In this lab, we are going to 1.) simulate geostatistical data, 2.) examine spatial dependence in errors, 3.) fit a Bayesian geostatistical model to simulated data using JAGS, 4.) obtain spatial predictions, 5.) compare spatial predictions to the “true” simulated process, and 6.) create a Bayesian model for Iowa temperature data and obtain predictions in a rectangular region encompassing the data.
The objectives of this lab are to:
Gain familiarity with spatial processes and models.
Obtain experience simulating Gaussian spatial processes.
Learn how to manipulate spatial data in R.
Use variograms to assess spatial structure in data and/or residuals.
See how to fit Bayesian geostatistical models and obtain spatial predictions.
In what follows, assume the Bayesian geostatistical model:
\[\begin{align} \mathbf{y} &\sim \text{N}\left(\mathbf{X}\boldsymbol\beta,\frac{1}{\tau}\exp\left(-\frac{\mathbf{D}}{\phi}\right)\right) \notag \\ \boldsymbol\beta &\sim \text{N}(\boldsymbol\mu_\beta,\boldsymbol\Sigma_\beta) \notag \\ \tau &\sim \text{Gamma}(0.01,0.01) \notag \\ \phi &\sim \text{Gamma}(a,b) \notag \end{align}\]
Note that a strong prior on the range parameter \(\phi\) is often helpful in geostatistical models; especially when the sample size is not very large. This is usually ok because, for the exponential covariance model, the effective range of spatial structure (i.e., <95% of the structure) occurs within distance \(3\phi\). Thus, we would not expect to be able to learn about \(\phi\) if \(3\phi\) is larger than about half the maximum distance in the spatial domain where data are collected. Also, more often than not, the range of spatial structure is smaller than you think because it is the dependence in the errors rather than in the data themselves. Usually most of the pattern in the response data is explained by the covariates, leaving small scale spatial structure for the covariance to accommodate.
####
#### Load Libraries
####
library(maps)
library(mvtnorm)
library(mvnfast)
library(rjags)
library(gstat)
library(MCMCvis)
library(sf)
library(ggplot2)
library(gridExtra)
####
#### Change Projection to UTM
####
ut.latlon <- st_as_sf(map("state", regions="utah", fill = T, plot = F))
ut.utm <- st_transform(ut.latlon, crs = st_crs("+proj=utm +zone=12 +datum=WGS84"))
ggplot(ut.utm) +
geom_sf() +
theme_bw() +
coord_sf(datum = st_crs(ut.utm))
####
#### Specify points locations that span the spatial domain
####
utgrid.locs <- st_make_grid(ut.utm, n = c(25, 30), what = 'centers')
####
#### Map these locations
####
ggplot(ut.utm) +
geom_sf() +
geom_sf(data = utgrid.locs) +
theme_bw() +
coord_sf(datum = st_crs(ut.utm))
ntot <- length(utgrid.locs)
####
#### Create Euclidean distance matrix
####
D <- as.matrix(st_distance(utgrid.locs))
units(D) <- NULL
####
#### Specify an exponential covariance model
####
s2 <- 2
phi <- 1.5*10^5
Sigma <- s2*exp(-D/phi)
plot(seq(0, max(D), length.out = 20),
s2*exp(-seq(0, max(D), length.out = 20)/phi),
type = "o", ylab = "cov", xlab = "distance")
####
#### Simulate the spatially correlated error process from a MVN
####
set.seed(13)
eps <- as.vector(rmvnorm(1,matrix(0,ntot,1),Sigma,method="chol")) # may take some time
utgrid.locs <- st_as_sf(utgrid.locs)
utgrid.locs$eps <- eps
ggplot(utgrid.locs) +
geom_sf(data=ut.utm,fill=NA)+
geom_sf(aes(colour = eps)) +
scale_colour_distiller(palette = 'RdYlBu') +
theme_bw() +
coord_sf(datum = st_crs(utgrid.locs))
####
#### Specify how much data we want to keep
####
sample.idx <- sample.int(ntot, size = 0.2*ntot)
utgrid.sample <- utgrid.locs[sample.idx, ]
####
#### Make map of these correlated errors
####
ggplot(utgrid.sample) +
geom_sf(data=ut.utm,fill=NA)+
geom_sf(aes(colour = eps)) +
scale_colour_distiller(palette = 'RdYlBu') +
theme_bw() +
coord_sf(datum = st_crs(utgrid.sample))
####
#### Examine empirical spatial structure in simulated process
####
n <- nrow(utgrid.sample)
var <- variogram(object = eps~1,
data = as_Spatial(utgrid.sample), covariogram=TRUE)
plot(var)
####
#### Simulate (or specify) spatial covariates and then calculate data
####
p <- 3
X <- matrix(1, nrow = ntot, ncol = p)
utgrid.coords <- st_coordinates(utgrid.locs)
X[,2] <- -cos(scale(utgrid.coords[,1])-.5)*cos(scale(utgrid.coords[,2]))
X[,3] <- scale(utgrid.coords[,1])+scale(utgrid.coords[,2])
####
#### Choose a known set of values for the regression coefficients beta
####
beta <- c(1, -2, 1)
####
#### Create observed data vector y
####
y <- X %*% beta + eps
####
#### Obtain maps of y, epsilon, and covariates
####
utgrid.locs$y <- y
utgrid.locs$x1 <- X[, 2]
utgrid.locs$x2 <- X[, 3]
p1 <- ggplot(utgrid.locs) +
geom_sf(data=ut.utm,fill=NA)+
geom_sf(aes(colour = y)) +
scale_colour_distiller(palette = 'RdYlBu') +
theme_bw() +
coord_sf(datum = st_crs(utgrid.locs))
p2 <- ggplot(utgrid.locs) +
geom_sf(data=ut.utm,fill=NA)+
geom_sf(aes(colour = eps)) +
scale_colour_distiller(palette = 'RdYlBu') +
theme_bw()+
coord_sf(datum = st_crs(utgrid.locs))
p3 <- ggplot(utgrid.locs) +
geom_sf(aes(colour = x1)) +
scale_colour_distiller(palette = 'RdYlBu') +
geom_sf(data=ut.utm,fill=NA)+
theme_bw()+
coord_sf(datum = st_crs(utgrid.locs))
p4 <- ggplot(utgrid.locs) +
geom_sf(aes(colour = x2)) +
scale_colour_distiller(palette = 'RdYlBu') +
geom_sf(data=ut.utm,fill=NA)+
theme_bw()+
coord_sf(datum = st_crs(utgrid.locs))
grid.arrange(p1, p2, p3, p4, nrow = 2)
##Just for the sampled locations
p1b <- ggplot(utgrid.locs[sample.idx, ]) +
geom_sf(data=ut.utm,fill=NA)+
geom_sf(aes(colour = y)) +
scale_colour_distiller(palette = 'RdYlBu') +
theme_bw() +
coord_sf(datum = st_crs(utgrid.locs))
p2b <- ggplot(utgrid.locs[sample.idx, ]) +
geom_sf(data=ut.utm,fill=NA)+
geom_sf(aes(colour = eps)) +
scale_colour_distiller(palette = 'RdYlBu') +
theme_bw()+
coord_sf(datum = st_crs(utgrid.locs))
grid.arrange(p1b, p2b, p3, p4, nrow = 2)
####
#### Set-up Variables
####
y.sm=y[sample.idx]
X.sm=X[sample.idx,]
D.sm=D[sample.idx,sample.idx]
phi.mn=max(D.sm)/8
phi.var=(max(D.sm)/8*.5)^2
a=(phi.mn^2)/phi.var
b=phi.mn/phi.var
data=list(y = y.sm, X = X.sm, n = n, p = p, D = D.sm,a=a,b=b)
inits <- list(beta = rep(0,p), tau = .5, phi = phi.mn)
var.names <- c("beta","phi","tau")
####
#### Specify Model for JAGS
####
m.jags <- "model{
#### Gaussian data model with mean=mu and Precision matrix = Prec.mat
y[1:n]~dmnorm(mu,Prec.mat)
mu<-X%*%beta
Prec.mat <- inverse(Cov.mat)
# defining the exponential covariance matrix
for(i in 1:n){
for(j in 1:n){
Cov.mat[i,j]<-sigma.sq*exp(-D[i,j]/phi)
}
}
# Inverse-Gamma prior on sigma-squared
sigma.sq<-1/tau
tau~dgamma(.01,.01)
# Gamma prior on phi
phi~dgamma(a,b)
# Exchangeable vague Gaussian priors on each beta[i]
for(i in 1:p){
beta[i]~dnorm(0,0.000001)
}
}
"
mod <- textConnection(m.jags)
####
#### Fit Model Using JAGS ; this may take a bit of time
####
jags.sp.m <- jags.model(mod, data = data, inits = inits, n.chains = 1, n.adapt = 400)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 1
## Unobserved stochastic nodes: 5
## Total graph size: 35238
##
## Initializing model
jags.out <- jags.samples(model=jags.sp.m,variable.names=var.names,n.iter=2000)
####
#### View MCMC Trace Plots
####
par(mfrow=c(2,3))
for(i in 1:p){
plot(jags.out$beta[i,,1],type="l",col=i,main=paste(c("beta",i-1)))
abline(h=beta[i],lwd=3)
}
plot(jags.out$phi[1,,1],type="l",col=4,main="phi")
abline(h=phi,lwd=3)
plot(1/jags.out$tau[1,,1],type="l",col=5,main="sigma^2")
abline(h=s2,lwd=3)
####
#### Posterior for phi
####
hist(jags.out$phi[1,,1],breaks=40,xlim=c(0,max(D)/2),prob=TRUE)
curve(dgamma(x,a,b),col=2,add=TRUE)
####
#### Read in Data
####
iowa.df <- read.table("iowa_temperature.txt")
names(iowa.df) <- c("long","lat","temp")
head(iowa.df)
## long lat temp
## 1 -90.74 41.24 51.6
## 2 -89.87 39.29 55.9
## 3 -89.02 39.84 54.7
## 4 -89.52 41.84 51.1
## 5 -90.05 41.17 52.2
## 6 -90.74 39.72 55.2
####
#### Plot Data in Default Lat/Long Projection
####
states.latlon <- st_as_sf(map("state", fill = T, plot = F))
iowa.sf <- st_as_sf(iowa.df, coords = c('long', 'lat'),
crs = 4326)
data.bb <- st_bbox(iowa.sf)
ggplot(iowa.sf) +
geom_sf(data = states.latlon) +
geom_sf(aes(size = temp)) +
theme_bw() +
coord_sf(xlim = c(data.bb$xmin, data.bb$xmax),
ylim = c(data.bb$ymin, data.bb$ymax))
####
#### Convert to UTM
####
iowa.data.utm <- st_transform(iowa.sf, crs = st_crs("+proj=utm +zone=15 +datum=WGS84"))
iowa.grid <- st_make_grid(iowa.data.utm, n = c(50, 45), what = 'centers')
ia.latlon <- st_as_sf(map("state", region = 'iowa', fill = T, plot = F))
ia.utm <- st_transform(ia.latlon, crs = st_crs("+proj=utm +zone=15 +datum=WGS84"))
ggplot(iowa.data.utm) +
geom_sf(data = iowa.grid, colour = 'gray') +
geom_sf(data = ia.utm, fill = 'NA') +
geom_sf(colour = 'red') +
theme_bw()
n.grid <- length(iowa.grid)
n.data <- nrow(iowa.data.utm)
all.locs <- c(st_geometry(iowa.data.utm), iowa.grid)
D.all <- as.matrix(st_distance(all.locs))
units(D.all) <- NULL
D.max <- max(D.all)
####
#### Create Design Matrix using Locs as Covariates
####
p <- 3
X <- matrix(1, n.data, p)
X[, 2:3] <- st_coordinates(all.locs)[1:n.data, ]
y <- as.vector(iowa.df$temp)
####
#### Euclidean Distance Matrix
####
D <- D.all[1:n.data, 1:n.data]
####
#### Set-up Variables
####
phi.mn <- D.max/8
phi.var <- (D.max/8*.5)^2
a <- (phi.mn^2)/phi.var
b <- phi.mn/phi.var
data <- list(y = y, X = X, n = length(y), p = p, D = D,a=a,b=b)
inits <- list(beta = rep(0, p), tau = .5, phi = phi.mn)
var.names <- c("beta","phi","tau","sigma.sq")
####
#### Specify Model for JAGS
####
m.jags <- "model{
#### Gaussian data model with mean=mu and Precision matrix = Prec.mat
y[1:n]~dmnorm(mu,Prec.mat)
mu<-X%*%beta
Prec.mat <- inverse(Cov.mat)
# defining the exponential covariance matrix
for(i in 1:n){
for(j in 1:n){
Cov.mat[i,j]<-sigma.sq*exp(-D[i,j]/phi)
}
}
# Inverse-Gamma prior on sigma-squared
sigma.sq<-1/tau
tau~dgamma(.01,.01)
# Uniform prior on phi
phi~dgamma(a,b)
# Exchangeable vague Gaussian priors on each beta[i]
for(i in 1:p){
beta[i]~dnorm(0,0.000001)
}
}
"
mod <- textConnection(m.jags)
####
#### Fit Model Using JAGS ; this may take a bit of time
####
n.mcmc <- 1000
jags.sp.m <- jags.model(mod, data = data, inits = inits, n.chains = 1, n.adapt = 100)
## Compiling model graph
## Resolving undeclared variables
## Allocating nodes
## Graph information:
## Observed stochastic nodes: 1
## Unobserved stochastic nodes: 5
## Total graph size: 51638
##
## Initializing model
jags.out <- coda.samples(model = jags.sp.m, variable.names = var.names, n.iter = n.mcmc)
s2.chain <- MCMCchains(jags.out, params = "sigma.sq")
phi.chain <- MCMCchains(jags.out, params = "phi")
beta.chains <- MCMCpstr(jags.out, params = "beta", type = "chains")$beta
####
#### Create Bayesian Predictive Surface Map via Bayesian Kriging
####
## Set-up for Kriging
D.full <- D.all
D.pred <- D.full[-c(1:n.data), -c(1:n.data)]
D.uo <- D.full[-(1:n.data), 1:n.data]
n.pred <- n.grid
y.pred.save <- matrix(0, n.pred, n.mcmc)
X.pred <- matrix(1, n.pred, p)
X.pred[, 2:3] <- st_coordinates(all.locs)[-c(1:n.data), ]
## Obtain predictions ; this will certainly take some time
for(k in 1:n.mcmc){
## get kth iteration's estimates
s2.tmp <- s2.chain[k]
phi.tmp <- phi.chain[k]
beta.tmp <- as.matrix(beta.chains[, k])
## obtain moments for multivariate normal
Sig.inv.tmp <- solve(s2.tmp*exp(-D/phi.tmp))
tmp.mn <- X.pred%*%beta.tmp + s2.tmp*exp(-D.uo/phi.tmp)%*%Sig.inv.tmp%*%(y - X%*%beta.tmp)
tmp.var <- s2.tmp*exp(-D.pred/phi.tmp) - s2.tmp*exp(-D.uo/phi.tmp)%*%Sig.inv.tmp%*%t(s2.tmp*exp(-D.uo/phi.tmp))
## obtain the prediction for the kth iteration
y.pred.save[,k] <- as.vector(rmvn(1, tmp.mn, tmp.var))
}
y.pred.mn=apply(y.pred.save,1,mean)
y.pred.sd=apply(y.pred.save,1,sd)
## Plot the predictive surfaces
y.pred.sf <- st_make_grid(iowa.data.utm, n = c(50, 45))
y.pred.sf <- st_as_sf(y.pred.sf)
y.pred.sf$mean <- y.pred.mn
y.pred.sf$sd <- y.pred.sd
pred.p1 <- ggplot(y.pred.sf) +
geom_sf(aes(fill = mean), colour = NA) +
geom_sf(data = iowa.data.utm) +
theme_bw() +
scale_fill_distiller(palette = 'RdYlBu') +
geom_sf(data = ia.utm, fill = 'NA') +
coord_sf(datum = st_crs(y.pred.sf))
pred.p2 <- ggplot(y.pred.sf) +
geom_sf(aes(fill = sd), colour = NA) +
geom_sf(data = iowa.data.utm) +
theme_bw() +
scale_fill_distiller(palette = 'RdYlBu') +
geom_sf(data = ia.utm, fill = 'NA') +
coord_sf(datum = st_crs(y.pred.sf))
grid.arrange(pred.p1, pred.p2, nrow = 1)