First install the stat536 library and other packages.
library(devtools)
install_github('andyhoegh/stat536')
## Skipping install of 'stat536' from a github remote, the SHA1 (5d5717cc) has not changed since last install.
## Use `force = TRUE` to force installation
library(stat536)
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(dlm)
##
## Attaching package: 'dlm'
## The following object is masked from 'package:ggplot2':
##
## %+%
library(datasets)
where \(v_t \sim N(0, V)\) and \(w_t \sim N(0, W)\),
This model is appropriate when there is no clear trend or seasonality.
set.seed(11152018)
num.pts <- 100
V <- 10
W <- 3
sim.ll <- sim_locallevel(num.pts = num.pts, V = 10, W = 3, mu.0 = 0)
ll <- data.frame( val = c(sim.ll$y, sim.ll$mu), t = rep(1:num.pts, 2), label = rep(c('y','mu'), each = num.pts))
ggplot(data = ll, aes(y = val, x = t ) ) + geom_line(aes(color=(label)))
To fit this model there are there iterative steps that are repeated for each time point:
Prior: \[\theta_{t-1}|y_{t-1}, \dots, y_{0} \sim N(\mu_{t-1}, C_{t-1})\]
Prediction: \[\theta_t|y_{t-1}, \dots, y_{0} \sim N(a_t, R_t),\] where in this case \(a_t = \mu_{t-1}\) and \(R_t = C_{t-1} + W\) \[y_t|y_{t-1} \sim N(f_t, Q_t)\] where in this case \(f_t = a_t\) and \(Q_t = R_t + V\).
Estimation (filtering): \[\theta_t|y_t, \dots, y_1 \sim N(\mu_t, C_t)\] where \(\mu_t = a_t + \frac{R_t}{R_t + V} (y_t - f_t)\) and \(C_t = R_t - \frac{R_t}{R_t + V} R_t\)
The code below works through these equations, assuming V and W are known, to obtain the predictive distribution.
## Code extracted from filter_ll()
y <- sim.ll$y
mu0 <- 0
C0 <- 100
a <- R <- f <- Q <- mu <- C <- rep(0, num.pts)
# evolution dist parameters (theta_t|y_{t-1})
a[1] <- mu0
R[1] <- C0 + W
# predictive dist parameters (y_t|y_{t-1})
f[1] <- a[1]
Q[1] <- R[1] + V
# filtering dist parameters (theta_{t}|y_t)
mu[1] <- a[1] + (R[1] / (R[1] + V)) * (y[1] - f[1])
C[1] <- R[1] - (R[1]/ (R[1] + V)) * R[1]
for (time.pts in 2:num.pts){
# evolution dist parameters (theta_t|y_{t-1})
a[time.pts] <- mu[time.pts - 1]
R[time.pts] <- C[time.pts - 1] + W
# predictive dist parameters (y_t|y_{t-1})
f[time.pts] <- a[time.pts]
Q[time.pts] <- R[time.pts] + V
# filtering dist parameters (theta_{t}|y_t)
mu[time.pts] <- a[time.pts] + (R[time.pts] / (R[time.pts] + V)) * (y[time.pts] - f[time.pts])
C[time.pts] <- R[time.pts] - (R[time.pts]/ (R[time.pts] + V)) * R[time.pts]
}
obs <- ll %>% filter(label == 'y') %>% ggplot(aes(y = val, x = t )) + geom_point() + geom_line(alpha=.2) + ggtitle('Observed Values') + ylab('')
preds <- data.frame(upper = f + 1.96 * sqrt(Q), lower = f - 1.96 * sqrt(Q), mean = f, t = 1:num.pts)
obs + geom_line(data=preds, aes(y = upper, x=t), col='red', linetype=3 ) + geom_line(data=preds, aes(y = lower, x=t), col='red', linetype=3 ) + geom_line(data=preds, aes(y = mean, x=t), col='red', linetype=2 )
Q: why is the initial variance so large?
Q: why is the predictive distribution plotted? What about the other two distributions, how might we visualize these?
Now we need to build in a function to estimate the variance parameters (V and W) too. This is accomplished using the forward-filter backward-sampler (FFBS) algorithm using the dlm
package.
mcmc.output <- dlmGibbsDIG(y =y, mod =dlmModPoly(order = 1), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 )
We need to look at the posterior estimates for parameters in the model.
- First V.
plot(mcmc.output$dV, type= 'l', main = 'Trace for V')
abline(h = V, col = 'red', lwd = 2)
plot(mcmc.output$dW, type= 'l', main = 'Trace for W')
abline(h = W, col = 'red', lwd = 2)
In this case we have samples from \(\theta_t|y_t\)
dim(mcmc.output$theta)
## [1] 101 1 5000
sample.paths <- sample(5000,15)
plot(mcmc.output$theta[,1,sample.paths[1]], type='l', ylab =expression(theta), xlab = 'time', main = 'Sample paths of theta')
for (i in 2:15){
lines(1:101, mcmc.output$theta[,1,sample.paths[i]])
}
lines(2:101, sim.ll$mu, col = 'red', lwd=2, lty = 2)
We can also get predictions from this model using the posterior parameters, but how??
The mean of the predictive distribution \(y_{t+1}|y_t\) is just \(\mu_{t-1}\) and the variance is \(C_{t-1} + W + V.\)
Thus we can create samples from the predictive distribution by using the samples of \(\theta\) and propogating the error. However, know that we are using all of the data to estimate V and W.
V.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(mcmc.output$dV, each = 101))), nrow=101, ncol = 5000, byrow = T)
W.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(mcmc.output$dW, 101))), nrow=101, ncol = 5000, byrow = T)
y.pred <- mcmc.output$theta[,1,] + V.matrix + W.matrix
pred.mean <- rowMeans(y.pred)
pred.quant <- apply(y.pred, 1, quantile, probs = c(.025, .975))
plot(sim.ll$y, ylim = c(min(pred.quant), max(pred.quant)), pch =16, ylab = '', xlab = 'Time', main = 'Predictive Distribution')
lines(1:100, pred.quant[1,-1], col = 'red', lty = 2)
lines(1:100, pred.quant[2,-1], col = 'red', lty = 2)
lines(1:100, pred.mean[-1], col = 'red')
data(Nile)
nile.output <- dlmGibbsDIG(y = Nile, mod =dlmModPoly(order = 1), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 )
We can also get predictions from this model using the posterior parameters, but how??
The mean of the predictive distribution \(y_{t+1}|y_t\) is just \(\mu_{t-1}\) and the variance is \(C_{t-1} + W + V.\)
Thus we can create samples from the predictive distribution by using the samples of \(\theta\) and propogating the error. However, know that we are using all of the data to estimate V and W.
V.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(nile.output$dV, each = 101))), nrow=101, ncol = 5000, byrow = T)
W.matrix <- matrix(rnorm(5000 * 101, mean = 0, sd = sqrt(rep(nile.output$dW, 101))), nrow=101, ncol = 5000, byrow = T)
nile.pred <- nile.output$theta[,1,] + V.matrix + W.matrix
pred.mean <- rowMeans(nile.pred)
pred.quant <- apply(nile.pred, 1, quantile, probs = c(.025, .975))
plot(Nile, ylim = c(min(pred.quant), max(pred.quant)), pch =16, ylab = '', xlab = 'Time', main = 'Predictive Distribution')
lines(1871:1970, pred.quant[1,-1], col = 'red', lty = 2)
lines(1871:1970, pred.quant[2,-1], col = 'red', lty = 2)
lines(1871:1970, pred.mean[-1], col = 'red')
dlmModPoly
has the capability to fit more complicated models too. Consider the linear growth model:
\[\begin{eqnarray*}
y_t &=& \mu_t + v_t\\
\mu_t &=& \mu_{t-1} + \beta_{t-1} + w_{t,1}\\
\beta_t &=& \beta_{t-1} + w_{t,2}.
\end{eqnarray*}\]
This can be fit using dlmModPoly(order = 2)
nile.output2 <- dlmGibbsDIG(y = Nile, mod =dlmModPoly(order = 2), a.y = 10, b.y = 10000, a.theta = 10, b.theta = 10000, n.sample = 5000 )
mean.nile <- rowMeans(nile.output2$theta[,1,])
nile.quant <- apply(nile.output2$theta[,1,],1, quantile, probs=c(.025,.975))
plot(Nile,ylim = c(min(c(nile.quant,Nile)), max(c(nile.quant,Nile))), pch =16, ylab = '', xlab = 'Time', main = 'Filtering Distribution')
lines(1871:1970, mean.nile[-1], col = 'red', lty = 2)
lines(1871:1970, nile.quant[2,-1], col = 'red', lty = 2)
lines(1871:1970, nile.quant[1,-1], col = 'red')
With the dlm
package, the model type needs to be specified. We have see dlmModPoly()
, but dlm
also contains:
dlmModSeas()
dlmModReg()
dlmModARMA()
Furthermore, these models can be combined to form a model with trends, regression components, ARMA elements, etc…
Another nice feature of state-space models is the ability to have time-varying regression coefficients. Furthermore, there is a straight-forward way to model time series data that is non-normal.
Consider the following model, describe the parameters in this model and explain what the model does.
\[\begin{eqnarray*} y_t &\sim& Bernoulli(\pi_t)\\ logit(\pi_t) &=& x_{t} \beta_{t}\\ \beta_{t} &=& \beta_{t-1} + w_{t} \\ \end{eqnarray*}\]where \(w_{t} \sim N(0, W)\).
set.seed(11192018)
num.pts <- 500
beta <- rep(3,num.pts)
x <- runif(num.pts)
W <- .05
for (time.pts in 2:num.pts){
beta[time.pts] <- beta[time.pts - 1] + rnorm(1, mean = 0, sd = sqrt(W))
}
pi.t <- exp( x * beta) / (1 + exp(x * beta) )
y <- rbinom(num.pts, 1 ,pi.t)
plot(pi.t, type = 'l', lty = 2, ylim=c(0,1), xlab='time')
points(1:num.pts,y, pch = 16, cex = .8)
In this situation, the prediction and filtering equations do not have closed form solutions. Technically, these distributions are computed with a series of integrals.
Another solution is to use a class of algorithms known as sequential Monte Carlo and in particular an algorithm known as a particle filter.
The algorithm is initialized with a distribution for the parameters at time 0. (For simplicity we will assume that \(W\) is known, but this is not necessary.)
# Storage
num.particles <- 10000
beta.samples <- matrix(0,nrow=num.particles, ncol = num.pts + 1)
# initialize particles at time 0
beta.samples[,1] <- rnorm(num.particles, mean = 0, sd=4)
hist(beta.samples[,1], main = 'Prior at time 0', xlab = expression(beta), breaks = 'FD')
Next we want to consider where the state distribution is at the next time point
# Storage
beta.evolution <- matrix(0, num.particles, num.pts)
# initialize particles at time 0
beta.evolution[,1] <- beta.samples[,1] + rnorm(num.particles, mean = 0, sd=sqrt(W))
hist(beta.evolution[,1], main = 'Evolution Dist at time 1', xlab = expression(beta), breaks = 'FD')
xb <- beta.evolution[,1] * x[1]
probs <- exp(xb) / (1 + exp(xb))
y1 <- rbinom(num.particles, 1, probs)
par(mfcol=c(1,2))
hist(probs, main = 'distribution for pi', xlab = expression(pi[1]), breaks = 'FD')
hist(y1, breaks = c(-.5,.5,1.5),main = 'distribution for y', xlab = expression(y[1]), probability = TRUE)
w <- dbinom(y[1], 1, prob = probs) / (dnorm(beta.evolution[,1],mean = beta.samples[,1], sd = sqrt(W)))
w.norm <- w / sum(w)
sample.pos <- base::sample(num.particles, prob = w.norm, replace = T)
beta.samples[,2] <- beta.evolution[sample.pos,1]
hist(beta.samples[,2], main = 'Posterior after time 1', xlab = expression(beta), breaks = 'FD')
abline(v=beta[1], col = 'red')
for (time.pts in 2:num.pts){
# evolution
beta.evolution[,time.pts] <- beta.samples[,time.pts] + rnorm(num.particles, mean = 0, sd=sqrt(W))
# weights and resample
xb <- beta.evolution[,time.pts] * x[time.pts]
probs <- exp(xb) / (1 + exp(xb))
w <- dbinom(y[time.pts],1, prob = probs) / (dnorm(beta.evolution[,time.pts],mean = beta.samples[,time.pts], sd = sqrt(W)) )
w.norm <- w / sum(w)
sample.pos <- base::sample(num.particles, prob = w.norm, replace = T)
beta.samples[,time.pts + 1] <- beta.evolution[sample.pos,time.pts]
}
beta.q <- apply(beta.samples, 2, quantile, prob=c(.025,.975))
par(mfcol = c(1,1))
plot(1:num.pts, beta, ylim=c(min(beta.samples), max(beta.samples)), pch = 16, xlab = 'time')
lines(1:num.pts, beta.q[1, -1], lty = 2)
lines(1:num.pts, beta.q[2, -1], lty = 2)
prob.samples <- exp(beta.samples[,-1] * matrix(x, nrow = num.particles, ncol = time.pts, byrow = T)) / (1 + exp(beta.samples[,-1] * matrix(x, nrow = num.particles, ncol = time.pts, byrow = T)))
prob.q <- apply(prob.samples, 2, quantile, prob = c(.025,.975))
plot(1:num.pts, pi.t, ylim=c(0, 1), pch = 16, xlab = 'time', type = 'b')
lines(1:num.pts, colMeans(prob.samples), lty = 2, col = 'red', lwd = 2)