Bayesian Data Analysis

Linear Regression

Extending from the idea of how we estimated the three participants’ IQ scores, simple linear regression can be conducted in Bayesian data analysis as well. Recall the linear regression model is normally described as \(\hat{y}_{ij}=b_0+b_1x_j\), in which \(b_0\) and \(b_1\) are two parameters. Also, every observed value of \(y\) given \(x_j\) can be derived as \(y_{ij}=\hat{y}_{ij}+\epsilon_{ij}\), in which \(\epsilon_{ij}\sim N(0,\sigma_\epsilon)\). That is, \(y_{ij}\sim N(\hat{y}_j,\sigma_\epsilon)\). Thus, in our JAGS model, the likelihood function for the observed data is a t distribution with the predicted value as the mean, \(\sigma_\epsilon\) as the standard deviation, and \(\tau\) as the shape parameter.

Numeric example

Ten subjects’ weights at 8 years old and 20 years old are listed as follow. With their weights at 8 years old as the predictor and the other weights as the dependent variable, a simple linear regression can be run.

# Body weight at 20 years old
y<-c(55,44,55,51,85,57,59,60,66,49)
# Body weight at 8 years old
x<-c(31,25,29,20,40,32,27,33,28,21)

We can make the scatter plot with the body weights at year 8 in the axis X and those at year 20 in the axis Y. Obviously, there is a positive correlation between the body weights at these two ages for the same participants.

library(ggplot2)
ggplot(data.frame(x=x,y=y),aes(x,y))+
  geom_point(shape=1)+
  geom_smooth(method=lm,se=F)

## `geom_smooth()` using formula 'y ~ x'

Let’s see how we can do regression analysis in Bayesian methods. As usual, we firstly prepare our data. Thereafter, we can set up our JAGS model.

datalist<-list(y=y,x=x,N=length(x))
modelString<-"
model{
    for(i in 1:N){
       mu[i]<-b0+b1*x[i]
       y[i]~dt(mu[i],1/sigma^2,tau)
       # Posterior predictive 
       pred[i]~dt(mu[i],1/sigma^2,tau)
    }
    # Priors
    b0~dnorm(0,0.0001)
    b1~dnorm(0,0.0001)
    sigma~dunif(0.001,1000)
    tau<-ntau+1
    ntau~dexp(1/29)
}"
writeLines(modelString,con="e10_model.txt")

Start modeling with the below codes.

library(rjags)

## Loading required package: coda

## Linked to JAGS 4.3.0

## Loaded modules: basemod,bugs

e10_model<-jags.model(file="e10_model.txt",data=datalist,
                      n.chains=2,n.adapt=1000)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 10
##    Unobserved stochastic nodes: 14
##    Total graph size: 66
## 
## Initializing model

update(e10_model,n.iter=10000,thin=10)
post_e10<-coda.samples(e10_model,c("b0","b1","sigma","tau","pred"),
                       n.iter=10000,thin=10)

Let’s check the posterior samples of the parameter estimates.

library(ggmcmc)

## Loading required package: dplyr

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: tidyr

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

sample_e10<-ggs(post_e10)
b0<-filter(sample_e10,Parameter=="b0")$value
b1<-filter(sample_e10,Parameter=="b1")$value
sigma<-filter(sample_e10,Parameter=="sigma")$value
tau<-filter(sample_e10,Parameter=="tau")$value
predy<-sapply(1:length(y),function(j){
        temp<-paste0("pred[",j,"]")
        filter(sample_e10,Parameter==temp)$value
})
fb0<-ggplot(data.frame(b0=b0),aes(b0))+geom_density()
fb1<-ggplot(data.frame(b1=b1),aes(b1))+geom_density()
fsigma<-ggplot(data.frame(sigma=sigma),aes(sigma))+geom_density()
ftau<-ggplot(data.frame(tau),aes(tau))+geom_density()
library(gridExtra)

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

grid.arrange(fb0,fb1,fsigma,ftau,ncol=2)

The mean estimates of \(b0\), \(b1\), \(\sigma_\epsilon\), and \(\tau\) are computed as follows. Also, the estimates are listed as well.

c(mean(b0),mean(b1),mean(sigma),mean(tau))

## [1] 16.741157  1.440962  8.385104 33.471001

Db0<-density(b0)
Db1<-density(b1)
Dsigma<-density(sigma)
Dtau<-density(tau)
# MAP of b0
Db0$x[which.max(Db0$y)]

## [1] 14.78993

# MAP of b1
Db1$x[which.max(Db1$y)]

## [1] 1.425122

# MAP of sigma
Dsigma$x[which.max(Dsigma$y)]

## [1] 6.733343

# MAP of tau
Dtau$x[which.max(Dtau$y)]

## [1] 10.77895

Also, we can compare the estimates of these parameters and the coefficients in regular regression analysis. In R lm( ) is used to do regression analysis. As shown in the below results, the intercept is 14.67 and the slope is 1.52. These two coefficients are close to our MAP estimates of \(b_0\) and \(b_1\). The standard error of estimate is actually \(\sigma_\epsilon\).

summary(lm(y~x))

## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6336 -5.8923 -0.6336  5.3014  9.5897 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  14.6724    11.8810   1.235  0.25189   
## x             1.5184     0.4077   3.725  0.00583 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.228 on 8 degrees of freedom
## Multiple R-squared:  0.6343, Adjusted R-squared:  0.5886 
## F-statistic: 13.87 on 1 and 8 DF,  p-value: 0.005831

We can check how well this model predicts the observed data. One way is to check the \(R^2\) between the predicted data and observed data. First, we compute the mean of predicted values for each observed data point. Second, the sum of squared errors is computed as \(\sum(y-\hat{y})^2\). Third, the total sum of squares is computed as \(\sum(y-\bar{y})^2\). \(R^2=\frac{SST-SSE}{SST}\).

prediction<-colMeans(predy)
SSE<-sum((y-prediction)^2)
SST<-sum((y-mean(y))^2)
R2<-(SST-SSE)/SST
R2

## [1] 0.6288658

dd<-data.frame(y=c(y,prediction),x=c(x,x),group=rep(c("Obs","Pred"),each=length(y)))
ggplot(dd,aes(x,y))+
  geom_point(aes(color=group))+
  geom_smooth(data=subset(dd,group=="Obs"),aes(x,y),method=lm,se=F,color="black",size=0.5)

## `geom_smooth()` using formula 'y ~ x'

Also, according to the assumption of linear regression analysis that the prediction residual and predictor are independent. We can also check for this assumption with our posterior predictive data. The result supports this assumption.

cor.test(y-prediction,x)

## 
##  Pearson's product-moment correlation
## 
## data:  y - prediction and x
## t = 0.19492, df = 8, p-value = 0.8503
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5862523  0.6694010
## sample estimates:
##        cor 
## 0.06875164

Another numeric example

Atkinson (1982) published a paper in Journal of the Royal Statistical Society, in which a data of 31 black cherry tress were reported. In this data set, the volume, height and diameter of these tress are included. We can estimate the volume of a tree given its height and diameter. Let’s do the regular regression analysis. The results show that all coefficients including the intercept are significantly different from 0.

dta<-read.table("http://www.statsci.org/data/general/cherry.txt",header=T)
summary(lm(Volume~Diam+Height,dta))

## 
## Call:
## lm(formula = Volume ~ Diam + Height, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -6.4065 -2.6493 -0.2876  2.2003  8.4847 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -57.9877     8.6382  -6.713 2.75e-07 ***
## Diam          4.7082     0.2643  17.816  < 2e-16 ***
## Height        0.3393     0.1302   2.607   0.0145 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.882 on 28 degrees of freedom
## Multiple R-squared:  0.948,  Adjusted R-squared:  0.9442 
## F-statistic:   255 on 2 and 28 DF,  p-value: < 2.2e-16

In Bayesian data analysis, we fit the regression model to the observed data in order to get the best-fit parameter values. Let’s prepare the data first.

x1<-as.numeric(dta$Diam)
x2<-as.numeric(dta$Height)
y<-as.numeric(dta$Volume)
datalist<-list(y=y,x1=x1,x2=x2,n=length(y))

We need to fix the regression model in the previous model to conduct the multiple regression analysis. Note this model is basically the same as the simple linear model, except we have two predictors instead of one now.

modelString<-"
model{
   for(i in 1:n){
      muy[i]<-b0+b1*x1[i]+b2*x2[i]
      y[i]~dt(muy[i],1/sigma^2,tau)
      predy[i]~dt(muy[i],1/sigma^2,tau)
   }
   # Priors
   b0~dnorm(0,0.0001)
   b1~dnorm(0,0.0001)
   b2~dnorm(0,0.0001)
   sigma~dunif(0.001,1000)
   tau<-ntau+1
   ntau~dexp(1/29)
}"
writeLines(modelString,con="e10_model1.txt")

Start modeling. This time we have five parameters to estimate, namely \(b_0\), \(b_1\), \(b_2\), \(\sigma_\epsilon\), and \(\tau\).

e10_model1<-jags.model(file="e10_model1.txt",data=datalist,
                       n.chains=2,n.adapt=100)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 31
##    Unobserved stochastic nodes: 36
##    Total graph size: 218
## 
## Initializing model

update(e10_model1,n.iter=1000)
post1_e10<-coda.samples(e10_model1,c("b0","b1","b2","sigma","tau","predy"),
                        n.iter=1000)

Let’s check the posterior samples of these parameter estimates. The mean of posterior estimates of \(b_0\), \(b_1\), and \(b_2\) are close to the analytic results of the multiple regression analysis.

sample1_e10<-ggs(post1_e10)
b0<-filter(sample1_e10,Parameter=="b0")$value
b1<-filter(sample1_e10,Parameter=="b1")$value
b2<-filter(sample1_e10,Parameter=="b2")$value
# Mean of posterior sample of parameter
c(mean(b0),mean(b1),mean(b2))

## [1] -54.1571403   4.7156712   0.2869488

The mean of posterior estimates of the standard error of estimate is computed by the following codes. The scatter plot for the combinations of observed and predicted data shows a strong positive relation between the model predictions and the observed data. The \(R^2\) is pretty high too.

sigma<-filter(sample1_e10,Parameter=="sigma")$value
mean(sigma)

## [1] 3.937295

tau<-filter(sample1_e10,Parameter=="tau")$value
mean(tau)

## [1] 32.09333

predy<-sapply(1:length(y),function(s){
  temp<-paste0("predy[",s,"]")
  filter(sample1_e10,Parameter==temp)$value
})
dd1<-data.frame(obs=y,pred=colMeans(predy))
ggplot(dd1,aes(obs,pred))+
  geom_point(shape=1)

SST<-sum((y-mean(y))^2)
SSE<-sum((y-colMeans(predy))^2)
R2<-(SST-SSE)/SST
R2

## [1] 0.9478809

Bayesian Counterpart of Pearson’s Correlation Test

In the classical Pearson’s correlation test, it is assumed that data follow a bivariate normal distribution. This distribution is shown in the below figure.

Estimate Parameters of Bivariate Normal Distribution

One problem is that the bivariate normal distribution and, more general, the multivariate normal distribution isn’t parameterized using \(\rho\), that is, we cannot estimate \(\rho\) directly!

The bivariate normal distribution is parameterized using \(\mu_1\) and \(\mu_2\) and a covariance matrix \(\sum\), which is

\[\begin{align} \begin{bmatrix} \sigma_1^2 & \rho\sigma_1\sigma_2\\ \rho\sigma_1\sigma_2 & \sigma_2^2 \end{bmatrix}. \end{align}\]

So the model or likelihood function we use to generate predictions is

\[\begin{align} [x_{i1},x_{i2}]\sim MultivariateNormal([\mu_1,\mu_2],\sum). \end{align}\]

Let’s do some practice. First, we create the data set of two variables sampled from the population of the bivariate normal distribution with \(\rho=-0.7\).

library(mvtnorm) # generate correlated data
mu<-c(10,30)
sigma<-c(20,40)
rho<--0.7
cov_mat<-rbind(c(sigma[1]^2,rho*sigma[1]*sigma[2]),
               c(rho*sigma[1]*sigma[2],sigma[2]^2))
set.seed(123)
v<-rmvnorm(30,mu,cov_mat)
colnames(v)<-c("x","y")
ggplot(data=data.frame(v),aes(x=x,y=y))+
  geom_point()

Now we prepare the data, set up the JAGS model, and do the modeling.

datalist<-list(K=v,N1=dim(v)[1],N2=dim(v)[2])
modelString<-"
model{
   for(i in 1:N1){
      K[i,1:2]~dmnorm(mu,prec)
   }
   # Construct the covariance matrix
   prec<-inverse(cov)
   cov[1,1]<-sigma[1]^2
   cov[2,2]<-sigma[2]^2
   cov[1,2]<-sigma[1]*sigma[2]*rho
   cov[2,1]<-sigma[1]*sigma[2]*rho
   # Priors
   for(j in 1:N2){
      mu[j]~dnorm(0,1/10000)
      sigma[j]~dunif(0.001,1000)
   }
   rho~dunif(-1,1)
}"
writeLines(modelString,con="e11_model.txt")

Initialize the JAGS model. Also, we transfer the posterior samples generated by JAGS to an R object.

e11_model<-jags.model(file="e11_model.txt",data=datalist,
                      n.chains=2,n.adapt=1000)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 30
##    Unobserved stochastic nodes: 5
##    Total graph size: 51
## 
## Initializing model

update(e11_model,n.iter=1000)
post_e11<-coda.samples(e11_model,c("mu","sigma","rho"),
                       n.iter=1000)
sample_e11<-ggs(post_e11)

The best-fit correlation coefficient can be computed by the following codes, which is close to the correlation coefficient of the sample. Also, the estimated means and standard deviations are close to those of this sample.

rho<-filter(sample_e11,Parameter=="rho")$value
mean(rho)

## [1] -0.7776889

mus<-sapply(1:2,function(x){
  temp<-paste0("mu[",x,"]")
  filter(sample_e11,Parameter==temp)$value
})
colMeans(mus)

## [1]  9.063791 34.367480

colMeans(v)

##         x         y 
##  9.350477 33.903283

sigmas<-sapply(1:2,function(x){
  temp<-paste0("sigma[",x,"]")
  filter(sample_e11,Parameter==temp)$value
})
colMeans(sigmas)

## [1] 19.75708 42.93574

apply(v,2,sd)

##        x        y 
## 18.99117 41.28755

Precision vs. Standard Deviation

In JAGS, normal distribution is called Gaussian distribution which has two parameters: \(\mu\) and \(\lambda\) and \(\lambda=\frac{1}{\sigma^2}\). In the textbook of Lee and Wagenmakers (2013), it is suggested to set up the prior distribution of \(\lambda\) as \(\Gamma\) distribution. For fitting with a mulinormal distribution, we can also change the prior distribution to \(Gamma\) distribution.

modelString<-"
model{
   for(i in 1:N1){
      K[i,1:2]~dmnorm(mu,precI)
   }
   precI<-inverse(prec)
   prec[1,1]<-sigma[1]^2
   prec[2,2]<-sigma[2]^2
   prec[1,2]<-rho*sigma[1]*sigma[2]
   prec[2,1]<-rho*sigma[1]*sigma[2]
   # Priors
   for(j in 1:N2){
      mu[j]~dnorm(0,0.0001)
      lambda[j]~dgamma(0.001,0.001)
      sigma[j]<-pow(lambda[j],-0.5)
   }
   rho~dunif(-1,1)
}"
writeLines(modelString,con="e11_model2.txt")

Now we rerun this model. The mean estimates of these parameters are close to the analytic results for the sample.

e11_model2<-jags.model(file="e11_model2.txt",data=datalist,
                       n.chains=2,n.adapt=1000)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 30
##    Unobserved stochastic nodes: 5
##    Total graph size: 53
## 
## Initializing model

update(e11_model2,n.iter=1000)
post2_e11<-coda.samples(e11_model2,c("mu","sigma","rho"),n.iter=1000)
sample2_e11<-ggs(post2_e11)
# Mean estimates of parameters
rho<-filter(sample2_e11,Parameter=="rho")$value
mean(rho)

## [1] -0.7783691

mus<-sapply(1:2,function(x){
  filter(sample2_e11,Parameter==paste0("mu[",x,"]"))$value
})
colMeans(mus)

## [1]  9.753596 32.935279

apply(v,2,mean)

##         x         y 
##  9.350477 33.903283

sigmas<-sapply(1:2,function(x){
  filter(sample2_e11,Parameter==paste0("sigma[",x,"]"))$value
})
colMeans(sigmas)

## [1] 19.37766 42.10914

apply(v,2,sd)

##        x        y 
## 18.99117 41.28755

Pearson Correlation with Uncertainty

In the previous example, the measurement errors are not taken into consideration. However, in the real testing conditions, the measurement error or the precision of the measuring tools should be taken into account.

That is, the observed data point \(k=(k_x,k_y)\) was drawn from the normal distributions over two dimensions centered on the true score \(t=(t_x,t_y)\) with \(\lambda=(\lambda_x,\lambda_y)\) as the indices of measurement precision.

We can fix our model to take this unvertainty into consideration.

modelString<-"
model{
    for(i in 1:N1){
       t[i,1:2]~dmnorm(mu,precI)
       for(j in 1:N2){
          K[i,j]~dnorm(t[i,j],lambdaerror[j])
       }
    }
    # Priors
    for(j in 1:N2){
       mu[j]~dnorm(0,0.001)
       lambdaerror[j]~dgamma(0.001,0.001)
       lambda[j]~dgamma(0.001,0.001)
    }
    precI<-inverse(prec)
    prec[1,1]<-1/lambda[1]
    prec[2,2]<-1/lambda[2]
    prec[1,2]<-rho/(pow(lambda[1],0.5)*pow(lambda[2],0.5))
    prec[2,1]<-rho/(pow(lambda[1],0.5)*pow(lambda[2],0.5))
    rho~dunif(-1,1)
}"
writeLines(modelString,con="e11_model3.txt")

Let’s re-do the modeling. As usual, we can check the mean estimates of all parameters.

e11_model3<-jags.model(file="e11_model3.txt",data=datalist,
                       n.chains=2,n.adapt=1000)

## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 60
##    Unobserved stochastic nodes: 37
##    Total graph size: 173
## 
## Initializing model

update(e11_model3,n.iter=1000)
post3_e11<-coda.samples(e11_model3,c("mu","lambda","lambdaerror","rho"),
                        n.iter=1000)
sample3_e11<-ggs(post3_e11)
mus<-sapply(1:2,function(x){
  filter(sample3_e11,Parameter==paste0("mu[",x,"]"))$value
})
colMeans(mus)

## [1]  9.881058 32.666797

apply(v,2,mean)

##         x         y 
##  9.350477 33.903283

lambdas<-sapply(1:2,function(x){
  filter(sample3_e11,Parameter==paste0("lambda[",x,"]"))$value
})
sigmas<-1/lambdas^0.5
colMeans(sigmas)

## [1] 18.68698 40.36361

apply(v,2,sd)

##        x        y 
## 18.99117 41.28755

lambdaerrors<-sapply(1:2,function(x){
  filter(sample3_e11,Parameter==paste0("lambdaerror[",x,"]"))$value
})
dim.sigmas<-1/lambdaerrors^0.5
colMeans(dim.sigmas)

## [1] 2.594826 5.883846

rhos<-filter(sample3_e11,Parameter=="rho")$value
mean(rhos)

## [1] -0.8098328

Scatter Plot with Error Bars

Since the estimated measurement error represents the measurement uncertainty, it can be added to the data points as the error bars on the two dimensions.

xy.dta<-data.frame(x=v[,1],y=v[,2],errorx=rep(colMeans(dim.sigmas)[1],nrow(v)),
           errory=rep(colMeans(dim.sigmas)[2],nrow(v)))
ggplot(xy.dta,aes(x=x,y=y))+
  geom_point(shape=1)+
  geom_errorbar(aes(xmin=x-errorx,xmax=x+errorx))+
  geom_errorbar(aes(ymin=y-errory,ymax=y+errory))