Linear Regression

Extending from the idea of how we estimated the three participants’ IQ scores, simple linear regression can be conducted in Bayesian data analysis as well. Recall the linear regression model is normally described as \(\hat{y}_{ij}=b_0+b_1x_j\), in which \(b_0\) and \(b_1\) are two parameters. Also, every observed value of \(y\) given \(x_j\) can be derived as \(y_{ij}=\hat{y}_{ij}+\epsilon_{ij}\), in which \(\epsilon_{ij}\sim N(0,\sigma_\epsilon)\). That is, \(y_{ij}\sim N(\hat{y}_j,\sigma_\epsilon)\). Thus, in our JAGS model, the likelihood function for the observed data is a t distribution with the predicted value as the mean, \(\sigma_\epsilon\) as the standard deviation, and \(\tau\) as the shape parameter.

Numeric example

Ten subjects’ weights at 8 years old and 20 years old are listed as follow. With their weights at 8 years old as the predictor and the other weights as the dependent variable, a simple linear regression can be run.

# Body weight at 20 years old
y<-c(55,44,55,51,85,57,59,60,66,49)
# Body weight at 8 years old
x<-c(31,25,29,20,40,32,27,33,28,21)

We can make the scatter plot with the body weights at year 8 in the axis X and those at year 20 in the axis Y. Obviously, there is a positive correlation between the body weights at these two ages for the same participants.

library(ggplot2)
ggplot(data.frame(x=x,y=y),aes(x,y))+
  geom_point(shape=1)+
  geom_smooth(method=lm,se=F)
## `geom_smooth()` using formula = 'y ~ x'

Let’s see how we can do regression analysis in Bayesian methods. As usual, we firstly prepare our data. Thereafter, we can set up our JAGS model.

datalist<-list(y=y,x=x,N=length(x))
modelString<-"
model{
    for(i in 1:N){
       mu[i]<-b0+b1*x[i]
       y[i]~dt(mu[i],1/sigma^2,tau)
       # Posterior predictive 
       pred[i]~dt(mu[i],1/sigma^2,tau)
    }
    # Priors
    b0~dnorm(0,0.0001)
    b1~dnorm(0,0.0001)
    sigma~dunif(0.001,1000)
    tau<-ntau+1
    ntau~dexp(1/29)
}"
writeLines(modelString,con="e10_model.txt")

Start modeling with the below codes.

library(rjags)
## Loading required package: coda
## Linked to JAGS 4.3.1
## Loaded modules: basemod,bugs
e10_model<-jags.model(file="e10_model.txt",data=datalist,
                      n.chains=2,n.adapt=1000)
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 10
##    Unobserved stochastic nodes: 14
##    Total graph size: 66
## 
## Initializing model
update(e10_model,n.iter=10000,thin=10)
post_e10<-coda.samples(e10_model,c("b0","b1","sigma","tau","pred"),
                       n.iter=10000,thin=10)

Let’s check the posterior samples of the parameter estimates.

library(ggmcmc)
## Loading required package: dplyr
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: tidyr
sample_e10<-ggs(post_e10)
b0<-filter(sample_e10,Parameter=="b0")$value
b1<-filter(sample_e10,Parameter=="b1")$value
sigma<-filter(sample_e10,Parameter=="sigma")$value
tau<-filter(sample_e10,Parameter=="tau")$value
predy<-sapply(1:length(y),function(j){
        temp<-paste0("pred[",j,"]")
        filter(sample_e10,Parameter==temp)$value
})
fb0<-ggplot(data.frame(b0=b0),aes(b0))+geom_density()
fb1<-ggplot(data.frame(b1=b1),aes(b1))+geom_density()
fsigma<-ggplot(data.frame(sigma=sigma),aes(sigma))+geom_density()
ftau<-ggplot(data.frame(tau),aes(tau))+geom_density()
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
grid.arrange(fb0,fb1,fsigma,ftau,ncol=2)

The mean estimates of \(b0\), \(b1\), \(\sigma_\epsilon\), and \(\tau\) are computed as follows. Also, the estimates are listed as well.

c(mean(b0),mean(b1),mean(sigma),mean(tau))
## [1] 15.937060  1.467469  8.321066 34.110986
Db0<-density(b0)
Db1<-density(b1)
Dsigma<-density(sigma)
Dtau<-density(tau)
# MAP of b0
Db0$x[which.max(Db0$y)]
## [1] 15.81824
# MAP of b1
Db1$x[which.max(Db1$y)]
## [1] 1.501708
# MAP of sigma
Dsigma$x[which.max(Dsigma$y)]
## [1] 7.193413
# MAP of tau
Dtau$x[which.max(Dtau$y)]
## [1] 11.49413

Also, we can compare the estimates of these parameters and the coefficients in regular regression analysis. In R lm( ) is used to do regression analysis. As shown in the below results, the intercept is 14.67 and the slope is 1.52. These two coefficients are close to our MAP estimates of \(b_0\) and \(b_1\). The standard error of estimate is actually \(\sigma_\epsilon\).

summary(lm(y~x))
## 
## Call:
## lm(formula = y ~ x)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -8.6336 -5.8923 -0.6336  5.3014  9.5897 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  14.6724    11.8810   1.235  0.25189   
## x             1.5184     0.4077   3.725  0.00583 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 7.228 on 8 degrees of freedom
## Multiple R-squared:  0.6343, Adjusted R-squared:  0.5886 
## F-statistic: 13.87 on 1 and 8 DF,  p-value: 0.005831

We can check how well this model predicts the observed data. One way is to check the \(R^2\) between the predicted data and observed data. First, we compute the mean of predicted values for each observed data point. Second, the sum of squared errors is computed as \(\sum(y-\hat{y})^2\). Third, the total sum of squares is computed as \(\sum(y-\bar{y})^2\). \(R^2=\frac{SST-SSE}{SST}\).

prediction<-colMeans(predy)
SSE<-sum((y-prediction)^2)
SST<-sum((y-mean(y))^2)
R2<-(SST-SSE)/SST
R2
## [1] 0.6280167
dd<-data.frame(y=c(y,prediction),x=c(x,x),group=rep(c("Obs","Pred"),each=length(y)))
ggplot(dd,aes(x,y))+
  geom_point(aes(color=group))+
  geom_smooth(data=subset(dd,group=="Obs"),aes(x,y),method=lm,se=F,color="black",size=0.5)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `geom_smooth()` using formula = 'y ~ x'

Also, according to the assumption of linear regression analysis that the prediction residual and predictor are independent. We can also check for this assumption with our posterior predictive data. The result supports this assumption.

cor.test(y-prediction,x)
## 
##  Pearson's product-moment correlation
## 
## data:  y - prediction and x
## t = 0.14737, df = 8, p-value = 0.8865
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.5971567  0.6600359
## sample estimates:
##        cor 
## 0.05203339

Another numeric example

This file collects participants’ biking behavior, smoking behavior, and their propensity to have a hear disease. Let’s conduct a multiple regression analysis for this data set.

dta<-read.csv("http://cml.nccu.edu.tw/lxyang/Bayes/heart.data.csv",header=T)
summary(lm(heart.disease~biking+smoking,dta))
## 
## Call:
## lm(formula = heart.disease ~ biking + smoking, data = dta)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1789 -0.4463  0.0362  0.4422  1.9331 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 14.984658   0.080137  186.99   <2e-16 ***
## biking      -0.200133   0.001366 -146.53   <2e-16 ***
## smoking      0.178334   0.003539   50.39   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.654 on 495 degrees of freedom
## Multiple R-squared:  0.9796, Adjusted R-squared:  0.9795 
## F-statistic: 1.19e+04 on 2 and 495 DF,  p-value: < 2.2e-16

In Bayesian data analysis, we fit the regression model to the observed data in order to get the best-fit parameter values. Let’s prepare the data first.

x1<-as.numeric(dta$biking)
x2<-as.numeric(dta$smoking)
y<-as.numeric(dta$heart.disease)
datalist<-list(y=y,x1=x1,x2=x2,n=length(y))

We need to fix the regression model in the previous model to conduct the multiple regression analysis. Note this model is basically the same as the simple linear model, except we have two predictors instead of one now.

modelString<-"
model{
   for(i in 1:n){
      muy[i]<-b0+b1*x1[i]+b2*x2[i]
      y[i]~dt(muy[i],1/sigma^2,tau)
      predy[i]~dt(muy[i],1/sigma^2,tau)
   }
   # Priors
   b0~dnorm(0,0.0001)
   b1~dnorm(0,0.0001)
   b2~dnorm(0,0.0001)
   sigma~dunif(0.001,1000)
   tau<-ntau+1
   ntau~dexp(1/29)
}"
writeLines(modelString,con="e10_model1.txt")

Start modeling. This time we have five parameters to estimate, namely \(b_0\), \(b_1\), \(b_2\), \(\sigma_\epsilon\), and \(\tau\).

e10_model1<-jags.model(file="e10_model1.txt",data=datalist,
                       n.chains=2,n.adapt=100)
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 498
##    Unobserved stochastic nodes: 503
##    Total graph size: 3503
## 
## Initializing model
update(e10_model1,n.iter=1000)
post1_e10<-coda.samples(e10_model1,c("b0","b1","b2","sigma","tau","predy"),
                        n.iter=1000)

Let’s check the posterior samples of these parameter estimates. The mean of posterior estimates of \(b_0\), \(b_1\), and \(b_2\) are close to the analytic results of the multiple regression analysis.

sample1_e10<-ggs(post1_e10)
b0<-filter(sample1_e10,Parameter=="b0")$value
b1<-filter(sample1_e10,Parameter=="b1")$value
b2<-filter(sample1_e10,Parameter=="b2")$value
# Mean of posterior sample of parameter
c(mean(b0),mean(b1),mean(b2))
## [1] 14.9848651 -0.2000857  0.1783741

The mean of posterior estimates of the standard error of estimate is computed by the following codes. The scatter plot for the combinations of observed and predicted data shows a strong positive relation between the model predictions and the observed data. The \(R^2\) is pretty high too.

sigma<-filter(sample1_e10,Parameter=="sigma")$value
mean(sigma)
## [1] 0.6303119
tau<-filter(sample1_e10,Parameter=="tau")$value
mean(tau)
## [1] 37.25534
predy<-sapply(1:length(y),function(s){
  temp<-paste0("predy[",s,"]")
  filter(sample1_e10,Parameter==temp)$value
})
dd1<-data.frame(obs=y,pred=colMeans(predy))
ggplot(dd1,aes(obs,pred))+
  geom_point(shape=1)

SST<-sum((y-mean(y))^2)
SSE<-sum((y-colMeans(predy))^2)
R2<-(SST-SSE)/SST
R2
## [1] 0.9796136

Bayesian Counterpart of Pearson’s Correlation Test

In the classical Pearson’s correlation test, it is assumed that data follow a bivariate normal distribution. This distribution is shown in the below figure.

Estimate Parameters of Bivariate Normal Distribution

One problem is that the bivariate normal distribution and, more general, the multivariate normal distribution isn’t parameterized using \(\rho\), that is, we cannot estimate \(\rho\) directly!

The bivariate normal distribution is parameterized using \(\mu_1\) and \(\mu_2\) and a covariance matrix \(\sum\), which is

\[\begin{align} \begin{bmatrix} \sigma_1^2 & \rho\sigma_1\sigma_2\\ \rho\sigma_1\sigma_2 & \sigma_2^2 \end{bmatrix}. \end{align}\]

So the model or likelihood function we use to generate predictions is

\[\begin{align} [x_{i1},x_{i2}]\sim MultivariateNormal([\mu_1,\mu_2],\sum). \end{align}\]

Let’s do some practice. First, we create the data set of two variables sampled from the population of the bivariate normal distribution with \(\rho=-0.7\).

library(mvtnorm) # generate correlated data
mu<-c(10,30)
sigma<-c(20,40)
rho<--0.7
cov_mat<-rbind(c(sigma[1]^2,rho*sigma[1]*sigma[2]),
               c(rho*sigma[1]*sigma[2],sigma[2]^2))
set.seed(123)
v<-rmvnorm(30,mu,cov_mat)
colnames(v)<-c("x","y")
ggplot(data=data.frame(v),aes(x=x,y=y))+
  geom_point()

Now we prepare the data, set up the JAGS model, and do the modeling.

datalist<-list(K=v,N1=dim(v)[1],N2=dim(v)[2])
modelString<-"
model{
   for(i in 1:N1){
      K[i,1:2]~dmnorm(mu,prec)
   }
   # Construct the covariance matrix
   prec<-inverse(cov)
   cov[1,1]<-sigma[1]^2
   cov[2,2]<-sigma[2]^2
   cov[1,2]<-sigma[1]*sigma[2]*rho
   cov[2,1]<-sigma[1]*sigma[2]*rho
   # Priors
   for(j in 1:N2){
      mu[j]~dnorm(0,1/10000)
      sigma[j]~dunif(0.001,1000)
   }
   rho~dunif(-1,1)
}"
writeLines(modelString,con="e11_model.txt")

Initialize the JAGS model. Also, we transfer the posterior samples generated by JAGS to an R object.

e11_model<-jags.model(file="e11_model.txt",data=datalist,
                      n.chains=2,n.adapt=1000)
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 30
##    Unobserved stochastic nodes: 5
##    Total graph size: 51
## 
## Initializing model
update(e11_model,n.iter=1000)
post_e11<-coda.samples(e11_model,c("mu","sigma","rho"),
                       n.iter=1000)
sample_e11<-ggs(post_e11)

The best-fit correlation coefficient can be computed by the following codes, which is close to the correlation coefficient of the sample. Also, the estimated means and standard deviations are close to those of this sample.

rho<-filter(sample_e11,Parameter=="rho")$value
mean(rho)
## [1] -0.7854011
mus<-sapply(1:2,function(x){
  temp<-paste0("mu[",x,"]")
  filter(sample_e11,Parameter==temp)$value
})
colMeans(mus)
## [1]  9.356965 33.688078
colMeans(v)
##         x         y 
##  9.350477 33.903283
sigmas<-sapply(1:2,function(x){
  temp<-paste0("sigma[",x,"]")
  filter(sample_e11,Parameter==temp)$value
})
colMeans(sigmas)
## [1] 19.87791 43.10286
apply(v,2,sd)
##        x        y 
## 18.99117 41.28755

Precision vs. Standard Deviation

In JAGS, normal distribution is called Gaussian distribution which has two parameters: \(\mu\) and \(\lambda\) and \(\lambda=\frac{1}{\sigma^2}\). In the textbook of Lee and Wagenmakers (2013), it is suggested to set up the prior distribution of \(\lambda\) as \(\Gamma\) distribution. For fitting with a mulinormal distribution, we can also change the prior distribution to \(Gamma\) distribution.

modelString<-"
model{
   for(i in 1:N1){
      K[i,1:2]~dmnorm(mu,precI)
   }
   precI<-inverse(prec)
   prec[1,1]<-sigma[1]^2
   prec[2,2]<-sigma[2]^2
   prec[1,2]<-rho*sigma[1]*sigma[2]
   prec[2,1]<-rho*sigma[1]*sigma[2]
   # Priors
   for(j in 1:N2){
      mu[j]~dnorm(0,0.0001)
      lambda[j]~dgamma(0.001,0.001)
      sigma[j]<-pow(lambda[j],-0.5)
   }
   rho~dunif(-1,1)
}"
writeLines(modelString,con="e11_model2.txt")

Now we rerun this model. The mean estimates of these parameters are close to the analytic results for the sample.

e11_model2<-jags.model(file="e11_model2.txt",data=datalist,
                       n.chains=2,n.adapt=1000)
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 30
##    Unobserved stochastic nodes: 5
##    Total graph size: 53
## 
## Initializing model
update(e11_model2,n.iter=1000)
post2_e11<-coda.samples(e11_model2,c("mu","sigma","rho"),n.iter=1000)
sample2_e11<-ggs(post2_e11)
# Mean estimates of parameters
rho<-filter(sample2_e11,Parameter=="rho")$value
mean(rho)
## [1] -0.7685626
mus<-sapply(1:2,function(x){
  filter(sample2_e11,Parameter==paste0("mu[",x,"]"))$value
})
colMeans(mus)
## [1]  9.332905 33.845685
apply(v,2,mean)
##         x         y 
##  9.350477 33.903283
sigmas<-sapply(1:2,function(x){
  filter(sample2_e11,Parameter==paste0("sigma[",x,"]"))$value
})
colMeans(sigmas)
## [1] 19.12581 41.46891
apply(v,2,sd)
##        x        y 
## 18.99117 41.28755

Pearson Correlation with Uncertainty

In the previous example, the measurement errors are not taken into consideration. However, in the real testing conditions, the measurement error or the precision of the measuring tools should be taken into account.

That is, the observed data point \(k=(k_x,k_y)\) was drawn from the normal distributions over two dimensions centered on the true score \(t=(t_x,t_y)\) with \(\lambda=(\lambda_x,\lambda_y)\) as the indices of measurement precision.

We can fix our model to take this unvertainty into consideration.

modelString<-"
model{
    for(i in 1:N1){
       t[i,1:2]~dmnorm(mu,precI)
       for(j in 1:N2){
          K[i,j]~dnorm(t[i,j],lambdaerror[j])
       }
    }
    # Priors
    for(j in 1:N2){
       mu[j]~dnorm(0,0.001)
       lambdaerror[j]~dgamma(0.001,0.001)
       lambda[j]~dgamma(0.001,0.001)
    }
    precI<-inverse(prec)
    prec[1,1]<-1/lambda[1]
    prec[2,2]<-1/lambda[2]
    prec[1,2]<-rho/(pow(lambda[1],0.5)*pow(lambda[2],0.5))
    prec[2,1]<-rho/(pow(lambda[1],0.5)*pow(lambda[2],0.5))
    rho~dunif(-1,1)
}"
writeLines(modelString,con="e11_model3.txt")

Let’s re-do the modeling. As usual, we can check the mean estimates of all parameters.

e11_model3<-jags.model(file="e11_model3.txt",data=datalist,
                       n.chains=2,n.adapt=1000)
## Compiling model graph
##    Resolving undeclared variables
##    Allocating nodes
## Graph information:
##    Observed stochastic nodes: 60
##    Unobserved stochastic nodes: 37
##    Total graph size: 173
## 
## Initializing model
update(e11_model3,n.iter=1000)
post3_e11<-coda.samples(e11_model3,c("mu","lambda","lambdaerror","rho"),
                        n.iter=1000)
sample3_e11<-ggs(post3_e11)
mus<-sapply(1:2,function(x){
  filter(sample3_e11,Parameter==paste0("mu[",x,"]"))$value
})
colMeans(mus)
## [1]  9.560108 32.997886
apply(v,2,mean)
##         x         y 
##  9.350477 33.903283
lambdas<-sapply(1:2,function(x){
  filter(sample3_e11,Parameter==paste0("lambda[",x,"]"))$value
})
sigmas<-1/lambdas^0.5
colMeans(sigmas)
## [1] 19.01023 41.54767
apply(v,2,sd)
##        x        y 
## 18.99117 41.28755
lambdaerrors<-sapply(1:2,function(x){
  filter(sample3_e11,Parameter==paste0("lambdaerror[",x,"]"))$value
})
dim.sigmas<-1/lambdaerrors^0.5
colMeans(dim.sigmas)
## [1] 1.104503 1.376748
rhos<-filter(sample3_e11,Parameter=="rho")$value
mean(rhos)
## [1] -0.781488

Scatter Plot with Error Bars

Since the estimated measurement error represents the measurement uncertainty, it can be added to the data points as the error bars on the two dimensions.

xy.dta<-data.frame(x=v[,1],y=v[,2],errorx=rep(colMeans(dim.sigmas)[1],nrow(v)),
           errory=rep(colMeans(dim.sigmas)[2],nrow(v)))
ggplot(xy.dta,aes(x=x,y=y))+
  geom_point(shape=1)+
  geom_errorbar(aes(xmin=x-errorx,xmax=x+errorx))+
  geom_errorbar(aes(ymin=y-errory,ymax=y+errory))