library(gamboostLSS)
library(mboost)
library(mgcv)
library(gamlss)
library(moments)
7: Statistical Boosting for GAMLSS
Illustrating the behaviour of statistical boosting
Figure 7.6
Figure 7.6 illustrates the fitting of non-linear effects via statistical boosting by highlighting how the number of iterations influences the roughness or smoothness of the fit.
The code first simulates an artificial data example, afterwards a univariate GAM is fitted with different stopping iterations.
###########################################################
## Simulate data
###########################################################
## uniformly distributed covariate
<- runif(150, -0.2, 0.2)
x
## outcome following a pre-defined curve + noise
<- -(.4 - 0.95* exp(-50*x^2))*x + 0.02 *rnorm(150)
y
<- y[order(x)] ## order obs by size of x
y <- x[order(x)] ## just for easier plotting
x
plot(x, y, pch = 19)
###########################################################
## Plotting
###########################################################
par(
mfrow = c(4, 3),
mar = c(2, 3, 2.5, 0.5),
cex.main = 1.3
)plot(
x,
y,las = 1,
main = "m = 0",
pch = 19,
xlab = "",
ylab = "",
col = rgb(0.7, 0.7, 0.7, 0.75)
## observations
) ## now carry out one boosting iteration
<- gamboost(y ~ x, control = boost_control(mstop = 0))
gam1 lines(x , fitted(gam1), lwd = 3, col = 2)
## add true function
curve(
-(.4 - 0.95 * exp(-50 * x ^ 2)) * x,
add = TRUE,
from = -.2,
to = .2,
lty = 4,
lwd = 3,
col = 4
)
## for a sequence of iterations
<- c(1, 3, 5, 15, 50, 70, 150, 500,
it_values 1000, 5000, 50000)
for (j in it_values) {
plot(
x,
y,las = 1,
main = paste("m =", j),
pch = 19,
xlab = "",
ylab = "",
col = rgb(0.7, 0.7, 0.7, 0.75)
## observations
) lines(x , fitted(gam1[j]), lwd = 3, col = 2)
curve(
-(.4 - 0.95 * exp(-50 * x ^ 2)) * x,
add = TRUE,
from = -.2,
to = .2,
lty = 4,
lwd = 3,
col = 4
) }
Determine the optimal stopping iterations mstop
set.seed(2305)
## set initial value
mstop(gam1) <- 500
## 25-fold bootstrap
<- cvrisk(gam1)
cvr
## results
plot(cvr)
## resulting optimal mstop
mstop(cvr)
[1] 99
Birthweight prediction
load(file="data/Ultra.RData")
$logAC <- log(ultra$AC)
ultra$logbw <- log(ultra$birthweight)
ultra$int <- rep(1, nrow(ultra)) ultra
Table 7.1
Table 7.1. compares the results of boosting prediction models with classical inference schemes. Four different models are estimated via both approaches in a 10-fold cross-validation. The performance is compared via MAPE, MPE and RE (sd of percentage error).
The fitting is embedded in a function ‘AMP’ that is applied in parallel (if a parallel computing environment is available) on each of the folds.
<- function(id) {
AMP set.seed(2305)
<- cv(type = "kfold", weights = rep(1, nrow(ultra)))
cvd
<- ultra[cvd[, id] == 1, ]
dat.train <- ultra[cvd[, id] == 0, ]
dat.test
set.seed(id)
# classical linear model
<-
glm1 glmboost(logbw ~ logAC + HC + FL + BPD + Daysbeforedelivery + parity + age,
data = dat.train)
<-
cvr cvrisk(
folds = cv(weights = model.weights(glm1), type = "bootstrap"),
glm1,grid = 1:1000
)mstop(glm1) <- mstop(cvr)
<- exp(predict(glm1, newdata = dat.test))
pred_glm1 <-
pe_glm1 100 * (dat.test$babyweight - pred_glm1) / dat.test$babyweight
<- abs(pe_glm1)
aep_glm1 <- length(coef(glm1))
length_glm1
<-
lm1 lm(logbw ~ logAC + HC + FL + BPD + Daysbeforedelivery + parity + age ,
data = dat.train)
<- exp(predict(lm1, newdata = dat.test))
pred_lm1 <- 100 * (dat.test$babyweight - pred_lm1) / dat.test$babyweight
pe_lm1 <- abs(pe_lm1)
aep_lm1 <- length(coef(lm1))
length_lm1
set.seed(id)
# linear model with pairwise interactions
<-
glm2 glmboost(
~ logAC + HC + FL + BPD + Daysbeforedelivery + parity + age +
logbw :HC + logAC:FL + logAC:BPD + logAC:Daysbeforedelivery + logAC:parity + logAC:age +
logAC:FL + HC:BPD + HC:Daysbeforedelivery + HC:parity + HC:age +
HC:BPD + FL:Daysbeforedelivery + FL:parity + FL:age +
FL:Daysbeforedelivery + BPD:parity + BPD:age +
BPD:parity + Daysbeforedelivery:age +
Daysbeforedelivery:age,
paritydata = dat.train
)<-
cvr cvrisk(
folds = cv(weights = model.weights(glm2), type = "bootstrap"),
glm2,grid = 1:5000
)mstop(glm2) <- mstop(cvr)
<- exp(predict(glm2, newdata = dat.test))
pred_glm2 <-
pe_glm2 100 * (dat.test$babyweight - pred_glm2) / dat.test$babyweight
<- abs(pe_glm2)
aep_glm2 <- length(coef(glm2))
length_glm2
<-
lm2 lm(
~ logAC + HC + FL + BPD + Daysbeforedelivery + parity + age +
logbw :HC + logAC:FL + logAC:BPD + logAC:Daysbeforedelivery + logAC:parity + logAC:age +
logAC:FL + HC:BPD + HC:Daysbeforedelivery + HC:parity + HC:age +
HC:BPD + FL:Daysbeforedelivery + FL:parity + FL:age +
FL:Daysbeforedelivery + BPD:parity + BPD:age +
BPD:parity + Daysbeforedelivery:age +
Daysbeforedelivery:age ,
paritydata = dat.train
)
<- exp(predict(lm2, newdata = dat.test))
pred_lm2 <- 100 * (dat.test$babyweight - pred_lm2) / dat.test$babyweight
pe_lm2 <- abs(pe_lm2)
aep_lm2 <- length(coef(lm2))
length_lm2
set.seed(id)
# linear model with all interactions
<-
glm3 glmboost(logbw ~ logAC * HC * FL * BPD * Daysbeforedelivery * parity * age ,
data = dat.train)
<-
cvr cvrisk(
folds = cv(weights = model.weights(glm3), type = "bootstrap"),
glm3,grid = 1:5000
)mstop(glm3) <- mstop(cvr)
<- exp(predict(glm3, newdata = dat.test))
pred_glm3 <-
pe_glm3 100 * (dat.test$babyweight - pred_glm3) / dat.test$babyweight
<- abs(pe_glm3)
aep_glm3 <- length(coef(glm3))
length_glm3
<-
lm3 lm(logbw ~ logAC * HC * FL * BPD * Daysbeforedelivery * parity * age ,
data = dat.train)
<- exp(predict(lm3, newdata = dat.test))
pred_lm3 <- 100 * (dat.test$babyweight - pred_lm3) / dat.test$babyweight
pe_lm3 <- abs(pe_lm3)
aep_lm3 <- length(coef(lm3))
length_lm3
set.seed(id)
# non-linear model
<-
gam1 gamboost(
~ bols(int, intercept = FALSE) + logAC + HC + FL + BPD + Daysbeforedelivery
logbw + bols(parity) + age,
data = dat.train
)<-
cvr cvrisk(
folds = cv(weights = model.weights(gam1), type = "bootstrap"),
gam1,grid = 1:1000
)mstop(gam1) <- mstop(cvr)
<- exp(predict(gam1, newdata = dat.test))
pred_gam1 <-
pe_gam1 100 * (dat.test$babyweight - pred_gam1) / dat.test$babyweight
<- abs(pe_gam1)
aep_gam1 <- length(coef(gam1))
length_gam1
<-
mgcv1 gam(logbw ~ s(logAC) + s(HC) + s(FL) + s(BPD) + s(Daysbeforedelivery) + parity + s(age),
data = dat.train)
<- exp(predict(mgcv1, newdata = dat.test))
pred_mgcv <-
pe_mgcv 100 * (dat.test$babyweight - pred_mgcv) / dat.test$babyweight
<- abs(pe_mgcv)
aep_mgcv <- length(coef(mgcv1))
length_mgcv
return(
list(
aep_glm1,
aep_lm1,
aep_glm2,
aep_lm2,
aep_glm3,
aep_lm3,
aep_gam1,
aep_mgcv,
pe_glm1,
pe_lm1,
pe_glm2,
pe_lm2,
pe_glm3,
pe_lm3,
pe_gam1,
pe_mgcv,
pred_glm1,
pred_lm1,
pred_glm2,
pred_lm2,
pred_glm3,
pred_lm3,
pred_gam1,
pred_mgcv,$babyweight,
dat.test
length_glm1,
length_lm1,
length_glm2,
length_lm2,
length_glm3,
length_lm3,
length_gam1,
length_mgcv
)
) }
This is now executed 10 times on the different fold, the results are stored in a list.
library(parallel)
<- 1:10
id <- mclapply(id, FUN = AMP, mc.cores = 10, mc.preschedule = FALSE) ERG
Now the results are extracted and the Table is built.
<- unlist(lapply(ERG, FUN= function(x) x[[1]]))
aep_glm1 <- unlist(lapply(ERG, function(x) x[[2]]))
aep_lm1 <- unlist(lapply(ERG, function(x) x[[3]]))
aep_glm2 <- unlist( lapply(ERG, function(x) x[[4]]))
aep_lm2 <- unlist(lapply(ERG, function(x) x[[5]]))
aep_glm3 <- unlist(lapply(ERG, function(x) x[[6]]))
aep_lm3 <- unlist(lapply(ERG, function(x) x[[7]]))
aep_gam1 <- unlist(lapply(ERG, function(x) x[[8]]))
aep_mgcv
<- unlist(lapply(ERG, function(x) x[[9]]))
pe_glm1 <- unlist(lapply(ERG, function(x) x[[10]]))
pe_lm1 <- unlist(lapply(ERG, function(x) x[[11]]))
pe_glm2 <- unlist( lapply(ERG, function(x) x[[12]]))
pe_lm2 <- unlist(lapply(ERG, function(x) x[[13]]))
pe_glm3 <- unlist(lapply(ERG, function(x) x[[14]]))
pe_lm3 <- unlist(lapply(ERG, function(x) x[[15]]))
pe_gam1 <- unlist(lapply(ERG, function(x) x[[16]]))
pe_mgcv
<- unlist(lapply(ERG, function(x) x[[17]]))
pred_glm1 <- unlist(lapply(ERG, function(x) x[[18]]))
pred_lm1 <- unlist(lapply(ERG, function(x) x[[19]]))
pred_glm2 <- unlist( lapply(ERG, function(x) x[[20]]))
pred_lm2 <- unlist(lapply(ERG, function(x) x[[21]]))
pred_glm3 <- unlist(lapply(ERG, function(x) x[[22]]))
pred_lm3 <- unlist(lapply(ERG, function(x) x[[23]]))
pred_gam1 <- unlist(lapply(ERG, function(x) x[[24]]))
pred_mgcv
<- unlist(sapply(ERG, function(x) x[[25]]))
bw
<- unlist(lapply(ERG, function(x) x[[26]]))
length_glm1 <- unlist(lapply(ERG, function(x) x[[27]]))
length_lm1 <- unlist(lapply(ERG, function(x) x[[28]]))
length_glm2 <- unlist( lapply(ERG, function(x) x[[29]]))
length_lm2 <- unlist(lapply(ERG, function(x) x[[30]]))
length_glm3 <- unlist(lapply(ERG, function(x) x[[31]]))
length_lm3 <- unlist(lapply(ERG, function(x) x[[32]]))
length_gam1 <- unlist(lapply(ERG, function(x) x[[33]]))
length_mgcv
<-
tab cbind(
median(aep_lm1),
median(aep_glm1),
median(aep_lm2),
median(aep_glm2),
median(aep_lm3),
median(aep_glm3),
median(aep_mgcv),
median(aep_gam1)
)<- rbind(tab,
tab c(
mean(pe_lm1),
mean(pe_glm1),
mean(pe_lm2),
mean(pe_glm2),
mean(pe_lm3),
mean(pe_glm3),
mean(pe_mgcv),
mean(pe_gam1)
))<- rbind(tab,
tab c(
sd(pe_lm1),
sd(pe_glm1),
sd(pe_lm2),
sd(pe_glm2),
sd(pe_lm3),
sd(pe_glm3),
sd(pe_mgcv),
sd(pe_gam1)
))<- rbind(tab,
tab c(
mean(length_lm1),
mean(length_glm1),
mean(length_lm2),
mean(length_glm2),
mean(length_lm3),
mean(length_glm3),
mean(length_mgcv),
mean(length_gam1)
- 1)
)
colnames(tab) <-
c(
"model1",
"model1 boosting",
"model2",
"model2 boosting",
"model3",
"model3 boosting",
"model4",
"model4 boosting"
)rownames(tab) <-
c(
"median absolute percentage error",
"mean percentage error",
"sd percentage error" ,
"length"
)round(t(tab), 2)
median absolute percentage error mean percentage error
model1 5.92 -0.45
model1 boosting 5.90 -0.45
model2 5.63 -0.46
model2 boosting 5.66 -0.44
model3 5.87 -1.34
model3 boosting 5.43 -0.49
model4 5.80 -0.44
model4 boosting 5.79 -0.46
sd percentage error length
model1 9.54 7.0
model1 boosting 9.55 7.0
model2 9.22 28.0
model2 boosting 9.16 17.0
model3 19.94 127.0
model3 boosting 9.31 19.4
model4 9.32 55.0
model4 boosting 9.43 6.0
Table 7.2 (L2 vs L1)
Table 7.2 of the book compares the performance of L2-boosting with boosting median regression, e.g. optimizing the L1 loss. The prediction accuracy is compared via 10-fold cross-validation.
<- function(id) {
AMP set.seed(2305)
<- cv(type = "kfold", weights = rep(1, nrow(ultra)))
cvd
# generate train and test data
<- ultra[cvd[, id] == 1, ]
dat.train <- ultra[cvd[, id] == 0, ]
dat.test
set.seed(id)
# quantile regression model
<-
glm1 glmboost(
~ logAC + HC + FL + BPD
logbw + Daysbeforedelivery + parity + age,
data = dat.train,
family = QuantReg()
)<-
cvr cvrisk(
folds = cv(weights = model.weights(glm1), type = "bootstrap"),
glm1,grid = 1:5000
)mstop(glm1) <- mstop(cvr)
<- exp(predict(glm1, newdata = dat.test))
pred_glm1 <-
pe_glm1 100 * (dat.test$babyweight - pred_glm1) / dat.test$babyweight
<- abs(pe_glm1)
aep_glm1 <- length(coef(glm1))
length_glm1
set.seed(id)
<-
lm1 glmboost(
~ logAC + HC + FL + BPD
logbw + Daysbeforedelivery + parity + age ,
data = dat.train,
family = Gaussian()
)<-
cvr cvrisk(
folds = cv(weights = model.weights(lm1), type = "bootstrap"),
lm1,grid = 1:1000
)mstop(lm1) <- mstop(cvr)
set.seed(id)
<- exp(predict(lm1, newdata = dat.test))
pred_lm1 <- 100 * (dat.test$babyweight - pred_lm1) / dat.test$babyweight
pe_lm1 <- abs(pe_lm1)
aep_lm1 <- length(coef(lm1))
length_lm1
set.seed(id)
# linear model with pairwise interactions
<-
glm2 glmboost(
~ logAC + HC + FL + BPD +
logbw + parity + age +
Daysbeforedelivery :HC + logAC:FL + logAC:BPD +
logAC:Daysbeforedelivery + logAC:parity + logAC:age +
logAC:FL + HC:BPD + HC:Daysbeforedelivery +
HC:parity + HC:age +
HC:BPD + FL:Daysbeforedelivery + FL:parity + FL:age +
FL:Daysbeforedelivery + BPD:parity + BPD:age +
BPD:parity +
Daysbeforedelivery:age + parity:age,
Daysbeforedeliverydata = dat.train,
family = QuantReg()
)<-
cvr cvrisk(
folds = cv(weights = model.weights(glm2), type = "bootstrap"),
glm2,grid = 1:10000
)mstop(glm2) <- mstop(cvr)
set.seed(id)
<- exp(predict(glm2, newdata = dat.test))
pred_glm2 <-
pe_glm2 100 * (dat.test$babyweight - pred_glm2) / dat.test$babyweight
<- abs(pe_glm2)
aep_glm2 <- length(coef(glm2))
length_glm2
set.seed(id)
# classical L2 model
<-
lm2 glmboost(
~ logAC + HC + FL + BPD + Daysbeforedelivery + parity + age +
logbw :HC + logAC:FL + logAC:BPD + logAC:Daysbeforedelivery + logAC:parity + logAC:age +
logAC:FL + HC:BPD + HC:Daysbeforedelivery + HC:parity + HC:age +
HC:BPD + FL:Daysbeforedelivery + FL:parity + FL:age +
FL:Daysbeforedelivery + BPD:parity + BPD:age +
BPD:parity + Daysbeforedelivery:age + parity:age,
Daysbeforedeliverydata = dat.train,
family = Gaussian()
)<-
cvr cvrisk(
folds = cv(weights = model.weights(lm2), type = "bootstrap"),
lm2,grid = 1:5000
)mstop(lm2) <- mstop(cvr)
<- exp(predict(lm2, newdata = dat.test))
pred_lm2 <- 100 * (dat.test$babyweight - pred_lm2) / dat.test$babyweight
pe_lm2 <- abs(pe_lm2)
aep_lm2 <- length(coef(lm2))
length_lm2
set.seed(id)
# linear model with all interactions
<-
glm3 glmboost(
~ logAC * HC * FL * BPD * Daysbeforedelivery * parity * age ,
logbw family = QuantReg(),
data = dat.train
)<-
cvr cvrisk(
folds = cv(weights = model.weights(glm3), type = "bootstrap"),
glm3,grid = 1:10000
)mstop(glm3) <- mstop(cvr)
<- exp(predict(glm3, newdata = dat.test))
pred_glm3 <-
pe_glm3 100 * (dat.test$babyweight - pred_glm3) / dat.test$babyweight
<- abs(pe_glm3)
aep_glm3 <- length(coef(glm3))
length_glm3
set.seed(id)
<-
lm3 glmboost(
~ logAC * HC * FL * BPD * Daysbeforedelivery * parity * age ,
logbw family = Gaussian(),
data = dat.train
)<-
cvr cvrisk(
folds = cv(weights = model.weights(lm3), type = "bootstrap"),
lm3,grid = 1:5000
)mstop(lm3) <- mstop(cvr)
<- exp(predict(lm3, newdata = dat.test))
pred_lm3 <- 100 * (dat.test$babyweight - pred_lm3) / dat.test$babyweight
pe_lm3 <- abs(pe_lm3)
aep_lm3 <- length(coef(lm3))
length_lm3
set.seed(id)
# non-linear model
<-
gam1 gamboost(
~ bols(int, intercept = FALSE) + logAC + HC + FL + BPD + Daysbeforedelivery +
logbw bols(parity) + age ,
family = QuantReg(),
data = dat.train
)<-
cvr cvrisk(
folds = cv(weights = model.weights(gam1), type = "bootstrap"),
gam1,grid = 1:1000
)mstop(gam1) <- mstop(cvr)
<- exp(predict(gam1, newdata = dat.test))
pred_gam1 <-
pe_gam1 100 * (dat.test$babyweight - pred_gam1) / dat.test$babyweight
<- abs(pe_gam1)
aep_gam1 <- length(coef(gam1))
length_gam1
set.seed(id)
<-
gaml2 gamboost(
~ bols(int, intercept = FALSE) + logAC + HC + FL + BPD + Daysbeforedelivery +
logbw bols(parity) + age ,
family = Gaussian(),
data = dat.train
)<-
cvr cvrisk(
folds = cv(weights = model.weights(gaml2), type = "bootstrap"),
gaml2,grid = 1:1000
)mstop(gaml2) <- mstop(cvr)
<- exp(predict(gaml2, newdata = dat.test))
pred_gaml2 <-
pe_gaml2 100 * (dat.test$babyweight - pred_gaml2) / dat.test$babyweight
<- abs(pe_gaml2)
aep_gaml2 <- length(coef(gaml2))
length_gaml2
return(
list(
aep_glm1,
aep_lm1,
aep_glm2,
aep_lm2,
aep_glm3,
aep_lm3,
aep_gam1,
aep_gaml2,
pe_glm1,
pe_lm1,
pe_glm2,
pe_lm2,
pe_glm3,
pe_lm3,
pe_gam1,
pe_gaml2,
pred_glm1,
pred_lm1,
pred_glm2,
pred_lm2,
pred_glm3,
pred_lm3,
pred_gam1,
pred_gaml2,$babyweight,
dat.test
length_glm1,
length_lm1,
length_lm2,
length_glm2,
length_lm3,
length_glm3,
length_gam1,
length_gaml2
)
) }
This can be then executed:
library(parallel)
<- 1:10
id
<- mclapply(id, FUN = AMP, mc.cores = 10,
ERG mc.preschedule = FALSE)
Now we read all the information to analyze the results and build Table 7.2:
<- unlist(lapply(ERG, FUN= function(x) x[[1]]))
aep_qlm1 <- unlist(lapply(ERG, function(x) x[[2]]))
aep_lm1 <- unlist(lapply(ERG, function(x) x[[3]]))
aep_qlm2 <- unlist( lapply(ERG, function(x) x[[4]]))
aep_lm2 <- unlist(lapply(ERG, function(x) x[[5]]))
aep_qlm3 <- unlist(lapply(ERG, function(x) x[[6]]))
aep_lm3 <- unlist(lapply(ERG, function(x) x[[7]]))
aep_qgam1 <- unlist(lapply(ERG, function(x) x[[8]]))
aep_gam1 #
<- unlist(lapply(ERG, function(x) x[[9]]))
pe_qlm1 <- unlist(sapply(ERG, function(x) x[[10]]))
pe_lm1 <- unlist(sapply(ERG, function(x) x[[11]]))
pe_qlm2 <- unlist( sapply(ERG, function(x) x[[12]]))
pe_lm2 <- unlist(sapply(ERG, function(x) x[[13]]))
pe_qlm3 <- unlist(sapply(ERG, function(x) x[[14]]))
pe_lm3 <- unlist(sapply(ERG, function(x) x[[15]]))
pe_qgam1 <- unlist(sapply(ERG, function(x) x[[16]]))
pe_gam1
<- unlist(lapply(ERG, function(x) x[[17]]))
pred_qglm1 <- unlist(sapply(ERG, function(x) x[[18]]))
pred_lm1 <- unlist(sapply(ERG, function(x) x[[19]]))
pred_qglm2 <- unlist( sapply(ERG, function(x) x[[20]]))
pred_lm2 <- unlist(sapply(ERG, function(x) x[[21]]))
pred_qglm3 <- unlist(sapply(ERG, function(x) x[[22]]))
pred_lm3 <- unlist(sapply(ERG, function(x) x[[23]]))
pred_qgam1 <- unlist(sapply(ERG, function(x) x[[24]]))
pred_gam1
<- unlist(sapply(ERG, function(x) x[[25]]))
bw
# #table
<-
tab cbind(
median(aep_lm1, na.rm = TRUE),
median(aep_qlm1, na.rm = TRUE),
median(aep_lm2, na.rm = TRUE),
median(aep_qlm2, na.rm = TRUE),
median(aep_lm3, na.rm = TRUE),
median(aep_qlm3, na.rm = TRUE),
median(aep_gam1, na.rm = TRUE),
median(aep_qgam1, na.rm = TRUE)
)<- rbind(tab,
tab c(
mean(pe_lm1, na.rm = TRUE),
mean(pe_qlm1, na.rm = TRUE),
mean(pe_lm2, na.rm = TRUE),
mean(pe_qlm2, na.rm = TRUE),
mean(pe_lm3, na.rm = TRUE),
mean(pe_qlm3, na.rm = TRUE),
mean(pe_gam1, na.rm = TRUE),
mean(pe_qgam1, na.rm = TRUE)
))<- rbind(tab,
tab c(
sd(pe_lm1, na.rm = TRUE),
sd(pe_qlm1, na.rm = TRUE),
sd(pe_lm2, na.rm = TRUE),
sd(pe_qlm2, na.rm = TRUE),
sd(pe_lm3, na.rm = TRUE),
sd(pe_qlm3, na.rm = TRUE),
sd(pe_gam1, na.rm = TRUE),
sd(pe_qgam1, na.rm = TRUE)
))
colnames(tab) <-
c(
"model1",
"model1 quant",
"model2",
"model2 quant",
"model3",
"model3 quant",
"model4",
"model4 quant"
)rownames(tab) <-
c("median absolute percentage error",
"mean percentage error",
"sd percentage error")
round(t(tab), 2)
median absolute percentage error mean percentage error
model1 5.90 -0.45
model1 quant 5.95 -0.15
model2 5.66 -0.44
model2 quant 5.65 -0.34
model3 5.43 -0.49
model3 quant 5.58 -0.42
model4 5.79 -0.46
model4 quant 5.86 -0.38
sd percentage error
model1 9.55
model1 quant 9.50
model2 9.16
model2 quant 9.14
model3 9.31
model3 quant 9.43
model4 9.43
model4 quant 9.60
Table 7.3
Table 7.3. in the book presents the results of fitting the JSU distribution with different types of candidate models via statistical boosting. This is evaluated via a 10-fold cross-validation.
<- function(id){
AMP
set.seed(2305)
<- cv(type = "kfold", weights = rep(1,nrow(ultra)))
cvd
<- ultra[cvd[,id] ==1,]
dat.train <- ultra[cvd[,id] ==0,]
dat.test
# linear model for SST
<- glmboostLSS(babyweight ~ AC + HC + FL + BPD +
glm1 + parity + age,
Daysbeforedelivery data= dat.train,
families = as.families("JSU",
mu.link = "log",
stabilization = "MAD"),
method = "noncyclic")
<- cvrisk(folds = cv(weights = model.weights(glm1),
cvr type = "kfold"), glm1,
grid = 1:3000, papply = lapply)
mstop(glm1) <- mstop(cvr)
<- predict(glm1, newdata = dat.test,
pred_glm1 type ="response")$mu
<- 100*(dat.test$babyweight -
pe_glm1 /dat.test$babyweight
pred_glm1)<- abs(pe_glm1)
aep_glm1 <- c(length(coef(glm1$mu)),
length_glm1 length(coef(glm1$sigma)),
length(coef(glm1$nu)),
length(coef(glm1$tau)))-1
<- mean(
ls_glm1 dJSU(
x = dat.test$babyweight,
mu = predict(glm1,
newdata = dat.test,
type = "response")$mu,
sigma = predict(glm1,
newdata = dat.test,
type =
"response")$sigma,
nu = predict(glm1,
newdata = dat.test,
type = "response")$nu,
tau = predict(glm1,
newdata = dat.test,
type = "response")$tau,
log = TRUE
)
)
<- glmboostLSS(
glm2 ~ AC + HC + FL + BPD +
babyweight + parity + age +
Daysbeforedelivery :HC + AC:FL + AC:BPD +
AC:Daysbeforedelivery + AC:parity +
AC:age + HC:FL + HC:BPD +
AC:Daysbeforedelivery + HC:parity +
HC:age +
HC:BPD + FL:Daysbeforedelivery +
FL:parity + FL:age +
FL:Daysbeforedelivery +
BPD:parity + BPD:age +
BPD
:parity +
Daysbeforedelivery:age +
Daysbeforedelivery:age ,
parity
data = dat.train,
families = as.families("JSU",
mu.link = "log",
stabilization =
"MAD"),
method = "noncyclic"
)
<- cvrisk(folds = cv(weights = model.weights(glm2),
cvr type = "kfold"), glm2,
grid = 1:3000, papply = lapply)
mstop(glm2) <- mstop(cvr)
<- predict(glm2, newdata = dat.test,
pred_glm2 type ="response")$mu
<- 100*(dat.test$babyweight -
pe_glm2 /dat.test$babyweight
pred_glm2)<- abs(pe_glm2)
aep_glm2 <- c(length(coef(glm2$mu)),
length_glm2 length(coef(glm2$sigma)),
length(coef(glm2$nu)),
length(coef(glm2$tau)))-1
<- mean(
ls_glm2 dJSU(
x = dat.test$babyweight,
mu = predict(glm2,
newdata = dat.test,
type = "response")$mu,
sigma = predict(glm2,
newdata = dat.test,
type =
"response")$sigma,
nu = predict(glm2,
newdata = dat.test, type
= "response")$nu,
tau = predict(glm2, newdata =
type =
dat.test, "response")$tau,
log = TRUE
)
)
# without BMI
<- glmboostLSS(babyweight ~ AC * HC * FL * BPD *
glm3 * parity * age,
Daysbeforedelivery data= dat.train,
families = as.families("JSU",
mu.link = "log",
stabilization =
"MAD"),
method = "noncyclic")
<- cvrisk(folds = cv(weights = model.weights(glm3),
cvr type = "kfold"),
grid = 1:3000, papply = lapply)
glm3, mstop(glm3) <- mstop(cvr)
<- predict(glm3, newdata = dat.test,
pred_glm3 type ="response")$mu
<- 100*(dat.test$babyweight - pred_glm3)/dat.test$babyweight
pe_glm3 <- abs(pe_glm3)
aep_glm3 <- c(length(coef(glm3$mu)),
length_glm3 length(coef(glm3$sigma)),
length(coef(glm3$nu)),
length(coef(glm3$tau)))-1
<- mean(
ls_glm3 dJSU(
x = dat.test$babyweight,
mu = predict(glm3,
newdata = dat.test,
type = "response")$mu,
sigma = predict(glm3,
newdata = dat.test,
type =
"response")$sigma,
nu = predict(glm3,
newdata = dat.test,
type = "response")$nu,
tau = predict(glm3,
newdata = dat.test,
type = "response")$tau,
log = TRUE
)
)
<- gamboostLSS(babyweight ~ AC + HC + FL + BPD +
gam1 + parity + age,
Daysbeforedelivery data= dat.train,
families = as.families("JSU",
mu.link = "log",
stabilization = "MAD"),
method = "noncyclic")
<- cvrisk(folds = cv(weights = model.weights(gam1),
cvr type = "kfold"), gam1,
grid = 1:1000, papply = lapply)
mstop(gam1) <- mstop(cvr)
<-predict(gam1, newdata = dat.test,
pred_gam1 type ="response")$mu
<- 100*(dat.test$babyweight -
pe_gam1 /dat.test$babyweight
pred_gam1)<- abs(pe_gam1)
aep_gam1 <- c(length(coef(gam1$mu)),
length_gam1 length(coef(gam1$sigma)),
length(coef(gam1$nu)),
length(coef(gam1$tau)))
<- mean(dJSU(x = dat.test$babyweight, mu =
ls_gam1 predict(gam1, newdata = dat.test,
type = "response")$mu,
sigma = predict(gam1, newdata =
type =
dat.test, "response")$sigma,
nu = predict(gam1, newdata = dat.test,
type = "response")$nu,
tau = predict(gam1,
newdata = dat.test,
type = "response")$tau,
log = TRUE))
return(list(aep_glm1, aep_glm2, aep_glm3, aep_gam1,
pe_glm1, pe_glm2, pe_glm3, pe_gam1,
pred_glm1, pred_glm2, pred_glm3, pred_gam1,mstop(glm1), mstop(glm2), mstop(glm3),
mstop(gam1), length_glm1, length_glm2,
length_glm3, length_gam1,
ls_glm1, ls_glm2, ls_glm3, ls_gam1)) }
Run on a cluster in parallel:
library(parallel)
<- 1:10
id <- mclapply(id, FUN = AMP, mc.cores = 10,
ERG mc.preschedule = FALSE)
Analyse the results and build the table:
<- unlist(lapply(ERG, function(x) x[[1]]))
aep_glm1 <- unlist(lapply(ERG, function(x) x[[2]]))
aep_glm2 <- unlist(lapply(ERG, function(x) x[[3]]))
aep_glm3 <- unlist(lapply(ERG, function(x) x[[4]]))
aep_gam1 summary(cbind(aep_glm1, aep_glm2, aep_glm3, aep_gam1))
aep_glm1 aep_glm2 aep_glm3 aep_gam1
Min. : 0.01664 Min. : 0.00595 Min. : 0.00451 Min. : 0.00319
1st Qu.: 2.56624 1st Qu.: 2.36707 1st Qu.: 2.41227 1st Qu.: 2.51316
Median : 6.08711 Median : 5.65487 Median : 5.47726 Median : 5.90358
Mean : 7.49147 Mean : 7.09282 Mean : 7.12070 Mean : 7.44914
3rd Qu.:10.80068 3rd Qu.:10.25353 3rd Qu.:10.11213 3rd Qu.:10.44822
Max. :39.63177 Max. :44.97803 Max. :54.95630 Max. :60.24528
<- unlist(lapply(ERG, function(x) x[[5]]))
pe_glm1 <- unlist(lapply(ERG, function(x) x[[6]]))
pe_glm2 <- unlist(lapply(ERG, function(x) x[[7]]))
pe_glm3 <- unlist(lapply(ERG, function(x) x[[8]]))
pe_gam1 summary(cbind(pe_glm1, pe_glm2, pe_glm3, pe_gam1))
pe_glm1 pe_glm2 pe_glm3 pe_gam1
Min. :-39.6318 Min. :-44.9780 Min. :-54.9563 Min. :-60.2453
1st Qu.: -6.7315 1st Qu.: -6.6353 1st Qu.: -6.3108 1st Qu.: -6.6977
Median : -0.5891 Median : -0.5015 Median : -0.4331 Median : -0.4905
Mean : -0.6506 Mean : -0.9567 Mean : -0.9579 Mean : -1.1448
3rd Qu.: 5.3179 3rd Qu.: 4.8514 3rd Qu.: 4.9422 3rd Qu.: 5.3121
Max. : 38.9190 Max. : 31.2456 Max. : 31.7973 Max. : 32.0935
<- unlist(lapply(ERG, function(x) x[[13]]))
mstop_glm1 <- unlist(lapply(ERG, function(x) x[[14]]))
mstop_glm2 <- unlist(lapply(ERG, function(x) x[[15]]))
mstop_glm3 <- unlist(lapply(ERG, function(x) x[[16]]))
mstop_gam1 summary(cbind(mstop_glm1, mstop_glm2, mstop_glm3, mstop_gam1))
mstop_glm1 mstop_glm2 mstop_glm3 mstop_gam1
Min. : 654.0 Min. :1495 Min. : 587 Min. :298.0
1st Qu.: 838.8 1st Qu.:2328 1st Qu.: 912 1st Qu.:396.5
Median : 963.5 Median :2873 Median :1569 Median :603.0
Mean :1524.3 Mean :2572 Mean :1478 Mean :555.3
3rd Qu.:2442.0 3rd Qu.:2960 3rd Qu.:1896 3rd Qu.:643.0
Max. :3000.0 Max. :2994 Max. :2790 Max. :909.0
<- unlist(lapply(ERG, function(x) x[[21]]))
ls_glm1 <- unlist(lapply(ERG, function(x) x[[22]]))
ls_glm2 <- unlist(lapply(ERG, function(x) x[[23]]))
ls_glm3 <- unlist(lapply(ERG, function(x) x[[24]]))
ls_gam1 round(apply(cbind(ls_glm1, ls_glm2, ls_glm3, ls_gam1), mean, MAR = 2),2)
ls_glm1 ls_glm2 ls_glm3 ls_gam1
-7.12 -7.09 -7.09 -7.11
<- unlist(lapply(ERG, function(x) x[[17]][1]))-1
l_glm1_mu <- unlist(lapply(ERG, function(x) x[[17]][2]))-1
l_glm1_si <- unlist(lapply(ERG, function(x) x[[17]][3]))-1
l_glm1_nu <- unlist(lapply(ERG, function(x) x[[17]][4]))-1
l_glm1_tau <- unlist(lapply(ERG, function(x) x[[18]][1]))-1
l_glm2_mu <- unlist(lapply(ERG, function(x) x[[18]][2]))-1
l_glm2_si <- unlist(lapply(ERG, function(x) x[[18]][3]))-1
l_glm2_nu <- unlist(lapply(ERG, function(x) x[[18]][4]))-1
l_glm2_tau <- unlist(lapply(ERG, function(x) x[[19]][1]))-1
l_glm3_mu <- unlist(lapply(ERG, function(x) x[[19]][2]))-1
l_glm3_si <- unlist(lapply(ERG, function(x) x[[19]][3]))-1
l_glm3_nu <- unlist(lapply(ERG, function(x) x[[19]][4]))-1
l_glm3_tau <- unlist(lapply(ERG, function(x) x[[20]][1]))
l_gam1_mu <- unlist(lapply(ERG, function(x) x[[20]][2]))
l_gam1_si <- unlist(lapply(ERG, function(x) x[[20]][3]))
l_gam1_nu <- unlist(lapply(ERG, function(x) x[[20]][4]))
l_gam1_tau
<- c(median(aep_glm1), median(aep_glm2), median(aep_glm3), median(aep_gam1))
tab <- rbind(tab, c(mean(pe_glm1), mean(pe_glm2), mean(pe_glm3), mean(pe_gam1)))
tab <- rbind(tab, c(sd(pe_glm1), sd(pe_glm2), sd(pe_glm3), sd(pe_gam1)))
tab
<- rbind(tab, c(mean(l_glm1_mu), mean(l_glm2_mu), mean(l_glm3_mu), mean(l_gam1_mu)))
tab <- rbind(tab, c(mean(l_glm1_si), mean(l_glm2_si), mean(l_glm3_si), mean(l_gam1_si)))
tab <- rbind(tab, c(mean(l_glm1_nu), mean(l_glm2_nu), mean(l_glm3_nu), mean(l_gam1_nu)))
tab <- rbind(tab, c(mean(l_glm1_tau), mean(l_glm2_tau), mean(l_glm3_tau), mean(l_gam1_tau)))
tab
colnames(tab) <- c("model1", "model2", "model3", "model4")
rownames(tab) <- c("MAE", "MPE", "RE", "variables mu", "variables sigma",
"variables nu", "variables tau")
This leads to Table 7.3 in the book:
round(tab, 2)
model1 model2 model3 model4
MAE 6.09 5.65 5.48 5.90
MPE -0.65 -0.96 -0.96 -1.14
RE 9.71 9.30 9.42 9.95
variables mu 6.00 19.70 20.90 7.00
variables sigma 5.90 15.60 16.50 5.30
variables nu 4.40 10.90 14.40 5.90
variables tau 4.90 10.70 7.50 4.60
NCI 60 data
Introduction
To illustrate variable selection in the context of distributional regression in Chapter 7, the cancer cell panel of the National Cancer Institute (NCI) is used. The so-called NCI-60 data contains gene expression profiles of 60 tumor cell lines and can be downloaded in different formats from http://discover.nci.nih.gov/cellminer/. These 60 human tumor cell lines represent different tumor entities: they were derived from patients with leukemia (blood cancer), melanoma (skin cancer), and lung, colon, central nervous system, ovarian, renal, breast and prostate cancers.
Data handling
In our case, the aim is to model the protein expression (outcome) measured via antibodies, with gene expression data as the predictors (Affymetrix HG-U133A-B chip). We have downloaded two datasets, containing gene expression data and protein expression. (NCI60
and KRT19_protein
are also available on the Datasets page of this website.) They now need to be cleaned and merged:
load("data/raw_data_CM.RData")
# delete one column with empty entries:
<- NCI60[,-which(names(NCI60)=="LC.NCI.H23")]
NCI60 # Harmonize the identifer
$Gene.name.d <- ifelse(as.character(NCI60$Gene.name.d) == "-",
NCI60paste("id_", as.character(NCI60$Identifier.c), sep = ""),
as.character(NCI60$Gene.name.d))
Combine the same gene expression variables from multiple different probes into one by taking their median. For details, see Sun, Zhou, and Fan (2020).
<- apply(NCI60[,8:66], FUN = function(x)
NCI60_new tapply(x, FUN=median,
factor(NCI60$Gene.name.d)), MAR = 2)
# log2 transform of gene-expr values
<- apply(abs(NCI60_new), log2, MAR = 2) NCI60_new
Figure 7.7: Histograms of kurtosis
Display skewness of the data by computing kurtosis for each variable.
# kurtosis of expression values
<- apply(NCI60_new, function(x) kurtosis(x), MAR = 1)
kurt_exp # kurtosis of protein values (re-transformed)
<- apply(2^KRT19_protein[,7:66],
kurt_pro function(x) kurtosis(x), MAR = 1)
Now histogram of these values:
par(mfrow=c(1,2))
hist(kurt_pro, breaks = 100, col = "darkgrey",
main = "Protein expression values", xlab = "kurtosis")
hist(kurt_exp, breaks =100, col = "darkgrey", ylim = c(0, 1400),
main = "Gene expression values", xlab = "kurtosis")
Check which one of the proteins has the highest variance:
$Gene.name.d[which.max(
KRT19_proteinapply(2^KRT19_protein[,7:66], function(x) sd(x), MAR = 1))]
[1] "KRT19"
This will be our outcome.
Merge gene expression and protein data
library(data.table)
# transpose data and take care of names
<- as.data.frame(NCI60_new)
NCI60 $Gene.name.d <- rownames(NCI60_new)
NCI60<- transpose(NCI60)
NCI60t colnames(NCI60t) <- rownames(NCI60)
rownames(NCI60t) <- colnames(NCI60)
rm(NCI60_new)
# get the right protein values
<- KRT19_protein[KRT19_protein$Gene.name.d == "KRT19",c(7:66)]
KRT19_protein_n <- KRT19_protein_n[,names(KRT19_protein_n) %in% names(NCI60)]
KRT19_protein_n
# Add the Protein to the gene expression
<- NCI60t[1:59,]
NCI60 1] <- as.numeric(2^KRT19_protein_n[,rownames(NCI60)])
NCI60[,names(NCI60)[1] <- "KRT19_protein"
<- data.frame(apply(NCI60, 2, function(x) as.numeric(x)))
NCI60 rm(KRT19_protein, KRT19_protein_n)
Figure 7.8: Histogram and density of outcome
hist(NCI60$KRT19_protein, breaks = 100, prob = TRUE, col = "darkgrey",
xlab ="KRT 19 protein", main = "", las = 1)
lines(density(NCI60$KRT19_protein), lwd = 2)
Model fitting
Choose suitable distribution
<- gamlss(KRT19_protein ~ KRT19, data = NCI60,
gam1 sigma.formula = ~ KRT19,
nu.formula = ~ KRT19 ,
tau.formula = ~ KRT19)
GAMLSS-RS iteration 1: Global Deviance = 453.2066
GAMLSS-RS iteration 2: Global Deviance = 422.2179
GAMLSS-RS iteration 3: Global Deviance = 420.2152
GAMLSS-RS iteration 4: Global Deviance = 420.1806
GAMLSS-RS iteration 5: Global Deviance = 420.1802
<- chooseDist(gam1, type = "realplus") findD
minimum GAIC(k= 2 ) family: GG
minimum GAIC(k= 3.84 ) family: GG
minimum GAIC(k= 4.08 ) family: GG
Function to run boosting with leave-one-out CV
<- function(i){
AMP # for reproducibility
set.seed(i)
# just for information
print(i)
# fit initial model without observation i
<- glmboostLSS( KRT19_protein ~ .,
glm1 data = NCI60[-i,],
control = boost_control(mstop=1, nu = 0.01),
method = "noncyclic", families = as.families("GG"))
# 25-fold subsampling to determine mstop
<- cvrisk(folds = cv(weights = model.weights(glm1),
cvrisk_glm1 type = "subsampling"),
grid = 1:150, papply = lapply)
glm1, # set mstop to the 'optimal' value
mstop(glm1) <- mstop(cvrisk_glm1)
# compute prediction on oobag obs
<- predict(glm1, type = "response", newdata = data_set[i,])
preds
return(c(mu = preds$mu, sigma = preds$sigma, nu = preds$nu,
mstop = mstop(cvrisk_glm1),
sel_mu = length(coef(glm1$mu)),
sel_sigma = length(coef(glm1$sigma)),
sel_nu = length(coef(glm1$nu))))
}
Run in parallel on multiple cores (Linux environment, takes some time…)
library(parallel)
<- 1:nrow(data_set) # each observation one i
i <- mclapply(i, FUN = AMP, mc.cores = 5, mc.preschedule = FALSE) ERG
Check how many genes were selected
# For the mu parameter
summary(unlist(lapply(ERG, function(x) x[5])))
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.000 2.000 3.000 2.475 3.500 6.000
# for sigma
summary(unlist(lapply(ERG, function(x) x[6])))
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 10.00 12.00 10.76 14.50 18.00
# for nu
table(unlist(lapply(ERG, function(x) x[7])))
0 2
58 1