library(gamboostLSS)
13: Variable Selection for Gene Expression Data
Introduction
This application uses the riboflavin
data from the hdi package.
library(hdi)
data("riboflavin")
# number of observations and gene expression levels
dim(riboflavin$x)
[1] 71 4088
Coefficient paths
In Figure 13.1 of the book, we display the coefficient path for the first 100 iterations when fitting a Gaussian location and scale model.
<- riboflavin
dat # change the variable names
attr(dat$x, "dimnames")[[2]] <- paste("", 1:4088, sep = "")
# fit a boosting model (remove intercept for nicer plot)
<- glmboostLSS( y ~ - 1+ x, data = dat,
glm1 method = "noncyclic",
families = GaussianLSS(mu = NULL,
sigma = NULL,
stabilization = "MAD"),
control = boost_control(nu = 0.1))
par(mfrow = c(1,2))
plot(glm1, off2int = TRUE, col = 1, las = 1, ylim =c(-1,1))
Leave-one-out cross-validation
We compare a classical boosting approach (mean regression) with a Gaussian location and scale and a Gumbel model. We use separate functions to compare the approaches running in parallel on a computing cluster.
First classical boosting
<- function(id)
AMP
{
set.seed(id)
print(id)
# classic L2 boosting without one obs (id)
<- glmboost( y ~ x, data = dat[- id,])
glm1
# bootstrapping
<- cvrisk(glm1, grid = 1:500, papply = "lapply")
cvr
mstop(glm1) <- mstop(cvr)
# predict on the left out obs
<- predict(glm1, newdata = dat[id,])
pred
# compute msep
<- (dat[id,]$y - pred)^2
msep
<- coef(glm1, which = "")
coefs
return(list(pred = pred, msep = msep,
mstop = mstop(cvr), coef = coefs ))
}
For executing in parallel
library(parallel)
<- 1:nrow(dat)
id <- mclapply(id, FUN = AMP, mc.cores = 11,
ERG mc.preschedule = FALSE)
Now check the MSEP, MAEP and selcted variables:
mean(unlist(sapply(ERG, function(x)x[2]))) # msep
[1] 0.2033454
median(sqrt(unlist(sapply(ERG, function(x)x[2])))) # maep
[1] 0.2405357
mean(sapply(ERG, function(x) sum(x[4]$coef != 0)))
[1] 33.21127
Gaussian location and scale model
<- function(id)
AMP_LSS
{
set.seed(id)
print(id)
<- glmboostLSS( y ~ x, data = dat[- id,],
glm1 method = "noncyclic",
families = GaussianLSS(),
control = boost_control(
mstop = 50))
<- cvrisk(glm1, papply = lapply)
cvr
mstop(glm1) <- mstop(cvr)
<- predict(glm1, newdata = dat[id,],
pred type = "response")
<- (dat[id,]$y - pred$mu)^2
msep <- abs(dat[id,]$y - pred$mu)
mae
<- coef(glm1, which = "")
coefs
return(list(pred = pred, msep = msep,
mstop = mstop(glm1),
coef = coefs ))
}
Execute in parallel
<- mclapply(id, FUN = AMP_LSS, mc.cores = 11,
ERG mc.preschedule = FALSE)
mean(unlist(sapply(ERG, function(x)x[2]))) # msep
[1] 0.2140843
median(sqrt(unlist(sapply(ERG, function(x)x[2])))) # maep
[1] 0.2342696
mean(sapply(ERG, function(x) sum(x[4]$coef$mu != 0)))
[1] 19.35211
mean(sapply(ERG, function(x) sum(x[4]$coef$sigma != 0)))
[1] 3.014085
Gumbel distribution
<- function(id)
AMP_LSS_GU
{
set.seed(id)
print(id)
<- glmboostLSS( y ~ x, data = dat[- id,],
glm1 method = "noncyclic",
families = as.families("GU",
stabilization = "MAD"),
control = boost_control(
mstop = 1, nu = 0.01))
<- cvrisk(glm1, grid = 1:300, papply = lapply)
cvr
mstop(glm1) <- mstop(cvr)
<- predict(glm1, newdata = dat[id,],
pred type = "response")
<- (dat[id,]$y - pred$mu)^2
msep
<- coef(glm1, which = "")
coefs
return(list(pred = pred, msep = msep,
mstop = mstop(glm1), coef = coefs ))
}
After running in parallel:
<- mclapply(id, FUN = AMP_LSS, mc.cores = 11,
ERG mc.preschedule = FALSE)
mean(as.numeric(unlist(sapply(ERG, function(x)x[2]))))# msep
[1] 0.3758644
median(sqrt(as.numeric(unlist(sapply(ERG, function(x)x[2])))))# msep
[1] 0.3587012
mean(sapply(ERG, function(x) sum(x[4]$coef$mu != 0)))
[1] 24.53521
mean(sapply(ERG, function(x) sum(x[4]$coef$sigma != 0)))
[1] 9.042254
Comparison with cyclical approach
<- function(id)
AMP_LSS_cyc
{
set.seed(id)
print(id)
<- glmboostLSS( y ~ x, data = dat[- id,],
glm1 method = "cyclic",
families =
GaussianLSS(
stabilization = "MAD"),
control = boost_control(
mstop = 200, nu = 0.01))
<- cvrisk(glm1, papply = lapply)
cvr
mstop(glm1) <- mstop(cvr)
<- predict(glm1, newdata = dat[id,],
pred type = "response")
<- (dat[id,]$y - pred$mu)^2
msep <- abs(dat[id,]$y - pred$mu)
mae
<- coef(glm1, which = "")
coefs
return(list(pred = pred, msep = msep,
mstop = mstop(glm1),
coef = coefs ))
}
Executing as above
<- mclapply(id, FUN = AMP_LSS_cyc, mc.cores = 11,
ERG mc.preschedule = FALSE)
# removing one failing run
43]] <- NULL
ERG[[
mean(as.numeric(unlist(sapply(ERG, function(x)x[2]))))# msep
[1] 0.2449231
median(sqrt(as.numeric(unlist(sapply(ERG, function(x)x[2])))))# msep
[1] 0.2955776
mean(sapply(ERG, function(x) sum(x[4]$coef$mu != 0)))
[1] 28.18571
mean(sapply(ERG, function(x) sum(x[4]$coef$sigma != 0)))
[1] 6.742857
One standard error rule
<- function(id)
AMP_LSS_ose
{
set.seed(id)
print(id)
<- glmboostLSS( y ~ x, data = dat[- id,],
glm1 method = "noncyclic",
families = GaussianLSS(),
control =
boost_control(mstop = 1))
<- cvrisk(glm1, grid = 1:100, papply = lapply)
cvr
<- apply(cvr, mean, MAR = 2)
means <- which.min(means)
mstop_orig <- min(means)
risk_orig
<- sd(cvr[,mstop(cvr)])/sqrt(25)
ose
<- min(which(means <= min(means) + ose))
mstop_ose
mstop(glm1) <- mstop_ose
<- predict(glm1, newdata = dat[id,],
pred type = "response")
<- (dat[id,]$y - pred$mu)^2
msep <- abs(dat[id,]$y - pred$mu)
mae
<- coef(glm1, which = "")
coefs <- length(coef(glm1)$mu)-1
selmu <- length(coef(glm1)$sigma)-1
selsi
<- dnorm(x = ribo[id,'y'],
ls_osemean = pred$mu,
sd = pred$sigma, log = TRUE)
return(list(pred = pred, msep = msep,
mstop = mstop(glm1),
selmu = selmu, selsi = selsi,
ls_ose = ls_ose))
}
<- mclapply(id, FUN = AMP_LSS_ose, mc.cores = 11,
ERG mc.preschedule = FALSE)
mean(unlist(sapply(ERG, function(x)x$selmu)))
[1] 15
mean(unlist(sapply(ERG, function(x)x$selsi)))
[1] 1.43662
mean(unlist(sapply(ERG, function(x)x$msep)))
[1] 0.2848565
median(sqrt(unlist(sapply(ERG, function(x)x$msep))))
[1] 0.288833
median(unlist(sapply(ERG, function(x)x$mstop)))
[1] 36
mean(unlist(sapply(ERG, function(x)x$ls_ose)))
[1] -0.7925668
Probing
<- function(id)
AMP_LSS_probing
{
set.seed(id)
print(id)
<- glmboostLSS( y ~ x, data = dat[- id,],
glm1 method = "noncyclic",
families = GaussianLSS(),
control = boost_control(
mstop = 1))
set.seed(id)
<- as.data.frame(
probes.train sapply(ribo[,-1], sample)) # shuffling all variables (leaving out response which is at position 1 and 2)
names(probes.train) <- paste(names(ribo)[-c(1)],
"probe", sep = "_")
<- cbind(ribo, probes.train)
probes.train <- glmboostLSS( y ~ ., data = probes.train[- id,],
glm1 method = "noncyclic",
families = GaussianLSS(),
control = boost_control(
mstop = 100))
if(!any(selected(glm1,merge = T) >
length(names(ribo)[-c(1)]))) glm1[100]
<- min(which(selected(glm1,merge = T) >
mstop_probes length(names(ribo)[-c(1)]))) - 1
glm1[mstop_probes]
<- coef(glm1)
coefs <- length(coef(glm1)$mu)-1
selmu <- length(coef(glm1)$sigma)-1
selsi
<- predict(glm1$mu, newdata = probes.train[id,],
pred_mu type = "response")
<- predict(glm1$sigma,
pred_sigma newdata = probes.train[id,],
type = "response")
<- (dat[id,]$y - pred_mu)^2
msep_prob <- abs(dat[id,]$y - pred_mu)
mae_prob
<- dnorm(x = ribo[id,'y'],
ls_prob mean = pred_mu, sd = pred_sigma,
log = TRUE)
return(list(selmu = selmu, selsi = selsi,
mstop = mstop_probes,
pred_mu=pred_mu,
pred_sigma =pred_sigma,
msep = msep_prob, mae = mae_prob,
ls_prob = ls_prob))
}
<- mclapply(id, FUN = AMP_LSS_probing, mc.cores = 11,
ERG mc.preschedule = FALSE)
mean(unlist(sapply(ERG, function(x)x$selmu)))
[1] 5.295775
mean(unlist(sapply(ERG, function(x)x$selsi)))
[1] 0.3661972
mean(unlist(sapply(ERG, function(x)x$msep)))
[1] 0.5452482
median(sqrt(unlist(sapply(ERG, function(x)x$msep))))
[1] 0.4737379
median(unlist(sapply(ERG, function(x)x$mstop)))
[1] 16
mean(unlist(sapply(ERG, function(x)x$ls_prob)))
[1] -1.136055
Stability selection
<- function(id)
AMP_LSS_stabsel
{
set.seed(id)
print(id)
<- glmboostLSS( y ~ x, data = dat[- id,],
glm1 method = "noncyclic",
families = GaussianLSS(),
control = boost_control(mstop = 100))
set.seed(id)
<- stabsel(glm1, q = 20, PFER = 1, assumption ='none')
s1 = names(s1$selected)
sel
if(any(sel %in% (grep("(Intercept)",sel , value = T)))){
<- sel[-which(sel %in% (grep("(Intercept)", sel, value = T)))]
sel
}
if(any(grepl('.mu', sel))){
<- gsub(".mu","\\1",sel[grep('mu',sel)])
sel_mu <- gsub("x","\\1",sel_mu)
sel_mu <- as.formula(paste("y~",paste(paste0("x.",sel_mu),collapse = '+'), sep= ''))
form_mu else{
}= 0
sel_mu <- as.formula(paste("y~1"))
form_mu
}
if(any(grepl('sigma', sel))){
<- gsub(".sigma","\\1",sel[grep('sigma', sel)])
sel_sigma <- gsub("x","\\1",sel_sigma)
sel_sigma <- as.formula(paste("y~",paste(paste0("x.",sel_sigma),collapse = '+'), sep = ''))
form_sigma else{
}= 0
sel_sigma <- as.formula(paste("y~1"))
form_sigma
}
<- list(mu = form_mu,
form sigma = form_sigma)
<- glmboostLSS(form, data = ribo[- id,],
glm1 method = "noncyclic",
families = GaussianLSS(),
control = boost_control(mstop = 1))
<- cvrisk(glm1, grid = 1:100, papply = lapply)
cvr mstop(glm1) <- mstop(cvr)
<- coef(glm1)
coefs <- length(coef(glm1)$mu)-1
selmu <- length(coef(glm1)$sigma)-1
selsi
<- predict(glm1$mu, newdata = ribo[id,],
pred_mu type = "response")
<- predict(glm1$sigma, newdata = ribo[id,],
pred_sigma type = "response")
<- (dat[id,'y'] - pred_mu)^2
msep_prob <- abs(dat[id,'y'] - pred_mu)
mae_prob
<- dnorm(x = ribo[id,'y'], mean = pred_mu,
ls_stab sd = pred_sigma, log = TRUE)
return(list(selmu = selmu, selsi = selsi,
mstop = mstop(glm1),
pred_mu = pred_mu, pred_sigma= pred_sigma,
msep = msep_prob, mae = mae_prob,
ls_stab = ls_stab))
}
<- mclapply(id, FUN = AMP_LSS_stabsel, mc.cores = 11,
ERG mc.preschedule = FALSE)
mean(unlist(sapply(ERG, function(x)x$selmu)))
[1] 1.929577
mean(unlist(sapply(ERG, function(x)x$selsi)))
[1] 0
mean(unlist(sapply(ERG, function(x)x$msep)))
[1] 0.4384487
median(sqrt(unlist(sapply(ERG, function(x)x$msep))))
[1] 0.3580849
mean(unlist(sapply(ERG, function(x)x$ls_stab)))
[1] -1.065873
Deselection
<- function(id)
AMP_LSS
{
set.seed(id)
print(id)
<- glmboostLSS( y ~ x, data = dat[- id,],
glm1 method = "noncyclic",
families = GaussianLSS(),
control = boost_control(mstop = 1))
<- cvrisk(glm1, grid = 1:100, papply = lapply)
cvr
mstop(glm1) <- mstop(cvr)
<- predict(glm1, newdata = dat[id,],
pred type = "response")
<- (dat[id,]$y - pred$mu)^2
msep <- abs(dat[id,]$y - pred$mu)
mae
<- coef(glm1, which = "")
coefs <- length(coef(glm1)$mu)-1
selmu <- length(coef(glm1)$sigma)-1
selsi
<- DeselectBoostLSS_2(glm1, tau = 0.01,
desel fam = GaussianLSS())
<- predict(desel$mu, newdata = ribo[id,],
pred_mu type = "response")
<- predict(desel$sigma,
pred_sigma newdata = ribo[id,],
type = "response")
<- (ribo[id,'y'] - pred_mu)^2
msep_desel <- abs(ribo[id,'y'] - pred_mu)
mae_desel
<- length(coef(desel$mu)) -1
selmu_desel <- length(coef(desel$sigma)) -1
selsi_desel
<- dnorm(x = ribo[id,'y'],
ls_desel mean = pred_mu, sd = pred_sigma,
log = TRUE)
return(list(pred = pred, msep = msep,
mstop = mstop(glm1),
selmu = selmu, selsi = selsi,
selmu_desel = selmu_desel,
selsi_desel = selsi_desel,
pred_mu= pred_mu, pred_sigma =pred_sigma,
msep_desel = msep_desel,mae_desel = mae_desel,
ls_desel = ls_desel))
}
Which calls the function DeselectBoostLSS_2:
<-
DeselectBoostLSS_2 function(object,
data = NULL,
fam,tau = NULL,
method = c('attributable', 'cumulative')) {
= attr(object, 'data')
data
<-
mstop ifelse(inherits(object, 'nc_mboostLSS'),
mstop(object)[1],
sum(mstop(object)))
if (length(risk(object, merge = T)) > (mstop + 2))
stop("risk cannot contain more entries than mstop")
= names(object)
parameter <- colnames(object[[1]]$response) == colnames(data)
which.response <- colnames(data)[which.response]
name.response
<- selected(object, parameter = names(object))
select <- selected(object, parameter = names(object))
select1 if (inherits(object, 'glmboostLSS')) {
1]] <- select[[1]] - 1
select[[2]] <- select[[2]] - 1
select[[
1]] <- select[[1]][select[[1]] != 0]
select[[2]] <- select[[2]][select[[2]] != 0]
select[[
<- risk(object, merge = T)
RiskRed <-
totalRiskRed risk(object, merge = T)[1] - risk(object, merge = T)[length(risk(object, merge =
(
T))])<-
diffRiskRed_all sapply(2:(length(RiskRed) - 1), function(k) {
- RiskRed[k + 1]
RiskRed[k]
})names(diffRiskRed_all) <- names(risk(object, merge = T)[-c(1:2)])
<- vector("list")
diffRiskRed 1]] <-
diffRiskRed[[names(diffRiskRed_all) == parameter[[1]]][select1[[1]] -
diffRiskRed_all[1 != 0]
2]] <-
diffRiskRed[[names(diffRiskRed_all) == parameter[[2]]][select1[[2]] -
diffRiskRed_all[1 != 0]
else{
} <- risk(object, merge = T)
RiskRed <-
diffRiskRed_all sapply(1:(length(RiskRed) - 1), function(k) {
- RiskRed[k + 1]
RiskRed[k]
})<- diffRiskRed_all[-c(1:4)]
diffRiskRed_all names(diffRiskRed_all) <- names(risk(object, merge = T)[-c(1:2)])
<- vector("list")
diffRiskRed 1]] <-
diffRiskRed[[names(diffRiskRed_all) == parameter[[1]]]
diffRiskRed_all[2]] <-
diffRiskRed[[names(diffRiskRed_all) == parameter[[2]]]
diffRiskRed_all[
}
if (is.null(coef(object)[[1]]))
1]] = 0
select[[else
1]] = select[[1]]
select[[if (is.null(coef(object)[[2]]))
2]] = 0
select[[else
2]] = select[[2]]
select[[
= lapply(1:length(parameter), function(i) {
Var count(select[[i]])[2]
})
# If a parameter is not selected
# zero <- sapply(1:length(parameter),function(i){length(select[[i]])})
# Risk.Var <- lapply(which(zero != 0),function(i){sapply(1:dim(Var[[i]])[1],function(j){sum(diffRiskRed[[i]][which(count(select[[i]])[[1]][j] == select[[i]])])})})
<-
Risk.Var lapply(1:2, function(i) {
sapply(1:dim(Var[[i]])[1], function(j) {
sum(diffRiskRed[[i]][which(count(select[[i]])[[1]][j] == select[[i]])])
})
})
# diffRiskRed[[i]]
# which(count(select[[i]])[[1]][j] == select[[i]])
<-
w.parameter c(rep(parameter[1], length(count(select[[1]])[[1]])), rep(parameter[2], length(count(select[[2]])[[1]])))
<-
n.parameter c(names(object[[1]]$coef()), names(object[[2]]$coef()))
if ('(Intercept)' %in% n.parameter)
<- n.parameter[-which(n.parameter == '(Intercept)')]
n.parameter
<- unlist(Risk.Var)[!is.na(unlist(Risk.Var))]
Risk.Var_1 if (any(Risk.Var_1 == 0))
<- Risk.Var_1[!Risk.Var_1 == 0]
Risk.Var_1
<- data.frame(w.parameter, n.parameter, Risk.Var_1)
Risk.order colnames(Risk.order) <- c('parameter', 'VarName', 'Risk')
<- Risk.order[order(Risk.order$Risk), ]
Risk.order $CumRisk <- cumsum(Risk.order$Risk)
Risk.order
<- ifelse(is.null(tau), 0.01, tau)
perc <- sum(Risk.order$Risk) * perc
percRiskRed if (method[1] == 'attributable') {
<- Risk.order[which(Risk.order$Risk > percRiskRed), ]
RiskRedOver else{
} <- Risk.order[which(Risk.order$CumRisk > percRiskRed), ]
RiskRedOver
}
if (is.na(parameter[2]))
2] <- 0
parameter[if (is.na(parameter[1]))
1] <- 0
parameter[
# if(length(name.response) > 1) name.response <- paste("cbind(",paste(name.response, collapse = ","),")")
<- "y"
name.response if (any(RiskRedOver$parameter == parameter[1])) {
# if(length(name.response) >1) name.response <- paste("cbind(",paste(name.response, collapse = ","),")")
<-
form1 as.formula(paste(name.response, " ~ ", paste(
gsub("x", "x.", RiskRedOver$VarName[RiskRedOver$parameter == parameter[1]]), collapse = "+"
)))else{
} <- as.formula(paste(name.response, '~1'))
form1
}
if (any(RiskRedOver$parameter == parameter[2])) {
<-
form2 as.formula(paste(name.response, " ~ ", paste(
gsub("x", "x.", RiskRedOver$VarName[RiskRedOver$parameter == parameter[2]]), collapse = "+"
)))else{
} <- as.formula(paste(name.response, '~1'))
form2
}
<- list(form1, form2)
formula names(formula) <- names(object)
<-
dfbase environment(environment(environment(object[[1]][["fitted"]])[["RET"]][["baselearner"]][[1]][["model.frame"]])[["ret"]][["model.frame"]])[["df"]]
colnames(data$x) <- paste0("x.", colnames(data$x))
<- data.frame(cbind(y = data$y, x = data$x))
data if (inherits(object, "nc_mboostLSS")) {
if (inherits(object, "glmboostLSS")) {
= glmboostLSS(
model_after
formula,data = data,
families = fam,
method = 'noncyclic',
weights = model.weights(object),
control = boost_control(
mstop = mstop(object),
nu = list(
as.numeric(object[[1]]$control$nu),
as.numeric(object[[2]]$control$nu)
)
)
)else{
} = gamboostLSS(
model_after
formula,data = data,
families = fam,
method = 'noncyclic',
weights = model.weights(object),
control = boost_control(
mstop = mstop(object),
nu = list(
as.numeric(object[[1]]$control$nu),
as.numeric(object[[2]]$control$nu)
)
)
)
}else{
} if (inherits(object, "glmboostLSS")) {
= glmboostLSS(
model_after
formula,data = data,
families = fam,
method = 'cyclic',
weights = model.weights(object),
control = boost_control(
mstop = mstop(object),
nu = list(
as.numeric(object[[1]]$control$nu),
as.numeric(object[[2]]$control$nu)
)
)
)else{
} = gamboostLSS(
model_after
formula,data = data,
families = fam,
method = 'cyclic',
weights = model.weights(object),
control = boost_control(
mstop = mstop(object),
nu = list(
as.numeric(object[[1]]$control$nu),
as.numeric(object[[2]]$control$nu)
)
)
)
}
}
<- model_after
out <- coef(model_after)
Coef <-
deselect_para list(coef = Coef,
tau = perc,
deselectmethod = method[1])
<- append(x = out, values = deselect_para)
out
class(out) <- c(class(out))
return(out)
}
<- mclapply(id, FUN = AMP_LSS, mc.cores = 11,
ERG mc.preschedule = FALSE)
mean(unlist(sapply(ERG, function(x)x$selmu)))
[1] 19.14085
mean(unlist(sapply(ERG, function(x)x$selsi)))
[1] 2.140845
mean(unlist(sapply(ERG, function(x)x$msep)))
[1] 0.2174377
median(sqrt(unlist(sapply(ERG, function(x)x$msep))))
[1] 0.2377528
median(unlist(sapply(ERG, function(x)x$mstop)))
[1] 49
mean(unlist(sapply(ERG, function(x)x$ls_desel)))
[1] -0.7983609