library(gamlss)
library(gamlss.ggplots)
library(ggplot2)
9: Speech intelligibility testing
The dataset speech
is in the gamlss.data package.
data(speech)
Figure 9.1: Histogram of proportion of words correctly recognised
ggplot(speech, aes(x=y)) +
geom_histogram(binwidth = 0.04, colour="darkblue", fill="lightblue") +
xlab("Proportion correct") +
ylab("Frequency") +
theme_bw(base_size=14)
Section 9.2.1: Choice of response distribution
Use the gamlss function chooseDist()
to pick the distribution for y
\(\in (0,1)\), starting with the Beta (BE
) distribution:
.01 <- subset(speech, (y>0&y<1))
speech<- gamlss(y~snr,
m1 sigma.formula =~ snr,
nu.formula =~ snr,
tau.formula =~ snr,
family=BE, data=speech.01, n.cyc=100, trace=FALSE)
<- chooseDist(m1, type="real0to1") a
minimum GAIC(k= 2 ) family: SIMPLEX
minimum GAIC(k= 3.84 ) family: SIMPLEX
minimum GAIC(k= 7.67 ) family: SIMPLEX
The SIMPLEX
is the preferred distribution for all three criteria.
Create the zero-and-one-inflated SIMPLEX
distribution:
library(gamlss.inf)
gen.Inf0to1(family="SIMPLEX", type.of.Inflation = "Zero&One")
A 0to1 inflated SIMPLEX distribution has been generated
and saved under the names:
dSIMPLEXInf0to1 pSIMPLEXInf0to1 qSIMPLEXInf0to1 rSIMPLEXInf0to1
plotSIMPLEXInf0to1
9.2.2: Variable selection
library(gtools)
library(sjmisc)
Generate all model formulae to be considered:
<-c('snr', 'noise_dir','noise_gender','algorithm')
vars<-unique(apply(combinations(length(vars), length(vars), repeats=T), 1, unique))
indexes<-function(x) paste('~',paste( vars[x],collapse=' + '))
gen.form<-function(x) paste( interaction.vars[x],collapse=' + ')
gen.form.int<-sapply(indexes, gen.form) ## main effects
formulas.main <- c(formulas.main, "~ 1")
formulas.main <- c(indexes,0)
indexes <- formulas.main
formulas
# interaction terms (only interactions with algorithm)
for(i in 1:length(formulas))
{<- length(indexes[[i]])
m if(m>1)
{<- combinations(m,2, v=indexes[[i]])
interaction.indexes <- paste(vars[interaction.indexes[,1]], ":",vars[interaction.indexes[,2]])
interaction.vars # exclude interactions without algorithm
<- NULL
d for(j in 1:length(interaction.vars))d <- c(d, str_contains(interaction.vars[j], "algorithm"))
<- interaction.vars[d]
interaction.vars if(sum(d)>0)
{<-unique(apply(combinations(length(interaction.vars), length(interaction.vars),
i.indexesrepeats=T), 1, unique))
<-sapply(i.indexes, gen.form.int)
interaction.terms<- c(formulas, (paste(formulas[i]," + ", interaction.terms)))
formulas
}
}
}
<- length(formulas) n
In Figure 9.2 we show the results for \(\kappa\in \{2, 3, 4, 5, 6, 7, 8, \log(n)\}\). Here we illustrate \(\kappa=4\):
<- 4
k
# starting formulas for sigma, xi0 and xi1
<- "~ 1"
sigma.form <- xi1.form <- "~ snr"
xi0.form
for(N in 1:2) ## number of times the selection is repeated
{<- NULL
mu.AIC for(i in 1:n)
{eval(parse(text=paste0("m1 <- gamlssInf0to1(y=y, mu.formula =",
formulas[i], " + random(subject), sigma.formula =", sigma.form,
", xi0.formula =",xi0.form,
"+ random(subject), xi1.formula =",xi1.form,
"+ random(subject), family=SIMPLEX, data=speech, n.cyc=100)")))
<- c(mu.AIC, GAIC(m1, k=k))
mu.AIC
}<- formulas[which.min(mu.AIC)]
mu.form
<- NULL
sigma.AIC for(i in 1:n)
{eval(parse(text=paste0("m1 <- gamlssInf0to1(y=y, mu.formula =",mu.form,
", sigma.formula =", formulas[i],
", xi0.formula =",xi0.form, ", xi1.formula =",xi1.form,
",family=SIMPLEX, data=speech, n.cyc=100)")))
<- c(sigma.AIC, GAIC(m1, k=k))
sigma.AIC
}<- formulas[which.min(sigma.AIC)]
sigma.form
<- NULL
xi0.AIC for(i in 1:n)
{eval(parse(text=paste0("m1 <- gamlssInf0to1(y=y, mu.formula =",mu.form,
" + random(subject), sigma.formula =", sigma.form,
", xi0.formula =",formulas[i],
" + random(subject), xi1.formula =",xi1.form,
" + random(subject),family=SIMPLEX, data=speech, n.cyc=100)")))
<- c(xi0.AIC, GAIC(m1, k=k))
xi0.AIC
}<- formulas[which.min(xi0.AIC)]
xi0.form
<- NULL
xi1.AIC for(i in 1:n)
{eval(parse(text=paste0("m1 <- gamlssInf0to1(y=y, mu.formula =",mu.form,
" + random(subject), sigma.formula =", sigma.form,
", xi0.formula =",xi0.form,
" + random(subject), xi1.formula =",formulas[i],
" + random(subject),family=SIMPLEX, data=speech, n.cyc=100)")))
<- c(xi1.AIC, GAIC(m1, k=k))
xi1.AIC
}<- formulas[which.min(xi1.AIC)]
xi1.form
eval(parse(text=paste0("model <- gamlssInf0to1(y=y, mu.formula =",mu.form,
" + random(subject), sigma.formula =", sigma.form,
", xi0.formula =",xi0.form,
" + random(subject), xi1.formula =",xi1.form,
" + random(subject),family=SIMPLEX, data=speech, n.cyc=100)")))
## end N }
This is the chosen model, given in Table 9.2:
summary(model)
*******************************************************************
Family: "InfSIMPLEX"
Call: gamlssInf0to1(y = correct, mu.formula = ~snr + noise_gender +
random(subject), sigma.formula = ~1, xi0.formula = ~snr +
noise_gender + algorithm + snr:algorithm + random(subject),
xi1.formula = ~snr + noise_gender + algorithm + random(subject),
data = bilateral, family = SIMPLEX, n.cyc = 100)
Fitting method: RS()
-------------------------------------------------------------------
Mu link function: logit
Mu Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.211299 0.038645 -5.468 4.72e-08 ***
snr 0.057692 0.006882 8.383 < 2e-16 ***
noise_genderM 0.109124 0.038102 2.864 0.0042 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-------------------------------------------------------------------
Sigma link function: log
Sigma Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 0.82636 0.01925 42.93 <2e-16 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-------------------------------------------------------------------
xi0 link function: log
xi0 Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.13948 0.08344 13.656 < 2e-16 ***
snr -0.29338 0.02197 -13.352 < 2e-16 ***
noise_genderM -0.31877 0.05819 -5.478 4.45e-08 ***
algorithmB -0.33815 0.10217 -3.310 0.000939 ***
algorithmC -0.38921 0.10495 -3.708 0.000210 ***
snr:algorithmB 0.12101 0.02760 4.385 1.18e-05 ***
snr:algorithmC 0.03431 0.02914 1.178 0.239037
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-------------------------------------------------------------------
xi1 link function: log
xi1 Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -0.87799 0.07645 -11.485 < 2e-16 ***
snr 0.22358 0.01110 20.151 < 2e-16 ***
noise_genderM 0.30756 0.05790 5.311 1.12e-07 ***
algorithmB -0.09953 0.06847 -1.454 0.146
algorithmC -0.37059 0.06889 -5.380 7.72e-08 ***
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
-------------------------------------------------------------------
NOTE: Additive smoothing terms exist in the formulas:
i) Std. Error for smoothers are for the linear effect only.
ii) Std. Error for the linear terms may not be reliable.
-------------------------------------------------------------------
No. of observations in the fit: 6720
Degrees of Freedom for the fit: 32.65723
Residual Deg. of Freedom: 6687.343
at cycle:
Global Deviance: 12228.7
AIC: 12294.02
SBC: 12516.51
*******************************************************************
Figure 9.3: Worm plot
resid_wp(model, title=NULL, value=10)
Section 9.3: Compute fitted values and bootstrap confidence intervals
Function pred.re() to compute fitted values
The generic pred()
function was found not to work when the model includes random effects. The following function pred.re()
does it:
<- function(mod, newdata, z=0, random=FALSE, seed=3876)
pred.re
{## obtains predictions for SIMPLEX01 (predict doesn't work with random effects)
<- newdata
preds <- data.frame(subject=unique(preds$subject))
re <- nrow(re)
m if(random)
{set.seed(seed)
$mu.re <- rnorm(m)
re$sigma.re <- rnorm(m)
re$xi0.re <- rnorm(m)
re$xi1.re <- rnorm(m)
reelse re$mu.re <- re$sigma.re <- re$xi0.re <- re$xi1.re <- z
} <- rep(re$mu.re, times=unclass(table(preds$subject)))
mu.re.expanded <- rep(re$sigma.re, times=unclass(table(preds$subject)))
sigma.re.expanded <- rep(re$xi0.re, times=unclass(table(preds$subject)))
xi0.re.expanded <- rep(re$xi1.re, times=unclass(table(preds$subject)))
xi1.re.expanded
# leave out random effects
$mu.coefficients <- na.omit(mod$mu.coefficients)
mod$sigma.coefficients <- na.omit(mod$sigma.coefficients)
mod$xi0.coefficients <- na.omit(mod$xi0.coefficients)
mod$xi1.coefficients <- na.omit(mod$xi1.coefficients)
mod
<- length(mod$mu.coefficients)
pmu <- length(mod$sigma.coefficients)
psigma <- length(mod$xi0.coefficients)
pxi0 <- length(mod$xi1.coefficients)
pxi1
# obtain summary object, but suppress summary output
sink(file="temp")
=summary(mod)
asink()
file.remove("temp")
<- a[1:pmu,2]
mu.se <- a[(pmu+1):(pmu+psigma),2]
sigma.se <- a[(pmu+psigma+1):(pmu+psigma+pxi0),2]
xi0.se <- a[(pmu+psigma+pxi0+1):(pmu+psigma+pxi0+pxi1),2]
xi1.se
<- rep(mod$mu.coefficients[1], nrow(newdata))
mu.lp for(j in 2:length(mod$mu.coefficients))
{<- names(mod$mu.coefficients)[j]
var if(var!="random(subject)")
<- mu.lp + newdata[,which(names(newdata)==var)]*mod$mu.coefficients[j]
mu.lp else mu.lp <- mu.lp + mu.re.expanded*mu.se[j]
}$mu <- exp(mu.lp)/(1+exp(mu.lp))
preds
<- length(mod$sigma.coefficients)
p <- rep(mod$sigma.coefficients[1], nrow(newdata))
sigma.lp if(p>1)for(j in 2:(p-1))
<-
sigma.lp +
sigma.lp which(names(newdata)==names(mod$sigma.coefficients)[j])]*mod$sigma.coefficients[j]
newdata[,$sigma <- exp(sigma.lp)
preds
<- rep(mod$xi0.coefficients[1], nrow(newdata))
xi0.lp for(j in 2:length(mod$xi0.coefficients))
{<- names(mod$xi0.coefficients)[j]
var if(var!="random(subject)")
<- xi0.lp + newdata[,which(names(newdata)==var)]*mod$xi0.coefficients[j]
xi0.lp else xi0.lp <- xi0.lp + xi0.re.expanded*xi0.se[j]
}$xi0 <- exp(xi0.lp)
preds
<- rep(mod$xi1.coefficients[1], nrow(newdata))
xi1.lp for(j in 2:length(mod$xi1.coefficients))
{<- names(mod$xi1.coefficients)[j]
var if(var!="random(subject)")
<- xi1.lp + newdata[,which(names(newdata)==var)]*mod$xi1.coefficients[j]
xi1.lp else xi1.lp <- xi1.lp + xi1.re.expanded*xi1.se[j]
}$xi1 <- exp(xi1.lp)
preds
$p0 <- preds$xi0/(1+preds$xi0+preds$xi1)
preds$p1 <- preds$xi1/(1+preds$xi0+preds$xi1)
preds$overallmean <- preds$mu*(1-preds$p0-preds$p1) + preds$p1
preds
return(preds)
}
Compute and plot fitted values for \(\xi_0\), \(\xi_1\) and percent overall correct
Set up the data frame newdata
for prediction:
library(fastDummies)
<- c("F","M")
g <- c("A","B","C")
a <- seq(-5,15, by=0.25)
s <- data.frame(expand.grid(g,a,s))
newdata names(newdata) <- c("noise_gender", "algorithm","snr")
$noise_dir <- "F"
newdata$correct <- 0
newdata$subject <- "B2"
newdata<- dummy_cols(newdata, select_columns=c("noise_gender", "algorithm","noise_dir"),
newdata remove_first_dummy = TRUE)
names(newdata)[7:10] <- c("noise_genderM", "algorithmB" , "algorithmC" , "noise_dirS")
$'snr:algorithmB' <- newdata$snr*newdata$algorithmB
newdata$'snr:algorithmC' <- newdata$snr*newdata$algorithmC
newdata$'noise_dirS:algorithmB' <- newdata$noise_dirS*newdata$algorithmB
newdata$'noise_dirS:algorithmC' <- newdata$noise_dirS*newdata$algorithmC
newdata$'noise_genderM:algorithmB' <- newdata$noise_genderM*newdata$algorithmB
newdata$'noise_genderM:algorithmC' <- newdata$noise_genderM*newdata$algorithmC newdata
Compute predictions:
<- pred.re(model, newdata, z=0) preds
Plot predictions
- Fitted proportion zero correct: \(\hat\xi_0\)
ggplot(preds, aes(x=snr, y=p0, colour=algorithm)) +
geom_line(size=1.0) +
facet_grid(.~noise_gender)+
ylab("Fitted proportion zero correct") + xlab("SNR") +
scale_y_continuous(labels=scales::percent) +
theme_bw()
- Fitted proportion all correct: \(\hat\xi_1\)
ggplot(preds, aes(x=snr, y=p1, colour=algorithm)) +
geom_line(size=1.0) +
facet_grid(.~noise_gender)+
ylab("Fitted proportion all correct") + xlab("SNR") +
scale_y_continuous(labels=scales::percent) +
theme_bw()
- Fitted overall mean correct (equation (9.4)) \[ \hat{\mathbb{E}}(y)=\frac{\hat{\mu}+\hat{\xi}_1}{1+\hat{\xi}_0+\hat{\xi}_1} \]
ggplot(preds, aes(x=snr, y=overallmean, colour=algorithm)) +
geom_line(size=1.0) +
facet_grid(.~noise_gender)+
ylab("Fitted overall mean") + xlab("SNR") +
scale_y_continuous(labels=scales::percent) +
theme_bw()
Figure 9.4
Compute bootstrap confidence intervals
# original sample
<- speech[,c("noise_gender","algorithm" , "snr" , "noise_dir" , "y" ,"subject" )]
original <-
original dummy_cols(original, select_columns=c("noise_gender", "algorithm", "noise_dir"),
remove_first_dummy = TRUE)
names(original)[7:10] <- c("noise_genderM", "algorithmB", "algorithmC", "noise_dirS")
$'snr:algorithmB' <- original$snr*original$algorithmB
original$'snr:algorithmC' <- original$snr*original$algorithmC
original$'noise_dirS:algorithmB' <- original$noise_dirS*original$algorithmB
original$'noise_dirS:algorithmC' <- original$noise_dirS*original$algorithmC
original$'noise_genderM:algorithmB' <- original$noise_genderM*original$algorithmB
original$'noise_genderM:algorithmC' <- original$noise_genderM*original$algorithmC
original
# this takes a while!
<- 500
B <- nrow(original)
n <- original
bootsam <- bootp0 <- bootp1 <- NULL
bootmeans set.seed(9371)
<- pred.re(model, original, random=TRUE)
preds
for(b in 1:B)
{cat(b,"\n")
<- rSIMPLEXInf0to1(n, mu=preds$mu, sigma=preds$sigma, xi0=preds$xi0, xi1=preds$xi1)
yboot $correct <- yboot
bootsameval(parse(text=paste0("mboot <- gamlssInf0to1(y=correct, mu.formula =",
paste("~ ",paste(names(model$mu.coefficients)[-1], collapse=" + ")),
", sigma.formula = ~1, xi0.formula =",
paste("~ ",paste(names(model$xi0.coefficients)[-1], collapse=" + ")),
", xi1.formula =",
paste("~ ",paste(names(model$xi1.coefficients)[-1], collapse=" + ")),
", family=SIMPLEX, data=bootsam, n.cyc=100)")))
<- cbind(bootmeans, pred.re(mboot, newdata)$overallmean )
bootmeans <- cbind(bootp0, pred.re(mboot, newdata)$p0 )
bootp0 <- cbind(bootp1, pred.re(mboot, newdata)$p1 )
bootp1
}
<- pred.re(model, newdata)
bootdata <-
bootdata cbind(bootdata,
lower=apply(bootmeans, 1, quantile, probs=0.025),
upper=apply(bootmeans, 1, quantile, probs=0.975),
p0lower=apply(bootp0, 1, quantile, probs=0.025),
p0upper=apply(bootp0, 1, quantile, probs=0.975),
p1lower=apply(bootp1, 1, quantile, probs=0.025),
p1upper=apply(bootp1, 1, quantile, probs=0.975))
Plot predictions with bootstrap confidence intervals
- Fitted proportion zero correct: \(\hat\xi_0\)
ggplot(bootdata, aes(x=snr,y=p0,colour=algorithm, fill=algorithm)) +
geom_line(linewidth=1.0) +
geom_ribbon(aes(ymin = p0lower, ymax = p0upper), alpha = 0.3)+
facet_grid(.~noise_gender)+
ylab("Fitted proportion zero correct") + xlab("SNR") +
scale_y_continuous()+
theme_bw()
- Fitted proportion all correct: \(\hat\xi_1\)
ggplot(bootdata, aes(x=snr,y=p1,colour=algorithm, fill=algorithm)) +
geom_line(linewidth=1.0) +
geom_ribbon(aes(ymin = p1lower, ymax = p1upper), alpha = 0.3)+
facet_grid(.~noise_gender)+
ylab("Fitted proportion all correct") + xlab("SNR") +
scale_y_continuous()+
theme_bw()
- Fitted overall mean correct (equation (9.4)) \[ \hat{\mathbb{E}}(y)=\frac{\hat{\mu}+\hat{\xi}_1}{1+\hat{\xi}_0+\hat{\xi}_1} \]
ggplot(bootdata, aes(x=snr,y=overallmean,colour=algorithm, fill=algorithm)) +
geom_line(linewidth=1.0) +
geom_ribbon(aes(ymin = lower, ymax = upper), alpha = 0.3)+
facet_grid(.~noise_gender)+
ylab("Fitted mean proportion correct") + xlab("SNR") +
scale_y_continuous()+
theme_bw()