library(gamlss)
library(gamlss.ggplots)
library(gamlss.add)
library(ggplot2)
library(dplyr)
library(gamboostLSS)
10: Social media post performance
data(fbdata)
Figure 10.1
Facebook data: histogram of the number of shares, truncated for display at 100 and overlaid with density estimate.
%>% dplyr::select(share) %>%
fbdata filter(share<101) %>% ## truncate at 100
ggplot(aes(share)) +
geom_histogram(aes(y=after_stat(density)), colour="darkblue", fill="lightblue", binwidth=1) +
theme_bw(base_size=12) +
xlab("Number of shares") +
ylab("Density") +
stat_density(alpha=.2, fill="#FF6666", bw=1.8, outline.type="upper", colour="black")+
theme(axis.text.x=element_text(size=14))+
theme(axis.text.y=element_text(size=14))+
theme(axis.title.x = element_text(size=14),
axis.title.y = element_text(size=14))
Model building
Select the response distribution
The gamlss cyclic P-spline function
pbc()
has been used for the smooth cyclic terms forPost.Hour
,Post.Weekday
andPost.Month
.Because of numerical instability of the
pbc(Post.Month)
term, the number of break points (knots) was set at 5 for this term (inter=5
).
<- gamlss(share ~ Category + pbc(Post.Month, control=pbc.control(inter=5)) + pbc(Post.Weekday),
m0 sigma.formula = ~ pbc(Post.Month, control=pbc.control(inter=5)) +
+ pbc(Post.Weekday),
Category nu.formula = ~ Category,
tau.formula = ~ Category,
data = fbdata, family=PO)
GAMLSS-RS iteration 1: Global Deviance = 14158.62
GAMLSS-RS iteration 2: Global Deviance = 14158.62
<- chooseDist(m0, type="count") T1
minimum GAIC(k= 2 ) family: ZASICHEL
minimum GAIC(k= 3.84 ) family: ZASICHEL
minimum GAIC(k= 6.2 ) family: SI
order(T1[,1])[1:5],] ## top 5 in order for AIC T1[
2 3.84 6.2
ZASICHEL 4087.826 4117.266 4155.026
ZISICHEL 4092.669 4122.109 4159.869
ZABNB 4092.753 4122.193 4159.953
SI 4095.530 4119.450 4150.130
SICHEL 4097.770 4121.690 4152.370
order(T1[,3])[1:5],] ## top 5 in order for BIC T1[
2 3.84 6.2
SI 4095.530 4119.450 4150.130
SICHEL 4097.770 4121.690 4152.370
ZASICHEL 4087.826 4117.266 4155.026
BNB 4102.049 4125.969 4156.649
ZISICHEL 4092.669 4122.109 4159.869
Select covariates
The gamlss function stepGAICAll.A()
was used for covariate selection, with the AIC as criterion (the default). The lengthy output of stepGAICAll.A()
shown below can be suppressed with the argument trace=FALSE
.
# starting model
<- gamlss(share~1, data = fbdata, family=SICHEL, method=mixed(20,100), trace=FALSE)
m1
<- stepGAICAll.A(m1,
m.SICHEL scope=list(lower=~1,
upper = ~ Category + Type + pbc(Post.Weekday) +
pbc(Post.Month, control=pbc.control(inter=5)) +
pbc(Post.Hour) + Paid))
---------------------------------------------------
Distribution parameter: mu
Start: AIC= 4280
share ~ 1
Df AIC
+ Category 2.0000 4186.3
+ pbc(Post.Hour) 8.3218 4252.8
+ pbc(Post.Weekday) 1.0000 4266.6
+ Type 2.0000 4272.2
+ Paid 1.0000 4275.0
+ pbc(Post.Month, control = pbc.control(inter = 5)) 1.0000 4278.9
<none> 4280.0
Step: AIC= 4186.32
share ~ Category
Df AIC
+ pbc(Post.Weekday) 1.0000 4168.5
+ pbc(Post.Hour) 1.0003 4184.4
+ pbc(Post.Month, control = pbc.control(inter = 5)) 1.0000 4184.6
<none> 4186.3
+ Paid 1.0000 4186.4
+ Type 2.0000 4189.0
Step: AIC= 4168.46
share ~ Category + pbc(Post.Weekday)
Df AIC
+ pbc(Post.Month, control = pbc.control(inter = 5)) 1.0000 4166.3
+ pbc(Post.Hour) 1.0003 4167.0
<none> 4168.5
+ Paid 1.0000 4170.1
+ Type 2.0000 4171.0
Step: AIC= 4166.25
share ~ Category + pbc(Post.Weekday) + pbc(Post.Month, control = pbc.control(inter = 5))
Df AIC
+ pbc(Post.Hour) 4.5673 4156.3
<none> 4166.3
+ Paid 1.0000 4167.5
+ Type 2.0000 4167.8
Step: AIC= 4156.28
share ~ Category + pbc(Post.Weekday) + pbc(Post.Month, control = pbc.control(inter = 5)) +
pbc(Post.Hour)
Df AIC
+ Type 2.2164 4155.3
<none> 4156.3
+ Paid 1.0240 4157.8
Step: AIC= 4155.31
share ~ Category + pbc(Post.Weekday) + pbc(Post.Month, control = pbc.control(inter = 5)) +
pbc(Post.Hour) + Type
Df AIC
<none> 4155.3
+ Paid 0.97896 4156.8
---------------------------------------------------
Distribution parameter: sigma
Start: AIC= 4155.31
~1
Df AIC
+ Category 1.14140 4134.8
+ pbc(Post.Month, control = pbc.control(inter = 5)) 1.17363 4147.4
+ pbc(Post.Weekday) 1.01504 4151.0
+ pbc(Post.Hour) 0.62380 4151.8
<none> 4155.3
+ Paid 0.97953 4157.2
+ Type 1.94709 4157.6
Step: AIC= 4134.76
~Category
Df AIC
+ pbc(Post.Month, control = pbc.control(inter = 5)) 1.64616 4127.2
+ pbc(Post.Weekday) 0.85811 4131.8
+ Paid 1.18304 4133.7
+ pbc(Post.Hour) 0.84724 4134.4
<none> 4134.8
+ Type 1.95036 4135.9
Step: AIC= 4127.24
share ~ Category + pbc(Post.Month, control = pbc.control(inter = 5))
Df AIC
+ pbc(Post.Hour) 0.76260 4118.3
+ pbc(Post.Weekday) 0.89129 4123.1
<none> 4127.2
+ Paid 0.98746 4127.9
+ Type 1.89642 4130.1
Step: AIC= 4118.3
share ~ Category + pbc(Post.Month, control = pbc.control(inter = 5)) +
pbc(Post.Hour)
Df AIC
+ pbc(Post.Weekday) 0.93208 4113.8
<none> 4118.3
+ Paid 0.98401 4119.1
+ Type 1.99804 4121.5
Step: AIC= 4113.8
share ~ Category + pbc(Post.Month, control = pbc.control(inter = 5)) +
pbc(Post.Hour) + pbc(Post.Weekday)
Df AIC
<none> 4113.8
+ Paid 0.96737 4114.6
+ Type 1.99660 4117.7
---------------------------------------------------
Distribution parameter: nu
Start: AIC= 4113.8
~1
Df AIC
+ pbc(Post.Month, control = pbc.control(inter = 5)) -2.2657 4089.5
+ Category -1.2657 4090.7
+ pbc(Post.Hour) -2.2656 4098.6
+ pbc(Post.Weekday) -2.2657 4099.8
+ Paid -2.2656 4108.5
+ Type 2.4285 4110.0
<none> 4113.8
Step: AIC= 4089.55
share ~ pbc(Post.Month, control = pbc.control(inter = 5))
Df AIC
+ Category 2.0001 4069.6
+ Type 2.0000 4081.0
+ Paid 1.0000 4085.5
+ pbc(Post.Weekday) 1.0000 4088.3
+ pbc(Post.Hour) 1.0000 4089.3
<none> 4089.5
- pbc(Post.Month, control = pbc.control(inter = 5)) -2.2657 4113.8
Step: AIC= 4069.59
share ~ pbc(Post.Month, control = pbc.control(inter = 5)) + Category
Df AIC
+ Type 1.99999 4061.9
<none> 4069.6
+ pbc(Post.Hour) 1.00002 4070.1
+ pbc(Post.Weekday) 1.00000 4070.6
+ Paid 0.99994 4072.0
- Category 2.00008 4089.5
- pbc(Post.Month, control = pbc.control(inter = 5)) 1.00010 4090.7
Step: AIC= 4061.92
share ~ pbc(Post.Month, control = pbc.control(inter = 5)) + Category +
Type
Df AIC
+ pbc(Post.Weekday) 0.99998 4058.8
+ pbc(Post.Hour) 1.00010 4060.3
<none> 4061.9
+ Paid 0.99995 4064.9
- Type 1.99999 4069.6
- Category 2.00004 4081.0
- pbc(Post.Month, control = pbc.control(inter = 5)) 1.00010 4085.9
Step: AIC= 4058.83
share ~ pbc(Post.Month, control = pbc.control(inter = 5)) + Category +
Type + pbc(Post.Weekday)
Df AIC
+ pbc(Post.Hour) 1.00008 4058.3
<none> 4058.8
- pbc(Post.Weekday) 0.99998 4061.9
+ Paid 0.99997 4066.3
- Category 2.00000 4069.0
- Type 1.99997 4070.6
- pbc(Post.Month, control = pbc.control(inter = 5)) 1.00008 4082.3
Step: AIC= 4058.34
share ~ pbc(Post.Month, control = pbc.control(inter = 5)) + Category +
Type + pbc(Post.Weekday) + pbc(Post.Hour)
Df AIC
<none> 4058.3
- pbc(Post.Hour) 1.00008 4058.8
- pbc(Post.Weekday) 0.99996 4060.3
+ Paid 0.99993 4066.3
- Category 2.00005 4069.3
- pbc(Post.Month, control = pbc.control(inter = 5)) 1.00005 4070.3
- Type 2.00003 4071.3
Final SICHEL model
summary(m.SICHEL)
******************************************************************
Family: c("SICHEL", "Sichel")
Call: gamlss(formula = share ~ Category + pbc(Post.Weekday) +
pbc(Post.Month, control = pbc.control(inter = 5)) +
pbc(Post.Hour) + Type, sigma.formula = ~Category +
pbc(Post.Month, control = pbc.control(inter = 5)) +
pbc(Post.Hour) + pbc(Post.Weekday), nu.formula = ~pbc(Post.Month,
control = pbc.control(inter = 5)) + Category +
Type + pbc(Post.Weekday) + pbc(Post.Hour), family = SICHEL,
data = fbdata, method = mixed(20, 100), trace = FALSE)
Fitting method: mixed(20, 100)
------------------------------------------------------------------
Mu link function: log
Mu Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 2.85863 0.19238 14.860 < 2e-16 ***
Category2 0.37931 0.10547 3.597 0.000356 ***
Category3 0.52690 0.09557 5.513 5.8e-08 ***
TypePhoto/Video 0.29027 0.19541 1.485 0.138094
TypeStatus 0.63849 0.22728 2.809 0.005172 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
------------------------------------------------------------------
Sigma link function: log
Sigma Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.9660 0.2794 7.037 6.96e-12 ***
Category2 -0.3813 0.4483 -0.850 0.39555
Category3 -1.1623 0.4073 -2.854 0.00451 **
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
------------------------------------------------------------------
Nu link function: identity
Nu Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) 1.4902 0.3444 4.327 1.85e-05 ***
Category2 -3.6145 0.3292 -10.981 < 2e-16 ***
Category3 -3.1493 0.2565 -12.280 < 2e-16 ***
TypePhoto/Video -1.3031 0.3560 -3.660 0.00028 ***
TypeStatus 0.5194 0.5144 1.010 0.31321
---
Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
------------------------------------------------------------------
NOTE: Additive smoothing terms exist in the formulas:
i) Std. Error for smoothers are for the linear effect only.
ii) Std. Error for the linear terms maybe are not accurate.
------------------------------------------------------------------
No. of observations in the fit: 495
Degrees of Freedom for the fit: 22.00045
Residual Deg. of Freedom: 472.9996
at cycle: 9
Global Deviance: 4014.34
AIC: 4058.341
SBC: 4150.843
******************************************************************
Figure 10.3
Facebook data: Worm plot, Sichel response distribution model (10.1). Twenty realizations of randomized quantile residuals are shown.
rqres.plot(m.SICHEL, howmany=20, plot.type="all", ylim=c(-1.5,1.5))
Table 10.2: Prediction scores
The prediction scores in Table 10.2 (MSEP, LS, CRPS, MAPE) are all based on \(K\)-fold cross-validation. The random number generation for obtaining the folds is platform-dependent, so the reader may get results which are numerically slightly different to those obtained here (and reported in Table 10.2).
Alternative models
Neural network: m.NN
# predictors are standardized for more stable results and faster convergence in nn():
<- mutate(fbdata,
fbdata type3 = as.numeric(Type),
Category.s = as.numeric(Category)/3,
Post.Weekday.s = Post.Weekday/7,
Post.Month.s = Post.Month/12,
Post.Hour.s = Post.Hour/24,
type3.s = type3/3)
set.seed(1494)
<- gamlss(share ~ nn(~ Category.s + Post.Weekday.s + Post.Month.s +
m.NN + type3, size=3, decay=0.1),
Post.Hour.s sigma.formula = ~ nn(~ Category.s + Post.Weekday.s + Post.Month.s +
+ type3, size=3, decay=0.1),
Post.Hour.s nu.formula = nn(~Category.s + Post.Weekday.s + Post.Month.s +
+ type3, size=3, decay=0.1),
Post.Hour.s family = SICHEL, data = fbdata, n.cyc=100)
Boosting: m.boost
<- gamboostLSS(share ~ bols(Category) + bbs(Post.Weekday) + bbs(Post.Month) +
m.boost bbs(Post.Hour) + bols(Type),
data = fbdata,
families = as.families("SICHEL"),
method = "noncyclic",
control = boost_control(nu = 0.1))
Functions for computing the prediction scores
<- function(data, K, seed=2346){
kfold ## sets up vector of indices for K-fold cross-validation
set.seed(seed)
<- nrow(data)
n <- floor(n/K)
m <- n%%K
rem $row <- 1:n
data<- data.frame(y=data$share, pred=NA)
CV <- sample(c(rep(1:K, each=m), sample(K, rem)))
I return(I)
}
<- function(model, data, K, seed=123)
kCV
{## produces model predictions using K-fold cross-validation
$row <- 1:nrow(data)
data<- data.frame(y=model$y, pred=NA)
CV <- kfold(data, K, seed) # vector of indices for K folds
I
for(k in 1:K)
{<- I!=k
Itrain <- data[Itrain,]
training.data <- data[!Itrain,]
testdata <- update(model, data=training.data)
mtrain $pred[testdata$row] <- predict(mtrain, newdata=testdata, type="response", data=training.data)
CV
}return(CV)
}
<- function(model, data, K, seed=123)
kCV.boost
{## produces model predictions using K-fold cross-validation
$row <- 1:nrow(data)
data<- data.frame(y=model$mu$response, pred=NA)
CV <- kfold(data, K, seed) # vector of indices for K folds
I
for(k in 1:K)
{<- I!=k
Itrain <- data[Itrain,]
training.data <- data[!Itrain,]
testdata ## The boosting model m.boost has to be hard-coded as it is not
## possible for the update() function to work for boosting
<- gamboostLSS(share ~ bols(Category) + bbs(Post.Weekday) + bbs(Post.Month) +
mtrain bbs(Post.Hour) + bols(Type),
data = training.data,
families = as.families("SICHEL"), method = "noncyclic",
control = boost_control(nu = 0.1))
<- cvrisk(mtrain, grid = 1:200)
cvr mstop(mtrain) <- mstop(cvr)
$pred[testdata$row] <- predict(mtrain, newdata=testdata, parameter="mu", type="response",
CVdata=training.data)
}return(CV)
}
Evaluation of MAPE and MSEP for model m.SICHEL
:
<- kCV(model=m.SICHEL, data=fbdata, K=10, seed=2345) kfold.gamlss
new prediction
new prediction
new prediction
new prediction
new prediction
new prediction
new prediction
new prediction
new prediction
new prediction
<- median(100*abs((kfold.gamlss$pred-kfold.gamlss$y)/(kfold.gamlss$y+1)))
MAPE <- mean((kfold.gamlss$pred-kfold.gamlss$y)^2)
MSEP MAPE
[1] 59.66588
MSEP
[1] 1810.404
Evaluation of MAPE and MSEP for model m.boost
:
<- kCV.boost(model=m.boost, data=fbdata, K=10, seed=2345) kfold.boost
Starting cross-validation...Starting cross-validation...Starting cross-validation...Starting cross-validation...Starting cross-validation...Starting cross-validation...Starting cross-validation...Starting cross-validation...Starting cross-validation...Starting cross-validation...
<- median(100*abs((kfold.boost$pred-kfold.boost$y)/(kfold.boost$y+1)))
MAPE <- mean((kfold.boost$pred-kfold.boost$y)^2)
MSEP MAPE
[1] 67.87735
MSEP
[1] 1784.676
Evaluation of MAPE and MSEP for model m.NN
:
<- kCV(model=m.NN, data=fbdata, K=10, seed=2345) kfold.NN
We encountered a problem in that the command
<- update(model, data=training.data) mtrain
does not work for the m.NN
model (even though it worked for m.SICHEL
). Even more strange, it worked when stepping through the commands manually. We will attempt to fix this, but in the meantime kfold.NN
was obtained by stepping through kCV()
manually.
<- median(100*abs((kfold.NN$pred-kfold.NN$y)/(kfold.NN$y+1)))
MAPE <- mean((kfold.NN$pred-kfold.NN$y)^2)
MSEP MAPE
[1] 59.62527
MSEP
[1] 1785.128
CRPS
<- function(model, data, K, seed, ymax=150)
kCRPS
{<- kfold(data, K, seed) # vector of indices for K folds
I $row <- 1:nrow(data)
data<- 0:ymax
y <- data.frame(row=1:nrow(data), y=model$y, mu=NA, sigma=NA, nu=NA)
predicted 6:(6+ymax)] <- NA
predicted[,
for(k in 1:K)
{<- I!=k
Itrain <- data[Itrain,]
training.data <- data[!Itrain,]
test.data <- try(update(model, data=training.data)) # doesn't work for NN model
mtrain <- predictAll(mtrain, data=training.data, newdata=test.data, type="response")
predtest $row, "mu"] <- predtest$mu
predicted[test.data$row, "sigma"] <- predtest$sigma
predicted[test.data$row, "nu"] <- predtest$nu
predicted[test.data
}
for(yi in y) predicted[,6+yi] <-
pSICHEL(yi, predicted$mu, predicted$sigma, predicted$nu)-as.numeric(yi>=predicted$y))^2
($CRPS <- -rowSums(predicted[,-(1:5)])
predictedreturn(sum(predicted$CRPS))
}
<- kCRPS(model=m.SICHEL, data=fbdata, K=10, seed=2345, ymax=791) crps
crps
[1] -6259.246
Again the cross-validation did not work for model m.NN
and CRPS had to be obtained manually.
Log score
We use the gamlss function gamlssCV()
to extract the log scores.
Model m.SICHEL
<- gamlssCV(formula = share ~ Category + Type + pbc(Post.Weekday) +
m.SICHEL.CV pbc(Post.Month, control = pbc.control(inter = 5)) +
pbc(Post.Hour),
sigma.formula = ~Category +
pbc(Post.Month, control = pbc.control(inter = 5)) +
pbc(Post.Hour) + pbc(Post.Weekday),
nu.formula = ~ Category + Type +
pbc(Post.Month, control = pbc.control(inter = 5)) +
pbc(Post.Weekday) + pbc(Post.Hour),
family = SICHEL, data = fbdata, method = mixed(20, 100),
K.fold=10, set.seed=123)
$CV/(-2) # log likelihood m.SICHEL.CV
[1] -2064.038
Model m.NN
The gamlssCV()
function does not work for models including nn()
terms:
<- gamlssCV(share ~ nn(~ Category.s + Post.Weekday.s + Post.Month.s +
m2 + type3, size=3, decay=0.1),
Post.Hour.s sigma.formula = ~ nn(~ Category.s + Post.Weekday.s +
+ Post.Hour.s + type3, size=3, decay=0.1),
Post.Month.s nu.formula = nn(~Category.s + Post.Weekday.s + Post.Month.s +
+ type3, size=3, decay=0.1),
Post.Hour.s family = SICHEL, data = fbdata, n.cyc=200,
K.fold=10, set.seed=123)
At present we have to step through the computations manually. (Hopefully this will be fixed.)
Figure 10.4: Term plots for Category and Post.Month
term.plot(m.SICHEL, what="mu", pages=1, ylim="common",
terms=c("Category", "pbc(Post.Month, control = pbc.control(inter = 5))"),
ylab=c("Partial for Category","Partial for pbc(Post.Month)"), mar=c(4, 1, 1, 2) + 0.1)
term.plot(m.SICHEL, what="sigma", pages=1, ylim="common",
terms=c("Category", "pbc(Post.Month, control = pbc.control(inter = 5))"),
ylab=c("Partial for Category","Partial for pbc(Post.Month)"))
term.plot(m.SICHEL, what="nu", pages=1, ylim="common",
terms=c("Category", "pbc(Post.Month, control = pbc.control(inter = 5))"),
ylab=c("Partial for Category","Partial for pbc(Post.Month)"))
Figure 10.5: Plots of fitted response distribution
library(distreg.vis)
These plots were created using the distreg.vis package, which is interactive. The following is the code generated by the package.
(a) Category
The following scenarios were selected: Post.Month
= 10 (October), Post.Weekday
= 6 (Friday), Post.Hour
= 3, Type
= “Photo/Video”, Category
= 1, 2, 3 (“action”, “product”, “inspiration”).
<- structure(list(Post.Month = c(10L, 10L, 10L),
covariate_data Post.Weekday = c(6L, 6L, 6L),
Post.Hour = c(3L, 3L, 3L),
Category = structure(c(1L, 2L, 3L),
levels = c("1", "2", "3"), class = "factor"),
Type = c("Photo/Video", "Photo/Video", "Photo/Video")),
class = "data.frame",
row.names = c("1 action", "2 product", "3 inspiration"))
<- preds(model = m.SICHEL, newdata = covariate_data) pred_data
new prediction
new prediction
new prediction
plot_dist(model = m.SICHEL, pred_params = pred_data, type = "pdf", palette = "Set1") +
lims(x=c(0,150)) +
labs(title=NULL) +
guides(fill=guide_legend(title='Category')) +
theme(text = element_text(size=14))
(b) Month
The following scenarios were selected: Post.Month
= 1, 5, 10 (January, May, October), Post.Weekday
= 6 (Friday), Post.Hour
= 3, Category
= 1 (“action”), Type
= “Photo/Video”.
<- structure(list(Post.Month = c(1L, 5L, 10L),
covariate_data Post.Weekday = c(6L, 6L, 6L),
Post.Hour = c(3L, 3L, 3L),
Category = structure(c(1L, 1L, 1L),
levels = c("1", "2", "3"), class = "factor"),
Type = c("Photo/Video", "Photo/Video", "Photo/Video")),
class = "data.frame",
row.names = c("January", "May", "October"))
<- preds(model = m.SICHEL, newdata = covariate_data) pred_data
new prediction
new prediction
new prediction
plot_dist(model = m.SICHEL, pred_params = pred_data, type = "pdf", palette = "Set1") +
lims(x=c(0,150)) +
labs(title=NULL) +
guides(fill=guide_legend(title='Month')) +
theme(text = element_text(size=14))