# ANALYSES:
# ARTICLE "Models, forests and trees of York English: 
#          Was/were variation as a case study for statistical practice" 
# Sali A. Tagliamonte and R. Harald Baayen, February 2012
# to appear in Language Variation and Change

# LIBRARIES
library(party)
library(lme4)
library(rms)    

#load the data
york = read.csv("data/york.csv", header=TRUE)


#-------------------------------------------------------------------
# FIGURE 1. Deviance residuals for a standard logistic model 
# and a logistic mixed model for individuals contributing at 
# least 10 utterances. Each boxplot should be centered around zero. 
#-------------------------------------------------------------------


tab = table(york$Individual)
obs10orMore = names(tab[tab>=10])
dat10plus = york[york$Individual %in% obs10orMore,]
dat10plus$Individual=dat10plus$Individual[drop=TRUE]

dat10plus.lmer0 = lmer(Form~Adjacency + Polarity + poly(Age, 2, raw=FALSE) + (1|Individual), 
data=dat10plus, family="binomial")
dat10plus.lm = glm(Form~Adjacency + Polarity + poly(Age, 2, raw=FALSE), 
data=dat10plus, family="binomial")

dat10plus$DevLmer = resid(dat10plus.lmer0)
dat10plus$DevGlm = resid(dat10plus.lm)

dfr1 = dat10plus[,c("DevLmer", "DevGlm", "Individual")]
dfr2 = dat10plus[,c("DevLmer", "DevGlm", "Individual")]
dfr1$DevResid = dfr1$DevLmer
dfr1$Model=rep("glmm",nrow(dfr1))
dfr2$DevResid = dfr2$DevGlm
dfr2$Model=rep("glm",nrow(dfr2))

dfr1m = tapply(dfr1$DevLmer, dfr1$Individual, median)
dfr2m = tapply(dfr2$DevGlm, dfr2$Individual, median)

dfr1$median = dfr1m[dfr1$Individual]
dfr2$median = dfr2m[dfr2$Individual]
dfr = rbind(dfr1,dfr2)
sortedSubjects = names(sort(dfr2m))
newNames = unlist(strsplit("abcdefghijklmnopqrstuvwxyz",""))[1:length(sortedSubjects)]
tmp = data.frame(Individual=sortedSubjects, Subject=newNames)
dfr2 = merge(dfr, tmp, by="Individual")
print(bwplot(DevResid~Subject|Model, data=dfr2, ylab="Deviance Residuals",
 strip = strip.custom(factor.levels=c('standard logistic model','mixed-effects logistic model')),
 panel = function(...) {
                panel.bwplot(...)
                panel.abline(h=0)
            }))

#-------------------------------------------------------------------
# FIGURE 2 
# Effects of the predictors for a standard variable rule analysis 
# with sum coding, and partial effects of the same predictors in a 
# logistic model with treatment coding. 
#-------------------------------------------------------------------

source("plainvaRb.fnc.R")
york$AgeGroup = ordered(york$AgeGroup, levels=c("20-30","31-50","51-70", "70+"))

allSubjects = plainvaRb.fnc(Form~Polarity+Adjacency+Sex+AgeGroup, data=york, plot=FALSE)
york.dd = datadist(york)
options(datadist="york.dd")
options(contrasts=c(unordered="contr.treatment",ordered="contr.treatment"))
york.lrm = lrm(Form ~ AgeGroup + Polarity + Adjacency + Sex, data=york)


plot(Predict(york.lrm, fun=plogis))

#par(mfcol=c(3,2))

# lrm plots, intercept has youngest age group, Affirmative polarity, adjacent adjacency, and females
coefs = coef(york.lrm)
# adjacency

x = (1:nlevels(york$Adjacency))+0.2-1
y = as.numeric(c(coefs["Intercept"], coefs["Intercept"]+coefs["Adjacency=Non-Adjacent"]))
plot(x, plogis(y), ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability",
  xlim=c(0, 1.4), type="b", yaxp=c(0, 1, 4),bty="l")
mtext(levels(york$Adjacency), side = 1, line=1, at = x, cex=0.8)
mtext("Adjacency", side = 1, line=2.5,  cex=0.8)

x = (1:nlevels(york$Polarity))+0.2-1
y = as.numeric(c(coefs["Intercept"], coefs["Intercept"]+coefs["Polarity=Negative"]))
plot(x, plogis(y), ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability",
  xlim=c(0, 1.4), type="b", yaxp=c(0, 1, 4),bty="l")
mtext(levels(york$Polarity), side = 1, line=1, at = x, cex=0.8)
mtext("Polarity", side = 1, line=2.5,  cex=0.8)

x = (1:nlevels(york$AgeGroup))+0.2-1
y = as.numeric(c(coefs["Intercept"], 
                 coefs["Intercept"]+coefs["AgeGroup=31-50"],
                 coefs["Intercept"]+coefs["AgeGroup=51-70"],
                 coefs["Intercept"]+coefs["AgeGroup=70+"]))
plot(x, plogis(y), ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability",
  xlim=c(0, 3.4), type="b", yaxp=c(0, 1, 4),bty="l")
mtext(levels(york$AgeGroup), side = 1, line=1, at = x, cex=0.8)
mtext("Age Group", side = 1, line=2.5,  cex=0.8)


cex.adj=0.4)

# varbrul plots
dfr=allSubjects$varbrul
f="Adjacency"
d = dfr[dfr$Factor==f,]
d$Levels=d$Levels[drop=TRUE]
x=0.2
plot((1:nlevels(d$Levels))+x-1, d$Probs, ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability",
  xlim=c(0, length(as.numeric(d$Levels))+x*2-1), type="b", yaxp=c(0, 1, 4),bty="l")
mtext(as.character(d$Levels), side = 1, line=1, at = (1:nlevels(d$Levels))+x-1, cex=0.8)
mtext(f, side = 1, line=2.5,  cex=0.8)

f="Polarity"
d = dfr[dfr$Factor==f,]
d$Levels=d$Levels[drop=TRUE]
plot((1:nlevels(d$Levels))+x-1, d$Probs, ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability",
  xlim=c(0, length(as.numeric(d$Levels))+x*2-1), type="b", yaxp=c(0, 1, 4),bty="l")
mtext(as.character(d$Levels), side = 1, line=1, at = (1:nlevels(d$Levels))+x-1, cex=0.8)
mtext(f, side = 1, line=2.5,  cex=0.8)

f="AgeGroup"
x=0.2
d = dfr[dfr$Factor==f,]
d$Levels=d$Levels[drop=TRUE]
plot((1:nlevels(d$Levels))+x-1, d$Probs, ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability", 
  xlim=c(0, length(as.numeric(d$Levels))+x*2-1), type="b", yaxp=c(0, 1, 4),bty="l")
mtext(as.character(d$Levels), side = 1, line=1, at = (1:nlevels(d$Levels))+x-1, cex=0.8)
mtext("AgeGroup", side = 1, line=2.5,  cex=0.8)

#par(mfrow=c(1,1))


#-------------------------------------------------------------------
# FIGURE 3
# Linear and Quadratic fits to a non-linear trend (proportion was as 
# a function of age group, compare the lower panels of Figure 2.
#-------------------------------------------------------------------


x=table(york$AgeGroup, york$Form)
y=apply(x,1,function(v)return(v/sum(v)))
was = y[2,]
groups=1:4
plot(1:4, was,xaxt="n", xlab="x", ylab="y", ylim=c(0.45, 0.85))
mtext("(AgeGroups)", 1, 4, cex=0.8)
mtext("(Proportion 'was')", 2, 2, cex=0.8)
axis(side=1, at=groups, labels=groups)
mtext(c("(20-30)", "(31-50)", "(51-70)", "(70+)"), side=1, line=2, at=groups)
abline(lm(as.numeric(was)~groups))
lines(groups, fitted(lm(as.numeric(was)~poly(groups, 2, raw=TRUE))),lty=2)
legend(2.2, 0.8, lty=c(1,2), c(expression(y == 0.85 - 0.10 * x), expression(y == 1.07 - 0.32 * x + 0.04 * x^2)))
rsquaredQ = round(summary(lm(as.numeric(was)~poly(groups, 2, raw=TRUE)))$r.sq,2)
rsquaredL = round(summary(lm(as.numeric(was)~groups))$r.sq,2)

#-------------------------------------------------------------------
# FIGURE 4
# Partial effect of Age for sentences with affirmative (black) and 
# negative (gray) polarity, with 95% confidence intervals.
#-------------------------------------------------------------------


york$FormBin = (york$Form=="S")+0
require(rms, quietly=TRUE) 
york.dd=datadist(york)
options(datadist='york.dd')
york.lrm  = lrm (Form~Adjacency+pol(Age, 2) * Polarity, data=york,x=T,y=T)
p1 = Predict(york.lrm, Age, Polarity, fun=plogis)
plot(p1)

#rug(york$Age)

p1a = p1[p1$Polarity=="Affirmative",]
p1n = p1[p1$Polarity=="Negative",]
plot(p1a$Age, p1a$yhat, type="l", ylim=c(0,1), col="red", xlab="Age", ylab="probability of 'was'")
lines(p1a$Age, p1a$lower, lty=2, col="red")
lines(p1a$Age, p1a$upper, lty=2, col="red")
lines(p1n$Age, p1n$yhat, lty=1, col="blue")
lines(p1n$Age, p1n$lower, lty=2, col="blue")
lines(p1n$Age, p1n$upper, lty=2, col="blue")
#rug(unique(york$Age))


#-------------------------------------------------------------------
# FIGURE 5
# Conditional permutation variable importance for the random forest 
# with all predictors. Predictors to the right of the rightmost 
# vertical gray line are significant.
#-------------------------------------------------------------------

york.cforest = cforest(Form ~ Adjacency + Polarity + Age + Individual, data=york, control = cforest_unbiased(mtry = 2))

#save(york.cforest, file="models/york.cforest.rda")
york.cforest.varimp = varimp(york.cforest, conditional=TRUE)
#save(york.cforest.varimp, file="york.cforest.varimp.rda")

load("models/york.cforest.rda")

york.trp = treeresponse(york.cforest) 
york$PredFOREST = sapply(york.trp, FUN=function(v)return(v[2]))
aaa = somers2(york$PredFOREST, york$FormBin) 
load("models/york.cforest2.rda")
#york.cforest2 = cforest(Form ~ Adjacency + Polarity + Age + 
#Sex  +  Education + Proximate1.adj + DP.Constituency + Proximate1 + Proximate2 + Individual, data=datall)
#save(york.cforest2, file="models/york.cforest2.rda")

#-------------------------------------------------------------------
# the next line of code takes 8 hours on a fast machine, it is 
# therefore commented out
#-------------------------------------------------------------------
#
# york.cforest2.varimp = varimp(york.cforest2, conditional=TRUE)  
# save(york.cforest2.varimp, file="models/york.cforest2.varimp.rda")
#
#-------------------------------------------------------------------
# instead, we load the saved version
#-------------------------------------------------------------------
load("models/york.cforest2.varimp.rda")

load("models/york.cforest2.rda")
load("models/york.cforest2.varimp.rda")
sortedVarimps = sort(york.cforest2.varimp)
names(sortedVarimps)[8]="DP Constituency"

print(dotplot(sortedVarimps,
  panel=function(...) {
    panel.dotplot(...)
    panel.abline(v=c(min(sortedVarimps), abs(min(sortedVarimps))), col="gray")
  },
  xlab="variable importance"))

york.trp2 = treeresponse(york.cforest2) 
york$PredFOREST2 = sapply(york.trp2, FUN=function(v)return(v[2]))
ddd = somers2(york$PredFOREST2, york$FormBin)

#-------------------------------------------------------------------
# FIGURE 6
# Conditional inference recursive partitioning tree.
#-------------------------------------------------------------------

york.ctree = ctree(Form ~ Adjacency + Polarity + Age + Sex  +  Education + Proximate1.adj + DP.Constituency + Proximate1 + Proximate2 + Individual, data=york)

york.ctree2 = york.ctree
v = c(1,0)
attr(v, "levels")=c("XA ... (31)", "XC ... (52)")
york.ctree2@tree$psplit$splitpoint = v
york.ctree2@tree$psplit$table = rep(1,2)
plot(york.ctree2)
#--------------------------------------------------------------------
# FIGURE 7
# Conditional inference recursive partitioning trees for all 
# individuals and for variable individuals. 
#--------------------------------------------------------------------
source("plainvaRb.fnc.R")
load("datvar.rda")
varSubjects = plainvaRb.fnc(Form~Polarity+Adjacency+Sex, data=datvar, plot=FALSE)

quicky = lrm(Form~Polarity+Adjacency+Sex, data=datvar)
quickyC = round(as.numeric(quicky$stats["C"]),3)
datvar.cforest2 = cforest(Form ~ Adjacency + Polarity + Age + 
Sex  +  Education + Proximate1.adj + DP.Constituency + Proximate1 + Proximate2 + 
Individual, data=datvar)


save(datvar.cforest2, file="models/datvar.cforest2.rda")
load("models/datvar.cforest2.rda")
datvar.cforest2.varimp = varimp(datvar.cforest2, conditional=TRUE)
save(datvar.cforest2.varimp, file="models/datvar.cforest2.varimp.rda")


load("models/datvar.cforest2.varimp.rda")
datvarVarimps = sort(datvar.cforest2.varimp)
names(datvarVarimps)[1]="Proximate1.adj"
names(datvarVarimps)[7]="DP Constituency"

print(dotplot(datvarVarimps[names(sortedVarimps)], panel=function(...) { panel.dotplot(...) panel.abline(v=c(min(datvarVarimps), abs(min(datvarVarimps))), col="gray") }, xlab="variable importance"))
datvar.trp2 = treeresponse(datvar.cforest2) 
datvar$PredFOREST2 = sapply(datvar.trp2, FUN=function(v)return(v[2]))
datvar$FormBin = (datvar$Form=="S")+0
eee=somers2(datvar$PredFOREST2, datvar$FormBin)