# ANALYSES: # ARTICLE "Models, forests and trees of York English: # Was/were variation as a case study for statistical practice" # Sali A. Tagliamonte and R. Harald Baayen, February 2012 # to appear in Language Variation and Change # LIBRARIES library(party) library(lme4) library(rms) #load the data york = read.csv("data/york.csv", header=TRUE) #------------------------------------------------------------------- # FIGURE 1. Deviance residuals for a standard logistic model # and a logistic mixed model for individuals contributing at # least 10 utterances. Each boxplot should be centered around zero. #------------------------------------------------------------------- tab = table(york$Individual) obs10orMore = names(tab[tab>=10]) dat10plus = york[york$Individual %in% obs10orMore,] dat10plus$Individual=dat10plus$Individual[drop=TRUE] dat10plus.lmer0 = lmer(Form~Adjacency + Polarity + poly(Age, 2, raw=FALSE) + (1|Individual), data=dat10plus, family="binomial") dat10plus.lm = glm(Form~Adjacency + Polarity + poly(Age, 2, raw=FALSE), data=dat10plus, family="binomial") dat10plus$DevLmer = resid(dat10plus.lmer0) dat10plus$DevGlm = resid(dat10plus.lm) dfr1 = dat10plus[,c("DevLmer", "DevGlm", "Individual")] dfr2 = dat10plus[,c("DevLmer", "DevGlm", "Individual")] dfr1$DevResid = dfr1$DevLmer dfr1$Model=rep("glmm",nrow(dfr1)) dfr2$DevResid = dfr2$DevGlm dfr2$Model=rep("glm",nrow(dfr2)) dfr1m = tapply(dfr1$DevLmer, dfr1$Individual, median) dfr2m = tapply(dfr2$DevGlm, dfr2$Individual, median) dfr1$median = dfr1m[dfr1$Individual] dfr2$median = dfr2m[dfr2$Individual] dfr = rbind(dfr1,dfr2) sortedSubjects = names(sort(dfr2m)) newNames = unlist(strsplit("abcdefghijklmnopqrstuvwxyz",""))[1:length(sortedSubjects)] tmp = data.frame(Individual=sortedSubjects, Subject=newNames) dfr2 = merge(dfr, tmp, by="Individual") print(bwplot(DevResid~Subject|Model, data=dfr2, ylab="Deviance Residuals", strip = strip.custom(factor.levels=c('standard logistic model','mixed-effects logistic model')), panel = function(...) { panel.bwplot(...) panel.abline(h=0) })) #------------------------------------------------------------------- # FIGURE 2 # Effects of the predictors for a standard variable rule analysis # with sum coding, and partial effects of the same predictors in a # logistic model with treatment coding. #------------------------------------------------------------------- source("plainvaRb.fnc.R") york$AgeGroup = ordered(york$AgeGroup, levels=c("20-30","31-50","51-70", "70+")) allSubjects = plainvaRb.fnc(Form~Polarity+Adjacency+Sex+AgeGroup, data=york, plot=FALSE) york.dd = datadist(york) options(datadist="york.dd") options(contrasts=c(unordered="contr.treatment",ordered="contr.treatment")) york.lrm = lrm(Form ~ AgeGroup + Polarity + Adjacency + Sex, data=york) plot(Predict(york.lrm, fun=plogis)) #par(mfcol=c(3,2)) # lrm plots, intercept has youngest age group, Affirmative polarity, adjacent adjacency, and females coefs = coef(york.lrm) # adjacency x = (1:nlevels(york$Adjacency))+0.2-1 y = as.numeric(c(coefs["Intercept"], coefs["Intercept"]+coefs["Adjacency=Non-Adjacent"])) plot(x, plogis(y), ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability", xlim=c(0, 1.4), type="b", yaxp=c(0, 1, 4),bty="l") mtext(levels(york$Adjacency), side = 1, line=1, at = x, cex=0.8) mtext("Adjacency", side = 1, line=2.5, cex=0.8) x = (1:nlevels(york$Polarity))+0.2-1 y = as.numeric(c(coefs["Intercept"], coefs["Intercept"]+coefs["Polarity=Negative"])) plot(x, plogis(y), ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability", xlim=c(0, 1.4), type="b", yaxp=c(0, 1, 4),bty="l") mtext(levels(york$Polarity), side = 1, line=1, at = x, cex=0.8) mtext("Polarity", side = 1, line=2.5, cex=0.8) x = (1:nlevels(york$AgeGroup))+0.2-1 y = as.numeric(c(coefs["Intercept"], coefs["Intercept"]+coefs["AgeGroup=31-50"], coefs["Intercept"]+coefs["AgeGroup=51-70"], coefs["Intercept"]+coefs["AgeGroup=70+"])) plot(x, plogis(y), ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability", xlim=c(0, 3.4), type="b", yaxp=c(0, 1, 4),bty="l") mtext(levels(york$AgeGroup), side = 1, line=1, at = x, cex=0.8) mtext("Age Group", side = 1, line=2.5, cex=0.8) cex.adj=0.4) # varbrul plots dfr=allSubjects$varbrul f="Adjacency" d = dfr[dfr$Factor==f,] d$Levels=d$Levels[drop=TRUE] x=0.2 plot((1:nlevels(d$Levels))+x-1, d$Probs, ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability", xlim=c(0, length(as.numeric(d$Levels))+x*2-1), type="b", yaxp=c(0, 1, 4),bty="l") mtext(as.character(d$Levels), side = 1, line=1, at = (1:nlevels(d$Levels))+x-1, cex=0.8) mtext(f, side = 1, line=2.5, cex=0.8) f="Polarity" d = dfr[dfr$Factor==f,] d$Levels=d$Levels[drop=TRUE] plot((1:nlevels(d$Levels))+x-1, d$Probs, ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability", xlim=c(0, length(as.numeric(d$Levels))+x*2-1), type="b", yaxp=c(0, 1, 4),bty="l") mtext(as.character(d$Levels), side = 1, line=1, at = (1:nlevels(d$Levels))+x-1, cex=0.8) mtext(f, side = 1, line=2.5, cex=0.8) f="AgeGroup" x=0.2 d = dfr[dfr$Factor==f,] d$Levels=d$Levels[drop=TRUE] plot((1:nlevels(d$Levels))+x-1, d$Probs, ylim=c(0,1), xaxt="n",xlab=" ", ylab="probability", xlim=c(0, length(as.numeric(d$Levels))+x*2-1), type="b", yaxp=c(0, 1, 4),bty="l") mtext(as.character(d$Levels), side = 1, line=1, at = (1:nlevels(d$Levels))+x-1, cex=0.8) mtext("AgeGroup", side = 1, line=2.5, cex=0.8) #par(mfrow=c(1,1)) #------------------------------------------------------------------- # FIGURE 3 # Linear and Quadratic fits to a non-linear trend (proportion was as # a function of age group, compare the lower panels of Figure 2. #------------------------------------------------------------------- x=table(york$AgeGroup, york$Form) y=apply(x,1,function(v)return(v/sum(v))) was = y[2,] groups=1:4 plot(1:4, was,xaxt="n", xlab="x", ylab="y", ylim=c(0.45, 0.85)) mtext("(AgeGroups)", 1, 4, cex=0.8) mtext("(Proportion 'was')", 2, 2, cex=0.8) axis(side=1, at=groups, labels=groups) mtext(c("(20-30)", "(31-50)", "(51-70)", "(70+)"), side=1, line=2, at=groups) abline(lm(as.numeric(was)~groups)) lines(groups, fitted(lm(as.numeric(was)~poly(groups, 2, raw=TRUE))),lty=2) legend(2.2, 0.8, lty=c(1,2), c(expression(y == 0.85 - 0.10 * x), expression(y == 1.07 - 0.32 * x + 0.04 * x^2))) rsquaredQ = round(summary(lm(as.numeric(was)~poly(groups, 2, raw=TRUE)))$r.sq,2) rsquaredL = round(summary(lm(as.numeric(was)~groups))$r.sq,2) #------------------------------------------------------------------- # FIGURE 4 # Partial effect of Age for sentences with affirmative (black) and # negative (gray) polarity, with 95% confidence intervals. #------------------------------------------------------------------- york$FormBin = (york$Form=="S")+0 require(rms, quietly=TRUE) york.dd=datadist(york) options(datadist='york.dd') york.lrm = lrm (Form~Adjacency+pol(Age, 2) * Polarity, data=york,x=T,y=T) p1 = Predict(york.lrm, Age, Polarity, fun=plogis) plot(p1) #rug(york$Age) p1a = p1[p1$Polarity=="Affirmative",] p1n = p1[p1$Polarity=="Negative",] plot(p1a$Age, p1a$yhat, type="l", ylim=c(0,1), col="red", xlab="Age", ylab="probability of 'was'") lines(p1a$Age, p1a$lower, lty=2, col="red") lines(p1a$Age, p1a$upper, lty=2, col="red") lines(p1n$Age, p1n$yhat, lty=1, col="blue") lines(p1n$Age, p1n$lower, lty=2, col="blue") lines(p1n$Age, p1n$upper, lty=2, col="blue") #rug(unique(york$Age)) #------------------------------------------------------------------- # FIGURE 5 # Conditional permutation variable importance for the random forest # with all predictors. Predictors to the right of the rightmost # vertical gray line are significant. #------------------------------------------------------------------- york.cforest = cforest(Form ~ Adjacency + Polarity + Age + Individual, data=york, control = cforest_unbiased(mtry = 2)) #save(york.cforest, file="models/york.cforest.rda") york.cforest.varimp = varimp(york.cforest, conditional=TRUE) #save(york.cforest.varimp, file="york.cforest.varimp.rda") load("models/york.cforest.rda") york.trp = treeresponse(york.cforest) york$PredFOREST = sapply(york.trp, FUN=function(v)return(v[2])) aaa = somers2(york$PredFOREST, york$FormBin) load("models/york.cforest2.rda") #york.cforest2 = cforest(Form ~ Adjacency + Polarity + Age + #Sex + Education + Proximate1.adj + DP.Constituency + Proximate1 + Proximate2 + Individual, data=datall) #save(york.cforest2, file="models/york.cforest2.rda") #------------------------------------------------------------------- # the next line of code takes 8 hours on a fast machine, it is # therefore commented out #------------------------------------------------------------------- # # york.cforest2.varimp = varimp(york.cforest2, conditional=TRUE) # save(york.cforest2.varimp, file="models/york.cforest2.varimp.rda") # #------------------------------------------------------------------- # instead, we load the saved version #------------------------------------------------------------------- load("models/york.cforest2.varimp.rda") load("models/york.cforest2.rda") load("models/york.cforest2.varimp.rda") sortedVarimps = sort(york.cforest2.varimp) names(sortedVarimps)[8]="DP Constituency" print(dotplot(sortedVarimps, panel=function(...) { panel.dotplot(...) panel.abline(v=c(min(sortedVarimps), abs(min(sortedVarimps))), col="gray") }, xlab="variable importance")) york.trp2 = treeresponse(york.cforest2) york$PredFOREST2 = sapply(york.trp2, FUN=function(v)return(v[2])) ddd = somers2(york$PredFOREST2, york$FormBin) #------------------------------------------------------------------- # FIGURE 6 # Conditional inference recursive partitioning tree. #------------------------------------------------------------------- york.ctree = ctree(Form ~ Adjacency + Polarity + Age + Sex + Education + Proximate1.adj + DP.Constituency + Proximate1 + Proximate2 + Individual, data=york) york.ctree2 = york.ctree v = c(1,0) attr(v, "levels")=c("XA ... (31)", "XC ... (52)") york.ctree2@tree$psplit$splitpoint = v york.ctree2@tree$psplit$table = rep(1,2) plot(york.ctree2) #-------------------------------------------------------------------- # FIGURE 7 # Conditional inference recursive partitioning trees for all # individuals and for variable individuals. #-------------------------------------------------------------------- source("plainvaRb.fnc.R") load("datvar.rda") varSubjects = plainvaRb.fnc(Form~Polarity+Adjacency+Sex, data=datvar, plot=FALSE) quicky = lrm(Form~Polarity+Adjacency+Sex, data=datvar) quickyC = round(as.numeric(quicky$stats["C"]),3) datvar.cforest2 = cforest(Form ~ Adjacency + Polarity + Age + Sex + Education + Proximate1.adj + DP.Constituency + Proximate1 + Proximate2 + Individual, data=datvar) save(datvar.cforest2, file="models/datvar.cforest2.rda") load("models/datvar.cforest2.rda") datvar.cforest2.varimp = varimp(datvar.cforest2, conditional=TRUE) save(datvar.cforest2.varimp, file="models/datvar.cforest2.varimp.rda") load("models/datvar.cforest2.varimp.rda") datvarVarimps = sort(datvar.cforest2.varimp) names(datvarVarimps)[1]="Proximate1.adj" names(datvarVarimps)[7]="DP Constituency" print(dotplot(datvarVarimps[names(sortedVarimps)], panel=function(...) { panel.dotplot(...) panel.abline(v=c(min(datvarVarimps), abs(min(datvarVarimps))), col="gray") }, xlab="variable importance")) datvar.trp2 = treeresponse(datvar.cforest2) datvar$PredFOREST2 = sapply(datvar.trp2, FUN=function(v)return(v[2])) datvar$FormBin = (datvar$Form=="S")+0 eee=somers2(datvar$PredFOREST2, datvar$FormBin)