##### Potsdam Mind Research Repository ######

## This script is part of the publication package for the article:

# Yan, Zhou, Shu, Yusupu, Miao, Kruegel, & Kliegl, R. (2014).
# Eye Movements Guided by Morphological Structure: 
# Evidence from the Uighur Language Cognition, 132(2),  181-215. 
# URL: http://www.sciencedirect.com/science/article/pii/S001002771400047X

#---------------------------------------------------------------------------------------------------------------
# Revision History
#---------------------------------------------------------------------------------------------------------------
# 29 Apr 2012, r. kliegl
# 30 Apr 2012, m. yan
# 12 May 2012, r. kliegl
# 07 Jun 2012, m. yan
# 21 Jun 2012, r. kliegl -- add launch site
# 22 Jun 2012, r. kliegl -- start over
# 24 Jun 2012, r. kliegl -- version 2, constrain to launch site: 1 <= ls <= 10, take out root length from main analysis
# 24 Jun 2012, m. yan -- generate new data file "U1_data_0624.txt", correcting various errors
# 24-25 Jun 2012, r. kliegl -- version 3, new data corrected rl; no more zero-values
# 26 Jun 2012, r. kliegl -- version 4, corrected log(wf) to log10(wf)
# 27 Jun 2012, r. kliegl -- outlier removal; decided against removal
# 02 Jul 2012, r. kliegl -- version 5, use log2 wl; center all covariates at zero; decided for removal of outliers
# 10 Jul 2012, r. kliegl -- include fixations on space before words, "NEW STUFF"
# 21-24 Sep 2012, m. yan -- final check and reorganize, correcting various errors
# 04 May 2013, m. yan -- add unique word id and unique root morpheme id
# 12 Sep 2013, m. yan -- add length x suffix random slope
# 21 Feb 2014, m. yan -- add initial bigram frequency

# The following scripts are tested under Window XP SP3, R 3.02 and ggplot2 0.9.3.1
# 22 Mar 2014, m. yan -- Final check for package

# Start over with Uighur_package.v2.R
# 18 Apr 2014. r. kliegl -- adapt to new lme4 (1.1-7) and R 3.0.3; use only REML=FALSE


#---------------------------------------------------------------------------------------------------------------
# VARIABLES
#---------------------------------------------------------------------------------------------------------------
# nsub			subject id
# nsen			sentence id 
# wn			word number in a sentence
# nw			number of words in a sentence
# flp			first-fixation landing position
# ffd			first-fixation duration
# sfd			second-fixation duration
# gd			gaze duration
# ifre			if a word fixated 1 or not 0
# ifskip		if a word skipped 1 or not 0
# wl			word length in letters
# ls			launch site
# wf			log10 word frequency
# rl			the number of root letters/root length
# sn			the number of suffixes
# suftype1		type of 1st suffix: 0 no suffix, 1 derivational suffix, 2 inflectional suffix
# suftype2		type of 2nd suffix


#---------------------------------------------------------------------------------------------------------------
# Exp 1
#---------------------------------------------------------------------------------------------------------------

library(ggplot2)
library(reshape2)
library(plyr)
library(scales)
library(grid)
library(lme4)
#library(psych)

rm(list=ls())
user='rk'

# Local functions
source("remef.v0.6.10.r")

# Plot function
theme_set(theme_bw(base_size=10))

stat_sum_single <- function(fun, geom="point", ...) {
  stat_summary(fun.y=fun, colour="red", geom=geom, size = 3, ...)
}

vplayout <- function(x, y) {
viewport(layout.pos.row = x, layout.pos.col = y) 
}

datraw <- read.table("U1_data_0624.txt",head=TRUE)  # New data on 24 June 2012
# datraw: N=26442

#---------------------------------------------------------------------------------------------------------------
# SETUP
#---------------------------------------------------------------------------------------------------------------
datraw$ls     <- (datraw$sac_in - datraw$flp)/11.3
datraw$ls[which(datraw$wn==1)] = NA        # Ming 2012Sep21, first words should have no ls
datraw$flp    <- datraw$flp/11.3           # FLP change pixel into letters
datraw$flp.r  <- datraw$flp/datraw$wl-0.5  # relative FLP, centered on word center
datraw$slp    <- datraw$slp/11.3           # second fixation landing position in letters
datraw$wf10   <- log10(datraw$wf)          # keep original frequency, generate log10 frequency
datraw$skp    <- ifelse(datraw$nfix==0, 1, 0)
datraw$rfx    <- ifelse(datraw$nfix>=2, 1, 0)

# Load unique word id and unique root id, 04 May 2013
widinfo <- read.table("wid.txt",head=TRUE)
idx = 0
for (i in 1:nrow(datraw)) {
      idx[i] = which(widinfo$nsen == datraw$nsen[i] & widinfo$wn == datraw$wn[i])
}

datraw$nsub   <- factor(datraw$nsub)
datraw$nsen   <- factor(datraw$nsen)

# add unique word id and unique root id
datraw$wid    <- factor(widinfo$wid[idx])
datraw$rmid   <- factor(widinfo$rmid[idx])
datraw$bigram <- widinfo$bigram[idx]

#---------------------------------------------------------------------------------------------------------------
# SELECTION
#---------------------------------------------------------------------------------------------------------------
# Skipped words
idx0 <- which(datraw$skp == 1)                      # N=3758
N_fix <- nrow(datraw) - length(idx0)                # number of fixated words: N=22684
dat0 <- datraw[ -idx0, ]
sum(dat0$nfix)                                      # number of fiations: N=34661

# Fixations on first words / last word / hyphenated words
idx1 <- which(dat0$wn==1 | dat0$nw==dat0$wn | dat0$ifcon == 1 | dat0$fst_lst==1)   # N=5712, 25% of N_fix

# Fixations following negative launch site; that is a regression (also intraword)
idx2 <- which(dat0$ls<0 & dat0$fst_lst==0)                       # N=122;  0.5%

# Short and long fixations                  
idx3 <- which(dat0$ffd < 60 | dat0$ffd > 600 | dat0$gd > 800)    # N=1297; 6%

# Filter 1 
dat1   <- droplevels(dat0[-unique(c(idx1, idx2, idx3)), ])
nrow(dat1)/N_fix

# words with first-fixation landing positions < 0 (i.e., FLPs on the space before the word)
idx4 <- which(dat1$flp < 0)   # 1785

# words with launch site from the last letter of the last word
idx5 <- which(dat1$ls < 1)    # 304

# Fixations with ls from more than 10 letter before the word
idx6 <- which(dat1$ls > 10)   # 1105

# Filter 2
dat2 <- droplevels(dat1[-unique(c(idx4, idx5, idx6)), ])

# datraw: N=26442
# dat0:   N=22684
# dat1:   N=16345 -> 16330
# dat2:   N=13529 -> 13523 # relative to datraw: 51%, dat0: 60%; dat1: 83%

#---------------------------------------------------------------------------------------------------------------
# FLP Model: log2 Word length, launch site, morphological complexity, word frequency
#---------------------------------------------------------------------------------------------------------------
# Center covariates (use wl and sn specification motivated by Exp 2)
dat2$wl.c <- log2(dat2$wl)  - mean(log2(dat2$wl)) # use log2 2=1, 4=2, 8=3, 16=4
dat2$ls.c <- dat2$ls    - mean(dat2$ls)
dat2$sn.c <- dat2$sn    - mean(dat2$sn)
dat2$wf.c <- dat2$wf10  - mean(dat2$wf10) # use log10 of word frequency
dat2$bigram.c <- log10(dat2$bigram+1)  - mean(log10(dat2$bigram+1))

dat3 <- dat2

# Estimate main variance compenents and correlation parameters for subjects, intercept for sentences
print(summary(flp_4.a.wid <- lmer(flp ~ wl.c * ls.c * sn.c * wf.c + 
                                        (1+wl.c+ls.c+sn.c+wf.c+wl.c*sn.c|nsub) + (1|nsen) + (1|wid), 
                                        data=dat3, REML=FALSE)), corr=FALSE)

# Center log10-transformed initial bigram frequency
print(summary(flp_4.b.wid <- lmer(flp ~ wl.c * ls.c * sn.c * wf.c * bigram.c + 
                                        (1+wl.c+ls.c+sn.c+wf.c+wl.c*sn.c|nsub) + (1|nsen) + (1|wid), 
                                        data=dat3, REML=FALSE)), corr=FALSE)
anova(flp_4.a.wid, flp_4.b.wid)


# START 18 Apr 2014 RK
# (A) Model flp_4.b.wid probably not optimal: 
#  (1) does not converge correctly w/ lme4 (1.1-7),
#  (2) no effects associated w/ word frequency (wf),
#  (3) smallest variance component for wf,
#  (4) model is degenerate; see:
VarCorr(flp_4.b.wid)
chf <- matrix(0, nrow=6, ncol=6)
(chf[lower.tri(chf, diag=TRUE)] <-  getME(flp_4.b.wid, "theta")[3:23])
sv <- svd(chf)
round(sv$d^2/sum(sv$d^2)*100)

# (B) Leave out word frequency
print(summary(flp_4.c.wid <- lmer(flp ~ wl.c * ls.c * sn.c * bigram.c + 
                                        (1+wl.c+ls.c+sn.c+wl.c:sn.c|nsub) + (1|nsen) + (1|wid), 
                                        data=dat3, REML=FALSE)), corr=FALSE)
anova(flp_4.a.wid, flp_4.b.wid, flp_4.c.wid)
#   (1) no warnings about convergence,
#   (2) no loss in goodness in fit,
#   (3) vc's are significant in drop1-LRTs, but model is still degenerate 
#       (at most three vc's are supported by data); not sure what do to. See:
VarCorr(flp_4.c.wid)
chf <- matrix(0, nrow=5, ncol=5)
(chf[lower.tri(chf, diag=TRUE)] <-  getME(flp_4.c.wid, "theta")[3:17])
sv <- svd(chf)
round(sv$d^2/sum(sv$d^2)*100)

# (C) Leave out word frequency and bigram frequency
print(summary(flp_4.d.wid <- lmer(flp ~ wl.c * ls.c * sn.c + 
                                        (1+wl.c+ls.c+sn.c+wl.c:sn.c|nsub) + (1|nsen) + (1|wid), 
                                        data=dat3, REML=FALSE)), corr=FALSE)
anova(flp_4.a.wid, flp_4.d.wid)
#   (1) no warnings about convergence,
#   (2) no loss in goodness in fit,
#   (3) vc's are significant in drop1-LRTs, but model is still degenerate 
#       (at most three vc's are supported by data); still not sure what do to. See:
VarCorr(flp_4.d.wid)
chf <- matrix(0, nrow=5, ncol=5)
(chf[lower.tri(chf, diag=TRUE)] <-  getME(flp_4.d.wid, "theta")[3:17])
sv <- svd(chf)
round(sv$d^2/sum(sv$d^2)*100)

# In the paper flp_4.a.wid was used for figures; now I would propose to use flp_4.d.wid. Keeping 
# variance components estimated as zero is probably no good idea, even if the vc is theoretically motivated.

# ANOTHER ISSUE
# Residuals of LMM; 4 outliers identified in WID model
lattice::qqmath(resid(flp_4.c.wid ))
iout <- which(resid(flp_4.c.wid ) > 6)   # min was -5.96
resid(flp_4.c.wid)[iout] 
dat3b <- dat3[-iout, ]

# Re-estimate last model w/o outliers and non-significant fixed-effect interaction terms
print(summary(temp <- lmer(flp ~ (wl.c + ls.c + sn.c + bigram.c)^2 + wl.c:sn.c:bigram.c + 
                                        (1+wl.c+ls.c+sn.c+wl.c:sn.c|nsub) + (1|nsen) + (1|wid), 
                                        data=dat3b, REML=FALSE)), corr=FALSE)
# Hm. Why do convergence problems depend on removing 2 to 6 outliers?

qplot(x=fitted(temp), y=resid(temp), geom="point",  shape=I("."), 
	xlab="Fitted values", ylab="Standardized residuals") + 
	geom_hline(yintercept=0) + theme_bw() # + geom_hex() + geom_density2d(size=1) 

# END 18 APRIL 2014

#---------------------------------------------------------------------------------------------------------------
# Visualization, FLP
#---------------------------------------------------------------------------------------------------------------

flp_2.a = flp_4.a.wid 

matrix(names(fixef(flp_2.a)))

#-------------------------------------------------
# Description, Table 2 in paper
dcast(dat2, value.var = "flp", wl ~ sn, length)

ddply(dat2, .(wl, sn), summarize, M=mean(flp), SE=sd(flp)/sqrt(length(flp)), N=length(flp))

# ... check frequencies
items <- unique(interaction(dat2$nsen, dat2$wn))
dat.items <- dat2[items, c("nsen", "wn", "wf",  "wl", "sn")]
dat.items <- dat.items[order(dat.items$wf), ]

table(cut(dat2$wf, breaks=c(0, 1, 10, 10^2, 10^3, 10^4, 10^5)))

#-------------------------------------------------
# Factor variations
# ... word length
dat2$wl.f  <- factor(dat2$wl)
dat2$wl.f.log2 <- factor(round(log2(dat2$wl)))

dat2$wl.f2 <- factor(ifelse(dat2$wl>=8, 'long', 'short'), levels=c("short", "long"), labels=c("short (2-7)", "long (8-17)"))
dat2$wl.f3 <- factor(ifelse(dat2$wl>=10, 'long', ifelse(dat2$wl>=6 & dat2$wl<=9, 'medium', 'short')))
dat2$wl.f3 <- factor(dat2$wl.f3, levels=c("short", "medium", "long"), labels=c("short (2-5)", "medium (6-9)", "long (10-17)"))
dat2$wl.f4 <- factor(ifelse(dat2$wl <=5, 'short', ifelse(dat2$wl >= 10, 'long', ifelse(dat2$wl >= 6 & dat2$wl <= 7, 'medium1', 'medium2'))))
dat2$wl.f4 <- factor(dat2$wl.f4, levels=c("short", "medium1", "medium2", "long"), labels=c("short (2-5)", "medium (6-7)", "medium (8-9)", "long (10-17)"))

# ... ... check
dcast(dat2, value.var="flp", wl ~ wl.f4, length)

#-------------------------------------------------
# ... launch site
dat2$ls.f  <- round(dat2$ls)
dat2$ls.f2 <- factor(ifelse(dat2$ls <= 4, "close", "far"), levels=c("close", "far"))

# ... ... check
dcast(dat2, value.var="flp", wl.f4 ~ ls.f2, mean)

#-------------------------------------------------
# ... mophological complexity
dat2$sn.f  <- factor(dat2$sn)
dat2$sn.f2 <- factor(ifelse(dat2$sn == 0, 0, 1), labels=c("-", "+")) # suffix no/yes
dat2$sn.f3 <- factor(ifelse(dat2$sn >= 3, 3, ifelse(dat2$sn == 0, 1, 2)), labels=c("0", "1-2", "3-5"))
dat2$sn.4  <- ifelse(dat2$sn >= 3, 3, dat2$sn)
dat2$sn.f4 <- factor(dat2$sn.4, labels=c("0", "1", "2", "3-5"))

# ... ... suffixes WITHIN short, medium, and long words (wl.f3)
fl3 <- levels(dat2$wl.f3)
dat2$sn_wl3 <- dat2$sn
dat2$sn_wl3[dat2$wl.f3 == fl3[2] &  dat2$sn >= 2] <- 2
dat2$sn_wl3[dat2$wl.f3 == fl3[3] &  dat2$sn == 0] <- 1
dat2$sn_wl3[dat2$wl.f3 == fl3[3] &  dat2$sn >= 3] <- 3
dat2$sn_wl3 <- factor(dat2$sn_wl3)

# ... ... ... check
table(dat2$wl.f3, dat2$sn)
table(dat2$wl.f3, dat2$sn_wl3)

# ... ... suffixes WITHIN short, medium1, medium2, and long words (wl.f4)
fl4 <- levels(dat2$wl.f4)
dat2$sn_wl4 <- dat2$sn
dat2$sn_wl4[dat2$wl.f4 == fl4[2] &  dat2$sn >= 2] <- 2
dat2$sn_wl4[dat2$wl.f4 == fl4[3] &  dat2$sn >= 2] <- 2
dat2$sn_wl4[dat2$wl.f4 == fl4[4] &  dat2$sn == 0] <- 1
dat2$sn_wl4[dat2$wl.f4 == fl4[4] &  dat2$sn >= 3] <- 3
dat2$sn_wl4 <- factor(dat2$sn_wl4)

# ... ... ... check
table(dat2$wl.f4, dat2$sn)
table(dat2$wl.f4, dat2$sn_wl4)

#-------------------------------------------------
# ... word frequency
dat2$wf.f  <- round(dat2$wf10)
dat2$wf.f2 <- factor(ifelse(dat2$wf.c <= 0, "low", "high"), levels=c("high", "low"))
dat2$wf.f3 <- factor(ifelse(dat2$wf.f <= 2, "low", ifelse(dat2$wf.f == 3, "medium", "high")), levels=c("low", "medium", "high"))

# ... suffix length (sl) = word length - root length
dat2$sl    <- dat2$wl - dat2$rl
dat2$sl.f2 <- factor(ifelse(dat2$sl <= 7, "simple", "complex"))
dat2$sl.f3 <- factor(ifelse(dat2$sl >= 8, 3, ifelse(dat2$sl == 0, 1, 2)), labels=c("0", "1-7", "8-10"))

#-------------------------------------------------
# Main effects

# WL
dat2$DV.wl <- remef(flp_2.a, keep=TRUE, grouping=TRUE, fix=c(1, "wl.c"), ran=NULL)
ddply(dat2, .(round(log2(wl))), summarise, M=mean(flp), SD=sd(flp), SE=sd(flp)/sqrt(length(flp)), N=length(flp))
ddply(dat2, .(round(log2(wl))), summarise, M=mean(DV.wl), SD=sd(DV.wl), SE=sd(DV.wl)/sqrt(length(DV.wl)), N=length(DV.wl))

# LS
dat2$DV.ls <- remef(flp_2.a, keep=TRUE, grouping=TRUE, fix=c(1, "ls.c"), ran=NULL)

# SN -- shows increase in observed means, decrease in estimated means
dat2$DV.sn <- remef(flp_2.a, keep=TRUE, grouping=TRUE, fix=c(1, "sn.c"), ran=NULL)
ddply(dat2, .(sn), summarise, M_obs=mean(flp), M_est=mean(DV.sn))

# WF -- shows decrease in observed means, increase in estimated means (for 2-4)
dat2$DV.wf <- remef(flp_2.a, keep=TRUE, grouping=TRUE, fix=c(1, "wf.c"), ran=NULL)
ddply(dat2, .(wf.f), summarise, M_obs=mean(flp), M_est=mean(DV.sn), N=length(flp))

# Significant interaction 

# WL x LS
dat2$DV.wl_ls <- remef(flp_2.a, keep=TRUE, grouping=TRUE, fix=c(1, "wl.c", "ls.c", "wl.c:ls.c"), ran=NULL)

# WL x SN
dat2$DV.wl_sn <- remef(flp_2.a, keep=TRUE, grouping=TRUE, fix=c(1, "wl.c", "sn.c", "wl.c:sn.c"), ran=NULL)

#-------------------------------------------------
# Plots
# add linetype and shape
ybreaks = seq(1, 7, 2)

#-------------------------------------------------
# WL x LS plot, Figure 2
ddply(dat2, .(wl.f4, ls.f), summarise, Obs = mean(flp), Est = mean(DV.wl_ls), N=length(flp) )
p1 <- qplot(data=dat2, x = ls, y = DV.wl_ls, xlab="Launch site", ylab="Fixation landing position", 
		group=wl.f4, linetype=wl.f4, shape=wl.f4, geom="smooth", method="lm") + 
		geom_smooth(method=lm,color="black") +
		scale_colour_manual(name = "Word length", values = c(1,2,3,4)) + 
		scale_linetype_manual(name = "Word length", values = c(1,2,3,4)) + 
		scale_shape_manual(name = "Word length", values = c(1,2,3,4)) + 
		stat_summary (fun.y = mean, geom="point", size=2, mapping = aes (y=flp, x=ls.f, group=wl.f4) ) +
		coord_cartesian(ylim=c(0, 7)) + theme_bw() + 
		theme(legend.justification= c(1,1), legend.position=c(1,1) )

pdf ("Fig2.pdf", paper="a4")
p1
dev.off ()

#-------------------------------------------------
# SN plot, Figure 3

ddply(dat2, .(sn), summarize, N=length(flp))     # not many fixations with sn > 3
ddply(dat2, .(sn.f4), summarize, N=length(flp))  # collaps sn 3 to 5

# ... aggregate to subject x sn table
sn_obs <- ddply(dat2, .(nsub, sn.f4), summarize, M=mean(flp))
sn_GM <- mean(sn_obs$M)

# ... remove between-subject variance, add GM back to data
sn_obs2 <- ddply(sn_obs, .(nsub), transform, M.w = M - mean(M) + sn_GM)  

# ... compute M and within-SEs after removal of between-subject variance
sn_obs3 <- ddply(sn_obs2, .(sn.f4), summarize, M = mean(M), SE=sd(M.w)/sqrt(length(M.w)), N=length(M.w))  

# ... LMM estimates
sn_est <- ddply(dat2, .(sn.f4), summarize, M=mean(DV.sn), SE=sd(DV.sn)/sqrt(length(DV.sn)), N=length(DV.sn))

# ... combine obs and est
sn_Means <- rbind(sn_obs3, sn_est)
sn_Means$Type <- factor(c(rep("observed", 4), rep("estimated", 4)), levels=c("observed", "estimated" ))

# ... plot
p2 <- qplot(data=sn_Means, x=sn.f4, y=M, xlab= "Number of suffixes", ylab="Fixation landing position",
		group=Type, linetype=Type, shape=Type, geom=c("point")) +
		geom_smooth(method=lm,color="black", se=FALSE) +
		geom_errorbar(aes(ymax=M+2*SE, ymin=M-2*SE), width=0.1, linetype=1) + 
		coord_cartesian(ylim=c(2, 5)) + theme_bw() + 
		theme(legend.title=element_blank(), legend.justification= c(0,1), legend.position=c(0,1))

pdf ("Fig3.pdf", paper="a4")
p2
dev.off ()


#---------------------------------------------------------------------------------------------------------------
# Exp 2
#---------------------------------------------------------------------------------------------------------------

library(ggplot2)
library(reshape2)
library(plyr)
library(scales)
library(grid)
library(lme4)

rm(list=ls()) 

source("remef.v0.6.10.r")

# --------------------------------------------------------------------------------------------------------
# Plot function
theme_set(theme_bw(base_size=10))

stat_sum_single <- function(fun, geom="point", ...) {
  stat_summary(fun.y=fun, colour="red", geom=geom, size = 3, ...)
}

vplayout <- function(x, y) {
viewport(layout.pos.row = x, layout.pos.col = y) 
}

datraw <- read.table("U2_data_0629.txt", head=TRUE, na.strings="-999")
datraw <- datraw[which(datraw$nsen!=37 & datraw$nsen!=84),]

# single-suffixed words

datraw$nsub   <- factor(datraw$nsub)
datraw$nsen   <- factor(datraw$nsen)
datraw$skp    <- ifelse(datraw$nfix==0,1,0)
datraw$rfx    <- ifelse(datraw$nfix>=2,1,0)

datraw$ls     <- (datraw$sac_in - datraw$flp)/10.35 # Needs to computed before conversion

datraw$flp    <- datraw$flp/10.35                   # change pixel into letters
datraw$slp    <- datraw$slp/10.35                   # change pixel into letters

datraw$wf10   <- log10(datraw$wf)                   # keep original frequency, generate log10 frequency
# --------------------------------------------------------------------------------------------------------

# Filters
twfix <- datraw[datraw$wn==3 & datraw$ffd >= 0, ]
length(which(twfix$ffd <  60))
length(which(twfix$ffd > 600))
length(which(twfix$gd  > 800))
length(which(twfix$flp <   0)) 
length(which(twfix$ls  <   1))
length(which(twfix$ls  >  10))

# Selection for target-word analysis
dat.N <- droplevels(subset(datraw, wn == 3 & ffd >= 60 & ffd <= 600 & gd <= 800 & flp >= 0 & ls >= 1 & ls <= 10))

# Centering
dat.N$wl.c <- dat.N$wl - 7.5
dat.N$sn.c <- dat.N$sn - 1.0
dat.N$ls.c <- dat.N$ls - mean(dat.N$ls)
dat.N$wf.c <- dat.N$wf10 - mean(dat.N$wf10)  # Use centered log10 frequency in LMM
dat.N$rl.c <- dat.N$rl - mean(dat.N$rl)    
dat.N$flp.c <- dat.N$flp - mean(dat.N$flp)   # ... when used as covariate
 
# Factor definitions (mostly for plotting)
dat.N$wl.f <- factor(dat.N$wl)
dat.N$wl.f2 <- factor(ifelse(dat.N$wl <= 7, "medium", "long"), levels=c("medium", "long"))
dat.N$wl.f3 <- factor(ifelse(dat.N$wl >=8, "8-9", dat.N$wl), levels=c("6", "7", "8-9"))

dat.N$ls.f <- round(dat.N$ls)
dat.N$ls.f2 <- factor(ifelse(dat.N$ls <= 4, "close", "far"), levels=c("close", "far"))

dat.N$sn.f  <- factor(dat.N$sn)

dat.N$wf.f <- round(dat.N$wf10)
dat.N$wf.f2 <- factor(ifelse(dat.N$wf.c <= 0, "low", "high"), levels=c("high", "low"))

# Descriptives
nrow(dat.N)
dcast(dat.N, value.var="flp", wl.f ~ sn.f, length)
dcast(dat.N, value.var="flp", wl.f ~ sn.f, function(x) c(M=round(mean(x),1)))

qplot(data=dat.N, x=ls, y=flp, geom="smooth", method="lm", formula=y~poly(x, 2)) +
   stat_summary (fun.y = mean, geom="point", colour="red", size=4, mapping = aes (x=ls.f))

qplot(data=dat.N, x=wl, y=flp, geom="smooth", method="lm", formula=y~poly(x, 1)) +
   stat_summary (fun.y = mean, geom="point", colour="red", size=4, mapping = aes (x=(wl)))

dat.N$wl_r <- -1/dat.N$wl
qplot(data=dat.N, x=wl_r, y=flp, geom="smooth", method="lm", formula=y~poly(x, 2)) +
   stat_summary (fun.y = mean, geom="point", colour="red", size=4, mapping = aes (x=(wl_r)))

qplot(data=dat.N, x=wl, y=flp, geom="smooth", method="lm", formula=y~poly(x, 2),
  group= sn.f, colour = sn.f) +  stat_summary (fun.y = mean, geom="point", mapping = aes (x=wl, group=sn.f, colour=sn.f) )

qplot(data=dat.N, x=wf10, y=flp, geom="smooth", method="lm", formula=y~poly(x, 2)) +
  stat_summary (fun.y = mean, geom="point", mapping = aes(x=round(wf.f)) ) 

# ...check frequencies
items <- unique(interaction(dat.N$nsen))
dat.items <- dat.N[items, c("nsen", "wf", "wf10", "wl", "sn")]
dat.items <- dat.items[order(dat.items$wf), ]

table(cut(dat.N$wf, breaks=c(0, 1, 10, 10^2, 10^3, 10^4, 10^5)))
sum(table(cut(dat.N$wf, breaks=c(0, 1, 10, 10^2, 10^3, 10^4, 10^5))))

# --------------------------------------------------------------------------------------------------------
# (1) FLP model: word length, launch site, suffixes, and word frequency
# --------------------------------------------------------------------------------------------------------

# ... VCs and CPs
print(summary(flp_69_02.a <- lmer(flp ~ wl.c * ls.c * sn.c * wf.c + (1+wl.c+ls.c+sn.c+wf.c|nsub) + (1|nsen), 
                                        data=dat.N, REML=FALSE)), corr=FALSE)

# START 18 APRIL 2014
VarCorr(flp_69_02.a)
chf <- matrix(0, nrow=5, ncol=5)
(chf[lower.tri(chf, diag=TRUE)] <-  getME(flp_69_02.a, "theta")[2:16])
sv <- svd(chf)
round(sv$d^2/sum(sv$d^2)*100)
# Model is degenerate

print(summary(flp_69_02.b    <- lmer(flp ~ wl.c * ls.c * sn.c * wf.c + (1|nsub) + (1|nsen) +
                                          (0+wl.c|nsub) + (0+ls.c|nsub) + (0+sn.c|nsub) + (0+wf.c|nsub), 
                                          data=dat.N, REML=FALSE)), corr=FALSE)
anova(flp_69_02.b, flp_69_02.a)
# Ensemble of correlation paramenters is not significant

# ... drop1 VC at a time
print(summary(flp_69_02.b.wl    <- lmer(flp ~ wl.c * ls.c * sn.c * wf.c + (1|nsub) + (1|nsen) +
                                              (0+ls.c|nsub) + (0+sn.c|nsub) + (0+wf.c|nsub), 
                                              data=dat.N, REML=FALSE)), corr=FALSE)
anova(flp_69_02.b.wl, flp_69_02.b)  # p = .062

print(summary(flp_69_02.b.ls    <- lmer(flp ~ wl.c * ls.c * sn.c * wf.c + (1|nsub) + (1|nsen) +
                                              (0+wl.c|nsub) + (0+sn.c|nsub) + (0+wf.c|nsub),
                                              data=dat.N, REML=FALSE)), corr=FALSE)
anova(flp_69_02.b.ls, flp_69_02.b)  # p = .005

print(summary(flp_69_02.b.sn    <- lmer(flp ~ wl.c * ls.c * sn.c * wf.c + (1|nsub) + (1|nsen) +
                                              (0+wl.c|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub),
                                              data=dat.N, REML=FALSE)), corr=FALSE)
anova(flp_69_02.b.sn, flp_69_02.b)  # p = 1

print(summary(flp_69_02.b.wf    <- lmer(flp ~ wl.c * ls.c * sn.c * wf.c + (1|nsub) + (1|nsen) +
                                              (0+wl.c|nsub) + (0+ls.c|nsub) + (0+sn.c|nsub), 
                                               data=dat.N, REML=FALSE)), corr=FALSE)
anova(flp_69_02.b.wf, flp_69_02.b)  # p = .021

# ... remove VC for sn
print(summary(flp_69_02.b2    <- lmer(flp ~ wl.c * ls.c * sn.c * wf.c + (1|nsub) + (1|nsen) +
                                            (0+wl.c|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub), 
                                             data=dat.N, REML=FALSE)), corr=FALSE)
anova(flp_69_02.b2, flp_69_02.a)

# ... fixed effect structure
print(summary(flp_69_02.c <- lmer(flp ~ wl.c + ls.c + sn.c + wf.c + (1|nsub) + (1|nsen) +
                                        (0+wl.c|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub), 
                                         data=dat.N, REML=FALSE)), corr=FALSE)
anova(flp_69_02.c, flp_69_02.b2)

# ... final model
print(summary(flp_69_02.c    <- lmer(flp ~ wl.c + ls.c + sn.c + wf.c + (1|nsub) + (1|nsen) +
                                           (0+wl.c|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub), 
                                            data=dat.N, REML=FALSE)), corr=FALSE)
# 18 APRIL 2014; check of svd looks ok
VarCorr(flp_69_02.c)
chf2 <-  diag(unname(getME(flp_69_02.c, "theta")[1:5]), 5)
sv2 <- svd(chf2)
round(sv2$d^2/sum(sv2$d^2)*100)

# 04 May 2013, add subject-related random effect for morphological complexity to the final model, 
# ... as reqested by reviewer, because of theoretical relevance of morph. complexity
print(summary(flp_69_02.c.x <- lmer(flp ~ wl.c + ls.c + sn.c + wf.c + (1|nsub) + (1|nsen) +
                                          (0+wl.c|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub) +
                                          (0+sn.c|nsub),
                                    data=dat.N, REML=FALSE)), corr=FALSE)

print(summary(flp_69_02.c.y1 <- lmer(flp ~ wl.c + ls.c + sn.c + wf.c +  (1|nsub) + (1|nsen) + 
                                          (0+wl.c|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub) + 
                                          (0+sn.c|nsub) + (0+sn.c|nsen),
                                     data=dat.N, REML=FALSE)), corr=FALSE)
      
print(summary(flp_69_02.c.y2 <- lmer(flp ~ wl.c + ls.c + sn.c + wf.c + (1|nsub) + (1|nsen) +
                                           sn.c:wl.c + sn.c:ls.c + sn.c:wf.c + sn.c:wl.c:ls.c + sn.c:wl.c:wf.c + sn.c:ls.c:wf.c + sn.c:ls.c:wf.c:wl.c +
                                           (0+wl.c|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub) + 
                                           (0+sn.c|nsub) + (0+sn.c|nsen),
                                     data=dat.N, REML=FALSE)), corr=FALSE)
anova(flp_69_02.c.y1, flp_69_02.c.y2)

print(summary(flp_69_02.c.y <- lmer(flp ~ wl.c + ls.c + sn.c + wf.c + (1|nsub) + (1|nsen) +
                                          (0+wl.c|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub) + (0+sn.c|nsub) + (0+sn.c|nsen),
                                    data=dat.N, REML=FALSE)), corr=FALSE)

#mtable(flp_69_02.c, flp_69_02.c.x, flp_69_02.c.y)

# START: 18 APRIL 2014; check of svd does NOT look ok: No variance of (0+sn.c|nsub)
VarCorr(flp_69_02.c.y)
chf3 <-  diag(unname(getME(flp_69_02.c.y, "theta")[1:7]), 7)
sv3 <- svd(chf3)
round(sv3$d^2/sum(sv3$d^2)*100)

# END: 18 APRIL 2014; keeping non-significant vc in the model is a bad idea if it makes the
# model degenerate. I would choose the model w/o (0+sn.c|nsub) now; 6 vc's supported by svd

# 2014 MAR06, add bigram freuquency, 
# ... as requested by reviewer, because of theoretical relevance of bigram frequency; 
# ... bigram frequency is not significant and does not change the other effects

E2_bigram <- read.table("E2_bigram.txt")
dat.N$bigram = 0
for (i in 1:nrow(dat.N))
{
      dat.N$bigram[i] = ifelse(dat.N$sn[i]==0, E2_bigram[as.numeric(as.character(dat.N$nsen[i])),2], E2_bigram[as.numeric(as.character(dat.N$nsen[i])),1])
}

print(summary(flp_69_02.c.y.b1 <- lmer(flp ~ (wl.c + ls.c + sn.c + wf.c) * bigram + (1|nsub) + (1|nsen) +
                                             (0+wl.c|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub) + (0+sn.c|nsub) + (0+sn.c|nsen), 
                                       data=dat.N, REML=FALSE)), corr=FALSE)
print(summary(flp_69_02.c.y.b2 <- lmer(flp ~ wl.c + ls.c + sn.c + wf.c + bigram + (1|nsub) + (1|nsen) +
                                             (0+wl.c|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub) + (0+sn.c|nsub) + (0+sn.c|nsen), 
                                       data=dat.N, REML=FALSE)), corr=FALSE)
anova(flp_69_02.c.y, flp_69_02.c.y.b2, flp_69_02.c.y.b1)


# ... using wl_r; 18 APRIL: added (0+sn.c | nsen); leads to a simpler RE model
print(summary(flp_69_02.d <- lmer(flp ~ wl_r * ls.c * sn.c * wf.c + (1|nsub) + (1|nsen) +
                                        (0+wl_r|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub) +
                                        (0+sn.c|nsub) + (0+sn.c|nsen),
                                  data=dat.N, REML=FALSE)), corr=FALSE)

print(summary(flp_69_02.e <- lmer(flp ~ wl_r + ls.c + sn.c + wf.c + (1|nsub) + (1|nsen) +
                                        (0+wl_r|nsub) + (0+ls.c|nsub) + (0+wf.c|nsub) +
                                        (0+sn.c|nsub) + (0+sn.c|nsen),
                                  data=dat.N, REML=FALSE)), corr=FALSE)

print(summary(flp_69_02.f <- lmer(flp ~ wl_r + ls.c + sn.c + wf.c + (1|nsub) + (1|nsen) +
                                        (0+ls.c|nsub) + (0+wf.c|nsub) +
                                        (0+sn.c|nsen),
                                  data=dat.N, REML=FALSE)), corr=FALSE)

anova(flp_69_02.f, flp_69_02.e, flp_69_02.d)

VarCorr(flp_69_02.f)
chf4 <-  diag(unname(getME(flp_69_02.f, "theta")[1:5]), 5)
sv4 <- svd(chf4)
round(sv4$d^2/sum(sv4$d^2)*100)

# END: 18 APRIL 2014

# rename the model to use the following scripts
flp_69_02.c <- flp_69_02.c.y

# Residuals of LMM: Looks ok
p0 <-qplot(x=fitted(flp_69_02.c), y=resid(flp_69_02.c), geom="point",  shape=I("."), size=8, 
      xlab="Fitted values", ylab="Standardized residuals") + 
      geom_hline(yintercept=0) + theme_bw() 
p0

lattice::qqmath(resid(flp_69_02.c))

# Partial effect: wl
ybreaks = seq(1, 7, 2)
matrix(names(fixef(flp_69_02.c)))
dat.N$DV.wl <- remef(flp_69_02.c, keep=TRUE, grouping=TRUE, fix=c(1, 2), ran=NULL)
p1 <- qplot(data=dat.N, x = wl, y = DV.wl, xlab="Word length", ylab="Landing position", group=1, geom="smooth", method="lm") +
      geom_smooth(method=lm,color="black") +
      stat_summary (fun.y = mean, geom="point", colour="black", size=2, mapping = aes (x=wl) ) +
      stat_summary (fun.y = mean, geom="point", shape=2,  size=3, mapping = aes (y=flp, x=wl) ) + 
      coord_cartesian(ylim=c(0, 5))
p1

# Partial effect: ls
dat.N$DV.ls <- remef(flp_69_02.c, keep=TRUE, grouping=TRUE, fix=c(1, 3), ran=NULL)
p2 <- qplot(data=dat.N, x = ls, y = DV.ls, xlab="Launch site", ylab="Landing position", group=1, geom="smooth", method="lm") +
      geom_smooth(method=lm,color="black") +
      stat_summary (fun.y = mean, geom="point", colour="black",  size=2, mapping = aes (x=ls.f) ) +
      stat_summary (fun.y = mean, geom="point", shape=2,   size=3, mapping = aes (y=flp, x=ls.f) ) +
      coord_cartesian(ylim=c(0, 5)) 
p2

# Partial effect: sn
dat.N$DV.sn <- remef(flp_69_02.c, keep=TRUE, grouping=TRUE, fix=c(1, 4), ran=NULL)
p3 <- qplot(data=dat.N, x = sn, y = DV.sn, xlab="Suffixes", ylab="Landing position", group=1, geom="smooth", method="lm") +
      geom_smooth(method=lm,color="black") +
      stat_summary (fun.y = mean, geom="point", colour="black", size=2, mapping = aes (x=sn) ) +       
      stat_summary (fun.y = mean, geom="point", shape=2, size=3, mapping = aes (y=flp, x=sn) ) + 
      coord_cartesian(ylim=c(0, 5)) 
p3

# Partial effect: wf
dat.N$DV.wf <- remef(flp_69_02.c, keep=TRUE, grouping=TRUE, fix=c(1, 5), ran=NULL)
p4 <- qplot(data=dat.N, x = wf10, y = DV.wf, xlab="Word frequency", ylab="Landing position", group=1, geom="smooth", method="lm") +
      geom_smooth(method=lm,color="black") +
      stat_summary (fun.y = mean, geom="point", colour="black",  size=2, mapping = aes (x=wf.f) ) +
      stat_summary (fun.y = mean, geom="point", shape=2,   size=3, mapping = aes (y=flp, x=wf.f) ) + 
      coord_cartesian(ylim=c(0, 5))  
p4

#Assemble plots in one figure
grid.newpage()  
pushViewport(viewport(layout = grid.layout(2, 2))) 
print(p1, vp=vplayout(1,1))
print(p2, vp=vplayout(1,2))
print(p3, vp=vplayout(2,1))
print(p4, vp=vplayout(2,2))

# ----------------------------------------------------------------------------------------------------------
# (2)  Other eye-movement measures: FFD, GD, RFX, SKP
# --------------------------------------------------------------------------------------------------------

# First-fixation duration (18 APRIL 2014, somewhat simplified)
# --------------------------------------------------------------------------------------------------------

# ... full model
print(summary(flp_69_02.a.ffd <- lmer(log(ffd) ~ wl.c * ls.c * sn.c * wf.c + (1+wl.c+ls.c+sn.c+wf.c|nsub) + (1|nsen),
                                      data=dat.N, REML=FALSE)), corr=FALSE)

# ... zcp model
print(summary(flp_69_02.z.ffd <- lmer(log(ffd) ~ wl.c * ls.c * sn.c * wf.c + (1+wl.c+ls.c+sn.c+wf.c||nsub) + (1|nsen),
                                      data=dat.N, REML=FALSE)), corr=FALSE)

VarCorr(flp_69_02.z.ffd)
chf_ffd <-  diag(unname(getME(flp_69_02.a.ffd, "theta")[1:6]), 6)
sv_ffd <- svd(chf_ffd)
round(sv_ffd$d^2/sum(sv_ffd$d^2)*100)

# ... reduce zcp model
print(summary(flp_69_02.z2.ffd <- lmer(log(ffd) ~ wl.c * ls.c * sn.c * wf.c + (1+ls.c||nsub) + (1|nsen),
                                       data=dat.N, REML=FALSE)), corr=FALSE)

print(summary(flp_69_02.b.ffd <- lmer(log(ffd) ~ wl.c * ls.c * sn.c * wf.c + (1|nsub) + (1|nsen),
                                      data=dat.N, REML=FALSE)), corr=FALSE)

anova(flp_69_02.b.ffd, flp_69_02.z2.ffd, flp_69_02.z.ffd, flp_69_02.a.ffd)

# ... fixed effect
print(summary(flp_69_02.c.ffd <- lmer(log(ffd) ~ wl.c + ls.c + sn.c + wf.c + (1+ls.c|nsub) + (1|nsen),
                                      data=dat.N, REML=FALSE)), corr=FALSE)

anova(flp_69_02.c.ffd, flp_69_02.z2.ffd)


# Gaze duration
# --------------------------------------------------------------------------------------------------------

# ... full model
print(summary(flp_69_02.a.gd <- lmer(log(gd) ~ wl.c * ls.c * sn.c * wf.c + (1+wl.c+ls.c+sn.c+wf.c|nsub) + (1|nsen),
                                     data=dat.N, REML=FALSE)), corr=FALSE)

# ... zcp model
print(summary(flp_69_02.z.gd <- lmer(log(gd) ~ wl.c * ls.c * sn.c * wf.c + (1+wl.c+ls.c+sn.c+wf.c||nsub) + (1|nsen),
                                      data=dat.N, REML=FALSE)), corr=FALSE)

VarCorr(flp_69_02.z.gd)
chf_gd <-  diag(unname(getME(flp_69_02.a.gd, "theta")[1:6]), 6)
sv_gd <- svd(chf_gd)
round(sv_gd$d^2/sum(sv_gd$d^2)*100)

print(summary(flp_69_02.b.gd <- lmer(log(gd) ~ wl.c * ls.c * sn.c * wf.c + (1|nsub) + (1|nsen),
                                     data=dat.N, REML=FALSE)), corr=FALSE)

anova(flp_69_02.b.gd, flp_69_02.z.gd, flp_69_02.a.gd)

# ... fixed effect
print(summary(flp_69_02.c.gd <- lmer(log(gd) ~ wl.c + ls.c + sn.c + wf.c + (1|nsub) + (1|nsen), 
                                     data=dat.N, REML=FALSE)), corr=FALSE)

anova(flp_69_02.c.gd, flp_69_02.b.gd)


# Refixation
# --------------------------------------------------------------------------------------------------------

# ... full model
print(summary(flp_69_02.a.rfx <- glmer(rfx ~ wl.c * ls.c * sn.c * wf.c + (1+wl.c+ls.c+sn.c+wf.c|nsub) + (1|nsen),
                                       family=binomial(link="logit"), data=dat.N)), corr=FALSE)
# ... no acceptable convergence

# ... zcp model
print(summary(flp_69_02.z.rfx <- glmer(rfx ~ wl.c * ls.c * sn.c * wf.c + (1+wl.c+ls.c+sn.c+wf.c||nsub) + (1|nsen),
                                       family=binomial(link="logit"), data=dat.N)), corr=FALSE)
# ... no acceptable convergence

VarCorr(flp_69_02.z.rfx)
chf_rfx <-  diag(unname(getME(flp_69_02.a.rfx, "theta")[1:5]), 5)
sv_rfx <- svd(chf_rfx)
round(sv_rfx$d^2/sum(sv_rfx$d^2)*100)

print(summary(flp_69_02.b.rfx <- glmer(rfx ~ wl.c * ls.c * sn.c * wf.c + (1|nsub) + (1|nsen), 
                                       family=binomial(link="logit"), data=dat.N)), corr=FALSE)
# ... marginally acceptable convergence

anova(flp_69_02.b.rfx,  flp_69_02.z.rfx)

# ... fixed effect
print(summary(flp_69_02.c.rfx <- glmer(rfx ~ wl.c + ls.c + sn.c + wf.c + (1|nsub) + (1|nsen),
                                      family=binomial(link="logit"), data=dat.N)), corr=FALSE)
# ... marginally acceptable convergence

anova(flp_69_02.c.rfx, flp_69_02.b.rfx)


# Skipping (note: launchsite is not available for skipped targets)
# --------------------------------------------------------------------------------------------------------

dat.skp <- droplevels(subset(datraw, skp <= 1 & wn == 3))

# Center
dat.skp$wl.c <- dat.skp$wl - 7.5
dat.skp$sn.c <- as.numeric(as.character(dat.skp$sn)) - 1.0
dat.skp$wf.c <- dat.skp$wf10 - mean(dat.skp$wf10)
dat.skp$wf.f <- factor(ifelse(dat.skp$wf.c >= 0, "high", "low"))
dat.skp$rl.c <- dat.skp$rl - mean(dat.skp$rl)    

# ... full model
print(summary(flp_69_02.a.skp <- glmer(skp ~ wl.c * sn.c * wf.c + (1+wl.c+sn.c+wf.c|nsub) + (1|nsen),
                                      family=binomial(link="logit"), data=dat.skp)), corr=FALSE)
# ... no acceptable convergence

# ... zcp model
print(summary(flp_69_02.z.skp <- glmer(rfx ~ wl.c * sn.c * wf.c + (1+wl.c+sn.c+wf.c||nsub) + (1|nsen),
                                       family=binomial(link="logit"), data=dat.skp)), corr=FALSE)
# ... no acceptable convergence

VarCorr(flp_69_02.z.skp)
chf_skp <-  diag(unname(getME(flp_69_02.a.rfx, "theta")[1:5]), 5)
sv_skp <- svd(chf_skp)
round(sv_skp$d^2/sum(sv_skp$d^2)*100)

print(summary(flp_69_02.b.skp <- glmer(skp ~ wl.c*sn.c*wf.c + (1|nsub) + (1|nsen),
                                      family=binomial(link="logit"), data=dat.skp)), corr=FALSE)
# ... marginally acceptable convergence

anova(flp_69_02.b.skp, flp_69_02.z.skp)

# ... fixed effect
print(summary(flp_69_02.c.skp <- glmer(skp ~ wl.c + sn.c + wf.c + (1|nsub) + (1|nsen),
                                      family=binomial(link="logit"), data=dat.skp)), corr=FALSE)

anova(flp_69_02.c.skp, flp_69_02.b.skp)