# R code for the linear mixed-effects models published in:
#
# Dimigen, Sommer, Hohlfeld, Jacobs, & Kliegl (2011)
# Co-registration of eye movements and EEG in natural reading: Analyses and Review
# Journal of Experimental Psychology: General, 140 (4), 552-572
#
#
# These scripts compute linear mixed-effects models (LMEs) of N400 amplitude in natural, left-to-right sentence reading.
# In addition to word and sentence properties, oculomotor behavior (fixation duration & gaze duration) is used to predict 
# the amplitude of the N400 component in the fixation-related brain potential (FRP). This relationship is explored at the
# level of individual fixations (i.e. at the "single-trial" level).
#
# The dataset contains fixation times and N400 amplitudes for 7,113 target fixations made by 30 participants during first-pass reading
# of a total of 499 different unique target words.
#
# Contact: Olaf Dimigen, dimigen@uni-potsdam.de

# Updated to accommodate changes to lme4 (version >= 1.1.5); i.e., replace print() with summary(), add argument refit=FALSE to anova()
# Some of the results may be slightly different due to changes in optimizer; use package lme4.0 for backward compatibility.
# Reinhold Kliegl, 19 March 2014

rm(list=ls())
library(lme4)

load("Dimigen.et.al.JEPGEN.2011.rda")

# Loads data frame "eyeeeg" containing 7,113 observations and the following columns:

# 01 id           subject id (N=30, max. id = 40 because 4 subjects were excluded due to drifts, another 6 recordings were aborted)
# 02 sentid       unique id of sentences (sentence number in Potsdam Sentence Corpus, range: 1-144)
# 03 wordid       unique id of word (sentence number*100 + word position)
# 04 ffd          first fixation duration [in ms]
# 05 gd           gaze duration [in ms]
# 06 n4pz         mean amplitude of fixation-related EEG, 300-500 ms after fixation onset at centroparietal electrode Pz
# 07 l            word length (in letters)
# 08 f            word frequency (CELEX corpus)
# 09 p            word predictability (cloze probability, logit-transformed)
# 10 cp           word predictability (cloze probability, raw)
# 11 cat2         predictability level (split into 2 levels)
# 12 cat3         predictability level (split into 3 levels, used for most FRP plots)
# 13 cat5         predictability level (split into 5 levels)
# 14 constr_N     contextual constraint 1: number of different candidate words named at least once in cloze procedure
# 15 posi         sentence position of word
# 16 constr_maxcp contextual constraint 2: cloze probability of word with highest cloze-prob. ("classical" definition of CC)

# Note: only linguistic properties used for linear mixed modeling are included in this data frame
# If you want to explore the influence of other lexical/sentential variables, you can link the published Potsdam Corpus to this 
# dataset by means of the unique wordid (see column 3).


# center predictors
# -------------------------------------------------------------------
eyeeeg$l.c            <- scale(eyeeeg$l,            center=TRUE, scale=FALSE) # word length (letters)
eyeeeg$posi.c          <- scale(eyeeeg$posi,         center=TRUE, scale=FALSE) # word position (in sentence)
eyeeeg$constr_N.c      <- scale(eyeeeg$constr_N,     center=TRUE, scale=FALSE) # constraint (as number of different words predicted)
eyeeeg$constr_maxcp.c  <- scale(eyeeeg$constr_maxcp, center=TRUE, scale=FALSE) # constraint (cloze p of most expected word)



# Main LMEs, results summarized in Table 3 (p. 564)
# -------------------------------------------------------------------

# N400 amplitude: Eye movement behavior as N400 predictor
summary(m.N4.ffd     <- lmer(n4pz ~ log(ffd)  + (1| id) + (1| wordid), REML=FALSE, data=eyeeeg), cor=FALSE) # Model 1 for FFD in Table 3
summary(m.N4.gd      <- lmer(n4pz ~ log(gd)   + (1| id) + (1| wordid), REML=FALSE, data=eyeeeg), cor=FALSE) # Model 1 for GD  in Table 3

# N400 amplitude: Linguistic covariates as N400 predictors (pred*freq, word length, contextual constraint, word position)
summary(m.N4.all     <- lmer(n4pz ~ p*f + l.c + constr_N.c + posi.c + (1| id) + (1| wordid), REML=FALSE, data=eyeeeg), cor=FALSE)

# N400 amplitude: Eye movement behavior AND linguistic properties as N400 predictors
summary(m.N4.ffd.all <- lmer(n4pz ~ log(ffd)  + p*f + l.c + constr_N.c + posi.c + (1| id) + (1| wordid), REML=FALSE, data=eyeeeg), cor=FALSE) # Model 2 for FFD in Table 3
summary(m.N4.gd.all  <- lmer(n4pz ~ log(gd)   + p*f + l.c + constr_N.c + posi.c + (1| id) + (1| wordid), REML=FALSE, data=eyeeeg), cor=FALSE) # Model 2 for GD in Table 3



# Model comparisons for the main LMEs, reported in Footnote 5 (p. 562)
# -------------------------------------------------------------------

# Model comparisons: does adding linguistic properties to the model improve the fit ?
anova(m.N4.ffd, m.N4.ffd.all) # Yes
anova(m.N4.gd , m.N4.gd.all)  # Yes

# Model comparisons: drop fixation time (FFD or GD) from the full model, does the fit decrease?
anova(m.N4.ffd.all, m.N4.all) # FFD: No
anova(m.N4.gd.all,  m.N4.all) # GD:  Yes




# Control models I: same results with an alternative definitions of contextual constraint? See in Footnote 2 (p. 555)
# -------------------------------------------------------------------

# - Definition 1: CC = number of words produced during cloze procedure (constr_N) - used above
# - Definition 2: CC = cloze probability of most expected word during cloze procedure (constr_maxcp) - "classical definition of CC"

summary(m.N4.ffd.all.cc <- lmer(n4pz ~ log(ffd) + p*f + l.c + constr_maxcp.c + posi.c + (1| id) + (1| wordid), REML=FALSE, data=eyeeeg), cor=FALSE) # Yes
summary(m.N4.gd.all.cc  <- lmer(n4pz ~ log(gd)  + p*f + l.c + constr_maxcp.c + posi.c + (1| id) + (1| wordid), REML=FALSE, data=eyeeeg), cor=FALSE) # Yes



# Control models II: effect of predictability on fixation time measures, as reported in Footnote 4 (p. 559) 
# -------------------------------------------------------------------

# - is predictability a significant predictor of FFD & GD when other linguistic covariates are controlled?
summary(m.ffd.all <- lmer(log(ffd) ~ p*f + l.c + constr_N.c + posi.c + (1| id) + (1| wordid), REML=FALSE, data=eyeeeg), cor=FALSE) # Yes
summary(m.gd.all  <- lmer(log(gd)  ~ p*f + l.c + constr_N.c + posi.c + (1| id) + (1| wordid), REML=FALSE, data=eyeeeg), cor=FALSE) # Yes