######## GENERATE A DATAFRAME WITH SIMULATED DATA ###########################################
# Sven Hohenstein, Reinhold Kliegl, 2009-2015
# Version 0.6.3 (June 2015)
#
# changes:
# - removed all `cat`s
################
# INPUT:
#
# B: vector of between-subject factors levels (it's length equals the number of between-subject factors)
# (you can use B = c() or B = NULL if there are no between-subject factors)
# W: vector of within-subject factors levels (it's length equals the number of within-subject factors)
# (you can use W = c() or W = NULL if there are no within-subject factors)
# n: number of observations within each between-subjects factor level combination (natural number)
# (the total number of observations equals n * prod(B); the minimum n equals prod(W), but can not be lower than 2)
# M: Matrix specifying the cell means of crossing between- and within-subject factors (dimensions: prod(B)-by-prod(W) matrix)
# (for pure within-subjects designs, it is possible to input a vector of size prod(W) as an alternative to a 1-by-prod(W) matrix)
# OR
# a scalar (single number) that will be the mean of all cells
# SD: Matrix specifying the cell standard deviations of crossing between- and within-subject factors (dimensions: prod(B)-by-prod(W) matrix)
# (for pure within-subjects designs, it is possible to input a vector of size prod(W) as an alternative to a 1-by-prod(W) matrix)
# OR
# a scalar (single number) that will be the standard deviation of all cells
# R: List of matrices specifying the correlations of the witin-subject factor level combinations
# (list length: prod(B); dimensions of matrices: prod(W)-by-prod(W))
# OR
# a singe matrix that will be used for the correlation of all within-subject factor level combinations
# OR
# a scalar (single number) that will be the correlation of all within-subject factor level combinations (except the diagonal)
# OR
# a list of length prod(B) containing only single numbers or single numbers and matrices
# Note: R is is ignored if there are no within-subject factors
# miss: Matrix specifying the proportion of random data loss within each cell of crossing between- and within-subject factors
# (dimensions: prod(B)-by-prod(W) matrix)
# (for pure within-subjects designs, it is possible to input a vector of size prod(W) as an alternative to a 1-by-prod(W) matrix)
# OR
# a scalar (single number), [0, 1], that will be the relative amount of data loss in each cell
# long: if TRUE each line of the resulting data frame contains of one value of the dependent variable
# if FALSE all (within-subject) measures of one subject are in one line of the resulting data frame
# family: string specifying the output distribution; "gaussian", "gamma", or "beta"
# empirical: if TRUE the random values will match the specified properties exactly (at least for gaussian data)
# if FALSE all values will be drawn randomly from a distribution with the specified properties
################
################
# OUTPUT:
#
# data frame containing the labels for the between- and-within factors, the subject (id) numbers, and the measurements obtained randomly from a normal distribution
# (the values of the dependent variable are random BUT they match the mean, standard deviation, and correlation values of the input)
################
################
# Please note:
# This works perfectly for guassian distributions.
# But note two suboptimal instances occuring with gamma and beta distributions:
# 1) Obtained means and standard deviations will slightly differ from the input. The deviation decreases with the number of subjects (observations).
# 2) Obtained correlations will slightly differ from the input. The deviation decreases with the number of subjects (observations).
# Whereas the classical Pearson correlation statistic is informative for gaussian distributions, it is hardly convenient for gamma and beta distributions.
# If a gamma or beta distribution is used, the Spearman rank correlation is the appropriate statistic. This method is used if cor() is called with the parameter method="spearman".
# The obtained Spearman correlations will be more close to the input values than Pearson correlations.
################
mixedDesign <- function(B = NULL, W = NULL, n = ifelse(is.null(W), 2, prod(W)), M = 0, SD = 1, R = 0, miss = 0, family = "gaussian", long = FALSE, empirical = TRUE) {
if (class(try(library(MASS), silent = FALSE)) == "try-error") stop("MASS package is necessary for this function. Download and install it.")
##### catch wrong input, e.g., B <- c(1, 2, 3)
if (any(B < 2)) { # remove illegal conditions (less than 2 conditions per factor)
warning("Removing illegal between-subject factors.")
ix <- which(B < 2)
B <- B[- ix]
}
nB <- length(B)
if (any(W < 2)) { # remove illegal conditions (less than 2 conditions per factor)
warning("Removing illegal within-subject factors.")
ix <- which(W < 2)
W <- W[- ix]
}
nW <- length(W) # number of within-subject factors
if (nB || nW) {
nBcomb <- 1 # number of within-subject factor level combinations
# this start value remains constant (1) for cases of pure within-subject designs since there is no "X by 0" design
if (nB) { # this segment is ignored if there are no between-subject factors
Blabel <- NULL # list with item labels
for (i in 1:nB) {
Blabel[[i]] <- paste(LETTERS[i], 1:B[i], sep = "")
nBcomb <- nBcomb * B[i]
}
}
nWcomb <- 1 # number of within-subject factor level combinations
# this start value remains constant (1) for cases of pure between-subject designs since there is no "X by 0" design
if (nW) { # this segment is ignored if there are no within-subject factors
Wlabel <- NULL # list with item labels
Wlabel_comb <- NULL # combination of factor level items
for (i in 1:nW) {
Wlabel_comb <- rep(Wlabel_comb, each = W[[i]])
Wlabel[[i]] <- paste(letters[i], 1:W[i], sep = "")
if (i == 1) Wlabel_comb <- Wlabel[[i]] else Wlabel_comb <- paste(Wlabel_comb, rep(Wlabel[[i]], nWcomb), sep = "_") # arrangement of combinations
nWcomb <- nWcomb * W[i]
}
# add "W" to the labels of the within factors
Wlabel_comb <- paste("W_", Wlabel_comb, sep ="")
}
if (n < 2) stop("Not enough subjects for the given design")
if (n < nWcomb) { # if subjects are not sufficient to produce data frame, additional subjects are added and later removed
warning("Not enough subjects for the given design:\n the resulting means, standard deviations, and correlations may differ from the specified ones")
n.dif <- nWcomb - n # subjects to remove
n <- nWcomb
} else n.dif <- 0 # no subjects to remove
# Means: allow vectors as an alternative to 1-by-X matrices
if (!nB && !is.matrix(M)) {
M <- as.matrix(M) # necessary for comparing dimensions
dim(M) <- rev(dim(M)) # change vector to 1-by-X matrix
}
if (!all(dim(as.matrix(M)) == c(nBcomb, nWcomb))) {
#generate matrix of mean values)
if (!all(dim(as.matrix(M)) == c(1, 1))) warning("Inanpropriate matrix of means. Using first value for all cell means.") # more than one element
CellMeans <- matrix(M[1], nBcomb, nWcomb)
} else CellMeans <- M
# SDs: allow vectors as an alternative to 1-by-X matrices
if (!nB && !is.matrix(SD)) {
SD <- as.matrix(SD) # necessary for comparing dimensions
dim(SD) <- rev(dim(SD)) # change vector to 1-by-X matrix
}
if (!all(dim(as.matrix(SD)) == c(nBcomb, nWcomb))) {
#generate matrix of sd values)
if (!all(dim(as.matrix(SD)) == c(1, 1))) warning("Inanpropriate dimensions of matrix of SDs. Using first value for all cell SDs.")
CellSDs <- matrix(SD[1], nBcomb, nWcomb)
} else CellSDs <- SD
if (any(CellSDs < 0)) stop("Negative values are not allowed for standard deviation parameter.")
if (!nB && !is.matrix(miss)) {
miss <- as.matrix(miss) # necessary for comparing dimensions
dim(miss) <- rev(dim(miss)) # change vector to 1-by-X matrix
}
if (!all(dim(as.matrix(miss)) == c(nBcomb, nWcomb))) {
#generate matrix of data loss proportions)
if (!all(dim(as.matrix(miss)) == c(1, 1))) warning("Inanpropriate dimensions of matrix of data loss proportions. Using first value for all cell data loss proportions.")
Cell.loss <- matrix(miss[1], nBcomb, nWcomb)
} else Cell.loss <- miss
# checking data loss values, must be between 0 (no data loss) and 1 (complete data loss)
if (any(Cell.loss < 0 | Cell.loss > 1)) {
stop("Inappropriate values for data loss proportions. All values must be between 0 and 1.") }
# check correlation matrices
if (any(unlist(R) < -1 | unlist(R) > 1) & nW) {
stop("Inappropriate values for correlation matrix. All values must be between -1 and 1.")
}
N <- n*nBcomb # total number of subjects
# ... generate dataframe compatible with means, standard deviation, and cors for w-s factor
dat.w <- matrix(rnorm(nWcomb * N), N)
# check correlation matrix
if (nW) {
# put single matrix in list (if there are no between-subject factors)
if (!is.list(R)) {
if (!nB) {
R <- list(R)
} else {
# use single matrix for all correlation matrices
R <- replicate(nBcomb, list(R))
warning("Using identical correlation matrix for all groups.")
oneCorMat <- TRUE # one correlation matrix (or one single number) for all correlations
}
} else { # test length of correlation list R
if (nB & length(R) != nBcomb) stop("Inappropriate matrix list 'R' specified. 'R' must be a list with a length according to the number of (between-subject) groups or a single matrix or a single number.")
oneCorMat <- FALSE # multiple correlation matrices (or single values)
}
for (group in 1 : nBcomb) {
if (!all(dim(as.matrix(R[[group]])) == c(nWcomb, nWcomb))) {
# does the inanpropriate matrix contain more than a single value?
if (!all(dim(as.matrix(R[[group]])) == c(1, 1))) {
warning(paste("Inanpropriate dimensions of ", group, ". correlation matrix. Using first value for all correlations (except the diagonal).", sep = ""))
} else if (nB) if (!oneCorMat) warning("Using ", R[[group]][1], " as value for all correlations in group ", group, ".\n", sep ="") # not if there is a single matrix/value
# create correlation matrix
R[[group]] <- matrix(R[[group]][1], nWcomb, nWcomb)
diag(R[[group]]) <- 1
} else {
# since only the values below the diagonal are relevant, the values above are replaced by them
R[[group]][upper.tri(R[[group]])] <- R[[group]][lower.tri(R[[group]])]
diag(R[[group]]) <- 1
}
}
} else R <- replicate(nBcomb, list(1)) # if there are no correlations (no within-subject factors)
for (group in 1 : nBcomb) {
# group index
ix <- (n * (group - 1) + 1) : (n * group)
# function mvrnorm() from MASS package
dat.w[ix, ] <- mvrnorm(n = n, mu = rep(0, nWcomb), Sigma = R[[group]], empirical = empirical)
}
# remove additional subjects (if n < prod(W))
if (n.dif != 0) {
ix.rem <- NULL #lines to be removed
for (i in 1:nBcomb) {
ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines)
ix <- ix[(n - n.dif + 1) : n] # lines with additional subjects (to be removed)
ix.rem <- c(ix.rem, ix)
}
dat.w <- dat.w[setdiff(1:N, ix.rem), ] # remove additional subject lines
n <- n - n.dif # give n its actual value back
N <- n * prod(B) # the actual value of N
}
# ... between-group effect
if (nB) {
#dat.b <- expand.grid(id=1:n, group=Blabel)
dat.b_small <- expand.grid(rev(Blabel))
dat.b <- data.frame(matrix(NA, N, nB))
colnames(dat.b) <- paste("B", LETTERS[1:nB], sep ="_")
for (i in nB:1) {
dat.b[ , nB + 1 - i] <- rep(dat.b_small[, i], each = n)
}
dat.b$id <- factor(1:N)
#dat.b$id <- factor(paste(dat.b$group, dat.b$id, sep=""))
} else {
dat.b <- data.frame(id = factor(1:N)) # a data frame containing the subject id
}
# do operations according distribution family
if (tolower(family) == "gaussian") {
# ... put mean and standard deviation in data
for (i in 1:nBcomb) {
ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines)
for (j in 1:n) {
dat.w[ix[j], ] <- dat.w[ix[j], ] * CellSDs[i, ]
dat.w[ix[j], ] <- dat.w[ix[j], ] + CellMeans[i, ]
}
}
} else if (tolower(family) == "gamma") {
if (any(CellMeans <= 0)) stop("Inappropriate mean for gamma distribution specified. Choose M > 0.")
if (any(CellSDs <= 0)) stop("Inappropriate standard deviation for gamma distribution specified. Choose SD > 0.")
# generate random beta distributed values
for (i in 1:nBcomb) {
ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines)
for (k in 1:nWcomb) {
# compute rank of gaussian distributed data of one factor level combination (between and within)
rg <- rank(dat.w[ix, k])
# calculate alpha (shape parameter) and theta (scale parameter) to obtain the desired mean and standard deviation
alpha <- CellMeans[i, k]^2 / CellSDs[i, k]^2
theta <- CellSDs[i, k]^2 / CellMeans[i, k]
# create random gamma data
dat.w[ix, k] <- rgamma(n, shape = alpha, scale = theta)
# apply rank of gaussian distributed values to ordered gamma distributed values
dat.w[ix, k] <- dat.w[ix, k][order(dat.w[ix, k])][rg]
}
}
} else if (tolower(family) == "beta") {
if (any(0 >= CellMeans | CellMeans >= 1)) stop("Mean of beta distribution must be in the closed interval (0, 1).")
if (any(0 >= CellSDs | CellSDs >= 0.5)) stop("Standard deviation of beta distribution must be in the closed interval (0, 0.5).")
# generate random beta distributed values
for (i in 1:nBcomb) {
ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines)
for (k in 1:nWcomb) {
# compute rank of gaussian distributed data of one factor level combination (between and within)
rg <- rank(dat.w[ix, k])
# calculate alpha1 (shape parameter) and alpha2 (shape parameter) to obtain the desired mean and standard deviation
alpha <- -((1 / CellSDs[i, k]^2) * CellMeans[i, k] * (CellMeans[i, k]^2 - CellMeans[i, k] + CellSDs[i, k]^2))
beta <- (1 / CellSDs[i, k]^2) * (CellMeans[i, k] - 1) * (CellMeans[i, k]^2 - CellMeans[i, k] + CellSDs[i, k]^2)
# create random beta data
dat.w[ix, k] <- rbeta(n, alpha, beta)
if (any(is.na(dat.w[ix, k]))) {
stop(paste("Failed to create beta distribution with mean ", CellMeans[i, k], " and standard deviation ", CellSDs[i, k], ". Choose different parameters.", sep = "")) }
# apply rank of gaussian distributed values to ordered beta distributed values
dat.w[ix, k] <- dat.w[ix, k][order(dat.w[ix, k])][rg]
}
}
} else stop("Unknown distribution family; possible are gaussian, gamma, and beta distributions.")
# apply data loss randomly
# 1) change proportion to absolute value
Cell.loss <- round(Cell.loss * n)
# 2) apply within each cell of n subjects
for (i in 1:nBcomb) {
ix <- (n * (i - 1) + 1) : (n * i)
for (j in 1:nWcomb) {
remove <- sample(1:n, Cell.loss[i, j], replace = FALSE)
dat.w[ix, j][remove] <- NA
}
}
# ... combine dat.b and dat.w (or dat.l)
if (long | !nW) { # there is no wide format for between-subjects designs since each subject is associated with exactly 1 value (DV)
dat.l <- as.vector(as.matrix(dat.w)) #long format
dat <- NULL # this will be the final data frame
for (i in 1:nWcomb) {
dat <- rbind(dat, dat.b)
}
if (nW) {
# ... combine with information of within-subject factor levels combinations
Wcond <- rev(expand.grid(rev(Wlabel)))
names(Wcond) <- paste("W_", letters[1:nW], sep = "")
Wcond.l <- NULL
for (i in 1:N) { # span a data frame
Wcond.l <- rbind(Wcond.l, Wcond)
}
for (i in 1:nWcomb) { # modify data frame (each row of Wcond is repeated N times)
ix <- (N * (i - 1) + 1) : (N * i)
Wcond.l[ix, ] <- Wcond[i, ]
}
dat <- cbind(dat, Wcond.l) # combine between-subject information and within-subject labels
}
dat <- cbind(dat, dat.l) # combine (between-subject information and within-subject labels) and (measures (DV))
names(dat)[2 + nB + nW] <- "DV"
# order data frame along id
dat <- data.frame(dat[order(dat$id), ], row.names = NULL)
} else {
colnames(dat.w) <- Wlabel_comb
dat <- cbind(dat.b, dat.w)
}
} else dat <- NULL # if there are neither between- nor within-subject factors
return(dat)
} # end of function