######## GENERATE A DATAFRAME WITH SIMULATED DATA ########################################### # Sven Hohenstein, Reinhold Kliegl, 2009-2015 # Version 0.6.3 (June 2015) # # changes: # - removed all `cat`s ################ # INPUT: # # B: vector of between-subject factors levels (it's length equals the number of between-subject factors) # (you can use B = c() or B = NULL if there are no between-subject factors) # W: vector of within-subject factors levels (it's length equals the number of within-subject factors) # (you can use W = c() or W = NULL if there are no within-subject factors) # n: number of observations within each between-subjects factor level combination (natural number) # (the total number of observations equals n * prod(B); the minimum n equals prod(W), but can not be lower than 2) # M: Matrix specifying the cell means of crossing between- and within-subject factors (dimensions: prod(B)-by-prod(W) matrix) # (for pure within-subjects designs, it is possible to input a vector of size prod(W) as an alternative to a 1-by-prod(W) matrix) # OR # a scalar (single number) that will be the mean of all cells # SD: Matrix specifying the cell standard deviations of crossing between- and within-subject factors (dimensions: prod(B)-by-prod(W) matrix) # (for pure within-subjects designs, it is possible to input a vector of size prod(W) as an alternative to a 1-by-prod(W) matrix) # OR # a scalar (single number) that will be the standard deviation of all cells # R: List of matrices specifying the correlations of the witin-subject factor level combinations # (list length: prod(B); dimensions of matrices: prod(W)-by-prod(W)) # OR # a singe matrix that will be used for the correlation of all within-subject factor level combinations # OR # a scalar (single number) that will be the correlation of all within-subject factor level combinations (except the diagonal) # OR # a list of length prod(B) containing only single numbers or single numbers and matrices # Note: R is is ignored if there are no within-subject factors # miss: Matrix specifying the proportion of random data loss within each cell of crossing between- and within-subject factors # (dimensions: prod(B)-by-prod(W) matrix) # (for pure within-subjects designs, it is possible to input a vector of size prod(W) as an alternative to a 1-by-prod(W) matrix) # OR # a scalar (single number), [0, 1], that will be the relative amount of data loss in each cell # long: if TRUE each line of the resulting data frame contains of one value of the dependent variable # if FALSE all (within-subject) measures of one subject are in one line of the resulting data frame # family: string specifying the output distribution; "gaussian", "gamma", or "beta" # empirical: if TRUE the random values will match the specified properties exactly (at least for gaussian data) # if FALSE all values will be drawn randomly from a distribution with the specified properties ################ ################ # OUTPUT: # # data frame containing the labels for the between- and-within factors, the subject (id) numbers, and the measurements obtained randomly from a normal distribution # (the values of the dependent variable are random BUT they match the mean, standard deviation, and correlation values of the input) ################ ################ # Please note: # This works perfectly for guassian distributions. # But note two suboptimal instances occuring with gamma and beta distributions: # 1) Obtained means and standard deviations will slightly differ from the input. The deviation decreases with the number of subjects (observations). # 2) Obtained correlations will slightly differ from the input. The deviation decreases with the number of subjects (observations). # Whereas the classical Pearson correlation statistic is informative for gaussian distributions, it is hardly convenient for gamma and beta distributions. # If a gamma or beta distribution is used, the Spearman rank correlation is the appropriate statistic. This method is used if cor() is called with the parameter method="spearman". # The obtained Spearman correlations will be more close to the input values than Pearson correlations. ################ mixedDesign <- function(B = NULL, W = NULL, n = ifelse(is.null(W), 2, prod(W)), M = 0, SD = 1, R = 0, miss = 0, family = "gaussian", long = FALSE, empirical = TRUE) { if (class(try(library(MASS), silent = FALSE)) == "try-error") stop("MASS package is necessary for this function. Download and install it.") ##### catch wrong input, e.g., B <- c(1, 2, 3) if (any(B < 2)) { # remove illegal conditions (less than 2 conditions per factor) warning("Removing illegal between-subject factors.") ix <- which(B < 2) B <- B[- ix] } nB <- length(B) if (any(W < 2)) { # remove illegal conditions (less than 2 conditions per factor) warning("Removing illegal within-subject factors.") ix <- which(W < 2) W <- W[- ix] } nW <- length(W) # number of within-subject factors if (nB || nW) { nBcomb <- 1 # number of within-subject factor level combinations # this start value remains constant (1) for cases of pure within-subject designs since there is no "X by 0" design if (nB) { # this segment is ignored if there are no between-subject factors Blabel <- NULL # list with item labels for (i in 1:nB) { Blabel[[i]] <- paste(LETTERS[i], 1:B[i], sep = "") nBcomb <- nBcomb * B[i] } } nWcomb <- 1 # number of within-subject factor level combinations # this start value remains constant (1) for cases of pure between-subject designs since there is no "X by 0" design if (nW) { # this segment is ignored if there are no within-subject factors Wlabel <- NULL # list with item labels Wlabel_comb <- NULL # combination of factor level items for (i in 1:nW) { Wlabel_comb <- rep(Wlabel_comb, each = W[[i]]) Wlabel[[i]] <- paste(letters[i], 1:W[i], sep = "") if (i == 1) Wlabel_comb <- Wlabel[[i]] else Wlabel_comb <- paste(Wlabel_comb, rep(Wlabel[[i]], nWcomb), sep = "_") # arrangement of combinations nWcomb <- nWcomb * W[i] } # add "W" to the labels of the within factors Wlabel_comb <- paste("W_", Wlabel_comb, sep ="") } if (n < 2) stop("Not enough subjects for the given design") if (n < nWcomb) { # if subjects are not sufficient to produce data frame, additional subjects are added and later removed warning("Not enough subjects for the given design:\n the resulting means, standard deviations, and correlations may differ from the specified ones") n.dif <- nWcomb - n # subjects to remove n <- nWcomb } else n.dif <- 0 # no subjects to remove # Means: allow vectors as an alternative to 1-by-X matrices if (!nB && !is.matrix(M)) { M <- as.matrix(M) # necessary for comparing dimensions dim(M) <- rev(dim(M)) # change vector to 1-by-X matrix } if (!all(dim(as.matrix(M)) == c(nBcomb, nWcomb))) { #generate matrix of mean values) if (!all(dim(as.matrix(M)) == c(1, 1))) warning("Inanpropriate matrix of means. Using first value for all cell means.") # more than one element CellMeans <- matrix(M[1], nBcomb, nWcomb) } else CellMeans <- M # SDs: allow vectors as an alternative to 1-by-X matrices if (!nB && !is.matrix(SD)) { SD <- as.matrix(SD) # necessary for comparing dimensions dim(SD) <- rev(dim(SD)) # change vector to 1-by-X matrix } if (!all(dim(as.matrix(SD)) == c(nBcomb, nWcomb))) { #generate matrix of sd values) if (!all(dim(as.matrix(SD)) == c(1, 1))) warning("Inanpropriate dimensions of matrix of SDs. Using first value for all cell SDs.") CellSDs <- matrix(SD[1], nBcomb, nWcomb) } else CellSDs <- SD if (any(CellSDs < 0)) stop("Negative values are not allowed for standard deviation parameter.") if (!nB && !is.matrix(miss)) { miss <- as.matrix(miss) # necessary for comparing dimensions dim(miss) <- rev(dim(miss)) # change vector to 1-by-X matrix } if (!all(dim(as.matrix(miss)) == c(nBcomb, nWcomb))) { #generate matrix of data loss proportions) if (!all(dim(as.matrix(miss)) == c(1, 1))) warning("Inanpropriate dimensions of matrix of data loss proportions. Using first value for all cell data loss proportions.") Cell.loss <- matrix(miss[1], nBcomb, nWcomb) } else Cell.loss <- miss # checking data loss values, must be between 0 (no data loss) and 1 (complete data loss) if (any(Cell.loss < 0 | Cell.loss > 1)) { stop("Inappropriate values for data loss proportions. All values must be between 0 and 1.") } # check correlation matrices if (any(unlist(R) < -1 | unlist(R) > 1) & nW) { stop("Inappropriate values for correlation matrix. All values must be between -1 and 1.") } N <- n*nBcomb # total number of subjects # ... generate dataframe compatible with means, standard deviation, and cors for w-s factor dat.w <- matrix(rnorm(nWcomb * N), N) # check correlation matrix if (nW) { # put single matrix in list (if there are no between-subject factors) if (!is.list(R)) { if (!nB) { R <- list(R) } else { # use single matrix for all correlation matrices R <- replicate(nBcomb, list(R)) warning("Using identical correlation matrix for all groups.") oneCorMat <- TRUE # one correlation matrix (or one single number) for all correlations } } else { # test length of correlation list R if (nB & length(R) != nBcomb) stop("Inappropriate matrix list 'R' specified. 'R' must be a list with a length according to the number of (between-subject) groups or a single matrix or a single number.") oneCorMat <- FALSE # multiple correlation matrices (or single values) } for (group in 1 : nBcomb) { if (!all(dim(as.matrix(R[[group]])) == c(nWcomb, nWcomb))) { # does the inanpropriate matrix contain more than a single value? if (!all(dim(as.matrix(R[[group]])) == c(1, 1))) { warning(paste("Inanpropriate dimensions of ", group, ". correlation matrix. Using first value for all correlations (except the diagonal).", sep = "")) } else if (nB) if (!oneCorMat) warning("Using ", R[[group]][1], " as value for all correlations in group ", group, ".\n", sep ="") # not if there is a single matrix/value # create correlation matrix R[[group]] <- matrix(R[[group]][1], nWcomb, nWcomb) diag(R[[group]]) <- 1 } else { # since only the values below the diagonal are relevant, the values above are replaced by them R[[group]][upper.tri(R[[group]])] <- R[[group]][lower.tri(R[[group]])] diag(R[[group]]) <- 1 } } } else R <- replicate(nBcomb, list(1)) # if there are no correlations (no within-subject factors) for (group in 1 : nBcomb) { # group index ix <- (n * (group - 1) + 1) : (n * group) # function mvrnorm() from MASS package dat.w[ix, ] <- mvrnorm(n = n, mu = rep(0, nWcomb), Sigma = R[[group]], empirical = empirical) } # remove additional subjects (if n < prod(W)) if (n.dif != 0) { ix.rem <- NULL #lines to be removed for (i in 1:nBcomb) { ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines) ix <- ix[(n - n.dif + 1) : n] # lines with additional subjects (to be removed) ix.rem <- c(ix.rem, ix) } dat.w <- dat.w[setdiff(1:N, ix.rem), ] # remove additional subject lines n <- n - n.dif # give n its actual value back N <- n * prod(B) # the actual value of N } # ... between-group effect if (nB) { #dat.b <- expand.grid(id=1:n, group=Blabel) dat.b_small <- expand.grid(rev(Blabel)) dat.b <- data.frame(matrix(NA, N, nB)) colnames(dat.b) <- paste("B", LETTERS[1:nB], sep ="_") for (i in nB:1) { dat.b[ , nB + 1 - i] <- rep(dat.b_small[, i], each = n) } dat.b$id <- factor(1:N) #dat.b$id <- factor(paste(dat.b$group, dat.b$id, sep="")) } else { dat.b <- data.frame(id = factor(1:N)) # a data frame containing the subject id } # do operations according distribution family if (tolower(family) == "gaussian") { # ... put mean and standard deviation in data for (i in 1:nBcomb) { ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines) for (j in 1:n) { dat.w[ix[j], ] <- dat.w[ix[j], ] * CellSDs[i, ] dat.w[ix[j], ] <- dat.w[ix[j], ] + CellMeans[i, ] } } } else if (tolower(family) == "gamma") { if (any(CellMeans <= 0)) stop("Inappropriate mean for gamma distribution specified. Choose M > 0.") if (any(CellSDs <= 0)) stop("Inappropriate standard deviation for gamma distribution specified. Choose SD > 0.") # generate random beta distributed values for (i in 1:nBcomb) { ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines) for (k in 1:nWcomb) { # compute rank of gaussian distributed data of one factor level combination (between and within) rg <- rank(dat.w[ix, k]) # calculate alpha (shape parameter) and theta (scale parameter) to obtain the desired mean and standard deviation alpha <- CellMeans[i, k]^2 / CellSDs[i, k]^2 theta <- CellSDs[i, k]^2 / CellMeans[i, k] # create random gamma data dat.w[ix, k] <- rgamma(n, shape = alpha, scale = theta) # apply rank of gaussian distributed values to ordered gamma distributed values dat.w[ix, k] <- dat.w[ix, k][order(dat.w[ix, k])][rg] } } } else if (tolower(family) == "beta") { if (any(0 >= CellMeans | CellMeans >= 1)) stop("Mean of beta distribution must be in the closed interval (0, 1).") if (any(0 >= CellSDs | CellSDs >= 0.5)) stop("Standard deviation of beta distribution must be in the closed interval (0, 0.5).") # generate random beta distributed values for (i in 1:nBcomb) { ix <- (n * (i - 1) + 1) : (n * i) # subjects within one between-subject factor level combination (lines) for (k in 1:nWcomb) { # compute rank of gaussian distributed data of one factor level combination (between and within) rg <- rank(dat.w[ix, k]) # calculate alpha1 (shape parameter) and alpha2 (shape parameter) to obtain the desired mean and standard deviation alpha <- -((1 / CellSDs[i, k]^2) * CellMeans[i, k] * (CellMeans[i, k]^2 - CellMeans[i, k] + CellSDs[i, k]^2)) beta <- (1 / CellSDs[i, k]^2) * (CellMeans[i, k] - 1) * (CellMeans[i, k]^2 - CellMeans[i, k] + CellSDs[i, k]^2) # create random beta data dat.w[ix, k] <- rbeta(n, alpha, beta) if (any(is.na(dat.w[ix, k]))) { stop(paste("Failed to create beta distribution with mean ", CellMeans[i, k], " and standard deviation ", CellSDs[i, k], ". Choose different parameters.", sep = "")) } # apply rank of gaussian distributed values to ordered beta distributed values dat.w[ix, k] <- dat.w[ix, k][order(dat.w[ix, k])][rg] } } } else stop("Unknown distribution family; possible are gaussian, gamma, and beta distributions.") # apply data loss randomly # 1) change proportion to absolute value Cell.loss <- round(Cell.loss * n) # 2) apply within each cell of n subjects for (i in 1:nBcomb) { ix <- (n * (i - 1) + 1) : (n * i) for (j in 1:nWcomb) { remove <- sample(1:n, Cell.loss[i, j], replace = FALSE) dat.w[ix, j][remove] <- NA } } # ... combine dat.b and dat.w (or dat.l) if (long | !nW) { # there is no wide format for between-subjects designs since each subject is associated with exactly 1 value (DV) dat.l <- as.vector(as.matrix(dat.w)) #long format dat <- NULL # this will be the final data frame for (i in 1:nWcomb) { dat <- rbind(dat, dat.b) } if (nW) { # ... combine with information of within-subject factor levels combinations Wcond <- rev(expand.grid(rev(Wlabel))) names(Wcond) <- paste("W_", letters[1:nW], sep = "") Wcond.l <- NULL for (i in 1:N) { # span a data frame Wcond.l <- rbind(Wcond.l, Wcond) } for (i in 1:nWcomb) { # modify data frame (each row of Wcond is repeated N times) ix <- (N * (i - 1) + 1) : (N * i) Wcond.l[ix, ] <- Wcond[i, ] } dat <- cbind(dat, Wcond.l) # combine between-subject information and within-subject labels } dat <- cbind(dat, dat.l) # combine (between-subject information and within-subject labels) and (measures (DV)) names(dat)[2 + nB + nW] <- "DV" # order data frame along id dat <- data.frame(dat[order(dat$id), ], row.names = NULL) } else { colnames(dat.w) <- Wlabel_comb dat <- cbind(dat.b, dat.w) } } else dat <- NULL # if there are neither between- nor within-subject factors return(dat) } # end of function