Simulation: How many covariates are too many?

# Define function
sim_covars <- function(n = 20, diff = 0, sd_y = 1, 
                       rho_covars = c(0.7, 0.3, 0)) {
  
  # Parameters
  # n <- 20   # observations per group
  # diff <- 0 # difference between groups
  # sd_y <- 3 # within-group sd of outcome (population-level)
  # rho_covars <- c(0.7, 0.3, 0) # correlation of control variables with outcome
                                 # in control group; add more coefficients to
                                 # include more covariates
  
  # In this simulation, the control variables are *not* intercorrelated.
  
  # Construct variance-covariance matrix
  k_covars <- length(rho_covars)
  sds <- c(sd_y, rep(1, k_covars))
  cor_matrix <- matrix(data = 0, nrow = 1 + k_covars, ncol = 1 + k_covars)
  diag(cor_matrix) <- 1
  cor_matrix[1, ] <- c(1, rho_covars)
  cor_matrix[, 1] <- c(1, rho_covars)
  cov_matrix <- cor_matrix * tcrossprod(sds)
  
  # Generate data
  d <- MASS::mvrnorm(n = 2*n, mu = rep(0, 1 + k_covars),
                     Sigma = cov_matrix)
  d <- as.data.frame(d)
  d$V1[1:n] <- d$V1[1:n] + diff
  d$Condition <- rep(c("experimental", "control"), each = n)
  
  # Fit model with all covariates
  m1 <- lm(V1 ~ ., data = d)
  
  # Fit model without covariates
  m2 <- lm(V1 ~ Condition, data = d)
  
  # Results
  results <- list(
    estimate_covar = unname(coef(m1)[2 + k_covars]),
    se_covar = summary(m1)$coefficients[2 + k_covars, 2],
    ci_lo_covar = confint(m1)[2 + k_covars, 1],
    ci_hi_covar = confint(m1)[2 + k_covars, 2],
    
    estimate_nocovar = unname(coef(m2)[2]),
    se_nocovar = summary(m2)$coefficients[2, 2],
    ci_lo_nocovar = confint(m2)[2, 1],
    ci_hi_nocovar = confint(m2)[2, 2]
  )
  
  results
}

# Run function 10k times
sim_results <- matrix(ncol = 8, nrow = 1e4)

for (i in 1:1e4) {
  sim_results[i, ] <- unlist(sim_covars(n = 8, rho_covars = c(0.5, 0.2, 0)))
}

# Let's compare the widths of the 95% CIs
ci_width_covar <- sim_results[, 4] - sim_results[, 3]
ci_width_nocovar <- sim_results[, 8] - sim_results[, 7]
difference_width <- ci_width_nocovar - ci_width_covar
hist(difference_width)

summary(difference_width)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -4.24478 -0.21508  0.03602  0.04967  0.31315  2.16452

# On average, slightly wider CI width when ignoring covariates,
# so not harmful.

# Now with two useless covariates and one with hardly any use
for (i in 1:1e4) {
  sim_results[i, ] <- unlist(sim_covars(n = 8, rho_covars = c(0.1, 0, 0)))
}
ci_width_covar <- sim_results[, 4] - sim_results[, 3]
ci_width_nocovar <- sim_results[, 8] - sim_results[, 7]
difference_width <- ci_width_nocovar - ci_width_covar
hist(difference_width)

summary(difference_width)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.8266 -0.4990 -0.2935 -0.3184 -0.1013  1.0229

# Now narrower CIs when ignoring covariates, so useless covariates are harmful.

# Upping n a bit.
for (i in 1:1e4) {
  sim_results[i, ] <- unlist(sim_covars(n = 15, rho_covars = c(0.1, 0, 0)))
}
ci_width_covar <- sim_results[, 4] - sim_results[, 3]
ci_width_nocovar <- sim_results[, 8] - sim_results[, 7]
difference_width <- ci_width_nocovar - ci_width_covar
hist(difference_width)

summary(difference_width)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.80265 -0.14614 -0.08467 -0.08732 -0.02564  0.43218

# Same tendency, if less pronounced.

# Upping rho a bit for one variable.
for (i in 1:1e4) {
  sim_results[i, ] <- unlist(sim_covars(n = 15, rho_covars = c(0.35, 0, 0)))
}
ci_width_covar <- sim_results[, 4] - sim_results[, 3]
ci_width_nocovar <- sim_results[, 8] - sim_results[, 7]
difference_width <- ci_width_nocovar - ci_width_covar
hist(difference_width)

summary(difference_width)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -0.713681 -0.079998 -0.002944  0.008030  0.090197  0.774731

# Now pretty much no harm, no benefit either.

Simulation: How many covariates are too many?

Jan Vanhove, https://janhove.github.io

15 October 2019