Author:
Yunbin Peng
Data set:
Simulated data
https://jbhender.github.io/Stats506/ps4_q4.RData
Description:
This script implement parallel comting to estimate out-of-sample prediction error using k-hold cross validation in R on logistic regression on a simulated data set.

Part 4bc

x = load("./ps4_q4.RData")
library(mgcv)
library(parallel)

xvalidate = function(x=sample_data, folds = 10, cores = 1L){
  start = proc.time()
  # number of observation
  n = nrow(x)
  
  # randomize index and determine test set index for each fold
  index = sample(n)
  dim(index) = c(n/folds, folds)
  
  # function to get root mean square error
  MSE = function(x,y) {
    return(mean((x-y)^2))
  }
  
  # parallel implementation of k-fold cross validation
  result = mclapply(1:folds, 
                    # function for each fold cross validation
                    function(i){
                      # index for test set
                      test.index = index[,i]
        
                      # get test set
                      testset = x[test.index,]
                      
                      # get training set
                      trainset = x[-test.index,]
                      
                      # run logistic regression on training set
                      model = gam(y~x, family = binomial(link = "logit"), 
                                  data = trainset)
                      
                      # use model from training set to predict y for test set
                      predict_y = predict.gam(model, testset)
                      
                      
                      # compare y and predicted y in test set
                      error = MSE(predict_y, testset$y)
                      
                      return(error)
                    }, 
                    # specify number of cores used
                    mc.cores = cores)
  end = proc.time()
  runtime = end - start
  meanerror = mean(unlist(result))
  return(list("cores" = cores,
              "fold" = folds,
              "mean error" = meanerror,
              "run time" = runtime)
         )
}

# get command on flux and read it 
argument = commandArgs(TRUE)
for (i in 1:length(argument)){
  eval(parse(text = argument[i]))
}

xvalidate(folds = n_fold, cores = n_core)

Running with 1 core on flux (output from Rout file on flux)
$cores
[1] 1
$fold
[1] 10

$mean error
[1] 0.2356253

$run time
user system elapsed
1.183 0.016 6.454

Running with 4 cores on flux (output from Rout file on flux)
$cores
[1] 4
$fold
[1] 10

$mean error
[1] 0.2358616

$run time
user system elapsed
0.964 0.240 2.904

Prediction error for 10-hold cross validation is about 0.236.

Part d

Prediction error: 0.2353646

runtime
user system elapsed
1.089 0.296 1.670

Part e

Number of folds does not have a big impact on prediction error, however more folds increase runtime (although not as much as percentage change in number of folds).