Author:
Yunbin Peng
Data set:
Simulated data
https://jbhender.github.io/Stats506/ps4_q4.RData
Description:
This script implement parallel comting to estimate out-of-sample prediction error using k-hold cross validation in R on logistic regression on a simulated data set.
x = load("./ps4_q4.RData")
library(mgcv)
library(parallel)
xvalidate = function(x=sample_data, folds = 10, cores = 1L){
start = proc.time()
# number of observation
n = nrow(x)
# randomize index and determine test set index for each fold
index = sample(n)
dim(index) = c(n/folds, folds)
# function to get root mean square error
MSE = function(x,y) {
return(mean((x-y)^2))
}
# parallel implementation of k-fold cross validation
result = mclapply(1:folds,
# function for each fold cross validation
function(i){
# index for test set
test.index = index[,i]
# get test set
testset = x[test.index,]
# get training set
trainset = x[-test.index,]
# run logistic regression on training set
model = gam(y~x, family = binomial(link = "logit"),
data = trainset)
# use model from training set to predict y for test set
predict_y = predict.gam(model, testset)
# compare y and predicted y in test set
error = MSE(predict_y, testset$y)
return(error)
},
# specify number of cores used
mc.cores = cores)
end = proc.time()
runtime = end - start
meanerror = mean(unlist(result))
return(list("cores" = cores,
"fold" = folds,
"mean error" = meanerror,
"run time" = runtime)
)
}
# get command on flux and read it
argument = commandArgs(TRUE)
for (i in 1:length(argument)){
eval(parse(text = argument[i]))
}
xvalidate(folds = n_fold, cores = n_core)
Running with 1 core on flux (output from Rout file on flux)
$cores
[1] 1
$fold
[1] 10
$mean error
[1] 0.2356253
$run time
user system elapsed
1.183 0.016 6.454
Running with 4 cores on flux (output from Rout file on flux)
$cores
[1] 4
$fold
[1] 10
$mean error
[1] 0.2358616
$run time
user system elapsed
0.964 0.240 2.904
Prediction error for 10-hold cross validation is about 0.236.
Prediction error: 0.2353646
runtime
user system elapsed
1.089 0.296 1.670
Number of folds does not have a big impact on prediction error, however more folds increase runtime (although not as much as percentage change in number of folds).