STATS 506 Assignment 2 Question 4

Description: This script compare two ways of generating confidence interval for median, one using non-parametric bootstrp with 1000 bootstrap samples, one by robust estimator. The two different confidence interval is used in Morte Carlo simulation for converage probablity and width, for different distributions and sample sizes.

bootmedian <- function(x, rep = 1000) {
  n = length(x) # sample size for each bootstrap sample
  bootsample = sample(x, n*rep, replace = TRUE) # generate bootstrap sample by sampling with replacement
  dim(bootsample) = c(rep, n) # each row is one bootstrap sample, there are 1000 rows
  boot.median = apply(bootsample, 1, median) # compute median for each bootstrap sample
  lowerci = quantile(boot.median, 0.025) # 95% upper confidence bound
  upperci = quantile(boot.median, 0.975) # 95% lower confidence bound 
  return(c(lowerci, upperci)) # return upper and lower confidence bound as a vector
}

bootmedianci <- function(xmat, target) {
  medianci = apply(xmat, 1, bootmedian) # get bootstrap 95% confidence interval for median
  cover_p = mean((medianci[1,] <= target) & (medianci[2,] >= target)) # coverage probability 
  width = mean(medianci[2, ] - mean(medianci[1, ])) # average width of 1000 bootstrap confidence interval
  return(c(cover_p, width)) # return coverage probability and width of confidence interval
}

robustmedianci <- function(xmat, target) {
  n = dim(xmat)[2] # sample size for each observation
  samplemedian = apply(xmat, 1, median) # median for each sample
  absdeviation = abs(xmat - samplemedian) # absolute deviation from median
  median_absdev = apply(absdeviation, 1, median) # median absolute deviation
  m = qnorm(1-(1-0.95)/2) # 2.5% quantile of standard normal distribution
  lowerci = samplemedian - m*1.49*median_absdev/sqrt(n) # lower bound of 95% confidence interval
  upperci = samplemedian + m*1.49*median_absdev/sqrt(n) # upper bound of 95% confidence interval
  cover_p = mean((lowerci < target) & (upperci > target)) # coverage probability
  width = mean(upperci-lowerci) # average width of 95% confidence interval for median
  return(c(cover_p, width))
}

The above 3 functions are used to generate bootstrap and robust confidence interval for median.

n = 200 # number of samples
samplesize = c(10, 25, 50, 100) #sample size
library(ggplot2)

# normal (0,5)
normalprob = matrix(0, 4, 2) # matrix to save coverage probability for normal
normalwidth = matrix(0, 4, 2) # matrix to save CI width for normal
for (i in 1:4) {
  k = samplesize[i] # sample size
  x = rnorm(n*k, 0, 3) # sample from normal distribution of mean 0 and sd 5
  dim(x) = c(n, samplesize[i]) # get sample matrix in correct dimension
  normalprob[i,1] = bootmedianci(x, 0)[1]
  normalwidth[i,1] = bootmedianci(x, 0)[2]
  normalprob[i,2] = robustmedianci(x, 0)[1]
  normalwidth[i,2] = robustmedianci(x, 0)[2]
}
normalprob <- data.frame(SampleSize = samplesize, Bootstrap = normalprob[,1], Robust = normalprob[,2])
normalwidth <- data.frame(SampleSize = samplesize, Bootstrap = normalwidth[,1], Robust = normalwidth[,2])

# Exponential(0.5)
expprob = matrix(0, 4, 2) # matrix to save coverage probability for normal
expwidth = matrix(0, 4, 2) # matrix to save CI width for normal
for (i in 1:4) {
  k = samplesize[i] # sample size
  x = rexp(n*k, 0.5) # sample from normal distribution of mean 0 and sd 5
  target = qexp(0.5, 0.5)
  dim(x) = c(n, samplesize[i]) # get sample matrix in correct dimension
  expprob[i,1] = bootmedianci(x, target)[1]
  expwidth[i,1] = bootmedianci(x, target)[2]
  expprob[i,2] = robustmedianci(x, target)[1]
  expwidth[i,2] = robustmedianci(x, target)[2]
}
expprob <- data.frame(SampleSize = samplesize, Bootstrap = expprob[,1], Robust = expprob[,2])
expwidth <- data.frame(SampleSize = samplesize, Bootstrap = expwidth[,1], Robust = expwidth[,2])

# uniform (-5,5)
unifprob <- matrix(0, 4, 2) # matrix to save coverage probability for normal
unifwidth <- matrix(0, 4, 2) # matrix to save CI width for normal
for (i in 1:4) {
  k = samplesize[i] # sample size
  x = runif(n*k, -5, 5) # sample from normal distribution of mean 0 and sd 5
  target = qunif(0.5, -5, 5)
  dim(x) = c(n, samplesize[i]) # get sample matrix in correct dimension
  unifprob[i,1] = bootmedianci(x, target)[1]
  unifwidth[i,1] = bootmedianci(x, target)[2]
  unifprob[i,2] = robustmedianci(x, target)[1]
  unifwidth[i,2] = robustmedianci(x, target)[2]
}
unifprob <- data.frame(SampleSize = samplesize, Bootstrap = unifprob[,1], Robust = unifprob[,2])
unifwidth <- data.frame(SampleSize = samplesize, Bootstrap = unifwidth[,1], Robust = unifwidth[,2])

# Poisson(2)
poisprob <- matrix(0, 4, 2) # matrix to save coverage probability for normal
poiswidth <- matrix(0, 4, 2) # matrix to save CI width for normal
for (i in 1:4) {
  k = samplesize[i] # sample size
  x = rpois(n*k, 2) # sample from normal distribution of mean 0 and sd 5
  target = qpois(0.5, 2)
  dim(x) = c(n, samplesize[i]) # get sample matrix in correct dimension
  poisprob[i,1] = bootmedianci(x, target)[1]
  poiswidth[i,1] = bootmedianci(x, target)[2]
  poisprob[i,2] = robustmedianci(x, target)[1]
  poiswidth[i,2] = robustmedianci(x, target)[2]
}
poisprob <- data.frame(SampleSize = samplesize, Bootstrap = poisprob[,1], Robust = poisprob[,2])
poiswidth <- data.frame(SampleSize = samplesize, Bootstrap = poiswidth[,1], Robust = poiswidth[,2])

The above code generate coverage probability and average width of confidence interval for each distribution and each sample size(10, 25, 50, 100)

require(gridExtra)

## Loading required package: gridExtra

p1 = ggplot(data = normalprob, aes(x = SampleSize, y = value, color = variable)) +
  geom_line(aes(y = Bootstrap, col = "Bootstrap"), size = 1.5) +
  geom_line(aes(y = Robust, col = "Robust"), size = 1.5) +
  xlab("Sample Size") + ylab("Coverage Probability") + 
  ggtitle("Coverage Probability for Normal Distribution(0, 5)")
p2 = ggplot(data = normalwidth, aes(x = SampleSize, y = value, color = variable)) +
  geom_line(aes(y = Bootstrap, col = "Bootstrap"), size = 1.5) +
  geom_line(aes(y = Robust, col = "Robust"), size = 1.5) +
  xlab("Sample Size") + ylab("Width of 95% Confidence Interval") + 
  ggtitle("Width of Confidence Interval for Normal Distribution(0, 5)")
grid.arrange(p1, p2, nrow=2)

p3 = ggplot(data = expprob, aes(x = SampleSize, y = value, color = variable)) +
  geom_line(aes(y = Bootstrap, col = "Bootstrap"), size = 1.5) +
  geom_line(aes(y = Robust, col = "Robust"), size = 1.5) +
  xlab("Sample Size") + ylab("Coverage Probability") + 
  ggtitle("Coverage Probability for Exponential Distribution(0.5)")
p4 = ggplot(data = expwidth, aes(x = SampleSize, y = value, color = variable)) +
  geom_line(aes(y = Bootstrap, col = "Bootstrap"), size = 1.5) +
  geom_line(aes(y = Robust, col = "Robust"), size = 1.5) +
  xlab("Sample Size") + ylab("Waidth of 95% Confidence Interval") + 
  ggtitle("Width of Confidence Interval for Exponential Distribution(0.5)")
grid.arrange(p3, p4, nrow=2)

p5 = ggplot(data = unifprob, aes(x = SampleSize, y = value, color = variable)) +
  geom_line(aes(y = Bootstrap, col = "Bootstrap"), size = 1.5) +
  geom_line(aes(y = Robust, col = "Robust"), size = 1.5) + 
  xlab("Sample Size") + ylab("Coverage Probability") + 
  ggtitle("Coverage Probability for Uniform Distribution(-5,5)")
p6 = ggplot(data = unifwidth, aes(x = SampleSize, y = value, color = variable)) +
  geom_line(aes(y = Bootstrap, col = "Bootstrap"), size = 1.5) +
  geom_line(aes(y = Robust, col = "Robust"), size = 1.5) +
  xlab("Sample Size") + ylab("Width of 95% Confidence Interval") + 
  ggtitle("Width of Confidence Interval for Uniform Distribution(-5,5)")
grid.arrange(p5, p6, nrow=2)

p7 = ggplot(data = poisprob, aes(x = SampleSize, y = value, color = variable)) +
  geom_line(aes(y = Bootstrap, col = "Bootstrap"), size = 1.5) +
  geom_line(aes(y = Robust, col = "Robust"), size = 1.5) +
  xlab("Sample Size") + ylab("Coverage Probability") + 
  ggtitle("Coverage Probability for Poisson Distribution(2)")
p8 = ggplot(data = poiswidth, aes(x = SampleSize, y = value, color = variable)) +
  geom_line(aes(y = Bootstrap, col = "Bootstrap"), size = 1.5) +
  geom_line(aes(y = Robust, col = "Robust"), size = 1.5) +
  xlab("Sample Size") + ylab("Width of 95% Confidence Interval") + 
  ggtitle("Width of Confidence Interval for Poisson Distribution(2)")
grid.arrange(p7, p8, nrow=2)

Observing coverage probability and width of 95% confidence interval for median, bootstrap confidence interval usually has a higher coverage probability and greater width than robust confidence interval. Width for confidence interval also decreases when sample size increases across distribution and method to generate confidence interval.

STATS 506 Assignment 2 Question 4

Yunbin Peng