Dataset used:
NYCflights14
Data URL:
https://raw.githubusercontent.com/wiki/arunsrinivasan/flights/NYCflights14/flights14.csv
Description:
This script uses data.table package to analyse flight departure and arrival delay for flights leaving 3 New York City airports in year 2014.
Data Preparation
# load packages
library(data.table)
library(ggplot2)
library(dplyr)
library(tidyverse)
library(knitr)
# load data set as nyc14
nyc14 = fread(
'https://raw.githubusercontent.com/wiki/arunsrinivasan/flights/NYCflights14/flights14.csv')
a
# compute mean departure delay by carrier and month
avg.dep.delay = nyc14 %>%
.[, .(avg_dep_delay = mean(dep_delay)), by = .(carrier, month)]
avg.dep.delay
## carrier month avg_dep_delay
## 1: AA 1 12.724056
## 2: AS 1 4.734694
## 3: B6 1 30.802756
## 4: DL 1 29.568240
## 5: EV 1 30.469170
## ---
## 130: WN 10 12.118060
## 131: UA 10 11.619975
## 132: OO 10 14.135135
## 133: US 10 2.955224
## 134: HA 10 -1.727273
# spaghetti plot for every month for each carrier
ggplot(data = avg.dep.delay,
aes(x = month, y = avg_dep_delay, group = carrier, color = carrier)) +
geom_line(size = 1) +
scale_x_continuous(breaks = round(seq(min(avg.dep.delay$month), max(avg.dep.delay$month),
by = 1),1)) +
labs(x = "Month", y = "Average departure delay",
title = "Average departure delay by month and carrier")

b
# compute 90th arrival depay by carrier, origin and destination
arr.delay.90 = nyc14 %>%
.[, .(arr_delay_90 = quantile(arr_delay, 0.9)), by = .(carrier, origin, dest)]
# seperate data set into different tables for each origin
arr.delay.jfk = arr.delay.90[origin == "JFK"]
arr.delay.lga = arr.delay.90[origin == "LGA"]
arr.delay.ewr = arr.delay.90[origin == "EWR"]
# heat map for each carrier and destinations based on 90th quantile of arrival delay
ggplot(data = arr.delay.jfk, aes(x = carrier, y = dest, fill = arr_delay_90)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 150, name = "90th percentile\narrival delay") +
labs(y = "Destination", x = "Carrier",
title = "Heatmap for 90th percentile arrival delay in JFK")

ggplot(data = arr.delay.lga, aes(x = carrier, y = dest, fill = arr_delay_90)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 50, name = "90th percentile\narrival delay") +
labs(y = "Destination", x = "Carrier",
title = "Heatmap for 90th percentile arrival delay in LGA")

ggplot(data = arr.delay.ewr, aes(x = carrier, y = dest, fill = arr_delay_90)) +
geom_tile(color = "white") +
scale_fill_gradient2(low = "blue", high = "red", mid = "white",
midpoint = 100, name = "90th percentile\narrival delay") +
labs(y = "Destination", x = "Carrier",
title = "Heatmap for 90th percentile arrival delay in EWR")

c
# function to determine which period of day is the departure time
dept <- function(x) {
if (x <= 1159) return("0:00-11:59")
if (x >= 1200 & x <= 1759) return("12:00-17:59")
if (x >= 1800) return("18:00-23:59")
}
# compute average departure delay for each origin by time windows
time_dep_delay = nyc14 %>%
.[, "departure_time_window" := sapply(dep_time, dept)] %>%
.[, .("mean departure delay" = mean(dep_delay)),
by = .(origin, departure_time_window)] %>%
.[order(origin, departure_time_window)]
# Display result
knitr::kable(time_dep_delay, digits=2,
col.names = c('Origin', 'Departure Time Window', 'Average Departure Delay'))
EWR |
0:00-11:59 |
4.62 |
EWR |
12:00-17:59 |
13.91 |
EWR |
18:00-23:59 |
35.88 |
JFK |
0:00-11:59 |
4.61 |
JFK |
12:00-17:59 |
10.22 |
JFK |
18:00-23:59 |
24.49 |
LGA |
0:00-11:59 |
1.96 |
LGA |
12:00-17:59 |
10.37 |
LGA |
18:00-23:59 |
29.61 |
d
# function to determine delay category
delay.category = function(x) {
if (x <= 0) return("delay < 0 or = 0 min")
else if (x > 0 & x < 15) return("delay < 15 minutes")
else return("delay > 15 minutes")
}
# function to compute 95% confidence interval for mean
upper = function(x) {
return(mean(x) + 1.96 * sd(x)/sqrt(length(x)))
}
lower = function(x) {
return(mean(x) - 1.96 * sd(x)/sqrt(length(x)))
}
# Compute 95th confidence interval for mean relative air time
dep.delay = nyc14[, delay_category := sapply(dep_delay, delay.category)] %>%
.[, scale_air_time := (air_time - mean(air_time))/mean(air_time),
by = .(carrier, flight)] %>%
.[, .("lower confidence bound(mean air time)" = lower(scale_air_time),
"upper confidence bound" = upper(scale_air_time)),
by = delay_category] %>%
.[order(delay_category)]
# Display result
knitr::kable(dep.delay, digits=5,
col.names = c('Delay Category',
'Lower Confidence Bound(mean air time)',
'Upper Confidence Bound(mean air time)'))
delay < 0 or = 0 min |
-0.00413 |
-0.00237 |
delay < 15 minutes |
0.00353 |
0.00738 |
delay > 15 minutes |
0.00302 |
0.00638 |