STATS 506 Homework 3 Problem 4

Dataset used:
NYCflights14
Data URL:
https://raw.githubusercontent.com/wiki/arunsrinivasan/flights/NYCflights14/flights14.csv
Description:
This script has 2 main parts:
1. Display a network map of flight originating from 3 NYC airports to different destinations, with thickness of edges(route) proportional to weekly average frequency of flights, using geom_curve function.
2. Compute pairwise distances between carriers based on frequency of flights between airports and produce 2-dimensional map of carrier using multidimensional scaling. Then repeat based on normalized frequency and compare results between two approaches.

Data Preperation

# load packages
library(data.table)
library(ggplot2)
library(dplyr)
library(tidyverse)

# load NYCflights14 data set
nyc14 = fread(
  'https://raw.githubusercontent.com/wiki/arunsrinivasan/flights/NYCflights14/flights14.csv')

a

# number of weeks
weeks = (365-30-31)/7 

# compute number of flights per week from each origin to each destination
weekly_flights = nyc14[, .("No_flights" = .N / weeks), by = .(origin, dest)]

# display first 10 rows of result  
knitr::kable(weekly_flights[1:10], digits=2, 
             col.names = c('Origin', 'Destination', 'Number of flights per week '))

Origin	Destination	Number of flights per week
JFK	LAX	235.05
LGA	PBI	53.12
EWR	LAX	97.31
JFK	MIA	63.32
JFK	SEA	41.79
EWR	MIA	48.22
JFK	SFO	169.66
JFK	BOS	94.66
JFK	ORD	29.13
JFK	IAH	0.16

b

# load dataset from problem 3
load("airportCoord.RData")

# function to find mds coordinates for airport
get_coord_x = function(target, data = airportCoord) {
  return(airportCoord[airport == target, x])
}
get_coord_y = function(target, data = airportCoord) {
  return(airportCoord[airport == target, y])
}

# generate data set 
weekly_flights1 = weekly_flights[ , "origin_x" := sapply(origin, get_coord_x)] %>%
  .[, "origin_y" := sapply(origin, get_coord_y)] %>%
  .[, "dest_x" := sapply(dest, get_coord_x)] %>%
  .[, "dest_y" := sapply(dest, get_coord_y)] %>%
  .[, "relative_flight_frequency" := No_flights / max(No_flights)]

# display first 10 rows of result 
knitr::kable(weekly_flights1[1:10], digits=2)

origin	dest	No_flights	origin_x	origin_y	dest_x	dest_y	relative_flight_frequency
JFK	LAX	235.05	744.43	410.60	-1596.24	-377.34	1.00
LGA	PBI	53.12	736.21	417.56	718.42	-628.08	0.23
EWR	LAX	97.31	724.12	406.28	-1596.24	-377.34	0.41
JFK	MIA	63.32	744.43	410.60	723.36	-691.24	0.27
JFK	SEA	41.79	744.43	410.60	-1674.02	600.74	0.18
EWR	MIA	48.22	724.12	406.28	723.36	-691.24	0.21
JFK	SFO	169.66	744.43	410.60	-1787.37	-88.04	0.72
JFK	BOS	94.66	744.43	410.60	827.58	577.14	0.40
JFK	ORD	29.13	744.43	410.60	18.34	275.63	0.12
JFK	IAH	0.16	744.43	410.60	-237.61	-617.44	0.00

# network diagram using geom_curve
ggplot(data = weekly_flights1, aes(x = dest_x, y = dest_y)) + geom_point() +
  geom_curve(aes(x = origin_x, y = origin_y, 
                 xend = dest_x, yend = dest_y,
                 size = relative_flight_frequency, 
                 color = origin), 
             arrow = arrow(length = unit(0.008, "npc")),
             alpha = 1, curvature = 0.4) + scale_size(range = c(0, 2)) +
  labs(x = "West <<<<<--->>>>> East", y = "South <<<<<--->>>>> North",
       title = "Flights map with relative weekly frequency") +
  theme(axis.text.x=element_blank(), axis.text.y=element_blank())

c

# for each origin and destination, find average flight per week for each carrier
carrier_flight = nyc14[, .("No_flights" = .N / weeks), by = .(origin, dest, carrier)] %>%
  .[, "flight" := paste(origin, dest, sep = " to ")] %>%
  .[, c("origin", "dest") := NULL]

# convert data.table from long to wide format
carrier_matrix = dcast(carrier_flight, carrier~flight, value.var = "No_flights")

# replace missing value as 0 
carrier_matrix[is.na(carrier_matrix)] = 0

# find list of carrier
carriername = carrier_matrix[, carrier]

# remove 1st column of carrier_matrix
carrier_matrix = carrier_matrix[, "carrier" := NULL]

# convert into matrix
carrier_matrix = as.matrix(carrier_matrix)
rownames(carrier_matrix) = carriername

# compute pairwise distance between carriers
distance.carrier = dist(carrier_matrix, method = "euclidean", diag = TRUE, upper = TRUE)

# multidimensional scaling on carrier
carrierMDS = cmdscale(distance.carrier)
plot(carrierMDS, pch='', xlab=' ', ylab=' ', bg='grey', xaxt='n', yaxt='n', 
     main = "2-D map of carrier based on frequency of flights between airports")
text(carrierMDS, attr(distance.carrier, 'Labels'))

I observe that most of carriers are clustered together while 4 carriers (B6, AA, DL and UA) are relatively far away from most of other carriers, especially for UA. This means those 4 carriers have relatively different frequency of flights between airports as compared to others.

d

# for each origin, destination and carrier, find average number of flights per week
# for each carrier, normalize average number of flights by total number of flights
carrier_flight = nyc14[, .("No_flights" = .N / weeks), by = .(origin, dest, carrier)] %>%
  .[, "flight" := paste(origin, dest, sep = " to ")] %>%
  .[, "Scaled_No_flights" := No_flights / sum(No_flights), by = .(carrier)] %>%
  .[, c("origin", "dest", "No_flights") := NULL]

# convert data.table from long to wide format
carrier_matrix = dcast(carrier_flight, carrier~flight, value.var = "Scaled_No_flights")

# replace missing value as 0 
carrier_matrix[is.na(carrier_matrix)] = 0

# find list of carrier
carriername = carrier_matrix[, carrier]

# remove 1st column of carrier_matrix
carrier_matrix = carrier_matrix[, "carrier" := NULL]

# convert into matrix
carrier_matrix = as.matrix(carrier_matrix)
rownames(carrier_matrix) = carriername

# compute pairwise distance between carriers
distance.carrier = dist(carrier_matrix, method = "euclidean", diag = TRUE, upper = TRUE)

# multidimensional scaling on carrier
carrierMDS = cmdscale(distance.carrier)
plot(carrierMDS, pch='', xlab=' ', ylab=' ', bg='grey', xaxt='n', yaxt='n', 
     main = "2-D map of carrier based on frequency of flights between airports")
text(carrierMDS, attr(distance.carrier, 'Labels'))

After normalizing frequency data for average weekly number of flights by each carrier (i.e. use proportion of flights specific route between between origin and destination), I observe that most carriers are very close to one another as there is a cluster of carriers in the center of the map, while there are 3 carriers (AS, F9 and HA) that are significantly far away from the cluster. This result is interesting as those 3 carriers are not “outliers” in 2-d map in part c. In fact there is a bias due to volume of flights for carriers in part c which I will explain with a toy example later.

Normalization actually reduce that bias and give a better comparison between carriers. Therefore I conclude those 3 carriers (AS, HA, F9) have a very different flight networks than others. For example they may have a much higher proportion of flights to a few destinations to where other carriers do not have flight.

If we actually look at those 3 carriers, we can see they almost exclusively fly to 1 or 2 destinations.

carrier_flight[carrier %in% c("AS", "F9", "HA")] %>% .[order(carrier)]

##    carrier     flight Scaled_No_flights
## 1:      AS EWR to SEA        1.00000000
## 2:      F9 LGA to DEN        0.98731501
## 3:      F9 LGA to CLE        0.01268499
## 4:      HA JFK to HNL        1.00000000

This is optional

I will use a toy example to explain differences in approaches in part c and d.
Suppose there are 3 carriers (A, B and C)
Carrier A has 2 weekly flight from NYC to LA, 1 flight from NYC to SF and 1 flight from NYC to SD.
Carrier B has 2 weekly flight from NYC to LA, 2 flight from NYC to SF and 1 flight from NYC to SD.
Carrier C has 200 weekly flight from NYC to LA, 100 flight from NYC to SF and 100 flight from NYC to SD.

It is easy to tell carrier C is basically a scaled up version of carrier A. However in part c, pairwise distance between carriers is computed based on absolute values of frequency, therefore there is greater distance between Carrier A and C than between Carrier A and B.

In part d we compute pairwise distance based on normalized frequency, so if we divide frequency data from total number of flights for each carrier, we will get
Carrier A has 50% flights from NYC to LA, 25% flights from NYC to SF and 25% flight from NYC to SD. Carrier B has 40% flights from NYC to LA, 40% flights from NYC to SF and 20% flight from NYC to SD. Carrier C has 50% flights from NYC to LA, 25% flights from NYC to SF and 25% flight from NYC to SD.

After normalization to proportion, there is no distance between Carrier A and C while there is some distance betwee Carrier A and B. Hence we conclude pairwise distance based on normalized frequency is a better representation of difference in network between carriers.

STATS 506 Homework 3 Problem 4

Yunbin Peng

November 12, 2017

Data Preperation

a

b

c

d