Dataset used:
NYCflights14
Distance between Destination Airports(AirportCodeDists)
Data URL:
https://raw.githubusercontent.com/wiki/arunsrinivasan/flights/NYCflights14/flights14.csv
https://jbhender.github.io/Stats506/AirportCodeDists.RData
Reference Script:
Web-scraping script by Dr James Henderson
https://jbhender.github.io/Stats506/AirportCodesWebScrape.R
Description:
This script has two main parts:
1. Use web-scrapting with rvest package to get distances between airports.
2. Produce pairwise distance matrix using distances airport and produce two-dimensional map for 112 airports by multidimensional scaling.
# load packages
library(rvest)
library(data.table)
library(tidyverse)
library(stringr)
library(parallel)
# Extract the information we want from the resulting string
get_miles = function(txt){
y = str_split(txt,'\\(')[[1]]
z = str_split(y[2],' ')[[1]][1]
as.numeric(z)
}
## Encapsulate the above in a function to find the distance
## between two valid airport codes.
scrape_dist = function(a1, a2){
url = sprintf('https://www.world-airport-codes.com/distance/?a1=%s&a2=%s',
a1, a2)
srch = read_html(url)
txt =
srch %>%
html_node("strong") %>% # identified by viewing the source in a browser
html_text()
get_miles(txt)
}
# Load NYCflights14 data.
nyc14 = fread('https://github.com/arunsrinivasan/flights/wiki/NYCflights14/flights14.csv')
# unique codes for origins and destinations
orig_codes = unique(nyc14$origin)
dest_codes = unique(nyc14$dest)
airport_codes = c(orig_codes, dest_codes)
# call scrape_dist for a single fixed code vs a set of targets
get_dists = function(fixed, targets){
dists = sapply(targets, function(target) scrape_dist(fixed, target))
tibble(from=fixed, to=targets, dist=dists)
}
# Inner loop find distance between an origin to other origins and all destinations
inner_loop = function(i){
get_dists(orig_codes[i], airport_codes[{i+1}:length(airport_codes)])
}
df_dist = list()
for(i in 1:{length(orig_codes)}){
df_dist[[i]] = inner_loop(i)
}
# bind results of inner loop into a single data frame
df_dist = do.call(bind_rows, df_dist)
save(df_dist, file='./OriginsAirportDist.RData')
Here is a sample of result
# load distances between origins and airports
load("OriginsAirportDist.RData")
df1 = data.table(df_dist)
df1
## from to dist
## 1: JFK LGA 10.69
## 2: JFK EWR 20.75
## 3: JFK LAX 2469.33
## 4: JFK PBI 1029.65
## 5: JFK MIA 1091.77
## ---
## 326: EWR ANC 3360.90
## 327: EWR TVC 642.58
## 328: EWR HYA 213.13
## 329: EWR SBN 634.88
## 330: EWR DAL 1362.26
# load distance between destination airports
load("AirportCodeDists.RData")
df2 = data.table(df_dist)
# merge both data.table into a single data.table
df3 = merge(df1, df2, all = TRUE)
# switch from and to columns in df3 to get a new data.table
df4 = data.table(from = df3$to, to = df3$from, dist = df3$dist)
# merge df3 and df4 to get pairwise distance between all airports
Dist = merge(df3, df4, all = T)
# convert Distance from long to wide format
Distance = dcast(Dist, from ~ to, value.var = "dist")
# delete 1st column of Distance (airports name)
Distance = Distance[, c("from") := NULL]
# convert to data.frame form as data.table does not support row names
Distance.df = as.data.frame(Distance)
rownames(Distance.df) = colnames(Distance.df)
# Convert all NA to 0
Distance.df[is.na(Distance.df)] <- 0
# check dimension of data.frame to be 112*112
dim(Distance.df)
## [1] 112 112
# Use multidimensional scaling on airports based on distances
airportMDS = cmdscale(Distance.df)
# store MDS coordinates as a data.table object
airportCoord = data.table(airport = rownames(airportMDS),
x = -1 * airportMDS[,1], y = airportMDS[, 2])
save(airportCoord, file='./airportCoord.RData')
# produce two-dimensional map for 112 airports
ggplot(data = airportCoord, aes(x = x, y = y, label = airport)) +
geom_text(size = 2.5) +
labs(x = "West <<<<<--->>>>> East", y = "South <<<<<--->>>>> North",
title = "2-dimensional Map for 112 airports") +
theme(axis.text.x=element_blank(), axis.text.y=element_blank())