Dataset used:
Resedidential Energy Consumption Survey (RECS)
Data URL:
http://www.eia.gov/consumption/residential/data/2009/csv/recs2009_public.csv
Description:
This script runs analysis of RECS dataset. The variables of interest is roof type of houses from different states and across various decades. The main package used is data.table.
# load package
library(data.table)
library(tidyverse)
# load data set as data.table format
fulldata = fread("recs2009_public.csv")
decode_state = function(x){
if(!is.numeric(x)) stop('decode_states expects numeric input indexed from 1!')
switch(x,
"CT, ME, NH, RI, VT", "MA", "NY", "NJ", "PA", "IL", "IN, OH", "MI", "WI",
"IA, MN, ND, SD", "KS, NE", "MO", "VA", "DE, DC, MD, WV", "GA",
"NC, SC" , "FL", "AL, KY, MS", "TN", "AR, LA, OK",
"TX", "CO", "ID, MT, UT, WY", "AZ", "NV, NM",
"CA", "AK, HI, OR, WA"
)
}
decode_roof_type = function(x) {
if(!is.numeric(x)) stop('decode_roof_type expects numeric input indexed from 1!')
switch(x,
'CeramicClay_Tiles',
'Wood_ShinglesShakes',
'Metal',
'SlateSynthetic_Slate',
'Composition_Shingles',
'Asphalt',
'Concrete_Tiles',
'Other')
}
decode_decade = function(x) {
if(!is.numeric(x)) stop('decode_decade expects numeric input indexed from 1!')
switch(x, 'Pre1950', '1950s', '1960s', '1970s', '1980s', '1990s','2000s', '2000s')
}
# Cleaning data and change labels for state and roof type
working_data = fulldata %>%
.[, .(REPORTABLE_DOMAIN, ROOFTYPE, NWEIGHT)] %>%
.[ROOFTYPE != -2] %>% # remove rooftype not application (-2)
.[, "State" := sapply(REPORTABLE_DOMAIN, decode_state)] %>%
.[, "Rooftype" := sapply(ROOFTYPE, decode_roof_type)] %>%
.[, .(State, Rooftype, NWEIGHT)]
# calculate the total weighted counts for each state save as "stateSum"
# calculate total weighted counts for each state and each rooftype, save as "weight_count"
rooftype_prop = working_data %>%
.[, stateSum := sum(NWEIGHT), by = .(State)] %>%
.[, weight_count := sum(NWEIGHT), by = .(State, Rooftype)] %>%
.[, "NWEIGHT" := NULL]
# remove duplicate rows
setkeyv(rooftype_prop, c("State", "Rooftype"))
rooftype_prop = subset(unique(rooftype_prop))
# calculate the proportion of each roof type in individual state
rooftype_prop = rooftype_prop %>%
.[, "Roof_proportion" := weight_count / stateSum * 100] %>%
.[, .(State, Rooftype, Roof_proportion)]
# keep the wood shingles roof type and order proportion
wood_proportion = rooftype_prop %>%
.[Rooftype == "Wood_ShinglesShakes"] %>%
.[, "Rooftype" := NULL] %>%
.[order(-Roof_proportion)]
# display result in table format
knitr::kable(wood_proportion, digits=1, col.names = c('State(s)','% of Wood Shingle Roofs'))
State(s) | % of Wood Shingle Roofs |
---|---|
NC, SC | 14.7 |
CA | 11.0 |
NV, NM | 10.9 |
CO | 10.6 |
ID, MT, UT, WY | 9.8 |
TX | 9.3 |
FL | 8.1 |
AK, HI, OR, WA | 7.5 |
IN, OH | 6.8 |
DE, DC, MD, WV | 6.8 |
PA | 6.7 |
GA | 6.6 |
AZ | 6.1 |
MA | 6.1 |
NY | 5.9 |
MO | 5.7 |
KS, NE | 5.2 |
CT, ME, NH, RI, VT | 4.8 |
IA, MN, ND, SD | 4.6 |
IL | 4.3 |
VA | 3.8 |
WI | 3.8 |
AR, LA, OK | 3.5 |
AL, KY, MS | 3.4 |
MI | 3.4 |
NJ | 2.9 |
TN | 1.8 |
From the table that North Carolina and South Carolina has the highest proportion of wood shingle roofs. Tennessee has the lowest proportion of wood roofs.
Key Steps
# Cleaning data and change labels for state and roof type
working_data2 = fulldata[, .(YEARMADERANGE, ROOFTYPE, NWEIGHT)] %>%
.[ROOFTYPE != -2] %>% # remove rooftype not application (-2)
.[, "decade" := sapply(YEARMADERANGE, decode_decade)] %>%
.[, "rooftype" := sapply(ROOFTYPE, decode_roof_type)] %>%
.[, .(decade, rooftype, NWEIGHT)]
# compute the weighted sum in each decade
decade_total = working_data2 %>%
.[, decade_total := sum(NWEIGHT), by = decade]
# weighted sum of different roof types in each decade
decade_proportion = decade_total %>%
.[, weight_count := sum(NWEIGHT), by = .(decade, rooftype)] %>%
.[, "NWEIGHT" := NULL]
# remove duplicate rows
setkeyv(decade_proportion, c("decade", "rooftype"))
decade_proportion = subset(unique(decade_proportion))
# calculate the proportion of each roof type in each decade
decade_proportion = decade_proportion %>%
.[, "Roof_proportion" := weight_count / decade_total * 100] %>%
.[, .(decade, rooftype, Roof_proportion)]
# convert data.table from long to wide
decade_proportion = dcast(decade_proportion, rooftype ~ decade, value.var = "Roof_proportion")
# compute relative change between 1950 and 2000
relative_change = decade_proportion$'2000s' / decade_proportion$'1950s'
# combine roof proportion data with relative change
decade_proportion = decade_proportion %>%
.[, relative_change := relative_change] %>%
.[, .(rooftype, Pre1950, `1950s`, `1960s`, `1970s`, `1980s`,
`1990s`, `2000s`, relative_change)] %>%
.[order(-relative_change)] # order by relative change in proportion
# display result
knitr::kable(decade_proportion, digits=2, caption="Propotion of roof types by decades")
rooftype | Pre1950 | 1950s | 1960s | 1970s | 1980s | 1990s | 2000s | relative_change |
---|---|---|---|---|---|---|---|---|
CeramicClay_Tiles | 1.16 | 1.05 | 2.02 | 3.17 | 5.01 | 6.63 | 5.75 | 5.45 |
Concrete_Tiles | 0.39 | 0.84 | 0.70 | 1.29 | 1.44 | 1.89 | 2.96 | 3.53 |
Composition_Shingles | 53.95 | 61.20 | 58.92 | 56.03 | 52.12 | 55.94 | 64.05 | 1.05 |
Metal | 6.86 | 4.54 | 6.28 | 13.06 | 13.78 | 11.56 | 4.50 | 0.99 |
Wood_ShinglesShakes | 5.85 | 8.03 | 8.23 | 6.31 | 8.39 | 6.12 | 6.42 | 0.80 |
Asphalt | 26.70 | 21.41 | 21.16 | 17.68 | 17.40 | 16.32 | 15.52 | 0.73 |
SlateSynthetic_Slate | 2.71 | 1.58 | 1.34 | 0.76 | 1.20 | 0.98 | 0.45 | 0.28 |
Other | 2.37 | 1.35 | 1.35 | 1.70 | 0.66 | 0.58 | 0.35 | 0.26 |
The roof type with the largest relative rise is Ceramic or Clay Tiles, with relative change 5.5 between 1950 and 2000.
Key Steps