Dataset used:
Resedidential Energy Consumption Survey (RECS)
Data URL:
http://www.eia.gov/consumption/residential/data/2009/csv/recs2009_public.csv
Description:
This script runs analysis of RECS dataset. The variables of interest is roof type of houses from different states and across various decades. The main package used is data.table.

# load package
library(data.table)
library(tidyverse)
# load data set as data.table format
fulldata =  fread("recs2009_public.csv")

Decodeing function for state and rooftype

decode_state = function(x){
  if(!is.numeric(x)) stop('decode_states expects numeric input indexed from 1!')
  switch(x,
         "CT, ME, NH, RI, VT", "MA", "NY", "NJ", "PA", "IL", "IN, OH", "MI", "WI",
         "IA, MN, ND, SD", "KS, NE", "MO", "VA", "DE, DC, MD, WV", "GA",
         "NC, SC" , "FL", "AL, KY, MS", "TN", "AR, LA, OK",
         "TX", "CO", "ID, MT, UT, WY", "AZ", "NV, NM",
         "CA", "AK, HI, OR, WA"
  )
}

decode_roof_type = function(x) {
  if(!is.numeric(x)) stop('decode_roof_type expects numeric input indexed from 1!')
  switch(x,
         'CeramicClay_Tiles',
         'Wood_ShinglesShakes',
         'Metal',
         'SlateSynthetic_Slate',
         'Composition_Shingles', 
         'Asphalt',
         'Concrete_Tiles',
         'Other')
}

decode_decade = function(x) {
  if(!is.numeric(x)) stop('decode_decade expects numeric input indexed from 1!')
  switch(x, 'Pre1950', '1950s', '1960s', '1970s', '1980s', '1990s','2000s', '2000s')
}

a

# Cleaning data and change labels for state and roof type 
working_data = fulldata %>%
  .[, .(REPORTABLE_DOMAIN, ROOFTYPE, NWEIGHT)] %>% 
  .[ROOFTYPE != -2] %>% # remove rooftype not application (-2)
  .[, "State" := sapply(REPORTABLE_DOMAIN, decode_state)] %>%
  .[, "Rooftype" := sapply(ROOFTYPE, decode_roof_type)] %>%
  .[, .(State, Rooftype, NWEIGHT)]

# calculate the total weighted counts for each state save as "stateSum"
# calculate total weighted counts for each state and each rooftype, save as "weight_count"
rooftype_prop = working_data %>% 
  .[, stateSum := sum(NWEIGHT), by = .(State)] %>%
  .[, weight_count := sum(NWEIGHT), by = .(State, Rooftype)] %>%
  .[, "NWEIGHT" := NULL]

# remove duplicate rows
setkeyv(rooftype_prop, c("State", "Rooftype"))
rooftype_prop = subset(unique(rooftype_prop))

# calculate the proportion of each roof type in individual state
rooftype_prop = rooftype_prop %>% 
  .[, "Roof_proportion" := weight_count / stateSum * 100] %>%
  .[, .(State, Rooftype, Roof_proportion)]

# keep the wood shingles roof type and order proportion
wood_proportion = rooftype_prop %>%
  .[Rooftype == "Wood_ShinglesShakes"] %>%
  .[, "Rooftype" := NULL] %>%
  .[order(-Roof_proportion)]
# display result in table format 
knitr::kable(wood_proportion, digits=1, col.names = c('State(s)','% of Wood Shingle Roofs'))
State(s) % of Wood Shingle Roofs
NC, SC 14.7
CA 11.0
NV, NM 10.9
CO 10.6
ID, MT, UT, WY 9.8
TX 9.3
FL 8.1
AK, HI, OR, WA 7.5
IN, OH 6.8
DE, DC, MD, WV 6.8
PA 6.7
GA 6.6
AZ 6.1
MA 6.1
NY 5.9
MO 5.7
KS, NE 5.2
CT, ME, NH, RI, VT 4.8
IA, MN, ND, SD 4.6
IL 4.3
VA 3.8
WI 3.8
AR, LA, OK 3.5
AL, KY, MS 3.4
MI 3.4
NJ 2.9
TN 1.8

From the table that North Carolina and South Carolina has the highest proportion of wood shingle roofs. Tennessee has the lowest proportion of wood roofs.

Key Steps

b

# Cleaning data and change labels for state and roof type 
working_data2 = fulldata[, .(YEARMADERANGE, ROOFTYPE, NWEIGHT)] %>% 
  .[ROOFTYPE != -2] %>% # remove rooftype not application (-2)
  .[, "decade" := sapply(YEARMADERANGE, decode_decade)] %>%
  .[, "rooftype" := sapply(ROOFTYPE, decode_roof_type)] %>%
  .[, .(decade, rooftype, NWEIGHT)]

# compute the weighted sum in each decade
decade_total = working_data2 %>%
  .[, decade_total := sum(NWEIGHT), by = decade]

# weighted sum of different roof types in each decade
decade_proportion = decade_total %>%
  .[, weight_count := sum(NWEIGHT), by = .(decade, rooftype)] %>%
  .[, "NWEIGHT" := NULL]

# remove duplicate rows
setkeyv(decade_proportion, c("decade", "rooftype"))
decade_proportion = subset(unique(decade_proportion))

# calculate the proportion of each roof type in each decade
decade_proportion = decade_proportion %>% 
  .[, "Roof_proportion" := weight_count / decade_total * 100] %>%
  .[, .(decade, rooftype, Roof_proportion)] 


# convert data.table from long to wide
decade_proportion = dcast(decade_proportion, rooftype ~ decade, value.var = "Roof_proportion")

# compute relative change between 1950 and 2000
relative_change = decade_proportion$'2000s' / decade_proportion$'1950s'

# combine roof proportion data with relative change
decade_proportion =  decade_proportion %>%
  .[, relative_change := relative_change] %>%
  .[, .(rooftype, Pre1950, `1950s`, `1960s`, `1970s`, `1980s`, 
        `1990s`, `2000s`, relative_change)] %>%
  .[order(-relative_change)] # order by relative change in proportion
# display result 
knitr::kable(decade_proportion, digits=2, caption="Propotion of roof types by decades")
Propotion of roof types by decades
rooftype Pre1950 1950s 1960s 1970s 1980s 1990s 2000s relative_change
CeramicClay_Tiles 1.16 1.05 2.02 3.17 5.01 6.63 5.75 5.45
Concrete_Tiles 0.39 0.84 0.70 1.29 1.44 1.89 2.96 3.53
Composition_Shingles 53.95 61.20 58.92 56.03 52.12 55.94 64.05 1.05
Metal 6.86 4.54 6.28 13.06 13.78 11.56 4.50 0.99
Wood_ShinglesShakes 5.85 8.03 8.23 6.31 8.39 6.12 6.42 0.80
Asphalt 26.70 21.41 21.16 17.68 17.40 16.32 15.52 0.73
SlateSynthetic_Slate 2.71 1.58 1.34 0.76 1.20 0.98 0.45 0.28
Other 2.37 1.35 1.35 1.70 0.66 0.58 0.35 0.26

The roof type with the largest relative rise is Ceramic or Clay Tiles, with relative change 5.5 between 1950 and 2000.

Key Steps