Dataset used:
Resedidential Energy Consumption Survey (RECS)
Data URL:
http://www.eia.gov/consumption/residential/data/2009/csv/recs2009_public.csv
Description:
This script runs analysis of RECS dataset. The variables of interest is roof type of houses from different states and across various years.
Note: You can move your mouse over visualization to see specific statistic interactively.

Data cleaning

library(tidyverse)
library(dplyr)
library(readr)
library(tidyr)
library(plotly)

recs_tib = read_delim('./recs2009_public.csv', delim=',',col_names=TRUE)
decode_state = function(x){
  if(!is.numeric(x)) stop('decode_states expects numeric input indexed from 1!')
  switch(x,
         "CT, ME, NH, RI, VT", "MA", "NY", "NJ", "PA", "IL", "IN, OH", "MI", "WI",
         "IA, MN, ND, SD", "KS, NE", "MO", "VA", "DE, DC, MD, WV", "GA",
         "NC, SC" , "FL", "AL, KY, MS", "TN", "AR, LA, OK",
         "TX", "CO", "ID, MT, UT, WY", "AZ", "NV, NM",
         "CA", "AK, HI, OR, WA"
  )
}
decode_all_states = function(x){
  sapply(x, decode_state)
}
decode_roof_type = function(x) {
  if(!is.numeric(x)) stop('decode_roof_type expects numeric input indexed from 1!')
  switch(x,
         'CeramicClay_Tiles',
         'Wood_ShinglesShakes',
         'Metal',
         'SlateSynthetic_Slate',
         'Composition_Shingles', 
         'Asphalt',
         'Concrete_Tiles',
         'Other')
}
decode_all_roof_type = function(x) {
  sapply(x, decode_roof_type)
}
decode_year_range = function(x) {
  if(!is.numeric(x)) stop('decode_year_range expects numeric input indexed from 1!')
  switch(x,
         '<1950',
         '1950s',
         '1960s',
         '1970s',
         '1980s',
         '1990s',
         '2000s',
         '2000s')
}
decode_all_year_range =  function(x) {
  sapply(x, decode_year_range)
}
percentagechange <- function(a,b) {
  (b-a)/a*100
}

Percentage of wood shingle roofs houses by State(s)

# (a)
home_roof_prop = recs_tib %>%
  transmute(State = REPORTABLE_DOMAIN, Type = ROOFTYPE, Weight = NWEIGHT) %>% # change label
  filter (Type != -2) %>% # remove NA for type label
  mutate(State = decode_all_states(State), Type = decode_all_roof_type(Type)) %>% # decode state and roof type
  group_by(State, Type) %>% # group by state, then roof type 
  summarise(Roof = sum(Weight)) %>% # sum of weight
  tidyr::spread(Type, Roof) %>% # convert to wide format
  mutate(Total = sum(CeramicClay_Tiles, Wood_ShinglesShakes, Metal, # create new column total 
                     SlateSynthetic_Slate, Composition_Shingles, Asphalt,
                     Concrete_Tiles, Other, na.rm = TRUE)) %>%
  mutate(CeramicClay_Tiles=CeramicClay_Tiles/Total*100, # convert each roof type to proportion
         Wood_ShinglesShakes=Wood_ShinglesShakes/Total*100,
         Metal=Metal/Total*100,
         SlateSynthetic_Slate=SlateSynthetic_Slate/Total*100,
         Composition_Shingles=Composition_Shingles/Total*100,
         Asphalt=Asphalt/Total*100,
         Concrete_Tiles=Concrete_Tiles/Total*100,
         Other = Other/Total*100) %>%
  select(-Total) %>% # remove total column
  arrange(desc(Wood_ShinglesShakes)) %>% # rearrange wood roof by proportion
  select(State, Wood_ShinglesShakes) %>% # select only state and wood roof column
  rename(Percentage_of_wood_shingle_roofs = Wood_ShinglesShakes) # rename wood roof column

knitr::kable(home_roof_prop,digits=1,caption='Percentage of wood shingle roofs houses by State(s).')
Percentage of wood shingle roofs houses by State(s).
State Percentage_of_wood_shingle_roofs
NC, SC 14.7
CA 11.0
NV, NM 10.9
CO 10.6
ID, MT, UT, WY 9.8
TX 9.3
FL 8.1
AK, HI, OR, WA 7.5
IN, OH 6.8
DE, DC, MD, WV 6.8
PA 6.7
GA 6.6
AZ 6.1
MA 6.1
NY 5.9
MO 5.7
KS, NE 5.2
CT, ME, NH, RI, VT 4.8
IA, MN, ND, SD 4.6
IL 4.3
VA 3.8
WI 3.8
AR, LA, OK 3.5
AL, KY, MS 3.4
MI 3.4
NJ 2.9
TN 1.8
figure1 = ggplot(data = home_roof_prop) +
  geom_bar(mapping = aes(x = reorder(State, -Percentage_of_wood_shingle_roofs), 
                         y = Percentage_of_wood_shingle_roofs), stat = "identity") +
  xlab('State') + ylab('Proportion of wood shingle roofs (%)') +
  ggtitle("Proportion of wood shingle roofs houses by state(s)") +
  coord_flip()
ggplotly(figure1)

North Carolina and South Carolina are the states with the highest proportion of wood shingle roof (14.7%), while Tennansee has the lower proportion of wood shingle roof(1.8%).

Change of popularity of different roof type over time

home_decade_prop = recs_tib %>%
  transmute(Type = ROOFTYPE, Weight = NWEIGHT, Year = YEARMADE, Decade = YEARMADERANGE) %>% # change label
  filter (Type != -2) %>% # remove missing value for rooftype
  mutate(Decade = decode_all_year_range(Decade), Type = decode_all_roof_type(Type)) %>% # decode decade and roof type
  group_by(Decade,Type) %>% # group by decade and then type
  summarise(Roof = sum(Weight)) %>%
  tidyr::spread(Type, Roof) %>% # convert to wide format
  mutate(Total = sum(CeramicClay_Tiles, Wood_ShinglesShakes, Metal,
                     SlateSynthetic_Slate, Composition_Shingles, Asphalt,
                     Concrete_Tiles, Other, na.rm = TRUE)) %>%
  mutate(CeramicClay_Tiles=CeramicClay_Tiles/Total*100, # compute proportion
         Wood_ShinglesShakes=Wood_ShinglesShakes/Total*100,
         Metal=Metal/Total*100,
         SlateSynthetic_Slate=SlateSynthetic_Slate/Total*100,
         Composition_Shingles=Composition_Shingles/Total*100,
         Asphalt=Asphalt/Total*100,
         Concrete_Tiles=Concrete_Tiles/Total*100,
         Other = Other/Total*100) %>%
  select(-Total) %>% # remove total column
  rename(Ceramic = CeramicClay_Tiles, Composition = Composition_Shingles, Concrete = Concrete_Tiles, 
         SlateSynthetic = SlateSynthetic_Slate, WoodShingles = Wood_ShinglesShakes)

knitr::kable(home_decade_prop,digits=1,caption='Percentage of roof types by Decade') # print out table
Percentage of roof types by Decade
Decade Asphalt Ceramic Composition Concrete Metal Other SlateSynthetic WoodShingles
<1950 26.7 1.2 54.0 0.4 6.9 2.4 2.7 5.8
1950s 21.4 1.1 61.2 0.8 4.5 1.4 1.6 8.0
1960s 21.2 2.0 58.9 0.7 6.3 1.3 1.3 8.2
1970s 17.7 3.2 56.0 1.3 13.1 1.7 0.8 6.3
1980s 17.4 5.0 52.1 1.4 13.8 0.7 1.2 8.4
1990s 16.3 6.6 55.9 1.9 11.6 0.6 1.0 6.1
2000s 15.5 5.7 64.1 3.0 4.5 0.4 0.4 6.4
home_decade_prop1 = home_decade_prop %>%
  gather(Type, value, Asphalt:WoodShingles) # convert to wide format
figure2 = 
  ggplot(data = home_decade_prop1, aes(x = Decade, y = value, group = Type, color = Type)) + 
  geom_line(size = 0.8) + # plot line chart for proportion of each roof type across decades
  xlab("Decade") + ylab("Proportion (%)") + 
  ggtitle("Proportion of different roof types by decade")
ggplotly(figure2)

The above are table and graph of proportion of each roof type by each decade. We can see composition roof remains as the most popular roof type over time and its popularity is still increasing, in contrast Asphalt roof has become less popular. We will now shift our focus to relative change of roof type proportion between 1950 and 2000.

for (i in 2:9) {
  home_decade_prop[8,i] = percentagechange(home_decade_prop[2,i], home_decade_prop[7,i]) # add new row of percentage change
}
home_decade_prop[8,1] = "PercentageChange"
home_decade_prop1 = home_decade_prop[8,]
home_decade_prop1 = home_decade_prop1 %>%
  rename(RoofType = Decade)
knitr::kable(home_decade_prop1,digits=1,caption='Percentage change of roof types between 1950 and 2000')
Percentage change of roof types between 1950 and 2000
RoofType Asphalt Ceramic Composition Concrete Metal Other SlateSynthetic WoodShingles
PercentageChange -27.5 445 4.7 253.2 -0.8 -74.1 -71.7 -20
home_decade_prop_df = as.numeric(home_decade_prop1[1,2:9])
roof = colnames(home_decade_prop1)[2:length(colnames(home_decade_prop1))]
figure3 = ggplot() +
  geom_bar(mapping = aes(x = reorder(roof, -home_decade_prop_df), 
                         y = home_decade_prop_df), stat = "identity") +
  xlab('Rooftype') + ylab('Relative Change(%)') +
  ggtitle("Relative Change in Proportion between 1950 and 2000")
ggplotly(figure3)

Both table and bar chart indicate that Ceramic roof type has the greatest relative rise in proportion between 1950 to 2000.