Dataset used:
Resedidential Energy Consumption Survey (RECS)
Data URL:
http://www.eia.gov/consumption/residential/data/2009/csv/recs2009_public.csv
Description:
This script runs analysis of RECS dataset. The variables of interest is roof type of houses from different states and across various years.
Note: You can move your mouse over visualization to see specific statistic interactively.
library(tidyverse)
library(dplyr)
library(readr)
library(tidyr)
library(plotly)
recs_tib = read_delim('./recs2009_public.csv', delim=',',col_names=TRUE)
decode_state = function(x){
if(!is.numeric(x)) stop('decode_states expects numeric input indexed from 1!')
switch(x,
"CT, ME, NH, RI, VT", "MA", "NY", "NJ", "PA", "IL", "IN, OH", "MI", "WI",
"IA, MN, ND, SD", "KS, NE", "MO", "VA", "DE, DC, MD, WV", "GA",
"NC, SC" , "FL", "AL, KY, MS", "TN", "AR, LA, OK",
"TX", "CO", "ID, MT, UT, WY", "AZ", "NV, NM",
"CA", "AK, HI, OR, WA"
)
}
decode_all_states = function(x){
sapply(x, decode_state)
}
decode_roof_type = function(x) {
if(!is.numeric(x)) stop('decode_roof_type expects numeric input indexed from 1!')
switch(x,
'CeramicClay_Tiles',
'Wood_ShinglesShakes',
'Metal',
'SlateSynthetic_Slate',
'Composition_Shingles',
'Asphalt',
'Concrete_Tiles',
'Other')
}
decode_all_roof_type = function(x) {
sapply(x, decode_roof_type)
}
decode_year_range = function(x) {
if(!is.numeric(x)) stop('decode_year_range expects numeric input indexed from 1!')
switch(x,
'<1950',
'1950s',
'1960s',
'1970s',
'1980s',
'1990s',
'2000s',
'2000s')
}
decode_all_year_range = function(x) {
sapply(x, decode_year_range)
}
percentagechange <- function(a,b) {
(b-a)/a*100
}
# (a)
home_roof_prop = recs_tib %>%
transmute(State = REPORTABLE_DOMAIN, Type = ROOFTYPE, Weight = NWEIGHT) %>% # change label
filter (Type != -2) %>% # remove NA for type label
mutate(State = decode_all_states(State), Type = decode_all_roof_type(Type)) %>% # decode state and roof type
group_by(State, Type) %>% # group by state, then roof type
summarise(Roof = sum(Weight)) %>% # sum of weight
tidyr::spread(Type, Roof) %>% # convert to wide format
mutate(Total = sum(CeramicClay_Tiles, Wood_ShinglesShakes, Metal, # create new column total
SlateSynthetic_Slate, Composition_Shingles, Asphalt,
Concrete_Tiles, Other, na.rm = TRUE)) %>%
mutate(CeramicClay_Tiles=CeramicClay_Tiles/Total*100, # convert each roof type to proportion
Wood_ShinglesShakes=Wood_ShinglesShakes/Total*100,
Metal=Metal/Total*100,
SlateSynthetic_Slate=SlateSynthetic_Slate/Total*100,
Composition_Shingles=Composition_Shingles/Total*100,
Asphalt=Asphalt/Total*100,
Concrete_Tiles=Concrete_Tiles/Total*100,
Other = Other/Total*100) %>%
select(-Total) %>% # remove total column
arrange(desc(Wood_ShinglesShakes)) %>% # rearrange wood roof by proportion
select(State, Wood_ShinglesShakes) %>% # select only state and wood roof column
rename(Percentage_of_wood_shingle_roofs = Wood_ShinglesShakes) # rename wood roof column
knitr::kable(home_roof_prop,digits=1,caption='Percentage of wood shingle roofs houses by State(s).')
State | Percentage_of_wood_shingle_roofs |
---|---|
NC, SC | 14.7 |
CA | 11.0 |
NV, NM | 10.9 |
CO | 10.6 |
ID, MT, UT, WY | 9.8 |
TX | 9.3 |
FL | 8.1 |
AK, HI, OR, WA | 7.5 |
IN, OH | 6.8 |
DE, DC, MD, WV | 6.8 |
PA | 6.7 |
GA | 6.6 |
AZ | 6.1 |
MA | 6.1 |
NY | 5.9 |
MO | 5.7 |
KS, NE | 5.2 |
CT, ME, NH, RI, VT | 4.8 |
IA, MN, ND, SD | 4.6 |
IL | 4.3 |
VA | 3.8 |
WI | 3.8 |
AR, LA, OK | 3.5 |
AL, KY, MS | 3.4 |
MI | 3.4 |
NJ | 2.9 |
TN | 1.8 |
figure1 = ggplot(data = home_roof_prop) +
geom_bar(mapping = aes(x = reorder(State, -Percentage_of_wood_shingle_roofs),
y = Percentage_of_wood_shingle_roofs), stat = "identity") +
xlab('State') + ylab('Proportion of wood shingle roofs (%)') +
ggtitle("Proportion of wood shingle roofs houses by state(s)") +
coord_flip()
ggplotly(figure1)
North Carolina and South Carolina are the states with the highest proportion of wood shingle roof (14.7%), while Tennansee has the lower proportion of wood shingle roof(1.8%).
home_decade_prop = recs_tib %>%
transmute(Type = ROOFTYPE, Weight = NWEIGHT, Year = YEARMADE, Decade = YEARMADERANGE) %>% # change label
filter (Type != -2) %>% # remove missing value for rooftype
mutate(Decade = decode_all_year_range(Decade), Type = decode_all_roof_type(Type)) %>% # decode decade and roof type
group_by(Decade,Type) %>% # group by decade and then type
summarise(Roof = sum(Weight)) %>%
tidyr::spread(Type, Roof) %>% # convert to wide format
mutate(Total = sum(CeramicClay_Tiles, Wood_ShinglesShakes, Metal,
SlateSynthetic_Slate, Composition_Shingles, Asphalt,
Concrete_Tiles, Other, na.rm = TRUE)) %>%
mutate(CeramicClay_Tiles=CeramicClay_Tiles/Total*100, # compute proportion
Wood_ShinglesShakes=Wood_ShinglesShakes/Total*100,
Metal=Metal/Total*100,
SlateSynthetic_Slate=SlateSynthetic_Slate/Total*100,
Composition_Shingles=Composition_Shingles/Total*100,
Asphalt=Asphalt/Total*100,
Concrete_Tiles=Concrete_Tiles/Total*100,
Other = Other/Total*100) %>%
select(-Total) %>% # remove total column
rename(Ceramic = CeramicClay_Tiles, Composition = Composition_Shingles, Concrete = Concrete_Tiles,
SlateSynthetic = SlateSynthetic_Slate, WoodShingles = Wood_ShinglesShakes)
knitr::kable(home_decade_prop,digits=1,caption='Percentage of roof types by Decade') # print out table
Decade | Asphalt | Ceramic | Composition | Concrete | Metal | Other | SlateSynthetic | WoodShingles |
---|---|---|---|---|---|---|---|---|
<1950 | 26.7 | 1.2 | 54.0 | 0.4 | 6.9 | 2.4 | 2.7 | 5.8 |
1950s | 21.4 | 1.1 | 61.2 | 0.8 | 4.5 | 1.4 | 1.6 | 8.0 |
1960s | 21.2 | 2.0 | 58.9 | 0.7 | 6.3 | 1.3 | 1.3 | 8.2 |
1970s | 17.7 | 3.2 | 56.0 | 1.3 | 13.1 | 1.7 | 0.8 | 6.3 |
1980s | 17.4 | 5.0 | 52.1 | 1.4 | 13.8 | 0.7 | 1.2 | 8.4 |
1990s | 16.3 | 6.6 | 55.9 | 1.9 | 11.6 | 0.6 | 1.0 | 6.1 |
2000s | 15.5 | 5.7 | 64.1 | 3.0 | 4.5 | 0.4 | 0.4 | 6.4 |
home_decade_prop1 = home_decade_prop %>%
gather(Type, value, Asphalt:WoodShingles) # convert to wide format
figure2 =
ggplot(data = home_decade_prop1, aes(x = Decade, y = value, group = Type, color = Type)) +
geom_line(size = 0.8) + # plot line chart for proportion of each roof type across decades
xlab("Decade") + ylab("Proportion (%)") +
ggtitle("Proportion of different roof types by decade")
ggplotly(figure2)
The above are table and graph of proportion of each roof type by each decade. We can see composition roof remains as the most popular roof type over time and its popularity is still increasing, in contrast Asphalt roof has become less popular. We will now shift our focus to relative change of roof type proportion between 1950 and 2000.
for (i in 2:9) {
home_decade_prop[8,i] = percentagechange(home_decade_prop[2,i], home_decade_prop[7,i]) # add new row of percentage change
}
home_decade_prop[8,1] = "PercentageChange"
home_decade_prop1 = home_decade_prop[8,]
home_decade_prop1 = home_decade_prop1 %>%
rename(RoofType = Decade)
knitr::kable(home_decade_prop1,digits=1,caption='Percentage change of roof types between 1950 and 2000')
RoofType | Asphalt | Ceramic | Composition | Concrete | Metal | Other | SlateSynthetic | WoodShingles |
---|---|---|---|---|---|---|---|---|
PercentageChange | -27.5 | 445 | 4.7 | 253.2 | -0.8 | -74.1 | -71.7 | -20 |
home_decade_prop_df = as.numeric(home_decade_prop1[1,2:9])
roof = colnames(home_decade_prop1)[2:length(colnames(home_decade_prop1))]
figure3 = ggplot() +
geom_bar(mapping = aes(x = reorder(roof, -home_decade_prop_df),
y = home_decade_prop_df), stat = "identity") +
xlab('Rooftype') + ylab('Relative Change(%)') +
ggtitle("Relative Change in Proportion between 1950 and 2000")
ggplotly(figure3)
Both table and bar chart indicate that Ceramic roof type has the greatest relative rise in proportion between 1950 to 2000.