├── .gitignore
├── README.md
├── building-permits.Rproj
├── dataPermit_full.csv
├── permits-actual.Rmd
├── permits-actual.md
├── permits-actual_files
└── figure-gfm
│ ├── unnamed-chunk-4-1.png
│ ├── unnamed-chunk-6-1.png
│ ├── unnamed-chunk-6-2.png
│ ├── unnamed-chunk-6-3.png
│ ├── unnamed-chunk-6-4.png
│ ├── unnamed-chunk-7-1.png
│ ├── unnamed-chunk-7-2.png
│ ├── unnamed-chunk-8-1.png
│ ├── unnamed-chunk-8-2.png
│ └── unnamed-chunk-9-1.png
├── permits-practice.Rmd
├── permits-practice.md
└── permits-practice_files
└── figure-gfm
├── unnamed-chunk-10-1.png
├── unnamed-chunk-10-2.png
├── unnamed-chunk-10-3.png
├── unnamed-chunk-4-1.png
├── unnamed-chunk-6-1.png
├── unnamed-chunk-7-1.png
├── unnamed-chunk-7-2.png
├── unnamed-chunk-7-3.png
├── unnamed-chunk-8-1.png
├── unnamed-chunk-8-2.png
├── unnamed-chunk-8-3.png
├── unnamed-chunk-9-1.png
└── unnamed-chunk-9-2.png
/.gitignore:
--------------------------------------------------------------------------------
1 | .Rproj.user
2 | .Rhistory
3 | .RData
4 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # building-permits
2 |
3 | This repository provies the code and data accompanying my "[whole game](https://www.youtube.com/watch?v=go5Au01Jrvs)" intro to data science video. The goal of this video is to quickly outline all the parts of a data science project so you get an overview of how everything hangs together, and see a little bit of my workflow.
4 |
5 | * [permits-actual.md](permits-actual.md): this is the actual code I typed
6 | during the video
7 |
8 | * [permits-practice.md](permits-practice.md): practice code that I wrote
9 | beforehand. The code is a little nicer and it has more comments.
10 |
11 | ## TODO
12 |
13 | * Figure out how to show keyboard shortcuts on screen
14 |
--------------------------------------------------------------------------------
/building-permits.Rproj:
--------------------------------------------------------------------------------
1 | Version: 1.0
2 |
3 | RestoreWorkspace: No
4 | SaveWorkspace: No
5 | AlwaysSaveHistory: Default
6 |
7 | EnableCodeIndexing: Yes
8 | UseSpacesForTab: Yes
9 | NumSpacesForTab: 2
10 | Encoding: UTF-8
11 |
12 | RnwWeave: knitr
13 | LaTeX: XeLaTeX
14 |
15 | AutoAppendNewline: Yes
16 | StripTrailingWhitespace: Yes
17 |
18 | BuildType: Package
19 | PackageUseDevtools: Yes
20 | PackageInstallArgs: --no-multiarch --with-keep.source
21 | PackageRoxygenize: rd,collate,namespace
22 |
--------------------------------------------------------------------------------
/permits-actual.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 | # US building permit
6 |
7 | ```{r setup}
8 | library(tidyverse)
9 | ```
10 |
11 | ## Data import
12 |
13 | Data downloaded from
14 |
15 | ```{r}
16 | permits_raw <- read_csv("dataPermit_full.csv", na = "null")
17 | permits_raw
18 | ```
19 |
20 | * area = metropolitan standard area
21 | * date = month / year (character vector, or strings)
22 | * f1 = 1 family house, f24 = 2-4 familes, f5 = 5+ family house
23 | * units = number of buildings, change in units; value = average value of building; valchange = change in value
24 |
25 | ```{r}
26 | permits <- permits_raw %>%
27 | separate(date, c("month", "year"), "/", convert = TRUE) %>%
28 | filter(year > 2007) %>%
29 | select(-ends_with("change"))
30 | permits
31 | ```
32 |
33 | ## Basic eda
34 |
35 | ```{r}
36 | permits %>% count(year)
37 | permits %>% count(area)
38 | permits %>% count(area) %>% count(n)
39 | ```
40 |
41 | I'm focus on single family homes
42 |
43 | ```{r}
44 | permits <- permits %>%
45 | mutate(date = year + (month - 1) / 12)
46 |
47 | ggplot(permits, aes(date, f1units)) +
48 | geom_line(aes(group = area))
49 | ```
50 |
51 | ## Focus
52 |
53 | Focus on big cities. Possible bias our results, because big cities are small cities?
54 | ```{r}
55 | f1units <- permits %>%
56 | group_by(area) %>%
57 | summarise(mean = mean(f1units)) %>%
58 | arrange(desc(mean))
59 | f1units
60 |
61 | f1units %>% filter(mean > 100)
62 |
63 | permits_big <- permits %>%
64 | semi_join(f1units %>% filter(mean > 100))
65 | permits_big
66 | ```
67 |
68 |
69 | ```{r}
70 | ggplot(permits_big, aes(date, f1units)) +
71 | geom_line(aes(group = area))
72 |
73 | ggplot(permits_big, aes(date, f1units)) +
74 | geom_line(aes(group = area), alpha = 1/10)
75 |
76 | ggplot(permits_big, aes(date, f1units)) +
77 | geom_line(aes(group = area), alpha = 1/10) +
78 | scale_y_log10()
79 |
80 | ggplot(permits_big, aes(date, f1units)) +
81 | geom_line(aes(group = area), alpha = 1/10) +
82 | scale_y_log10() +
83 | geom_smooth(se = FALSE)
84 |
85 | ```
86 |
87 | ## Model
88 |
89 | ```{r}
90 | houston <- permits %>% filter(str_detect(area, "Houston"))
91 | houston %>% count(area)
92 |
93 | ggplot(houston, aes(date, f1units)) +
94 | geom_line(aes(group = area))
95 |
96 | ggplot(houston, aes(month, f1units)) +
97 | geom_line(aes(group = year)) +
98 | scale_y_log10()
99 |
100 | ```
101 |
102 | Question:
103 |
104 | * Is this pattern the same everywhere?
105 | * What drives it? Is it the weather?
106 | * Houston in July is less pleasant than Houston Dec
107 |
108 | ```{r}
109 | library(modelr)
110 |
111 |
112 | houston_mod <- lm(log(f1units) ~ factor(month), data = houston)
113 |
114 | houston %>%
115 | add_predictions(houston_mod) %>%
116 | ggplot(aes(date, pred)) +
117 | geom_line()
118 |
119 | houston %>%
120 | add_residuals(houston_mod) %>%
121 | ggplot(aes(date, resid)) +
122 | geom_line()
123 |
124 | ```
125 |
126 | * What's driving this trend?
127 | * What hapened around 2010?
128 |
129 | ## Extend the model to every city
130 |
131 | ```{r}
132 | by_area <- permits_big %>%
133 | group_by(area) %>%
134 | nest()
135 |
136 | area_model <- function(df) {
137 | lm(log10(f1units + 1) ~ factor(month), data = df)
138 | }
139 |
140 | detrended <- by_area %>% mutate(
141 | model = map(data, area_model),
142 | resids = map2(data, model, add_residuals)
143 | ) %>% unnest(resids)
144 |
145 | ggplot(detrended, aes(date, resid)) +
146 | geom_line(aes(group = area), alpha = 1/10) +
147 | geom_smooth(se = FALSE)
148 |
149 | ```
150 |
151 |
--------------------------------------------------------------------------------
/permits-actual.md:
--------------------------------------------------------------------------------
1 |
2 | # US building permit
3 |
4 | ``` r
5 | library(tidyverse)
6 | ```
7 |
8 | ## ── Attaching packages ──────────────────────────────────────── tidyverse 1.2.0.9000 ──
9 |
10 | ## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
11 | ## ✔ tibble 1.4.1 ✔ dplyr 0.7.4
12 | ## ✔ tidyr 0.7.2.9000 ✔ stringr 1.2.0
13 | ## ✔ readr 1.1.1 ✔ forcats 0.2.0
14 |
15 | ## ── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
16 | ## ✖ dplyr::filter() masks stats::filter()
17 | ## ✖ dplyr::lag() masks stats::lag()
18 |
19 | ## Data import
20 |
21 | Data downloaded from
22 |
23 |
24 | ``` r
25 | permits_raw <- read_csv("dataPermit_full.csv", na = "null")
26 | ```
27 |
28 | ## Parsed with column specification:
29 | ## cols(
30 | ## area = col_character(),
31 | ## date = col_character(),
32 | ## f1units = col_integer(),
33 | ## f1change = col_double(),
34 | ## f1value = col_integer(),
35 | ## f1valchange = col_double(),
36 | ## f24units = col_integer(),
37 | ## f24change = col_double(),
38 | ## f24value = col_integer(),
39 | ## f24valchange = col_double(),
40 | ## f5units = col_integer(),
41 | ## f5change = col_double(),
42 | ## f5value = col_integer(),
43 | ## f5valchange = col_double()
44 | ## )
45 |
46 | ``` r
47 | permits_raw
48 | ```
49 |
50 | ## # A tibble: 89,660 x 14
51 | ## area date f1un… f1ch… f1va… f1va… f24u… f24c… f24v… f24v… f5un… f5ch…
52 | ##
53 | ## 1 Abil… 01/1… 24 NA 67900 NA 4 NA 46200 NA 200 NA
54 | ## 2 Abil… 02/1… 39 NA 75900 NA 0 NA 0 NA 0 NA
55 | ## 3 Abil… 03/1… 38 NA 78000 NA 4 NA 37000 NA 0 NA
56 | ## 4 Abil… 04/1… 29 NA 66500 NA 0 NA 0 NA 0 NA
57 | ## 5 Abil… 05/1… 29 NA 77600 NA 0 NA 0 NA 0 NA
58 | ## 6 Abil… 06/1… 42 NA 66500 NA 0 NA 0 NA 0 NA
59 | ## 7 Abil… 07/1… 48 NA 67600 NA 18 NA 24400 NA 0 NA
60 | ## 8 Abil… 08/1… 67 NA 69000 NA 0 NA 0 NA 0 NA
61 | ## 9 Abil… 09/1… 53 NA 60800 NA 2 NA 31200 NA 0 NA
62 | ## 10 Abil… 10/1… 80 NA 73000 NA 2 NA 23800 NA 152 NA
63 | ## # ... with 89,650 more rows, and 2 more variables: f5value ,
64 | ## # f5valchange
65 |
66 | - area = metropolitan standard area
67 | - date = month / year (character vector, or strings)
68 | - f1 = 1 family house, f24 = 2-4 familes, f5 = 5+ family house
69 | - units = number of buildings, change in units; value = average value
70 | of building; valchange = change in value
71 |
72 |
73 |
74 | ``` r
75 | permits <- permits_raw %>%
76 | separate(date, c("month", "year"), "/", convert = TRUE) %>%
77 | filter(year > 2007) %>%
78 | select(-ends_with("change"))
79 | permits
80 | ```
81 |
82 | ## # A tibble: 44,707 x 9
83 | ## area f1units month year f1value f24units f24value f5units f5va…
84 | ##
85 | ## 1 Abilene, TX 10 1 2008 179100 4 111500 0 0
86 | ## 2 Abilene, TX 19 2 2008 158000 2 80000 0 0
87 | ## 3 Abilene, TX 29 3 2008 190300 0 0 0 0
88 | ## 4 Abilene, TX 21 4 2008 155600 0 0 0 0
89 | ## 5 Abilene, TX 26 5 2008 159500 0 0 0 0
90 | ## 6 Abilene, TX 14 6 2008 142100 0 0 0 0
91 | ## 7 Abilene, TX 20 7 2008 192600 0 0 0 0
92 | ## 8 Abilene, TX 17 8 2008 215000 0 0 0 0
93 | ## 9 Abilene, TX 9 9 2008 127500 0 0 0 0
94 | ## 10 Abilene, TX 8 10 2008 174700 0 0 0 0
95 | ## # ... with 44,697 more rows
96 |
97 | ## Basic eda
98 |
99 | ``` r
100 | permits %>% count(year)
101 | ```
102 |
103 | ## # A tibble: 10 x 2
104 | ## year n
105 | ##
106 | ## 1 2008 4560
107 | ## 2 2009 4560
108 | ## 3 2010 4560
109 | ## 4 2011 4560
110 | ## 5 2012 4560
111 | ## 6 2013 4560
112 | ## 7 2014 4572
113 | ## 8 2015 4380
114 | ## 9 2016 4380
115 | ## 10 2017 4015
116 |
117 | ``` r
118 | permits %>% count(area)
119 | ```
120 |
121 | ## # A tibble: 381 x 2
122 | ## area n
123 | ##
124 | ## 1 Abilene, TX 119
125 | ## 2 Akron, OH 119
126 | ## 3 Albany-Schenectady-Troy, NY 119
127 | ## 4 Albany, GA 119
128 | ## 5 Albany, OR 119
129 | ## 6 Albuquerque, NM 119
130 | ## 7 Alexandria, LA 119
131 | ## 8 Allentown-Bethlehem-Easton, PA-NJ 119
132 | ## 9 Altoona, PA 119
133 | ## 10 Amarillo, TX 119
134 | ## # ... with 371 more rows
135 |
136 | ``` r
137 | permits %>% count(area) %>% count(n)
138 | ```
139 |
140 | ## # A tibble: 3 x 2
141 | ## n nn
142 | ##
143 | ## 1 47 1
144 | ## 2 84 16
145 | ## 3 119 364
146 |
147 | I’m focus on single family homes
148 |
149 | ``` r
150 | permits <- permits %>%
151 | mutate(date = year + (month - 1) / 12)
152 |
153 | ggplot(permits, aes(date, f1units)) +
154 | geom_line(aes(group = area))
155 | ```
156 |
157 | 
158 |
159 | ## Focus
160 |
161 | Focus on big cities. Possible bias our results, because big cities are
162 | small cities?
163 |
164 | ``` r
165 | f1units <- permits %>%
166 | group_by(area) %>%
167 | summarise(mean = mean(f1units)) %>%
168 | arrange(desc(mean))
169 | f1units
170 | ```
171 |
172 | ## # A tibble: 381 x 2
173 | ## area mean
174 | ##
175 | ## 1 Houston-The Woodlands-Sugar Land, TX 2546
176 | ## 2 Dallas-Fort Worth-Arlington, TX 1802
177 | ## 3 Atlanta-Sandy Springs-Roswell, GA 1154
178 | ## 4 Phoenix-Mesa-Scottsdale, AZ 1057
179 | ## 5 Washington-Arlington-Alexandria, DC-VA-MD-WV 955
180 | ## 6 Austin-Round Rock, TX 795
181 | ## 7 Charlotte-Concord-Gastonia, NC-SC 786
182 | ## 8 New York-Newark-Jersey City, NY-NJ-PA 758
183 | ## 9 Orlando-Kissimmee-Sanford, FL 713
184 | ## 10 Nashville-Davidson--Murfreesboro--Franklin, TN 647
185 | ## # ... with 371 more rows
186 |
187 | ``` r
188 | f1units %>% filter(mean > 100)
189 | ```
190 |
191 | ## # A tibble: 96 x 2
192 | ## area mean
193 | ##
194 | ## 1 Houston-The Woodlands-Sugar Land, TX 2546
195 | ## 2 Dallas-Fort Worth-Arlington, TX 1802
196 | ## 3 Atlanta-Sandy Springs-Roswell, GA 1154
197 | ## 4 Phoenix-Mesa-Scottsdale, AZ 1057
198 | ## 5 Washington-Arlington-Alexandria, DC-VA-MD-WV 955
199 | ## 6 Austin-Round Rock, TX 795
200 | ## 7 Charlotte-Concord-Gastonia, NC-SC 786
201 | ## 8 New York-Newark-Jersey City, NY-NJ-PA 758
202 | ## 9 Orlando-Kissimmee-Sanford, FL 713
203 | ## 10 Nashville-Davidson--Murfreesboro--Franklin, TN 647
204 | ## # ... with 86 more rows
205 |
206 | ``` r
207 | permits_big <- permits %>%
208 | semi_join(f1units %>% filter(mean > 100))
209 | ```
210 |
211 | ## Joining, by = "area"
212 |
213 | ``` r
214 | permits_big
215 | ```
216 |
217 | ## # A tibble: 11,424 x 10
218 | ## area f1un… month year f1val… f24u… f24va… f5un… f5va… date
219 | ##
220 | ## 1 Albuquerque, NM 169 1 2008 168100 0 0 41 82700 2008
221 | ## 2 Albuquerque, NM 225 2 2008 163600 24 78900 61 82700 2008
222 | ## 3 Albuquerque, NM 244 3 2008 188300 4 85700 30 82700 2008
223 | ## 4 Albuquerque, NM 232 4 2008 181500 4 140800 33 76700 2008
224 | ## 5 Albuquerque, NM 212 5 2008 189000 9 103600 48 76700 2008
225 | ## 6 Albuquerque, NM 216 6 2008 195800 7 108900 52 76700 2008
226 | ## 7 Albuquerque, NM 172 7 2008 182000 5 131000 33 76700 2008
227 | ## 8 Albuquerque, NM 136 8 2008 193700 3 140800 37 76700 2009
228 | ## 9 Albuquerque, NM 127 9 2008 184100 3 140800 33 76700 2009
229 | ## 10 Albuquerque, NM 136 10 2008 181900 3 140800 36 76700 2009
230 | ## # ... with 11,414 more rows
231 |
232 | ``` r
233 | ggplot(permits_big, aes(date, f1units)) +
234 | geom_line(aes(group = area))
235 | ```
236 |
237 | 
238 |
239 | ``` r
240 | ggplot(permits_big, aes(date, f1units)) +
241 | geom_line(aes(group = area), alpha = 1/10)
242 | ```
243 |
244 | 
245 |
246 | ``` r
247 | ggplot(permits_big, aes(date, f1units)) +
248 | geom_line(aes(group = area), alpha = 1/10) +
249 | scale_y_log10()
250 | ```
251 |
252 | 
253 |
254 | ``` r
255 | ggplot(permits_big, aes(date, f1units)) +
256 | geom_line(aes(group = area), alpha = 1/10) +
257 | scale_y_log10() +
258 | geom_smooth(se = FALSE)
259 | ```
260 |
261 | ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
262 |
263 | 
264 |
265 | ## Model
266 |
267 | ``` r
268 | houston <- permits %>% filter(str_detect(area, "Houston"))
269 | houston %>% count(area)
270 | ```
271 |
272 | ## # A tibble: 1 x 2
273 | ## area n
274 | ##
275 | ## 1 Houston-The Woodlands-Sugar Land, TX 119
276 |
277 | ``` r
278 | ggplot(houston, aes(date, f1units)) +
279 | geom_line(aes(group = area))
280 | ```
281 |
282 | 
283 |
284 | ``` r
285 | ggplot(houston, aes(month, f1units)) +
286 | geom_line(aes(group = year)) +
287 | scale_y_log10()
288 | ```
289 |
290 | 
291 |
292 | Question:
293 |
294 | - Is this pattern the same everywhere?
295 | - What drives it? Is it the weather?
296 | - Houston in July is less pleasant than Houston Dec
297 |
298 |
299 |
300 | ``` r
301 | library(modelr)
302 |
303 |
304 | houston_mod <- lm(log(f1units) ~ factor(month), data = houston)
305 |
306 | houston %>%
307 | add_predictions(houston_mod) %>%
308 | ggplot(aes(date, pred)) +
309 | geom_line()
310 | ```
311 |
312 | 
313 |
314 | ``` r
315 | houston %>%
316 | add_residuals(houston_mod) %>%
317 | ggplot(aes(date, resid)) +
318 | geom_line()
319 | ```
320 |
321 | 
322 |
323 | - What’s driving this trend?
324 | - What hapened around 2010?
325 |
326 | ## Extend the model to every city
327 |
328 | ``` r
329 | by_area <- permits_big %>%
330 | group_by(area) %>%
331 | nest()
332 |
333 | area_model <- function(df) {
334 | lm(log10(f1units + 1) ~ factor(month), data = df)
335 | }
336 |
337 | detrended <- by_area %>% mutate(
338 | model = map(data, area_model),
339 | resids = map2(data, model, add_residuals)
340 | ) %>% unnest(resids)
341 |
342 | ggplot(detrended, aes(date, resid)) +
343 | geom_line(aes(group = area), alpha = 1/10) +
344 | geom_smooth(se = FALSE)
345 | ```
346 |
347 | ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
348 |
349 | 
350 |
--------------------------------------------------------------------------------
/permits-actual_files/figure-gfm/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-actual_files/figure-gfm/unnamed-chunk-4-1.png
--------------------------------------------------------------------------------
/permits-actual_files/figure-gfm/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-actual_files/figure-gfm/unnamed-chunk-6-1.png
--------------------------------------------------------------------------------
/permits-actual_files/figure-gfm/unnamed-chunk-6-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-actual_files/figure-gfm/unnamed-chunk-6-2.png
--------------------------------------------------------------------------------
/permits-actual_files/figure-gfm/unnamed-chunk-6-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-actual_files/figure-gfm/unnamed-chunk-6-3.png
--------------------------------------------------------------------------------
/permits-actual_files/figure-gfm/unnamed-chunk-6-4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-actual_files/figure-gfm/unnamed-chunk-6-4.png
--------------------------------------------------------------------------------
/permits-actual_files/figure-gfm/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-actual_files/figure-gfm/unnamed-chunk-7-1.png
--------------------------------------------------------------------------------
/permits-actual_files/figure-gfm/unnamed-chunk-7-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-actual_files/figure-gfm/unnamed-chunk-7-2.png
--------------------------------------------------------------------------------
/permits-actual_files/figure-gfm/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-actual_files/figure-gfm/unnamed-chunk-8-1.png
--------------------------------------------------------------------------------
/permits-actual_files/figure-gfm/unnamed-chunk-8-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-actual_files/figure-gfm/unnamed-chunk-8-2.png
--------------------------------------------------------------------------------
/permits-actual_files/figure-gfm/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-actual_files/figure-gfm/unnamed-chunk-9-1.png
--------------------------------------------------------------------------------
/permits-practice.Rmd:
--------------------------------------------------------------------------------
1 | ---
2 | output: github_document
3 | ---
4 |
5 | # US building permits
6 |
7 | ```{r setup}
8 | library(tidyverse)
9 | ```
10 |
11 | ## Data import
12 |
13 | Data downloaded from
14 |
15 | ```{r}
16 | permits_raw <- read_csv("dataPermit_full.csv", na = "null")
17 | permits_raw
18 | ```
19 |
20 | * f1 = 1 family home; f24 = 2-4 family home; f5 = 5+ unit home
21 | * units = number built; change = percent change from previous month
22 | * value = average value per unit; valchange = percent change from previous month
23 | Hire Director of Education
24 |
25 | Eliminate the change and valchange variables; can easily recompute if needed (and good practice to be sceptical about someone elses computations)
26 |
27 | ```{r}
28 | permits <- permits_raw %>%
29 | select(-ends_with("change")) %>%
30 | separate(date, c("month", "year"), "/", convert = TRUE) %>%
31 | filter(year > 2007)
32 | permits
33 | ```
34 |
35 |
36 | ## Very basic EDA
37 |
38 | ```{r}
39 | permits %>% count(year)
40 | permits %>% count(area)
41 | permits %>% count(area) %>% count(n)
42 | ```
43 |
44 | I'm going to arbitrarily focus on single family houses.
45 |
46 | ```{r}
47 | ggplot(permits, aes(year + month / 12, f1units)) +
48 | geom_line(aes(group = area))
49 | ```
50 |
51 | ## Focus on big cities
52 |
53 | TOO MUCH DATA.
54 |
55 | Let's start by focusing on the bigger cities. This might give a misleading overview (maybe big cities are different fundamentally) but it's a good place to start because they'll have more permits so should hopefully be less variation and the long-term patterns clearler.
56 |
57 | ```{r}
58 | f1units <- permits %>%
59 | group_by(area) %>%
60 | summarise(mean = mean(f1units)) %>%
61 | arrange(desc(mean))
62 | f1units
63 |
64 | permits_big <- permits %>%
65 | semi_join(f1units %>% filter(mean > 120)) %>%
66 | mutate(date = year + (month - 1) / 12)
67 | ```
68 |
69 | Now replot:
70 |
71 | ```{r}
72 | ggplot(permits_big, aes(date, f1units)) +
73 | geom_line(aes(group = area))
74 | ```
75 |
76 | ```{r}
77 | ggplot(permits_big, aes(date, f1units)) +
78 | geom_line(aes(group = area)) +
79 | scale_y_log10()
80 |
81 | ggplot(permits_big, aes(date, f1units)) +
82 | geom_line(aes(group = area), alpha = 1/10) +
83 | scale_y_log10()
84 |
85 | ggplot(permits_big, aes(date, f1units)) +
86 | geom_line(aes(group = area), alpha = 1/5) +
87 | scale_y_log10() +
88 | geom_smooth(se = FALSE)
89 | ```
90 |
91 | ## What's that regular pattern?
92 |
93 | ```{r}
94 | houston <- permits_big %>% filter(str_detect(area, "Houston"))
95 | ggplot(houston, aes(date, f1units + 1)) +
96 | geom_line()
97 |
98 | ggplot(houston, aes(month, f1units + 1)) +
99 | geom_line(aes(group = year)) +
100 | geom_smooth(se = FALSE)
101 |
102 | ggplot(houston, aes(month, f1units + 1)) +
103 | geom_line(aes(group = year)) +
104 | geom_smooth(se = FALSE) +
105 | scale_y_log10()
106 |
107 | ```
108 |
109 | Wonderings:
110 |
111 | * Is the seaonal pattern more pronounced in colder climates?
112 | * What's the lag between permitting and beginning construction?
113 | * What drives seasonal pattern? July usually less pleasant than Jan in Houston.
114 |
115 | ## Can we remove it?
116 |
117 | Let's use a model to partition the signal into monthly pattern + other
118 |
119 | ```{r}
120 | library(modelr)
121 |
122 | houston_mod <- lm(log(f1units) ~ factor(month), data = houston)
123 | houston %>%
124 | add_predictions(houston_mod) %>%
125 | ggplot(aes(date, pred)) +
126 | geom_line()
127 |
128 | houston %>%
129 | add_residuals(houston_mod) %>%
130 | ggplot(aes(date, resid)) +
131 | geom_hline(yintercept = 0, colour = "white", size = 3) +
132 | geom_line()
133 | ```
134 |
135 | ## Now for all cities
136 |
137 | ```{r}
138 | by_area <- permits_big %>%
139 | group_by(area) %>%
140 | nest()
141 |
142 | area_model <- function(df) {
143 | lm(log10(f1units + 1) ~ factor(month), data = df)
144 | }
145 |
146 | detrended <- by_area %>% mutate(
147 | model = map(data, area_model),
148 | resids = map2(data, model, add_residuals)
149 | ) %>% unnest(resids)
150 |
151 |
152 | ggplot(detrended, aes(date, resid)) +
153 | geom_line(aes(group = area), alpha = 1/10) +
154 | geom_smooth(se = FALSE)
155 |
156 | ggplot(detrended, aes(date, 10 ^ resid)) +
157 | geom_line(aes(group = area), alpha = 1/10) +
158 | geom_smooth(se = FALSE)
159 |
160 | ggplot(detrended, aes(date, 10 ^ resid)) +
161 | geom_line(aes(group = area), alpha = 1/10) +
162 | stat_summary(geom = "line", fun.y = function(x) quantile(x, 0.25), colour = "blue") +
163 | stat_summary(geom = "line", fun.y = function(x) quantile(x, 0.75), colour = "blue")
164 | ```
165 |
166 | Wonderings:
167 |
168 | * Is the gap spreading?
169 | * Can explain why some cities are growing more?
170 | * Is it population? Is it geography? (Probably more complex than either!)
171 | * Does the pattern follow for 2-4 and 5+ family units?
172 |
--------------------------------------------------------------------------------
/permits-practice.md:
--------------------------------------------------------------------------------
1 |
2 | # US building permits
3 |
4 | ``` r
5 | library(tidyverse)
6 | ```
7 |
8 | ## ── Attaching packages ──────────────────────────────────────── tidyverse 1.2.0.9000 ──
9 |
10 | ## ✔ ggplot2 2.2.1.9000 ✔ purrr 0.2.4
11 | ## ✔ tibble 1.4.1 ✔ dplyr 0.7.4
12 | ## ✔ tidyr 0.7.2.9000 ✔ stringr 1.2.0
13 | ## ✔ readr 1.1.1 ✔ forcats 0.2.0
14 |
15 | ## ── Conflicts ──────────────────────────────────────────────── tidyverse_conflicts() ──
16 | ## ✖ dplyr::filter() masks stats::filter()
17 | ## ✖ dplyr::lag() masks stats::lag()
18 |
19 | ## Data import
20 |
21 | Data downloaded from
22 |
23 |
24 | ``` r
25 | permits_raw <- read_csv("dataPermit_full.csv", na = "null")
26 | ```
27 |
28 | ## Parsed with column specification:
29 | ## cols(
30 | ## area = col_character(),
31 | ## date = col_character(),
32 | ## f1units = col_integer(),
33 | ## f1change = col_double(),
34 | ## f1value = col_integer(),
35 | ## f1valchange = col_double(),
36 | ## f24units = col_integer(),
37 | ## f24change = col_double(),
38 | ## f24value = col_integer(),
39 | ## f24valchange = col_double(),
40 | ## f5units = col_integer(),
41 | ## f5change = col_double(),
42 | ## f5value = col_integer(),
43 | ## f5valchange = col_double()
44 | ## )
45 |
46 | ``` r
47 | permits_raw
48 | ```
49 |
50 | ## # A tibble: 89,660 x 14
51 | ## area date f1un… f1ch… f1va… f1va… f24u… f24c… f24v… f24v… f5un… f5ch…
52 | ##
53 | ## 1 Abil… 01/1… 24 NA 67900 NA 4 NA 46200 NA 200 NA
54 | ## 2 Abil… 02/1… 39 NA 75900 NA 0 NA 0 NA 0 NA
55 | ## 3 Abil… 03/1… 38 NA 78000 NA 4 NA 37000 NA 0 NA
56 | ## 4 Abil… 04/1… 29 NA 66500 NA 0 NA 0 NA 0 NA
57 | ## 5 Abil… 05/1… 29 NA 77600 NA 0 NA 0 NA 0 NA
58 | ## 6 Abil… 06/1… 42 NA 66500 NA 0 NA 0 NA 0 NA
59 | ## 7 Abil… 07/1… 48 NA 67600 NA 18 NA 24400 NA 0 NA
60 | ## 8 Abil… 08/1… 67 NA 69000 NA 0 NA 0 NA 0 NA
61 | ## 9 Abil… 09/1… 53 NA 60800 NA 2 NA 31200 NA 0 NA
62 | ## 10 Abil… 10/1… 80 NA 73000 NA 2 NA 23800 NA 152 NA
63 | ## # ... with 89,650 more rows, and 2 more variables: f5value ,
64 | ## # f5valchange
65 |
66 | - f1 = 1 family home; f24 = 2-4 family home; f5 = 5+ unit home
67 | - units = number built; change = percent change from previous month
68 | - value = average value per unit; valchange = percent change from
69 | previous month Hire Director of Education
70 |
71 | Eliminate the change and valchange variables; can easily recompute if
72 | needed (and good practice to be sceptical about someone elses
73 | computations)
74 |
75 | ``` r
76 | permits <- permits_raw %>%
77 | select(-ends_with("change")) %>%
78 | separate(date, c("month", "year"), "/", convert = TRUE) %>%
79 | filter(year > 2007)
80 | permits
81 | ```
82 |
83 | ## # A tibble: 44,707 x 9
84 | ## area f1units month year f1value f24units f24value f5units f5va…
85 | ##
86 | ## 1 Abilene, TX 10 1 2008 179100 4 111500 0 0
87 | ## 2 Abilene, TX 19 2 2008 158000 2 80000 0 0
88 | ## 3 Abilene, TX 29 3 2008 190300 0 0 0 0
89 | ## 4 Abilene, TX 21 4 2008 155600 0 0 0 0
90 | ## 5 Abilene, TX 26 5 2008 159500 0 0 0 0
91 | ## 6 Abilene, TX 14 6 2008 142100 0 0 0 0
92 | ## 7 Abilene, TX 20 7 2008 192600 0 0 0 0
93 | ## 8 Abilene, TX 17 8 2008 215000 0 0 0 0
94 | ## 9 Abilene, TX 9 9 2008 127500 0 0 0 0
95 | ## 10 Abilene, TX 8 10 2008 174700 0 0 0 0
96 | ## # ... with 44,697 more rows
97 |
98 | ## Very basic EDA
99 |
100 | ``` r
101 | permits %>% count(year)
102 | ```
103 |
104 | ## # A tibble: 10 x 2
105 | ## year n
106 | ##
107 | ## 1 2008 4560
108 | ## 2 2009 4560
109 | ## 3 2010 4560
110 | ## 4 2011 4560
111 | ## 5 2012 4560
112 | ## 6 2013 4560
113 | ## 7 2014 4572
114 | ## 8 2015 4380
115 | ## 9 2016 4380
116 | ## 10 2017 4015
117 |
118 | ``` r
119 | permits %>% count(area)
120 | ```
121 |
122 | ## # A tibble: 381 x 2
123 | ## area n
124 | ##
125 | ## 1 Abilene, TX 119
126 | ## 2 Akron, OH 119
127 | ## 3 Albany-Schenectady-Troy, NY 119
128 | ## 4 Albany, GA 119
129 | ## 5 Albany, OR 119
130 | ## 6 Albuquerque, NM 119
131 | ## 7 Alexandria, LA 119
132 | ## 8 Allentown-Bethlehem-Easton, PA-NJ 119
133 | ## 9 Altoona, PA 119
134 | ## 10 Amarillo, TX 119
135 | ## # ... with 371 more rows
136 |
137 | ``` r
138 | permits %>% count(area) %>% count(n)
139 | ```
140 |
141 | ## # A tibble: 3 x 2
142 | ## n nn
143 | ##
144 | ## 1 47 1
145 | ## 2 84 16
146 | ## 3 119 364
147 |
148 | I’m going to arbitrarily focus on single family houses.
149 |
150 | ``` r
151 | ggplot(permits, aes(year + month / 12, f1units)) +
152 | geom_line(aes(group = area))
153 | ```
154 |
155 | 
156 |
157 | ## Focus on big cities
158 |
159 | TOO MUCH DATA.
160 |
161 | Let’s start by focusing on the bigger cities. This might give a
162 | misleading overview (maybe big cities are different fundamentally) but
163 | it’s a good place to start because they’ll have more permits so should
164 | hopefully be less variation and the long-term patterns clearler.
165 |
166 | ``` r
167 | f1units <- permits %>%
168 | group_by(area) %>%
169 | summarise(mean = mean(f1units)) %>%
170 | arrange(desc(mean))
171 | f1units
172 | ```
173 |
174 | ## # A tibble: 381 x 2
175 | ## area mean
176 | ##
177 | ## 1 Houston-The Woodlands-Sugar Land, TX 2546
178 | ## 2 Dallas-Fort Worth-Arlington, TX 1802
179 | ## 3 Atlanta-Sandy Springs-Roswell, GA 1154
180 | ## 4 Phoenix-Mesa-Scottsdale, AZ 1057
181 | ## 5 Washington-Arlington-Alexandria, DC-VA-MD-WV 955
182 | ## 6 Austin-Round Rock, TX 795
183 | ## 7 Charlotte-Concord-Gastonia, NC-SC 786
184 | ## 8 New York-Newark-Jersey City, NY-NJ-PA 758
185 | ## 9 Orlando-Kissimmee-Sanford, FL 713
186 | ## 10 Nashville-Davidson--Murfreesboro--Franklin, TN 647
187 | ## # ... with 371 more rows
188 |
189 | ``` r
190 | permits_big <- permits %>%
191 | semi_join(f1units %>% filter(mean > 120)) %>%
192 | mutate(date = year + (month - 1) / 12)
193 | ```
194 |
195 | ## Joining, by = "area"
196 |
197 | Now replot:
198 |
199 | ``` r
200 | ggplot(permits_big, aes(date, f1units)) +
201 | geom_line(aes(group = area))
202 | ```
203 |
204 | 
205 |
206 | ``` r
207 | ggplot(permits_big, aes(date, f1units)) +
208 | geom_line(aes(group = area)) +
209 | scale_y_log10()
210 | ```
211 |
212 | 
213 |
214 | ``` r
215 | ggplot(permits_big, aes(date, f1units)) +
216 | geom_line(aes(group = area), alpha = 1/10) +
217 | scale_y_log10()
218 | ```
219 |
220 | 
221 |
222 | ``` r
223 | ggplot(permits_big, aes(date, f1units)) +
224 | geom_line(aes(group = area), alpha = 1/5) +
225 | scale_y_log10() +
226 | geom_smooth(se = FALSE)
227 | ```
228 |
229 | ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
230 |
231 | 
232 |
233 | ## What’s that regular pattern?
234 |
235 | ``` r
236 | houston <- permits_big %>% filter(str_detect(area, "Houston"))
237 | ggplot(houston, aes(date, f1units + 1)) +
238 | geom_line()
239 | ```
240 |
241 | 
242 |
243 | ``` r
244 | ggplot(houston, aes(month, f1units + 1)) +
245 | geom_line(aes(group = year)) +
246 | geom_smooth(se = FALSE)
247 | ```
248 |
249 | ## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
250 |
251 | 
252 |
253 | ``` r
254 | ggplot(houston, aes(month, f1units + 1)) +
255 | geom_line(aes(group = year)) +
256 | geom_smooth(se = FALSE) +
257 | scale_y_log10()
258 | ```
259 |
260 | ## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
261 |
262 | 
263 |
264 | Wonderings:
265 |
266 | - Is the seaonal pattern more pronounced in colder climates?
267 | - What’s the lag between permitting and beginning construction?
268 | - What drives seasonal pattern? July usually less pleasant than Jan in
269 | Houston.
270 |
271 | ## Can we remove it?
272 |
273 | Let’s use a model to partition the signal into monthly pattern + other
274 |
275 | ``` r
276 | library(modelr)
277 |
278 | houston_mod <- lm(log(f1units) ~ factor(month), data = houston)
279 | houston %>%
280 | add_predictions(houston_mod) %>%
281 | ggplot(aes(date, pred)) +
282 | geom_line()
283 | ```
284 |
285 | 
286 |
287 | ``` r
288 | houston %>%
289 | add_residuals(houston_mod) %>%
290 | ggplot(aes(date, resid)) +
291 | geom_hline(yintercept = 0, colour = "white", size = 3) +
292 | geom_line()
293 | ```
294 |
295 | 
296 |
297 | ## Now for all cities
298 |
299 | ``` r
300 | by_area <- permits_big %>%
301 | group_by(area) %>%
302 | nest()
303 |
304 | area_model <- function(df) {
305 | lm(log10(f1units + 1) ~ factor(month), data = df)
306 | }
307 |
308 | detrended <- by_area %>% mutate(
309 | model = map(data, area_model),
310 | resids = map2(data, model, add_residuals)
311 | ) %>% unnest(resids)
312 |
313 |
314 | ggplot(detrended, aes(date, resid)) +
315 | geom_line(aes(group = area), alpha = 1/10) +
316 | geom_smooth(se = FALSE)
317 | ```
318 |
319 | ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
320 |
321 | 
322 |
323 | ``` r
324 | ggplot(detrended, aes(date, 10 ^ resid)) +
325 | geom_line(aes(group = area), alpha = 1/10) +
326 | geom_smooth(se = FALSE)
327 | ```
328 |
329 | ## `geom_smooth()` using method = 'gam' and formula 'y ~ s(x, bs = "cs")'
330 |
331 | 
332 |
333 | ``` r
334 | ggplot(detrended, aes(date, 10 ^ resid)) +
335 | geom_line(aes(group = area), alpha = 1/10) +
336 | stat_summary(geom = "line", fun.y = function(x) quantile(x, 0.25), colour = "blue") +
337 | stat_summary(geom = "line", fun.y = function(x) quantile(x, 0.75), colour = "blue")
338 | ```
339 |
340 | 
341 |
342 | Wonderings:
343 |
344 | - Is the gap spreading?
345 | - Can explain why some cities are growing more?
346 | - Is it population? Is it geography? (Probably more complex than
347 | either\!)
348 | - Does the pattern follow for 2-4 and 5+ family units?
349 |
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-10-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-10-1.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-10-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-10-2.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-10-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-10-3.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-4-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-4-1.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-6-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-6-1.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-7-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-7-1.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-7-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-7-2.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-7-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-7-3.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-8-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-8-1.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-8-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-8-2.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-8-3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-8-3.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-9-1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-9-1.png
--------------------------------------------------------------------------------
/permits-practice_files/figure-gfm/unnamed-chunk-9-2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/hadley/building-permits/eb02603170863b6311f53bdce6830fc5fe53331b/permits-practice_files/figure-gfm/unnamed-chunk-9-2.png
--------------------------------------------------------------------------------