├── .gitignore
├── LICENSE
├── README.md
├── analysis
├── analysis.R
├── collisions_map.html
├── export_data.sql
├── helpers.R
└── regression_model.R
├── augment_data
├── Gemfile
├── Gemfile.lock
├── README.md
├── augment_collisions_with_geocoding.sql
├── augment_collisions_with_most_common_coordinates.sql
├── geocode.rb
└── populate_geocodings.sql
├── download_raw_data.sh
├── import_data.sh
├── initialize_database.sh
├── setup_files
├── create_indexes.sql
├── create_schema.sql
├── import_data.sql
└── populate_vehicles_and_factors.sql
└── shapefiles
├── nyct2010wi_18d
├── nyct2010wi.dbf
├── nyct2010wi.prj
├── nyct2010wi.shp
├── nyct2010wi.shp.xml
└── nyct2010wi.shx
└── taxi_zones
├── taxi_zones.dbf
├── taxi_zones.prj
├── taxi_zones.sbn
├── taxi_zones.sbx
├── taxi_zones.shp
├── taxi_zones.shp.xml
└── taxi_zones.shx
/.gitignore:
--------------------------------------------------------------------------------
1 | raw_data/*
2 | analysis/*.csv
3 | analysis/*.csv.gz
4 | analysis/graphs/*.png
5 | analysis/graphs/boroughs/*.png
6 | analysis/graphs/zones/*.png
7 | .Rapp.history
8 | .DS_Store
9 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Todd Schneider
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # NYC Motor Vehicle Collisions
2 |
3 | Code in support of this post: [Mapping Motor Vehicle Collisions in New York City](https://toddwschneider.com/posts/nyc-motor-vehicle-collisions-map/)
4 |
5 | Raw data comes from the NYPD: https://data.cityofnewyork.us/Public-Safety/NYPD-Motor-Vehicle-Collisions/h9gi-nx95
6 |
7 | ## Instructions
8 |
9 | 1. Download and install PostgreSQL and PostGIS (both are available via Homebrew)
10 | 2. `./download_raw_data.sh`
11 | 3. `./initialize_database.sh`
12 | 4. `./import_data.sh`
13 |
14 | Additional code to fill in missing coordinates for collisions that have cross streets or addresses but no lat/lon lives in the `augment_data/` subfolder
15 |
16 | Assorted SQL and R scripts to analyze data and draw maps are in the `analysis/` subfolder
17 |
--------------------------------------------------------------------------------
/analysis/analysis.R:
--------------------------------------------------------------------------------
1 | source("helpers.R")
2 |
3 | collisions = query("
4 | SELECT
5 | unique_key,
6 | collision_time::date AS date,
7 | date(date_trunc('month', collision_time)) AS month,
8 | extract(hour FROM collision_time) AS hour_of_day,
9 | extract(dow FROM collision_time) AS day_of_week,
10 | latitude,
11 | longitude,
12 | z.locationid,
13 | z.zone,
14 | c.borough,
15 | nyct2010_gid,
16 | number_of_motorists_injured + number_of_cyclists_injured + number_of_pedestrians_injured AS people_injured,
17 | number_of_motorists_injured AS motorists_injured,
18 | number_of_cyclists_injured AS cyclists_injured,
19 | number_of_pedestrians_injured AS pedestrians_injured,
20 | number_of_motorists_killed + number_of_cyclists_killed + number_of_pedestrians_killed AS people_killed,
21 | number_of_motorists_killed AS motorists_killed,
22 | number_of_cyclists_killed AS cyclists_killed,
23 | number_of_pedestrians_killed AS pedestrians_killed
24 | FROM collisions c
25 | LEFT JOIN taxi_zones z ON c.taxi_zone_gid = z.gid
26 | ORDER BY unique_key
27 | ")
28 |
29 | zb = query("SELECT DISTINCT zone, borough FROM taxi_zones ORDER BY zone")
30 |
31 | date_seq = seq(
32 | min(collisions$date),
33 | max(collisions$date),
34 | by = "1 day"
35 | )
36 |
37 | variable_factor_levels = c(
38 | "collisions",
39 | "people_injured",
40 | "people_killed",
41 | "motorists_injured",
42 | "motorists_killed",
43 | "cyclists_injured",
44 | "cyclists_killed",
45 | "pedestrians_injured",
46 | "pedestrians_killed"
47 | )
48 |
49 | variable_factor_labels = variable_factor_levels %>%
50 | gsub("_", " ", .) %>%
51 | capitalize_first_letter()
52 |
53 | aggregate_collisions = function(dimensions = quos()) {
54 | dimensions_excluded = purrr::map(dimensions, function(d) expr(-!!d))
55 |
56 | collisions %>%
57 | mutate(collisions = 1) %>%
58 | select(!!!dimensions, date, collisions, matches("_(injured|killed)$")) %>%
59 | group_by(!!!dimensions, date) %>%
60 | summarize_all(sum) %>%
61 | ungroup() %>%
62 | group_by(!!!dimensions) %>%
63 | complete(
64 | date = date_seq,
65 | fill = list(
66 | collisions = 0,
67 | people_injured = 0,
68 | motorists_injured = 0,
69 | cyclists_injured = 0,
70 | pedestrians_injured = 0,
71 | people_killed = 0,
72 | motorists_killed = 0,
73 | cyclists_killed = 0,
74 | pedestrians_killed = 0
75 | )
76 | ) %>%
77 | ungroup() %>%
78 | gather(variable, daily, -date, !!!dimensions_excluded) %>%
79 | group_by(!!!dimensions, variable) %>%
80 | arrange(!!!dimensions, variable, date) %>%
81 | mutate(
82 | rolling28 = rollsumr(daily, k = 28, na.pad = TRUE),
83 | rolling365 = rollsumr(daily, k = 365, na.pad = TRUE)
84 | ) %>%
85 | ungroup() %>%
86 | mutate(
87 | year = year(date),
88 | variable = factor(variable, levels = variable_factor_levels, labels = variable_factor_labels)
89 | )
90 | }
91 |
92 | aggregate_collisions_by_year = function(dimensions = quos()) {
93 | aggregate_collisions(dimensions = dimensions) %>%
94 | group_by(!!!dimensions, variable, year) %>%
95 | summarize(total = sum(daily)) %>%
96 | ungroup()
97 | }
98 |
99 | aggregated_data = aggregate_collisions_by_year() %>%
100 | filter(year %in% 2013:2018)
101 |
102 | aggregated_data_by_borough = aggregate_collisions_by_year(dimensions = quos(borough)) %>%
103 | filter(year %in% 2013:2018)
104 |
105 | aggregated_data_by_zone = aggregate_collisions_by_year(dimensions = quos(zone)) %>%
106 | filter(year %in% 2013:2018) %>%
107 | inner_join(zb, by = "zone")
108 |
109 | zones = aggregated_data_by_zone %>%
110 | distinct(zone) %>%
111 | filter(!is.na(zone)) %>%
112 | pull(zone)
113 |
114 | plot_bs = 32
115 | plot_width = 800
116 |
117 | p1 = aggregated_data %>%
118 | filter(variable == "Collisions") %>%
119 | ggplot(aes(x = year, y = total)) +
120 | geom_line(size = 1, color = nypd_blue) +
121 | geom_point(size = 3, color = nypd_blue) +
122 | geom_blank(aes(y = 0)) +
123 | geom_blank(aes(y = 1.3 * total)) +
124 | scale_y_continuous(labels = scales::comma) +
125 | ggtitle("New York City Motor Vehicle Collisions", "Annual total") +
126 | labs(caption = "Data via NYPD\ntoddwschneider.com") +
127 | theme_tws(base_size = plot_bs) +
128 | theme(
129 | axis.title = element_blank(),
130 | panel.grid.minor.x = element_blank()
131 | )
132 |
133 | p2 = aggregated_data %>%
134 | filter(variable != "Collisions") %>%
135 | ggplot(aes(x = year, y = total)) +
136 | geom_line(size = 1, color = nypd_blue) +
137 | geom_point(size = 3, color = nypd_blue) +
138 | geom_blank(aes(y = 0)) +
139 | geom_blank(aes(y = 1.4 * total)) +
140 | scale_x_continuous(breaks = c(2014, 2016, 2018)) +
141 | scale_y_continuous(labels = scales::comma) +
142 | facet_wrap(~variable, scales = "free_y", ncol = 2) +
143 | ggtitle("New York City", "Annual injuries from motor vehicle collisions") +
144 | labs(caption = "Data via NYPD\ntoddwschneider.com") +
145 | theme_tws(base_size = plot_bs) +
146 | theme(
147 | axis.title = element_blank(),
148 | panel.grid.minor.y = element_blank(),
149 | plot.subtitle = element_text(margin = unit(c(0, 0, 1.1, 0), "lines")),
150 | plot.margin = margin(plot_bs / 2, plot_bs * 0.75, plot_bs / 2, plot_bs / 2),
151 | axis.text = element_text(size = rel(0.7)),
152 | strip.text = element_text(size = rel(0.7))
153 | )
154 |
155 | png("graphs/nyc_collisions.png", height = plot_width * 0.75, width = plot_width)
156 | print(p1)
157 | dev.off()
158 |
159 | png("graphs/nyc_injuries.png", height = plot_width * 1.5, width = plot_width)
160 | print(p2)
161 | dev.off()
162 |
163 | for (b in c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island")) {
164 | p1 = aggregated_data_by_borough %>%
165 | filter(borough == b, variable == "Collisions") %>%
166 | ggplot(aes(x = year, y = total)) +
167 | geom_line(size = 1, color = nypd_blue) +
168 | geom_point(size = 3, color = nypd_blue) +
169 | geom_blank(aes(y = 0)) +
170 | geom_blank(aes(y = 1.4 * total)) +
171 | scale_x_continuous(breaks = c(2014, 2016, 2018)) +
172 | scale_y_continuous(labels = scales::comma) +
173 | ggtitle(paste(b, "Motor Vehicle Collisions"), "Annual total") +
174 | labs(caption = "Data via NYPD\ntoddwschneider.com") +
175 | theme_tws(base_size = plot_bs) +
176 | theme(
177 | axis.title = element_blank(),
178 | panel.grid.minor.y = element_blank()
179 | )
180 |
181 | p2 = aggregated_data_by_borough %>%
182 | filter(variable != "Collisions", borough == b) %>%
183 | ggplot(aes(x = year, y = total)) +
184 | geom_line(size = 1, color = nypd_blue) +
185 | geom_point(size = 3, color = nypd_blue) +
186 | geom_blank(aes(y = 0)) +
187 | geom_blank(aes(y = 1.4 * total)) +
188 | scale_x_continuous(breaks = c(2014, 2016, 2018)) +
189 | scale_y_continuous(labels = scales::comma, breaks = integer_breaks(n = 3)) +
190 | facet_wrap(~variable, scales = "free_y", ncol = 2) +
191 | ggtitle(b, "Annual injuries from motor vehicle collisions") +
192 | labs(caption = "Data via NYPD\ntoddwschneider.com") +
193 | theme_tws(base_size = plot_bs) +
194 | theme(
195 | axis.title = element_blank(),
196 | panel.grid.minor.y = element_blank(),
197 | plot.subtitle = element_text(margin = unit(c(0, 0, 1.1, 0), "lines")),
198 | plot.margin = margin(plot_bs / 2, plot_bs * 0.75, plot_bs / 2, plot_bs / 2),
199 | axis.text = element_text(size = rel(0.7)),
200 | strip.text = element_text(size = rel(0.7))
201 | )
202 |
203 | png(paste0("graphs/boroughs/", tolower(gsub(" ", "_", b)), "_collisions.png"), height = plot_width * 0.75, width = plot_width)
204 | print(p1)
205 | dev.off()
206 |
207 | png(paste0("graphs/boroughs/", tolower(gsub(" ", "_", b)), "_injuries.png"), height = plot_width * 1.5, width = plot_width)
208 | print(p2)
209 | dev.off()
210 | }
211 |
212 | for (z in zones) {
213 | zfile = z %>%
214 | str_replace_all("['()]", "") %>%
215 | str_replace_all("[\\s/]", "_") %>%
216 | tolower()
217 |
218 | borough = filter(zb, zone == z)$borough
219 |
220 | p1 = aggregated_data_by_zone %>%
221 | filter(zone == z, variable == "Collisions") %>%
222 | ggplot(aes(x = year, y = total)) +
223 | geom_line(size = 1, color = nypd_blue) +
224 | geom_point(size = 3, color = nypd_blue) +
225 | geom_blank(aes(y = 0)) +
226 | geom_blank(aes(y = 1.4 * total)) +
227 | scale_x_continuous(breaks = c(2014, 2016, 2018)) +
228 | scale_y_continuous(labels = scales::comma) +
229 | ggtitle(
230 | paste(z, borough, sep = ", "),
231 | "Annual motor vehicle collisions"
232 | ) +
233 | labs(caption = "Data via NYPD\ntoddwschneider.com") +
234 | theme_tws(base_size = plot_bs) +
235 | theme(
236 | axis.title = element_blank(),
237 | panel.grid.minor.y = element_blank()
238 | )
239 |
240 | p2 = aggregated_data_by_zone %>%
241 | filter(zone == z, variable != "Collisions") %>%
242 | ggplot(aes(x = year, y = total)) +
243 | geom_line(size = 1, color = nypd_blue) +
244 | geom_point(size = 3, color = nypd_blue) +
245 | geom_blank(aes(y = 0)) +
246 | geom_blank(aes(y = 1.4 * total)) +
247 | scale_x_continuous(breaks = c(2014, 2016, 2018)) +
248 | scale_y_continuous(labels = scales::comma, breaks = integer_breaks(n = 3)) +
249 | facet_wrap(~variable, scales = "free_y", ncol = 2) +
250 | ggtitle(
251 | paste(z, borough, sep = ", "),
252 | "Annual injuries from motor vehicle collision"
253 | ) +
254 | labs(caption = "Data via NYPD\ntoddwschneider.com") +
255 | theme_tws(base_size = plot_bs) +
256 | theme(
257 | axis.title = element_blank(),
258 | panel.grid.minor.y = element_blank(),
259 | plot.subtitle = element_text(margin = unit(c(0, 0, 1.1, 0), "lines")),
260 | plot.margin = margin(plot_bs / 2, plot_bs * 0.75, plot_bs / 2, plot_bs / 2),
261 | axis.text = element_text(size = rel(0.7)),
262 | strip.text = element_text(size = rel(0.7))
263 | )
264 |
265 | png(paste0("graphs/zones/", zfile, "_collisions.png"), height = plot_width * 0.75, width = plot_width)
266 | print(p1)
267 | dev.off()
268 |
269 | png(paste0("graphs/zones/", zfile, "_injuries.png"), height = plot_width * 1.5, width = plot_width)
270 | print(p2)
271 | dev.off()
272 | }
273 |
274 |
275 |
276 | # find zones with steepest (positive or negative) trends
277 | regressions_data = aggregated_data_by_zone %>%
278 | select(zone, borough, year, variable, total) %>%
279 | mutate(variable = gsub(" ", "_", tolower(variable))) %>%
280 | spread(variable, total)
281 |
282 | slopes = purrr::map(zones, function(z) {
283 | df = filter(regressions_data, zone == z)
284 |
285 | tibble(
286 | zone = z,
287 | people_injured = lm(people_injured ~ year, data = df)$coef["year"],
288 | motorists_injured = lm(motorists_injured ~ year, data = df)$coef["year"],
289 | cyclists_injured = lm(cyclists_injured ~ year, data = df)$coef["year"],
290 | pedestrians_injured = lm(pedestrians_injured ~ year, data = df)$coef["year"]
291 | )
292 | }) %>% bind_rows()
293 |
294 | arrange(slopes, people_injured)
295 | arrange(slopes, desc(people_injured))
296 |
297 |
298 |
299 | # injury rates by time of day and alcohol involvement
300 | alcohol_involved_unique_keys = query("
301 | SELECT DISTINCT collision_unique_key
302 | FROM collisions_contributing_factors
303 | WHERE contributing_factor LIKE '%alcohol%'
304 | ")$collision_unique_key
305 |
306 | injury_rates_hourly = collisions %>%
307 | mutate(alcohol_involved = unique_key %in% alcohol_involved_unique_keys) %>%
308 | group_by(hour_of_day) %>%
309 | summarize(
310 | collisions = n(),
311 | frac_with_injury = mean(people_injured > 0),
312 | frac_with_fatality = mean(people_killed > 0),
313 | frac_with_alcohol_involvement = mean(alcohol_involved)
314 | ) %>%
315 | ungroup()
316 |
317 | injury_rates_hourly = bind_rows(
318 | injury_rates_hourly,
319 | injury_rates_hourly %>%
320 | filter(hour_of_day == 0) %>%
321 | mutate(hour_of_day = 24)
322 | )
323 |
324 | p1 = ggplot(injury_rates_hourly, aes(x = hour_of_day, y = collisions)) +
325 | geom_line(size = 1, color = nypd_blue) +
326 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) +
327 | scale_y_continuous(labels = scales::comma) +
328 | expand_limits(y = c(0, 125e3)) +
329 | ggtitle(
330 | "NYC collisions by time of day",
331 | "Jul 2012–Jan 2019"
332 | ) +
333 | labs(caption = "Data via NYPD\ntoddwschneider.com") +
334 | theme_tws(base_size = plot_bs) +
335 | no_axis_titles()
336 |
337 | p2 = ggplot(injury_rates_hourly, aes(x = hour_of_day, y = frac_with_injury)) +
338 | geom_line(size = 1, color = nypd_blue) +
339 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) +
340 | scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
341 | expand_limits(y = 0) +
342 | ggtitle(
343 | "Injury rate by time of day",
344 | "% of NYC collisions that result in injury"
345 | ) +
346 | labs(caption = "Data via NYPD, Jul 2012–Jan 2019\ntoddwschneider.com") +
347 | theme_tws(base_size = plot_bs) +
348 | no_axis_titles()
349 |
350 | p3 = ggplot(injury_rates_hourly, aes(x = hour_of_day, y = frac_with_fatality)) +
351 | geom_line(size = 1, color = nypd_blue) +
352 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) +
353 | scale_y_continuous(labels = scales::percent_format(accuracy = 0.1)) +
354 | expand_limits(y = 0) +
355 | ggtitle(
356 | "Fatality rate by time of day",
357 | "% of NYC collisions that result in fatality"
358 | ) +
359 | labs(caption = "Data via NYPD, Jul 2012–Jan 2019\ntoddwschneider.com") +
360 | theme_tws(base_size = plot_bs) +
361 | no_axis_titles()
362 |
363 | p4 = ggplot(injury_rates_hourly, aes(x = hour_of_day, y = frac_with_alcohol_involvement)) +
364 | geom_line(size = 1, color = nypd_blue) +
365 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) +
366 | scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
367 | expand_limits(y = 0) +
368 | ggtitle(
369 | "Alcohol involvement by time of day",
370 | "% of NYC collisions with alcohol cited as contributing factor"
371 | ) +
372 | labs(caption = "Data via NYPD, Jul 2012–Jan 2019\ntoddwschneider.com") +
373 | theme_tws(base_size = plot_bs) +
374 | theme(plot.subtitle = element_text(size = rel(0.8))) +
375 | no_axis_titles()
376 |
377 | png("graphs/collisions_by_hour.png", height = plot_width, width = plot_width)
378 | print(p1)
379 | dev.off()
380 |
381 | png("graphs/injury_rate_by_hour.png", height = plot_width, width = plot_width)
382 | print(p2)
383 | dev.off()
384 |
385 | png("graphs/fatality_rate_by_hour.png", height = plot_width, width = plot_width)
386 | print(p3)
387 | dev.off()
388 |
389 | png("graphs/alcohol_involvement_by_hour.png", height = plot_width, width = plot_width)
390 | print(p4)
391 | dev.off()
392 |
393 | alcohol_stats = collisions %>%
394 | mutate(alcohol_involved = unique_key %in% alcohol_involved_unique_keys) %>%
395 | group_by(alcohol_involved) %>%
396 | summarize(
397 | collisions = n(),
398 | frac_with_injury = mean(people_injured > 0),
399 | frac_with_fatality = mean(people_killed > 0)
400 | ) %>%
401 | ungroup()
402 |
403 | injury_rates_alcohol = collisions %>%
404 | mutate(alcohol_involved = unique_key %in% alcohol_involved_unique_keys) %>%
405 | group_by(hour_of_day, alcohol_involved) %>%
406 | summarize(
407 | collisions = n(),
408 | frac_with_injury = mean(people_injured > 0),
409 | frac_with_fatality = mean(people_killed > 0)
410 | ) %>%
411 | ungroup()
412 |
413 | injury_rates_alcohol = bind_rows(
414 | injury_rates_alcohol,
415 | injury_rates_alcohol %>%
416 | filter(hour_of_day == 0) %>%
417 | mutate(hour_of_day = 24)
418 | )
419 |
420 | p5 = ggplot(injury_rates_alcohol, aes(x = hour_of_day, y = frac_with_injury, color = alcohol_involved)) +
421 | geom_line(size = 1) +
422 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) +
423 | scale_y_continuous(labels = scales::percent_format(accuracy = 1)) +
424 | scale_color_manual(values = c(nypd_blue, "#49acd5"), guide = FALSE) +
425 | expand_limits(y = c(0, 0.4)) +
426 | annotate(
427 | "text", x = 17, y = 0.15, label = "Alcohol not cited",
428 | size = 8, color = nypd_blue, family = "Open Sans"
429 | ) +
430 | annotate(
431 | "text", x = 17, y = 0.34, label = "Alcohol cited",
432 | size = 8, color = "#49acd5", family = "Open Sans"
433 | ) +
434 | ggtitle(
435 | "Injury rate by alcohol involvement",
436 | "% of NYC collisions that result in injury"
437 | ) +
438 | labs(caption = "Data via NYPD, Jul 2012–Jan 2019\ntoddwschneider.com") +
439 | theme_tws(base_size = plot_bs) +
440 | no_axis_titles()
441 |
442 | p6 = ggplot(injury_rates_alcohol, aes(x = hour_of_day, y = frac_with_fatality, color = alcohol_involved)) +
443 | geom_line(size = 1) +
444 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) +
445 | scale_y_continuous(labels = scales::percent_format(accuracy = 0.1)) +
446 | scale_color_manual(values = c(nypd_blue, "#49acd5"), guide = FALSE) +
447 | expand_limits(y = c(0, 0.01)) +
448 | annotate(
449 | "text", x = 5, y = 0, label = "Alcohol not cited",
450 | size = 8, color = nypd_blue, family = "Open Sans"
451 | ) +
452 | annotate(
453 | "text", x = 5, y = 0.009, label = "Alcohol cited",
454 | size = 8, color = "#49acd5", family = "Open Sans"
455 | ) +
456 | ggtitle(
457 | "Fatality rate by alcohol involvement",
458 | "% of NYC collisions that result in fatality"
459 | ) +
460 | labs(caption = "Data via NYPD, Jul 2012–Jan 2019\ntoddwschneider.com") +
461 | theme_tws(base_size = plot_bs) +
462 | no_axis_titles()
463 |
464 | png("graphs/injury_rate_by_alcohol_involvement.png", height = plot_width, width = plot_width)
465 | print(p5)
466 | dev.off()
467 |
468 | png("graphs/fatality_rate_by_alcohol_involvement.png", height = plot_width, width = plot_width)
469 | print(p6)
470 | dev.off()
471 |
--------------------------------------------------------------------------------
/analysis/collisions_map.html:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 | NYC Motor Vehicle Collisions
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
143 |
144 |
145 |
146 |
153 |
154 |
155 |
156 |
157 | ×
158 |
159 |
160 |
161 |
162 |
173 |
174 |
175 |
176 |
184 |
185 |
186 |
187 |
197 |
198 |
199 |
200 |
208 |
209 |
210 |
211 |
217 |
218 |
219 |
220 |
226 |
227 |
228 |
229 |
230 |
231 |
247 |
248 |
249 |
250 |
251 |
252 | Fetching data
253 |

254 |
255 |
256 |
257 |
258 |
259 |
260 |
261 |
262 |
263 |
264 |
265 |
266 |
267 |
268 |
269 |
270 |
271 |
272 |
273 |
278 |
279 |
280 |
281 |
282 |
283 |
Collisions
284 |
285 |
286 |
287 |
288 |
289 |
290 |
291 |
292 |
865 |
866 |
--------------------------------------------------------------------------------
/analysis/export_data.sql:
--------------------------------------------------------------------------------
1 | DROP TABLE IF EXISTS export_data;
2 |
3 | CREATE TEMP TABLE export_data AS
4 | WITH nv AS (
5 | SELECT
6 | collision_unique_key,
7 | count(*) AS num_vehicles
8 | FROM collisions_vehicles
9 | GROUP BY collision_unique_key
10 | )
11 | SELECT
12 | extract(year FROM collision_time) AS year,
13 | CASE borough
14 | WHEN 'Bronx' THEN 'x'
15 | WHEN 'Brooklyn' THEN 'k'
16 | WHEN 'Manhattan' THEN 'm'
17 | WHEN 'Queens' THEN 'q'
18 | WHEN 'Staten Island' THEN 's'
19 | END AS borough,
20 | round(latitude, 5) AS lat,
21 | round(longitude, 5) AS lng,
22 | nv.num_vehicles,
23 | number_of_motorists_injured AS motorists_injured,
24 | number_of_motorists_killed AS motorists_killed,
25 | number_of_cyclists_injured AS cyclists_injured,
26 | number_of_cyclists_killed AS cyclists_killed,
27 | number_of_pedestrians_injured AS pedestrians_injured,
28 | number_of_pedestrians_killed AS pedestrians_killed,
29 | CASE
30 | WHEN extract(hour FROM collision_time) IN (8, 9, 10) THEN 'm'
31 | WHEN extract(hour FROM collision_time) IN (11, 12, 13, 14, 15) THEN 'i'
32 | WHEN extract(hour FROM collision_time) IN (16, 17, 18) THEN 'a'
33 | WHEN extract(hour FROM collision_time) IN (19, 20, 21) THEN 'e'
34 | ELSE 'o'
35 | END AS time_of_day
36 | FROM collisions c
37 | LEFT JOIN nv ON c.unique_key = nv.collision_unique_key
38 | WHERE latitude IS NOT NULL
39 | AND longitude IS NOT NULL
40 | AND latitude BETWEEN 40.4 AND 41
41 | AND longitude BETWEEN -74.4 AND -73.5
42 | ORDER BY unique_key;
43 |
44 | \copy (SELECT * FROM export_data) TO 'nyc_motor_vehicle_collisions.csv' CSV HEADER;
45 |
--------------------------------------------------------------------------------
/analysis/helpers.R:
--------------------------------------------------------------------------------
1 | required_packages = c("tidyverse", "scales", "lubridate", "RPostgres", "zoo")
2 | installed_packages = rownames(installed.packages())
3 | packages_to_install = required_packages[!(required_packages %in% installed_packages)]
4 |
5 | if (length(packages_to_install) > 0) {
6 | install.packages(
7 | packages_to_install,
8 | dependencies = TRUE,
9 | repos = "https://cloud.r-project.org",
10 | )
11 | }
12 |
13 | library(tidyverse)
14 | library(scales)
15 | library(lubridate)
16 | library(zoo)
17 | library(RPostgres)
18 |
19 | con = dbConnect(
20 | dbDriver("Postgres"),
21 | dbname = "nyc-motor-vehicle-collisions",
22 | host = "localhost"
23 | )
24 |
25 | query = function(sql) {
26 | res = dbSendQuery(con, sql)
27 | results = dbFetch(res) %>% as_tibble()
28 | dbClearResult(res)
29 | results
30 | }
31 |
32 | capitalize_first_letter = function(string) {
33 | paste0(toupper(substr(string, 1, 1)), substr(string, 2, nchar(string)))
34 | }
35 |
36 | theme_void_sf = function(base_size = 12) {
37 | theme_void(base_size = base_size) +
38 | theme(
39 | panel.grid = element_line(size = 0),
40 | text = element_text(family = "Open Sans")
41 | )
42 | }
43 |
44 | font_family = "Open Sans"
45 | title_font_family = "Fjalla One"
46 | nypd_blue = "#00003c"
47 |
48 | theme_tws = function(base_size = 12) {
49 | bg_color = "#f4f4f4"
50 | bg_rect = element_rect(fill = bg_color, color = bg_color)
51 |
52 | theme_bw(base_size) +
53 | theme(
54 | text = element_text(family = font_family),
55 | plot.title = element_text(family = title_font_family),
56 | plot.subtitle = element_text(size = rel(1)),
57 | plot.caption = element_text(size = rel(0.5), margin = unit(c(1, 0, 0, 0), "lines"), lineheight = 1.1, color = "#555555"),
58 | plot.background = bg_rect,
59 | axis.ticks = element_blank(),
60 | axis.text.x = element_text(size = rel(1)),
61 | axis.title.x = element_text(size = rel(1), margin = margin(1, 0, 0, 0, unit = "lines")),
62 | axis.text.y = element_text(size = rel(1)),
63 | axis.title.y = element_text(size = rel(1)),
64 | panel.background = bg_rect,
65 | panel.border = element_blank(),
66 | panel.grid.major = element_line(color = "grey80", size = 0.25),
67 | panel.grid.minor = element_line(color = "grey80", size = 0.25),
68 | panel.spacing = unit(1.5, "lines"),
69 | legend.background = bg_rect,
70 | legend.key.width = unit(1.5, "line"),
71 | legend.key = element_blank(),
72 | strip.background = element_blank()
73 | )
74 | }
75 |
76 | no_axis_titles = function() {
77 | theme(axis.title = element_blank())
78 | }
79 |
80 | # via https://stackoverflow.com/a/10559838
81 | integer_breaks = function(n = 3, ...) {
82 | breaker = pretty_breaks(n, ...)
83 | function(x) {
84 | breaks = breaker(x)
85 | breaks[breaks == floor(breaks)]
86 | }
87 | }
88 |
--------------------------------------------------------------------------------
/analysis/regression_model.R:
--------------------------------------------------------------------------------
1 | source("helpers.R")
2 | library(glmnet)
3 |
4 | # set up regression data
5 | regression_collisions = query("
6 | SELECT
7 | unique_key,
8 | extract(hour FROM collision_time) AS hour_of_day,
9 | extract(dow FROM collision_time) AS day_of_week,
10 | extract(year FROM collision_time) AS year,
11 | coalesce(on_street_name, off_street_name) AS street_name,
12 | borough,
13 | number_of_motorists_injured + number_of_cyclists_injured + number_of_pedestrians_injured > 0 AS has_injury,
14 | number_of_motorists_killed + number_of_cyclists_killed + number_of_pedestrians_killed > 0 AS has_fatality
15 | FROM collisions
16 | WHERE borough IS NOT NULL
17 | ORDER BY unique_key
18 | ")
19 |
20 | regression_contributing_factors = query("
21 | WITH candidates AS (
22 | SELECT contributing_factor
23 | FROM collisions_contributing_factors
24 | GROUP BY contributing_factor
25 | HAVING COUNT(*) >= 5000
26 | )
27 | SELECT *
28 | FROM collisions_contributing_factors
29 | WHERE contributing_factor IN (SELECT contributing_factor FROM candidates)
30 | ")
31 |
32 | regression_vehicle_types = query("
33 | WITH candidates AS (
34 | SELECT vehicle_type
35 | FROM collisions_vehicles
36 | GROUP BY vehicle_type
37 | HAVING COUNT(*) >= 5000
38 | )
39 | SELECT *
40 | FROM collisions_vehicles
41 | WHERE vehicle_type IN (SELECT vehicle_type FROM candidates)
42 | ")
43 |
44 | regression_vehicles_involved = query("
45 | SELECT collision_unique_key, count(*)::int AS num_vehicles
46 | FROM collisions_vehicles
47 | GROUP BY collision_unique_key
48 | ORDER BY collision_unique_key
49 | ")
50 |
51 | regression_collisions = regression_collisions %>%
52 | inner_join(regression_vehicles_involved, by = c("unique_key" = "collision_unique_key")) %>%
53 | mutate(
54 | hour_of_day = factor(hour_of_day, levels = c(12:23, 0:11)),
55 | day_of_week = factor(day_of_week),
56 | weekday = factor(day_of_week %in% 1:5),
57 | year = factor(year),
58 | borough = fct_relevel(factor(borough), "Manhattan"),
59 | num_vehicles = factor(num_vehicles, levels = c(2, 1, 3, 4, 5)),
60 | street_type = fct_relevel(factor(case_when(
61 | grepl("expressway|expy|expwy|parkway|pkwy|highway|bqe|turnpike|fdr|thruway", street_name) ~ "highway",
62 | grepl("street| st$", street_name) ~ "street",
63 | grepl("avenue|broadway|bowery| ave$", street_name) ~ "avenue",
64 | grepl(" road| rd$", street_name) ~ "road",
65 | grepl(" lane| ln$", street_name) ~ "lane",
66 | grepl(" drive| dr$", street_name) ~ "drive",
67 | grepl("boulevard|blvd", street_name) ~ "boulevard",
68 | grepl(" place| pl$", street_name) ~ "place",
69 | grepl("bridge", street_name) ~ "bridge",
70 | grepl("tunnel", street_name) ~ "tunnel",
71 | !is.na(street_name) ~ "other",
72 | TRUE ~ "unknown"
73 | )), "unknown")
74 | )
75 |
76 | for(f in sort(unique(regression_contributing_factors$contributing_factor))) {
77 | fname = paste0("cf_", gsub(".", "_", make.names(f), fixed = TRUE))
78 |
79 | factor_unique_keys = regression_contributing_factors %>%
80 | filter(contributing_factor == f) %>%
81 | pull(collision_unique_key) %>%
82 | unique()
83 |
84 | regression_collisions = regression_collisions %>%
85 | mutate(!!fname := as.numeric(unique_key %in% factor_unique_keys))
86 | }
87 |
88 | for(v in sort(unique(regression_vehicle_types$vehicle_type))) {
89 | vname = paste0("vt_", gsub(".", "_", make.names(v), fixed = TRUE))
90 |
91 | vehicle_unique_keys = regression_vehicle_types %>%
92 | filter(vehicle_type == v) %>%
93 | pull(collision_unique_key) %>%
94 | unique()
95 |
96 | regression_collisions = regression_collisions %>%
97 | mutate(!!vname := as.numeric(unique_key %in% vehicle_unique_keys))
98 | }
99 |
100 | # build model matrices
101 | injury_model_matrix = sparse.model.matrix(
102 | has_injury ~ . - 1,
103 | select(regression_collisions, -day_of_week, -street_name, -unique_key, -has_fatality)
104 | )
105 |
106 | fatality_model_matrix = sparse.model.matrix(
107 | has_fatality ~ . - 1,
108 | select(regression_collisions, -day_of_week, -street_name, -unique_key, -has_injury)
109 | )
110 |
111 | # run regularized regressions
112 | injury_cvfit = cv.glmnet(
113 | x = injury_model_matrix,
114 | y = regression_collisions$has_injury,
115 | family = "binomial"
116 | )
117 |
118 | fatality_cvfit = cv.glmnet(
119 | x = fatality_model_matrix,
120 | y = regression_collisions$has_fatality,
121 | family = "binomial"
122 | )
123 |
124 | # check on lambda values and coefficients
125 | plot(injury_cvfit)
126 | coef(injury_cvfit, s = "lambda.1se")
127 |
128 | plot(fatality_cvfit)
129 | coef(fatality_cvfit, s = "lambda.1se")
130 |
--------------------------------------------------------------------------------
/augment_data/Gemfile:
--------------------------------------------------------------------------------
1 | source 'https://rubygems.org'
2 |
3 | ruby '2.6.6'
4 |
5 | gem 'activerecord', '~> 5.2', require: 'active_record'
6 | gem 'pg', '~> 1.1'
7 | gem 'rest-client', '~> 2.0'
8 |
--------------------------------------------------------------------------------
/augment_data/Gemfile.lock:
--------------------------------------------------------------------------------
1 | GEM
2 | remote: https://rubygems.org/
3 | specs:
4 | activemodel (5.2.8.1)
5 | activesupport (= 5.2.8.1)
6 | activerecord (5.2.8.1)
7 | activemodel (= 5.2.8.1)
8 | activesupport (= 5.2.8.1)
9 | arel (>= 9.0)
10 | activesupport (5.2.8.1)
11 | concurrent-ruby (~> 1.0, >= 1.0.2)
12 | i18n (>= 0.7, < 2)
13 | minitest (~> 5.1)
14 | tzinfo (~> 1.1)
15 | arel (9.0.0)
16 | concurrent-ruby (1.1.10)
17 | domain_name (0.5.20190701)
18 | unf (>= 0.0.5, < 1.0.0)
19 | http-accept (1.7.0)
20 | http-cookie (1.0.3)
21 | domain_name (~> 0.5)
22 | i18n (1.11.0)
23 | concurrent-ruby (~> 1.0)
24 | mime-types (3.3.1)
25 | mime-types-data (~> 3.2015)
26 | mime-types-data (3.2020.0512)
27 | minitest (5.16.2)
28 | netrc (0.11.0)
29 | pg (1.2.3)
30 | rest-client (2.1.0)
31 | http-accept (>= 1.7.0, < 2.0)
32 | http-cookie (>= 1.0.2, < 2.0)
33 | mime-types (>= 1.16, < 4.0)
34 | netrc (~> 0.8)
35 | thread_safe (0.3.6)
36 | tzinfo (1.2.10)
37 | thread_safe (~> 0.1)
38 | unf (0.1.4)
39 | unf_ext
40 | unf_ext (0.0.7.7)
41 |
42 | PLATFORMS
43 | ruby
44 |
45 | DEPENDENCIES
46 | activerecord (~> 5.2)
47 | pg (~> 1.1)
48 | rest-client (~> 2.0)
49 |
50 | RUBY VERSION
51 | ruby 2.6.6p146
52 |
53 | BUNDLED WITH
54 | 1.17.2
55 |
--------------------------------------------------------------------------------
/augment_data/README.md:
--------------------------------------------------------------------------------
1 | # Fill in collision records that are missing coordinates
2 |
3 | ~13% of the collisions in the raw dataset are missing lat/lon coordinates. Many of those records have cross streets or addresses, which can be used to fill in coordinates with reasonable guesses. There are 2 strategies used to fill in missing coordinates: other collisions at the same cross streets, and geocoding. These processes add coordinates for ~8% of the full dataset, leaving ~5% of collisions without coordinates. The `collisions.coordinates_source` column keeps track of where each collision's coordinates came from. It's possible these augmentation processes might introduce some data errors, but an anecdotal manual review suggested that they are accurate more often than not.
4 |
5 | ## Coordinates from other collisions listed at the same cross streets
6 |
7 | Calculate the most common coordinates for each pair of cross streets, and use them to fill in coordinates for collisions at the same cross streets that are missing coordinates. E.g. there are collisions listed at Bruckner Boulevard & E 138 St that are missing coordinates, but there are also collisions at the same cross streets that have coordinates, so assume that the unknown Bruckner & E 138 collisions happened at the most common coordinates listed for the known Bruckner & E 138 collisions, subject there being at least 2 known collisions at the same lat/lon rounded to 4 digits.
8 |
9 | `psql nyc-motor-vehicle-collisions -f augment_collisions_with_most_common_coordinates.sql`
10 |
11 | ## Geocode missing coordinates
12 |
13 | 1. Get a Google Maps Geocoding API key: https://developers.google.com/maps/documentation/geocoding/get-api-key
14 | 2. `psql nyc-motor-vehicle-collisions -f populate_geocodings.sql`
15 | 3. `bundle install`
16 | 4. `ruby geocode.rb --google-api-key YOUR_API_KEY_HERE`
17 | 5. `psql nyc-motor-vehicle-collisions -f augment_collisions_with_geocoding.sql`
18 |
19 | Note that as of January 2019, Google Maps Geocoding API allows up to 40,000 requests per month for free in the United States, then charges $5 per 1,000 requests after that. See https://developers.google.com/maps/documentation/geocoding/usage-and-billing for the latest pricing. When I ran `geocode.rb` there fewer than 40,000 records to geocode, so I was able to do it for free.
20 |
21 | ## Augmentation results
22 |
23 | Coordinates sources with raw data through 12/31/2018:
24 |
25 | ```sql
26 | SELECT
27 | extract(year FROM collision_time) AS year,
28 | COUNT(*) AS n,
29 | ROUND(SUM(CASE WHEN coordinates_source = 'raw_data' THEN 1 END)::numeric / COUNT(*), 2) AS raw_data,
30 | ROUND(SUM(CASE WHEN coordinates_source = 'most_common_in_raw_data' THEN 1 END)::numeric / COUNT(*), 2) AS most_common_in_raw_data,
31 | ROUND(SUM(CASE WHEN coordinates_source = 'geocoding' THEN 1 END)::numeric / COUNT(*), 2) AS geocoding,
32 | ROUND(SUM(CASE WHEN coordinates_source IS NULL THEN 1 END)::numeric / COUNT(*), 2) AS unknown
33 | FROM collisions
34 | GROUP BY year
35 | ORDER BY year;
36 | ```
37 |
38 | ```sql
39 | year | n | raw_data | most_common_in_raw_data | geocoding | unknown
40 | ------+--------+----------+-------------------------+-----------+---------
41 | 2012 | 100541 | 0.85 | 0.06 | 0.03 | 0.06
42 | 2013 | 203727 | 0.84 | 0.07 | 0.03 | 0.06
43 | 2014 | 206028 | 0.84 | 0.07 | 0.03 | 0.06
44 | 2015 | 217693 | 0.84 | 0.07 | 0.03 | 0.06
45 | 2016 | 229780 | 0.84 | 0.08 | 0.03 | 0.05
46 | 2017 | 230991 | 0.94 | 0.01 | 0.03 | 0.02
47 | 2018 | 231016 | 0.93 | 0.01 | 0.03 | 0.03
48 | ```
49 |
--------------------------------------------------------------------------------
/augment_data/augment_collisions_with_geocoding.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE tmp_points AS
2 | SELECT
3 | id,
4 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS location
5 | FROM geocodings
6 | WHERE (taxi_zone_gid IS NULL OR nyct2010_gid IS NULL)
7 | AND latitude IS NOT NULL
8 | AND longitude IS NOT NULL;
9 |
10 | CREATE INDEX ON tmp_points USING gist (location);
11 |
12 | CREATE TABLE tmp_zones AS
13 | SELECT t.id, z.gid, z.borough
14 | FROM tmp_points t, taxi_zones z
15 | WHERE ST_Within(t.location, z.geom);
16 |
17 | CREATE UNIQUE INDEX ON tmp_zones (id);
18 |
19 | CREATE TABLE tmp_tracts AS
20 | SELECT t.id, n.gid, n.boroname
21 | FROM tmp_points t, nyct2010 n
22 | WHERE ST_Within(t.location, n.geom);
23 |
24 | CREATE UNIQUE INDEX ON tmp_tracts (id);
25 |
26 | UPDATE geocodings
27 | SET taxi_zone_gid = tmp_zones.gid,
28 | borough = tmp_zones.borough
29 | FROM tmp_zones
30 | WHERE geocodings.id = tmp_zones.id;
31 |
32 | UPDATE geocodings
33 | SET nyct2010_gid = tmp_tracts.gid,
34 | borough = tmp_tracts.boroname
35 | FROM tmp_tracts
36 | WHERE geocodings.id = tmp_tracts.id;
37 |
38 | WITH coords AS (
39 | SELECT
40 | c.unique_key,
41 | g.latitude,
42 | g.longitude,
43 | g.taxi_zone_gid,
44 | g.borough,
45 | g.nyct2010_gid
46 | FROM collisions c
47 | INNER JOIN geocodings g
48 | ON coalesce(c.on_street_name, '') = coalesce(g.on_street_name, '')
49 | AND coalesce(c.cross_street_name, '') = coalesce(g.cross_street_name, '')
50 | AND coalesce(c.off_street_name, '') = coalesce(g.off_street_name, '')
51 | AND coalesce(c.reported_borough, '') = coalesce(g.reported_borough, '')
52 | WHERE c.latitude IS NULL
53 | AND c.longitude IS NULL
54 | AND g.latitude IS NOT NULL
55 | AND g.longitude IS NOT NULL
56 | )
57 | UPDATE collisions
58 | SET latitude = coords.latitude,
59 | longitude = coords.longitude,
60 | taxi_zone_gid = coords.taxi_zone_gid,
61 | borough = coords.borough,
62 | nyct2010_gid = coords.nyct2010_gid,
63 | coordinates_source = 'geocoding'
64 | FROM coords
65 | WHERE collisions.unique_key = coords.unique_key;
66 |
67 | DROP TABLE tmp_points;
68 | DROP TABLE tmp_zones;
69 | DROP TABLE tmp_tracts;
70 |
--------------------------------------------------------------------------------
/augment_data/augment_collisions_with_most_common_coordinates.sql:
--------------------------------------------------------------------------------
1 | CREATE TEMP TABLE most_common_coords AS
2 | WITH candidates AS (
3 | SELECT
4 | on_street_name,
5 | cross_street_name,
6 | borough,
7 | taxi_zone_gid,
8 | nyct2010_gid,
9 | round(latitude, 4) AS lat,
10 | round(longitude, 4) AS lng,
11 | count(*) AS n
12 | FROM collisions
13 | WHERE coordinates_source = 'raw_data'
14 | AND latitude IS NOT NULL
15 | AND longitude IS NOT NULL
16 | AND on_street_name IS NOT NULL
17 | AND (
18 | cross_street_name IS NOT NULL
19 | OR on_street_name LIKE '% bridge'
20 | )
21 | GROUP BY on_street_name, cross_street_name, borough, taxi_zone_gid, nyct2010_gid, lat, lng
22 | )
23 | SELECT DISTINCT ON (on_street_name, cross_street_name) *
24 | FROM candidates
25 | WHERE n >= 2
26 | ORDER BY on_street_name, cross_street_name, n DESC;
27 |
28 | WITH most_common AS (
29 | SELECT
30 | c.unique_key,
31 | mcc.lat,
32 | mcc.lng,
33 | mcc.taxi_zone_gid,
34 | mcc.borough,
35 | mcc.nyct2010_gid
36 | FROM collisions c
37 | INNER JOIN most_common_coords mcc
38 | ON c.on_street_name IS NOT NULL
39 | AND c.on_street_name = mcc.on_street_name
40 | AND coalesce(c.cross_street_name, '') = coalesce(mcc.cross_street_name, '')
41 | WHERE c.latitude IS NULL
42 | AND c.longitude IS NULL
43 | )
44 | UPDATE collisions
45 | SET latitude = most_common.lat,
46 | longitude = most_common.lng,
47 | taxi_zone_gid = most_common.taxi_zone_gid,
48 | borough = most_common.borough,
49 | nyct2010_gid = most_common.nyct2010_gid,
50 | coordinates_source = 'most_common_in_raw_data'
51 | FROM most_common
52 | WHERE collisions.unique_key = most_common.unique_key;
53 |
54 | DROP TABLE most_common_coords;
55 |
--------------------------------------------------------------------------------
/augment_data/geocode.rb:
--------------------------------------------------------------------------------
1 | require 'bundler/setup'
2 | Bundler.require
3 |
4 | ActiveRecord::Base.establish_connection(
5 | adapter: 'postgresql',
6 | host: 'localhost',
7 | database: 'nyc-motor-vehicle-collisions'
8 | )
9 |
10 | class Geocoding < ActiveRecord::Base
11 | API_BASE_URL = 'https://maps.googleapis.com/maps/api/geocode/json'
12 | API_KEY = Hash[*ARGV]['--google-api-key']
13 | NYC_BOUNDS = '40.4,-74.3|41.0,-73.65'
14 |
15 | def geocode!
16 | raise 'no api key' if API_KEY.blank?
17 |
18 | begin
19 | request = RestClient.get(API_BASE_URL, params: {
20 | address: address_for_geocode,
21 | bounds: NYC_BOUNDS,
22 | key: API_KEY
23 | })
24 | rescue RestClient::BadRequest
25 | puts "Bad request for #{id}"
26 | return
27 | end
28 |
29 | json = JSON.parse(request.body)
30 | result = json.dig('results', 0)
31 |
32 | if result && (result['types'] & acceptable_google_types).present?
33 | self.latitude = result.dig('geometry', 'location', 'lat')
34 | self.longitude = result.dig('geometry', 'location', 'lng')
35 | end
36 |
37 | self.full_response = json
38 |
39 | save!
40 | end
41 | end
42 |
43 | class IntersectionGeocoding < Geocoding
44 | scope :for_geocoding, -> { where(full_response: nil) }
45 |
46 | def address_for_geocode
47 | raise "#{id} missing on_street_name" if on_street_name.blank?
48 | raise "#{id} missing cross_street_name" if cross_street_name.blank?
49 |
50 | [
51 | "#{on_street_name} and #{cross_street_name}",
52 | reported_borough,
53 | 'nyc'
54 | ].compact.join(', ').downcase
55 | end
56 |
57 | def acceptable_google_types
58 | %w(intersection)
59 | end
60 | end
61 |
62 | class StreetAddressGeocoding < Geocoding
63 | scope :for_geocoding, -> {
64 | where("
65 | full_response IS NULL
66 | AND off_street_name NOT LIKE '%parking lot%'
67 | AND off_street_name NOT LIKE '%p/l%'
68 | AND off_street_name NOT LIKE '%pl of%'
69 | AND off_street_name NOT LIKE '%muni lot%'
70 | AND off_street_name NOT LIKE '%driveway%'
71 | AND NOT (
72 | off_street_name ~* '^\\d+ east drive' AND reported_borough IS NULL
73 | )
74 | AND off_street_name ~* '\\d'
75 | ")
76 | }
77 |
78 | def address_for_geocode
79 | raise "#{id} missing off_street_name" if off_street_name.blank?
80 |
81 | [
82 | off_street_name.squish,
83 | reported_borough,
84 | 'nyc'
85 | ].compact.join(', ').downcase
86 | end
87 |
88 | def acceptable_google_types
89 | %w(street_address premise)
90 | end
91 | end
92 |
93 | def run
94 | if Geocoding::API_KEY.blank?
95 | puts [
96 | 'You have to specify a Google Maps Geocoding API key',
97 | 'Usage:',
98 | ' ruby geocode.rb --google-api-key YOUR_API_KEY_HERE'
99 | ].join("\n\n")
100 |
101 | return
102 | end
103 |
104 | [IntersectionGeocoding, StreetAddressGeocoding].each do |klass|
105 | scope = klass.for_geocoding
106 | puts "#{Time.now}: going to geocode #{scope.count} #{klass.name} addresses"
107 |
108 | scope.find_each.with_index do |g, i|
109 | puts "#{Time.now}: done #{i} addresses" if i > 0 && i % 50 == 0
110 | g.geocode!
111 | end
112 |
113 | puts "#{Time.now}: finished #{klass.name}"
114 | end
115 | end
116 |
117 | run
118 |
--------------------------------------------------------------------------------
/augment_data/populate_geocodings.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO geocodings (type, on_street_name, cross_street_name, reported_borough, n)
2 | SELECT
3 | 'IntersectionGeocoding'::text AS type,
4 | on_street_name,
5 | cross_street_name,
6 | reported_borough,
7 | COUNT(*) AS n
8 | FROM collisions
9 | WHERE latitude IS NULL
10 | AND longitude IS NULL
11 | AND on_street_name IS NOT NULL
12 | AND cross_street_name IS NOT NULL
13 | GROUP BY on_street_name, cross_street_name, reported_borough
14 | ORDER BY COUNT(*) DESC, on_street_name, cross_street_name, reported_borough;
15 |
16 | INSERT INTO geocodings (type, off_street_name, reported_borough, n)
17 | SELECT
18 | 'StreetAddressGeocoding'::text AS type,
19 | off_street_name,
20 | reported_borough,
21 | COUNT(*) AS n
22 | FROM collisions
23 | WHERE latitude IS NULL
24 | AND longitude IS NULL
25 | AND off_street_name IS NOT NULL
26 | GROUP BY off_street_name, reported_borough
27 | ORDER BY COUNT(*) DESC, off_street_name, reported_borough;
28 |
--------------------------------------------------------------------------------
/download_raw_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "`date`: downloading raw data"
4 | mkdir -p raw_data
5 | wget -c -O raw_data/collisions.csv https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD
6 | echo "`date`: done downloading raw data"
7 |
--------------------------------------------------------------------------------
/import_data.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | echo "`date`: importing raw data"
4 | psql nyc-motor-vehicle-collisions -f setup_files/import_data.sql
5 | echo "`date`: done importing raw data; populating vehicles and contributing factors"
6 | psql nyc-motor-vehicle-collisions -f setup_files/populate_vehicles_and_factors.sql
7 | echo "`date`: done populating; creating indexes"
8 | psql nyc-motor-vehicle-collisions -f setup_files/create_indexes.sql
9 | echo "`date`: done"
10 |
--------------------------------------------------------------------------------
/initialize_database.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | createdb nyc-motor-vehicle-collisions
4 |
5 | psql nyc-motor-vehicle-collisions -f setup_files/create_schema.sql
6 |
7 | shp2pgsql -I -s 2263:4326 shapefiles/taxi_zones/taxi_zones.shp | psql -d nyc-motor-vehicle-collisions
8 | psql nyc-motor-vehicle-collisions -c "CREATE INDEX ON taxi_zones (locationid);"
9 | psql nyc-motor-vehicle-collisions -c "VACUUM ANALYZE taxi_zones;"
10 |
11 | shp2pgsql -I -s 2263:4326 shapefiles/nyct2010wi_18d/nyct2010wi.shp nyct2010 | psql -d nyc-motor-vehicle-collisions
12 | psql nyc-motor-vehicle-collisions -c "VACUUM ANALYZE nyct2010;"
13 |
--------------------------------------------------------------------------------
/setup_files/create_indexes.sql:
--------------------------------------------------------------------------------
1 | CREATE INDEX ON collisions (taxi_zone_gid);
2 | CREATE INDEX ON collisions (borough);
3 | CREATE INDEX ON collisions USING brin (collision_time) WITH (pages_per_range = 32);
4 |
--------------------------------------------------------------------------------
/setup_files/create_schema.sql:
--------------------------------------------------------------------------------
1 | CREATE EXTENSION postgis;
2 |
3 | CREATE TABLE collisions_raw (
4 | date date,
5 | time time,
6 | borough text,
7 | zip_code text,
8 | latitude numeric,
9 | longitude numeric,
10 | location text,
11 | on_street_name text,
12 | cross_street_name text,
13 | off_street_name text,
14 | number_of_persons_injured integer,
15 | number_of_persons_killed integer,
16 | number_of_pedestrians_injured integer,
17 | number_of_pedestrians_killed integer,
18 | number_of_cyclists_injured integer,
19 | number_of_cyclists_killed integer,
20 | number_of_motorists_injured integer,
21 | number_of_motorists_killed integer,
22 | contributing_factor_vehicle_1 text,
23 | contributing_factor_vehicle_2 text,
24 | contributing_factor_vehicle_3 text,
25 | contributing_factor_vehicle_4 text,
26 | contributing_factor_vehicle_5 text,
27 | unique_key integer primary key,
28 | vehicle_type_code_1 text,
29 | vehicle_type_code_2 text,
30 | vehicle_type_code_3 text,
31 | vehicle_type_code_4 text,
32 | vehicle_type_code_5 text
33 | );
34 |
35 | CREATE TABLE collisions (
36 | unique_key integer primary key,
37 | collision_time timestamp without time zone,
38 | taxi_zone_gid integer,
39 | nyct2010_gid integer,
40 | borough text,
41 | reported_borough text,
42 | zip_code text,
43 | latitude numeric,
44 | longitude numeric,
45 | on_street_name text,
46 | cross_street_name text,
47 | off_street_name text,
48 | number_of_persons_injured integer,
49 | number_of_persons_killed integer,
50 | number_of_pedestrians_injured integer,
51 | number_of_pedestrians_killed integer,
52 | number_of_cyclists_injured integer,
53 | number_of_cyclists_killed integer,
54 | number_of_motorists_injured integer,
55 | number_of_motorists_killed integer,
56 | contributing_factor_vehicle_1 text,
57 | contributing_factor_vehicle_2 text,
58 | contributing_factor_vehicle_3 text,
59 | contributing_factor_vehicle_4 text,
60 | contributing_factor_vehicle_5 text,
61 | vehicle_type_code_1 text,
62 | vehicle_type_code_2 text,
63 | vehicle_type_code_3 text,
64 | vehicle_type_code_4 text,
65 | vehicle_type_code_5 text,
66 | coordinates_source text
67 | );
68 |
69 | CREATE TABLE collisions_vehicles (
70 | collision_unique_key integer not null,
71 | vehicle_number integer not null,
72 | vehicle_type text,
73 | primary key (collision_unique_key, vehicle_number)
74 | );
75 |
76 | CREATE TABLE collisions_contributing_factors (
77 | collision_unique_key integer not null,
78 | contributing_number integer not null,
79 | contributing_factor text,
80 | primary key (collision_unique_key, contributing_number)
81 | );
82 |
83 | CREATE TABLE geocodings (
84 | id serial primary key,
85 | type text not null,
86 | on_street_name text,
87 | cross_street_name text,
88 | off_street_name text,
89 | reported_borough text,
90 | n integer,
91 | latitude numeric,
92 | longitude numeric,
93 | full_response jsonb,
94 | taxi_zone_gid integer,
95 | nyct2010_gid integer,
96 | borough text
97 | );
98 |
99 | CREATE UNIQUE INDEX ON geocodings (on_street_name, cross_street_name, reported_borough) WHERE off_street_name IS NULL;
100 | CREATE UNIQUE INDEX ON geocodings (on_street_name, cross_street_name) WHERE off_street_name IS NULL AND reported_borough IS NULL;
101 | CREATE UNIQUE INDEX ON geocodings (off_street_name, reported_borough) WHERE on_street_name IS NULL;
102 | CREATE UNIQUE INDEX ON geocodings (off_street_name) WHERE on_street_name IS NULL AND reported_borough IS NULL;
103 |
--------------------------------------------------------------------------------
/setup_files/import_data.sql:
--------------------------------------------------------------------------------
1 | SET datestyle = 'ISO, MDY';
2 |
3 | \copy collisions_raw FROM 'raw_data/collisions.csv' CSV HEADER;
4 |
5 | CREATE TABLE tmp_points AS
6 | SELECT
7 | unique_key,
8 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS location
9 | FROM collisions_raw
10 | WHERE longitude IS NOT NULL AND latitude IS NOT NULL;
11 |
12 | CREATE INDEX ON tmp_points USING gist (location);
13 |
14 | CREATE TABLE tmp_zones AS
15 | SELECT t.unique_key, z.gid, z.borough
16 | FROM tmp_points t, taxi_zones z
17 | WHERE ST_Within(t.location, z.geom);
18 |
19 | CREATE UNIQUE INDEX ON tmp_zones (unique_key);
20 |
21 | CREATE TABLE tmp_tracts AS
22 | SELECT t.unique_key, n.gid, n.boroname
23 | FROM tmp_points t, nyct2010 n
24 | WHERE ST_Within(t.location, n.geom);
25 |
26 | CREATE UNIQUE INDEX ON tmp_tracts (unique_key);
27 |
28 | DELETE FROM collisions
29 | WHERE unique_key IN (SELECT unique_key FROM collisions_raw);
30 |
31 | INSERT INTO collisions (
32 | unique_key, collision_time, taxi_zone_gid, nyct2010_gid, borough,
33 | reported_borough, zip_code, latitude, longitude, on_street_name,
34 | cross_street_name, off_street_name, number_of_persons_injured,
35 | number_of_persons_killed, number_of_pedestrians_injured,
36 | number_of_pedestrians_killed, number_of_cyclists_injured,
37 | number_of_cyclists_killed, number_of_motorists_injured,
38 | number_of_motorists_killed, contributing_factor_vehicle_1,
39 | contributing_factor_vehicle_2, contributing_factor_vehicle_3,
40 | contributing_factor_vehicle_4, contributing_factor_vehicle_5,
41 | vehicle_type_code_1, vehicle_type_code_2, vehicle_type_code_3,
42 | vehicle_type_code_4, vehicle_type_code_5, coordinates_source
43 | )
44 | SELECT
45 | r.unique_key,
46 | (r.date || ' ' || r.time)::timestamp without time zone AS collision_time,
47 | z.gid AS taxi_zone_gid,
48 | t.gid AS nyct2010_gid,
49 | -- prefer borough from nyct2010 over taxi_zones because nyct2010
50 | -- includes water areas and therefore captures bridge accidents
51 | coalesce(t.boroname, z.borough) AS borough,
52 | nullif(lower(trim(r.borough)), '') AS reported_borough,
53 | nullif(trim(r.zip_code), '') AS zip_code,
54 | nullif(r.latitude, 0) AS latitude,
55 | nullif(r.longitude, 0) AS longitude,
56 | nullif(lower(trim(regexp_replace(r.on_street_name, '\s+', ' ', 'g'))), '') AS on_street_name,
57 | nullif(lower(trim(regexp_replace(r.cross_street_name, '\s+', ' ', 'g'))), '') AS cross_street_name,
58 | nullif(lower(trim(regexp_replace(r.off_street_name, '\s+', ' ', 'g'))), '') AS off_street_name,
59 | r.number_of_persons_injured,
60 | r.number_of_persons_killed,
61 | r.number_of_pedestrians_injured,
62 | r.number_of_pedestrians_killed,
63 | r.number_of_cyclists_injured,
64 | r.number_of_cyclists_killed,
65 | r.number_of_motorists_injured,
66 | r.number_of_motorists_killed,
67 | nullif(lower(trim(r.contributing_factor_vehicle_1)), '') AS contributing_factor_vehicle_1,
68 | nullif(lower(trim(r.contributing_factor_vehicle_2)), '') AS contributing_factor_vehicle_2,
69 | nullif(lower(trim(r.contributing_factor_vehicle_3)), '') AS contributing_factor_vehicle_3,
70 | nullif(lower(trim(r.contributing_factor_vehicle_4)), '') AS contributing_factor_vehicle_4,
71 | nullif(lower(trim(r.contributing_factor_vehicle_5)), '') AS contributing_factor_vehicle_5,
72 | nullif(lower(trim(r.vehicle_type_code_1)), '') AS vehicle_type_code_1,
73 | nullif(lower(trim(r.vehicle_type_code_2)), '') AS vehicle_type_code_2,
74 | nullif(lower(trim(r.vehicle_type_code_3)), '') AS vehicle_type_code_3,
75 | nullif(lower(trim(r.vehicle_type_code_4)), '') AS vehicle_type_code_4,
76 | nullif(lower(trim(r.vehicle_type_code_5)), '') AS vehicle_type_code_5,
77 | CASE
78 | WHEN nullif(r.latitude, 0) IS NOT NULL AND nullif(r.longitude, 0) IS NOT NULL
79 | THEN 'raw_data'
80 | END AS coordinates_source
81 | FROM collisions_raw r
82 | LEFT JOIN tmp_zones z ON r.unique_key = z.unique_key
83 | LEFT JOIN tmp_tracts t ON r.unique_key = t.unique_key
84 | ORDER BY r.date, r.time;
85 |
86 | TRUNCATE TABLE collisions_raw;
87 | DROP TABLE tmp_points;
88 | DROP TABLE tmp_zones;
89 | DROP TABLE tmp_tracts;
90 |
--------------------------------------------------------------------------------
/setup_files/populate_vehicles_and_factors.sql:
--------------------------------------------------------------------------------
1 | INSERT INTO collisions_vehicles
2 | (collision_unique_key, vehicle_number, vehicle_type)
3 | WITH vehicles AS (
4 | SELECT
5 | unique_key,
6 | ARRAY[1, 2, 3, 4, 5] AS vehicle_number_array,
7 | ARRAY[
8 | vehicle_type_code_1,
9 | vehicle_type_code_2,
10 | vehicle_type_code_3,
11 | vehicle_type_code_4,
12 | vehicle_type_code_5
13 | ] AS vehicle_type_array
14 | FROM collisions
15 | ),
16 | unnested AS (
17 | SELECT
18 | unique_key,
19 | unnest(vehicle_number_array) AS vehicle_number,
20 | unnest(vehicle_type_array) AS vehicle_type
21 | FROM vehicles
22 | )
23 | SELECT unique_key, vehicle_number, vehicle_type
24 | FROM unnested
25 | WHERE vehicle_type IS NOT NULL
26 | ORDER BY unique_key, vehicle_number
27 | ON CONFLICT (collision_unique_key, vehicle_number)
28 | DO UPDATE SET vehicle_type = EXCLUDED.vehicle_type;
29 |
30 | INSERT INTO collisions_contributing_factors
31 | (collision_unique_key, contributing_number, contributing_factor)
32 | WITH vehicles AS (
33 | SELECT
34 | unique_key,
35 | ARRAY[1, 2, 3, 4, 5] AS contributing_number_array,
36 | ARRAY[
37 | contributing_factor_vehicle_1,
38 | contributing_factor_vehicle_2,
39 | contributing_factor_vehicle_3,
40 | contributing_factor_vehicle_4,
41 | contributing_factor_vehicle_5
42 | ] AS contributing_factor_array
43 | FROM collisions
44 | ),
45 | unnested AS (
46 | SELECT
47 | unique_key,
48 | unnest(contributing_number_array) AS contributing_number,
49 | unnest(contributing_factor_array) AS contributing_factor
50 | FROM vehicles
51 | )
52 | SELECT unique_key, contributing_number, contributing_factor
53 | FROM unnested
54 | WHERE contributing_factor IS NOT NULL
55 | ORDER BY unique_key, contributing_number
56 | ON CONFLICT (collision_unique_key, contributing_number)
57 | DO UPDATE SET contributing_factor = EXCLUDED.contributing_factor;
58 |
--------------------------------------------------------------------------------
/shapefiles/nyct2010wi_18d/nyct2010wi.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/nyct2010wi_18d/nyct2010wi.dbf
--------------------------------------------------------------------------------
/shapefiles/nyct2010wi_18d/nyct2010wi.prj:
--------------------------------------------------------------------------------
1 | PROJCS["NAD_1983_StatePlane_New_York_Long_Island_FIPS_3104_Feet",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic"],PARAMETER["False_Easting",984250.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-74.0],PARAMETER["Standard_Parallel_1",40.66666666666666],PARAMETER["Standard_Parallel_2",41.03333333333333],PARAMETER["Latitude_Of_Origin",40.16666666666666],UNIT["Foot_US",0.3048006096012192]]
--------------------------------------------------------------------------------
/shapefiles/nyct2010wi_18d/nyct2010wi.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/nyct2010wi_18d/nyct2010wi.shp
--------------------------------------------------------------------------------
/shapefiles/nyct2010wi_18d/nyct2010wi.shp.xml:
--------------------------------------------------------------------------------
1 | 20181116085344001.0North American Profile of ISO19115 2003FALSEnyct2010wiwithheldLocal Area Network002912287.0687871067382.508423116411.371399273617.84320110.000ProjectedGCS_North_American_1983Linear Unit: Foot_US (0.304801)NAD_1983_StatePlane_New_York_Long_Island_FIPS_3104_Feet<ProjectedCoordinateSystem xsi:type='typens:ProjectedCoordinateSystem' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xmlns:xs='http://www.w3.org/2001/XMLSchema' xmlns:typens='http://www.esri.com/schemas/ArcGIS/10.1'><WKT>PROJCS["NAD_1983_StatePlane_New_York_Long_Island_FIPS_3104_Feet",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic"],PARAMETER["False_Easting",984250.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-74.0],PARAMETER["Standard_Parallel_1",40.66666666666666],PARAMETER["Standard_Parallel_2",41.03333333333333],PARAMETER["Latitude_Of_Origin",40.16666666666666],UNIT["Foot_US",0.3048006096012192],AUTHORITY["EPSG",2263]]</WKT><XOrigin>-120039300</XOrigin><YOrigin>-96540300</YOrigin><XYScale>37212589.015695661</XYScale><ZOrigin>-100000</ZOrigin><ZScale>10000</ZScale><MOrigin>-100000</MOrigin><MScale>10000</MScale><XYTolerance>0.0032808333333333331</XYTolerance><ZTolerance>0.001</ZTolerance><MTolerance>0.001</MTolerance><HighPrecision>true</HighPrecision><WKID>102718</WKID><LatestWKID>2263</LatestWKID></ProjectedCoordinateSystem>Select c:\temp\BYTES_GP\Districts.gdb\nyct2010wi C:\temp\18D\GIS_OUTPUT\Districts\shp\nyct2010wi__Iteration.shp "ObjectID>=0 AND ObjectID<3000"Select C:\temp\18D\GIS_OUTPUT\Districts\shp\nyct2010wi__Iteration.shp C:\temp\18D\GIS_OUTPUT\Districts\shp\nyct2010wi.shp #201811160905390020181116090539001500000005000New York City Department of City Planning120 Broadway, 31st FloorNew YorkNew York10271USNew York City Department of City Planning120 Broadway, 31st FloorNew YorkNew York10271USAvailable at the following website: https://www1.nyc.gov/site/planning/data-maps/open-data.pageFreehttps://www1.nyc.gov/site/planning/data-maps/open-data.page0.000ESRI ShapefileOpen SpecificationNew York City Census Tracts for 2010 US Census Water Included18DNew York City Department of City PlanningNew York City Department of City PlanningBYTES of the BIG APPLE18D10/23/2018 8:49:29 AM11/19/2018 8:49:49 AM<DIV STYLE="text-align:Left;"><DIV><P><SPAN>The Census Tracts for the 2010 US Census. These boundary files are derived from the US Census Bureau's TIGER project and have been geographically modified to fit the New York City base map.</SPAN></P></DIV></DIV>These districts were created by the Department of City Planning to aid city agencies in administering public services.New York City Department of City Planning120 Broadway, 31st FloorNew YorkNY10271USNew YorkManhattanQueensBrooklynRichmondBronxKingsStaten IslandNew York CityboundariesboundaryboroughNew YorkManhattanQueensBrooklynRichmondboundariesBronxKingsboundaryboroughStaten IslandNew York CityThe data is freely available to all New York City agencies and the public.<DIV STYLE="text-align:Left;"><P><SPAN>This dataset is being provided by the Department of City Planning (DCP) on DCP’s website for informational purposes only. DCP does not warranty the completeness, accuracy, content, or fitness for any particular purpose or use of the dataset, nor are any such warranties to be implied or inferred with respect to the dataset as furnished on the website. DCP and the City are not liable for any deficiencies in the completeness, accuracy, content, or fitness for any particular purpose or use the dataset, or applications utilizing the dataset, provided by any third party.</SPAN></P></DIV>ground conditionDepartment of City Planning.Microsoft Windows 7 Version 6.1 (Build 7601) Service Pack 1; Esri ArcGIS 10.2.0.33481-74.260380-73.69920640.91769140.485808These data are accurate as of US Census 2010.The District files are created from the same release version of the Department of City Planning LION file. The LION file is spatially aligned with NYCMap aerial photography.nyct2010wiFeature Class0FIDFIDOID400Internal feature number.EsriSequential unique whole numbers that are automatically generated.CTLabelCTLabelString7The census tract identifier for the polygon. Each census tract number is unique to its borough.00BoroCodeBoroCodeString1Borough of census tract.1Manhattan2Bronx3Brooklyn4Queens5Staten Island00BoroNameBoroNameString32Borough Name.00CDEligibilCDEligibilString1Community Development Block Grant Eligibility.00CT2010CT2010String6String value of the census tract number.00BoroCT2010BoroCT2010String7Merged string of borough code and census tract number.00PUMAPUMAString4Public Use Microdata Areas00ShapeShapeGeometry000Feature geometry.ESRICoordinates defining the features.SHAPE_AreaShape_AreaDouble19Area of feature in internal units squared.ESRIPositive real numbers that are automatically generated.00NTACodeNTACodeString400Neighborhood Tabulation Area CodeShape_LengShape_LengDouble1900NTANameNTANameString7500Neighborhood Tabulation Area NamedatasetEPSG8.1.10SimpleFALSE0FALSEFALSE20181116
2 |
--------------------------------------------------------------------------------
/shapefiles/nyct2010wi_18d/nyct2010wi.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/nyct2010wi_18d/nyct2010wi.shx
--------------------------------------------------------------------------------
/shapefiles/taxi_zones/taxi_zones.dbf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/taxi_zones/taxi_zones.dbf
--------------------------------------------------------------------------------
/shapefiles/taxi_zones/taxi_zones.prj:
--------------------------------------------------------------------------------
1 | PROJCS["NAD_1983_StatePlane_New_York_Long_Island_FIPS_3104_Feet",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic"],PARAMETER["False_Easting",984250.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-74.0],PARAMETER["Standard_Parallel_1",40.66666666666666],PARAMETER["Standard_Parallel_2",41.03333333333333],PARAMETER["Latitude_Of_Origin",40.16666666666666],UNIT["Foot_US",0.3048006096012192]]
--------------------------------------------------------------------------------
/shapefiles/taxi_zones/taxi_zones.sbn:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/taxi_zones/taxi_zones.sbn
--------------------------------------------------------------------------------
/shapefiles/taxi_zones/taxi_zones.sbx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/taxi_zones/taxi_zones.sbx
--------------------------------------------------------------------------------
/shapefiles/taxi_zones/taxi_zones.shp:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/taxi_zones/taxi_zones.shp
--------------------------------------------------------------------------------
/shapefiles/taxi_zones/taxi_zones.shp.xml:
--------------------------------------------------------------------------------
1 |
2 | 20150921140426001.0TRUECalculateField taxi_zones zone [Taxi_zone] VB #CalculateField taxi_zones LocationID [taxi_zone_] VB #CalculateField taxi_zones borough [FIRST_Boro] VB #
3 |
--------------------------------------------------------------------------------
/shapefiles/taxi_zones/taxi_zones.shx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/taxi_zones/taxi_zones.shx
--------------------------------------------------------------------------------