├── .gitignore ├── LICENSE ├── README.md ├── analysis ├── analysis.R ├── collisions_map.html ├── export_data.sql ├── helpers.R └── regression_model.R ├── augment_data ├── Gemfile ├── Gemfile.lock ├── README.md ├── augment_collisions_with_geocoding.sql ├── augment_collisions_with_most_common_coordinates.sql ├── geocode.rb └── populate_geocodings.sql ├── download_raw_data.sh ├── import_data.sh ├── initialize_database.sh ├── setup_files ├── create_indexes.sql ├── create_schema.sql ├── import_data.sql └── populate_vehicles_and_factors.sql └── shapefiles ├── nyct2010wi_18d ├── nyct2010wi.dbf ├── nyct2010wi.prj ├── nyct2010wi.shp ├── nyct2010wi.shp.xml └── nyct2010wi.shx └── taxi_zones ├── taxi_zones.dbf ├── taxi_zones.prj ├── taxi_zones.sbn ├── taxi_zones.sbx ├── taxi_zones.shp ├── taxi_zones.shp.xml └── taxi_zones.shx /.gitignore: -------------------------------------------------------------------------------- 1 | raw_data/* 2 | analysis/*.csv 3 | analysis/*.csv.gz 4 | analysis/graphs/*.png 5 | analysis/graphs/boroughs/*.png 6 | analysis/graphs/zones/*.png 7 | .Rapp.history 8 | .DS_Store 9 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Todd Schneider 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # NYC Motor Vehicle Collisions 2 | 3 | Code in support of this post: [Mapping Motor Vehicle Collisions in New York City](https://toddwschneider.com/posts/nyc-motor-vehicle-collisions-map/) 4 | 5 | Raw data comes from the NYPD: https://data.cityofnewyork.us/Public-Safety/NYPD-Motor-Vehicle-Collisions/h9gi-nx95 6 | 7 | ## Instructions 8 | 9 | 1. Download and install PostgreSQL and PostGIS (both are available via Homebrew) 10 | 2. `./download_raw_data.sh` 11 | 3. `./initialize_database.sh` 12 | 4. `./import_data.sh` 13 | 14 | Additional code to fill in missing coordinates for collisions that have cross streets or addresses but no lat/lon lives in the `augment_data/` subfolder 15 | 16 | Assorted SQL and R scripts to analyze data and draw maps are in the `analysis/` subfolder 17 | -------------------------------------------------------------------------------- /analysis/analysis.R: -------------------------------------------------------------------------------- 1 | source("helpers.R") 2 | 3 | collisions = query(" 4 | SELECT 5 | unique_key, 6 | collision_time::date AS date, 7 | date(date_trunc('month', collision_time)) AS month, 8 | extract(hour FROM collision_time) AS hour_of_day, 9 | extract(dow FROM collision_time) AS day_of_week, 10 | latitude, 11 | longitude, 12 | z.locationid, 13 | z.zone, 14 | c.borough, 15 | nyct2010_gid, 16 | number_of_motorists_injured + number_of_cyclists_injured + number_of_pedestrians_injured AS people_injured, 17 | number_of_motorists_injured AS motorists_injured, 18 | number_of_cyclists_injured AS cyclists_injured, 19 | number_of_pedestrians_injured AS pedestrians_injured, 20 | number_of_motorists_killed + number_of_cyclists_killed + number_of_pedestrians_killed AS people_killed, 21 | number_of_motorists_killed AS motorists_killed, 22 | number_of_cyclists_killed AS cyclists_killed, 23 | number_of_pedestrians_killed AS pedestrians_killed 24 | FROM collisions c 25 | LEFT JOIN taxi_zones z ON c.taxi_zone_gid = z.gid 26 | ORDER BY unique_key 27 | ") 28 | 29 | zb = query("SELECT DISTINCT zone, borough FROM taxi_zones ORDER BY zone") 30 | 31 | date_seq = seq( 32 | min(collisions$date), 33 | max(collisions$date), 34 | by = "1 day" 35 | ) 36 | 37 | variable_factor_levels = c( 38 | "collisions", 39 | "people_injured", 40 | "people_killed", 41 | "motorists_injured", 42 | "motorists_killed", 43 | "cyclists_injured", 44 | "cyclists_killed", 45 | "pedestrians_injured", 46 | "pedestrians_killed" 47 | ) 48 | 49 | variable_factor_labels = variable_factor_levels %>% 50 | gsub("_", " ", .) %>% 51 | capitalize_first_letter() 52 | 53 | aggregate_collisions = function(dimensions = quos()) { 54 | dimensions_excluded = purrr::map(dimensions, function(d) expr(-!!d)) 55 | 56 | collisions %>% 57 | mutate(collisions = 1) %>% 58 | select(!!!dimensions, date, collisions, matches("_(injured|killed)$")) %>% 59 | group_by(!!!dimensions, date) %>% 60 | summarize_all(sum) %>% 61 | ungroup() %>% 62 | group_by(!!!dimensions) %>% 63 | complete( 64 | date = date_seq, 65 | fill = list( 66 | collisions = 0, 67 | people_injured = 0, 68 | motorists_injured = 0, 69 | cyclists_injured = 0, 70 | pedestrians_injured = 0, 71 | people_killed = 0, 72 | motorists_killed = 0, 73 | cyclists_killed = 0, 74 | pedestrians_killed = 0 75 | ) 76 | ) %>% 77 | ungroup() %>% 78 | gather(variable, daily, -date, !!!dimensions_excluded) %>% 79 | group_by(!!!dimensions, variable) %>% 80 | arrange(!!!dimensions, variable, date) %>% 81 | mutate( 82 | rolling28 = rollsumr(daily, k = 28, na.pad = TRUE), 83 | rolling365 = rollsumr(daily, k = 365, na.pad = TRUE) 84 | ) %>% 85 | ungroup() %>% 86 | mutate( 87 | year = year(date), 88 | variable = factor(variable, levels = variable_factor_levels, labels = variable_factor_labels) 89 | ) 90 | } 91 | 92 | aggregate_collisions_by_year = function(dimensions = quos()) { 93 | aggregate_collisions(dimensions = dimensions) %>% 94 | group_by(!!!dimensions, variable, year) %>% 95 | summarize(total = sum(daily)) %>% 96 | ungroup() 97 | } 98 | 99 | aggregated_data = aggregate_collisions_by_year() %>% 100 | filter(year %in% 2013:2018) 101 | 102 | aggregated_data_by_borough = aggregate_collisions_by_year(dimensions = quos(borough)) %>% 103 | filter(year %in% 2013:2018) 104 | 105 | aggregated_data_by_zone = aggregate_collisions_by_year(dimensions = quos(zone)) %>% 106 | filter(year %in% 2013:2018) %>% 107 | inner_join(zb, by = "zone") 108 | 109 | zones = aggregated_data_by_zone %>% 110 | distinct(zone) %>% 111 | filter(!is.na(zone)) %>% 112 | pull(zone) 113 | 114 | plot_bs = 32 115 | plot_width = 800 116 | 117 | p1 = aggregated_data %>% 118 | filter(variable == "Collisions") %>% 119 | ggplot(aes(x = year, y = total)) + 120 | geom_line(size = 1, color = nypd_blue) + 121 | geom_point(size = 3, color = nypd_blue) + 122 | geom_blank(aes(y = 0)) + 123 | geom_blank(aes(y = 1.3 * total)) + 124 | scale_y_continuous(labels = scales::comma) + 125 | ggtitle("New York City Motor Vehicle Collisions", "Annual total") + 126 | labs(caption = "Data via NYPD\ntoddwschneider.com") + 127 | theme_tws(base_size = plot_bs) + 128 | theme( 129 | axis.title = element_blank(), 130 | panel.grid.minor.x = element_blank() 131 | ) 132 | 133 | p2 = aggregated_data %>% 134 | filter(variable != "Collisions") %>% 135 | ggplot(aes(x = year, y = total)) + 136 | geom_line(size = 1, color = nypd_blue) + 137 | geom_point(size = 3, color = nypd_blue) + 138 | geom_blank(aes(y = 0)) + 139 | geom_blank(aes(y = 1.4 * total)) + 140 | scale_x_continuous(breaks = c(2014, 2016, 2018)) + 141 | scale_y_continuous(labels = scales::comma) + 142 | facet_wrap(~variable, scales = "free_y", ncol = 2) + 143 | ggtitle("New York City", "Annual injuries from motor vehicle collisions") + 144 | labs(caption = "Data via NYPD\ntoddwschneider.com") + 145 | theme_tws(base_size = plot_bs) + 146 | theme( 147 | axis.title = element_blank(), 148 | panel.grid.minor.y = element_blank(), 149 | plot.subtitle = element_text(margin = unit(c(0, 0, 1.1, 0), "lines")), 150 | plot.margin = margin(plot_bs / 2, plot_bs * 0.75, plot_bs / 2, plot_bs / 2), 151 | axis.text = element_text(size = rel(0.7)), 152 | strip.text = element_text(size = rel(0.7)) 153 | ) 154 | 155 | png("graphs/nyc_collisions.png", height = plot_width * 0.75, width = plot_width) 156 | print(p1) 157 | dev.off() 158 | 159 | png("graphs/nyc_injuries.png", height = plot_width * 1.5, width = plot_width) 160 | print(p2) 161 | dev.off() 162 | 163 | for (b in c("Bronx", "Brooklyn", "Manhattan", "Queens", "Staten Island")) { 164 | p1 = aggregated_data_by_borough %>% 165 | filter(borough == b, variable == "Collisions") %>% 166 | ggplot(aes(x = year, y = total)) + 167 | geom_line(size = 1, color = nypd_blue) + 168 | geom_point(size = 3, color = nypd_blue) + 169 | geom_blank(aes(y = 0)) + 170 | geom_blank(aes(y = 1.4 * total)) + 171 | scale_x_continuous(breaks = c(2014, 2016, 2018)) + 172 | scale_y_continuous(labels = scales::comma) + 173 | ggtitle(paste(b, "Motor Vehicle Collisions"), "Annual total") + 174 | labs(caption = "Data via NYPD\ntoddwschneider.com") + 175 | theme_tws(base_size = plot_bs) + 176 | theme( 177 | axis.title = element_blank(), 178 | panel.grid.minor.y = element_blank() 179 | ) 180 | 181 | p2 = aggregated_data_by_borough %>% 182 | filter(variable != "Collisions", borough == b) %>% 183 | ggplot(aes(x = year, y = total)) + 184 | geom_line(size = 1, color = nypd_blue) + 185 | geom_point(size = 3, color = nypd_blue) + 186 | geom_blank(aes(y = 0)) + 187 | geom_blank(aes(y = 1.4 * total)) + 188 | scale_x_continuous(breaks = c(2014, 2016, 2018)) + 189 | scale_y_continuous(labels = scales::comma, breaks = integer_breaks(n = 3)) + 190 | facet_wrap(~variable, scales = "free_y", ncol = 2) + 191 | ggtitle(b, "Annual injuries from motor vehicle collisions") + 192 | labs(caption = "Data via NYPD\ntoddwschneider.com") + 193 | theme_tws(base_size = plot_bs) + 194 | theme( 195 | axis.title = element_blank(), 196 | panel.grid.minor.y = element_blank(), 197 | plot.subtitle = element_text(margin = unit(c(0, 0, 1.1, 0), "lines")), 198 | plot.margin = margin(plot_bs / 2, plot_bs * 0.75, plot_bs / 2, plot_bs / 2), 199 | axis.text = element_text(size = rel(0.7)), 200 | strip.text = element_text(size = rel(0.7)) 201 | ) 202 | 203 | png(paste0("graphs/boroughs/", tolower(gsub(" ", "_", b)), "_collisions.png"), height = plot_width * 0.75, width = plot_width) 204 | print(p1) 205 | dev.off() 206 | 207 | png(paste0("graphs/boroughs/", tolower(gsub(" ", "_", b)), "_injuries.png"), height = plot_width * 1.5, width = plot_width) 208 | print(p2) 209 | dev.off() 210 | } 211 | 212 | for (z in zones) { 213 | zfile = z %>% 214 | str_replace_all("['()]", "") %>% 215 | str_replace_all("[\\s/]", "_") %>% 216 | tolower() 217 | 218 | borough = filter(zb, zone == z)$borough 219 | 220 | p1 = aggregated_data_by_zone %>% 221 | filter(zone == z, variable == "Collisions") %>% 222 | ggplot(aes(x = year, y = total)) + 223 | geom_line(size = 1, color = nypd_blue) + 224 | geom_point(size = 3, color = nypd_blue) + 225 | geom_blank(aes(y = 0)) + 226 | geom_blank(aes(y = 1.4 * total)) + 227 | scale_x_continuous(breaks = c(2014, 2016, 2018)) + 228 | scale_y_continuous(labels = scales::comma) + 229 | ggtitle( 230 | paste(z, borough, sep = ", "), 231 | "Annual motor vehicle collisions" 232 | ) + 233 | labs(caption = "Data via NYPD\ntoddwschneider.com") + 234 | theme_tws(base_size = plot_bs) + 235 | theme( 236 | axis.title = element_blank(), 237 | panel.grid.minor.y = element_blank() 238 | ) 239 | 240 | p2 = aggregated_data_by_zone %>% 241 | filter(zone == z, variable != "Collisions") %>% 242 | ggplot(aes(x = year, y = total)) + 243 | geom_line(size = 1, color = nypd_blue) + 244 | geom_point(size = 3, color = nypd_blue) + 245 | geom_blank(aes(y = 0)) + 246 | geom_blank(aes(y = 1.4 * total)) + 247 | scale_x_continuous(breaks = c(2014, 2016, 2018)) + 248 | scale_y_continuous(labels = scales::comma, breaks = integer_breaks(n = 3)) + 249 | facet_wrap(~variable, scales = "free_y", ncol = 2) + 250 | ggtitle( 251 | paste(z, borough, sep = ", "), 252 | "Annual injuries from motor vehicle collision" 253 | ) + 254 | labs(caption = "Data via NYPD\ntoddwschneider.com") + 255 | theme_tws(base_size = plot_bs) + 256 | theme( 257 | axis.title = element_blank(), 258 | panel.grid.minor.y = element_blank(), 259 | plot.subtitle = element_text(margin = unit(c(0, 0, 1.1, 0), "lines")), 260 | plot.margin = margin(plot_bs / 2, plot_bs * 0.75, plot_bs / 2, plot_bs / 2), 261 | axis.text = element_text(size = rel(0.7)), 262 | strip.text = element_text(size = rel(0.7)) 263 | ) 264 | 265 | png(paste0("graphs/zones/", zfile, "_collisions.png"), height = plot_width * 0.75, width = plot_width) 266 | print(p1) 267 | dev.off() 268 | 269 | png(paste0("graphs/zones/", zfile, "_injuries.png"), height = plot_width * 1.5, width = plot_width) 270 | print(p2) 271 | dev.off() 272 | } 273 | 274 | 275 | 276 | # find zones with steepest (positive or negative) trends 277 | regressions_data = aggregated_data_by_zone %>% 278 | select(zone, borough, year, variable, total) %>% 279 | mutate(variable = gsub(" ", "_", tolower(variable))) %>% 280 | spread(variable, total) 281 | 282 | slopes = purrr::map(zones, function(z) { 283 | df = filter(regressions_data, zone == z) 284 | 285 | tibble( 286 | zone = z, 287 | people_injured = lm(people_injured ~ year, data = df)$coef["year"], 288 | motorists_injured = lm(motorists_injured ~ year, data = df)$coef["year"], 289 | cyclists_injured = lm(cyclists_injured ~ year, data = df)$coef["year"], 290 | pedestrians_injured = lm(pedestrians_injured ~ year, data = df)$coef["year"] 291 | ) 292 | }) %>% bind_rows() 293 | 294 | arrange(slopes, people_injured) 295 | arrange(slopes, desc(people_injured)) 296 | 297 | 298 | 299 | # injury rates by time of day and alcohol involvement 300 | alcohol_involved_unique_keys = query(" 301 | SELECT DISTINCT collision_unique_key 302 | FROM collisions_contributing_factors 303 | WHERE contributing_factor LIKE '%alcohol%' 304 | ")$collision_unique_key 305 | 306 | injury_rates_hourly = collisions %>% 307 | mutate(alcohol_involved = unique_key %in% alcohol_involved_unique_keys) %>% 308 | group_by(hour_of_day) %>% 309 | summarize( 310 | collisions = n(), 311 | frac_with_injury = mean(people_injured > 0), 312 | frac_with_fatality = mean(people_killed > 0), 313 | frac_with_alcohol_involvement = mean(alcohol_involved) 314 | ) %>% 315 | ungroup() 316 | 317 | injury_rates_hourly = bind_rows( 318 | injury_rates_hourly, 319 | injury_rates_hourly %>% 320 | filter(hour_of_day == 0) %>% 321 | mutate(hour_of_day = 24) 322 | ) 323 | 324 | p1 = ggplot(injury_rates_hourly, aes(x = hour_of_day, y = collisions)) + 325 | geom_line(size = 1, color = nypd_blue) + 326 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) + 327 | scale_y_continuous(labels = scales::comma) + 328 | expand_limits(y = c(0, 125e3)) + 329 | ggtitle( 330 | "NYC collisions by time of day", 331 | "Jul 2012–Jan 2019" 332 | ) + 333 | labs(caption = "Data via NYPD\ntoddwschneider.com") + 334 | theme_tws(base_size = plot_bs) + 335 | no_axis_titles() 336 | 337 | p2 = ggplot(injury_rates_hourly, aes(x = hour_of_day, y = frac_with_injury)) + 338 | geom_line(size = 1, color = nypd_blue) + 339 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) + 340 | scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + 341 | expand_limits(y = 0) + 342 | ggtitle( 343 | "Injury rate by time of day", 344 | "% of NYC collisions that result in injury" 345 | ) + 346 | labs(caption = "Data via NYPD, Jul 2012–Jan 2019\ntoddwschneider.com") + 347 | theme_tws(base_size = plot_bs) + 348 | no_axis_titles() 349 | 350 | p3 = ggplot(injury_rates_hourly, aes(x = hour_of_day, y = frac_with_fatality)) + 351 | geom_line(size = 1, color = nypd_blue) + 352 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) + 353 | scale_y_continuous(labels = scales::percent_format(accuracy = 0.1)) + 354 | expand_limits(y = 0) + 355 | ggtitle( 356 | "Fatality rate by time of day", 357 | "% of NYC collisions that result in fatality" 358 | ) + 359 | labs(caption = "Data via NYPD, Jul 2012–Jan 2019\ntoddwschneider.com") + 360 | theme_tws(base_size = plot_bs) + 361 | no_axis_titles() 362 | 363 | p4 = ggplot(injury_rates_hourly, aes(x = hour_of_day, y = frac_with_alcohol_involvement)) + 364 | geom_line(size = 1, color = nypd_blue) + 365 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) + 366 | scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + 367 | expand_limits(y = 0) + 368 | ggtitle( 369 | "Alcohol involvement by time of day", 370 | "% of NYC collisions with alcohol cited as contributing factor" 371 | ) + 372 | labs(caption = "Data via NYPD, Jul 2012–Jan 2019\ntoddwschneider.com") + 373 | theme_tws(base_size = plot_bs) + 374 | theme(plot.subtitle = element_text(size = rel(0.8))) + 375 | no_axis_titles() 376 | 377 | png("graphs/collisions_by_hour.png", height = plot_width, width = plot_width) 378 | print(p1) 379 | dev.off() 380 | 381 | png("graphs/injury_rate_by_hour.png", height = plot_width, width = plot_width) 382 | print(p2) 383 | dev.off() 384 | 385 | png("graphs/fatality_rate_by_hour.png", height = plot_width, width = plot_width) 386 | print(p3) 387 | dev.off() 388 | 389 | png("graphs/alcohol_involvement_by_hour.png", height = plot_width, width = plot_width) 390 | print(p4) 391 | dev.off() 392 | 393 | alcohol_stats = collisions %>% 394 | mutate(alcohol_involved = unique_key %in% alcohol_involved_unique_keys) %>% 395 | group_by(alcohol_involved) %>% 396 | summarize( 397 | collisions = n(), 398 | frac_with_injury = mean(people_injured > 0), 399 | frac_with_fatality = mean(people_killed > 0) 400 | ) %>% 401 | ungroup() 402 | 403 | injury_rates_alcohol = collisions %>% 404 | mutate(alcohol_involved = unique_key %in% alcohol_involved_unique_keys) %>% 405 | group_by(hour_of_day, alcohol_involved) %>% 406 | summarize( 407 | collisions = n(), 408 | frac_with_injury = mean(people_injured > 0), 409 | frac_with_fatality = mean(people_killed > 0) 410 | ) %>% 411 | ungroup() 412 | 413 | injury_rates_alcohol = bind_rows( 414 | injury_rates_alcohol, 415 | injury_rates_alcohol %>% 416 | filter(hour_of_day == 0) %>% 417 | mutate(hour_of_day = 24) 418 | ) 419 | 420 | p5 = ggplot(injury_rates_alcohol, aes(x = hour_of_day, y = frac_with_injury, color = alcohol_involved)) + 421 | geom_line(size = 1) + 422 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) + 423 | scale_y_continuous(labels = scales::percent_format(accuracy = 1)) + 424 | scale_color_manual(values = c(nypd_blue, "#49acd5"), guide = FALSE) + 425 | expand_limits(y = c(0, 0.4)) + 426 | annotate( 427 | "text", x = 17, y = 0.15, label = "Alcohol not cited", 428 | size = 8, color = nypd_blue, family = "Open Sans" 429 | ) + 430 | annotate( 431 | "text", x = 17, y = 0.34, label = "Alcohol cited", 432 | size = 8, color = "#49acd5", family = "Open Sans" 433 | ) + 434 | ggtitle( 435 | "Injury rate by alcohol involvement", 436 | "% of NYC collisions that result in injury" 437 | ) + 438 | labs(caption = "Data via NYPD, Jul 2012–Jan 2019\ntoddwschneider.com") + 439 | theme_tws(base_size = plot_bs) + 440 | no_axis_titles() 441 | 442 | p6 = ggplot(injury_rates_alcohol, aes(x = hour_of_day, y = frac_with_fatality, color = alcohol_involved)) + 443 | geom_line(size = 1) + 444 | scale_x_continuous(breaks = c(0, 6, 12, 18, 24), labels = c("12 AM", "6 AM", "12 PM", "6 PM", "12 AM")) + 445 | scale_y_continuous(labels = scales::percent_format(accuracy = 0.1)) + 446 | scale_color_manual(values = c(nypd_blue, "#49acd5"), guide = FALSE) + 447 | expand_limits(y = c(0, 0.01)) + 448 | annotate( 449 | "text", x = 5, y = 0, label = "Alcohol not cited", 450 | size = 8, color = nypd_blue, family = "Open Sans" 451 | ) + 452 | annotate( 453 | "text", x = 5, y = 0.009, label = "Alcohol cited", 454 | size = 8, color = "#49acd5", family = "Open Sans" 455 | ) + 456 | ggtitle( 457 | "Fatality rate by alcohol involvement", 458 | "% of NYC collisions that result in fatality" 459 | ) + 460 | labs(caption = "Data via NYPD, Jul 2012–Jan 2019\ntoddwschneider.com") + 461 | theme_tws(base_size = plot_bs) + 462 | no_axis_titles() 463 | 464 | png("graphs/injury_rate_by_alcohol_involvement.png", height = plot_width, width = plot_width) 465 | print(p5) 466 | dev.off() 467 | 468 | png("graphs/fatality_rate_by_alcohol_involvement.png", height = plot_width, width = plot_width) 469 | print(p6) 470 | dev.off() 471 | -------------------------------------------------------------------------------- /analysis/collisions_map.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | NYC Motor Vehicle Collisions 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 143 | 144 | 145 | 146 | 153 | 154 |
155 | 156 |
157 | × 158 |
159 |
160 | 161 |
162 | 173 |
174 | 175 |
176 | 184 |
185 | 186 |
187 | 197 |
198 | 199 |
200 | 208 |
209 | 210 |
211 | 217 |
218 | 219 |
220 | 226 |
227 | 228 |
229 | 230 | 231 |
232 |
233 | 234 | 235 |
236 | 237 |
238 | 239 | 240 |
241 | 242 |
243 | 244 | 245 |
246 |
247 |
248 | 249 |
250 | 251 |
252 | Fetching data 253 | throbber 254 |
255 | 256 |
257 | 258 |
259 | 260 | 261 |
262 | 263 |
264 | 265 | 266 | 267 | 268 | 269 |
270 | 271 |
272 | 273 |
274 | 275 | GitHub 276 | 277 |
278 |
279 | 280 |
281 | 282 | 290 | 291 | 292 | 865 | 866 | -------------------------------------------------------------------------------- /analysis/export_data.sql: -------------------------------------------------------------------------------- 1 | DROP TABLE IF EXISTS export_data; 2 | 3 | CREATE TEMP TABLE export_data AS 4 | WITH nv AS ( 5 | SELECT 6 | collision_unique_key, 7 | count(*) AS num_vehicles 8 | FROM collisions_vehicles 9 | GROUP BY collision_unique_key 10 | ) 11 | SELECT 12 | extract(year FROM collision_time) AS year, 13 | CASE borough 14 | WHEN 'Bronx' THEN 'x' 15 | WHEN 'Brooklyn' THEN 'k' 16 | WHEN 'Manhattan' THEN 'm' 17 | WHEN 'Queens' THEN 'q' 18 | WHEN 'Staten Island' THEN 's' 19 | END AS borough, 20 | round(latitude, 5) AS lat, 21 | round(longitude, 5) AS lng, 22 | nv.num_vehicles, 23 | number_of_motorists_injured AS motorists_injured, 24 | number_of_motorists_killed AS motorists_killed, 25 | number_of_cyclists_injured AS cyclists_injured, 26 | number_of_cyclists_killed AS cyclists_killed, 27 | number_of_pedestrians_injured AS pedestrians_injured, 28 | number_of_pedestrians_killed AS pedestrians_killed, 29 | CASE 30 | WHEN extract(hour FROM collision_time) IN (8, 9, 10) THEN 'm' 31 | WHEN extract(hour FROM collision_time) IN (11, 12, 13, 14, 15) THEN 'i' 32 | WHEN extract(hour FROM collision_time) IN (16, 17, 18) THEN 'a' 33 | WHEN extract(hour FROM collision_time) IN (19, 20, 21) THEN 'e' 34 | ELSE 'o' 35 | END AS time_of_day 36 | FROM collisions c 37 | LEFT JOIN nv ON c.unique_key = nv.collision_unique_key 38 | WHERE latitude IS NOT NULL 39 | AND longitude IS NOT NULL 40 | AND latitude BETWEEN 40.4 AND 41 41 | AND longitude BETWEEN -74.4 AND -73.5 42 | ORDER BY unique_key; 43 | 44 | \copy (SELECT * FROM export_data) TO 'nyc_motor_vehicle_collisions.csv' CSV HEADER; 45 | -------------------------------------------------------------------------------- /analysis/helpers.R: -------------------------------------------------------------------------------- 1 | required_packages = c("tidyverse", "scales", "lubridate", "RPostgres", "zoo") 2 | installed_packages = rownames(installed.packages()) 3 | packages_to_install = required_packages[!(required_packages %in% installed_packages)] 4 | 5 | if (length(packages_to_install) > 0) { 6 | install.packages( 7 | packages_to_install, 8 | dependencies = TRUE, 9 | repos = "https://cloud.r-project.org", 10 | ) 11 | } 12 | 13 | library(tidyverse) 14 | library(scales) 15 | library(lubridate) 16 | library(zoo) 17 | library(RPostgres) 18 | 19 | con = dbConnect( 20 | dbDriver("Postgres"), 21 | dbname = "nyc-motor-vehicle-collisions", 22 | host = "localhost" 23 | ) 24 | 25 | query = function(sql) { 26 | res = dbSendQuery(con, sql) 27 | results = dbFetch(res) %>% as_tibble() 28 | dbClearResult(res) 29 | results 30 | } 31 | 32 | capitalize_first_letter = function(string) { 33 | paste0(toupper(substr(string, 1, 1)), substr(string, 2, nchar(string))) 34 | } 35 | 36 | theme_void_sf = function(base_size = 12) { 37 | theme_void(base_size = base_size) + 38 | theme( 39 | panel.grid = element_line(size = 0), 40 | text = element_text(family = "Open Sans") 41 | ) 42 | } 43 | 44 | font_family = "Open Sans" 45 | title_font_family = "Fjalla One" 46 | nypd_blue = "#00003c" 47 | 48 | theme_tws = function(base_size = 12) { 49 | bg_color = "#f4f4f4" 50 | bg_rect = element_rect(fill = bg_color, color = bg_color) 51 | 52 | theme_bw(base_size) + 53 | theme( 54 | text = element_text(family = font_family), 55 | plot.title = element_text(family = title_font_family), 56 | plot.subtitle = element_text(size = rel(1)), 57 | plot.caption = element_text(size = rel(0.5), margin = unit(c(1, 0, 0, 0), "lines"), lineheight = 1.1, color = "#555555"), 58 | plot.background = bg_rect, 59 | axis.ticks = element_blank(), 60 | axis.text.x = element_text(size = rel(1)), 61 | axis.title.x = element_text(size = rel(1), margin = margin(1, 0, 0, 0, unit = "lines")), 62 | axis.text.y = element_text(size = rel(1)), 63 | axis.title.y = element_text(size = rel(1)), 64 | panel.background = bg_rect, 65 | panel.border = element_blank(), 66 | panel.grid.major = element_line(color = "grey80", size = 0.25), 67 | panel.grid.minor = element_line(color = "grey80", size = 0.25), 68 | panel.spacing = unit(1.5, "lines"), 69 | legend.background = bg_rect, 70 | legend.key.width = unit(1.5, "line"), 71 | legend.key = element_blank(), 72 | strip.background = element_blank() 73 | ) 74 | } 75 | 76 | no_axis_titles = function() { 77 | theme(axis.title = element_blank()) 78 | } 79 | 80 | # via https://stackoverflow.com/a/10559838 81 | integer_breaks = function(n = 3, ...) { 82 | breaker = pretty_breaks(n, ...) 83 | function(x) { 84 | breaks = breaker(x) 85 | breaks[breaks == floor(breaks)] 86 | } 87 | } 88 | -------------------------------------------------------------------------------- /analysis/regression_model.R: -------------------------------------------------------------------------------- 1 | source("helpers.R") 2 | library(glmnet) 3 | 4 | # set up regression data 5 | regression_collisions = query(" 6 | SELECT 7 | unique_key, 8 | extract(hour FROM collision_time) AS hour_of_day, 9 | extract(dow FROM collision_time) AS day_of_week, 10 | extract(year FROM collision_time) AS year, 11 | coalesce(on_street_name, off_street_name) AS street_name, 12 | borough, 13 | number_of_motorists_injured + number_of_cyclists_injured + number_of_pedestrians_injured > 0 AS has_injury, 14 | number_of_motorists_killed + number_of_cyclists_killed + number_of_pedestrians_killed > 0 AS has_fatality 15 | FROM collisions 16 | WHERE borough IS NOT NULL 17 | ORDER BY unique_key 18 | ") 19 | 20 | regression_contributing_factors = query(" 21 | WITH candidates AS ( 22 | SELECT contributing_factor 23 | FROM collisions_contributing_factors 24 | GROUP BY contributing_factor 25 | HAVING COUNT(*) >= 5000 26 | ) 27 | SELECT * 28 | FROM collisions_contributing_factors 29 | WHERE contributing_factor IN (SELECT contributing_factor FROM candidates) 30 | ") 31 | 32 | regression_vehicle_types = query(" 33 | WITH candidates AS ( 34 | SELECT vehicle_type 35 | FROM collisions_vehicles 36 | GROUP BY vehicle_type 37 | HAVING COUNT(*) >= 5000 38 | ) 39 | SELECT * 40 | FROM collisions_vehicles 41 | WHERE vehicle_type IN (SELECT vehicle_type FROM candidates) 42 | ") 43 | 44 | regression_vehicles_involved = query(" 45 | SELECT collision_unique_key, count(*)::int AS num_vehicles 46 | FROM collisions_vehicles 47 | GROUP BY collision_unique_key 48 | ORDER BY collision_unique_key 49 | ") 50 | 51 | regression_collisions = regression_collisions %>% 52 | inner_join(regression_vehicles_involved, by = c("unique_key" = "collision_unique_key")) %>% 53 | mutate( 54 | hour_of_day = factor(hour_of_day, levels = c(12:23, 0:11)), 55 | day_of_week = factor(day_of_week), 56 | weekday = factor(day_of_week %in% 1:5), 57 | year = factor(year), 58 | borough = fct_relevel(factor(borough), "Manhattan"), 59 | num_vehicles = factor(num_vehicles, levels = c(2, 1, 3, 4, 5)), 60 | street_type = fct_relevel(factor(case_when( 61 | grepl("expressway|expy|expwy|parkway|pkwy|highway|bqe|turnpike|fdr|thruway", street_name) ~ "highway", 62 | grepl("street| st$", street_name) ~ "street", 63 | grepl("avenue|broadway|bowery| ave$", street_name) ~ "avenue", 64 | grepl(" road| rd$", street_name) ~ "road", 65 | grepl(" lane| ln$", street_name) ~ "lane", 66 | grepl(" drive| dr$", street_name) ~ "drive", 67 | grepl("boulevard|blvd", street_name) ~ "boulevard", 68 | grepl(" place| pl$", street_name) ~ "place", 69 | grepl("bridge", street_name) ~ "bridge", 70 | grepl("tunnel", street_name) ~ "tunnel", 71 | !is.na(street_name) ~ "other", 72 | TRUE ~ "unknown" 73 | )), "unknown") 74 | ) 75 | 76 | for(f in sort(unique(regression_contributing_factors$contributing_factor))) { 77 | fname = paste0("cf_", gsub(".", "_", make.names(f), fixed = TRUE)) 78 | 79 | factor_unique_keys = regression_contributing_factors %>% 80 | filter(contributing_factor == f) %>% 81 | pull(collision_unique_key) %>% 82 | unique() 83 | 84 | regression_collisions = regression_collisions %>% 85 | mutate(!!fname := as.numeric(unique_key %in% factor_unique_keys)) 86 | } 87 | 88 | for(v in sort(unique(regression_vehicle_types$vehicle_type))) { 89 | vname = paste0("vt_", gsub(".", "_", make.names(v), fixed = TRUE)) 90 | 91 | vehicle_unique_keys = regression_vehicle_types %>% 92 | filter(vehicle_type == v) %>% 93 | pull(collision_unique_key) %>% 94 | unique() 95 | 96 | regression_collisions = regression_collisions %>% 97 | mutate(!!vname := as.numeric(unique_key %in% vehicle_unique_keys)) 98 | } 99 | 100 | # build model matrices 101 | injury_model_matrix = sparse.model.matrix( 102 | has_injury ~ . - 1, 103 | select(regression_collisions, -day_of_week, -street_name, -unique_key, -has_fatality) 104 | ) 105 | 106 | fatality_model_matrix = sparse.model.matrix( 107 | has_fatality ~ . - 1, 108 | select(regression_collisions, -day_of_week, -street_name, -unique_key, -has_injury) 109 | ) 110 | 111 | # run regularized regressions 112 | injury_cvfit = cv.glmnet( 113 | x = injury_model_matrix, 114 | y = regression_collisions$has_injury, 115 | family = "binomial" 116 | ) 117 | 118 | fatality_cvfit = cv.glmnet( 119 | x = fatality_model_matrix, 120 | y = regression_collisions$has_fatality, 121 | family = "binomial" 122 | ) 123 | 124 | # check on lambda values and coefficients 125 | plot(injury_cvfit) 126 | coef(injury_cvfit, s = "lambda.1se") 127 | 128 | plot(fatality_cvfit) 129 | coef(fatality_cvfit, s = "lambda.1se") 130 | -------------------------------------------------------------------------------- /augment_data/Gemfile: -------------------------------------------------------------------------------- 1 | source 'https://rubygems.org' 2 | 3 | ruby '2.6.6' 4 | 5 | gem 'activerecord', '~> 5.2', require: 'active_record' 6 | gem 'pg', '~> 1.1' 7 | gem 'rest-client', '~> 2.0' 8 | -------------------------------------------------------------------------------- /augment_data/Gemfile.lock: -------------------------------------------------------------------------------- 1 | GEM 2 | remote: https://rubygems.org/ 3 | specs: 4 | activemodel (5.2.8.1) 5 | activesupport (= 5.2.8.1) 6 | activerecord (5.2.8.1) 7 | activemodel (= 5.2.8.1) 8 | activesupport (= 5.2.8.1) 9 | arel (>= 9.0) 10 | activesupport (5.2.8.1) 11 | concurrent-ruby (~> 1.0, >= 1.0.2) 12 | i18n (>= 0.7, < 2) 13 | minitest (~> 5.1) 14 | tzinfo (~> 1.1) 15 | arel (9.0.0) 16 | concurrent-ruby (1.1.10) 17 | domain_name (0.5.20190701) 18 | unf (>= 0.0.5, < 1.0.0) 19 | http-accept (1.7.0) 20 | http-cookie (1.0.3) 21 | domain_name (~> 0.5) 22 | i18n (1.11.0) 23 | concurrent-ruby (~> 1.0) 24 | mime-types (3.3.1) 25 | mime-types-data (~> 3.2015) 26 | mime-types-data (3.2020.0512) 27 | minitest (5.16.2) 28 | netrc (0.11.0) 29 | pg (1.2.3) 30 | rest-client (2.1.0) 31 | http-accept (>= 1.7.0, < 2.0) 32 | http-cookie (>= 1.0.2, < 2.0) 33 | mime-types (>= 1.16, < 4.0) 34 | netrc (~> 0.8) 35 | thread_safe (0.3.6) 36 | tzinfo (1.2.10) 37 | thread_safe (~> 0.1) 38 | unf (0.1.4) 39 | unf_ext 40 | unf_ext (0.0.7.7) 41 | 42 | PLATFORMS 43 | ruby 44 | 45 | DEPENDENCIES 46 | activerecord (~> 5.2) 47 | pg (~> 1.1) 48 | rest-client (~> 2.0) 49 | 50 | RUBY VERSION 51 | ruby 2.6.6p146 52 | 53 | BUNDLED WITH 54 | 1.17.2 55 | -------------------------------------------------------------------------------- /augment_data/README.md: -------------------------------------------------------------------------------- 1 | # Fill in collision records that are missing coordinates 2 | 3 | ~13% of the collisions in the raw dataset are missing lat/lon coordinates. Many of those records have cross streets or addresses, which can be used to fill in coordinates with reasonable guesses. There are 2 strategies used to fill in missing coordinates: other collisions at the same cross streets, and geocoding. These processes add coordinates for ~8% of the full dataset, leaving ~5% of collisions without coordinates. The `collisions.coordinates_source` column keeps track of where each collision's coordinates came from. It's possible these augmentation processes might introduce some data errors, but an anecdotal manual review suggested that they are accurate more often than not. 4 | 5 | ## Coordinates from other collisions listed at the same cross streets 6 | 7 | Calculate the most common coordinates for each pair of cross streets, and use them to fill in coordinates for collisions at the same cross streets that are missing coordinates. E.g. there are collisions listed at Bruckner Boulevard & E 138 St that are missing coordinates, but there are also collisions at the same cross streets that have coordinates, so assume that the unknown Bruckner & E 138 collisions happened at the most common coordinates listed for the known Bruckner & E 138 collisions, subject there being at least 2 known collisions at the same lat/lon rounded to 4 digits. 8 | 9 | `psql nyc-motor-vehicle-collisions -f augment_collisions_with_most_common_coordinates.sql` 10 | 11 | ## Geocode missing coordinates 12 | 13 | 1. Get a Google Maps Geocoding API key: https://developers.google.com/maps/documentation/geocoding/get-api-key 14 | 2. `psql nyc-motor-vehicle-collisions -f populate_geocodings.sql` 15 | 3. `bundle install` 16 | 4. `ruby geocode.rb --google-api-key YOUR_API_KEY_HERE` 17 | 5. `psql nyc-motor-vehicle-collisions -f augment_collisions_with_geocoding.sql` 18 | 19 | Note that as of January 2019, Google Maps Geocoding API allows up to 40,000 requests per month for free in the United States, then charges $5 per 1,000 requests after that. See https://developers.google.com/maps/documentation/geocoding/usage-and-billing for the latest pricing. When I ran `geocode.rb` there fewer than 40,000 records to geocode, so I was able to do it for free. 20 | 21 | ## Augmentation results 22 | 23 | Coordinates sources with raw data through 12/31/2018: 24 | 25 | ```sql 26 | SELECT 27 | extract(year FROM collision_time) AS year, 28 | COUNT(*) AS n, 29 | ROUND(SUM(CASE WHEN coordinates_source = 'raw_data' THEN 1 END)::numeric / COUNT(*), 2) AS raw_data, 30 | ROUND(SUM(CASE WHEN coordinates_source = 'most_common_in_raw_data' THEN 1 END)::numeric / COUNT(*), 2) AS most_common_in_raw_data, 31 | ROUND(SUM(CASE WHEN coordinates_source = 'geocoding' THEN 1 END)::numeric / COUNT(*), 2) AS geocoding, 32 | ROUND(SUM(CASE WHEN coordinates_source IS NULL THEN 1 END)::numeric / COUNT(*), 2) AS unknown 33 | FROM collisions 34 | GROUP BY year 35 | ORDER BY year; 36 | ``` 37 | 38 | ```sql 39 | year | n | raw_data | most_common_in_raw_data | geocoding | unknown 40 | ------+--------+----------+-------------------------+-----------+--------- 41 | 2012 | 100541 | 0.85 | 0.06 | 0.03 | 0.06 42 | 2013 | 203727 | 0.84 | 0.07 | 0.03 | 0.06 43 | 2014 | 206028 | 0.84 | 0.07 | 0.03 | 0.06 44 | 2015 | 217693 | 0.84 | 0.07 | 0.03 | 0.06 45 | 2016 | 229780 | 0.84 | 0.08 | 0.03 | 0.05 46 | 2017 | 230991 | 0.94 | 0.01 | 0.03 | 0.02 47 | 2018 | 231016 | 0.93 | 0.01 | 0.03 | 0.03 48 | ``` 49 | -------------------------------------------------------------------------------- /augment_data/augment_collisions_with_geocoding.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE tmp_points AS 2 | SELECT 3 | id, 4 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS location 5 | FROM geocodings 6 | WHERE (taxi_zone_gid IS NULL OR nyct2010_gid IS NULL) 7 | AND latitude IS NOT NULL 8 | AND longitude IS NOT NULL; 9 | 10 | CREATE INDEX ON tmp_points USING gist (location); 11 | 12 | CREATE TABLE tmp_zones AS 13 | SELECT t.id, z.gid, z.borough 14 | FROM tmp_points t, taxi_zones z 15 | WHERE ST_Within(t.location, z.geom); 16 | 17 | CREATE UNIQUE INDEX ON tmp_zones (id); 18 | 19 | CREATE TABLE tmp_tracts AS 20 | SELECT t.id, n.gid, n.boroname 21 | FROM tmp_points t, nyct2010 n 22 | WHERE ST_Within(t.location, n.geom); 23 | 24 | CREATE UNIQUE INDEX ON tmp_tracts (id); 25 | 26 | UPDATE geocodings 27 | SET taxi_zone_gid = tmp_zones.gid, 28 | borough = tmp_zones.borough 29 | FROM tmp_zones 30 | WHERE geocodings.id = tmp_zones.id; 31 | 32 | UPDATE geocodings 33 | SET nyct2010_gid = tmp_tracts.gid, 34 | borough = tmp_tracts.boroname 35 | FROM tmp_tracts 36 | WHERE geocodings.id = tmp_tracts.id; 37 | 38 | WITH coords AS ( 39 | SELECT 40 | c.unique_key, 41 | g.latitude, 42 | g.longitude, 43 | g.taxi_zone_gid, 44 | g.borough, 45 | g.nyct2010_gid 46 | FROM collisions c 47 | INNER JOIN geocodings g 48 | ON coalesce(c.on_street_name, '') = coalesce(g.on_street_name, '') 49 | AND coalesce(c.cross_street_name, '') = coalesce(g.cross_street_name, '') 50 | AND coalesce(c.off_street_name, '') = coalesce(g.off_street_name, '') 51 | AND coalesce(c.reported_borough, '') = coalesce(g.reported_borough, '') 52 | WHERE c.latitude IS NULL 53 | AND c.longitude IS NULL 54 | AND g.latitude IS NOT NULL 55 | AND g.longitude IS NOT NULL 56 | ) 57 | UPDATE collisions 58 | SET latitude = coords.latitude, 59 | longitude = coords.longitude, 60 | taxi_zone_gid = coords.taxi_zone_gid, 61 | borough = coords.borough, 62 | nyct2010_gid = coords.nyct2010_gid, 63 | coordinates_source = 'geocoding' 64 | FROM coords 65 | WHERE collisions.unique_key = coords.unique_key; 66 | 67 | DROP TABLE tmp_points; 68 | DROP TABLE tmp_zones; 69 | DROP TABLE tmp_tracts; 70 | -------------------------------------------------------------------------------- /augment_data/augment_collisions_with_most_common_coordinates.sql: -------------------------------------------------------------------------------- 1 | CREATE TEMP TABLE most_common_coords AS 2 | WITH candidates AS ( 3 | SELECT 4 | on_street_name, 5 | cross_street_name, 6 | borough, 7 | taxi_zone_gid, 8 | nyct2010_gid, 9 | round(latitude, 4) AS lat, 10 | round(longitude, 4) AS lng, 11 | count(*) AS n 12 | FROM collisions 13 | WHERE coordinates_source = 'raw_data' 14 | AND latitude IS NOT NULL 15 | AND longitude IS NOT NULL 16 | AND on_street_name IS NOT NULL 17 | AND ( 18 | cross_street_name IS NOT NULL 19 | OR on_street_name LIKE '% bridge' 20 | ) 21 | GROUP BY on_street_name, cross_street_name, borough, taxi_zone_gid, nyct2010_gid, lat, lng 22 | ) 23 | SELECT DISTINCT ON (on_street_name, cross_street_name) * 24 | FROM candidates 25 | WHERE n >= 2 26 | ORDER BY on_street_name, cross_street_name, n DESC; 27 | 28 | WITH most_common AS ( 29 | SELECT 30 | c.unique_key, 31 | mcc.lat, 32 | mcc.lng, 33 | mcc.taxi_zone_gid, 34 | mcc.borough, 35 | mcc.nyct2010_gid 36 | FROM collisions c 37 | INNER JOIN most_common_coords mcc 38 | ON c.on_street_name IS NOT NULL 39 | AND c.on_street_name = mcc.on_street_name 40 | AND coalesce(c.cross_street_name, '') = coalesce(mcc.cross_street_name, '') 41 | WHERE c.latitude IS NULL 42 | AND c.longitude IS NULL 43 | ) 44 | UPDATE collisions 45 | SET latitude = most_common.lat, 46 | longitude = most_common.lng, 47 | taxi_zone_gid = most_common.taxi_zone_gid, 48 | borough = most_common.borough, 49 | nyct2010_gid = most_common.nyct2010_gid, 50 | coordinates_source = 'most_common_in_raw_data' 51 | FROM most_common 52 | WHERE collisions.unique_key = most_common.unique_key; 53 | 54 | DROP TABLE most_common_coords; 55 | -------------------------------------------------------------------------------- /augment_data/geocode.rb: -------------------------------------------------------------------------------- 1 | require 'bundler/setup' 2 | Bundler.require 3 | 4 | ActiveRecord::Base.establish_connection( 5 | adapter: 'postgresql', 6 | host: 'localhost', 7 | database: 'nyc-motor-vehicle-collisions' 8 | ) 9 | 10 | class Geocoding < ActiveRecord::Base 11 | API_BASE_URL = 'https://maps.googleapis.com/maps/api/geocode/json' 12 | API_KEY = Hash[*ARGV]['--google-api-key'] 13 | NYC_BOUNDS = '40.4,-74.3|41.0,-73.65' 14 | 15 | def geocode! 16 | raise 'no api key' if API_KEY.blank? 17 | 18 | begin 19 | request = RestClient.get(API_BASE_URL, params: { 20 | address: address_for_geocode, 21 | bounds: NYC_BOUNDS, 22 | key: API_KEY 23 | }) 24 | rescue RestClient::BadRequest 25 | puts "Bad request for #{id}" 26 | return 27 | end 28 | 29 | json = JSON.parse(request.body) 30 | result = json.dig('results', 0) 31 | 32 | if result && (result['types'] & acceptable_google_types).present? 33 | self.latitude = result.dig('geometry', 'location', 'lat') 34 | self.longitude = result.dig('geometry', 'location', 'lng') 35 | end 36 | 37 | self.full_response = json 38 | 39 | save! 40 | end 41 | end 42 | 43 | class IntersectionGeocoding < Geocoding 44 | scope :for_geocoding, -> { where(full_response: nil) } 45 | 46 | def address_for_geocode 47 | raise "#{id} missing on_street_name" if on_street_name.blank? 48 | raise "#{id} missing cross_street_name" if cross_street_name.blank? 49 | 50 | [ 51 | "#{on_street_name} and #{cross_street_name}", 52 | reported_borough, 53 | 'nyc' 54 | ].compact.join(', ').downcase 55 | end 56 | 57 | def acceptable_google_types 58 | %w(intersection) 59 | end 60 | end 61 | 62 | class StreetAddressGeocoding < Geocoding 63 | scope :for_geocoding, -> { 64 | where(" 65 | full_response IS NULL 66 | AND off_street_name NOT LIKE '%parking lot%' 67 | AND off_street_name NOT LIKE '%p/l%' 68 | AND off_street_name NOT LIKE '%pl of%' 69 | AND off_street_name NOT LIKE '%muni lot%' 70 | AND off_street_name NOT LIKE '%driveway%' 71 | AND NOT ( 72 | off_street_name ~* '^\\d+ east drive' AND reported_borough IS NULL 73 | ) 74 | AND off_street_name ~* '\\d' 75 | ") 76 | } 77 | 78 | def address_for_geocode 79 | raise "#{id} missing off_street_name" if off_street_name.blank? 80 | 81 | [ 82 | off_street_name.squish, 83 | reported_borough, 84 | 'nyc' 85 | ].compact.join(', ').downcase 86 | end 87 | 88 | def acceptable_google_types 89 | %w(street_address premise) 90 | end 91 | end 92 | 93 | def run 94 | if Geocoding::API_KEY.blank? 95 | puts [ 96 | 'You have to specify a Google Maps Geocoding API key', 97 | 'Usage:', 98 | ' ruby geocode.rb --google-api-key YOUR_API_KEY_HERE' 99 | ].join("\n\n") 100 | 101 | return 102 | end 103 | 104 | [IntersectionGeocoding, StreetAddressGeocoding].each do |klass| 105 | scope = klass.for_geocoding 106 | puts "#{Time.now}: going to geocode #{scope.count} #{klass.name} addresses" 107 | 108 | scope.find_each.with_index do |g, i| 109 | puts "#{Time.now}: done #{i} addresses" if i > 0 && i % 50 == 0 110 | g.geocode! 111 | end 112 | 113 | puts "#{Time.now}: finished #{klass.name}" 114 | end 115 | end 116 | 117 | run 118 | -------------------------------------------------------------------------------- /augment_data/populate_geocodings.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO geocodings (type, on_street_name, cross_street_name, reported_borough, n) 2 | SELECT 3 | 'IntersectionGeocoding'::text AS type, 4 | on_street_name, 5 | cross_street_name, 6 | reported_borough, 7 | COUNT(*) AS n 8 | FROM collisions 9 | WHERE latitude IS NULL 10 | AND longitude IS NULL 11 | AND on_street_name IS NOT NULL 12 | AND cross_street_name IS NOT NULL 13 | GROUP BY on_street_name, cross_street_name, reported_borough 14 | ORDER BY COUNT(*) DESC, on_street_name, cross_street_name, reported_borough; 15 | 16 | INSERT INTO geocodings (type, off_street_name, reported_borough, n) 17 | SELECT 18 | 'StreetAddressGeocoding'::text AS type, 19 | off_street_name, 20 | reported_borough, 21 | COUNT(*) AS n 22 | FROM collisions 23 | WHERE latitude IS NULL 24 | AND longitude IS NULL 25 | AND off_street_name IS NOT NULL 26 | GROUP BY off_street_name, reported_borough 27 | ORDER BY COUNT(*) DESC, off_street_name, reported_borough; 28 | -------------------------------------------------------------------------------- /download_raw_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "`date`: downloading raw data" 4 | mkdir -p raw_data 5 | wget -c -O raw_data/collisions.csv https://data.cityofnewyork.us/api/views/h9gi-nx95/rows.csv?accessType=DOWNLOAD 6 | echo "`date`: done downloading raw data" 7 | -------------------------------------------------------------------------------- /import_data.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "`date`: importing raw data" 4 | psql nyc-motor-vehicle-collisions -f setup_files/import_data.sql 5 | echo "`date`: done importing raw data; populating vehicles and contributing factors" 6 | psql nyc-motor-vehicle-collisions -f setup_files/populate_vehicles_and_factors.sql 7 | echo "`date`: done populating; creating indexes" 8 | psql nyc-motor-vehicle-collisions -f setup_files/create_indexes.sql 9 | echo "`date`: done" 10 | -------------------------------------------------------------------------------- /initialize_database.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | createdb nyc-motor-vehicle-collisions 4 | 5 | psql nyc-motor-vehicle-collisions -f setup_files/create_schema.sql 6 | 7 | shp2pgsql -I -s 2263:4326 shapefiles/taxi_zones/taxi_zones.shp | psql -d nyc-motor-vehicle-collisions 8 | psql nyc-motor-vehicle-collisions -c "CREATE INDEX ON taxi_zones (locationid);" 9 | psql nyc-motor-vehicle-collisions -c "VACUUM ANALYZE taxi_zones;" 10 | 11 | shp2pgsql -I -s 2263:4326 shapefiles/nyct2010wi_18d/nyct2010wi.shp nyct2010 | psql -d nyc-motor-vehicle-collisions 12 | psql nyc-motor-vehicle-collisions -c "VACUUM ANALYZE nyct2010;" 13 | -------------------------------------------------------------------------------- /setup_files/create_indexes.sql: -------------------------------------------------------------------------------- 1 | CREATE INDEX ON collisions (taxi_zone_gid); 2 | CREATE INDEX ON collisions (borough); 3 | CREATE INDEX ON collisions USING brin (collision_time) WITH (pages_per_range = 32); 4 | -------------------------------------------------------------------------------- /setup_files/create_schema.sql: -------------------------------------------------------------------------------- 1 | CREATE EXTENSION postgis; 2 | 3 | CREATE TABLE collisions_raw ( 4 | date date, 5 | time time, 6 | borough text, 7 | zip_code text, 8 | latitude numeric, 9 | longitude numeric, 10 | location text, 11 | on_street_name text, 12 | cross_street_name text, 13 | off_street_name text, 14 | number_of_persons_injured integer, 15 | number_of_persons_killed integer, 16 | number_of_pedestrians_injured integer, 17 | number_of_pedestrians_killed integer, 18 | number_of_cyclists_injured integer, 19 | number_of_cyclists_killed integer, 20 | number_of_motorists_injured integer, 21 | number_of_motorists_killed integer, 22 | contributing_factor_vehicle_1 text, 23 | contributing_factor_vehicle_2 text, 24 | contributing_factor_vehicle_3 text, 25 | contributing_factor_vehicle_4 text, 26 | contributing_factor_vehicle_5 text, 27 | unique_key integer primary key, 28 | vehicle_type_code_1 text, 29 | vehicle_type_code_2 text, 30 | vehicle_type_code_3 text, 31 | vehicle_type_code_4 text, 32 | vehicle_type_code_5 text 33 | ); 34 | 35 | CREATE TABLE collisions ( 36 | unique_key integer primary key, 37 | collision_time timestamp without time zone, 38 | taxi_zone_gid integer, 39 | nyct2010_gid integer, 40 | borough text, 41 | reported_borough text, 42 | zip_code text, 43 | latitude numeric, 44 | longitude numeric, 45 | on_street_name text, 46 | cross_street_name text, 47 | off_street_name text, 48 | number_of_persons_injured integer, 49 | number_of_persons_killed integer, 50 | number_of_pedestrians_injured integer, 51 | number_of_pedestrians_killed integer, 52 | number_of_cyclists_injured integer, 53 | number_of_cyclists_killed integer, 54 | number_of_motorists_injured integer, 55 | number_of_motorists_killed integer, 56 | contributing_factor_vehicle_1 text, 57 | contributing_factor_vehicle_2 text, 58 | contributing_factor_vehicle_3 text, 59 | contributing_factor_vehicle_4 text, 60 | contributing_factor_vehicle_5 text, 61 | vehicle_type_code_1 text, 62 | vehicle_type_code_2 text, 63 | vehicle_type_code_3 text, 64 | vehicle_type_code_4 text, 65 | vehicle_type_code_5 text, 66 | coordinates_source text 67 | ); 68 | 69 | CREATE TABLE collisions_vehicles ( 70 | collision_unique_key integer not null, 71 | vehicle_number integer not null, 72 | vehicle_type text, 73 | primary key (collision_unique_key, vehicle_number) 74 | ); 75 | 76 | CREATE TABLE collisions_contributing_factors ( 77 | collision_unique_key integer not null, 78 | contributing_number integer not null, 79 | contributing_factor text, 80 | primary key (collision_unique_key, contributing_number) 81 | ); 82 | 83 | CREATE TABLE geocodings ( 84 | id serial primary key, 85 | type text not null, 86 | on_street_name text, 87 | cross_street_name text, 88 | off_street_name text, 89 | reported_borough text, 90 | n integer, 91 | latitude numeric, 92 | longitude numeric, 93 | full_response jsonb, 94 | taxi_zone_gid integer, 95 | nyct2010_gid integer, 96 | borough text 97 | ); 98 | 99 | CREATE UNIQUE INDEX ON geocodings (on_street_name, cross_street_name, reported_borough) WHERE off_street_name IS NULL; 100 | CREATE UNIQUE INDEX ON geocodings (on_street_name, cross_street_name) WHERE off_street_name IS NULL AND reported_borough IS NULL; 101 | CREATE UNIQUE INDEX ON geocodings (off_street_name, reported_borough) WHERE on_street_name IS NULL; 102 | CREATE UNIQUE INDEX ON geocodings (off_street_name) WHERE on_street_name IS NULL AND reported_borough IS NULL; 103 | -------------------------------------------------------------------------------- /setup_files/import_data.sql: -------------------------------------------------------------------------------- 1 | SET datestyle = 'ISO, MDY'; 2 | 3 | \copy collisions_raw FROM 'raw_data/collisions.csv' CSV HEADER; 4 | 5 | CREATE TABLE tmp_points AS 6 | SELECT 7 | unique_key, 8 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS location 9 | FROM collisions_raw 10 | WHERE longitude IS NOT NULL AND latitude IS NOT NULL; 11 | 12 | CREATE INDEX ON tmp_points USING gist (location); 13 | 14 | CREATE TABLE tmp_zones AS 15 | SELECT t.unique_key, z.gid, z.borough 16 | FROM tmp_points t, taxi_zones z 17 | WHERE ST_Within(t.location, z.geom); 18 | 19 | CREATE UNIQUE INDEX ON tmp_zones (unique_key); 20 | 21 | CREATE TABLE tmp_tracts AS 22 | SELECT t.unique_key, n.gid, n.boroname 23 | FROM tmp_points t, nyct2010 n 24 | WHERE ST_Within(t.location, n.geom); 25 | 26 | CREATE UNIQUE INDEX ON tmp_tracts (unique_key); 27 | 28 | DELETE FROM collisions 29 | WHERE unique_key IN (SELECT unique_key FROM collisions_raw); 30 | 31 | INSERT INTO collisions ( 32 | unique_key, collision_time, taxi_zone_gid, nyct2010_gid, borough, 33 | reported_borough, zip_code, latitude, longitude, on_street_name, 34 | cross_street_name, off_street_name, number_of_persons_injured, 35 | number_of_persons_killed, number_of_pedestrians_injured, 36 | number_of_pedestrians_killed, number_of_cyclists_injured, 37 | number_of_cyclists_killed, number_of_motorists_injured, 38 | number_of_motorists_killed, contributing_factor_vehicle_1, 39 | contributing_factor_vehicle_2, contributing_factor_vehicle_3, 40 | contributing_factor_vehicle_4, contributing_factor_vehicle_5, 41 | vehicle_type_code_1, vehicle_type_code_2, vehicle_type_code_3, 42 | vehicle_type_code_4, vehicle_type_code_5, coordinates_source 43 | ) 44 | SELECT 45 | r.unique_key, 46 | (r.date || ' ' || r.time)::timestamp without time zone AS collision_time, 47 | z.gid AS taxi_zone_gid, 48 | t.gid AS nyct2010_gid, 49 | -- prefer borough from nyct2010 over taxi_zones because nyct2010 50 | -- includes water areas and therefore captures bridge accidents 51 | coalesce(t.boroname, z.borough) AS borough, 52 | nullif(lower(trim(r.borough)), '') AS reported_borough, 53 | nullif(trim(r.zip_code), '') AS zip_code, 54 | nullif(r.latitude, 0) AS latitude, 55 | nullif(r.longitude, 0) AS longitude, 56 | nullif(lower(trim(regexp_replace(r.on_street_name, '\s+', ' ', 'g'))), '') AS on_street_name, 57 | nullif(lower(trim(regexp_replace(r.cross_street_name, '\s+', ' ', 'g'))), '') AS cross_street_name, 58 | nullif(lower(trim(regexp_replace(r.off_street_name, '\s+', ' ', 'g'))), '') AS off_street_name, 59 | r.number_of_persons_injured, 60 | r.number_of_persons_killed, 61 | r.number_of_pedestrians_injured, 62 | r.number_of_pedestrians_killed, 63 | r.number_of_cyclists_injured, 64 | r.number_of_cyclists_killed, 65 | r.number_of_motorists_injured, 66 | r.number_of_motorists_killed, 67 | nullif(lower(trim(r.contributing_factor_vehicle_1)), '') AS contributing_factor_vehicle_1, 68 | nullif(lower(trim(r.contributing_factor_vehicle_2)), '') AS contributing_factor_vehicle_2, 69 | nullif(lower(trim(r.contributing_factor_vehicle_3)), '') AS contributing_factor_vehicle_3, 70 | nullif(lower(trim(r.contributing_factor_vehicle_4)), '') AS contributing_factor_vehicle_4, 71 | nullif(lower(trim(r.contributing_factor_vehicle_5)), '') AS contributing_factor_vehicle_5, 72 | nullif(lower(trim(r.vehicle_type_code_1)), '') AS vehicle_type_code_1, 73 | nullif(lower(trim(r.vehicle_type_code_2)), '') AS vehicle_type_code_2, 74 | nullif(lower(trim(r.vehicle_type_code_3)), '') AS vehicle_type_code_3, 75 | nullif(lower(trim(r.vehicle_type_code_4)), '') AS vehicle_type_code_4, 76 | nullif(lower(trim(r.vehicle_type_code_5)), '') AS vehicle_type_code_5, 77 | CASE 78 | WHEN nullif(r.latitude, 0) IS NOT NULL AND nullif(r.longitude, 0) IS NOT NULL 79 | THEN 'raw_data' 80 | END AS coordinates_source 81 | FROM collisions_raw r 82 | LEFT JOIN tmp_zones z ON r.unique_key = z.unique_key 83 | LEFT JOIN tmp_tracts t ON r.unique_key = t.unique_key 84 | ORDER BY r.date, r.time; 85 | 86 | TRUNCATE TABLE collisions_raw; 87 | DROP TABLE tmp_points; 88 | DROP TABLE tmp_zones; 89 | DROP TABLE tmp_tracts; 90 | -------------------------------------------------------------------------------- /setup_files/populate_vehicles_and_factors.sql: -------------------------------------------------------------------------------- 1 | INSERT INTO collisions_vehicles 2 | (collision_unique_key, vehicle_number, vehicle_type) 3 | WITH vehicles AS ( 4 | SELECT 5 | unique_key, 6 | ARRAY[1, 2, 3, 4, 5] AS vehicle_number_array, 7 | ARRAY[ 8 | vehicle_type_code_1, 9 | vehicle_type_code_2, 10 | vehicle_type_code_3, 11 | vehicle_type_code_4, 12 | vehicle_type_code_5 13 | ] AS vehicle_type_array 14 | FROM collisions 15 | ), 16 | unnested AS ( 17 | SELECT 18 | unique_key, 19 | unnest(vehicle_number_array) AS vehicle_number, 20 | unnest(vehicle_type_array) AS vehicle_type 21 | FROM vehicles 22 | ) 23 | SELECT unique_key, vehicle_number, vehicle_type 24 | FROM unnested 25 | WHERE vehicle_type IS NOT NULL 26 | ORDER BY unique_key, vehicle_number 27 | ON CONFLICT (collision_unique_key, vehicle_number) 28 | DO UPDATE SET vehicle_type = EXCLUDED.vehicle_type; 29 | 30 | INSERT INTO collisions_contributing_factors 31 | (collision_unique_key, contributing_number, contributing_factor) 32 | WITH vehicles AS ( 33 | SELECT 34 | unique_key, 35 | ARRAY[1, 2, 3, 4, 5] AS contributing_number_array, 36 | ARRAY[ 37 | contributing_factor_vehicle_1, 38 | contributing_factor_vehicle_2, 39 | contributing_factor_vehicle_3, 40 | contributing_factor_vehicle_4, 41 | contributing_factor_vehicle_5 42 | ] AS contributing_factor_array 43 | FROM collisions 44 | ), 45 | unnested AS ( 46 | SELECT 47 | unique_key, 48 | unnest(contributing_number_array) AS contributing_number, 49 | unnest(contributing_factor_array) AS contributing_factor 50 | FROM vehicles 51 | ) 52 | SELECT unique_key, contributing_number, contributing_factor 53 | FROM unnested 54 | WHERE contributing_factor IS NOT NULL 55 | ORDER BY unique_key, contributing_number 56 | ON CONFLICT (collision_unique_key, contributing_number) 57 | DO UPDATE SET contributing_factor = EXCLUDED.contributing_factor; 58 | -------------------------------------------------------------------------------- /shapefiles/nyct2010wi_18d/nyct2010wi.dbf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/nyct2010wi_18d/nyct2010wi.dbf -------------------------------------------------------------------------------- /shapefiles/nyct2010wi_18d/nyct2010wi.prj: -------------------------------------------------------------------------------- 1 | PROJCS["NAD_1983_StatePlane_New_York_Long_Island_FIPS_3104_Feet",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic"],PARAMETER["False_Easting",984250.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-74.0],PARAMETER["Standard_Parallel_1",40.66666666666666],PARAMETER["Standard_Parallel_2",41.03333333333333],PARAMETER["Latitude_Of_Origin",40.16666666666666],UNIT["Foot_US",0.3048006096012192]] -------------------------------------------------------------------------------- /shapefiles/nyct2010wi_18d/nyct2010wi.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/nyct2010wi_18d/nyct2010wi.shp -------------------------------------------------------------------------------- /shapefiles/nyct2010wi_18d/nyct2010wi.shp.xml: -------------------------------------------------------------------------------- 1 | 20181116085344001.0North American Profile of ISO19115 2003FALSEnyct2010wiwithheldLocal Area Network002912287.0687871067382.508423116411.371399273617.84320110.000ProjectedGCS_North_American_1983Linear Unit: Foot_US (0.304801)NAD_1983_StatePlane_New_York_Long_Island_FIPS_3104_Feet<ProjectedCoordinateSystem xsi:type='typens:ProjectedCoordinateSystem' xmlns:xsi='http://www.w3.org/2001/XMLSchema-instance' xmlns:xs='http://www.w3.org/2001/XMLSchema' xmlns:typens='http://www.esri.com/schemas/ArcGIS/10.1'><WKT>PROJCS[&quot;NAD_1983_StatePlane_New_York_Long_Island_FIPS_3104_Feet&quot;,GEOGCS[&quot;GCS_North_American_1983&quot;,DATUM[&quot;D_North_American_1983&quot;,SPHEROID[&quot;GRS_1980&quot;,6378137.0,298.257222101]],PRIMEM[&quot;Greenwich&quot;,0.0],UNIT[&quot;Degree&quot;,0.0174532925199433]],PROJECTION[&quot;Lambert_Conformal_Conic&quot;],PARAMETER[&quot;False_Easting&quot;,984250.0],PARAMETER[&quot;False_Northing&quot;,0.0],PARAMETER[&quot;Central_Meridian&quot;,-74.0],PARAMETER[&quot;Standard_Parallel_1&quot;,40.66666666666666],PARAMETER[&quot;Standard_Parallel_2&quot;,41.03333333333333],PARAMETER[&quot;Latitude_Of_Origin&quot;,40.16666666666666],UNIT[&quot;Foot_US&quot;,0.3048006096012192],AUTHORITY[&quot;EPSG&quot;,2263]]</WKT><XOrigin>-120039300</XOrigin><YOrigin>-96540300</YOrigin><XYScale>37212589.015695661</XYScale><ZOrigin>-100000</ZOrigin><ZScale>10000</ZScale><MOrigin>-100000</MOrigin><MScale>10000</MScale><XYTolerance>0.0032808333333333331</XYTolerance><ZTolerance>0.001</ZTolerance><MTolerance>0.001</MTolerance><HighPrecision>true</HighPrecision><WKID>102718</WKID><LatestWKID>2263</LatestWKID></ProjectedCoordinateSystem>Select c:\temp\BYTES_GP\Districts.gdb\nyct2010wi C:\temp\18D\GIS_OUTPUT\Districts\shp\nyct2010wi__Iteration.shp "ObjectID>=0 AND ObjectID<3000"Select C:\temp\18D\GIS_OUTPUT\Districts\shp\nyct2010wi__Iteration.shp C:\temp\18D\GIS_OUTPUT\Districts\shp\nyct2010wi.shp #201811160905390020181116090539001500000005000New York City Department of City Planning120 Broadway, 31st FloorNew YorkNew York10271USNew York City Department of City Planning120 Broadway, 31st FloorNew YorkNew York10271USAvailable at the following website: https://www1.nyc.gov/site/planning/data-maps/open-data.pageFreehttps://www1.nyc.gov/site/planning/data-maps/open-data.page0.000ESRI ShapefileOpen SpecificationNew York City Census Tracts for 2010 US Census Water Included18DNew York City Department of City PlanningNew York City Department of City PlanningBYTES of the BIG APPLE18D10/23/2018 8:49:29 AM11/19/2018 8:49:49 AM<DIV STYLE="text-align:Left;"><DIV><P><SPAN>The Census Tracts for the 2010 US Census. These boundary files are derived from the US Census Bureau's TIGER project and have been geographically modified to fit the New York City base map.</SPAN></P></DIV></DIV>These districts were created by the Department of City Planning to aid city agencies in administering public services.New York City Department of City Planning120 Broadway, 31st FloorNew YorkNY10271USNew YorkManhattanQueensBrooklynRichmondBronxKingsStaten IslandNew York CityboundariesboundaryboroughNew YorkManhattanQueensBrooklynRichmondboundariesBronxKingsboundaryboroughStaten IslandNew York CityThe data is freely available to all New York City agencies and the public.<DIV STYLE="text-align:Left;"><P><SPAN>This dataset is being provided by the Department of City Planning (DCP) on DCP’s website for informational purposes only. DCP does not warranty the completeness, accuracy, content, or fitness for any particular purpose or use of the dataset, nor are any such warranties to be implied or inferred with respect to the dataset as furnished on the website. DCP and the City are not liable for any deficiencies in the completeness, accuracy, content, or fitness for any particular purpose or use the dataset, or applications utilizing the dataset, provided by any third party.</SPAN></P></DIV>ground conditionDepartment of City Planning.Microsoft Windows 7 Version 6.1 (Build 7601) Service Pack 1; Esri ArcGIS 10.2.0.33481-74.260380-73.69920640.91769140.485808These data are accurate as of US Census 2010.The District files are created from the same release version of the Department of City Planning LION file. The LION file is spatially aligned with NYCMap aerial photography.nyct2010wiFeature Class0FIDFIDOID400Internal feature number.EsriSequential unique whole numbers that are automatically generated.CTLabelCTLabelString7The census tract identifier for the polygon. Each census tract number is unique to its borough.00BoroCodeBoroCodeString1Borough of census tract.1Manhattan2Bronx3Brooklyn4Queens5Staten Island00BoroNameBoroNameString32Borough Name.00CDEligibilCDEligibilString1Community Development Block Grant Eligibility.00CT2010CT2010String6String value of the census tract number.00BoroCT2010BoroCT2010String7Merged string of borough code and census tract number.00PUMAPUMAString4Public Use Microdata Areas00ShapeShapeGeometry000Feature geometry.ESRICoordinates defining the features.SHAPE_AreaShape_AreaDouble19Area of feature in internal units squared.ESRIPositive real numbers that are automatically generated.00NTACodeNTACodeString400Neighborhood Tabulation Area CodeShape_LengShape_LengDouble1900NTANameNTANameString7500Neighborhood Tabulation Area NamedatasetEPSG8.1.10SimpleFALSE0FALSEFALSE20181116 2 | -------------------------------------------------------------------------------- /shapefiles/nyct2010wi_18d/nyct2010wi.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/nyct2010wi_18d/nyct2010wi.shx -------------------------------------------------------------------------------- /shapefiles/taxi_zones/taxi_zones.dbf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/taxi_zones/taxi_zones.dbf -------------------------------------------------------------------------------- /shapefiles/taxi_zones/taxi_zones.prj: -------------------------------------------------------------------------------- 1 | PROJCS["NAD_1983_StatePlane_New_York_Long_Island_FIPS_3104_Feet",GEOGCS["GCS_North_American_1983",DATUM["D_North_American_1983",SPHEROID["GRS_1980",6378137.0,298.257222101]],PRIMEM["Greenwich",0.0],UNIT["Degree",0.0174532925199433]],PROJECTION["Lambert_Conformal_Conic"],PARAMETER["False_Easting",984250.0],PARAMETER["False_Northing",0.0],PARAMETER["Central_Meridian",-74.0],PARAMETER["Standard_Parallel_1",40.66666666666666],PARAMETER["Standard_Parallel_2",41.03333333333333],PARAMETER["Latitude_Of_Origin",40.16666666666666],UNIT["Foot_US",0.3048006096012192]] -------------------------------------------------------------------------------- /shapefiles/taxi_zones/taxi_zones.sbn: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/taxi_zones/taxi_zones.sbn -------------------------------------------------------------------------------- /shapefiles/taxi_zones/taxi_zones.sbx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/taxi_zones/taxi_zones.sbx -------------------------------------------------------------------------------- /shapefiles/taxi_zones/taxi_zones.shp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/taxi_zones/taxi_zones.shp -------------------------------------------------------------------------------- /shapefiles/taxi_zones/taxi_zones.shp.xml: -------------------------------------------------------------------------------- 1 | 2 | 20150921140426001.0TRUECalculateField taxi_zones zone [Taxi_zone] VB #CalculateField taxi_zones LocationID [taxi_zone_] VB #CalculateField taxi_zones borough [FIRST_Boro] VB # 3 | -------------------------------------------------------------------------------- /shapefiles/taxi_zones/taxi_zones.shx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/toddwschneider/nyc-motor-vehicle-collisions/42ccff38b4d9cb5ae2b26e988bf36a18c23ed96d/shapefiles/taxi_zones/taxi_zones.shx --------------------------------------------------------------------------------