├── .gitignore
├── .gitmodules
├── CODE
    ├── 00_Startup.R
    ├── 11_business_download.R
    ├── 12_crime_download.R
    ├── 13_food_inspection_download.R
    ├── 14_garbage_download.R
    ├── 15_sanitation_download.R
    ├── 21_calculate_violation_matrix.R
    ├── 22_calculate_heat_map_values.R
    ├── 23_food_insp_features.R
    ├── 24_bus_features.R
    ├── 30_xgboost_model.R
    ├── 30a_glmnet_model.R
    ├── 30b_glmnet_model_evaluation.R
    ├── 31_xgboost_model_evaluation.R
    ├── 31a_random_forest_model.R
    ├── 31b_random_forest_evaluation.R
    ├── README.md
    ├── functions
    │   ├── GenerateOtherLicenseInfo.R
    │   ├── calculate_confusion_values.R
    │   ├── calculate_heat_values.R
    │   ├── calculate_violation_matrix.R
    │   ├── calculate_violation_types.R
    │   ├── categorize.R
    │   ├── filter_business.R
    │   ├── filter_crime.R
    │   ├── filter_foodInspect.R
    │   ├── filter_garbageCarts.R
    │   ├── filter_sanitationComplaints.R
    │   ├── find_bus_id_matches.R
    │   ├── gini.R
    │   ├── kde.R
    │   ├── logLik.R
    │   ├── nokey.R
    │   ├── simulated_bin_summary.R
    │   ├── simulated_date_diff_mean.R
    │   └── weather_3day_calc.R
    ├── not used
    │   ├── addTextMining.R
    │   ├── development_of_heat_merge.R
    │   ├── heat.R
    │   ├── kde_comparison
    │   │   ├── allstate_kde_new.R
    │   │   └── mass_kde_full.R
    │   ├── liveReadInAbandonedBuildings.R
    │   ├── liveReadInAbandonedVehicles.R
    │   ├── liveReadInBldgPermits.R
    │   ├── liveReadInBldgViolations.R
    │   ├── liveReadInGraffiti.R
    │   ├── liveReadInLightsOut.R
    │   ├── liveReadInPotHoles.R
    │   ├── liveReadInRodents.R
    │   └── liveReadInTreeTrims.R
    └── prep_inspectors_for_eval.R
├── CONTRIBUTING.md
├── DATA
    ├── 11_bus_license.Rds
    ├── 12_crime.Rds
    ├── 13_food_inspections.Rds
    ├── 14_garbage_carts.Rds
    ├── 15_sanitation_code.Rds
    ├── 17_mongo_weather_update.Rds
    ├── 19_inspector_assignments.Rds
    ├── 21_food_inspection_violation_matrix.Rds
    ├── 21_food_inspection_violation_matrix_nums.Rds
    ├── 22_burglary_heat.Rds
    ├── 22_garbageCarts_heat.Rds
    ├── 22_sanitationComplaints_heat.Rds
    ├── 23_food_insp_features.Rds
    ├── 24_bus_features.Rds
    ├── 30_xgboost_data.Rds
    ├── 30_xgboost_model.Rds
    ├── Inspection_Report_Form.pdf
    └── inspectors.Rds
├── LICENSE.md
├── README.md
├── REPORTS
    ├── .gitignore
    ├── CountComparison_aftersecondrefactor.Rmd
    ├── CountComparison_aftersecondrefactor.html
    ├── Metric_Development.Rmd
    ├── Metric_Development.html
    ├── Metric_Development.xlsx
    ├── Metric_Development_bargraphs.xlsx
    ├── Metric_Development_day_diff.xlsx
    ├── ModelSummary_20141204.Rmd
    ├── ModelSummary_20141204.html
    ├── forecasting-restaurants-with-critical-violations-in-Chicago.Rmd
    ├── forecasting-restaurants-with-critical-violations-in-Chicago.bib
    ├── forecasting-restaurants-with-critical-violations-in-Chicago.docx
    ├── forecasting-restaurants-with-critical-violations-in-Chicago.html
    ├── forecasting-restaurants-with-critical-violations-in-Chicago.pdf
    └── references.bib
└── food-inspections-evaluation.Rproj


/.gitignore:
--------------------------------------------------------------------------------
1 | /.project
2 | .Rproj.user
3 | .Rproj
4 | CODE/socrata_token.txt
5 | .Rhistory
6 | 
7 | 


--------------------------------------------------------------------------------
/.gitmodules:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/.gitmodules


--------------------------------------------------------------------------------
/CODE/00_Startup.R:
--------------------------------------------------------------------------------
 1 | ##------------------------------------------------------------------------------
 2 | ## INSTALL DEPENDENCIES IF MISSING
 3 | ##------------------------------------------------------------------------------
 4 | 
 5 | if(!"devtools" %in% rownames(installed.packages())){
 6 |     install.packages("devtools",
 7 |                      dependencies = TRUE,
 8 |                      repos = "https://cloud.r-project.org/")
 9 | }
10 | 
11 | if(!"Rcpp" %in% rownames(installed.packages())){
12 |     install.packages("Rcpp",
13 |                      dependencies = TRUE,
14 |                      repos = "https://cloud.r-project.org/")
15 | }
16 | 
17 | if(!"RSocrata" %in% rownames(installed.packages())){
18 |     install.packages("RSocrata",
19 |                      dependencies = TRUE,
20 |                      repos = "https://cloud.r-project.org/")
21 | }
22 | 
23 | if(!"data.table" %in% rownames(installed.packages())){
24 |     install.packages("data.table",
25 |                      dependencies = TRUE,
26 |                      repos = "https://cloud.r-project.org/")
27 | }
28 | 
29 | if(!"geneorama" %in% rownames(installed.packages())){
30 |     devtools::install_github('geneorama/geneorama')
31 | }
32 | 
33 | if(!"printr" %in% rownames(installed.packages())){
34 |     devtools::install_github(repo = 'yihui/printr')
35 | }
36 | 
37 | ##------------------------------------------------------------------------------
38 | ## UPDATE DEPENDENCIES IF MISSING
39 | ##------------------------------------------------------------------------------
40 | 
41 | ## Update to RSocrata 1.7.2-2 (or later) 
42 | if(installed.packages()["RSocrata","Version"] < "1.7.2-2"){
43 |     install.packages("RSocrata", 
44 |                      repos = "https://cloud.r-project.org/")
45 | }
46 | 
47 | ## Needs recent version for foverlaps
48 | if(installed.packages()["data.table","Version"] < "1.10.0"){
49 |     install.packages("data.table", 
50 |                      repos = "https://cloud.r-project.org/")
51 | }
52 | 
53 | if(installed.packages()["geneorama","Version"] < "1.5.0"){
54 |     devtools::install_github('geneorama/geneorama')
55 | }
56 | 


--------------------------------------------------------------------------------
/CODE/11_business_download.R:
--------------------------------------------------------------------------------
 1 | ##==============================================================================
 2 | ## INITIALIZE
 3 | ##==============================================================================
 4 | ## Remove all objects; perform garbage collection
 5 | rm(list=ls())
 6 | gc(reset=TRUE)
 7 |     
 8 | ## Load libraries & project functions
 9 | geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
10 | geneorama::sourceDir("CODE/functions/")
11 | 
12 | ##==============================================================================
13 | ## DOWNLOAD FILES FROM DATA PORTAL
14 | ##==============================================================================
15 | 
16 | ## DEFINE URL
17 | url <- "https://data.cityofchicago.org/resource/r5kz-chrr.csv"
18 | 
19 | ## READ DATA
20 | business <- read.socrata(url, stringsAsFactors = FALSE)
21 | # str(business)
22 | 
23 | ## CONVERT TO DATA TABLE
24 | business <- as.data.table(business)
25 | 
26 | ## Replace .'s in column names
27 | setnames(business, gsub("\\.","_",colnames(business)))
28 | 
29 | ## MODIFY DATA
30 | geneorama::convert_datatable_IntNum(business)
31 | geneorama::convert_datatable_DateIDate(business)
32 | 
33 | ## FIX TWO DATE COLUMNS THAT MAY NOT DOWNLOAD PROPERLY
34 | business[ , LICENSE_TERM_START_DATE := as.IDate(LICENSE_TERM_START_DATE, "%m/%d/%Y")]
35 | business[ , LICENSE_TERM_EXPIRATION_DATE := as.IDate(LICENSE_TERM_EXPIRATION_DATE, "%m/%d/%Y")]
36 | 
37 | ## SAVE RESULT
38 | saveRDS(business, "DATA/11_bus_license.Rds")
39 | 


--------------------------------------------------------------------------------
/CODE/12_crime_download.R:
--------------------------------------------------------------------------------
 1 | ##==============================================================================
 2 | ## INITIALIZE
 3 | ##==============================================================================
 4 | ## Remove all objects; perform garbage collection
 5 | rm(list=ls())
 6 | gc(reset=TRUE)
 7 | 
 8 | ## Load libraries & project functions
 9 | geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
10 | geneorama::sourceDir("CODE/functions/")
11 | 
12 | ##==============================================================================
13 | ## DOWNLOAD FILES FROM DATA PORTAL
14 | ##==============================================================================
15 | 
16 | ## DEFINE URL AND QUERY
17 | url <- "https://data.cityofchicago.org/resource/ijzp-q8t2.csv"
18 | # q <- "?primary_type=BURGLARY&$where=date>'2016-02-01'" # date filter examp
19 | q <- "?primary_type=BURGLARY"
20 | 
21 | ## READ DATA (BURGLARY ONLY)
22 | crime <- read.socrata(paste0(url, q), stringsAsFactors = FALSE)
23 | # str(crime)
24 | 
25 | ## CONVERT TO DATA TABLE
26 | crime <- as.data.table(crime)
27 | 
28 | ## Replace .'s in column names
29 | setnames(crime, gsub("\\.","_",colnames(crime)))
30 | 
31 | ## MODIFY DATA
32 | geneorama::convert_datatable_IntNum(crime)
33 | geneorama::convert_datatable_DateIDate(crime)
34 | crime[ , Arrest := as.logical(Arrest)]
35 | crime[ , Domestic := as.logical(Domestic)]
36 | 
37 | ## SAVE RESULT
38 | saveRDS(crime , "DATA/12_crime.Rds")
39 | 


--------------------------------------------------------------------------------
/CODE/13_food_inspection_download.R:
--------------------------------------------------------------------------------
 1 | ##==========================================================================
 2 | ## INITIALIZE
 3 | ##==========================================================================
 4 | ## Remove all objects; perform garbage collection
 5 | rm(list=ls())
 6 | gc(reset=TRUE)
 7 | 
 8 | ## Load libraries & project functions
 9 | geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
10 | geneorama::sourceDir("CODE/functions/")
11 | 
12 | ##==============================================================================
13 | ## DOWNLOAD FILES FROM DATA PORTAL
14 | ##==============================================================================
15 | 
16 | ## DEFINE URL
17 | url <- "https://data.cityofchicago.org/resource/4ijn-s7e5.csv"
18 | 
19 | ## READ DATA
20 | foodInspect <- read.socrata(url, stringsAsFactors = FALSE)
21 | str(foodInspect)
22 | 
23 | ## CONVERT TO DATA TABLE
24 | foodInspect <- as.data.table(foodInspect)
25 | 
26 | ## Replace .'s in column names, and extra underscores
27 | setnames(foodInspect, gsub("\\.","_",colnames(foodInspect)))
28 | setnames(foodInspect, gsub("_+$","",colnames(foodInspect)))
29 | 
30 | ## MODIFY DATA
31 | geneorama::convert_datatable_IntNum(foodInspect)
32 | geneorama::convert_datatable_DateIDate(foodInspect)
33 | 
34 | ## SAVE RESULT
35 | saveRDS(foodInspect , "DATA/13_food_inspections.Rds")
36 | 


--------------------------------------------------------------------------------
/CODE/14_garbage_download.R:
--------------------------------------------------------------------------------
 1 | ##==============================================================================
 2 | ## INITIALIZE
 3 | ##==============================================================================
 4 | ## Remove all objects; perform garbage collection
 5 | rm(list=ls())
 6 | gc(reset=TRUE)
 7 | 
 8 | ## Load libraries & project functions
 9 | geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
10 | geneorama::sourceDir("CODE/functions/")
11 | 
12 | ##==============================================================================
13 | ## DOWNLOAD FILES FROM DATA PORTAL
14 | ##==============================================================================
15 | 
16 | ## DEFINE URL
17 | url <- "https://data.cityofchicago.org/resource/9ksk-na4q.csv"
18 | 
19 | ## READ DATA
20 | garbageCarts <- read.socrata(url, stringsAsFactors = FALSE)
21 | # str(garbageCarts)
22 | 
23 | ## CONVERT TO DATA TABLE
24 | garbageCarts <- as.data.table(garbageCarts)
25 | 
26 | ## Replace .'s in column names
27 | setnames(garbageCarts, gsub("\\.","_",colnames(garbageCarts)))
28 | 
29 | ## MODIFY DATA
30 | geneorama::convert_datatable_IntNum(garbageCarts)
31 | geneorama::convert_datatable_DateIDate(garbageCarts)
32 | 
33 | ## SAVE RESULT
34 | saveRDS(garbageCarts , "DATA/14_garbage_carts.Rds")
35 | 
36 | 


--------------------------------------------------------------------------------
/CODE/15_sanitation_download.R:
--------------------------------------------------------------------------------
 1 | ##==============================================================================
 2 | ## INITIALIZE
 3 | ##==============================================================================
 4 | ## Remove all objects; perform garbage collection
 5 | rm(list=ls())
 6 | gc(reset=TRUE)
 7 | 
 8 | ## Load libraries & project functions
 9 | geneorama::loadinstall_libraries(c("data.table", "RSocrata"))
10 | geneorama::sourceDir("CODE/functions/")
11 | 
12 | ##==============================================================================
13 | ## DOWNLOAD FILES FROM DATA PORTAL
14 | ##==============================================================================
15 | 
16 | ## DEFINE URL
17 | url <- "https://data.cityofchicago.org/resource/me59-5fac.csv"
18 | 
19 | ## READ DATA
20 | sanitationComplaints <- read.socrata(url, stringsAsFactors = FALSE)
21 | # str(sanitationComplaints)
22 | 
23 | ## CONVERT TO DATA TABLE
24 | sanitationComplaints <- as.data.table(sanitationComplaints)
25 | 
26 | ## Replace .'s in column names
27 | setnames(sanitationComplaints, gsub("\\.","_",colnames(sanitationComplaints)))
28 | 
29 | ## Remove one row where the header is (somewhat) repeated
30 | sanitationComplaints <- sanitationComplaints[Service_Request_Number!="SERVICE REQUEST NUMBER"]
31 | 
32 | ## MODIFY DATA
33 | geneorama::convert_datatable_IntNum(sanitationComplaints)
34 | geneorama::convert_datatable_DateIDate(sanitationComplaints)
35 | 
36 | ## SAVE RESULT
37 | saveRDS(sanitationComplaints , "DATA/15_sanitation_code.Rds")
38 | 
39 | 


--------------------------------------------------------------------------------
/CODE/21_calculate_violation_matrix.R:
--------------------------------------------------------------------------------
 1 | ##
 2 | ## Because this step takes so long, it's pre-calculated here.
 3 | ##
 4 | 
 5 | ##==============================================================================
 6 | ## INITIALIZE
 7 | ##==============================================================================
 8 | ## Remove all objects; perform garbage collection
 9 | rm(list=ls())
10 | gc(reset=TRUE)
11 | 
12 | ## Load libraries & project functions
13 | geneorama::loadinstall_libraries(c("data.table", "MASS"))
14 | geneorama::sourceDir("CODE/functions/")
15 | 
16 | ##==============================================================================
17 | ## LOAD CACHED RDS FILES
18 | ##==============================================================================
19 | foodInspect <- readRDS("DATA/13_food_inspections.Rds")
20 | foodInspect <- filter_foodInspect(foodInspect)
21 | 
22 | ##==============================================================================
23 | ## CALCULATE FEATURES BASED ON FOOD INSPECTION DATA
24 | ##==============================================================================
25 | 
26 | ## Calculate violation matrix and put into data.table with inspection id as key
27 | vio_mat <- calculate_violation_matrix(foodInspect[ , Violations])
28 | 
29 | ## Add key column to vio_mat
30 | vio_mat <- data.table(vio_mat, 
31 |                       Inspection_ID = foodInspect[ , Inspection_ID], 
32 |                       key = "Inspection_ID")
33 | 
34 | ## calculate_violation_types calculates violations by categories:
35 | ##       Critical, serious, and minor violations
36 | violation_dat <- calculate_violation_types(violation_mat =vio_mat)
37 | 
38 | ## Save results
39 | saveRDS(vio_mat, "DATA/21_food_inspection_violation_matrix_nums.Rds")
40 | saveRDS(violation_dat, "DATA/21_food_inspection_violation_matrix.Rds")
41 | 


--------------------------------------------------------------------------------
/CODE/22_calculate_heat_map_values.R:
--------------------------------------------------------------------------------
 1 | ##==============================================================================
 2 | ## INITIALIZE
 3 | ##==============================================================================
 4 | ## Remove all objects; perform garbage collection
 5 | rm(list=ls())
 6 | gc(reset=TRUE)
 7 | 
 8 | ## Load libraries & project functions
 9 | geneorama::loadinstall_libraries(c("data.table", "MASS"))
10 | geneorama::sourceDir("CODE/functions/")
11 | 
12 | ##==============================================================================
13 | ## LOAD CACHED RDS FILES
14 | ##==============================================================================
15 | 
16 | ## Import the key data sets used for prediction
17 | foodInspect <- readRDS("DATA/13_food_inspections.Rds")
18 | crime <-  readRDS("DATA/12_crime.Rds")
19 | garbageCarts <- readRDS("DATA/14_garbage_carts.Rds")
20 | sanitationComplaints <- readRDS("DATA/15_sanitation_code.Rds")
21 | 
22 | ## Apply filters by omitting rows that are not used in the model
23 | foodInspect <- filter_foodInspect(foodInspect)
24 | crime <- filter_crime(crime)
25 | garbageCarts <- filter_garbageCarts(garbageCarts)
26 | sanitationComplaints <- filter_sanitationComplaints(sanitationComplaints)
27 | 
28 | ##==============================================================================
29 | ## CALCULATE HEAT MAP VALUES
30 | ##==============================================================================
31 | burglary_heat <- 
32 |     calculate_heat_values(inspections = foodInspect, 
33 |                           observed_values = crime,
34 |                           window = 90, 
35 |                           page_limit = 500)
36 | garbageCarts_heat <- 
37 |     calculate_heat_values(inspections = foodInspect, 
38 |                           observed_values = garbageCarts[
39 |                               i = TRUE,
40 |                               j = list(Date = Creation_Date,
41 |                                        Latitude,
42 |                                        Longitude)],
43 |                           window = 90, 
44 |                           page_limit = 500)
45 | sanitationComplaints_heat <- 
46 |     calculate_heat_values(inspections = foodInspect, 
47 |                           observed_values = sanitationComplaints[
48 |                               i = TRUE,
49 |                               j = list(Date = Creation_Date,
50 |                                        Latitude,
51 |                                        Longitude)],
52 |                           window = 90, 
53 |                           page_limit = 500)
54 | 
55 | ##==============================================================================
56 | ## SAVE HEAT MAP VALUES
57 | ##==============================================================================
58 | saveRDS(burglary_heat, "DATA/22_burglary_heat.Rds")
59 | saveRDS(garbageCarts_heat, "DATA/22_garbageCarts_heat.Rds")
60 | saveRDS(sanitationComplaints_heat, "DATA/22_sanitationComplaints_heat.Rds")
61 | 
62 | 
63 | 


--------------------------------------------------------------------------------
/CODE/23_food_insp_features.R:
--------------------------------------------------------------------------------
 1 | ##==============================================================================
 2 | ## INITIALIZE
 3 | ##==============================================================================
 4 | ## Remove all objects; perform garbage collection
 5 | rm(list=ls())
 6 | gc(reset=TRUE)
 7 | 
 8 | ## Load libraries & project functions
 9 | geneorama::loadinstall_libraries(c("data.table", "MASS"))
10 | geneorama::sourceDir("CODE/functions/")
11 | ## Import shift function
12 | shift <- geneorama::shift
13 | 
14 | ##==============================================================================
15 | ## LOAD CACHED RDS FILES
16 | ##==============================================================================
17 | foodInspect <- readRDS("DATA/13_food_inspections.Rds")
18 | 
19 | ## Apply row filter to remove invalid data
20 | foodInspect <- filter_foodInspect(foodInspect)
21 | 
22 | ## Remove violations from food inspection, violations are caputured in the 
23 | ## violation matrix data
24 | foodInspect$Violations <- NULL
25 | 
26 | ## Import violation matrix which lists violations by categories:
27 | ##       Critical, serious, and minor violations
28 | violation_dat <- readRDS("DATA/21_food_inspection_violation_matrix.Rds")
29 | 
30 | ##==============================================================================
31 | ## CALCULATE FEATURES
32 | ##==============================================================================
33 | 
34 | ## Facility_Type_Clean: Anything that is not "restaurant" or "grocery" is "other"
35 | foodInspect[ , Facility_Type_Clean := 
36 |                  categorize(x = Facility_Type,
37 |                             primary = list(Restaurant = "restaurant",
38 |                                            Grocery_Store = "grocery"),
39 |                             ignore.case = TRUE)]
40 | ## Join in the violation matrix
41 | foodInspect <- merge(x = foodInspect, 
42 |                      y = violation_dat, 
43 |                      by = "Inspection_ID")
44 | ## Create pass / fail flags
45 | foodInspect[ , pass_flag := ifelse(Results=="Pass",1, 0)]
46 | foodInspect[ , fail_flag := ifelse(Results=="Fail",1, 0)]
47 | ## Set key to ensure that records are treated CHRONOLOGICALLY...
48 | setkey(foodInspect, License, Inspection_Date)
49 | ## Then find previous info by "shifting" the columns (grouped by License)
50 | foodInspect[ , pastFail := shift(fail_flag, -1, 0), by = License]
51 | foodInspect[ , pastCritical := shift(criticalCount, -1, 0), by = License]
52 | foodInspect[ , pastSerious := shift(seriousCount, -1, 0), by = License]
53 | foodInspect[ , pastMinor := shift(minorCount, -1, 0), by = License]
54 | 
55 | ## Calcualte time since last inspection.
56 | ## If the time is NA, this means it's the first inspection; add an inicator 
57 | ## variable to indicate that it's the first inspection.
58 | foodInspect[i = TRUE , 
59 |             j = timeSinceLast := as.numeric(
60 |                 Inspection_Date - shift(Inspection_Date, -1, NA)) / 365, 
61 |             by = License]
62 | foodInspect[ , firstRecord := 0]
63 | foodInspect[is.na(timeSinceLast), firstRecord := 1]
64 | foodInspect[is.na(timeSinceLast), timeSinceLast := 2]
65 | foodInspect[ , timeSinceLast := pmin(timeSinceLast, 2)]
66 | 
67 | ##==============================================================================
68 | ## SAVE RDS
69 | ##==============================================================================
70 | setkey(foodInspect, Inspection_ID)
71 | saveRDS(foodInspect, file.path("DATA/23_food_insp_features.Rds"))
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/CODE/24_bus_features.R:
--------------------------------------------------------------------------------
 1 | ##==============================================================================
 2 | ## INITIALIZE
 3 | ##==============================================================================
 4 | ## Remove all objects; perform garbage collection
 5 | rm(list=ls())
 6 | gc(reset=TRUE)
 7 | 
 8 | ## Load libraries & project functions
 9 | geneorama::loadinstall_libraries(c("data.table", "MASS"))
10 | geneorama::sourceDir("CODE/functions/")
11 | ## Import shift function
12 | shift <- geneorama::shift
13 | 
14 | ##==============================================================================
15 | ## LOAD CACHED RDS FILES
16 | ##==============================================================================
17 | business <- readRDS("DATA/11_bus_license.Rds")
18 | 
19 | ## Apply filter to remove invalid / unused data
20 | business <- filter_business(business)
21 | 
22 | ## Food inspection data needed for some feature calculations inspection date
23 | foodInspect <- readRDS("DATA/23_food_insp_features.Rds")
24 | 
25 | ##==============================================================================
26 | ## CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA
27 | ##==============================================================================
28 | 
29 | ## Calculate min date (by license)
30 | business[ , minDate := min(LICENSE_TERM_START_DATE), LICENSE_NUMBER]
31 | business[ , maxDate := max(LICENSE_TERM_EXPIRATION_DATE), LICENSE_NUMBER]
32 | 
33 | ##==============================================================================
34 | ## Use only the business data that pertains to food inspections
35 | ##==============================================================================
36 | ## Create a table of matches between the food inspection and business license
37 | ## data, based on the where the Inspection_Date falls within the business
38 | ## license renewal
39 | id_table_food2business <- find_bus_id_matches(business, foodInspect)
40 | geneorama::NAsummary(id_table_food2business)
41 | 
42 | ## Add food key to matched business data
43 | bus_matched <- merge(x = id_table_food2business, 
44 |                      y = business, 
45 |                      by = "ID", 
46 |                      all.y = FALSE, 
47 |                      all.x = TRUE)
48 | setkey(bus_matched, Inspection_ID)
49 | 
50 | ## Add business key to food data
51 | foodInspect <- merge(x = id_table_food2business, 
52 |                      y = foodInspect, 
53 |                      by = "Inspection_ID")
54 | setkey(foodInspect, Inspection_ID)
55 | 
56 | ## Use minDate and Inspection date to calculate age at 
57 | bus_matched <- bus_matched[foodInspect[,Inspection_Date,keyby=Inspection_ID]]
58 | bus_matched[ , ageAtInspection := as.numeric(Inspection_Date - minDate) / 365]
59 | 
60 | ## Remove Inspection Date to avoid conflict names when merging later
61 | bus_matched[ , Inspection_Date := NULL]
62 | 
63 | 
64 | ## CALCULATE AND MERGE IN OTHER CATEGORIES
65 | OtherCategories <- GenerateOtherLicenseInfo(foodInspect, business,  max_cat = 12)
66 | geneorama::NAsummary(OtherCategories)
67 | 
68 | ## Merge in results
69 | bus_matched <- merge(x = bus_matched, 
70 |                      y = OtherCategories, 
71 |                      by = "Inspection_ID",
72 |                      all.x = T)
73 | ## Remove NAs in category columns and set max value to 1
74 | for (j in match(colnames(OtherCategories)[-1], colnames(bus_matched))) {
75 |     set(x = bus_matched, i = which(is.na(bus_matched[[j]])), j = j, value = 0)
76 |     set(x = bus_matched, j = j, value = pmin(bus_matched[[j]], 1))
77 | }
78 | 
79 | bus_matched
80 | 
81 | ##==============================================================================
82 | ## SAVE RDS
83 | ##==============================================================================
84 | ## Set the key for dat_model
85 | setkey(bus_matched, Inspection_ID)
86 | saveRDS(bus_matched, file.path("DATA/24_bus_features.Rds"))
87 | 
88 | 


--------------------------------------------------------------------------------
/CODE/30_xgboost_model.R:
--------------------------------------------------------------------------------
  1 | ##==============================================================================
  2 | ## INITIALIZE
  3 | ##==============================================================================
  4 | ## Remove all objects; perform garbage collection
  5 | rm(list=ls())
  6 | gc(reset=TRUE)
  7 | 
  8 | ## Load libraries & project functions
  9 | geneorama::loadinstall_libraries(c("data.table", "xgboost", "ggplot2"))
 10 | geneorama::sourceDir("CODE/functions/")
 11 | 
 12 | ##==============================================================================
 13 | ## LOAD CACHED RDS FILES
 14 | ##==============================================================================
 15 | food <- readRDS("DATA/23_food_insp_features.Rds")
 16 | bus <- readRDS("DATA/24_bus_features.Rds")
 17 | sanitarians <- readRDS("DATA/19_inspector_assignments.Rds")
 18 | weather <- readRDS("DATA/17_mongo_weather_update.Rds")
 19 | heat_burglary <- readRDS("DATA/22_burglary_heat.Rds")
 20 | heat_garbage <- readRDS("DATA/22_garbageCarts_heat.Rds")
 21 | heat_sanitation <- readRDS("DATA/22_sanitationComplaints_heat.Rds")
 22 | 
 23 | ##==============================================================================
 24 | ## MERGE IN FEATURES
 25 | ##==============================================================================
 26 | sanitarians <- sanitarians[,list(Inspection_ID=inspectionID), keyby=sanitarian]
 27 | setnames(heat_burglary, "heat_values", "heat_burglary")
 28 | setnames(heat_garbage, "heat_values", "heat_garbage")
 29 | setnames(heat_sanitation, "heat_values", "heat_sanitation")
 30 | 
 31 | dat <- copy(food)
 32 | dat <- dat[bus]
 33 | dat <- merge(x = dat,  y = sanitarians,  by = "Inspection_ID")
 34 | dat <- merge(x = dat, y = weather_3day_calc(weather), by = "Inspection_Date")
 35 | dat <- merge(dat, na.omit(heat_burglary),  by = "Inspection_ID")
 36 | dat <- merge(dat, na.omit(heat_garbage),  by = "Inspection_ID")
 37 | dat <- merge(dat, na.omit(heat_sanitation),  by = "Inspection_ID")
 38 | 
 39 | ## Set the key for dat
 40 | setkey(dat, Inspection_ID)
 41 | 
 42 | ## Remove unnecessary data
 43 | rm(food, bus, sanitarians, weather, heat_burglary, heat_garbage, heat_sanitation)
 44 | 
 45 | ## Only the model data should be present
 46 | geneorama::lll()
 47 | 
 48 | ##==============================================================================
 49 | ## FILTER ROWS
 50 | ##==============================================================================
 51 | dat <- dat[LICENSE_DESCRIPTION=="Retail Food Establishment"]
 52 | dat
 53 | 
 54 | ##==============================================================================
 55 | ## DISPLAY AVAILABLE VARIABLES
 56 | ##==============================================================================
 57 | geneorama::NAsummary(dat)
 58 | 
 59 | ##==============================================================================
 60 | ## Add criticalFound variable to dat:
 61 | ##==============================================================================
 62 | dat[ , criticalFound := pmin(1, criticalCount)]
 63 | 
 64 | ##==============================================================================
 65 | ## Calculate index for training data (last three months)
 66 | ##==============================================================================
 67 | dat[ , Test := Inspection_Date >= (max(Inspection_Date) - 90)]
 68 | 
 69 | ##==============================================================================
 70 | ## CREATE MODEL DATA
 71 | ##==============================================================================
 72 | # sort(colnames(dat))
 73 | xmat <- dat[ , list(Inspector = as.character(sanitarian),
 74 |                     pastSerious = pmin(pastSerious, 1),
 75 |                     pastCritical = pmin(pastCritical, 1),
 76 |                     timeSinceLast,
 77 |                     ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L),
 78 |                     consumption_on_premises_incidental_activity,
 79 |                     tobacco,
 80 |                     temperatureMax,
 81 |                     heat_burglary = pmin(heat_burglary, 70),
 82 |                     heat_sanitation = pmin(heat_sanitation, 70),
 83 |                     heat_garbage = pmin(heat_garbage, 50),
 84 |                     criticalFound),
 85 |              keyby = list(Inspection_ID, Test)]
 86 | 
 87 | ## View the structure of the final xmat
 88 | str(xmat)
 89 | 
 90 | ##==============================================================================
 91 | ## XGBOOST MODEL
 92 | ##==============================================================================
 93 | 
 94 | mm <- model.matrix(criticalFound ~ . -1, 
 95 |                    data = xmat[ , .SD, .SDcol=-key(xmat)])
 96 | 
 97 | iiTest <- dat$Test
 98 | iiTrain <- !dat$Test
 99 | 
100 | 
101 | # Extract train data set
102 | train <- mm[iiTrain, ]
103 | test  <- mm[iiTest, ]
104 | 
105 | train.target <- xmat[iiTrain, criticalFound]
106 | test.target  <- xmat[iiTest, criticalFound]
107 | 
108 | #table(train.target)
109 | #table(test.target)
110 | 
111 | set.seed(1)
112 | 
113 | # 5% of test data is taken for validation purpose (733+120 = 853 [5% of 17075 - Train set] )
114 | h <- c( sample(which(train.target == 0), 733), sample(which(train.target == 1), 120))
115 | 
116 | # Create validation data set based on the above rows
117 | dval   <- xgb.DMatrix(data = data.matrix(train[h, ]), label = train.target[h])
118 | dtrain <- xgb.DMatrix(data = data.matrix(train), label = train.target)
119 | 
120 | watchlist <- list(val = dval, train = dtrain)
121 | 
122 | # Run xgbmodel
123 | set.seed(1)
124 | xgbmodel <- xgb.train(data = dtrain, 
125 |                       nfold = 5,
126 |                       eta = 0.02,
127 |                       max_depth = 6,
128 |                       nround=500, 
129 |                       subsample = 0.75,
130 |                       colsample_bytree = 0.75,
131 |                       eval_metric = "mlogloss",
132 |                       objective = "multi:softprob",
133 |                       num_class = 2,
134 |                       nthread = 4,
135 |                       num_parallel_trees = 500,
136 |                       early_stopping_rounds = 25,
137 |                       watchlist = watchlist,
138 |                       verbose = 1, 
139 |                       gamma = 0
140 | )
141 | 
142 | # Predict Test data set score
143 | y_pred <- predict(xgbmodel, data.matrix(test), ntree=xgbmodel$bestInd)
144 | 
145 | # Extract second column's value that correspond to "Critical factor = 1"
146 | y_pred <- data.frame(matrix(y_pred, byrow = TRUE, ncol = 2))[, 2]
147 | 
148 | # Calculate RMSE based on predicted and actual values
149 | RMSE <- sqrt(mean((test.target-y_pred)^2))  # test1637
150 | RMSE
151 | 
152 | # RMSE: 0.3590026; 446  rounds; eta = 0.04;  subsample = 0.75; colsample_bytree = 0.75 - reg:linear
153 | # RMSE: 0.3742767; 3000 rounds; eta = 0.04;  subsample = 0.75; colsample_bytree = 0.75 - reg:linear
154 | # RMSE: 0.3678386; 2000 rounds; eta = 0.04;  subsample = 0.75; colsample_bytree = 0.75 - multi:softprob
155 | # RMSE: 0.3586859; 1200 rounds; eta = 0.025; subsample = 0.70; colsample_bytree = 0.70 - multi:softprob
156 | # RMSE: 0.3596287; 1200 rounds; eta = 0.03;  subsample = 0.70; colsample_bytree = 0.70 - multi:softprob
157 | # RMSE: 0.3578677; 1200 rounds; eta = 0.02;  subsample = 0.70; colsample_bytree = 0.70 - multi:softprob
158 | # RMSE: 0.3576444; 1100 rounds; eta = 0.02;  subsample = 0.70; colsample_bytree = 0.70 - multi:softprob
159 | # RMSE: 0.3559322; 1100 rounds; eta = 0.02;  subsample = 0.70; colsample_bytree = 0.70 - multi:softprob
160 | # RMSE: 0.3563522;  500 rounds; eta = 0.02;  subsample = 0.75; colsample_bytree = 0.75 - multi:softprob
161 | # RMSE: 0.3563522; 1000 rounds; eta = 0.02;  subsample = 0.75; colsample_bytree = 0.75 - multi:softprob
162 | # RMSE: 0.3792566; 1200 rounds; eta = 0.002; subsample = 0.75; colsample_bytree = 0.75 - multi:softprob
163 | 
164 | ## ATTACH PREDICTIONS TO DAT
165 | y_pred_mm <- predict(xgbmodel, data.matrix(mm), ntree = xgbmodel$bestInd)
166 | 
167 | 
168 | 
169 | dat$score <- data.frame(matrix(y_pred_mm, byrow = TRUE, ncol = 2))[, 2]
170 | 
171 | ## Identify each row as test / train
172 | dat$Test <- iiTest
173 | dat$Train <- iiTrain
174 | 
175 | ##==============================================================================
176 | ## SAVE RESULTS
177 | ##==============================================================================
178 | 
179 | saveRDS(dat, "DATA/30_xgboost_data.Rds")
180 | saveRDS(xgbmodel, "DATA/30_xgboost_model.Rds")
181 | 
182 | 
183 | 


--------------------------------------------------------------------------------
/CODE/30a_glmnet_model.R:
--------------------------------------------------------------------------------
  1 | ##==============================================================================
  2 | ## INITIALIZE
  3 | ##==============================================================================
  4 | ## Remove all objects; perform garbage collection
  5 | rm(list=ls())
  6 | gc(reset=TRUE)
  7 | 
  8 | ## Load libraries & project functions
  9 | geneorama::loadinstall_libraries(c("data.table", "glmnet"))
 10 | geneorama::sourceDir("CODE/functions/")
 11 | 
 12 | ##==============================================================================
 13 | ## LOAD CACHED RDS FILES
 14 | ##==============================================================================
 15 | food <- readRDS("DATA/23_food_insp_features.Rds")
 16 | bus <- readRDS("DATA/24_bus_features.Rds")
 17 | sanitarians <- readRDS("DATA/19_inspector_assignments.Rds")
 18 | weather <- readRDS("DATA/17_mongo_weather_update.Rds")
 19 | heat_burglary <- readRDS("DATA/22_burglary_heat.Rds")
 20 | heat_garbage <- readRDS("DATA/22_garbageCarts_heat.Rds")
 21 | heat_sanitation <- readRDS("DATA/22_sanitationComplaints_heat.Rds")
 22 | 
 23 | ##==============================================================================
 24 | ## MERGE IN FEATURES
 25 | ##==============================================================================
 26 | sanitarians <- sanitarians[,list(Inspection_ID=inspectionID), keyby=sanitarian]
 27 | setnames(heat_burglary, "heat_values", "heat_burglary")
 28 | setnames(heat_garbage, "heat_values", "heat_garbage")
 29 | setnames(heat_sanitation, "heat_values", "heat_sanitation")
 30 | 
 31 | dat <- copy(food)
 32 | dat <- dat[bus]
 33 | dat <- merge(x = dat,  y = sanitarians,  by = "Inspection_ID")
 34 | dat <- merge(x = dat, y = weather_3day_calc(weather), by = "Inspection_Date")
 35 | dat <- merge(dat, na.omit(heat_burglary),  by = "Inspection_ID")
 36 | dat <- merge(dat, na.omit(heat_garbage),  by = "Inspection_ID")
 37 | dat <- merge(dat, na.omit(heat_sanitation),  by = "Inspection_ID")
 38 | 
 39 | ## Set the key for dat
 40 | setkey(dat, Inspection_ID)
 41 | 
 42 | ## Remove unnecessary data
 43 | rm(food, bus, sanitarians, weather, heat_burglary, heat_garbage, heat_sanitation)
 44 | 
 45 | ## Only the model data should be present
 46 | geneorama::lll()
 47 | 
 48 | ##==============================================================================
 49 | ## FILTER ROWS
 50 | ##==============================================================================
 51 | dat <- dat[LICENSE_DESCRIPTION=="Retail Food Establishment"]
 52 | dat
 53 | 
 54 | ##==============================================================================
 55 | ## DISPLAY AVAILABLE VARIABLES
 56 | ##==============================================================================
 57 | geneorama::NAsummary(dat)
 58 | 
 59 | ##==============================================================================
 60 | ## Add criticalFound variable to dat:
 61 | ##==============================================================================
 62 | dat[ , criticalFound := pmin(1, criticalCount)]
 63 | 
 64 | ##==============================================================================
 65 | ## Calculate index for training data (last three months)
 66 | ##==============================================================================
 67 | dat[ , Test := Inspection_Date >= (max(Inspection_Date) - 90)]
 68 | 
 69 | ##==============================================================================
 70 | ## CREATE MODEL DATA
 71 | ##==============================================================================
 72 | # sort(colnames(dat))
 73 | xmat <- dat[ , list(Inspector = as.character(sanitarian),
 74 |                     pastSerious = pmin(pastSerious, 1),
 75 |                     pastCritical = pmin(pastCritical, 1),
 76 |                     timeSinceLast,
 77 |                     ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L),
 78 |                     consumption_on_premises_incidental_activity,
 79 |                     tobacco,
 80 |                     temperatureMax,
 81 |                     heat_burglary = pmin(heat_burglary, 70),
 82 |                     heat_sanitation = pmin(heat_sanitation, 70),
 83 |                     heat_garbage = pmin(heat_garbage, 50),
 84 |                     criticalFound),
 85 |              keyby = list(Inspection_ID, Test)]
 86 | 
 87 | ## View the structure of the final xmat
 88 | str(xmat)
 89 | 
 90 | ##==============================================================================
 91 | ## GLMNET MODEL
 92 | ##==============================================================================
 93 | ## Construct model matrix without the key values in xmat
 94 | mm <- model.matrix(criticalFound ~ . -1, 
 95 |                    data = xmat[ , .SD, .SDcol=-key(xmat)])
 96 | str(mm)
 97 | colnames(mm)
 98 | 
 99 | # fit ridge regression, alpha = 0, only inspector coefficients penalized
100 | penalty <- ifelse(grepl("^Inspector", colnames(mm)), 1, 0)
101 | 
102 | ## PRODUCTION version of the model which is not split for test / train
103 | model <- cv.glmnet(x = mm,
104 |                    y = xmat[ ,  criticalFound],
105 |                    family = "binomial", 
106 |                    alpha = 0,
107 |                    penalty.factor = penalty)
108 | 
109 | ## EVALUATION version of the model which is only fit on the training data
110 | model_eval <- cv.glmnet(x = mm[xmat$Test==FALSE, ],
111 |                         y = xmat[Test == FALSE ,  criticalFound],
112 |                         family = "binomial", 
113 |                         alpha = 0,
114 |                         penalty.factor = penalty)
115 | 
116 | ## Lambda
117 | model$lambda
118 | model$lambda.min
119 | 
120 | ## Attach predictions for top lambda choice to the data
121 | dat$glm_pred <- predict(model$glmnet.fit, 
122 |                         newx = mm, 
123 |                         s = model$lambda.min,
124 |                         type = "response")[,1]
125 | dat$glm_pred_test <- predict(model_eval$glmnet.fit, 
126 |                              newx = mm, 
127 |                              s = model_eval$lambda.min,
128 |                              type = "response")[,1]
129 | 
130 | ## Coefficients
131 | coef <- coef(model)[,1]
132 | inspCoef <- coef[grepl("^Inspector",names(coef))]
133 | inspCoef <- inspCoef[order(-inspCoef)]
134 | 
135 | ## Save Results
136 | saveRDS(dat, "DATA/30_dat.Rds")
137 | saveRDS(xmat, "DATA/30_xmat.Rds")
138 | saveRDS(mm, "DATA/30_modelmatrix.Rds")
139 | saveRDS(coef, "DATA/30_coef.Rds")
140 | saveRDS(inspCoef, "DATA/30_inspCoef.Rds")
141 | saveRDS(model, "DATA/30_model.Rds")
142 | saveRDS(model_eval, "DATA/30_model_eval.Rds")
143 | 


--------------------------------------------------------------------------------
/CODE/30b_glmnet_model_evaluation.R:
--------------------------------------------------------------------------------
  1 | ##==============================================================================
  2 | ## INITIALIZE
  3 | ##==============================================================================
  4 | ## Remove all objects; perform garbage collection
  5 | rm(list=ls())
  6 | gc(reset=TRUE)
  7 | 
  8 | ## Load libraries that are used
  9 | geneorama::loadinstall_libraries(c("data.table", "glmnet", "ggplot2", "ROCR"))
 10 | ## Load custom functions
 11 | geneorama::sourceDir("CODE/functions/")
 12 | 
 13 | ##==============================================================================
 14 | ## LOAD CACHED RDS FILES
 15 | ##==============================================================================
 16 | dat <- readRDS("DATA/30_dat.Rds")
 17 | cvfit <- readRDS("DATA/30_model_eval.Rds")
 18 | mm <- readRDS("DATA/30_modelmatrix.Rds")
 19 | 
 20 | str(cvfit)
 21 | str(cvfit$glmnet.fit$beta)
 22 | 
 23 | ## Calculate scores for all lambda values
 24 | allscores <- predict(cvfit$glmnet.fit, 
 25 |                      newx = as.matrix(mm), 
 26 |                      s = cvfit$glmnet.fit$lambda,
 27 |                      type = "response")
 28 | allscores <- as.data.table(allscores)
 29 | setnames(allscores, cvfit$glmnet.fit$beta@Dimnames[[2]])
 30 | 
 31 | ## Identify each row as test / train
 32 | allscores$Test <- dat$Test
 33 | allscores$Train <- !dat$Test
 34 | 
 35 | ##==============================================================================
 36 | ## GLMNET specific diagnostics
 37 | ##==============================================================================
 38 | 
 39 | ## Find the index of the lambda that is closest to the optimal lambda of cvfit
 40 | ## Because of issues with numerical precision on different platforms, it's a
 41 | ## good idea to find the index of the lambda, rather than rely on matching the
 42 | ## actual values.
 43 | ## This result should be zero, but it might not be exactly zero on all platforms
 44 | ## cvfit$glmnet.fit$lambda[iLambda] - cvfit$lambda.min
 45 | cat("The lambda that minimizes error in cvfit is:", cvfit$lambda.min, "\n")
 46 | iLambda <- which.min((cvfit$glmnet.fit$lambda - cvfit$lambda.min)^2)
 47 | 
 48 | ## Print the coefficients
 49 | coef <- cvfit$glmnet.fit$beta[,iLambda]
 50 | coef
 51 | 
 52 | ## Performance for different values of lambda:
 53 | plot(cvfit$cvm ~ log(cvfit$lambda))
 54 | 
 55 | ## Note, this is equivalent to the log likelihood results
 56 | loglik_errors <- sapply(1:100, 
 57 |                         function(i) {
 58 |                             logLik(p = allscores[Train==TRUE][[i]], 
 59 |                             y = dat[Test==FALSE, criticalFound])
 60 |                             })
 61 | plot(loglik_errors ~ log(cvfit$glmnet.fit$lambda))
 62 | lines(x = log(cvfit$glmnet.fit$lambda),
 63 |       y = (1 - cvfit$glmnet.fit$dev.ratio) * cvfit$glmnet.fit$nulldev / 2,
 64 |       col = "blue")
 65 | 
 66 | ## Evolution of coefficients as penalty changes
 67 | plot(cvfit$glmnet.fit, label = TRUE)
 68 | plot(cvfit$glmnet.fit, xvar = "dev", label = TRUE)
 69 | 
 70 | 
 71 | ##==============================================================================
 72 | ## Gini and confusion matrix calculations
 73 | ##==============================================================================
 74 | 
 75 | # Show gini performance of inspector model on tune data set
 76 | dat[Test==FALSE, gini(glm_pred, criticalFound, plot=TRUE)]
 77 | dat[Test==TRUE, gini(glm_pred, criticalFound, plot=TRUE)]
 78 | 
 79 | ## Calculate confusion matrix values for evaluation
 80 | calculate_confusion_values(actual = dat[Test==TRUE, criticalFound],
 81 |                            expected = dat[Test==TRUE, glm_pred_test], 
 82 |                            r = .25)
 83 | 
 84 | ## Calculate matrix of confusion matrix values for evaluation
 85 | confusion_values_test <- t(sapply(seq(0, 1 ,.01), 
 86 |                                   calculate_confusion_values,
 87 |                                   actual = dat[Test==TRUE, criticalFound],
 88 |                                   expected = dat[Test==TRUE, glm_pred_test]))
 89 | confusion_values_test <- as.data.table(confusion_values_test)
 90 | confusion_values_test
 91 | 
 92 | 
 93 | ggplot(melt(confusion_values_test, id.vars="r")) + 
 94 |     aes(x=r, y=value, colour=variable) + geom_line() + 
 95 |     geom_hline(yintercept = c(0,1))
 96 | ##==============================================================================
 97 | ## MODEL EVALUATION
 98 | ##    - TIME SAVINGS
 99 | ##    - PERIOD A vs PERIOD B
100 | ##==============================================================================
101 | ## Subset of just observations during test period:
102 | datTest <- dat[Test == TRUE]
103 | 
104 | ## Mean time savings:
105 | datTest[ , simulated_date_diff_mean(Inspection_Date, glm_pred_test, criticalFound)]
106 | 
107 | ## Detailed time savings:
108 | bins <- datTest[ , simulated_bin_summary(Inspection_Date, glm_pred_test, criticalFound)]
109 | bins
110 | 
111 | ## This calculation is the weighted average date difference, which should match
112 | ## the previous result from `simulated_bin_summary`
113 | bins[ , sum(as.integer(date) * POS) / sum(POS)] -
114 |     bins[ , sum(as.integer(date) * POS_SIM) / sum(POS_SIM)]
115 | 
116 | ## Find the midpoint of the inspections to divide bins into period A & B
117 | mid <- bins[ , sum(N)/2]
118 | 
119 | ## Divide the bins into period A & B based on midpoint
120 | ## Note: GT and LT is strict, so ties would be excluded.  Although there are no
121 | ##       ties for now (as of 2015 publication).
122 | binsA <- bins[NTOT < mid]
123 | binsB <- bins[NTOT > mid]
124 | 
125 | tot_crit <- sum(bins$POS)
126 | 
127 | binsA[ , sum(POS)/tot_crit]     ## [1] 0.5465116
128 | binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705
129 | 
130 | 
131 | ##==============================================================================
132 | ## Metrics with ROCR Package
133 | ##==============================================================================
134 | 
135 | ## Example with random values:
136 | # predTest <- prediction(datTest[ ,list(glm_pred_test, runif(.N) )], 
137 | #                        datTest[ ,list(criticalFound, criticalFound)])
138 | 
139 | predTest <- prediction(datTest$glm_pred_test, datTest$criticalFound)
140 | 
141 | ## precision / recall
142 | plot(performance(predTest, "prec", "rec"), main="precision recall")
143 | 
144 | # ROC
145 | plot(performance(predTest, "tpr", "fpr"), main="ROC")
146 | abline(0, 1, lty=2)
147 | 
148 | ## sensitivity / specificity
149 | plot(performance(predTest, "sens", "spec"), main="sensitivity vs specificity")
150 | abline(1, -1, lty=2)
151 | 
152 | ## phi
153 | plot(performance(predTest, "phi"), main="phi scores")
154 | 
155 | ## Fancy ROC curve:
156 | op <- par(bg="lightgray", mai=c(1.2,1.5,1,1))
157 | plot(performance(predTest,"tpr","fpr"), 
158 |      main="ROC Curve", colorize=TRUE, lwd=10)
159 | par(op)
160 | 
161 | ## Effect of using a cost function on cutoffs
162 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 1), 
163 |      main="Even costs (FP=1 TN=1)")
164 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 4), 
165 |      main="Higher cost for FN (FP=1 TN=4)")
166 | 
167 | ## Accuracy
168 | plot(performance(predTest, measure = "acc"))
169 | 
170 | ## AUC
171 | performance(predTest, measure = "auc")@y.values[[1]]
172 | 
173 | 


--------------------------------------------------------------------------------
/CODE/31_xgboost_model_evaluation.R:
--------------------------------------------------------------------------------
  1 | 
  2 | ##==============================================================================
  3 | ## INITIALIZE
  4 | ##==============================================================================
  5 | if(interactive()){
  6 |     ## Remove all objects; perform garbage collection
  7 |     rm(list=ls())
  8 |     gc(reset=TRUE)
  9 |     ## Detach libraries that are not used
 10 |     geneorama::detach_nonstandard_packages()
 11 | }
 12 | ## Load libraries that are used
 13 | geneorama::loadinstall_libraries(c("data.table", "ggplot2", "ROCR"))
 14 | ## Load custom functions
 15 | geneorama::sourceDir("CODE/functions/")
 16 | 
 17 | ##==============================================================================
 18 | ## LOAD CACHED RDS FILES
 19 | ##==============================================================================
 20 | dat <- readRDS("DATA/30_xgboost_data.Rds")
 21 | model <- readRDS("DATA/30_xgboost_model.Rds")
 22 | 
 23 | ##==============================================================================
 24 | ## XGBoost specific diagnostics
 25 | ##==============================================================================
 26 | 
 27 | 
 28 | ##==============================================================================
 29 | ## Gini and confusion matrix calculations
 30 | ##==============================================================================
 31 | 
 32 | ## Plot of the errors by Lambda for the out of sample Test data
 33 | # Show gini performance of inspector model on tune data set
 34 | dat[Train==TRUE, gini(score, criticalFound, plot=TRUE)]
 35 | dat[Test==TRUE, gini(score, criticalFound, plot=TRUE)]
 36 | 
 37 | ## Calculate confusion matrix values for evaluation
 38 | calculate_confusion_values(actual = dat[Test==TRUE, criticalFound],
 39 |                            expected = dat[Test==TRUE, score], 
 40 |                            r = .25)
 41 | 
 42 | ### XGBoost
 43 | #          r   true_pos   true_neg  false_neg  false_pos 
 44 | # 0.25000000 0.07819181 0.65180208 0.07941356 0.19059255 
 45 | 
 46 | 
 47 | ### Random Forest
 48 | #        r   true_pos   true_neg  false_neg  false_pos 
 49 | #0.25000000 0.05497862 0.72632865 0.10262676 0.11606597 
 50 | 
 51 | 
 52 | ## Calculate matrix of confusion matrix values for evaluation
 53 | confusion_values_test <- t(sapply(seq(0, 1 ,.01), 
 54 |                                   calculate_confusion_values,
 55 |                                   actual = dat[Test==TRUE, criticalFound],
 56 |                                   expected = dat[Test==TRUE, score]))
 57 | confusion_values_test <- as.data.table(confusion_values_test)
 58 | confusion_values_test
 59 | 
 60 | 
 61 | ggplot(melt(confusion_values_test, id.vars="r")) + 
 62 |     aes(x=r, y=value, colour=variable) + geom_line() + 
 63 |     geom_hline(yintercept = c(0,1))
 64 | 
 65 | ##==============================================================================
 66 | ## MODEL EVALUATION
 67 | ##    - TIME SAVINGS
 68 | ##    - PERIOD A vs PERIOD B
 69 | ##==============================================================================
 70 | ## Subset of just observations during test period:
 71 | datTest <- dat[Test == TRUE]
 72 | 
 73 | ## Mean time savings:
 74 | datTest[ , simulated_date_diff_mean(Inspection_Date, score, criticalFound)]
 75 | 
 76 | ## Detailed time savings:
 77 | bins <- datTest[ , simulated_bin_summary(Inspection_Date, score, criticalFound)]
 78 | bins
 79 | 
 80 | ## This calculation is the weighted average date difference, which should match
 81 | ## the previous result from `simulated_bin_summary`
 82 | bins[ , sum(as.integer(date) * POS) / sum(POS)] -
 83 |     bins[ , sum(as.integer(date) * POS_SIM) / sum(POS_SIM)]
 84 | 
 85 | ## Find the midpoint of the inspections to divide bins into period A & B
 86 | mid <- bins[ , sum(N)/2]
 87 | 
 88 | ## Divide the bins into period A & B based on midpoint
 89 | ## Note: GT and LT is strict, so ties would be excluded.  Although there are no
 90 | ##       ties for now (as of 2015 publication).
 91 | binsA <- bins[NTOT < mid]
 92 | binsB <- bins[NTOT > mid]
 93 | 
 94 | tot_crit <- sum(bins$POS)
 95 | 
 96 | binsA[ , sum(POS)/tot_crit]     ## [1] 0.5465116
 97 | binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705
 98 | 
 99 | 
100 | ##==============================================================================
101 | ## Metrics with ROCR Package
102 | ##==============================================================================
103 | 
104 | ## Example with random values:
105 | # predTest <- prediction(datTest[ ,list(score, runif(.N) )], 
106 | #                        datTest[ ,list(criticalFound, criticalFound)])
107 | 
108 | predTest <- prediction(datTest$score, datTest$criticalFound)
109 | 
110 | ## precision / recall
111 | plot(performance(predTest, "prec", "rec"), main="precision recall")
112 | 
113 | # ROC
114 | plot(performance(predTest, "tpr", "fpr"), main="ROC")
115 | abline(0, 1, lty=2)
116 | 
117 | ## sensitivity / specificity
118 | plot(performance(predTest, "sens", "spec"), main="sensitivity vs specificity")
119 | abline(1, -1, lty=2)
120 | 
121 | ## phi
122 | plot(performance(predTest, "phi"), main="phi scores")
123 | 
124 | ## Fancy ROC curve:
125 | op <- par(bg="lightgray", mai=c(1.2,1.5,1,1))
126 | plot(performance(predTest,"tpr","fpr"), 
127 |      main="ROC Curve", colorize=TRUE, lwd=3)
128 | par(op)
129 | 
130 | ## Effect of using a cost function on cutoffs
131 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 1), 
132 |      main="Even costs (FP=1 TN=1)")
133 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 4), 
134 |      main="Higher cost for FN (FP=1 TN=4)")
135 | 
136 | ## Accuracy
137 | plot(performance(predTest, measure = "acc"))
138 | 
139 | ## AUC
140 | performance(predTest, measure = "auc")@y.values[[1]]
141 | # 0.6790282 - AUC: XGBoost
142 | # 0.6577989 - AUC: Random Forest 
143 | 
144 | #No of days to discover restaurants with critical violations
145 | #Random Forest: Time difference of 6.554264 days
146 | #XGBoost: 		Time difference of 7.790698 days
147 | 
148 | #Random Forest Output
149 | # Show gini performance of inspector model on tune data set
150 | dat[Train==TRUE, gini(score, criticalFound, plot=TRUE)]
151 | # 0.7674537      Gini Score: XGBoost
152 | # 0.9956746      Gini Score: RandomForest
153 | 
154 | 
155 | dat[Test==TRUE, gini(score, criticalFound, plot=TRUE)]
156 | # 0.358062      Gini Score: XGBoost
157 | # 0.3159969     Gini Score: RandomForest
158 | 
159 | 
160 | 


--------------------------------------------------------------------------------
/CODE/31a_random_forest_model.R:
--------------------------------------------------------------------------------
 1 | ##==============================================================================
 2 | ## INITIALIZE
 3 | ##==============================================================================
 4 | ## Remove all objects; perform garbage collection
 5 | rm(list=ls())
 6 | gc(reset=TRUE)
 7 | 
 8 | ## Load libraries that are used
 9 | geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2"))
10 | ## Load custom functions
11 | geneorama::sourceDir("CODE/functions/")
12 | 
13 | ##==============================================================================
14 | ## LOAD CACHED RDS FILES
15 | ##==============================================================================
16 | dat <- readRDS("DATA/30_dat.Rds")
17 | mm <- readRDS("DATA/30_modelmatrix.Rds")
18 | xmat <- readRDS("DATA/30_xmat.Rds")
19 | 
20 | ##==============================================================================
21 | ## RANDOM FOREST MODEL
22 | ##==============================================================================
23 | model <- randomForest(x = mm[xmat$Test==FALSE, ],
24 |                       y = as.factor(xmat[xmat$Test==FALSE, criticalFound]),
25 |                       importance=TRUE)
26 | 
27 | ## ATTACH PREDICTIONS TO DAT
28 | dat$rf_pred_test <- predict(model, 
29 |                             as.matrix(mm), 
30 |                             type="prob")[ , 2]
31 | 
32 | ##==============================================================================
33 | ## SAVE RESULTS
34 | ##==============================================================================
35 | 
36 | saveRDS(dat, "DATA/31_random_forest_data.Rds")
37 | saveRDS(model, "DATA/31_random_forest_model.Rds")
38 | 
39 | 
40 | 
41 | 


--------------------------------------------------------------------------------
/CODE/31b_random_forest_evaluation.R:
--------------------------------------------------------------------------------
  1 | ##==============================================================================
  2 | ## INITIALIZE
  3 | ##==============================================================================
  4 | ## Remove all objects; perform garbage collection
  5 | rm(list=ls())
  6 | gc(reset=TRUE)
  7 | 
  8 | ## Load libraries that are used
  9 | geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2", "ROCR"))
 10 | ## Load custom functions
 11 | geneorama::sourceDir("CODE/functions/")
 12 | 
 13 | ##==============================================================================
 14 | ## LOAD CACHED RDS FILES
 15 | ##==============================================================================
 16 | dat <- readRDS("DATA/31_random_forest_data.Rds")
 17 | model <- readRDS("DATA/31_random_forest_model.Rds")
 18 | 
 19 | ##==============================================================================
 20 | ## Random Forest specific diagnostics
 21 | ##==============================================================================
 22 | varImpPlot(model)
 23 | importance(model)
 24 | 
 25 | ##==============================================================================
 26 | ## Gini and confusion matrix calculations
 27 | ##==============================================================================
 28 | 
 29 | ## Plot of the errors by Lambda for the out of sample Test data
 30 | # Show gini performance of inspector model on tune data set
 31 | dat[Test==FALSE, gini(rf_pred_test, criticalFound, plot=TRUE)]
 32 | dat[Test==TRUE, gini(rf_pred_test, criticalFound, plot=TRUE)]
 33 | 
 34 | ## Calculate confusion matrix values for evaluation
 35 | calculate_confusion_values(actual = dat[Test==TRUE, criticalFound],
 36 |                            expected = dat[Test==TRUE, rf_pred_test], 
 37 |                            r = .25)
 38 | 
 39 | ## Calculate matrix of confusion matrix values for evaluation
 40 | confusion_values_test <- t(sapply(seq(0, 1 ,.01), 
 41 |                                   calculate_confusion_values,
 42 |                                   actual = dat[Test==TRUE, criticalFound],
 43 |                                   expected = dat[Test==TRUE, rf_pred_test]))
 44 | confusion_values_test <- as.data.table(confusion_values_test)
 45 | confusion_values_test
 46 | 
 47 | 
 48 | ggplot(melt(confusion_values_test, id.vars="r")) + 
 49 |     aes(x=r, y=value, colour=variable) + geom_line() + 
 50 |     geom_hline(yintercept = c(0,1))
 51 | 
 52 | ##==============================================================================
 53 | ## MODEL EVALUATION
 54 | ##    - TIME SAVINGS
 55 | ##    - PERIOD A vs PERIOD B
 56 | ##==============================================================================
 57 | ## Subset of just observations during test period:
 58 | datTest <- dat[Test == TRUE]
 59 | 
 60 | ## Mean time savings:
 61 | datTest[ , simulated_date_diff_mean(Inspection_Date, rf_pred_test, criticalFound)]
 62 | 
 63 | ## Detailed time savings:
 64 | bins <- datTest[ , simulated_bin_summary(Inspection_Date, rf_pred_test, criticalFound)]
 65 | bins
 66 | 
 67 | ## This calculation is the weighted average date difference, which should match
 68 | ## the previous result from `simulated_bin_summary`
 69 | bins[ , sum(as.integer(date) * POS) / sum(POS)] -
 70 |     bins[ , sum(as.integer(date) * POS_SIM) / sum(POS_SIM)]
 71 | 
 72 | ## Find the midpoint of the inspections to divide bins into period A & B
 73 | mid <- bins[ , sum(N)/2]
 74 | 
 75 | ## Divide the bins into period A & B based on midpoint
 76 | ## Note: GT and LT is strict, so ties would be excluded.  Although there are no
 77 | ##       ties for now (as of 2015 publication).
 78 | binsA <- bins[NTOT < mid]
 79 | binsB <- bins[NTOT > mid]
 80 | 
 81 | tot_crit <- sum(bins$POS)
 82 | 
 83 | ## THE COMMENT VALUES ARE FROM THE GLMNET RUN
 84 | ## HOW CAN THESE BE THE SAME???
 85 | binsA[ , sum(POS)/tot_crit]     ## [1] 0.5465116
 86 | binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705
 87 | 
 88 | 
 89 | ##==============================================================================
 90 | ## Metrics with ROCR Package
 91 | ##==============================================================================
 92 | 
 93 | ## Example with random values:
 94 | # predTest <- prediction(datTest[ ,list(rf_pred_test, runif(.N) )], 
 95 | #                        datTest[ ,list(criticalFound, criticalFound)])
 96 | 
 97 | predTest <- prediction(datTest$rf_pred_test, datTest$criticalFound)
 98 | 
 99 | ## precision / recall
100 | plot(performance(predTest, "prec", "rec"), main="precision recall")
101 | 
102 | # ROC
103 | plot(performance(predTest, "tpr", "fpr"), main="ROC")
104 | abline(0, 1, lty=2)
105 | 
106 | ## sensitivity / specificity
107 | plot(performance(predTest, "sens", "spec"), main="sensitivity vs specificity")
108 | abline(1, -1, lty=2)
109 | 
110 | ## phi
111 | plot(performance(predTest, "phi"), main="phi scores")
112 | 
113 | ## Fancy ROC curve:
114 | op <- par(bg="lightgray", mai=c(1.2,1.5,1,1))
115 | plot(performance(predTest,"tpr","fpr"), 
116 |      main="ROC Curve", colorize=TRUE, lwd=10)
117 | par(op)
118 | 
119 | ## Effect of using a cost function on cutoffs
120 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 1), 
121 |      main="Even costs (FP=1 TN=1)")
122 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 4), 
123 |      main="Higher cost for FN (FP=1 TN=4)")
124 | 
125 | ## Accuracy
126 | plot(performance(predTest, measure = "acc"))
127 | 
128 | ## AUC
129 | performance(predTest, measure = "auc")@y.values[[1]]
130 | 


--------------------------------------------------------------------------------
/CODE/README.md:
--------------------------------------------------------------------------------
 1 | # Files
 2 | 
 3 | - ```00_Startup.R``` Downloads the necessary packages required to step
 4 |   through the rest of the R scripts
 5 | - ```21_calculate_violation_matrix.R``` Performs matrix calculations
 6 |   on types of violations in the inspections data. This step is
 7 |   performed in a separate script as it takes some time.
 8 | - ```22_calculate_heat_map_values.R``` Calculates heat maps for
 9 |   garbage, crime(burglary) and sanitation complaints data.
10 | - ```23_generate_model_dat.R``` Filter primary datsets, creates a
11 |   basis for the model, creates features based on various data
12 |   sets,attaches heat map, inspectors and weather and performs
13 |   requisite merges to the basis model
14 | - ```30_glmnet_model.R``` The pre-calculated output from previous
15 |   scripts is imported and used in the model built, trained and tested
16 |   in this script. The main data set is indexed by time, and past data
17 |   is used to independently build the model. The model is then applied
18 |   to test data.  Finally, this script also includes necessary code to
19 |   evaluate the effectiveness of the City of Chicago’s data driven food
20 |   inspections pilot.
21 | 


--------------------------------------------------------------------------------
/CODE/functions/GenerateOtherLicenseInfo.R:
--------------------------------------------------------------------------------
 1 | 
 2 | GenerateOtherLicenseInfo <- function(inspection_data, 
 3 |                                      business_data, 
 4 |                                      max_cat = 99){
 5 |     
 6 |     ## For debugging:
 7 |     # inspection_data <- copy(foodInspect)
 8 |     # business_data <- copy(business)
 9 |     
10 |     ## MAKE DATA COPIES AND SET KEY
11 |     food_licenses <- inspection_data[i = TRUE,
12 |                                      j = list(d_insp = Inspection_Date,
13 |                                               id = Inspection_ID), 
14 |                                      keyby = list(license_food = License)]
15 |     biz <- business_data[i = TRUE, 
16 |                               j = list(dba = DOING_BUSINESS_AS_NAME,
17 |                                        addr = ADDRESS, 
18 |                                        d_start = LICENSE_TERM_START_DATE, 
19 |                                        d_end = LICENSE_TERM_EXPIRATION_DATE,
20 |                                        desc = LICENSE_DESCRIPTION),
21 |                               keyby = list(license = LICENSE_NUMBER)]
22 |     
23 |     ## JOIN BUSINESS NAMES TO FOOD DATA
24 |     food_licenses_names <- biz[food_licenses, mult="first"]
25 |     food_licenses_names <- food_licenses_names[i = TRUE, 
26 |                                                j = list(license_insp = license, id),
27 |                                                keyby = list(dba, addr, d_insp)]
28 |     
29 |     ## SUBSET OF BUSINESSES THAT ARE NOT LICENSED WITHIN THE FOOD DATABASE
30 |     biz_nomatch <- biz[!(license %in% food_licenses_names$license_insp)]
31 |     
32 |     ## SET KEY FOR biz_nomatch TO ENABLE MATCHING IN food_licenses_names
33 |     setkey(biz_nomatch, dba, addr, d_end)
34 |     
35 |     ## MAKE A COPY OF THE INSPECTION DATE, WHICH GETS OVERWRITTEN BY THE END DATE IN THE ROLLING JOIN
36 |     food_licenses_names[ , d_insp_copy := d_insp]
37 |     
38 |     ## CREATE TABLE OF LICENSE DESCRIPTIONS
39 |     tab <- food_licenses_names[biz_nomatch, roll=Inf]
40 |     tab <- tab[!is.na(id)]
41 |     tab <- tab[ , .N, keyby = list(dba, addr, d_insp=d_insp_copy, desc)]
42 |     tab <- dcast.data.table(data = tab,
43 |                             formula = dba + addr + d_insp ~ desc, 
44 |                             value.var = "N", 
45 |                             fill = 0L)
46 |     
47 |     ## SUMMARIZE TOTALS FOR EACH CATEGORY
48 |     category_totals <- as.data.table(sapply(tab[,4:ncol(tab), with=F], sum), 
49 |                                      keep.rownames = TRUE)[order(-V2)]
50 |     setnames(category_totals, c("cat", "N"))
51 |     ## LIMIT CATEGORY COLUMNS
52 |     categories_keep <- category_totals[1:max(min(max_cat, nrow(category_totals)-3), 1), 
53 |                                        cat]
54 |     tab_final <- tab[,c("dba", "addr", "d_insp", categories_keep), with=F]
55 |     
56 |     setkey(food_licenses_names , dba, addr, d_insp)
57 |     tab_final[food_licenses_names]
58 |     
59 |     ## MERGE RESULTS BACK AND ONLY KEEP INSPECTION ID AS KEY
60 |     ret <- food_licenses_names[tab_final][,c("id", categories_keep), with=F]
61 |     setnames(ret, "id", "Inspection_ID")
62 |     setnames(ret, colnames(ret)[-1], tolower(colnames(ret)[-1]))
63 |     setnames(ret, colnames(ret)[-1], gsub("[[:punct:]]+", "", colnames(ret)[-1]))
64 |     setnames(ret, colnames(ret)[-1], gsub("[ ]+", "_", colnames(ret)[-1]))
65 |     
66 |     setkey(ret, Inspection_ID)
67 |     return(ret)
68 | }
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/CODE/functions/calculate_confusion_values.R:
--------------------------------------------------------------------------------
 1 | 
 2 | calculate_confusion_values <- function(actual, expected, r){
 3 |     res <-  expected > r
 4 |     true_pos <- res & actual
 5 |     true_neg <- !res & !actual
 6 |     false_pos <- res & !actual
 7 |     false_neg <- !res & actual
 8 |     
 9 |     result <- c(r = r,
10 |                 true_pos = sum(true_pos) / length(res),
11 |                 true_neg = sum(true_neg) / length(res),
12 |                 false_neg = sum(false_neg) / length(res),
13 |                 false_pos = sum(false_pos) / length(res))
14 |     return(result)
15 | }
16 | 
17 | 


--------------------------------------------------------------------------------
/CODE/functions/calculate_heat_values.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ## The following fields must be present:
 3 | ##    inspections$Inspection_ID
 4 | ##    inspections$Inspection_Date
 5 | ##    inspections$Latitude
 6 | ##    inspections$Longitude
 7 | ##    observed_values$Latitude
 8 | ##    observed_values$Longitude
 9 | ##    observed_values$Date
10 | 
11 | calculate_heat_values <- function(inspections, 
12 |                                   observed_values,
13 |                                   window = 90,
14 |                                   page_limit = 500,
15 |                                   verbose = TRUE){
16 |     require(data.table)
17 |     
18 |     obs_cols <- c("Date", "Latitude", "Longitude")
19 |     insp_cols <- c("Inspection_ID", "Inspection_Date", "Latitude", "Longitude")
20 |     
21 |     ## Check for required columns
22 |     if(!all(obs_cols %in% colnames(observed_values))) {
23 |         stop(paste0("observed_values is missing one of these columns: \n",
24 |                     "      'Latitude', 'Longitude', 'Date'"))
25 |     }
26 |     if(!all(insp_cols %in% colnames(inspections))) {
27 |         stop(paste0("inspections is missing one of these columns: \n",
28 |                     "      'Latitude', 'Longitude', 'Date'"))
29 |     }
30 |     
31 |     ## Subset
32 |     inspections <- inspections[ , insp_cols, with=F]
33 |     observed_values <- observed_values[ , obs_cols, with=F]
34 |     
35 |     ## Filter out NA values
36 |     observed_values <- na.omit(observed_values)
37 |     
38 |     ## Create index values for pages
39 |     N <- nrow(inspections)
40 |     START_ROWS <- seq(1, N, page_limit)
41 |     END_ROWS <- c(seq(1, N, page_limit)[-1] - 1, N)
42 |     II <- mapply(`:`, START_ROWS, END_ROWS)
43 |     
44 |     ret <- rbindlist(lapply(II, function(ii) {
45 |         if(verbose){
46 |             print(paste(sys.call()[2], "out of", length(II)))
47 |         }
48 |         foverlaps(    
49 |             x = inspections[i = ii, 
50 |                             j = list(Inspection_ID, 
51 |                                      Latitude, 
52 |                                      Longitude), 
53 |                             keyby = list(start = Inspection_Date - window, 
54 |                                          end = Inspection_Date)],
55 |             y = observed_values[i = TRUE, 
56 |                                 j = list(Latitude, Longitude),
57 |                                 keyby = list(start = Date,  end = Date)],
58 |             type = "any")[ , kde(new=c(i.Latitude[1], i.Longitude[1]), 
59 |                                  x = Latitude, 
60 |                                  y = Longitude, 
61 |                                  h = c(.01, .01)),
62 |                           keyby = Inspection_ID]}))
63 |     setkey(ret, Inspection_ID)
64 |     setnames(ret, "V1", "heat_values")
65 |     return(ret)
66 | }
67 | 
68 | 


--------------------------------------------------------------------------------
/CODE/functions/calculate_violation_matrix.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Requires a vector of violations in text format, 
 3 | ## where each element is a collapsed list of violations
 4 | ## separated by |
 5 | 
 6 | calculate_violation_matrix <- function(violation_text){
 7 |     
 8 |     require(data.table)
 9 |     
10 |     ## Tabluate voilation types
11 |     ## 1) Split violoation description by "|"
12 |     ## 2) use regex to extract leading digits of code number
13 |     ## 3) create indicator matrix of code violations
14 |     vio <- strsplit(violation_text,"| ",fixed=T)
15 |     vio_nums <- lapply(vio, 
16 |                        function(item) regmatches(x = item, 
17 |                                                  m = gregexpr(pattern = "^[0-9]+", 
18 |                                                               text = item)))
19 |     vio_mat <- geneorama::list2matrix(vio_nums, count = T)
20 |     vio_mat <- vio_mat[ , order(as.numeric(colnames(vio_mat)))]
21 |     # colnames(vio_mat)
22 |     # range(vio_mat)
23 |     
24 |     return(vio_mat)
25 | }
26 | 
27 | 


--------------------------------------------------------------------------------
/CODE/functions/calculate_violation_types.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ## Requirements for calculate_violation_types:
 3 | ##       1 Requires a matrix (within a keyed data.table) of violation 
 4 | ##         indicators where each row contains 0 or 1 to indicate the presence
 5 | ##         of a violation.  There should be columns 1:44, named "1" to "44" 
 6 | ##         (plus a key column) corresponding to the 44 violation types found in 
 7 | ##         the data.
 8 | ##
 9 | ## Uses the function calculate_violation_matrix
10 | ## to calculate intermediate result (indicator matrix)
11 | ##
12 | 
13 | calculate_violation_types <- function(violation_mat){
14 |     
15 |     require(data.table)
16 |     
17 |     ## Check that violation_mat is a data.table
18 |     if(!inherits(x = violation_mat, what = "data.table")){
19 |         stop("violation_mat should be a data.table, and have a defined key")
20 |     }
21 |     
22 |     ## Check for the key
23 |     if(length(key(violation_mat)) == 0) {
24 |         stop("The violation matrix should have a defined key")
25 |     }
26 |     
27 |     vio_mat <- as.matrix(nokey(violation_mat))
28 |     vio_key <- violation_mat[ , key(violation_mat), with = FALSE]
29 |     
30 |     ## Tabluate voilation types
31 |     ## use apply to total up each group of code violations
32 |     criticalCount <- apply(vio_mat[ , colnames(vio_mat) %in% 1:14], 1, sum)
33 |     seriousCount <- apply(vio_mat[ , colnames(vio_mat) %in% 15:29], 1, sum)
34 |     minorCount <- apply(vio_mat[ , colnames(vio_mat) %in% 30:44], 1, sum)
35 | 
36 |     ## Construct return values
37 |     ret <- data.table(criticalCount,
38 |                       seriousCount,
39 |                       minorCount,
40 |                       vio_key,
41 |                       key = key(violation_mat))
42 |     return(ret)
43 | }
44 | 


--------------------------------------------------------------------------------
/CODE/functions/categorize.R:
--------------------------------------------------------------------------------
 1 | 
 2 | #' 
 3 | #' @name categorize
 4 | #' @param primary: a named list of the categories to convert
 5 | #' @param other: A single character name for "other", default value is "other"
 6 | #' @param ... optional arguments passed to grep
 7 | #' 
 8 | #' @description 
 9 | #' Transforms a vector of categories to preferred names, and makes non-preferred 
10 | #' categores into an "other" category. The "other" category can be named, but 
11 | #' the default label is "other".
12 | #' 
13 | #' @details
14 | #' The labels are found using `grep`, and the user can supply additional 
15 | #' arguments through the ... argument (e.g. ignore.case = TRUE).
16 | #' 
17 | #' Warning, the matching gets complicated if one string is contained in another,
18 | #' the first pattern will prevail. See example.
19 | #' 
20 | #' @note
21 | #' This might already be more efficiently implemented in base R (or some other
22 | #' package).
23 | #' 
24 | #' @example
25 | #' categorize(x = c("restaurant", "Restaurant and bar", "grocery", 
26 | #'                  "Grocery Store", "stadium", "school", "church", 
27 | #'                  "high school", "school restaurant"), 
28 | #'            primary = list(Restaurant = "restaurant",
29 | #'                           Grocery_Store = "grocery",
30 | #'                           "School"), 
31 | #'            other = "None of the above", 
32 | #'            ignore.case = TRUE)
33 | #' 
34 | 
35 | 
36 | categorize <- function(x, primary, other = "Other", ...){
37 |     primary <- rev(primary)
38 |     ## intitialize a new vector ret to be returned
39 |     ret <- vector(mode=mode(x), length = length(x))
40 |     ret[] <- NA
41 |     ## Get category names
42 |     if(is.null(names(primary))){
43 |         ## If no names are specified use "primary" directly
44 |         cat_names <- unlist(primary)
45 |     } else {
46 |         ## If names are specified use primary's names, unless missing
47 |         cat_names <- names(primary)
48 |         for(i in 1:length(primary)){
49 |             if(cat_names[i] == "") cat_names[i] <- primary[[i]]
50 |         }
51 |     }
52 |     ## Set the new vector to the primary category when it matches a primary name
53 |     for(i in 1:length(primary)){
54 |         ret[grep(paste0("^" , primary[i], "$"), x, ...)] = cat_names[i]
55 |     }
56 |     ## otherwise set it to the "other" label
57 |     ret[which(is.na(ret))] <- other
58 |     
59 |     return(ret)
60 | }
61 | 
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/CODE/functions/filter_business.R:
--------------------------------------------------------------------------------
 1 | filter_business <- function(business){
 2 |     
 3 |     ##==============================================================================
 4 |     ## Filter rows
 5 |     ##==============================================================================
 6 |     ## Remove rows with na values
 7 |     business <- business[!is.na(LICENSE_TERM_START_DATE)]
 8 |     business <- business[!is.na(LICENSE_TERM_EXPIRATION_DATE)]
 9 |     ## Keep only certain application types
10 |     business <- business[!(APPLICATION_TYPE %in% c("C_CAPA","C_SBA"))]
11 |     ## Remove duplicate ids
12 |     business <- business[!duplicated(ID)]
13 |     
14 |     ##==============================================================================
15 |     ## Remove columns that are never used
16 |     ##==============================================================================
17 |     business$SITE_NUMBER <- NULL
18 |     business$APPLICATION_CREATED_DATE <- NULL
19 |     business$APPLICATION_REQUIREMENTS_COMPLETE <- NULL
20 |     business$PAYMENT_DATE <- NULL
21 |     business$CONDITIONAL_APPROVAL <- NULL
22 |     business$LICENSE_APPROVED_FOR_ISSUANCE <- NULL
23 |     business$DATE_ISSUED <- NULL
24 |     business$LICENSE_STATUS_CHANGE_DATE <- NULL
25 |     business$SSA <- NULL
26 |     business$LOCATION <- NULL
27 |     
28 |     ## Return results
29 |     return(business)
30 | }
31 | 


--------------------------------------------------------------------------------
/CODE/functions/filter_crime.R:
--------------------------------------------------------------------------------
1 | filter_crime <- function(crime){
2 |     crime <- crime[Date>as.IDate('2011-07-01')]
3 |     crime <- crime[!(is.na(Latitude) | is.na(Longitude) | is.na(Date))]
4 |     crime
5 | }
6 | 


--------------------------------------------------------------------------------
/CODE/functions/filter_foodInspect.R:
--------------------------------------------------------------------------------
 1 | filter_foodInspect <- function(foodInspect){
 2 |     foodInspect <- foodInspect[!is.na(Inspection_Date) & !is.na(License)]
 3 |     if(foodInspect[,any(duplicated(Inspection_ID))]){
 4 |         warning(paste0("Removing ", nrow(foodInspect[duplicated(Inspection_ID)]), 
 5 |                        " duplicated records from foodInspect ",
 6 |                        "of ", nrow(foodInspect), " total records.\n",
 7 |                        "Duplication is based on the Inspection_ID"))
 8 |         ## Remove any duplicated Inspection_ID values
 9 |         foodInspect <- foodInspect[!duplicated(Inspection_ID)]
10 |     }
11 |     foodInspect <- foodInspect[License != 0]
12 |     foodInspect <- foodInspect[Inspection_Date > as.IDate("2011-09-01")]
13 |     foodInspect <- foodInspect[Inspection_Type == "Canvass"]
14 |     foodInspect <- foodInspect[!Results %in% c('Out of Business',
15 |                                                'Business Not Located',
16 |                                                'No Entry')]
17 |     foodInspect
18 | }
19 | 


--------------------------------------------------------------------------------
/CODE/functions/filter_garbageCarts.R:
--------------------------------------------------------------------------------
1 | filter_garbageCarts <- function(garbageCarts){
2 |     garbageCarts <- garbageCarts[!is.na(Latitude) & !is.na(Longitude) & !is.na(Creation_Date)]
3 |     garbageCarts <- garbageCarts[Status %in% c("Completed", "Open")]
4 |     garbageCarts
5 | }
6 | 


--------------------------------------------------------------------------------
/CODE/functions/filter_sanitationComplaints.R:
--------------------------------------------------------------------------------
1 | filter_sanitationComplaints <- function(sanitationComplaints) {
2 |     sanitationComplaints <- sanitationComplaints[!is.na(Latitude) & !is.na(Longitude) & !is.na(Creation_Date)]
3 |     sanitationComplaints <- sanitationComplaints[Status %in% c("Completed", "Open")]
4 |     sanitationComplaints
5 | }
6 | 


--------------------------------------------------------------------------------
/CODE/functions/find_bus_id_matches.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ##
 3 | ## Match food inspection ids to business license ids
 4 | ## based on when the inspection happened in relationship
 5 | ## to the business license renewal cycle.
 6 | ##
 7 | 
 8 | find_bus_id_matches <- function(business, foodInspect) {
 9 |     # browser()
10 |     ## Since many food businesses are inspected before they are issued a 
11 |     ## license,  so we need to move back the "License term start date" for 
12 |     ## newly issued licenses.  An adjustment of 365 days seems pretty 
13 |     ## reasonable, and creates lots of matches.
14 |     ## Note also, the `as.integer` call.  This is needed to avoid a warning.
15 |     business_copy <- copy(business)
16 |     business_copy <- business_copy[APPLICATION_TYPE=="ISSUE",
17 |                                    LICENSE_TERM_START_DATE := 
18 |                                        as.integer(LICENSE_TERM_START_DATE - 365)]
19 |     
20 |     ## Merge over time periods
21 |     dat <- foverlaps(foodInspect[i = TRUE,
22 |                                  j = Inspection_ID,
23 |                                  keyby = list(License, 
24 |                                               Inspection_Date = Inspection_Date, 
25 |                                               Inspection_Date_end = Inspection_Date)], 
26 |                      business_copy[i = LICENSE_TERM_START_DATE < LICENSE_TERM_EXPIRATION_DATE, 
27 |                                    j = list(ID),
28 |                                    keyby = list(LICENSE_NUMBER, 
29 |                                                 LICENSE_TERM_START_DATE, 
30 |                                                 LICENSE_TERM_EXPIRATION_DATE)], 
31 |                      mult = "first", 
32 |                      type = "any", 
33 |                      nomatch = NA)
34 |     return(dat[ , list(Inspection_ID, ID)])
35 | }
36 | 


--------------------------------------------------------------------------------
/CODE/functions/gini.R:
--------------------------------------------------------------------------------
 1 | # function to calculate the normalized gini coeficient
 2 | gini <- function(p,y, plot=FALSE){
 3 |     sorted <- cbind(p,y)
 4 |     y.val <- sum(y)
 5 |     sorted <- cbind(sorted[order(-p),-1],c(1:y.val,rep(y.val,nrow(sorted)-y.val)))
 6 |     sorted <- cbind(sorted,cumsum(sorted[,1]))
 7 |     sorted <- cbind(sorted,cumsum(rep(y.val/length(y),length(y))))
 8 |     csum <- colSums(sorted[,2:4])
 9 |     gini <- (csum[2]-csum[3])/(csum[1]-csum[3])
10 |     
11 |     if (plot){
12 |         plot(
13 |             x=1:nrow(sorted)/nrow(sorted),
14 |             y=sorted[,3]/y.val,
15 |             main="Cummulative Captured",
16 |             xlab="% Investigated",
17 |             ylab="% Captured")
18 |         lines(x=1:nrow(sorted)/nrow(sorted),y=sorted[,2]/y.val)
19 |         text(x=0.8,y=0.2,labels=paste("Gini: ",round(gini*100,1),"%",sep=""))
20 |         
21 |     }
22 |     gini
23 | }
24 | 


--------------------------------------------------------------------------------
/CODE/functions/kde.R:
--------------------------------------------------------------------------------
 1 | #function to get temperature on heat map at a new point
 2 | kde <- function (new, x, y, h) 
 3 | {
 4 |   nx <- length(x)
 5 |   if (length(y) != nx) 
 6 |     stop("data vectors must be the same length")
 7 |   if (any(!is.finite(x)) || any(!is.finite(y))) 
 8 |     stop("missing or infinite values in the data are not allowed")
 9 |   h <- if (missing(h)) 
10 |     c(bandwidth.nrd(x), bandwidth.nrd(y))
11 |   else rep(h, length.out = 2L)
12 |   h <- h/4
13 |   ax <- (new[1]-x)/h[1L]
14 |   ay <- (new[2]-y)/h[2L]
15 |   z <- tcrossprod(matrix(dnorm(ax), , nx), matrix(dnorm(ay), 
16 |                                                   , nx))/(nx * h[1L] * h[2L])
17 |   z
18 | }
19 | 


--------------------------------------------------------------------------------
/CODE/functions/logLik.R:
--------------------------------------------------------------------------------
1 | # function to calculate the binomial likelihood
2 | logLik <- function(p,y) {
3 |     p <- pmin(pmax(p,0.0000000000001),0.999999999999)
4 |     -sum(y*log(p) + (1-y)*log(1-p))
5 | }
6 | 
7 | 
8 | 


--------------------------------------------------------------------------------
/CODE/functions/nokey.R:
--------------------------------------------------------------------------------
 1 | 
 2 | nokey <- function(dt, usestrings){
 3 |   require(data.table)
 4 |   
 5 |   if(!inherits(x = dt, what = "data.table")){
 6 |     stop("dt should be a data.table")
 7 |   }
 8 |   
 9 |   if(haskey(dt)){
10 |     ret <- dt[, .SD, .SDcol = -key(dt)]
11 |   } else {
12 |     ret <- dt
13 |   }
14 |   return(ret)
15 | }
16 | 
17 | 


--------------------------------------------------------------------------------
/CODE/functions/simulated_bin_summary.R:
--------------------------------------------------------------------------------
 1 | 
 2 | simulated_bin_summary <- function(dates, scores, positives) {
 3 |     require(data.table)
 4 |     
 5 |     ##--------------------------------------------------------------------------
 6 |     ## Create base data
 7 |     ##--------------------------------------------------------------------------
 8 |     ## Assemble components into a data.table for convenience
 9 |     dat <- data.table(date=dates, score=scores, positives)
10 |     
11 |     ##--------------------------------------------------------------------------
12 |     ## Add simulated result
13 |     ##--------------------------------------------------------------------------
14 |     
15 |     ## This assumes inspectors would perform the same number of inspections per
16 |     ## day, as they had in their original schedule.
17 |     
18 |     ## The simulated date is a vector of the original dates in sequential
19 |     ## order, but matched to the original data ordered by score.
20 |     dat <- dat[order(-score), simulated_date := dat[order(date), date]][]
21 |     
22 |     ##--------------------------------------------------------------------------
23 |     ## Simulate inspector assignment to bins according to score
24 |     ##--------------------------------------------------------------------------
25 |     
26 |     ## Create bin summary of results
27 |     
28 |     ## Caluclate original counts / day
29 |     ## Note: Using `keyby` forces the result to be sorted, which is necessary
30 |     ##       for the cumulative summation.
31 |     bins <- dat[i = TRUE,
32 |                 j = list(POS = sum(positives), 
33 |                          .N), 
34 |                 keyby = date]
35 |     bins <- bins[ , NTOT := cumsum(N)][]
36 |     bins <- bins[ , POSTOT := cumsum(POS)][]
37 |     
38 |     ## Caluclate simulated counts / day
39 |     ## Note: Here we key by the simulated date
40 |     bins_sim <- dat[i = TRUE,
41 |                     j = list(POS = sum(positives), 
42 |                              .N), 
43 |                     keyby = simulated_date]
44 |     bins_sim <- bins_sim[ , NTOT := cumsum(N)][]
45 |     bins_sim <- bins_sim[ , POSTOT := cumsum(POS)][]
46 |     
47 |     setcolorder(bins, c("date", "N", "NTOT", "POS", "POSTOT"))
48 |     bins <- merge(bins, 
49 |                   bins_sim[i = TRUE,
50 |                            j = list(POS_SIM = POS,
51 |                                     POSTOT_SIM = POSTOT),
52 |                            keyby = list(date = simulated_date)],
53 |                   "date")
54 |     return(bins)
55 | 
56 | }
57 | 


--------------------------------------------------------------------------------
/CODE/functions/simulated_date_diff_mean.R:
--------------------------------------------------------------------------------
 1 | 
 2 | simulated_date_diff_mean <- function(dates, scores, pos) {
 3 |     require(data.table)
 4 |     
 5 |     ##--------------------------------------------------------------------------
 6 |     ## Create base data
 7 |     ##--------------------------------------------------------------------------
 8 |     ## Assemble components into a data.table for convenience
 9 |     dat <- data.table(date=dates, score=scores, pos)
10 |     
11 |     ##--------------------------------------------------------------------------
12 |     ## Add simulated result
13 |     ##--------------------------------------------------------------------------
14 |     
15 |     ## This assumes inspectors would perform the same number of inspections per
16 |     ## day, as they had in their original schedule.
17 |     
18 |     ## The simulated date is a vector of the original dates in sequential
19 |     ## order, but matched to the original data ordered by score.
20 |     dat <- dat[order(-score), simulated_date := dat[order(date), date]][]
21 |     
22 |     ## The time difference is the difference between the original date and 
23 |     ## the simulated date, for occurrences that are "positive"
24 |     mean_diff <- dat[pos==1, mean(date - simulated_date)]
25 |     
26 |     ##--------------------------------------------------------------------------
27 |     ## Return average date difference
28 |     ##--------------------------------------------------------------------------
29 |     return(mean_diff)
30 | 
31 | }
32 | 


--------------------------------------------------------------------------------
/CODE/functions/weather_3day_calc.R:
--------------------------------------------------------------------------------
 1 | 
 2 | weather_3day_calc <- function(weather){
 3 |     
 4 |     require(data.table)
 5 |     # str(weather)
 6 |     weather <- as.data.frame(weather)
 7 |     nr <- nrow(weather)
 8 |     weather <- weather[order(-as.numeric(weather$date)), ]
 9 |     threeDay <- weather[2:(nr-2), colnames(weather) != "date"] + 
10 |         weather[3:(nr - 1), colnames(weather) != "date"] +
11 |         weather[4:(nr - 0), colnames(weather) != "date"] 
12 |     threeDay <- threeDay / 3
13 |     threeDay$date <- weather$date[1:(nr-3)]
14 |     
15 |     threeDay <- as.data.table(threeDay)
16 |     threeDay[ , date := as.IDate(date, format="%m/%d/%y")]
17 |     setnames(threeDay, 'date', "Inspection_Date")
18 |     
19 |     threeDay <- threeDay[, .SD, keyby=Inspection_Date]
20 |     
21 |     return(threeDay)
22 | }
23 | 


--------------------------------------------------------------------------------
/CODE/not used/addTextMining.R:
--------------------------------------------------------------------------------
  1 | 
  2 | library(plyr)
  3 | library(tm)
  4 | library(SnowballC)
  5 | library(lda)
  6 | library(ggplot2)
  7 | library(reshape2)
  8 | 
  9 | 
 10 | addressCount <- ddply(foodInspect,c("doing_business_as_name"),summarize,addressCount=length(unique(address)))
 11 | foodInspect <- merge(
 12 |   x=foodInspect,
 13 |   y=addressCount,
 14 |   by="doing_business_as_name",
 15 |   all.x=TRUE,
 16 |   all.y=FALSE)
 17 | foodInspect$addressCount <- pmin(foodInspect$addressCount,5)
 18 | 
 19 | wardCount <- ddply(foodInspect,c("doing_business_as_name"),summarize,wardCount=length(unique(ward)))
 20 | foodInspect <- merge(
 21 |   x=foodInspect,
 22 |   y=wardCount,
 23 |   by="doing_business_as_name",
 24 |   all.x=TRUE,
 25 |   all.y=FALSE)
 26 | foodInspect$wardCount <- pmin(foodInspect$wardCount,5)
 27 | 
 28 | districtCount <- ddply(foodInspect,c("doing_business_as_name"),summarize,districtCount=length(unique(police_district)))
 29 | foodInspect <- merge(
 30 |   x=foodInspect,
 31 |   y=districtCount,
 32 |   by="doing_business_as_name",
 33 |   all.x=TRUE,
 34 |   all.y=FALSE)
 35 | foodInspect$districtCount <- pmin(foodInspect$districtCount,5)
 36 | 
 37 | rm(districtCount,wardCount,addressCount); gc()
 38 | 
 39 | bname <- foodInspect$doing_business_as_name
 40 | bname <- tolower(bname)
 41 | bname <- gsub("[[:punct:]]","",bname)
 42 | bname <- gsub("[0-9]+","",bname)
 43 | 
 44 | myStopWords <- c(stopwords("english"),"th")
 45 | myStopWords <- myStopWords[!grepl("[[:punct:]]",myStopWords)]
 46 | myStopRegEx <- paste("\\b",paste(myStopWords,collapse="\\b|\\b", sep=""),"\\b",sep="")
 47 | 
 48 | bname <- gsub(myStopRegEx,"",bname)
 49 | 
 50 | bname <- sapply(strsplit(bname," "), function(w) paste(wordStem(w),collapse=" ",sep=""))
 51 | 
 52 | bname <- gsub("[[:space:]]+"," ",bname)
 53 | bname <- gsub("^[[:space:]]+","",bname)
 54 | bname <- gsub("[[:space:]]+$","",bname)
 55 | 
 56 | 
 57 | words <- table(unlist(strsplit(bname, split=" ")))
 58 | words <- words[order(-words)]
 59 | words <- words[words>5]
 60 | 
 61 | name_split <- strsplit(bname, split=" ")
 62 | 
 63 | # tf <- llply(name_split,function(x) {
 64 | #   n <- sum(x %in% names(words))
 65 | #   if (n > 0){
 66 | #     mat <- matrix(0L,nrow=2,ncol=n)
 67 | #     w <- factor(x[x %in% names(words)],levels=names(words))
 68 | #     mat[1,] <- unclass(w) - 1L
 69 | #     mat[2,] <- as.integer(table(w)[x[x %in% names(words)]])
 70 | #   } else {
 71 | #     mat <- NULL
 72 | #   }
 73 | #   return(mat)
 74 | # })
 75 | 
 76 | 
 77 | 
 78 | #set.seed(8675309)
 79 | # K <- 10 ## Num clusters
 80 | # result <- lda.collapsed.gibbs.sampler(tf[!sapply(tf,is.null)], K, names(words), num.iterations=1000, alpha=0.1, eta=0.1, burnin=1000, compute.log.likelihood=TRUE) 
 81 | # plot(result$log.likelihoods[1,], type="l")
 82 | ## Get the top words in the cluster
 83 | #(top.words <- top.topic.words(result$topics, 20))
 84 | 
 85 | #sum(sapply(name_split, function(wds) sum(wds %in% c("eurest")))>0)
 86 | #head(foodInspect$doing_business_as_name[sapply(name_split, function(wds) sum(wds %in% c("sodexo")))>0],20)
 87 | 
 88 | 
 89 | foodInspect$name_length <- sapply(name_split,length)
 90 | foodInspect$type_food <- "unknown"
 91 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
 92 |                         c("el",
 93 |                           "taqueria",
 94 |                           "lo",
 95 |                           "de",
 96 |                           "y",
 97 |                           "mexican",
 98 |                           "chipotl",
 99 |                           "taco",
100 |                           "don",
101 |                           "pepe",
102 |                           "birrieria",
103 |                           "maria",
104 |                           "del",
105 |                           "taco",
106 |                           "burrito",
107 |                           "pollo",
108 |                           "feliz",
109 |                           "loco",
110 |                           "hermano"
111 |                         )))>0, "spanish", foodInspect$type_food)
112 | 
113 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
114 |                                                                    c("pizzeria",
115 |                                                                      "pizza",
116 |                                                                      "papa",
117 |                                                                      "littl",
118 |                                                                      "salad",
119 |                                                                      "ristorant",
120 |                                                                      "itali",
121 |                                                                      "trattoria",
122 |                                                                      "francesca",
123 |                                                                      "di",
124 |                                                                      "domino"                                                                     
125 |                                                                    )))>0, "italian", foodInspect$type_food)
126 | 
127 | 
128 | 
129 | 
130 | 
131 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
132 |                                                                    c("subwai",
133 |                                                                      "sandwich",
134 |                                                                      "submarin",
135 |                                                                      "work",
136 |                                                                      "john",
137 |                                                                      "potbelli",
138 |                                                                      "jimmi",
139 |                                                                      "sub",
140 |                                                                      "cosi",
141 |                                                                      "pocket"
142 |                                                                    )))>0, "sandwich", foodInspect$type_food)
143 | 
144 | 
145 | 
146 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
147 |                                                                     c("bakeri",
148 |                                                                       "cafe",
149 |                                                                       "corner",
150 |                                                                       "dunkin",
151 |                                                                       "donut",
152 |                                                                       "caff",
153 |                                                                       "bagel",
154 |                                                                       "sweet",
155 |                                                                       "cake",
156 |                                                                       "pastri",
157 |                                                                       "bon",
158 |                                                                       "au",
159 |                                                                       "coffe",
160 |                                                                       "starbuck",
161 |                                                                       "tea",
162 |                                                                       "bread",
163 |                                                                       "pancak",
164 |                                                                       "juic",
165 |                                                                       "fruit",
166 |                                                                       "xsport"                                                                      
167 |                                                                     )))>0, "breakfast", foodInspect$type_food)
168 | 
169 | 
170 | 
171 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
172 |                                                                     c("burger",
173 |                                                                       "hamburg",
174 |                                                                       "chophous",
175 |                                                                       "buffett",
176 |                                                                       "kitchen",
177 |                                                                       "wing",
178 |                                                                       "wingstop",
179 |                                                                       "wendi",
180 |                                                                       "mcdonald",
181 |                                                                       "fast",
182 |                                                                       "donald",
183 |                                                                       "castl",
184 |                                                                       "box",
185 |                                                                       "white",
186 |                                                                       "popey",
187 |                                                                       "king",
188 |                                                                       "gyro",
189 |                                                                       "fri",
190 |                                                                       "beef",
191 |                                                                       "bistro",
192 |                                                                       "dog",
193 |                                                                       "hot",
194 |                                                                       "chicken",
195 |                                                                       "fish",
196 |                                                                       "restaur",
197 |                                                                       "famili",
198 |                                                                       "golden",
199 |                                                                       "grill",
200 |                                                                       "steak",
201 |                                                                       "rib",
202 |                                                                       "big",
203 |                                                                       "pita",
204 |                                                                       "indian",
205 |                                                                       "india",
206 |                                                                       "bombai",
207 |                                                                       "kfc",
208 |                                                                       "cuisin"
209 |                                                                     )))>0, "dinner", foodInspect$type_food)
210 | 
211 | 
212 | 
213 | 
214 | 
215 | 
216 | 
217 | 
218 | 
219 | 
220 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
221 |                                                                     c("bar",
222 |                                                                       "club",
223 |                                                                       "loung",
224 |                                                                       "tavern",
225 |                                                                       "sport",
226 |                                                                       "pub",
227 |                                                                       "tap",
228 |                                                                       "cantina",
229 |                                                                       "taverna"
230 |                                                                     )))>0, "bar", foodInspect$type_food)
231 | 
232 | 
233 | 
234 | 
235 | 
236 | 
237 | 
238 | 
239 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
240 |                                                                c("food",
241 |                                                                  "mart",
242 |                                                                  "liquor",
243 |                                                                  "store",
244 |                                                                  "groceri",
245 |                                                                  "j",
246 |                                                                  "eleven",
247 |                                                                  "dollar",
248 |                                                                  "walgreen",
249 |                                                                  "store",
250 |                                                                  "wine",
251 |                                                                  "stop",
252 |                                                                  "mini",
253 |                                                                  "citgo",
254 |                                                                  "bp",
255 |                                                                  "shell",
256 |                                                                  "pharmaci",
257 |                                                                  "marathon",
258 |                                                                  "snack"
259 |                                                                )))>0, "mart", foodInspect$type_food)
260 | 
261 | 
262 | 
263 | 
264 | 
265 | 
266 | 
267 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
268 |                                                                c("hospit",
269 |                                                                  "intern",
270 |                                                                  "cater",
271 |                                                                  "univers",
272 |                                                                  "eurest",
273 |                                                                  "servic",
274 |                                                                  "levi",
275 |                                                                  "center",
276 |                                                                  "field",
277 |                                                                  "hotel",
278 |                                                                  "marriott",
279 |                                                                  "sodexo",
280 |                                                                  "aramark",
281 |                                                                  "inn",
282 |                                                                  "moodi",
283 |                                                                  "arrang",
284 |                                                                  "airlin",
285 |                                                                  "associ",
286 |                                                                  "institut"
287 |                                                                )))>0, "cater", foodInspect$type_food)
288 | 
289 | 
290 | 
291 | 
292 | 
293 | 
294 | 
295 | 
296 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
297 |                                                                c("new",
298 |                                                                  "china",
299 |                                                                  "thai",
300 |                                                                  "chines",
301 |                                                                  "cuisin",
302 |                                                                  "wok",
303 |                                                                  "express",
304 |                                                                  "noodl",
305 |                                                                  "sushi",
306 |                                                                  "chines",
307 |                                                                  "blue",
308 |                                                                  "japanes",
309 |                                                                  "rice",
310 |                                                                  "express",
311 |                                                                  "see",
312 |                                                                  "afc",
313 |                                                                  "chop",
314 |                                                                  "suei",
315 |                                                                  "panda",
316 |                                                                  "see",
317 |                                                                  "asian",
318 |                                                                  "japan"
319 |                                                                )))>0, "asian", foodInspect$type_food)
320 | 
321 | 
322 | 
323 | 
324 | 
325 | 
326 | 
327 | 
328 | 
329 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
330 |                                                                c("supermercado",
331 |                                                                  "supermarket",
332 |                                                                  "dominick",
333 |                                                                  "jewel",
334 |                                                                  "mariano",
335 |                                                                  "trader",
336 |                                                                  "aldi",
337 |                                                                  "whole",
338 |                                                                  "carniceria",
339 |                                                                  "meat",
340 |                                                                  "deli",
341 |                                                                  "sausag",
342 |                                                                  "produc",
343 |                                                                  "market"
344 |                                                                )))>0, "grocery", foodInspect$type_food)
345 | 
346 | 
347 | 
348 | 
349 | #what about chocolate, or ben and jerri's
350 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 
351 |                                                                    c("robbin",
352 |                                                                      "baskin",
353 |                                                                      "ic",
354 |                                                                      "icecream",
355 |                                                                      "dairi",
356 |                                                                      "oberwei",
357 |                                                                      "shake",
358 |                                                                      "dessert",
359 |                                                                      "gelato",
360 |                                                                      "candi",
361 |                                                                      "frozen",
362 |                                                                      "yogurt",
363 |                                                                      "cold",
364 |                                                                      "sugar",
365 |                                                                      "custard",
366 |                                                                      "theatr",
367 |                                                                      "pastri",
368 |                                                                      "cake",
369 |                                                                      "pretzel"
370 |                                                                    )))>0, "dessert", foodInspect$type_food)
371 | 
372 | 
373 | 
374 | 
375 | 
376 | 
377 | 
378 | 
379 | foodInspect$type_food <- factor(foodInspect$type_food, 
380 |                             levels=c("spanish","italian","sandwich",
381 |                                      "breakfast","dinner","bar",
382 |                                      "mart","cater","asian",
383 |                                      "grocery","dessert","unknown"))
384 | 
385 | 
386 | 
387 | 
388 | 
389 | 
390 | 
391 | 
392 | foodInspect$nondisp_ware <-ifelse(sapply(name_split, function(wds) sum(wds %in% 
393 |                                                                      c("ristorant",
394 |                                                                        "itali",
395 |                                                                        "trattoria",
396 |                                                                        "francesca",
397 |                                                                        "di",
398 |                                                                        "cafe",
399 |                                                                        "caff",
400 |                                                                        "pancak",
401 |                                                                        "chophous",
402 |                                                                        "buffett",
403 |                                                                        "kitchen",
404 |                                                                        "bistro",
405 |                                                                        "restaur",
406 |                                                                        "famili",
407 |                                                                        "golden",
408 |                                                                        "steak",
409 |                                                                        "rib",
410 |                                                                        "indian",
411 |                                                                        "india",
412 |                                                                        "bombai",
413 |                                                                        "cuisin",
414 |                                                                        "bar",
415 |                                                                        "club",
416 |                                                                        "loung",
417 |                                                                        "tavern",
418 |                                                                        "sport",
419 |                                                                        "pub",
420 |                                                                        "tap",
421 |                                                                        "cantina",
422 |                                                                        "taverna",
423 |                                                                        "hospit",
424 |                                                                        "intern",
425 |                                                                        "cater",
426 |                                                                        "univers",
427 |                                                                        "eurest",
428 |                                                                        "servic",
429 |                                                                        "center",
430 |                                                                        "hotel",
431 |                                                                        "marriott",
432 |                                                                        "sodexo",
433 |                                                                        "aramark",
434 |                                                                        "inn",
435 |                                                                        "moodi",
436 |                                                                        "associ",
437 |                                                                        "institut",
438 |                                                                        "new",
439 |                                                                        "china",
440 |                                                                        "thai",
441 |                                                                        "chines",
442 |                                                                        "cuisin",
443 |                                                                        "wok",
444 |                                                                        "sushi",
445 |                                                                        "chines",
446 |                                                                        "blue",
447 |                                                                        "japanes",
448 |                                                                        "chop",
449 |                                                                        "suei",
450 |                                                                        "asian",
451 |                                                                        "japan"
452 |                                                                      )))>0, 1L, 0L)
453 | 
454 | 
455 | 
456 | 
457 | 
458 | 
459 | ## Number of documents to display
460 | # N <- 10
461 | # theme_set(theme_bw())  
462 | # 
463 | # topic.proportions <- t(result$document_sums) / colSums(result$document_sums)
464 | # topic.proportions <- topic.proportions[sample(1:dim(topic.proportions)[1], N),]
465 | # 
466 | # topic.proportions[is.na(topic.proportions)] <-  1 / K
467 | # 
468 | # colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=".")
469 | # 
470 | # topic.proportions.df <- melt(cbind(data.frame(topic.proportions), document=factor(1:N)),
471 | #                              variable.name="topic",
472 | #                              id.vars = "document")  
473 | # head(topic.proportions.df)
474 | # 
475 | # ggplot(data=topic.proportions.df) + geom_bar(aes(x=topic,y=value,fill=topic),stat="identity") + labs(ylab="proportion") + 
476 | #   theme(axis.text.x = element_text(angle=90, hjust=1)) +  
477 | #   coord_flip() +
478 | #   facet_wrap(~ document, ncol=5) + scale_x_discrete("", breaks = NULL)
479 | 
480 | # marginal <- result$topics / sum(result$topics)
481 | # marginal[marginal==0] <- 1e-15
482 | # marginal <- log(marginal/sum(marginal))
483 | # #save(words, top.words, marginal, file="text_mine_result20140127v02.Rdata")
484 | # 
485 | # counts <- sapply(tf, function(m) {
486 | #   if (is.null(m)) {
487 | #     dat <- data.frame(voc=names(words)[1], cnts=0)
488 | #   } else {
489 | #     dat <- data.frame(voc=names(words)[m[1,]+1], cnts = m[2,])
490 | #   }
491 | #   dat$voc <- factor(dat$voc, levels=names(words))    
492 | #   tapply(dat$cnts, dat$voc, sum, simplify=TRUE)
493 | # })
494 | # counts[is.na(counts)] <- 0
495 | # 
496 | # logLik <- marginal %*% counts
497 | # 
498 | # foodInspect$topic <- apply(top.words, 2, paste, collapse=".")[apply(logLik,2,which.max)]
499 | # foodInspect$topic <- factor(foodInspect$topic, levels=apply(top.words, 2, paste, collapse="."))
500 | # #head(foodInspect[,c("doing_business_as_name","topic")],50)
501 | # 
502 | # # ratio <- t(logLik)/logLik[1,] - 1
503 | # # ratio <- ratio[,-1]
504 | # # colnames(ratio) <- paste("lr",apply(top.words, 2, paste, collapse=".")[-1], sep="_")
505 | # # foodInspect <- cbind(foodInspect,ratio)
506 | # 
507 | # logLik <- t(logLik)
508 | # colnames(logLik) <- paste("lr",apply(top.words, 2, paste, collapse="."), sep="_")
509 | # foodInspect <- cbind(foodInspect,logLik)
510 | 
511 | 
512 | # tf_idf <- ldply(strsplit(bname, split=" "),function(x) {
513 | #   table(factor(x[x %in% names(words)],levels=names(words)))
514 | #   })
515 | # ##idf <- sapply(tf_idf,function(col) sum(col!=0))
516 | # ##idf <- log(nrow(tf_idf)/idf)
517 | # ##l_ply(1:length(idf), function(i) {tf_idf[,i] <<- tf_idf[,i]*idf[i] 
518 | # ##                                   invisible()})
519 | # colnames(tf_idf) <- paste("name",colnames(tf_idf),sep="_")
520 | # foodInspect <- cbind(foodInspect,tf_idf)
521 | 
522 | #rm(idf,bname,myStopRegEx,words,myStopWords,tf_idf); gc()
523 | #rm(bname,myStopRegEx,words,myStopWords,tf_idf); gc()
524 | #rm(bname,myStopRegEx,words,myStopWords,tf,counts,logLik,ratio,marginal,result,K); gc()
525 | rm(bname,myStopRegEx,words,myStopWords,name_split); gc()
526 | 


--------------------------------------------------------------------------------
/CODE/not used/development_of_heat_merge.R:
--------------------------------------------------------------------------------
 1 | foodInspect$heat_burglary <- merge_heat(events=burglary, dateCol="date", window=90, nGroups=CPUs)
 2 | crime
 3 | 
 4 | dat
 5 | CUR <- dat[1789]$Inspection_Date
 6 | LAT <- dat[1789]$Latitude
 7 | LON <- dat[1789]$Longitude
 8 | 
 9 | WIN <- 90
10 | 
11 | ## "Current" crime
12 | crime[Date >= (CUR - WIN) & Date <= CUR,
13 |       list(Latitude, Longitude),
14 |       keyby = Date]
15 | ## KDE based on "Current" crime and current lat/long
16 | crime[Date > (CUR - WIN) & Date < CUR,
17 |       kde(new=c(LAT, LON), x=Latitude, y=Longitude, h = c(.01, .01))]
18 | 
19 | mydat <- dat[1789:1810]
20 | 
21 | ## Mydat (relevant fields only)
22 | mydat[ , Inspection_ID, keyby = list(Begin = Inspection_Date - WIN,
23 |                                      End = Inspection_Date,
24 |                                      Latitude,
25 |                                      Longitude)]
26 | ## Merge
27 | foverlaps(x = mydat[i = 1, 
28 |                     j = list(Inspection_ID,
29 |                              Latitude,
30 |                              Longitude), 
31 |                     keyby = list(start = Inspection_Date - WIN,
32 |                                  end = Inspection_Date)],
33 |           y = crime[i = TRUE,
34 |                     list(Latitude, 
35 |                          Longitude), 
36 |                     keyby=list(start = Date, 
37 |                                end = Date)],
38 |           type = "any")
39 | 
40 | 
41 | foverlaps(x = mydat[, j = list(Inspection_ID, Latitude, Longitude), 
42 |                     keyby = list(start = Inspection_Date - WIN, end = Inspection_Date)],
43 |           y = crime[i = TRUE, list(Latitude, Longitude),  
44 |                     keyby=list(start = Date,  end = Date)],
45 |           type = "any")[ , kde(new=c(i.Latitude[1], i.Longitude[1]), 
46 |                                x = Latitude, y = Longitude, h = c(.01, .01)),
47 |                         keyby = Inspection_ID]
48 | crime[Date >= (CUR - WIN) & Date <= CUR,
49 |       kde(new=c(LAT, LON), x=Latitude, y=Longitude, h = c(.01, .01))]
50 | 
51 | 


--------------------------------------------------------------------------------
/CODE/not used/heat.R:
--------------------------------------------------------------------------------
 1 | #filter to events within a date, and get heat
 2 | heat <- function(date,lat,long, data, dateColumnName, days){
 3 |   out <- tryCatch({
 4 |     biz <- data[data[,dateColumnName]>(date - days*60*60*24) &
 5 |                   data[,dateColumnName]<date,]
 6 |     kde(
 7 |       new=c(lat,long),
 8 |       x = biz$latitude, 
 9 |       y = biz$longitude, 
10 |       h = c(0.01, 0.01)
11 |     )},
12 |     error = function(cond) return(0)
13 |   )
14 |   return(out)
15 | }
16 | 


--------------------------------------------------------------------------------
/CODE/not used/kde_comparison/allstate_kde_new.R:
--------------------------------------------------------------------------------
 1 | allstate_kde <- function (new, x, y, h) {
 2 |     nx <- length(x)
 3 |     if (length(y) != nx) 
 4 |         stop("data vectors must be the same length")
 5 |     if (any(!is.finite(x)) || any(!is.finite(y))) 
 6 |         stop("missing or infinite values in the data are not allowed")
 7 | 
 8 | 
 9 | 
10 | 
11 | 
12 |     h <- if (missing(h)) 
13 |         c(bandwidth.nrd(x), bandwidth.nrd(y))
14 |     else rep(h, length.out = 2L)
15 |   
16 | 
17 |     h <- h / 4
18 |     ax <- (new[1]-x)        / h[1L]
19 |     ay <- (new[2]-y)        / h[2L]
20 |     z <- tcrossprod(matrix(dnorm(ax), , nx), 
21 |                     matrix(dnorm(ay), , nx)) / (nx * h[1L] * h[2L])
22 |     z
23 | }
24 | 


--------------------------------------------------------------------------------
/CODE/not used/kde_comparison/mass_kde_full.R:
--------------------------------------------------------------------------------
 1 | MASS_kde2d <- function (x, y, h, n = 25, lims = c(range(x), range(y))) {
 2 |     nx <- length(x)
 3 |     if (length(y) != nx) 
 4 |         stop("data vectors must be the same length")
 5 |     if (any(!is.finite(x)) || any(!is.finite(y))) 
 6 |         stop("missing or infinite values in the data are not allowed")
 7 |     if (any(!is.finite(lims))) 
 8 |         stop("only finite values are allowed in 'lims'")
 9 |     n <- rep(n, length.out = 2L)
10 |     gx <- seq.int(lims[1L], lims[2L], length.out = n[1L])
11 |     gy <- seq.int(lims[3L], lims[4L], length.out = n[2L])
12 |     h <- if (missing(h)) 
13 |         c(bandwidth.nrd(x), bandwidth.nrd(y))
14 |     else rep(h, length.out = 2L)
15 |     if (any(h <= 0)) 
16 |         stop("bandwidths must be strictly positive")
17 |     h <- h / 4
18 |     ax <- outer(gx, x, "-") / h[1L]
19 |     ay <- outer(gy, y, "-") / h[2L]
20 |     z <- tcrossprod(matrix(dnorm(ax), , nx), 
21 |                     matrix(dnorm(ay), , nx)) / (nx * h[1L] * h[2L])
22 |     list(x = gx, y = gy, z = z)
23 | }


--------------------------------------------------------------------------------
/CODE/not used/liveReadInAbandonedBuildings.R:
--------------------------------------------------------------------------------
 1 | 
 2 | myheader <- matrix(c(
 3 |   "service_request_type", "NULL",
 4 |   "service_request_number", "NULL",
 5 |   "date_service_request_was_received", "abandonedbuildings.date",
 6 |   "location_of_building_on_the_lot", "NULL",
 7 |   "is_the_building_dangerous_or_hazardous", "NULL",
 8 |   "is_the_building_open_or_boarded", "NULL",
 9 |   "if_the_building_is_open_where_is_the_entry_point", "NULL",
10 |   "is_the_building_currently_vacant_or_occupied", "NULL",
11 |   "is_the_building_vacant_due_to_fire", "NULL",
12 |   "any_people_using_property_homeless_children_gangs", "NULL",
13 |   "address_street_number", "NULL",
14 |   "address_street_direction", "NULL",
15 |   "address_street_name", "NULL",
16 |   "address_street_suffix", "NULL",
17 |   "zip_code", "NULL",
18 |   "x_coordinate", "NULL",
19 |   "y_coordinate", "NULL",
20 |   "ward", "NULL",
21 |   "police_district", "NULL",
22 |   "community_area", "NULL",
23 |   "latitude", "numeric",
24 |   "longitude", "numeric",
25 |   "location", "NULL"
26 | ), ncol=2, byrow=TRUE)
27 | 
28 | 
29 | 
30 | setting <- list(name="abandonedbuildings.date",
31 |                 func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y"))
32 | )
33 | 
34 | 
35 | mySubset <- "&$where=1=1"
36 | 
37 | 
38 | abandonedBuildings <- liveReadCSV(key="7nii-7srd", header=myheader, filter=mySubset, dateSetting=setting)
39 | 
40 | 
41 | abandonedBuildings <- subset(abandonedBuildings, !is.na(latitude) & !is.na(longitude) & !is.na(date_service_request_was_received))
42 | 
43 | rm(myheader, mySubset, setting); gc()
44 | 


--------------------------------------------------------------------------------
/CODE/not used/liveReadInAbandonedVehicles.R:
--------------------------------------------------------------------------------
 1 | 
 2 | myheader <- matrix(c(
 3 |   "creation_date", "abandonedvehicles.date",
 4 |   "status", "character",
 5 |   "completion_date", "NULL",
 6 |   "service_request_number", "NULL",
 7 |   "type_of_service_request", "NULL",
 8 |   "license_plate", "NULL",
 9 |   "vehicle_make_model", "NULL",
10 |   "vehicle_color", "NULL",
11 |   "current_activity", "NULL",
12 |   "most_recent_action", "NULL",
13 |   "how_many_days_has_the_vechicle_been_reported_as_parked", "NULL",
14 |   "street_address", "NULL",
15 |   "zip_code", "NULL",
16 |   "x_coordinate", "NULL",
17 |   "y_coordinate", "NULL",
18 |   "ward", "NULL",
19 |   "police_district", "NULL",
20 |   "community_area", "NULL",
21 |   "latitude", "numeric",
22 |   "longitude", "numeric",
23 |   "location", "NULL"
24 | ), ncol=2, byrow=TRUE)
25 | 
26 | 
27 | 
28 | setting <- list(name="abandonedvehicles.date",
29 |                 func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y"))
30 | )
31 | 
32 | 
33 | mySubset <- "&$where=1=1"
34 | 
35 | 
36 | abandonedVehicles <- liveReadCSV(key="3c9v-pnva", header=myheader, filter=mySubset, dateSetting=setting)
37 | 
38 | 
39 | abandonedVehicles <- subset(abandonedVehicles, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date))
40 | abandonedVehicles <- subset(abandonedVehicles, status %in% c("Completed", "Open"))
41 | abandonedVehicles$status <- NULL
42 | 
43 | rm(myheader, mySubset, setting); gc()
44 | 


--------------------------------------------------------------------------------
/CODE/not used/liveReadInBldgPermits.R:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | myheader <- matrix(c(
  4 |   "id","NULL",
  5 |   "permit","NULL",
  6 |   "permit_type","character",
  7 |   "issue_date","permit.date",
  8 |   "estimated_cost","NULL",
  9 |   "amount_waived","NULL",
 10 |   "amount_paid","NULL",
 11 |   "total_permit_fee","NULL",
 12 |   "street_number","NULL",
 13 |   "street_direction","NULL",
 14 |   "street_name","NULL",
 15 |   "suffix","NULL",
 16 |   "work_description","NULL",
 17 |   "pin1","NULL",
 18 |   "pin2","NULL",
 19 |   "pin3","NULL",
 20 |   "pin4","NULL",
 21 |   "pin5","NULL",
 22 |   "pin6","NULL",
 23 |   "pin7","NULL",
 24 |   "pin8","NULL",
 25 |   "pin9","NULL",
 26 |   "pin10","NULL",
 27 |   "contractor_1_type","NULL",
 28 |   "contractor_1_name","NULL",
 29 |   "contractor_1_address","NULL",
 30 |   "contractor_1_city","NULL",
 31 |   "contractor_1_state","NULL",
 32 |   "contractor_1_zipcode","NULL",
 33 |   "contractor_1_phone","NULL",
 34 |   "contractor_2_type","NULL",
 35 |   "contractor_2_name","NULL",
 36 |   "contractor_2_address","NULL",
 37 |   "contractor_2_city","NULL",
 38 |   "contractor_2_state","NULL",
 39 |   "contractor_2_zipcode","NULL",
 40 |   "contractor_2_phone","NULL",
 41 |   "contractor_3_type","NULL",
 42 |   "contractor_3_name","NULL",
 43 |   "contractor_3_address","NULL",
 44 |   "contractor_3_city","NULL",
 45 |   "contractor_3_state","NULL",
 46 |   "contractor_3_zipcode","NULL",
 47 |   "contractor_3_phone","NULL",
 48 |   "contractor_4_type","NULL",
 49 |   "contractor_4_name","NULL",
 50 |   "contractor_4_address","NULL",
 51 |   "contractor_4_city","NULL",
 52 |   "contractor_4_state","NULL",
 53 |   "contractor_4_zipcode","NULL",
 54 |   "contractor_4_phone","NULL",
 55 |   "contractor_5_type","NULL",
 56 |   "contractor_5_name","NULL",
 57 |   "contractor_5_address","NULL",
 58 |   "contractor_5_city","NULL",
 59 |   "contractor_5_state","NULL",
 60 |   "contractor_5_zipcode","NULL",
 61 |   "contractor_5_phone","NULL",
 62 |   "contractor_6_type","NULL",
 63 |   "contractor_6_name","NULL",
 64 |   "contractor_6_address","NULL",
 65 |   "contractor_6_city","NULL",
 66 |   "contractor_6_state","NULL",
 67 |   "contractor_6_zipcode","NULL",
 68 |   "contractor_6_phone","NULL",
 69 |   "contractor_7_type","NULL",
 70 |   "contractor_7_name","NULL",
 71 |   "contractor_7_address","NULL",
 72 |   "contractor_7_city","NULL",
 73 |   "contractor_7_state","NULL",
 74 |   "contractor_7_zipcode","NULL",
 75 |   "contractor_7_phone","NULL",
 76 |   "contractor_8_type","NULL",
 77 |   "contractor_8_name","NULL",
 78 |   "contractor_8_address","NULL",
 79 |   "contractor_8_city","NULL",
 80 |   "contractor_8_state","NULL",
 81 |   "contractor_8_zipcode","NULL",
 82 |   "contractor_8_phone","NULL",
 83 |   "contractor_9_type","NULL",
 84 |   "contractor_9_name","NULL",
 85 |   "contractor_9_address","NULL",
 86 |   "contractor_9_city","NULL",
 87 |   "contractor_9_state","NULL",
 88 |   "contractor_9_zipcode","NULL",
 89 |   "contractor_9_phone","NULL",
 90 |   "contractor_10_type","NULL",
 91 |   "contractor_10_name","NULL",
 92 |   "contractor_10_address","NULL",
 93 |   "contractor_10_city","NULL",
 94 |   "contractor_10_state","NULL",
 95 |   "contractor_10_zipcode","NULL",
 96 |   "contractor_10_phone","NULL",
 97 |   "contractor_11_type","NULL",
 98 |   "contractor_11_name","NULL",
 99 |   "contractor_11_address","NULL",
100 |   "contractor_11_city","NULL",
101 |   "contractor_11_state","NULL",
102 |   "contractor_11_zipcode","NULL",
103 |   "contractor_11_phone","NULL",
104 |   "contractor_12_type","NULL",
105 |   "contractor_12_name","NULL",
106 |   "contractor_12_address","NULL",
107 |   "contractor_12_city","NULL",
108 |   "contractor_12_state","NULL",
109 |   "contractor_12_zipcode","NULL",
110 |   "contractor_12_phone","NULL",
111 |   "contractor_13_type","NULL",
112 |   "contractor_13_name","NULL",
113 |   "contractor_13_address","NULL",
114 |   "contractor_13_city","NULL",
115 |   "contractor_13_state","NULL",
116 |   "contractor_13_zipcode","NULL",
117 |   "contractor_13_phone","NULL",
118 |   "contractor_14_type","NULL",
119 |   "contractor_14_name","NULL",
120 |   "contractor_14_address","NULL",
121 |   "contractor_14_city","NULL",
122 |   "contractor_14_state","NULL",
123 |   "contractor_14_zipcode","NULL",
124 |   "contractor_14_phone","NULL",
125 |   "contractor_15_type","NULL",
126 |   "contractor_15_name","NULL",
127 |   "contractor_15_address","NULL",
128 |   "contractor_15_city","NULL",
129 |   "contractor_15_state","NULL",
130 |   "contractor_15_zipcode","NULL",
131 |   "contractor_15_phone","NULL",
132 |   "latitude", "numeric",
133 |   "longitude", "numeric",
134 |   "location", "NULL"
135 | ), ncol=2, byrow=TRUE)
136 | 
137 | 
138 | setting <- list(name="permit.date",
139 |                 func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y"))
140 | )
141 | 
142 | 
143 | mySubset <- "&$where=_issue_date>to_floating_timestamp('2011-01-01T01:00:01')"
144 | 
145 | permits <- liveReadCSV(key="ydr8-5enu", header=myheader, filter=mySubset, dateSetting=setting)
146 | 
147 | permits <- subset(permits, !is.na(latitude) & !is.na(longitude) & !is.na(issue_date))
148 | permits$permit_type<- factor(permits$permit_type,levels=names(table(permits$permit_type)[order(-table(permits$permit_type))]))
149 | print(table(permits$permit_type))
150 | 
151 | rm(myheader, mySubset, setting); gc()
152 | 
153 | 
154 | 


--------------------------------------------------------------------------------
/CODE/not used/liveReadInBldgViolations.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | myheader <- matrix(c(
 4 |   "id","NULL",
 5 |   "violation_last_modified_date","NULL",
 6 |   "violation_date","violation.date",
 7 |   "violation_code","NULL",
 8 |   "violation_status","NULL",
 9 |   "violation_status_date","NULL",
10 |   "violation_description","character",
11 |   "violation_location","NULL",
12 |   "violation_inspector_comments","NULL",
13 |   "violation_ordinance","character",
14 |   "inspector_id","NULL",
15 |   "inspection_number","NULL",
16 |   "inspection_status","NULL",
17 |   "inspection_waived","NULL",
18 |   "inspection_category","character",
19 |   "department_bureau","NULL",
20 |   "address","NULL",
21 |   "property_group","NULL",
22 |   "latitude", "numeric",
23 |   "longitude", "numeric",
24 |   "location", "NULL"
25 | ), ncol=2, byrow=TRUE)
26 | 
27 | 
28 | setting <- list(name="violation.date",
29 |                 func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y"))
30 | )
31 | 
32 | 
33 | mySubset <- "&$where=violation_date>to_floating_timestamp('2011-01-01T01:00:01')"
34 | 
35 | bldgViolations <- liveReadCSV(key="22u3-xenr", header=myheader, filter=mySubset, dateSetting=setting)
36 | 
37 | bldgViolations <- subset(bldgViolations, !is.na(latitude) & !is.na(longitude) & !is.na(violation_date))
38 | bldgViolations$violation_description<- factor(bldgViolations$violation_description,levels=names(table(bldgViolations$violation_description)[order(-table(bldgViolations$violation_description))]))
39 | bldgViolations$violation_ordinance<- factor(bldgViolations$violation_ordinance,levels=names(table(bldgViolations$violation_ordinance)[order(-table(bldgViolations$violation_ordinance))]))
40 | bldgViolations$inspection_category<- factor(bldgViolations$inspection_category,levels=names(table(bldgViolations$inspection_category)[order(-table(bldgViolations$inspection_category))]))
41 | #print(table(bldgViolations$violation_description))
42 | #print(table(bldgViolations$violation_ordinance))
43 | #print(table(bldgViolations$inspection_category))
44 | 
45 | 
46 | 
47 | rm(myheader, mySubset, setting); gc()
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/CODE/not used/liveReadInGraffiti.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | myheader <- matrix(c(
 4 |   "creation_date", "graffiti.date",
 5 |   "status", "character",
 6 |   "completion_date", "NULL",
 7 |   "service_request_number", "NULL",
 8 |   "type_of_service_request", "NULL",
 9 |   "what_type_of_suface_is_the_graffiti_on_", "NULL",
10 |   "where_is_the_graffiti_located_", "NULL",
11 |   "street_address", "NULL",
12 |   "zip_code", "NULL",
13 |   "x_coordinate", "NULL",
14 |   "y_coordinate", "NULL",
15 |   "ward", "NULL",
16 |   "police_district", "NULL",
17 |   "community_area", "NULL",
18 |   "latitude", "numeric",
19 |   "longitude", "numeric",
20 |   "location", "NULL"
21 | ), ncol=2, byrow=TRUE)
22 | 
23 | 
24 | setting <- list(name="graffiti.date",
25 |                 func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y"))
26 | )
27 | 
28 | 
29 | mySubset <- "&$where=1=1"
30 | 
31 | graffiti <- liveReadCSV(key="hec5-y4x5", header=myheader, filter=mySubset, dateSetting=setting)
32 | 
33 | graffiti <- subset(graffiti, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date))
34 | graffiti <- subset(graffiti, status %in% c("Completed", "Open"))
35 | graffiti$status <- NULL
36 | 
37 | rm(myheader, mySubset, setting); gc()
38 | 
39 | 
40 | 


--------------------------------------------------------------------------------
/CODE/not used/liveReadInLightsOut.R:
--------------------------------------------------------------------------------
 1 | 
 2 | myheader <- matrix(c(
 3 |   "creation_date", "lights.date",
 4 |   "status", "character",
 5 |   "completion_date", "NULL",
 6 |   "service_request_number", "NULL",
 7 |   "type_of_service_request", "character",
 8 |   "street_address", "NULL",
 9 |   "zip_code", "NULL",
10 |   "x_coordinate", "NULL",
11 |   "y_coordinate", "NULL",
12 |   "ward", "NULL",
13 |   "police_district", "NULL",
14 |   "community_area", "NULL",
15 |   "latitude", "numeric",
16 |   "longitude", "numeric",
17 |   "location", "NULL"
18 | ), ncol=2, byrow=TRUE)
19 | 
20 | 
21 | 
22 | setting <- list(name="lights.date",
23 |                 func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y"))
24 | )
25 | 
26 | 
27 | mySubset <- "&$where=1=1"
28 | 
29 | 
30 | alleyLights <- liveReadCSV(key="t28b-ys7j", header=myheader, filter=mySubset, dateSetting=setting)
31 | streetLights <- liveReadCSV(key="3aav-uy2v", header=myheader, filter=mySubset, dateSetting=setting)
32 | 
33 | 
34 | alleyLights <- subset(alleyLights, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date))
35 | alleyLights <- subset(alleyLights, status %in% c("Completed", "Open"))
36 | alleyLights$status <- NULL
37 | 
38 | streetLights <- subset(streetLights, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date))
39 | streetLights <- subset(streetLights, status %in% c("Completed", "Open"))
40 | streetLights$status <- NULL
41 | 
42 | lightsOut <- rbind(alleyLights,streetLights)
43 | 
44 | rm(myheader, mySubset, setting, alleyLights, streetLights); gc()
45 | 


--------------------------------------------------------------------------------
/CODE/not used/liveReadInPotHoles.R:
--------------------------------------------------------------------------------
 1 | 
 2 | myheader <- matrix(c(
 3 |   "creation_date", "pothole.date",
 4 |   "status", "character",
 5 |   "completion_date", "NULL",
 6 |   "service_request_number", "NULL",
 7 |   "type_of_service_request", "NULL",
 8 |   "currentActivity", "NULL",
 9 |   "most_recent_action", "NULL",
10 |   "number_of_potholes_filled_on_block", "NULL",
11 |   "street_address", "NULL",
12 |   "zip", "NULL",
13 |   "x_coordinate", "NULL",
14 |   "y_coordinate", "NULL",
15 |   "ward", "NULL",
16 |   "police_district", "NULL",
17 |   "community_area", "NULL",
18 |   "latitude", "numeric",
19 |   "longitude", "numeric",
20 |   "location", "NULL"
21 | ), ncol=2, byrow=TRUE)
22 | 
23 | 
24 | 
25 | setting <- list(name="pothole.date",
26 |                 func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y"))
27 | )
28 | 
29 | 
30 | mySubset <- "&$where=1=1"
31 | 
32 | 
33 | potHoles <- liveReadCSV(key="7as2-ds3y", header=myheader, filter=mySubset, dateSetting=setting)
34 | 
35 | 
36 | potHoles <- subset(potHoles, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date))
37 | potHoles <- subset(potHoles, status %in% c("Completed", "Open"))
38 | potHoles$status <- NULL
39 | 
40 | rm(myheader, mySubset, setting); gc()
41 | 


--------------------------------------------------------------------------------
/CODE/not used/liveReadInRodents.R:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | myheader <- matrix(c(
 4 |   "creation_date", "rodent.date",
 5 |   "status", "character",
 6 |   "completion_date", "NULL",
 7 |   "service_request_number", "NULL",
 8 |   "type_of_service_request", "NULL",
 9 |   "number_of_premises_baited", "NULL",
10 |   "number_of_premises_with_garbage", "NULL",
11 |   "number_of_premises_with_rats","NULL",
12 |   "current_activity","NULL",
13 |   "most_recent_action","NULL",
14 |   "street_address", "NULL",
15 |   "zip_code", "NULL",
16 |   "x_coordinate", "NULL",
17 |   "y_coordinate", "NULL",
18 |   "ward", "NULL",
19 |   "police_district", "NULL",
20 |   "community_area", "NULL",
21 |   "latitude", "numeric",
22 |   "longitude", "numeric",
23 |   "location", "NULL"
24 | ), ncol=2, byrow=TRUE)
25 | 
26 | 
27 | setting <- list(name="rodent.date",
28 |                 func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y"))
29 | )
30 | 
31 | 
32 | mySubset <- "&$where=1=1"
33 | 
34 | rodents <- liveReadCSV(key="97t6-zrhs", header=myheader, filter=mySubset, dateSetting=setting)
35 | 
36 | rodents <- subset(rodents, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date))
37 | rodents <- subset(rodents, status %in% c("Completed", "Open"))
38 | rodents$status <- NULL
39 | 
40 | rm(myheader, mySubset, setting); gc()
41 | 
42 | 
43 | 


--------------------------------------------------------------------------------
/CODE/not used/liveReadInTreeTrims.R:
--------------------------------------------------------------------------------
 1 | 
 2 | myheader <- matrix(c(
 3 |   "creation_date", "treetrims.date",
 4 |   "status", "character",
 5 |   "completion_date", "NULL",
 6 |   "service_request_number", "NULL",
 7 |   "type_of_service_request", "NULL",
 8 |   "location_of_trees", "NULL",
 9 |   "street_address", "NULL",
10 |   "zip_code", "NULL",
11 |   "x_coordinate", "NULL",
12 |   "y_coordinate", "NULL",
13 |   "ward", "NULL",
14 |   "police_district", "NULL",
15 |   "community_area", "NULL",
16 |   "latitude", "numeric",
17 |   "longitude", "numeric",
18 |   "location", "NULL"
19 | ), ncol=2, byrow=TRUE)
20 | 
21 | 
22 | 
23 | setting <- list(name="treetrims.date",
24 |                 func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y"))
25 | )
26 | 
27 | 
28 | mySubset <- "&$where=1=1"
29 | 
30 | 
31 | treeTrims <- liveReadCSV(key="uxic-zsuj", header=myheader, filter=mySubset, dateSetting=setting)
32 | 
33 | 
34 | treeTrims <- subset(treeTrims, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date))
35 | treeTrims <- subset(treeTrims, status %in% c("Completed", "Open"))
36 | treeTrims$status <- NULL
37 | 
38 | rm(myheader, mySubset, setting); gc()
39 | 


--------------------------------------------------------------------------------
/CODE/prep_inspectors_for_eval.R:
--------------------------------------------------------------------------------
 1 | 
 2 | ##
 3 | ## The sanitarian identies are not publically available, so measures are taken 
 4 | ## disguise their identities in this analysis.
 5 | ## This script takes the original disguised data used in the evaluation and 
 6 | ## modifies it to match the format that became available in the city in early 
 7 | ## 2015 and is still available as of Dec 2017. 
 8 | ##
 9 | 
10 | ##==========================================================================
11 | ## INITIALIZE
12 | ##==========================================================================
13 | ## Remove all objects; perform garbage collection
14 | rm(list=ls())
15 | gc(reset=TRUE)
16 | 
17 | ## Load libraries & project functions
18 | geneorama::loadinstall_libraries(c("data.table"))
19 | geneorama::sourceDir("CODE/functions/")
20 | 
21 | ##==============================================================================
22 | ## LOAD CACHED RDS FILES
23 | ##==============================================================================
24 | 
25 | inspectors <- readRDS("DATA/inspectors.Rds")
26 | food <- readRDS("DATA/13_food_inspections.Rds")
27 | 
28 | ##==============================================================================
29 | ## PROCESS INSPECTOR DATA
30 | ##==============================================================================
31 | 
32 | ## Removing letters out front, and numbers trailing hyphen are not needed 
33 | ## (e.g. -1006 I believe is the code for retail food license)
34 | inspectors[ , License := gsub('[A-z]+|-.+$', "", License)]
35 | 
36 | ## cleaning any leading zeros
37 | inspectors[ , License := gsub('^[0]+', "", License)]
38 | 
39 | ## removing possibly invalid license numbers
40 | inspectors <- inspectors[nchar(License) > 3 & Inspector_Assigned != " "]
41 | 
42 | ## if multiple inspections for same license number, then using the inspector 
43 | ## on the first inspection
44 | inspectors <- inspectors[ , .N, by=list(License, Inspection_Date, Inspector_Assigned)]
45 | inspectors$N <- NULL
46 | 
47 | ## Convert to integer to match other data
48 | inspectors[ , License := as.integer(License)]
49 | setkey(inspectors, License, Inspection_Date)
50 | 
51 | ## Further deduplication
52 | inspectors_deduped <- inspectors[i = TRUE , 
53 |                                  j = list(Inspector_Assigned = Inspector_Assigned[1]), 
54 |                                  keyby = list(License, Inspection_Date)]
55 | 
56 | ## Merge in the Inspection_ID from the food records
57 | inspectors_deduped <- merge(x = food[ , list(License, Inspection_Date, Inspection_ID)],
58 |                             y = inspectors_deduped,
59 |                             by = c("License", "Inspection_Date"),
60 |                             # all.x = TRUE,
61 |                             sort = FALSE)
62 | inspectors_deduped[duplicated(Inspection_ID)]
63 | inspectors_deduped <- inspectors_deduped[!duplicated(Inspection_ID)]
64 | 
65 | ## Make the key columns of the inspector data match the output that the model
66 | ## would have gotten from COC internal systems
67 | inspectors_deduped_renamed <- inspectors_deduped[i = TRUE,
68 |                                                  j = list(sanitarian = Inspector_Assigned),
69 |                                                  keyby = list(inspectionID = Inspection_ID)]
70 | 
71 | geneorama::NAsummary(inspectors_deduped_renamed)
72 | 
73 | ##------------------------------------------------------------------------------
74 | ## SAVE RESULT
75 | ##------------------------------------------------------------------------------
76 | 
77 | saveRDS(inspectors_deduped_renamed, "DATA/19_inspector_assignments.Rds")
78 | 
79 | 
80 | 
81 | 


--------------------------------------------------------------------------------
/CONTRIBUTING.md:
--------------------------------------------------------------------------------
 1 | # How to Contribute
 2 | 
 3 | We welcome efforts to improve this project, and we are open to contributions for model improvements, process improvements, and general good ideas.  Please use this guide to help structure your contributions and to make it easier for us to consider your contributions and provide feedback.  If we do use your work we will acknowledge your contributions to the best of ability, and all contributions will be governed under the license terms specified in the LICENSE.md file in this project. To get started, sign the [Contributor License Agreement](https://www.clahub.com/agreements/Chicago/food-inspections-evaluation).
 4 | 
 5 | In general we use the structure provided by GitHub for our workflow management, so if you're new to GitHub please see this guide first: https://guides.github.com/activities/contributing-to-open-source/#contributing
 6 | 
 7 | Your contributions have the potential to have a positive impact on not just us, but everyone who is impacted by anyone who uses this project.  So, consider that a big thanks in advance.
 8 | 
 9 | ## Reporting an Issue
10 | 
11 | Food Inspections Evaluation uses [GitHub Issue Tracking](https://github.com/Chicago/food-inspections-evaluation/issues) to track issues. This is a good place to start and can be a helpful place to manage both technical and non-technical issues. 
12 | 
13 | ## Submitting Code Changes
14 | 
15 | Please send a [GitHub Pull Request to City of Chicago](https://github.com/chicago/food-inspections-evaluation/pull/new/master) with a clear list of what you've done (read more about [pull requests](http://help.github.com/pull-requests/)). Always write a clear log message for your commits. 
16 | 
17 | ## Demonstrating Model Performance
18 | 
19 | We welcome improvements to the analytic model that creates predictions for the Department of Public Health. The city may adopt a pull request that sufficiently improves the accuracy and prediction, thus, allowing you to contribute to the inspection practice for the City.
20 | 
21 | If your pull request is to improve the model, please consider the following steps when submitting a pull request:
22 | * Identify how your model is improving prior results
23 | * Run a test using the benchmark data provided in the repository
24 | * Create a pull request which describes those improvements in the description.
25 | * Work with the data science team to reproduce those results
26 |  
27 | ### Training your data
28 | Train your food inspection model using data between January 2009 and 2012. Use these fits to generate a forecast of food inspections for the time period between September 2, 2014, and October 31, 2014.
29 | 
30 | ### Measuring improvement
31 | The City sought to reduce the time to find critical violations. Thus, we are interested in a few key qualities in any improvements.
32 | * Your model reduces the average time to find critical violations (currently: 7.4 days)
33 | * Your model reduces the variance of the time to find critical violations (e.g., reduces the time by 7.5 days, but the standard deviation is lower)
34 | * Similarly, all restaurants were found earlier with no restaurants being found later, even if the average time remains the same
35 | * Your model increases the proportion of violations found in the first half of the pilot (e.g., percentage of critical violations found in September 2014).
36 | The team has calculated metrics for each one of these measures. You can investigate how these measures were calculated by referring to "Forecasting Restaurants with Critical Violations". Let us know if there are other metrics that should be considered for model improvement.
37 | 
38 | ### Ability to adopt model
39 | If you would like to submit an improvement, please open a pull request that notes improvements to at least one of the aforementioned benchmarks. Your code should be able to reproduce those results by the data science team.
40 | 
41 | Model improvements that include new data must use data that is freely (*gratis* or *libre*) to the City of Chicago. There must not be any terms that would prohibit the City from storing data.on local servers.
42 | 
43 | Likewise, by submitting a pull request, you agree that the City of Chicago will be allowed to use your code for analytic purposes and that your software will be licensed under the licensing found in LICENSE.md in this repository.
44 | 


--------------------------------------------------------------------------------
/DATA/11_bus_license.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/11_bus_license.Rds


--------------------------------------------------------------------------------
/DATA/12_crime.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/12_crime.Rds


--------------------------------------------------------------------------------
/DATA/13_food_inspections.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/13_food_inspections.Rds


--------------------------------------------------------------------------------
/DATA/14_garbage_carts.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/14_garbage_carts.Rds


--------------------------------------------------------------------------------
/DATA/15_sanitation_code.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/15_sanitation_code.Rds


--------------------------------------------------------------------------------
/DATA/17_mongo_weather_update.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/17_mongo_weather_update.Rds


--------------------------------------------------------------------------------
/DATA/19_inspector_assignments.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/19_inspector_assignments.Rds


--------------------------------------------------------------------------------
/DATA/21_food_inspection_violation_matrix.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/21_food_inspection_violation_matrix.Rds


--------------------------------------------------------------------------------
/DATA/21_food_inspection_violation_matrix_nums.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/21_food_inspection_violation_matrix_nums.Rds


--------------------------------------------------------------------------------
/DATA/22_burglary_heat.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/22_burglary_heat.Rds


--------------------------------------------------------------------------------
/DATA/22_garbageCarts_heat.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/22_garbageCarts_heat.Rds


--------------------------------------------------------------------------------
/DATA/22_sanitationComplaints_heat.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/22_sanitationComplaints_heat.Rds


--------------------------------------------------------------------------------
/DATA/23_food_insp_features.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/23_food_insp_features.Rds


--------------------------------------------------------------------------------
/DATA/24_bus_features.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/24_bus_features.Rds


--------------------------------------------------------------------------------
/DATA/30_xgboost_data.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/30_xgboost_data.Rds


--------------------------------------------------------------------------------
/DATA/30_xgboost_model.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/30_xgboost_model.Rds


--------------------------------------------------------------------------------
/DATA/Inspection_Report_Form.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/Inspection_Report_Form.pdf


--------------------------------------------------------------------------------
/DATA/inspectors.Rds:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/inspectors.Rds


--------------------------------------------------------------------------------
/LICENSE.md:
--------------------------------------------------------------------------------
1 | Copyright, 2014 City of Chicago
2 | 
3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
4 | 
5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
6 | 
7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
8 | 
9 | Libraries and other software utilized in this repository are copyrighted and distributed under their respective open source licenses.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | Food Inspections Evaluation
 2 | ============================
 3 | 
 4 | This is our model for predicting which food establishments are at most risk for the types of violations most likely to spread food-borne illness. Chicago Department of Public Health staff use these predictions to prioritize inspections. During a two month pilot period, we found that that using these predictions meant that inspectors found critical violations much faster.
 5 | 
 6 | You can help improve the health of our city by improving this model. This repository contains a training and test set, along with the data used in the current model. 
 7 | 
 8 | Feel free to clone, fork, send pull requests and to file bugs. 
 9 | Please note that we will need you to agree to our Contributor License Agreement (CLA) in order to be able to use any pull requests.
10 | 
11 | 
12 | Original Analysis and Reports
13 | -----------------------------
14 | In an effort to reduce the public’s exposure to foodborne illness the [City of Chicago](https://github.com/Chicago) partnered with Allstate’s Quantitative Research & Analytics department to develop a predictive model to help prioritize the city's food inspection staff.  This Github project is a complete working evaluation of the model including the data that was used in the model, the code that was used to produce the statistical results, the evaluation of the validity of the results, and documentation of our methodology.
15 | 
16 | The model evaluation calculates individualized risk scores for more than ten thousand Chicagoland food establishments using publically available data, most of which is updated nightly on [Chicago’s data portal](https://data.cityofchicago.org/). The sole exception is information about the inspectors.
17 | 
18 | The evaluation compares two months of Chicago’s Department of Public Health inspections to an alternative data driven approach based on the model. The two month evaluation period is a completely out of sample evaluation based on a model created using test and training data sets from prior time periods.
19 | 
20 | The reports may be reproduced compiling the knitr documents present in ``./REPORTS``.
21 | 
22 | REQUIREMENTS
23 | ------------
24 | 
25 | All of the code in this project uses the open source statistical application, R.  We advise that you use ```R version >= 3.1``` for best results. 
26 | 
27 | Ubuntu users may need to install `libssl-dev`, `libcurl4-gnutls-dev`, and `libxml2-dev`.  This can be accomplished by typing the following command at the command line:
28 | `sudo apt-get install libssl-dev libcurl4-gnutls-dev libxml2-dev`
29 | 
30 | The code makes extensive usage of the ``data.table`` package. If you are not familiar with the package, you might want to consult the data.table [FAQ available on CRAN] (http://cran.r-project.org/web/packages/data.table/vignettes/datatable-faq.pdf).
31 | 
32 | 
33 | FILE LAYOUT
34 | ------
35 | 
36 | The following directory structure is used:
37 | 
38 | DIRECTORY           | DESCRIPTION
39 | --------------------|----------------------
40 | `.`                 | Project files such as README and LICENSE
41 | `./CODE/`           | Sequential scripts used to develop model
42 | `./CODE/functions/` | General function definitions, which could be used in any script
43 | `./DATA/`           | Data files created by scripts in `./CODE/`, or static
44 | `./REPORTS/`        | Reports and other output are located in 
45 | 
46 | We have included all of the steps used to develop the model, evaluate the results, and document the results in the above directory structure.
47 | 
48 | The scripts located in the `./CODE/` folder are organized sequentially, meaning that the numeric prefix indicates the order in which the script was / should be run in order to reproduce our results.
49 | 
50 | Although we include all the necessary steps to download and transform the data used in the model, we also have stored a snapshot of the data in the repository.  So, to run the model as it stands, it is only necessary to download the repository, install the dependencies, and step through the code in `CODE/30_glmnet_model.R`.  If you do not already have them, the dependencies can be installed using the startup script `CODE/00_Startup.R`.
51 | 
52 | DATA
53 | ------
54 | 
55 | Data used to develop the model is stored in the ``./DATA`` directory. [Chicago’s Open Data Portal](http://data.cityofchicago.org). The following datasets were used in the building the analysis-ready dataset. 
56 | 
57 | ```
58 | Business Licenses
59 | Food Inspections 
60 | Crime
61 | Garbage Cart Complaints
62 | Sanitation Complaints
63 | Weather
64 | Sanitarian Information
65 | ```
66 | 
67 | The data sources are joined to create a tabular dataset that paints a statistical picture of a ‘business license’- The primary modelling unit / unit of observation in this project.
68 | 
69 | The data sources are joined (in SQLesque manner) on appropriate composite keys. These keys include Inspection ID, Business License, and Geography expressed as a Latitude / Longitude combination among others. 
70 | 
71 | 
72 | Acknowledgements
73 | ----------------
74 | This research was conducted by the City of Chicago with support from the [Civic Consulting Alliance](http://www.ccachicago.org/), and [Allstate Insurance](https://www.allstate.com/). The City would especially like to thank Stephen Collins, Gavin Smart, Ben Albright, and David Crippin for their efforts in developing the predictive model. We also appreciate the help of Kelsey Burr, Christian Hines, and Kiran Pookote in coordinating this research project. We owe a special thanks to our volunteers from Allstate who put in a tremendous effort to develop the predictive model and allowing their team to volunteer for projects to change their city. This project was partially funded by an award from the Bloomberg Philanthropies' Mayors Challenge.
75 | 


--------------------------------------------------------------------------------
/REPORTS/.gitignore:
--------------------------------------------------------------------------------
1 | *.md
2 | *_cache/
3 | *_files
4 | 


--------------------------------------------------------------------------------
/REPORTS/CountComparison_aftersecondrefactor.Rmd:
--------------------------------------------------------------------------------
  1 | ---
  2 | title: "Data Integrity Check"
  3 | output: html_document
  4 | ---
  5 | 
  6 | ### Purpose
  7 | 
  8 | This document compares monthly counts at various stages of the model process.  We show how many food inspection records (and critical violations) there were
  9 | 
 10 | 1. In the original download, 
 11 | 2. After filtering, and
 12 | 2. After taking out any NA's (which indicates that a data sources wasn't able to merged into the food inspection data).
 13 | 
 14 | ```{r, echo=FALSE, warning=FALSE, results='hide', message=FALSE}
 15 | 
 16 | ## Initialize and load data... (hidden in report output)
 17 | 
 18 | geneorama::set_project_dir("food-inspections-evaluation")
 19 | ##==============================================================================
 20 | ## INITIALIZE
 21 | ##==============================================================================
 22 | ## Remove all objects; perform garbage collection
 23 | rm(list=ls())
 24 | gc(reset=TRUE)
 25 | ## Detach libraries that are not used
 26 | geneorama::detach_nonstandard_packages()
 27 | ## Load libraries that are used
 28 | geneorama::loadinstall_libraries(c("data.table", "MASS"))
 29 | ## Load custom functions
 30 | geneorama::sourceDir("CODE/functions/")
 31 | 
 32 | ##==============================================================================
 33 | ## LOAD CACHED RDS FILES
 34 | ##==============================================================================
 35 | ## Import the key data sets used for prediction
 36 | foodInspect <- readRDS("DATA/13_food_inspections.Rds")
 37 | foodInspect$Violations <- NULL
 38 | violation_dat <- readRDS("DATA/22_violation_dat.Rds")
 39 | dat_model <- readRDS("DATA/23_dat_model.Rds")
 40 | 
 41 | ##==============================================================================
 42 | ## ADD FIELDS AND APPLY FILTER TO foodInspect
 43 | ##==============================================================================
 44 | ## Remove duplicate IDs first
 45 | foodInspect <- foodInspect[!duplicated(Inspection_ID)]
 46 | 
 47 | ## Limit dates
 48 | foodInspect <- foodInspect[Inspection_Date > as.IDate("2011-09-01")]
 49 | 
 50 | ## Join in the violation matrix
 51 | foodInspect <- merge(x = foodInspect, 
 52 |                      y = violation_dat, 
 53 |                      by = "Inspection_ID", 
 54 |                      all.x = TRUE)
 55 | 
 56 | foodInspect[ , criticalFound := pmin(1, criticalCount)]
 57 | 
 58 | foodInspect[i = TRUE, 
 59 |             j = Facility_Type_Clean := 
 60 |                 categorize(x = Facility_Type,
 61 |                            primary = list(Restaurant = "restaurant",
 62 |                                           Grocery_Store = "grocery"),
 63 |                            ignore.case = TRUE)]
 64 | foodInspect_filtered <- filter_foodInspect(foodInspect)
 65 | 
 66 | ##==============================================================================
 67 | ## ADD FIELDS AND APPLY FILTER TO dat_model
 68 | ##==============================================================================
 69 | dat_model[ , criticalFound := pmin(1, criticalCount)]
 70 | ## Only keep "Retail Food Establishment"
 71 | dat_model <- dat_model[LICENSE_DESCRIPTION == "Retail Food Establishment"]
 72 | ## Remove License Description
 73 | dat_model[ , LICENSE_DESCRIPTION := NULL]
 74 | dat_model <- na.omit(dat_model)
 75 | 
 76 | ```
 77 | 
 78 | ### Construct tables of counts, and critical violations found 
 79 | 
 80 | ```{r}
 81 | tab_food <- foodInspect[
 82 |     i = TRUE, 
 83 |     j = list(N = .N, 
 84 |              critFound = sum(criticalFound)), 
 85 |     keyby = list(month = round(Inspection_Date, "month"))]
 86 | 
 87 | tab_food_open_canvass <- foodInspect[
 88 |     i = Inspection_Type == "Canvass" &
 89 |         !Results %in% c('Out of Business',
 90 |                         'Business Not Located',
 91 |                         'No Entry'), 
 92 |     j = list(N = .N, 
 93 |              critFound = sum(criticalFound)), 
 94 |     keyby = list(month = round(Inspection_Date, "month"))]
 95 | 
 96 | tab_food_filtered <- foodInspect_filtered[
 97 |     i = TRUE, 
 98 |     j = list(N = .N, 
 99 |              critFound = sum(criticalFound)), 
100 |     keyby = list(month = round(Inspection_Date, "month"))]
101 | 
102 | tab_modeldat <- dat_model[
103 |     i = TRUE, 
104 |     j = list(N = .N, 
105 |              critFound = sum(criticalFound)), 
106 |     keyby = list(month = round(Inspection_Date, "month"))]
107 | ```
108 | 
109 | ### Monthly counts at each step in "filtering"
110 | 
111 | ```{r}
112 | tab_food
113 | tab_food_open_canvass
114 | tab_food_filtered
115 | tab_modeldat
116 | 
117 | ```
118 | 
119 | ### Monthly counts at each stage in "filtering", merged into a single table
120 | 
121 | ```{r}
122 | tab_all <- merge(tab_food, 
123 |                  merge(tab_food_open_canvass, 
124 |                        merge(tab_food_filtered,
125 |                              tab_modeldat, 
126 |                              all = TRUE,
127 |                              suffixes = c(".Filtered", ".Model")), 
128 |                        all=TRUE),
129 |                  all=TRUE,
130 |                  suffixes = c(".ALL", ".Open"))
131 | tab_all
132 | ```
133 | 
134 | 
135 | ### Violations, tabluated by "is.na"
136 | 
137 | Generally, it looks like data that was incomplete was lower in violation counts.
138 | 
139 | ```{r}
140 | geneorama::set_project_dir("food-inspections-evaluation")
141 | 
142 | ## Read in the original dat model before any filtering is applied
143 | dat_model_orig <- readRDS("DATA/dat_model.Rds")
144 | 
145 | ## Luckily the restaurants with missing business data mostly appear to have
146 | ## lower counts of critical and serious violations
147 | geneorama::NAsummary(dat_model_orig)
148 | dat_model_orig[ , table(is.na(Business_ID))]
149 | dat_model_orig[
150 |     i = TRUE,
151 |     j = list(mean_critical = mean(criticalCount), sd_critical = sd(criticalCount),
152 |              mean_serious = mean(seriousCount), sd_serious = sd(seriousCount),
153 |              mean_minor = mean(minorCount), sd_minor = sd(minorCount)), 
154 |     is.na(Business_ID)]
155 | ```
156 | 
157 | 
158 | 
159 | 
160 | 
161 | 
162 | 
163 | 
164 | 
165 | 
166 | 
167 | 


--------------------------------------------------------------------------------
/REPORTS/Metric_Development.Rmd:
--------------------------------------------------------------------------------
  1 | ```{r, echo=FALSE, results='hide', message=FALSE}
  2 | ##==============================================================================
  3 | ## INITIALIZE
  4 | ##==============================================================================
  5 | ## Remove all objects; perform garbage collection
  6 | rm(list=ls())
  7 | gc(reset=TRUE)
  8 | ## Detach libraries that are not used
  9 | geneorama::detach_nonstandard_packages()
 10 | ## Load libraries that are used
 11 | geneorama::loadinstall_libraries(c("data.table", "glmnet", "ggplot2", "splines",
 12 |                                    "knitr", "printr", "scales"))
 13 | ## Load custom functions
 14 | geneorama::sourceDir("CODE/functions/")
 15 | 
 16 | ## Define melt to be the data.table melt
 17 | melt <- data.table:::melt.data.table
 18 | ```
 19 | 
 20 | ```{r, echo=FALSE, results='hide', fig.show='hide', message=FALSE}
 21 | ##==============================================================================
 22 | ## RUN GLMNET MODEL
 23 | ##==============================================================================
 24 | 
 25 | ## NOTE: THE DIRECTORY NAME BELOW MUST MATCH YOUR PROECT DIRECTORY NAME
 26 | ## If you clone the project directly from github.com the project name will
 27 | ## automatically be food-inspection-evaluation, but if you change the name 
 28 | ## you will need to update the name of the project below in the set_project_dir
 29 | ## function below.
 30 | geneorama::set_project_dir("food-inspections-evaluation")
 31 | geneorama::sourceDir("CODE/functions/")
 32 | 
 33 | ## Overwrite the base "interactive" function to prevent complete initialization
 34 | interactive <- function(){FALSE}
 35 | ## Source the file that runs the glmnet model:
 36 | source("CODE/30_glmnet_model.R", echo = FALSE)
 37 | ## Remove the temporary local "interactive" function
 38 | rm(interactive)
 39 | ## Remove excess variables generated by "CODE/30_glmnet_model.R"
 40 | rm(confusion_values_test, dat, errors, errorsTest, iiTest, iiTrain, 
 41 |    lam, mm, pen, w.lam, xmat)
 42 | 
 43 | ## These should be the only remaining objects (besides functions) imported:
 44 | # coef
 45 | # inspCoef
 46 | # datTest
 47 | # model
 48 | 
 49 | ## List objects (besides functions) to check with comments above:
 50 | geneorama::lll()
 51 | ```
 52 | 
 53 | 
 54 | 
 55 | ```{r}
 56 | opts_chunk$set(tidy = FALSE)
 57 | opts_chunk$set(fig.width = 10)
 58 | # opts_chunk$set(fig.height = 10)
 59 | ```
 60 | 
 61 | 
 62 | ```{r, echo=FALSE, results='hide'}
 63 | ##==============================================================================
 64 | ## DEFINE CUSTOM GGPLOT FUNCTION
 65 | ##==============================================================================
 66 | 
 67 | # cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 
 68 | #                "#0072B2", "#D55E00", "#CC79A7")
 69 | 
 70 | ggplot <- function(...) {
 71 |     cbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 
 72 |                "#0072B2", "#D55E00", "#CC79A7")
 73 |     ggplot2::ggplot(...) +
 74 |         theme_grey()+
 75 |         scale_fill_manual(values=cbPalette) +
 76 |         scale_colour_manual(values=cbPalette) +
 77 |         theme(plot.title = element_text(size = 15))
 78 |     
 79 | }
 80 | ```
 81 | 
 82 | ```{r, echo=FALSE, eval=FALSE}
 83 | ## Testing palette
 84 | # cbPalette <- c("#000000", "#E69F00", "#56B4E9", 
 85 | #                "#009E73", "#F0E442", "#0072B2", 
 86 | #                "#D55E00", "#CC79A7")
 87 | # pie(rep(1,length(cbPalette)), col = cbPalette)
 88 | ```
 89 | 
 90 | ## Bar plots
 91 | 
 92 | ```{r bar_graph_comparing_BAU_and_model_in_first_half}
 93 | # datTest[period==1, list(.N, sum(criticalFound))]
 94 | # datTest[period_modeled==1, list(.N, sum(criticalFound))]
 95 | 
 96 | crit_viol_rate <- data.table(
 97 |     Regime = c("Business As Usual", 
 98 |                "Data Driven"),
 99 |     values = c(datTest[period==1, sum(criticalFound)/.N],
100 |                datTest[period_modeled==1, sum(criticalFound)/.N]))
101 | crit_viol_cumulative <- data.table(
102 |     Regime = c("Business As Usual",
103 |                "Data Driven"),
104 |     values = c(datTest[period==1, sum(criticalFound)] / datTest[ , sum(criticalFound)],
105 |                datTest[period_modeled==1, sum(criticalFound)] / datTest[ , sum(criticalFound)]))
106 | 
107 | ggplot(crit_viol_rate) + 
108 |     aes(x = Regime, y = values, fill=Regime) + 
109 |     labs(title=paste0('Percentage of inspections resulting in a critical violation\n',
110 |                       'during Period 1\n') ) +
111 |     geom_bar(stat = "identity") + 
112 |     # guides(fill=FALSE) +
113 |     scale_y_continuous(labels = percent) +
114 |     xlab("") + ylab("")
115 | 
116 | ggplot(crit_viol_cumulative) + 
117 |     aes(x = factor(Regime), y = values, fill=Regime) + 
118 |     labs(title=paste0('Percentage of period 1 & period critical violations\n',
119 |                       'found in Period 1\n') ) +
120 |     geom_bar(stat = "identity") +
121 |     # guides(fill=FALSE) +
122 |     scale_y_continuous(labels = percent) +
123 |     xlab("") + ylab("") + expand_limits(y = 1)
124 | ```
125 | 
126 | 
127 | 
128 | ## Aggregate count comparisons
129 | 
130 | ```{r}
131 | comp <- cbind(
132 |     datTest[i = order(Inspection_Date), 
133 |             j = list(Inspection_ID_BAU   = Inspection_ID,
134 |                      Inspection_Date   = Inspection_Date,
135 |                      criticalFound_BAU     = criticalFound)],
136 |     datTest[i = order(-glm_pred), 
137 |             j = list(Inspection_ID_Model   = Inspection_ID,
138 |                      criticalFound_Model   = criticalFound)])
139 | comp <- comp[ , Best_Possible := criticalFound_Model[order(-criticalFound_Model)]]
140 | comp <- comp[ , Worst_Possible := criticalFound_Model[order(criticalFound_Model)]]
141 | # comp
142 | 
143 | comp_summary <- comp[
144 |     i = TRUE, 
145 |     j = list(Total_Inspections = .N,
146 |              Crit_Violations_BAU = sum(criticalFound_BAU),
147 |              Crit_Violations_Model = sum(criticalFound_Model),
148 |              Best_Possible = sum(Best_Possible),
149 |              Worst_Possible = sum(Worst_Possible)), 
150 |     keyby = list(Inspection_Date = Inspection_Date)]
151 | comp_summary
152 | ```
153 | 
154 | ## Calculation of average time saved with model
155 | 
156 | ```{r}
157 | date_comp <- merge(
158 |     x = comp[i = TRUE,
159 |              j = list (criticalFound_BAU,
160 |                        date_bau = Inspection_Date,
161 |                        id = Inspection_ID_BAU)],
162 |     y = comp[i = TRUE,
163 |              j = list (criticalFound_Model,
164 |                        date_model = Inspection_Date,
165 |                        id = Inspection_ID_Model)],
166 |     by = "id")
167 | ```
168 | 
169 | ### Average improvement based on date comparison
170 | 
171 | ```{r}
172 | date_comp[i = criticalFound_Model==1,
173 |           j = mean(date_model - date_bau)]
174 | ```
175 | 
176 | ### Distribution of "how many days sooner" for all critical violations
177 | 
178 | ```{r}
179 | hist(as.numeric(date_comp[criticalFound_Model==1,
180 |           -(date_model - date_bau)]),
181 |      main = paste0("Days sooner that a critical \n",
182 |                    " violation would have been discovered"),
183 |      breaks = 30, 
184 |      xlim = c(-60, 60),
185 |      xlab = "days sooner",
186 |      col = c(rep("#CC79A7", 10), rep("#009E73", 20)))
187 | ```
188 | 
189 | ```{r}
190 | hist(as.numeric(date_comp[criticalFound_Model==1,
191 |           -(date_model - date_bau)]),
192 |      main = paste0("Days sooner that a critical \n",
193 |                    " violation would have been discovered"),
194 |      breaks = 5, 
195 |      xlab = "days sooner",
196 |      col = c(rep("#CC79A7", 3), rep("#009E73", 10)))
197 | ```
198 | 
199 | 
200 | 
201 | ```{r}
202 | comp_summary_percent <- comp_summary[
203 |         i = TRUE,
204 |         j = list(Pct_Crit_Violations_Found_BAU = Crit_Violations_BAU / Total_Inspections,
205 |                  Pct_Crit_Violations_Found_Model = Crit_Violations_Model / Total_Inspections),
206 |         keyby = Inspection_Date]
207 | ```
208 | 
209 | ### Unadjusted plot
210 | 
211 | ```{r}
212 | ggplot(melt(data = comp_summary_percent, id.vars = "Inspection_Date")) +
213 |     aes(x=Inspection_Date, y=value, colour=variable) + 
214 |     labs(title=paste0("Percent of inspections resulting in a Critical Violation\n",
215 |                       "(Daily intervals shown)\n"))+
216 |     geom_line()
217 | ```
218 | 
219 | ### Plot smoothed by gam to show overall trend
220 | 
221 | ```{r}
222 | ggplot(melt(data = comp_summary_percent, id.vars = "Inspection_Date")) +
223 |     aes(x=Inspection_Date, y=value, colour=variable, fill=variable) + 
224 |     labs(title=paste0('Critical violations found on a daily basis\n',
225 |                       'as a percent of total inspections performed\n',
226 |                       "(smoothed results)\n") ) +
227 |     stat_smooth(method = "gam", 
228 |                 formula = y ~ s(x, k=10, sp=2, bs="ps"), 
229 |                 alpha=.3,
230 |                 level = .65)
231 | ```
232 | 
233 | 
234 | ## Count comparisons (cumulative)
235 | 
236 | 
237 | ```{r}
238 | comp_summary_cumsum <- comp_summary[
239 |     i = TRUE,
240 |     j = list(`\nInspection Date` = Inspection_Date,
241 |              `Business As Usual` = cumsum(Crit_Violations_BAU),
242 |              `Data Driven` = cumsum(Crit_Violations_Model),
243 |              `Best Possible` = cumsum(Best_Possible),
244 |              `Worst Possible` = cumsum(Worst_Possible))]
245 | 
246 | 
247 | ggplot(melt(data = comp_summary_cumsum, 
248 |             id.vars = "\nInspection Date")) +
249 |     aes(x = `\nInspection Date`, 
250 |         y = value, 
251 |         colour = variable) + 
252 |     labs(title="Comparing cumulative violations discovered\n") +
253 |     ylab("Cumulative critical violations to date\n") +
254 |     geom_line(lwd=1.5) + 
255 |     geom_point(colour="black")
256 | 
257 | ```
258 | 
259 | ```{r}
260 | comp_summary_cumsum_diff <- comp_summary_cumsum[
261 |     i = TRUE,
262 |     j = list(Inspection_Date, 
263 |              diff = Model - BAU)]
264 | 
265 | ggplot(comp_summary_cumsum_diff) +
266 |     aes(x=Inspection_Date, y=diff) + 
267 |     labs(title=paste0("Difference Between BAU and Modeled\n",
268 |                       "Cumulative Critical Violations Found\n")) +
269 |     ylab("Cumulative Difference of Critical Violations Found") +
270 |     geom_line()
271 | # geneorama::clipper(comp_summary)
272 | ```
273 | 
274 | 
275 | 
276 | ```{r}
277 | comp_summary_cumsum_diff_multi <- comp_summary_cumsum[
278 |     i = TRUE,
279 |     j = list(Inspection_Date, 
280 |              bau_diff = Best_Possible - BAU,
281 |              model_diff = Best_Possible - Model)]
282 | 
283 | ggplot(melt(comp_summary_cumsum_diff_multi, 
284 |             id.vars = "Inspection_Date")) +
285 |     aes(x=Inspection_Date, y=value, colour=variable) + 
286 |     labs(title=paste0("Difference Between BAU and Modeled\n",
287 |                       "Cumulative Critical Violations Found\n")) +
288 |     ylab("Cumulative Difference of Critical Violations Found") +
289 |     geom_line()
290 | # geneorama::clipper(comp_summary)
291 | ```
292 | 
293 | 
294 | ```{r}
295 | geneorama::set_project_dir("food-inspections-evaluation")
296 | foodInspect <- readRDS("DATA/13_food_inspections.Rds")
297 | foodInspect <- foodInspect[ , Violations := NULL]
298 | 
299 | setkey(foodInspect, Inspection_ID)
300 | setkey(date_comp, id)
301 | 
302 | date_comp_w_name <- foodInspect[,DBA_Name,keyby=Inspection_ID][date_comp]
303 | date_comp_w_name <- date_comp_w_name[ , criticalFound_Model:=NULL]
304 | setnames(date_comp_w_name, "Inspection_ID", "id")
305 | setnames(date_comp_w_name, "DBA_Name", "dba_name")
306 | setnames(date_comp_w_name, "criticalFound_BAU", "crit")
307 | 
308 | # date_comp_w_name
309 | 
310 | set.seed(27)
311 | ii <- c(sample(date_comp_w_name[crit==1,id], 5),
312 |         sample(date_comp_w_name[crit==0,id], 20))
313 | 
314 | samp <- date_comp_w_name[id %in% ii]
315 | samp
316 | 
317 | plot(data.frame(x = c(1, 1, 3, 3),
318 |                 y = c(1, nrow(samp), 1, nrow(samp))),
319 |      type = "n", axes=F, xlab="", ylab="")
320 | samp_dba <- samp[order(samp$date_bau), ]
321 | samp_model <- samp[order(samp$date_model), ]
322 | 
323 | text(x = 1.5, y = 1:nrow(samp_dba), labels = samp_dba$dba_name, 
324 |      col = ifelse(samp_dba$crit==1,"red", "black") )
325 | text(x = 2.5, y = 1:nrow(samp_model), labels = samp_model$dba_name, 
326 |      col = ifelse(samp_model$crit==1,"red", "black") )
327 | 
328 | ```
329 | 
330 | 
331 | 
332 | 


--------------------------------------------------------------------------------
/REPORTS/Metric_Development.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/REPORTS/Metric_Development.xlsx


--------------------------------------------------------------------------------
/REPORTS/Metric_Development_bargraphs.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/REPORTS/Metric_Development_bargraphs.xlsx


--------------------------------------------------------------------------------
/REPORTS/Metric_Development_day_diff.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/REPORTS/Metric_Development_day_diff.xlsx


--------------------------------------------------------------------------------
/REPORTS/ModelSummary_20141204.Rmd:
--------------------------------------------------------------------------------
  1 | 
  2 | ## Purpose of this report:
  3 | 
  4 | This report is (basically) just a hard copy of what you should see if you run the file "CODE/30\_glmnet\_model.R".  It's only used for debugging / checking results. (Can be useful if you download new data or try a new model, and want to compare results to the "original").
  5 | 
  6 | ## Initialization
  7 | 
  8 | ```{r, echo=TRUE, warning=FALSE, results='hide', message=FALSE}
  9 | geneorama::set_project_dir("food-inspections-evaluation")
 10 | ##==============================================================================
 11 | ## INITIALIZE
 12 | ##==============================================================================
 13 | ## Remove all objects; perform garbage collection
 14 | rm(list=ls())
 15 | gc(reset=TRUE)
 16 | ## Detach libraries that are not used
 17 | geneorama::detach_nonstandard_packages()
 18 | ## Load libraries that are used
 19 | geneorama::loadinstall_libraries(c("data.table", "glmnet", "ggplot2", "knitr"))
 20 | ## Load custom functions
 21 | geneorama::sourceDir("CODE/functions/")
 22 | 
 23 | ## Set knitr options
 24 | opts_chunk$set(tidy = FALSE)
 25 | ```
 26 | 
 27 | ## Load Data
 28 | 
 29 | ```{r}
 30 | geneorama::set_project_dir("food-inspections-evaluation")
 31 | ##==============================================================================
 32 | ## LOAD CACHED RDS FILES
 33 | ##==============================================================================
 34 | dat <- readRDS("DATA/23_dat_model.Rds")
 35 | 
 36 | ## Only keep "Retail Food Establishment"
 37 | dat <- dat[LICENSE_DESCRIPTION == "Retail Food Establishment"]
 38 | ## Remove License Description
 39 | dat[ , LICENSE_DESCRIPTION := NULL]
 40 | dat <- na.omit(dat)
 41 | 
 42 | ## Add criticalFound variable to dat:
 43 | dat[ , criticalFound := pmin(1, criticalCount)]
 44 | 
 45 | ## Set the key for dat
 46 | setkey(dat, Inspection_ID)
 47 | 
 48 | ##==============================================================================
 49 | ## CREATE MODEL DATA
 50 | ##==============================================================================
 51 | # sort(colnames(dat))
 52 | xmat <- dat[ , list(criticalFound,
 53 |                     # Inspector = Inspector_Grade, 
 54 |                     Inspector = Inspector_Assigned,
 55 |                     pastSerious = pmin(pastSerious, 1),
 56 |                     ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L),
 57 |                     pastCritical = pmin(pastCritical, 1),
 58 |                     consumption_on_premises_incidental_activity,
 59 |                     tobacco_retail_over_counter,
 60 |                     temperatureMax,
 61 |                     heat_burglary = pmin(heat_burglary, 70),
 62 |                     heat_sanitation = pmin(heat_sanitation, 70),
 63 |                     heat_garbage = pmin(heat_garbage, 50),
 64 |                     # risk = as.factor(Risk),
 65 |                     # facility_type = as.factor(Facility_Type),
 66 |                     timeSinceLast),
 67 |             keyby = Inspection_ID]
 68 | mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F])
 69 | mm <- as.data.table(mm)
 70 | str(mm)
 71 | colnames(mm)
 72 | ```
 73 | 
 74 | ## Set up, run, and evaluate model
 75 | 
 76 | ```{r}
 77 | ##==============================================================================
 78 | ## CREATE TEST / TRAIN PARTITIONS
 79 | ##==============================================================================
 80 | ## 2014-07-01 is an easy separator
 81 | dat[Inspection_Date < "2014-07-01", range(Inspection_Date)]
 82 | dat[Inspection_Date > "2014-07-01", range(Inspection_Date)]
 83 | 
 84 | iiTrain <- dat[ , which(Inspection_Date < "2014-07-01")]
 85 | iiTest <- dat[ , which(Inspection_Date > "2014-07-01")]
 86 | 
 87 | ## Check to see if any rows didn't make it through the model.matrix formula
 88 | nrow(dat)
 89 | nrow(xmat)
 90 | nrow(mm)
 91 | 
 92 | ##==============================================================================
 93 | ## GLMNET MODEL
 94 | ##==============================================================================
 95 | # fit ridge regression, alpha = 0, only inspector coefficients penalized
 96 | pen <- ifelse(grepl("^Inspector", colnames(mm)), 1, 0)
 97 | model <- glmnet(x = as.matrix(mm[iiTrain]),
 98 |                 y = xmat[iiTrain,  criticalFound],
 99 |                 family = "binomial", 
100 |                 alpha = 0,
101 |                 penalty.factor = pen)
102 | 
103 | ## See what regularization parameter 'lambda' is optimal on tune set
104 | ## (We are essentially usin the previous hardcoded value)
105 | errors <- sapply(model$lambda, 
106 |                  function(lam) 
107 |                      logLik(p = predict(model, 
108 |                                         newx = as.matrix(mm[iiTrain]), 
109 |                                         s=lam, 
110 |                                         type="response")[,1], 
111 |                             y = xmat[iiTrain, criticalFound]))
112 | plot(x=log(model$lambda), y=errors, type="l")
113 | which.min(errors)
114 | model$lambda[which.min(errors)]
115 | ## manual lambda selection
116 | w.lam <- 100
117 | lam <- model$lambda[w.lam]
118 | coef <- model$beta[,w.lam]
119 | inspCoef <- coef[grepl("^Inspector",names(coef))]
120 | inspCoef <- inspCoef[order(-inspCoef)]
121 | ## coefficients for the inspectors, and for other variables
122 | inspCoef
123 | coef[!grepl("^Inspector",names(coef))]
124 | 
125 | 
126 | ## By the way, if we had knowledge of the future, we would have chosen a 
127 | ## different lambda
128 | errorsTest <- sapply(model$lambda, 
129 |                      function(lam) 
130 |                          logLik(p = predict(model, 
131 |                                             newx = as.matrix(mm[iiTest]), 
132 |                                             s=lam, 
133 |                                             type="response")[,1], 
134 |                                 y = xmat[iiTest, criticalFound]))
135 | plot(x=log(model$lambda), y=errorsTest, type="l")
136 | which.min(errorsTest)
137 | model$lambda[which.min(errorsTest)]
138 | 
139 | ## ATTACH PREDICTIONS TO DAT
140 | dat$glm_pred <- predict(model, newx=as.matrix(mm), 
141 |                         s=lam, 
142 |                         type="response")[,1]
143 | 
144 | # show gini performance of inspector model on tune data set
145 | dat[iiTest, gini(glm_pred, criticalFound, plot=TRUE)]
146 | 
147 | ## Calculate confusion matrix values for evaluation
148 | calculate_confusion_values(actual = xmat[iiTest, criticalFound],
149 |                            expected = dat[iiTest, glm_pred], 
150 |                            r = .25)
151 | 
152 | ## Calculate matrix of confusion matrix values for evaluation
153 | confusion_values_test <- t(sapply(seq(0, 1 ,.01), 
154 |                                   calculate_confusion_values,
155 |                                   actual = xmat[iiTest, criticalFound],
156 |                                   expected = dat[iiTest, glm_pred]))
157 | confusion_values_test
158 | ggplot(reshape2::melt(as.data.table(confusion_values_test), 
159 |                       id.vars="r")) + 
160 |     aes(x=r, y=value, colour=variable) + geom_line() + 
161 |     geom_hline(yintercept = c(0,1))
162 | 
163 | ##==============================================================================
164 | ## CALCULATION OF LIFT
165 | ##==============================================================================
166 | ## TEST PERIOD: Date range
167 | dat[iiTest, range(Inspection_Date)]
168 | ## TEST PERIOD: Total inspections
169 | dat[iiTest, .N]
170 | ## TEST PERIOD: Critical found
171 | dat[iiTest, sum(criticalCount)]
172 | ## TEST PERIOD: Inspections with any critical violations
173 | dat[iiTest, sum(criticalFound)]
174 | 
175 | ## Subset test period
176 | datTest <- dat[iiTest]
177 | ## Identify first period
178 | datTest[ , period := ifelse(Inspection_Date < median(Inspection_Date),1,2)]
179 | datTest[, .N, keyby=list(period)]
180 | datTest[, .N, keyby=list(Inspection_Date, period)]
181 | ## Identify top half of scores (which would have been the first period)
182 | datTest[ , period_modeled := ifelse(glm_pred > median(glm_pred), 1, 2)]
183 | 
184 | datTest[period == 1, sum(criticalFound)]
185 | datTest[period_modeled == 1, sum(criticalFound)]
186 | 
187 | datTest[, list(.N, Violations = sum(criticalFound)), keyby=list(period)]
188 | datTest[, list(.N, Violations = sum(criticalFound)), keyby=list(period_modeled)]
189 | 
190 | 141 / (141 + 117)
191 | 178 / (178 + 80)
192 | 0.6899225 - .5465116
193 | 
194 | 0.65 - .55
195 | ```
196 | 
197 | 
198 | 
199 | 
200 | 


--------------------------------------------------------------------------------
/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.bib:
--------------------------------------------------------------------------------
 1 | 
 2 | @techreport{liu_food_????,
 3 | 	address = {Manhattan, {KS}},
 4 | 	title = {Food Safety Training Needed for Asian Restaurants: Longitudinal Review of Health Inspection Data in Kansas},
 5 | 	abstract = {The purpose of this study was to assess the frequencies and types of food code violations at Asian restaurants in Kansas using longitudinal review of health inspection data. A total of 326 restaurant inspection reports from 156 Asian restaurants in 10 Kansas counties were reviewed. Descriptive and inferential statistics were calculated using {SPSS}. The findings of this study suggested the focus areas for food safety training in Asian restaurants: temperature control of {PHF}; employee personal hygiene; and employee hand washing practices. Also, our results indicated that behavior-related violations, especially behavior-related critical violations occurred more during the routine health inspection than other inspection types. In the future, researches could identify the effective ways to overcome barriers to food safety training in Asian restaurants. Through this investigation, Asian restaurant owners and managers may gain insights on what food handling practices related to code violations they should emphasize when training their employees.},
 6 | 	institution = {Kansas State University},
 7 | 	author = {Liu, Pei and Lee, Yee Ming and Choi, Young Gin and Kwon, Junehee},
 8 | 	keywords = {Asian restaurant, food code violation, food inspections, food safety training, foodborne illness, health inspection},
 9 | 	file = {Food and Safety Training Needed for Asian Restaurants- Longitudin.pdf:/Users/tomschenk/Library/Application Support/Zotero/Profiles/2mv6rop3.default/zotero/storage/IVZ5DESU/Food and Safety Training Needed for Asian Restaurants- Longitudin.pdf:application/pdf}
10 | }
11 | 
12 | @article{phillips_recurrent_????,
13 | 	title = {Recurrent Critical Violations of the Food Code in Retail Food Service Establishments},
14 | 	volume = {68},
15 | 	abstract = {Records of restaurant inspections by public healtb departments provide sequential "snapshots" of conditions in retail food service establishments that can be used to identil)' risk factors and evaluate the effectiveness
16 | of interventions. Data from a random 10 percent sample of restaurant inspection files from 31 counties in Oklahoma, including 4,044 inspections conducted during 1996-2000 in "medium-risk" and "bigh-risk" establishments, were analyzed to determine rates of critical violations and recurrent violations for different categories of estahlisbments. Repeat violations accounted for about half of all violations. Estahlisbments subjectively designated as high risk by healtb department personnel were in fact found to have higher violation rates than those described as medium-risk establishments. Outside Oklahoma County, regional cbain restaurants were significantly more likely than other restaurants to bave recurrent violations of critical items related to food-holding temperature, hygiene practices, sanitization, and hygiene facilities. Differences observed in violation rates among individual establishments were not primarily attributable to inconsistent enforcement by individual inspectors; rather, they appeared to be indicative of real differences in bygienic conditions and practices.},
17 | 	number = {10},
18 | 	journal = {Journal of Environmental Health},
19 | 	author = {Phillips, Margaret L. and Elledge, Brenda L. and Basara, Heather G. and Lynch, Robert A. and Boatright, Daniel T.},
20 | 	keywords = {food inspections, health},
21 | 	pages = {24--30},
22 | 	file = {RecurrentViolations.pdf:/Users/tomschenk/Library/Application Support/Zotero/Profiles/2mv6rop3.default/zotero/storage/M27PH57F/RecurrentViolations.pdf:application/pdf}
23 | }
24 | 
25 | @article{hoag_risk-based_2007,
26 | 	title = {A Risk-Based Food Inspection Program},
27 | 	volume = {69},
28 | 	abstract = {The inspection of food facilities is a crucial public service designed to prevent foodborne illnesses among retail food consumers. To enhance the existing food inspection process in San Bernardino County, California, a risk-based food inspection program and assessment instrument has been developed and proposed. A literature review and interviews with health professionals were conducted to establish a baseline understanding of various inspection procedures currently being employed throughout the nation. San Bernardino subsequently developed an assessment instrument and attendant inspection schedules that reflect best practices. The proposed inspection model categorizes food facilities as high, moderate, or low risk according to food properties, service population characteristics, facility history, and predefined operational risks. The San Bernardino model supports health department decision making with respect to inspection resource allocation and also makes possible sliding permit fees that reflect the relative risk associated with each facility.},
29 | 	number = {7},
30 | 	journal = {Journal of Environmental Health},
31 | 	author = {Hoag, Michelle A. and Porter, Corwin and Uppala, Padma and Dyjack, David T.},
32 | 	month = mar,
33 | 	year = {2007},
34 | 	pages = {33--36}
35 | }
36 | 
37 | @article{harris_health_2014,
38 | 	title = {Health Department Use of Social Media to Identify Foodborne Illness — Chicago, Illinois, 2013–2014},
39 | 	volume = {63},
40 | 	url = {http://www.cdc.gov/mmwr/preview/mmwrhtml/mm6332a1.htm},
41 | 	number = {32},
42 | 	urldate = {2014-12-24},
43 | 	journal = {Morbidity and Mortality Weekly Report},
44 | 	author = {Harris, Jenine K. and Mansour, Raed and Choucair, Bechara and Olson, Joe and Nissen, Cory and Bhatt, Jay},
45 | 	month = aug,
46 | 	year = {2014},
47 | 	pages = {681--685},
48 | 	file = {Health Department Use of Social Media to Identify Foodborne Illness — Chicago, Illinois, 2013–2014:/Users/tomschenk/Library/Application Support/Zotero/Profiles/2mv6rop3.default/zotero/storage/WI73PGEZ/mm6332a1.html:text/html}
49 | }
50 | 
51 | @article{friedman_regularization_2010,
52 | 	title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
53 | 	volume = {33},
54 | 	issn = {1548-7660},
55 | 	url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2929880/},
56 | 	abstract = {We develop fast algorithms for estimation of generalized linear models with convex penalties. The models include linear regression, two-class logistic regression, and multinomial regression problems while the penalties include ℓ1 (the lasso), ℓ2 (ridge regression) and mixtures of the two (the elastic net). The algorithms use cyclical coordinate descent, computed along a regularization path. The methods can handle large problems and can also deal efficiently with sparse features. In comparative timings we find that the new algorithms are considerably faster than competing methods.},
57 | 	number = {1},
58 | 	urldate = {2015-02-08},
59 | 	journal = {Journal of Statistical Software},
60 | 	author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Rob},
61 | 	year = {2010},
62 | 	pmid = {20808728},
63 | 	pmcid = {PMC2929880},
64 | 	pages = {1--22},
65 | 	file = {PubMed Central Full Text PDF:/Users/tomschenk/Library/Application Support/Zotero/Profiles/2mv6rop3.default/zotero/storage/NTTQQTW6/Friedman et al. - 2010 - Regularization Paths for Generalized Linear Models.pdf:application/pdf}
66 | }


--------------------------------------------------------------------------------
/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.docx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.docx


--------------------------------------------------------------------------------
/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.pdf


--------------------------------------------------------------------------------
/REPORTS/references.bib:
--------------------------------------------------------------------------------
 1 | @Article{Friedman_2010,
 2 |   title = {Regularization Paths for Generalized Linear Models via Coordinate Descent},
 3 |   author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani},
 4 |   journal = {Journal of Statistical Software},
 5 |   year = {2010},
 6 |   volume = {33},
 7 |   number = {1},
 8 |   pages = {1--22},
 9 |   url = {http://www.jstatsoft.org/v33/i01/},
10 | }
11 | 
12 | @Book{Venables_2002,
13 |   title = {Modern Applied Statistics with S},
14 |   author = {W. N. Venables and B. D. Ripley},
15 |   publisher = {Springer},
16 |   edition = {Fourth},
17 |   address = {New York},
18 |   year = {2002},
19 |   note = {ISBN 0-387-95457-0},
20 |   url = {http://www.stats.ox.ac.uk/pub/MASS4},
21 | }
22 | 
23 | @Article{harris_health_2014,
24 |   title = {Health Department Use of Social Media to Identify Foodborne Illness — Chicago, Illinois, 2013–2014},
25 |   volume = {63},
26 |   url = {http://www.cdc.gov/mmwr/preview/mmwrhtml/mm6332a1.htm},
27 |   number = {32},
28 |   urldate = {2014-12-24},
29 |   journal = {Morbidity and Mortality Weekly Report},
30 |   author = {Jenine K. Harris and Raed Mansour and Bechara Choucair and Joe Olson and Cory Nissen and Jay Bhatt},
31 |   month = {aug},
32 |   year = {2014},
33 |   pages = {681--685},
34 |   file = {Health Department Use of Social Media to Identify Foodborne Illness — Chicago, Illinois, 2013–2014:/Users/tomschenk/Library/Application Support/Zotero/Profiles/2mv6rop3.default/zotero/storage/WI73PGEZ/mm6332a1.html:text/html},
35 | }
36 | 
37 | @Article{Simon_2011,
38 |   title = {Regularization Paths for Cox's Proportional Hazards Model via Coordinate Descent},
39 |   author = {Noah Simon and Jerome Friedman and Trevor Hastie and Rob Tibshirani},
40 |   journal = {Journal of Statistical Software},
41 |   year = {2011},
42 |   volume = {39},
43 |   number = {5},
44 |   pages = {1--13},
45 |   url = {http://www.jstatsoft.org/v39/i05/},
46 | }
47 | 


--------------------------------------------------------------------------------
/food-inspections-evaluation.Rproj:
--------------------------------------------------------------------------------
 1 | Version: 1.0
 2 | 
 3 | RestoreWorkspace: No
 4 | SaveWorkspace: No
 5 | AlwaysSaveHistory: Default
 6 | 
 7 | EnableCodeIndexing: Yes
 8 | UseSpacesForTab: Yes
 9 | NumSpacesForTab: 4
10 | Encoding: UTF-8
11 | 
12 | RnwWeave: Sweave
13 | LaTeX: pdfLaTeX
14 | 


--------------------------------------------------------------------------------