├── .gitignore ├── .gitmodules ├── CODE ├── 00_Startup.R ├── 11_business_download.R ├── 12_crime_download.R ├── 13_food_inspection_download.R ├── 14_garbage_download.R ├── 15_sanitation_download.R ├── 21_calculate_violation_matrix.R ├── 22_calculate_heat_map_values.R ├── 23_food_insp_features.R ├── 24_bus_features.R ├── 30_xgboost_model.R ├── 30a_glmnet_model.R ├── 30b_glmnet_model_evaluation.R ├── 31_xgboost_model_evaluation.R ├── 31a_random_forest_model.R ├── 31b_random_forest_evaluation.R ├── README.md ├── functions │ ├── GenerateOtherLicenseInfo.R │ ├── calculate_confusion_values.R │ ├── calculate_heat_values.R │ ├── calculate_violation_matrix.R │ ├── calculate_violation_types.R │ ├── categorize.R │ ├── filter_business.R │ ├── filter_crime.R │ ├── filter_foodInspect.R │ ├── filter_garbageCarts.R │ ├── filter_sanitationComplaints.R │ ├── find_bus_id_matches.R │ ├── gini.R │ ├── kde.R │ ├── logLik.R │ ├── nokey.R │ ├── simulated_bin_summary.R │ ├── simulated_date_diff_mean.R │ └── weather_3day_calc.R ├── not used │ ├── addTextMining.R │ ├── development_of_heat_merge.R │ ├── heat.R │ ├── kde_comparison │ │ ├── allstate_kde_new.R │ │ └── mass_kde_full.R │ ├── liveReadInAbandonedBuildings.R │ ├── liveReadInAbandonedVehicles.R │ ├── liveReadInBldgPermits.R │ ├── liveReadInBldgViolations.R │ ├── liveReadInGraffiti.R │ ├── liveReadInLightsOut.R │ ├── liveReadInPotHoles.R │ ├── liveReadInRodents.R │ └── liveReadInTreeTrims.R └── prep_inspectors_for_eval.R ├── CONTRIBUTING.md ├── DATA ├── 11_bus_license.Rds ├── 12_crime.Rds ├── 13_food_inspections.Rds ├── 14_garbage_carts.Rds ├── 15_sanitation_code.Rds ├── 17_mongo_weather_update.Rds ├── 19_inspector_assignments.Rds ├── 21_food_inspection_violation_matrix.Rds ├── 21_food_inspection_violation_matrix_nums.Rds ├── 22_burglary_heat.Rds ├── 22_garbageCarts_heat.Rds ├── 22_sanitationComplaints_heat.Rds ├── 23_food_insp_features.Rds ├── 24_bus_features.Rds ├── 30_xgboost_data.Rds ├── 30_xgboost_model.Rds ├── Inspection_Report_Form.pdf └── inspectors.Rds ├── LICENSE.md ├── README.md ├── REPORTS ├── .gitignore ├── CountComparison_aftersecondrefactor.Rmd ├── CountComparison_aftersecondrefactor.html ├── Metric_Development.Rmd ├── Metric_Development.html ├── Metric_Development.xlsx ├── Metric_Development_bargraphs.xlsx ├── Metric_Development_day_diff.xlsx ├── ModelSummary_20141204.Rmd ├── ModelSummary_20141204.html ├── forecasting-restaurants-with-critical-violations-in-Chicago.Rmd ├── forecasting-restaurants-with-critical-violations-in-Chicago.bib ├── forecasting-restaurants-with-critical-violations-in-Chicago.docx ├── forecasting-restaurants-with-critical-violations-in-Chicago.html ├── forecasting-restaurants-with-critical-violations-in-Chicago.pdf └── references.bib └── food-inspections-evaluation.Rproj /.gitignore: -------------------------------------------------------------------------------- 1 | /.project 2 | .Rproj.user 3 | .Rproj 4 | CODE/socrata_token.txt 5 | .Rhistory 6 | 7 | -------------------------------------------------------------------------------- /.gitmodules: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/.gitmodules -------------------------------------------------------------------------------- /CODE/00_Startup.R: -------------------------------------------------------------------------------- 1 | ##------------------------------------------------------------------------------ 2 | ## INSTALL DEPENDENCIES IF MISSING 3 | ##------------------------------------------------------------------------------ 4 | 5 | if(!"devtools" %in% rownames(installed.packages())){ 6 | install.packages("devtools", 7 | dependencies = TRUE, 8 | repos = "https://cloud.r-project.org/") 9 | } 10 | 11 | if(!"Rcpp" %in% rownames(installed.packages())){ 12 | install.packages("Rcpp", 13 | dependencies = TRUE, 14 | repos = "https://cloud.r-project.org/") 15 | } 16 | 17 | if(!"RSocrata" %in% rownames(installed.packages())){ 18 | install.packages("RSocrata", 19 | dependencies = TRUE, 20 | repos = "https://cloud.r-project.org/") 21 | } 22 | 23 | if(!"data.table" %in% rownames(installed.packages())){ 24 | install.packages("data.table", 25 | dependencies = TRUE, 26 | repos = "https://cloud.r-project.org/") 27 | } 28 | 29 | if(!"geneorama" %in% rownames(installed.packages())){ 30 | devtools::install_github('geneorama/geneorama') 31 | } 32 | 33 | if(!"printr" %in% rownames(installed.packages())){ 34 | devtools::install_github(repo = 'yihui/printr') 35 | } 36 | 37 | ##------------------------------------------------------------------------------ 38 | ## UPDATE DEPENDENCIES IF MISSING 39 | ##------------------------------------------------------------------------------ 40 | 41 | ## Update to RSocrata 1.7.2-2 (or later) 42 | if(installed.packages()["RSocrata","Version"] < "1.7.2-2"){ 43 | install.packages("RSocrata", 44 | repos = "https://cloud.r-project.org/") 45 | } 46 | 47 | ## Needs recent version for foverlaps 48 | if(installed.packages()["data.table","Version"] < "1.10.0"){ 49 | install.packages("data.table", 50 | repos = "https://cloud.r-project.org/") 51 | } 52 | 53 | if(installed.packages()["geneorama","Version"] < "1.5.0"){ 54 | devtools::install_github('geneorama/geneorama') 55 | } 56 | -------------------------------------------------------------------------------- /CODE/11_business_download.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries & project functions 9 | geneorama::loadinstall_libraries(c("data.table", "RSocrata")) 10 | geneorama::sourceDir("CODE/functions/") 11 | 12 | ##============================================================================== 13 | ## DOWNLOAD FILES FROM DATA PORTAL 14 | ##============================================================================== 15 | 16 | ## DEFINE URL 17 | url <- "https://data.cityofchicago.org/resource/r5kz-chrr.csv" 18 | 19 | ## READ DATA 20 | business <- read.socrata(url, stringsAsFactors = FALSE) 21 | # str(business) 22 | 23 | ## CONVERT TO DATA TABLE 24 | business <- as.data.table(business) 25 | 26 | ## Replace .'s in column names 27 | setnames(business, gsub("\\.","_",colnames(business))) 28 | 29 | ## MODIFY DATA 30 | geneorama::convert_datatable_IntNum(business) 31 | geneorama::convert_datatable_DateIDate(business) 32 | 33 | ## FIX TWO DATE COLUMNS THAT MAY NOT DOWNLOAD PROPERLY 34 | business[ , LICENSE_TERM_START_DATE := as.IDate(LICENSE_TERM_START_DATE, "%m/%d/%Y")] 35 | business[ , LICENSE_TERM_EXPIRATION_DATE := as.IDate(LICENSE_TERM_EXPIRATION_DATE, "%m/%d/%Y")] 36 | 37 | ## SAVE RESULT 38 | saveRDS(business, "DATA/11_bus_license.Rds") 39 | -------------------------------------------------------------------------------- /CODE/12_crime_download.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries & project functions 9 | geneorama::loadinstall_libraries(c("data.table", "RSocrata")) 10 | geneorama::sourceDir("CODE/functions/") 11 | 12 | ##============================================================================== 13 | ## DOWNLOAD FILES FROM DATA PORTAL 14 | ##============================================================================== 15 | 16 | ## DEFINE URL AND QUERY 17 | url <- "https://data.cityofchicago.org/resource/ijzp-q8t2.csv" 18 | # q <- "?primary_type=BURGLARY&$where=date>'2016-02-01'" # date filter examp 19 | q <- "?primary_type=BURGLARY" 20 | 21 | ## READ DATA (BURGLARY ONLY) 22 | crime <- read.socrata(paste0(url, q), stringsAsFactors = FALSE) 23 | # str(crime) 24 | 25 | ## CONVERT TO DATA TABLE 26 | crime <- as.data.table(crime) 27 | 28 | ## Replace .'s in column names 29 | setnames(crime, gsub("\\.","_",colnames(crime))) 30 | 31 | ## MODIFY DATA 32 | geneorama::convert_datatable_IntNum(crime) 33 | geneorama::convert_datatable_DateIDate(crime) 34 | crime[ , Arrest := as.logical(Arrest)] 35 | crime[ , Domestic := as.logical(Domestic)] 36 | 37 | ## SAVE RESULT 38 | saveRDS(crime , "DATA/12_crime.Rds") 39 | -------------------------------------------------------------------------------- /CODE/13_food_inspection_download.R: -------------------------------------------------------------------------------- 1 | ##========================================================================== 2 | ## INITIALIZE 3 | ##========================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries & project functions 9 | geneorama::loadinstall_libraries(c("data.table", "RSocrata")) 10 | geneorama::sourceDir("CODE/functions/") 11 | 12 | ##============================================================================== 13 | ## DOWNLOAD FILES FROM DATA PORTAL 14 | ##============================================================================== 15 | 16 | ## DEFINE URL 17 | url <- "https://data.cityofchicago.org/resource/4ijn-s7e5.csv" 18 | 19 | ## READ DATA 20 | foodInspect <- read.socrata(url, stringsAsFactors = FALSE) 21 | str(foodInspect) 22 | 23 | ## CONVERT TO DATA TABLE 24 | foodInspect <- as.data.table(foodInspect) 25 | 26 | ## Replace .'s in column names, and extra underscores 27 | setnames(foodInspect, gsub("\\.","_",colnames(foodInspect))) 28 | setnames(foodInspect, gsub("_+$","",colnames(foodInspect))) 29 | 30 | ## MODIFY DATA 31 | geneorama::convert_datatable_IntNum(foodInspect) 32 | geneorama::convert_datatable_DateIDate(foodInspect) 33 | 34 | ## SAVE RESULT 35 | saveRDS(foodInspect , "DATA/13_food_inspections.Rds") 36 | -------------------------------------------------------------------------------- /CODE/14_garbage_download.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries & project functions 9 | geneorama::loadinstall_libraries(c("data.table", "RSocrata")) 10 | geneorama::sourceDir("CODE/functions/") 11 | 12 | ##============================================================================== 13 | ## DOWNLOAD FILES FROM DATA PORTAL 14 | ##============================================================================== 15 | 16 | ## DEFINE URL 17 | url <- "https://data.cityofchicago.org/resource/9ksk-na4q.csv" 18 | 19 | ## READ DATA 20 | garbageCarts <- read.socrata(url, stringsAsFactors = FALSE) 21 | # str(garbageCarts) 22 | 23 | ## CONVERT TO DATA TABLE 24 | garbageCarts <- as.data.table(garbageCarts) 25 | 26 | ## Replace .'s in column names 27 | setnames(garbageCarts, gsub("\\.","_",colnames(garbageCarts))) 28 | 29 | ## MODIFY DATA 30 | geneorama::convert_datatable_IntNum(garbageCarts) 31 | geneorama::convert_datatable_DateIDate(garbageCarts) 32 | 33 | ## SAVE RESULT 34 | saveRDS(garbageCarts , "DATA/14_garbage_carts.Rds") 35 | 36 | -------------------------------------------------------------------------------- /CODE/15_sanitation_download.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries & project functions 9 | geneorama::loadinstall_libraries(c("data.table", "RSocrata")) 10 | geneorama::sourceDir("CODE/functions/") 11 | 12 | ##============================================================================== 13 | ## DOWNLOAD FILES FROM DATA PORTAL 14 | ##============================================================================== 15 | 16 | ## DEFINE URL 17 | url <- "https://data.cityofchicago.org/resource/me59-5fac.csv" 18 | 19 | ## READ DATA 20 | sanitationComplaints <- read.socrata(url, stringsAsFactors = FALSE) 21 | # str(sanitationComplaints) 22 | 23 | ## CONVERT TO DATA TABLE 24 | sanitationComplaints <- as.data.table(sanitationComplaints) 25 | 26 | ## Replace .'s in column names 27 | setnames(sanitationComplaints, gsub("\\.","_",colnames(sanitationComplaints))) 28 | 29 | ## Remove one row where the header is (somewhat) repeated 30 | sanitationComplaints <- sanitationComplaints[Service_Request_Number!="SERVICE REQUEST NUMBER"] 31 | 32 | ## MODIFY DATA 33 | geneorama::convert_datatable_IntNum(sanitationComplaints) 34 | geneorama::convert_datatable_DateIDate(sanitationComplaints) 35 | 36 | ## SAVE RESULT 37 | saveRDS(sanitationComplaints , "DATA/15_sanitation_code.Rds") 38 | 39 | -------------------------------------------------------------------------------- /CODE/21_calculate_violation_matrix.R: -------------------------------------------------------------------------------- 1 | ## 2 | ## Because this step takes so long, it's pre-calculated here. 3 | ## 4 | 5 | ##============================================================================== 6 | ## INITIALIZE 7 | ##============================================================================== 8 | ## Remove all objects; perform garbage collection 9 | rm(list=ls()) 10 | gc(reset=TRUE) 11 | 12 | ## Load libraries & project functions 13 | geneorama::loadinstall_libraries(c("data.table", "MASS")) 14 | geneorama::sourceDir("CODE/functions/") 15 | 16 | ##============================================================================== 17 | ## LOAD CACHED RDS FILES 18 | ##============================================================================== 19 | foodInspect <- readRDS("DATA/13_food_inspections.Rds") 20 | foodInspect <- filter_foodInspect(foodInspect) 21 | 22 | ##============================================================================== 23 | ## CALCULATE FEATURES BASED ON FOOD INSPECTION DATA 24 | ##============================================================================== 25 | 26 | ## Calculate violation matrix and put into data.table with inspection id as key 27 | vio_mat <- calculate_violation_matrix(foodInspect[ , Violations]) 28 | 29 | ## Add key column to vio_mat 30 | vio_mat <- data.table(vio_mat, 31 | Inspection_ID = foodInspect[ , Inspection_ID], 32 | key = "Inspection_ID") 33 | 34 | ## calculate_violation_types calculates violations by categories: 35 | ## Critical, serious, and minor violations 36 | violation_dat <- calculate_violation_types(violation_mat =vio_mat) 37 | 38 | ## Save results 39 | saveRDS(vio_mat, "DATA/21_food_inspection_violation_matrix_nums.Rds") 40 | saveRDS(violation_dat, "DATA/21_food_inspection_violation_matrix.Rds") 41 | -------------------------------------------------------------------------------- /CODE/22_calculate_heat_map_values.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries & project functions 9 | geneorama::loadinstall_libraries(c("data.table", "MASS")) 10 | geneorama::sourceDir("CODE/functions/") 11 | 12 | ##============================================================================== 13 | ## LOAD CACHED RDS FILES 14 | ##============================================================================== 15 | 16 | ## Import the key data sets used for prediction 17 | foodInspect <- readRDS("DATA/13_food_inspections.Rds") 18 | crime <- readRDS("DATA/12_crime.Rds") 19 | garbageCarts <- readRDS("DATA/14_garbage_carts.Rds") 20 | sanitationComplaints <- readRDS("DATA/15_sanitation_code.Rds") 21 | 22 | ## Apply filters by omitting rows that are not used in the model 23 | foodInspect <- filter_foodInspect(foodInspect) 24 | crime <- filter_crime(crime) 25 | garbageCarts <- filter_garbageCarts(garbageCarts) 26 | sanitationComplaints <- filter_sanitationComplaints(sanitationComplaints) 27 | 28 | ##============================================================================== 29 | ## CALCULATE HEAT MAP VALUES 30 | ##============================================================================== 31 | burglary_heat <- 32 | calculate_heat_values(inspections = foodInspect, 33 | observed_values = crime, 34 | window = 90, 35 | page_limit = 500) 36 | garbageCarts_heat <- 37 | calculate_heat_values(inspections = foodInspect, 38 | observed_values = garbageCarts[ 39 | i = TRUE, 40 | j = list(Date = Creation_Date, 41 | Latitude, 42 | Longitude)], 43 | window = 90, 44 | page_limit = 500) 45 | sanitationComplaints_heat <- 46 | calculate_heat_values(inspections = foodInspect, 47 | observed_values = sanitationComplaints[ 48 | i = TRUE, 49 | j = list(Date = Creation_Date, 50 | Latitude, 51 | Longitude)], 52 | window = 90, 53 | page_limit = 500) 54 | 55 | ##============================================================================== 56 | ## SAVE HEAT MAP VALUES 57 | ##============================================================================== 58 | saveRDS(burglary_heat, "DATA/22_burglary_heat.Rds") 59 | saveRDS(garbageCarts_heat, "DATA/22_garbageCarts_heat.Rds") 60 | saveRDS(sanitationComplaints_heat, "DATA/22_sanitationComplaints_heat.Rds") 61 | 62 | 63 | -------------------------------------------------------------------------------- /CODE/23_food_insp_features.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries & project functions 9 | geneorama::loadinstall_libraries(c("data.table", "MASS")) 10 | geneorama::sourceDir("CODE/functions/") 11 | ## Import shift function 12 | shift <- geneorama::shift 13 | 14 | ##============================================================================== 15 | ## LOAD CACHED RDS FILES 16 | ##============================================================================== 17 | foodInspect <- readRDS("DATA/13_food_inspections.Rds") 18 | 19 | ## Apply row filter to remove invalid data 20 | foodInspect <- filter_foodInspect(foodInspect) 21 | 22 | ## Remove violations from food inspection, violations are caputured in the 23 | ## violation matrix data 24 | foodInspect$Violations <- NULL 25 | 26 | ## Import violation matrix which lists violations by categories: 27 | ## Critical, serious, and minor violations 28 | violation_dat <- readRDS("DATA/21_food_inspection_violation_matrix.Rds") 29 | 30 | ##============================================================================== 31 | ## CALCULATE FEATURES 32 | ##============================================================================== 33 | 34 | ## Facility_Type_Clean: Anything that is not "restaurant" or "grocery" is "other" 35 | foodInspect[ , Facility_Type_Clean := 36 | categorize(x = Facility_Type, 37 | primary = list(Restaurant = "restaurant", 38 | Grocery_Store = "grocery"), 39 | ignore.case = TRUE)] 40 | ## Join in the violation matrix 41 | foodInspect <- merge(x = foodInspect, 42 | y = violation_dat, 43 | by = "Inspection_ID") 44 | ## Create pass / fail flags 45 | foodInspect[ , pass_flag := ifelse(Results=="Pass",1, 0)] 46 | foodInspect[ , fail_flag := ifelse(Results=="Fail",1, 0)] 47 | ## Set key to ensure that records are treated CHRONOLOGICALLY... 48 | setkey(foodInspect, License, Inspection_Date) 49 | ## Then find previous info by "shifting" the columns (grouped by License) 50 | foodInspect[ , pastFail := shift(fail_flag, -1, 0), by = License] 51 | foodInspect[ , pastCritical := shift(criticalCount, -1, 0), by = License] 52 | foodInspect[ , pastSerious := shift(seriousCount, -1, 0), by = License] 53 | foodInspect[ , pastMinor := shift(minorCount, -1, 0), by = License] 54 | 55 | ## Calcualte time since last inspection. 56 | ## If the time is NA, this means it's the first inspection; add an inicator 57 | ## variable to indicate that it's the first inspection. 58 | foodInspect[i = TRUE , 59 | j = timeSinceLast := as.numeric( 60 | Inspection_Date - shift(Inspection_Date, -1, NA)) / 365, 61 | by = License] 62 | foodInspect[ , firstRecord := 0] 63 | foodInspect[is.na(timeSinceLast), firstRecord := 1] 64 | foodInspect[is.na(timeSinceLast), timeSinceLast := 2] 65 | foodInspect[ , timeSinceLast := pmin(timeSinceLast, 2)] 66 | 67 | ##============================================================================== 68 | ## SAVE RDS 69 | ##============================================================================== 70 | setkey(foodInspect, Inspection_ID) 71 | saveRDS(foodInspect, file.path("DATA/23_food_insp_features.Rds")) 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /CODE/24_bus_features.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries & project functions 9 | geneorama::loadinstall_libraries(c("data.table", "MASS")) 10 | geneorama::sourceDir("CODE/functions/") 11 | ## Import shift function 12 | shift <- geneorama::shift 13 | 14 | ##============================================================================== 15 | ## LOAD CACHED RDS FILES 16 | ##============================================================================== 17 | business <- readRDS("DATA/11_bus_license.Rds") 18 | 19 | ## Apply filter to remove invalid / unused data 20 | business <- filter_business(business) 21 | 22 | ## Food inspection data needed for some feature calculations inspection date 23 | foodInspect <- readRDS("DATA/23_food_insp_features.Rds") 24 | 25 | ##============================================================================== 26 | ## CALCULATE FEATURES BASED ON BUSINESS LICENSE DATA 27 | ##============================================================================== 28 | 29 | ## Calculate min date (by license) 30 | business[ , minDate := min(LICENSE_TERM_START_DATE), LICENSE_NUMBER] 31 | business[ , maxDate := max(LICENSE_TERM_EXPIRATION_DATE), LICENSE_NUMBER] 32 | 33 | ##============================================================================== 34 | ## Use only the business data that pertains to food inspections 35 | ##============================================================================== 36 | ## Create a table of matches between the food inspection and business license 37 | ## data, based on the where the Inspection_Date falls within the business 38 | ## license renewal 39 | id_table_food2business <- find_bus_id_matches(business, foodInspect) 40 | geneorama::NAsummary(id_table_food2business) 41 | 42 | ## Add food key to matched business data 43 | bus_matched <- merge(x = id_table_food2business, 44 | y = business, 45 | by = "ID", 46 | all.y = FALSE, 47 | all.x = TRUE) 48 | setkey(bus_matched, Inspection_ID) 49 | 50 | ## Add business key to food data 51 | foodInspect <- merge(x = id_table_food2business, 52 | y = foodInspect, 53 | by = "Inspection_ID") 54 | setkey(foodInspect, Inspection_ID) 55 | 56 | ## Use minDate and Inspection date to calculate age at 57 | bus_matched <- bus_matched[foodInspect[,Inspection_Date,keyby=Inspection_ID]] 58 | bus_matched[ , ageAtInspection := as.numeric(Inspection_Date - minDate) / 365] 59 | 60 | ## Remove Inspection Date to avoid conflict names when merging later 61 | bus_matched[ , Inspection_Date := NULL] 62 | 63 | 64 | ## CALCULATE AND MERGE IN OTHER CATEGORIES 65 | OtherCategories <- GenerateOtherLicenseInfo(foodInspect, business, max_cat = 12) 66 | geneorama::NAsummary(OtherCategories) 67 | 68 | ## Merge in results 69 | bus_matched <- merge(x = bus_matched, 70 | y = OtherCategories, 71 | by = "Inspection_ID", 72 | all.x = T) 73 | ## Remove NAs in category columns and set max value to 1 74 | for (j in match(colnames(OtherCategories)[-1], colnames(bus_matched))) { 75 | set(x = bus_matched, i = which(is.na(bus_matched[[j]])), j = j, value = 0) 76 | set(x = bus_matched, j = j, value = pmin(bus_matched[[j]], 1)) 77 | } 78 | 79 | bus_matched 80 | 81 | ##============================================================================== 82 | ## SAVE RDS 83 | ##============================================================================== 84 | ## Set the key for dat_model 85 | setkey(bus_matched, Inspection_ID) 86 | saveRDS(bus_matched, file.path("DATA/24_bus_features.Rds")) 87 | 88 | -------------------------------------------------------------------------------- /CODE/30_xgboost_model.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries & project functions 9 | geneorama::loadinstall_libraries(c("data.table", "xgboost", "ggplot2")) 10 | geneorama::sourceDir("CODE/functions/") 11 | 12 | ##============================================================================== 13 | ## LOAD CACHED RDS FILES 14 | ##============================================================================== 15 | food <- readRDS("DATA/23_food_insp_features.Rds") 16 | bus <- readRDS("DATA/24_bus_features.Rds") 17 | sanitarians <- readRDS("DATA/19_inspector_assignments.Rds") 18 | weather <- readRDS("DATA/17_mongo_weather_update.Rds") 19 | heat_burglary <- readRDS("DATA/22_burglary_heat.Rds") 20 | heat_garbage <- readRDS("DATA/22_garbageCarts_heat.Rds") 21 | heat_sanitation <- readRDS("DATA/22_sanitationComplaints_heat.Rds") 22 | 23 | ##============================================================================== 24 | ## MERGE IN FEATURES 25 | ##============================================================================== 26 | sanitarians <- sanitarians[,list(Inspection_ID=inspectionID), keyby=sanitarian] 27 | setnames(heat_burglary, "heat_values", "heat_burglary") 28 | setnames(heat_garbage, "heat_values", "heat_garbage") 29 | setnames(heat_sanitation, "heat_values", "heat_sanitation") 30 | 31 | dat <- copy(food) 32 | dat <- dat[bus] 33 | dat <- merge(x = dat, y = sanitarians, by = "Inspection_ID") 34 | dat <- merge(x = dat, y = weather_3day_calc(weather), by = "Inspection_Date") 35 | dat <- merge(dat, na.omit(heat_burglary), by = "Inspection_ID") 36 | dat <- merge(dat, na.omit(heat_garbage), by = "Inspection_ID") 37 | dat <- merge(dat, na.omit(heat_sanitation), by = "Inspection_ID") 38 | 39 | ## Set the key for dat 40 | setkey(dat, Inspection_ID) 41 | 42 | ## Remove unnecessary data 43 | rm(food, bus, sanitarians, weather, heat_burglary, heat_garbage, heat_sanitation) 44 | 45 | ## Only the model data should be present 46 | geneorama::lll() 47 | 48 | ##============================================================================== 49 | ## FILTER ROWS 50 | ##============================================================================== 51 | dat <- dat[LICENSE_DESCRIPTION=="Retail Food Establishment"] 52 | dat 53 | 54 | ##============================================================================== 55 | ## DISPLAY AVAILABLE VARIABLES 56 | ##============================================================================== 57 | geneorama::NAsummary(dat) 58 | 59 | ##============================================================================== 60 | ## Add criticalFound variable to dat: 61 | ##============================================================================== 62 | dat[ , criticalFound := pmin(1, criticalCount)] 63 | 64 | ##============================================================================== 65 | ## Calculate index for training data (last three months) 66 | ##============================================================================== 67 | dat[ , Test := Inspection_Date >= (max(Inspection_Date) - 90)] 68 | 69 | ##============================================================================== 70 | ## CREATE MODEL DATA 71 | ##============================================================================== 72 | # sort(colnames(dat)) 73 | xmat <- dat[ , list(Inspector = as.character(sanitarian), 74 | pastSerious = pmin(pastSerious, 1), 75 | pastCritical = pmin(pastCritical, 1), 76 | timeSinceLast, 77 | ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L), 78 | consumption_on_premises_incidental_activity, 79 | tobacco, 80 | temperatureMax, 81 | heat_burglary = pmin(heat_burglary, 70), 82 | heat_sanitation = pmin(heat_sanitation, 70), 83 | heat_garbage = pmin(heat_garbage, 50), 84 | criticalFound), 85 | keyby = list(Inspection_ID, Test)] 86 | 87 | ## View the structure of the final xmat 88 | str(xmat) 89 | 90 | ##============================================================================== 91 | ## XGBOOST MODEL 92 | ##============================================================================== 93 | 94 | mm <- model.matrix(criticalFound ~ . -1, 95 | data = xmat[ , .SD, .SDcol=-key(xmat)]) 96 | 97 | iiTest <- dat$Test 98 | iiTrain <- !dat$Test 99 | 100 | 101 | # Extract train data set 102 | train <- mm[iiTrain, ] 103 | test <- mm[iiTest, ] 104 | 105 | train.target <- xmat[iiTrain, criticalFound] 106 | test.target <- xmat[iiTest, criticalFound] 107 | 108 | #table(train.target) 109 | #table(test.target) 110 | 111 | set.seed(1) 112 | 113 | # 5% of test data is taken for validation purpose (733+120 = 853 [5% of 17075 - Train set] ) 114 | h <- c( sample(which(train.target == 0), 733), sample(which(train.target == 1), 120)) 115 | 116 | # Create validation data set based on the above rows 117 | dval <- xgb.DMatrix(data = data.matrix(train[h, ]), label = train.target[h]) 118 | dtrain <- xgb.DMatrix(data = data.matrix(train), label = train.target) 119 | 120 | watchlist <- list(val = dval, train = dtrain) 121 | 122 | # Run xgbmodel 123 | set.seed(1) 124 | xgbmodel <- xgb.train(data = dtrain, 125 | nfold = 5, 126 | eta = 0.02, 127 | max_depth = 6, 128 | nround=500, 129 | subsample = 0.75, 130 | colsample_bytree = 0.75, 131 | eval_metric = "mlogloss", 132 | objective = "multi:softprob", 133 | num_class = 2, 134 | nthread = 4, 135 | num_parallel_trees = 500, 136 | early_stopping_rounds = 25, 137 | watchlist = watchlist, 138 | verbose = 1, 139 | gamma = 0 140 | ) 141 | 142 | # Predict Test data set score 143 | y_pred <- predict(xgbmodel, data.matrix(test), ntree=xgbmodel$bestInd) 144 | 145 | # Extract second column's value that correspond to "Critical factor = 1" 146 | y_pred <- data.frame(matrix(y_pred, byrow = TRUE, ncol = 2))[, 2] 147 | 148 | # Calculate RMSE based on predicted and actual values 149 | RMSE <- sqrt(mean((test.target-y_pred)^2)) # test1637 150 | RMSE 151 | 152 | # RMSE: 0.3590026; 446 rounds; eta = 0.04; subsample = 0.75; colsample_bytree = 0.75 - reg:linear 153 | # RMSE: 0.3742767; 3000 rounds; eta = 0.04; subsample = 0.75; colsample_bytree = 0.75 - reg:linear 154 | # RMSE: 0.3678386; 2000 rounds; eta = 0.04; subsample = 0.75; colsample_bytree = 0.75 - multi:softprob 155 | # RMSE: 0.3586859; 1200 rounds; eta = 0.025; subsample = 0.70; colsample_bytree = 0.70 - multi:softprob 156 | # RMSE: 0.3596287; 1200 rounds; eta = 0.03; subsample = 0.70; colsample_bytree = 0.70 - multi:softprob 157 | # RMSE: 0.3578677; 1200 rounds; eta = 0.02; subsample = 0.70; colsample_bytree = 0.70 - multi:softprob 158 | # RMSE: 0.3576444; 1100 rounds; eta = 0.02; subsample = 0.70; colsample_bytree = 0.70 - multi:softprob 159 | # RMSE: 0.3559322; 1100 rounds; eta = 0.02; subsample = 0.70; colsample_bytree = 0.70 - multi:softprob 160 | # RMSE: 0.3563522; 500 rounds; eta = 0.02; subsample = 0.75; colsample_bytree = 0.75 - multi:softprob 161 | # RMSE: 0.3563522; 1000 rounds; eta = 0.02; subsample = 0.75; colsample_bytree = 0.75 - multi:softprob 162 | # RMSE: 0.3792566; 1200 rounds; eta = 0.002; subsample = 0.75; colsample_bytree = 0.75 - multi:softprob 163 | 164 | ## ATTACH PREDICTIONS TO DAT 165 | y_pred_mm <- predict(xgbmodel, data.matrix(mm), ntree = xgbmodel$bestInd) 166 | 167 | 168 | 169 | dat$score <- data.frame(matrix(y_pred_mm, byrow = TRUE, ncol = 2))[, 2] 170 | 171 | ## Identify each row as test / train 172 | dat$Test <- iiTest 173 | dat$Train <- iiTrain 174 | 175 | ##============================================================================== 176 | ## SAVE RESULTS 177 | ##============================================================================== 178 | 179 | saveRDS(dat, "DATA/30_xgboost_data.Rds") 180 | saveRDS(xgbmodel, "DATA/30_xgboost_model.Rds") 181 | 182 | 183 | -------------------------------------------------------------------------------- /CODE/30a_glmnet_model.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries & project functions 9 | geneorama::loadinstall_libraries(c("data.table", "glmnet")) 10 | geneorama::sourceDir("CODE/functions/") 11 | 12 | ##============================================================================== 13 | ## LOAD CACHED RDS FILES 14 | ##============================================================================== 15 | food <- readRDS("DATA/23_food_insp_features.Rds") 16 | bus <- readRDS("DATA/24_bus_features.Rds") 17 | sanitarians <- readRDS("DATA/19_inspector_assignments.Rds") 18 | weather <- readRDS("DATA/17_mongo_weather_update.Rds") 19 | heat_burglary <- readRDS("DATA/22_burglary_heat.Rds") 20 | heat_garbage <- readRDS("DATA/22_garbageCarts_heat.Rds") 21 | heat_sanitation <- readRDS("DATA/22_sanitationComplaints_heat.Rds") 22 | 23 | ##============================================================================== 24 | ## MERGE IN FEATURES 25 | ##============================================================================== 26 | sanitarians <- sanitarians[,list(Inspection_ID=inspectionID), keyby=sanitarian] 27 | setnames(heat_burglary, "heat_values", "heat_burglary") 28 | setnames(heat_garbage, "heat_values", "heat_garbage") 29 | setnames(heat_sanitation, "heat_values", "heat_sanitation") 30 | 31 | dat <- copy(food) 32 | dat <- dat[bus] 33 | dat <- merge(x = dat, y = sanitarians, by = "Inspection_ID") 34 | dat <- merge(x = dat, y = weather_3day_calc(weather), by = "Inspection_Date") 35 | dat <- merge(dat, na.omit(heat_burglary), by = "Inspection_ID") 36 | dat <- merge(dat, na.omit(heat_garbage), by = "Inspection_ID") 37 | dat <- merge(dat, na.omit(heat_sanitation), by = "Inspection_ID") 38 | 39 | ## Set the key for dat 40 | setkey(dat, Inspection_ID) 41 | 42 | ## Remove unnecessary data 43 | rm(food, bus, sanitarians, weather, heat_burglary, heat_garbage, heat_sanitation) 44 | 45 | ## Only the model data should be present 46 | geneorama::lll() 47 | 48 | ##============================================================================== 49 | ## FILTER ROWS 50 | ##============================================================================== 51 | dat <- dat[LICENSE_DESCRIPTION=="Retail Food Establishment"] 52 | dat 53 | 54 | ##============================================================================== 55 | ## DISPLAY AVAILABLE VARIABLES 56 | ##============================================================================== 57 | geneorama::NAsummary(dat) 58 | 59 | ##============================================================================== 60 | ## Add criticalFound variable to dat: 61 | ##============================================================================== 62 | dat[ , criticalFound := pmin(1, criticalCount)] 63 | 64 | ##============================================================================== 65 | ## Calculate index for training data (last three months) 66 | ##============================================================================== 67 | dat[ , Test := Inspection_Date >= (max(Inspection_Date) - 90)] 68 | 69 | ##============================================================================== 70 | ## CREATE MODEL DATA 71 | ##============================================================================== 72 | # sort(colnames(dat)) 73 | xmat <- dat[ , list(Inspector = as.character(sanitarian), 74 | pastSerious = pmin(pastSerious, 1), 75 | pastCritical = pmin(pastCritical, 1), 76 | timeSinceLast, 77 | ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L), 78 | consumption_on_premises_incidental_activity, 79 | tobacco, 80 | temperatureMax, 81 | heat_burglary = pmin(heat_burglary, 70), 82 | heat_sanitation = pmin(heat_sanitation, 70), 83 | heat_garbage = pmin(heat_garbage, 50), 84 | criticalFound), 85 | keyby = list(Inspection_ID, Test)] 86 | 87 | ## View the structure of the final xmat 88 | str(xmat) 89 | 90 | ##============================================================================== 91 | ## GLMNET MODEL 92 | ##============================================================================== 93 | ## Construct model matrix without the key values in xmat 94 | mm <- model.matrix(criticalFound ~ . -1, 95 | data = xmat[ , .SD, .SDcol=-key(xmat)]) 96 | str(mm) 97 | colnames(mm) 98 | 99 | # fit ridge regression, alpha = 0, only inspector coefficients penalized 100 | penalty <- ifelse(grepl("^Inspector", colnames(mm)), 1, 0) 101 | 102 | ## PRODUCTION version of the model which is not split for test / train 103 | model <- cv.glmnet(x = mm, 104 | y = xmat[ , criticalFound], 105 | family = "binomial", 106 | alpha = 0, 107 | penalty.factor = penalty) 108 | 109 | ## EVALUATION version of the model which is only fit on the training data 110 | model_eval <- cv.glmnet(x = mm[xmat$Test==FALSE, ], 111 | y = xmat[Test == FALSE , criticalFound], 112 | family = "binomial", 113 | alpha = 0, 114 | penalty.factor = penalty) 115 | 116 | ## Lambda 117 | model$lambda 118 | model$lambda.min 119 | 120 | ## Attach predictions for top lambda choice to the data 121 | dat$glm_pred <- predict(model$glmnet.fit, 122 | newx = mm, 123 | s = model$lambda.min, 124 | type = "response")[,1] 125 | dat$glm_pred_test <- predict(model_eval$glmnet.fit, 126 | newx = mm, 127 | s = model_eval$lambda.min, 128 | type = "response")[,1] 129 | 130 | ## Coefficients 131 | coef <- coef(model)[,1] 132 | inspCoef <- coef[grepl("^Inspector",names(coef))] 133 | inspCoef <- inspCoef[order(-inspCoef)] 134 | 135 | ## Save Results 136 | saveRDS(dat, "DATA/30_dat.Rds") 137 | saveRDS(xmat, "DATA/30_xmat.Rds") 138 | saveRDS(mm, "DATA/30_modelmatrix.Rds") 139 | saveRDS(coef, "DATA/30_coef.Rds") 140 | saveRDS(inspCoef, "DATA/30_inspCoef.Rds") 141 | saveRDS(model, "DATA/30_model.Rds") 142 | saveRDS(model_eval, "DATA/30_model_eval.Rds") 143 | -------------------------------------------------------------------------------- /CODE/30b_glmnet_model_evaluation.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries that are used 9 | geneorama::loadinstall_libraries(c("data.table", "glmnet", "ggplot2", "ROCR")) 10 | ## Load custom functions 11 | geneorama::sourceDir("CODE/functions/") 12 | 13 | ##============================================================================== 14 | ## LOAD CACHED RDS FILES 15 | ##============================================================================== 16 | dat <- readRDS("DATA/30_dat.Rds") 17 | cvfit <- readRDS("DATA/30_model_eval.Rds") 18 | mm <- readRDS("DATA/30_modelmatrix.Rds") 19 | 20 | str(cvfit) 21 | str(cvfit$glmnet.fit$beta) 22 | 23 | ## Calculate scores for all lambda values 24 | allscores <- predict(cvfit$glmnet.fit, 25 | newx = as.matrix(mm), 26 | s = cvfit$glmnet.fit$lambda, 27 | type = "response") 28 | allscores <- as.data.table(allscores) 29 | setnames(allscores, cvfit$glmnet.fit$beta@Dimnames[[2]]) 30 | 31 | ## Identify each row as test / train 32 | allscores$Test <- dat$Test 33 | allscores$Train <- !dat$Test 34 | 35 | ##============================================================================== 36 | ## GLMNET specific diagnostics 37 | ##============================================================================== 38 | 39 | ## Find the index of the lambda that is closest to the optimal lambda of cvfit 40 | ## Because of issues with numerical precision on different platforms, it's a 41 | ## good idea to find the index of the lambda, rather than rely on matching the 42 | ## actual values. 43 | ## This result should be zero, but it might not be exactly zero on all platforms 44 | ## cvfit$glmnet.fit$lambda[iLambda] - cvfit$lambda.min 45 | cat("The lambda that minimizes error in cvfit is:", cvfit$lambda.min, "\n") 46 | iLambda <- which.min((cvfit$glmnet.fit$lambda - cvfit$lambda.min)^2) 47 | 48 | ## Print the coefficients 49 | coef <- cvfit$glmnet.fit$beta[,iLambda] 50 | coef 51 | 52 | ## Performance for different values of lambda: 53 | plot(cvfit$cvm ~ log(cvfit$lambda)) 54 | 55 | ## Note, this is equivalent to the log likelihood results 56 | loglik_errors <- sapply(1:100, 57 | function(i) { 58 | logLik(p = allscores[Train==TRUE][[i]], 59 | y = dat[Test==FALSE, criticalFound]) 60 | }) 61 | plot(loglik_errors ~ log(cvfit$glmnet.fit$lambda)) 62 | lines(x = log(cvfit$glmnet.fit$lambda), 63 | y = (1 - cvfit$glmnet.fit$dev.ratio) * cvfit$glmnet.fit$nulldev / 2, 64 | col = "blue") 65 | 66 | ## Evolution of coefficients as penalty changes 67 | plot(cvfit$glmnet.fit, label = TRUE) 68 | plot(cvfit$glmnet.fit, xvar = "dev", label = TRUE) 69 | 70 | 71 | ##============================================================================== 72 | ## Gini and confusion matrix calculations 73 | ##============================================================================== 74 | 75 | # Show gini performance of inspector model on tune data set 76 | dat[Test==FALSE, gini(glm_pred, criticalFound, plot=TRUE)] 77 | dat[Test==TRUE, gini(glm_pred, criticalFound, plot=TRUE)] 78 | 79 | ## Calculate confusion matrix values for evaluation 80 | calculate_confusion_values(actual = dat[Test==TRUE, criticalFound], 81 | expected = dat[Test==TRUE, glm_pred_test], 82 | r = .25) 83 | 84 | ## Calculate matrix of confusion matrix values for evaluation 85 | confusion_values_test <- t(sapply(seq(0, 1 ,.01), 86 | calculate_confusion_values, 87 | actual = dat[Test==TRUE, criticalFound], 88 | expected = dat[Test==TRUE, glm_pred_test])) 89 | confusion_values_test <- as.data.table(confusion_values_test) 90 | confusion_values_test 91 | 92 | 93 | ggplot(melt(confusion_values_test, id.vars="r")) + 94 | aes(x=r, y=value, colour=variable) + geom_line() + 95 | geom_hline(yintercept = c(0,1)) 96 | ##============================================================================== 97 | ## MODEL EVALUATION 98 | ## - TIME SAVINGS 99 | ## - PERIOD A vs PERIOD B 100 | ##============================================================================== 101 | ## Subset of just observations during test period: 102 | datTest <- dat[Test == TRUE] 103 | 104 | ## Mean time savings: 105 | datTest[ , simulated_date_diff_mean(Inspection_Date, glm_pred_test, criticalFound)] 106 | 107 | ## Detailed time savings: 108 | bins <- datTest[ , simulated_bin_summary(Inspection_Date, glm_pred_test, criticalFound)] 109 | bins 110 | 111 | ## This calculation is the weighted average date difference, which should match 112 | ## the previous result from `simulated_bin_summary` 113 | bins[ , sum(as.integer(date) * POS) / sum(POS)] - 114 | bins[ , sum(as.integer(date) * POS_SIM) / sum(POS_SIM)] 115 | 116 | ## Find the midpoint of the inspections to divide bins into period A & B 117 | mid <- bins[ , sum(N)/2] 118 | 119 | ## Divide the bins into period A & B based on midpoint 120 | ## Note: GT and LT is strict, so ties would be excluded. Although there are no 121 | ## ties for now (as of 2015 publication). 122 | binsA <- bins[NTOT < mid] 123 | binsB <- bins[NTOT > mid] 124 | 125 | tot_crit <- sum(bins$POS) 126 | 127 | binsA[ , sum(POS)/tot_crit] ## [1] 0.5465116 128 | binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705 129 | 130 | 131 | ##============================================================================== 132 | ## Metrics with ROCR Package 133 | ##============================================================================== 134 | 135 | ## Example with random values: 136 | # predTest <- prediction(datTest[ ,list(glm_pred_test, runif(.N) )], 137 | # datTest[ ,list(criticalFound, criticalFound)]) 138 | 139 | predTest <- prediction(datTest$glm_pred_test, datTest$criticalFound) 140 | 141 | ## precision / recall 142 | plot(performance(predTest, "prec", "rec"), main="precision recall") 143 | 144 | # ROC 145 | plot(performance(predTest, "tpr", "fpr"), main="ROC") 146 | abline(0, 1, lty=2) 147 | 148 | ## sensitivity / specificity 149 | plot(performance(predTest, "sens", "spec"), main="sensitivity vs specificity") 150 | abline(1, -1, lty=2) 151 | 152 | ## phi 153 | plot(performance(predTest, "phi"), main="phi scores") 154 | 155 | ## Fancy ROC curve: 156 | op <- par(bg="lightgray", mai=c(1.2,1.5,1,1)) 157 | plot(performance(predTest,"tpr","fpr"), 158 | main="ROC Curve", colorize=TRUE, lwd=10) 159 | par(op) 160 | 161 | ## Effect of using a cost function on cutoffs 162 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 1), 163 | main="Even costs (FP=1 TN=1)") 164 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 4), 165 | main="Higher cost for FN (FP=1 TN=4)") 166 | 167 | ## Accuracy 168 | plot(performance(predTest, measure = "acc")) 169 | 170 | ## AUC 171 | performance(predTest, measure = "auc")@y.values[[1]] 172 | 173 | -------------------------------------------------------------------------------- /CODE/31_xgboost_model_evaluation.R: -------------------------------------------------------------------------------- 1 | 2 | ##============================================================================== 3 | ## INITIALIZE 4 | ##============================================================================== 5 | if(interactive()){ 6 | ## Remove all objects; perform garbage collection 7 | rm(list=ls()) 8 | gc(reset=TRUE) 9 | ## Detach libraries that are not used 10 | geneorama::detach_nonstandard_packages() 11 | } 12 | ## Load libraries that are used 13 | geneorama::loadinstall_libraries(c("data.table", "ggplot2", "ROCR")) 14 | ## Load custom functions 15 | geneorama::sourceDir("CODE/functions/") 16 | 17 | ##============================================================================== 18 | ## LOAD CACHED RDS FILES 19 | ##============================================================================== 20 | dat <- readRDS("DATA/30_xgboost_data.Rds") 21 | model <- readRDS("DATA/30_xgboost_model.Rds") 22 | 23 | ##============================================================================== 24 | ## XGBoost specific diagnostics 25 | ##============================================================================== 26 | 27 | 28 | ##============================================================================== 29 | ## Gini and confusion matrix calculations 30 | ##============================================================================== 31 | 32 | ## Plot of the errors by Lambda for the out of sample Test data 33 | # Show gini performance of inspector model on tune data set 34 | dat[Train==TRUE, gini(score, criticalFound, plot=TRUE)] 35 | dat[Test==TRUE, gini(score, criticalFound, plot=TRUE)] 36 | 37 | ## Calculate confusion matrix values for evaluation 38 | calculate_confusion_values(actual = dat[Test==TRUE, criticalFound], 39 | expected = dat[Test==TRUE, score], 40 | r = .25) 41 | 42 | ### XGBoost 43 | # r true_pos true_neg false_neg false_pos 44 | # 0.25000000 0.07819181 0.65180208 0.07941356 0.19059255 45 | 46 | 47 | ### Random Forest 48 | # r true_pos true_neg false_neg false_pos 49 | #0.25000000 0.05497862 0.72632865 0.10262676 0.11606597 50 | 51 | 52 | ## Calculate matrix of confusion matrix values for evaluation 53 | confusion_values_test <- t(sapply(seq(0, 1 ,.01), 54 | calculate_confusion_values, 55 | actual = dat[Test==TRUE, criticalFound], 56 | expected = dat[Test==TRUE, score])) 57 | confusion_values_test <- as.data.table(confusion_values_test) 58 | confusion_values_test 59 | 60 | 61 | ggplot(melt(confusion_values_test, id.vars="r")) + 62 | aes(x=r, y=value, colour=variable) + geom_line() + 63 | geom_hline(yintercept = c(0,1)) 64 | 65 | ##============================================================================== 66 | ## MODEL EVALUATION 67 | ## - TIME SAVINGS 68 | ## - PERIOD A vs PERIOD B 69 | ##============================================================================== 70 | ## Subset of just observations during test period: 71 | datTest <- dat[Test == TRUE] 72 | 73 | ## Mean time savings: 74 | datTest[ , simulated_date_diff_mean(Inspection_Date, score, criticalFound)] 75 | 76 | ## Detailed time savings: 77 | bins <- datTest[ , simulated_bin_summary(Inspection_Date, score, criticalFound)] 78 | bins 79 | 80 | ## This calculation is the weighted average date difference, which should match 81 | ## the previous result from `simulated_bin_summary` 82 | bins[ , sum(as.integer(date) * POS) / sum(POS)] - 83 | bins[ , sum(as.integer(date) * POS_SIM) / sum(POS_SIM)] 84 | 85 | ## Find the midpoint of the inspections to divide bins into period A & B 86 | mid <- bins[ , sum(N)/2] 87 | 88 | ## Divide the bins into period A & B based on midpoint 89 | ## Note: GT and LT is strict, so ties would be excluded. Although there are no 90 | ## ties for now (as of 2015 publication). 91 | binsA <- bins[NTOT < mid] 92 | binsB <- bins[NTOT > mid] 93 | 94 | tot_crit <- sum(bins$POS) 95 | 96 | binsA[ , sum(POS)/tot_crit] ## [1] 0.5465116 97 | binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705 98 | 99 | 100 | ##============================================================================== 101 | ## Metrics with ROCR Package 102 | ##============================================================================== 103 | 104 | ## Example with random values: 105 | # predTest <- prediction(datTest[ ,list(score, runif(.N) )], 106 | # datTest[ ,list(criticalFound, criticalFound)]) 107 | 108 | predTest <- prediction(datTest$score, datTest$criticalFound) 109 | 110 | ## precision / recall 111 | plot(performance(predTest, "prec", "rec"), main="precision recall") 112 | 113 | # ROC 114 | plot(performance(predTest, "tpr", "fpr"), main="ROC") 115 | abline(0, 1, lty=2) 116 | 117 | ## sensitivity / specificity 118 | plot(performance(predTest, "sens", "spec"), main="sensitivity vs specificity") 119 | abline(1, -1, lty=2) 120 | 121 | ## phi 122 | plot(performance(predTest, "phi"), main="phi scores") 123 | 124 | ## Fancy ROC curve: 125 | op <- par(bg="lightgray", mai=c(1.2,1.5,1,1)) 126 | plot(performance(predTest,"tpr","fpr"), 127 | main="ROC Curve", colorize=TRUE, lwd=3) 128 | par(op) 129 | 130 | ## Effect of using a cost function on cutoffs 131 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 1), 132 | main="Even costs (FP=1 TN=1)") 133 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 4), 134 | main="Higher cost for FN (FP=1 TN=4)") 135 | 136 | ## Accuracy 137 | plot(performance(predTest, measure = "acc")) 138 | 139 | ## AUC 140 | performance(predTest, measure = "auc")@y.values[[1]] 141 | # 0.6790282 - AUC: XGBoost 142 | # 0.6577989 - AUC: Random Forest 143 | 144 | #No of days to discover restaurants with critical violations 145 | #Random Forest: Time difference of 6.554264 days 146 | #XGBoost: Time difference of 7.790698 days 147 | 148 | #Random Forest Output 149 | # Show gini performance of inspector model on tune data set 150 | dat[Train==TRUE, gini(score, criticalFound, plot=TRUE)] 151 | # 0.7674537 Gini Score: XGBoost 152 | # 0.9956746 Gini Score: RandomForest 153 | 154 | 155 | dat[Test==TRUE, gini(score, criticalFound, plot=TRUE)] 156 | # 0.358062 Gini Score: XGBoost 157 | # 0.3159969 Gini Score: RandomForest 158 | 159 | 160 | -------------------------------------------------------------------------------- /CODE/31a_random_forest_model.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries that are used 9 | geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2")) 10 | ## Load custom functions 11 | geneorama::sourceDir("CODE/functions/") 12 | 13 | ##============================================================================== 14 | ## LOAD CACHED RDS FILES 15 | ##============================================================================== 16 | dat <- readRDS("DATA/30_dat.Rds") 17 | mm <- readRDS("DATA/30_modelmatrix.Rds") 18 | xmat <- readRDS("DATA/30_xmat.Rds") 19 | 20 | ##============================================================================== 21 | ## RANDOM FOREST MODEL 22 | ##============================================================================== 23 | model <- randomForest(x = mm[xmat$Test==FALSE, ], 24 | y = as.factor(xmat[xmat$Test==FALSE, criticalFound]), 25 | importance=TRUE) 26 | 27 | ## ATTACH PREDICTIONS TO DAT 28 | dat$rf_pred_test <- predict(model, 29 | as.matrix(mm), 30 | type="prob")[ , 2] 31 | 32 | ##============================================================================== 33 | ## SAVE RESULTS 34 | ##============================================================================== 35 | 36 | saveRDS(dat, "DATA/31_random_forest_data.Rds") 37 | saveRDS(model, "DATA/31_random_forest_model.Rds") 38 | 39 | 40 | 41 | -------------------------------------------------------------------------------- /CODE/31b_random_forest_evaluation.R: -------------------------------------------------------------------------------- 1 | ##============================================================================== 2 | ## INITIALIZE 3 | ##============================================================================== 4 | ## Remove all objects; perform garbage collection 5 | rm(list=ls()) 6 | gc(reset=TRUE) 7 | 8 | ## Load libraries that are used 9 | geneorama::loadinstall_libraries(c("data.table", "randomForest", "ggplot2", "ROCR")) 10 | ## Load custom functions 11 | geneorama::sourceDir("CODE/functions/") 12 | 13 | ##============================================================================== 14 | ## LOAD CACHED RDS FILES 15 | ##============================================================================== 16 | dat <- readRDS("DATA/31_random_forest_data.Rds") 17 | model <- readRDS("DATA/31_random_forest_model.Rds") 18 | 19 | ##============================================================================== 20 | ## Random Forest specific diagnostics 21 | ##============================================================================== 22 | varImpPlot(model) 23 | importance(model) 24 | 25 | ##============================================================================== 26 | ## Gini and confusion matrix calculations 27 | ##============================================================================== 28 | 29 | ## Plot of the errors by Lambda for the out of sample Test data 30 | # Show gini performance of inspector model on tune data set 31 | dat[Test==FALSE, gini(rf_pred_test, criticalFound, plot=TRUE)] 32 | dat[Test==TRUE, gini(rf_pred_test, criticalFound, plot=TRUE)] 33 | 34 | ## Calculate confusion matrix values for evaluation 35 | calculate_confusion_values(actual = dat[Test==TRUE, criticalFound], 36 | expected = dat[Test==TRUE, rf_pred_test], 37 | r = .25) 38 | 39 | ## Calculate matrix of confusion matrix values for evaluation 40 | confusion_values_test <- t(sapply(seq(0, 1 ,.01), 41 | calculate_confusion_values, 42 | actual = dat[Test==TRUE, criticalFound], 43 | expected = dat[Test==TRUE, rf_pred_test])) 44 | confusion_values_test <- as.data.table(confusion_values_test) 45 | confusion_values_test 46 | 47 | 48 | ggplot(melt(confusion_values_test, id.vars="r")) + 49 | aes(x=r, y=value, colour=variable) + geom_line() + 50 | geom_hline(yintercept = c(0,1)) 51 | 52 | ##============================================================================== 53 | ## MODEL EVALUATION 54 | ## - TIME SAVINGS 55 | ## - PERIOD A vs PERIOD B 56 | ##============================================================================== 57 | ## Subset of just observations during test period: 58 | datTest <- dat[Test == TRUE] 59 | 60 | ## Mean time savings: 61 | datTest[ , simulated_date_diff_mean(Inspection_Date, rf_pred_test, criticalFound)] 62 | 63 | ## Detailed time savings: 64 | bins <- datTest[ , simulated_bin_summary(Inspection_Date, rf_pred_test, criticalFound)] 65 | bins 66 | 67 | ## This calculation is the weighted average date difference, which should match 68 | ## the previous result from `simulated_bin_summary` 69 | bins[ , sum(as.integer(date) * POS) / sum(POS)] - 70 | bins[ , sum(as.integer(date) * POS_SIM) / sum(POS_SIM)] 71 | 72 | ## Find the midpoint of the inspections to divide bins into period A & B 73 | mid <- bins[ , sum(N)/2] 74 | 75 | ## Divide the bins into period A & B based on midpoint 76 | ## Note: GT and LT is strict, so ties would be excluded. Although there are no 77 | ## ties for now (as of 2015 publication). 78 | binsA <- bins[NTOT < mid] 79 | binsB <- bins[NTOT > mid] 80 | 81 | tot_crit <- sum(bins$POS) 82 | 83 | ## THE COMMENT VALUES ARE FROM THE GLMNET RUN 84 | ## HOW CAN THESE BE THE SAME??? 85 | binsA[ , sum(POS)/tot_crit] ## [1] 0.5465116 86 | binsA[ , sum(POS_SIM)/tot_crit] ## [1] 0.6821705 87 | 88 | 89 | ##============================================================================== 90 | ## Metrics with ROCR Package 91 | ##============================================================================== 92 | 93 | ## Example with random values: 94 | # predTest <- prediction(datTest[ ,list(rf_pred_test, runif(.N) )], 95 | # datTest[ ,list(criticalFound, criticalFound)]) 96 | 97 | predTest <- prediction(datTest$rf_pred_test, datTest$criticalFound) 98 | 99 | ## precision / recall 100 | plot(performance(predTest, "prec", "rec"), main="precision recall") 101 | 102 | # ROC 103 | plot(performance(predTest, "tpr", "fpr"), main="ROC") 104 | abline(0, 1, lty=2) 105 | 106 | ## sensitivity / specificity 107 | plot(performance(predTest, "sens", "spec"), main="sensitivity vs specificity") 108 | abline(1, -1, lty=2) 109 | 110 | ## phi 111 | plot(performance(predTest, "phi"), main="phi scores") 112 | 113 | ## Fancy ROC curve: 114 | op <- par(bg="lightgray", mai=c(1.2,1.5,1,1)) 115 | plot(performance(predTest,"tpr","fpr"), 116 | main="ROC Curve", colorize=TRUE, lwd=10) 117 | par(op) 118 | 119 | ## Effect of using a cost function on cutoffs 120 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 1), 121 | main="Even costs (FP=1 TN=1)") 122 | plot(performance(predTest, "cost", cost.fp = 1, cost.fn = 4), 123 | main="Higher cost for FN (FP=1 TN=4)") 124 | 125 | ## Accuracy 126 | plot(performance(predTest, measure = "acc")) 127 | 128 | ## AUC 129 | performance(predTest, measure = "auc")@y.values[[1]] 130 | -------------------------------------------------------------------------------- /CODE/README.md: -------------------------------------------------------------------------------- 1 | # Files 2 | 3 | - ```00_Startup.R``` Downloads the necessary packages required to step 4 | through the rest of the R scripts 5 | - ```21_calculate_violation_matrix.R``` Performs matrix calculations 6 | on types of violations in the inspections data. This step is 7 | performed in a separate script as it takes some time. 8 | - ```22_calculate_heat_map_values.R``` Calculates heat maps for 9 | garbage, crime(burglary) and sanitation complaints data. 10 | - ```23_generate_model_dat.R``` Filter primary datsets, creates a 11 | basis for the model, creates features based on various data 12 | sets,attaches heat map, inspectors and weather and performs 13 | requisite merges to the basis model 14 | - ```30_glmnet_model.R``` The pre-calculated output from previous 15 | scripts is imported and used in the model built, trained and tested 16 | in this script. The main data set is indexed by time, and past data 17 | is used to independently build the model. The model is then applied 18 | to test data. Finally, this script also includes necessary code to 19 | evaluate the effectiveness of the City of Chicago’s data driven food 20 | inspections pilot. 21 | -------------------------------------------------------------------------------- /CODE/functions/GenerateOtherLicenseInfo.R: -------------------------------------------------------------------------------- 1 | 2 | GenerateOtherLicenseInfo <- function(inspection_data, 3 | business_data, 4 | max_cat = 99){ 5 | 6 | ## For debugging: 7 | # inspection_data <- copy(foodInspect) 8 | # business_data <- copy(business) 9 | 10 | ## MAKE DATA COPIES AND SET KEY 11 | food_licenses <- inspection_data[i = TRUE, 12 | j = list(d_insp = Inspection_Date, 13 | id = Inspection_ID), 14 | keyby = list(license_food = License)] 15 | biz <- business_data[i = TRUE, 16 | j = list(dba = DOING_BUSINESS_AS_NAME, 17 | addr = ADDRESS, 18 | d_start = LICENSE_TERM_START_DATE, 19 | d_end = LICENSE_TERM_EXPIRATION_DATE, 20 | desc = LICENSE_DESCRIPTION), 21 | keyby = list(license = LICENSE_NUMBER)] 22 | 23 | ## JOIN BUSINESS NAMES TO FOOD DATA 24 | food_licenses_names <- biz[food_licenses, mult="first"] 25 | food_licenses_names <- food_licenses_names[i = TRUE, 26 | j = list(license_insp = license, id), 27 | keyby = list(dba, addr, d_insp)] 28 | 29 | ## SUBSET OF BUSINESSES THAT ARE NOT LICENSED WITHIN THE FOOD DATABASE 30 | biz_nomatch <- biz[!(license %in% food_licenses_names$license_insp)] 31 | 32 | ## SET KEY FOR biz_nomatch TO ENABLE MATCHING IN food_licenses_names 33 | setkey(biz_nomatch, dba, addr, d_end) 34 | 35 | ## MAKE A COPY OF THE INSPECTION DATE, WHICH GETS OVERWRITTEN BY THE END DATE IN THE ROLLING JOIN 36 | food_licenses_names[ , d_insp_copy := d_insp] 37 | 38 | ## CREATE TABLE OF LICENSE DESCRIPTIONS 39 | tab <- food_licenses_names[biz_nomatch, roll=Inf] 40 | tab <- tab[!is.na(id)] 41 | tab <- tab[ , .N, keyby = list(dba, addr, d_insp=d_insp_copy, desc)] 42 | tab <- dcast.data.table(data = tab, 43 | formula = dba + addr + d_insp ~ desc, 44 | value.var = "N", 45 | fill = 0L) 46 | 47 | ## SUMMARIZE TOTALS FOR EACH CATEGORY 48 | category_totals <- as.data.table(sapply(tab[,4:ncol(tab), with=F], sum), 49 | keep.rownames = TRUE)[order(-V2)] 50 | setnames(category_totals, c("cat", "N")) 51 | ## LIMIT CATEGORY COLUMNS 52 | categories_keep <- category_totals[1:max(min(max_cat, nrow(category_totals)-3), 1), 53 | cat] 54 | tab_final <- tab[,c("dba", "addr", "d_insp", categories_keep), with=F] 55 | 56 | setkey(food_licenses_names , dba, addr, d_insp) 57 | tab_final[food_licenses_names] 58 | 59 | ## MERGE RESULTS BACK AND ONLY KEEP INSPECTION ID AS KEY 60 | ret <- food_licenses_names[tab_final][,c("id", categories_keep), with=F] 61 | setnames(ret, "id", "Inspection_ID") 62 | setnames(ret, colnames(ret)[-1], tolower(colnames(ret)[-1])) 63 | setnames(ret, colnames(ret)[-1], gsub("[[:punct:]]+", "", colnames(ret)[-1])) 64 | setnames(ret, colnames(ret)[-1], gsub("[ ]+", "_", colnames(ret)[-1])) 65 | 66 | setkey(ret, Inspection_ID) 67 | return(ret) 68 | } 69 | 70 | 71 | -------------------------------------------------------------------------------- /CODE/functions/calculate_confusion_values.R: -------------------------------------------------------------------------------- 1 | 2 | calculate_confusion_values <- function(actual, expected, r){ 3 | res <- expected > r 4 | true_pos <- res & actual 5 | true_neg <- !res & !actual 6 | false_pos <- res & !actual 7 | false_neg <- !res & actual 8 | 9 | result <- c(r = r, 10 | true_pos = sum(true_pos) / length(res), 11 | true_neg = sum(true_neg) / length(res), 12 | false_neg = sum(false_neg) / length(res), 13 | false_pos = sum(false_pos) / length(res)) 14 | return(result) 15 | } 16 | 17 | -------------------------------------------------------------------------------- /CODE/functions/calculate_heat_values.R: -------------------------------------------------------------------------------- 1 | 2 | ## The following fields must be present: 3 | ## inspections$Inspection_ID 4 | ## inspections$Inspection_Date 5 | ## inspections$Latitude 6 | ## inspections$Longitude 7 | ## observed_values$Latitude 8 | ## observed_values$Longitude 9 | ## observed_values$Date 10 | 11 | calculate_heat_values <- function(inspections, 12 | observed_values, 13 | window = 90, 14 | page_limit = 500, 15 | verbose = TRUE){ 16 | require(data.table) 17 | 18 | obs_cols <- c("Date", "Latitude", "Longitude") 19 | insp_cols <- c("Inspection_ID", "Inspection_Date", "Latitude", "Longitude") 20 | 21 | ## Check for required columns 22 | if(!all(obs_cols %in% colnames(observed_values))) { 23 | stop(paste0("observed_values is missing one of these columns: \n", 24 | " 'Latitude', 'Longitude', 'Date'")) 25 | } 26 | if(!all(insp_cols %in% colnames(inspections))) { 27 | stop(paste0("inspections is missing one of these columns: \n", 28 | " 'Latitude', 'Longitude', 'Date'")) 29 | } 30 | 31 | ## Subset 32 | inspections <- inspections[ , insp_cols, with=F] 33 | observed_values <- observed_values[ , obs_cols, with=F] 34 | 35 | ## Filter out NA values 36 | observed_values <- na.omit(observed_values) 37 | 38 | ## Create index values for pages 39 | N <- nrow(inspections) 40 | START_ROWS <- seq(1, N, page_limit) 41 | END_ROWS <- c(seq(1, N, page_limit)[-1] - 1, N) 42 | II <- mapply(`:`, START_ROWS, END_ROWS) 43 | 44 | ret <- rbindlist(lapply(II, function(ii) { 45 | if(verbose){ 46 | print(paste(sys.call()[2], "out of", length(II))) 47 | } 48 | foverlaps( 49 | x = inspections[i = ii, 50 | j = list(Inspection_ID, 51 | Latitude, 52 | Longitude), 53 | keyby = list(start = Inspection_Date - window, 54 | end = Inspection_Date)], 55 | y = observed_values[i = TRUE, 56 | j = list(Latitude, Longitude), 57 | keyby = list(start = Date, end = Date)], 58 | type = "any")[ , kde(new=c(i.Latitude[1], i.Longitude[1]), 59 | x = Latitude, 60 | y = Longitude, 61 | h = c(.01, .01)), 62 | keyby = Inspection_ID]})) 63 | setkey(ret, Inspection_ID) 64 | setnames(ret, "V1", "heat_values") 65 | return(ret) 66 | } 67 | 68 | -------------------------------------------------------------------------------- /CODE/functions/calculate_violation_matrix.R: -------------------------------------------------------------------------------- 1 | 2 | ## Requires a vector of violations in text format, 3 | ## where each element is a collapsed list of violations 4 | ## separated by | 5 | 6 | calculate_violation_matrix <- function(violation_text){ 7 | 8 | require(data.table) 9 | 10 | ## Tabluate voilation types 11 | ## 1) Split violoation description by "|" 12 | ## 2) use regex to extract leading digits of code number 13 | ## 3) create indicator matrix of code violations 14 | vio <- strsplit(violation_text,"| ",fixed=T) 15 | vio_nums <- lapply(vio, 16 | function(item) regmatches(x = item, 17 | m = gregexpr(pattern = "^[0-9]+", 18 | text = item))) 19 | vio_mat <- geneorama::list2matrix(vio_nums, count = T) 20 | vio_mat <- vio_mat[ , order(as.numeric(colnames(vio_mat)))] 21 | # colnames(vio_mat) 22 | # range(vio_mat) 23 | 24 | return(vio_mat) 25 | } 26 | 27 | -------------------------------------------------------------------------------- /CODE/functions/calculate_violation_types.R: -------------------------------------------------------------------------------- 1 | 2 | ## Requirements for calculate_violation_types: 3 | ## 1 Requires a matrix (within a keyed data.table) of violation 4 | ## indicators where each row contains 0 or 1 to indicate the presence 5 | ## of a violation. There should be columns 1:44, named "1" to "44" 6 | ## (plus a key column) corresponding to the 44 violation types found in 7 | ## the data. 8 | ## 9 | ## Uses the function calculate_violation_matrix 10 | ## to calculate intermediate result (indicator matrix) 11 | ## 12 | 13 | calculate_violation_types <- function(violation_mat){ 14 | 15 | require(data.table) 16 | 17 | ## Check that violation_mat is a data.table 18 | if(!inherits(x = violation_mat, what = "data.table")){ 19 | stop("violation_mat should be a data.table, and have a defined key") 20 | } 21 | 22 | ## Check for the key 23 | if(length(key(violation_mat)) == 0) { 24 | stop("The violation matrix should have a defined key") 25 | } 26 | 27 | vio_mat <- as.matrix(nokey(violation_mat)) 28 | vio_key <- violation_mat[ , key(violation_mat), with = FALSE] 29 | 30 | ## Tabluate voilation types 31 | ## use apply to total up each group of code violations 32 | criticalCount <- apply(vio_mat[ , colnames(vio_mat) %in% 1:14], 1, sum) 33 | seriousCount <- apply(vio_mat[ , colnames(vio_mat) %in% 15:29], 1, sum) 34 | minorCount <- apply(vio_mat[ , colnames(vio_mat) %in% 30:44], 1, sum) 35 | 36 | ## Construct return values 37 | ret <- data.table(criticalCount, 38 | seriousCount, 39 | minorCount, 40 | vio_key, 41 | key = key(violation_mat)) 42 | return(ret) 43 | } 44 | -------------------------------------------------------------------------------- /CODE/functions/categorize.R: -------------------------------------------------------------------------------- 1 | 2 | #' 3 | #' @name categorize 4 | #' @param primary: a named list of the categories to convert 5 | #' @param other: A single character name for "other", default value is "other" 6 | #' @param ... optional arguments passed to grep 7 | #' 8 | #' @description 9 | #' Transforms a vector of categories to preferred names, and makes non-preferred 10 | #' categores into an "other" category. The "other" category can be named, but 11 | #' the default label is "other". 12 | #' 13 | #' @details 14 | #' The labels are found using `grep`, and the user can supply additional 15 | #' arguments through the ... argument (e.g. ignore.case = TRUE). 16 | #' 17 | #' Warning, the matching gets complicated if one string is contained in another, 18 | #' the first pattern will prevail. See example. 19 | #' 20 | #' @note 21 | #' This might already be more efficiently implemented in base R (or some other 22 | #' package). 23 | #' 24 | #' @example 25 | #' categorize(x = c("restaurant", "Restaurant and bar", "grocery", 26 | #' "Grocery Store", "stadium", "school", "church", 27 | #' "high school", "school restaurant"), 28 | #' primary = list(Restaurant = "restaurant", 29 | #' Grocery_Store = "grocery", 30 | #' "School"), 31 | #' other = "None of the above", 32 | #' ignore.case = TRUE) 33 | #' 34 | 35 | 36 | categorize <- function(x, primary, other = "Other", ...){ 37 | primary <- rev(primary) 38 | ## intitialize a new vector ret to be returned 39 | ret <- vector(mode=mode(x), length = length(x)) 40 | ret[] <- NA 41 | ## Get category names 42 | if(is.null(names(primary))){ 43 | ## If no names are specified use "primary" directly 44 | cat_names <- unlist(primary) 45 | } else { 46 | ## If names are specified use primary's names, unless missing 47 | cat_names <- names(primary) 48 | for(i in 1:length(primary)){ 49 | if(cat_names[i] == "") cat_names[i] <- primary[[i]] 50 | } 51 | } 52 | ## Set the new vector to the primary category when it matches a primary name 53 | for(i in 1:length(primary)){ 54 | ret[grep(paste0("^" , primary[i], "$"), x, ...)] = cat_names[i] 55 | } 56 | ## otherwise set it to the "other" label 57 | ret[which(is.na(ret))] <- other 58 | 59 | return(ret) 60 | } 61 | 62 | 63 | 64 | -------------------------------------------------------------------------------- /CODE/functions/filter_business.R: -------------------------------------------------------------------------------- 1 | filter_business <- function(business){ 2 | 3 | ##============================================================================== 4 | ## Filter rows 5 | ##============================================================================== 6 | ## Remove rows with na values 7 | business <- business[!is.na(LICENSE_TERM_START_DATE)] 8 | business <- business[!is.na(LICENSE_TERM_EXPIRATION_DATE)] 9 | ## Keep only certain application types 10 | business <- business[!(APPLICATION_TYPE %in% c("C_CAPA","C_SBA"))] 11 | ## Remove duplicate ids 12 | business <- business[!duplicated(ID)] 13 | 14 | ##============================================================================== 15 | ## Remove columns that are never used 16 | ##============================================================================== 17 | business$SITE_NUMBER <- NULL 18 | business$APPLICATION_CREATED_DATE <- NULL 19 | business$APPLICATION_REQUIREMENTS_COMPLETE <- NULL 20 | business$PAYMENT_DATE <- NULL 21 | business$CONDITIONAL_APPROVAL <- NULL 22 | business$LICENSE_APPROVED_FOR_ISSUANCE <- NULL 23 | business$DATE_ISSUED <- NULL 24 | business$LICENSE_STATUS_CHANGE_DATE <- NULL 25 | business$SSA <- NULL 26 | business$LOCATION <- NULL 27 | 28 | ## Return results 29 | return(business) 30 | } 31 | -------------------------------------------------------------------------------- /CODE/functions/filter_crime.R: -------------------------------------------------------------------------------- 1 | filter_crime <- function(crime){ 2 | crime <- crime[Date>as.IDate('2011-07-01')] 3 | crime <- crime[!(is.na(Latitude) | is.na(Longitude) | is.na(Date))] 4 | crime 5 | } 6 | -------------------------------------------------------------------------------- /CODE/functions/filter_foodInspect.R: -------------------------------------------------------------------------------- 1 | filter_foodInspect <- function(foodInspect){ 2 | foodInspect <- foodInspect[!is.na(Inspection_Date) & !is.na(License)] 3 | if(foodInspect[,any(duplicated(Inspection_ID))]){ 4 | warning(paste0("Removing ", nrow(foodInspect[duplicated(Inspection_ID)]), 5 | " duplicated records from foodInspect ", 6 | "of ", nrow(foodInspect), " total records.\n", 7 | "Duplication is based on the Inspection_ID")) 8 | ## Remove any duplicated Inspection_ID values 9 | foodInspect <- foodInspect[!duplicated(Inspection_ID)] 10 | } 11 | foodInspect <- foodInspect[License != 0] 12 | foodInspect <- foodInspect[Inspection_Date > as.IDate("2011-09-01")] 13 | foodInspect <- foodInspect[Inspection_Type == "Canvass"] 14 | foodInspect <- foodInspect[!Results %in% c('Out of Business', 15 | 'Business Not Located', 16 | 'No Entry')] 17 | foodInspect 18 | } 19 | -------------------------------------------------------------------------------- /CODE/functions/filter_garbageCarts.R: -------------------------------------------------------------------------------- 1 | filter_garbageCarts <- function(garbageCarts){ 2 | garbageCarts <- garbageCarts[!is.na(Latitude) & !is.na(Longitude) & !is.na(Creation_Date)] 3 | garbageCarts <- garbageCarts[Status %in% c("Completed", "Open")] 4 | garbageCarts 5 | } 6 | -------------------------------------------------------------------------------- /CODE/functions/filter_sanitationComplaints.R: -------------------------------------------------------------------------------- 1 | filter_sanitationComplaints <- function(sanitationComplaints) { 2 | sanitationComplaints <- sanitationComplaints[!is.na(Latitude) & !is.na(Longitude) & !is.na(Creation_Date)] 3 | sanitationComplaints <- sanitationComplaints[Status %in% c("Completed", "Open")] 4 | sanitationComplaints 5 | } 6 | -------------------------------------------------------------------------------- /CODE/functions/find_bus_id_matches.R: -------------------------------------------------------------------------------- 1 | 2 | ## 3 | ## Match food inspection ids to business license ids 4 | ## based on when the inspection happened in relationship 5 | ## to the business license renewal cycle. 6 | ## 7 | 8 | find_bus_id_matches <- function(business, foodInspect) { 9 | # browser() 10 | ## Since many food businesses are inspected before they are issued a 11 | ## license, so we need to move back the "License term start date" for 12 | ## newly issued licenses. An adjustment of 365 days seems pretty 13 | ## reasonable, and creates lots of matches. 14 | ## Note also, the `as.integer` call. This is needed to avoid a warning. 15 | business_copy <- copy(business) 16 | business_copy <- business_copy[APPLICATION_TYPE=="ISSUE", 17 | LICENSE_TERM_START_DATE := 18 | as.integer(LICENSE_TERM_START_DATE - 365)] 19 | 20 | ## Merge over time periods 21 | dat <- foverlaps(foodInspect[i = TRUE, 22 | j = Inspection_ID, 23 | keyby = list(License, 24 | Inspection_Date = Inspection_Date, 25 | Inspection_Date_end = Inspection_Date)], 26 | business_copy[i = LICENSE_TERM_START_DATE < LICENSE_TERM_EXPIRATION_DATE, 27 | j = list(ID), 28 | keyby = list(LICENSE_NUMBER, 29 | LICENSE_TERM_START_DATE, 30 | LICENSE_TERM_EXPIRATION_DATE)], 31 | mult = "first", 32 | type = "any", 33 | nomatch = NA) 34 | return(dat[ , list(Inspection_ID, ID)]) 35 | } 36 | -------------------------------------------------------------------------------- /CODE/functions/gini.R: -------------------------------------------------------------------------------- 1 | # function to calculate the normalized gini coeficient 2 | gini <- function(p,y, plot=FALSE){ 3 | sorted <- cbind(p,y) 4 | y.val <- sum(y) 5 | sorted <- cbind(sorted[order(-p),-1],c(1:y.val,rep(y.val,nrow(sorted)-y.val))) 6 | sorted <- cbind(sorted,cumsum(sorted[,1])) 7 | sorted <- cbind(sorted,cumsum(rep(y.val/length(y),length(y)))) 8 | csum <- colSums(sorted[,2:4]) 9 | gini <- (csum[2]-csum[3])/(csum[1]-csum[3]) 10 | 11 | if (plot){ 12 | plot( 13 | x=1:nrow(sorted)/nrow(sorted), 14 | y=sorted[,3]/y.val, 15 | main="Cummulative Captured", 16 | xlab="% Investigated", 17 | ylab="% Captured") 18 | lines(x=1:nrow(sorted)/nrow(sorted),y=sorted[,2]/y.val) 19 | text(x=0.8,y=0.2,labels=paste("Gini: ",round(gini*100,1),"%",sep="")) 20 | 21 | } 22 | gini 23 | } 24 | -------------------------------------------------------------------------------- /CODE/functions/kde.R: -------------------------------------------------------------------------------- 1 | #function to get temperature on heat map at a new point 2 | kde <- function (new, x, y, h) 3 | { 4 | nx <- length(x) 5 | if (length(y) != nx) 6 | stop("data vectors must be the same length") 7 | if (any(!is.finite(x)) || any(!is.finite(y))) 8 | stop("missing or infinite values in the data are not allowed") 9 | h <- if (missing(h)) 10 | c(bandwidth.nrd(x), bandwidth.nrd(y)) 11 | else rep(h, length.out = 2L) 12 | h <- h/4 13 | ax <- (new[1]-x)/h[1L] 14 | ay <- (new[2]-y)/h[2L] 15 | z <- tcrossprod(matrix(dnorm(ax), , nx), matrix(dnorm(ay), 16 | , nx))/(nx * h[1L] * h[2L]) 17 | z 18 | } 19 | -------------------------------------------------------------------------------- /CODE/functions/logLik.R: -------------------------------------------------------------------------------- 1 | # function to calculate the binomial likelihood 2 | logLik <- function(p,y) { 3 | p <- pmin(pmax(p,0.0000000000001),0.999999999999) 4 | -sum(y*log(p) + (1-y)*log(1-p)) 5 | } 6 | 7 | 8 | -------------------------------------------------------------------------------- /CODE/functions/nokey.R: -------------------------------------------------------------------------------- 1 | 2 | nokey <- function(dt, usestrings){ 3 | require(data.table) 4 | 5 | if(!inherits(x = dt, what = "data.table")){ 6 | stop("dt should be a data.table") 7 | } 8 | 9 | if(haskey(dt)){ 10 | ret <- dt[, .SD, .SDcol = -key(dt)] 11 | } else { 12 | ret <- dt 13 | } 14 | return(ret) 15 | } 16 | 17 | -------------------------------------------------------------------------------- /CODE/functions/simulated_bin_summary.R: -------------------------------------------------------------------------------- 1 | 2 | simulated_bin_summary <- function(dates, scores, positives) { 3 | require(data.table) 4 | 5 | ##-------------------------------------------------------------------------- 6 | ## Create base data 7 | ##-------------------------------------------------------------------------- 8 | ## Assemble components into a data.table for convenience 9 | dat <- data.table(date=dates, score=scores, positives) 10 | 11 | ##-------------------------------------------------------------------------- 12 | ## Add simulated result 13 | ##-------------------------------------------------------------------------- 14 | 15 | ## This assumes inspectors would perform the same number of inspections per 16 | ## day, as they had in their original schedule. 17 | 18 | ## The simulated date is a vector of the original dates in sequential 19 | ## order, but matched to the original data ordered by score. 20 | dat <- dat[order(-score), simulated_date := dat[order(date), date]][] 21 | 22 | ##-------------------------------------------------------------------------- 23 | ## Simulate inspector assignment to bins according to score 24 | ##-------------------------------------------------------------------------- 25 | 26 | ## Create bin summary of results 27 | 28 | ## Caluclate original counts / day 29 | ## Note: Using `keyby` forces the result to be sorted, which is necessary 30 | ## for the cumulative summation. 31 | bins <- dat[i = TRUE, 32 | j = list(POS = sum(positives), 33 | .N), 34 | keyby = date] 35 | bins <- bins[ , NTOT := cumsum(N)][] 36 | bins <- bins[ , POSTOT := cumsum(POS)][] 37 | 38 | ## Caluclate simulated counts / day 39 | ## Note: Here we key by the simulated date 40 | bins_sim <- dat[i = TRUE, 41 | j = list(POS = sum(positives), 42 | .N), 43 | keyby = simulated_date] 44 | bins_sim <- bins_sim[ , NTOT := cumsum(N)][] 45 | bins_sim <- bins_sim[ , POSTOT := cumsum(POS)][] 46 | 47 | setcolorder(bins, c("date", "N", "NTOT", "POS", "POSTOT")) 48 | bins <- merge(bins, 49 | bins_sim[i = TRUE, 50 | j = list(POS_SIM = POS, 51 | POSTOT_SIM = POSTOT), 52 | keyby = list(date = simulated_date)], 53 | "date") 54 | return(bins) 55 | 56 | } 57 | -------------------------------------------------------------------------------- /CODE/functions/simulated_date_diff_mean.R: -------------------------------------------------------------------------------- 1 | 2 | simulated_date_diff_mean <- function(dates, scores, pos) { 3 | require(data.table) 4 | 5 | ##-------------------------------------------------------------------------- 6 | ## Create base data 7 | ##-------------------------------------------------------------------------- 8 | ## Assemble components into a data.table for convenience 9 | dat <- data.table(date=dates, score=scores, pos) 10 | 11 | ##-------------------------------------------------------------------------- 12 | ## Add simulated result 13 | ##-------------------------------------------------------------------------- 14 | 15 | ## This assumes inspectors would perform the same number of inspections per 16 | ## day, as they had in their original schedule. 17 | 18 | ## The simulated date is a vector of the original dates in sequential 19 | ## order, but matched to the original data ordered by score. 20 | dat <- dat[order(-score), simulated_date := dat[order(date), date]][] 21 | 22 | ## The time difference is the difference between the original date and 23 | ## the simulated date, for occurrences that are "positive" 24 | mean_diff <- dat[pos==1, mean(date - simulated_date)] 25 | 26 | ##-------------------------------------------------------------------------- 27 | ## Return average date difference 28 | ##-------------------------------------------------------------------------- 29 | return(mean_diff) 30 | 31 | } 32 | -------------------------------------------------------------------------------- /CODE/functions/weather_3day_calc.R: -------------------------------------------------------------------------------- 1 | 2 | weather_3day_calc <- function(weather){ 3 | 4 | require(data.table) 5 | # str(weather) 6 | weather <- as.data.frame(weather) 7 | nr <- nrow(weather) 8 | weather <- weather[order(-as.numeric(weather$date)), ] 9 | threeDay <- weather[2:(nr-2), colnames(weather) != "date"] + 10 | weather[3:(nr - 1), colnames(weather) != "date"] + 11 | weather[4:(nr - 0), colnames(weather) != "date"] 12 | threeDay <- threeDay / 3 13 | threeDay$date <- weather$date[1:(nr-3)] 14 | 15 | threeDay <- as.data.table(threeDay) 16 | threeDay[ , date := as.IDate(date, format="%m/%d/%y")] 17 | setnames(threeDay, 'date', "Inspection_Date") 18 | 19 | threeDay <- threeDay[, .SD, keyby=Inspection_Date] 20 | 21 | return(threeDay) 22 | } 23 | -------------------------------------------------------------------------------- /CODE/not used/addTextMining.R: -------------------------------------------------------------------------------- 1 | 2 | library(plyr) 3 | library(tm) 4 | library(SnowballC) 5 | library(lda) 6 | library(ggplot2) 7 | library(reshape2) 8 | 9 | 10 | addressCount <- ddply(foodInspect,c("doing_business_as_name"),summarize,addressCount=length(unique(address))) 11 | foodInspect <- merge( 12 | x=foodInspect, 13 | y=addressCount, 14 | by="doing_business_as_name", 15 | all.x=TRUE, 16 | all.y=FALSE) 17 | foodInspect$addressCount <- pmin(foodInspect$addressCount,5) 18 | 19 | wardCount <- ddply(foodInspect,c("doing_business_as_name"),summarize,wardCount=length(unique(ward))) 20 | foodInspect <- merge( 21 | x=foodInspect, 22 | y=wardCount, 23 | by="doing_business_as_name", 24 | all.x=TRUE, 25 | all.y=FALSE) 26 | foodInspect$wardCount <- pmin(foodInspect$wardCount,5) 27 | 28 | districtCount <- ddply(foodInspect,c("doing_business_as_name"),summarize,districtCount=length(unique(police_district))) 29 | foodInspect <- merge( 30 | x=foodInspect, 31 | y=districtCount, 32 | by="doing_business_as_name", 33 | all.x=TRUE, 34 | all.y=FALSE) 35 | foodInspect$districtCount <- pmin(foodInspect$districtCount,5) 36 | 37 | rm(districtCount,wardCount,addressCount); gc() 38 | 39 | bname <- foodInspect$doing_business_as_name 40 | bname <- tolower(bname) 41 | bname <- gsub("[[:punct:]]","",bname) 42 | bname <- gsub("[0-9]+","",bname) 43 | 44 | myStopWords <- c(stopwords("english"),"th") 45 | myStopWords <- myStopWords[!grepl("[[:punct:]]",myStopWords)] 46 | myStopRegEx <- paste("\\b",paste(myStopWords,collapse="\\b|\\b", sep=""),"\\b",sep="") 47 | 48 | bname <- gsub(myStopRegEx,"",bname) 49 | 50 | bname <- sapply(strsplit(bname," "), function(w) paste(wordStem(w),collapse=" ",sep="")) 51 | 52 | bname <- gsub("[[:space:]]+"," ",bname) 53 | bname <- gsub("^[[:space:]]+","",bname) 54 | bname <- gsub("[[:space:]]+$","",bname) 55 | 56 | 57 | words <- table(unlist(strsplit(bname, split=" "))) 58 | words <- words[order(-words)] 59 | words <- words[words>5] 60 | 61 | name_split <- strsplit(bname, split=" ") 62 | 63 | # tf <- llply(name_split,function(x) { 64 | # n <- sum(x %in% names(words)) 65 | # if (n > 0){ 66 | # mat <- matrix(0L,nrow=2,ncol=n) 67 | # w <- factor(x[x %in% names(words)],levels=names(words)) 68 | # mat[1,] <- unclass(w) - 1L 69 | # mat[2,] <- as.integer(table(w)[x[x %in% names(words)]]) 70 | # } else { 71 | # mat <- NULL 72 | # } 73 | # return(mat) 74 | # }) 75 | 76 | 77 | 78 | #set.seed(8675309) 79 | # K <- 10 ## Num clusters 80 | # result <- lda.collapsed.gibbs.sampler(tf[!sapply(tf,is.null)], K, names(words), num.iterations=1000, alpha=0.1, eta=0.1, burnin=1000, compute.log.likelihood=TRUE) 81 | # plot(result$log.likelihoods[1,], type="l") 82 | ## Get the top words in the cluster 83 | #(top.words <- top.topic.words(result$topics, 20)) 84 | 85 | #sum(sapply(name_split, function(wds) sum(wds %in% c("eurest")))>0) 86 | #head(foodInspect$doing_business_as_name[sapply(name_split, function(wds) sum(wds %in% c("sodexo")))>0],20) 87 | 88 | 89 | foodInspect$name_length <- sapply(name_split,length) 90 | foodInspect$type_food <- "unknown" 91 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 92 | c("el", 93 | "taqueria", 94 | "lo", 95 | "de", 96 | "y", 97 | "mexican", 98 | "chipotl", 99 | "taco", 100 | "don", 101 | "pepe", 102 | "birrieria", 103 | "maria", 104 | "del", 105 | "taco", 106 | "burrito", 107 | "pollo", 108 | "feliz", 109 | "loco", 110 | "hermano" 111 | )))>0, "spanish", foodInspect$type_food) 112 | 113 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 114 | c("pizzeria", 115 | "pizza", 116 | "papa", 117 | "littl", 118 | "salad", 119 | "ristorant", 120 | "itali", 121 | "trattoria", 122 | "francesca", 123 | "di", 124 | "domino" 125 | )))>0, "italian", foodInspect$type_food) 126 | 127 | 128 | 129 | 130 | 131 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 132 | c("subwai", 133 | "sandwich", 134 | "submarin", 135 | "work", 136 | "john", 137 | "potbelli", 138 | "jimmi", 139 | "sub", 140 | "cosi", 141 | "pocket" 142 | )))>0, "sandwich", foodInspect$type_food) 143 | 144 | 145 | 146 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 147 | c("bakeri", 148 | "cafe", 149 | "corner", 150 | "dunkin", 151 | "donut", 152 | "caff", 153 | "bagel", 154 | "sweet", 155 | "cake", 156 | "pastri", 157 | "bon", 158 | "au", 159 | "coffe", 160 | "starbuck", 161 | "tea", 162 | "bread", 163 | "pancak", 164 | "juic", 165 | "fruit", 166 | "xsport" 167 | )))>0, "breakfast", foodInspect$type_food) 168 | 169 | 170 | 171 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 172 | c("burger", 173 | "hamburg", 174 | "chophous", 175 | "buffett", 176 | "kitchen", 177 | "wing", 178 | "wingstop", 179 | "wendi", 180 | "mcdonald", 181 | "fast", 182 | "donald", 183 | "castl", 184 | "box", 185 | "white", 186 | "popey", 187 | "king", 188 | "gyro", 189 | "fri", 190 | "beef", 191 | "bistro", 192 | "dog", 193 | "hot", 194 | "chicken", 195 | "fish", 196 | "restaur", 197 | "famili", 198 | "golden", 199 | "grill", 200 | "steak", 201 | "rib", 202 | "big", 203 | "pita", 204 | "indian", 205 | "india", 206 | "bombai", 207 | "kfc", 208 | "cuisin" 209 | )))>0, "dinner", foodInspect$type_food) 210 | 211 | 212 | 213 | 214 | 215 | 216 | 217 | 218 | 219 | 220 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 221 | c("bar", 222 | "club", 223 | "loung", 224 | "tavern", 225 | "sport", 226 | "pub", 227 | "tap", 228 | "cantina", 229 | "taverna" 230 | )))>0, "bar", foodInspect$type_food) 231 | 232 | 233 | 234 | 235 | 236 | 237 | 238 | 239 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 240 | c("food", 241 | "mart", 242 | "liquor", 243 | "store", 244 | "groceri", 245 | "j", 246 | "eleven", 247 | "dollar", 248 | "walgreen", 249 | "store", 250 | "wine", 251 | "stop", 252 | "mini", 253 | "citgo", 254 | "bp", 255 | "shell", 256 | "pharmaci", 257 | "marathon", 258 | "snack" 259 | )))>0, "mart", foodInspect$type_food) 260 | 261 | 262 | 263 | 264 | 265 | 266 | 267 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 268 | c("hospit", 269 | "intern", 270 | "cater", 271 | "univers", 272 | "eurest", 273 | "servic", 274 | "levi", 275 | "center", 276 | "field", 277 | "hotel", 278 | "marriott", 279 | "sodexo", 280 | "aramark", 281 | "inn", 282 | "moodi", 283 | "arrang", 284 | "airlin", 285 | "associ", 286 | "institut" 287 | )))>0, "cater", foodInspect$type_food) 288 | 289 | 290 | 291 | 292 | 293 | 294 | 295 | 296 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 297 | c("new", 298 | "china", 299 | "thai", 300 | "chines", 301 | "cuisin", 302 | "wok", 303 | "express", 304 | "noodl", 305 | "sushi", 306 | "chines", 307 | "blue", 308 | "japanes", 309 | "rice", 310 | "express", 311 | "see", 312 | "afc", 313 | "chop", 314 | "suei", 315 | "panda", 316 | "see", 317 | "asian", 318 | "japan" 319 | )))>0, "asian", foodInspect$type_food) 320 | 321 | 322 | 323 | 324 | 325 | 326 | 327 | 328 | 329 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 330 | c("supermercado", 331 | "supermarket", 332 | "dominick", 333 | "jewel", 334 | "mariano", 335 | "trader", 336 | "aldi", 337 | "whole", 338 | "carniceria", 339 | "meat", 340 | "deli", 341 | "sausag", 342 | "produc", 343 | "market" 344 | )))>0, "grocery", foodInspect$type_food) 345 | 346 | 347 | 348 | 349 | #what about chocolate, or ben and jerri's 350 | foodInspect$type_food <- ifelse(sapply(name_split, function(wds) sum(wds %in% 351 | c("robbin", 352 | "baskin", 353 | "ic", 354 | "icecream", 355 | "dairi", 356 | "oberwei", 357 | "shake", 358 | "dessert", 359 | "gelato", 360 | "candi", 361 | "frozen", 362 | "yogurt", 363 | "cold", 364 | "sugar", 365 | "custard", 366 | "theatr", 367 | "pastri", 368 | "cake", 369 | "pretzel" 370 | )))>0, "dessert", foodInspect$type_food) 371 | 372 | 373 | 374 | 375 | 376 | 377 | 378 | 379 | foodInspect$type_food <- factor(foodInspect$type_food, 380 | levels=c("spanish","italian","sandwich", 381 | "breakfast","dinner","bar", 382 | "mart","cater","asian", 383 | "grocery","dessert","unknown")) 384 | 385 | 386 | 387 | 388 | 389 | 390 | 391 | 392 | foodInspect$nondisp_ware <-ifelse(sapply(name_split, function(wds) sum(wds %in% 393 | c("ristorant", 394 | "itali", 395 | "trattoria", 396 | "francesca", 397 | "di", 398 | "cafe", 399 | "caff", 400 | "pancak", 401 | "chophous", 402 | "buffett", 403 | "kitchen", 404 | "bistro", 405 | "restaur", 406 | "famili", 407 | "golden", 408 | "steak", 409 | "rib", 410 | "indian", 411 | "india", 412 | "bombai", 413 | "cuisin", 414 | "bar", 415 | "club", 416 | "loung", 417 | "tavern", 418 | "sport", 419 | "pub", 420 | "tap", 421 | "cantina", 422 | "taverna", 423 | "hospit", 424 | "intern", 425 | "cater", 426 | "univers", 427 | "eurest", 428 | "servic", 429 | "center", 430 | "hotel", 431 | "marriott", 432 | "sodexo", 433 | "aramark", 434 | "inn", 435 | "moodi", 436 | "associ", 437 | "institut", 438 | "new", 439 | "china", 440 | "thai", 441 | "chines", 442 | "cuisin", 443 | "wok", 444 | "sushi", 445 | "chines", 446 | "blue", 447 | "japanes", 448 | "chop", 449 | "suei", 450 | "asian", 451 | "japan" 452 | )))>0, 1L, 0L) 453 | 454 | 455 | 456 | 457 | 458 | 459 | ## Number of documents to display 460 | # N <- 10 461 | # theme_set(theme_bw()) 462 | # 463 | # topic.proportions <- t(result$document_sums) / colSums(result$document_sums) 464 | # topic.proportions <- topic.proportions[sample(1:dim(topic.proportions)[1], N),] 465 | # 466 | # topic.proportions[is.na(topic.proportions)] <- 1 / K 467 | # 468 | # colnames(topic.proportions) <- apply(top.words, 2, paste, collapse=".") 469 | # 470 | # topic.proportions.df <- melt(cbind(data.frame(topic.proportions), document=factor(1:N)), 471 | # variable.name="topic", 472 | # id.vars = "document") 473 | # head(topic.proportions.df) 474 | # 475 | # ggplot(data=topic.proportions.df) + geom_bar(aes(x=topic,y=value,fill=topic),stat="identity") + labs(ylab="proportion") + 476 | # theme(axis.text.x = element_text(angle=90, hjust=1)) + 477 | # coord_flip() + 478 | # facet_wrap(~ document, ncol=5) + scale_x_discrete("", breaks = NULL) 479 | 480 | # marginal <- result$topics / sum(result$topics) 481 | # marginal[marginal==0] <- 1e-15 482 | # marginal <- log(marginal/sum(marginal)) 483 | # #save(words, top.words, marginal, file="text_mine_result20140127v02.Rdata") 484 | # 485 | # counts <- sapply(tf, function(m) { 486 | # if (is.null(m)) { 487 | # dat <- data.frame(voc=names(words)[1], cnts=0) 488 | # } else { 489 | # dat <- data.frame(voc=names(words)[m[1,]+1], cnts = m[2,]) 490 | # } 491 | # dat$voc <- factor(dat$voc, levels=names(words)) 492 | # tapply(dat$cnts, dat$voc, sum, simplify=TRUE) 493 | # }) 494 | # counts[is.na(counts)] <- 0 495 | # 496 | # logLik <- marginal %*% counts 497 | # 498 | # foodInspect$topic <- apply(top.words, 2, paste, collapse=".")[apply(logLik,2,which.max)] 499 | # foodInspect$topic <- factor(foodInspect$topic, levels=apply(top.words, 2, paste, collapse=".")) 500 | # #head(foodInspect[,c("doing_business_as_name","topic")],50) 501 | # 502 | # # ratio <- t(logLik)/logLik[1,] - 1 503 | # # ratio <- ratio[,-1] 504 | # # colnames(ratio) <- paste("lr",apply(top.words, 2, paste, collapse=".")[-1], sep="_") 505 | # # foodInspect <- cbind(foodInspect,ratio) 506 | # 507 | # logLik <- t(logLik) 508 | # colnames(logLik) <- paste("lr",apply(top.words, 2, paste, collapse="."), sep="_") 509 | # foodInspect <- cbind(foodInspect,logLik) 510 | 511 | 512 | # tf_idf <- ldply(strsplit(bname, split=" "),function(x) { 513 | # table(factor(x[x %in% names(words)],levels=names(words))) 514 | # }) 515 | # ##idf <- sapply(tf_idf,function(col) sum(col!=0)) 516 | # ##idf <- log(nrow(tf_idf)/idf) 517 | # ##l_ply(1:length(idf), function(i) {tf_idf[,i] <<- tf_idf[,i]*idf[i] 518 | # ## invisible()}) 519 | # colnames(tf_idf) <- paste("name",colnames(tf_idf),sep="_") 520 | # foodInspect <- cbind(foodInspect,tf_idf) 521 | 522 | #rm(idf,bname,myStopRegEx,words,myStopWords,tf_idf); gc() 523 | #rm(bname,myStopRegEx,words,myStopWords,tf_idf); gc() 524 | #rm(bname,myStopRegEx,words,myStopWords,tf,counts,logLik,ratio,marginal,result,K); gc() 525 | rm(bname,myStopRegEx,words,myStopWords,name_split); gc() 526 | -------------------------------------------------------------------------------- /CODE/not used/development_of_heat_merge.R: -------------------------------------------------------------------------------- 1 | foodInspect$heat_burglary <- merge_heat(events=burglary, dateCol="date", window=90, nGroups=CPUs) 2 | crime 3 | 4 | dat 5 | CUR <- dat[1789]$Inspection_Date 6 | LAT <- dat[1789]$Latitude 7 | LON <- dat[1789]$Longitude 8 | 9 | WIN <- 90 10 | 11 | ## "Current" crime 12 | crime[Date >= (CUR - WIN) & Date <= CUR, 13 | list(Latitude, Longitude), 14 | keyby = Date] 15 | ## KDE based on "Current" crime and current lat/long 16 | crime[Date > (CUR - WIN) & Date < CUR, 17 | kde(new=c(LAT, LON), x=Latitude, y=Longitude, h = c(.01, .01))] 18 | 19 | mydat <- dat[1789:1810] 20 | 21 | ## Mydat (relevant fields only) 22 | mydat[ , Inspection_ID, keyby = list(Begin = Inspection_Date - WIN, 23 | End = Inspection_Date, 24 | Latitude, 25 | Longitude)] 26 | ## Merge 27 | foverlaps(x = mydat[i = 1, 28 | j = list(Inspection_ID, 29 | Latitude, 30 | Longitude), 31 | keyby = list(start = Inspection_Date - WIN, 32 | end = Inspection_Date)], 33 | y = crime[i = TRUE, 34 | list(Latitude, 35 | Longitude), 36 | keyby=list(start = Date, 37 | end = Date)], 38 | type = "any") 39 | 40 | 41 | foverlaps(x = mydat[, j = list(Inspection_ID, Latitude, Longitude), 42 | keyby = list(start = Inspection_Date - WIN, end = Inspection_Date)], 43 | y = crime[i = TRUE, list(Latitude, Longitude), 44 | keyby=list(start = Date, end = Date)], 45 | type = "any")[ , kde(new=c(i.Latitude[1], i.Longitude[1]), 46 | x = Latitude, y = Longitude, h = c(.01, .01)), 47 | keyby = Inspection_ID] 48 | crime[Date >= (CUR - WIN) & Date <= CUR, 49 | kde(new=c(LAT, LON), x=Latitude, y=Longitude, h = c(.01, .01))] 50 | 51 | -------------------------------------------------------------------------------- /CODE/not used/heat.R: -------------------------------------------------------------------------------- 1 | #filter to events within a date, and get heat 2 | heat <- function(date,lat,long, data, dateColumnName, days){ 3 | out <- tryCatch({ 4 | biz <- data[data[,dateColumnName]>(date - days*60*60*24) & 5 | data[,dateColumnName]to_floating_timestamp('2011-01-01T01:00:01')" 144 | 145 | permits <- liveReadCSV(key="ydr8-5enu", header=myheader, filter=mySubset, dateSetting=setting) 146 | 147 | permits <- subset(permits, !is.na(latitude) & !is.na(longitude) & !is.na(issue_date)) 148 | permits$permit_type<- factor(permits$permit_type,levels=names(table(permits$permit_type)[order(-table(permits$permit_type))])) 149 | print(table(permits$permit_type)) 150 | 151 | rm(myheader, mySubset, setting); gc() 152 | 153 | 154 | -------------------------------------------------------------------------------- /CODE/not used/liveReadInBldgViolations.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | myheader <- matrix(c( 4 | "id","NULL", 5 | "violation_last_modified_date","NULL", 6 | "violation_date","violation.date", 7 | "violation_code","NULL", 8 | "violation_status","NULL", 9 | "violation_status_date","NULL", 10 | "violation_description","character", 11 | "violation_location","NULL", 12 | "violation_inspector_comments","NULL", 13 | "violation_ordinance","character", 14 | "inspector_id","NULL", 15 | "inspection_number","NULL", 16 | "inspection_status","NULL", 17 | "inspection_waived","NULL", 18 | "inspection_category","character", 19 | "department_bureau","NULL", 20 | "address","NULL", 21 | "property_group","NULL", 22 | "latitude", "numeric", 23 | "longitude", "numeric", 24 | "location", "NULL" 25 | ), ncol=2, byrow=TRUE) 26 | 27 | 28 | setting <- list(name="violation.date", 29 | func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y")) 30 | ) 31 | 32 | 33 | mySubset <- "&$where=violation_date>to_floating_timestamp('2011-01-01T01:00:01')" 34 | 35 | bldgViolations <- liveReadCSV(key="22u3-xenr", header=myheader, filter=mySubset, dateSetting=setting) 36 | 37 | bldgViolations <- subset(bldgViolations, !is.na(latitude) & !is.na(longitude) & !is.na(violation_date)) 38 | bldgViolations$violation_description<- factor(bldgViolations$violation_description,levels=names(table(bldgViolations$violation_description)[order(-table(bldgViolations$violation_description))])) 39 | bldgViolations$violation_ordinance<- factor(bldgViolations$violation_ordinance,levels=names(table(bldgViolations$violation_ordinance)[order(-table(bldgViolations$violation_ordinance))])) 40 | bldgViolations$inspection_category<- factor(bldgViolations$inspection_category,levels=names(table(bldgViolations$inspection_category)[order(-table(bldgViolations$inspection_category))])) 41 | #print(table(bldgViolations$violation_description)) 42 | #print(table(bldgViolations$violation_ordinance)) 43 | #print(table(bldgViolations$inspection_category)) 44 | 45 | 46 | 47 | rm(myheader, mySubset, setting); gc() 48 | 49 | 50 | -------------------------------------------------------------------------------- /CODE/not used/liveReadInGraffiti.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | myheader <- matrix(c( 4 | "creation_date", "graffiti.date", 5 | "status", "character", 6 | "completion_date", "NULL", 7 | "service_request_number", "NULL", 8 | "type_of_service_request", "NULL", 9 | "what_type_of_suface_is_the_graffiti_on_", "NULL", 10 | "where_is_the_graffiti_located_", "NULL", 11 | "street_address", "NULL", 12 | "zip_code", "NULL", 13 | "x_coordinate", "NULL", 14 | "y_coordinate", "NULL", 15 | "ward", "NULL", 16 | "police_district", "NULL", 17 | "community_area", "NULL", 18 | "latitude", "numeric", 19 | "longitude", "numeric", 20 | "location", "NULL" 21 | ), ncol=2, byrow=TRUE) 22 | 23 | 24 | setting <- list(name="graffiti.date", 25 | func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y")) 26 | ) 27 | 28 | 29 | mySubset <- "&$where=1=1" 30 | 31 | graffiti <- liveReadCSV(key="hec5-y4x5", header=myheader, filter=mySubset, dateSetting=setting) 32 | 33 | graffiti <- subset(graffiti, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date)) 34 | graffiti <- subset(graffiti, status %in% c("Completed", "Open")) 35 | graffiti$status <- NULL 36 | 37 | rm(myheader, mySubset, setting); gc() 38 | 39 | 40 | -------------------------------------------------------------------------------- /CODE/not used/liveReadInLightsOut.R: -------------------------------------------------------------------------------- 1 | 2 | myheader <- matrix(c( 3 | "creation_date", "lights.date", 4 | "status", "character", 5 | "completion_date", "NULL", 6 | "service_request_number", "NULL", 7 | "type_of_service_request", "character", 8 | "street_address", "NULL", 9 | "zip_code", "NULL", 10 | "x_coordinate", "NULL", 11 | "y_coordinate", "NULL", 12 | "ward", "NULL", 13 | "police_district", "NULL", 14 | "community_area", "NULL", 15 | "latitude", "numeric", 16 | "longitude", "numeric", 17 | "location", "NULL" 18 | ), ncol=2, byrow=TRUE) 19 | 20 | 21 | 22 | setting <- list(name="lights.date", 23 | func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y")) 24 | ) 25 | 26 | 27 | mySubset <- "&$where=1=1" 28 | 29 | 30 | alleyLights <- liveReadCSV(key="t28b-ys7j", header=myheader, filter=mySubset, dateSetting=setting) 31 | streetLights <- liveReadCSV(key="3aav-uy2v", header=myheader, filter=mySubset, dateSetting=setting) 32 | 33 | 34 | alleyLights <- subset(alleyLights, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date)) 35 | alleyLights <- subset(alleyLights, status %in% c("Completed", "Open")) 36 | alleyLights$status <- NULL 37 | 38 | streetLights <- subset(streetLights, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date)) 39 | streetLights <- subset(streetLights, status %in% c("Completed", "Open")) 40 | streetLights$status <- NULL 41 | 42 | lightsOut <- rbind(alleyLights,streetLights) 43 | 44 | rm(myheader, mySubset, setting, alleyLights, streetLights); gc() 45 | -------------------------------------------------------------------------------- /CODE/not used/liveReadInPotHoles.R: -------------------------------------------------------------------------------- 1 | 2 | myheader <- matrix(c( 3 | "creation_date", "pothole.date", 4 | "status", "character", 5 | "completion_date", "NULL", 6 | "service_request_number", "NULL", 7 | "type_of_service_request", "NULL", 8 | "currentActivity", "NULL", 9 | "most_recent_action", "NULL", 10 | "number_of_potholes_filled_on_block", "NULL", 11 | "street_address", "NULL", 12 | "zip", "NULL", 13 | "x_coordinate", "NULL", 14 | "y_coordinate", "NULL", 15 | "ward", "NULL", 16 | "police_district", "NULL", 17 | "community_area", "NULL", 18 | "latitude", "numeric", 19 | "longitude", "numeric", 20 | "location", "NULL" 21 | ), ncol=2, byrow=TRUE) 22 | 23 | 24 | 25 | setting <- list(name="pothole.date", 26 | func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y")) 27 | ) 28 | 29 | 30 | mySubset <- "&$where=1=1" 31 | 32 | 33 | potHoles <- liveReadCSV(key="7as2-ds3y", header=myheader, filter=mySubset, dateSetting=setting) 34 | 35 | 36 | potHoles <- subset(potHoles, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date)) 37 | potHoles <- subset(potHoles, status %in% c("Completed", "Open")) 38 | potHoles$status <- NULL 39 | 40 | rm(myheader, mySubset, setting); gc() 41 | -------------------------------------------------------------------------------- /CODE/not used/liveReadInRodents.R: -------------------------------------------------------------------------------- 1 | 2 | 3 | myheader <- matrix(c( 4 | "creation_date", "rodent.date", 5 | "status", "character", 6 | "completion_date", "NULL", 7 | "service_request_number", "NULL", 8 | "type_of_service_request", "NULL", 9 | "number_of_premises_baited", "NULL", 10 | "number_of_premises_with_garbage", "NULL", 11 | "number_of_premises_with_rats","NULL", 12 | "current_activity","NULL", 13 | "most_recent_action","NULL", 14 | "street_address", "NULL", 15 | "zip_code", "NULL", 16 | "x_coordinate", "NULL", 17 | "y_coordinate", "NULL", 18 | "ward", "NULL", 19 | "police_district", "NULL", 20 | "community_area", "NULL", 21 | "latitude", "numeric", 22 | "longitude", "numeric", 23 | "location", "NULL" 24 | ), ncol=2, byrow=TRUE) 25 | 26 | 27 | setting <- list(name="rodent.date", 28 | func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y")) 29 | ) 30 | 31 | 32 | mySubset <- "&$where=1=1" 33 | 34 | rodents <- liveReadCSV(key="97t6-zrhs", header=myheader, filter=mySubset, dateSetting=setting) 35 | 36 | rodents <- subset(rodents, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date)) 37 | rodents <- subset(rodents, status %in% c("Completed", "Open")) 38 | rodents$status <- NULL 39 | 40 | rm(myheader, mySubset, setting); gc() 41 | 42 | 43 | -------------------------------------------------------------------------------- /CODE/not used/liveReadInTreeTrims.R: -------------------------------------------------------------------------------- 1 | 2 | myheader <- matrix(c( 3 | "creation_date", "treetrims.date", 4 | "status", "character", 5 | "completion_date", "NULL", 6 | "service_request_number", "NULL", 7 | "type_of_service_request", "NULL", 8 | "location_of_trees", "NULL", 9 | "street_address", "NULL", 10 | "zip_code", "NULL", 11 | "x_coordinate", "NULL", 12 | "y_coordinate", "NULL", 13 | "ward", "NULL", 14 | "police_district", "NULL", 15 | "community_area", "NULL", 16 | "latitude", "numeric", 17 | "longitude", "numeric", 18 | "location", "NULL" 19 | ), ncol=2, byrow=TRUE) 20 | 21 | 22 | 23 | setting <- list(name="treetrims.date", 24 | func=function(from) as.POSIXct(strptime(from,format="%m/%d/%Y")) 25 | ) 26 | 27 | 28 | mySubset <- "&$where=1=1" 29 | 30 | 31 | treeTrims <- liveReadCSV(key="uxic-zsuj", header=myheader, filter=mySubset, dateSetting=setting) 32 | 33 | 34 | treeTrims <- subset(treeTrims, !is.na(latitude) & !is.na(longitude) & !is.na(creation_date)) 35 | treeTrims <- subset(treeTrims, status %in% c("Completed", "Open")) 36 | treeTrims$status <- NULL 37 | 38 | rm(myheader, mySubset, setting); gc() 39 | -------------------------------------------------------------------------------- /CODE/prep_inspectors_for_eval.R: -------------------------------------------------------------------------------- 1 | 2 | ## 3 | ## The sanitarian identies are not publically available, so measures are taken 4 | ## disguise their identities in this analysis. 5 | ## This script takes the original disguised data used in the evaluation and 6 | ## modifies it to match the format that became available in the city in early 7 | ## 2015 and is still available as of Dec 2017. 8 | ## 9 | 10 | ##========================================================================== 11 | ## INITIALIZE 12 | ##========================================================================== 13 | ## Remove all objects; perform garbage collection 14 | rm(list=ls()) 15 | gc(reset=TRUE) 16 | 17 | ## Load libraries & project functions 18 | geneorama::loadinstall_libraries(c("data.table")) 19 | geneorama::sourceDir("CODE/functions/") 20 | 21 | ##============================================================================== 22 | ## LOAD CACHED RDS FILES 23 | ##============================================================================== 24 | 25 | inspectors <- readRDS("DATA/inspectors.Rds") 26 | food <- readRDS("DATA/13_food_inspections.Rds") 27 | 28 | ##============================================================================== 29 | ## PROCESS INSPECTOR DATA 30 | ##============================================================================== 31 | 32 | ## Removing letters out front, and numbers trailing hyphen are not needed 33 | ## (e.g. -1006 I believe is the code for retail food license) 34 | inspectors[ , License := gsub('[A-z]+|-.+$', "", License)] 35 | 36 | ## cleaning any leading zeros 37 | inspectors[ , License := gsub('^[0]+', "", License)] 38 | 39 | ## removing possibly invalid license numbers 40 | inspectors <- inspectors[nchar(License) > 3 & Inspector_Assigned != " "] 41 | 42 | ## if multiple inspections for same license number, then using the inspector 43 | ## on the first inspection 44 | inspectors <- inspectors[ , .N, by=list(License, Inspection_Date, Inspector_Assigned)] 45 | inspectors$N <- NULL 46 | 47 | ## Convert to integer to match other data 48 | inspectors[ , License := as.integer(License)] 49 | setkey(inspectors, License, Inspection_Date) 50 | 51 | ## Further deduplication 52 | inspectors_deduped <- inspectors[i = TRUE , 53 | j = list(Inspector_Assigned = Inspector_Assigned[1]), 54 | keyby = list(License, Inspection_Date)] 55 | 56 | ## Merge in the Inspection_ID from the food records 57 | inspectors_deduped <- merge(x = food[ , list(License, Inspection_Date, Inspection_ID)], 58 | y = inspectors_deduped, 59 | by = c("License", "Inspection_Date"), 60 | # all.x = TRUE, 61 | sort = FALSE) 62 | inspectors_deduped[duplicated(Inspection_ID)] 63 | inspectors_deduped <- inspectors_deduped[!duplicated(Inspection_ID)] 64 | 65 | ## Make the key columns of the inspector data match the output that the model 66 | ## would have gotten from COC internal systems 67 | inspectors_deduped_renamed <- inspectors_deduped[i = TRUE, 68 | j = list(sanitarian = Inspector_Assigned), 69 | keyby = list(inspectionID = Inspection_ID)] 70 | 71 | geneorama::NAsummary(inspectors_deduped_renamed) 72 | 73 | ##------------------------------------------------------------------------------ 74 | ## SAVE RESULT 75 | ##------------------------------------------------------------------------------ 76 | 77 | saveRDS(inspectors_deduped_renamed, "DATA/19_inspector_assignments.Rds") 78 | 79 | 80 | 81 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | # How to Contribute 2 | 3 | We welcome efforts to improve this project, and we are open to contributions for model improvements, process improvements, and general good ideas. Please use this guide to help structure your contributions and to make it easier for us to consider your contributions and provide feedback. If we do use your work we will acknowledge your contributions to the best of ability, and all contributions will be governed under the license terms specified in the LICENSE.md file in this project. To get started, sign the [Contributor License Agreement](https://www.clahub.com/agreements/Chicago/food-inspections-evaluation). 4 | 5 | In general we use the structure provided by GitHub for our workflow management, so if you're new to GitHub please see this guide first: https://guides.github.com/activities/contributing-to-open-source/#contributing 6 | 7 | Your contributions have the potential to have a positive impact on not just us, but everyone who is impacted by anyone who uses this project. So, consider that a big thanks in advance. 8 | 9 | ## Reporting an Issue 10 | 11 | Food Inspections Evaluation uses [GitHub Issue Tracking](https://github.com/Chicago/food-inspections-evaluation/issues) to track issues. This is a good place to start and can be a helpful place to manage both technical and non-technical issues. 12 | 13 | ## Submitting Code Changes 14 | 15 | Please send a [GitHub Pull Request to City of Chicago](https://github.com/chicago/food-inspections-evaluation/pull/new/master) with a clear list of what you've done (read more about [pull requests](http://help.github.com/pull-requests/)). Always write a clear log message for your commits. 16 | 17 | ## Demonstrating Model Performance 18 | 19 | We welcome improvements to the analytic model that creates predictions for the Department of Public Health. The city may adopt a pull request that sufficiently improves the accuracy and prediction, thus, allowing you to contribute to the inspection practice for the City. 20 | 21 | If your pull request is to improve the model, please consider the following steps when submitting a pull request: 22 | * Identify how your model is improving prior results 23 | * Run a test using the benchmark data provided in the repository 24 | * Create a pull request which describes those improvements in the description. 25 | * Work with the data science team to reproduce those results 26 | 27 | ### Training your data 28 | Train your food inspection model using data between January 2009 and 2012. Use these fits to generate a forecast of food inspections for the time period between September 2, 2014, and October 31, 2014. 29 | 30 | ### Measuring improvement 31 | The City sought to reduce the time to find critical violations. Thus, we are interested in a few key qualities in any improvements. 32 | * Your model reduces the average time to find critical violations (currently: 7.4 days) 33 | * Your model reduces the variance of the time to find critical violations (e.g., reduces the time by 7.5 days, but the standard deviation is lower) 34 | * Similarly, all restaurants were found earlier with no restaurants being found later, even if the average time remains the same 35 | * Your model increases the proportion of violations found in the first half of the pilot (e.g., percentage of critical violations found in September 2014). 36 | The team has calculated metrics for each one of these measures. You can investigate how these measures were calculated by referring to "Forecasting Restaurants with Critical Violations". Let us know if there are other metrics that should be considered for model improvement. 37 | 38 | ### Ability to adopt model 39 | If you would like to submit an improvement, please open a pull request that notes improvements to at least one of the aforementioned benchmarks. Your code should be able to reproduce those results by the data science team. 40 | 41 | Model improvements that include new data must use data that is freely (*gratis* or *libre*) to the City of Chicago. There must not be any terms that would prohibit the City from storing data.on local servers. 42 | 43 | Likewise, by submitting a pull request, you agree that the City of Chicago will be allowed to use your code for analytic purposes and that your software will be licensed under the licensing found in LICENSE.md in this repository. 44 | -------------------------------------------------------------------------------- /DATA/11_bus_license.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/11_bus_license.Rds -------------------------------------------------------------------------------- /DATA/12_crime.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/12_crime.Rds -------------------------------------------------------------------------------- /DATA/13_food_inspections.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/13_food_inspections.Rds -------------------------------------------------------------------------------- /DATA/14_garbage_carts.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/14_garbage_carts.Rds -------------------------------------------------------------------------------- /DATA/15_sanitation_code.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/15_sanitation_code.Rds -------------------------------------------------------------------------------- /DATA/17_mongo_weather_update.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/17_mongo_weather_update.Rds -------------------------------------------------------------------------------- /DATA/19_inspector_assignments.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/19_inspector_assignments.Rds -------------------------------------------------------------------------------- /DATA/21_food_inspection_violation_matrix.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/21_food_inspection_violation_matrix.Rds -------------------------------------------------------------------------------- /DATA/21_food_inspection_violation_matrix_nums.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/21_food_inspection_violation_matrix_nums.Rds -------------------------------------------------------------------------------- /DATA/22_burglary_heat.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/22_burglary_heat.Rds -------------------------------------------------------------------------------- /DATA/22_garbageCarts_heat.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/22_garbageCarts_heat.Rds -------------------------------------------------------------------------------- /DATA/22_sanitationComplaints_heat.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/22_sanitationComplaints_heat.Rds -------------------------------------------------------------------------------- /DATA/23_food_insp_features.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/23_food_insp_features.Rds -------------------------------------------------------------------------------- /DATA/24_bus_features.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/24_bus_features.Rds -------------------------------------------------------------------------------- /DATA/30_xgboost_data.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/30_xgboost_data.Rds -------------------------------------------------------------------------------- /DATA/30_xgboost_model.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/30_xgboost_model.Rds -------------------------------------------------------------------------------- /DATA/Inspection_Report_Form.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/Inspection_Report_Form.pdf -------------------------------------------------------------------------------- /DATA/inspectors.Rds: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/DATA/inspectors.Rds -------------------------------------------------------------------------------- /LICENSE.md: -------------------------------------------------------------------------------- 1 | Copyright, 2014 City of Chicago 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: 4 | 5 | The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. 6 | 7 | THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. 8 | 9 | Libraries and other software utilized in this repository are copyrighted and distributed under their respective open source licenses. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | Food Inspections Evaluation 2 | ============================ 3 | 4 | This is our model for predicting which food establishments are at most risk for the types of violations most likely to spread food-borne illness. Chicago Department of Public Health staff use these predictions to prioritize inspections. During a two month pilot period, we found that that using these predictions meant that inspectors found critical violations much faster. 5 | 6 | You can help improve the health of our city by improving this model. This repository contains a training and test set, along with the data used in the current model. 7 | 8 | Feel free to clone, fork, send pull requests and to file bugs. 9 | Please note that we will need you to agree to our Contributor License Agreement (CLA) in order to be able to use any pull requests. 10 | 11 | 12 | Original Analysis and Reports 13 | ----------------------------- 14 | In an effort to reduce the public’s exposure to foodborne illness the [City of Chicago](https://github.com/Chicago) partnered with Allstate’s Quantitative Research & Analytics department to develop a predictive model to help prioritize the city's food inspection staff. This Github project is a complete working evaluation of the model including the data that was used in the model, the code that was used to produce the statistical results, the evaluation of the validity of the results, and documentation of our methodology. 15 | 16 | The model evaluation calculates individualized risk scores for more than ten thousand Chicagoland food establishments using publically available data, most of which is updated nightly on [Chicago’s data portal](https://data.cityofchicago.org/). The sole exception is information about the inspectors. 17 | 18 | The evaluation compares two months of Chicago’s Department of Public Health inspections to an alternative data driven approach based on the model. The two month evaluation period is a completely out of sample evaluation based on a model created using test and training data sets from prior time periods. 19 | 20 | The reports may be reproduced compiling the knitr documents present in ``./REPORTS``. 21 | 22 | REQUIREMENTS 23 | ------------ 24 | 25 | All of the code in this project uses the open source statistical application, R. We advise that you use ```R version >= 3.1``` for best results. 26 | 27 | Ubuntu users may need to install `libssl-dev`, `libcurl4-gnutls-dev`, and `libxml2-dev`. This can be accomplished by typing the following command at the command line: 28 | `sudo apt-get install libssl-dev libcurl4-gnutls-dev libxml2-dev` 29 | 30 | The code makes extensive usage of the ``data.table`` package. If you are not familiar with the package, you might want to consult the data.table [FAQ available on CRAN] (http://cran.r-project.org/web/packages/data.table/vignettes/datatable-faq.pdf). 31 | 32 | 33 | FILE LAYOUT 34 | ------ 35 | 36 | The following directory structure is used: 37 | 38 | DIRECTORY | DESCRIPTION 39 | --------------------|---------------------- 40 | `.` | Project files such as README and LICENSE 41 | `./CODE/` | Sequential scripts used to develop model 42 | `./CODE/functions/` | General function definitions, which could be used in any script 43 | `./DATA/` | Data files created by scripts in `./CODE/`, or static 44 | `./REPORTS/` | Reports and other output are located in 45 | 46 | We have included all of the steps used to develop the model, evaluate the results, and document the results in the above directory structure. 47 | 48 | The scripts located in the `./CODE/` folder are organized sequentially, meaning that the numeric prefix indicates the order in which the script was / should be run in order to reproduce our results. 49 | 50 | Although we include all the necessary steps to download and transform the data used in the model, we also have stored a snapshot of the data in the repository. So, to run the model as it stands, it is only necessary to download the repository, install the dependencies, and step through the code in `CODE/30_glmnet_model.R`. If you do not already have them, the dependencies can be installed using the startup script `CODE/00_Startup.R`. 51 | 52 | DATA 53 | ------ 54 | 55 | Data used to develop the model is stored in the ``./DATA`` directory. [Chicago’s Open Data Portal](http://data.cityofchicago.org). The following datasets were used in the building the analysis-ready dataset. 56 | 57 | ``` 58 | Business Licenses 59 | Food Inspections 60 | Crime 61 | Garbage Cart Complaints 62 | Sanitation Complaints 63 | Weather 64 | Sanitarian Information 65 | ``` 66 | 67 | The data sources are joined to create a tabular dataset that paints a statistical picture of a ‘business license’- The primary modelling unit / unit of observation in this project. 68 | 69 | The data sources are joined (in SQLesque manner) on appropriate composite keys. These keys include Inspection ID, Business License, and Geography expressed as a Latitude / Longitude combination among others. 70 | 71 | 72 | Acknowledgements 73 | ---------------- 74 | This research was conducted by the City of Chicago with support from the [Civic Consulting Alliance](http://www.ccachicago.org/), and [Allstate Insurance](https://www.allstate.com/). The City would especially like to thank Stephen Collins, Gavin Smart, Ben Albright, and David Crippin for their efforts in developing the predictive model. We also appreciate the help of Kelsey Burr, Christian Hines, and Kiran Pookote in coordinating this research project. We owe a special thanks to our volunteers from Allstate who put in a tremendous effort to develop the predictive model and allowing their team to volunteer for projects to change their city. This project was partially funded by an award from the Bloomberg Philanthropies' Mayors Challenge. 75 | -------------------------------------------------------------------------------- /REPORTS/.gitignore: -------------------------------------------------------------------------------- 1 | *.md 2 | *_cache/ 3 | *_files 4 | -------------------------------------------------------------------------------- /REPORTS/CountComparison_aftersecondrefactor.Rmd: -------------------------------------------------------------------------------- 1 | --- 2 | title: "Data Integrity Check" 3 | output: html_document 4 | --- 5 | 6 | ### Purpose 7 | 8 | This document compares monthly counts at various stages of the model process. We show how many food inspection records (and critical violations) there were 9 | 10 | 1. In the original download, 11 | 2. After filtering, and 12 | 2. After taking out any NA's (which indicates that a data sources wasn't able to merged into the food inspection data). 13 | 14 | ```{r, echo=FALSE, warning=FALSE, results='hide', message=FALSE} 15 | 16 | ## Initialize and load data... (hidden in report output) 17 | 18 | geneorama::set_project_dir("food-inspections-evaluation") 19 | ##============================================================================== 20 | ## INITIALIZE 21 | ##============================================================================== 22 | ## Remove all objects; perform garbage collection 23 | rm(list=ls()) 24 | gc(reset=TRUE) 25 | ## Detach libraries that are not used 26 | geneorama::detach_nonstandard_packages() 27 | ## Load libraries that are used 28 | geneorama::loadinstall_libraries(c("data.table", "MASS")) 29 | ## Load custom functions 30 | geneorama::sourceDir("CODE/functions/") 31 | 32 | ##============================================================================== 33 | ## LOAD CACHED RDS FILES 34 | ##============================================================================== 35 | ## Import the key data sets used for prediction 36 | foodInspect <- readRDS("DATA/13_food_inspections.Rds") 37 | foodInspect$Violations <- NULL 38 | violation_dat <- readRDS("DATA/22_violation_dat.Rds") 39 | dat_model <- readRDS("DATA/23_dat_model.Rds") 40 | 41 | ##============================================================================== 42 | ## ADD FIELDS AND APPLY FILTER TO foodInspect 43 | ##============================================================================== 44 | ## Remove duplicate IDs first 45 | foodInspect <- foodInspect[!duplicated(Inspection_ID)] 46 | 47 | ## Limit dates 48 | foodInspect <- foodInspect[Inspection_Date > as.IDate("2011-09-01")] 49 | 50 | ## Join in the violation matrix 51 | foodInspect <- merge(x = foodInspect, 52 | y = violation_dat, 53 | by = "Inspection_ID", 54 | all.x = TRUE) 55 | 56 | foodInspect[ , criticalFound := pmin(1, criticalCount)] 57 | 58 | foodInspect[i = TRUE, 59 | j = Facility_Type_Clean := 60 | categorize(x = Facility_Type, 61 | primary = list(Restaurant = "restaurant", 62 | Grocery_Store = "grocery"), 63 | ignore.case = TRUE)] 64 | foodInspect_filtered <- filter_foodInspect(foodInspect) 65 | 66 | ##============================================================================== 67 | ## ADD FIELDS AND APPLY FILTER TO dat_model 68 | ##============================================================================== 69 | dat_model[ , criticalFound := pmin(1, criticalCount)] 70 | ## Only keep "Retail Food Establishment" 71 | dat_model <- dat_model[LICENSE_DESCRIPTION == "Retail Food Establishment"] 72 | ## Remove License Description 73 | dat_model[ , LICENSE_DESCRIPTION := NULL] 74 | dat_model <- na.omit(dat_model) 75 | 76 | ``` 77 | 78 | ### Construct tables of counts, and critical violations found 79 | 80 | ```{r} 81 | tab_food <- foodInspect[ 82 | i = TRUE, 83 | j = list(N = .N, 84 | critFound = sum(criticalFound)), 85 | keyby = list(month = round(Inspection_Date, "month"))] 86 | 87 | tab_food_open_canvass <- foodInspect[ 88 | i = Inspection_Type == "Canvass" & 89 | !Results %in% c('Out of Business', 90 | 'Business Not Located', 91 | 'No Entry'), 92 | j = list(N = .N, 93 | critFound = sum(criticalFound)), 94 | keyby = list(month = round(Inspection_Date, "month"))] 95 | 96 | tab_food_filtered <- foodInspect_filtered[ 97 | i = TRUE, 98 | j = list(N = .N, 99 | critFound = sum(criticalFound)), 100 | keyby = list(month = round(Inspection_Date, "month"))] 101 | 102 | tab_modeldat <- dat_model[ 103 | i = TRUE, 104 | j = list(N = .N, 105 | critFound = sum(criticalFound)), 106 | keyby = list(month = round(Inspection_Date, "month"))] 107 | ``` 108 | 109 | ### Monthly counts at each step in "filtering" 110 | 111 | ```{r} 112 | tab_food 113 | tab_food_open_canvass 114 | tab_food_filtered 115 | tab_modeldat 116 | 117 | ``` 118 | 119 | ### Monthly counts at each stage in "filtering", merged into a single table 120 | 121 | ```{r} 122 | tab_all <- merge(tab_food, 123 | merge(tab_food_open_canvass, 124 | merge(tab_food_filtered, 125 | tab_modeldat, 126 | all = TRUE, 127 | suffixes = c(".Filtered", ".Model")), 128 | all=TRUE), 129 | all=TRUE, 130 | suffixes = c(".ALL", ".Open")) 131 | tab_all 132 | ``` 133 | 134 | 135 | ### Violations, tabluated by "is.na" 136 | 137 | Generally, it looks like data that was incomplete was lower in violation counts. 138 | 139 | ```{r} 140 | geneorama::set_project_dir("food-inspections-evaluation") 141 | 142 | ## Read in the original dat model before any filtering is applied 143 | dat_model_orig <- readRDS("DATA/dat_model.Rds") 144 | 145 | ## Luckily the restaurants with missing business data mostly appear to have 146 | ## lower counts of critical and serious violations 147 | geneorama::NAsummary(dat_model_orig) 148 | dat_model_orig[ , table(is.na(Business_ID))] 149 | dat_model_orig[ 150 | i = TRUE, 151 | j = list(mean_critical = mean(criticalCount), sd_critical = sd(criticalCount), 152 | mean_serious = mean(seriousCount), sd_serious = sd(seriousCount), 153 | mean_minor = mean(minorCount), sd_minor = sd(minorCount)), 154 | is.na(Business_ID)] 155 | ``` 156 | 157 | 158 | 159 | 160 | 161 | 162 | 163 | 164 | 165 | 166 | 167 | -------------------------------------------------------------------------------- /REPORTS/Metric_Development.Rmd: -------------------------------------------------------------------------------- 1 | ```{r, echo=FALSE, results='hide', message=FALSE} 2 | ##============================================================================== 3 | ## INITIALIZE 4 | ##============================================================================== 5 | ## Remove all objects; perform garbage collection 6 | rm(list=ls()) 7 | gc(reset=TRUE) 8 | ## Detach libraries that are not used 9 | geneorama::detach_nonstandard_packages() 10 | ## Load libraries that are used 11 | geneorama::loadinstall_libraries(c("data.table", "glmnet", "ggplot2", "splines", 12 | "knitr", "printr", "scales")) 13 | ## Load custom functions 14 | geneorama::sourceDir("CODE/functions/") 15 | 16 | ## Define melt to be the data.table melt 17 | melt <- data.table:::melt.data.table 18 | ``` 19 | 20 | ```{r, echo=FALSE, results='hide', fig.show='hide', message=FALSE} 21 | ##============================================================================== 22 | ## RUN GLMNET MODEL 23 | ##============================================================================== 24 | 25 | ## NOTE: THE DIRECTORY NAME BELOW MUST MATCH YOUR PROECT DIRECTORY NAME 26 | ## If you clone the project directly from github.com the project name will 27 | ## automatically be food-inspection-evaluation, but if you change the name 28 | ## you will need to update the name of the project below in the set_project_dir 29 | ## function below. 30 | geneorama::set_project_dir("food-inspections-evaluation") 31 | geneorama::sourceDir("CODE/functions/") 32 | 33 | ## Overwrite the base "interactive" function to prevent complete initialization 34 | interactive <- function(){FALSE} 35 | ## Source the file that runs the glmnet model: 36 | source("CODE/30_glmnet_model.R", echo = FALSE) 37 | ## Remove the temporary local "interactive" function 38 | rm(interactive) 39 | ## Remove excess variables generated by "CODE/30_glmnet_model.R" 40 | rm(confusion_values_test, dat, errors, errorsTest, iiTest, iiTrain, 41 | lam, mm, pen, w.lam, xmat) 42 | 43 | ## These should be the only remaining objects (besides functions) imported: 44 | # coef 45 | # inspCoef 46 | # datTest 47 | # model 48 | 49 | ## List objects (besides functions) to check with comments above: 50 | geneorama::lll() 51 | ``` 52 | 53 | 54 | 55 | ```{r} 56 | opts_chunk$set(tidy = FALSE) 57 | opts_chunk$set(fig.width = 10) 58 | # opts_chunk$set(fig.height = 10) 59 | ``` 60 | 61 | 62 | ```{r, echo=FALSE, results='hide'} 63 | ##============================================================================== 64 | ## DEFINE CUSTOM GGPLOT FUNCTION 65 | ##============================================================================== 66 | 67 | # cbPalette <- c("#000000", "#E69F00", "#56B4E9", "#009E73", "#F0E442", 68 | # "#0072B2", "#D55E00", "#CC79A7") 69 | 70 | ggplot <- function(...) { 71 | cbPalette <- c("#E69F00", "#56B4E9", "#009E73", "#F0E442", 72 | "#0072B2", "#D55E00", "#CC79A7") 73 | ggplot2::ggplot(...) + 74 | theme_grey()+ 75 | scale_fill_manual(values=cbPalette) + 76 | scale_colour_manual(values=cbPalette) + 77 | theme(plot.title = element_text(size = 15)) 78 | 79 | } 80 | ``` 81 | 82 | ```{r, echo=FALSE, eval=FALSE} 83 | ## Testing palette 84 | # cbPalette <- c("#000000", "#E69F00", "#56B4E9", 85 | # "#009E73", "#F0E442", "#0072B2", 86 | # "#D55E00", "#CC79A7") 87 | # pie(rep(1,length(cbPalette)), col = cbPalette) 88 | ``` 89 | 90 | ## Bar plots 91 | 92 | ```{r bar_graph_comparing_BAU_and_model_in_first_half} 93 | # datTest[period==1, list(.N, sum(criticalFound))] 94 | # datTest[period_modeled==1, list(.N, sum(criticalFound))] 95 | 96 | crit_viol_rate <- data.table( 97 | Regime = c("Business As Usual", 98 | "Data Driven"), 99 | values = c(datTest[period==1, sum(criticalFound)/.N], 100 | datTest[period_modeled==1, sum(criticalFound)/.N])) 101 | crit_viol_cumulative <- data.table( 102 | Regime = c("Business As Usual", 103 | "Data Driven"), 104 | values = c(datTest[period==1, sum(criticalFound)] / datTest[ , sum(criticalFound)], 105 | datTest[period_modeled==1, sum(criticalFound)] / datTest[ , sum(criticalFound)])) 106 | 107 | ggplot(crit_viol_rate) + 108 | aes(x = Regime, y = values, fill=Regime) + 109 | labs(title=paste0('Percentage of inspections resulting in a critical violation\n', 110 | 'during Period 1\n') ) + 111 | geom_bar(stat = "identity") + 112 | # guides(fill=FALSE) + 113 | scale_y_continuous(labels = percent) + 114 | xlab("") + ylab("") 115 | 116 | ggplot(crit_viol_cumulative) + 117 | aes(x = factor(Regime), y = values, fill=Regime) + 118 | labs(title=paste0('Percentage of period 1 & period critical violations\n', 119 | 'found in Period 1\n') ) + 120 | geom_bar(stat = "identity") + 121 | # guides(fill=FALSE) + 122 | scale_y_continuous(labels = percent) + 123 | xlab("") + ylab("") + expand_limits(y = 1) 124 | ``` 125 | 126 | 127 | 128 | ## Aggregate count comparisons 129 | 130 | ```{r} 131 | comp <- cbind( 132 | datTest[i = order(Inspection_Date), 133 | j = list(Inspection_ID_BAU = Inspection_ID, 134 | Inspection_Date = Inspection_Date, 135 | criticalFound_BAU = criticalFound)], 136 | datTest[i = order(-glm_pred), 137 | j = list(Inspection_ID_Model = Inspection_ID, 138 | criticalFound_Model = criticalFound)]) 139 | comp <- comp[ , Best_Possible := criticalFound_Model[order(-criticalFound_Model)]] 140 | comp <- comp[ , Worst_Possible := criticalFound_Model[order(criticalFound_Model)]] 141 | # comp 142 | 143 | comp_summary <- comp[ 144 | i = TRUE, 145 | j = list(Total_Inspections = .N, 146 | Crit_Violations_BAU = sum(criticalFound_BAU), 147 | Crit_Violations_Model = sum(criticalFound_Model), 148 | Best_Possible = sum(Best_Possible), 149 | Worst_Possible = sum(Worst_Possible)), 150 | keyby = list(Inspection_Date = Inspection_Date)] 151 | comp_summary 152 | ``` 153 | 154 | ## Calculation of average time saved with model 155 | 156 | ```{r} 157 | date_comp <- merge( 158 | x = comp[i = TRUE, 159 | j = list (criticalFound_BAU, 160 | date_bau = Inspection_Date, 161 | id = Inspection_ID_BAU)], 162 | y = comp[i = TRUE, 163 | j = list (criticalFound_Model, 164 | date_model = Inspection_Date, 165 | id = Inspection_ID_Model)], 166 | by = "id") 167 | ``` 168 | 169 | ### Average improvement based on date comparison 170 | 171 | ```{r} 172 | date_comp[i = criticalFound_Model==1, 173 | j = mean(date_model - date_bau)] 174 | ``` 175 | 176 | ### Distribution of "how many days sooner" for all critical violations 177 | 178 | ```{r} 179 | hist(as.numeric(date_comp[criticalFound_Model==1, 180 | -(date_model - date_bau)]), 181 | main = paste0("Days sooner that a critical \n", 182 | " violation would have been discovered"), 183 | breaks = 30, 184 | xlim = c(-60, 60), 185 | xlab = "days sooner", 186 | col = c(rep("#CC79A7", 10), rep("#009E73", 20))) 187 | ``` 188 | 189 | ```{r} 190 | hist(as.numeric(date_comp[criticalFound_Model==1, 191 | -(date_model - date_bau)]), 192 | main = paste0("Days sooner that a critical \n", 193 | " violation would have been discovered"), 194 | breaks = 5, 195 | xlab = "days sooner", 196 | col = c(rep("#CC79A7", 3), rep("#009E73", 10))) 197 | ``` 198 | 199 | 200 | 201 | ```{r} 202 | comp_summary_percent <- comp_summary[ 203 | i = TRUE, 204 | j = list(Pct_Crit_Violations_Found_BAU = Crit_Violations_BAU / Total_Inspections, 205 | Pct_Crit_Violations_Found_Model = Crit_Violations_Model / Total_Inspections), 206 | keyby = Inspection_Date] 207 | ``` 208 | 209 | ### Unadjusted plot 210 | 211 | ```{r} 212 | ggplot(melt(data = comp_summary_percent, id.vars = "Inspection_Date")) + 213 | aes(x=Inspection_Date, y=value, colour=variable) + 214 | labs(title=paste0("Percent of inspections resulting in a Critical Violation\n", 215 | "(Daily intervals shown)\n"))+ 216 | geom_line() 217 | ``` 218 | 219 | ### Plot smoothed by gam to show overall trend 220 | 221 | ```{r} 222 | ggplot(melt(data = comp_summary_percent, id.vars = "Inspection_Date")) + 223 | aes(x=Inspection_Date, y=value, colour=variable, fill=variable) + 224 | labs(title=paste0('Critical violations found on a daily basis\n', 225 | 'as a percent of total inspections performed\n', 226 | "(smoothed results)\n") ) + 227 | stat_smooth(method = "gam", 228 | formula = y ~ s(x, k=10, sp=2, bs="ps"), 229 | alpha=.3, 230 | level = .65) 231 | ``` 232 | 233 | 234 | ## Count comparisons (cumulative) 235 | 236 | 237 | ```{r} 238 | comp_summary_cumsum <- comp_summary[ 239 | i = TRUE, 240 | j = list(`\nInspection Date` = Inspection_Date, 241 | `Business As Usual` = cumsum(Crit_Violations_BAU), 242 | `Data Driven` = cumsum(Crit_Violations_Model), 243 | `Best Possible` = cumsum(Best_Possible), 244 | `Worst Possible` = cumsum(Worst_Possible))] 245 | 246 | 247 | ggplot(melt(data = comp_summary_cumsum, 248 | id.vars = "\nInspection Date")) + 249 | aes(x = `\nInspection Date`, 250 | y = value, 251 | colour = variable) + 252 | labs(title="Comparing cumulative violations discovered\n") + 253 | ylab("Cumulative critical violations to date\n") + 254 | geom_line(lwd=1.5) + 255 | geom_point(colour="black") 256 | 257 | ``` 258 | 259 | ```{r} 260 | comp_summary_cumsum_diff <- comp_summary_cumsum[ 261 | i = TRUE, 262 | j = list(Inspection_Date, 263 | diff = Model - BAU)] 264 | 265 | ggplot(comp_summary_cumsum_diff) + 266 | aes(x=Inspection_Date, y=diff) + 267 | labs(title=paste0("Difference Between BAU and Modeled\n", 268 | "Cumulative Critical Violations Found\n")) + 269 | ylab("Cumulative Difference of Critical Violations Found") + 270 | geom_line() 271 | # geneorama::clipper(comp_summary) 272 | ``` 273 | 274 | 275 | 276 | ```{r} 277 | comp_summary_cumsum_diff_multi <- comp_summary_cumsum[ 278 | i = TRUE, 279 | j = list(Inspection_Date, 280 | bau_diff = Best_Possible - BAU, 281 | model_diff = Best_Possible - Model)] 282 | 283 | ggplot(melt(comp_summary_cumsum_diff_multi, 284 | id.vars = "Inspection_Date")) + 285 | aes(x=Inspection_Date, y=value, colour=variable) + 286 | labs(title=paste0("Difference Between BAU and Modeled\n", 287 | "Cumulative Critical Violations Found\n")) + 288 | ylab("Cumulative Difference of Critical Violations Found") + 289 | geom_line() 290 | # geneorama::clipper(comp_summary) 291 | ``` 292 | 293 | 294 | ```{r} 295 | geneorama::set_project_dir("food-inspections-evaluation") 296 | foodInspect <- readRDS("DATA/13_food_inspections.Rds") 297 | foodInspect <- foodInspect[ , Violations := NULL] 298 | 299 | setkey(foodInspect, Inspection_ID) 300 | setkey(date_comp, id) 301 | 302 | date_comp_w_name <- foodInspect[,DBA_Name,keyby=Inspection_ID][date_comp] 303 | date_comp_w_name <- date_comp_w_name[ , criticalFound_Model:=NULL] 304 | setnames(date_comp_w_name, "Inspection_ID", "id") 305 | setnames(date_comp_w_name, "DBA_Name", "dba_name") 306 | setnames(date_comp_w_name, "criticalFound_BAU", "crit") 307 | 308 | # date_comp_w_name 309 | 310 | set.seed(27) 311 | ii <- c(sample(date_comp_w_name[crit==1,id], 5), 312 | sample(date_comp_w_name[crit==0,id], 20)) 313 | 314 | samp <- date_comp_w_name[id %in% ii] 315 | samp 316 | 317 | plot(data.frame(x = c(1, 1, 3, 3), 318 | y = c(1, nrow(samp), 1, nrow(samp))), 319 | type = "n", axes=F, xlab="", ylab="") 320 | samp_dba <- samp[order(samp$date_bau), ] 321 | samp_model <- samp[order(samp$date_model), ] 322 | 323 | text(x = 1.5, y = 1:nrow(samp_dba), labels = samp_dba$dba_name, 324 | col = ifelse(samp_dba$crit==1,"red", "black") ) 325 | text(x = 2.5, y = 1:nrow(samp_model), labels = samp_model$dba_name, 326 | col = ifelse(samp_model$crit==1,"red", "black") ) 327 | 328 | ``` 329 | 330 | 331 | 332 | -------------------------------------------------------------------------------- /REPORTS/Metric_Development.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/REPORTS/Metric_Development.xlsx -------------------------------------------------------------------------------- /REPORTS/Metric_Development_bargraphs.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/REPORTS/Metric_Development_bargraphs.xlsx -------------------------------------------------------------------------------- /REPORTS/Metric_Development_day_diff.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/REPORTS/Metric_Development_day_diff.xlsx -------------------------------------------------------------------------------- /REPORTS/ModelSummary_20141204.Rmd: -------------------------------------------------------------------------------- 1 | 2 | ## Purpose of this report: 3 | 4 | This report is (basically) just a hard copy of what you should see if you run the file "CODE/30\_glmnet\_model.R". It's only used for debugging / checking results. (Can be useful if you download new data or try a new model, and want to compare results to the "original"). 5 | 6 | ## Initialization 7 | 8 | ```{r, echo=TRUE, warning=FALSE, results='hide', message=FALSE} 9 | geneorama::set_project_dir("food-inspections-evaluation") 10 | ##============================================================================== 11 | ## INITIALIZE 12 | ##============================================================================== 13 | ## Remove all objects; perform garbage collection 14 | rm(list=ls()) 15 | gc(reset=TRUE) 16 | ## Detach libraries that are not used 17 | geneorama::detach_nonstandard_packages() 18 | ## Load libraries that are used 19 | geneorama::loadinstall_libraries(c("data.table", "glmnet", "ggplot2", "knitr")) 20 | ## Load custom functions 21 | geneorama::sourceDir("CODE/functions/") 22 | 23 | ## Set knitr options 24 | opts_chunk$set(tidy = FALSE) 25 | ``` 26 | 27 | ## Load Data 28 | 29 | ```{r} 30 | geneorama::set_project_dir("food-inspections-evaluation") 31 | ##============================================================================== 32 | ## LOAD CACHED RDS FILES 33 | ##============================================================================== 34 | dat <- readRDS("DATA/23_dat_model.Rds") 35 | 36 | ## Only keep "Retail Food Establishment" 37 | dat <- dat[LICENSE_DESCRIPTION == "Retail Food Establishment"] 38 | ## Remove License Description 39 | dat[ , LICENSE_DESCRIPTION := NULL] 40 | dat <- na.omit(dat) 41 | 42 | ## Add criticalFound variable to dat: 43 | dat[ , criticalFound := pmin(1, criticalCount)] 44 | 45 | ## Set the key for dat 46 | setkey(dat, Inspection_ID) 47 | 48 | ##============================================================================== 49 | ## CREATE MODEL DATA 50 | ##============================================================================== 51 | # sort(colnames(dat)) 52 | xmat <- dat[ , list(criticalFound, 53 | # Inspector = Inspector_Grade, 54 | Inspector = Inspector_Assigned, 55 | pastSerious = pmin(pastSerious, 1), 56 | ageAtInspection = ifelse(ageAtInspection > 4, 1L, 0L), 57 | pastCritical = pmin(pastCritical, 1), 58 | consumption_on_premises_incidental_activity, 59 | tobacco_retail_over_counter, 60 | temperatureMax, 61 | heat_burglary = pmin(heat_burglary, 70), 62 | heat_sanitation = pmin(heat_sanitation, 70), 63 | heat_garbage = pmin(heat_garbage, 50), 64 | # risk = as.factor(Risk), 65 | # facility_type = as.factor(Facility_Type), 66 | timeSinceLast), 67 | keyby = Inspection_ID] 68 | mm <- model.matrix(criticalFound ~ . -1, data=xmat[ , -1, with=F]) 69 | mm <- as.data.table(mm) 70 | str(mm) 71 | colnames(mm) 72 | ``` 73 | 74 | ## Set up, run, and evaluate model 75 | 76 | ```{r} 77 | ##============================================================================== 78 | ## CREATE TEST / TRAIN PARTITIONS 79 | ##============================================================================== 80 | ## 2014-07-01 is an easy separator 81 | dat[Inspection_Date < "2014-07-01", range(Inspection_Date)] 82 | dat[Inspection_Date > "2014-07-01", range(Inspection_Date)] 83 | 84 | iiTrain <- dat[ , which(Inspection_Date < "2014-07-01")] 85 | iiTest <- dat[ , which(Inspection_Date > "2014-07-01")] 86 | 87 | ## Check to see if any rows didn't make it through the model.matrix formula 88 | nrow(dat) 89 | nrow(xmat) 90 | nrow(mm) 91 | 92 | ##============================================================================== 93 | ## GLMNET MODEL 94 | ##============================================================================== 95 | # fit ridge regression, alpha = 0, only inspector coefficients penalized 96 | pen <- ifelse(grepl("^Inspector", colnames(mm)), 1, 0) 97 | model <- glmnet(x = as.matrix(mm[iiTrain]), 98 | y = xmat[iiTrain, criticalFound], 99 | family = "binomial", 100 | alpha = 0, 101 | penalty.factor = pen) 102 | 103 | ## See what regularization parameter 'lambda' is optimal on tune set 104 | ## (We are essentially usin the previous hardcoded value) 105 | errors <- sapply(model$lambda, 106 | function(lam) 107 | logLik(p = predict(model, 108 | newx = as.matrix(mm[iiTrain]), 109 | s=lam, 110 | type="response")[,1], 111 | y = xmat[iiTrain, criticalFound])) 112 | plot(x=log(model$lambda), y=errors, type="l") 113 | which.min(errors) 114 | model$lambda[which.min(errors)] 115 | ## manual lambda selection 116 | w.lam <- 100 117 | lam <- model$lambda[w.lam] 118 | coef <- model$beta[,w.lam] 119 | inspCoef <- coef[grepl("^Inspector",names(coef))] 120 | inspCoef <- inspCoef[order(-inspCoef)] 121 | ## coefficients for the inspectors, and for other variables 122 | inspCoef 123 | coef[!grepl("^Inspector",names(coef))] 124 | 125 | 126 | ## By the way, if we had knowledge of the future, we would have chosen a 127 | ## different lambda 128 | errorsTest <- sapply(model$lambda, 129 | function(lam) 130 | logLik(p = predict(model, 131 | newx = as.matrix(mm[iiTest]), 132 | s=lam, 133 | type="response")[,1], 134 | y = xmat[iiTest, criticalFound])) 135 | plot(x=log(model$lambda), y=errorsTest, type="l") 136 | which.min(errorsTest) 137 | model$lambda[which.min(errorsTest)] 138 | 139 | ## ATTACH PREDICTIONS TO DAT 140 | dat$glm_pred <- predict(model, newx=as.matrix(mm), 141 | s=lam, 142 | type="response")[,1] 143 | 144 | # show gini performance of inspector model on tune data set 145 | dat[iiTest, gini(glm_pred, criticalFound, plot=TRUE)] 146 | 147 | ## Calculate confusion matrix values for evaluation 148 | calculate_confusion_values(actual = xmat[iiTest, criticalFound], 149 | expected = dat[iiTest, glm_pred], 150 | r = .25) 151 | 152 | ## Calculate matrix of confusion matrix values for evaluation 153 | confusion_values_test <- t(sapply(seq(0, 1 ,.01), 154 | calculate_confusion_values, 155 | actual = xmat[iiTest, criticalFound], 156 | expected = dat[iiTest, glm_pred])) 157 | confusion_values_test 158 | ggplot(reshape2::melt(as.data.table(confusion_values_test), 159 | id.vars="r")) + 160 | aes(x=r, y=value, colour=variable) + geom_line() + 161 | geom_hline(yintercept = c(0,1)) 162 | 163 | ##============================================================================== 164 | ## CALCULATION OF LIFT 165 | ##============================================================================== 166 | ## TEST PERIOD: Date range 167 | dat[iiTest, range(Inspection_Date)] 168 | ## TEST PERIOD: Total inspections 169 | dat[iiTest, .N] 170 | ## TEST PERIOD: Critical found 171 | dat[iiTest, sum(criticalCount)] 172 | ## TEST PERIOD: Inspections with any critical violations 173 | dat[iiTest, sum(criticalFound)] 174 | 175 | ## Subset test period 176 | datTest <- dat[iiTest] 177 | ## Identify first period 178 | datTest[ , period := ifelse(Inspection_Date < median(Inspection_Date),1,2)] 179 | datTest[, .N, keyby=list(period)] 180 | datTest[, .N, keyby=list(Inspection_Date, period)] 181 | ## Identify top half of scores (which would have been the first period) 182 | datTest[ , period_modeled := ifelse(glm_pred > median(glm_pred), 1, 2)] 183 | 184 | datTest[period == 1, sum(criticalFound)] 185 | datTest[period_modeled == 1, sum(criticalFound)] 186 | 187 | datTest[, list(.N, Violations = sum(criticalFound)), keyby=list(period)] 188 | datTest[, list(.N, Violations = sum(criticalFound)), keyby=list(period_modeled)] 189 | 190 | 141 / (141 + 117) 191 | 178 / (178 + 80) 192 | 0.6899225 - .5465116 193 | 194 | 0.65 - .55 195 | ``` 196 | 197 | 198 | 199 | 200 | -------------------------------------------------------------------------------- /REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.bib: -------------------------------------------------------------------------------- 1 | 2 | @techreport{liu_food_????, 3 | address = {Manhattan, {KS}}, 4 | title = {Food Safety Training Needed for Asian Restaurants: Longitudinal Review of Health Inspection Data in Kansas}, 5 | abstract = {The purpose of this study was to assess the frequencies and types of food code violations at Asian restaurants in Kansas using longitudinal review of health inspection data. A total of 326 restaurant inspection reports from 156 Asian restaurants in 10 Kansas counties were reviewed. Descriptive and inferential statistics were calculated using {SPSS}. The findings of this study suggested the focus areas for food safety training in Asian restaurants: temperature control of {PHF}; employee personal hygiene; and employee hand washing practices. Also, our results indicated that behavior-related violations, especially behavior-related critical violations occurred more during the routine health inspection than other inspection types. In the future, researches could identify the effective ways to overcome barriers to food safety training in Asian restaurants. Through this investigation, Asian restaurant owners and managers may gain insights on what food handling practices related to code violations they should emphasize when training their employees.}, 6 | institution = {Kansas State University}, 7 | author = {Liu, Pei and Lee, Yee Ming and Choi, Young Gin and Kwon, Junehee}, 8 | keywords = {Asian restaurant, food code violation, food inspections, food safety training, foodborne illness, health inspection}, 9 | file = {Food and Safety Training Needed for Asian Restaurants- Longitudin.pdf:/Users/tomschenk/Library/Application Support/Zotero/Profiles/2mv6rop3.default/zotero/storage/IVZ5DESU/Food and Safety Training Needed for Asian Restaurants- Longitudin.pdf:application/pdf} 10 | } 11 | 12 | @article{phillips_recurrent_????, 13 | title = {Recurrent Critical Violations of the Food Code in Retail Food Service Establishments}, 14 | volume = {68}, 15 | abstract = {Records of restaurant inspections by public healtb departments provide sequential "snapshots" of conditions in retail food service establishments that can be used to identil)' risk factors and evaluate the effectiveness 16 | of interventions. Data from a random 10 percent sample of restaurant inspection files from 31 counties in Oklahoma, including 4,044 inspections conducted during 1996-2000 in "medium-risk" and "bigh-risk" establishments, were analyzed to determine rates of critical violations and recurrent violations for different categories of estahlisbments. Repeat violations accounted for about half of all violations. Estahlisbments subjectively designated as high risk by healtb department personnel were in fact found to have higher violation rates than those described as medium-risk establishments. Outside Oklahoma County, regional cbain restaurants were significantly more likely than other restaurants to bave recurrent violations of critical items related to food-holding temperature, hygiene practices, sanitization, and hygiene facilities. Differences observed in violation rates among individual establishments were not primarily attributable to inconsistent enforcement by individual inspectors; rather, they appeared to be indicative of real differences in bygienic conditions and practices.}, 17 | number = {10}, 18 | journal = {Journal of Environmental Health}, 19 | author = {Phillips, Margaret L. and Elledge, Brenda L. and Basara, Heather G. and Lynch, Robert A. and Boatright, Daniel T.}, 20 | keywords = {food inspections, health}, 21 | pages = {24--30}, 22 | file = {RecurrentViolations.pdf:/Users/tomschenk/Library/Application Support/Zotero/Profiles/2mv6rop3.default/zotero/storage/M27PH57F/RecurrentViolations.pdf:application/pdf} 23 | } 24 | 25 | @article{hoag_risk-based_2007, 26 | title = {A Risk-Based Food Inspection Program}, 27 | volume = {69}, 28 | abstract = {The inspection of food facilities is a crucial public service designed to prevent foodborne illnesses among retail food consumers. To enhance the existing food inspection process in San Bernardino County, California, a risk-based food inspection program and assessment instrument has been developed and proposed. A literature review and interviews with health professionals were conducted to establish a baseline understanding of various inspection procedures currently being employed throughout the nation. San Bernardino subsequently developed an assessment instrument and attendant inspection schedules that reflect best practices. The proposed inspection model categorizes food facilities as high, moderate, or low risk according to food properties, service population characteristics, facility history, and predefined operational risks. The San Bernardino model supports health department decision making with respect to inspection resource allocation and also makes possible sliding permit fees that reflect the relative risk associated with each facility.}, 29 | number = {7}, 30 | journal = {Journal of Environmental Health}, 31 | author = {Hoag, Michelle A. and Porter, Corwin and Uppala, Padma and Dyjack, David T.}, 32 | month = mar, 33 | year = {2007}, 34 | pages = {33--36} 35 | } 36 | 37 | @article{harris_health_2014, 38 | title = {Health Department Use of Social Media to Identify Foodborne Illness — Chicago, Illinois, 2013–2014}, 39 | volume = {63}, 40 | url = {http://www.cdc.gov/mmwr/preview/mmwrhtml/mm6332a1.htm}, 41 | number = {32}, 42 | urldate = {2014-12-24}, 43 | journal = {Morbidity and Mortality Weekly Report}, 44 | author = {Harris, Jenine K. and Mansour, Raed and Choucair, Bechara and Olson, Joe and Nissen, Cory and Bhatt, Jay}, 45 | month = aug, 46 | year = {2014}, 47 | pages = {681--685}, 48 | file = {Health Department Use of Social Media to Identify Foodborne Illness — Chicago, Illinois, 2013–2014:/Users/tomschenk/Library/Application Support/Zotero/Profiles/2mv6rop3.default/zotero/storage/WI73PGEZ/mm6332a1.html:text/html} 49 | } 50 | 51 | @article{friedman_regularization_2010, 52 | title = {Regularization Paths for Generalized Linear Models via Coordinate Descent}, 53 | volume = {33}, 54 | issn = {1548-7660}, 55 | url = {http://www.ncbi.nlm.nih.gov/pmc/articles/PMC2929880/}, 56 | abstract = {We develop fast algorithms for estimation of generalized linear models with convex penalties. The models include linear regression, two-class logistic regression, and multinomial regression problems while the penalties include ℓ1 (the lasso), ℓ2 (ridge regression) and mixtures of the two (the elastic net). The algorithms use cyclical coordinate descent, computed along a regularization path. The methods can handle large problems and can also deal efficiently with sparse features. In comparative timings we find that the new algorithms are considerably faster than competing methods.}, 57 | number = {1}, 58 | urldate = {2015-02-08}, 59 | journal = {Journal of Statistical Software}, 60 | author = {Friedman, Jerome and Hastie, Trevor and Tibshirani, Rob}, 61 | year = {2010}, 62 | pmid = {20808728}, 63 | pmcid = {PMC2929880}, 64 | pages = {1--22}, 65 | file = {PubMed Central Full Text PDF:/Users/tomschenk/Library/Application Support/Zotero/Profiles/2mv6rop3.default/zotero/storage/NTTQQTW6/Friedman et al. - 2010 - Regularization Paths for Generalized Linear Models.pdf:application/pdf} 66 | } -------------------------------------------------------------------------------- /REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.docx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.docx -------------------------------------------------------------------------------- /REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Chicago/food-inspections-evaluation/603d8dba4d2ea266aced60ac49bdd6ee96704d05/REPORTS/forecasting-restaurants-with-critical-violations-in-Chicago.pdf -------------------------------------------------------------------------------- /REPORTS/references.bib: -------------------------------------------------------------------------------- 1 | @Article{Friedman_2010, 2 | title = {Regularization Paths for Generalized Linear Models via Coordinate Descent}, 3 | author = {Jerome Friedman and Trevor Hastie and Robert Tibshirani}, 4 | journal = {Journal of Statistical Software}, 5 | year = {2010}, 6 | volume = {33}, 7 | number = {1}, 8 | pages = {1--22}, 9 | url = {http://www.jstatsoft.org/v33/i01/}, 10 | } 11 | 12 | @Book{Venables_2002, 13 | title = {Modern Applied Statistics with S}, 14 | author = {W. N. Venables and B. D. Ripley}, 15 | publisher = {Springer}, 16 | edition = {Fourth}, 17 | address = {New York}, 18 | year = {2002}, 19 | note = {ISBN 0-387-95457-0}, 20 | url = {http://www.stats.ox.ac.uk/pub/MASS4}, 21 | } 22 | 23 | @Article{harris_health_2014, 24 | title = {Health Department Use of Social Media to Identify Foodborne Illness — Chicago, Illinois, 2013–2014}, 25 | volume = {63}, 26 | url = {http://www.cdc.gov/mmwr/preview/mmwrhtml/mm6332a1.htm}, 27 | number = {32}, 28 | urldate = {2014-12-24}, 29 | journal = {Morbidity and Mortality Weekly Report}, 30 | author = {Jenine K. Harris and Raed Mansour and Bechara Choucair and Joe Olson and Cory Nissen and Jay Bhatt}, 31 | month = {aug}, 32 | year = {2014}, 33 | pages = {681--685}, 34 | file = {Health Department Use of Social Media to Identify Foodborne Illness — Chicago, Illinois, 2013–2014:/Users/tomschenk/Library/Application Support/Zotero/Profiles/2mv6rop3.default/zotero/storage/WI73PGEZ/mm6332a1.html:text/html}, 35 | } 36 | 37 | @Article{Simon_2011, 38 | title = {Regularization Paths for Cox's Proportional Hazards Model via Coordinate Descent}, 39 | author = {Noah Simon and Jerome Friedman and Trevor Hastie and Rob Tibshirani}, 40 | journal = {Journal of Statistical Software}, 41 | year = {2011}, 42 | volume = {39}, 43 | number = {5}, 44 | pages = {1--13}, 45 | url = {http://www.jstatsoft.org/v39/i05/}, 46 | } 47 | -------------------------------------------------------------------------------- /food-inspections-evaluation.Rproj: -------------------------------------------------------------------------------- 1 | Version: 1.0 2 | 3 | RestoreWorkspace: No 4 | SaveWorkspace: No 5 | AlwaysSaveHistory: Default 6 | 7 | EnableCodeIndexing: Yes 8 | UseSpacesForTab: Yes 9 | NumSpacesForTab: 4 10 | Encoding: UTF-8 11 | 12 | RnwWeave: Sweave 13 | LaTeX: pdfLaTeX 14 | --------------------------------------------------------------------------------