├── .gitignore ├── scripts ├── README.md └── scrape.js ├── data └── forces.csv └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | .*.swp 2 | cache/* 3 | node_modules/* 4 | -------------------------------------------------------------------------------- /scripts/README.md: -------------------------------------------------------------------------------- 1 | scrape.js is a node script. You'll need to install the relevant dependencies which are: 2 | 3 | npm install jsdom request 4 | 5 | -------------------------------------------------------------------------------- /data/forces.csv: -------------------------------------------------------------------------------- 1 | id,Label,Population 2010 2 | avon-and-somerset,Avon & Somerset Constabulary,1623200 3 | bedfordshire,Bedfordshire Police,614800 4 | cambridgeshire,Cambridgeshire Constabulary,789700 5 | cheshire,Cheshire Constabulary,1009300 6 | city-of-london,City of London Police,563500 7 | cleveland,Cleveland Police,494400 8 | cumbria,Cumbria Constabulary,1010600 9 | derbyshire,Derbyshire Constabulary,1680400 10 | devon-and-cornwall,Devon & Cornwall Police,715000 11 | dorset,Dorset Police,611600 12 | durham,Durham Constabulary,506100 13 | dyfed-powys,Dyfed Powys Police,1738000 14 | essex,Essex Police,593500 15 | gloucestershire,Gloucestershire Constabulary,2629400 16 | greater-manchester,Greater Manchester Police,561400 17 | gwent,Gwent Police,1884200 18 | hampshire,Hampshire Constabulary,1107500 19 | hertfordshire,Hertfordshire Constabulary,921200 20 | humberside,Humberside Constabulary,1684100 21 | kent,Kent Police,1449300 22 | lancashire,Lancashire Constabulary,993900 23 | leicestershire,Leicestershire Police,703000 24 | lincolnshire,Lincolnshire Police,11700 25 | merseyside,Merseyside Police,1353400 26 | metropolitan,Metropolitan Police Service,7813500 27 | norfolk,Norfolk Constabulary,862300 28 | north-wales,North Wales Police,678500 29 | north-yorkshire,North Yorkshire Police,802200 30 | northamptonshire,Northamptonshire Police,687300 31 | northumbria,Northumbria Police,1431500 32 | nottinghamshire,Nottinghamshire Police,1086600 33 | south-wales,South Wales Police,1260500 34 | south-yorkshire,South Yorkshire Police,1328300 35 | staffordshire,Staffordshire Police,1071400 36 | suffolk,Suffolk Constabulary,719500 37 | surrey,Surrey Police,1127300 38 | sussex,Sussex Police,1574000 39 | thames-valley,Thames Valley Police,2253500 40 | warwickshire,Warwickshire Police,536000 41 | west-mercia,West Mercia Police,1192700 42 | west-midlands,West Midlands Police,2655100 43 | west-yorkshire,West Yorkshire Police,2249500 44 | wiltshire,Wiltshire Police,661600 45 | btp,British Transport Police, -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | badge 2 | 3 | UK Crime data. This is consolidation of, pointers to, and scripts for, data 4 | from various sources, primarily the UK Government site at 5 | . 6 | 7 | ## Forces Info 8 | 9 | See data/forces.csv and a GDocs version: 10 | 11 | 12 | ### Population Data 13 | 14 | Police Force area population numbers from the Home Office: 15 | 16 | 17 | ## Stats format 18 | 19 |
20 | Date,Period,Area,Type,Count
21 | 2012-04-01,Month,Avon and Somerset Constabulary,Burglary,51
22 | 
23 | 24 | ## Street 25 | 26 |
27 | Month,Reported by,Falls within,Easting,Northing,Location,Crime type,Context
28 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,360667,169714,On or near Hengrove Lane,Burglary,
29 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,363360,174812,On or near Mayfield Park South,Burglary,
30 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,356417,116223,On or near Hillside Terrace,Burglary,
31 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,366011,173580,On or near Brompton Close,Burglary,
32 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,360070,169484,On or near Parsons Paddock,Burglary,
33 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,372396,164022,On or near East Way,Burglary,
34 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,323046,124501,On or near Nightclub,Burglary,
35 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,360099,171539,On or near Firfield Street,Burglary,
36 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,374604,164984,On or near Charlotte Street,Burglary,
37 | 
38 | 39 | ## Neighbourhood 40 | 41 | Month,Force,Neighbourhood,All crime and ASB,Burglary,Anti-social behaviour,Robbery,Vehicle crime,Violent crime,Public disorder and weapons,Shoplifting,Criminal damage and arson,Other theft,Drugs,Other crime 42 | 2012-04,Avon and Somerset Constabulary,JN310,1,1,0,0,0,0,0,0,0,0,0,0 43 | 2012-04,Avon and Somerset Constabulary,FW004,11,1,1,0,1,2,0,1,2,1,2,0 44 | 2012-04,Avon and Somerset Constabulary,JC205,8,0,7,0,0,0,0,1,0,0,0,0 45 | 2012-04,Avon and Somerset Constabulary,JW110,9,2,4,0,0,0,0,0,2,1,0,0 46 | 2012-04,Avon and Somerset Constabulary,JW112,2,0,1,0,0,0,0,0,0,1,0,0 47 | 2012-04,Avon and Somerset Constabulary,JW109,13,0,5,0,2,0,0,0,1,3,2,0 48 | 2012-04,Avon and Somerset Constabulary,JW108,51,4,17,0,1,3,0,5,10,7,2,2 49 | 2012-04,Avon and Somerset Constabulary,JW107,39,2,23,0,1,8,0,0,3,1,0,1 50 | 2012-04,Avon and Somerset Constabulary,JW106,41,5,13,0,1,6,0,2,7,3,1,3 51 | 52 | ## Outcomes 53 | 54 |
55 | Month,Reported by,Falls within,Easting,Northing,Location,Outcome type
56 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Suspect charged
57 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Local resolution
58 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Offender sentenced as part of another case
59 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Offender given a caution
60 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,358631,172503,On or near Prince Street,Suspect charged
61 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,362083,182740,On or near Honeysuckle Close,Local resolution
62 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Suspect charged
63 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Offender given a caution
64 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,375327,164974,On or near Johnstone Street,Offender given a caution
65 | 
66 | 67 | -------------------------------------------------------------------------------- /scripts/scrape.js: -------------------------------------------------------------------------------- 1 | var fs = require('fs'); 2 | var path = require('path'); 3 | var spawn = require('child_process').spawn; 4 | 5 | var jsdom = require('jsdom'); 6 | var request = require('request'); 7 | var csv = require('csv'); 8 | 9 | 10 | var linklist = 'http://police.uk/data'; 11 | var outlistfp = 'cache/linklist.json'; 12 | var zipsdir = 'cache/zip'; 13 | var csvdir = 'cache/csv'; 14 | 15 | if (!path.existsSync('cache')) { 16 | fs.mkdirSync('cache'); 17 | } 18 | if (!path.existsSync(zipsdir)) { 19 | fs.mkdirSync(zipsdir); 20 | } 21 | if (!path.existsSync(csvdir)) { 22 | fs.mkdirSync(csvdir); 23 | } 24 | 25 | function scrapeLinkList() { 26 | var out = { 27 | 'streets': [], 28 | 'neighbourhoods': [], 29 | 'outcomes': [] 30 | }; 31 | jsdom.env({ 32 | html: linklist, 33 | scripts: [ 34 | 'http://code.jquery.com/jquery.js' 35 | ], 36 | done: function(errors, window) { 37 | var $ = window.$; 38 | // street files 39 | $('#downloads .months table tr td:nth-child(2) a').each(function(idx, elem) { 40 | out['streets'].push( $(elem).attr('href') ); 41 | }); 42 | // neighbourhoods files 43 | $('#downloads .months table tr td:nth-child(3) a').each(function(idx, elem) { 44 | out['neighbourhoods'].push( $(elem).attr('href') ); 45 | }); 46 | // not interested in outcomes atm (final column) 47 | 48 | $('#downloads .months table').first().find('tr').slice(1).each(function(idx, elem) { 49 | var obj = { 50 | id: $(elem).find('th').first().text(), 51 | Label: $($(elem).find('td a')[1]).attr('href').split('/')[6].slice(8).replace('-neighbourhood.zip', '') 52 | } 53 | out['forces'].push(obj); 54 | }); 55 | out['forces'].push({ 56 | id: 'btp', 57 | label: 'British Transport Police' 58 | }); 59 | 60 | // now save 61 | fs.writeFile(outlistfp, JSON.stringify(out, null, 2), function(err) { 62 | console.log('JSON saved to ' + outlistfp); 63 | }); 64 | fs.writeFile('data/forces.json', JSON.stringify(out['forces'], null, 2), function(err) { 65 | console.log('Forces data written'); 66 | }); 67 | } 68 | }); 69 | } 70 | 71 | function scrapeForces() { 72 | var out = { 73 | 'forces': [] 74 | }; 75 | jsdom.env({ 76 | html: linklist, 77 | scripts: [ 78 | 'http://code.jquery.com/jquery.js' 79 | ], 80 | done: function(errors, window) { 81 | // now save 82 | var writer = csv().to.path('data/forces.csv'); 83 | writer.write(['id', 'label']); 84 | 85 | var $ = window.$; 86 | $('#downloads .months table').first().find('tr').slice(1).each(function(idx, elem) { 87 | var obj = [ 88 | $($(elem).find('td a')[1]).attr('href').split('/')[6].slice(8).replace('-neighbourhood.zip', ''), 89 | $(elem).find('th').first().text() 90 | ]; 91 | writer.write(obj); 92 | }); 93 | writer.write([ 94 | 'btp', 95 | 'British Transport Police' 96 | ]); 97 | } 98 | }); 99 | } 100 | 101 | function scrapeZip() { 102 | var links = JSON.parse(fs.readFileSync(outlistfp))['streets']; 103 | links.forEach(function(link) { 104 | var fn = path.join(zipsdir, link.split('/').pop()); 105 | var stream = fs.createWriteStream(fn); 106 | request(link) 107 | .pipe(stream) 108 | .on('close', function() { 109 | console.log(fn); 110 | }); 111 | 112 | stream.on('error', function(e) { 113 | console.error(e); 114 | }) 115 | }); 116 | } 117 | 118 | function consolidateZipToCsv() { 119 | var stats = { 120 | total: 0, 121 | total_with_location: 0 122 | }; 123 | // get weird error - think it is related to csv processing not unbinding listeners from this stream ... 124 | // (node) warning: possible EventEmitter memory leak detected. 11 listeners added. Use emitter.setMaxListeners() to increase limit. 125 | // emitter.setMaxListeners(50) 126 | // stream.setMaxListeners(0); 127 | 128 | // write the header 129 | // stream.write('Month,Reported by,Falls within,Longitude,Latitude,Location,Crime type,Context\n'); 130 | // Drop "Falls within" as always repetition of Reported by AFAICt - see below for drop during processing 131 | var headers = 'Month,Reported by,Longitude,Latitude,Location,Crime type,Context'.split(','); 132 | 133 | var links = JSON.parse(fs.readFileSync(outlistfp))['streets']; 134 | links = links.slice(0,45); 135 | 136 | function process(link, cb) { 137 | var fn = link.split('/').pop(); 138 | var csvpath = path.join(csvdir, fn + '.csv'); 139 | var zipfp = path.join(zipsdir, link.split('/').pop()); 140 | console.log('Processing: ' + zipfp + ' to ' + csvpath); 141 | var unzip = spawn('unzip', ['-p', zipfp]) 142 | csv() 143 | .from.stream(unzip.stdout) 144 | .to.path(csvpath) 145 | .transform(function(data, idx) { 146 | if (idx == 0) { 147 | return headers; 148 | } 149 | stats.total += 1; 150 | // we will be a bit brutal - discard everything w/o a location 151 | // data[3] = Easting 152 | if (!data[3]) { 153 | return; 154 | } 155 | stats.total_with_location += 1; 156 | 157 | // fix up easting / northing to lon/lat 158 | var newval = convertEastingNorthingToLonLat(data.slice(3,5)); 159 | data.splice(3,2,newval[0], newval[1]); 160 | 161 | // fix month 162 | data[0] = data[0] + '-01'; 163 | 164 | // let's drop 'Falls within' but note first if "Reported by" and "Falls within" are different 165 | if (data[1] != data[2]) { // so unusual worth noting!! 166 | console.log(data); 167 | } 168 | data.splice(1,1); 169 | return data; 170 | }) 171 | .on('record', function(data) { 172 | }) 173 | .on('end', function() { 174 | console.log(stats); 175 | cb(); 176 | }) 177 | .on('error', function(error) { 178 | console.log(error.message); 179 | }); 180 | } 181 | var idx = 0; 182 | var looper = function() { 183 | if (idx >= links.length) { 184 | console.log(stats); 185 | return; 186 | } else { 187 | process(links[idx], looper); 188 | idx += 1; 189 | } 190 | }; 191 | looper(); 192 | } 193 | 194 | // Set up Projections for OSGB36 => WGS84 conversion 195 | var proj4js = require('proj4js'); 196 | // hat-tip to Peter Hicks for providing the conversion spec 197 | // http://blog.poggs.com/2010/09/converting-osgb36-eastingsnorthings-to-wgs84-longitudelatitude-in-ruby/ 198 | proj4js.defs["OSGB36"]="+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +ellps=airy +datum=OSGB36 +units=m +no_defs"; 199 | var srcproj = new proj4js.Proj('OSGB36'); 200 | 201 | // eastingsnorthing must be an array (easting, northing) 202 | // :return: [lon, lat] 203 | var convertEastingNorthingToLonLat = function(eastingnorthing) { 204 | var eastingnorthing = [ parseInt(eastingnorthing[0]), parseInt(eastingnorthing[1]) ]; 205 | if (eastingnorthing[0]) { 206 | var point = new proj4js.Point(eastingnorthing); 207 | var out = proj4js.transform(srcproj, proj4js.WGS84, point); 208 | // 5 decimal places is ~1m accuracy 209 | return [round(out.x,6), round(out.y,6)]; 210 | } else { 211 | return null; 212 | } 213 | }; 214 | 215 | function round(num, decPlaces) { 216 | var scale = Math.pow(10,decPlaces); 217 | return Math.round(num*scale)/scale; 218 | } 219 | 220 | function computeStats(filterString) { 221 | var links = JSON.parse(fs.readFileSync(outlistfp))['streets']; 222 | links = links.filter(function(link) { 223 | if (filterString) return (link.indexOf(filterString) != -1); 224 | else return true; 225 | }); 226 | 227 | var stats = { 228 | }; 229 | // Month,Reported by,Longitude,Latitude,Location,Crime type,Context 230 | var distinctRows = [0,1,5]; 231 | var processRow = function(row) { 232 | var key = distinctRows.map(function(idx) { 233 | return row[idx]; 234 | }).join(':::'); 235 | if (key in stats) { 236 | stats[key] = stats[key] + 1; 237 | } else { 238 | stats[key] = 1; 239 | } 240 | } 241 | var writeStats = function(theStats, stream) { 242 | for (key in theStats) { 243 | var row = ['Month']; 244 | key.split(':::').forEach(function(val) { 245 | row.push(val); 246 | }); 247 | row.push(theStats[key]); 248 | stream.write(row); 249 | }; 250 | } 251 | 252 | headers = 'Period,Date,Body,Type,Count'.split(','); 253 | var outcsv = csv().to.path('cache/stats.csv'); 254 | outcsv.write(headers); 255 | 256 | var idx = 0; 257 | function process(link, cb) { 258 | var csvpath = _csvFilePathFromLink(link); 259 | csv() 260 | .from.path(csvpath) 261 | .on('record', function(data, idx) { 262 | if (idx > 0) { 263 | processRow(data) 264 | } 265 | }) 266 | .on('end', function() { 267 | console.log('Processed ' + csvpath); 268 | // now the loop 269 | if (idx < links.length-1) { 270 | idx += 1; 271 | writeStats(stats, outcsv); 272 | stats = {}; 273 | cb(links[idx], cb) 274 | } else { 275 | // really finished - write stats 276 | writeStats(stats, outcsv); 277 | } 278 | }); 279 | } 280 | process(links[0], process); 281 | } 282 | 283 | var _csvFilePathFromLink = function(link) { 284 | var fn = link.split('/').pop(); 285 | var csvpath = path.join(csvdir, fn + '.csv'); 286 | return csvpath; 287 | } 288 | 289 | // scrapeLinkList(); 290 | // scrapeZip(); 291 | // consolidateZipToCsv(); 292 | 293 | // removes 'node' and this script 294 | args = process.argv.splice(2); 295 | 296 | if (args.length == 0) { 297 | console.log('Commands are: fixtures | rebuild_db | load '); 298 | return; 299 | } 300 | if (args.length >= 2) { 301 | filter = args[1]; 302 | } 303 | 304 | if (args[0] == 'scrapelinks') { 305 | scrapeLinkList(); 306 | } else if (args[0] == 'scrapeforces') { 307 | scrapeForces(); 308 | } else if (args[0] == 'consolidate') { 309 | consolidateZipToCsv(); 310 | } else if (args[0] == 'stats') { 311 | computeStats(); 312 | } 313 | 314 | --------------------------------------------------------------------------------