16 |
17 | ## Stats format
18 |
19 |
20 | Date,Period,Area,Type,Count
21 | 2012-04-01,Month,Avon and Somerset Constabulary,Burglary,51
22 |
23 |
24 | ## Street
25 |
26 |
27 | Month,Reported by,Falls within,Easting,Northing,Location,Crime type,Context
28 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,360667,169714,On or near Hengrove Lane,Burglary,
29 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,363360,174812,On or near Mayfield Park South,Burglary,
30 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,356417,116223,On or near Hillside Terrace,Burglary,
31 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,366011,173580,On or near Brompton Close,Burglary,
32 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,360070,169484,On or near Parsons Paddock,Burglary,
33 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,372396,164022,On or near East Way,Burglary,
34 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,323046,124501,On or near Nightclub,Burglary,
35 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,360099,171539,On or near Firfield Street,Burglary,
36 | 2012-04,Avon and Somerset Constabulary,Avon and Somerset Constabulary,374604,164984,On or near Charlotte Street,Burglary,
37 |
38 |
39 | ## Neighbourhood
40 |
41 | Month,Force,Neighbourhood,All crime and ASB,Burglary,Anti-social behaviour,Robbery,Vehicle crime,Violent crime,Public disorder and weapons,Shoplifting,Criminal damage and arson,Other theft,Drugs,Other crime
42 | 2012-04,Avon and Somerset Constabulary,JN310,1,1,0,0,0,0,0,0,0,0,0,0
43 | 2012-04,Avon and Somerset Constabulary,FW004,11,1,1,0,1,2,0,1,2,1,2,0
44 | 2012-04,Avon and Somerset Constabulary,JC205,8,0,7,0,0,0,0,1,0,0,0,0
45 | 2012-04,Avon and Somerset Constabulary,JW110,9,2,4,0,0,0,0,0,2,1,0,0
46 | 2012-04,Avon and Somerset Constabulary,JW112,2,0,1,0,0,0,0,0,0,1,0,0
47 | 2012-04,Avon and Somerset Constabulary,JW109,13,0,5,0,2,0,0,0,1,3,2,0
48 | 2012-04,Avon and Somerset Constabulary,JW108,51,4,17,0,1,3,0,5,10,7,2,2
49 | 2012-04,Avon and Somerset Constabulary,JW107,39,2,23,0,1,8,0,0,3,1,0,1
50 | 2012-04,Avon and Somerset Constabulary,JW106,41,5,13,0,1,6,0,2,7,3,1,3
51 |
52 | ## Outcomes
53 |
54 |
55 | Month,Reported by,Falls within,Easting,Northing,Location,Outcome type
56 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Suspect charged
57 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Local resolution
58 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Offender sentenced as part of another case
59 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Offender given a caution
60 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,358631,172503,On or near Prince Street,Suspect charged
61 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,362083,182740,On or near Honeysuckle Close,Local resolution
62 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Suspect charged
63 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,,,No location,Offender given a caution
64 | 2012-08,Avon and Somerset Constabulary,Avon and Somerset Constabulary,375327,164974,On or near Johnstone Street,Offender given a caution
65 |
66 |
67 |
--------------------------------------------------------------------------------
/scripts/scrape.js:
--------------------------------------------------------------------------------
1 | var fs = require('fs');
2 | var path = require('path');
3 | var spawn = require('child_process').spawn;
4 |
5 | var jsdom = require('jsdom');
6 | var request = require('request');
7 | var csv = require('csv');
8 |
9 |
10 | var linklist = 'http://police.uk/data';
11 | var outlistfp = 'cache/linklist.json';
12 | var zipsdir = 'cache/zip';
13 | var csvdir = 'cache/csv';
14 |
15 | if (!path.existsSync('cache')) {
16 | fs.mkdirSync('cache');
17 | }
18 | if (!path.existsSync(zipsdir)) {
19 | fs.mkdirSync(zipsdir);
20 | }
21 | if (!path.existsSync(csvdir)) {
22 | fs.mkdirSync(csvdir);
23 | }
24 |
25 | function scrapeLinkList() {
26 | var out = {
27 | 'streets': [],
28 | 'neighbourhoods': [],
29 | 'outcomes': []
30 | };
31 | jsdom.env({
32 | html: linklist,
33 | scripts: [
34 | 'http://code.jquery.com/jquery.js'
35 | ],
36 | done: function(errors, window) {
37 | var $ = window.$;
38 | // street files
39 | $('#downloads .months table tr td:nth-child(2) a').each(function(idx, elem) {
40 | out['streets'].push( $(elem).attr('href') );
41 | });
42 | // neighbourhoods files
43 | $('#downloads .months table tr td:nth-child(3) a').each(function(idx, elem) {
44 | out['neighbourhoods'].push( $(elem).attr('href') );
45 | });
46 | // not interested in outcomes atm (final column)
47 |
48 | $('#downloads .months table').first().find('tr').slice(1).each(function(idx, elem) {
49 | var obj = {
50 | id: $(elem).find('th').first().text(),
51 | Label: $($(elem).find('td a')[1]).attr('href').split('/')[6].slice(8).replace('-neighbourhood.zip', '')
52 | }
53 | out['forces'].push(obj);
54 | });
55 | out['forces'].push({
56 | id: 'btp',
57 | label: 'British Transport Police'
58 | });
59 |
60 | // now save
61 | fs.writeFile(outlistfp, JSON.stringify(out, null, 2), function(err) {
62 | console.log('JSON saved to ' + outlistfp);
63 | });
64 | fs.writeFile('data/forces.json', JSON.stringify(out['forces'], null, 2), function(err) {
65 | console.log('Forces data written');
66 | });
67 | }
68 | });
69 | }
70 |
71 | function scrapeForces() {
72 | var out = {
73 | 'forces': []
74 | };
75 | jsdom.env({
76 | html: linklist,
77 | scripts: [
78 | 'http://code.jquery.com/jquery.js'
79 | ],
80 | done: function(errors, window) {
81 | // now save
82 | var writer = csv().to.path('data/forces.csv');
83 | writer.write(['id', 'label']);
84 |
85 | var $ = window.$;
86 | $('#downloads .months table').first().find('tr').slice(1).each(function(idx, elem) {
87 | var obj = [
88 | $($(elem).find('td a')[1]).attr('href').split('/')[6].slice(8).replace('-neighbourhood.zip', ''),
89 | $(elem).find('th').first().text()
90 | ];
91 | writer.write(obj);
92 | });
93 | writer.write([
94 | 'btp',
95 | 'British Transport Police'
96 | ]);
97 | }
98 | });
99 | }
100 |
101 | function scrapeZip() {
102 | var links = JSON.parse(fs.readFileSync(outlistfp))['streets'];
103 | links.forEach(function(link) {
104 | var fn = path.join(zipsdir, link.split('/').pop());
105 | var stream = fs.createWriteStream(fn);
106 | request(link)
107 | .pipe(stream)
108 | .on('close', function() {
109 | console.log(fn);
110 | });
111 |
112 | stream.on('error', function(e) {
113 | console.error(e);
114 | })
115 | });
116 | }
117 |
118 | function consolidateZipToCsv() {
119 | var stats = {
120 | total: 0,
121 | total_with_location: 0
122 | };
123 | // get weird error - think it is related to csv processing not unbinding listeners from this stream ...
124 | // (node) warning: possible EventEmitter memory leak detected. 11 listeners added. Use emitter.setMaxListeners() to increase limit.
125 | // emitter.setMaxListeners(50)
126 | // stream.setMaxListeners(0);
127 |
128 | // write the header
129 | // stream.write('Month,Reported by,Falls within,Longitude,Latitude,Location,Crime type,Context\n');
130 | // Drop "Falls within" as always repetition of Reported by AFAICt - see below for drop during processing
131 | var headers = 'Month,Reported by,Longitude,Latitude,Location,Crime type,Context'.split(',');
132 |
133 | var links = JSON.parse(fs.readFileSync(outlistfp))['streets'];
134 | links = links.slice(0,45);
135 |
136 | function process(link, cb) {
137 | var fn = link.split('/').pop();
138 | var csvpath = path.join(csvdir, fn + '.csv');
139 | var zipfp = path.join(zipsdir, link.split('/').pop());
140 | console.log('Processing: ' + zipfp + ' to ' + csvpath);
141 | var unzip = spawn('unzip', ['-p', zipfp])
142 | csv()
143 | .from.stream(unzip.stdout)
144 | .to.path(csvpath)
145 | .transform(function(data, idx) {
146 | if (idx == 0) {
147 | return headers;
148 | }
149 | stats.total += 1;
150 | // we will be a bit brutal - discard everything w/o a location
151 | // data[3] = Easting
152 | if (!data[3]) {
153 | return;
154 | }
155 | stats.total_with_location += 1;
156 |
157 | // fix up easting / northing to lon/lat
158 | var newval = convertEastingNorthingToLonLat(data.slice(3,5));
159 | data.splice(3,2,newval[0], newval[1]);
160 |
161 | // fix month
162 | data[0] = data[0] + '-01';
163 |
164 | // let's drop 'Falls within' but note first if "Reported by" and "Falls within" are different
165 | if (data[1] != data[2]) { // so unusual worth noting!!
166 | console.log(data);
167 | }
168 | data.splice(1,1);
169 | return data;
170 | })
171 | .on('record', function(data) {
172 | })
173 | .on('end', function() {
174 | console.log(stats);
175 | cb();
176 | })
177 | .on('error', function(error) {
178 | console.log(error.message);
179 | });
180 | }
181 | var idx = 0;
182 | var looper = function() {
183 | if (idx >= links.length) {
184 | console.log(stats);
185 | return;
186 | } else {
187 | process(links[idx], looper);
188 | idx += 1;
189 | }
190 | };
191 | looper();
192 | }
193 |
194 | // Set up Projections for OSGB36 => WGS84 conversion
195 | var proj4js = require('proj4js');
196 | // hat-tip to Peter Hicks for providing the conversion spec
197 | // http://blog.poggs.com/2010/09/converting-osgb36-eastingsnorthings-to-wgs84-longitudelatitude-in-ruby/
198 | proj4js.defs["OSGB36"]="+proj=tmerc +lat_0=49 +lon_0=-2 +k=0.9996012717 +x_0=400000 +y_0=-100000 +ellps=airy +datum=OSGB36 +units=m +no_defs";
199 | var srcproj = new proj4js.Proj('OSGB36');
200 |
201 | // eastingsnorthing must be an array (easting, northing)
202 | // :return: [lon, lat]
203 | var convertEastingNorthingToLonLat = function(eastingnorthing) {
204 | var eastingnorthing = [ parseInt(eastingnorthing[0]), parseInt(eastingnorthing[1]) ];
205 | if (eastingnorthing[0]) {
206 | var point = new proj4js.Point(eastingnorthing);
207 | var out = proj4js.transform(srcproj, proj4js.WGS84, point);
208 | // 5 decimal places is ~1m accuracy
209 | return [round(out.x,6), round(out.y,6)];
210 | } else {
211 | return null;
212 | }
213 | };
214 |
215 | function round(num, decPlaces) {
216 | var scale = Math.pow(10,decPlaces);
217 | return Math.round(num*scale)/scale;
218 | }
219 |
220 | function computeStats(filterString) {
221 | var links = JSON.parse(fs.readFileSync(outlistfp))['streets'];
222 | links = links.filter(function(link) {
223 | if (filterString) return (link.indexOf(filterString) != -1);
224 | else return true;
225 | });
226 |
227 | var stats = {
228 | };
229 | // Month,Reported by,Longitude,Latitude,Location,Crime type,Context
230 | var distinctRows = [0,1,5];
231 | var processRow = function(row) {
232 | var key = distinctRows.map(function(idx) {
233 | return row[idx];
234 | }).join(':::');
235 | if (key in stats) {
236 | stats[key] = stats[key] + 1;
237 | } else {
238 | stats[key] = 1;
239 | }
240 | }
241 | var writeStats = function(theStats, stream) {
242 | for (key in theStats) {
243 | var row = ['Month'];
244 | key.split(':::').forEach(function(val) {
245 | row.push(val);
246 | });
247 | row.push(theStats[key]);
248 | stream.write(row);
249 | };
250 | }
251 |
252 | headers = 'Period,Date,Body,Type,Count'.split(',');
253 | var outcsv = csv().to.path('cache/stats.csv');
254 | outcsv.write(headers);
255 |
256 | var idx = 0;
257 | function process(link, cb) {
258 | var csvpath = _csvFilePathFromLink(link);
259 | csv()
260 | .from.path(csvpath)
261 | .on('record', function(data, idx) {
262 | if (idx > 0) {
263 | processRow(data)
264 | }
265 | })
266 | .on('end', function() {
267 | console.log('Processed ' + csvpath);
268 | // now the loop
269 | if (idx < links.length-1) {
270 | idx += 1;
271 | writeStats(stats, outcsv);
272 | stats = {};
273 | cb(links[idx], cb)
274 | } else {
275 | // really finished - write stats
276 | writeStats(stats, outcsv);
277 | }
278 | });
279 | }
280 | process(links[0], process);
281 | }
282 |
283 | var _csvFilePathFromLink = function(link) {
284 | var fn = link.split('/').pop();
285 | var csvpath = path.join(csvdir, fn + '.csv');
286 | return csvpath;
287 | }
288 |
289 | // scrapeLinkList();
290 | // scrapeZip();
291 | // consolidateZipToCsv();
292 |
293 | // removes 'node' and this script
294 | args = process.argv.splice(2);
295 |
296 | if (args.length == 0) {
297 | console.log('Commands are: fixtures | rebuild_db | load ');
298 | return;
299 | }
300 | if (args.length >= 2) {
301 | filter = args[1];
302 | }
303 |
304 | if (args[0] == 'scrapelinks') {
305 | scrapeLinkList();
306 | } else if (args[0] == 'scrapeforces') {
307 | scrapeForces();
308 | } else if (args[0] == 'consolidate') {
309 | consolidateZipToCsv();
310 | } else if (args[0] == 'stats') {
311 | computeStats();
312 | }
313 |
314 |
--------------------------------------------------------------------------------