├── README.md
└── SF.py


/README.md:
--------------------------------------------------------------------------------
1 | # SF-Crime-analysis
2 | 旧金山犯罪数据分析，基于Databricks平台，运用MapReduce实现分布式数据处理
3 | 


--------------------------------------------------------------------------------
/SF.py:
--------------------------------------------------------------------------------
  1 | #import the data reader
  2 | from csv import reader
  3 | 
  4 | #prepare data 
  5 | df_crimes = crime_data_lines.map(lambda line: [x.strip('"') for x in next(reader([line]))])
  6 | 
  7 | #get header
  8 | header = df_crimes.first()
  9 | 
 10 | header
 11 | Out[159]: 
 12 | ['IncidntNum',
 13 |  'Category',
 14 |  'Descript',
 15 |  'DayOfWeek',
 16 |  'Date',
 17 |  'Time',
 18 |  'PdDistrict',
 19 |  'Resolution',
 20 |  'Address',
 21 |  'X',
 22 |  'Y',
 23 |  'Location',
 24 |  'PdId']
 25 |  
 26 |  #remove the first line of data
 27 | crimes = df_crimes.filter(lambda x: x != header)
 28 | 
 29 | #approach 1: use RDD 
 30 | #approach 2: use Dataframe, register the RDD to a dataframe // crimeDF = crimes.map(lambda p: Row(IncidntNum=p[0], Category=int(p[1])))
 31 | #approach 3: use SQL 
 32 | # 1st question: 
 33 | #Write a Spark program that counts the number of crimes for different category.
 34 | print crimes.count()
 35 | category = crimes.take(crimes.count())
 36 | category = crimes.map(lambda x: (x[1],1))
 37 | category.countByKey().items()
 38 | rddformdata = sc.parallelize(category.countByKey().items())
 39 | categorysorted = rddformdata.sortBy(lambda a:a[1])
 40 | categorysorted.collect()
 41 | 
 42 | 8977
 43 | Out[163]: 
 44 | [('SEX OFFENSES, NON FORCIBLE', 1),
 45 |  ('PORNOGRAPHY/OBSCENE MAT', 1),
 46 |  ('TREA', 1),
 47 |  ('FAMILY OFFENSES', 1),
 48 |  ('EXTORTION', 2),
 49 |  ('SUICIDE', 3),
 50 |  ('LOITERING', 4),
 51 |  ('LIQUOR LAWS', 4),
 52 |  ('EMBEZZLEMENT', 5),
 53 |  ('BRIBERY', 7),
 54 |  ('KIDNAPPING', 10),
 55 |  ('DRIVING UNDER THE INFLUENCE', 16),
 56 |  ('DISORDERLY CONDUCT', 18),
 57 |  ('DRUNKENNESS', 19),
 58 |  ('PROSTITUTION', 20),
 59 |  ('FORGERY/COUNTERFEITING', 29),
 60 |  ('ARSON', 29),
 61 |  ('RUNAWAY', 39),
 62 |  ('SEX OFFENSES, FORCIBLE', 40),
 63 |  ('RECOVERED VEHICLE', 48),
 64 |  ('STOLEN PROPERTY', 66),
 65 |  ('WEAPON LAWS', 106),
 66 |  ('TRESPASS', 109),
 67 |  ('SECONDARY CODES', 118),
 68 |  ('FRAUD', 155),
 69 |  ('DRUG/NARCOTIC', 167),
 70 |  ('ROBBERY', 187),
 71 |  ('MISSING PERSON', 265),
 72 |  ('BURGLARY', 302),
 73 |  ('WARRANTS', 312),
 74 |  ('SUSPICIOUS OCC', 312),
 75 |  ('VEHICLE THEFT', 353),
 76 |  ('VANDALISM', 650),
 77 |  ('ASSAULT', 780),
 78 |  ('NON-CRIMINAL', 991),
 79 |  ('OTHER OFFENSES', 1002),
 80 |  ('LARCENY/THEFT', 2805)]
 81 |  
 82 |  ####2nd question
 83 | ##### Write a program that counts the number of crimes for different district
 84 | district = crimes.take(crimes.count())
 85 | district = crimes.map(lambda x: (x[0:][6],1))
 86 | district.countByKey().items()
 87 | rddformdata2 = sc.parallelize(category.countByKey().items())
 88 | districtsorted = rddformdata2.sortBy(lambda a:a[1])
 89 | districtsorted.collect()
 90 | 
 91 | Out[164]: 
 92 | [('SEX OFFENSES, NON FORCIBLE', 1),
 93 |  ('PORNOGRAPHY/OBSCENE MAT', 1),
 94 |  ('TREA', 1),
 95 |  ('FAMILY OFFENSES', 1),
 96 |  ('EXTORTION', 2),
 97 |  ('SUICIDE', 3),
 98 |  ('LOITERING', 4),
 99 |  ('LIQUOR LAWS', 4),
100 |  ('EMBEZZLEMENT', 5),
101 |  ('BRIBERY', 7),
102 |  ('KIDNAPPING', 10),
103 |  ('DRIVING UNDER THE INFLUENCE', 16),
104 |  ('DISORDERLY CONDUCT', 18),
105 |  ('DRUNKENNESS', 19),
106 |  ('PROSTITUTION', 20),
107 |  ('FORGERY/COUNTERFEITING', 29),
108 |  ('ARSON', 29),
109 |  ('RUNAWAY', 39),
110 |  ('SEX OFFENSES, FORCIBLE', 40),
111 |  ('RECOVERED VEHICLE', 48),
112 |  ('RECOVERED VEHICLE', 48),
113 |  ('STOLEN PROPERTY', 66),
114 |  ('WEAPON LAWS', 106),
115 |  ('TRESPASS', 109),
116 |  ('SECONDARY CODES', 118),
117 |  ('FRAUD', 155),
118 |  ('DRUG/NARCOTIC', 167),
119 |  ('ROBBERY', 187),
120 |  ('MISSING PERSON', 265),
121 |  ('BURGLARY', 302),
122 |  ('WARRANTS', 312),
123 |  ('SUSPICIOUS OCC', 312),
124 |  ('VEHICLE THEFT', 353),
125 |  ('VANDALISM', 650),
126 |  ('ASSAULT', 780),
127 |  ('NON-CRIMINAL', 991),
128 |  ('OTHER OFFENSES', 1002),
129 |  ('LARCENY/THEFT', 2805)]
130 |  
131 |  #### 3rd question
132 | ##### Write a program to count the number of crimes each Sunday at SF downtown. 
133 | ###### hints: define your spatial function for filtering data
134 | crimesonsunday = crimes.filter(lambda x: x[0:][3] == 'Sunday')
135 | crimesonsundayflt = crimesonsunday.filter(lambda x: x[0:][9]+x[0:][10] >= -84)
136 | dis = crimesonsundayflt.map(lambda x: (x[0:][4],1))
137 | sorted(dis.countByKey().items())
138 | Out[165]: [('07/16/2017', 405), ('07/23/2017', 442), ('07/30/2017', 391)]
139 | 
140 | ##### Extra: visualize the spatial distribution of crimes and run a kmeans clustering algorithm
141 | import numpy as np
142 | import matplotlib.pyplot as plt
143 | 
144 | x = crimes.map(lambda x:x[9]).collect()
145 | y = crimes.map(lambda x:x[10]).collect()
146 | 
147 | from pandas import *
148 | from ggplot import *
149 | pydf = DataFrame({'x':x,'y':y})
150 | p = ggplot(pydf,aes('x','y')) + \
151 |     geom_point(color = 'blue')
152 | display(p)
153 | 
154 | from numpy import array
155 | from math import sqrt
156 | 
157 | from pyspark.mllib.clustering import KMeans, KMeansModel
158 | 
159 | # Load and parse the data
160 | 
161 | datasplit1 = crimes.map(lambda x: (x[9],x[10]))
162 | #print datasplit1.first()[0]
163 | datasplit = datasplit1.map(lambda line: array([float(line[0]),float(line[1])]))
164 | #print type(datasplit)
165 | # Build the model (cluster the data)
166 | clusters = KMeans.train(datasplit, 4, maxIterations=10, initializationMode="random")
167 | clusters.clusterCenters
168 | print clusters.clusterCenters
169 | #print type(clusters.clusterCenters)
170 | #print type(clusters.clusterCenters[0])
171 | #Evaluate clustering by computing Within Set Sum of Squared Errors
172 | def error(point):
173 |     center = clusters.centers[clusters.predict(point)]
174 |     
175 |     return sqrt(sum([x**2 for x in (point - center)]))
176 | 
177 | WSSSE = datasplit.map(lambda point: error(point)).reduce(lambda x, y: x + y)
178 | #center1 = clusters.centers[clusters.predict(array([-122.44383721,   37.77953436]))]
179 | #print center1
180 | print("Within Set Sum of Squared Error = " + str(WSSSE))
181 | # Save and load model
182 | #clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
183 | #sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel")
184 | 
185 | [array([-122.41187284,   37.79039618]), array([-122.41816633,   37.76877959]), array([-122.47403813,   37.75457395]), array([-122.40955324,   37.73028157])]
186 | Within Set Sum of Squared Error = 148.31068216
187 | 
188 | rddcenter = sc.parallelize(mycenters)
189 | centerx = rddcenter.map(lambda x: x[0]).collect()
190 | centery = rddcenter.map(lambda x: x[1]).collect()
191 | print centery
192 | 
193 | class Scatter:
194 |   def __init__(self, num):
195 |     self.num = num
196 |     self.scatter = datasplit.filter(lambda x: (clusters.centers[clusters.predict(x)] == mycenters[num].tolist())[0])
197 |     self.xx = self.scatter.map(lambda x:x[0]).collect()
198 |     self.yy = self.scatter.map(lambda x:x[1]).collect()
199 |   def graph(self):
200 |     #self.graph = DataFrame({'x':self.xx,'y':self.yy})
201 |     return DataFrame({'x':self.xx,'y':self.yy})
202 | 
203 | scatter1 = Scatter(0)
204 | scatter2 = Scatter(1)
205 | scatter3 = Scatter(2)
206 | scatter4 = Scatter(3)
207 | 
208 | rddcenter = sc.parallelize(mycenters)
209 | centerx = rddcenter.map(lambda x: x[0]).collect()
210 | centery = rddcenter.map(lambda x: x[1]).collect()
211 | graphcenter = DataFrame({'x':centerx,'y':centery})
212 | 
213 | pp3 = ggplot(scatter1.graph(), aes('x','y')) + geom_point(scatter1.graph(), color = 'blue') + geom_point(scatter2.graph(), color = 'gray') + geom_point(scatter3.graph(), color = 'yellow') + geom_point(scatter4.graph(), color = 'green') + geom_point(graphcenter,shape = '*', color = 'red', size = 300)
214 | display(pp3)
215 | 
216 | #print scatter.graph()
217 | 


--------------------------------------------------------------------------------