├── README.md └── SF.py /README.md: -------------------------------------------------------------------------------- 1 | # SF-Crime-analysis 2 | 旧金山犯罪数据分析,基于Databricks平台,运用MapReduce实现分布式数据处理 3 | -------------------------------------------------------------------------------- /SF.py: -------------------------------------------------------------------------------- 1 | #import the data reader 2 | from csv import reader 3 | 4 | #prepare data 5 | df_crimes = crime_data_lines.map(lambda line: [x.strip('"') for x in next(reader([line]))]) 6 | 7 | #get header 8 | header = df_crimes.first() 9 | 10 | header 11 | Out[159]: 12 | ['IncidntNum', 13 | 'Category', 14 | 'Descript', 15 | 'DayOfWeek', 16 | 'Date', 17 | 'Time', 18 | 'PdDistrict', 19 | 'Resolution', 20 | 'Address', 21 | 'X', 22 | 'Y', 23 | 'Location', 24 | 'PdId'] 25 | 26 | #remove the first line of data 27 | crimes = df_crimes.filter(lambda x: x != header) 28 | 29 | #approach 1: use RDD 30 | #approach 2: use Dataframe, register the RDD to a dataframe // crimeDF = crimes.map(lambda p: Row(IncidntNum=p[0], Category=int(p[1]))) 31 | #approach 3: use SQL 32 | # 1st question: 33 | #Write a Spark program that counts the number of crimes for different category. 34 | print crimes.count() 35 | category = crimes.take(crimes.count()) 36 | category = crimes.map(lambda x: (x[1],1)) 37 | category.countByKey().items() 38 | rddformdata = sc.parallelize(category.countByKey().items()) 39 | categorysorted = rddformdata.sortBy(lambda a:a[1]) 40 | categorysorted.collect() 41 | 42 | 8977 43 | Out[163]: 44 | [('SEX OFFENSES, NON FORCIBLE', 1), 45 | ('PORNOGRAPHY/OBSCENE MAT', 1), 46 | ('TREA', 1), 47 | ('FAMILY OFFENSES', 1), 48 | ('EXTORTION', 2), 49 | ('SUICIDE', 3), 50 | ('LOITERING', 4), 51 | ('LIQUOR LAWS', 4), 52 | ('EMBEZZLEMENT', 5), 53 | ('BRIBERY', 7), 54 | ('KIDNAPPING', 10), 55 | ('DRIVING UNDER THE INFLUENCE', 16), 56 | ('DISORDERLY CONDUCT', 18), 57 | ('DRUNKENNESS', 19), 58 | ('PROSTITUTION', 20), 59 | ('FORGERY/COUNTERFEITING', 29), 60 | ('ARSON', 29), 61 | ('RUNAWAY', 39), 62 | ('SEX OFFENSES, FORCIBLE', 40), 63 | ('RECOVERED VEHICLE', 48), 64 | ('STOLEN PROPERTY', 66), 65 | ('WEAPON LAWS', 106), 66 | ('TRESPASS', 109), 67 | ('SECONDARY CODES', 118), 68 | ('FRAUD', 155), 69 | ('DRUG/NARCOTIC', 167), 70 | ('ROBBERY', 187), 71 | ('MISSING PERSON', 265), 72 | ('BURGLARY', 302), 73 | ('WARRANTS', 312), 74 | ('SUSPICIOUS OCC', 312), 75 | ('VEHICLE THEFT', 353), 76 | ('VANDALISM', 650), 77 | ('ASSAULT', 780), 78 | ('NON-CRIMINAL', 991), 79 | ('OTHER OFFENSES', 1002), 80 | ('LARCENY/THEFT', 2805)] 81 | 82 | ####2nd question 83 | ##### Write a program that counts the number of crimes for different district 84 | district = crimes.take(crimes.count()) 85 | district = crimes.map(lambda x: (x[0:][6],1)) 86 | district.countByKey().items() 87 | rddformdata2 = sc.parallelize(category.countByKey().items()) 88 | districtsorted = rddformdata2.sortBy(lambda a:a[1]) 89 | districtsorted.collect() 90 | 91 | Out[164]: 92 | [('SEX OFFENSES, NON FORCIBLE', 1), 93 | ('PORNOGRAPHY/OBSCENE MAT', 1), 94 | ('TREA', 1), 95 | ('FAMILY OFFENSES', 1), 96 | ('EXTORTION', 2), 97 | ('SUICIDE', 3), 98 | ('LOITERING', 4), 99 | ('LIQUOR LAWS', 4), 100 | ('EMBEZZLEMENT', 5), 101 | ('BRIBERY', 7), 102 | ('KIDNAPPING', 10), 103 | ('DRIVING UNDER THE INFLUENCE', 16), 104 | ('DISORDERLY CONDUCT', 18), 105 | ('DRUNKENNESS', 19), 106 | ('PROSTITUTION', 20), 107 | ('FORGERY/COUNTERFEITING', 29), 108 | ('ARSON', 29), 109 | ('RUNAWAY', 39), 110 | ('SEX OFFENSES, FORCIBLE', 40), 111 | ('RECOVERED VEHICLE', 48), 112 | ('RECOVERED VEHICLE', 48), 113 | ('STOLEN PROPERTY', 66), 114 | ('WEAPON LAWS', 106), 115 | ('TRESPASS', 109), 116 | ('SECONDARY CODES', 118), 117 | ('FRAUD', 155), 118 | ('DRUG/NARCOTIC', 167), 119 | ('ROBBERY', 187), 120 | ('MISSING PERSON', 265), 121 | ('BURGLARY', 302), 122 | ('WARRANTS', 312), 123 | ('SUSPICIOUS OCC', 312), 124 | ('VEHICLE THEFT', 353), 125 | ('VANDALISM', 650), 126 | ('ASSAULT', 780), 127 | ('NON-CRIMINAL', 991), 128 | ('OTHER OFFENSES', 1002), 129 | ('LARCENY/THEFT', 2805)] 130 | 131 | #### 3rd question 132 | ##### Write a program to count the number of crimes each Sunday at SF downtown. 133 | ###### hints: define your spatial function for filtering data 134 | crimesonsunday = crimes.filter(lambda x: x[0:][3] == 'Sunday') 135 | crimesonsundayflt = crimesonsunday.filter(lambda x: x[0:][9]+x[0:][10] >= -84) 136 | dis = crimesonsundayflt.map(lambda x: (x[0:][4],1)) 137 | sorted(dis.countByKey().items()) 138 | Out[165]: [('07/16/2017', 405), ('07/23/2017', 442), ('07/30/2017', 391)] 139 | 140 | ##### Extra: visualize the spatial distribution of crimes and run a kmeans clustering algorithm 141 | import numpy as np 142 | import matplotlib.pyplot as plt 143 | 144 | x = crimes.map(lambda x:x[9]).collect() 145 | y = crimes.map(lambda x:x[10]).collect() 146 | 147 | from pandas import * 148 | from ggplot import * 149 | pydf = DataFrame({'x':x,'y':y}) 150 | p = ggplot(pydf,aes('x','y')) + \ 151 | geom_point(color = 'blue') 152 | display(p) 153 | 154 | from numpy import array 155 | from math import sqrt 156 | 157 | from pyspark.mllib.clustering import KMeans, KMeansModel 158 | 159 | # Load and parse the data 160 | 161 | datasplit1 = crimes.map(lambda x: (x[9],x[10])) 162 | #print datasplit1.first()[0] 163 | datasplit = datasplit1.map(lambda line: array([float(line[0]),float(line[1])])) 164 | #print type(datasplit) 165 | # Build the model (cluster the data) 166 | clusters = KMeans.train(datasplit, 4, maxIterations=10, initializationMode="random") 167 | clusters.clusterCenters 168 | print clusters.clusterCenters 169 | #print type(clusters.clusterCenters) 170 | #print type(clusters.clusterCenters[0]) 171 | #Evaluate clustering by computing Within Set Sum of Squared Errors 172 | def error(point): 173 | center = clusters.centers[clusters.predict(point)] 174 | 175 | return sqrt(sum([x**2 for x in (point - center)])) 176 | 177 | WSSSE = datasplit.map(lambda point: error(point)).reduce(lambda x, y: x + y) 178 | #center1 = clusters.centers[clusters.predict(array([-122.44383721, 37.77953436]))] 179 | #print center1 180 | print("Within Set Sum of Squared Error = " + str(WSSSE)) 181 | # Save and load model 182 | #clusters.save(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") 183 | #sameModel = KMeansModel.load(sc, "target/org/apache/spark/PythonKMeansExample/KMeansModel") 184 | 185 | [array([-122.41187284, 37.79039618]), array([-122.41816633, 37.76877959]), array([-122.47403813, 37.75457395]), array([-122.40955324, 37.73028157])] 186 | Within Set Sum of Squared Error = 148.31068216 187 | 188 | rddcenter = sc.parallelize(mycenters) 189 | centerx = rddcenter.map(lambda x: x[0]).collect() 190 | centery = rddcenter.map(lambda x: x[1]).collect() 191 | print centery 192 | 193 | class Scatter: 194 | def __init__(self, num): 195 | self.num = num 196 | self.scatter = datasplit.filter(lambda x: (clusters.centers[clusters.predict(x)] == mycenters[num].tolist())[0]) 197 | self.xx = self.scatter.map(lambda x:x[0]).collect() 198 | self.yy = self.scatter.map(lambda x:x[1]).collect() 199 | def graph(self): 200 | #self.graph = DataFrame({'x':self.xx,'y':self.yy}) 201 | return DataFrame({'x':self.xx,'y':self.yy}) 202 | 203 | scatter1 = Scatter(0) 204 | scatter2 = Scatter(1) 205 | scatter3 = Scatter(2) 206 | scatter4 = Scatter(3) 207 | 208 | rddcenter = sc.parallelize(mycenters) 209 | centerx = rddcenter.map(lambda x: x[0]).collect() 210 | centery = rddcenter.map(lambda x: x[1]).collect() 211 | graphcenter = DataFrame({'x':centerx,'y':centery}) 212 | 213 | pp3 = ggplot(scatter1.graph(), aes('x','y')) + geom_point(scatter1.graph(), color = 'blue') + geom_point(scatter2.graph(), color = 'gray') + geom_point(scatter3.graph(), color = 'yellow') + geom_point(scatter4.graph(), color = 'green') + geom_point(graphcenter,shape = '*', color = 'red', size = 300) 214 | display(pp3) 215 | 216 | #print scatter.graph() 217 | --------------------------------------------------------------------------------