├── README.md
├── analysis.py
├── web
├── webapp.py
└── templates
│ └── hotregion.template.html
└── spy.py
/README.md:
--------------------------------------------------------------------------------
1 | # housePriceSpider
2 | 基于链家房产网的房价数据爬取及分析
3 |
--------------------------------------------------------------------------------
/analysis.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Jan 20 13:29:47 2017
4 |
5 | @author: L
6 | """
7 | import random
8 | import numpy as np
9 | import cPickle
10 | from collections import defaultdict
11 | import sys, re
12 | import pandas as pd
13 | import numpy as np
14 | import matplotlib.pyplot as plt
15 |
16 | city = "chengdu2"
17 | fileName = "%s.txt"%(city)
18 | df = pd.read_csv(fileName, header=None, sep=',')
19 | state = ["city","district","name","lng","lat","privice"]
20 |
21 | df.columns = state
22 |
23 | #district stsat describe
24 | #this will return a Series Object
25 | df_stat = df['privice'].groupby(df['district'])
26 | describe = df_stat.describe().to_dict()
27 | means = df_stat.mean().to_dict()
28 |
29 | describeAll = df['privice'].describe().to_dict()
30 | describeAllNew = {}
31 | for key in describeAll:
32 | describeAllNew[('全市',key)] = describeAll[key]
33 |
34 | means['全市'] = df['privice'].mean()
35 | #plot means
36 | sorted_means = sorted(means.iteritems(), key=lambda d:d[1], reverse = True )
37 |
38 | districtArray = []
39 | stdArray = []
40 | priviceArray = []
41 | countArray = []
42 | for item in sorted_means:
43 | district = item[0]
44 | privice = item[1]
45 | districtArray.insert(0,district)
46 | priviceArray.insert(0,privice)
47 | countArray.insert(0,counts[district])
48 | stdArray.insert(0,std[district])
49 | N = len(priviceArray)
50 | ind = np.arange(N) # the x locations for the groups
51 | width = 0.35 # the width of the bars
52 | fig, ax = plt.subplots()
53 | rects1 = ax.bar(ind, priviceArray, width, color='darkorange')
54 |
55 | right_data = countArray
56 | axf = ax.twinx()
57 | print len(ax.get_xticks())
58 | print len(right_data)
59 | rects2 = axf.plot(ind, right_data, color='forestgreen')
60 | #axf.set_ylim((0, 20))
61 | axf.set_ylabel('房屋数量/套')
62 | axf.set_ylim(0,4000)
63 |
64 | ax.set_xlabel('地区')
65 | ax.set_ylabel('均价/元')
66 | ax.set_title('成都市各区2017年1月二手房房屋均价--房屋数量')
67 | ax.set_xticks(ind+width)
68 | ax.set_xticklabels( districtArray )
69 | def autolabel(rects):
70 | # attach some text labels
71 | for rect in rects:
72 | height = rect.get_height()
73 | ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height),
74 | ha='center', va='bottom')
75 | def autolabel_line(rects):
76 | # attach some text labels
77 | for rect in rects:
78 | print rect.get_ydata()
79 | X = rect.get_xdata()
80 | Y = rect.get_ydata()
81 | for x,y in zip(X,Y):
82 | axf.text(x, 1.01*y, '%d'%int(y),color='forestgreen',
83 | ha='center', va='bottom')
84 |
85 | autolabel(rects1)
86 | autolabel_line(rects2)
87 | plt.style.use('ggplot')
88 | plt.show()
89 |
90 |
91 | boxData = []
92 | boxLables = []
93 | for item in sorted_means:
94 | district1 = item[0]
95 | if district1 != '全市' :
96 | print district1
97 | df_tmp = df.ix[df.district==district1]
98 | data = df_tmp['privice'].values
99 | boxData.append(data)
100 | boxLables.append(district1)
101 | else:
102 | data = df['privice'].values
103 | boxData.append(data)
104 | boxLables.append('全市')
105 |
106 | fig, ax = plt.subplots()
107 | ax.boxplot((boxData),labels=(boxLables))
108 | ax.set_xlabel('地区')
109 | ax.set_ylabel('价格(/元)')
110 | ax.set_title('成都市各区2017年1月二手房房屋价格统计')
111 | plt.style.use('ggplot')
112 | ax.show()
113 |
114 |
115 |
116 |
117 |
118 |
--------------------------------------------------------------------------------
/web/webapp.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | @author: zbc
4 | """
5 | import os
6 | import tornado.ioloop
7 | import tornado.httpserver
8 | import tornado.web
9 | import json
10 | import math
11 | import sys
12 | reload(sys)
13 | sys.setdefaultencoding( "utf-8" )
14 |
15 | ''''
16 | 生成hotregin
17 | '''
18 | def _mkF( sf ):
19 | vf = round( float(sf), 9 )
20 | return vf
21 |
22 | def _loadLocations(filename):
23 | if filename== None:return
24 |
25 | filen = filename
26 | hd = file( filen, "r" )
27 | if not hd: return [ [0,[] ], [] ]
28 | pts = []
29 | for line in hd:
30 | ay = line.strip().split(",")
31 | if len( ay ) < 6: continue
32 | pts.append( [ ay[4], ay[3], ay[5] ] )
33 | hd.close()
34 | hotRegion = _loadPts( pts )
35 | return hotRegion
36 |
37 | def _loadPts( fpts ):
38 | d = {}
39 | #print len(fpts)
40 | for ay in fpts:
41 | if len( ay ) != 3: continue
42 | try:
43 | latf = _mkF( ay[0] )
44 | lotf = _mkF( ay[1] )
45 | r = [ latf, lotf ]
46 | k = "%f %f"%( r[0], r[1] )
47 | d[k] = [ r, int(ay[2]) ]
48 | except:pass
49 |
50 | #print len(d)
51 | if len( d ) == 0: return [ 0, [] ]
52 |
53 | dd = []
54 | for k in d: dd.append( d[k] )
55 |
56 | len1 = len(dd)
57 | dd = sorted( dd, key=lambda x:x[1], reverse = True )
58 | sz = min( 50000, len1 )
59 | dd = dd[0: sz ]
60 |
61 | ret = []
62 | maxv = 0
63 | minv = 10000
64 | array = []
65 | i = 0
66 | for k in dd:
67 | r = k[0]
68 | c = k[1]
69 | c = int( round( math.log( c, 4 ) + 4, 0 ) )
70 | array.append({"lng": r[1],"lat":r[0], "count": c*10})
71 | i = i + 1
72 |
73 | return array
74 | """
75 | def _loadUserPts( fpts ):
76 | d = {}
77 | #print len(fpts)
78 | for ay in fpts:
79 | if len( ay ) != 2: continue
80 | try:
81 | latf = _mkF( ay[0] )
82 | lotf = _mkF( ay[1] )
83 |
84 | r = [ latf, lotf ]
85 | k = "%f %f"%( r[0], r[1] )
86 | #print k
87 | if k not in d:
88 | d[k] = [ r, 1 ]
89 | else:
90 | d[k][1] = d[k][1] + 1
91 | except:pass
92 |
93 | #print len(d)
94 | if len( d ) == 0: return [ 0, [] ]
95 |
96 | dd = []
97 | for k in d: dd.append( d[k] )
98 |
99 | len1 = len(dd)
100 | dd = sorted( dd, key=lambda x:x[1], reverse = True )
101 | sz = min( 50000, len1 )
102 | dd = dd[0: sz ]
103 |
104 | ret = []
105 | maxv = 0
106 | minv = 10000
107 | array = []
108 | i = 0
109 | for k in dd:
110 | r = k[0]
111 | c = k[1]
112 | c = int( round( math.log( c, 4 ) + 4, 0 ) )
113 | array.append({"lng": r[1],"lat":r[0], "count": c*10})
114 | i = i + 1
115 |
116 | return array
117 | """
118 | map_center = {"chengdu":[30.67, 104.064],"beijing":[39.9772370000,116.3959960000],"shanghai":[31.236305,121.480237]}
119 | map_level = {"chengdu":13,"beijing":12,"shanghai":12}
120 |
121 | class orderClusterHandler(tornado.web.RequestHandler):
122 | def get(self):
123 | if True :
124 | mapCentor = [30.67, 104.064]
125 | mapLevel = 14
126 | filename = "../chengdu.txt"
127 | #热点数据
128 | hot_regin = _loadLocations(filename)
129 | self.render( "hotregion.template.html",
130 | center_lon=mapCentor[1], center_lat=mapCentor[0],
131 | hotregin=hot_regin)
132 |
133 | def make_app():
134 | settings = {
135 | "template_path": os.path.join(os.path.dirname(__file__), "templates"),
136 | "static_path":os.path.join(os.path.dirname(__file__), "src"),
137 | "cookie_secret": "bZJc2sWbQLKos6GkHn/VB9oXwQt8S0R0kRvJ5/xJ89E=",
138 | "xsrf_cookies": False,
139 | }
140 |
141 | return tornado.web.Application(handlers=[ (r"/", orderClusterHandler),
142 | ], **settings)
143 |
144 | if __name__ == "__main__":
145 | app = make_app()
146 | app.listen(8110)
147 | tornado.ioloop.IOLoop.current().start()
148 |
--------------------------------------------------------------------------------
/web/templates/hotregion.template.html:
--------------------------------------------------------------------------------
1 |
2 | {% autoescape None %}
3 |
4 | 房价热力图
5 |
6 |
7 |
8 |
9 |
10 |
11 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
127 |
128 |
129 |
--------------------------------------------------------------------------------
/spy.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | import json
3 | import re
4 | import io
5 | import datetime
6 | import random
7 | import time
8 | import urllib
9 | import urllib2
10 | import pymongo
11 | import datetime
12 | import time
13 | import sys
14 | import demjson
15 | from bs4 import BeautifulSoup
16 | from math import *
17 | reload(sys)
18 | sys.setdefaultencoding( "utf-8" )
19 |
20 | # input Lat_A 纬度A
21 | # input Lng_A 经度A
22 | # input Lat_B 纬度B
23 | # input Lng_B 经度B
24 | # output distance 距离(km)
25 | def calcLatLngDist(Lat_A, Lng_A, Lat_B, Lng_B, Debug = False ):
26 | Lat_A = float(Lat_A)
27 | Lng_A = float(Lng_A)
28 | Lat_B = float(Lat_B)
29 | Lng_B = float(Lng_B)
30 | #print Lat_A, Lng_A, Lat_B, Lng_B
31 | if Lat_A == Lat_B and Lng_A == Lng_B:
32 | if Debug: print "_calcLatLngDist return as equal"
33 | return 0
34 |
35 | ra = 6378.140 # 赤道半径 (km)
36 | rb = 6356.755 # 极半径 (km)
37 | flatten = (ra - rb) / ra # 地球扁率
38 | rad_lat_A = radians(Lat_A)
39 | rad_lng_A = radians(Lng_A)
40 | rad_lat_B = radians(Lat_B)
41 | rad_lng_B = radians(Lng_B)
42 | pA = atan(rb / ra * tan(rad_lat_A))
43 | pB = atan(rb / ra * tan(rad_lat_B))
44 | distance = 0
45 | try:
46 | xx = acos(sin(pA) * sin(pB) + cos(pA) * cos(pB) * cos(rad_lng_A - rad_lng_B))
47 | c1 = (sin(xx) - xx) * (sin(pA) + sin(pB)) ** 2 / cos(xx / 2) ** 2
48 | c2 = (sin(xx) + xx) * (sin(pA) - sin(pB)) ** 2 / sin(xx / 2) ** 2
49 | dr = flatten / 8 * (c1 - c2)
50 | distance = ra * (xx + dr)
51 | except Exception,e:
52 | if Debug:
53 | print "_calcLatLngDist return as exception:"
54 | print Lat_A, Lng_A, Lat_B, Lng_B
55 | print e
56 | return 0
57 | return distance
58 | calcLatLngDist(104.131182,30.655364, 104.06, 30.67)
59 |
60 | def getGdLocation(address):
61 | try:
62 | #Key名称:adrDev249091488
63 | Key = '48bdf4f06248826b22e51538dccb5f4d'
64 | appKeys = ['7ba215f6db353d2aa66c55fa4dfe99ad','b80ccd147375ed37bfbdd9025ed9e384','5a9a676d40a02bbcf3cbe2bc581f0e26','7c046c20b0e2c70020d39b4988116127','74182609e8238c9822282985734ed495','595db8b17bed1ad220b79ff08e3fedaa','701dc8acd6d7aec627c678623f40f861','af2dd94fee5153f44d4e34160a4fdbe7','f51c07f0b4cecf5a773a63c87099645c','9f566db560bd026c506e1efaa75c3952','44379632a8984fb7d5d88f7d9bf6e5b2']
65 | rdOff = random.randint( 0, len( appKeys ) - 1 )
66 | appK = appKeys[rdOff]
67 | gdUrl = 'http://restapi.amap.com/v3/geocode/geo?key=%s&address=%s&city=成都'%(appK,address)
68 | req = urllib2.Request( gdUrl )
69 | data = urllib2.urlopen( req )
70 | res = data.read()
71 | res = json.loads( res )
72 | if res and res['info'] == 'OK' and 'geocodes' in res and len(res['geocodes'])>0 and 'location' in res['geocodes'][0]:
73 | location = res['geocodes'][0]['location']
74 | city = res['geocodes'][0]['city']
75 | district = res['geocodes'][0]['district']
76 | return location,city,district
77 | else:
78 | print res
79 | return None,"",""
80 | except:
81 | return None,"",""
82 |
83 | appKeys = [ "4032f6db1085b0c63683ef3917e40428","IkSvwkWPwCuICyAjnS0QGBzw","6WDQYk8GK6CbusVvepkSQKST","CvbddAko7nt1layAy2IPYuZe", "0ufxKGZM4j0dyzwK7FF6fS5L", "Ni07CGCmkAiRCtCTcq1rql4B","tnPseOMpG3G02Rk5pWN2NXBt","Weefu0Q7Lj6BTdVLkHYEonQo" ]
84 | def getBdLocation(address):
85 | try:
86 | rdOff = random.randint( 0, len( appKeys ) - 1 )
87 | appK = appKeys[rdOff]
88 | urlAddress = "http://api.map.baidu.com/geocoder?address=%s&output=json&key=%s"%( name, appK)
89 | req = urllib2.Request( urlAddress )
90 | data = urllib2.urlopen( req )
91 | res = data.read()
92 | res = json.loads( res )
93 | lat = -1
94 | lng = -1
95 | if res and res['status'] == 'OK' and ('result' in res and 'location' in res['result'] and 'lng' in res['result']['location']):
96 | lng = res['result']['location']['lng']
97 | lat = res['result']['location']['lat']
98 | location = 'lng:%s,lat:%s'%(lng,lat)
99 | return location
100 | else:
101 | print res
102 | return None
103 | except:
104 | return None
105 | cityList = ["成都"]
106 |
107 | fw = open("./chengdu.txt","a+")
108 | index = [i+1 for i in range(32)]
109 | for pa in index:
110 | try:
111 | if pa==1:
112 | url = "http://cd.fang.lianjia.com/loupan/"
113 | else:
114 | url = "http://cd.fang.lianjia.com/loupan/pg%d/"%(pa)
115 | print "request:"+url
116 | req = urllib2.Request( url )
117 | req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36")
118 | req.add_header("Accept","*/*")
119 | req.add_header("Accept-Language","zh-CN,zh;q=0.8")
120 |
121 | data = urllib2.urlopen( req )
122 | res = data.read()
123 | #print res
124 | #res = res.replace(" ","")
125 | #print res
126 | #objects = demjson.decode(res)
127 |
128 | soup = BeautifulSoup(res)
129 | houseLst = soup.findAll(id='house-lst')
130 | resp = soup.findAll('div', attrs = {'class': 'info-panel'})
131 |
132 | for i in range(len(resp)):
133 | name = resp[i].findAll('a', attrs = {'target': '_blank'})[0].text
134 |
135 | privice = resp[i].findAll('span', attrs = {'class': 'num'})
136 | privice = privice[0].text
137 |
138 | region = resp[i].findAll('span', attrs = {'class': 'region'})
139 | address = region[0].text.split('(')[0]
140 | ##解析获得经纬度
141 | location,city,district = getGdLocation(name)
142 | if not location:
143 | location = getBdLocation(address)#自定义函数
144 | if not location:
145 | continue
146 | formatStr = "%s,%s,%s,%s,%s\n"%(city,district,name,location,privice)
147 | print formatStr
148 | fw.write(formatStr)
149 | except:
150 | pass
151 | fw.close()
152 |
153 |
154 | #二手房数据获取
155 | fw = open("./chengdu2.txt","a+")
156 | index = [i+1 for i in range(100)]
157 | for pa in index:
158 | try:
159 | if pa==1:
160 | url = "http://cd.lianjia.com/ershoufang/"
161 | else:
162 | url = "http://cd.lianjia.com/ershoufang/pg%d/"%(pa)
163 | print "request:"+url
164 | req = urllib2.Request( url )
165 | req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36")
166 | req.add_header("Accept","*/*")
167 | req.add_header("Accept-Language","zh-CN,zh;q=0.8")
168 |
169 | data = urllib2.urlopen( req )
170 | res = data.read()
171 |
172 | soup = BeautifulSoup(res)
173 | resp = soup.findAll('div', attrs = {'class': 'content'})
174 | resp = resp[0].findAll('ul', attrs = {'class': 'sellListContent'})
175 | resp = resp[0].findAll('li', attrs = {'class': 'clear'})
176 | for i in range(len(resp)):
177 | address = resp[i].findAll('div', attrs = {'class': 'address'})
178 | address = address[0].findAll('a', attrs = {'target': '_blank'})[0].text
179 | address = address.replace(" ","")
180 |
181 | addressAddInfo = resp[i].findAll('div', attrs = {'class': 'positionInfo'})[0].findAll('a', attrs = {'target': '_blank'})[0].text
182 | name = address
183 | address = address +"_" + addressAddInfo
184 |
185 | print address
186 | unitPrice = resp[i].findAll('div', attrs = {'class': 'unitPrice'})[0].text
187 | unitPrice = unitPrice.replace("单价","").replace("元/平米","")
188 | ##解析获得经纬度
189 | ##售房者填写的地址和楼盘名字可能有误,通过楼盘和区域名来获取经纬度的同时,验证其可靠性
190 | location = None
191 | location1,city,district = getGdLocation(name)
192 | location2,city,district = getGdLocation(address)
193 | if location1 and location2:
194 | distince = calcLatLngDist(location1.split(',')[0], location1.split(',')[1], location2.split(',')[0], location2.split(',')[1])
195 | print distince
196 | if distince > 6.0:
197 | continue
198 | print "distince验证通过:%s"%(distince)
199 | distince1 = calcLatLngDist(location1.split(',')[0], location1.split(',')[1], 104.06, 30.67)
200 | distince2 = calcLatLngDist(location2.split(',')[0], location2.split(',')[1], 104.06, 30.67)
201 | if distince1 < distince2:
202 | location = location1
203 | else:
204 | location = location2
205 | if not location:
206 | location = getBdLocation(address)
207 | if not location:
208 | print "地址解析失败"
209 | continue
210 | formatStr = "%s,%s,%s,%s,%s\n"%(city,district,name,location,unitPrice)
211 | print formatStr
212 | fw.write(formatStr)
213 | except:
214 | pass
215 | fw.close()
216 |
217 |
218 |
219 |
220 |
221 |
222 |
223 |
224 |
225 |
226 |
227 |
228 |
229 |
230 |
231 |
232 |
233 |
234 |
--------------------------------------------------------------------------------