├── README.md ├── analysis.py ├── spy.py └── web ├── templates └── hotregion.template.html └── webapp.py /README.md: -------------------------------------------------------------------------------- 1 | # housePriceSpider 2 | 基于链家房产网的房价数据爬取及分析 3 | -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 20 13:29:47 2017 4 | 5 | @author: L 6 | """ 7 | import random 8 | import numpy as np 9 | import cPickle 10 | from collections import defaultdict 11 | import sys, re 12 | import pandas as pd 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | 16 | city = "chengdu2" 17 | fileName = "%s.txt"%(city) 18 | df = pd.read_csv(fileName, header=None, sep=',') 19 | state = ["city","district","name","lng","lat","privice"] 20 | 21 | df.columns = state 22 | 23 | #district stsat describe 24 | #this will return a Series Object 25 | df_stat = df['privice'].groupby(df['district']) 26 | describe = df_stat.describe().to_dict() 27 | means = df_stat.mean().to_dict() 28 | 29 | describeAll = df['privice'].describe().to_dict() 30 | describeAllNew = {} 31 | for key in describeAll: 32 | describeAllNew[('全市',key)] = describeAll[key] 33 | 34 | means['全市'] = df['privice'].mean() 35 | #plot means 36 | sorted_means = sorted(means.iteritems(), key=lambda d:d[1], reverse = True ) 37 | 38 | districtArray = [] 39 | stdArray = [] 40 | priviceArray = [] 41 | countArray = [] 42 | for item in sorted_means: 43 | district = item[0] 44 | privice = item[1] 45 | districtArray.insert(0,district) 46 | priviceArray.insert(0,privice) 47 | countArray.insert(0,counts[district]) 48 | stdArray.insert(0,std[district]) 49 | N = len(priviceArray) 50 | ind = np.arange(N) # the x locations for the groups 51 | width = 0.35 # the width of the bars 52 | fig, ax = plt.subplots() 53 | rects1 = ax.bar(ind, priviceArray, width, color='darkorange') 54 | 55 | right_data = countArray 56 | axf = ax.twinx() 57 | print len(ax.get_xticks()) 58 | print len(right_data) 59 | rects2 = axf.plot(ind, right_data, color='forestgreen') 60 | #axf.set_ylim((0, 20)) 61 | axf.set_ylabel('房屋数量/套') 62 | axf.set_ylim(0,4000) 63 | 64 | ax.set_xlabel('地区') 65 | ax.set_ylabel('均价/元') 66 | ax.set_title('成都市各区2017年1月二手房房屋均价--房屋数量') 67 | ax.set_xticks(ind+width) 68 | ax.set_xticklabels( districtArray ) 69 | def autolabel(rects): 70 | # attach some text labels 71 | for rect in rects: 72 | height = rect.get_height() 73 | ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height), 74 | ha='center', va='bottom') 75 | def autolabel_line(rects): 76 | # attach some text labels 77 | for rect in rects: 78 | print rect.get_ydata() 79 | X = rect.get_xdata() 80 | Y = rect.get_ydata() 81 | for x,y in zip(X,Y): 82 | axf.text(x, 1.01*y, '%d'%int(y),color='forestgreen', 83 | ha='center', va='bottom') 84 | 85 | autolabel(rects1) 86 | autolabel_line(rects2) 87 | plt.style.use('ggplot') 88 | plt.show() 89 | 90 | 91 | boxData = [] 92 | boxLables = [] 93 | for item in sorted_means: 94 | district1 = item[0] 95 | if district1 != '全市' : 96 | print district1 97 | df_tmp = df.ix[df.district==district1] 98 | data = df_tmp['privice'].values 99 | boxData.append(data) 100 | boxLables.append(district1) 101 | else: 102 | data = df['privice'].values 103 | boxData.append(data) 104 | boxLables.append('全市') 105 | 106 | fig, ax = plt.subplots() 107 | ax.boxplot((boxData),labels=(boxLables)) 108 | ax.set_xlabel('地区') 109 | ax.set_ylabel('价格(/元)') 110 | ax.set_title('成都市各区2017年1月二手房房屋价格统计') 111 | plt.style.use('ggplot') 112 | ax.show() 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /spy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import re 4 | import io 5 | import datetime 6 | import random 7 | import time 8 | import urllib 9 | import urllib2 10 | import pymongo 11 | import datetime 12 | import time 13 | import sys 14 | import demjson 15 | from bs4 import BeautifulSoup 16 | from math import * 17 | reload(sys) 18 | sys.setdefaultencoding( "utf-8" ) 19 | 20 | # input Lat_A 纬度A 21 | # input Lng_A 经度A 22 | # input Lat_B 纬度B 23 | # input Lng_B 经度B 24 | # output distance 距离(km) 25 | def calcLatLngDist(Lat_A, Lng_A, Lat_B, Lng_B, Debug = False ): 26 | Lat_A = float(Lat_A) 27 | Lng_A = float(Lng_A) 28 | Lat_B = float(Lat_B) 29 | Lng_B = float(Lng_B) 30 | #print Lat_A, Lng_A, Lat_B, Lng_B 31 | if Lat_A == Lat_B and Lng_A == Lng_B: 32 | if Debug: print "_calcLatLngDist return as equal" 33 | return 0 34 | 35 | ra = 6378.140 # 赤道半径 (km) 36 | rb = 6356.755 # 极半径 (km) 37 | flatten = (ra - rb) / ra # 地球扁率 38 | rad_lat_A = radians(Lat_A) 39 | rad_lng_A = radians(Lng_A) 40 | rad_lat_B = radians(Lat_B) 41 | rad_lng_B = radians(Lng_B) 42 | pA = atan(rb / ra * tan(rad_lat_A)) 43 | pB = atan(rb / ra * tan(rad_lat_B)) 44 | distance = 0 45 | try: 46 | xx = acos(sin(pA) * sin(pB) + cos(pA) * cos(pB) * cos(rad_lng_A - rad_lng_B)) 47 | c1 = (sin(xx) - xx) * (sin(pA) + sin(pB)) ** 2 / cos(xx / 2) ** 2 48 | c2 = (sin(xx) + xx) * (sin(pA) - sin(pB)) ** 2 / sin(xx / 2) ** 2 49 | dr = flatten / 8 * (c1 - c2) 50 | distance = ra * (xx + dr) 51 | except Exception,e: 52 | if Debug: 53 | print "_calcLatLngDist return as exception:" 54 | print Lat_A, Lng_A, Lat_B, Lng_B 55 | print e 56 | return 0 57 | return distance 58 | calcLatLngDist(104.131182,30.655364, 104.06, 30.67) 59 | 60 | def getGdLocation(address): 61 | try: 62 | #Key名称:adrDev249091488 63 | Key = '48bdf4f06248826b22e51538dccb5f4d' 64 | appKeys = ['7ba215f6db353d2aa66c55fa4dfe99ad','b80ccd147375ed37bfbdd9025ed9e384','5a9a676d40a02bbcf3cbe2bc581f0e26','7c046c20b0e2c70020d39b4988116127','74182609e8238c9822282985734ed495','595db8b17bed1ad220b79ff08e3fedaa','701dc8acd6d7aec627c678623f40f861','af2dd94fee5153f44d4e34160a4fdbe7','f51c07f0b4cecf5a773a63c87099645c','9f566db560bd026c506e1efaa75c3952','44379632a8984fb7d5d88f7d9bf6e5b2'] 65 | rdOff = random.randint( 0, len( appKeys ) - 1 ) 66 | appK = appKeys[rdOff] 67 | gdUrl = 'http://restapi.amap.com/v3/geocode/geo?key=%s&address=%s&city=成都'%(appK,address) 68 | req = urllib2.Request( gdUrl ) 69 | data = urllib2.urlopen( req ) 70 | res = data.read() 71 | res = json.loads( res ) 72 | if res and res['info'] == 'OK' and 'geocodes' in res and len(res['geocodes'])>0 and 'location' in res['geocodes'][0]: 73 | location = res['geocodes'][0]['location'] 74 | city = res['geocodes'][0]['city'] 75 | district = res['geocodes'][0]['district'] 76 | return location,city,district 77 | else: 78 | print res 79 | return None,"","" 80 | except: 81 | return None,"","" 82 | 83 | appKeys = [ "4032f6db1085b0c63683ef3917e40428","IkSvwkWPwCuICyAjnS0QGBzw","6WDQYk8GK6CbusVvepkSQKST","CvbddAko7nt1layAy2IPYuZe", "0ufxKGZM4j0dyzwK7FF6fS5L", "Ni07CGCmkAiRCtCTcq1rql4B","tnPseOMpG3G02Rk5pWN2NXBt","Weefu0Q7Lj6BTdVLkHYEonQo" ] 84 | def getBdLocation(address): 85 | try: 86 | rdOff = random.randint( 0, len( appKeys ) - 1 ) 87 | appK = appKeys[rdOff] 88 | urlAddress = "http://api.map.baidu.com/geocoder?address=%s&output=json&key=%s"%( name, appK) 89 | req = urllib2.Request( urlAddress ) 90 | data = urllib2.urlopen( req ) 91 | res = data.read() 92 | res = json.loads( res ) 93 | lat = -1 94 | lng = -1 95 | if res and res['status'] == 'OK' and ('result' in res and 'location' in res['result'] and 'lng' in res['result']['location']): 96 | lng = res['result']['location']['lng'] 97 | lat = res['result']['location']['lat'] 98 | location = 'lng:%s,lat:%s'%(lng,lat) 99 | return location 100 | else: 101 | print res 102 | return None 103 | except: 104 | return None 105 | cityList = ["成都"] 106 | 107 | fw = open("./chengdu.txt","a+") 108 | index = [i+1 for i in range(32)] 109 | for pa in index: 110 | try: 111 | if pa==1: 112 | url = "http://cd.fang.lianjia.com/loupan/" 113 | else: 114 | url = "http://cd.fang.lianjia.com/loupan/pg%d/"%(pa) 115 | print "request:"+url 116 | req = urllib2.Request( url ) 117 | req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") 118 | req.add_header("Accept","*/*") 119 | req.add_header("Accept-Language","zh-CN,zh;q=0.8") 120 | 121 | data = urllib2.urlopen( req ) 122 | res = data.read() 123 | #print res 124 | #res = res.replace(" ","") 125 | #print res 126 | #objects = demjson.decode(res) 127 | 128 | soup = BeautifulSoup(res) 129 | houseLst = soup.findAll(id='house-lst') 130 | resp = soup.findAll('div', attrs = {'class': 'info-panel'}) 131 | 132 | for i in range(len(resp)): 133 | name = resp[i].findAll('a', attrs = {'target': '_blank'})[0].text 134 | 135 | privice = resp[i].findAll('span', attrs = {'class': 'num'}) 136 | privice = privice[0].text 137 | 138 | region = resp[i].findAll('span', attrs = {'class': 'region'}) 139 | address = region[0].text.split('(')[0] 140 | ##解析获得经纬度 141 | location,city,district = getGdLocation(name) 142 | if not location: 143 | location = getBdLocation(address)#自定义函数 144 | if not location: 145 | continue 146 | formatStr = "%s,%s,%s,%s,%s\n"%(city,district,name,location,privice) 147 | print formatStr 148 | fw.write(formatStr) 149 | except: 150 | pass 151 | fw.close() 152 | 153 | 154 | #二手房数据获取 155 | fw = open("./chengdu2.txt","a+") 156 | index = [i+1 for i in range(100)] 157 | for pa in index: 158 | try: 159 | if pa==1: 160 | url = "http://cd.lianjia.com/ershoufang/" 161 | else: 162 | url = "http://cd.lianjia.com/ershoufang/pg%d/"%(pa) 163 | print "request:"+url 164 | req = urllib2.Request( url ) 165 | req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") 166 | req.add_header("Accept","*/*") 167 | req.add_header("Accept-Language","zh-CN,zh;q=0.8") 168 | 169 | data = urllib2.urlopen( req ) 170 | res = data.read() 171 | 172 | soup = BeautifulSoup(res) 173 | resp = soup.findAll('div', attrs = {'class': 'content'}) 174 | resp = resp[0].findAll('ul', attrs = {'class': 'sellListContent'}) 175 | resp = resp[0].findAll('li', attrs = {'class': 'clear'}) 176 | for i in range(len(resp)): 177 | address = resp[i].findAll('div', attrs = {'class': 'address'}) 178 | address = address[0].findAll('a', attrs = {'target': '_blank'})[0].text 179 | address = address.replace(" ","") 180 | 181 | addressAddInfo = resp[i].findAll('div', attrs = {'class': 'positionInfo'})[0].findAll('a', attrs = {'target': '_blank'})[0].text 182 | name = address 183 | address = address +"_" + addressAddInfo 184 | 185 | print address 186 | unitPrice = resp[i].findAll('div', attrs = {'class': 'unitPrice'})[0].text 187 | unitPrice = unitPrice.replace("单价","").replace("元/平米","") 188 | ##解析获得经纬度 189 | ##售房者填写的地址和楼盘名字可能有误,通过楼盘和区域名来获取经纬度的同时,验证其可靠性 190 | location = None 191 | location1,city,district = getGdLocation(name) 192 | location2,city,district = getGdLocation(address) 193 | if location1 and location2: 194 | distince = calcLatLngDist(location1.split(',')[0], location1.split(',')[1], location2.split(',')[0], location2.split(',')[1]) 195 | print distince 196 | if distince > 6.0: 197 | continue 198 | print "distince验证通过:%s"%(distince) 199 | distince1 = calcLatLngDist(location1.split(',')[0], location1.split(',')[1], 104.06, 30.67) 200 | distince2 = calcLatLngDist(location2.split(',')[0], location2.split(',')[1], 104.06, 30.67) 201 | if distince1 < distince2: 202 | location = location1 203 | else: 204 | location = location2 205 | if not location: 206 | location = getBdLocation(address) 207 | if not location: 208 | print "地址解析失败" 209 | continue 210 | formatStr = "%s,%s,%s,%s,%s\n"%(city,district,name,location,unitPrice) 211 | print formatStr 212 | fw.write(formatStr) 213 | except: 214 | pass 215 | fw.close() 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /web/templates/hotregion.template.html: -------------------------------------------------------------------------------- 1 | 2 | {% autoescape None %} 3 | 4 | 房价热力图 5 | 6 | 7 | 8 | 9 | 10 | 11 | 14 | 15 | 16 |
17 |
18 | 19 | 20 | 21 | 22 | 127 | 128 | 129 | -------------------------------------------------------------------------------- /web/webapp.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | @author: zbc 4 | """ 5 | import os 6 | import tornado.ioloop 7 | import tornado.httpserver 8 | import tornado.web 9 | import json 10 | import math 11 | import sys 12 | reload(sys) 13 | sys.setdefaultencoding( "utf-8" ) 14 | 15 | '''' 16 | 生成hotregin 17 | ''' 18 | def _mkF( sf ): 19 | vf = round( float(sf), 9 ) 20 | return vf 21 | 22 | def _loadLocations(filename): 23 | if filename== None:return 24 | 25 | filen = filename 26 | hd = file( filen, "r" ) 27 | if not hd: return [ [0,[] ], [] ] 28 | pts = [] 29 | for line in hd: 30 | ay = line.strip().split(",") 31 | if len( ay ) < 6: continue 32 | pts.append( [ ay[4], ay[3], ay[5] ] ) 33 | hd.close() 34 | hotRegion = _loadPts( pts ) 35 | return hotRegion 36 | 37 | def _loadPts( fpts ): 38 | d = {} 39 | #print len(fpts) 40 | for ay in fpts: 41 | if len( ay ) != 3: continue 42 | try: 43 | latf = _mkF( ay[0] ) 44 | lotf = _mkF( ay[1] ) 45 | r = [ latf, lotf ] 46 | k = "%f %f"%( r[0], r[1] ) 47 | d[k] = [ r, int(ay[2]) ] 48 | except:pass 49 | 50 | #print len(d) 51 | if len( d ) == 0: return [ 0, [] ] 52 | 53 | dd = [] 54 | for k in d: dd.append( d[k] ) 55 | 56 | len1 = len(dd) 57 | dd = sorted( dd, key=lambda x:x[1], reverse = True ) 58 | sz = min( 50000, len1 ) 59 | dd = dd[0: sz ] 60 | 61 | ret = [] 62 | maxv = 0 63 | minv = 10000 64 | array = [] 65 | i = 0 66 | for k in dd: 67 | r = k[0] 68 | c = k[1] 69 | c = int( round( math.log( c, 4 ) + 4, 0 ) ) 70 | array.append({"lng": r[1],"lat":r[0], "count": c*10}) 71 | i = i + 1 72 | 73 | return array 74 | """ 75 | def _loadUserPts( fpts ): 76 | d = {} 77 | #print len(fpts) 78 | for ay in fpts: 79 | if len( ay ) != 2: continue 80 | try: 81 | latf = _mkF( ay[0] ) 82 | lotf = _mkF( ay[1] ) 83 | 84 | r = [ latf, lotf ] 85 | k = "%f %f"%( r[0], r[1] ) 86 | #print k 87 | if k not in d: 88 | d[k] = [ r, 1 ] 89 | else: 90 | d[k][1] = d[k][1] + 1 91 | except:pass 92 | 93 | #print len(d) 94 | if len( d ) == 0: return [ 0, [] ] 95 | 96 | dd = [] 97 | for k in d: dd.append( d[k] ) 98 | 99 | len1 = len(dd) 100 | dd = sorted( dd, key=lambda x:x[1], reverse = True ) 101 | sz = min( 50000, len1 ) 102 | dd = dd[0: sz ] 103 | 104 | ret = [] 105 | maxv = 0 106 | minv = 10000 107 | array = [] 108 | i = 0 109 | for k in dd: 110 | r = k[0] 111 | c = k[1] 112 | c = int( round( math.log( c, 4 ) + 4, 0 ) ) 113 | array.append({"lng": r[1],"lat":r[0], "count": c*10}) 114 | i = i + 1 115 | 116 | return array 117 | """ 118 | map_center = {"chengdu":[30.67, 104.064],"beijing":[39.9772370000,116.3959960000],"shanghai":[31.236305,121.480237]} 119 | map_level = {"chengdu":13,"beijing":12,"shanghai":12} 120 | 121 | class orderClusterHandler(tornado.web.RequestHandler): 122 | def get(self): 123 | if True : 124 | mapCentor = [30.67, 104.064] 125 | mapLevel = 14 126 | filename = "../chengdu.txt" 127 | #热点数据 128 | hot_regin = _loadLocations(filename) 129 | self.render( "hotregion.template.html", 130 | center_lon=mapCentor[1], center_lat=mapCentor[0], 131 | hotregin=hot_regin) 132 | 133 | def make_app(): 134 | settings = { 135 | "template_path": os.path.join(os.path.dirname(__file__), "templates"), 136 | "static_path":os.path.join(os.path.dirname(__file__), "src"), 137 | "cookie_secret": "bZJc2sWbQLKos6GkHn/VB9oXwQt8S0R0kRvJ5/xJ89E=", 138 | "xsrf_cookies": False, 139 | } 140 | 141 | return tornado.web.Application(handlers=[ (r"/", orderClusterHandler), 142 | ], **settings) 143 | 144 | if __name__ == "__main__": 145 | app = make_app() 146 | app.listen(8110) 147 | tornado.ioloop.IOLoop.current().start() 148 | --------------------------------------------------------------------------------