├── README.md ├── analysis.py ├── spy.py └── web ├── templates └── hotregion.template.html └── webapp.py /README.md: -------------------------------------------------------------------------------- 1 | # housePriceSpider 2 | 基于链家房产网的房价数据爬取及分析 3 | -------------------------------------------------------------------------------- /analysis.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jan 20 13:29:47 2017 4 | 5 | @author: L 6 | """ 7 | import random 8 | import numpy as np 9 | import cPickle 10 | from collections import defaultdict 11 | import sys, re 12 | import pandas as pd 13 | import numpy as np 14 | import matplotlib.pyplot as plt 15 | 16 | city = "chengdu2" 17 | fileName = "%s.txt"%(city) 18 | df = pd.read_csv(fileName, header=None, sep=',') 19 | state = ["city","district","name","lng","lat","privice"] 20 | 21 | df.columns = state 22 | 23 | #district stsat describe 24 | #this will return a Series Object 25 | df_stat = df['privice'].groupby(df['district']) 26 | describe = df_stat.describe().to_dict() 27 | means = df_stat.mean().to_dict() 28 | 29 | describeAll = df['privice'].describe().to_dict() 30 | describeAllNew = {} 31 | for key in describeAll: 32 | describeAllNew[('全市',key)] = describeAll[key] 33 | 34 | means['全市'] = df['privice'].mean() 35 | #plot means 36 | sorted_means = sorted(means.iteritems(), key=lambda d:d[1], reverse = True ) 37 | 38 | districtArray = [] 39 | stdArray = [] 40 | priviceArray = [] 41 | countArray = [] 42 | for item in sorted_means: 43 | district = item[0] 44 | privice = item[1] 45 | districtArray.insert(0,district) 46 | priviceArray.insert(0,privice) 47 | countArray.insert(0,counts[district]) 48 | stdArray.insert(0,std[district]) 49 | N = len(priviceArray) 50 | ind = np.arange(N) # the x locations for the groups 51 | width = 0.35 # the width of the bars 52 | fig, ax = plt.subplots() 53 | rects1 = ax.bar(ind, priviceArray, width, color='darkorange') 54 | 55 | right_data = countArray 56 | axf = ax.twinx() 57 | print len(ax.get_xticks()) 58 | print len(right_data) 59 | rects2 = axf.plot(ind, right_data, color='forestgreen') 60 | #axf.set_ylim((0, 20)) 61 | axf.set_ylabel('房屋数量/套') 62 | axf.set_ylim(0,4000) 63 | 64 | ax.set_xlabel('地区') 65 | ax.set_ylabel('均价/元') 66 | ax.set_title('成都市各区2017年1月二手房房屋均价--房屋数量') 67 | ax.set_xticks(ind+width) 68 | ax.set_xticklabels( districtArray ) 69 | def autolabel(rects): 70 | # attach some text labels 71 | for rect in rects: 72 | height = rect.get_height() 73 | ax.text(rect.get_x()+rect.get_width()/2., 1.05*height, '%d'%int(height), 74 | ha='center', va='bottom') 75 | def autolabel_line(rects): 76 | # attach some text labels 77 | for rect in rects: 78 | print rect.get_ydata() 79 | X = rect.get_xdata() 80 | Y = rect.get_ydata() 81 | for x,y in zip(X,Y): 82 | axf.text(x, 1.01*y, '%d'%int(y),color='forestgreen', 83 | ha='center', va='bottom') 84 | 85 | autolabel(rects1) 86 | autolabel_line(rects2) 87 | plt.style.use('ggplot') 88 | plt.show() 89 | 90 | 91 | boxData = [] 92 | boxLables = [] 93 | for item in sorted_means: 94 | district1 = item[0] 95 | if district1 != '全市' : 96 | print district1 97 | df_tmp = df.ix[df.district==district1] 98 | data = df_tmp['privice'].values 99 | boxData.append(data) 100 | boxLables.append(district1) 101 | else: 102 | data = df['privice'].values 103 | boxData.append(data) 104 | boxLables.append('全市') 105 | 106 | fig, ax = plt.subplots() 107 | ax.boxplot((boxData),labels=(boxLables)) 108 | ax.set_xlabel('地区') 109 | ax.set_ylabel('价格(/元)') 110 | ax.set_title('成都市各区2017年1月二手房房屋价格统计') 111 | plt.style.use('ggplot') 112 | ax.show() 113 | 114 | 115 | 116 | 117 | 118 | -------------------------------------------------------------------------------- /spy.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import json 3 | import re 4 | import io 5 | import datetime 6 | import random 7 | import time 8 | import urllib 9 | import urllib2 10 | import pymongo 11 | import datetime 12 | import time 13 | import sys 14 | import demjson 15 | from bs4 import BeautifulSoup 16 | from math import * 17 | reload(sys) 18 | sys.setdefaultencoding( "utf-8" ) 19 | 20 | # input Lat_A 纬度A 21 | # input Lng_A 经度A 22 | # input Lat_B 纬度B 23 | # input Lng_B 经度B 24 | # output distance 距离(km) 25 | def calcLatLngDist(Lat_A, Lng_A, Lat_B, Lng_B, Debug = False ): 26 | Lat_A = float(Lat_A) 27 | Lng_A = float(Lng_A) 28 | Lat_B = float(Lat_B) 29 | Lng_B = float(Lng_B) 30 | #print Lat_A, Lng_A, Lat_B, Lng_B 31 | if Lat_A == Lat_B and Lng_A == Lng_B: 32 | if Debug: print "_calcLatLngDist return as equal" 33 | return 0 34 | 35 | ra = 6378.140 # 赤道半径 (km) 36 | rb = 6356.755 # 极半径 (km) 37 | flatten = (ra - rb) / ra # 地球扁率 38 | rad_lat_A = radians(Lat_A) 39 | rad_lng_A = radians(Lng_A) 40 | rad_lat_B = radians(Lat_B) 41 | rad_lng_B = radians(Lng_B) 42 | pA = atan(rb / ra * tan(rad_lat_A)) 43 | pB = atan(rb / ra * tan(rad_lat_B)) 44 | distance = 0 45 | try: 46 | xx = acos(sin(pA) * sin(pB) + cos(pA) * cos(pB) * cos(rad_lng_A - rad_lng_B)) 47 | c1 = (sin(xx) - xx) * (sin(pA) + sin(pB)) ** 2 / cos(xx / 2) ** 2 48 | c2 = (sin(xx) + xx) * (sin(pA) - sin(pB)) ** 2 / sin(xx / 2) ** 2 49 | dr = flatten / 8 * (c1 - c2) 50 | distance = ra * (xx + dr) 51 | except Exception,e: 52 | if Debug: 53 | print "_calcLatLngDist return as exception:" 54 | print Lat_A, Lng_A, Lat_B, Lng_B 55 | print e 56 | return 0 57 | return distance 58 | calcLatLngDist(104.131182,30.655364, 104.06, 30.67) 59 | 60 | def getGdLocation(address): 61 | try: 62 | #Key名称:adrDev249091488 63 | Key = '48bdf4f06248826b22e51538dccb5f4d' 64 | appKeys = ['7ba215f6db353d2aa66c55fa4dfe99ad','b80ccd147375ed37bfbdd9025ed9e384','5a9a676d40a02bbcf3cbe2bc581f0e26','7c046c20b0e2c70020d39b4988116127','74182609e8238c9822282985734ed495','595db8b17bed1ad220b79ff08e3fedaa','701dc8acd6d7aec627c678623f40f861','af2dd94fee5153f44d4e34160a4fdbe7','f51c07f0b4cecf5a773a63c87099645c','9f566db560bd026c506e1efaa75c3952','44379632a8984fb7d5d88f7d9bf6e5b2'] 65 | rdOff = random.randint( 0, len( appKeys ) - 1 ) 66 | appK = appKeys[rdOff] 67 | gdUrl = 'http://restapi.amap.com/v3/geocode/geo?key=%s&address=%s&city=成都'%(appK,address) 68 | req = urllib2.Request( gdUrl ) 69 | data = urllib2.urlopen( req ) 70 | res = data.read() 71 | res = json.loads( res ) 72 | if res and res['info'] == 'OK' and 'geocodes' in res and len(res['geocodes'])>0 and 'location' in res['geocodes'][0]: 73 | location = res['geocodes'][0]['location'] 74 | city = res['geocodes'][0]['city'] 75 | district = res['geocodes'][0]['district'] 76 | return location,city,district 77 | else: 78 | print res 79 | return None,"","" 80 | except: 81 | return None,"","" 82 | 83 | appKeys = [ "4032f6db1085b0c63683ef3917e40428","IkSvwkWPwCuICyAjnS0QGBzw","6WDQYk8GK6CbusVvepkSQKST","CvbddAko7nt1layAy2IPYuZe", "0ufxKGZM4j0dyzwK7FF6fS5L", "Ni07CGCmkAiRCtCTcq1rql4B","tnPseOMpG3G02Rk5pWN2NXBt","Weefu0Q7Lj6BTdVLkHYEonQo" ] 84 | def getBdLocation(address): 85 | try: 86 | rdOff = random.randint( 0, len( appKeys ) - 1 ) 87 | appK = appKeys[rdOff] 88 | urlAddress = "http://api.map.baidu.com/geocoder?address=%s&output=json&key=%s"%( name, appK) 89 | req = urllib2.Request( urlAddress ) 90 | data = urllib2.urlopen( req ) 91 | res = data.read() 92 | res = json.loads( res ) 93 | lat = -1 94 | lng = -1 95 | if res and res['status'] == 'OK' and ('result' in res and 'location' in res['result'] and 'lng' in res['result']['location']): 96 | lng = res['result']['location']['lng'] 97 | lat = res['result']['location']['lat'] 98 | location = 'lng:%s,lat:%s'%(lng,lat) 99 | return location 100 | else: 101 | print res 102 | return None 103 | except: 104 | return None 105 | cityList = ["成都"] 106 | 107 | fw = open("./chengdu.txt","a+") 108 | index = [i+1 for i in range(32)] 109 | for pa in index: 110 | try: 111 | if pa==1: 112 | url = "http://cd.fang.lianjia.com/loupan/" 113 | else: 114 | url = "http://cd.fang.lianjia.com/loupan/pg%d/"%(pa) 115 | print "request:"+url 116 | req = urllib2.Request( url ) 117 | req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") 118 | req.add_header("Accept","*/*") 119 | req.add_header("Accept-Language","zh-CN,zh;q=0.8") 120 | 121 | data = urllib2.urlopen( req ) 122 | res = data.read() 123 | #print res 124 | #res = res.replace(" ","") 125 | #print res 126 | #objects = demjson.decode(res) 127 | 128 | soup = BeautifulSoup(res) 129 | houseLst = soup.findAll(id='house-lst') 130 | resp = soup.findAll('div', attrs = {'class': 'info-panel'}) 131 | 132 | for i in range(len(resp)): 133 | name = resp[i].findAll('a', attrs = {'target': '_blank'})[0].text 134 | 135 | privice = resp[i].findAll('span', attrs = {'class': 'num'}) 136 | privice = privice[0].text 137 | 138 | region = resp[i].findAll('span', attrs = {'class': 'region'}) 139 | address = region[0].text.split('(')[0] 140 | ##解析获得经纬度 141 | location,city,district = getGdLocation(name) 142 | if not location: 143 | location = getBdLocation(address)#自定义函数 144 | if not location: 145 | continue 146 | formatStr = "%s,%s,%s,%s,%s\n"%(city,district,name,location,privice) 147 | print formatStr 148 | fw.write(formatStr) 149 | except: 150 | pass 151 | fw.close() 152 | 153 | 154 | #二手房数据获取 155 | fw = open("./chengdu2.txt","a+") 156 | index = [i+1 for i in range(100)] 157 | for pa in index: 158 | try: 159 | if pa==1: 160 | url = "http://cd.lianjia.com/ershoufang/" 161 | else: 162 | url = "http://cd.lianjia.com/ershoufang/pg%d/"%(pa) 163 | print "request:"+url 164 | req = urllib2.Request( url ) 165 | req.add_header("User-Agent","Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/45.0.2454.101 Safari/537.36") 166 | req.add_header("Accept","*/*") 167 | req.add_header("Accept-Language","zh-CN,zh;q=0.8") 168 | 169 | data = urllib2.urlopen( req ) 170 | res = data.read() 171 | 172 | soup = BeautifulSoup(res) 173 | resp = soup.findAll('div', attrs = {'class': 'content'}) 174 | resp = resp[0].findAll('ul', attrs = {'class': 'sellListContent'}) 175 | resp = resp[0].findAll('li', attrs = {'class': 'clear'}) 176 | for i in range(len(resp)): 177 | address = resp[i].findAll('div', attrs = {'class': 'address'}) 178 | address = address[0].findAll('a', attrs = {'target': '_blank'})[0].text 179 | address = address.replace(" ","") 180 | 181 | addressAddInfo = resp[i].findAll('div', attrs = {'class': 'positionInfo'})[0].findAll('a', attrs = {'target': '_blank'})[0].text 182 | name = address 183 | address = address +"_" + addressAddInfo 184 | 185 | print address 186 | unitPrice = resp[i].findAll('div', attrs = {'class': 'unitPrice'})[0].text 187 | unitPrice = unitPrice.replace("单价","").replace("元/平米","") 188 | ##解析获得经纬度 189 | ##售房者填写的地址和楼盘名字可能有误,通过楼盘和区域名来获取经纬度的同时,验证其可靠性 190 | location = None 191 | location1,city,district = getGdLocation(name) 192 | location2,city,district = getGdLocation(address) 193 | if location1 and location2: 194 | distince = calcLatLngDist(location1.split(',')[0], location1.split(',')[1], location2.split(',')[0], location2.split(',')[1]) 195 | print distince 196 | if distince > 6.0: 197 | continue 198 | print "distince验证通过:%s"%(distince) 199 | distince1 = calcLatLngDist(location1.split(',')[0], location1.split(',')[1], 104.06, 30.67) 200 | distince2 = calcLatLngDist(location2.split(',')[0], location2.split(',')[1], 104.06, 30.67) 201 | if distince1 < distince2: 202 | location = location1 203 | else: 204 | location = location2 205 | if not location: 206 | location = getBdLocation(address) 207 | if not location: 208 | print "地址解析失败" 209 | continue 210 | formatStr = "%s,%s,%s,%s,%s\n"%(city,district,name,location,unitPrice) 211 | print formatStr 212 | fw.write(formatStr) 213 | except: 214 | pass 215 | fw.close() 216 | 217 | 218 | 219 | 220 | 221 | 222 | 223 | 224 | 225 | 226 | 227 | 228 | 229 | 230 | 231 | 232 | 233 | 234 | -------------------------------------------------------------------------------- /web/templates/hotregion.template.html: -------------------------------------------------------------------------------- 1 | 2 | {% autoescape None %} 3 |
4 |