├── .gitattributes ├── .gitignore ├── README.md ├── config.ini ├── house.py ├── reports └── 房屋价格情况统计20190124.html ├── requirements.txt └── source ├── __init__.py ├── anjuke.py ├── beike.py ├── common.py ├── ganji.py ├── lianjia.py ├── read.py ├── report.py ├── save.py ├── template.py └── tongcheng.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.html linguist-language=python 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | *.iml 3 | tmp.py 4 | __pycache__/* 5 | .idea/* 6 | test/* 7 | *.html 8 | temp.py 9 | source/__pycache__/* 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # house 2 | 此为二手房数据,需要新房数据的移步[new_house](https://github.com/tree-branch/new_house) 3 | 4 | 爬取贝壳找房、链家的房源信息,便于广大未买房子的朋友们尽快成为房奴!!!Crawl the house informations of ke.com, lianjia.com (anjvke.com, 58.com, ganji.com after the update), convenient for the majority of friends who did not buy the house as soon as to become the mortgage slave!!! 5 | 6 | ## 直接运行 7 | 修改config.ini内的mysql链接地址 8 | 9 | python3.0及以上版本 10 | 11 | python house.py 12 | 13 | 缺什么包就 pip install *** 14 | 15 | 大概率缺少(pymysql, leancloud, cryptography) 16 | 17 | ## 个性化运行 18 | 此程序是把leancloud作为云数据库使用;在 https://leancloud.cn/ 内建立账号;修改config.ini为自己的App ID App KEY 19 | 20 | python house.py 21 | 22 | 修改house.py内贝壳找房等网站的网址,查询的限定条件需要能够保存在URL内,例如链家的排序也是可以保存在URL内的,一看例子你也应该就懂了,不懂的话就再看一遍,直接给我发邮件当然是最快的办法 :-)。 23 | 24 | ## 联系方式 25 | 有想说的联系:lm521299@sina.com 26 | 27 | ## 20190215log 28 | * 修正使用leancloud时,生成报告时读取数据不全的问题。此bug只影响报告的生成,不影响数据爬取 29 | 30 | ## 20190123log 31 | * 增加简单的数据比较功能 32 | * 使用leancloud的需要添加masterkey参数到config.ini中 33 | 34 | ## 20200605log 35 | * 增加报告上的房屋连接可以直接跳转 36 | 37 | ## 20201117log 38 | * 修复链家数据错误的问题 39 | 40 | ## 20210508log 41 | * 修复链家、贝壳数据错误的问题 42 | 43 | ## 20210906log 44 | * 修复链家、贝壳数据错误的问题 45 | * 增加关注人数字段 46 | 47 | ## 20211214log 48 | * 修复链家数据错误的问题 49 | 50 | 51 |  52 | 53 | # 希望发现不好用的时候邮件通知我一下,方便我尽快修改,谢谢 :-) 54 |  55 | ↑看一下大家什么时候喜欢关注房源信息↑ 56 | -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | ;MySQL parameters 2 | [mysql] 3 | host = localhost 4 | port = 3306 5 | user = lv 6 | passwd = lv 7 | db = lvdb 8 | 9 | ;leancloud parameters 10 | [leancloud] 11 | appid = tprA4QlLY29nvh5QmiWsNl0s-gzGzoHsz 12 | appkey = idYvbwv28UfweEIJJ01E8bBb 13 | masterkey = I_can_not_tell_you 14 | 15 | ;default mysql 16 | [savetype] 17 | type = mysql 18 | ;type = leancloud 19 | 20 | -------------------------------------------------------------------------------- /house.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | # python3.0 3 | 4 | from source.save import saveData 5 | from source.common import getHtml 6 | from source.report import reportData 7 | import configparser 8 | import webbrowser 9 | import os 10 | 11 | 12 | # ------主函数------ 13 | # delete() 14 | if __name__ == '__main__': 15 | # 获取参数 16 | config = configparser.ConfigParser() 17 | config.read("config.ini") 18 | 19 | # 清除数据 20 | save = saveData(config) 21 | save.deleteOldData() 22 | 23 | # 贝壳找房 (例:北京、通州 251-499万 80-100平) 根据自己需求添加链接 24 | beike1 = getHtml('''https://bj.ke.com/ershoufang/tongzhou/co32ba80ea100bp251ep499/''') 25 | beike2 = getHtml('''https://bj.ke.com/ershoufang/tongzhou/pg2co32ba80ea100bp251ep499/''') 26 | beike3 = getHtml('''https://bj.ke.com/ershoufang/tongzhou/pg3co32ba80ea100bp251ep499/''') 27 | beike_htmls = [beike1, beike2, beike3] 28 | for beike_html in beike_htmls: 29 | save.beike_save(beike_html) 30 | 31 | # 链家 (例:北京 0-600万 60-100平) 根据自己需求添加链接 32 | lianjia1 = getHtml('''https://bj.lianjia.com/ershoufang/bp0ep600ba60ea100l3/rs/''') 33 | lianjia2 = getHtml('''https://bj.lianjia.com/ershoufang/pg2l3ba60ea100ep600/''') 34 | lianjia3 = getHtml('''https://bj.lianjia.com/ershoufang/pg3l3ba60ea100ep600/''') 35 | lianjia_htmls = [lianjia1, lianjia2, lianjia3] 36 | for lianjia_html in lianjia_htmls: 37 | save.lianjia_save(lianjia_html) 38 | 39 | # 58同城 高新园区 80-120W 3室 精装修 40 | tongcheng1 = getHtml('''http://bj.58.com/ershoufang/?PGTID=0d00000c-0000-099e-5f9d-eb7cd9b2d735&ClickID=1&huansuanyue=200_600&bunengdaikuan=0&area=60_100''') 41 | tongcheng2 = getHtml('''http://bj.58.com/ershoufang/pn2/?huansuanyue=200_600&bunengdaikuan=0&area=60_100&PGTID=0d300000-0000-0b90-1e0b-bf894f74b13a&ClickID=1''') 42 | tongcheng3 = getHtml('''http://bj.58.com/ershoufang/pn3/?huansuanyue=200_600&bunengdaikuan=0&area=60_100&PGTID=0d300000-0000-08f9-ba56-6673c850e2b8&ClickID=1''') 43 | # print(str(tongcheng1.encode('GB18030'))) 44 | tongcheng_htmls = [tongcheng1, tongcheng2, tongcheng3] 45 | for tongcheng_html in tongcheng_htmls: 46 | save.tongcheng_save(tongcheng_html) 47 | 48 | # 安居客 (例:北京 200-600万 60-100平 按最新排序) 根据自己需求添加链接 49 | anjuke1 = getHtml('''https://beijing.anjuke.com/sale/o5/?from_area=60&to_area=100&from_price=200&to_price=600''') 50 | anjuke2 = getHtml('''https://beijing.anjuke.com/sale/o5-p2/?from_area=60&to_area=100&from_price=200&to_price=600#filtersort''') 51 | anjuke3 = getHtml('''https://beijing.anjuke.com/sale/o5-p3/?from_area=60&to_area=100&from_price=200&to_price=600#filtersort''') 52 | anjuke_htmls = [anjuke1, anjuke2, anjuke3] 53 | for anjuke_html in anjuke_htmls: 54 | save.anjuke_save(anjuke_html) 55 | 56 | # # 赶集 高新园区 80-120W 3室 精装修 57 | # ganji1 = getHtml('''http://dl.ganji.com/fang5/gaoxinyuanqu/b80e120h3q2/''') 58 | # ganji2 = getHtml('''http://dl.ganji.com/fang5/gaoxinyuanqu/b80e120h3o2q2/''') 59 | # ganji3 = getHtml('''http://dl.ganji.com/fang5/gaoxinyuanqu/b80e120h3o3q2/''') 60 | # ganji_htmls = [ganji1, ganji2, ganji3] 61 | # for ganji_html in ganji_htmls: 62 | # ganji_save(ganji_html) 63 | 64 | print("生成报告中...") 65 | rep = reportData() 66 | reportFileName = rep.get_report() 67 | webbrowser.open('''file:///''' + os.path.dirname(__file__) + '''/reports/''' + reportFileName) 68 | 69 | print("OVER!!!") 70 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | configparser==5.0.1 2 | mysql-connector==2.2.9 3 | numpy==1.19.4 4 | pandas==1.1.4 5 | PyMySQL==0.10.1 -------------------------------------------------------------------------------- /source/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tree-branch/house/6c6c3c8ffa92f84192b2c3346b294e9a25b7d008/source/__init__.py -------------------------------------------------------------------------------- /source/anjuke.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | from html.parser import HTMLParser 4 | 5 | from numpy.core.defchararray import strip 6 | 7 | 8 | class AnjukeParser(HTMLParser): 9 | def __init__(self): 10 | super().__init__() 11 | # 存储中间数据(安居客为房屋描述、小区名) 12 | self.span = "" 13 | # 存储中间数据(安居客为总价) 14 | self.strong = "" 15 | # 房屋名称 16 | self.houseName = [] 17 | # 小区名称 18 | self.villageName = [] 19 | # 房子介绍 20 | self.houseNote = [] 21 | # 总价 22 | self.houseTotlePrice = [] 23 | # 单价 24 | self.houseUnitPrice = [] 25 | # 房屋链接 26 | self.houseLink = [] 27 | # 第一张图片 28 | self.houseImg = [] 29 | # 用于标记数据类型 30 | self.flag = [] 31 | # aa 32 | self.aa = () 33 | 34 | def feed(self, data): 35 | super().feed(data) 36 | # 校验数据个数是否统一 37 | size = len(self.houseName) 38 | if len(self.houseName) != size or len(self.villageName) != size or len(self.houseNote) != size \ 39 | or len(self.houseTotlePrice) != size or len(self.houseUnitPrice) != size or len(self.houseLink) != size \ 40 | or len(self.houseImg) != size: 41 | raise ValueError("数据个数不一致:houseName-" + str(len(self.houseName)) + ",villageName-" + str(len(self.villageName)) + 42 | ",houseNote-" + str(len(self.houseNote)) + ",houseTotlePrice-" + str(len(self.houseTotlePrice)) + 43 | ",houseUnitPrice-" + str(len(self.houseUnitPrice)) + ",houseLink-" + str(len(self.houseLink)) + 44 | ",houseImg-" + str(len(self.houseImg))) 45 | return self.houseName, self.villageName, self.houseNote, self.houseTotlePrice, self.houseUnitPrice, self.houseLink, self.houseImg, [0]*len(self.houseImg) 46 | 47 | def handle_starttag(self, tag, attrs): 48 | if tag == "span" and ("class", "comm-address") in attrs: 49 | self.flag.append("villageName") 50 | elif tag == "span" and ("class", "price-det") in attrs: 51 | self.flag.append("houseTotlePrice_2") 52 | elif tag == "span" and ("class", "unit-price") in attrs: 53 | self.flag.append("houseUnitPrice") 54 | elif tag == "span": 55 | self.flag.append("span") 56 | elif tag == "strong": 57 | self.flag.append("strong") 58 | elif tag == "a" and ("class", "houseListTitle ") in attrs: 59 | self.flag.append("houseName") 60 | for attr in attrs: 61 | if attr[0] == "href": 62 | self.houseLink.append(attr[1]) 63 | elif tag == "div" and ("class", "details-item") in attrs: 64 | self.flag.append("houseNote_2") 65 | self.span = "" 66 | elif tag == "img" and ("width", "180") in attrs: 67 | for attr in attrs: 68 | if attr[0] == "src": 69 | self.houseImg.append(attr[1]) 70 | 71 | def handle_endtag(self, tag): 72 | if len(self.flag) != 0: 73 | if tag == "div" and self.flag[-1] == "houseNote_2" and self.span != "": 74 | # 此时为houseNote的结束 75 | # print(self.span.encode('GB18030')) 76 | self.houseNote.append(self.span) 77 | self.flag.pop() 78 | self.span = "" 79 | elif tag == "div" and self.flag[-1] == "houseNote_2": 80 | self.flag.pop() 81 | 82 | def handle_data(self, data): 83 | if len(self.flag) != 0: 84 | if self.flag[-1] == "span": 85 | # print(str(data)) 86 | self.span += data 87 | self.flag.pop() 88 | elif self.flag[-1] == "strong": 89 | self.strong = data 90 | self.flag.pop() 91 | elif self.flag[-1] == "houseName": 92 | # print(str(data)) 93 | self.houseName.append(str(strip(data))) 94 | self.flag.pop() 95 | elif self.flag[-1] == "villageName": 96 | # print(str(data)) 97 | self.villageName.append(str(strip(data))) 98 | self.flag.pop() 99 | elif self.flag[-1] == "houseTotlePrice_2": 100 | # print(str(data)) 101 | self.houseTotlePrice.append(self.strong + data) 102 | self.strong = "" 103 | self.flag.pop() 104 | elif self.flag[-1] == "houseUnitPrice": 105 | # print(str(data)) 106 | self.houseUnitPrice.append(data) 107 | self.flag.pop() 108 | -------------------------------------------------------------------------------- /source/beike.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | from html.parser import HTMLParser 4 | 5 | class BeikeParser(HTMLParser): 6 | def __init__(self): 7 | super().__init__() 8 | # 存储中间数据(链家为总房价与单价) 9 | self.span = "" 10 | # 房屋名称 11 | self.houseName = [] 12 | # 小区名称 13 | self.villageName = [] 14 | # 房子介绍 15 | self.houseNote = [] 16 | # 总价 17 | self.houseTotlePrice = [] 18 | self.houseTotlePrice_tmp = "" #用于拼接houseTotlePrice 19 | # 单价 20 | self.houseUnitPrice = [] 21 | # 房屋链接 22 | self.houseLink = [] 23 | # 第一张图片 24 | self.houseImg = [] 25 | # 关注人数 26 | self.followNum = [] 27 | # 用于标记数据类型 28 | self.flag = [] 29 | self.sign = 0 30 | 31 | def feed(self, data): 32 | super().feed(data) 33 | # 校验数据个数是否统一 34 | size = len(self.houseName) 35 | if len(self.houseName) != size or len(self.villageName) != size or len(self.houseNote) != size \ 36 | or len(self.houseTotlePrice) != size or len(self.houseUnitPrice) != size or len(self.houseLink) != size \ 37 | or len(self.houseImg) != size or len(self.followNum) != size: 38 | raise ValueError("数据个数不一致:houseName-" + str(len(self.houseName)) + ",villageName-" + str(len(self.villageName)) + 39 | ",houseNote-" + str(len(self.houseNote)) + ",houseTotlePrice-" + str(len(self.houseTotlePrice)) + 40 | ",houseUnitPrice-" + str(len(self.houseUnitPrice)) + ",houseLink-" + str(len(self.houseLink)) + 41 | ",houseImg-" + str(len(self.houseImg)) + ",followNum-" + str(len(self.followNum))) 42 | return self.houseName, self.villageName, self.houseNote, self.houseTotlePrice, self.houseUnitPrice, self.houseLink, self.houseImg, self.followNum 43 | 44 | def handle_starttag(self, tag, attrs): 45 | if tag == "span": 46 | if ("class", "houseIcon") in attrs: 47 | self.flag.append("houseNote") 48 | self.flag.append("span") 49 | elif tag == "a" and ("class", "img VIEWDATA CLICKDATA maidian-detail") in attrs: 50 | # self.flag.append("houseName") 51 | for attr in attrs: 52 | if attr[0] == "title": 53 | self.houseName.append(attr[1]) 54 | elif attr[0] == "href": 55 | self.houseLink.append(attr[1]) 56 | # elif tag == "a" and ("data-el", "region") in attrs: 57 | # self.flag.append("villageName") 58 | # elif tag == "a" and ("class", "no_resblock_a") in attrs: 59 | # self.flag.append("villageName") 60 | # elif tag == "div" and ("class", "houseInfo") in attrs: 61 | # self.flag.append("houseNote") 62 | elif tag == "div" and ("class", "totalPrice totalPrice2") in attrs: 63 | self.flag.append("houseTotlePrice_2") 64 | elif tag == "div" and ("class", "unitPrice") in attrs: 65 | self.flag.append("houseUnitPrice_2") 66 | elif tag == "img" and ("class", "lj-lazy") in attrs: 67 | for attr in attrs: 68 | if attr[0] == "alt": 69 | for attr2 in attrs: 70 | if attr2[0] == "data-original": 71 | self.houseImg.append(attr2[1]) 72 | break 73 | break 74 | elif tag == "div" and ("class", "positionInfo") in attrs: 75 | self.flag.append("villageName_1") 76 | elif tag == "a" and len(self.flag) > 0 and self.flag[-1] == "villageName_1": 77 | self.flag.pop() 78 | self.flag.append("villageName_2") 79 | elif tag == "div" and ("class", "followInfo") in attrs: 80 | self.flag.append("followNum") 81 | 82 | def handle_data(self, data): 83 | data = data.replace(' ', '') 84 | if len(self.flag) > 0: 85 | if self.flag[-1] == "span": 86 | self.span = data 87 | self.flag.pop() 88 | if len(self.flag) > 0 and self.flag[-1] == "houseUnitPrice_2": 89 | self.houseUnitPrice.append(self.span) 90 | self.flag.pop() 91 | elif len(self.flag) > 0 and self.flag[-1] == "houseNote": 92 | self.houseNote.append(self.span) 93 | # self.villageName.append(self.span.split('|')[0].strip()) 94 | self.flag.pop() 95 | elif len(self.flag) > 0 and self.flag[-1] == "followNum": 96 | self.followNum.append(int(self.span.replace(' ', '').split('人')[0])) 97 | self.flag.pop() 98 | elif len(self.flag) > 0 and self.flag[-1] == "houseTotlePrice_2": 99 | self.houseTotlePrice_tmp = self.span 100 | # self.villageName.append(self.span.split('|')[0].strip()) 101 | elif self.flag[-1] == "houseName": 102 | # print(str(data)) 103 | self.houseName.append(data) 104 | self.flag.pop() 105 | # elif self.flag[-1] == "villageName": 106 | # # print(str(data)) 107 | # self.villageName.append(data) 108 | # self.flag.pop() 109 | # elif self.flag[-1] == "houseNote": 110 | # print(self.span) 111 | # self.houseNote.append(self.span) 112 | # self.villageName.append(self.span.split('|')[0]) 113 | # self.span = "" 114 | # self.flag.pop() 115 | elif self.flag[-1] == "houseTotlePrice_2": 116 | if data != "": 117 | self.houseTotlePrice_tmp = self.houseTotlePrice_tmp + self.span + data 118 | self.span = "" 119 | # self.flag.pop() 120 | elif self.flag[-1] == "villageName_2": 121 | # print(str(data)) 122 | self.villageName.append(data) 123 | self.flag.pop() 124 | 125 | def handle_endtag(self, tag): 126 | if tag == "div" and len(self.flag) > 0 and self.flag[-1] == "houseTotlePrice_2": 127 | self.houseTotlePrice.append(self.houseTotlePrice_tmp)#.replace(' ', '')) 128 | self.houseTotlePrice_tmp = "" 129 | self.flag.pop() 130 | 131 | -------------------------------------------------------------------------------- /source/common.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | # python3.0 3 | import urllib.request 4 | 5 | # 定义一个getHtml()函数 6 | def getHtml(url): 7 | # HEADERS = {'cookie':'als=0; sessid=E20EF245-B578-B62A-405F-2E2EC80DD166; ajk_boostup_captcha=0e5106912b04695c71d190f8987ebf1a; ajk_member_captcha=6f503b9a45c529f8f1e53c34c8705def; search_words=%E5%A4%A7%E6%9C%89%E6%81%AC%E5%9B%AD%E4%BA%8C%E6%9C%9F%7C%E5%93%88%E4%BD%9B%E6%98%A0%E5%83%8F%7C%E5%A4%A7%E6%9C%89%E6%81%AC%E5%9B%AD; viewed_comm_21=212476_512034_538146_212341_212176; viewed_comm_list=212511_212476_512034_538146_212341_212176; ajk_member_name=%E8%80%80%E4%B8%AD; ajk_member_key=146c25ce41adc687f802173e10684b46; ajk_member_time=1519886340; aQQ_ajkauthinfos=X%2BvioYvshCNej0r1lQljTMj209xwrTWPFZHr4fU%2BBOVshg2FIa%2FwG804Z%2F5D0RBECPh2dBsrAQ; lui=34603604%3A1; ajk_member_id=34603604; lps=http%3A%2F%2Fdalian.anjuke.com%2F%7C; ctid=21; chatconf=0.1488850300876.2017094.755457675.2005134818.21; browse_comm_ids=512034%7C512033; propertys=chs2vf-omfh2s_cg0a1c-omfbnv_; 58tj_uuid=6bf94a05-49db-441c-a82b-7c6f0fd10568; new_session=0; init_refer=; new_uv=4; __xsptplusUT_8=1; _ga=GA1.2.846449226.1488850146; _gat=1; __xsptplus8=8.4.1488863765.1488864393.3%234%7C%7C%7C%7C%7C%23%23sBxkSJUmyzzeOfmsql0wujs4qe1wUNkI%23; aQQ_ajkguid=81A92F38-8AFB-3CD1-F259-78F93B4E9AE5; twe=2',} 8 | HEADERS = { 9 | 'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} 10 | HEADERS = {'User-Agent': 'User-Agent:Mozilla/5.0'} 11 | req = urllib.request.Request(url, headers=HEADERS) 12 | page = urllib.request.urlopen(req) # urllib.urlopen()方法用于打开一个URL地址 13 | html = page.read() # read()方法用于读取URL上的数据 14 | return html.decode('UTF-8').replace(u'\xa9', u'').replace("'", "").replace("\r\n", "").replace("\n", "") # 汉字转换及正则匹配无法对换行进行处理及去掉单引号 15 | -------------------------------------------------------------------------------- /source/ganji.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | from html.parser import HTMLParser 4 | 5 | class GanjiParser(HTMLParser): 6 | def __init__(self): 7 | super().__init__() 8 | # 存储中间数据(链家为总房价与单价) 9 | self.span = "" 10 | # 房屋名称 11 | self.houseName = [] 12 | # 小区名称 13 | self.villageName = [] 14 | # 房子介绍 15 | self.houseNote = [] 16 | # 总价 17 | self.houseTotlePrice = [] 18 | # 单价 19 | self.houseUnitPrice = [] 20 | # 房屋链接 21 | self.houseLink = [] 22 | # 第一张图片 23 | self.houseImg = [] 24 | # 用于标记数据类型 25 | self.flag = [] 26 | 27 | def feed(self, data): 28 | super().feed(data) 29 | # 校验数据个数是否统一 30 | size = len(self.houseName) 31 | if len(self.houseName) != size or len(self.villageName) != size or len(self.houseNote) != size \ 32 | or len(self.houseTotlePrice) != size or len(self.houseUnitPrice) != size or len(self.houseLink) != size \ 33 | or len(self.houseImg) != size: 34 | raise ValueError("数据个数不一致:houseName-" + str(len(self.houseName)) + ",villageName-" + str(len(self.villageName)) + 35 | ",houseNote-" + str(len(self.houseNote)) + ",houseTotlePrice-" + str(len(self.houseTotlePrice)) + 36 | ",houseUnitPrice-" + str(len(self.houseUnitPrice)) + ",houseLink-" + str(len(self.houseLink)) + 37 | ",houseImg-" + str(len(self.houseImg))) 38 | return self.houseName, self.villageName, self.houseNote, self.houseTotlePrice, self.houseUnitPrice, self.houseLink, self.houseImg, [0]*len(self.houseImg) 39 | 40 | # def handle_starttag(self, tag, attrs): 41 | # if tag == "span": 42 | # self.flag.append("span") 43 | # elif tag == "a" and ("data-el", "ershoufang") in attrs and ("class", "") in attrs: 44 | # self.flag.append("houseName") 45 | # for attr in attrs: 46 | # if attr[0] == "href": 47 | # self.houseLink.append(attr[1]) 48 | # elif tag == "a" and ("data-el", "region") in attrs: 49 | # self.flag.append("villageName") 50 | # elif tag == "div" and ("class", "houseInfo") in attrs: 51 | # self.flag.append("houseNote") 52 | # elif tag == "div" and ("class", "totalPrice") in attrs: 53 | # self.flag.append("houseTotlePrice_2") 54 | # elif tag == "div" and ("class", "unitPrice") in attrs: 55 | # self.flag.append("houseUnitPrice_2") 56 | # elif tag == "img" and ("class", "lj-lazy") in attrs: 57 | # for attr in attrs: 58 | # if attr[0] == "alt": 59 | # for attr2 in attrs: 60 | # if attr2[0] == "data-original": 61 | # self.houseImg.append(attr2[1]) 62 | # break 63 | # break 64 | # 65 | # def handle_data(self, data): 66 | # if len(self.flag) != 0: 67 | # if self.flag[-1] == "span": 68 | # # print(str(data)) 69 | # self.span = data 70 | # self.flag.pop() 71 | # if len(self.flag) > 0 and self.flag[-1] == "houseUnitPrice_2": 72 | # self.houseUnitPrice.append(self.span) 73 | # self.flag.pop() 74 | # elif self.flag[-1] == "houseName": 75 | # # print(str(data)) 76 | # self.houseName.append(data) 77 | # self.flag.pop() 78 | # elif self.flag[-1] == "villageName": 79 | # # print(str(data)) 80 | # self.villageName.append(data) 81 | # self.flag.pop() 82 | # elif self.flag[-1] == "houseNote": 83 | # # print(str(data)) 84 | # self.houseNote.append(data) 85 | # self.flag.pop() 86 | # elif self.flag[-1] == "houseTotlePrice_2": 87 | # # print(str(data)) 88 | # self.houseTotlePrice.append(self.span + data) 89 | # self.span = "" 90 | # self.flag.pop() 91 | # # elif self.flag[-1] == "houseUnitPrice_2": 92 | # # self.houseUnitPrice.append(self.span + data) 93 | # # self.span = "" 94 | # # self.flag.pop() 95 | # # elif self.flag[-1] == "houseLink": 96 | # # print(str(data)) 97 | # # self.houseLink.append(data) 98 | # # self.flag.pop() 99 | # # elif self.flag[-1] == "houseImg": 100 | # # print(str(data)) 101 | # # self.houseImg.append(data) 102 | # # self.flag.pop() 103 | 104 | -------------------------------------------------------------------------------- /source/lianjia.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | from html.parser import HTMLParser 4 | 5 | class LianjiaParser(HTMLParser): 6 | def __init__(self): 7 | super().__init__() 8 | # 存储中间数据(链家为总房价与单价) 9 | self.span = "" 10 | # 房屋名称 11 | self.houseName = [] 12 | # 小区名称 13 | self.villageName = [] 14 | # 房子介绍 15 | self.houseNote = [] 16 | self.houseNote_tmp = "" #用于拼接houseNote 17 | # 总价 18 | self.houseTotlePrice = [] 19 | # 单价 20 | self.houseUnitPrice = [] 21 | # 房屋链接 22 | self.houseLink = [] 23 | # 第一张图片 24 | self.houseImg = [] 25 | # 关注人数 26 | self.followNum = [] 27 | # 用于标记数据类型 28 | self.flag = [] 29 | 30 | def feed(self, data): 31 | super().feed(data) 32 | # 校验数据个数是否统一 33 | size = len(self.houseName) 34 | if len(self.houseName) != size or len(self.villageName) != size or len(self.houseNote) != size \ 35 | or len(self.houseTotlePrice) != size or len(self.houseUnitPrice) != size or len(self.houseLink) != size \ 36 | or len(self.houseImg) != size or len(self.followNum) != size: 37 | raise ValueError("数据个数不一致:houseName-" + str(len(self.houseName)) + ",villageName-" + str(len(self.villageName)) + 38 | ",houseNote-" + str(len(self.houseNote)) + ",houseTotlePrice-" + str(len(self.houseTotlePrice)) + 39 | ",houseUnitPrice-" + str(len(self.houseUnitPrice)) + ",houseLink-" + str(len(self.houseLink)) + 40 | ",houseImg-" + str(len(self.houseImg)) + ",followNum-" + str(len(self.followNum))) 41 | return self.houseName, self.villageName, self.houseNote, self.houseTotlePrice, self.houseUnitPrice, self.houseLink, self.houseImg, self.followNum 42 | 43 | def handle_starttag(self, tag, attrs): 44 | if tag == "span": 45 | self.flag.append("span") 46 | elif tag == "a" and ("data-el", "ershoufang") in attrs and ("class", "") in attrs: 47 | self.flag.append("houseName") 48 | for attr in attrs: 49 | if attr[0] == "href": 50 | self.houseLink.append(attr[1]) 51 | elif tag == "a" and ("data-el", "region") in attrs: 52 | self.flag.append("villageName") 53 | elif tag == "a" and ("class", "no_resblock_a") in attrs: 54 | self.flag.append("villageName") 55 | elif tag == "div" and ("class", "houseInfo") in attrs: 56 | self.flag.append("houseNote") 57 | elif tag == "div" and ("class", "totalPrice totalPrice2") in attrs: 58 | self.flag.append("houseTotlePrice_2") 59 | elif tag == "div" and ("class", "unitPrice") in attrs: 60 | self.flag.append("houseUnitPrice_2") 61 | elif tag == "img" and ("class", "lj-lazy") in attrs: 62 | for attr in attrs: 63 | if attr[0] == "alt": 64 | for attr2 in attrs: 65 | if attr2[0] == "data-original": 66 | self.houseImg.append(attr2[1]) 67 | break 68 | break 69 | elif tag == "div" and ("class", "followInfo") in attrs: 70 | self.flag.append("followNum") 71 | 72 | def handle_data(self, data): 73 | data = data.replace(' ', '') 74 | if len(self.flag) > 0: 75 | # print(self.flag) 76 | if self.flag[-1] == "span": 77 | # print(str(data)) 78 | self.span = data 79 | self.flag.pop() 80 | if len(self.flag) > 0 and self.flag[-1] == "houseUnitPrice_2": 81 | self.houseUnitPrice.append(self.span) 82 | self.flag.pop() 83 | elif len(self.flag) > 0 and self.flag[-1] == "followNum": 84 | self.followNum.append(int(self.span.replace(' ', '').split('人')[0])) 85 | self.flag.pop() 86 | elif self.flag[-1] == "houseName": 87 | # print(str(data)) 88 | self.houseName.append(data) 89 | self.flag.pop() 90 | elif self.flag[-1] == "villageName": 91 | # print(str(data)) 92 | self.villageName.append(data) 93 | self.flag.pop() 94 | elif self.flag[-1] == "houseTotlePrice_2" and data=="万": 95 | # print(str(data)) 96 | self.houseTotlePrice.append(self.span + data) 97 | self.span = "" 98 | self.flag.pop() 99 | if len(self.flag) > 0 and self.flag[-1] == "houseNote": 100 | self.houseNote_tmp = self.houseNote_tmp + data 101 | 102 | def handle_endtag(self, tag): 103 | if tag == "div" and len(self.flag) > 0 and self.flag[-1] == "houseNote": 104 | self.houseNote.append(self.houseNote_tmp) 105 | self.houseNote_tmp = "" 106 | self.flag.pop() 107 | -------------------------------------------------------------------------------- /source/read.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | import configparser 4 | import time 5 | 6 | from .beike import BeikeParser 7 | from .anjuke import AnjukeParser 8 | from .ganji import GanjiParser 9 | from .lianjia import LianjiaParser 10 | from .tongcheng import TongchengParser 11 | 12 | 13 | class readData(): 14 | ''' 15 | 用于读取数据 16 | ''' 17 | 18 | def __init__(self, config): 19 | self._config = config 20 | pass 21 | 22 | # 读取 leancloud 表名列表 23 | def _read_leancloud_tablenames(self): 24 | import requests 25 | url = 'https://tpra4qll.api.lncld.net/1.1/schemas' 26 | head = { 27 | "X-LC-Id": self._config['leancloud']['appid'], 28 | "X-LC-Key": self._config['leancloud']['masterkey'] + ',master' 29 | } 30 | response = requests.get(url, headers=head) 31 | tablenames = sorted(list(response.json().keys()), reverse=True) 32 | return tablenames 33 | 34 | # 读取 leancloud 表数据 35 | def _read_leancloud_data(self, tablename): 36 | import requests 37 | import pandas as pd 38 | 39 | url = 'https://tpra4qll.api.lncld.net/1.1/classes/' 40 | limit = 200 41 | skip = 0 42 | head = { 43 | "X-LC-Id": self._config['leancloud']['appid'], 44 | "X-LC-Key": self._config['leancloud']['appkey'], 45 | "Content-Type": "application/json" 46 | } 47 | sign = 1 48 | data = pd.DataFrame() 49 | while(sign): 50 | response = requests.get(url + str(tablename) + '?limit=' + str(limit) + '&skip=' + str(skip), headers=head) 51 | data = data.append(pd.DataFrame(response.json()["results"])) 52 | if len(response.json()["results"])==0: 53 | sign = 0 54 | skip = skip + limit 55 | data = data.drop_duplicates(['houseLink']) 56 | return data 57 | 58 | # 读取 mysql 表名列表 59 | def _read_mysql_tablenames(self): 60 | import mysql.connector 61 | import pandas as pd 62 | 63 | host = self._config.get('mysql', 'host') 64 | port = self._config.getint('mysql', 'port') 65 | user = self._config.get('mysql', 'user') 66 | passwd = self._config.get('mysql', 'passwd') 67 | db = self._config.get('mysql', 'db') 68 | 69 | conn = mysql.connector.connect(host=host, user=user, password=passwd, database=db, port=port, use_unicode=True) 70 | get_tableNames_sql = """select table_name from information_schema.tables order by table_name DESC """ 71 | tablenames = pd.read_sql(get_tableNames_sql, conn).iloc[:, 0].tolist() 72 | 73 | return tablenames 74 | 75 | # 读取 mysql 表数据 76 | def _read_mysql_data(self, tablename): 77 | import mysql.connector 78 | import pandas as pd 79 | 80 | host = self._config.get('mysql', 'host') 81 | port = self._config.getint('mysql', 'port') 82 | user = self._config.get('mysql', 'user') 83 | passwd = self._config.get('mysql', 'passwd') 84 | db = self._config.get('mysql', 'db') 85 | 86 | conn = mysql.connector.connect(host=host, user=user, password=passwd, database=db, port=port, use_unicode=True) 87 | get_data_sql = """select * from %s""" % tablename 88 | data = pd.read_sql(get_data_sql, conn) 89 | 90 | return data 91 | 92 | def read_tablenames(self): 93 | if self._config['savetype']['type'] == 'mysql': 94 | return self._read_mysql_tablenames() 95 | elif self._config['savetype']['type'] == 'leancloud': 96 | return self._read_leancloud_tablenames() 97 | 98 | def read_data(self, tablename): 99 | if self._config['savetype']['type'] == 'mysql': 100 | return self._read_mysql_data(tablename) 101 | elif self._config['savetype']['type'] == 'leancloud': 102 | return self._read_leancloud_data(tablename) 103 | -------------------------------------------------------------------------------- /source/report.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | # python3.0 3 | import source.template as temp 4 | from .read import readData 5 | import time 6 | 7 | class reportData(): 8 | ''' 9 | 用于读取数据 10 | ''' 11 | 12 | def __init__(self, reportFileName=None): 13 | if reportFileName is None: 14 | self._reportFileName = '房屋价格情况统计' + time.strftime('%Y%m%d', time.localtime(time.time())) 15 | else: 16 | self._reportFileName = reportFileName 17 | 18 | # 生成数据块 19 | def _get_table_label(self, id, day, newdata, olddata): 20 | import pandas as pd 21 | new = pd.DataFrame() 22 | down = pd.DataFrame() 23 | up = pd.DataFrame() 24 | other = pd.DataFrame() 25 | # 为了兼容以前没有 关注人数的数据 26 | if "followNum" in newdata.columns: 27 | newdata["followNum"] = newdata["followNum"].astype('string') 28 | else: 29 | newdata["followNum"] = "0" 30 | if "followNum" in olddata.columns: 31 | olddata["followNum"] = olddata["followNum"].astype('string') 32 | else: 33 | olddata["followNum"] = "0" 34 | for index, row in newdata.iterrows(): 35 | if row.houseLink in olddata.houseLink.tolist(): 36 | if row.houseTotlePrice < olddata[olddata.houseLink == row.houseLink].houseTotlePrice.iloc[0]: 37 | down = down.append(row.append(pd.Series({'old_houseTotlePrice': olddata[olddata.houseLink == row.houseLink].houseTotlePrice.iloc[0]})), ignore_index=True) 38 | elif row.houseTotlePrice > olddata[olddata.houseLink == row.houseLink].houseTotlePrice.iloc[0]: 39 | up = up.append(row.append(pd.Series({'old_houseTotlePrice': olddata[olddata.houseLink == row.houseLink].houseTotlePrice.iloc[0]})), ignore_index=True) 40 | else: 41 | other = other.append(row.append(pd.Series({'old_houseTotlePrice': olddata[olddata.houseLink == row.houseLink].houseTotlePrice.iloc[0]})), ignore_index=True) 42 | else: 43 | new = new.append(row.append(pd.Series({'old_houseTotlePrice': '-'})), ignore_index=True) 44 | new['sign'] = '新增' 45 | down['sign'] = '下降' 46 | up['sign'] = '上升' 47 | other['sign'] = '不变' 48 | result = ''' 49 |
升降标志 | 54 |房屋名 | 55 |房屋备注 | 56 |房屋总价 | 57 |房屋历史总价 | 58 |房屋单价 | 59 |关注人数 | 60 |小区名 | 61 |房屋链接 | 62 |来源网站 | 63 |
---|---|---|---|---|---|---|---|---|---|
升降标志 | 75 |房屋名 | 76 |房屋备注 | 77 |房屋总价 | 78 |房屋历史总价 | 79 |房屋单价 | 80 |关注人数 | 81 |小区名 | 82 |房屋链接 | 83 |来源网站 | 84 |
统计时间: %s
108 |报告耗时: %s
109 |