├── .gitattributes ├── .gitignore ├── README.md ├── config.ini ├── house.py ├── reports └── 房屋价格情况统计20190124.html ├── requirements.txt └── source ├── __init__.py ├── anjuke.py ├── beike.py ├── common.py ├── ganji.py ├── lianjia.py ├── read.py ├── report.py ├── save.py ├── template.py └── tongcheng.py /.gitattributes: -------------------------------------------------------------------------------- 1 | *.html linguist-language=python 2 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | *.bak 2 | *.iml 3 | tmp.py 4 | __pycache__/* 5 | .idea/* 6 | test/* 7 | *.html 8 | temp.py 9 | source/__pycache__/* 10 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # house 2 | 此为二手房数据,需要新房数据的移步[new_house](https://github.com/tree-branch/new_house) 3 | 4 | 爬取贝壳找房、链家的房源信息,便于广大未买房子的朋友们尽快成为房奴!!!Crawl the house informations of ke.com, lianjia.com (anjvke.com, 58.com, ganji.com after the update), convenient for the majority of friends who did not buy the house as soon as to become the mortgage slave!!! 5 | 6 | ## 直接运行 7 | 修改config.ini内的mysql链接地址 8 | 9 | python3.0及以上版本 10 | 11 | python house.py 12 | 13 | 缺什么包就 pip install *** 14 | 15 | 大概率缺少(pymysql, leancloud, cryptography) 16 | 17 | ## 个性化运行 18 | 此程序是把leancloud作为云数据库使用;在 https://leancloud.cn/ 内建立账号;修改config.ini为自己的App ID App KEY 19 | 20 | python house.py 21 | 22 | 修改house.py内贝壳找房等网站的网址,查询的限定条件需要能够保存在URL内,例如链家的排序也是可以保存在URL内的,一看例子你也应该就懂了,不懂的话就再看一遍,直接给我发邮件当然是最快的办法 :-)。 23 | 24 | ## 联系方式 25 | 有想说的联系:lm521299@sina.com 26 | 27 | ## 20190215log 28 | * 修正使用leancloud时,生成报告时读取数据不全的问题。此bug只影响报告的生成,不影响数据爬取 29 | 30 | ## 20190123log 31 | * 增加简单的数据比较功能 32 | * 使用leancloud的需要添加masterkey参数到config.ini中 33 | 34 | ## 20200605log 35 | * 增加报告上的房屋连接可以直接跳转 36 | 37 | ## 20201117log 38 | * 修复链家数据错误的问题 39 | 40 | ## 20210508log 41 | * 修复链家、贝壳数据错误的问题 42 | 43 | ## 20210906log 44 | * 修复链家、贝壳数据错误的问题 45 | * 增加关注人数字段 46 | 47 | ## 20211214log 48 | * 修复链家数据错误的问题 49 | 50 | 51 | ![](https://img-blog.csdnimg.cn/20200715103658153.png) 52 | 53 | # 希望发现不好用的时候邮件通知我一下,方便我尽快修改,谢谢 :-) 54 | ![](https://starchart.cc/tree-branch/house.svg) 55 | ↑看一下大家什么时候喜欢关注房源信息↑ 56 | -------------------------------------------------------------------------------- /config.ini: -------------------------------------------------------------------------------- 1 | ;MySQL parameters 2 | [mysql] 3 | host = localhost 4 | port = 3306 5 | user = lv 6 | passwd = lv 7 | db = lvdb 8 | 9 | ;leancloud parameters 10 | [leancloud] 11 | appid = tprA4QlLY29nvh5QmiWsNl0s-gzGzoHsz 12 | appkey = idYvbwv28UfweEIJJ01E8bBb 13 | masterkey = I_can_not_tell_you 14 | 15 | ;default mysql 16 | [savetype] 17 | type = mysql 18 | ;type = leancloud 19 | 20 | -------------------------------------------------------------------------------- /house.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | # python3.0 3 | 4 | from source.save import saveData 5 | from source.common import getHtml 6 | from source.report import reportData 7 | import configparser 8 | import webbrowser 9 | import os 10 | 11 | 12 | # ------主函数------ 13 | # delete() 14 | if __name__ == '__main__': 15 | # 获取参数 16 | config = configparser.ConfigParser() 17 | config.read("config.ini") 18 | 19 | # 清除数据 20 | save = saveData(config) 21 | save.deleteOldData() 22 | 23 | # 贝壳找房 (例:北京、通州 251-499万 80-100平) 根据自己需求添加链接 24 | beike1 = getHtml('''https://bj.ke.com/ershoufang/tongzhou/co32ba80ea100bp251ep499/''') 25 | beike2 = getHtml('''https://bj.ke.com/ershoufang/tongzhou/pg2co32ba80ea100bp251ep499/''') 26 | beike3 = getHtml('''https://bj.ke.com/ershoufang/tongzhou/pg3co32ba80ea100bp251ep499/''') 27 | beike_htmls = [beike1, beike2, beike3] 28 | for beike_html in beike_htmls: 29 | save.beike_save(beike_html) 30 | 31 | # 链家 (例:北京 0-600万 60-100平) 根据自己需求添加链接 32 | lianjia1 = getHtml('''https://bj.lianjia.com/ershoufang/bp0ep600ba60ea100l3/rs/''') 33 | lianjia2 = getHtml('''https://bj.lianjia.com/ershoufang/pg2l3ba60ea100ep600/''') 34 | lianjia3 = getHtml('''https://bj.lianjia.com/ershoufang/pg3l3ba60ea100ep600/''') 35 | lianjia_htmls = [lianjia1, lianjia2, lianjia3] 36 | for lianjia_html in lianjia_htmls: 37 | save.lianjia_save(lianjia_html) 38 | 39 | # 58同城 高新园区 80-120W 3室 精装修 40 | tongcheng1 = getHtml('''http://bj.58.com/ershoufang/?PGTID=0d00000c-0000-099e-5f9d-eb7cd9b2d735&ClickID=1&huansuanyue=200_600&bunengdaikuan=0&area=60_100''') 41 | tongcheng2 = getHtml('''http://bj.58.com/ershoufang/pn2/?huansuanyue=200_600&bunengdaikuan=0&area=60_100&PGTID=0d300000-0000-0b90-1e0b-bf894f74b13a&ClickID=1''') 42 | tongcheng3 = getHtml('''http://bj.58.com/ershoufang/pn3/?huansuanyue=200_600&bunengdaikuan=0&area=60_100&PGTID=0d300000-0000-08f9-ba56-6673c850e2b8&ClickID=1''') 43 | # print(str(tongcheng1.encode('GB18030'))) 44 | tongcheng_htmls = [tongcheng1, tongcheng2, tongcheng3] 45 | for tongcheng_html in tongcheng_htmls: 46 | save.tongcheng_save(tongcheng_html) 47 | 48 | # 安居客 (例:北京 200-600万 60-100平 按最新排序) 根据自己需求添加链接 49 | anjuke1 = getHtml('''https://beijing.anjuke.com/sale/o5/?from_area=60&to_area=100&from_price=200&to_price=600''') 50 | anjuke2 = getHtml('''https://beijing.anjuke.com/sale/o5-p2/?from_area=60&to_area=100&from_price=200&to_price=600#filtersort''') 51 | anjuke3 = getHtml('''https://beijing.anjuke.com/sale/o5-p3/?from_area=60&to_area=100&from_price=200&to_price=600#filtersort''') 52 | anjuke_htmls = [anjuke1, anjuke2, anjuke3] 53 | for anjuke_html in anjuke_htmls: 54 | save.anjuke_save(anjuke_html) 55 | 56 | # # 赶集 高新园区 80-120W 3室 精装修 57 | # ganji1 = getHtml('''http://dl.ganji.com/fang5/gaoxinyuanqu/b80e120h3q2/''') 58 | # ganji2 = getHtml('''http://dl.ganji.com/fang5/gaoxinyuanqu/b80e120h3o2q2/''') 59 | # ganji3 = getHtml('''http://dl.ganji.com/fang5/gaoxinyuanqu/b80e120h3o3q2/''') 60 | # ganji_htmls = [ganji1, ganji2, ganji3] 61 | # for ganji_html in ganji_htmls: 62 | # ganji_save(ganji_html) 63 | 64 | print("生成报告中...") 65 | rep = reportData() 66 | reportFileName = rep.get_report() 67 | webbrowser.open('''file:///''' + os.path.dirname(__file__) + '''/reports/''' + reportFileName) 68 | 69 | print("OVER!!!") 70 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | configparser==5.0.1 2 | mysql-connector==2.2.9 3 | numpy==1.19.4 4 | pandas==1.1.4 5 | PyMySQL==0.10.1 -------------------------------------------------------------------------------- /source/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/tree-branch/house/6c6c3c8ffa92f84192b2c3346b294e9a25b7d008/source/__init__.py -------------------------------------------------------------------------------- /source/anjuke.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | from html.parser import HTMLParser 4 | 5 | from numpy.core.defchararray import strip 6 | 7 | 8 | class AnjukeParser(HTMLParser): 9 | def __init__(self): 10 | super().__init__() 11 | # 存储中间数据(安居客为房屋描述、小区名) 12 | self.span = "" 13 | # 存储中间数据(安居客为总价) 14 | self.strong = "" 15 | # 房屋名称 16 | self.houseName = [] 17 | # 小区名称 18 | self.villageName = [] 19 | # 房子介绍 20 | self.houseNote = [] 21 | # 总价 22 | self.houseTotlePrice = [] 23 | # 单价 24 | self.houseUnitPrice = [] 25 | # 房屋链接 26 | self.houseLink = [] 27 | # 第一张图片 28 | self.houseImg = [] 29 | # 用于标记数据类型 30 | self.flag = [] 31 | # aa 32 | self.aa = () 33 | 34 | def feed(self, data): 35 | super().feed(data) 36 | # 校验数据个数是否统一 37 | size = len(self.houseName) 38 | if len(self.houseName) != size or len(self.villageName) != size or len(self.houseNote) != size \ 39 | or len(self.houseTotlePrice) != size or len(self.houseUnitPrice) != size or len(self.houseLink) != size \ 40 | or len(self.houseImg) != size: 41 | raise ValueError("数据个数不一致:houseName-" + str(len(self.houseName)) + ",villageName-" + str(len(self.villageName)) + 42 | ",houseNote-" + str(len(self.houseNote)) + ",houseTotlePrice-" + str(len(self.houseTotlePrice)) + 43 | ",houseUnitPrice-" + str(len(self.houseUnitPrice)) + ",houseLink-" + str(len(self.houseLink)) + 44 | ",houseImg-" + str(len(self.houseImg))) 45 | return self.houseName, self.villageName, self.houseNote, self.houseTotlePrice, self.houseUnitPrice, self.houseLink, self.houseImg, [0]*len(self.houseImg) 46 | 47 | def handle_starttag(self, tag, attrs): 48 | if tag == "span" and ("class", "comm-address") in attrs: 49 | self.flag.append("villageName") 50 | elif tag == "span" and ("class", "price-det") in attrs: 51 | self.flag.append("houseTotlePrice_2") 52 | elif tag == "span" and ("class", "unit-price") in attrs: 53 | self.flag.append("houseUnitPrice") 54 | elif tag == "span": 55 | self.flag.append("span") 56 | elif tag == "strong": 57 | self.flag.append("strong") 58 | elif tag == "a" and ("class", "houseListTitle ") in attrs: 59 | self.flag.append("houseName") 60 | for attr in attrs: 61 | if attr[0] == "href": 62 | self.houseLink.append(attr[1]) 63 | elif tag == "div" and ("class", "details-item") in attrs: 64 | self.flag.append("houseNote_2") 65 | self.span = "" 66 | elif tag == "img" and ("width", "180") in attrs: 67 | for attr in attrs: 68 | if attr[0] == "src": 69 | self.houseImg.append(attr[1]) 70 | 71 | def handle_endtag(self, tag): 72 | if len(self.flag) != 0: 73 | if tag == "div" and self.flag[-1] == "houseNote_2" and self.span != "": 74 | # 此时为houseNote的结束 75 | # print(self.span.encode('GB18030')) 76 | self.houseNote.append(self.span) 77 | self.flag.pop() 78 | self.span = "" 79 | elif tag == "div" and self.flag[-1] == "houseNote_2": 80 | self.flag.pop() 81 | 82 | def handle_data(self, data): 83 | if len(self.flag) != 0: 84 | if self.flag[-1] == "span": 85 | # print(str(data)) 86 | self.span += data 87 | self.flag.pop() 88 | elif self.flag[-1] == "strong": 89 | self.strong = data 90 | self.flag.pop() 91 | elif self.flag[-1] == "houseName": 92 | # print(str(data)) 93 | self.houseName.append(str(strip(data))) 94 | self.flag.pop() 95 | elif self.flag[-1] == "villageName": 96 | # print(str(data)) 97 | self.villageName.append(str(strip(data))) 98 | self.flag.pop() 99 | elif self.flag[-1] == "houseTotlePrice_2": 100 | # print(str(data)) 101 | self.houseTotlePrice.append(self.strong + data) 102 | self.strong = "" 103 | self.flag.pop() 104 | elif self.flag[-1] == "houseUnitPrice": 105 | # print(str(data)) 106 | self.houseUnitPrice.append(data) 107 | self.flag.pop() 108 | -------------------------------------------------------------------------------- /source/beike.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | from html.parser import HTMLParser 4 | 5 | class BeikeParser(HTMLParser): 6 | def __init__(self): 7 | super().__init__() 8 | # 存储中间数据(链家为总房价与单价) 9 | self.span = "" 10 | # 房屋名称 11 | self.houseName = [] 12 | # 小区名称 13 | self.villageName = [] 14 | # 房子介绍 15 | self.houseNote = [] 16 | # 总价 17 | self.houseTotlePrice = [] 18 | self.houseTotlePrice_tmp = "" #用于拼接houseTotlePrice 19 | # 单价 20 | self.houseUnitPrice = [] 21 | # 房屋链接 22 | self.houseLink = [] 23 | # 第一张图片 24 | self.houseImg = [] 25 | # 关注人数 26 | self.followNum = [] 27 | # 用于标记数据类型 28 | self.flag = [] 29 | self.sign = 0 30 | 31 | def feed(self, data): 32 | super().feed(data) 33 | # 校验数据个数是否统一 34 | size = len(self.houseName) 35 | if len(self.houseName) != size or len(self.villageName) != size or len(self.houseNote) != size \ 36 | or len(self.houseTotlePrice) != size or len(self.houseUnitPrice) != size or len(self.houseLink) != size \ 37 | or len(self.houseImg) != size or len(self.followNum) != size: 38 | raise ValueError("数据个数不一致:houseName-" + str(len(self.houseName)) + ",villageName-" + str(len(self.villageName)) + 39 | ",houseNote-" + str(len(self.houseNote)) + ",houseTotlePrice-" + str(len(self.houseTotlePrice)) + 40 | ",houseUnitPrice-" + str(len(self.houseUnitPrice)) + ",houseLink-" + str(len(self.houseLink)) + 41 | ",houseImg-" + str(len(self.houseImg)) + ",followNum-" + str(len(self.followNum))) 42 | return self.houseName, self.villageName, self.houseNote, self.houseTotlePrice, self.houseUnitPrice, self.houseLink, self.houseImg, self.followNum 43 | 44 | def handle_starttag(self, tag, attrs): 45 | if tag == "span": 46 | if ("class", "houseIcon") in attrs: 47 | self.flag.append("houseNote") 48 | self.flag.append("span") 49 | elif tag == "a" and ("class", "img VIEWDATA CLICKDATA maidian-detail") in attrs: 50 | # self.flag.append("houseName") 51 | for attr in attrs: 52 | if attr[0] == "title": 53 | self.houseName.append(attr[1]) 54 | elif attr[0] == "href": 55 | self.houseLink.append(attr[1]) 56 | # elif tag == "a" and ("data-el", "region") in attrs: 57 | # self.flag.append("villageName") 58 | # elif tag == "a" and ("class", "no_resblock_a") in attrs: 59 | # self.flag.append("villageName") 60 | # elif tag == "div" and ("class", "houseInfo") in attrs: 61 | # self.flag.append("houseNote") 62 | elif tag == "div" and ("class", "totalPrice totalPrice2") in attrs: 63 | self.flag.append("houseTotlePrice_2") 64 | elif tag == "div" and ("class", "unitPrice") in attrs: 65 | self.flag.append("houseUnitPrice_2") 66 | elif tag == "img" and ("class", "lj-lazy") in attrs: 67 | for attr in attrs: 68 | if attr[0] == "alt": 69 | for attr2 in attrs: 70 | if attr2[0] == "data-original": 71 | self.houseImg.append(attr2[1]) 72 | break 73 | break 74 | elif tag == "div" and ("class", "positionInfo") in attrs: 75 | self.flag.append("villageName_1") 76 | elif tag == "a" and len(self.flag) > 0 and self.flag[-1] == "villageName_1": 77 | self.flag.pop() 78 | self.flag.append("villageName_2") 79 | elif tag == "div" and ("class", "followInfo") in attrs: 80 | self.flag.append("followNum") 81 | 82 | def handle_data(self, data): 83 | data = data.replace(' ', '') 84 | if len(self.flag) > 0: 85 | if self.flag[-1] == "span": 86 | self.span = data 87 | self.flag.pop() 88 | if len(self.flag) > 0 and self.flag[-1] == "houseUnitPrice_2": 89 | self.houseUnitPrice.append(self.span) 90 | self.flag.pop() 91 | elif len(self.flag) > 0 and self.flag[-1] == "houseNote": 92 | self.houseNote.append(self.span) 93 | # self.villageName.append(self.span.split('|')[0].strip()) 94 | self.flag.pop() 95 | elif len(self.flag) > 0 and self.flag[-1] == "followNum": 96 | self.followNum.append(int(self.span.replace(' ', '').split('人')[0])) 97 | self.flag.pop() 98 | elif len(self.flag) > 0 and self.flag[-1] == "houseTotlePrice_2": 99 | self.houseTotlePrice_tmp = self.span 100 | # self.villageName.append(self.span.split('|')[0].strip()) 101 | elif self.flag[-1] == "houseName": 102 | # print(str(data)) 103 | self.houseName.append(data) 104 | self.flag.pop() 105 | # elif self.flag[-1] == "villageName": 106 | # # print(str(data)) 107 | # self.villageName.append(data) 108 | # self.flag.pop() 109 | # elif self.flag[-1] == "houseNote": 110 | # print(self.span) 111 | # self.houseNote.append(self.span) 112 | # self.villageName.append(self.span.split('|')[0]) 113 | # self.span = "" 114 | # self.flag.pop() 115 | elif self.flag[-1] == "houseTotlePrice_2": 116 | if data != "": 117 | self.houseTotlePrice_tmp = self.houseTotlePrice_tmp + self.span + data 118 | self.span = "" 119 | # self.flag.pop() 120 | elif self.flag[-1] == "villageName_2": 121 | # print(str(data)) 122 | self.villageName.append(data) 123 | self.flag.pop() 124 | 125 | def handle_endtag(self, tag): 126 | if tag == "div" and len(self.flag) > 0 and self.flag[-1] == "houseTotlePrice_2": 127 | self.houseTotlePrice.append(self.houseTotlePrice_tmp)#.replace(' ', '')) 128 | self.houseTotlePrice_tmp = "" 129 | self.flag.pop() 130 | 131 | -------------------------------------------------------------------------------- /source/common.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | # python3.0 3 | import urllib.request 4 | 5 | # 定义一个getHtml()函数 6 | def getHtml(url): 7 | # HEADERS = {'cookie':'als=0; sessid=E20EF245-B578-B62A-405F-2E2EC80DD166; ajk_boostup_captcha=0e5106912b04695c71d190f8987ebf1a; ajk_member_captcha=6f503b9a45c529f8f1e53c34c8705def; search_words=%E5%A4%A7%E6%9C%89%E6%81%AC%E5%9B%AD%E4%BA%8C%E6%9C%9F%7C%E5%93%88%E4%BD%9B%E6%98%A0%E5%83%8F%7C%E5%A4%A7%E6%9C%89%E6%81%AC%E5%9B%AD; viewed_comm_21=212476_512034_538146_212341_212176; viewed_comm_list=212511_212476_512034_538146_212341_212176; ajk_member_name=%E8%80%80%E4%B8%AD; ajk_member_key=146c25ce41adc687f802173e10684b46; ajk_member_time=1519886340; aQQ_ajkauthinfos=X%2BvioYvshCNej0r1lQljTMj209xwrTWPFZHr4fU%2BBOVshg2FIa%2FwG804Z%2F5D0RBECPh2dBsrAQ; lui=34603604%3A1; ajk_member_id=34603604; lps=http%3A%2F%2Fdalian.anjuke.com%2F%7C; ctid=21; chatconf=0.1488850300876.2017094.755457675.2005134818.21; browse_comm_ids=512034%7C512033; propertys=chs2vf-omfh2s_cg0a1c-omfbnv_; 58tj_uuid=6bf94a05-49db-441c-a82b-7c6f0fd10568; new_session=0; init_refer=; new_uv=4; __xsptplusUT_8=1; _ga=GA1.2.846449226.1488850146; _gat=1; __xsptplus8=8.4.1488863765.1488864393.3%234%7C%7C%7C%7C%7C%23%23sBxkSJUmyzzeOfmsql0wujs4qe1wUNkI%23; aQQ_ajkguid=81A92F38-8AFB-3CD1-F259-78F93B4E9AE5; twe=2',} 8 | HEADERS = { 9 | 'User-Agent': 'User-Agent:Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_3) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/56.0.2924.87 Safari/537.36'} 10 | HEADERS = {'User-Agent': 'User-Agent:Mozilla/5.0'} 11 | req = urllib.request.Request(url, headers=HEADERS) 12 | page = urllib.request.urlopen(req) # urllib.urlopen()方法用于打开一个URL地址 13 | html = page.read() # read()方法用于读取URL上的数据 14 | return html.decode('UTF-8').replace(u'\xa9', u'').replace("'", "").replace("\r\n", "").replace("\n", "") # 汉字转换及正则匹配无法对换行进行处理及去掉单引号 15 | -------------------------------------------------------------------------------- /source/ganji.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | from html.parser import HTMLParser 4 | 5 | class GanjiParser(HTMLParser): 6 | def __init__(self): 7 | super().__init__() 8 | # 存储中间数据(链家为总房价与单价) 9 | self.span = "" 10 | # 房屋名称 11 | self.houseName = [] 12 | # 小区名称 13 | self.villageName = [] 14 | # 房子介绍 15 | self.houseNote = [] 16 | # 总价 17 | self.houseTotlePrice = [] 18 | # 单价 19 | self.houseUnitPrice = [] 20 | # 房屋链接 21 | self.houseLink = [] 22 | # 第一张图片 23 | self.houseImg = [] 24 | # 用于标记数据类型 25 | self.flag = [] 26 | 27 | def feed(self, data): 28 | super().feed(data) 29 | # 校验数据个数是否统一 30 | size = len(self.houseName) 31 | if len(self.houseName) != size or len(self.villageName) != size or len(self.houseNote) != size \ 32 | or len(self.houseTotlePrice) != size or len(self.houseUnitPrice) != size or len(self.houseLink) != size \ 33 | or len(self.houseImg) != size: 34 | raise ValueError("数据个数不一致:houseName-" + str(len(self.houseName)) + ",villageName-" + str(len(self.villageName)) + 35 | ",houseNote-" + str(len(self.houseNote)) + ",houseTotlePrice-" + str(len(self.houseTotlePrice)) + 36 | ",houseUnitPrice-" + str(len(self.houseUnitPrice)) + ",houseLink-" + str(len(self.houseLink)) + 37 | ",houseImg-" + str(len(self.houseImg))) 38 | return self.houseName, self.villageName, self.houseNote, self.houseTotlePrice, self.houseUnitPrice, self.houseLink, self.houseImg, [0]*len(self.houseImg) 39 | 40 | # def handle_starttag(self, tag, attrs): 41 | # if tag == "span": 42 | # self.flag.append("span") 43 | # elif tag == "a" and ("data-el", "ershoufang") in attrs and ("class", "") in attrs: 44 | # self.flag.append("houseName") 45 | # for attr in attrs: 46 | # if attr[0] == "href": 47 | # self.houseLink.append(attr[1]) 48 | # elif tag == "a" and ("data-el", "region") in attrs: 49 | # self.flag.append("villageName") 50 | # elif tag == "div" and ("class", "houseInfo") in attrs: 51 | # self.flag.append("houseNote") 52 | # elif tag == "div" and ("class", "totalPrice") in attrs: 53 | # self.flag.append("houseTotlePrice_2") 54 | # elif tag == "div" and ("class", "unitPrice") in attrs: 55 | # self.flag.append("houseUnitPrice_2") 56 | # elif tag == "img" and ("class", "lj-lazy") in attrs: 57 | # for attr in attrs: 58 | # if attr[0] == "alt": 59 | # for attr2 in attrs: 60 | # if attr2[0] == "data-original": 61 | # self.houseImg.append(attr2[1]) 62 | # break 63 | # break 64 | # 65 | # def handle_data(self, data): 66 | # if len(self.flag) != 0: 67 | # if self.flag[-1] == "span": 68 | # # print(str(data)) 69 | # self.span = data 70 | # self.flag.pop() 71 | # if len(self.flag) > 0 and self.flag[-1] == "houseUnitPrice_2": 72 | # self.houseUnitPrice.append(self.span) 73 | # self.flag.pop() 74 | # elif self.flag[-1] == "houseName": 75 | # # print(str(data)) 76 | # self.houseName.append(data) 77 | # self.flag.pop() 78 | # elif self.flag[-1] == "villageName": 79 | # # print(str(data)) 80 | # self.villageName.append(data) 81 | # self.flag.pop() 82 | # elif self.flag[-1] == "houseNote": 83 | # # print(str(data)) 84 | # self.houseNote.append(data) 85 | # self.flag.pop() 86 | # elif self.flag[-1] == "houseTotlePrice_2": 87 | # # print(str(data)) 88 | # self.houseTotlePrice.append(self.span + data) 89 | # self.span = "" 90 | # self.flag.pop() 91 | # # elif self.flag[-1] == "houseUnitPrice_2": 92 | # # self.houseUnitPrice.append(self.span + data) 93 | # # self.span = "" 94 | # # self.flag.pop() 95 | # # elif self.flag[-1] == "houseLink": 96 | # # print(str(data)) 97 | # # self.houseLink.append(data) 98 | # # self.flag.pop() 99 | # # elif self.flag[-1] == "houseImg": 100 | # # print(str(data)) 101 | # # self.houseImg.append(data) 102 | # # self.flag.pop() 103 | 104 | -------------------------------------------------------------------------------- /source/lianjia.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | from html.parser import HTMLParser 4 | 5 | class LianjiaParser(HTMLParser): 6 | def __init__(self): 7 | super().__init__() 8 | # 存储中间数据(链家为总房价与单价) 9 | self.span = "" 10 | # 房屋名称 11 | self.houseName = [] 12 | # 小区名称 13 | self.villageName = [] 14 | # 房子介绍 15 | self.houseNote = [] 16 | self.houseNote_tmp = "" #用于拼接houseNote 17 | # 总价 18 | self.houseTotlePrice = [] 19 | # 单价 20 | self.houseUnitPrice = [] 21 | # 房屋链接 22 | self.houseLink = [] 23 | # 第一张图片 24 | self.houseImg = [] 25 | # 关注人数 26 | self.followNum = [] 27 | # 用于标记数据类型 28 | self.flag = [] 29 | 30 | def feed(self, data): 31 | super().feed(data) 32 | # 校验数据个数是否统一 33 | size = len(self.houseName) 34 | if len(self.houseName) != size or len(self.villageName) != size or len(self.houseNote) != size \ 35 | or len(self.houseTotlePrice) != size or len(self.houseUnitPrice) != size or len(self.houseLink) != size \ 36 | or len(self.houseImg) != size or len(self.followNum) != size: 37 | raise ValueError("数据个数不一致:houseName-" + str(len(self.houseName)) + ",villageName-" + str(len(self.villageName)) + 38 | ",houseNote-" + str(len(self.houseNote)) + ",houseTotlePrice-" + str(len(self.houseTotlePrice)) + 39 | ",houseUnitPrice-" + str(len(self.houseUnitPrice)) + ",houseLink-" + str(len(self.houseLink)) + 40 | ",houseImg-" + str(len(self.houseImg)) + ",followNum-" + str(len(self.followNum))) 41 | return self.houseName, self.villageName, self.houseNote, self.houseTotlePrice, self.houseUnitPrice, self.houseLink, self.houseImg, self.followNum 42 | 43 | def handle_starttag(self, tag, attrs): 44 | if tag == "span": 45 | self.flag.append("span") 46 | elif tag == "a" and ("data-el", "ershoufang") in attrs and ("class", "") in attrs: 47 | self.flag.append("houseName") 48 | for attr in attrs: 49 | if attr[0] == "href": 50 | self.houseLink.append(attr[1]) 51 | elif tag == "a" and ("data-el", "region") in attrs: 52 | self.flag.append("villageName") 53 | elif tag == "a" and ("class", "no_resblock_a") in attrs: 54 | self.flag.append("villageName") 55 | elif tag == "div" and ("class", "houseInfo") in attrs: 56 | self.flag.append("houseNote") 57 | elif tag == "div" and ("class", "totalPrice totalPrice2") in attrs: 58 | self.flag.append("houseTotlePrice_2") 59 | elif tag == "div" and ("class", "unitPrice") in attrs: 60 | self.flag.append("houseUnitPrice_2") 61 | elif tag == "img" and ("class", "lj-lazy") in attrs: 62 | for attr in attrs: 63 | if attr[0] == "alt": 64 | for attr2 in attrs: 65 | if attr2[0] == "data-original": 66 | self.houseImg.append(attr2[1]) 67 | break 68 | break 69 | elif tag == "div" and ("class", "followInfo") in attrs: 70 | self.flag.append("followNum") 71 | 72 | def handle_data(self, data): 73 | data = data.replace(' ', '') 74 | if len(self.flag) > 0: 75 | # print(self.flag) 76 | if self.flag[-1] == "span": 77 | # print(str(data)) 78 | self.span = data 79 | self.flag.pop() 80 | if len(self.flag) > 0 and self.flag[-1] == "houseUnitPrice_2": 81 | self.houseUnitPrice.append(self.span) 82 | self.flag.pop() 83 | elif len(self.flag) > 0 and self.flag[-1] == "followNum": 84 | self.followNum.append(int(self.span.replace(' ', '').split('人')[0])) 85 | self.flag.pop() 86 | elif self.flag[-1] == "houseName": 87 | # print(str(data)) 88 | self.houseName.append(data) 89 | self.flag.pop() 90 | elif self.flag[-1] == "villageName": 91 | # print(str(data)) 92 | self.villageName.append(data) 93 | self.flag.pop() 94 | elif self.flag[-1] == "houseTotlePrice_2" and data=="万": 95 | # print(str(data)) 96 | self.houseTotlePrice.append(self.span + data) 97 | self.span = "" 98 | self.flag.pop() 99 | if len(self.flag) > 0 and self.flag[-1] == "houseNote": 100 | self.houseNote_tmp = self.houseNote_tmp + data 101 | 102 | def handle_endtag(self, tag): 103 | if tag == "div" and len(self.flag) > 0 and self.flag[-1] == "houseNote": 104 | self.houseNote.append(self.houseNote_tmp) 105 | self.houseNote_tmp = "" 106 | self.flag.pop() 107 | -------------------------------------------------------------------------------- /source/read.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | import configparser 4 | import time 5 | 6 | from .beike import BeikeParser 7 | from .anjuke import AnjukeParser 8 | from .ganji import GanjiParser 9 | from .lianjia import LianjiaParser 10 | from .tongcheng import TongchengParser 11 | 12 | 13 | class readData(): 14 | ''' 15 | 用于读取数据 16 | ''' 17 | 18 | def __init__(self, config): 19 | self._config = config 20 | pass 21 | 22 | # 读取 leancloud 表名列表 23 | def _read_leancloud_tablenames(self): 24 | import requests 25 | url = 'https://tpra4qll.api.lncld.net/1.1/schemas' 26 | head = { 27 | "X-LC-Id": self._config['leancloud']['appid'], 28 | "X-LC-Key": self._config['leancloud']['masterkey'] + ',master' 29 | } 30 | response = requests.get(url, headers=head) 31 | tablenames = sorted(list(response.json().keys()), reverse=True) 32 | return tablenames 33 | 34 | # 读取 leancloud 表数据 35 | def _read_leancloud_data(self, tablename): 36 | import requests 37 | import pandas as pd 38 | 39 | url = 'https://tpra4qll.api.lncld.net/1.1/classes/' 40 | limit = 200 41 | skip = 0 42 | head = { 43 | "X-LC-Id": self._config['leancloud']['appid'], 44 | "X-LC-Key": self._config['leancloud']['appkey'], 45 | "Content-Type": "application/json" 46 | } 47 | sign = 1 48 | data = pd.DataFrame() 49 | while(sign): 50 | response = requests.get(url + str(tablename) + '?limit=' + str(limit) + '&skip=' + str(skip), headers=head) 51 | data = data.append(pd.DataFrame(response.json()["results"])) 52 | if len(response.json()["results"])==0: 53 | sign = 0 54 | skip = skip + limit 55 | data = data.drop_duplicates(['houseLink']) 56 | return data 57 | 58 | # 读取 mysql 表名列表 59 | def _read_mysql_tablenames(self): 60 | import mysql.connector 61 | import pandas as pd 62 | 63 | host = self._config.get('mysql', 'host') 64 | port = self._config.getint('mysql', 'port') 65 | user = self._config.get('mysql', 'user') 66 | passwd = self._config.get('mysql', 'passwd') 67 | db = self._config.get('mysql', 'db') 68 | 69 | conn = mysql.connector.connect(host=host, user=user, password=passwd, database=db, port=port, use_unicode=True) 70 | get_tableNames_sql = """select table_name from information_schema.tables order by table_name DESC """ 71 | tablenames = pd.read_sql(get_tableNames_sql, conn).iloc[:, 0].tolist() 72 | 73 | return tablenames 74 | 75 | # 读取 mysql 表数据 76 | def _read_mysql_data(self, tablename): 77 | import mysql.connector 78 | import pandas as pd 79 | 80 | host = self._config.get('mysql', 'host') 81 | port = self._config.getint('mysql', 'port') 82 | user = self._config.get('mysql', 'user') 83 | passwd = self._config.get('mysql', 'passwd') 84 | db = self._config.get('mysql', 'db') 85 | 86 | conn = mysql.connector.connect(host=host, user=user, password=passwd, database=db, port=port, use_unicode=True) 87 | get_data_sql = """select * from %s""" % tablename 88 | data = pd.read_sql(get_data_sql, conn) 89 | 90 | return data 91 | 92 | def read_tablenames(self): 93 | if self._config['savetype']['type'] == 'mysql': 94 | return self._read_mysql_tablenames() 95 | elif self._config['savetype']['type'] == 'leancloud': 96 | return self._read_leancloud_tablenames() 97 | 98 | def read_data(self, tablename): 99 | if self._config['savetype']['type'] == 'mysql': 100 | return self._read_mysql_data(tablename) 101 | elif self._config['savetype']['type'] == 'leancloud': 102 | return self._read_leancloud_data(tablename) 103 | -------------------------------------------------------------------------------- /source/report.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | # python3.0 3 | import source.template as temp 4 | from .read import readData 5 | import time 6 | 7 | class reportData(): 8 | ''' 9 | 用于读取数据 10 | ''' 11 | 12 | def __init__(self, reportFileName=None): 13 | if reportFileName is None: 14 | self._reportFileName = '房屋价格情况统计' + time.strftime('%Y%m%d', time.localtime(time.time())) 15 | else: 16 | self._reportFileName = reportFileName 17 | 18 | # 生成数据块 19 | def _get_table_label(self, id, day, newdata, olddata): 20 | import pandas as pd 21 | new = pd.DataFrame() 22 | down = pd.DataFrame() 23 | up = pd.DataFrame() 24 | other = pd.DataFrame() 25 | # 为了兼容以前没有 关注人数的数据 26 | if "followNum" in newdata.columns: 27 | newdata["followNum"] = newdata["followNum"].astype('string') 28 | else: 29 | newdata["followNum"] = "0" 30 | if "followNum" in olddata.columns: 31 | olddata["followNum"] = olddata["followNum"].astype('string') 32 | else: 33 | olddata["followNum"] = "0" 34 | for index, row in newdata.iterrows(): 35 | if row.houseLink in olddata.houseLink.tolist(): 36 | if row.houseTotlePrice < olddata[olddata.houseLink == row.houseLink].houseTotlePrice.iloc[0]: 37 | down = down.append(row.append(pd.Series({'old_houseTotlePrice': olddata[olddata.houseLink == row.houseLink].houseTotlePrice.iloc[0]})), ignore_index=True) 38 | elif row.houseTotlePrice > olddata[olddata.houseLink == row.houseLink].houseTotlePrice.iloc[0]: 39 | up = up.append(row.append(pd.Series({'old_houseTotlePrice': olddata[olddata.houseLink == row.houseLink].houseTotlePrice.iloc[0]})), ignore_index=True) 40 | else: 41 | other = other.append(row.append(pd.Series({'old_houseTotlePrice': olddata[olddata.houseLink == row.houseLink].houseTotlePrice.iloc[0]})), ignore_index=True) 42 | else: 43 | new = new.append(row.append(pd.Series({'old_houseTotlePrice': '-'})), ignore_index=True) 44 | new['sign'] = '新增' 45 | down['sign'] = '下降' 46 | up['sign'] = '上升' 47 | other['sign'] = '不变' 48 | result = ''' 49 |

较上%s天

50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | ''' % (day, id) 66 | result += self._get_tbody_label(new) 67 | result += self._get_tbody_label(down) 68 | result += self._get_tbody_label(up) 69 | result += self._get_tbody_label(other) 70 | result += ''' 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 |
升降标志房屋名房屋备注房屋总价房屋历史总价房屋单价关注人数小区名房屋链接来源网站
升降标志房屋名房屋备注房屋总价房屋历史总价房屋单价关注人数小区名房屋链接来源网站
''' 87 | return result 88 | 89 | def _get_tbody_label(self, data): 90 | result = '' 91 | for index, row in data.iterrows(): 92 | result += ''' 93 | 94 | %s 95 | %s 96 | %s 97 | %s 98 | %s 99 | %s 100 | %s 101 | %s 102 | %s 103 | %s 104 | '''% (row.sign, row.houseName, row.houseNote, row.houseTotlePrice, row.old_houseTotlePrice, row.houseUnitPrice, row.followNum, row.villageName, row.houseLink if row.houseLink[:4]=="http" else "http://" + row.houseLink, row.houseLink, row.webName) 105 | return result 106 | 107 | # 生成报告文件 108 | def get_report(self): 109 | import datetime 110 | import configparser 111 | import os 112 | 113 | start = time.perf_counter() 114 | # 获取参数 115 | config = configparser.ConfigParser() 116 | config.read("config.ini") 117 | 118 | # 时间差 119 | day1 = None 120 | day7 = None 121 | day30 = None 122 | 123 | # 表名 124 | tn0 = None 125 | tn1 = None 126 | tn7 = None 127 | tn30 = None 128 | 129 | # 数据 130 | data0 = None 131 | data1 = None 132 | data7 = None 133 | data30 = None 134 | 135 | # 获得所有表名 136 | read = readData(config) 137 | tablenames = read.read_tablenames() 138 | 139 | # 得到当天,一天前,一周前,一月前的表名 140 | day = 1 141 | while True: 142 | day = day - 1 143 | if day == -60: 144 | break 145 | res = None 146 | timestring = (datetime.datetime.now() + datetime.timedelta(days=day)).strftime('%Y%m%d') 147 | for tablename in tablenames: 148 | if tablename[1:9] == timestring: 149 | res = tablename 150 | break 151 | if res is not None: 152 | if day == 0: 153 | tn0 = res 154 | elif day > -7: 155 | tn1 = res 156 | day1 = day 157 | day = -6 158 | elif day > -15: 159 | tn7 = res 160 | day7 = day 161 | day = -29 162 | elif day > -60: 163 | tn30 = res 164 | day30 = day 165 | else: 166 | break 167 | elif day == 0: 168 | raise ValueError("当天数据不存在。") 169 | if tn0 is not None: 170 | # 得到当天的数据 171 | data0 = read.read_data(tn0) 172 | if tn1 is not None: 173 | # 得到day1天前的数据 174 | data1 = read.read_data(tn1) 175 | if tn7 is not None: 176 | # 得到day7天的数据 177 | data7 = read.read_data(tn7) 178 | if tn30 is not None: 179 | # 得到day30天的数据 180 | data30 = read.read_data(tn30) 181 | result1 = "没有昨天的数据" 182 | result7 = "没有上周的数据" 183 | result30 = "没有上个月的数据" 184 | if data1 is not None: 185 | # 较1天结果html 186 | result1 = self._get_table_label('day1', (-1)*day1, data0, data1) 187 | if data7 is not None: 188 | # 较7天结果html 189 | result7 = self._get_table_label('day7', (-1)*day7, data0, data7) 190 | if data30 is not None: 191 | # 较30天结果html 192 | result30 = self._get_table_label('day30', (-1)*day30, data0, data30) 193 | end = time.perf_counter() 194 | html = temp.html_temp % (time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())), "%.2f"%(end-start) + 's', result1, result7, result30) 195 | 196 | f = open(os.path.dirname(__file__) + '''/../reports/''' + self._reportFileName + '.html', 'wb') 197 | f.write(html.encode('utf-8')) 198 | 199 | return self._reportFileName + '.html' 200 | 201 | -------------------------------------------------------------------------------- /source/save.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | import configparser 4 | import time 5 | 6 | from .beike import BeikeParser 7 | from .anjuke import AnjukeParser 8 | from .ganji import GanjiParser 9 | from .lianjia import LianjiaParser 10 | from .tongcheng import TongchengParser 11 | 12 | 13 | class saveData(): 14 | ''' 15 | 用于保存数据 16 | ''' 17 | 18 | def __init__(self, config): 19 | self._config = config 20 | pass 21 | 22 | # 清除leancloud数据 23 | def _delete_leancloud(self): 24 | import leancloud 25 | # 初始化leancloud 26 | leancloud.init(self._config['leancloud']['appid'], self._config['leancloud']['appkey']) 27 | # 开启日志 28 | # logging.basicConfig(level=logging.DEBUG) 29 | timestring = time.strftime('%Y%m%d%H', time.localtime(time.time())) 30 | tablename = 'T' + timestring + 'TheFutureOfHome' 31 | TestObject = leancloud.Object.extend(tablename) 32 | test_object = TestObject() 33 | test_object.destroy() 34 | 35 | # 保存到leancloud 36 | def _save_leancloud(self, webName, houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, 37 | houseImg, followNum): 38 | import leancloud 39 | # 初始化leancloud 40 | leancloud.init(self._config['leancloud']['appid'], self._config['leancloud']['appkey']) 41 | # 开启日志 42 | # logging.basicConfig(level=logging.DEBUG) 43 | timestring = time.strftime('%Y%m%d%H', time.localtime(time.time())) 44 | # timestring = "2021090510" 45 | tablename = 'T' + timestring + 'TheFutureOfHome' 46 | TestObject = leancloud.Object.extend(tablename) 47 | for i in range(0, len(houseName)): 48 | test_object = TestObject() 49 | test_object.set('webName', webName) 50 | test_object.set('houseName', houseName[i]) 51 | test_object.set('villageName', villageName[i]) 52 | test_object.set('houseNote', houseNote[i]) 53 | test_object.set('houseTotlePrice', houseTotlePrice[i]) 54 | test_object.set('houseUnitPrice', houseUnitPrice[i]) 55 | test_object.set('houseLink', houseLink[i]) 56 | test_object.set('houseImg', houseImg[i]) 57 | test_object.set('followNum', followNum[i]) 58 | try: 59 | test_object.save() 60 | except Exception as e: 61 | print(e) 62 | print( 63 | "webName:%s\nhouseName:%s\nvillageName:%s\nhouseNote:%s\nhouseTotlePrice:%s\nhouseUnitPrice:%s\nhouseLink:%s\nhouseImg:%s\nfollowNum:%s\n" % ( 64 | webName, houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, 65 | houseImg, followNum)) 66 | print(webName + ' saved ' + str(len(houseName)) + ' rows.') 67 | 68 | # 清理mysql数据 69 | def _delete_mysql(self): 70 | import pymysql 71 | # 用于忽略表已存在的警告 72 | import warnings 73 | warnings.filterwarnings("ignore") 74 | host = self._config.get('mysql', 'host') 75 | port = self._config.getint('mysql', 'port') 76 | user = self._config.get('mysql', 'user') 77 | passwd = self._config.get('mysql', 'passwd') 78 | db = self._config.get('mysql', 'db') 79 | 80 | conn = pymysql.connect(host=host, port=port, user=user, passwd=passwd, db=db, charset='utf8') 81 | cursor = conn.cursor() 82 | timestring = time.strftime('%Y%m%d%H', time.localtime(time.time())) 83 | tablename = 'T' + timestring + 'TheFutureOfHome' 84 | drop_sql = """drop table IF EXISTS %s""" % (tablename) 85 | drop_rows = cursor.execute(drop_sql) 86 | print('delete ' + str(drop_rows) + ' rows.') 87 | 88 | conn.commit() 89 | cursor.close() 90 | conn.close() 91 | 92 | # 保存到mysql 93 | def _save_mysql(self, webName, houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, 94 | houseImg, followNum): 95 | import pymysql 96 | # 用于忽略表已存在的警告 97 | import warnings 98 | warnings.filterwarnings("ignore") 99 | host = self._config.get('mysql', 'host') 100 | port = self._config.getint('mysql', 'port') 101 | user = self._config.get('mysql', 'user') 102 | passwd = self._config.get('mysql', 'passwd') 103 | db = self._config.get('mysql', 'db') 104 | 105 | conn = pymysql.connect(host=host, port=port, user=user, passwd=passwd, db=db, charset='utf8') 106 | cursor = conn.cursor() 107 | 108 | timestring = time.strftime('%Y%m%d%H', time.localtime(time.time())) 109 | tablename = 'T' + timestring + 'TheFutureOfHome' 110 | create_table_sql = """CREATE TABLE IF NOT EXISTS %s ( 111 | Id int auto_increment, 112 | webName varchar(255), 113 | houseName varchar(255), 114 | villageName varchar(255), 115 | houseNote varchar(255), 116 | houseTotlePrice varchar(255), 117 | houseUnitPrice varchar(255), 118 | houseLink varchar(255), 119 | houseImg varchar(255), 120 | followNum varchar(255), 121 | primary key(Id) 122 | ) 123 | ENGINE=InnoDB DEFAULT CHARSET=utf8;""" % (tablename) 124 | cursor.execute(create_table_sql) 125 | 126 | insert_sql = """insert into %s (webName, houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum) values """ % ( 127 | tablename) 128 | for i in range(0, len(houseName)): 129 | if i == 0: 130 | insert_sql += """('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')""" % ( 131 | webName, houseName[i], villageName[i], houseNote[i], houseTotlePrice[i], houseUnitPrice[i], 132 | houseLink[i], houseImg[i], followNum[i]) 133 | else: 134 | insert_sql += """,('%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s', '%s')""" % ( 135 | webName, houseName[i], villageName[i], houseNote[i], houseTotlePrice[i], houseUnitPrice[i], 136 | houseLink[i], houseImg[i], followNum[i]) 137 | insert_sql += """;""" 138 | saved_rows = 0 139 | if len(houseName) > 0: 140 | try: 141 | saved_rows = cursor.execute(insert_sql) 142 | except: 143 | print(insert_sql) 144 | print(webName + ' saved ' + str(saved_rows) + ' rows.') 145 | conn.commit() 146 | cursor.close() 147 | conn.close() 148 | 149 | def deleteOldData(self): 150 | if self._config['savetype']['type'] == 'mysql': 151 | self._delete_mysql() 152 | elif self._config['savetype']['type'] == 'leancloud': 153 | self._delete_leancloud() 154 | 155 | def _saveData(self, *args): 156 | if self._config['savetype']['type'] == 'mysql': 157 | self._save_mysql(*args) 158 | elif self._config['savetype']['type'] == 'leancloud': 159 | self._save_leancloud(*args) 160 | 161 | # 贝壳找房 162 | def beike_save(self, html): 163 | beike = BeikeParser() 164 | houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum = beike.feed(html) 165 | self._saveData('贝壳', houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum) 166 | 167 | # 链家 168 | def lianjia_save(self, html): 169 | lianjia = LianjiaParser() 170 | houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum = lianjia.feed(html) 171 | self._saveData('链家', houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum) 172 | 173 | # 58同城 174 | def tongcheng_save(self, html): 175 | tongcheng = TongchengParser() 176 | houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum = tongcheng.feed(html) 177 | self._saveData('58同城', houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum) 178 | 179 | # 安居客 180 | def anjuke_save(self, html): 181 | anjuke = AnjukeParser() 182 | houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum = anjuke.feed(html) 183 | self._saveData('安居客', houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum) 184 | 185 | # 赶集 186 | def ganji_save(self, html): 187 | ganji = GanjiParser() 188 | houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum = ganji.feed(html) 189 | self._saveData('赶集', houseName, villageName, houseNote, houseTotlePrice, houseUnitPrice, houseLink, houseImg, followNum) 190 | -------------------------------------------------------------------------------- /source/template.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | # python3.0 3 | 4 | html_temp = ''' 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 25 | 100 | 101 | 102 | 103 |
104 | 105 | 110 | %s
111 | %s
112 | %s
113 |
114 | 115 | 116 | ''' 117 | -------------------------------------------------------------------------------- /source/tongcheng.py: -------------------------------------------------------------------------------- 1 | # encoding:utf-8 2 | 3 | from html.parser import HTMLParser 4 | 5 | 6 | class TongchengParser(HTMLParser): 7 | def __init__(self): 8 | super().__init__() 9 | # 存储中间数据(58同城为小区名,房屋介绍) 10 | self.span_a = "" 11 | # 存储中间数据(58同城为总价) 12 | self.b = "" 13 | # 房屋名称 14 | self.houseName = [] 15 | # 小区名称 16 | self.villageName = [] 17 | # 房子介绍 18 | self.houseNote = [] 19 | # 总价 20 | self.houseTotlePrice = [] 21 | # 单价 22 | self.houseUnitPrice = [] 23 | # 房屋链接 24 | self.houseLink = [] 25 | # 第一张图片 26 | self.houseImg = [] 27 | # 用于标记数据类型 28 | self.flag = [] 29 | 30 | def feed(self, data): 31 | super().feed(data) 32 | # 校验数据个数是否统一 33 | size = len(self.houseName) 34 | if len(self.houseName) != size or len(self.villageName) != size or len(self.houseNote) != size \ 35 | or len(self.houseTotlePrice) != size or len(self.houseUnitPrice) != size or len(self.houseLink) != size \ 36 | or len(self.houseImg) != size: 37 | raise ValueError("数据个数不一致:houseName-" + str(len(self.houseName)) + ",villageName-" + str(len(self.villageName)) + 38 | ",houseNote-" + str(len(self.houseNote)) + ",houseTotlePrice-" + str(len(self.houseTotlePrice)) + 39 | ",houseUnitPrice-" + str(len(self.houseUnitPrice)) + ",houseLink-" + str(len(self.houseLink)) + 40 | ",houseImg-" + str(len(self.houseImg))) 41 | return self.houseName, self.villageName, self.houseNote, self.houseTotlePrice, self.houseUnitPrice, self.houseLink, self.houseImg, [0]*len(self.houseImg) 42 | 43 | def handle_starttag(self, tag, attrs): 44 | if tag == "span": 45 | self.flag.append("span") 46 | elif tag == "a" and ("tongji_label", "listclick") in attrs: 47 | self.flag.append("houseName") 48 | for attr in attrs: 49 | if attr[0] == "href": 50 | self.houseLink.append(attr[1]) 51 | break 52 | elif tag == "a" and len(self.flag) >= 1 and self.flag[-1] == "houseNote_2": 53 | self.flag.append("a") 54 | self.flag[-2] = 'villageName_2' 55 | elif tag == "a": 56 | self.flag.append("a") 57 | elif tag == "b": 58 | self.flag.append("b") 59 | elif tag == "p" and ("class", "baseinfo") in attrs: 60 | self.flag.append("houseNote_2") 61 | self.span_a = '' 62 | elif tag == "p" and ("class", "sum") in attrs: 63 | self.flag.append("houseTotlePrice_2") 64 | elif tag == "p" and ("class", "unit") in attrs: 65 | self.flag.append("houseUnitPrice") 66 | elif tag == "img": 67 | for attr in attrs: 68 | if attr[0] == "data-src": 69 | self.houseImg.append(attr[1]) 70 | break 71 | 72 | def handle_endtag(self, tag): 73 | if len(self.flag) != 0: 74 | if tag == "p" and self.flag[-1] == "villageName_2": 75 | # 此时为villageName的结束 76 | # print(self.span.encode('GB18030')) 77 | self.villageName.append(self.span_a.replace(' ', '')) 78 | self.flag.pop() 79 | self.span_a = "" 80 | elif tag == "p" and self.flag[-1] == "houseNote_2": 81 | # 此时为houseNote的结束 82 | # print(self.span.encode('GB18030')) 83 | self.houseNote.append(self.span_a.replace(' ', '')) 84 | self.flag.pop() 85 | self.span_a = "" 86 | 87 | def handle_data(self, data): 88 | if len(self.flag) != 0: 89 | if self.flag[-1] == "span": 90 | # print(str(data)) 91 | self.span_a += data.strip() 92 | self.flag.pop() 93 | elif self.flag[-1] == "a": 94 | # print(str(data)) 95 | self.span_a += data.strip() 96 | self.flag.pop() 97 | elif self.flag[-1] == "b": 98 | # print(str(data)) 99 | self.b += data 100 | self.flag.pop() 101 | elif self.flag[-1] == "houseName": 102 | # print(str(data)) 103 | self.houseName.append(data) 104 | self.flag.pop() 105 | elif self.flag[-1] == "houseTotlePrice_2" and data.replace(' ', '') != '': 106 | # print(str(data)) 107 | self.houseTotlePrice.append(self.b + data.replace(' ', '')) 108 | self.b = "" 109 | self.flag.pop() 110 | elif self.flag[-1] == "houseUnitPrice": 111 | self.houseUnitPrice.append(data) 112 | self.flag.pop() 113 | 114 | --------------------------------------------------------------------------------