├── ErShouFang ├── __init__.py ├── crawl.py ├── crawl_test.py ├── dataSight.py ├── extensions.py ├── items.py ├── middlewares.py ├── pipelines.py ├── settings.py ├── spiders │ ├── FangTianXia.py │ ├── LianJia.py │ ├── Wiwj.py │ ├── ZhongYuan.py │ └── __init__.py ├── 二手房数据.png ├── 北京二手房在售数量.png ├── 北京二手房平均价格.png ├── 北京二手房平均面积.png ├── 北京二手房数据分布.png └── 北京最贵小区表.xls ├── README.md └── scrapy.cfg /ErShouFang/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/__init__.py -------------------------------------------------------------------------------- /ErShouFang/crawl.py: -------------------------------------------------------------------------------- 1 | # Create time:2018-12-25 17:44 2 | # Author:Chen 3 | 4 | from scrapy.crawler import CrawlerProcess 5 | from scrapy.utils.project import get_project_settings 6 | 7 | if __name__ == "__main__": 8 | p = CrawlerProcess(get_project_settings()) 9 | p.crawl("ZhongYuan") 10 | # p.crawl("Wiwj") 11 | # p.crawl("LianJia") 12 | # p.crawl("FangTianXia") 13 | p.start() 14 | -------------------------------------------------------------------------------- /ErShouFang/crawl_test.py: -------------------------------------------------------------------------------- 1 | # Create time:2018-12-25 19:01 2 | # Author:Chen 3 | 4 | import requests 5 | import random 6 | import redis 7 | 8 | 9 | url_list = [f'https://bj.lianjia.com/ershoufang/{region}' for region in \ 10 | ["dongcheng", "xicheng", "chaoyang", "haidian", "fengtai", "shijingshan", 11 | "tongzhou", "changping", "daxing", "yizhuangkaifaqu", "yizhuangkaifaqu", 12 | "shunyi", "fangshan", "mentougou", "pinggu", "huairou", "miyun", "yanqing"]] 13 | 14 | 15 | ua = ["Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1", 16 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0", 17 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", 18 | 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 19 | 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 20 | 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.0 Safari/534.13', 21 | 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/533.3', 22 | 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8', 23 | 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.458.1 Safari/534.3', 24 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3', 25 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3', 26 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11', 27 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11', 28 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 29 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/17.0.940.0 Safari/535.8', 30 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7ad-imcjapan-syosyaman-xkgi3lqg03!wgz' 31 | ] 32 | 33 | con = redis.Redis() 34 | 35 | for url in url_list: 36 | r = requests.get(url, headers={"User-Agent": random.choice(ua)}, proxies={'https': 'https://' + con.srandmember("qingting_https_proxies", 1)[0].decode("utf-8")}) 37 | r.encoding = r.apparent_encoding 38 | print(r.text) -------------------------------------------------------------------------------- /ErShouFang/dataSight.py: -------------------------------------------------------------------------------- 1 | # Create time:2018-12-26 15:49 2 | # Author:Chen 3 | # 数据可视化 4 | 5 | from pyecharts import Bar, Pie 6 | import pymongo 7 | import datetime 8 | import xlwt 9 | 10 | 11 | class DataSight: 12 | 13 | def __init__(self): 14 | self.mongo_cli = pymongo.MongoClient('127.0.0.1', 27017).ErShouFang.houses 15 | self.sub_title = "作者:Chen" 16 | 17 | def avg_area(self): 18 | """ 19 | 生成区域平均面积图片 20 | :return: 21 | """ 22 | self.render_data("北京二手房平均面积", "平方米", "region", "avg", "$area", "柱状图") 23 | 24 | def avg_unit_price(self): 25 | """ 26 | 生成区域平均价格图片 27 | :return: 28 | """ 29 | self.render_data("北京二手房平均价格", "元/平方米", "region", "avg", "$unit_price", "柱状图") 30 | 31 | def count(self): 32 | """ 33 | 生成区域数量图片 34 | :return: 35 | """ 36 | self.render_data("北京二手房在售数量", "套", "region", "sum", 1, "柱状图") 37 | 38 | def agency_count(self): 39 | """ 40 | 爬虫四大中介在售房的饼状图 41 | :return: 42 | """ 43 | self.render_data("北京二手房数据分布", "套", "agency", "sum", 1, "饼状图") 44 | 45 | def community_avg_price(self): 46 | excel = xlwt.Workbook() 47 | for region in ["西城", "东城", "朝阳", "海淀"]: 48 | arg = [ 49 | {'$match': {"region": region}}, 50 | {"$group": {"_id": "$community", "avg_price": {"$avg": "$unit_price"}, "count": {"$sum": 1}}}, 51 | {"$match": {"count": {"$gt": 4}}}, 52 | {"$sort": {"avg_price": -1}}, 53 | {"$limit": 10} 54 | ] 55 | result = self.mongo_cli.aggregate(arg) 56 | sheet = excel.add_sheet(region) 57 | sheet.write(0, 0, "小区名称") 58 | sheet.write(0, 1, "平均价格") 59 | for i, data in enumerate(result): 60 | sheet.write(i + 1, 0, data['_id']) 61 | sheet.write(i + 1, 1, int(data['avg_price'])) 62 | excel.save("北京最贵小区表.xls") 63 | 64 | def render_data(self, title_name, unit_name, group_name, accumulate_type, accumulate_name, image_type): 65 | """ 66 | 生成图片 67 | :param title_name: 标题和文件名字 字符串 68 | :param unit_name: 单位 字符串 69 | :param file_name: 生成的图片名称 字符串 70 | :param group_name: 数据库中的key,根据此名字对数据分组 字符串 71 | :param accumulate_type: 计算的类型 sum:求和 avg:求平均值等 字符串 72 | :param accumulate_name: 数据库中的key,被计算数据的名字 前面加$ 73 | :param image_type: 图标类型,柱状图,饼状图 74 | :return: 75 | """ 76 | arg = [{"$group": {"_id": f"${group_name}", "result": {f"${accumulate_type}": accumulate_name}}}] 77 | result = list(self.mongo_cli.aggregate(pipeline=arg)) 78 | # 由大到小排序 79 | result.sort(key=lambda x: x['result'], reverse=True) 80 | for i in result: 81 | if i['_id'] == "亦庄开发区": 82 | i['_id'] = "亦庄" 83 | if image_type == "柱状图": 84 | bar = Bar(f"{title_name}{datetime.datetime.now().strftime('%m%d')}", self.sub_title) 85 | elif image_type == "饼状图": 86 | bar = Pie(f"{title_name}{datetime.datetime.now().strftime('%m%d')}", self.sub_title) 87 | else: 88 | raise Exception("未定义图标类型") 89 | bar.add( 90 | f"单位:{unit_name}", 91 | [r['_id'] for r in result], 92 | [int(r['result']) for r in result], 93 | xaxis_interval=0, 94 | is_label_show=True 95 | ) 96 | bar.render(path=f'{title_name}.png') 97 | 98 | 99 | if __name__ == "__main__": 100 | DataSight().community_avg_price() 101 | -------------------------------------------------------------------------------- /ErShouFang/extensions.py: -------------------------------------------------------------------------------- 1 | # Create time:2018-12-24 22:05 2 | # Author:Chen 3 | from scrapy import signals 4 | from scrapy.exceptions import NotConfigured 5 | import requests 6 | import time 7 | import threading 8 | import json 9 | import logging 10 | import redis 11 | 12 | 13 | class QingtingProxy: 14 | 15 | def __init__(self, proxy_key, redis_setting): 16 | self.url = "https://proxy.horocn.com/api/proxies?order_id=U7G61620840936700598&num=10&format=json&line_separator=win" 17 | self.spider_running = True 18 | self.proxy_key = proxy_key 19 | self.server = redis.Redis(*redis_setting) 20 | 21 | @classmethod 22 | def from_crawler(cls, crawler): 23 | if crawler.settings.get("EXTENSIONS_DO"): 24 | spider = cls(crawler.settings.get("PROXY_KEY"), crawler.settings.getlist("REDIS")) 25 | crawler.signals.connect(spider.engine_started, signal=signals.engine_started) 26 | crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed) 27 | return spider 28 | else: 29 | raise NotConfigured 30 | 31 | def proxy(self, server): 32 | while self.spider_running: 33 | r = requests.get(self.url) 34 | logging.debug(r.text) 35 | if "不足" in r.text: 36 | raise Exception("代理余额不足") 37 | # 代理ip和端口号组合后的列表 38 | ip_list = [i['host'] + ":" + str(i["port"]) for i in json.loads(r.text)] 39 | for ip in ip_list: 40 | server.sadd(self.proxy_key + "_pr", ip) 41 | server.rename(self.proxy_key + "_pr", self.proxy_key) 42 | # 代理ip调用最小间隔10s 43 | time.sleep(10) 44 | 45 | def engine_started(self): 46 | # 当engine开始运行时开始维护代理池 47 | threading.Thread(target=self.proxy, args=(self.server,)).start() 48 | 49 | def spider_closed(self, spider): 50 | # 当爬虫关闭时关闭代理池维护 51 | self.spider_running = False 52 | -------------------------------------------------------------------------------- /ErShouFang/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class ErshoufangItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | bedroom = scrapy.Field() 15 | living_room = scrapy.Field() 16 | area = scrapy.Field() 17 | total_price = scrapy.Field() 18 | unit_price = scrapy.Field() 19 | region = scrapy.Field() 20 | community = scrapy.Field() 21 | agency = scrapy.Field() 22 | date = scrapy.Field() 23 | 24 | -------------------------------------------------------------------------------- /ErShouFang/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals, Request 9 | import random 10 | import time 11 | import logging 12 | import redis 13 | 14 | 15 | class ErshoufangSpiderMiddleware(object): 16 | # Not all methods need to be defined. If a method is not defined, 17 | # scrapy acts as if the spider middleware does not modify the 18 | # passed objects. 19 | 20 | @classmethod 21 | def from_crawler(cls, crawler): 22 | # This method is used by Scrapy to create your spiders. 23 | s = cls() 24 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 25 | return s 26 | 27 | def process_spider_input(self, response, spider): 28 | # Called for each response that goes through the spider 29 | # middleware and into the spider. 30 | 31 | # Should return None or raise an exception. 32 | return None 33 | 34 | def process_spider_output(self, response, result, spider): 35 | # Called with the results returned from the Spider, after 36 | # it has processed the response. 37 | 38 | # Must return an iterable of Request, dict or Item objects. 39 | for i in result: 40 | yield i 41 | 42 | def process_spider_exception(self, response, exception, spider): 43 | # Called when a spider or process_spider_input() method 44 | # (from other spider middleware) raises an exception. 45 | 46 | # Should return either None or an iterable of Response, dict 47 | # or Item objects. 48 | pass 49 | 50 | def process_start_requests(self, start_requests, spider): 51 | # Called with the start requests of the spider, and works 52 | # similarly to the process_spider_output() method, except 53 | # that it doesn’t have a response associated. 54 | 55 | # Must return only requests (not items). 56 | for r in start_requests: 57 | yield r 58 | 59 | def spider_opened(self, spider): 60 | spider.logger.info('Spider opened: %s' % spider.name) 61 | 62 | 63 | class ErshoufangDownloaderMiddleware(object): 64 | # Not all methods need to be defined. If a method is not defined, 65 | # scrapy acts as if the downloader middleware does not modify the 66 | # passed objects. 67 | 68 | def __init__(self, ua): 69 | self.ua = ua 70 | 71 | @classmethod 72 | def from_crawler(cls, crawler): 73 | # This method is used by Scrapy to create your spiders. 74 | s = cls(crawler.settings.getlist("USER_AGENT")) 75 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 76 | return s 77 | 78 | def process_request(self, request, spider): 79 | # Called for each request that goes through the downloader 80 | # middleware. 81 | 82 | # Must either: 83 | # - return None: continue processing this request 84 | # - or return a Response object 85 | # - or return a Request object 86 | # - or raise IgnoreRequest: process_exception() methods of 87 | # installed downloader middleware will be called 88 | # 我爱我家反爬,需要cookie 89 | if "5i5j" in request.url: 90 | request.cookies = { 91 | 'PHPSESSID': '1p63h7he47e4jb21uf3qpqa31h', 92 | ' domain': 'bj', 93 | ' yfx_c_g_u_id_10000001': '_ck18122702385612291856717911557', 94 | ' yfx_f_l_v_t_10000001': 'f_t_1545849536214__r_t_1545849536214__v_t_1545849536214__r_c_0', 95 | ' yfx_mr_n_10000001': 'baidu::market_type_ppzq::::::::::%E6%A0%87%E9%A2%98::bj.5i5j.com::::::%E5%B7%A6%E4%BE%A7%E6%A0%87%E9%A2%98::%E6%A0%87%E9%A2%98::160::pmf_from_adv::bj.5i5j.com/', 96 | ' yfx_mr_f_n_10000001': 'baidu::market_type_ppzq::::::::::%E6%A0%87%E9%A2%98::bj.5i5j.com::::::%E5%B7%A6%E4%BE%A7%E6%A0%87%E9%A2%98::%E6%A0%87%E9%A2%98::160::pmf_from_adv::bj.5i5j.com/', 97 | ' _ga': 'GA1.2.1709742213.1545849536', 98 | ' _gid': 'GA1.2.1682278583.1545849536', 99 | ' _gat': '1', 100 | ' Hm_lvt_94ed3d23572054a86ed341d64b267ec6': '1545849537', 101 | ' Hm_lpvt_94ed3d23572054a86ed341d64b267ec6': '1545849539', 102 | ' _Jo0OQK': '50ADB2E7DEF3A95BCBAA04D52F5414775A30C0C5815693EB62460770A7531C510DCEFD1A42F959FF2E12A16D4BFD17D9C7D374A1376BEE8ED0AACA8F800B32B3CE3C57212F12283777C840763663251ADEB840763663251ADEBC6107B635DA7B8669547054735CB82A9GJ1Z1XA=='} 103 | request.headers["User-Agent"] = random.choice(self.ua) 104 | return None 105 | 106 | def process_response(self, request, response, spider): 107 | # Called with the response returned from the downloader. 108 | 109 | # Must either; 110 | # - return a Response object 111 | # - return a Request object 112 | # - or raise IgnoreRequest 113 | return response 114 | 115 | def process_exception(self, request, exception, spider): 116 | # Called when a download handler or a process_request() 117 | # (from other downloader middleware) raises an exception. 118 | 119 | # Must either: 120 | # - return None: continue processing this exception 121 | # - return a Response object: stops process_exception() chain 122 | # - return a Request object: stops process_exception() chain 123 | pass 124 | 125 | def spider_opened(self, spider): 126 | spider.logger.info('Spider opened: %s' % spider.name) 127 | 128 | 129 | class RandomProxy: 130 | """ 131 | 将每次请求加上代理ip 132 | """ 133 | def __init__(self, proxy_key, redis_setting): 134 | self.proxy_key = proxy_key 135 | self.server = redis.Redis(*redis_setting) 136 | 137 | @classmethod 138 | def from_crawler(cls, crawler): 139 | return cls(crawler.settings.get("PROXY_KEY"), crawler.settings.getlist("REDIS")) 140 | 141 | def set_proxy(self, server): 142 | if server.scard(self.proxy_key) == 0: 143 | # 等待代理池初始化 144 | time.sleep(10) 145 | return server.srandmember(self.proxy_key, 1)[0].decode("utf-8") 146 | 147 | def process_request(self, request, spider): 148 | request.meta["proxy"] = "https://" + self.set_proxy(self.server) 149 | 150 | def process_exception(self, request, exception, spider): 151 | if exception: 152 | logging.error("连接异常,返回消息队列") 153 | return Request(request.url, callback=request.callback, dont_filter=True) 154 | 155 | def process_response(self, request, response, spider): 156 | if response.status != 200: 157 | logging.error("状态码异常,返回消息队列") 158 | return Request(request.url, callback=request.callback, dont_filter=True) 159 | return response -------------------------------------------------------------------------------- /ErShouFang/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | import pymongo 9 | 10 | class ErshoufangPipeline(object): 11 | 12 | def __init__(self, mongo): 13 | cli = pymongo.MongoClient(*mongo) 14 | self.collection = cli.ErShouFang.houses 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | return cls(crawler.settings.getlist("MONGO")) 19 | 20 | def process_item(self, item, spider): 21 | self.collection.insert(dict(item)) 22 | return item 23 | -------------------------------------------------------------------------------- /ErShouFang/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for ErShouFang project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'ErShouFang' 13 | 14 | SPIDER_MODULES = ['ErShouFang.spiders'] 15 | NEWSPIDER_MODULE = 'ErShouFang.spiders' 16 | 17 | # redis代理池 18 | REDIS = ['127.0.0.1', 6379] 19 | PROXY_KEY = 'https_proxies' 20 | 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 22 | USER_AGENT = ["Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1", 23 | "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0", 24 | "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", 25 | 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 26 | 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16', 27 | 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.0 Safari/534.13', 28 | 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/533.3', 29 | 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8', 30 | 'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.458.1 Safari/534.3', 31 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3', 32 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3', 33 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11', 34 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11', 35 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 36 | 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/17.0.940.0 Safari/535.8', 37 | 'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7ad-imcjapan-syosyaman-xkgi3lqg03!wgz' 38 | ] 39 | 40 | 41 | # Obey robots.txt rules 42 | ROBOTSTXT_OBEY = False 43 | 44 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 45 | #CONCURRENT_REQUESTS = 32 46 | 47 | # Configure a delay for requests for the same website (default: 0) 48 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay 49 | # See also autothrottle settings and docs 50 | DOWNLOAD_DELAY = 0 51 | # The download delay setting will honor only one of: 52 | CONCURRENT_REQUESTS_PER_DOMAIN = 48 53 | CONCURRENT_REQUESTS_PER_IP = 48 54 | 55 | # Disable cookies (enabled by default) 56 | COOKIES_ENABLED = True 57 | 58 | # Disable Telnet Console (enabled by default) 59 | #TELNETCONSOLE_ENABLED = False 60 | 61 | # Override the default request headers: 62 | #DEFAULT_REQUEST_HEADERS = { 63 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 64 | # 'Accept-Language': 'en', 65 | #} 66 | 67 | # Enable or disable spider middlewares 68 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html 69 | #SPIDER_MIDDLEWARES = { 70 | # 'ErShouFang.middlewares.ErshoufangSpiderMiddleware': 543, 71 | #} 72 | 73 | # Enable or disable downloader middlewares 74 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 75 | DOWNLOADER_MIDDLEWARES = { 76 | 'ErShouFang.middlewares.ErshoufangDownloaderMiddleware': 543, 77 | 'ErShouFang.middlewares.RandomProxy': 591, 78 | } 79 | 80 | # Enable or disable extensions 81 | # See https://doc.scrapy.org/en/latest/topics/extensions.html 82 | EXTENSIONS = { 83 | 'ErShouFang.extensions.QingtingProxy': 2, 84 | } 85 | # 在true的情况下才运行扩展 86 | EXTENSIONS_DO = False 87 | 88 | # Configure item pipelines 89 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html 90 | ITEM_PIPELINES = { 91 | 'ErShouFang.pipelines.ErshoufangPipeline': 300, 92 | } 93 | 94 | # Enable and configure the AutoThrottle extension (disabled by default) 95 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html 96 | #AUTOTHROTTLE_ENABLED = True 97 | # The initial download delay 98 | #AUTOTHROTTLE_START_DELAY = 5 99 | # The maximum download delay to be set in case of high latencies 100 | #AUTOTHROTTLE_MAX_DELAY = 60 101 | # The average number of requests Scrapy should be sending in parallel to 102 | # each remote server 103 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 104 | # Enable showing throttling stats for every response received: 105 | #AUTOTHROTTLE_DEBUG = False 106 | 107 | # Enable and configure HTTP caching (disabled by default) 108 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 109 | #HTTPCACHE_ENABLED = True 110 | #HTTPCACHE_EXPIRATION_SECS = 0 111 | #HTTPCACHE_DIR = 'httpcache' 112 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 113 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 114 | -------------------------------------------------------------------------------- /ErShouFang/spiders/FangTianXia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import datetime 5 | from ErShouFang.items import ErshoufangItem 6 | 7 | 8 | class FangtianxiaSpider(scrapy.Spider): 9 | name = 'FangTianXia' 10 | start_urls = [f'https://esf.fang.com{region}' for region in\ 11 | ["/house-a01/", "/house-a00/", "/house-a06/", "/house-a03/", "/house-a02/", 12 | "/house-a012/", "/house-a0585/", "/house-a010/", "/house-a08/", "/house-a011/", 13 | "/house-a07/", "/house-a013/", "/house-a09/", "/house-a014/", "/house-a015/", "/house-a016/"]] 14 | 15 | def parse(self, response): 16 | max_page = response.xpath("//div[@class='page_al']/p[last()]/text()").extract_first() 17 | max_page_num = re.findall('共(\d+)页', max_page)[0] 18 | for page in range(1, int(max_page_num) + 1): 19 | yield scrapy.Request(response.url + f"i3{page}/", callback=self.page_parse) 20 | 21 | def page_parse(self, response): 22 | region = response.xpath("//div[@class='term_screen2 clearfix']/ul/li[1]/a/text()").extract_first() 23 | date = datetime.datetime.now().strftime("%Y-%m-%d") 24 | dls = response.xpath("//div[@class='shop_list shop_list_4']/dl") 25 | for dl in dls: 26 | room = dl.xpath(".//p[@class='tel_shop']/text()[1]").extract_first() 27 | if room: 28 | area = dl.xpath(".//p[@class='tel_shop']/text()[2]").extract_first() 29 | total_price = dl.xpath(".//dd[@class='price_right']/span/b/text()").extract_first() 30 | unit_price = dl.xpath(".//dd[@class='price_right']/span[2]/text()").extract_first() 31 | community = dl.xpath(".//p[@class='add_shop']/a/@title").extract_first() 32 | yield ErshoufangItem( 33 | bedroom=int(re.findall("(\d)室", room)[0]), 34 | living_room=int(re.findall("(\d)厅", room)[0]), 35 | area=float(re.findall("(.+)㎡", area)[0]), 36 | total_price=int(total_price), 37 | unit_price=int(re.findall('(\d+)元/㎡', unit_price)[0]), 38 | region=region, 39 | community=community, 40 | agency="房天下", 41 | date=date 42 | ) -------------------------------------------------------------------------------- /ErShouFang/spiders/LianJia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import datetime 5 | from ErShouFang.items import ErshoufangItem 6 | 7 | 8 | class LianjiaSpider(scrapy.Spider): 9 | name = 'LianJia' 10 | custom_settings = { 11 | 'EXTENSIONS_DO': True 12 | } 13 | start_urls = [f'https://bj.lianjia.com/ershoufang/{region}/' for region in\ 14 | ["dongcheng", "xicheng", "chaoyang", "haidian", "fengtai", "shijingshan", 15 | "tongzhou", "changping", "daxing", "yizhuangkaifaqu", 16 | "shunyi", "fangshan", "mentougou", "pinggu", "huairou", "miyun", "yanqing"]] 17 | 18 | def parse(self, response): 19 | """ 20 | 每个地区的房屋列表页 21 | :param response: 22 | :return: 23 | """ 24 | # 房子总数 25 | total_num = response.xpath("//h2[@class='total fl']/span/text()").extract_first().strip() 26 | # 最大页数 27 | max_page_num = int(int(total_num) / 30) + 1 28 | # 最多只有100页 29 | if max_page_num > 100: 30 | max_page_num = 100 31 | for page_num in range(1, max_page_num+1): 32 | # 带页数的url地址 33 | url = response.url + f"pg{page_num}/" 34 | yield scrapy.Request(url, callback=self.page_parse) 35 | 36 | def page_parse(self, response): 37 | lis = response.xpath("//li[@class='clear LOGCLICKDATA']") 38 | region = response.xpath("//div[@class='sub_nav section_sub_nav']/a[@class='selected']/text()").extract_first() 39 | date = datetime.datetime.now().strftime("%Y-%m-%d") 40 | for li in lis: 41 | room = li.xpath(".//div[@class='houseInfo']/text()[1]").extract_first() 42 | if "室" not in room: 43 | # 商住非正规房 44 | return 45 | area = li.xpath(".//div[@class='houseInfo']/text()[2]").extract_first() 46 | total_price = li.xpath(".//div[@class='totalPrice']/span/text()").extract_first() 47 | unit_price = li.xpath(".//div[@class='unitPrice']/span/text()").extract_first() 48 | community = li.xpath(".//div[@class='houseInfo']/a/text()").extract_first() 49 | yield ErshoufangItem( 50 | bedroom=int(re.findall("(\d)室", room)[0]), 51 | living_room=int(re.findall("(\d)厅", room)[0]), 52 | area=float(re.findall("(.+)平米", area)[0]), 53 | total_price=int(total_price), 54 | unit_price=int(re.findall('单价(.+)元/平米',unit_price)[0]), 55 | region=region, 56 | community=community, 57 | agency="链家", 58 | date=date 59 | ) 60 | -------------------------------------------------------------------------------- /ErShouFang/spiders/Wiwj.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import datetime 5 | from ErShouFang.items import ErshoufangItem 6 | 7 | 8 | class WiwjSpider(scrapy.Spider): 9 | name = 'Wiwj' 10 | start_urls = [f'https://bj.5i5j.com/ershoufang/{region}/' for region in\ 11 | ["haidianqu", "dongchengqu", "xichengqu", "fengtaiqu", "shijingshanqu", "tongzhouqu", 12 | "changpingqu", "daxingqu", "yizhuang", "shunyiqu", "fangshanqu", "mentougou", 13 | "pinggu", "huairou", "miyun", "yanqing", "chaoyangqu"]] 14 | 15 | def parse(self, response): 16 | redirect = self.rediection(response, self.parse) 17 | if not redirect: 18 | total_num = response.xpath("//div[@class='total-box noBor']/span/text()").extract_first() 19 | max_page_num = int(int(total_num)/30) + 1 20 | for page in range(1, max_page_num + 1): 21 | yield scrapy.Request(response.url + f"n{page}/", callback=self.page_parse) 22 | else: 23 | return redirect 24 | 25 | def page_parse(self, response): 26 | redirect = self.rediection(response, self.parse) 27 | if not redirect: 28 | region = response.xpath("//li[@class='new_di_tab_cur']/text()").extract_first() 29 | date = datetime.datetime.now().strftime("%Y-%m-%d") 30 | lis = response.xpath("//ul[@class='pList']/li") 31 | for li in lis: 32 | room = li.xpath(".//div[@class='listCon']/div[@class='listX']/p[1]/text()").extract_first() 33 | # 商用非正规房 34 | if '多' in room: 35 | return 36 | total_price = li.xpath(".//div[@class='listCon']/div[@class='listX']/div[@class='jia']/p[1]/strong/text()").extract_first() 37 | unit_price = li.xpath("//div[@class='listCon']/div[@class='listX']/div[@class='jia']/p[2]/text()").extract_first() 38 | community = li.xpath(".//div[@class='listCon']/div[@class='listX']/p[2]/a/text()").extract_first() 39 | yield ErshoufangItem( 40 | bedroom=int(re.findall("(\d)室", room)[0]), 41 | living_room=int(re.findall("(\d)厅", room)[0]), 42 | area=float(re.findall(r"\·(.+)\s\s平米", room)[0]), 43 | total_price=int(total_price), 44 | unit_price=int(re.findall('单价(.+)元/', unit_price)[0]), 45 | region=region, 46 | community=community.split(' ')[1], 47 | agency="我爱我家", 48 | date=date 49 | ) 50 | else: 51 | return redirect 52 | 53 | def rediection(self, response, func): 54 | body = response.body.decode("utf-8") 55 | # 如果重定向那么重新发送request 56 | if "window.location.href" in body: 57 | return scrapy.Request(re.findall(r"window.location.href='(.+?)';", body)[0], callback=func) -------------------------------------------------------------------------------- /ErShouFang/spiders/ZhongYuan.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import scrapy 3 | import re 4 | import datetime 5 | from ErShouFang.items import ErshoufangItem 6 | 7 | 8 | class ZhongyuanSpider(scrapy.Spider): 9 | name = 'ZhongYuan' 10 | start_urls = [f'https://bj.centanet.com/ershoufang/{region}/' for region in\ 11 | ["dongchengqu", "haidianqu", "xichengqu", "shijingshanqu", "fengtaiqu", "shunyiqu", 12 | "mentougouqu", "tongzhouqu", "daxingqu", "yizhuangkaifaqu"]] 13 | 14 | def parse(self, response): 15 | max_page = response.xpath("//a[text()='>>']/@href").extract_first() 16 | # 判断是否有第二页,如果没有直接分析网页 17 | if max_page: 18 | max_page_num = re.findall(r'g(\d+)\/', max_page)[0] 19 | for page in range(1, int(max_page_num) + 1): 20 | yield scrapy.Request(response.url + f"g{page}/", callback=self.page_parse) 21 | else: 22 | self.page_parse(response) 23 | 24 | def page_parse(self, response): 25 | region = response.xpath("//p[@class='termcon fl ']/span[@class='curr']/text()").extract_first() 26 | date = datetime.datetime.now().strftime("%Y-%m-%d") 27 | divs = response.xpath("//div[@class='house-item clearfix']") 28 | for div in divs: 29 | room = div.xpath(".//p[@class='house-name']/span[2]/text()").extract_first() 30 | area = div.xpath(".//p[@class='house-name']/span[4]/text()").extract_first() 31 | total_price = div.xpath(".//p[@class='price-nub cRed tc']/span/text()").extract_first() 32 | unit_price = div.xpath(".//p[@class='price-txt tc']/text()").extract_first() 33 | community = div.xpath(".//p[@class='house-name']/a/text()").extract_first() 34 | yield ErshoufangItem( 35 | bedroom=int(re.findall("(\d)室", room)[0]), 36 | living_room=int(re.findall("(\d)厅", room)[0] if '厅' in room else '0'), 37 | area=float(re.findall("(.+)平", area)[0]), 38 | total_price=int(float(total_price)), 39 | unit_price=int(float(re.findall('(\d+)元/平', unit_price)[0])), 40 | region=region, 41 | community=community, 42 | agency="中原地产", 43 | date=date 44 | ) -------------------------------------------------------------------------------- /ErShouFang/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /ErShouFang/二手房数据.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/二手房数据.png -------------------------------------------------------------------------------- /ErShouFang/北京二手房在售数量.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/北京二手房在售数量.png -------------------------------------------------------------------------------- /ErShouFang/北京二手房平均价格.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/北京二手房平均价格.png -------------------------------------------------------------------------------- /ErShouFang/北京二手房平均面积.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/北京二手房平均面积.png -------------------------------------------------------------------------------- /ErShouFang/北京二手房数据分布.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/北京二手房数据分布.png -------------------------------------------------------------------------------- /ErShouFang/北京最贵小区表.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/北京最贵小区表.xls -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ErShouFang 2 | 抓取链家,我爱我家,房天下,中原地产房产中介二手房在售数据,做出数据可视化图片,并做简单的数据分析 3 | 4 | 简述 5 | 想要了解北京二手房在售情况,房产中介的数据无疑是很有说服力的,本爬虫爬取四家中介网站的在售二手房共14万多条数据,覆盖了北京各个地区。以下会根据数据图片做简要的数据分析。 6 | 在技术方面,本爬虫使用scrapy框架,使用两个下载器中间件分别做了代理、cookie、请求头处理,创建extension自动维护代理池,数据写入mongo数据库,使用mongo聚合对数据进行分类、总和与平均值计算,使用pyecharts库对数据进行可视化,生成了柱状图与饼状图。 7 | 8 | 数据分析 9 | ![image](https://github.com/LemonBottom/ErShouFang/blob/master/ErShouFang/北京二手房平均价格.png?raw=true) 10 | 全北京在售二手房,西城区平均房价最高,达到了骇人听闻的十一万每平米,其次是东城区,直逼十万每平米,再然后是海淀和朝阳。以此数据与房天下网站公布的数据相比,总体价格趋势与房天下数据一致,不过海淀区的平均价格相差较大,本爬虫是九万,房天下数据是八万四,相差了六千。本人认为这与年末的学区房政策有着千丝万缕的联系,经微博网友雪球的爆料,根据其提供的上地实验中学的《关于北京市义务教育入学政策的通知》,其中提到《北京市海淀区教育委员会关于2018年义务教育阶段入学工作的实施意见》“自2019年1月1日起,在海淀区新登记并取得房屋不动产权证书的住房用于申请入学的,将不再对应一所学校,实施多校划片”。这或许是海淀区房价平均值不稳定的原因之一,一些房屋出售者会根据政策变化及时变更房屋价格。 11 | 12 | ![image](https://github.com/LemonBottom/ErShouFang/blob/master/ErShouFang/北京二手房在售数量.png?raw=true) 13 | 在北京哪里卖二手房子的最多?朝阳和海淀。不过通过数据可以看出,北京近郊的房产交易要比东西城火热。本人认为,随着地铁不断向外拓展,北京近郊的交通会越来越便利,房地产会越发火热,毕竟房价和中心城区差出了一个梯队,平均价格在五万左右。 14 | 15 | ![image](https://github.com/LemonBottom/ErShouFang/blob/master/ErShouFang/北京二手房平均面积.png?raw=true) 16 | 北京近郊的房子面积要比城区大。 17 | 18 | ![image](https://github.com/LemonBottom/ErShouFang/blob/master/ErShouFang/北京二手房数据分布.png?raw=true) 19 | 可以看出相比两大明码标价中介链家和我爱我家,房天下的数据最多,中原地产数据最少。 20 | 21 | 北京最贵的小区在哪? 22 | 23 | 西城区 24 | 25 | 小区名称 | 平均价格 26 | :------: | :------: 27 | 丰融园 | 186127 | 28 | 中海凯旋 | 177202 | 29 | 丰侨公寓 | 176112 | 30 | 丰汇园 | 175823 | 31 | 北京尊府 | 172882 | 32 | 33 | 东城区 34 | 35 | |小区名称|平均价格| 36 | :------: | :------: 37 | |北下洼子胡同|165664| 38 | |府学胡同|161980| 39 | |八宝坑胡同|160336| 40 | |普渡寺小区|151491| 41 | |长安太和|140198| 42 | 43 | 朝阳区 44 | 45 | |小区名称|平均价格| 46 | :------: | :------: 47 | |泛海世家|178752| 48 | |红玺台|151880| 49 | 北京壹号院|149892| 50 | |翰林阁|149176| 51 | |太阳公元|140682| 52 | 53 | 海淀区 54 | 55 | |小区名称|平均价格| 56 | :------: | :------: 57 | |万柳书院|192737| 58 | |万城华府|182129| 59 | |万城华府海园|179488| 60 | |保利海德公园|166471| 61 | |光大水墨风景|147323| 62 | 63 | 64 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = ErShouFang.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = ErShouFang 12 | --------------------------------------------------------------------------------