├── ErShouFang
    ├── __init__.py
    ├── crawl.py
    ├── crawl_test.py
    ├── dataSight.py
    ├── extensions.py
    ├── items.py
    ├── middlewares.py
    ├── pipelines.py
    ├── settings.py
    ├── spiders
    │   ├── FangTianXia.py
    │   ├── LianJia.py
    │   ├── Wiwj.py
    │   ├── ZhongYuan.py
    │   └── __init__.py
    ├── 二手房数据.png
    ├── 北京二手房在售数量.png
    ├── 北京二手房平均价格.png
    ├── 北京二手房平均面积.png
    ├── 北京二手房数据分布.png
    └── 北京最贵小区表.xls
├── README.md
└── scrapy.cfg


/ErShouFang/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/__init__.py


--------------------------------------------------------------------------------
/ErShouFang/crawl.py:
--------------------------------------------------------------------------------
 1 | # Create time:2018-12-25 17:44
 2 | # Author:Chen
 3 | 
 4 | from scrapy.crawler import CrawlerProcess
 5 | from scrapy.utils.project import get_project_settings
 6 | 
 7 | if __name__ == "__main__":
 8 | 	p = CrawlerProcess(get_project_settings())
 9 | 	p.crawl("ZhongYuan")
10 | 	# p.crawl("Wiwj")
11 | 	# p.crawl("LianJia")
12 | 	# p.crawl("FangTianXia")
13 | 	p.start()
14 | 


--------------------------------------------------------------------------------
/ErShouFang/crawl_test.py:
--------------------------------------------------------------------------------
 1 | # Create time:2018-12-25 19:01
 2 | # Author:Chen
 3 | 
 4 | import requests
 5 | import random
 6 | import redis
 7 | 
 8 | 
 9 | url_list = [f'https://bj.lianjia.com/ershoufang/{region}' for region in \
10 |                   ["dongcheng", "xicheng", "chaoyang", "haidian", "fengtai", "shijingshan",
11 |                    "tongzhou", "changping", "daxing", "yizhuangkaifaqu", "yizhuangkaifaqu",
12 |                    "shunyi", "fangshan", "mentougou", "pinggu", "huairou", "miyun", "yanqing"]]
13 | 
14 | 
15 | ua = ["Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1",
16 |       "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
17 |       "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
18 |       'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
19 |       'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
20 |       'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.0 Safari/534.13',
21 |       'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/533.3',
22 |       'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
23 |       'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.458.1 Safari/534.3',
24 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
25 |       'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
26 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
27 |       'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
28 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
29 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/17.0.940.0 Safari/535.8',
30 |       'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7ad-imcjapan-syosyaman-xkgi3lqg03!wgz'
31 |       ]
32 | 
33 | con = redis.Redis()
34 | 
35 | for url in url_list:
36 |     r = requests.get(url, headers={"User-Agent": random.choice(ua)}, proxies={'https': 'https://' + con.srandmember("qingting_https_proxies", 1)[0].decode("utf-8")})
37 |     r.encoding = r.apparent_encoding
38 |     print(r.text)


--------------------------------------------------------------------------------
/ErShouFang/dataSight.py:
--------------------------------------------------------------------------------
  1 | # Create time:2018-12-26 15:49
  2 | # Author:Chen
  3 | # 数据可视化
  4 | 
  5 | from pyecharts import Bar, Pie
  6 | import pymongo
  7 | import datetime
  8 | import xlwt
  9 | 
 10 | 
 11 | class DataSight:
 12 | 
 13 |     def __init__(self):
 14 |         self.mongo_cli = pymongo.MongoClient('127.0.0.1', 27017).ErShouFang.houses
 15 |         self.sub_title = "作者：Chen"
 16 | 
 17 |     def avg_area(self):
 18 |         """
 19 |         生成区域平均面积图片
 20 |         :return:
 21 |         """
 22 |         self.render_data("北京二手房平均面积", "平方米", "region", "avg", "$area", "柱状图")
 23 | 
 24 |     def avg_unit_price(self):
 25 |         """
 26 |         生成区域平均价格图片
 27 |         :return:
 28 |         """
 29 |         self.render_data("北京二手房平均价格", "元/平方米", "region", "avg", "$unit_price", "柱状图")
 30 | 
 31 |     def count(self):
 32 |         """
 33 |         生成区域数量图片
 34 |         :return:
 35 |         """
 36 |         self.render_data("北京二手房在售数量", "套", "region", "sum", 1, "柱状图")
 37 | 
 38 |     def agency_count(self):
 39 |         """
 40 |         爬虫四大中介在售房的饼状图
 41 |         :return:
 42 |         """
 43 |         self.render_data("北京二手房数据分布", "套", "agency", "sum", 1, "饼状图")
 44 | 
 45 |     def community_avg_price(self):
 46 |         excel = xlwt.Workbook()
 47 |         for region in ["西城", "东城", "朝阳", "海淀"]:
 48 |             arg = [
 49 |                 {'$match': {"region": region}},
 50 |                 {"$group": {"_id": "$community", "avg_price": {"$avg": "$unit_price"}, "count": {"$sum": 1}}},
 51 |                 {"$match": {"count": {"$gt": 4}}},
 52 |                 {"$sort": {"avg_price": -1}},
 53 |                 {"$limit": 10}
 54 |             ]
 55 |             result = self.mongo_cli.aggregate(arg)
 56 |             sheet = excel.add_sheet(region)
 57 |             sheet.write(0, 0, "小区名称")
 58 |             sheet.write(0, 1, "平均价格")
 59 |             for i, data in enumerate(result):
 60 |                 sheet.write(i + 1, 0, data['_id'])
 61 |                 sheet.write(i + 1, 1, int(data['avg_price']))
 62 |         excel.save("北京最贵小区表.xls")
 63 | 
 64 |     def render_data(self, title_name, unit_name, group_name, accumulate_type, accumulate_name, image_type):
 65 |         """
 66 |         生成图片
 67 |         :param title_name: 标题和文件名字 字符串
 68 |         :param unit_name: 单位 字符串
 69 |         :param file_name: 生成的图片名称 字符串
 70 |         :param group_name: 数据库中的key，根据此名字对数据分组 字符串
 71 |         :param accumulate_type: 计算的类型 sum：求和 avg：求平均值等 字符串
 72 |         :param accumulate_name: 数据库中的key，被计算数据的名字 前面加$
 73 |         :param image_type: 图标类型，柱状图，饼状图
 74 |         :return:
 75 |         """
 76 |         arg = [{"$group": {"_id": f"${group_name}", "result": {f"${accumulate_type}": accumulate_name}}}]
 77 |         result = list(self.mongo_cli.aggregate(pipeline=arg))
 78 |         # 由大到小排序
 79 |         result.sort(key=lambda x: x['result'], reverse=True)
 80 |         for i in result:
 81 |             if i['_id'] == "亦庄开发区":
 82 |                 i['_id'] = "亦庄"
 83 |         if image_type == "柱状图":
 84 |             bar = Bar(f"{title_name}{datetime.datetime.now().strftime('%m%d')}", self.sub_title)
 85 |         elif image_type == "饼状图":
 86 |             bar = Pie(f"{title_name}{datetime.datetime.now().strftime('%m%d')}", self.sub_title)
 87 |         else:
 88 |             raise Exception("未定义图标类型")
 89 |         bar.add(
 90 |             f"单位：{unit_name}",
 91 |             [r['_id'] for r in result],
 92 |             [int(r['result']) for r in result],
 93 |             xaxis_interval=0,
 94 |             is_label_show=True
 95 |         )
 96 |         bar.render(path=f'{title_name}.png')
 97 | 
 98 | 
 99 | if __name__ == "__main__":
100 |     DataSight().community_avg_price()
101 | 


--------------------------------------------------------------------------------
/ErShouFang/extensions.py:
--------------------------------------------------------------------------------
 1 | # Create time:2018-12-24 22:05
 2 | # Author:Chen
 3 | from scrapy import signals
 4 | from scrapy.exceptions import NotConfigured
 5 | import requests
 6 | import time
 7 | import threading
 8 | import json
 9 | import logging
10 | import redis
11 | 
12 | 
13 | class QingtingProxy:
14 | 
15 | 	def __init__(self, proxy_key, redis_setting):
16 | 		self.url = "https://proxy.horocn.com/api/proxies?order_id=U7G61620840936700598&num=10&format=json&line_separator=win"
17 | 		self.spider_running = True
18 | 		self.proxy_key = proxy_key
19 | 		self.server = redis.Redis(*redis_setting)
20 | 
21 | 	@classmethod
22 | 	def from_crawler(cls, crawler):
23 | 		if crawler.settings.get("EXTENSIONS_DO"):
24 | 			spider = cls(crawler.settings.get("PROXY_KEY"), crawler.settings.getlist("REDIS"))
25 | 			crawler.signals.connect(spider.engine_started, signal=signals.engine_started)
26 | 			crawler.signals.connect(spider.spider_closed, signal=signals.spider_closed)
27 | 			return spider
28 | 		else:
29 | 			raise NotConfigured
30 | 
31 | 	def proxy(self, server):
32 | 		while self.spider_running:
33 | 			r = requests.get(self.url)
34 | 			logging.debug(r.text)
35 | 			if "不足" in r.text:
36 | 				raise Exception("代理余额不足")
37 | 			# 代理ip和端口号组合后的列表
38 | 			ip_list = [i['host'] + ":" + str(i["port"]) for i in json.loads(r.text)]
39 | 			for ip in ip_list:
40 | 				server.sadd(self.proxy_key + "_pr", ip)
41 | 			server.rename(self.proxy_key + "_pr", self.proxy_key)
42 | 			# 代理ip调用最小间隔10s
43 | 			time.sleep(10)
44 | 
45 | 	def engine_started(self):
46 | 		# 当engine开始运行时开始维护代理池
47 | 		threading.Thread(target=self.proxy, args=(self.server,)).start()
48 | 
49 | 	def spider_closed(self, spider):
50 | 		# 当爬虫关闭时关闭代理池维护
51 | 		self.spider_running = False
52 | 


--------------------------------------------------------------------------------
/ErShouFang/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class ErshoufangItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     bedroom = scrapy.Field()
15 |     living_room = scrapy.Field()
16 |     area = scrapy.Field()
17 |     total_price = scrapy.Field()
18 |     unit_price = scrapy.Field()
19 |     region = scrapy.Field()
20 |     community = scrapy.Field()
21 |     agency = scrapy.Field()
22 |     date = scrapy.Field()
23 | 
24 | 


--------------------------------------------------------------------------------
/ErShouFang/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals, Request
  9 | import random
 10 | import time
 11 | import logging
 12 | import redis
 13 | 
 14 | 
 15 | class ErshoufangSpiderMiddleware(object):
 16 |     # Not all methods need to be defined. If a method is not defined,
 17 |     # scrapy acts as if the spider middleware does not modify the
 18 |     # passed objects.
 19 | 
 20 |     @classmethod
 21 |     def from_crawler(cls, crawler):
 22 |         # This method is used by Scrapy to create your spiders.
 23 |         s = cls()
 24 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 25 |         return s
 26 | 
 27 |     def process_spider_input(self, response, spider):
 28 |         # Called for each response that goes through the spider
 29 |         # middleware and into the spider.
 30 | 
 31 |         # Should return None or raise an exception.
 32 |         return None
 33 | 
 34 |     def process_spider_output(self, response, result, spider):
 35 |         # Called with the results returned from the Spider, after
 36 |         # it has processed the response.
 37 | 
 38 |         # Must return an iterable of Request, dict or Item objects.
 39 |         for i in result:
 40 |             yield i
 41 | 
 42 |     def process_spider_exception(self, response, exception, spider):
 43 |         # Called when a spider or process_spider_input() method
 44 |         # (from other spider middleware) raises an exception.
 45 | 
 46 |         # Should return either None or an iterable of Response, dict
 47 |         # or Item objects.
 48 |         pass
 49 | 
 50 |     def process_start_requests(self, start_requests, spider):
 51 |         # Called with the start requests of the spider, and works
 52 |         # similarly to the process_spider_output() method, except
 53 |         # that it doesn’t have a response associated.
 54 | 
 55 |         # Must return only requests (not items).
 56 |         for r in start_requests:
 57 |             yield r
 58 | 
 59 |     def spider_opened(self, spider):
 60 |         spider.logger.info('Spider opened: %s' % spider.name)
 61 | 
 62 | 
 63 | class ErshoufangDownloaderMiddleware(object):
 64 |     # Not all methods need to be defined. If a method is not defined,
 65 |     # scrapy acts as if the downloader middleware does not modify the
 66 |     # passed objects.
 67 | 
 68 |     def __init__(self, ua):
 69 |         self.ua = ua
 70 | 
 71 |     @classmethod
 72 |     def from_crawler(cls, crawler):
 73 |         # This method is used by Scrapy to create your spiders.
 74 |         s = cls(crawler.settings.getlist("USER_AGENT"))
 75 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 76 |         return s
 77 | 
 78 |     def process_request(self, request, spider):
 79 |         # Called for each request that goes through the downloader
 80 |         # middleware.
 81 | 
 82 |         # Must either:
 83 |         # - return None: continue processing this request
 84 |         # - or return a Response object
 85 |         # - or return a Request object
 86 |         # - or raise IgnoreRequest: process_exception() methods of
 87 |         #   installed downloader middleware will be called
 88 |         # 我爱我家反爬，需要cookie
 89 |         if "5i5j" in request.url:
 90 |             request.cookies = {
 91 |                 'PHPSESSID': '1p63h7he47e4jb21uf3qpqa31h',
 92 |                  ' domain': 'bj',
 93 |                  ' yfx_c_g_u_id_10000001': '_ck18122702385612291856717911557',
 94 |                  ' yfx_f_l_v_t_10000001': 'f_t_1545849536214__r_t_1545849536214__v_t_1545849536214__r_c_0',
 95 |                  ' yfx_mr_n_10000001': 'baidu::market_type_ppzq::::::::::%E6%A0%87%E9%A2%98::bj.5i5j.com::::::%E5%B7%A6%E4%BE%A7%E6%A0%87%E9%A2%98::%E6%A0%87%E9%A2%98::160::pmf_from_adv::bj.5i5j.com/',
 96 |                  ' yfx_mr_f_n_10000001': 'baidu::market_type_ppzq::::::::::%E6%A0%87%E9%A2%98::bj.5i5j.com::::::%E5%B7%A6%E4%BE%A7%E6%A0%87%E9%A2%98::%E6%A0%87%E9%A2%98::160::pmf_from_adv::bj.5i5j.com/',
 97 |                  ' _ga': 'GA1.2.1709742213.1545849536',
 98 |                  ' _gid': 'GA1.2.1682278583.1545849536',
 99 |                  ' _gat': '1',
100 |                  ' Hm_lvt_94ed3d23572054a86ed341d64b267ec6': '1545849537',
101 |                  ' Hm_lpvt_94ed3d23572054a86ed341d64b267ec6': '1545849539',
102 |                  ' _Jo0OQK': '50ADB2E7DEF3A95BCBAA04D52F5414775A30C0C5815693EB62460770A7531C510DCEFD1A42F959FF2E12A16D4BFD17D9C7D374A1376BEE8ED0AACA8F800B32B3CE3C57212F12283777C840763663251ADEB840763663251ADEBC6107B635DA7B8669547054735CB82A9GJ1Z1XA=='}
103 |         request.headers["User-Agent"] = random.choice(self.ua)
104 |         return None
105 | 
106 |     def process_response(self, request, response, spider):
107 |         # Called with the response returned from the downloader.
108 | 
109 |         # Must either;
110 |         # - return a Response object
111 |         # - return a Request object
112 |         # - or raise IgnoreRequest
113 |         return response
114 | 
115 |     def process_exception(self, request, exception, spider):
116 |         # Called when a download handler or a process_request()
117 |         # (from other downloader middleware) raises an exception.
118 | 
119 |         # Must either:
120 |         # - return None: continue processing this exception
121 |         # - return a Response object: stops process_exception() chain
122 |         # - return a Request object: stops process_exception() chain
123 |         pass
124 | 
125 |     def spider_opened(self, spider):
126 |         spider.logger.info('Spider opened: %s' % spider.name)
127 | 
128 | 
129 | class RandomProxy:
130 |     """
131 |     将每次请求加上代理ip
132 |     """
133 |     def __init__(self, proxy_key, redis_setting):
134 |         self.proxy_key = proxy_key
135 |         self.server = redis.Redis(*redis_setting)
136 | 
137 |     @classmethod
138 |     def from_crawler(cls, crawler):
139 |         return cls(crawler.settings.get("PROXY_KEY"), crawler.settings.getlist("REDIS"))
140 | 
141 |     def set_proxy(self, server):
142 |         if server.scard(self.proxy_key) == 0:
143 |             # 等待代理池初始化
144 |             time.sleep(10)
145 |         return server.srandmember(self.proxy_key, 1)[0].decode("utf-8")
146 | 
147 |     def process_request(self, request, spider):
148 |         request.meta["proxy"] = "https://" + self.set_proxy(self.server)
149 | 
150 |     def process_exception(self, request, exception, spider):
151 |         if exception:
152 |             logging.error("连接异常，返回消息队列")
153 |             return Request(request.url, callback=request.callback, dont_filter=True)
154 | 
155 |     def process_response(self, request, response, spider):
156 |         if response.status != 200:
157 |             logging.error("状态码异常，返回消息队列")
158 |             return Request(request.url, callback=request.callback, dont_filter=True)
159 |         return response


--------------------------------------------------------------------------------
/ErShouFang/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | import pymongo
 9 | 
10 | class ErshoufangPipeline(object):
11 | 
12 |     def __init__(self, mongo):
13 |         cli = pymongo.MongoClient(*mongo)
14 |         self.collection = cli.ErShouFang.houses
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         return cls(crawler.settings.getlist("MONGO"))
19 | 
20 |     def process_item(self, item, spider):
21 |         self.collection.insert(dict(item))
22 |         return item
23 | 


--------------------------------------------------------------------------------
/ErShouFang/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for ErShouFang project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'ErShouFang'
 13 | 
 14 | SPIDER_MODULES = ['ErShouFang.spiders']
 15 | NEWSPIDER_MODULE = 'ErShouFang.spiders'
 16 | 
 17 | # redis代理池
 18 | REDIS = ['127.0.0.1', 6379]
 19 | PROXY_KEY = 'https_proxies'
 20 | 
 21 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
 22 | USER_AGENT = ["Mozilla/5.0 (Windows; U; Windows NT 6.1; en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1",
 23 |       "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/49.0.2623.221 Safari/537.36 SE 2.X MetaSr 1.0",
 24 |       "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0",
 25 |       'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
 26 |       'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.16 (KHTML, like Gecko) Chrome/10.0.648.133 Safari/534.16',
 27 |       'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.13 (KHTML, like Gecko) Chrome/9.0.597.0 Safari/534.13',
 28 |       'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/533.3 (KHTML, like Gecko) Chrome/8.0.552.224 Safari/533.3',
 29 |       'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.8 (KHTML, like Gecko) Chrome/7.0.521.0 Safari/534.8',
 30 |       'Mozilla/5.0 (Windows; U; Windows NT 6.0; en-US) AppleWebKit/534.3 (KHTML, like Gecko) Chrome/6.0.458.1 Safari/534.3',
 31 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
 32 |       'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3',
 33 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
 34 |       'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.66 Safari/535.11',
 35 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
 36 |       'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.8 (KHTML, like Gecko) Chrome/17.0.940.0 Safari/535.8',
 37 |       'Mozilla/5.0 (Windows NT 6.1) AppleWebKit/535.7 (KHTML, like Gecko) Chrome/16.0.912.77 Safari/535.7ad-imcjapan-syosyaman-xkgi3lqg03!wgz'
 38 |       ]
 39 | 
 40 | 
 41 | # Obey robots.txt rules
 42 | ROBOTSTXT_OBEY = False
 43 | 
 44 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 45 | #CONCURRENT_REQUESTS = 32
 46 | 
 47 | # Configure a delay for requests for the same website (default: 0)
 48 | # See https://doc.scrapy.org/en/latest/topics/settings.html#download-delay
 49 | # See also autothrottle settings and docs
 50 | DOWNLOAD_DELAY = 0
 51 | # The download delay setting will honor only one of:
 52 | CONCURRENT_REQUESTS_PER_DOMAIN = 48
 53 | CONCURRENT_REQUESTS_PER_IP = 48
 54 | 
 55 | # Disable cookies (enabled by default)
 56 | COOKIES_ENABLED = True
 57 | 
 58 | # Disable Telnet Console (enabled by default)
 59 | #TELNETCONSOLE_ENABLED = False
 60 | 
 61 | # Override the default request headers:
 62 | #DEFAULT_REQUEST_HEADERS = {
 63 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 64 | #   'Accept-Language': 'en',
 65 | #}
 66 | 
 67 | # Enable or disable spider middlewares
 68 | # See https://doc.scrapy.org/en/latest/topics/spider-middleware.html
 69 | #SPIDER_MIDDLEWARES = {
 70 | #    'ErShouFang.middlewares.ErshoufangSpiderMiddleware': 543,
 71 | #}
 72 | 
 73 | # Enable or disable downloader middlewares
 74 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
 75 | DOWNLOADER_MIDDLEWARES = {
 76 |     'ErShouFang.middlewares.ErshoufangDownloaderMiddleware': 543,
 77 |     'ErShouFang.middlewares.RandomProxy': 591,
 78 | }
 79 | 
 80 | # Enable or disable extensions
 81 | # See https://doc.scrapy.org/en/latest/topics/extensions.html
 82 | EXTENSIONS = {
 83 |    'ErShouFang.extensions.QingtingProxy': 2,
 84 | }
 85 | # 在true的情况下才运行扩展
 86 | EXTENSIONS_DO = False
 87 | 
 88 | # Configure item pipelines
 89 | # See https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 90 | ITEM_PIPELINES = {
 91 |    'ErShouFang.pipelines.ErshoufangPipeline': 300,
 92 | }
 93 | 
 94 | # Enable and configure the AutoThrottle extension (disabled by default)
 95 | # See https://doc.scrapy.org/en/latest/topics/autothrottle.html
 96 | #AUTOTHROTTLE_ENABLED = True
 97 | # The initial download delay
 98 | #AUTOTHROTTLE_START_DELAY = 5
 99 | # The maximum download delay to be set in case of high latencies
100 | #AUTOTHROTTLE_MAX_DELAY = 60
101 | # The average number of requests Scrapy should be sending in parallel to
102 | # each remote server
103 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
104 | # Enable showing throttling stats for every response received:
105 | #AUTOTHROTTLE_DEBUG = False
106 | 
107 | # Enable and configure HTTP caching (disabled by default)
108 | # See https://doc.scrapy.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
109 | #HTTPCACHE_ENABLED = True
110 | #HTTPCACHE_EXPIRATION_SECS = 0
111 | #HTTPCACHE_DIR = 'httpcache'
112 | #HTTPCACHE_IGNORE_HTTP_CODES = []
113 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
114 | 


--------------------------------------------------------------------------------
/ErShouFang/spiders/FangTianXia.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | import datetime
 5 | from ErShouFang.items import ErshoufangItem
 6 | 
 7 | 
 8 | class FangtianxiaSpider(scrapy.Spider):
 9 |     name = 'FangTianXia'
10 |     start_urls = [f'https://esf.fang.com{region}' for region in\
11 |                   ["/house-a01/", "/house-a00/", "/house-a06/", "/house-a03/", "/house-a02/",
12 |                   "/house-a012/", "/house-a0585/", "/house-a010/", "/house-a08/", "/house-a011/",
13 |                    "/house-a07/", "/house-a013/", "/house-a09/", "/house-a014/", "/house-a015/", "/house-a016/"]]
14 | 
15 |     def parse(self, response):
16 |         max_page = response.xpath("//div[@class='page_al']/p[last()]/text()").extract_first()
17 |         max_page_num = re.findall('共(\d+)页', max_page)[0]
18 |         for page in range(1, int(max_page_num) + 1):
19 |             yield scrapy.Request(response.url + f"i3{page}/", callback=self.page_parse)
20 | 
21 |     def page_parse(self, response):
22 |         region = response.xpath("//div[@class='term_screen2 clearfix']/ul/li[1]/a/text()").extract_first()
23 |         date = datetime.datetime.now().strftime("%Y-%m-%d")
24 |         dls = response.xpath("//div[@class='shop_list shop_list_4']/dl")
25 |         for dl in dls:
26 |             room = dl.xpath(".//p[@class='tel_shop']/text()[1]").extract_first()
27 |             if room:
28 |                 area = dl.xpath(".//p[@class='tel_shop']/text()[2]").extract_first()
29 |                 total_price = dl.xpath(".//dd[@class='price_right']/span/b/text()").extract_first()
30 |                 unit_price = dl.xpath(".//dd[@class='price_right']/span[2]/text()").extract_first()
31 |                 community = dl.xpath(".//p[@class='add_shop']/a/@title").extract_first()
32 |                 yield ErshoufangItem(
33 |                     bedroom=int(re.findall("(\d)室", room)[0]),
34 |                     living_room=int(re.findall("(\d)厅", room)[0]),
35 |                     area=float(re.findall("(.+)㎡", area)[0]),
36 |                     total_price=int(total_price),
37 |                     unit_price=int(re.findall('(\d+)元/㎡', unit_price)[0]),
38 |                     region=region,
39 |                     community=community,
40 |                     agency="房天下",
41 |                     date=date
42 |                 )


--------------------------------------------------------------------------------
/ErShouFang/spiders/LianJia.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | import datetime
 5 | from ErShouFang.items import ErshoufangItem
 6 | 
 7 | 
 8 | class LianjiaSpider(scrapy.Spider):
 9 |     name = 'LianJia'
10 |     custom_settings = {
11 |         'EXTENSIONS_DO': True
12 |     }
13 |     start_urls = [f'https://bj.lianjia.com/ershoufang/{region}/' for region in\
14 |                   ["dongcheng", "xicheng", "chaoyang", "haidian", "fengtai", "shijingshan",
15 |                    "tongzhou", "changping", "daxing", "yizhuangkaifaqu",
16 |                    "shunyi", "fangshan", "mentougou", "pinggu", "huairou", "miyun", "yanqing"]]
17 | 
18 |     def parse(self, response):
19 |         """
20 |         每个地区的房屋列表页
21 |         :param response:
22 |         :return:
23 |         """
24 |         # 房子总数
25 |         total_num = response.xpath("//h2[@class='total fl']/span/text()").extract_first().strip()
26 |         # 最大页数
27 |         max_page_num = int(int(total_num) / 30) + 1
28 |         # 最多只有100页
29 |         if max_page_num > 100:
30 |             max_page_num = 100
31 |         for page_num in range(1, max_page_num+1):
32 |             # 带页数的url地址
33 |             url = response.url + f"pg{page_num}/"
34 |             yield scrapy.Request(url, callback=self.page_parse)
35 | 
36 |     def page_parse(self, response):
37 |         lis = response.xpath("//li[@class='clear LOGCLICKDATA']")
38 |         region = response.xpath("//div[@class='sub_nav section_sub_nav']/a[@class='selected']/text()").extract_first()
39 |         date = datetime.datetime.now().strftime("%Y-%m-%d")
40 |         for li in lis:
41 |             room = li.xpath(".//div[@class='houseInfo']/text()[1]").extract_first()
42 |             if "室" not in room:
43 |                 # 商住非正规房
44 |                 return
45 |             area = li.xpath(".//div[@class='houseInfo']/text()[2]").extract_first()
46 |             total_price = li.xpath(".//div[@class='totalPrice']/span/text()").extract_first()
47 |             unit_price = li.xpath(".//div[@class='unitPrice']/span/text()").extract_first()
48 |             community = li.xpath(".//div[@class='houseInfo']/a/text()").extract_first()
49 |             yield ErshoufangItem(
50 |                bedroom=int(re.findall("(\d)室", room)[0]),
51 |                living_room=int(re.findall("(\d)厅", room)[0]),
52 |                area=float(re.findall("(.+)平米", area)[0]),
53 |                total_price=int(total_price),
54 |                unit_price=int(re.findall('单价(.+)元/平米',unit_price)[0]),
55 |                region=region,
56 |                community=community,
57 |                agency="链家",
58 |                date=date
59 |                 )
60 | 


--------------------------------------------------------------------------------
/ErShouFang/spiders/Wiwj.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | import datetime
 5 | from ErShouFang.items import ErshoufangItem
 6 | 
 7 | 
 8 | class WiwjSpider(scrapy.Spider):
 9 |     name = 'Wiwj'
10 |     start_urls = [f'https://bj.5i5j.com/ershoufang/{region}/' for region in\
11 |                   ["haidianqu", "dongchengqu", "xichengqu", "fengtaiqu", "shijingshanqu", "tongzhouqu",
12 |                    "changpingqu", "daxingqu", "yizhuang", "shunyiqu", "fangshanqu", "mentougou",
13 |                    "pinggu", "huairou", "miyun", "yanqing", "chaoyangqu"]]
14 | 
15 |     def parse(self, response):
16 |         redirect = self.rediection(response, self.parse)
17 |         if not redirect:
18 |             total_num = response.xpath("//div[@class='total-box noBor']/span/text()").extract_first()
19 |             max_page_num = int(int(total_num)/30) + 1
20 |             for page in range(1, max_page_num + 1):
21 |                 yield scrapy.Request(response.url + f"n{page}/", callback=self.page_parse)
22 |         else:
23 |             return redirect
24 | 
25 |     def page_parse(self, response):
26 |         redirect = self.rediection(response, self.parse)
27 |         if not redirect:
28 |             region = response.xpath("//li[@class='new_di_tab_cur']/text()").extract_first()
29 |             date = datetime.datetime.now().strftime("%Y-%m-%d")
30 |             lis = response.xpath("//ul[@class='pList']/li")
31 |             for li in lis:
32 |                 room = li.xpath(".//div[@class='listCon']/div[@class='listX']/p[1]/text()").extract_first()
33 |                 # 商用非正规房
34 |                 if '多' in room:
35 |                     return
36 |                 total_price = li.xpath(".//div[@class='listCon']/div[@class='listX']/div[@class='jia']/p[1]/strong/text()").extract_first()
37 |                 unit_price = li.xpath("//div[@class='listCon']/div[@class='listX']/div[@class='jia']/p[2]/text()").extract_first()
38 |                 community = li.xpath(".//div[@class='listCon']/div[@class='listX']/p[2]/a/text()").extract_first()
39 |                 yield ErshoufangItem(
40 |                    bedroom=int(re.findall("(\d)室", room)[0]),
41 |                    living_room=int(re.findall("(\d)厅", room)[0]),
42 |                    area=float(re.findall(r"\·(.+)\s\s平米", room)[0]),
43 |                    total_price=int(total_price),
44 |                    unit_price=int(re.findall('单价(.+)元/', unit_price)[0]),
45 |                    region=region,
46 |                    community=community.split(' ')[1],
47 |                    agency="我爱我家",
48 |                    date=date
49 |                     )
50 |         else:
51 |             return redirect
52 | 
53 |     def rediection(self, response, func):
54 |         body = response.body.decode("utf-8")
55 |         # 如果重定向那么重新发送request
56 |         if "window.location.href" in body:
57 |             return scrapy.Request(re.findall(r"window.location.href='(.+?)';", body)[0], callback=func)


--------------------------------------------------------------------------------
/ErShouFang/spiders/ZhongYuan.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import scrapy
 3 | import re
 4 | import datetime
 5 | from ErShouFang.items import ErshoufangItem
 6 | 
 7 | 
 8 | class ZhongyuanSpider(scrapy.Spider):
 9 |     name = 'ZhongYuan'
10 |     start_urls = [f'https://bj.centanet.com/ershoufang/{region}/' for region in\
11 |                   ["dongchengqu", "haidianqu", "xichengqu", "shijingshanqu", "fengtaiqu", "shunyiqu",
12 |                    "mentougouqu", "tongzhouqu", "daxingqu", "yizhuangkaifaqu"]]
13 | 
14 |     def parse(self, response):
15 |         max_page = response.xpath("//a[text()='>>']/@href").extract_first()
16 |         # 判断是否有第二页，如果没有直接分析网页
17 |         if max_page:
18 |             max_page_num = re.findall(r'g(\d+)\/', max_page)[0]
19 |             for page in range(1, int(max_page_num) + 1):
20 |                 yield scrapy.Request(response.url + f"g{page}/", callback=self.page_parse)
21 |         else:
22 |             self.page_parse(response)
23 | 
24 |     def page_parse(self, response):
25 |         region = response.xpath("//p[@class='termcon fl ']/span[@class='curr']/text()").extract_first()
26 |         date = datetime.datetime.now().strftime("%Y-%m-%d")
27 |         divs = response.xpath("//div[@class='house-item clearfix']")
28 |         for div in divs:
29 |             room = div.xpath(".//p[@class='house-name']/span[2]/text()").extract_first()
30 |             area = div.xpath(".//p[@class='house-name']/span[4]/text()").extract_first()
31 |             total_price = div.xpath(".//p[@class='price-nub cRed tc']/span/text()").extract_first()
32 |             unit_price = div.xpath(".//p[@class='price-txt tc']/text()").extract_first()
33 |             community = div.xpath(".//p[@class='house-name']/a/text()").extract_first()
34 |             yield ErshoufangItem(
35 |                 bedroom=int(re.findall("(\d)室", room)[0]),
36 |                 living_room=int(re.findall("(\d)厅", room)[0] if '厅' in room else '0'),
37 |                 area=float(re.findall("(.+)平", area)[0]),
38 |                 total_price=int(float(total_price)),
39 |                 unit_price=int(float(re.findall('(\d+)元/平', unit_price)[0])),
40 |                 region=region,
41 |                 community=community,
42 |                 agency="中原地产",
43 |                 date=date
44 |             )


--------------------------------------------------------------------------------
/ErShouFang/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/ErShouFang/二手房数据.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/二手房数据.png


--------------------------------------------------------------------------------
/ErShouFang/北京二手房在售数量.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/北京二手房在售数量.png


--------------------------------------------------------------------------------
/ErShouFang/北京二手房平均价格.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/北京二手房平均价格.png


--------------------------------------------------------------------------------
/ErShouFang/北京二手房平均面积.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/北京二手房平均面积.png


--------------------------------------------------------------------------------
/ErShouFang/北京二手房数据分布.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/北京二手房数据分布.png


--------------------------------------------------------------------------------
/ErShouFang/北京最贵小区表.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/LemonBottom/ErShouFang/a956514ecbe0fa4eed418c56286298fd69658d46/ErShouFang/北京最贵小区表.xls


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ErShouFang
 2 | 抓取链家，我爱我家，房天下，中原地产房产中介二手房在售数据，做出数据可视化图片，并做简单的数据分析
 3 | 
 4 | 简述    
 5 | 想要了解北京二手房在售情况，房产中介的数据无疑是很有说服力的，本爬虫爬取四家中介网站的在售二手房共14万多条数据，覆盖了北京各个地区。以下会根据数据图片做简要的数据分析。
 6 | 在技术方面，本爬虫使用scrapy框架，使用两个下载器中间件分别做了代理、cookie、请求头处理，创建extension自动维护代理池，数据写入mongo数据库，使用mongo聚合对数据进行分类、总和与平均值计算，使用pyecharts库对数据进行可视化，生成了柱状图与饼状图。
 7 |  
 8 | 数据分析
 9 | ![image](https://github.com/LemonBottom/ErShouFang/blob/master/ErShouFang/北京二手房平均价格.png?raw=true)
10 | 全北京在售二手房，西城区平均房价最高，达到了骇人听闻的十一万每平米，其次是东城区，直逼十万每平米，再然后是海淀和朝阳。以此数据与房天下网站公布的数据相比，总体价格趋势与房天下数据一致，不过海淀区的平均价格相差较大，本爬虫是九万，房天下数据是八万四，相差了六千。本人认为这与年末的学区房政策有着千丝万缕的联系，经微博网友雪球的爆料，根据其提供的上地实验中学的《关于北京市义务教育入学政策的通知》，其中提到《北京市海淀区教育委员会关于2018年义务教育阶段入学工作的实施意见》“自2019年1月1日起，在海淀区新登记并取得房屋不动产权证书的住房用于申请入学的，将不再对应一所学校，实施多校划片”。这或许是海淀区房价平均值不稳定的原因之一，一些房屋出售者会根据政策变化及时变更房屋价格。
11 | 
12 | ![image](https://github.com/LemonBottom/ErShouFang/blob/master/ErShouFang/北京二手房在售数量.png?raw=true)
13 | 在北京哪里卖二手房子的最多？朝阳和海淀。不过通过数据可以看出，北京近郊的房产交易要比东西城火热。本人认为，随着地铁不断向外拓展，北京近郊的交通会越来越便利，房地产会越发火热，毕竟房价和中心城区差出了一个梯队，平均价格在五万左右。
14 | 
15 | ![image](https://github.com/LemonBottom/ErShouFang/blob/master/ErShouFang/北京二手房平均面积.png?raw=true)
16 | 北京近郊的房子面积要比城区大。
17 | 
18 | ![image](https://github.com/LemonBottom/ErShouFang/blob/master/ErShouFang/北京二手房数据分布.png?raw=true)
19 | 可以看出相比两大明码标价中介链家和我爱我家，房天下的数据最多，中原地产数据最少。
20 | 
21 | 北京最贵的小区在哪？
22 | 
23 | 西城区
24 | 
25 | 小区名称 | 平均价格 
26 | :------: | :------: 
27 | 丰融园 | 186127 |
28 | 中海凯旋 | 177202 |
29 | 丰侨公寓 | 176112 |
30 | 丰汇园 | 175823 |
31 | 北京尊府 | 172882 |
32 | 
33 | 东城区
34 | 
35 | |小区名称|平均价格|
36 | :------: | :------: 
37 | |北下洼子胡同|165664|
38 | |府学胡同|161980|
39 | |八宝坑胡同|160336|
40 | |普渡寺小区|151491|
41 | |长安太和|140198|
42 | 
43 | 朝阳区
44 | 
45 | |小区名称|平均价格|
46 | :------: | :------: 
47 | |泛海世家|178752|
48 | |红玺台|151880|
49 | 北京壹号院|149892|
50 | |翰林阁|149176|
51 | |太阳公元|140682|
52 | 
53 | 海淀区
54 | 
55 | |小区名称|平均价格|
56 | :------: | :------: 
57 | |万柳书院|192737|
58 | |万城华府|182129|
59 | |万城华府海园|179488|
60 | |保利海德公园|166471|
61 | |光大水墨风景|147323|
62 | 
63 | 
64 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = ErShouFang.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = ErShouFang
12 | 


--------------------------------------------------------------------------------