├── lib
    ├── __init__.py
    ├── zone
    │   ├── __init__.py
    │   ├── area.py
    │   ├── district.py
    │   └── city.py
    ├── const
    │   ├── __init__.py
    │   └── xpath.py
    ├── spider
    │   ├── __init__.py
    │   ├── base_spider.py
    │   ├── loupan_spider.py
    │   ├── xiaoqu_spider.py
    │   ├── ershou_spider.py
    │   └── zufang_spider.py
    ├── utility
    │   ├── __init__.py
    │   ├── version.py
    │   ├── writer.py
    │   ├── log.py
    │   ├── date.py
    │   └── path.py
    ├── item
    │   ├── __init__.py
    │   ├── loupan.py
    │   ├── xiaoqu.py
    │   ├── ershou.py
    │   └── zufang.py
    └── request
    │   ├── __init__.py
    │   ├── proxy.py
    │   └── headers.py
├── .gitignore
├── pic
    ├── xiaoqu_top.png
    └── district_top.png
├── log
    └── __init__.py
├── requirements.txt
├── ershou.py
├── loupan.py
├── zufang.py
├── xiaoqu.py
├── tool
    ├── clean.py
    ├── ershou_number.py
    ├── lianjia_xiaoqu.sql
    ├── ershou_image_with_threads.py
    └── ershou_image_with_coroutine.py
├── test
    └── date_test.py
├── xiaoqu_to_chart.py
├── README.md
├── xiaoqu_to_db.py
└── LICENSE


/lib/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/zone/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lib/const/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # 


--------------------------------------------------------------------------------
/lib/spider/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # 


--------------------------------------------------------------------------------
/lib/utility/__init__.py:
--------------------------------------------------------------------------------
1 | # coding=utf-8
2 | # 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | *.pyc
2 | .idea/
3 | data/
4 | *.html
5 | *.csv
6 | log/log.txt
7 | 


--------------------------------------------------------------------------------
/pic/xiaoqu_top.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Crush0nyou/beike-lianjia/HEAD/pic/xiaoqu_top.png


--------------------------------------------------------------------------------
/pic/district_top.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Crush0nyou/beike-lianjia/HEAD/pic/district_top.png


--------------------------------------------------------------------------------
/lib/item/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # author: zengyuetian
4 | 
5 | 
6 | if __name__ == '__main__':
7 |     pass


--------------------------------------------------------------------------------
/lib/request/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # author: zengyuetian
4 | 
5 | 
6 | if __name__ == '__main__':
7 |     pass


--------------------------------------------------------------------------------
/log/__init__.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # coding=utf-8
3 | # author: zengyuetian
4 | # 此代码仅供学习与交流，请勿用于商业用途。
5 | 
6 | 
7 | if __name__ == '__main__':
8 |     pass


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | threadpool
 2 | bs4
 3 | pymysql
 4 | requests
 5 | lxml
 6 | records
 7 | pymongo
 8 | xlsxwriter
 9 | numpy
10 | pandas
11 | pyecharts
12 | pyecharts-snapshot
13 | 


--------------------------------------------------------------------------------
/ershou.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: Zeng YueTian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 获得指定城市的二手房数据
 6 | 
 7 | from lib.spider.ershou_spider import *
 8 | 
 9 | if __name__ == "__main__":
10 |     spider = ErShouSpider(SPIDER_NAME)
11 |     spider.start()
12 | 
13 | 


--------------------------------------------------------------------------------
/loupan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: Zeng YueTian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 获得指定城市的所有新房楼盘数据
 6 | 
 7 | from lib.spider.loupan_spider import *
 8 | 
 9 | if __name__ == "__main__":
10 |     spider = LouPanBaseSpider(SPIDER_NAME)
11 |     spider.start()
12 | 
13 | 


--------------------------------------------------------------------------------
/zufang.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: Zeng YueTian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 获得指定城市的出租房数据
 6 | 
 7 | 
 8 | from lib.spider.zufang_spider import *
 9 | 
10 | if __name__ == "__main__":
11 |     spider = ZuFangBaseSpider(SPIDER_NAME)
12 |     spider.start()
13 | 
14 | 


--------------------------------------------------------------------------------
/xiaoqu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: Zeng YueTian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 获得指定城市的小区数据
 6 | # 这些数据包括:
 7 | # 日期,所属区县,板块名,小区名,挂牌均价,挂牌数
 8 | # 20180221,浦东,川沙,恒纬家苑,32176元/m2,3套在售二手房
 9 | 
10 | from lib.spider.xiaoqu_spider import *
11 | 
12 | if __name__ == "__main__":
13 |     spider = XiaoQuBaseSpider(SPIDER_NAME)
14 |     spider.start()
15 | 


--------------------------------------------------------------------------------
/lib/utility/version.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 判断Python版本环境
 6 | 
 7 | 
 8 | import sys
 9 | 
10 | if sys.version_info < (3, 0):   # 如果小于Python3
11 |     PYTHON_3 = False
12 | else:
13 |     PYTHON_3 = True
14 | 
15 | if not PYTHON_3:   # 如果小于Python3
16 |     reload(sys)
17 |     sys.setdefaultencoding("utf-8")
18 | 


--------------------------------------------------------------------------------
/lib/utility/writer.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 保存结果到文件
 6 | 
 7 | from lib.utility.path import *
 8 | 
 9 | 
10 | def write_urls_to_file(file_name, urls):
11 |     file_name = DATA_PATH + "/" + file_name
12 |     txt_file = open(file_name, 'w')
13 |     for url in urls:
14 |         txt_file.write(url+"\n")
15 |     txt_file.close()
16 | 


--------------------------------------------------------------------------------
/lib/utility/log.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | 
 6 | import logging
 7 | from lib.utility.path import LOG_PATH
 8 | 
 9 | logger = logging.getLogger(__name__)
10 | logger.setLevel(level=logging.INFO)
11 | handler = logging.FileHandler(LOG_PATH + "/log.txt")
12 | handler.setLevel(logging.INFO)
13 | formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
14 | handler.setFormatter(formatter)
15 | logger.addHandler(handler)
16 | 
17 | if __name__ == '__main__':
18 |     pass


--------------------------------------------------------------------------------
/tool/clean.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 清理结果文件
 6 | 
 7 | from lib.utility.path import *
 8 | 
 9 | if __name__ == '__main__':
10 |     # 删除日志
11 |     os.system("rm -rf {0}/*.txt".format(LOG_PATH))
12 | 
13 |     # 删除爬取的数据
14 |     os.system("rm -rf {0}/*".format(DATA_PATH))
15 | 
16 |     # 删除HTML
17 |     os.system("rm -rf {0}/*.html".format(ROOT_PATH))
18 | 
19 |     # 删除csv
20 |     os.system("rm -rf {0}/*.csv".format(ROOT_PATH))
21 | 
22 |     # 删除json
23 |     os.system("rm -rf {0}/*.json".format(ROOT_PATH))


--------------------------------------------------------------------------------
/lib/item/loupan.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 新房楼盘的数据结构
 6 | 
 7 | 
 8 | class LouPan(object):
 9 |     def __init__(self, xiaoqu, price, total):
10 |         # self.district = district
11 |         # self.area = area
12 |         self.xiaoqu = xiaoqu
13 |         # self.address = address
14 |         # self.size = size
15 |         self.price = price
16 |         self.total = total
17 | 
18 |     def text(self):
19 |         return self.xiaoqu + "," + \
20 |                 self.price + "," + \
21 |                 self.total
22 | 


--------------------------------------------------------------------------------
/lib/item/xiaoqu.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 小区信息的数据结构
 6 | 
 7 | 
 8 | class XiaoQu(object):
 9 |     def __init__(self, district, area, name, price, on_sale):
10 |         self.district = district
11 |         self.area = area
12 |         self.price = price
13 |         self.name = name
14 |         self.on_sale = on_sale
15 | 
16 |     def text(self):
17 |         return self.district + "," + \
18 |                 self.area + "," + \
19 |                 self.name + "," + \
20 |                 self.price + "," + \
21 |                 self.on_sale
22 | 


--------------------------------------------------------------------------------
/lib/item/ershou.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 二手房信息的数据结构
 6 | 
 7 | 
 8 | class ErShou(object):
 9 |     def __init__(self, district, area, name, price, desc, pic):
10 |         self.district = district
11 |         self.area = area
12 |         self.price = price
13 |         self.name = name
14 |         self.desc = desc
15 |         self.pic = pic
16 | 
17 |     def text(self):
18 |         return self.district + "," + \
19 |                 self.area + "," + \
20 |                 self.name + "," + \
21 |                 self.price + "," + \
22 |                 self.desc + "," + \
23 |                 self.pic
24 | 


--------------------------------------------------------------------------------
/test/date_test.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | 
 6 | import unittest
 7 | from lib.utility.date import *
 8 | 
 9 | 
10 | class DateTest(unittest.TestCase):
11 |     def setUp(self):
12 |         pass
13 | 
14 |     def tearDown(self):
15 |         pass
16 | 
17 |     def test_time_string(self):
18 |         self.assertEqual(len(get_time_string()), 14)
19 | 
20 |     def test_date_string(self):
21 |         self.assertEqual(len(get_date_string()), 8)
22 | 
23 |     def test_year_string(self):
24 |         self.assertEqual(len(get_year_month_string()), 6)
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     unittest.main()
29 | 


--------------------------------------------------------------------------------
/lib/item/zufang.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 二手房信息的数据结构
 6 | 
 7 | 
 8 | class ZuFang(object):
 9 |     def __init__(self, district, area, xiaoqu, layout, size, price):
10 |         self.district = district
11 |         self.area = area
12 |         self.xiaoqu = xiaoqu
13 |         self.layout = layout
14 |         self.size = size
15 |         self.price = price
16 | 
17 |     def text(self):
18 |         return self.district + "," + \
19 |                 self.area + "," + \
20 |                 self.xiaoqu + "," + \
21 |                 self.layout + "," + \
22 |                 self.size + "," + \
23 |                 self.price
24 | 


--------------------------------------------------------------------------------
/lib/utility/date.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 日期和时间的字符串辅助函数
 6 | 
 7 | import time
 8 | 
 9 | 
10 | def get_time_string():
11 |     """
12 |     获得形如20161010120000这样的年月日时分秒字符串
13 |     :return:
14 |     """
15 |     current = time.localtime()
16 |     return time.strftime("%Y%m%d%H%M%S", current)
17 | 
18 | 
19 | def get_date_string():
20 |     """
21 |     获得形如20161010这样的年月日字符串
22 |     :return:
23 |     """
24 |     current = time.localtime()
25 |     return time.strftime("%Y%m%d", current)
26 | 
27 | 
28 | def get_year_month_string():
29 |     """
30 |     获得形如201610这样的年月字符串
31 |     :return:
32 |     """
33 |     current = time.localtime()
34 |     return time.strftime("%Y%m", current)
35 | 
36 | 
37 | if __name__ == "__main__":
38 |     print(get_date_string())
39 | 


--------------------------------------------------------------------------------
/tool/ershou_number.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 获取城市挂牌二手房数量
 6 | 
 7 | import time
 8 | from lib.spider.base_spider import SPIDER_NAME
 9 | from bs4 import BeautifulSoup
10 | from lib.zone.city import cities
11 | import requests
12 | 
13 | numbers = dict()
14 | 
15 | 
16 | def get_ershou_number(city):
17 |     url = "https://{0}.{1}.com/ershoufang/".format(city, SPIDER_NAME)
18 |     print(url)
19 |     response = requests.get(url)
20 |     html = response.content
21 |     soup = BeautifulSoup(html, "lxml")
22 |     element = soup.find('h2', class_='total')
23 |     number = int(element.text.split(" ")[1].strip())
24 |     numbers[city] = number
25 | 
26 | 
27 | if __name__ == '__main__':
28 |     start = time.time()
29 |     for key, value in cities.items():
30 |         # print(key, value)
31 |         get_ershou_number(key)
32 |     for k, v in numbers.items():
33 |         print(cities[k], v)
34 |     print("cost {0} seconds".format(time.time() - start))
35 | 


--------------------------------------------------------------------------------
/lib/const/xpath.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 页面元素的XPATH
 6 | 
 7 | from lib.spider.base_spider import SPIDER_NAME, LIANJIA_SPIDER, BEIKE_SPIDER
 8 | 
 9 | if SPIDER_NAME == LIANJIA_SPIDER:
10 |     ERSHOUFANG_QU_XPATH = '//*[@id="filter-options"]/dl[1]/dd/div/a'
11 |     ERSHOUFANG_BANKUAI_XPATH = '//*[@id="filter-options"]/dl[1]/dd/div[2]/a'
12 |     XIAOQU_QU_XPATH = '//*[@id="filter-options"]/dl[1]/dd/div/a'
13 |     XIAOQU_BANKUAI_XPATH = '//*[@id="filter-options"]/dl[1]/dd/div[2]/a'
14 |     DISTRICT_AREA_XPATH = '//div[3]/div[1]/dl[2]/dd/div/div[2]/a'
15 |     CITY_DISTRICT_XPATH = '///div[3]/div[1]/dl[2]/dd/div/div/a'
16 | elif SPIDER_NAME == BEIKE_SPIDER:
17 |     ERSHOUFANG_QU_XPATH = '//*[@id="filter-options"]/dl[1]/dd/div/a'
18 |     ERSHOUFANG_BANKUAI_XPATH = '//*[@id="filter-options"]/dl[1]/dd/div[2]/a'
19 |     XIAOQU_QU_XPATH = '//*[@id="filter-options"]/dl[1]/dd/div/a'
20 |     XIAOQU_BANKUAI_XPATH = '//*[@id="filter-options"]/dl[1]/dd/div[2]/a'
21 |     DISTRICT_AREA_XPATH = '//div[3]/div[1]/dl[2]/dd/div/div[2]/a'
22 |     CITY_DISTRICT_XPATH = '///div[3]/div[1]/dl[2]/dd/div/div/a'
23 | 


--------------------------------------------------------------------------------
/lib/request/proxy.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 用于获取代理
 6 | 
 7 | from bs4 import BeautifulSoup
 8 | import requests
 9 | from lib.request.headers import create_headers
10 | 
11 | proxys_src = []
12 | proxys = []
13 | 
14 | 
15 | def spider_proxyip(num=10):
16 |     try:
17 |         url = 'http://www.xicidaili.com/nt/1'
18 |         req = requests.get(url, headers=create_headers())
19 |         source_code = req.content
20 |         print(source_code)
21 |         soup = BeautifulSoup(source_code, 'lxml')
22 |         ips = soup.findAll('tr')
23 | 
24 |         for x in range(1, len(ips)):
25 |             ip = ips[x]
26 |             tds = ip.findAll("td")
27 |             proxy_host = "{0}://".format(tds[5].contents[0]) + tds[1].contents[0] + ":" + tds[2].contents[0]
28 |             proxy_temp = {tds[5].contents[0]: proxy_host}
29 |             proxys_src.append(proxy_temp)
30 |             if x >= num:
31 |                 break
32 |     except Exception as e:
33 |         print("spider_proxyip exception:")
34 |         print(e)
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     spider_proxyip(10)
39 |     print(proxys_src)
40 | 


--------------------------------------------------------------------------------
/lib/zone/area.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 板块信息相关函数
 6 | 
 7 | from lib.zone.district import *
 8 | from lib.const.xpath import *
 9 | from lib.request.headers import *
10 | from lib.spider.base_spider import SPIDER_NAME
11 | 
12 | 
13 | def get_district_url(city, district):
14 |     """
15 |     拼接指定城市的区县url
16 |     :param city: 城市
17 |     :param district: 区县
18 |     :return:
19 |     """
20 |     return "http://{0}.{1}.com/xiaoqu/{2}".format(city, SPIDER_NAME, district)
21 | 
22 | 
23 | def get_areas(city, district):
24 |     """
25 |     通过城市和区县名获得下级板块名
26 |     :param city: 城市
27 |     :param district: 区县
28 |     :return: 区县列表
29 |     """
30 |     page = get_district_url(city, district)
31 |     areas = list()
32 |     try:
33 |         headers = create_headers()
34 |         response = requests.get(page, timeout=10, headers=headers)
35 |         html = response.content
36 |         root = etree.HTML(html)
37 |         links = root.xpath(DISTRICT_AREA_XPATH)
38 | 
39 |         # 针对a标签的list进行处理
40 |         for link in links:
41 |             relative_link = link.attrib['href']
42 |             # 去掉最后的"/"
43 |             relative_link = relative_link[:-1]
44 |             # 获取最后一节
45 |             area = relative_link.split("/")[-1]
46 |             # 去掉区县名,防止重复
47 |             if area != district:
48 |                 chinese_area = link.text
49 |                 chinese_area_dict[area] = chinese_area
50 |                 # print(chinese_area)
51 |                 areas.append(area)
52 |         return areas
53 |     except Exception as e:
54 |         print(e)
55 | 
56 | 
57 | if __name__ == "__main__":
58 |     print(get_areas("sh", "huangpu"))
59 | 
60 | 


--------------------------------------------------------------------------------
/lib/utility/path.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 获得当前目录结构,创建目录结构
 6 | 
 7 | 
 8 | import inspect
 9 | import os
10 | import sys
11 | 
12 | 
13 | def get_root_path():
14 |     file_path = os.path.abspath(inspect.getfile(sys.modules[__name__]))
15 |     parent_path = os.path.dirname(file_path)
16 |     lib_path = os.path.dirname(parent_path)
17 |     root_path = os.path.dirname(lib_path)
18 |     return root_path
19 | 
20 | 
21 | def create_data_path():
22 |     root_path = get_root_path()
23 |     data_path = root_path + "/data"
24 |     if not os.path.exists(data_path):
25 |         os.makedirs(data_path)
26 |     return data_path
27 | 
28 | 
29 | def create_site_path(site):
30 |     data_path = create_data_path()
31 |     site_path = data_path + "/" + site
32 |     if not os.path.exists(site_path):
33 |         os.makedirs(site_path)
34 |     return site_path
35 | 
36 | 
37 | def create_city_path(site, city):
38 |     site_path = create_site_path(site)
39 |     city_path = site_path + "/" + city
40 |     if not os.path.exists(city_path):
41 |         os.makedirs(city_path)
42 |     return city_path
43 | 
44 | 
45 | def create_date_path(site, city, date):
46 |     city_path = create_city_path(site, city)
47 |     date_path = city_path + "/" + date
48 |     if not os.path.exists(date_path):
49 |         os.makedirs(date_path)
50 |     return date_path
51 | 
52 | 
53 | # const for path
54 | ROOT_PATH = get_root_path()
55 | DATA_PATH = ROOT_PATH + "/data"
56 | SAMPLE_PATH = ROOT_PATH + "/sample"
57 | LOG_PATH = ROOT_PATH + "/log"
58 | 
59 | if __name__ == "__main__":
60 |     create_date_path("lianjia", "sh", "20160912")
61 |     create_date_path("anjuke", "bj", "20160912")
62 | 


--------------------------------------------------------------------------------
/lib/zone/district.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 获得各城市的区县相关信息
 6 | 
 7 | import requests
 8 | from lxml import etree
 9 | from lib.zone.city import cities
10 | from lib.const.xpath import *
11 | from lib.request.headers import *
12 | from lib.spider.base_spider import SPIDER_NAME
13 | 
14 | chinese_city_district_dict = dict()     # 城市代码和中文名映射
15 | chinese_area_dict = dict()              # 版块代码和中文名映射
16 | area_dict = dict()
17 | 
18 | 
19 | def get_chinese_district(en):
20 |     """
21 |     拼音区县名转中文区县名
22 |     :param en: 英文
23 |     :return: 中文
24 |     """
25 |     return chinese_city_district_dict.get(en, None)
26 | 
27 | 
28 | def get_districts(city):
29 |     """
30 |     获取各城市的区县中英文对照信息
31 |     :param city: 城市
32 |     :return: 英文区县名列表
33 |     """
34 |     url = 'https://{0}.{1}.com/xiaoqu/'.format(city, SPIDER_NAME)
35 |     headers = create_headers()
36 |     response = requests.get(url, timeout=10, headers=headers)
37 |     html = response.content
38 |     root = etree.HTML(html)
39 |     elements = root.xpath(CITY_DISTRICT_XPATH)
40 |     en_names = list()
41 |     ch_names = list()
42 |     for element in elements:
43 |         link = element.attrib['href']
44 |         en_names.append(link.split('/')[-2])
45 |         ch_names.append(element.text)
46 | 
47 |         # 打印区县英文和中文名列表
48 |     for index, name in enumerate(en_names):
49 |         chinese_city_district_dict[name] = ch_names[index]
50 |         # print(name + ' -> ' + ch_names[index])
51 |     return en_names
52 | 
53 | 
54 | if __name__ == '__main__':
55 |     for key in cities.keys():
56 |         # 寻找那些网页格式不合规的城市
57 |         chinese_city_district_dict = dict()
58 |         get_districts(key)
59 |         if len(chinese_city_district_dict.items()) == 0:
60 |             print(key)
61 | 


--------------------------------------------------------------------------------
/tool/lianjia_xiaoqu.sql:
--------------------------------------------------------------------------------
 1 | # ************************************************************
 2 | # Sequel Pro SQL dump
 3 | # Version 4541
 4 | #
 5 | # http://www.sequelpro.com/
 6 | # https://github.com/sequelpro/sequelpro
 7 | #
 8 | # Host: 127.0.0.1 (MySQL 5.7.21)
 9 | # Database: lianjia
10 | # Generation Time: 2018-03-31 12:54:33 +0000
11 | # ************************************************************
12 | 
13 | 
14 | /*!40101 SET @OLD_CHARACTER_SET_CLIENT=@@CHARACTER_SET_CLIENT */;
15 | /*!40101 SET @OLD_CHARACTER_SET_RESULTS=@@CHARACTER_SET_RESULTS */;
16 | /*!40101 SET @OLD_COLLATION_CONNECTION=@@COLLATION_CONNECTION */;
17 | /*!40101 SET NAMES utf8 */;
18 | /*!40014 SET @OLD_FOREIGN_KEY_CHECKS=@@FOREIGN_KEY_CHECKS, FOREIGN_KEY_CHECKS=0 */;
19 | /*!40101 SET @OLD_SQL_MODE=@@SQL_MODE, SQL_MODE='NO_AUTO_VALUE_ON_ZERO' */;
20 | /*!40111 SET @OLD_SQL_NOTES=@@SQL_NOTES, SQL_NOTES=0 */;
21 | 
22 | 
23 | # Dump of table xiaoqu
24 | # ------------------------------------------------------------
25 | 
26 | DROP TABLE IF EXISTS `xiaoqu`;
27 | 
28 | CREATE TABLE `xiaoqu` (
29 |   `id` int(11) unsigned NOT NULL AUTO_INCREMENT,
30 |   `city` varchar(10) DEFAULT NULL,
31 |   `date` varchar(8) DEFAULT NULL,
32 |   `district` varchar(50) DEFAULT NULL,
33 |   `area` varchar(50) DEFAULT NULL,
34 |   `xiaoqu` varchar(100) DEFAULT NULL,
35 |   `price` int(11) DEFAULT NULL,
36 |   `sale` int(11) DEFAULT NULL,
37 |   PRIMARY KEY (`id`)
38 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8;
39 | 
40 | 
41 | 
42 | 
43 | /*!40111 SET SQL_NOTES=@OLD_SQL_NOTES */;
44 | /*!40101 SET SQL_MODE=@OLD_SQL_MODE */;
45 | /*!40014 SET FOREIGN_KEY_CHECKS=@OLD_FOREIGN_KEY_CHECKS */;
46 | /*!40101 SET CHARACTER_SET_CLIENT=@OLD_CHARACTER_SET_CLIENT */;
47 | /*!40101 SET CHARACTER_SET_RESULTS=@OLD_CHARACTER_SET_RESULTS */;
48 | /*!40101 SET COLLATION_CONNECTION=@OLD_COLLATION_CONNECTION */;
49 | 


--------------------------------------------------------------------------------
/lib/spider/base_spider.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 爬虫基类
 6 | # 爬虫名常量，用来设置爬取哪个站点
 7 | 
 8 | import threading
 9 | from lib.zone.city import lianjia_cities, beike_cities
10 | from lib.utility.date import *
11 | import lib.utility.version
12 | import random
13 | 
14 | thread_pool_size = 50
15 | 
16 | # 防止爬虫被禁，随机延迟设定
17 | # 如果不想delay，就设定False，
18 | # 具体时间可以修改random_delay()，由于多线程，建议数值大于10
19 | RANDOM_DELAY = False
20 | LIANJIA_SPIDER = "lianjia"
21 | BEIKE_SPIDER = "ke"
22 | # SPIDER_NAME = LIANJIA_SPIDER
23 | SPIDER_NAME = BEIKE_SPIDER
24 | 
25 | 
26 | class BaseSpider(object):
27 |     @staticmethod
28 |     def random_delay():
29 |         if RANDOM_DELAY:
30 |             time.sleep(random.randint(0, 16))
31 | 
32 |     def __init__(self, name):
33 |         self.name = name
34 |         if self.name == LIANJIA_SPIDER:
35 |             self.cities = lianjia_cities
36 |         elif self.name == BEIKE_SPIDER:
37 |             self.cities = beike_cities
38 |         else:
39 |             self.cities = None
40 |         # 准备日期信息，爬到的数据存放到日期相关文件夹下
41 |         self.date_string = get_date_string()
42 |         print('Today date is: %s' % self.date_string)
43 | 
44 |         self.total_num = 0  # 总的小区个数，用于统计
45 |         print("Target site is {0}.com".format(SPIDER_NAME))
46 |         self.mutex = threading.Lock()  # 创建锁
47 | 
48 |     def create_prompt_text(self):
49 |         """
50 |         根据已有城市中英文对照表拼接选择提示信息
51 |         :return: 拼接好的字串
52 |         """
53 |         city_info = list()
54 |         count = 0
55 |         for en_name, ch_name in self.cities.items():
56 |             count += 1
57 |             city_info.append(en_name)
58 |             city_info.append(": ")
59 |             city_info.append(ch_name)
60 |             if count % 4 == 0:
61 |                 city_info.append("\n")
62 |             else:
63 |                 city_info.append(", ")
64 |         return 'Which city do you want to crawl?\n' + ''.join(city_info)
65 | 
66 |     def get_chinese_city(self, en):
67 |         """
68 |         拼音拼音名转中文城市名
69 |         :param en: 拼音
70 |         :return: 中文
71 |         """
72 |         return self.cities.get(en, None)
73 | 


--------------------------------------------------------------------------------
/lib/request/headers.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # USER AGENTS 可以自己添加
 6 | 
 7 | import random
 8 | from lib.spider.base_spider import SPIDER_NAME
 9 | 
10 | USER_AGENTS = [
11 |     "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
12 |     "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)",
13 |     "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)",
14 |     "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)",
15 |     "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)",
16 |     "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)",
17 |     "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)",
18 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)",
19 |     "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6",
20 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1",
21 |     "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0",
22 |     "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5",
23 |     "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6",
24 |     "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
25 |     "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20",
26 |     "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52",
27 | ]
28 | 
29 | 
30 | def create_headers():
31 |     headers = dict()
32 |     headers["User-Agent"] = random.choice(USER_AGENTS)
33 |     headers["Referer"] = "http://www.{0}.com".format(SPIDER_NAME)
34 |     return headers
35 | 
36 | 
37 | if __name__ == '__main__':
38 |     pass
39 | 


--------------------------------------------------------------------------------
/lib/zone/city.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # author: zengyuetian
  4 | # 此代码仅供学习与交流，请勿用于商业用途。
  5 | # 城市缩写和城市名的映射
  6 | # 想抓取其他已有城市的话，需要把相关城市信息放入下面的字典中
  7 | # 不过暂时只有下面这些城市在链家上是统一样式
  8 | 
  9 | import sys
 10 | from lib.utility.version import PYTHON_3
 11 | from lib.utility.log import *
 12 | 
 13 | cities = {
 14 |     'bj': '北京',
 15 |     'cd': '成都',
 16 |     'cq': '重庆',
 17 |     'cs': '长沙',
 18 |     'dg': '东莞',
 19 |     'dl': '大连',
 20 |     'fs': '佛山',
 21 |     'gz': '广州',
 22 |     'hz': '杭州',
 23 |     'hf': '合肥',
 24 |     'jn': '济南',
 25 |     'nj': '南京',
 26 |     'qd': '青岛',
 27 |     'sh': '上海',
 28 |     'sz': '深圳',
 29 |     'su': '苏州',
 30 |     'sy': '沈阳',
 31 |     'tj': '天津',
 32 |     'wh': '武汉',
 33 |     'xm': '厦门',
 34 |     'yt': '烟台',
 35 | }
 36 | 
 37 | 
 38 | lianjia_cities = cities
 39 | beike_cities = cities
 40 | 
 41 | 
 42 | def create_prompt_text():
 43 |     """
 44 |     根据已有城市中英文对照表拼接选择提示信息
 45 |     :return: 拼接好的字串
 46 |     """
 47 |     city_info = list()
 48 |     count = 0
 49 |     for en_name, ch_name in cities.items():
 50 |         count += 1
 51 |         city_info.append(en_name)
 52 |         city_info.append(": ")
 53 |         city_info.append(ch_name)
 54 |         if count % 4 == 0:
 55 |             city_info.append("\n")
 56 |         else:
 57 |             city_info.append(", ")
 58 |     return 'Which city do you want to crawl?\n' + ''.join(city_info)
 59 | 
 60 | 
 61 | def get_chinese_city(en):
 62 |     """
 63 |     拼音拼音名转中文城市名
 64 |     :param en: 拼音
 65 |     :return: 中文
 66 |     """
 67 |     return cities.get(en, None)
 68 | 
 69 | 
 70 | def get_city():
 71 |     city = None
 72 |     # 允许用户通过命令直接指定
 73 |     if len(sys.argv) < 2:
 74 |         print("Wait for your choice.")
 75 |         # 让用户选择爬取哪个城市的二手房小区价格数据
 76 |         prompt = create_prompt_text()
 77 |         # 判断Python版本
 78 |         if not PYTHON_3:  # 如果小于Python3
 79 |             city = raw_input(prompt)
 80 |         else:
 81 |             city = input(prompt)
 82 |     elif len(sys.argv) == 2:
 83 |         city = str(sys.argv[1])
 84 |         print("City is: {0}".format(city))
 85 |     else:
 86 |         print("At most accept one parameter.")
 87 |         exit(1)
 88 | 
 89 |     chinese_city = get_chinese_city(city)
 90 |     if chinese_city is not None:
 91 |         message = 'OK, start to crawl ' + get_chinese_city(city)
 92 |         print(message)
 93 |         logger.info(message)
 94 |     else:
 95 |         print("No such city, please check your input.")
 96 |         exit(1)
 97 |     return city
 98 | 
 99 | 
100 | if __name__ == '__main__':
101 |     print(get_chinese_city("sh"))
102 | 


--------------------------------------------------------------------------------
/xiaoqu_to_chart.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 展示小区图表信息（仅仅支持MAC）
 5 | # 1. 杀死之前启动的http服务器
 6 | # 2. 启动一个新的http服务器
 7 | # 3. 用浏览器打开生成的数据html文件
 8 | 
 9 | import pandas as pd
10 | from pyecharts import Bar
11 | 
12 | import os
13 | import time
14 | from lib.utility.version import PYTHON_3
15 | 
16 | if __name__ == '__main__':
17 |     try:
18 |         import webbrowser as web
19 |         auto_browse = True
20 |     except Exception as e:
21 |         auto_browse = False
22 | 
23 |     if auto_browse:
24 |         try:
25 |             if PYTHON_3:
26 |                 os.system("ps aux | grep python | grep http.server | grep -v grep | awk '{print $2}' | xargs kill")
27 |                 os.system("python -m http.server 8080 & > /dev/null 2>&1 ")
28 |             else:
29 |                 os.system("ps aux | grep python | grep SimpleHTTPServer | grep -v grep | awk '{print $2}' | xargs kill")
30 |                 os.system("python -m SimpleHTTPServer 8080 & > /dev/null 2>&1 ")
31 |         except Exception as e:
32 |             print(e)
33 | 
34 |     # 注意，已经将分割符号转换成分号，因为有的小区名中有逗号
35 |     df = pd.read_csv("xiaoqu.csv", encoding="utf-8", sep=";")
36 | 
37 |     # 打印总行数
38 |     print("row number is {0}".format(len(df.index)))
39 | 
40 |     # 过滤房价为0的无效数据
41 |     df = df[df.price > 0]
42 |     # # 去除重复行
43 |     # df = df.drop_duplicates()
44 |     print("row number is {0}".format(len(df.index)))
45 | 
46 |     ####################################################
47 |     # 最贵的小区排名
48 |     ####################################################
49 |     df.sort_values("price", ascending=False, inplace=True)
50 |     num = 5
51 |     print(df.head(num))
52 |     city = df["city_ch"][0]
53 |     xqs = df["xiaoqu"][0:num]
54 |     prices = df["price"][0:num]
55 |     bar = Bar("{0}小区均价".format(city))
56 |     bar.add("小区均价前{0}名".format(num), xqs, prices, is_stack=True, is_label_show=True, xaxis_interval=0, xaxis_rotate=45)
57 |     bar.render(path="xiaoqu.html")
58 | 
59 |     ####################################################
60 |     # 区县均价排名
61 |     ####################################################
62 |     district_df = df.groupby('district').mean()
63 |     district_df = district_df.round(0)
64 |     district_df.sort_values("price", ascending=False, inplace=True)
65 |     print(district_df)
66 |     districts = district_df.index.tolist()
67 |     prices = district_df["price"]
68 |     bar = Bar("{0}区县均价".format(city))
69 |     bar.add("区县均价排名", districts, prices, is_stack=True, is_label_show=True, xaxis_interval=0, xaxis_rotate=45)
70 |     bar.render(path="district.html")
71 | 
72 |     if auto_browse:
73 |         web.open("http://localhost:8080/xiaoqu.html", new=0, autoraise=True)
74 |         web.open("http://localhost:8080/district.html", new=0, autoraise=True)
75 |         # 确保页面打开
76 |         time.sleep(15)
77 | 
78 | 
79 | 


--------------------------------------------------------------------------------
/tool/ershou_image_with_threads.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 多线程下载图片，Python 3.6.1
 6 | 
 7 | 
 8 | import os
 9 | import time
10 | from lib.zone.city import get_chinese_city
11 | from lib.request.headers import create_headers
12 | from lib.utility.date import get_date_string
13 | from lib.spider.base_spider import SPIDER_NAME
14 | from lib.utility.path import DATA_PATH
15 | from tomorrow import threads
16 | import requests
17 | 
18 | 
19 | def get_ershou_img_urls(city):
20 |     urls = list()
21 |     date = get_date_string()
22 |     # 获得 csv 文件路径
23 |     # date = "20180331"   # 指定采集数据的日期
24 |     # city = "sh"         # 指定采集数据的城市
25 |     csv_dir = "{0}/{1}/ershou/{2}/{3}".format(DATA_PATH, SPIDER_NAME, city, date)
26 | 
27 |     files = list()
28 |     if not os.path.exists(csv_dir):
29 |         print("{0} does not exist.".format(csv_dir))
30 |         print("Please run 'python ershou.py' firstly.")
31 |         print("Bye.")
32 |         exit(0)
33 |     else:
34 |         print('OK, start to process ' + get_chinese_city(city))
35 |     for csv in os.listdir(csv_dir):
36 |         if csv[-3:] != "csv":
37 |             continue
38 |         data_csv = csv_dir + "/" + csv
39 |         # print(data_csv)
40 |         files.append(data_csv)
41 | 
42 |     # 清理数据
43 |     count = 0
44 |     for csv in files:
45 |         with open(csv, 'r') as f:
46 |             for line in f:
47 |                 count += 1
48 |                 text = line.strip()
49 |                 try:
50 |                    results = text.split("https://")
51 |                 except Exception as e:
52 |                     print(text)
53 |                     print(e)
54 |                     continue
55 |                 # 确保之前的步骤采集到了图片的url
56 |                 if len(results) > 1:
57 |                     url = results[-1]
58 |                     urls.append("https://"+url)
59 |                     print("https://"+url)
60 |     print(len(urls))
61 |     return urls
62 | 
63 | 
64 | @threads(50)
65 | def download_images(save_path: str, image_url: str):
66 |     """
67 |     :param save_path: 保存图片的路径
68 |      :param image_url: 图片的下载的url地址
69 |     :return:
70 |     """
71 |     resp = requests.get(image_url)
72 |     fp = open(save_path, 'wb')
73 |     fp.write(resp.content)
74 |     fp.close()
75 | 
76 | if __name__ == '__main__':
77 |     # urls = ["https://img.ljcdn.com/370600-inspection/test-9925c97c-fc99-4d1a-97fa-2fd6d3209027.png!m_fill,w_280,h_210,f_jpg?from=ke.com",
78 |     #         "https://img.ljcdn.com/370600-inspection/df98f65c-427e-4d7d-91a7-425a5d682af5.jpg!m_fill,w_280,h_210,f_jpg?from=ke.com",
79 |     #         "https://img.ljcdn.com/370600-inspection/test-9925c97c-fc99-4d1a-97fa-2fd6d3209027.png!m_fill,w_280,h_210,f_jpg?from=ke.com",
80 |     #         "https://img.ljcdn.com/370600-inspection/df98f65c-427e-4d7d-91a7-425a5d682af5.jpg!m_fill,w_280,h_210,f_jpg?from=ke.com"]
81 |     # 指定城市
82 |     start = time.time()
83 |     city = "yt"
84 |     urls = get_ershou_img_urls(city)
85 | 
86 |     date = get_date_string()
87 |     csv_dir = "{0}/{1}/ershou/{2}/{3}".format(DATA_PATH, SPIDER_NAME, city, date)
88 |     to_do = [download_images("{0}/{1}.jpg".format(csv_dir, i), urls[i]) for i in range(len(urls))]
89 |     print("Start to download, please wait...")
90 | 


--------------------------------------------------------------------------------
/tool/ershou_image_with_coroutine.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # coding=utf-8
 3 | # author: zengyuetian
 4 | # 此代码仅供学习与交流，请勿用于商业用途。
 5 | # 协程下载图片，仅支持Python 3.6.1
 6 | 
 7 | 
 8 | 
 9 | import aiohttp
10 | import aiofiles
11 | import asyncio
12 | import os
13 | import time
14 | from lib.zone.city import get_chinese_city
15 | from lib.request.headers import create_headers
16 | from lib.utility.date import get_date_string
17 | from lib.spider.base_spider import SPIDER_NAME
18 | from lib.utility.path import DATA_PATH
19 | 
20 | 
21 | def get_ershou_img_urls(city):
22 |     urls = list()
23 |     date = get_date_string()
24 |     # 获得 csv 文件路径
25 |     # date = "20180331"   # 指定采集数据的日期
26 |     # city = "sh"         # 指定采集数据的城市
27 |     csv_dir = "{0}/{1}/ershou/{2}/{3}".format(DATA_PATH, SPIDER_NAME, city, date)
28 | 
29 |     files = list()
30 |     if not os.path.exists(csv_dir):
31 |         print("{0} does not exist.".format(csv_dir))
32 |         print("Please run 'python ershou.py' firstly.")
33 |         print("Bye.")
34 |         exit(0)
35 |     else:
36 |         print('OK, start to process ' + get_chinese_city(city))
37 |     for csv in os.listdir(csv_dir):
38 |         if csv[-3:] != "csv":
39 |             continue
40 |         data_csv = csv_dir + "/" + csv
41 |         # print(data_csv)
42 |         files.append(data_csv)
43 | 
44 |     # 清理数据
45 |     count = 0
46 |     for csv in files:
47 |         with open(csv, 'r') as f:
48 |             for line in f:
49 |                 count += 1
50 |                 text = line.strip()
51 |                 try:
52 |                    results = text.split("https://")
53 |                 except Exception as e:
54 |                     print(text)
55 |                     print(e)
56 |                     continue
57 |                 # 确保之前的步骤采集到了图片的url
58 |                 if len(results) > 1:
59 |                     url = results[-1]
60 |                     urls.append("https://"+url)
61 |                     print("https://"+url)
62 |     print(len(urls))
63 |     return urls
64 | 
65 | 
66 | async def download_images(save_path: str, image_url: str):
67 |     """
68 |     :param save_path: 保存图片的路径
69 |      :param image_url: 图片的下载的url地址
70 |     :return:
71 |     """
72 |     async with aiohttp.ClientSession() as session:
73 |         async with session.get(image_url, headers=create_headers()) as req:
74 |             image = await req.read()
75 |             fp = await aiofiles.open(save_path, 'wb')
76 |             await fp.write(image)
77 | 
78 | if __name__ == '__main__':
79 |     # urls = ["https://img.ljcdn.com/370600-inspection/test-9925c97c-fc99-4d1a-97fa-2fd6d3209027.png!m_fill,w_280,h_210,f_jpg?from=ke.com",
80 |     #         "https://img.ljcdn.com/370600-inspection/df98f65c-427e-4d7d-91a7-425a5d682af5.jpg!m_fill,w_280,h_210,f_jpg?from=ke.com",
81 |     #         "https://img.ljcdn.com/370600-inspection/test-9925c97c-fc99-4d1a-97fa-2fd6d3209027.png!m_fill,w_280,h_210,f_jpg?from=ke.com",
82 |     #         "https://img.ljcdn.com/370600-inspection/df98f65c-427e-4d7d-91a7-425a5d682af5.jpg!m_fill,w_280,h_210,f_jpg?from=ke.com"]
83 |     # 指定城市
84 |     start = time.time()
85 |     city = "yt"
86 |     urls = get_ershou_img_urls(city)
87 |     loop = asyncio.get_event_loop()
88 |     date = get_date_string()
89 |     csv_dir = "{0}/{1}/ershou/{2}/{3}".format(DATA_PATH, SPIDER_NAME, city, date)
90 |     to_do = [download_images("{0}/{1}.jpg".format(csv_dir, i), urls[i]) for i in range(len(urls))]
91 |     print("Start to download, please wait.")
92 |     wait_future = asyncio.wait(to_do)
93 |     resp = loop.run_until_complete(wait_future)
94 |     loop.close()
95 |     print("Download {0} images, cost {1} seconds.".format(len(urls), time.time() - start))
96 | 


--------------------------------------------------------------------------------
/lib/spider/loupan_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # author: zengyuetian
  4 | # 此代码仅供学习与交流，请勿用于商业用途。
  5 | # 爬取楼盘数据的爬虫派生类
  6 | 
  7 | import re
  8 | import math
  9 | import requests
 10 | from bs4 import BeautifulSoup
 11 | from lib.item.loupan import *
 12 | from lib.spider.base_spider import *
 13 | from lib.request.headers import *
 14 | from lib.utility.date import *
 15 | from lib.utility.path import *
 16 | from lib.zone.city import get_city
 17 | from lib.utility.log import *
 18 | import lib.utility.version
 19 | 
 20 | 
 21 | class LouPanBaseSpider(BaseSpider):
 22 |     def collect_city_loupan_data(self, city_name, fmt="csv"):
 23 |         """
 24 |         将指定城市的新房楼盘数据存储下来，默认存为csv文件
 25 |         :param city_name: 城市
 26 |         :param fmt: 保存文件格式
 27 |         :return: None
 28 |         """
 29 |         csv_file = self.today_path + "/{0}.csv".format(city_name)
 30 |         with open(csv_file, "w") as f:
 31 |             # 开始获得需要的板块数据
 32 |             loupans = self.get_loupan_info(city_name)
 33 |             self.total_num = len(loupans)
 34 |             if fmt == "csv":
 35 |                 for loupan in loupans:
 36 |                     f.write(self.date_string + "," + loupan.text() + "\n")
 37 |         print("Finish crawl: " + city_name + ", save data to : " + csv_file)
 38 | 
 39 |     @staticmethod
 40 |     def get_loupan_info(city_name):
 41 |         """
 42 |         爬取页面获取城市新房楼盘信息
 43 |         :param city_name: 城市
 44 |         :return: 新房楼盘信息列表
 45 |         """
 46 |         total_page = 1
 47 |         loupan_list = list()
 48 |         page = 'http://{0}.fang.{1}.com/loupan/'.format(city_name, SPIDER_NAME)
 49 |         print(page)
 50 |         headers = create_headers()
 51 |         response = requests.get(page, timeout=10, headers=headers)
 52 |         html = response.content
 53 |         soup = BeautifulSoup(html, "lxml")
 54 | 
 55 |         # 获得总的页数
 56 |         try:
 57 |             page_box = soup.find_all('div', class_='page-box')[0]
 58 |             matches = re.search('.*data-total-count="(\d+)".*', str(page_box))
 59 |             total_page = int(math.ceil(int(matches.group(1)) / 10))
 60 |         except Exception as e:
 61 |             print("\tWarning: only find one page for {0}".format(city_name))
 62 |             print(e)
 63 | 
 64 |         print(total_page)
 65 |         # 从第一页开始,一直遍历到最后一页
 66 |         headers = create_headers()
 67 |         for i in range(1, total_page + 1):
 68 |             page = 'http://{0}.fang.{1}.com/loupan/pg{2}'.format(city_name, SPIDER_NAME, i)
 69 |             print(page)
 70 |             BaseSpider.random_delay()
 71 |             response = requests.get(page, timeout=10, headers=headers)
 72 |             html = response.content
 73 |             soup = BeautifulSoup(html, "lxml")
 74 | 
 75 |             # 获得有小区信息的panel
 76 |             house_elements = soup.find_all('li', class_="resblock-list")
 77 |             for house_elem in house_elements:
 78 |                 price = house_elem.find('span', class_="number")
 79 |                 total = house_elem.find('div', class_="second")
 80 |                 loupan = house_elem.find('a', class_='name')
 81 | 
 82 |                 # 继续清理数据
 83 |                 try:
 84 |                     price = price.text.strip()
 85 |                 except Exception as e:
 86 |                     price = '0'
 87 | 
 88 |                 loupan = loupan.text.replace("\n", "")
 89 | 
 90 |                 try:
 91 |                     total = total.text.strip().replace(u'总价', '')
 92 |                     total = total.replace(u'/套起', '')
 93 |                 except Exception as e:
 94 |                     total = '0'
 95 | 
 96 |                 print("{0} {1} {2} ".format(
 97 |                     loupan, price, total))
 98 | 
 99 |                 # 作为对象保存
100 |                 loupan = LouPan(loupan, price, total)
101 |                 loupan_list.append(loupan)
102 |         return loupan_list
103 | 
104 |     def start(self):
105 |         city = get_city()
106 |         print('Today date is: %s' % self.date_string)
107 |         self.today_path = create_date_path("{0}/loupan".format(SPIDER_NAME), city, self.date_string)
108 | 
109 |         t1 = time.time()  # 开始计时
110 |         self.collect_city_loupan_data(city)
111 |         t2 = time.time()  # 计时结束，统计结果
112 | 
113 |         print("Total crawl {0} loupan.".format(self.total_num))
114 |         print("Total cost {0} second ".format(t2 - t1))
115 | 
116 | 
117 | if __name__ == '__main__':
118 |     pass
119 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # 链家网(lianjia.com)和贝壳网(ke.com)爬虫
  2 | - 爬取链家网、贝壳网的各类房价数据（小区数据，挂牌二手房, 出租房，新房）。
  3 | - **如果好用，请点星支持 ！**
  4 | - 支持北京上海广州深圳等国内21个主要城市；支持Python2和Python3; 基于页面的数据爬取，稳定可靠; 丰富的代码注释，帮助理解代码并且方便扩展功能。
  5 | - 数据含义：城市-city, 区县-district, 板块-area, 小区-xiaoqu, 二手房-ershou, 租房-zufang， 新房-loupan。
  6 | - 每个版块存储为一个csv文件，该文件可以作为原始数据进行进一步的处理和分析。
  7 | - 支持图表展示。
  8 | ![alt text](https://github.com/jumper2014/lianjia-spider/blob/master/pic/xiaoqu_top.png)
  9 | ![alt text](https://github.com/jumper2014/lianjia-spider/blob/master/pic/district_top.png)
 10 | - 如果链家和贝壳页面结构有调整，欢迎反馈，我将尽力保持更新。
 11 | - 此代码仅供学习与交流，请勿用于商业用途，后果自负。
 12 | 
 13 | ## 安装依赖
 14 | - pip install -r requirements.txt
 15 | - 运行前，请将当前目录加入到系统环境变量PYTHONPATH中。
 16 | - 运行前，请指定要爬取的网站，见lib/spider/base_spider.py里面的SPIDER_NAME变量。
 17 | - 清理数据，运行 python tool/clean.py
 18 | 
 19 | ## 快速问答
 20 | - Q: 如何降低爬取速度，避免被封IP？A:见base_spider.py里面的RANDOM_DELAY
 21 | - Q: 如何减少并发的爬虫数？ A: 见见base_spider.py的thread_pool_size
 22 | - Q: 为何无法使用xiaoqu_to_chart.py? A: 该脚本现仅支持mac系统
 23 | - Q: 有其他问题反馈途径么？ A: 问题反馈QQ群号635276285。
 24 | 
 25 | ## 小区房价数据爬取
 26 | - 内容格式：采集日期,所属区县,板块名,小区名,挂牌均价,挂牌数
 27 | - 内容如下：20180221,浦东,川沙,恒纬家苑,32176元/m2,3套在售二手房
 28 | - 数据可以存入MySQL/MongoDB数据库，用于进一步数据分析，比如排序，计算区县和版块均价。
 29 | - MySQL数据库结构可以通过导入tool/lianjia_xiaoqu.sql建立。
 30 | - MySQL数据格式: 城市 日期 所属区县 版块名 小区名 挂牌均价 挂牌数
 31 | - MySQL数据内容：上海 20180331 徐汇 衡山路 永嘉路621号 333333 0
 32 | - MongoDB数据内容: { "_id" : ObjectId("5ac0309332e3885598b3b751"), "city" : "上海", "district" : "黄浦", "area" : "五里桥", "date" : "20180331", "price" : 81805, "sale" : 11, "xiaoqu" : "桥一小区" }
 33 | - Excel数据内容：上海 20180331 徐汇 衡山路 永嘉路621号 333333 0
 34 | - 运行, python xiaoqu.py 根据提示输入城市代码，回车确认，开始采集数据到csv文件
 35 | - 运行, python xiaoqu.py city, 自动开始采集数据到csv文件
 36 | ```
 37 | hz: 杭州, sz: 深圳, dl: 大连, fs: 佛山
 38 | xm: 厦门, dg: 东莞, gz: 广州, bj: 北京
 39 | cd: 成都, sy: 沈阳, jn: 济南, sh: 上海
 40 | tj: 天津, qd: 青岛, cs: 长沙, su: 苏州
 41 | cq: 重庆, wh: 武汉, hf: 合肥, yt: 烟台
 42 | nj: 南京, 
 43 | ```
 44 | - 修改 xiaoqu_to_db.py 中的database变量，设置数据最终存入mysql/mongodb/Excel/json
 45 | - python xiaoqu_to_db.py 根据提示将今天采集到的csv数据存入数据库。(默认导出为单一csv文件)
 46 | - python xiaoqu_to_chart.py 将单一csv文件数据通过图表展示。
 47 | 
 48 | ## 挂牌二手房数据爬取
 49 | - 获取链家网挂牌二手房价数据，数据格式如下：
 50 | - 20180405,浦东,万祥镇,祥安菊苑 3室2厅 258万,258万,祥安菊苑  | 3室2厅 | 126.58平米 | 南 | 毛坯
 51 | - 运行，python ershou.py 根据提示输入城市代码，回车确认，开始采集数据到csv文件
 52 | - 运行，python ershou.py city，自动开始采集数据到csv文件
 53 | 
 54 | 
 55 | ## 出租房数据爬取
 56 | - 获取链家网挂牌出租房数据，数据格式如下：
 57 | - 20180407,浦东,御桥,仁和都市花园  ,3室2厅,100平米,8000
 58 | - 运行，python zufang.py 根据提示输入城市代码，回车确认，开始采集数据到csv文件
 59 | - 运行，python zufang.py city，自动开始采集数据到csv文件
 60 | 
 61 | ## 新房数据爬取
 62 | - 获取链家网新房数据，数据格式如下：
 63 | - 20180407,上海星河湾,76000,1672万
 64 | - 运行，python loupan.py 根据提示输入城市代码，回车确认，开始采集数据到csv文件
 65 | - 运行，python loupan.py city，自动开始采集数据到csv文件
 66 | 
 67 | ## 结果存储
 68 | - 根目录下建立data目录存放结果数据文件
 69 | - 小区房价数据存储目录为 data/site/xiaoqu/city/date
 70 | - 二手房房价数据存储目录为 data/site/ershou/city/date
 71 | - 出租房房价数据存储目录为 data/site/zufang/city/date
 72 | - 新房房价数据存储目录为 data/site/loupan/city/date
 73 | 
 74 | ## 性能
 75 | - 300秒爬取上海市207个版块的2.7万条小区数据，平均每秒90条数据。
 76 | ```
 77 | Total crawl 207 areas.
 78 | Total cost 294.048109055 second to crawl 27256 data items.
 79 | ```
 80 | - 1000秒爬取上海215个版块的7.5万条挂牌二手房数据，平均每秒75条数据。
 81 | ```
 82 | Total crawl 215 areas.
 83 | Total cost 1028.3090899 second to crawl 75448 data items.
 84 | ```
 85 | - 300秒爬取上海215个版块的3.2万条出租房数据, 平均每秒150条数据。
 86 | ```
 87 | Total crawl 215 areas.
 88 | Total cost 299.7534770965576 second to crawl 32735 data items.
 89 | ```
 90 | - 30秒爬取上海400个新盘数据。
 91 | ```
 92 | Total crawl 400 loupan.
 93 | Total cost 29.757128953933716 second
 94 | ```
 95 | 
 96 | 
 97 | 
 98 | ### 更新记录
 99 | - 2019/06/21 去除requirements.txt中的webbrower
100 | - 2018/11/05 增加工具下载二手房缩略图tool/download_ershou_image.py
101 | - 2018/11/01 增加二手房缩略图地址
102 | - 2018/10/28 xiaoqu_to_db.py改造成支持命令行参数自动运行。
103 | - 2018/10/25 将主要爬取代码抽取到spider类中。
104 | - 2018/10/22 文件名，目录，代码重构。
105 | - 2018/10/20 增加中间文件清理功能，能够爬取贝壳网的小区，新房，二手房和租房数据。
106 | - 2018/10/19 支持贝壳网小区数据爬取
107 | - 2018/10/15 增加Spider类，优化异常处理，功能无变动
108 | - 2018/10/14 允许用户通过命令行指定要爬取的城市，而不仅仅通过交互模式选择，用于支持自动爬取。
109 | - 2018/10/11 增加初步log功能。
110 | - 2018/10/09 图表展示区县均价排名。
111 | - 2018/10/07 小区房价导出到json文件, csv文件。图表展示最贵的小区。
112 | - 2018/10/05 增加Referer。增加透明代理服务器获取(未使用)
113 | - 2018/06/01 支持User-Agent
114 | - 2018/04/07 支持采集新房的基本房价信息
115 | - 2018/04/07 支持采集出租房的相关信息
116 | - 2018/04/05 支持采集挂牌二手房信息
117 | - 2018/04/02 支持将采集到的csv数据导入Excel
118 | - 2018/04/01 同时支持Python2和Python3
119 | - 2018/04/01 支持将采集到的csv数据导入MongoDB数据库
120 | - 2018/03/31 支持将采集到的csv数据导入MySQL数据库
121 | - 2018/03/27 修复bug: 版块下只有一页小区数据时未能正确爬取 
122 | - 2018/03/27 增加5个城市，现在支持21个城市的小区数据爬取
123 | - 2018/03/10 自动获取城市的区县列表，现在支持16个城市小区数据爬取
124 | - 2018/03/06 支持北京二手房小区数据采集
125 | - 2018/02/21 应对链家前端页面更新，使用内置urllib2代替第三方requests库,提升性能，减少依赖
126 | - 2018/02/01 支持上海二手房小区数据采集


--------------------------------------------------------------------------------
/lib/spider/xiaoqu_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # author: zengyuetian
  4 | # 此代码仅供学习与交流，请勿用于商业用途。
  5 | # 爬取小区数据的爬虫派生类
  6 | 
  7 | import re
  8 | import threadpool
  9 | from bs4 import BeautifulSoup
 10 | from lib.item.xiaoqu import *
 11 | from lib.zone.city import get_city
 12 | from lib.spider.base_spider import *
 13 | from lib.utility.date import *
 14 | from lib.utility.path import *
 15 | from lib.zone.area import *
 16 | from lib.utility.log import *
 17 | import lib.utility.version
 18 | 
 19 | 
 20 | class XiaoQuBaseSpider(BaseSpider):
 21 |     def collect_area_xiaoqu_data(self, city_name, area_name, fmt="csv"):
 22 |         """
 23 |         对于每个板块,获得这个板块下所有小区的信息
 24 |         并且将这些信息写入文件保存
 25 |         :param city_name: 城市
 26 |         :param area_name: 板块
 27 |         :param fmt: 保存文件格式
 28 |         :return: None
 29 |         """
 30 |         district_name = area_dict.get(area_name, "")
 31 |         csv_file = self.today_path + "/{0}_{1}.csv".format(district_name, area_name)
 32 |         with open(csv_file, "w") as f:
 33 |             # 开始获得需要的板块数据
 34 |             xqs = self.get_xiaoqu_info(city_name, area_name)
 35 |             # 锁定
 36 |             if self.mutex.acquire(1):
 37 |                 self.total_num += len(xqs)
 38 |                 # 释放
 39 |                 self.mutex.release()
 40 |             if fmt == "csv":
 41 |                 for xiaoqu in xqs:
 42 |                     f.write(self.date_string + "," + xiaoqu.text() + "\n")
 43 |         print("Finish crawl area: " + area_name + ", save data to : " + csv_file)
 44 |         logger.info("Finish crawl area: " + area_name + ", save data to : " + csv_file)
 45 | 
 46 |     @staticmethod
 47 |     def get_xiaoqu_info(city, area):
 48 |         total_page = 1
 49 |         district = area_dict.get(area, "")
 50 |         chinese_district = get_chinese_district(district)
 51 |         chinese_area = chinese_area_dict.get(area, "")
 52 |         xiaoqu_list = list()
 53 |         page = 'http://{0}.{1}.com/xiaoqu/{2}/'.format(city, SPIDER_NAME, area)
 54 |         print(page)
 55 |         logger.info(page)
 56 | 
 57 |         headers = create_headers()
 58 |         response = requests.get(page, timeout=10, headers=headers)
 59 |         html = response.content
 60 |         soup = BeautifulSoup(html, "lxml")
 61 | 
 62 |         # 获得总的页数
 63 |         try:
 64 |             page_box = soup.find_all('div', class_='page-box')[0]
 65 |             matches = re.search('.*"totalPage":(\d+),.*', str(page_box))
 66 |             total_page = int(matches.group(1))
 67 |         except Exception as e:
 68 |             print("\tWarning: only find one page for {0}".format(area))
 69 |             print(e)
 70 | 
 71 |         # 从第一页开始,一直遍历到最后一页
 72 |         for i in range(1, total_page + 1):
 73 |             headers = create_headers()
 74 |             page = 'http://{0}.{1}.com/xiaoqu/{2}/pg{3}'.format(city, SPIDER_NAME, area, i)
 75 |             print(page)  # 打印版块页面地址
 76 |             BaseSpider.random_delay()
 77 |             response = requests.get(page, timeout=10, headers=headers)
 78 |             html = response.content
 79 |             soup = BeautifulSoup(html, "lxml")
 80 | 
 81 |             # 获得有小区信息的panel
 82 |             house_elems = soup.find_all('li', class_="xiaoquListItem")
 83 |             for house_elem in house_elems:
 84 |                 price = house_elem.find('div', class_="totalPrice")
 85 |                 name = house_elem.find('div', class_='title')
 86 |                 on_sale = house_elem.find('div', class_="xiaoquListItemSellCount")
 87 | 
 88 |                 # 继续清理数据
 89 |                 price = price.text.strip()
 90 |                 name = name.text.replace("\n", "")
 91 |                 on_sale = on_sale.text.replace("\n", "").strip()
 92 | 
 93 |                 # 作为对象保存
 94 |                 xiaoqu = XiaoQu(chinese_district, chinese_area, name, price, on_sale)
 95 |                 xiaoqu_list.append(xiaoqu)
 96 |         return xiaoqu_list
 97 | 
 98 |     def start(self):
 99 |         city = get_city()
100 |         self.today_path = create_date_path("{0}/xiaoqu".format(SPIDER_NAME), city, self.date_string)
101 |         t1 = time.time()  # 开始计时
102 | 
103 |         # 获得城市有多少区列表, district: 区县
104 |         districts = get_districts(city)
105 |         print('City: {0}'.format(city))
106 |         print('Districts: {0}'.format(districts))
107 | 
108 |         # 获得每个区的板块, area: 板块
109 |         areas = list()
110 |         for district in districts:
111 |             areas_of_district = get_areas(city, district)
112 |             print('{0}: Area list:  {1}'.format(district, areas_of_district))
113 |             # 用list的extend方法,L1.extend(L2)，该方法将参数L2的全部元素添加到L1的尾部
114 |             areas.extend(areas_of_district)
115 |             # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', }
116 |             for area in areas_of_district:
117 |                 area_dict[area] = district
118 |         print("Area:", areas)
119 |         print("District and areas:", area_dict)
120 | 
121 |         # 准备线程池用到的参数
122 |         nones = [None for i in range(len(areas))]
123 |         city_list = [city for i in range(len(areas))]
124 |         args = zip(zip(city_list, areas), nones)
125 |         # areas = areas[0: 1]
126 | 
127 |         # 针对每个板块写一个文件,启动一个线程来操作
128 |         pool_size = thread_pool_size
129 |         pool = threadpool.ThreadPool(pool_size)
130 |         my_requests = threadpool.makeRequests(self.collect_area_xiaoqu_data, args)
131 |         [pool.putRequest(req) for req in my_requests]
132 |         pool.wait()
133 |         pool.dismissWorkers(pool_size, do_join=True)  # 完成后退出
134 | 
135 |         # 计时结束，统计结果
136 |         t2 = time.time()
137 |         print("Total crawl {0} areas.".format(len(areas)))
138 |         print("Total cost {0} second to crawl {1} data items.".format(t2 - t1, self.total_num))
139 | 
140 | 
141 | if __name__ == "__main__":
142 |     # urls = get_xiaoqu_area_urls()
143 |     # print urls
144 |     # get_xiaoqu_info("sh", "beicai")
145 |     spider = XiaoQuBaseSpider("lianjia")
146 |     spider.start()
147 | 


--------------------------------------------------------------------------------
/lib/spider/ershou_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # author: zengyuetian
  4 | # 此代码仅供学习与交流，请勿用于商业用途。
  5 | # 爬取二手房数据的爬虫派生类
  6 | 
  7 | import re
  8 | import threadpool
  9 | from bs4 import BeautifulSoup
 10 | from lib.item.ershou import *
 11 | from lib.zone.city import get_city
 12 | from lib.spider.base_spider import *
 13 | from lib.utility.date import *
 14 | from lib.utility.path import *
 15 | from lib.zone.area import *
 16 | from lib.utility.log import *
 17 | import lib.utility.version
 18 | 
 19 | 
 20 | class ErShouSpider(BaseSpider):
 21 |     def collect_area_ershou_data(self, city_name, area_name, fmt="csv"):
 22 |         """
 23 |         对于每个板块,获得这个板块下所有二手房的信息
 24 |         并且将这些信息写入文件保存
 25 |         :param city_name: 城市
 26 |         :param area_name: 板块
 27 |         :param fmt: 保存文件格式
 28 |         :return: None
 29 |         """
 30 |         district_name = area_dict.get(area_name, "")
 31 |         csv_file = self.today_path + "/{0}_{1}.csv".format(district_name, area_name)
 32 |         with open(csv_file, "w") as f:
 33 |             # 开始获得需要的板块数据
 34 |             ershous = self.get_area_ershou_info(city_name, area_name)
 35 |             # 锁定，多线程读写
 36 |             if self.mutex.acquire(1):
 37 |                 self.total_num += len(ershous)
 38 |                 # 释放
 39 |                 self.mutex.release()
 40 |             if fmt == "csv":
 41 |                 for ershou in ershous:
 42 |                     # print(date_string + "," + xiaoqu.text())
 43 |                     f.write(self.date_string + "," + ershou.text() + "\n")
 44 |         print("Finish crawl area: " + area_name + ", save data to : " + csv_file)
 45 | 
 46 |     @staticmethod
 47 |     def get_area_ershou_info(city_name, area_name):
 48 |         """
 49 |         通过爬取页面获得城市指定版块的二手房信息
 50 |         :param city_name: 城市
 51 |         :param area_name: 版块
 52 |         :return: 二手房数据列表
 53 |         """
 54 |         total_page = 1
 55 |         district_name = area_dict.get(area_name, "")
 56 |         # 中文区县
 57 |         chinese_district = get_chinese_district(district_name)
 58 |         # 中文版块
 59 |         chinese_area = chinese_area_dict.get(area_name, "")
 60 | 
 61 |         ershou_list = list()
 62 |         page = 'http://{0}.{1}.com/ershoufang/{2}/'.format(city_name, SPIDER_NAME, area_name)
 63 |         print(page)  # 打印版块页面地址
 64 |         headers = create_headers()
 65 |         response = requests.get(page, timeout=10, headers=headers)
 66 |         html = response.content
 67 |         soup = BeautifulSoup(html, "lxml")
 68 | 
 69 |         # 获得总的页数，通过查找总页码的元素信息
 70 |         try:
 71 |             page_box = soup.find_all('div', class_='page-box')[0]
 72 |             matches = re.search('.*"totalPage":(\d+),.*', str(page_box))
 73 |             total_page = int(matches.group(1))
 74 |         except Exception as e:
 75 |             print("\tWarning: only find one page for {0}".format(area_name))
 76 |             print(e)
 77 | 
 78 |         # 从第一页开始,一直遍历到最后一页
 79 |         for num in range(1, total_page + 1):
 80 |             page = 'http://{0}.{1}.com/ershoufang/{2}/pg{3}'.format(city_name, SPIDER_NAME, area_name, num)
 81 |             print(page)  # 打印每一页的地址
 82 |             headers = create_headers()
 83 |             BaseSpider.random_delay()
 84 |             response = requests.get(page, timeout=10, headers=headers)
 85 |             html = response.content
 86 |             soup = BeautifulSoup(html, "lxml")
 87 | 
 88 |             # 获得有小区信息的panel
 89 |             house_elements = soup.find_all('li', class_="clear")
 90 |             for house_elem in house_elements:
 91 |                 price = house_elem.find('div', class_="totalPrice")
 92 |                 name = house_elem.find('div', class_='title')
 93 |                 desc = house_elem.find('div', class_="houseInfo")
 94 |                 pic = house_elem.find('a', class_="img").find('img', class_="lj-lazy")
 95 | 
 96 |                 # 继续清理数据
 97 |                 price = price.text.strip()
 98 |                 name = name.text.replace("\n", "")
 99 |                 desc = desc.text.replace("\n", "").strip()
100 |                 pic = pic.get('data-original').strip()
101 |                 # print(pic)
102 | 
103 | 
104 |                 # 作为对象保存
105 |                 ershou = ErShou(chinese_district, chinese_area, name, price, desc, pic)
106 |                 ershou_list.append(ershou)
107 |         return ershou_list
108 | 
109 |     def start(self):
110 |         city = get_city()
111 |         self.today_path = create_date_path("{0}/ershou".format(SPIDER_NAME), city, self.date_string)
112 | 
113 |         t1 = time.time()  # 开始计时
114 | 
115 |         # 获得城市有多少区列表, district: 区县
116 |         districts = get_districts(city)
117 |         print('City: {0}'.format(city))
118 |         print('Districts: {0}'.format(districts))
119 | 
120 |         # 获得每个区的板块, area: 板块
121 |         areas = list()
122 |         for district in districts:
123 |             areas_of_district = get_areas(city, district)
124 |             print('{0}: Area list:  {1}'.format(district, areas_of_district))
125 |             # 用list的extend方法,L1.extend(L2)，该方法将参数L2的全部元素添加到L1的尾部
126 |             areas.extend(areas_of_district)
127 |             # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', }
128 |             for area in areas_of_district:
129 |                 area_dict[area] = district
130 |         print("Area:", areas)
131 |         print("District and areas:", area_dict)
132 | 
133 |         # 准备线程池用到的参数
134 |         nones = [None for i in range(len(areas))]
135 |         city_list = [city for i in range(len(areas))]
136 |         args = zip(zip(city_list, areas), nones)
137 |         # areas = areas[0: 1]   # For debugging
138 | 
139 |         # 针对每个板块写一个文件,启动一个线程来操作
140 |         pool_size = thread_pool_size
141 |         pool = threadpool.ThreadPool(pool_size)
142 |         my_requests = threadpool.makeRequests(self.collect_area_ershou_data, args)
143 |         [pool.putRequest(req) for req in my_requests]
144 |         pool.wait()
145 |         pool.dismissWorkers(pool_size, do_join=True)  # 完成后退出
146 | 
147 |         # 计时结束，统计结果
148 |         t2 = time.time()
149 |         print("Total crawl {0} areas.".format(len(areas)))
150 |         print("Total cost {0} second to crawl {1} data items.".format(t2 - t1, self.total_num))
151 | 
152 | 
153 | if __name__ == '__main__':
154 |     pass
155 | 


--------------------------------------------------------------------------------
/xiaoqu_to_db.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # author: zengyuetian
  4 | # 此代码仅供学习与交流，请勿用于商业用途。
  5 | # read data from csv, write to database
  6 | # database includes: mysql/mongodb/excel/json/csv
  7 | 
  8 | import os
  9 | import pymysql
 10 | from lib.utility.path import DATA_PATH
 11 | from lib.zone.city import *
 12 | from lib.utility.date import *
 13 | from lib.utility.version import PYTHON_3
 14 | from lib.spider.base_spider import SPIDER_NAME
 15 | 
 16 | pymysql.install_as_MySQLdb()
 17 | 
 18 | 
 19 | def create_prompt_text():
 20 |     city_info = list()
 21 |     num = 0
 22 |     for en_name, ch_name in cities.items():
 23 |         num += 1
 24 |         city_info.append(en_name)
 25 |         city_info.append(": ")
 26 |         city_info.append(ch_name)
 27 |         if num % 4 == 0:
 28 |             city_info.append("\n")
 29 |         else:
 30 |             city_info.append(", ")
 31 |     return 'Which city data do you want to save ?\n' + ''.join(city_info)
 32 | 
 33 | 
 34 | if __name__ == '__main__':
 35 |     # 设置目标数据库
 36 |     ##################################
 37 |     # mysql/mongodb/excel/json/csv
 38 |     # database = "mysql"
 39 |     # database = "mongodb"
 40 |     # database = "excel"
 41 |     # database = "json"
 42 |     database = "csv"
 43 |     ##################################
 44 |     db = None
 45 |     collection = None
 46 |     workbook = None
 47 |     csv_file = None
 48 |     datas = list()
 49 | 
 50 |     if database == "mysql":
 51 |         import records
 52 |         db = records.Database('mysql://root:123456@localhost/lianjia?charset=utf8', encoding='utf-8')
 53 |     elif database == "mongodb":
 54 |         from pymongo import MongoClient
 55 |         conn = MongoClient('localhost', 27017)
 56 |         db = conn.lianjia  # 连接lianjia数据库，没有则自动创建
 57 |         collection = db.xiaoqu  # 使用xiaoqu集合，没有则自动创建
 58 |     elif database == "excel":
 59 |         import xlsxwriter
 60 |         workbook = xlsxwriter.Workbook('xiaoqu.xlsx')
 61 |         worksheet = workbook.add_worksheet()
 62 |     elif database == "json":
 63 |         import json
 64 |     elif database == "csv":
 65 |         csv_file = open("xiaoqu.csv", "w")
 66 |         line = "{0};{1};{2};{3};{4};{5};{6}\n".format('city_ch', 'date', 'district', 'area', 'xiaoqu', 'price', 'sale')
 67 |         csv_file.write(line)
 68 | 
 69 |     city = get_city()
 70 |     # 准备日期信息，爬到的数据存放到日期相关文件夹下
 71 |     date = get_date_string()
 72 |     # 获得 csv 文件路径
 73 |     # date = "20180331"   # 指定采集数据的日期
 74 |     # city = "sh"         # 指定采集数据的城市
 75 |     city_ch = get_chinese_city(city)
 76 |     csv_dir = "{0}/{1}/xiaoqu/{2}/{3}".format(DATA_PATH, SPIDER_NAME, city, date)
 77 | 
 78 |     files = list()
 79 |     if not os.path.exists(csv_dir):
 80 |         print("{0} does not exist.".format(csv_dir))
 81 |         print("Please run 'python xiaoqu.py' firstly.")
 82 |         print("Bye.")
 83 |         exit(0)
 84 |     else:
 85 |         print('OK, start to process ' + get_chinese_city(city))
 86 |     for csv in os.listdir(csv_dir):
 87 |         data_csv = csv_dir + "/" + csv
 88 |         # print(data_csv)
 89 |         files.append(data_csv)
 90 | 
 91 |     # 清理数据
 92 |     count = 0
 93 |     row = 0
 94 |     col = 0
 95 |     for csv in files:
 96 |         with open(csv, 'r') as f:
 97 |             for line in f:
 98 |                 count += 1
 99 |                 text = line.strip()
100 |                 try:
101 |                     # 如果小区名里面没有逗号，那么总共是6项
102 |                     if text.count(',') == 5:
103 |                         date, district, area, xiaoqu, price, sale = text.split(',')
104 |                     elif text.count(',') < 5:
105 |                         continue
106 |                     else:
107 |                         fields = text.split(',')
108 |                         date = fields[0]
109 |                         district = fields[1]
110 |                         area = fields[2]
111 |                         xiaoqu = ','.join(fields[3:-2])
112 |                         price = fields[-2]
113 |                         sale = fields[-1]
114 |                 except Exception as e:
115 |                     print(text)
116 |                     print(e)
117 |                     continue
118 |                 sale = sale.replace(r'套在售二手房', '')
119 |                 price = price.replace(r'暂无', '0')
120 |                 price = price.replace(r'元/m2', '')
121 |                 price = int(price)
122 |                 sale = int(sale)
123 |                 print("{0} {1} {2} {3} {4} {5}".format(date, district, area, xiaoqu, price, sale))
124 |                 # 写入mysql数据库
125 |                 if database == "mysql":
126 |                     db.query('INSERT INTO xiaoqu (city, date, district, area, xiaoqu, price, sale) '
127 |                              'VALUES(:city, :date, :district, :area, :xiaoqu, :price, :sale)',
128 |                              city=city_ch, date=date, district=district, area=area, xiaoqu=xiaoqu, price=price,
129 |                              sale=sale)
130 |                 # 写入mongodb数据库
131 |                 elif database == "mongodb":
132 |                     data = dict(city=city_ch, date=date, district=district, area=area, xiaoqu=xiaoqu, price=price,
133 |                                 sale=sale)
134 |                     collection.insert(data)
135 |                 elif database == "excel":
136 |                     if not PYTHON_3:
137 |                         worksheet.write_string(row, col, unicode(city_ch, 'utf-8'))
138 |                         worksheet.write_string(row, col + 1, date)
139 |                         worksheet.write_string(row, col + 2, unicode(district, 'utf-8'))
140 |                         worksheet.write_string(row, col + 3, unicode(area, 'utf-8'))
141 |                         worksheet.write_string(row, col + 4, unicode(xiaoqu, 'utf-8'))
142 |                         worksheet.write_number(row, col + 5, price)
143 |                         worksheet.write_number(row, col + 6, sale)
144 |                     else:
145 |                         worksheet.write_string(row, col, city_ch)
146 |                         worksheet.write_string(row, col + 1, date)
147 |                         worksheet.write_string(row, col + 2, district)
148 |                         worksheet.write_string(row, col + 3, area)
149 |                         worksheet.write_string(row, col + 4, xiaoqu)
150 |                         worksheet.write_number(row, col + 5, price)
151 |                         worksheet.write_number(row, col + 6, sale)
152 |                     row += 1
153 |                 elif database == "json":
154 |                     data = dict(city=city_ch, date=date, district=district, area=area, xiaoqu=xiaoqu, price=price,
155 |                                 sale=sale)
156 |                     datas.append(data)
157 |                 elif database == "csv":
158 |                     line = "{0};{1};{2};{3};{4};{5};{6}\n".format(city_ch, date, district, area, xiaoqu, price, sale)
159 |                     csv_file.write(line)
160 | 
161 |     # 写入，并且关闭句柄
162 |     if database == "excel":
163 |         workbook.close()
164 |     elif database == "json":
165 |         json.dump(datas, open('xiaoqu.json', 'w'), ensure_ascii=False, indent=2)
166 |     elif database == "csv":
167 |         csv_file.close()
168 | 
169 |     print("Total write {0} items to database.".format(count))
170 | 


--------------------------------------------------------------------------------
/lib/spider/zufang_spider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding=utf-8
  3 | # author: zengyuetian
  4 | # 此代码仅供学习与交流，请勿用于商业用途。
  5 | # 爬取租房数据的爬虫派生类
  6 | 
  7 | import re
  8 | import threadpool
  9 | from bs4 import BeautifulSoup
 10 | from lib.item.zufang import *
 11 | from lib.spider.base_spider import *
 12 | from lib.utility.date import *
 13 | from lib.utility.path import *
 14 | from lib.zone.area import *
 15 | from lib.zone.city import get_city
 16 | import lib.utility.version
 17 | 
 18 | 
 19 | class ZuFangBaseSpider(BaseSpider):
 20 |     def collect_area_zufang_data(self, city_name, area_name, fmt="csv"):
 21 |         """
 22 |         对于每个板块,获得这个板块下所有出租房的信息
 23 |         并且将这些信息写入文件保存
 24 |         :param city_name: 城市
 25 |         :param area_name: 板块
 26 |         :param fmt: 保存文件格式
 27 |         :return: None
 28 |         """
 29 |         district_name = area_dict.get(area_name, "")
 30 |         csv_file = self.today_path + "/{0}_{1}.csv".format(district_name, area_name)
 31 |         with open(csv_file, "w") as f:
 32 |             # 开始获得需要的板块数据
 33 |             zufangs = self.get_area_zufang_info(city_name, area_name)
 34 |             # 锁定
 35 |             if self.mutex.acquire(1):
 36 |                 self.total_num += len(zufangs)
 37 |                 # 释放
 38 |                 self.mutex.release()
 39 |             if fmt == "csv":
 40 |                 for zufang in zufangs:
 41 |                     f.write(self.date_string + "," + zufang.text() + "\n")
 42 |         print("Finish crawl area: " + area_name + ", save data to : " + csv_file)
 43 | 
 44 |     @staticmethod
 45 |     def get_area_zufang_info(city_name, area_name):
 46 |         matches = None
 47 |         """
 48 |         通过爬取页面获取城市指定版块的租房信息
 49 |         :param city_name: 城市
 50 |         :param area_name: 版块
 51 |         :return: 出租房信息列表
 52 |         """
 53 |         total_page = 1
 54 |         district_name = area_dict.get(area_name, "")
 55 |         chinese_district = get_chinese_district(district_name)
 56 |         chinese_area = chinese_area_dict.get(area_name, "")
 57 |         zufang_list = list()
 58 |         page = 'http://{0}.{1}.com/zufang/{2}/'.format(city_name, SPIDER_NAME, area_name)
 59 |         print(page)
 60 | 
 61 |         headers = create_headers()
 62 |         response = requests.get(page, timeout=10, headers=headers)
 63 |         html = response.content
 64 |         soup = BeautifulSoup(html, "lxml")
 65 | 
 66 |         # 获得总的页数
 67 |         try:
 68 |             if SPIDER_NAME == "lianjia":
 69 |                 page_box = soup.find_all('div', class_='page-box')[0]
 70 |                 matches = re.search('.*"totalPage":(\d+),.*', str(page_box))
 71 |             elif SPIDER_NAME == "ke":
 72 |                 page_box = soup.find_all('div', class_='content__pg')[0]
 73 |                 # print(page_box)
 74 |                 matches = re.search('.*data-totalpage="(\d+)".*', str(page_box))
 75 |             total_page = int(matches.group(1))
 76 |             # print(total_page)
 77 |         except Exception as e:
 78 |             print("\tWarning: only find one page for {0}".format(area_name))
 79 |             print(e)
 80 | 
 81 |         # 从第一页开始,一直遍历到最后一页
 82 |         headers = create_headers()
 83 |         for num in range(1, total_page + 1):
 84 |             page = 'http://{0}.{1}.com/zufang/{2}/pg{3}'.format(city_name, SPIDER_NAME, area_name, num)
 85 |             print(page)
 86 |             BaseSpider.random_delay()
 87 |             response = requests.get(page, timeout=10, headers=headers)
 88 |             html = response.content
 89 |             soup = BeautifulSoup(html, "lxml")
 90 | 
 91 |             # 获得有小区信息的panel
 92 |             if SPIDER_NAME == "lianjia":
 93 |                 ul_element = soup.find('ul', class_="house-lst")
 94 |                 house_elements = ul_element.find_all('li')
 95 |             else:
 96 |                 ul_element = soup.find('div', class_="content__list")
 97 |                 house_elements = ul_element.find_all('div', class_="content__list--item")
 98 | 
 99 |             if len(house_elements) == 0:
100 |                 continue
101 |             # else:
102 |             #     print(len(house_elements))
103 | 
104 |             for house_elem in house_elements:
105 |                 if SPIDER_NAME == "lianjia":
106 |                     price = house_elem.find('span', class_="num")
107 |                     xiaoqu = house_elem.find('span', class_='region')
108 |                     layout = house_elem.find('span', class_="zone")
109 |                     size = house_elem.find('span', class_="meters")
110 |                 else:
111 |                     price = house_elem.find('span', class_="content__list--item-price")
112 |                     desc1 = house_elem.find('p', class_="content__list--item--title")
113 |                     desc2 = house_elem.find('p', class_="content__list--item--des")
114 | 
115 |                 try:
116 |                     if SPIDER_NAME == "lianjia":
117 |                         price = price.text.strip()
118 |                         xiaoqu = xiaoqu.text.strip().replace("\n", "")
119 |                         layout = layout.text.strip()
120 |                         size = size.text.strip()
121 |                     else:
122 |                         # 继续清理数据
123 |                         price = price.text.strip().replace(" ", "").replace("元/月", "")
124 |                         # print(price)
125 |                         desc1 = desc1.text.strip().replace("\n", "")
126 |                         desc2 = desc2.text.strip().replace("\n", "").replace(" ", "")
127 |                         # print(desc1)
128 | 
129 |                         infos = desc1.split(' ')
130 |                         xiaoqu = infos[0]
131 |                         layout = infos[1]
132 |                         descs = desc2.split('/')
133 |                         # print(descs[1])
134 |                         size = descs[1].replace("㎡", "平米")
135 | 
136 |                     # print("{0} {1} {2} {3} {4} {5} {6}".format(
137 |                     #     chinese_district, chinese_area, xiaoqu, layout, size, price))
138 | 
139 |                     # 作为对象保存
140 |                     zufang = ZuFang(chinese_district, chinese_area, xiaoqu, layout, size, price)
141 |                     zufang_list.append(zufang)
142 |                 except Exception as e:
143 |                     print("=" * 20 + " page no data")
144 |                     print(e)
145 |                     print(page)
146 |                     print("=" * 20)
147 |         return zufang_list
148 | 
149 |     def start(self):
150 |         city = get_city()
151 |         self.today_path = create_date_path("{0}/zufang".format(SPIDER_NAME), city, self.date_string)
152 |         # collect_area_zufang('sh', 'beicai')  # For debugging, keep it here
153 |         t1 = time.time()  # 开始计时
154 | 
155 |         # 获得城市有多少区列表, district: 区县
156 |         districts = get_districts(city)
157 |         print('City: {0}'.format(city))
158 |         print('Districts: {0}'.format(districts))
159 | 
160 |         # 获得每个区的板块, area: 板块
161 |         areas = list()
162 |         for district in districts:
163 |             areas_of_district = get_areas(city, district)
164 |             print('{0}: Area list:  {1}'.format(district, areas_of_district))
165 |             # 用list的extend方法,L1.extend(L2)，该方法将参数L2的全部元素添加到L1的尾部
166 |             areas.extend(areas_of_district)
167 |             # 使用一个字典来存储区县和板块的对应关系, 例如{'beicai': 'pudongxinqu', }
168 |             for area in areas_of_district:
169 |                 area_dict[area] = district
170 |         print("Area:", areas)
171 |         print("District and areas:", area_dict)
172 | 
173 |         # 准备线程池用到的参数
174 |         nones = [None for i in range(len(areas))]
175 |         city_list = [city for i in range(len(areas))]
176 |         args = zip(zip(city_list, areas), nones)
177 |         # areas = areas[0: 1]
178 | 
179 |         # 针对每个板块写一个文件,启动一个线程来操作
180 |         pool_size = thread_pool_size
181 |         pool = threadpool.ThreadPool(pool_size)
182 |         my_requests = threadpool.makeRequests(self.collect_area_zufang_data, args)
183 |         [pool.putRequest(req) for req in my_requests]
184 |         pool.wait()
185 |         pool.dismissWorkers(pool_size, do_join=True)  # 完成后退出
186 | 
187 |         # 计时结束，统计结果
188 |         t2 = time.time()
189 |         print("Total crawl {0} areas.".format(len(areas)))
190 |         print("Total cost {0} second to crawl {1} data items.".format(t2 - t1, self.total_num))
191 | 
192 | 
193 | if __name__ == '__main__':
194 |     # get_area_zufang_info("yt", "muping")
195 |     pass
196 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 |                                  Apache License
  2 |                            Version 2.0, January 2004
  3 |                         http://www.apache.org/licenses/
  4 | 
  5 |    TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
  6 | 
  7 |    1. Definitions.
  8 | 
  9 |       "License" shall mean the terms and conditions for use, reproduction,
 10 |       and distribution as defined by Sections 1 through 9 of this document.
 11 | 
 12 |       "Licensor" shall mean the copyright owner or entity authorized by
 13 |       the copyright owner that is granting the License.
 14 | 
 15 |       "Legal Entity" shall mean the union of the acting entity and all
 16 |       other entities that control, are controlled by, or are under common
 17 |       control with that entity. For the purposes of this definition,
 18 |       "control" means (i) the power, direct or indirect, to cause the
 19 |       direction or management of such entity, whether by contract or
 20 |       otherwise, or (ii) ownership of fifty percent (50%) or more of the
 21 |       outstanding shares, or (iii) beneficial ownership of such entity.
 22 | 
 23 |       "You" (or "Your") shall mean an individual or Legal Entity
 24 |       exercising permissions granted by this License.
 25 | 
 26 |       "Source" form shall mean the preferred form for making modifications,
 27 |       including but not limited to software source code, documentation
 28 |       source, and configuration files.
 29 | 
 30 |       "Object" form shall mean any form resulting from mechanical
 31 |       transformation or translation of a Source form, including but
 32 |       not limited to compiled object code, generated documentation,
 33 |       and conversions to other media types.
 34 | 
 35 |       "Work" shall mean the work of authorship, whether in Source or
 36 |       Object form, made available under the License, as indicated by a
 37 |       copyright notice that is included in or attached to the work
 38 |       (an example is provided in the Appendix below).
 39 | 
 40 |       "Derivative Works" shall mean any work, whether in Source or Object
 41 |       form, that is based on (or derived from) the Work and for which the
 42 |       editorial revisions, annotations, elaborations, or other modifications
 43 |       represent, as a whole, an original work of authorship. For the purposes
 44 |       of this License, Derivative Works shall not include works that remain
 45 |       separable from, or merely link (or bind by name) to the interfaces of,
 46 |       the Work and Derivative Works thereof.
 47 | 
 48 |       "Contribution" shall mean any work of authorship, including
 49 |       the original version of the Work and any modifications or additions
 50 |       to that Work or Derivative Works thereof, that is intentionally
 51 |       submitted to Licensor for inclusion in the Work by the copyright owner
 52 |       or by an individual or Legal Entity authorized to submit on behalf of
 53 |       the copyright owner. For the purposes of this definition, "submitted"
 54 |       means any form of electronic, verbal, or written communication sent
 55 |       to the Licensor or its representatives, including but not limited to
 56 |       communication on electronic mailing lists, source code control systems,
 57 |       and issue tracking systems that are managed by, or on behalf of, the
 58 |       Licensor for the purpose of discussing and improving the Work, but
 59 |       excluding communication that is conspicuously marked or otherwise
 60 |       designated in writing by the copyright owner as "Not a Contribution."
 61 | 
 62 |       "Contributor" shall mean Licensor and any individual or Legal Entity
 63 |       on behalf of whom a Contribution has been received by Licensor and
 64 |       subsequently incorporated within the Work.
 65 | 
 66 |    2. Grant of Copyright License. Subject to the terms and conditions of
 67 |       this License, each Contributor hereby grants to You a perpetual,
 68 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 69 |       copyright license to reproduce, prepare Derivative Works of,
 70 |       publicly display, publicly perform, sublicense, and distribute the
 71 |       Work and such Derivative Works in Source or Object form.
 72 | 
 73 |    3. Grant of Patent License. Subject to the terms and conditions of
 74 |       this License, each Contributor hereby grants to You a perpetual,
 75 |       worldwide, non-exclusive, no-charge, royalty-free, irrevocable
 76 |       (except as stated in this section) patent license to make, have made,
 77 |       use, offer to sell, sell, import, and otherwise transfer the Work,
 78 |       where such license applies only to those patent claims licensable
 79 |       by such Contributor that are necessarily infringed by their
 80 |       Contribution(s) alone or by combination of their Contribution(s)
 81 |       with the Work to which such Contribution(s) was submitted. If You
 82 |       institute patent litigation against any entity (including a
 83 |       cross-claim or counterclaim in a lawsuit) alleging that the Work
 84 |       or a Contribution incorporated within the Work constitutes direct
 85 |       or contributory patent infringement, then any patent licenses
 86 |       granted to You under this License for that Work shall terminate
 87 |       as of the date such litigation is filed.
 88 | 
 89 |    4. Redistribution. You may reproduce and distribute copies of the
 90 |       Work or Derivative Works thereof in any medium, with or without
 91 |       modifications, and in Source or Object form, provided that You
 92 |       meet the following conditions:
 93 | 
 94 |       (a) You must give any other recipients of the Work or
 95 |           Derivative Works a copy of this License; and
 96 | 
 97 |       (b) You must cause any modified files to carry prominent notices
 98 |           stating that You changed the files; and
 99 | 
100 |       (c) You must retain, in the Source form of any Derivative Works
101 |           that You distribute, all copyright, patent, trademark, and
102 |           attribution notices from the Source form of the Work,
103 |           excluding those notices that do not pertain to any part of
104 |           the Derivative Works; and
105 | 
106 |       (d) If the Work includes a "NOTICE" text file as part of its
107 |           distribution, then any Derivative Works that You distribute must
108 |           include a readable copy of the attribution notices contained
109 |           within such NOTICE file, excluding those notices that do not
110 |           pertain to any part of the Derivative Works, in at least one
111 |           of the following places: within a NOTICE text file distributed
112 |           as part of the Derivative Works; within the Source form or
113 |           documentation, if provided along with the Derivative Works; or,
114 |           within a display generated by the Derivative Works, if and
115 |           wherever such third-party notices normally appear. The contents
116 |           of the NOTICE file are for informational purposes only and
117 |           do not modify the License. You may add Your own attribution
118 |           notices within Derivative Works that You distribute, alongside
119 |           or as an addendum to the NOTICE text from the Work, provided
120 |           that such additional attribution notices cannot be construed
121 |           as modifying the License.
122 | 
123 |       You may add Your own copyright statement to Your modifications and
124 |       may provide additional or different license terms and conditions
125 |       for use, reproduction, or distribution of Your modifications, or
126 |       for any such Derivative Works as a whole, provided Your use,
127 |       reproduction, and distribution of the Work otherwise complies with
128 |       the conditions stated in this License.
129 | 
130 |    5. Submission of Contributions. Unless You explicitly state otherwise,
131 |       any Contribution intentionally submitted for inclusion in the Work
132 |       by You to the Licensor shall be under the terms and conditions of
133 |       this License, without any additional terms or conditions.
134 |       Notwithstanding the above, nothing herein shall supersede or modify
135 |       the terms of any separate license agreement you may have executed
136 |       with Licensor regarding such Contributions.
137 | 
138 |    6. Trademarks. This License does not grant permission to use the trade
139 |       names, trademarks, service marks, or product names of the Licensor,
140 |       except as required for reasonable and customary use in describing the
141 |       origin of the Work and reproducing the content of the NOTICE file.
142 | 
143 |    7. Disclaimer of Warranty. Unless required by applicable law or
144 |       agreed to in writing, Licensor provides the Work (and each
145 |       Contributor provides its Contributions) on an "AS IS" BASIS,
146 |       WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 |       implied, including, without limitation, any warranties or conditions
148 |       of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 |       PARTICULAR PURPOSE. You are solely responsible for determining the
150 |       appropriateness of using or redistributing the Work and assume any
151 |       risks associated with Your exercise of permissions under this License.
152 | 
153 |    8. Limitation of Liability. In no event and under no legal theory,
154 |       whether in tort (including negligence), contract, or otherwise,
155 |       unless required by applicable law (such as deliberate and grossly
156 |       negligent acts) or agreed to in writing, shall any Contributor be
157 |       liable to You for damages, including any direct, indirect, special,
158 |       incidental, or consequential damages of any character arising as a
159 |       result of this License or out of the use or inability to use the
160 |       Work (including but not limited to damages for loss of goodwill,
161 |       work stoppage, computer failure or malfunction, or any and all
162 |       other commercial damages or losses), even if such Contributor
163 |       has been advised of the possibility of such damages.
164 | 
165 |    9. Accepting Warranty or Additional Liability. While redistributing
166 |       the Work or Derivative Works thereof, You may choose to offer,
167 |       and charge a fee for, acceptance of support, warranty, indemnity,
168 |       or other liability obligations and/or rights consistent with this
169 |       License. However, in accepting such obligations, You may act only
170 |       on Your own behalf and on Your sole responsibility, not on behalf
171 |       of any other Contributor, and only if You agree to indemnify,
172 |       defend, and hold each Contributor harmless for any liability
173 |       incurred by, or claims asserted against, such Contributor by reason
174 |       of your accepting any such warranty or additional liability.
175 | 
176 |    END OF TERMS AND CONDITIONS
177 | 
178 |    APPENDIX: How to apply the Apache License to your work.
179 | 
180 |       To apply the Apache License to your work, attach the following
181 |       boilerplate notice, with the fields enclosed by brackets "[]"
182 |       replaced with your own identifying information. (Don't include
183 |       the brackets!)  The text should be enclosed in the appropriate
184 |       comment syntax for the file format. We also recommend that a
185 |       file or class name and description of purpose be included on the
186 |       same "printed page" as the copyright notice for easier
187 |       identification within third-party archives.
188 | 
189 |    Copyright [yyyy] [name of copyright owner]
190 | 
191 |    Licensed under the Apache License, Version 2.0 (the "License");
192 |    you may not use this file except in compliance with the License.
193 |    You may obtain a copy of the License at
194 | 
195 |        http://www.apache.org/licenses/LICENSE-2.0
196 | 
197 |    Unless required by applicable law or agreed to in writing, software
198 |    distributed under the License is distributed on an "AS IS" BASIS,
199 |    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 |    See the License for the specific language governing permissions and
201 |    limitations under the License.
202 | 


--------------------------------------------------------------------------------