├── lianjia_spiderV1.1
    ├── try.py
    ├── img1.jpg
    ├── config.py
    ├── request.py
    └── spider.py
├── new_lianjia
    ├── lianjia
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── spiders
    │   │   ├── __pycache__
    │   │   │   ├── __init__.cpython-36.pyc
    │   │   │   ├── ershou.cpython-36.pyc
    │   │   │   └── zufang.cpython-36.pyc
    │   │   ├── __init__.py
    │   │   ├── zufang.py
    │   │   └── ershou.py
    │   ├── pipelines.py
    │   ├── items.py
    │   ├── settings.py
    │   └── middlewares.py
    ├── start.py
    └── scrapy.cfg
└── README.md


/lianjia_spiderV1.1/try.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/new_lianjia/lianjia/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/lianjia_spiderV1.1/img1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/lianjia_spiderV1.1/img1.jpg


--------------------------------------------------------------------------------
/new_lianjia/lianjia/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/new_lianjia/lianjia/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/new_lianjia/lianjia/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/new_lianjia/lianjia/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/new_lianjia/lianjia/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/new_lianjia/lianjia/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/new_lianjia/lianjia/spiders/__pycache__/ershou.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/new_lianjia/lianjia/spiders/__pycache__/ershou.cpython-36.pyc


--------------------------------------------------------------------------------
/new_lianjia/lianjia/spiders/__pycache__/zufang.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/new_lianjia/lianjia/spiders/__pycache__/zufang.cpython-36.pyc


--------------------------------------------------------------------------------
/new_lianjia/start.py:
--------------------------------------------------------------------------------
1 | from scrapy.cmdline import execute
2 | 
3 | if __name__ == "__main__":
4 |     # execute(['scrapy', 'crawl', 'ershoufang'])
5 |     # execute(['scrapy', 'crawl', 'zufang'])
6 | 


--------------------------------------------------------------------------------
/new_lianjia/lianjia/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/new_lianjia/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = lianjia.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = lianjia
12 | 


--------------------------------------------------------------------------------
/lianjia_spiderV1.1/config.py:
--------------------------------------------------------------------------------
 1 | # 配置信息
 2 | 
 3 | # 所爬取城市的url，以下为青岛的二手房地址
 4 | entrance_url = 	'http://qd.lianjia.com/ershoufang'
 5 | head_url = 'http://qd.lianjia.com'
 6 | 
 7 | # phantomJS的配置信息
 8 | SERVICE_ARGS = ['--load-images=false', '--disk-cache=true']
 9 | 
10 | # MongoDB的配置信息
11 | MONGO_URL = 'localhost'
12 | MONGO_DB = 'lianjia'
13 | MONGO_TABLE = 'qingdao'


--------------------------------------------------------------------------------
/new_lianjia/lianjia/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class LianjiaPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/new_lianjia/lianjia/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class LianjiaItem(scrapy.Item):
12 |     # define the fields for your item here like:
13 |     # name = scrapy.Field()
14 |     pass
15 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # 链家二手房爬虫
 2 | #### 2018年2月9日更新 (new_lianjia)
 3 | ##### 使用scrapy，爬取全部地区的二手房和租房的数据。
 4 | ##### 只写了逻辑部分
 5 | #### 功能：爬取指定城市的二手房信息
 6 | #### 用到的东西比较杂，主要是复习之前一个月爬虫的学习内容，没有开多进程，爬取一页信息后sleep3秒（其实1秒也可以），这样的速度不会被封IP。
 7 | #### 一个有10000套二手房的城市需要半个小时爬取完， 需要爬哪个城市的二手房可以在config文件设置。
 8 | #### 第一次用github，折腾了两个小时终于发上来了。。。。。。
 9 | ### spider.py主程序
10 | ### request.py 包含requests请求函数和获取总页数的函数
11 | ### config.py 各种配置
12 | ### try.py 多余没用的文件
13 | 爬取信息：  
14 | ![image](https://github.com/longxiaofei/spider-lianjia/blob/master/lianjia_spiderV1.1/img1.jpg?raw=true)
15 | 


--------------------------------------------------------------------------------
/new_lianjia/lianjia/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for lianjia project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'lianjia'
13 | 
14 | SPIDER_MODULES = ['lianjia.spiders']
15 | NEWSPIDER_MODULE = 'lianjia.spiders'
16 | 
17 | 
18 | ROBOTSTXT_OBEY = False
19 | 
20 | CONCURRENT_REQUESTS = 1
21 | 
22 | DOWNLOAD_DELAY = 4
23 | 
24 | COOKIES_ENABLED = False
25 | 
26 | #TELNETCONSOLE_ENABLED = False
27 | 
28 | DEFAULT_REQUEST_HEADERS = {
29 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
30 |     'Accept-Encoding': 'gzip, deflate, br',
31 |     'Accept-Language': 'zh-CN,zh;q=0.9',
32 |     'Cache-Control': 'max-age=0',
33 |     'Connection': 'keep-alive',
34 |     'Upgrade-Insecure-Requests': '1',
35 |     'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36',
36 | }
37 | 
38 | #SPIDER_MIDDLEWARES = {
39 | #    'lianjia.middlewares.LianjiaSpiderMiddleware': 543,
40 | #}
41 | 
42 | #DOWNLOADER_MIDDLEWARES = {
43 | #    'lianjia.middlewares.MyCustomDownloaderMiddleware': 543,
44 | #}
45 | 
46 | #EXTENSIONS = {
47 | #    'scrapy.extensions.telnet.TelnetConsole': None,
48 | #}
49 | 
50 | # Configure item pipelines
51 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
52 | #ITEM_PIPELINES = {
53 | #    'lianjia.pipelines.LianjiaPipeline': 300,
54 | #}
55 | 
56 | 


--------------------------------------------------------------------------------
/new_lianjia/lianjia/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class LianjiaSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/lianjia_spiderV1.1/request.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | import random
 3 | from requests.exceptions import RequestException
 4 | from selenium import webdriver
 5 | from selenium.webdriver.common.by import By
 6 | from selenium.webdriver.support.ui import WebDriverWait
 7 | from selenium.webdriver.support import expected_conditions as EC
 8 | from selenium.common.exceptions import TimeoutException
 9 | from config import SERVICE_ARGS
10 | import time
11 | 
12 | USER_LIST = [
13 | 	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60",
14 | 	"Opera/8.0 (Windows NT 5.1; U; en)",
15 | 	"Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50",
16 | 	"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50",
17 | 	"Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0",
18 | 	"Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10",
19 | 	"Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)",
20 | 	"Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)",
21 | 	"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11", 
22 | 	"TaoBrowser/2.0 Safari/536.11"
23 | ]
24 | 
25 | headers = {
26 | 	'User-Agent': random.choice(USER_LIST),
27 | 	'Connection': 'keep-alive',
28 | }
29 | 
30 | def request(url):
31 | 	try:
32 | 		response = requests.get(url, headers=headers)
33 | 		if response.status_code == 200:
34 | 			return response.text
35 | 		else:
36 | 			print(url, '：错误页面！......')
37 | 			time.sleep(25)
38 | 			return None
39 | 	except RequestException as e:
40 | 		print('获取', url, '失败......')
41 | 		time.sleep(25)
42 | 		return None
43 | 
44 | # 获取一个地区二手房信息的总页数
45 | def get_total_page(browser, url):
46 | 	browser.get(url)
47 | 	try:
48 | 		time.sleep(4)
49 | 		total_room = browser.find_element_by_xpath('/html/body/div[4]/div[1]/div[2]/h2/span').text
50 | 		if not total_room:
51 | 			return None
52 | 		if int(total_room) <= 30:
53 | 			return 1
54 | 		total = WebDriverWait(browser, 30).until(
55 | 				EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div[1]/div[7]/div[2]/div/a[last()]"))
56 | 			)
57 | 		if not total.text.isdigit():
58 | 			total_page = browser.find_element_by_xpath('/html/body/div[4]/div[1]/div[7]/div[2]/div/a[last()-1]').text
59 | 		else:
60 | 			total_page = total.text
61 | 		return total_page
62 | 	except TimeoutException as e:
63 | 		print('获取总页数失败，25秒后重新获取')
64 | 		time.sleep(25)
65 | 		return get_total_page(url)
66 | 
67 | 
68 | 
69 | 
70 | 


--------------------------------------------------------------------------------
/lianjia_spiderV1.1/spider.py:
--------------------------------------------------------------------------------
  1 | from pyquery import PyQuery as pq
  2 | from bs4 import BeautifulSoup
  3 | import re
  4 | import pymongo
  5 | import time
  6 | from selenium import webdriver
  7 | from config import *
  8 | from request import request, get_total_page
  9 | 
 10 | # 连接数据库
 11 | Client = pymongo.MongoClient(MONGO_URL)
 12 | db = Client[MONGO_DB]
 13 | table = db[MONGO_TABLE]
 14 | 
 15 | # 将信息存入数据库
 16 | def save_to_mongo(data):
 17 | 	table.save(data)
 18 | 	print('插入一条数据成功')
 19 | 
 20 | # 提取页面中的信息
 21 | def parse_page(area, url):
 22 | 		html = request(url)
 23 | 		if html:
 24 | 			doc = pq(html)
 25 | 			items = doc('body > div.content > div.leftContent > ul > li').items()
 26 | 			for item in items:
 27 | 				all_conflg = item.find('.houseInfo').text().split(' | ')
 28 | 				if all_conflg[1].find('别墅') != -1:
 29 | 					del(all_conflg[1])
 30 | 				if all_conflg[-1].find('有电梯') != -1:
 31 | 					all_conflg[-1] = '有'
 32 | 				else:
 33 | 					all_conflg[-1] = '无'
 34 | 				page_url = item.find('a.img').attr('href')
 35 | 				title = item.find('.title').text()
 36 | 				total_price = item.find('.totalPrice span').text()
 37 | 				unit_price = item.find('.unitPrice').text()
 38 | 				loaction = item.find('.positionInfo a').text()
 39 | 				data = {
 40 | 					'标题': title,
 41 | 					'URL': page_url,
 42 | 					'所在区': area,
 43 | 					'所在地': loaction,
 44 | 					'所在小区': all_conflg[0],
 45 | 					'规格': all_conflg[1],
 46 | 					'面积': all_conflg[2],
 47 | 					'电梯': all_conflg[-1],
 48 | 					'总价': total_price,
 49 | 					'单价': unit_price,
 50 | 					'查重url': url,
 51 | 				}
 52 | 				save_to_mongo(data)
 53 | 		time.sleep(3)
 54 | 
 55 | # 获取所需爬取城市各地区二手房信息的url
 56 | def ready_steup(url):
 57 | 	group = []
 58 | 	html = request(url)
 59 | 	if html:
 60 | 		soup = BeautifulSoup(html, 'lxml')
 61 | 	for item in soup.find('div', class_='position').find_all('dl')[1].find_all('div')[1].find_all('a'):
 62 | 		area = item.text
 63 | 		second_url = head_url + item['href']
 64 | 		second_html = request(second_url)
 65 | 		if second_html:
 66 | 			second_soup = BeautifulSoup(second_html, 'lxml')
 67 | 			total_room = second_soup.find('div', class_='resultDes').find('h2').find('span').text
 68 | 			if int(total_room) <= 3000:
 69 | 				goal_url = {
 70 | 					'area': area,
 71 | 					'url': second_url,
 72 | 				}
 73 | 				yield goal_url
 74 | 			else:
 75 | 				for a in second_soup.find('div', class_='position').find_all('dl')[1].find_all('div')[2].find_all('a'):
 76 | 					goal_url = {
 77 | 						'area': area,
 78 | 						'url': head_url + a['href'],
 79 | 					}
 80 | 					yield goal_url
 81 | 
 82 | def main():
 83 | 	browser = webdriver.PhantomJS(service_args=SERVICE_ARGS)
 84 | 	browser.set_window_size(1400, 900)
 85 | 	for url in ready_steup(entrance_url):
 86 | 		print('获取地区url成功,正在获取页数')
 87 | 		total_page = get_total_page(browser ,url['url'])
 88 | 		if total_page:
 89 | 			for page in range(1, int(total_page)+1):
 90 | 				goal_url = url['url'] + 'pg' + str(page)
 91 | 				if table.find_one({'查重url': goal_url}):
 92 | 					print(goal_url, '已经爬取过了......')
 93 | 					continue
 94 | 				parse_page(url['area'], goal_url)
 95 | 	print('全部页面爬取完成。')
 96 | 	browser.close()
 97 | 
 98 | if __name__ == '__main__':
 99 | 	main()
100 | 


--------------------------------------------------------------------------------
/new_lianjia/lianjia/spiders/zufang.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider, Request
 3 | from copy import deepcopy
 4 | import json
 5 | import re
 6 | 
 7 | 
 8 | class ZufangSpider(Spider):
 9 |     name = 'zufang'
10 |     allowed_domains = ['lianjia.com']
11 | 
12 |     def start_requests(self):
13 |         url = 'https://bj.lianjia.com/'
14 |         yield Request(url=url, callback=self.get_all_city)
15 | 
16 |     def get_all_city(self, response):
17 |         for city in response.xpath('//*[contains(@class, "fc-main")]//a'):
18 |             city_url = city.xpath('./@href').extract_first()
19 |             is_go = re.match(r'https://[a-z]+\.lianjia\.com/', city_url)
20 |             if is_go:
21 |                 zufang_url = city_url + 'zufang/'
22 |                 yield Request(url=zufang_url, callback=self.get_all_area)
23 | 
24 |     def get_all_area(self, response):
25 |         for area in response.xpath('//*[@id="filter-options"]/dl[1]//a'):
26 |             area_url = area.xpath('./@href').extract_first()
27 |             if 'https' not in area_url:
28 |                 head_url = response.url.replace('/zufang/', '')
29 |                 area_url = head_url + area_url
30 |                 yield Request(url=area_url, callback=self.parse_index)
31 | 
32 |     def parse_index(self, response):
33 |         ljConf = response.xpath('//script[1]').extract_first()
34 |         city = re.search(r'city_name: \'(.*?)\',', ljConf).group(1)
35 |         area = response.xpath('//*[@id="filter-options"]/dl[1]//div[1]//*[@class="on"]/text()').extract_first()
36 |         house_nums = response.xpath('//*[contains(@class, "list-head")]/h2/span/text()').extract_first()
37 | 
38 |         if int(house_nums):
39 |             for house_data in response.xpath('.//*[@id="house-lst"]//li'):
40 |                 title = house_data.xpath('.//h2/a/text()').extract_first()
41 |                 link_url = house_data.xpath('.//h2/a/@href').extract_first()
42 |                 price = house_data.xpath('.//*[@class="price"]/span/text()').extract_first()
43 |                 region = house_data.xpath('.//*[@class="region"]/text()').extract_first().replace('\xa0', '')
44 |                 zone = house_data.xpath('.//*[@class="zone"]//text()').extract_first().replace('\xa0', '')
45 |                 meters = house_data.xpath('.//*[@class="meters"]/text()').extract_first().replace('\xa0', '')
46 |                 direction = house_data.xpath('.//*[@class="where"]/span[last()]/text()').extract_first()
47 |                 other_datas = ''.join(house_data.xpath('.//*[@class="con"]//text()').extract()).split('/')
48 |                 labels = house_data.xpath('.//*[@class="chanquan"]//text()').extract()
49 | 
50 |                 test = {
51 |                     'title': title,             #标题
52 |                     'link_url': link_url,       #链接
53 |                     'price': price,             #价格    
54 |                     'region': region,           #所在小区
55 |                     'zone': zone,               #几室几厅
56 |                     'meters': meters,           #面积
57 |                     'direction': direction,     #方向
58 |                     'other_datas': other_datas, #其他信息
59 |                     'labels': labels,           #标签
60 |                     'city': city,               #所在城市
61 |                     'area': area,               #所在区域
62 |                 }
63 |                 print(test)
64 | 
65 |             #是否翻页
66 |             page_datas = response.xpath('//*[contains(@class, "page-box")]/@page-data').extract_first()
67 |             page_datas = json.loads(page_datas)
68 |             total_page = page_datas['totalPage']
69 |             cur_page = page_datas['curPage']
70 |             if cur_page < total_page:
71 |                 next_page = cur_page + 1
72 |                 head_url = re.sub(r'pg\d+/', '', response.url)
73 |                 next_page_url = head_url + 'pg' + str(next_page)
74 |                 yield Request(url=area_url, callback=self.parse_index)


--------------------------------------------------------------------------------
/new_lianjia/lianjia/spiders/ershou.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy import Spider, Request
 3 | from copy import deepcopy
 4 | import json
 5 | import re
 6 | 
 7 | 
 8 | class ErshouSpider(Spider):
 9 |     name = 'ershoufang'
10 |     allowed_domains = ['lianjia.com']
11 | 
12 |     def start_requests(self):
13 |         url = 'https://bj.lianjia.com/'
14 |         yield Request(url=url, callback=self.get_all_city)
15 | 
16 |     def get_all_city(self, response):
17 |         for city in response.xpath('//*[contains(@class, "fc-main")]//a'):
18 |             city_url = city.xpath('./@href').extract_first()
19 |             is_go = re.match(r'https://[a-z]+\.lianjia\.com/', city_url)
20 |             if is_go:
21 |                 zufang_url = city_url + 'ershoufang/'
22 |                 yield Request(url=zufang_url, callback=self.get_all_area)
23 | 
24 |     def get_all_area(self, response):
25 |         for area in response.xpath('//*[@id="position"]/dl[2]//a'):
26 |             area_url = area.xpath('./@href').extract_first()
27 |             if 'https' not in area_url and 'ershoufang' in area_url:
28 |                 head_url = response.url.replace('/ershoufang/', '')
29 |                 area_url = head_url + area_url
30 |                 yield Request(url=area_url, callback=self.get_all_detail_area)
31 | 
32 |     def get_all_detail_area(self, response):
33 |         all_detail_area = response.xpath('//*[@id="position"]//*[contains(@class, "section_sub_sub_nav")]//a')
34 |         for detail_area in all_detail_area:
35 |             detail_area_url = detail_area.xpath('./@href').extract_first()
36 |             head_url = re.sub(r'/ershoufang.*', '', response.url)
37 |             detail_area_url = head_url + detail_area_url
38 |             yield Request(url=detail_area_url, callback=self.parse_index)
39 | 
40 |     def parse_index(self, response):
41 |         ljConf = response.xpath('//script[1]').extract_first()
42 |         city = re.search(r'city_name: \'(.*?)\',', ljConf).group(1)
43 |         area = response.xpath('//*[@id="position"]/dl[2]//*[@class="selected"]/text()').extract()[0]
44 |         house_nums = response.xpath('//h2[contains(@class, "total")]/span/text()').extract_first().strip(' ')
45 | 
46 |         if int(house_nums):
47 |             for house_data in response.xpath('.//*[@class="bigImgList"]//div[@class="item"]'):
48 |                 title = house_data.xpath('.//*[@class="title"]/text()').extract_first()
49 |                 link_url = house_data.xpath('.//*[@class="title"]/@href').extract_first()
50 |                 price = house_data.xpath('.//*[@class="price"]/span/text()').extract_first()
51 |                 house_info = house_data.xpath('.//*[@class="info"]//text()').extract()
52 |                 region = house_info[0]
53 |                 zone = house_info[2]
54 |                 meters = house_info[4]
55 |                 direction = house_info[6]
56 |                 house_style = house_info[8]
57 |                 try:
58 |                     is_elevator = house_info[10]
59 |                 except:
60 |                     is_elevator = '未知'
61 |                 labels = house_data.xpath('.//*[@class="tag"]//text()').extract()
62 | 
63 |                 test = {
64 |                     'title': title,             #标题
65 |                     'link_url': link_url,       #链接
66 |                     'price': price,             #价格    
67 |                     'region': region,           #所在小区
68 |                     'zone': zone,               #几室几厅
69 |                     'meters': meters,           #面积
70 |                     'direction': direction,     #方向
71 |                     'house_style': house_style, #装修
72 |                     'is_elevator': is_elevator, #有无电梯
73 |                     'labels': labels,           #标签
74 |                     'city': city,               #所在城市
75 |                     'area': area,               #所在区域
76 |                 }
77 |                 print(test)
78 | 
79 |             #是否翻页
80 |             page_datas = response.xpath('//*[contains(@class, "page-box")]/@page-data').extract_first()
81 |             page_datas = json.loads(page_datas)
82 |             total_page = page_datas['totalPage']
83 |             cur_page = page_datas['curPage']
84 |             if cur_page < total_page:
85 |                 next_page = cur_page + 1
86 |                 head_url = re.sub(r'pg\d+/', '', response.url)
87 |                 next_page_url = head_url + 'pg' + str(next_page)
88 |                 yield Request(url=area_url, callback=self.parse_index)


--------------------------------------------------------------------------------