├── lianjia_spiderV1.1 ├── try.py ├── img1.jpg ├── config.py ├── request.py └── spider.py ├── new_lianjia ├── lianjia │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── spiders │ │ ├── __pycache__ │ │ │ ├── __init__.cpython-36.pyc │ │ │ ├── ershou.cpython-36.pyc │ │ │ └── zufang.cpython-36.pyc │ │ ├── __init__.py │ │ ├── zufang.py │ │ └── ershou.py │ ├── pipelines.py │ ├── items.py │ ├── settings.py │ └── middlewares.py ├── start.py └── scrapy.cfg └── README.md /lianjia_spiderV1.1/try.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /new_lianjia/lianjia/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /lianjia_spiderV1.1/img1.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/lianjia_spiderV1.1/img1.jpg -------------------------------------------------------------------------------- /new_lianjia/lianjia/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/new_lianjia/lianjia/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /new_lianjia/lianjia/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/new_lianjia/lianjia/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /new_lianjia/lianjia/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/new_lianjia/lianjia/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /new_lianjia/lianjia/spiders/__pycache__/ershou.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/new_lianjia/lianjia/spiders/__pycache__/ershou.cpython-36.pyc -------------------------------------------------------------------------------- /new_lianjia/lianjia/spiders/__pycache__/zufang.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/longxiaofei/spider-lianjia/HEAD/new_lianjia/lianjia/spiders/__pycache__/zufang.cpython-36.pyc -------------------------------------------------------------------------------- /new_lianjia/start.py: -------------------------------------------------------------------------------- 1 | from scrapy.cmdline import execute 2 | 3 | if __name__ == "__main__": 4 | # execute(['scrapy', 'crawl', 'ershoufang']) 5 | # execute(['scrapy', 'crawl', 'zufang']) 6 | -------------------------------------------------------------------------------- /new_lianjia/lianjia/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /new_lianjia/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = lianjia.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = lianjia 12 | -------------------------------------------------------------------------------- /lianjia_spiderV1.1/config.py: -------------------------------------------------------------------------------- 1 | # 配置信息 2 | 3 | # 所爬取城市的url,以下为青岛的二手房地址 4 | entrance_url = 'http://qd.lianjia.com/ershoufang' 5 | head_url = 'http://qd.lianjia.com' 6 | 7 | # phantomJS的配置信息 8 | SERVICE_ARGS = ['--load-images=false', '--disk-cache=true'] 9 | 10 | # MongoDB的配置信息 11 | MONGO_URL = 'localhost' 12 | MONGO_DB = 'lianjia' 13 | MONGO_TABLE = 'qingdao' -------------------------------------------------------------------------------- /new_lianjia/lianjia/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class LianjiaPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /new_lianjia/lianjia/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class LianjiaItem(scrapy.Item): 12 | # define the fields for your item here like: 13 | # name = scrapy.Field() 14 | pass 15 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # 链家二手房爬虫 2 | #### 2018年2月9日更新 (new_lianjia) 3 | ##### 使用scrapy,爬取全部地区的二手房和租房的数据。 4 | ##### 只写了逻辑部分 5 | #### 功能:爬取指定城市的二手房信息 6 | #### 用到的东西比较杂,主要是复习之前一个月爬虫的学习内容,没有开多进程,爬取一页信息后sleep3秒(其实1秒也可以),这样的速度不会被封IP。 7 | #### 一个有10000套二手房的城市需要半个小时爬取完, 需要爬哪个城市的二手房可以在config文件设置。 8 | #### 第一次用github,折腾了两个小时终于发上来了。。。。。。 9 | ### spider.py主程序 10 | ### request.py 包含requests请求函数和获取总页数的函数 11 | ### config.py 各种配置 12 | ### try.py 多余没用的文件 13 | 爬取信息: 14 | ![image](https://github.com/longxiaofei/spider-lianjia/blob/master/lianjia_spiderV1.1/img1.jpg?raw=true) 15 | -------------------------------------------------------------------------------- /new_lianjia/lianjia/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for lianjia project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'lianjia' 13 | 14 | SPIDER_MODULES = ['lianjia.spiders'] 15 | NEWSPIDER_MODULE = 'lianjia.spiders' 16 | 17 | 18 | ROBOTSTXT_OBEY = False 19 | 20 | CONCURRENT_REQUESTS = 1 21 | 22 | DOWNLOAD_DELAY = 4 23 | 24 | COOKIES_ENABLED = False 25 | 26 | #TELNETCONSOLE_ENABLED = False 27 | 28 | DEFAULT_REQUEST_HEADERS = { 29 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 30 | 'Accept-Encoding': 'gzip, deflate, br', 31 | 'Accept-Language': 'zh-CN,zh;q=0.9', 32 | 'Cache-Control': 'max-age=0', 33 | 'Connection': 'keep-alive', 34 | 'Upgrade-Insecure-Requests': '1', 35 | 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.140 Safari/537.36', 36 | } 37 | 38 | #SPIDER_MIDDLEWARES = { 39 | # 'lianjia.middlewares.LianjiaSpiderMiddleware': 543, 40 | #} 41 | 42 | #DOWNLOADER_MIDDLEWARES = { 43 | # 'lianjia.middlewares.MyCustomDownloaderMiddleware': 543, 44 | #} 45 | 46 | #EXTENSIONS = { 47 | # 'scrapy.extensions.telnet.TelnetConsole': None, 48 | #} 49 | 50 | # Configure item pipelines 51 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 52 | #ITEM_PIPELINES = { 53 | # 'lianjia.pipelines.LianjiaPipeline': 300, 54 | #} 55 | 56 | -------------------------------------------------------------------------------- /new_lianjia/lianjia/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class LianjiaSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /lianjia_spiderV1.1/request.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import random 3 | from requests.exceptions import RequestException 4 | from selenium import webdriver 5 | from selenium.webdriver.common.by import By 6 | from selenium.webdriver.support.ui import WebDriverWait 7 | from selenium.webdriver.support import expected_conditions as EC 8 | from selenium.common.exceptions import TimeoutException 9 | from config import SERVICE_ARGS 10 | import time 11 | 12 | USER_LIST = [ 13 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.95 Safari/537.36 OPR/26.0.1656.60", 14 | "Opera/8.0 (Windows NT 5.1; U; en)", 15 | "Mozilla/5.0 (Windows NT 5.1; U; en; rv:1.8.1) Gecko/20061208 Firefox/2.0.0 Opera 9.50", 16 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; en) Opera 9.50", 17 | "Mozilla/5.0 (Windows NT 6.1; WOW64; rv:34.0) Gecko/20100101 Firefox/34.0", 18 | "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10", 19 | "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", 20 | "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", 21 | "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11", 22 | "TaoBrowser/2.0 Safari/536.11" 23 | ] 24 | 25 | headers = { 26 | 'User-Agent': random.choice(USER_LIST), 27 | 'Connection': 'keep-alive', 28 | } 29 | 30 | def request(url): 31 | try: 32 | response = requests.get(url, headers=headers) 33 | if response.status_code == 200: 34 | return response.text 35 | else: 36 | print(url, ':错误页面!......') 37 | time.sleep(25) 38 | return None 39 | except RequestException as e: 40 | print('获取', url, '失败......') 41 | time.sleep(25) 42 | return None 43 | 44 | # 获取一个地区二手房信息的总页数 45 | def get_total_page(browser, url): 46 | browser.get(url) 47 | try: 48 | time.sleep(4) 49 | total_room = browser.find_element_by_xpath('/html/body/div[4]/div[1]/div[2]/h2/span').text 50 | if not total_room: 51 | return None 52 | if int(total_room) <= 30: 53 | return 1 54 | total = WebDriverWait(browser, 30).until( 55 | EC.presence_of_element_located((By.XPATH, "/html/body/div[4]/div[1]/div[7]/div[2]/div/a[last()]")) 56 | ) 57 | if not total.text.isdigit(): 58 | total_page = browser.find_element_by_xpath('/html/body/div[4]/div[1]/div[7]/div[2]/div/a[last()-1]').text 59 | else: 60 | total_page = total.text 61 | return total_page 62 | except TimeoutException as e: 63 | print('获取总页数失败,25秒后重新获取') 64 | time.sleep(25) 65 | return get_total_page(url) 66 | 67 | 68 | 69 | 70 | -------------------------------------------------------------------------------- /lianjia_spiderV1.1/spider.py: -------------------------------------------------------------------------------- 1 | from pyquery import PyQuery as pq 2 | from bs4 import BeautifulSoup 3 | import re 4 | import pymongo 5 | import time 6 | from selenium import webdriver 7 | from config import * 8 | from request import request, get_total_page 9 | 10 | # 连接数据库 11 | Client = pymongo.MongoClient(MONGO_URL) 12 | db = Client[MONGO_DB] 13 | table = db[MONGO_TABLE] 14 | 15 | # 将信息存入数据库 16 | def save_to_mongo(data): 17 | table.save(data) 18 | print('插入一条数据成功') 19 | 20 | # 提取页面中的信息 21 | def parse_page(area, url): 22 | html = request(url) 23 | if html: 24 | doc = pq(html) 25 | items = doc('body > div.content > div.leftContent > ul > li').items() 26 | for item in items: 27 | all_conflg = item.find('.houseInfo').text().split(' | ') 28 | if all_conflg[1].find('别墅') != -1: 29 | del(all_conflg[1]) 30 | if all_conflg[-1].find('有电梯') != -1: 31 | all_conflg[-1] = '有' 32 | else: 33 | all_conflg[-1] = '无' 34 | page_url = item.find('a.img').attr('href') 35 | title = item.find('.title').text() 36 | total_price = item.find('.totalPrice span').text() 37 | unit_price = item.find('.unitPrice').text() 38 | loaction = item.find('.positionInfo a').text() 39 | data = { 40 | '标题': title, 41 | 'URL': page_url, 42 | '所在区': area, 43 | '所在地': loaction, 44 | '所在小区': all_conflg[0], 45 | '规格': all_conflg[1], 46 | '面积': all_conflg[2], 47 | '电梯': all_conflg[-1], 48 | '总价': total_price, 49 | '单价': unit_price, 50 | '查重url': url, 51 | } 52 | save_to_mongo(data) 53 | time.sleep(3) 54 | 55 | # 获取所需爬取城市各地区二手房信息的url 56 | def ready_steup(url): 57 | group = [] 58 | html = request(url) 59 | if html: 60 | soup = BeautifulSoup(html, 'lxml') 61 | for item in soup.find('div', class_='position').find_all('dl')[1].find_all('div')[1].find_all('a'): 62 | area = item.text 63 | second_url = head_url + item['href'] 64 | second_html = request(second_url) 65 | if second_html: 66 | second_soup = BeautifulSoup(second_html, 'lxml') 67 | total_room = second_soup.find('div', class_='resultDes').find('h2').find('span').text 68 | if int(total_room) <= 3000: 69 | goal_url = { 70 | 'area': area, 71 | 'url': second_url, 72 | } 73 | yield goal_url 74 | else: 75 | for a in second_soup.find('div', class_='position').find_all('dl')[1].find_all('div')[2].find_all('a'): 76 | goal_url = { 77 | 'area': area, 78 | 'url': head_url + a['href'], 79 | } 80 | yield goal_url 81 | 82 | def main(): 83 | browser = webdriver.PhantomJS(service_args=SERVICE_ARGS) 84 | browser.set_window_size(1400, 900) 85 | for url in ready_steup(entrance_url): 86 | print('获取地区url成功,正在获取页数') 87 | total_page = get_total_page(browser ,url['url']) 88 | if total_page: 89 | for page in range(1, int(total_page)+1): 90 | goal_url = url['url'] + 'pg' + str(page) 91 | if table.find_one({'查重url': goal_url}): 92 | print(goal_url, '已经爬取过了......') 93 | continue 94 | parse_page(url['area'], goal_url) 95 | print('全部页面爬取完成。') 96 | browser.close() 97 | 98 | if __name__ == '__main__': 99 | main() 100 | -------------------------------------------------------------------------------- /new_lianjia/lianjia/spiders/zufang.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, Request 3 | from copy import deepcopy 4 | import json 5 | import re 6 | 7 | 8 | class ZufangSpider(Spider): 9 | name = 'zufang' 10 | allowed_domains = ['lianjia.com'] 11 | 12 | def start_requests(self): 13 | url = 'https://bj.lianjia.com/' 14 | yield Request(url=url, callback=self.get_all_city) 15 | 16 | def get_all_city(self, response): 17 | for city in response.xpath('//*[contains(@class, "fc-main")]//a'): 18 | city_url = city.xpath('./@href').extract_first() 19 | is_go = re.match(r'https://[a-z]+\.lianjia\.com/', city_url) 20 | if is_go: 21 | zufang_url = city_url + 'zufang/' 22 | yield Request(url=zufang_url, callback=self.get_all_area) 23 | 24 | def get_all_area(self, response): 25 | for area in response.xpath('//*[@id="filter-options"]/dl[1]//a'): 26 | area_url = area.xpath('./@href').extract_first() 27 | if 'https' not in area_url: 28 | head_url = response.url.replace('/zufang/', '') 29 | area_url = head_url + area_url 30 | yield Request(url=area_url, callback=self.parse_index) 31 | 32 | def parse_index(self, response): 33 | ljConf = response.xpath('//script[1]').extract_first() 34 | city = re.search(r'city_name: \'(.*?)\',', ljConf).group(1) 35 | area = response.xpath('//*[@id="filter-options"]/dl[1]//div[1]//*[@class="on"]/text()').extract_first() 36 | house_nums = response.xpath('//*[contains(@class, "list-head")]/h2/span/text()').extract_first() 37 | 38 | if int(house_nums): 39 | for house_data in response.xpath('.//*[@id="house-lst"]//li'): 40 | title = house_data.xpath('.//h2/a/text()').extract_first() 41 | link_url = house_data.xpath('.//h2/a/@href').extract_first() 42 | price = house_data.xpath('.//*[@class="price"]/span/text()').extract_first() 43 | region = house_data.xpath('.//*[@class="region"]/text()').extract_first().replace('\xa0', '') 44 | zone = house_data.xpath('.//*[@class="zone"]//text()').extract_first().replace('\xa0', '') 45 | meters = house_data.xpath('.//*[@class="meters"]/text()').extract_first().replace('\xa0', '') 46 | direction = house_data.xpath('.//*[@class="where"]/span[last()]/text()').extract_first() 47 | other_datas = ''.join(house_data.xpath('.//*[@class="con"]//text()').extract()).split('/') 48 | labels = house_data.xpath('.//*[@class="chanquan"]//text()').extract() 49 | 50 | test = { 51 | 'title': title, #标题 52 | 'link_url': link_url, #链接 53 | 'price': price, #价格 54 | 'region': region, #所在小区 55 | 'zone': zone, #几室几厅 56 | 'meters': meters, #面积 57 | 'direction': direction, #方向 58 | 'other_datas': other_datas, #其他信息 59 | 'labels': labels, #标签 60 | 'city': city, #所在城市 61 | 'area': area, #所在区域 62 | } 63 | print(test) 64 | 65 | #是否翻页 66 | page_datas = response.xpath('//*[contains(@class, "page-box")]/@page-data').extract_first() 67 | page_datas = json.loads(page_datas) 68 | total_page = page_datas['totalPage'] 69 | cur_page = page_datas['curPage'] 70 | if cur_page < total_page: 71 | next_page = cur_page + 1 72 | head_url = re.sub(r'pg\d+/', '', response.url) 73 | next_page_url = head_url + 'pg' + str(next_page) 74 | yield Request(url=area_url, callback=self.parse_index) -------------------------------------------------------------------------------- /new_lianjia/lianjia/spiders/ershou.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy import Spider, Request 3 | from copy import deepcopy 4 | import json 5 | import re 6 | 7 | 8 | class ErshouSpider(Spider): 9 | name = 'ershoufang' 10 | allowed_domains = ['lianjia.com'] 11 | 12 | def start_requests(self): 13 | url = 'https://bj.lianjia.com/' 14 | yield Request(url=url, callback=self.get_all_city) 15 | 16 | def get_all_city(self, response): 17 | for city in response.xpath('//*[contains(@class, "fc-main")]//a'): 18 | city_url = city.xpath('./@href').extract_first() 19 | is_go = re.match(r'https://[a-z]+\.lianjia\.com/', city_url) 20 | if is_go: 21 | zufang_url = city_url + 'ershoufang/' 22 | yield Request(url=zufang_url, callback=self.get_all_area) 23 | 24 | def get_all_area(self, response): 25 | for area in response.xpath('//*[@id="position"]/dl[2]//a'): 26 | area_url = area.xpath('./@href').extract_first() 27 | if 'https' not in area_url and 'ershoufang' in area_url: 28 | head_url = response.url.replace('/ershoufang/', '') 29 | area_url = head_url + area_url 30 | yield Request(url=area_url, callback=self.get_all_detail_area) 31 | 32 | def get_all_detail_area(self, response): 33 | all_detail_area = response.xpath('//*[@id="position"]//*[contains(@class, "section_sub_sub_nav")]//a') 34 | for detail_area in all_detail_area: 35 | detail_area_url = detail_area.xpath('./@href').extract_first() 36 | head_url = re.sub(r'/ershoufang.*', '', response.url) 37 | detail_area_url = head_url + detail_area_url 38 | yield Request(url=detail_area_url, callback=self.parse_index) 39 | 40 | def parse_index(self, response): 41 | ljConf = response.xpath('//script[1]').extract_first() 42 | city = re.search(r'city_name: \'(.*?)\',', ljConf).group(1) 43 | area = response.xpath('//*[@id="position"]/dl[2]//*[@class="selected"]/text()').extract()[0] 44 | house_nums = response.xpath('//h2[contains(@class, "total")]/span/text()').extract_first().strip(' ') 45 | 46 | if int(house_nums): 47 | for house_data in response.xpath('.//*[@class="bigImgList"]//div[@class="item"]'): 48 | title = house_data.xpath('.//*[@class="title"]/text()').extract_first() 49 | link_url = house_data.xpath('.//*[@class="title"]/@href').extract_first() 50 | price = house_data.xpath('.//*[@class="price"]/span/text()').extract_first() 51 | house_info = house_data.xpath('.//*[@class="info"]//text()').extract() 52 | region = house_info[0] 53 | zone = house_info[2] 54 | meters = house_info[4] 55 | direction = house_info[6] 56 | house_style = house_info[8] 57 | try: 58 | is_elevator = house_info[10] 59 | except: 60 | is_elevator = '未知' 61 | labels = house_data.xpath('.//*[@class="tag"]//text()').extract() 62 | 63 | test = { 64 | 'title': title, #标题 65 | 'link_url': link_url, #链接 66 | 'price': price, #价格 67 | 'region': region, #所在小区 68 | 'zone': zone, #几室几厅 69 | 'meters': meters, #面积 70 | 'direction': direction, #方向 71 | 'house_style': house_style, #装修 72 | 'is_elevator': is_elevator, #有无电梯 73 | 'labels': labels, #标签 74 | 'city': city, #所在城市 75 | 'area': area, #所在区域 76 | } 77 | print(test) 78 | 79 | #是否翻页 80 | page_datas = response.xpath('//*[contains(@class, "page-box")]/@page-data').extract_first() 81 | page_datas = json.loads(page_datas) 82 | total_page = page_datas['totalPage'] 83 | cur_page = page_datas['curPage'] 84 | if cur_page < total_page: 85 | next_page = cur_page + 1 86 | head_url = re.sub(r'pg\d+/', '', response.url) 87 | next_page_url = head_url + 'pg' + str(next_page) 88 | yield Request(url=area_url, callback=self.parse_index) --------------------------------------------------------------------------------