├── README.md
├── crawl
    ├── .DS_Store
    ├── crawl
    │   ├── .DS_Store
    │   ├── __init__.py
    │   ├── __pycache__
    │   │   ├── __init__.cpython-36.pyc
    │   │   ├── items.cpython-36.pyc
    │   │   ├── pipelines.cpython-36.pyc
    │   │   └── settings.cpython-36.pyc
    │   ├── items.py
    │   ├── middlewares.py
    │   ├── pipelines.py
    │   ├── settings.py
    │   └── spiders
    │   │   ├── .DS_Store
    │   │   ├── LianJiaSpider.py
    │   │   ├── __init__.py
    │   │   ├── __pycache__
    │   │       ├── LianJiaSpider.cpython-36.pyc
    │   │       └── __init__.cpython-36.pyc
    │   │   └── run.bat
    ├── scrapy.cfg
    └── sql
    │   ├── data.sql
    │   ├── house.sql
    │   └── 说明.txt
├── data.xls
├── house.xlsx
├── images
    ├── img1.png
    ├── img2.png
    ├── img3.png
    ├── img4.png
    ├── img5.png
    └── img6.png
└── main.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # 链家数据分析示例
 2 | 
 3 | ![](images/img1.png)
 4 | 
 5 | ![](images/img2.png)
 6 | 
 7 | ![](images/img3.png)
 8 | 
 9 | ![](images/img4.png)
10 | 
11 | ![](images/img5.png)
12 | 
13 | ![](images/img6.png)


--------------------------------------------------------------------------------
/crawl/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/.DS_Store


--------------------------------------------------------------------------------
/crawl/crawl/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/.DS_Store


--------------------------------------------------------------------------------
/crawl/crawl/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/__init__.py


--------------------------------------------------------------------------------
/crawl/crawl/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/crawl/crawl/__pycache__/items.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/__pycache__/items.cpython-36.pyc


--------------------------------------------------------------------------------
/crawl/crawl/__pycache__/pipelines.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/__pycache__/pipelines.cpython-36.pyc


--------------------------------------------------------------------------------
/crawl/crawl/__pycache__/settings.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/__pycache__/settings.cpython-36.pyc


--------------------------------------------------------------------------------
/crawl/crawl/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your scraped items
 4 | #
 5 | # See documentation in:
 6 | # https://doc.scrapy.org/en/latest/topics/items.html
 7 | 
 8 | import scrapy
 9 | 
10 | 
11 | class House(scrapy.Item):
12 |     className = 'house'
13 |     
14 |     # 房屋基本信息
15 |     houseTitle = scrapy.Field() # 房屋标题
16 |     houseTotalMoney = scrapy.Field() #房屋总价
17 |     houseSinglePrice = scrapy.Field() # 房屋单价
18 |     houseDownPayment = scrapy.Field() # 房屋首付
19 |     houseGardenName = scrapy.Field() # 小区名称
20 |     houseLocation = scrapy.Field() # 小区所在位置
21 |     houseNumber = scrapy.Field() # 房屋链家编号
22 | 
23 |     # 房屋基本属性
24 |     houseType = scrapy.Field() # 房屋户型
25 |     houseFloor  = scrapy.Field() # 所在楼层
26 |     houseBuildingArea = scrapy.Field() # 房屋建筑面积
27 |     houseStructure = scrapy.Field() # 户型结构
28 |     houseInnerArea = scrapy.Field() # 房屋套内面积
29 |     houseBuildingType = scrapy.Field() # 建筑类型
30 |     houseOrientation = scrapy.Field() # 房屋朝向
31 |     houseBuildingStructure = scrapy.Field() # 建筑结构
32 |     houseDecoration = scrapy.Field() # 装修情况
33 |     houseElevatorRatio = scrapy.Field() # 梯户比例
34 |     houseElevator = scrapy.Field() # 电梯配备
35 |     housePrivilege = scrapy.Field() # 产权年限
36 |     
37 |     # 房屋交易属性
38 |     houseListDate = scrapy.Field() # 挂牌时间
39 |     houseTradeProperty = scrapy.Field() # 房屋交易权属
40 |     houseLastTrade = scrapy.Field() # 上次交易时间
41 |     houseUsage = scrapy.Field() # 房屋用途
42 |     houseAgeLimit = scrapy.Field() # 房屋年限
43 |     housePrivilegeProperty = scrapy.Field() # 产权所属
44 |     housePledge = scrapy.Field() # 抵押信息
45 |     houseRecord = scrapy.Field() # 房屋备案
46 |     houseImg = scrapy.Field() # 房屋户型图
47 | 
48 |     # 爬虫关键信息
49 |     houseUrl = scrapy.Field() # 房屋URL地址
50 |     houseRefererUrl = scrapy.Field() # 房屋引用URL地址
51 |     houseCrawlTime = scrapy.Field() # 爬取时间
52 | 


--------------------------------------------------------------------------------
/crawl/crawl/middlewares.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Define here the models for your spider middleware
  4 | #
  5 | # See documentation in:
  6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html
  7 | 
  8 | from scrapy import signals
  9 | 
 10 | 
 11 | class CrawlSpiderMiddleware(object):
 12 |     # Not all methods need to be defined. If a method is not defined,
 13 |     # scrapy acts as if the spider middleware does not modify the
 14 |     # passed objects.
 15 | 
 16 |     @classmethod
 17 |     def from_crawler(cls, crawler):
 18 |         # This method is used by Scrapy to create your spiders.
 19 |         s = cls()
 20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 21 |         return s
 22 | 
 23 |     def process_spider_input(self, response, spider):
 24 |         # Called for each response that goes through the spider
 25 |         # middleware and into the spider.
 26 | 
 27 |         # Should return None or raise an exception.
 28 |         return None
 29 | 
 30 |     def process_spider_output(self, response, result, spider):
 31 |         # Called with the results returned from the Spider, after
 32 |         # it has processed the response.
 33 | 
 34 |         # Must return an iterable of Request, dict or Item objects.
 35 |         for i in result:
 36 |             yield i
 37 | 
 38 |     def process_spider_exception(self, response, exception, spider):
 39 |         # Called when a spider or process_spider_input() method
 40 |         # (from other spider middleware) raises an exception.
 41 | 
 42 |         # Should return either None or an iterable of Response, dict
 43 |         # or Item objects.
 44 |         pass
 45 | 
 46 |     def process_start_requests(self, start_requests, spider):
 47 |         # Called with the start requests of the spider, and works
 48 |         # similarly to the process_spider_output() method, except
 49 |         # that it doesn’t have a response associated.
 50 | 
 51 |         # Must return only requests (not items).
 52 |         for r in start_requests:
 53 |             yield r
 54 | 
 55 |     def spider_opened(self, spider):
 56 |         spider.logger.info('Spider opened: %s' % spider.name)
 57 | 
 58 | 
 59 | class CrawlDownloaderMiddleware(object):
 60 |     # Not all methods need to be defined. If a method is not defined,
 61 |     # scrapy acts as if the downloader middleware does not modify the
 62 |     # passed objects.
 63 | 
 64 |     @classmethod
 65 |     def from_crawler(cls, crawler):
 66 |         # This method is used by Scrapy to create your spiders.
 67 |         s = cls()
 68 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
 69 |         return s
 70 | 
 71 |     def process_request(self, request, spider):
 72 |         # Called for each request that goes through the downloader
 73 |         # middleware.
 74 | 
 75 |         # Must either:
 76 |         # - return None: continue processing this request
 77 |         # - or return a Response object
 78 |         # - or return a Request object
 79 |         # - or raise IgnoreRequest: process_exception() methods of
 80 |         #   installed downloader middleware will be called
 81 |         return None
 82 | 
 83 |     def process_response(self, request, response, spider):
 84 |         # Called with the response returned from the downloader.
 85 | 
 86 |         # Must either;
 87 |         # - return a Response object
 88 |         # - return a Request object
 89 |         # - or raise IgnoreRequest
 90 |         return response
 91 | 
 92 |     def process_exception(self, request, exception, spider):
 93 |         # Called when a download handler or a process_request()
 94 |         # (from other downloader middleware) raises an exception.
 95 | 
 96 |         # Must either:
 97 |         # - return None: continue processing this exception
 98 |         # - return a Response object: stops process_exception() chain
 99 |         # - return a Request object: stops process_exception() chain
100 |         pass
101 | 
102 |     def spider_opened(self, spider):
103 |         spider.logger.info('Spider opened: %s' % spider.name)
104 | 
105 | 


--------------------------------------------------------------------------------
/crawl/crawl/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | import datetime,json,logging,pymysql.cursors
10 | import pymongo
11 | from scrapy.conf import settings
12 | 
13 | 
14 | logging.basicConfig(filename='scrapy.log')
15 | logger = logging.getLogger(__name__)
16 | 
17 | class MysqlPipeline(object):
18 | 
19 |     def __init__(self):
20 |         pass
21 | 
22 |     def open_spider(self, spider):
23 |         self.mysql_conn = pymysql.connect(
24 |             host = '127.0.0.1',
25 |             user = 'root',
26 |             password = 'root',
27 |             db = 'temp',
28 |             charset = 'utf8mb4',
29 |             cursorclass = pymysql.cursors.DictCursor)
30 | 
31 |     def process_item(self, item, spider):
32 |         if 'house' == item.className:
33 |             try:
34 |                 sql_search = "select houseTitle from `house` where `houseTitle`=%s"
35 |                 with self.mysql_conn.cursor() as cursor:
36 |                     cursor.execute(sql_search, (item.get('houseTitle', '')))
37 |                     houseIsExist = cursor.fetchone()
38 |                     
39 |                     if houseIsExist is None:
40 |                         sql_write = "insert into `house` (`houseId`, `houseTitle`, `houseTotalMoney`, `houseSinglePrice`, `houseDownPayment`, `houseGardenName`,\
41 |                         `houseLocation`, `houseNumber`, `houseType`, `houseFloor`, `houseBuildingArea`, `houseStructure`, \
42 |                         `houseInnerArea`, `houseBuildingType`, `houseOrientation`, `houseBuildingStructure`, `houseDecoration`, `houseElevatorRatio`, \
43 |                         `houseElevator`, `housePrivilege`, `houseListDate`, `houseTradeProperty`, `houseLastTrade`, `houseUsage`, \
44 |                         `houseAgeLimit`, `housePrivilegeProperty`, `housePledge`, `houseRecord`, `houseImg`, `houseUrl`, `houseRefererUrl`, `houseCrawlTime`) \
45 |                         VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)"
46 | 
47 |                         cursor.execute(sql_write, (0, item.get('houseTitle', ''), item.get('houseTotalMoney', ''), item.get('houseSinglePrice', '') \
48 |                         , item.get('houseDownPayment', ''), item.get('houseGardenName', ''), item.get('houseLocation', ''), item.get('houseNumber', '') \
49 |                         , item.get('houseType', ''), item.get('houseFloor', ''), item.get('houseBuildingArea', ''), item.get('houseStructure', '') \
50 |                         , item.get('houseInnerArea', ''), item.get('houseBuildingType', ''), item.get('houseOrientation', ''), item.get('houseBuildingStructure', '') \
51 |                         , item.get('houseDecoration', ''), item.get('houseElevatorRatio', ''), item.get('houseElevator', ''), item.get('housePrivilege', '') \
52 |                         , item.get('houseListDate', ''), item.get('houseTradeProperty', ''), item.get('houseLastTrade', ''), item.get('houseUsage', '') \
53 |                         , item.get('houseAgeLimit', ''), item.get('housePrivilegeProperty', ''), item.get('housePledge', ''), item.get('houseRecord', '') \
54 |                         , item.get('houseImg', ''), item.get('houseUrl', ''), item.get('houseRefererUrl', ''), item.get('houseCrawlTime', '')))
55 | 
56 |                 self.mysql_conn.commit()
57 |                 
58 |             except Exception as e:
59 |                 logger.error(item.get('houseUrl',''))
60 |                 logger.error(e)
61 |                 logger.error('---------------------------------------\n')
62 |         return item
63 | 
64 |     def close_spider(self, spider):
65 |         self.mysql_conn.close()
66 | 


--------------------------------------------------------------------------------
/crawl/crawl/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for crawl project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     https://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     https://doc.scrapy.org/en/latest/topics/downloader-middleware.html
10 | #     https://doc.scrapy.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'crawl'
13 | 
14 | SPIDER_MODULES = ['crawl.spiders']
15 | NEWSPIDER_MODULE = 'crawl.spiders'
16 | 
17 | 
18 | ITEM_PIPELINES = {
19 |     'crawl.pipelines.MysqlPipeline':100
20 | }
21 | 
22 | LOG_LEVEL = 'ERROR'
23 | LOG_FILE = 'scrapy.log'
24 | 
25 | 
26 | 
27 | # Obey robots.txt rules
28 | ROBOTSTXT_OBEY = True
29 | USER_AGENT = "Baiduspider"
30 | 
31 | 
32 | 


--------------------------------------------------------------------------------
/crawl/crawl/spiders/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/spiders/.DS_Store


--------------------------------------------------------------------------------
/crawl/crawl/spiders/LianJiaSpider.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | # -*- coding: utf-8 -*-
  3 | # created author:tianxing
  4 | # created date:2018-05-30
  5 | 
  6 | import scrapy
  7 | import random
  8 | import datetime
  9 | import re
 10 | from crawl.items import House
 11 | 
 12 | class LianjiaSpider(scrapy.Spider):
 13 |     # 爬虫名称，爬虫时执行scrapy crawl lianjia
 14 |     name = 'lianjia'
 15 | 
 16 |     start_urls = []
 17 |     
 18 |     for page in range(1,101):
 19 |         start_urls.append('https://cd.lianjia.com/ershoufang/pg' + str(page) + '/')
 20 |     
 21 |     # 随机排序
 22 |     random.shuffle(start_urls)
 23 | 
 24 |     def parse(self,response):
 25 |         Links = response.xpath('//ul[@class="sellListContent"]/li')
 26 | 
 27 |         for url in Links:
 28 |             item = House()
 29 |             # 引用URL地址
 30 |             item['houseRefererUrl'] = response.url
 31 | 
 32 |             singleUrl = url.xpath('div[@class="info clear"]/div[@class="title"]/a/@href').extract()[0]
 33 |             
 34 |             # 向子页面发出请求
 35 |             yield scrapy.Request(url=singleUrl,meta={'item':item},callback=self.parse_page)
 36 | 
 37 |     def parse_page(self,response):
 38 |         item = response.meta['item']
 39 | 
 40 |         # 房屋基本信息，房屋标题、房屋总价、房屋单价、房屋首付、房屋税费、小区名称、小区所在位置、房屋链家编号
 41 |         # 房屋标题
 42 |         try:
 43 |             item['houseTitle'] = response.xpath('//div[@class="content"]/div[@class="title"]/h1/text()').extract()[0].strip()
 44 |         except:
 45 |             item['houseTitle'] = ''
 46 |         
 47 |         # 房屋总价
 48 |         try:
 49 |             # 数值
 50 |             houseTotalMoneyNum = response.xpath('//div[@class="price "]/span[@class="total"]/text()').extract()[0].replace(' ','')
 51 |             # 单位
 52 |             houseTotalMoneyUnit = response.xpath('//div[@class="price "]/span[@class="unit"]/span/text()').extract()[0].replace(' ','')
 53 |             item['houseTotalMoney'] =  houseTotalMoneyNum + houseTotalMoneyUnit
 54 |         except:
 55 |             item['houseTotalMoney'] = ''
 56 |         
 57 |         # 房屋单价
 58 |         try:
 59 |             houseSinglePrice = response.xpath('//span[@class="unitPriceValue"]')
 60 |             item['houseSinglePrice'] = houseSinglePrice.xpath('string(.)').extract()[0].replace(' ','')
 61 |         except:
 62 |             item['houseSinglePrice'] = ''
 63 | 
 64 |         # 房屋首付
 65 |         try:
 66 |             item['houseDownPayment'] =  re.findall(r'<span class="taxtext" title="首付(.+?) 税费">', response.text)[0]
 67 |         except:
 68 |             item['houseDownPayment'] = ''
 69 |         
 70 |         # 小区名称
 71 |         try:
 72 |             item['houseGardenName'] = response.xpath('//div[@class="aroundInfo"]/div[@class="communityName"]/a[1]/text()').extract()[0].replace(' ','')
 73 |         except:
 74 |             item['houseGardenName'] = ''
 75 |         
 76 |         # 小区所在位置
 77 |         try:
 78 |             houseLocation = response.xpath('//div[@class="aroundInfo"]/div[@class="areaName"]/span[@class="info"]')
 79 |             item['houseLocation'] = houseLocation.xpath('string(.)').extract()[0].strip().replace('\xa0','\t')
 80 |         except:
 81 |             item['houseLocation'] = ''
 82 |         
 83 |         # 房屋链家编号
 84 |         try:
 85 |             item['houseNumber'] = response.xpath('//div[@class="aroundInfo"]/div[@class="houseRecord"]/span[2]/text()').extract()[0].replace('"','')
 86 |         except:
 87 |             item['houseNumber'] = ''
 88 | 
 89 |         # 房屋基本信息，房屋户型、房屋所在楼层、建筑面积、户型结构、套内面积、建筑类型、房屋朝向、建筑结构、装修情况、梯户比例、配备电梯、产权年限
 90 |         # 基本属性的基准xpath
 91 |         baseXpath = response.xpath('//div[@class="introContent"]/div[@class="base"]/div[@class="content"]/ul')
 92 |         
 93 |         # 房屋户型
 94 |         try:
 95 |             item['houseType'] = baseXpath.xpath('li[1]/text()').extract()[0].replace(' ','')
 96 |         except:
 97 |             item['houseType'] = ''
 98 | 
 99 |         # 所在楼层
100 |         try:
101 |             item['houseFloor'] = baseXpath.xpath('li[2]/text()').extract()[0].replace(' ','')
102 |         except:
103 |             item['houseFloor'] = ''
104 |         
105 |         # 房屋建筑面积
106 |         try:
107 |             item['houseBuildingArea'] = baseXpath.xpath('li[3]/text()').extract()[0].replace(' ','')
108 |         except:
109 |             item['houseBuildingArea'] = ''
110 |         
111 |         # 户型结构
112 |         try:
113 |             item['houseStructure'] = baseXpath.xpath('li[4]/text()').extract()[0].replace(' ','')
114 |         except:
115 |             item['houseStructure'] = ''
116 |         
117 |         # 房屋套内面积
118 |         try:
119 |             item['houseInnerArea'] = baseXpath.xpath('li[5]/text()').extract()[0].replace(' ','')
120 |         except:
121 |             item['houseInnerArea'] = ''
122 |         
123 |         # 建筑类型
124 |         try:
125 |             item['houseBuildingType'] = baseXpath.xpath('li[6]/text()').extract()[0].replace(' ','')
126 |         except:
127 |             item['houseBuildingType'] = ''
128 |         
129 |         # 房屋朝向
130 |         try:
131 |             item['houseOrientation'] = baseXpath.xpath('li[7]/text()').extract()[0].replace(' ','')
132 |         except:
133 |             item['houseOrientation'] = ''
134 | 
135 |         # 建筑结构
136 |         try:
137 |             item['houseBuildingStructure'] = baseXpath.xpath('li[8]/text()').extract()[0].replace(' ','')
138 |         except:
139 |             item['houseBuildingStructure'] = ''
140 |         
141 |         # 装修情况
142 |         try:
143 |             item['houseDecoration'] = baseXpath.xpath('li[9]/text()').extract()[0].replace(' ','')
144 |         except:
145 |             item['houseDecoration'] = ''
146 | 
147 |         # 梯户比例
148 |         try:
149 |             item['houseElevatorRatio'] = baseXpath.xpath('li[10]/text()').extract()[0].replace(' ','')
150 |         except:
151 |             item['houseElevatorRatio'] = ''
152 |         
153 |         # 电梯配备
154 |         try:
155 |             item['houseElevator'] = baseXpath.xpath('li[11]/text()').extract()[0].replace(' ','')
156 |         except:
157 |             item['houseElevator'] = ''
158 |         
159 |         # 产权年限
160 |         try:
161 |             item['housePrivilege'] = baseXpath.xpath('li[12]/text()').extract()[0].replace(' ','')
162 |         except:
163 |             item['housePrivilege'] = ''
164 | 
165 |         # 房屋交易信息，挂牌时间、交易权属、上次交易、房屋用途、房屋年限、产权所属、抵押信息、房屋备案
166 |         # 交易属性的基准xpath
167 |         transXpath = response.xpath('//div[@class="introContent"]/div[@class="transaction"]/div[@class="content"]/ul')
168 |         
169 |         # 挂牌时间
170 |         try:
171 |             item['houseListDate'] = transXpath.xpath('li[1]/span[2]/text()').extract()[0].replace(' ','')
172 |         except:
173 |             item['houseListDate'] = ''
174 |         
175 |         # 交易权属
176 |         try:
177 |             item['houseTradeProperty'] = transXpath.xpath('li[2]/span[2]/text()').extract()[0].replace(' ','')
178 |         except:
179 |             item['houseTradeProperty'] = ''
180 |         
181 |         # 上次交易
182 |         try:
183 |             item['houseLastTrade'] = transXpath.xpath('li[3]/span[2]/text()').extract()[0].replace(' ','')
184 |         except:
185 |             item['houseLastTrade'] = ''
186 |         
187 |         # 房屋用途
188 |         try:
189 |             item['houseUsage'] = transXpath.xpath('li[4]/span[2]/text()').extract()[0].replace(' ','')
190 |         except:
191 |             item['houseUsage'] = ''
192 |         
193 |         # 房屋年限
194 |         try:
195 |             item['houseAgeLimit'] = transXpath.xpath('li[5]/span[2]/text()').extract()[0].replace(' ','')
196 |         except:
197 |             item['houseAgeLimit'] = ''
198 |         
199 |         # 产权所属
200 |         try:
201 |             item['housePrivilegeProperty'] = transXpath.xpath('li[6]/span[2]/text()').extract()[0].replace(' ','')
202 |         except:
203 |             item['housePrivilegeProperty'] = ''
204 |         
205 |         # 抵押信息
206 |         try:
207 |             item['housePledge'] = transXpath.xpath('li[7]/span[2]/text()').extract()[0].replace(' ','').replace('\n','')
208 |         except:
209 |             item['housePledge'] = ''
210 |         
211 |         # 房屋备案
212 |         try:
213 |             item['houseRecord'] = transXpath.xpath('li[8]/span[2]/text()').extract()[0].replace(' ','')
214 |         except:
215 |             item['houseRecord'] = ''
216 | 
217 |         # 房屋户型图
218 |         try:
219 |             item['houseImg'] = response.xpath("//div[@class='imgdiv']/img/@src").extract()[0].replace("240x180","533x400")
220 |         except:
221 |             item['houseImg'] = ''
222 | 
223 |         # 爬虫关键信息
224 |         # 房屋URL地址
225 |         item['houseUrl'] = response.url
226 | 
227 |         # 房屋爬虫时间
228 |         item['houseCrawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S')
229 |         
230 |         # 实际要取得内容
231 |         yield item


--------------------------------------------------------------------------------
/crawl/crawl/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/crawl/crawl/spiders/__pycache__/LianJiaSpider.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/spiders/__pycache__/LianJiaSpider.cpython-36.pyc


--------------------------------------------------------------------------------
/crawl/crawl/spiders/__pycache__/__init__.cpython-36.pyc:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/spiders/__pycache__/__init__.cpython-36.pyc


--------------------------------------------------------------------------------
/crawl/crawl/spiders/run.bat:
--------------------------------------------------------------------------------
1 | scrapy crawl lianjia


--------------------------------------------------------------------------------
/crawl/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = crawl.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = crawl
12 | 


--------------------------------------------------------------------------------
/crawl/sql/house.sql:
--------------------------------------------------------------------------------
 1 | /*
 2 | Navicat MySQL Data Transfer
 3 | 
 4 | Source Server         : mysql_127.0.0.1_local
 5 | Source Server Version : 50722
 6 | Source Host           : 127.0.0.1:3306
 7 | Source Database       : crawl
 8 | 
 9 | Target Server Type    : MYSQL
10 | Target Server Version : 50722
11 | File Encoding         : 65001
12 | 
13 | Date: 2019-07-17 10:56:15
14 | */
15 | 
16 | SET FOREIGN_KEY_CHECKS=0;
17 | 
18 | -- ----------------------------
19 | -- Table structure for house
20 | -- ----------------------------
21 | DROP TABLE IF EXISTS `house`;
22 | CREATE TABLE `house` (
23 |   `houseId` int(11) NOT NULL AUTO_INCREMENT,
24 |   `houseTitle` longtext,
25 |   `houseTotalMoney` longtext,
26 |   `houseSinglePrice` longtext,
27 |   `houseDownPayment` longtext,
28 |   `houseGardenName` longtext,
29 |   `houseLocation` longtext,
30 |   `houseNumber` longtext,
31 |   `houseType` longtext,
32 |   `houseFloor` longtext,
33 |   `houseBuildingArea` longtext,
34 |   `houseStructure` longtext,
35 |   `houseInnerArea` longtext,
36 |   `houseBuildingType` longtext,
37 |   `houseOrientation` longtext,
38 |   `houseBuildingStructure` longtext,
39 |   `houseDecoration` longtext,
40 |   `houseElevatorRatio` longtext,
41 |   `houseElevator` longtext,
42 |   `housePrivilege` longtext,
43 |   `houseListDate` longtext,
44 |   `houseTradeProperty` longtext,
45 |   `houseLastTrade` longtext,
46 |   `houseUsage` longtext,
47 |   `houseAgeLimit` longtext,
48 |   `housePrivilegeProperty` longtext,
49 |   `housePledge` longtext,
50 |   `houseRecord` longtext,
51 |   `houseImg` longtext,
52 |   `houseUrl` longtext,
53 |   `houseRefererUrl` longtext,
54 |   `houseCrawlTime` longtext,
55 |   PRIMARY KEY (`houseId`)
56 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4;
57 | 


--------------------------------------------------------------------------------
/crawl/sql/说明.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/sql/说明.txt


--------------------------------------------------------------------------------
/data.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/data.xls


--------------------------------------------------------------------------------
/house.xlsx:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/house.xlsx


--------------------------------------------------------------------------------
/images/img1.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img1.png


--------------------------------------------------------------------------------
/images/img2.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img2.png


--------------------------------------------------------------------------------
/images/img3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img3.png


--------------------------------------------------------------------------------
/images/img4.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img4.png


--------------------------------------------------------------------------------
/images/img5.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img5.png


--------------------------------------------------------------------------------
/images/img6.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img6.png


--------------------------------------------------------------------------------