├── README.md ├── crawl ├── .DS_Store ├── crawl │ ├── .DS_Store │ ├── __init__.py │ ├── __pycache__ │ │ ├── __init__.cpython-36.pyc │ │ ├── items.cpython-36.pyc │ │ ├── pipelines.cpython-36.pyc │ │ └── settings.cpython-36.pyc │ ├── items.py │ ├── middlewares.py │ ├── pipelines.py │ ├── settings.py │ └── spiders │ │ ├── .DS_Store │ │ ├── LianJiaSpider.py │ │ ├── __init__.py │ │ ├── __pycache__ │ │ ├── LianJiaSpider.cpython-36.pyc │ │ └── __init__.cpython-36.pyc │ │ └── run.bat ├── scrapy.cfg └── sql │ ├── data.sql │ ├── house.sql │ └── 说明.txt ├── data.xls ├── house.xlsx ├── images ├── img1.png ├── img2.png ├── img3.png ├── img4.png ├── img5.png └── img6.png └── main.ipynb /README.md: -------------------------------------------------------------------------------- 1 | # 链家数据分析示例 2 | 3 | ![](images/img1.png) 4 | 5 | ![](images/img2.png) 6 | 7 | ![](images/img3.png) 8 | 9 | ![](images/img4.png) 10 | 11 | ![](images/img5.png) 12 | 13 | ![](images/img6.png) -------------------------------------------------------------------------------- /crawl/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/.DS_Store -------------------------------------------------------------------------------- /crawl/crawl/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/.DS_Store -------------------------------------------------------------------------------- /crawl/crawl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/__init__.py -------------------------------------------------------------------------------- /crawl/crawl/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /crawl/crawl/__pycache__/items.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/__pycache__/items.cpython-36.pyc -------------------------------------------------------------------------------- /crawl/crawl/__pycache__/pipelines.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/__pycache__/pipelines.cpython-36.pyc -------------------------------------------------------------------------------- /crawl/crawl/__pycache__/settings.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/__pycache__/settings.cpython-36.pyc -------------------------------------------------------------------------------- /crawl/crawl/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your scraped items 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/items.html 7 | 8 | import scrapy 9 | 10 | 11 | class House(scrapy.Item): 12 | className = 'house' 13 | 14 | # 房屋基本信息 15 | houseTitle = scrapy.Field() # 房屋标题 16 | houseTotalMoney = scrapy.Field() #房屋总价 17 | houseSinglePrice = scrapy.Field() # 房屋单价 18 | houseDownPayment = scrapy.Field() # 房屋首付 19 | houseGardenName = scrapy.Field() # 小区名称 20 | houseLocation = scrapy.Field() # 小区所在位置 21 | houseNumber = scrapy.Field() # 房屋链家编号 22 | 23 | # 房屋基本属性 24 | houseType = scrapy.Field() # 房屋户型 25 | houseFloor = scrapy.Field() # 所在楼层 26 | houseBuildingArea = scrapy.Field() # 房屋建筑面积 27 | houseStructure = scrapy.Field() # 户型结构 28 | houseInnerArea = scrapy.Field() # 房屋套内面积 29 | houseBuildingType = scrapy.Field() # 建筑类型 30 | houseOrientation = scrapy.Field() # 房屋朝向 31 | houseBuildingStructure = scrapy.Field() # 建筑结构 32 | houseDecoration = scrapy.Field() # 装修情况 33 | houseElevatorRatio = scrapy.Field() # 梯户比例 34 | houseElevator = scrapy.Field() # 电梯配备 35 | housePrivilege = scrapy.Field() # 产权年限 36 | 37 | # 房屋交易属性 38 | houseListDate = scrapy.Field() # 挂牌时间 39 | houseTradeProperty = scrapy.Field() # 房屋交易权属 40 | houseLastTrade = scrapy.Field() # 上次交易时间 41 | houseUsage = scrapy.Field() # 房屋用途 42 | houseAgeLimit = scrapy.Field() # 房屋年限 43 | housePrivilegeProperty = scrapy.Field() # 产权所属 44 | housePledge = scrapy.Field() # 抵押信息 45 | houseRecord = scrapy.Field() # 房屋备案 46 | houseImg = scrapy.Field() # 房屋户型图 47 | 48 | # 爬虫关键信息 49 | houseUrl = scrapy.Field() # 房屋URL地址 50 | houseRefererUrl = scrapy.Field() # 房屋引用URL地址 51 | houseCrawlTime = scrapy.Field() # 爬取时间 52 | -------------------------------------------------------------------------------- /crawl/crawl/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class CrawlSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | 58 | 59 | class CrawlDownloaderMiddleware(object): 60 | # Not all methods need to be defined. If a method is not defined, 61 | # scrapy acts as if the downloader middleware does not modify the 62 | # passed objects. 63 | 64 | @classmethod 65 | def from_crawler(cls, crawler): 66 | # This method is used by Scrapy to create your spiders. 67 | s = cls() 68 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 69 | return s 70 | 71 | def process_request(self, request, spider): 72 | # Called for each request that goes through the downloader 73 | # middleware. 74 | 75 | # Must either: 76 | # - return None: continue processing this request 77 | # - or return a Response object 78 | # - or return a Request object 79 | # - or raise IgnoreRequest: process_exception() methods of 80 | # installed downloader middleware will be called 81 | return None 82 | 83 | def process_response(self, request, response, spider): 84 | # Called with the response returned from the downloader. 85 | 86 | # Must either; 87 | # - return a Response object 88 | # - return a Request object 89 | # - or raise IgnoreRequest 90 | return response 91 | 92 | def process_exception(self, request, exception, spider): 93 | # Called when a download handler or a process_request() 94 | # (from other downloader middleware) raises an exception. 95 | 96 | # Must either: 97 | # - return None: continue processing this exception 98 | # - return a Response object: stops process_exception() chain 99 | # - return a Request object: stops process_exception() chain 100 | pass 101 | 102 | def spider_opened(self, spider): 103 | spider.logger.info('Spider opened: %s' % spider.name) 104 | 105 | -------------------------------------------------------------------------------- /crawl/crawl/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | import datetime,json,logging,pymysql.cursors 10 | import pymongo 11 | from scrapy.conf import settings 12 | 13 | 14 | logging.basicConfig(filename='scrapy.log') 15 | logger = logging.getLogger(__name__) 16 | 17 | class MysqlPipeline(object): 18 | 19 | def __init__(self): 20 | pass 21 | 22 | def open_spider(self, spider): 23 | self.mysql_conn = pymysql.connect( 24 | host = '127.0.0.1', 25 | user = 'root', 26 | password = 'root', 27 | db = 'temp', 28 | charset = 'utf8mb4', 29 | cursorclass = pymysql.cursors.DictCursor) 30 | 31 | def process_item(self, item, spider): 32 | if 'house' == item.className: 33 | try: 34 | sql_search = "select houseTitle from `house` where `houseTitle`=%s" 35 | with self.mysql_conn.cursor() as cursor: 36 | cursor.execute(sql_search, (item.get('houseTitle', ''))) 37 | houseIsExist = cursor.fetchone() 38 | 39 | if houseIsExist is None: 40 | sql_write = "insert into `house` (`houseId`, `houseTitle`, `houseTotalMoney`, `houseSinglePrice`, `houseDownPayment`, `houseGardenName`,\ 41 | `houseLocation`, `houseNumber`, `houseType`, `houseFloor`, `houseBuildingArea`, `houseStructure`, \ 42 | `houseInnerArea`, `houseBuildingType`, `houseOrientation`, `houseBuildingStructure`, `houseDecoration`, `houseElevatorRatio`, \ 43 | `houseElevator`, `housePrivilege`, `houseListDate`, `houseTradeProperty`, `houseLastTrade`, `houseUsage`, \ 44 | `houseAgeLimit`, `housePrivilegeProperty`, `housePledge`, `houseRecord`, `houseImg`, `houseUrl`, `houseRefererUrl`, `houseCrawlTime`) \ 45 | VALUES (%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s,%s)" 46 | 47 | cursor.execute(sql_write, (0, item.get('houseTitle', ''), item.get('houseTotalMoney', ''), item.get('houseSinglePrice', '') \ 48 | , item.get('houseDownPayment', ''), item.get('houseGardenName', ''), item.get('houseLocation', ''), item.get('houseNumber', '') \ 49 | , item.get('houseType', ''), item.get('houseFloor', ''), item.get('houseBuildingArea', ''), item.get('houseStructure', '') \ 50 | , item.get('houseInnerArea', ''), item.get('houseBuildingType', ''), item.get('houseOrientation', ''), item.get('houseBuildingStructure', '') \ 51 | , item.get('houseDecoration', ''), item.get('houseElevatorRatio', ''), item.get('houseElevator', ''), item.get('housePrivilege', '') \ 52 | , item.get('houseListDate', ''), item.get('houseTradeProperty', ''), item.get('houseLastTrade', ''), item.get('houseUsage', '') \ 53 | , item.get('houseAgeLimit', ''), item.get('housePrivilegeProperty', ''), item.get('housePledge', ''), item.get('houseRecord', '') \ 54 | , item.get('houseImg', ''), item.get('houseUrl', ''), item.get('houseRefererUrl', ''), item.get('houseCrawlTime', ''))) 55 | 56 | self.mysql_conn.commit() 57 | 58 | except Exception as e: 59 | logger.error(item.get('houseUrl','')) 60 | logger.error(e) 61 | logger.error('---------------------------------------\n') 62 | return item 63 | 64 | def close_spider(self, spider): 65 | self.mysql_conn.close() 66 | -------------------------------------------------------------------------------- /crawl/crawl/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for crawl project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # https://doc.scrapy.org/en/latest/topics/settings.html 9 | # https://doc.scrapy.org/en/latest/topics/downloader-middleware.html 10 | # https://doc.scrapy.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'crawl' 13 | 14 | SPIDER_MODULES = ['crawl.spiders'] 15 | NEWSPIDER_MODULE = 'crawl.spiders' 16 | 17 | 18 | ITEM_PIPELINES = { 19 | 'crawl.pipelines.MysqlPipeline':100 20 | } 21 | 22 | LOG_LEVEL = 'ERROR' 23 | LOG_FILE = 'scrapy.log' 24 | 25 | 26 | 27 | # Obey robots.txt rules 28 | ROBOTSTXT_OBEY = True 29 | USER_AGENT = "Baiduspider" 30 | 31 | 32 | -------------------------------------------------------------------------------- /crawl/crawl/spiders/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/spiders/.DS_Store -------------------------------------------------------------------------------- /crawl/crawl/spiders/LianJiaSpider.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python3 2 | # -*- coding: utf-8 -*- 3 | # created author:tianxing 4 | # created date:2018-05-30 5 | 6 | import scrapy 7 | import random 8 | import datetime 9 | import re 10 | from crawl.items import House 11 | 12 | class LianjiaSpider(scrapy.Spider): 13 | # 爬虫名称,爬虫时执行scrapy crawl lianjia 14 | name = 'lianjia' 15 | 16 | start_urls = [] 17 | 18 | for page in range(1,101): 19 | start_urls.append('https://cd.lianjia.com/ershoufang/pg' + str(page) + '/') 20 | 21 | # 随机排序 22 | random.shuffle(start_urls) 23 | 24 | def parse(self,response): 25 | Links = response.xpath('//ul[@class="sellListContent"]/li') 26 | 27 | for url in Links: 28 | item = House() 29 | # 引用URL地址 30 | item['houseRefererUrl'] = response.url 31 | 32 | singleUrl = url.xpath('div[@class="info clear"]/div[@class="title"]/a/@href').extract()[0] 33 | 34 | # 向子页面发出请求 35 | yield scrapy.Request(url=singleUrl,meta={'item':item},callback=self.parse_page) 36 | 37 | def parse_page(self,response): 38 | item = response.meta['item'] 39 | 40 | # 房屋基本信息,房屋标题、房屋总价、房屋单价、房屋首付、房屋税费、小区名称、小区所在位置、房屋链家编号 41 | # 房屋标题 42 | try: 43 | item['houseTitle'] = response.xpath('//div[@class="content"]/div[@class="title"]/h1/text()').extract()[0].strip() 44 | except: 45 | item['houseTitle'] = '' 46 | 47 | # 房屋总价 48 | try: 49 | # 数值 50 | houseTotalMoneyNum = response.xpath('//div[@class="price "]/span[@class="total"]/text()').extract()[0].replace(' ','') 51 | # 单位 52 | houseTotalMoneyUnit = response.xpath('//div[@class="price "]/span[@class="unit"]/span/text()').extract()[0].replace(' ','') 53 | item['houseTotalMoney'] = houseTotalMoneyNum + houseTotalMoneyUnit 54 | except: 55 | item['houseTotalMoney'] = '' 56 | 57 | # 房屋单价 58 | try: 59 | houseSinglePrice = response.xpath('//span[@class="unitPriceValue"]') 60 | item['houseSinglePrice'] = houseSinglePrice.xpath('string(.)').extract()[0].replace(' ','') 61 | except: 62 | item['houseSinglePrice'] = '' 63 | 64 | # 房屋首付 65 | try: 66 | item['houseDownPayment'] = re.findall(r'', response.text)[0] 67 | except: 68 | item['houseDownPayment'] = '' 69 | 70 | # 小区名称 71 | try: 72 | item['houseGardenName'] = response.xpath('//div[@class="aroundInfo"]/div[@class="communityName"]/a[1]/text()').extract()[0].replace(' ','') 73 | except: 74 | item['houseGardenName'] = '' 75 | 76 | # 小区所在位置 77 | try: 78 | houseLocation = response.xpath('//div[@class="aroundInfo"]/div[@class="areaName"]/span[@class="info"]') 79 | item['houseLocation'] = houseLocation.xpath('string(.)').extract()[0].strip().replace('\xa0','\t') 80 | except: 81 | item['houseLocation'] = '' 82 | 83 | # 房屋链家编号 84 | try: 85 | item['houseNumber'] = response.xpath('//div[@class="aroundInfo"]/div[@class="houseRecord"]/span[2]/text()').extract()[0].replace('"','') 86 | except: 87 | item['houseNumber'] = '' 88 | 89 | # 房屋基本信息,房屋户型、房屋所在楼层、建筑面积、户型结构、套内面积、建筑类型、房屋朝向、建筑结构、装修情况、梯户比例、配备电梯、产权年限 90 | # 基本属性的基准xpath 91 | baseXpath = response.xpath('//div[@class="introContent"]/div[@class="base"]/div[@class="content"]/ul') 92 | 93 | # 房屋户型 94 | try: 95 | item['houseType'] = baseXpath.xpath('li[1]/text()').extract()[0].replace(' ','') 96 | except: 97 | item['houseType'] = '' 98 | 99 | # 所在楼层 100 | try: 101 | item['houseFloor'] = baseXpath.xpath('li[2]/text()').extract()[0].replace(' ','') 102 | except: 103 | item['houseFloor'] = '' 104 | 105 | # 房屋建筑面积 106 | try: 107 | item['houseBuildingArea'] = baseXpath.xpath('li[3]/text()').extract()[0].replace(' ','') 108 | except: 109 | item['houseBuildingArea'] = '' 110 | 111 | # 户型结构 112 | try: 113 | item['houseStructure'] = baseXpath.xpath('li[4]/text()').extract()[0].replace(' ','') 114 | except: 115 | item['houseStructure'] = '' 116 | 117 | # 房屋套内面积 118 | try: 119 | item['houseInnerArea'] = baseXpath.xpath('li[5]/text()').extract()[0].replace(' ','') 120 | except: 121 | item['houseInnerArea'] = '' 122 | 123 | # 建筑类型 124 | try: 125 | item['houseBuildingType'] = baseXpath.xpath('li[6]/text()').extract()[0].replace(' ','') 126 | except: 127 | item['houseBuildingType'] = '' 128 | 129 | # 房屋朝向 130 | try: 131 | item['houseOrientation'] = baseXpath.xpath('li[7]/text()').extract()[0].replace(' ','') 132 | except: 133 | item['houseOrientation'] = '' 134 | 135 | # 建筑结构 136 | try: 137 | item['houseBuildingStructure'] = baseXpath.xpath('li[8]/text()').extract()[0].replace(' ','') 138 | except: 139 | item['houseBuildingStructure'] = '' 140 | 141 | # 装修情况 142 | try: 143 | item['houseDecoration'] = baseXpath.xpath('li[9]/text()').extract()[0].replace(' ','') 144 | except: 145 | item['houseDecoration'] = '' 146 | 147 | # 梯户比例 148 | try: 149 | item['houseElevatorRatio'] = baseXpath.xpath('li[10]/text()').extract()[0].replace(' ','') 150 | except: 151 | item['houseElevatorRatio'] = '' 152 | 153 | # 电梯配备 154 | try: 155 | item['houseElevator'] = baseXpath.xpath('li[11]/text()').extract()[0].replace(' ','') 156 | except: 157 | item['houseElevator'] = '' 158 | 159 | # 产权年限 160 | try: 161 | item['housePrivilege'] = baseXpath.xpath('li[12]/text()').extract()[0].replace(' ','') 162 | except: 163 | item['housePrivilege'] = '' 164 | 165 | # 房屋交易信息,挂牌时间、交易权属、上次交易、房屋用途、房屋年限、产权所属、抵押信息、房屋备案 166 | # 交易属性的基准xpath 167 | transXpath = response.xpath('//div[@class="introContent"]/div[@class="transaction"]/div[@class="content"]/ul') 168 | 169 | # 挂牌时间 170 | try: 171 | item['houseListDate'] = transXpath.xpath('li[1]/span[2]/text()').extract()[0].replace(' ','') 172 | except: 173 | item['houseListDate'] = '' 174 | 175 | # 交易权属 176 | try: 177 | item['houseTradeProperty'] = transXpath.xpath('li[2]/span[2]/text()').extract()[0].replace(' ','') 178 | except: 179 | item['houseTradeProperty'] = '' 180 | 181 | # 上次交易 182 | try: 183 | item['houseLastTrade'] = transXpath.xpath('li[3]/span[2]/text()').extract()[0].replace(' ','') 184 | except: 185 | item['houseLastTrade'] = '' 186 | 187 | # 房屋用途 188 | try: 189 | item['houseUsage'] = transXpath.xpath('li[4]/span[2]/text()').extract()[0].replace(' ','') 190 | except: 191 | item['houseUsage'] = '' 192 | 193 | # 房屋年限 194 | try: 195 | item['houseAgeLimit'] = transXpath.xpath('li[5]/span[2]/text()').extract()[0].replace(' ','') 196 | except: 197 | item['houseAgeLimit'] = '' 198 | 199 | # 产权所属 200 | try: 201 | item['housePrivilegeProperty'] = transXpath.xpath('li[6]/span[2]/text()').extract()[0].replace(' ','') 202 | except: 203 | item['housePrivilegeProperty'] = '' 204 | 205 | # 抵押信息 206 | try: 207 | item['housePledge'] = transXpath.xpath('li[7]/span[2]/text()').extract()[0].replace(' ','').replace('\n','') 208 | except: 209 | item['housePledge'] = '' 210 | 211 | # 房屋备案 212 | try: 213 | item['houseRecord'] = transXpath.xpath('li[8]/span[2]/text()').extract()[0].replace(' ','') 214 | except: 215 | item['houseRecord'] = '' 216 | 217 | # 房屋户型图 218 | try: 219 | item['houseImg'] = response.xpath("//div[@class='imgdiv']/img/@src").extract()[0].replace("240x180","533x400") 220 | except: 221 | item['houseImg'] = '' 222 | 223 | # 爬虫关键信息 224 | # 房屋URL地址 225 | item['houseUrl'] = response.url 226 | 227 | # 房屋爬虫时间 228 | item['houseCrawlTime'] = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') 229 | 230 | # 实际要取得内容 231 | yield item -------------------------------------------------------------------------------- /crawl/crawl/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /crawl/crawl/spiders/__pycache__/LianJiaSpider.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/spiders/__pycache__/LianJiaSpider.cpython-36.pyc -------------------------------------------------------------------------------- /crawl/crawl/spiders/__pycache__/__init__.cpython-36.pyc: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/crawl/spiders/__pycache__/__init__.cpython-36.pyc -------------------------------------------------------------------------------- /crawl/crawl/spiders/run.bat: -------------------------------------------------------------------------------- 1 | scrapy crawl lianjia -------------------------------------------------------------------------------- /crawl/scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.io/en/latest/deploy.html 5 | 6 | [settings] 7 | default = crawl.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = crawl 12 | -------------------------------------------------------------------------------- /crawl/sql/house.sql: -------------------------------------------------------------------------------- 1 | /* 2 | Navicat MySQL Data Transfer 3 | 4 | Source Server : mysql_127.0.0.1_local 5 | Source Server Version : 50722 6 | Source Host : 127.0.0.1:3306 7 | Source Database : crawl 8 | 9 | Target Server Type : MYSQL 10 | Target Server Version : 50722 11 | File Encoding : 65001 12 | 13 | Date: 2019-07-17 10:56:15 14 | */ 15 | 16 | SET FOREIGN_KEY_CHECKS=0; 17 | 18 | -- ---------------------------- 19 | -- Table structure for house 20 | -- ---------------------------- 21 | DROP TABLE IF EXISTS `house`; 22 | CREATE TABLE `house` ( 23 | `houseId` int(11) NOT NULL AUTO_INCREMENT, 24 | `houseTitle` longtext, 25 | `houseTotalMoney` longtext, 26 | `houseSinglePrice` longtext, 27 | `houseDownPayment` longtext, 28 | `houseGardenName` longtext, 29 | `houseLocation` longtext, 30 | `houseNumber` longtext, 31 | `houseType` longtext, 32 | `houseFloor` longtext, 33 | `houseBuildingArea` longtext, 34 | `houseStructure` longtext, 35 | `houseInnerArea` longtext, 36 | `houseBuildingType` longtext, 37 | `houseOrientation` longtext, 38 | `houseBuildingStructure` longtext, 39 | `houseDecoration` longtext, 40 | `houseElevatorRatio` longtext, 41 | `houseElevator` longtext, 42 | `housePrivilege` longtext, 43 | `houseListDate` longtext, 44 | `houseTradeProperty` longtext, 45 | `houseLastTrade` longtext, 46 | `houseUsage` longtext, 47 | `houseAgeLimit` longtext, 48 | `housePrivilegeProperty` longtext, 49 | `housePledge` longtext, 50 | `houseRecord` longtext, 51 | `houseImg` longtext, 52 | `houseUrl` longtext, 53 | `houseRefererUrl` longtext, 54 | `houseCrawlTime` longtext, 55 | PRIMARY KEY (`houseId`) 56 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4; 57 | -------------------------------------------------------------------------------- /crawl/sql/说明.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/crawl/sql/说明.txt -------------------------------------------------------------------------------- /data.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/data.xls -------------------------------------------------------------------------------- /house.xlsx: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/house.xlsx -------------------------------------------------------------------------------- /images/img1.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img1.png -------------------------------------------------------------------------------- /images/img2.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img2.png -------------------------------------------------------------------------------- /images/img3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img3.png -------------------------------------------------------------------------------- /images/img4.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img4.png -------------------------------------------------------------------------------- /images/img5.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img5.png -------------------------------------------------------------------------------- /images/img6.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PowersYang/Lianjia_analysis/baaec31f9a898babeec162fea302fc642507ffe4/images/img6.png --------------------------------------------------------------------------------