├── .gitignore ├── README.md ├── data ├── data_for_sale.jl └── data_sold.jl ├── scrapy.cfg └── trulia_scraper ├── __init__.py ├── items.py ├── middlewares.py ├── parsing.py ├── pipelines.py ├── settings.py └── spiders ├── __init__.py ├── trulia.py └── trulia_sold.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | env/ 12 | build/ 13 | develop-eggs/ 14 | dist/ 15 | downloads/ 16 | eggs/ 17 | .eggs/ 18 | lib/ 19 | lib64/ 20 | parts/ 21 | sdist/ 22 | var/ 23 | wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | 49 | # Translations 50 | *.mo 51 | *.pot 52 | 53 | # Django stuff: 54 | *.log 55 | local_settings.py 56 | 57 | # Flask stuff: 58 | instance/ 59 | .webassets-cache 60 | 61 | # Scrapy stuff: 62 | .scrapy 63 | 64 | # Sphinx documentation 65 | docs/_build/ 66 | 67 | # PyBuilder 68 | target/ 69 | 70 | # Jupyter Notebook 71 | .ipynb_checkpoints 72 | 73 | # pyenv 74 | .python-version 75 | 76 | # celery beat schedule file 77 | celerybeat-schedule 78 | 79 | # SageMath parsed files 80 | *.sage.py 81 | 82 | # dotenv 83 | .env 84 | 85 | # virtualenv 86 | .venv 87 | venv/ 88 | ENV/ 89 | 90 | # Spyder project settings 91 | .spyderproject 92 | .spyproject 93 | 94 | # Rope project settings 95 | .ropeproject 96 | 97 | # mkdocs documentation 98 | /site 99 | 100 | # mypy 101 | .mypy_cache/ 102 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # trulia-scraper 2 | Scraper for real estate listings on [Trulia.com](https://www.trulia.com/) implemented in Python with Scrapy. 3 | 4 | ## Basic usage 5 | To crawl the scraper, you need to install [Python 3](https://www.python.org/download/releases/3.0/), as well as the [Scrapy](https://pypi.python.org/pypi/Scrapy) framework and the [Pyparsing](https://pypi.python.org/pypi/pyparsing/2.2.0) module. The scraper features two spiders: 6 | 7 | 1. `trulia`, which scrapes all real estate listings which are _for sale_ in a given state and city starting from a URL such as [https://www.trulia.com/CA/San_Francisco/](https://www.trulia.com/CA/San_Francisco/); 8 | 2. `trulia_sold`, which similarly scrapes listings of recently _sold_ properties starting from a URL such as [https://www.trulia.com/sold/San_Francisco,CA/](https://www.trulia.com/sold/San_Francisco,CA/). 9 | 10 | To crawl the `trulia_sold` spider for the state of `CA` and city of `San_Francisco` (the default locale), simply run the command 11 | 12 | ``` 13 | scrapy crawl trulia_sold 14 | ``` 15 | from the project directory. To scrape listings for another city, specify the `city` and `state` arguments using the `-a` flag. For example, 16 | 17 | ``` 18 | scrapy crawl trulia_sold -a state=NY -a city=New_York 19 | ``` 20 | will scrape all listings reachable from [https://www.trulia.com/sold/New_York,NY/](https://www.trulia.com/sold/New_York,NY/). 21 | 22 | By default, the scraped data will be stored (using Scrapy's [feed export](https://doc.scrapy.org/en/latest/topics/feed-exports.html)) in the `data` directory as a [JSON lines](http://jsonlines.org/) (`.jl`) file following the naming convention 23 | 24 | ``` 25 | data_{sold|for_sale}_{state}_{city}_{time}.jl 26 | ``` 27 | 28 | where `{sold|for_sale}` is `sold` or `for_sale` for the `trulia` and `trulia_sold` spiders, respectively, `{state}` and `{city}` are the specified state and city (e.g. `CA` and `San_Francisco`, respectively), and `{time}` represents the current UTC time. 29 | 30 | If you prefer a different output file name and format, you can specify this from the command line using Scrapy's `-o` option. For example, 31 | 32 | ``` 33 | scrapy crawl trulia_sold -a state=WA -city=Seattle -o data_Seattle.csv 34 | ``` 35 | will output the data in CSV format as `data_Seattle.csv`. (Scrapy automatically picks up the file format from the specified file extension). -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = trulia_scraper.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = trulia_scraper 12 | -------------------------------------------------------------------------------- /trulia_scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/khpeek/trulia-scraper/2c3313d6d4fdfe55acc56965145f4aa7e5cac86c/trulia_scraper/__init__.py -------------------------------------------------------------------------------- /trulia_scraper/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | from scrapy.loader import ItemLoader 3 | from scrapy.loader.processors import TakeFirst, MapCompose, Identity, Compose 4 | import scrapy 5 | from trulia_scraper.parsing import remove_empty, get_number_from_string 6 | 7 | 8 | class TruliaItem(scrapy.Item): 9 | url = scrapy.Field() 10 | address = scrapy.Field() 11 | city_state = scrapy.Field() 12 | price = scrapy.Field() # for items on sale only 13 | neighborhood = scrapy.Field() 14 | overview = scrapy.Field() 15 | description = scrapy.Field() 16 | 17 | # Columns from the 'price events' table are stored in separate lists 18 | prices = scrapy.Field() 19 | dates = scrapy.Field() 20 | events = scrapy.Field() 21 | 22 | # Property tax information is on 'sold' pages only 23 | property_tax_assessment_year = scrapy.Field() 24 | property_tax = scrapy.Field() 25 | property_tax_assessment_land = scrapy.Field() 26 | property_tax_assessment_improvements = scrapy.Field() 27 | property_tax_assessment_total = scrapy.Field() 28 | property_tax_market_value = scrapy.Field() 29 | 30 | # The 'Features' sections is on 'for sale' pages only 31 | listing_information = scrapy.Field() 32 | listing_information_date_updated = scrapy.Field() 33 | public_records = scrapy.Field() 34 | public_records_date_updated = scrapy.Field() 35 | 36 | # Items generated from further parsing of 'raw' scraped data 37 | area = scrapy.Field() 38 | lot_size = scrapy.Field() 39 | lot_size_units = scrapy.Field() 40 | price_per_square_foot = scrapy.Field() # For properties on sale only 41 | bedrooms = scrapy.Field() 42 | bathrooms = scrapy.Field() 43 | year_built = scrapy.Field() 44 | days_on_Trulia = scrapy.Field() 45 | views = scrapy.Field() 46 | price_history = scrapy.Field() 47 | 48 | 49 | class TruliaItemLoader(ItemLoader): 50 | default_input_processor = MapCompose(str.strip) 51 | default_output_processor = TakeFirst() 52 | 53 | price_out = Compose(TakeFirst(), lambda s: int(s.replace(',', ''))) 54 | overview_out = Identity() 55 | description_out = Compose(remove_empty) 56 | prices_out = Identity() 57 | dates_out = Compose(remove_empty) 58 | events_out = Compose(remove_empty) 59 | 60 | listing_information_out = Identity() 61 | public_records_out = Identity() 62 | 63 | area_out = Compose(TakeFirst(), get_number_from_string) 64 | lot_size_out = Compose(TakeFirst(), get_number_from_string) 65 | price_per_square_foot_out = Compose(TakeFirst(), get_number_from_string) 66 | bedrooms_out = Compose(TakeFirst(), int) 67 | bathrooms_out = Compose(TakeFirst(), int) 68 | year_built_out = Compose(TakeFirst(), int) 69 | days_on_Trulia_out = Compose(TakeFirst(), lambda s: int(s.replace(',', ''))) 70 | views_out = Compose(TakeFirst(), lambda s: int(s.replace(',', ''))) 71 | -------------------------------------------------------------------------------- /trulia_scraper/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class TruliaScraperSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /trulia_scraper/parsing.py: -------------------------------------------------------------------------------- 1 | def remove_empty(l): 2 | '''Remove items which evaluate to False (such as empty strings) from the input list.''' 3 | return [x for x in l if x] 4 | 5 | def get_number_from_string(string, number_type=float): 6 | '''Remove commas from the input string and parse as a number''' 7 | return number_type(string.replace(',', '')) -------------------------------------------------------------------------------- /trulia_scraper/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | 9 | class TruliaScraperPipeline(object): 10 | def process_item(self, item, spider): 11 | return item 12 | -------------------------------------------------------------------------------- /trulia_scraper/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for trulia_scraper project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'trulia_scraper' 13 | 14 | SPIDER_MODULES = ['trulia_scraper.spiders'] 15 | NEWSPIDER_MODULE = 'trulia_scraper.spiders' 16 | 17 | 18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent 19 | #USER_AGENT = 'trulia_scraper (+http://www.yourdomain.com)' 20 | 21 | # Obey robots.txt rules 22 | ROBOTSTXT_OBEY = True 23 | 24 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 25 | #CONCURRENT_REQUESTS = 32 26 | 27 | # Configure a delay for requests for the same website (default: 0) 28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 29 | # See also autothrottle settings and docs 30 | #DOWNLOAD_DELAY = 3 31 | # The download delay setting will honor only one of: 32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16 33 | #CONCURRENT_REQUESTS_PER_IP = 16 34 | 35 | # Disable cookies (enabled by default) 36 | #COOKIES_ENABLED = False 37 | 38 | # Disable Telnet Console (enabled by default) 39 | #TELNETCONSOLE_ENABLED = False 40 | 41 | # Override the default request headers: 42 | #DEFAULT_REQUEST_HEADERS = { 43 | # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 44 | # 'Accept-Language': 'en', 45 | #} 46 | 47 | # Enable or disable spider middlewares 48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 49 | #SPIDER_MIDDLEWARES = { 50 | # 'trulia_scraper.middlewares.TruliaScraperSpiderMiddleware': 543, 51 | #} 52 | 53 | # Enable or disable downloader middlewares 54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 55 | #DOWNLOADER_MIDDLEWARES = { 56 | # 'trulia_scraper.middlewares.MyCustomDownloaderMiddleware': 543, 57 | #} 58 | 59 | # Enable or disable extensions 60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 61 | #EXTENSIONS = { 62 | # 'scrapy.extensions.telnet.TelnetConsole': None, 63 | #} 64 | 65 | # Configure item pipelines 66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 67 | #ITEM_PIPELINES = { 68 | # 'trulia_scraper.pipelines.TruliaScraperPipeline': 300, 69 | #} 70 | 71 | # Enable and configure the AutoThrottle extension (disabled by default) 72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 73 | #AUTOTHROTTLE_ENABLED = True 74 | # The initial download delay 75 | #AUTOTHROTTLE_START_DELAY = 5 76 | # The maximum download delay to be set in case of high latencies 77 | #AUTOTHROTTLE_MAX_DELAY = 60 78 | # The average number of requests Scrapy should be sending in parallel to 79 | # each remote server 80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0 81 | # Enable showing throttling stats for every response received: 82 | #AUTOTHROTTLE_DEBUG = False 83 | 84 | # Enable and configure HTTP caching (disabled by default) 85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 86 | #HTTPCACHE_ENABLED = True 87 | #HTTPCACHE_EXPIRATION_SECS = 0 88 | #HTTPCACHE_DIR = 'httpcache' 89 | #HTTPCACHE_IGNORE_HTTP_CODES = [] 90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 91 | -------------------------------------------------------------------------------- /trulia_scraper/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /trulia_scraper/spiders/trulia.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import scrapy 4 | import math 5 | import datetime 6 | from scrapy.linkextractors import LinkExtractor 7 | from trulia_scraper.items import TruliaItem, TruliaItemLoader 8 | from trulia_scraper.parsing import get_number_from_string 9 | from scrapy.utils.conf import closest_scrapy_cfg 10 | 11 | 12 | class TruliaSpider(scrapy.Spider): 13 | name = 'trulia' 14 | allowed_domains = ['trulia.com'] 15 | custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_for_sale_%(state)s_%(city)s_%(time)s.jl'), 16 | 'FEED_FORMAT': 'jsonlines'} 17 | 18 | def __init__(self, state='CA', city='San_Francisco', *args, **kwargs): 19 | super().__init__(*args, **kwargs) 20 | self.state = state 21 | self.city = city 22 | self.start_urls = ['http://trulia.com/{state}/{city}'.format(state=state, city=city)] 23 | self.le = LinkExtractor(allow=r'^https://www.trulia.com/property') 24 | 25 | def parse(self, response): 26 | N = self.get_number_of_pages_to_scrape(response) 27 | self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N)) 28 | for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]: 29 | yield scrapy.Request(url=url, callback=self.parse_index_page) 30 | 31 | @staticmethod 32 | def get_number_of_pages_to_scrape(response): 33 | pagination = response.css('.paginationContainer').xpath('.//*/text()[contains(., "Results")]') 34 | number_of_results = int(pagination.re_first(r'^1 - 30 of ([\d,]+) Results$').replace(',', '')) 35 | return math.ceil(number_of_results/30) 36 | 37 | def parse_index_page(self, response): 38 | for link in self.le.extract_links(response): 39 | yield scrapy.Request(url=link.url, callback=self.parse_property_page) 40 | 41 | def parse_property_page(self, response): 42 | l = TruliaItemLoader(item=TruliaItem(), response=response) 43 | self.load_common_fields(item_loader=l, response=response) 44 | 45 | listing_information = l.nested_xpath('//span[text() = "LISTING INFORMATION"]') 46 | listing_information.add_xpath('listing_information', './parent::div/following-sibling::ul[1]/li/text()') 47 | listing_information.add_xpath('listing_information_date_updated', './following-sibling::span/text()', re=r'^Updated: (.*)') 48 | 49 | public_records = l.nested_xpath('//span[text() = "PUBLIC RECORDS"]') 50 | public_records.add_xpath('public_records', './parent::div/following-sibling::ul[1]/li/text()') 51 | public_records.add_xpath('public_records_date_updated', './following-sibling::span/text()', re=r'^Updated: (.*)') 52 | 53 | item = l.load_item() 54 | self.post_process(item=item) 55 | return item 56 | 57 | @staticmethod 58 | def load_common_fields(item_loader, response): 59 | '''Load field values which are common to "on sale" and "recently sold" properties.''' 60 | item_loader.add_value('url', response.url) 61 | item_loader.add_xpath('address', '//*[@data-role="address"]/text()') 62 | item_loader.add_xpath('city_state', '//*[@data-role="cityState"]/text()') 63 | item_loader.add_xpath('price', '//span[@data-role="price"]/text()', re=r'\$([\d,]+)') 64 | item_loader.add_xpath('neighborhood', '//*[@data-role="cityState"]/parent::h1/following-sibling::span/a/text()') 65 | details = item_loader.nested_css('.homeDetailsHeading') 66 | overview = details.nested_xpath('.//span[contains(text(), "Overview")]/parent::div/following-sibling::div[1]') 67 | overview.add_xpath('overview', xpath='.//li/text()') 68 | overview.add_xpath('area', xpath='.//li/text()', re=r'([\d,]+) sqft$') 69 | overview.add_xpath('lot_size', xpath='.//li/text()', re=r'([\d,.]+) (?:acres|sqft) lot size$') 70 | overview.add_xpath('lot_size_units', xpath='.//li/text()', re=r'[\d,.]+ (acres|sqft) lot size$') 71 | overview.add_xpath('price_per_square_foot', xpath='.//li/text()', re=r'\$([\d,.]+)/sqft$') 72 | overview.add_xpath('bedrooms', xpath='.//li/text()', re=r'(\d+) (?:Beds|Bed|beds|bed)$') 73 | overview.add_xpath('bathrooms', xpath='.//li/text()', re=r'(\d+) (?:Baths|Bath|baths|bath)$') 74 | overview.add_xpath('year_built', xpath='.//li/text()', re=r'Built in (\d+)') 75 | overview.add_xpath('days_on_Trulia', xpath='.//li/text()', re=r'([\d,]) days on Trulia$') 76 | overview.add_xpath('views', xpath='.//li/text()', re=r'([\d,]+) views$') 77 | item_loader.add_css('description', '#descriptionContainer *::text') 78 | 79 | price_events = details.nested_xpath('.//*[text() = "Price History"]/parent::*/following-sibling::*[1]/div/div') 80 | price_events.add_xpath('prices', './div[contains(text(), "$")]/text()') 81 | price_events.add_xpath('dates', './div[contains(text(), "$")]/preceding-sibling::div/text()') 82 | price_events.add_xpath('events', './div[contains(text(), "$")]/following-sibling::div/text()') 83 | 84 | @staticmethod 85 | def post_process(item): 86 | '''Add any additional data to an item after loading it''' 87 | if item.get('dates') is not None: 88 | dates = [datetime.datetime.strptime(date, '%m/%d/%Y') for date in item['dates']] 89 | prices = [int(price.lstrip('$').replace(',', '')) for price in item['prices']] 90 | item['price_history'] = sorted(list(zip(dates, prices, item['events'])), key=lambda x: x[0]) 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /trulia_scraper/spiders/trulia_sold.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | import os 3 | import scrapy 4 | from scrapy.linkextractors import LinkExtractor 5 | import trulia_scraper.parsing as parsing 6 | from trulia_scraper.items import TruliaItem, TruliaItemLoader 7 | import trulia_scraper.spiders.trulia as trulia 8 | from scrapy.utils.conf import closest_scrapy_cfg 9 | 10 | 11 | class TruliaSpider(scrapy.Spider): 12 | name = 'trulia_sold' 13 | allowed_domains = ['trulia.com'] 14 | custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_sold_%(state)s_%(city)s_%(time)s.jl'), 15 | 'FEED_FORMAT': 'jsonlines'} 16 | 17 | def __init__(self, state='CA', city='San_Francisco', *args, **kwargs): 18 | super().__init__(*args, **kwargs) 19 | self.state = state 20 | self.city = city 21 | self.start_urls = ['http://trulia.com/sold/{city},{state}/'.format(state=state, city=city)] 22 | self.le = LinkExtractor(allow=r'^https://www.trulia.com/homes/.+/sold/') 23 | 24 | def parse(self, response): 25 | N = trulia.TruliaSpider.get_number_of_pages_to_scrape(response) 26 | self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N)) 27 | for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]: 28 | yield scrapy.Request(url=url, callback=self.parse_index_page) 29 | 30 | def parse_index_page(self, response): 31 | for link in self.le.extract_links(response): 32 | yield scrapy.Request(url=link.url, callback=self.parse_property_page) 33 | 34 | def parse_property_page(self, response): 35 | item_loader = TruliaItemLoader(item=TruliaItem(), response=response) 36 | trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response) 37 | 38 | details = item_loader.nested_css('.homeDetailsHeading') 39 | taxes = details.nested_xpath('.//*[text() = "Property Taxes and Assessment"]/parent::div') 40 | taxes.add_xpath('property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()') 41 | taxes.add_xpath('property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()') 42 | taxes.add_xpath('property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()') 43 | taxes.add_xpath('property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()') 44 | taxes.add_xpath('property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()') 45 | taxes.add_xpath('property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()') 46 | 47 | item = item_loader.load_item() 48 | trulia.TruliaSpider.post_process(item=item) 49 | return item 50 | --------------------------------------------------------------------------------