├── .gitignore
├── README.md
├── data
    ├── data_for_sale.jl
    └── data_sold.jl
├── scrapy.cfg
└── trulia_scraper
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── parsing.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
        ├── __init__.py
        ├── trulia.py
        └── trulia_sold.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | env/
 12 | build/
 13 | develop-eggs/
 14 | dist/
 15 | downloads/
 16 | eggs/
 17 | .eggs/
 18 | lib/
 19 | lib64/
 20 | parts/
 21 | sdist/
 22 | var/
 23 | wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | 
 49 | # Translations
 50 | *.mo
 51 | *.pot
 52 | 
 53 | # Django stuff:
 54 | *.log
 55 | local_settings.py
 56 | 
 57 | # Flask stuff:
 58 | instance/
 59 | .webassets-cache
 60 | 
 61 | # Scrapy stuff:
 62 | .scrapy
 63 | 
 64 | # Sphinx documentation
 65 | docs/_build/
 66 | 
 67 | # PyBuilder
 68 | target/
 69 | 
 70 | # Jupyter Notebook
 71 | .ipynb_checkpoints
 72 | 
 73 | # pyenv
 74 | .python-version
 75 | 
 76 | # celery beat schedule file
 77 | celerybeat-schedule
 78 | 
 79 | # SageMath parsed files
 80 | *.sage.py
 81 | 
 82 | # dotenv
 83 | .env
 84 | 
 85 | # virtualenv
 86 | .venv
 87 | venv/
 88 | ENV/
 89 | 
 90 | # Spyder project settings
 91 | .spyderproject
 92 | .spyproject
 93 | 
 94 | # Rope project settings
 95 | .ropeproject
 96 | 
 97 | # mkdocs documentation
 98 | /site
 99 | 
100 | # mypy
101 | .mypy_cache/
102 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # trulia-scraper
 2 | Scraper for real estate listings on [Trulia.com](https://www.trulia.com/) implemented in Python with Scrapy.
 3 | 
 4 | ## Basic usage
 5 | To crawl the scraper, you need to install [Python 3](https://www.python.org/download/releases/3.0/), as well as the [Scrapy](https://pypi.python.org/pypi/Scrapy) framework and the [Pyparsing](https://pypi.python.org/pypi/pyparsing/2.2.0) module. The scraper features two spiders:
 6 | 
 7 | 1. `trulia`, which scrapes all real estate listings which are _for sale_ in a given state and city starting from a URL such as [https://www.trulia.com/CA/San_Francisco/](https://www.trulia.com/CA/San_Francisco/);
 8 | 2. `trulia_sold`, which similarly scrapes listings of recently _sold_ properties starting from a URL such as [https://www.trulia.com/sold/San_Francisco,CA/](https://www.trulia.com/sold/San_Francisco,CA/).
 9 | 
10 | To crawl the `trulia_sold` spider for the state of `CA` and city of `San_Francisco` (the default locale), simply run the command
11 | 
12 | ```
13 | scrapy crawl trulia_sold
14 | ```
15 | from the project directory. To scrape listings for another city, specify the `city` and `state` arguments using the `-a` flag. For example,
16 | 
17 | ```
18 | scrapy crawl trulia_sold -a state=NY -a city=New_York
19 | ```
20 | will scrape all listings reachable from [https://www.trulia.com/sold/New_York,NY/](https://www.trulia.com/sold/New_York,NY/).
21 | 
22 | By default, the scraped data will be stored (using Scrapy's [feed export](https://doc.scrapy.org/en/latest/topics/feed-exports.html)) in the `data` directory as a [JSON lines](http://jsonlines.org/) (`.jl`) file following the naming convention
23 | 
24 | ```
25 | data_{sold|for_sale}_{state}_{city}_{time}.jl
26 | ```
27 | 
28 | where `{sold|for_sale}` is `sold` or `for_sale` for the `trulia` and `trulia_sold` spiders, respectively, `{state}` and `{city}` are the specified state and city (e.g. `CA` and `San_Francisco`, respectively), and `{time}` represents the current UTC time.
29 | 
30 | If you prefer a different output file name and format, you can specify this from the command line using Scrapy's `-o` option. For example,
31 | 
32 | ```
33 | scrapy crawl trulia_sold -a state=WA -city=Seattle -o data_Seattle.csv
34 | ```
35 | will output the data in CSV format as `data_Seattle.csv`. (Scrapy automatically picks up the file format from the specified file extension).


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = trulia_scraper.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = trulia_scraper
12 | 


--------------------------------------------------------------------------------
/trulia_scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/khpeek/trulia-scraper/2c3313d6d4fdfe55acc56965145f4aa7e5cac86c/trulia_scraper/__init__.py


--------------------------------------------------------------------------------
/trulia_scraper/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | from scrapy.loader import ItemLoader
 3 | from scrapy.loader.processors import TakeFirst, MapCompose, Identity, Compose
 4 | import scrapy
 5 | from trulia_scraper.parsing import remove_empty, get_number_from_string
 6 | 
 7 | 
 8 | class TruliaItem(scrapy.Item):
 9 |     url = scrapy.Field()
10 |     address = scrapy.Field()
11 |     city_state = scrapy.Field()
12 |     price = scrapy.Field()              # for items on sale only
13 |     neighborhood = scrapy.Field()
14 |     overview = scrapy.Field()
15 |     description = scrapy.Field()
16 | 
17 |     # Columns from the 'price events' table are stored in separate lists
18 |     prices = scrapy.Field()
19 |     dates = scrapy.Field()
20 |     events = scrapy.Field()
21 | 
22 |     # Property tax information is on 'sold' pages only
23 |     property_tax_assessment_year = scrapy.Field()
24 |     property_tax = scrapy.Field()
25 |     property_tax_assessment_land = scrapy.Field()
26 |     property_tax_assessment_improvements = scrapy.Field()
27 |     property_tax_assessment_total = scrapy.Field()
28 |     property_tax_market_value = scrapy.Field()
29 | 
30 |     # The 'Features' sections is on 'for sale' pages only
31 |     listing_information = scrapy.Field()
32 |     listing_information_date_updated = scrapy.Field()
33 |     public_records = scrapy.Field()
34 |     public_records_date_updated = scrapy.Field()
35 | 
36 |     # Items generated from further parsing of 'raw' scraped data
37 |     area = scrapy.Field()
38 |     lot_size = scrapy.Field()
39 |     lot_size_units = scrapy.Field()
40 |     price_per_square_foot = scrapy.Field()      # For properties on sale only
41 |     bedrooms = scrapy.Field()
42 |     bathrooms = scrapy.Field()
43 |     year_built = scrapy.Field()
44 |     days_on_Trulia = scrapy.Field()
45 |     views = scrapy.Field()
46 |     price_history = scrapy.Field()
47 | 
48 | 
49 | class TruliaItemLoader(ItemLoader):
50 |     default_input_processor = MapCompose(str.strip)
51 |     default_output_processor = TakeFirst()
52 | 
53 |     price_out = Compose(TakeFirst(), lambda s: int(s.replace(',', '')))
54 |     overview_out = Identity()
55 |     description_out = Compose(remove_empty)
56 |     prices_out = Identity()
57 |     dates_out = Compose(remove_empty)
58 |     events_out = Compose(remove_empty)
59 | 
60 |     listing_information_out = Identity()
61 |     public_records_out = Identity()
62 | 
63 |     area_out = Compose(TakeFirst(), get_number_from_string)
64 |     lot_size_out = Compose(TakeFirst(), get_number_from_string)
65 |     price_per_square_foot_out = Compose(TakeFirst(), get_number_from_string)
66 |     bedrooms_out = Compose(TakeFirst(), int)
67 |     bathrooms_out = Compose(TakeFirst(), int)
68 |     year_built_out = Compose(TakeFirst(), int)
69 |     days_on_Trulia_out = Compose(TakeFirst(), lambda s: int(s.replace(',', '')))
70 |     views_out = Compose(TakeFirst(), lambda s: int(s.replace(',', '')))
71 | 


--------------------------------------------------------------------------------
/trulia_scraper/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class TruliaScraperSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/trulia_scraper/parsing.py:
--------------------------------------------------------------------------------
1 | def remove_empty(l):
2 |     '''Remove items which evaluate to False (such as empty strings) from the input list.'''
3 |     return [x for x in l if x]
4 | 
5 | def get_number_from_string(string, number_type=float):
6 |     '''Remove commas from the input string and parse as a number'''
7 |     return number_type(string.replace(',', ''))


--------------------------------------------------------------------------------
/trulia_scraper/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | 
 9 | class TruliaScraperPipeline(object):
10 |     def process_item(self, item, spider):
11 |         return item
12 | 


--------------------------------------------------------------------------------
/trulia_scraper/settings.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Scrapy settings for trulia_scraper project
 4 | #
 5 | # For simplicity, this file contains only settings considered important or
 6 | # commonly used. You can find more settings consulting the documentation:
 7 | #
 8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
 9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
11 | 
12 | BOT_NAME = 'trulia_scraper'
13 | 
14 | SPIDER_MODULES = ['trulia_scraper.spiders']
15 | NEWSPIDER_MODULE = 'trulia_scraper.spiders'
16 | 
17 | 
18 | # Crawl responsibly by identifying yourself (and your website) on the user-agent
19 | #USER_AGENT = 'trulia_scraper (+http://www.yourdomain.com)'
20 | 
21 | # Obey robots.txt rules
22 | ROBOTSTXT_OBEY = True
23 | 
24 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
25 | #CONCURRENT_REQUESTS = 32
26 | 
27 | # Configure a delay for requests for the same website (default: 0)
28 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
29 | # See also autothrottle settings and docs
30 | #DOWNLOAD_DELAY = 3
31 | # The download delay setting will honor only one of:
32 | #CONCURRENT_REQUESTS_PER_DOMAIN = 16
33 | #CONCURRENT_REQUESTS_PER_IP = 16
34 | 
35 | # Disable cookies (enabled by default)
36 | #COOKIES_ENABLED = False
37 | 
38 | # Disable Telnet Console (enabled by default)
39 | #TELNETCONSOLE_ENABLED = False
40 | 
41 | # Override the default request headers:
42 | #DEFAULT_REQUEST_HEADERS = {
43 | #   'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
44 | #   'Accept-Language': 'en',
45 | #}
46 | 
47 | # Enable or disable spider middlewares
48 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
49 | #SPIDER_MIDDLEWARES = {
50 | #    'trulia_scraper.middlewares.TruliaScraperSpiderMiddleware': 543,
51 | #}
52 | 
53 | # Enable or disable downloader middlewares
54 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
55 | #DOWNLOADER_MIDDLEWARES = {
56 | #    'trulia_scraper.middlewares.MyCustomDownloaderMiddleware': 543,
57 | #}
58 | 
59 | # Enable or disable extensions
60 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
61 | #EXTENSIONS = {
62 | #    'scrapy.extensions.telnet.TelnetConsole': None,
63 | #}
64 | 
65 | # Configure item pipelines
66 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
67 | #ITEM_PIPELINES = {
68 | #    'trulia_scraper.pipelines.TruliaScraperPipeline': 300,
69 | #}
70 | 
71 | # Enable and configure the AutoThrottle extension (disabled by default)
72 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
73 | #AUTOTHROTTLE_ENABLED = True
74 | # The initial download delay
75 | #AUTOTHROTTLE_START_DELAY = 5
76 | # The maximum download delay to be set in case of high latencies
77 | #AUTOTHROTTLE_MAX_DELAY = 60
78 | # The average number of requests Scrapy should be sending in parallel to
79 | # each remote server
80 | #AUTOTHROTTLE_TARGET_CONCURRENCY = 1.0
81 | # Enable showing throttling stats for every response received:
82 | #AUTOTHROTTLE_DEBUG = False
83 | 
84 | # Enable and configure HTTP caching (disabled by default)
85 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
86 | #HTTPCACHE_ENABLED = True
87 | #HTTPCACHE_EXPIRATION_SECS = 0
88 | #HTTPCACHE_DIR = 'httpcache'
89 | #HTTPCACHE_IGNORE_HTTP_CODES = []
90 | #HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
91 | 


--------------------------------------------------------------------------------
/trulia_scraper/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/trulia_scraper/spiders/trulia.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import scrapy
 4 | import math
 5 | import datetime
 6 | from scrapy.linkextractors import LinkExtractor
 7 | from trulia_scraper.items import TruliaItem, TruliaItemLoader
 8 | from trulia_scraper.parsing import get_number_from_string
 9 | from scrapy.utils.conf import closest_scrapy_cfg
10 | 
11 | 
12 | class TruliaSpider(scrapy.Spider):
13 |     name = 'trulia'
14 |     allowed_domains = ['trulia.com']
15 |     custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_for_sale_%(state)s_%(city)s_%(time)s.jl'), 
16 |                        'FEED_FORMAT': 'jsonlines'}
17 | 
18 |     def __init__(self, state='CA', city='San_Francisco', *args, **kwargs):
19 |         super().__init__(*args, **kwargs)
20 |         self.state = state
21 |         self.city = city
22 |         self.start_urls = ['http://trulia.com/{state}/{city}'.format(state=state, city=city)]
23 |         self.le = LinkExtractor(allow=r'^https://www.trulia.com/property')
24 | 
25 |     def parse(self, response):
26 |         N = self.get_number_of_pages_to_scrape(response)
27 |         self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N))
28 |         for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]:
29 |             yield scrapy.Request(url=url, callback=self.parse_index_page)
30 | 
31 |     @staticmethod
32 |     def get_number_of_pages_to_scrape(response):
33 |         pagination = response.css('.paginationContainer').xpath('.//*/text()[contains(., "Results")]')
34 |         number_of_results = int(pagination.re_first(r'^1 - 30 of ([\d,]+) Results$').replace(',', ''))
35 |         return math.ceil(number_of_results/30)
36 | 
37 |     def parse_index_page(self, response):
38 |         for link in self.le.extract_links(response):
39 |             yield scrapy.Request(url=link.url, callback=self.parse_property_page)
40 | 
41 |     def parse_property_page(self, response):
42 |         l = TruliaItemLoader(item=TruliaItem(), response=response)
43 |         self.load_common_fields(item_loader=l, response=response)
44 | 
45 |         listing_information = l.nested_xpath('//span[text() = "LISTING INFORMATION"]')
46 |         listing_information.add_xpath('listing_information', './parent::div/following-sibling::ul[1]/li/text()')
47 |         listing_information.add_xpath('listing_information_date_updated', './following-sibling::span/text()', re=r'^Updated: (.*)')
48 | 
49 |         public_records = l.nested_xpath('//span[text() = "PUBLIC RECORDS"]')
50 |         public_records.add_xpath('public_records', './parent::div/following-sibling::ul[1]/li/text()')
51 |         public_records.add_xpath('public_records_date_updated', './following-sibling::span/text()', re=r'^Updated: (.*)')
52 | 
53 |         item = l.load_item()
54 |         self.post_process(item=item)
55 |         return item
56 | 
57 |     @staticmethod
58 |     def load_common_fields(item_loader, response):
59 |         '''Load field values which are common to "on sale" and "recently sold" properties.'''
60 |         item_loader.add_value('url', response.url)
61 |         item_loader.add_xpath('address', '//*[@data-role="address"]/text()')
62 |         item_loader.add_xpath('city_state', '//*[@data-role="cityState"]/text()')
63 |         item_loader.add_xpath('price', '//span[@data-role="price"]/text()', re=r'\$([\d,]+)')
64 |         item_loader.add_xpath('neighborhood', '//*[@data-role="cityState"]/parent::h1/following-sibling::span/a/text()')
65 |         details = item_loader.nested_css('.homeDetailsHeading')
66 |         overview = details.nested_xpath('.//span[contains(text(), "Overview")]/parent::div/following-sibling::div[1]')
67 |         overview.add_xpath('overview', xpath='.//li/text()')
68 |         overview.add_xpath('area', xpath='.//li/text()', re=r'([\d,]+) sqft$')
69 |         overview.add_xpath('lot_size', xpath='.//li/text()', re=r'([\d,.]+) (?:acres|sqft) lot size$')
70 |         overview.add_xpath('lot_size_units', xpath='.//li/text()', re=r'[\d,.]+ (acres|sqft) lot size$')
71 |         overview.add_xpath('price_per_square_foot', xpath='.//li/text()', re=r'\$([\d,.]+)/sqft$')
72 |         overview.add_xpath('bedrooms', xpath='.//li/text()', re=r'(\d+) (?:Beds|Bed|beds|bed)$')
73 |         overview.add_xpath('bathrooms', xpath='.//li/text()', re=r'(\d+) (?:Baths|Bath|baths|bath)$')
74 |         overview.add_xpath('year_built', xpath='.//li/text()', re=r'Built in (\d+)')
75 |         overview.add_xpath('days_on_Trulia', xpath='.//li/text()', re=r'([\d,]) days on Trulia$')
76 |         overview.add_xpath('views', xpath='.//li/text()', re=r'([\d,]+) views$')
77 |         item_loader.add_css('description', '#descriptionContainer *::text')
78 | 
79 |         price_events = details.nested_xpath('.//*[text() = "Price History"]/parent::*/following-sibling::*[1]/div/div')
80 |         price_events.add_xpath('prices', './div[contains(text(), "$")]/text()')
81 |         price_events.add_xpath('dates', './div[contains(text(), "$")]/preceding-sibling::div/text()')
82 |         price_events.add_xpath('events', './div[contains(text(), "$")]/following-sibling::div/text()')
83 | 
84 |     @staticmethod
85 |     def post_process(item):
86 |         '''Add any additional data to an item after loading it'''
87 |         if item.get('dates') is not None:
88 |             dates = [datetime.datetime.strptime(date, '%m/%d/%Y') for date in item['dates']]
89 |             prices = [int(price.lstrip('$').replace(',', '')) for price in item['prices']]
90 |             item['price_history'] = sorted(list(zip(dates, prices, item['events'])), key=lambda x: x[0])
91 | 
92 | 
93 | 
94 | 


--------------------------------------------------------------------------------
/trulia_scraper/spiders/trulia_sold.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | import os
 3 | import scrapy
 4 | from scrapy.linkextractors import LinkExtractor
 5 | import trulia_scraper.parsing as parsing
 6 | from trulia_scraper.items import TruliaItem, TruliaItemLoader
 7 | import trulia_scraper.spiders.trulia as trulia
 8 | from scrapy.utils.conf import closest_scrapy_cfg
 9 | 
10 | 
11 | class TruliaSpider(scrapy.Spider):
12 |     name = 'trulia_sold'
13 |     allowed_domains = ['trulia.com']
14 |     custom_settings = {'FEED_URI': os.path.join(os.path.dirname(closest_scrapy_cfg()), 'data/data_sold_%(state)s_%(city)s_%(time)s.jl'), 
15 |                        'FEED_FORMAT': 'jsonlines'}
16 | 
17 |     def __init__(self, state='CA', city='San_Francisco', *args, **kwargs):
18 |         super().__init__(*args, **kwargs)
19 |         self.state = state
20 |         self.city = city
21 |         self.start_urls = ['http://trulia.com/sold/{city},{state}/'.format(state=state, city=city)]
22 |         self.le = LinkExtractor(allow=r'^https://www.trulia.com/homes/.+/sold/')
23 | 
24 |     def parse(self, response):
25 |         N = trulia.TruliaSpider.get_number_of_pages_to_scrape(response)
26 |         self.logger.info("Determined that property pages are contained on {N} different index pages, each containing at most 30 properties. Proceeding to scrape each index page...".format(N=N))
27 |         for url in [response.urljoin("{n}_p/".format(n=n)) for n in range(1, N+1)]:
28 |             yield scrapy.Request(url=url, callback=self.parse_index_page)
29 | 
30 |     def parse_index_page(self, response):
31 |         for link in self.le.extract_links(response):
32 |             yield scrapy.Request(url=link.url, callback=self.parse_property_page)
33 | 
34 |     def parse_property_page(self, response):
35 |         item_loader = TruliaItemLoader(item=TruliaItem(), response=response)
36 |         trulia.TruliaSpider.load_common_fields(item_loader=item_loader, response=response)
37 | 
38 |         details = item_loader.nested_css('.homeDetailsHeading')
39 |         taxes = details.nested_xpath('.//*[text() = "Property Taxes and Assessment"]/parent::div')
40 |         taxes.add_xpath('property_tax_assessment_year', './following-sibling::div/div[contains(text(), "Year")]/following-sibling::div/text()')
41 |         taxes.add_xpath('property_tax', './following-sibling::div/div[contains(text(), "Tax")]/following-sibling::div/text()')
42 |         taxes.add_xpath('property_tax_assessment_land', './following-sibling::div/div/div[contains(text(), "Land")]/following-sibling::div/text()')
43 |         taxes.add_xpath('property_tax_assessment_improvements', './following-sibling::div/div/div[contains(text(), "Improvements")]/following-sibling::div/text()')
44 |         taxes.add_xpath('property_tax_assessment_total', './following-sibling::div/div/div[contains(text(), "Total")]/following-sibling::div/text()')
45 |         taxes.add_xpath('property_tax_market_value', './following-sibling::div/div[contains(text(), "Market Value")]/following-sibling::div/text()')
46 | 
47 |         item = item_loader.load_item()
48 |         trulia.TruliaSpider.post_process(item=item)
49 |         return item
50 | 


--------------------------------------------------------------------------------