├── .gitignore ├── LICENSE.txt ├── Procfile ├── README.md ├── alembic ├── README ├── alembic.ini ├── env.py ├── script.py.mako └── versions │ ├── 25fd903281e5_initial_property_tables.py │ └── a91f96702e22_update_property_fields.py ├── docker-compose.yml ├── install.sh ├── parcel_id_extractor.py ├── parcel_ids.txt ├── requirements.txt ├── scraper ├── __init__.py ├── items.py ├── middlewares.py ├── models.py ├── pipelines.py ├── settings.py └── spiders │ ├── __init__.py │ ├── assessment_spider.py │ └── test_assessment_spider.py ├── scrapy.cfg ├── setup.py └── terraform └── scraper_ec2.tf /.gitignore: -------------------------------------------------------------------------------- 1 | venv* 2 | 3 | data/ 4 | output.csv 5 | parcel_ids.txt 6 | .idea 7 | 8 | # below from: https://github.com/github/gitignore/blob/master/Python.gitignore 9 | # Byte-compiled / optimized / DLL files 10 | __pycache__/ 11 | *.py[cod] 12 | *$py.class 13 | 14 | # C extensions 15 | *.so 16 | 17 | # Distribution / packaging 18 | .Python 19 | build/ 20 | develop-eggs/ 21 | dist/ 22 | downloads/ 23 | eggs/ 24 | .eggs/ 25 | lib/ 26 | lib64/ 27 | parts/ 28 | sdist/ 29 | var/ 30 | wheels/ 31 | *.egg-info/ 32 | .installed.cfg 33 | *.egg 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .coverage 49 | .coverage.* 50 | .cache 51 | nosetests.xml 52 | coverage.xml 53 | *.cover 54 | .hypothesis/ 55 | 56 | # Translations 57 | *.mo 58 | *.pot 59 | 60 | # Django stuff: 61 | *.log 62 | local_settings.py 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # pyenv 81 | .python-version 82 | 83 | # celery beat schedule file 84 | celerybeat-schedule 85 | 86 | # SageMath parsed files 87 | *.sage.py 88 | 89 | # Environments 90 | .env 91 | .venv 92 | env/ 93 | venv/ 94 | ENV/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | output.csv 110 | 111 | # Local .terraform directories 112 | **/.terraform/* 113 | 114 | # .tfstate files 115 | *.tfstate 116 | *.tfstate.* 117 | 118 | # Crash log files 119 | crash.log 120 | 121 | # Ignore any .tfvars files that are generated automatically for each Terraform run. Most 122 | # .tfvars files are managed as part of configuration and so should be included in 123 | # version control. 124 | # 125 | # example.tfvars 126 | 127 | # Ignore override files as they are usually used to override resources locally and so 128 | # are not checked in 129 | override.tf 130 | override.tf.json 131 | *_override.tf 132 | *_override.tf.json 133 | 134 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Copyright (c) 2017 Code For New Orleans 2 | 3 | Permission is hereby granted, free of charge, to any person obtaining a copy 4 | of this software and associated documentation files (the "Software"), to deal 5 | in the Software without restriction, including without limitation the rights 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 7 | copies of the Software, and to permit persons to whom the Software is 8 | furnished to do so, subject to the following conditions: 9 | 10 | The above copyright notice and this permission notice shall be included in 11 | all copies or substantial portions of the Software. 12 | 13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN 19 | THE SOFTWARE. 20 | -------------------------------------------------------------------------------- /Procfile: -------------------------------------------------------------------------------- 1 | worker: scrapy runspider scraper/spiders/assessment_spider.py 2 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # assessor-scraper 2 | 3 | The goal of this project is to transform the data from the Orleans Parish 4 | Assessor's Office [website](http://nolaassessor.com/) into formats that 5 | are better suited for data analysis. 6 | 7 | ## development environment setup 8 | 9 | ### prerequisites 10 | 11 | You must have Python 3 installed. You can download it 12 | [here](https://www.python.org/downloads/). 13 | 14 | ### first setup a python [virtual environment](https://docs.python.org/3/library/venv.html#creating-virtual-environments) 15 | 16 | ``` 17 | python3 -m venv .venv 18 | . .venv/bin/activate 19 | ``` 20 | 21 | ### install the dependencies with [pip](https://pip.pypa.io/en/stable/user_guide/#requirements-files) 22 | ``` 23 | pip install -r requirements.txt 24 | ``` 25 | 26 | 27 | ## Getting started 28 | 29 | ### Set up the database 30 | By default, the scraper is setup to load data into a PostgreSQL database. 31 | Docs on setting up and making changes to the database are [here](alembic/README). 32 | You can quickly get the database running locally using [Docker](https://store.docker.com/search?type=edition&offering=community). 33 | ``` 34 | docker-compose up -d db 35 | ``` 36 | 37 | If you want to explore how to extract data using scrapy, use the [scrapy 38 | shell](https://doc.scrapy.org/en/latest/intro/tutorial.html#extracting-data) to interactively 39 | work with the response. 40 | 41 | For example, 42 | ``` 43 | scrapy shell http://qpublic9.qpublic.net/la_orleans_display.php?KEY=1500-SUGARBOWLDR 44 | owner = response.xpath('//td[@class="owner_value"]/text()').get() 45 | total_value = response.xpath('//td[@class="tax_value"]/text()')[3].get().strip() 46 | next_page = response.xpath('//td[@class="header_link"]/a/@href').get() 47 | ``` 48 | 49 | ### Get all the parcel ids 50 | 51 | Getting a list of parcel ids allows us to build urls for every property 52 | so we can scrape the data for that parcel. These parcel ids are used 53 | in the url like `http://qpublic9.qpublic.net/la_orleans_display.php?KEY=701-POYDRASST`, 54 | where `701-POYDRASST` is the parcel id. 55 | 56 | Running the `parcel_id_extractor.py` script will cleverly use the owner search to 57 | extract all available parcel ids, then save them in a file `parcel_ids.txt`. 58 | 59 | The file is checked in to the repo, but if you want to run it yourself 60 | to update it with the latest, run 61 | 62 | ``` 63 | python parcel_id_extractor.py 64 | ``` 65 | 66 | 67 | ### Running the spider 68 | Running the spider from the command line will crawl the assessors website and 69 | [output the data](https://doc.scrapy.org/en/latest/topics/feed-exports.html) to a destination of your choice. 70 | 71 | By default, the spider will output data to a postgres database, which is configured 72 | in `scraper/settings.py`. You can use a hosted postgres instance or run one locally using 73 | [Docker](https://store.docker.com/search?type=edition&offering=community): 74 | 75 | > Important Note: Scraping should always be done responsibly so check the [robots.txt](http://www.robotstxt.org/robotstxt.html) file to ensure the site doesn't explicitly instruct crawlers to not crawl. Also when running the scraper, be careful not to cause unexpected load to the assessors website - consider running during non-peak hours or profiling the latency to ensure you aren't overwhelming the servers. 76 | 77 | 78 | To run the spider, 79 | ``` 80 | scrapy runspider scraper/spiders/assessment_spider.py 81 | ``` 82 | > Warning: this will take a long time to run...you can kill the process with ctrl+c. 83 | 84 | To run the spider and output to a csv 85 | ``` 86 | scrapy runspider scraper/spiders/assessment_spider.py -o output.csv 87 | ``` 88 | 89 | #### Running on Heroku 90 | Set required environment variables: 91 | ``` 92 | heroku config:set DATABASE_URL=postgres://user:pass@host:5432/assessordb 93 | ``` 94 | 95 | You can run the scraper on Heroku by scaling up the worker dyno: 96 | ``` 97 | heroku ps:scale worker=1 98 | ``` 99 | 100 | See [the Heroku docs](https://devcenter.heroku.com/articles/getting-started-with-python#introduction) for more info on how to deploy Python code. 101 | 102 | #### Running in aws with Terraform 103 | 1) Install terraform 104 | 2) `cd terraform` 105 | 3) `terraform init` 106 | 4) `terraform plan` 107 | 5) `terraform apply` 108 | 6) `ssh ubuntu@{public_dns}` 109 | -------------------------------------------------------------------------------- /alembic/README: -------------------------------------------------------------------------------- 1 | Generic single-database configuration. 2 | 3 | # Creating db 4 | - `createuser -d -P assessor` 5 | - `createdb -U assessor assessor` 6 | 7 | # Updating db 8 | From within the `assessor-scraper/alembic` directory, run: 9 | 10 | - `alembic upgrade head` 11 | 12 | # Autogenerating new migration 13 | After making changes to scraper/models.py you'll need to create a database migration to allow others 14 | to update their db with new fields. 15 | 16 | Make sure to include a description of changes: 17 | 18 | - `alembic revision --autogenerate -m description_of_changes` 19 | -------------------------------------------------------------------------------- /alembic/alembic.ini: -------------------------------------------------------------------------------- 1 | # A generic, single database configuration. 2 | 3 | [alembic] 4 | # path to migration scripts 5 | script_location = . 6 | 7 | # template used to generate migration files 8 | file_template = %%(year)d%%(month).2d%%(day).2d%%(hour).2d%%(minute).2d%%(second).2d_%%(slug)s 9 | 10 | # timezone to use when rendering the date 11 | # within the migration file as well as the filename. 12 | # string value is passed to dateutil.tz.gettz() 13 | # leave blank for localtime 14 | # timezone = 15 | 16 | # max length of characters to apply to the 17 | # "slug" field 18 | #truncate_slug_length = 40 19 | 20 | # set to 'true' to run the environment during 21 | # the 'revision' command, regardless of autogenerate 22 | # revision_environment = false 23 | 24 | # set to 'true' to allow .pyc and .pyo files without 25 | # a source .py file to be detected as revisions in the 26 | # versions/ directory 27 | # sourceless = false 28 | 29 | # version location specification; this defaults 30 | # to alembic/versions. When using multiple version 31 | # directories, initial revisions must be specified with --version-path 32 | # version_locations = %(here)s/bar %(here)s/bat alembic/versions 33 | 34 | # the output encoding used when revision files 35 | # are written from script.py.mako 36 | # output_encoding = utf-8 37 | 38 | sqlalchemy.url = postgresql://assessor:assessor@localhost/assessor 39 | 40 | 41 | # Logging configuration 42 | [loggers] 43 | keys = root,sqlalchemy,alembic 44 | 45 | [handlers] 46 | keys = console 47 | 48 | [formatters] 49 | keys = generic 50 | 51 | [logger_root] 52 | level = WARN 53 | handlers = console 54 | qualname = 55 | 56 | [logger_sqlalchemy] 57 | level = WARN 58 | handlers = 59 | qualname = sqlalchemy.engine 60 | 61 | [logger_alembic] 62 | level = INFO 63 | handlers = 64 | qualname = alembic 65 | 66 | [handler_console] 67 | class = StreamHandler 68 | args = (sys.stderr,) 69 | level = NOTSET 70 | formatter = generic 71 | 72 | [formatter_generic] 73 | format = %(levelname)-5.5s [%(name)s] %(message)s 74 | datefmt = %H:%M:%S 75 | -------------------------------------------------------------------------------- /alembic/env.py: -------------------------------------------------------------------------------- 1 | from __future__ import with_statement 2 | 3 | from logging.config import fileConfig 4 | 5 | from sqlalchemy import engine_from_config 6 | from sqlalchemy import pool 7 | 8 | from alembic import context 9 | 10 | # this is the Alembic Config object, which provides 11 | # access to the values within the .ini file in use. 12 | config = context.config 13 | 14 | # Interpret the config file for Python logging. 15 | # This line sets up loggers basically. 16 | fileConfig(config.config_file_name) 17 | 18 | # add your model's MetaData object here 19 | # for 'autogenerate' support 20 | # from myapp import mymodel 21 | # target_metadata = mymodel.Base.metadata 22 | import os 23 | import sys 24 | parent_dir = os.path.abspath(os.path.join(os.getcwd(), "..")) 25 | sys.path.append(parent_dir) 26 | from scraper import models 27 | target_metadata = models.Base.metadata 28 | 29 | # other values from the config, defined by the needs of env.py, 30 | # can be acquired: 31 | # my_important_option = config.get_main_option("my_important_option") 32 | # ... etc. 33 | 34 | 35 | def run_migrations_offline(): 36 | """Run migrations in 'offline' mode. 37 | 38 | This configures the context with just a URL 39 | and not an Engine, though an Engine is acceptable 40 | here as well. By skipping the Engine creation 41 | we don't even need a DBAPI to be available. 42 | 43 | Calls to context.execute() here emit the given string to the 44 | script output. 45 | 46 | """ 47 | url = config.get_main_option("sqlalchemy.url") 48 | context.configure( 49 | url=url, target_metadata=target_metadata, literal_binds=True 50 | ) 51 | 52 | with context.begin_transaction(): 53 | context.run_migrations() 54 | 55 | 56 | def run_migrations_online(): 57 | """Run migrations in 'online' mode. 58 | 59 | In this scenario we need to create an Engine 60 | and associate a connection with the context. 61 | 62 | """ 63 | connectable = engine_from_config( 64 | config.get_section(config.config_ini_section), 65 | prefix="sqlalchemy.", 66 | poolclass=pool.NullPool, 67 | ) 68 | 69 | with connectable.connect() as connection: 70 | context.configure( 71 | connection=connection, target_metadata=target_metadata 72 | ) 73 | 74 | with context.begin_transaction(): 75 | context.run_migrations() 76 | 77 | 78 | if context.is_offline_mode(): 79 | run_migrations_offline() 80 | else: 81 | run_migrations_online() 82 | -------------------------------------------------------------------------------- /alembic/script.py.mako: -------------------------------------------------------------------------------- 1 | """${message} 2 | 3 | Revision ID: ${up_revision} 4 | Revises: ${down_revision | comma,n} 5 | Create Date: ${create_date} 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | ${imports if imports else ""} 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = ${repr(up_revision)} 14 | down_revision = ${repr(down_revision)} 15 | branch_labels = ${repr(branch_labels)} 16 | depends_on = ${repr(depends_on)} 17 | 18 | 19 | def upgrade(): 20 | ${upgrades if upgrades else "pass"} 21 | 22 | 23 | def downgrade(): 24 | ${downgrades if downgrades else "pass"} 25 | -------------------------------------------------------------------------------- /alembic/versions/25fd903281e5_initial_property_tables.py: -------------------------------------------------------------------------------- 1 | """initial_property_tables 2 | 3 | Revision ID: 25fd903281e5 4 | Revises: 5 | Create Date: 2019-01-16 19:49:42.948323 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = '25fd903281e5' 14 | down_revision = None 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.create_table('properties', 22 | sa.Column('id', sa.Integer(), nullable=False), 23 | sa.Column('property_key', sa.String(), nullable=False), 24 | sa.Column('todays_date', sa.String(), nullable=True), 25 | sa.Column('location', sa.String(), nullable=True), 26 | sa.Column('owner_name', sa.String(), nullable=True), 27 | sa.Column('mailing_address', sa.String(), nullable=True), 28 | sa.Column('municipal_district', sa.String(), nullable=True), 29 | sa.Column('location_address', sa.String(), nullable=True), 30 | sa.Column('tax_bill_number', sa.String(), nullable=True), 31 | sa.Column('property_class', sa.String(), nullable=True), 32 | sa.Column('special_tax_district', sa.String(), nullable=True), 33 | sa.Column('subdivision_name', sa.String(), nullable=True), 34 | sa.Column('land_area_sq_ft', sa.String(), nullable=True), 35 | sa.Column('zoning_district', sa.String(), nullable=True), 36 | sa.Column('building_area_sq_ft', sa.String(), nullable=True), 37 | sa.Column('square', sa.String(), nullable=True), 38 | sa.Column('lot', sa.String(), nullable=True), 39 | sa.Column('book', sa.String(), nullable=True), 40 | sa.Column('folio', sa.String(), nullable=True), 41 | sa.Column('line', sa.String(), nullable=True), 42 | sa.Column('parcel_map', sa.String(), nullable=True), 43 | sa.Column('legal_description', sa.String(), nullable=True), 44 | sa.Column('assessment_area', sa.String(), nullable=True), 45 | sa.PrimaryKeyConstraint('id') 46 | ) 47 | op.create_table('property_transfers', 48 | sa.Column('id', sa.Integer(), nullable=False), 49 | sa.Column('property_id', sa.Integer(), nullable=True), 50 | sa.Column('sale_transfer_date', sa.String(), nullable=True), 51 | sa.Column('price', sa.String(), nullable=True), 52 | sa.Column('grantor', sa.String(), nullable=True), 53 | sa.Column('grantee', sa.String(), nullable=True), 54 | sa.Column('notarial_archive_number', sa.String(), nullable=True), 55 | sa.Column('instrument_number', sa.String(), nullable=True), 56 | sa.ForeignKeyConstraint(['property_id'], ['properties.id'], ), 57 | sa.PrimaryKeyConstraint('id') 58 | ) 59 | op.create_table('property_values', 60 | sa.Column('id', sa.Integer(), nullable=False), 61 | sa.Column('property_id', sa.Integer(), nullable=True), 62 | sa.Column('year', sa.String(), nullable=True), 63 | sa.Column('land_value', sa.String(), nullable=True), 64 | sa.Column('building_value', sa.String(), nullable=True), 65 | sa.Column('total_value', sa.String(), nullable=True), 66 | sa.Column('assessed_land_value', sa.String(), nullable=True), 67 | sa.Column('assessed_building_value', sa.String(), nullable=True), 68 | sa.Column('total_assessed_value', sa.String(), nullable=True), 69 | sa.Column('homestead_exemption_value', sa.String(), nullable=True), 70 | sa.Column('taxable_assessment', sa.String(), nullable=True), 71 | sa.Column('age_freeze', sa.String(), nullable=True), 72 | sa.Column('disability_freeze', sa.String(), nullable=True), 73 | sa.Column('assmnt_change', sa.String(), nullable=True), 74 | sa.Column('tax_contract', sa.String(), nullable=True), 75 | sa.ForeignKeyConstraint(['property_id'], ['properties.id'], ), 76 | sa.PrimaryKeyConstraint('id') 77 | ) 78 | # ### end Alembic commands ### 79 | 80 | 81 | def downgrade(): 82 | # ### commands auto generated by Alembic - please adjust! ### 83 | op.drop_table('property_values') 84 | op.drop_table('property_transfers') 85 | op.drop_table('properties') 86 | # ### end Alembic commands ### 87 | -------------------------------------------------------------------------------- /alembic/versions/a91f96702e22_update_property_fields.py: -------------------------------------------------------------------------------- 1 | """update_property_fields 2 | 3 | Revision ID: a91f96702e22 4 | Revises: 25fd903281e5 5 | Create Date: 2019-01-16 19:51:33.633171 6 | 7 | """ 8 | from alembic import op 9 | import sqlalchemy as sa 10 | 11 | 12 | # revision identifiers, used by Alembic. 13 | revision = 'a91f96702e22' 14 | down_revision = '25fd903281e5' 15 | branch_labels = None 16 | depends_on = None 17 | 18 | 19 | def upgrade(): 20 | # ### commands auto generated by Alembic - please adjust! ### 21 | op.add_column('properties', sa.Column('lot_folio', sa.String(), nullable=True)) 22 | op.add_column('properties', sa.Column('revised_bldg_area_sqft', sa.String(), nullable=True)) 23 | op.drop_column('properties', 'lot') 24 | # ### end Alembic commands ### 25 | 26 | 27 | def downgrade(): 28 | # ### commands auto generated by Alembic - please adjust! ### 29 | op.add_column('properties', sa.Column('lot', sa.VARCHAR(), autoincrement=False, nullable=True)) 30 | op.drop_column('properties', 'revised_bldg_area_sqft') 31 | op.drop_column('properties', 'lot_folio') 32 | # ### end Alembic commands ### 33 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | version: '2' 2 | services: 3 | db: 4 | image: postgres:alpine 5 | restart: always 6 | environment: 7 | POSTGRES_USER: assessor 8 | POSTGRES_PASSWORD: assessor 9 | ports: 10 | - 5432:5432 11 | elasticsearch1: 12 | image: docker.elastic.co/elasticsearch/elasticsearch:5.5.0 13 | container_name: elasticsearch1 14 | environment: 15 | - cluster.name=docker-cluster 16 | - bootstrap.memory_lock=true 17 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m" 18 | ulimits: 19 | memlock: 20 | soft: -1 21 | hard: -1 22 | mem_limit: 1g 23 | volumes: 24 | - ./data/esdata1:/usr/share/elasticsearch/data 25 | ports: 26 | - 9200:9200 27 | networks: 28 | - esnet 29 | elasticsearch2: 30 | container_name: elasticsearch2 31 | image: docker.elastic.co/elasticsearch/elasticsearch:5.5.0 32 | environment: 33 | - cluster.name=docker-cluster 34 | - bootstrap.memory_lock=true 35 | - "ES_JAVA_OPTS=-Xms512m -Xmx512m" 36 | - "discovery.zen.ping.unicast.hosts=elasticsearch1" 37 | ulimits: 38 | memlock: 39 | soft: -1 40 | hard: -1 41 | mem_limit: 1g 42 | volumes: 43 | - ./data/esdata2:/usr/share/elasticsearch/data 44 | networks: 45 | - esnet 46 | kibana: 47 | image: docker.elastic.co/kibana/kibana:5.5.0 48 | container_name: kibana 49 | environment: 50 | ELASTICSEARCH_URL: http://elasticsearch1:9200 51 | ports: 52 | - 5601:5601 53 | networks: 54 | - esnet 55 | volumes: 56 | esdata1: 57 | driver: local 58 | esdata2: 59 | driver: local 60 | 61 | networks: 62 | esnet: 63 | -------------------------------------------------------------------------------- /install.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | set -ex 4 | 5 | sudo apt-get -y update 6 | sudo apt-get -y install build-essential python3-dev 7 | sudo apt-get -y install python-pip 8 | 9 | sudo apt-get -y install postgresql postgresql-contrib libpq-dev 10 | sudo service postgresql start 11 | 12 | sudo -u postgres createuser -s -d assessor 13 | sudo -u postgres psql -c "ALTER USER assessor WITH PASSWORD 'assessor';" 14 | sudo -u postgres createdb assessor 15 | sudo -u postgres psql -c "grant all privileges on database assessor to assessor" 16 | 17 | sudo pip install virtualenv 18 | 19 | git clone https://github.com/codefornola/assessor-scraper.git 20 | 21 | virtualenv -p python3 venv_scraper 22 | 23 | . venv_scraper/bin/activate 24 | 25 | pip install requests 26 | pip install psycopg2==2.7.3.2 27 | pip install pyproj 28 | pip install SQLAlchemy==1.1.15 29 | pip install Scrapy==1.4.0 30 | -------------------------------------------------------------------------------- /parcel_id_extractor.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import threading 5 | from queue import Queue 6 | from string import ascii_uppercase, digits 7 | 8 | import requests 9 | from bs4 import BeautifulSoup 10 | 11 | SEARCH_URL = "http://qpublic9.qpublic.net/la_orleans_alsearch.php?" \ 12 | "searchType=owner_name&INPUT={}&BEGIN={}" 13 | 14 | Q = Queue() 15 | 16 | 17 | class ParcelIdExtractor(object): 18 | """ 19 | Fuzzes the owner search to extract all available parcel ids 20 | """ 21 | 22 | def __init__(self): 23 | self.parcel_ids = frozenset() 24 | self.lock = threading.Lock() 25 | 26 | def update_ids(self, ids): 27 | """Use a lock to prevent multiple threads from updating parcel_ids""" 28 | self.lock.acquire() 29 | self.parcel_ids |= frozenset(ids) 30 | self.lock.release() 31 | 32 | def search_all_terms(self): 33 | """ 34 | Puts all the search terms on a queue to be processed by worker threads. 35 | Note: all owner names are capitalized on the assessor's site, so we 36 | only use capitalized letters 37 | """ 38 | # 0-9 + A-Z 39 | terms = [d for d in digits] + [l for l in ascii_uppercase] 40 | [Q.put(t) for t in terms] 41 | 42 | def search(self, search_term, begin=0): 43 | """ 44 | Searches by owner name, extracts the parcel ids, then recursively pages 45 | through the results until no more ids are found for the search_term 46 | """ 47 | thread = threading.current_thread().getName() 48 | url = SEARCH_URL.format(search_term, begin) 49 | print('{} searching {}'.format(thread, url)) 50 | r = requests.get(url) 51 | if 'No Records Found.' in r.text: 52 | return 53 | else: 54 | soup = BeautifulSoup(r.text, 'html.parser') 55 | pids = [td.a.text for td in soup.select('td.search_value') 56 | if td.a is not None and td.a.text != 'Map It'] 57 | if len(pids) > 0: 58 | self.update_ids(pids) 59 | self.search(search_term, begin + len(pids)) 60 | 61 | def process_queue(self): 62 | while not Q.empty(): 63 | term = Q.get() 64 | self.search(term) 65 | Q.task_done() 66 | 67 | def main(self, file_name='parcel_ids.txt', num_worker_threads=10): 68 | try: 69 | # populate queue with all the search terms 70 | self.search_all_terms() 71 | # start worker threads to process queue 72 | threads = [] 73 | for i in range(num_worker_threads): 74 | t = threading.Thread(target=self.process_queue) 75 | threads.append(t) 76 | t.start() 77 | # wait for all threads to complete 78 | [t.join() for t in threads] 79 | with open(file_name, 'w') as f: 80 | print('writing {} parcel ids'.format(len(self.parcel_ids))) 81 | for id in self.parcel_ids: 82 | f.write(id + os.linesep) 83 | except Exception as error: 84 | print(error) 85 | 86 | 87 | if __name__ == '__main__': 88 | ParcelIdExtractor().main() 89 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alembic==1.0.8 2 | asn1crypto==0.24.0 3 | attrs>=19.1.0 4 | Automat==0.7.0 5 | beautifulsoup4==4.7.1 6 | certifi==2019.3.9 7 | cffi==1.12.2 8 | chardet==3.0.4 9 | constantly==15.1.0 10 | cryptography>=2.6.1 11 | cssselect==1.0.3 12 | elasticsearch==7.0.0 13 | hyperlink==19.0.0 14 | idna==2.8 15 | incremental==17.5.0 16 | lxml==4.3.3 17 | parsel==1.5.1 18 | psycopg2==2.8.1 19 | pyasn1==0.4.5 20 | pyasn1-modules==0.2.4 21 | pycparser==2.19 22 | PyDispatcher==2.0.5 23 | pyopenssl>=19.0.0 24 | pyproj==2.1.3 25 | queuelib==1.5.0 26 | requests>=2.21.0 27 | Scrapy==1.6.0 28 | ScrapyElasticSearch==0.9.1 29 | service-identity==18.1.0 30 | six==1.12.0 31 | SQLAlchemy==1.3.2 32 | Twisted==19.2.0 33 | urllib3>=1.24.1 34 | w3lib==1.20.0 35 | zope.interface==4.6.0 36 | -------------------------------------------------------------------------------- /scraper/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/codefornola/assessor-scraper/8245c6640daddd6679f793f63f4e46eb4a4bb5ab/scraper/__init__.py -------------------------------------------------------------------------------- /scraper/items.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # See documentation in: 4 | # http://doc.scrapy.org/en/latest/topics/items.html 5 | 6 | import scrapy 7 | 8 | 9 | class Property(scrapy.Item): 10 | property_key = scrapy.Field() 11 | owner_name = scrapy.Field() 12 | todays_date = scrapy.Field() 13 | mailing_address = scrapy.Field() 14 | municipal_district = scrapy.Field() 15 | location_address = scrapy.Field() 16 | location = scrapy.Field() 17 | tax_bill_number = scrapy.Field() 18 | property_class = scrapy.Field() 19 | special_tax_district = scrapy.Field() 20 | subdivision_name = scrapy.Field() 21 | land_area_sq_ft = scrapy.Field() 22 | zoning_district = scrapy.Field() 23 | building_area_sq_ft = scrapy.Field() 24 | square = scrapy.Field() 25 | lot_folio = scrapy.Field() 26 | book = scrapy.Field() 27 | folio = scrapy.Field() 28 | line = scrapy.Field() 29 | parcel_map = scrapy.Field() 30 | legal_description = scrapy.Field() 31 | assessment_area = scrapy.Field() 32 | sales = scrapy.Field() 33 | values = scrapy.Field() 34 | revised_bldg_area_sqft = scrapy.Field() 35 | -------------------------------------------------------------------------------- /scraper/middlewares.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define here the models for your spider middleware 4 | # 5 | # See documentation in: 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html 7 | 8 | from scrapy import signals 9 | 10 | 11 | class ScraperSpiderMiddleware(object): 12 | # Not all methods need to be defined. If a method is not defined, 13 | # scrapy acts as if the spider middleware does not modify the 14 | # passed objects. 15 | 16 | @classmethod 17 | def from_crawler(cls, crawler): 18 | # This method is used by Scrapy to create your spiders. 19 | s = cls() 20 | crawler.signals.connect(s.spider_opened, signal=signals.spider_opened) 21 | return s 22 | 23 | def process_spider_input(self, response, spider): 24 | # Called for each response that goes through the spider 25 | # middleware and into the spider. 26 | 27 | # Should return None or raise an exception. 28 | return None 29 | 30 | def process_spider_output(self, response, result, spider): 31 | # Called with the results returned from the Spider, after 32 | # it has processed the response. 33 | 34 | # Must return an iterable of Request, dict or Item objects. 35 | for i in result: 36 | yield i 37 | 38 | def process_spider_exception(self, response, exception, spider): 39 | # Called when a spider or process_spider_input() method 40 | # (from other spider middleware) raises an exception. 41 | 42 | # Should return either None or an iterable of Response, dict 43 | # or Item objects. 44 | pass 45 | 46 | def process_start_requests(self, start_requests, spider): 47 | # Called with the start requests of the spider, and works 48 | # similarly to the process_spider_output() method, except 49 | # that it doesn’t have a response associated. 50 | 51 | # Must return only requests (not items). 52 | for r in start_requests: 53 | yield r 54 | 55 | def spider_opened(self, spider): 56 | spider.logger.info('Spider opened: %s' % spider.name) 57 | -------------------------------------------------------------------------------- /scraper/models.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import os 4 | import logging 5 | from sqlalchemy import create_engine, Column, Integer, String, ForeignKey 6 | from sqlalchemy.engine.url import URL 7 | from sqlalchemy.ext.declarative import declarative_base 8 | from sqlalchemy.orm import relationship 9 | 10 | from scraper import settings 11 | 12 | Base = declarative_base() 13 | 14 | 15 | def db_connect(): 16 | """ 17 | Returns sqlalchemy engine instance 18 | """ 19 | if 'DATABASE_URL' in os.environ: 20 | DATABASE_URL = os.environ['DATABASE_URL'] 21 | logging.debug("Connecting to %s", URL) 22 | else: 23 | DATABASE_URL = URL(**settings.DATABASE) 24 | logging.debug("Connecting with settings %s", DATABASE_URL) 25 | return create_engine(DATABASE_URL) 26 | 27 | 28 | def create_tables(engine): 29 | Base.metadata.create_all(engine) 30 | 31 | 32 | class Property(Base): 33 | __tablename__ = 'properties' 34 | 35 | id = Column(Integer, primary_key=True) 36 | property_key = Column(String, nullable=False) 37 | todays_date = Column(String) 38 | location = Column(String) 39 | owner_name = Column(String) 40 | mailing_address = Column(String) 41 | municipal_district = Column(String) 42 | location_address = Column(String) 43 | tax_bill_number = Column(String) 44 | property_class = Column(String) 45 | special_tax_district = Column(String) 46 | subdivision_name = Column(String) 47 | land_area_sq_ft = Column(String) 48 | revised_bldg_area_sqft = Column(String) 49 | zoning_district = Column(String) 50 | building_area_sq_ft = Column(String) 51 | square = Column(String) 52 | lot_folio = Column(String) 53 | book = Column(String) 54 | folio = Column(String) 55 | line = Column(String) 56 | parcel_map = Column(String) 57 | legal_description = Column(String) 58 | assessment_area = Column(String) 59 | values = relationship('PropertyValue') 60 | transfers = relationship('PropertyTransfer') 61 | 62 | 63 | class PropertyValue(Base): 64 | __tablename__ = 'property_values' 65 | 66 | id = Column(Integer, primary_key=True) 67 | property_id = Column(Integer, ForeignKey('properties.id')) 68 | year = Column(String) 69 | land_value = Column(String) 70 | building_value = Column(String) 71 | total_value = Column(String) 72 | assessed_land_value = Column(String) 73 | assessed_building_value = Column(String) 74 | total_assessed_value = Column(String) 75 | homestead_exemption_value = Column(String) 76 | taxable_assessment = Column(String) 77 | age_freeze = Column(String) 78 | disability_freeze = Column(String) 79 | assmnt_change = Column(String) 80 | tax_contract = Column(String) 81 | 82 | 83 | class PropertyTransfer(Base): 84 | __tablename__ = 'property_transfers' 85 | 86 | id = Column(Integer, primary_key=True) 87 | property_id = Column(Integer, ForeignKey('properties.id')) 88 | sale_transfer_date = Column(String) 89 | price = Column(String) 90 | grantor = Column(String) 91 | grantee = Column(String) 92 | notarial_archive_number = Column(String) 93 | instrument_number = Column(String) 94 | -------------------------------------------------------------------------------- /scraper/pipelines.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Define your item pipelines here 4 | # 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html 7 | 8 | from sqlalchemy.orm import sessionmaker 9 | 10 | from scraper.models import Property, PropertyTransfer, PropertyValue, db_connect, create_tables 11 | 12 | 13 | class PostgresPipeline(object): 14 | """Pipeline for storing scraped items in postgres""" 15 | 16 | def __init__(self): 17 | engine = db_connect() 18 | create_tables(engine) 19 | self.Session = sessionmaker(bind=engine) 20 | 21 | def process_item(self, item, spider): 22 | """ 23 | This method is called for every item emitted by the spider. 24 | """ 25 | session = self.Session() 26 | sales = item['sales'] 27 | values = item['values'] 28 | del item['sales'] 29 | del item['values'] 30 | property = Property(**item) 31 | 32 | try: 33 | session.add(property) 34 | # flush to obtain the id of property to be used as the foreign key 35 | session.flush() 36 | 37 | for sale in sales: 38 | sale['property_id'] = property.id 39 | session.add(PropertyTransfer(**sale)) 40 | for value in values: 41 | value['property_id'] = property.id 42 | session.add(PropertyValue(**value)) 43 | session.commit() 44 | except: 45 | session.rollback() 46 | raise 47 | finally: 48 | session.close() 49 | 50 | return item 51 | -------------------------------------------------------------------------------- /scraper/settings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | # Scrapy settings for scraper project 4 | # 5 | # For simplicity, this file contains only settings considered important or 6 | # commonly used. You can find more settings consulting the documentation: 7 | # 8 | # http://doc.scrapy.org/en/latest/topics/settings.html 9 | # http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 10 | # http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 11 | 12 | BOT_NAME = 'scraper' 13 | 14 | SPIDER_MODULES = ['scraper.spiders'] 15 | NEWSPIDER_MODULE = 'scraper.spiders' 16 | 17 | # Obey robots.txt rules 18 | ROBOTSTXT_OBEY = False 19 | 20 | # Configure maximum concurrent requests performed by Scrapy (default: 16) 21 | # See also autothrottle settings 22 | # CONCURRENT_REQUESTS = 32 23 | 24 | # Configure a delay for requests for the same website (default: 0) 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay 26 | # DOWNLOAD_DELAY = 3 27 | # RANDOMIZE_DOWNLOAD_DELAY = True 28 | # The download delay setting will honor only one of: 29 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16 30 | # CONCURRENT_REQUESTS_PER_IP = 16 31 | 32 | # Enable and configure the AutoThrottle extension (disabled by default) 33 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html 34 | AUTOTHROTTLE_ENABLED = True 35 | # The initial download delay 36 | # AUTOTHROTTLE_START_DELAY = 5 37 | # The maximum download delay to be set in case of high latencies 38 | # AUTOTHROTTLE_MAX_DELAY = 60 39 | # The average number of requests Scrapy should be sending in parallel to 40 | # each remote server 41 | AUTOTHROTTLE_TARGET_CONCURRENCY = 8 42 | # Enable showing throttling stats for every response received: 43 | # AUTOTHROTTLE_DEBUG = True 44 | 45 | # Disable cookies (enabled by default) 46 | # COOKIES_ENABLED = False 47 | 48 | # Disable Telnet Console (enabled by default) 49 | # TELNETCONSOLE_ENABLED = False 50 | 51 | # Override the default request headers: 52 | DEFAULT_REQUEST_HEADERS = { 53 | 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 54 | 'Accept-Language': 'en', 55 | 'Accept-Encoding': 'gzip, deflate, sdch', 56 | } 57 | 58 | # Enable or disable spider middlewares 59 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html 60 | # SPIDER_MIDDLEWARES = { 61 | # 'scraper.middlewares.ScraperSpiderMiddleware': 543, 62 | # } 63 | 64 | # Enable or disable downloader middlewares 65 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html 66 | # DOWNLOADER_MIDDLEWARES = { 67 | # 'scraper.middlewares.MyCustomDownloaderMiddleware': 543, 68 | # } 69 | 70 | # Enable or disable extensions 71 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html 72 | # EXTENSIONS = { 73 | # 'scrapy.extensions.telnet.TelnetConsole': None, 74 | # } 75 | 76 | # Configure item pipelines 77 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html 78 | ITEM_PIPELINES = { 79 | 'scraper.pipelines.PostgresPipeline': 300, 80 | } 81 | # ITEM_PIPELINES = { 82 | # 'scrapyelasticsearch.scrapyelasticsearch.ElasticSearchPipeline': 500 83 | # } 84 | 85 | ELASTICSEARCH_SERVERS = ['http://elastic:changeme@localhost:9200'] 86 | ELASTICSEARCH_INDEX = 'assessor' 87 | ELASTICSEARCH_INDEX_DATE_FORMAT = '%Y-%m' 88 | ELASTICSEARCH_TYPE = 'property' 89 | ELASTICSEARCH_UNIQ_KEY = 'property_key' 90 | 91 | DATABASE = { 92 | 'drivername': 'postgres', 93 | 'host': 'localhost', 94 | 'port': '5432', 95 | 'username': 'assessor', 96 | 'password': 'assessor', 97 | 'database': 'assessor' 98 | } 99 | 100 | # Enable and configure HTTP caching (disabled by default) 101 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings 102 | # HTTPCACHE_ENABLED = True 103 | # HTTPCACHE_EXPIRATION_SECS = 0 104 | # HTTPCACHE_DIR = 'httpcache' 105 | # HTTPCACHE_IGNORE_HTTP_CODES = [] 106 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage' 107 | -------------------------------------------------------------------------------- /scraper/spiders/__init__.py: -------------------------------------------------------------------------------- 1 | # This package will contain the spiders of your Scrapy project 2 | # 3 | # Please refer to the documentation for information on how to create and manage 4 | # your spiders. 5 | -------------------------------------------------------------------------------- /scraper/spiders/assessment_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import logging 4 | import pprint 5 | import re 6 | import os 7 | from urllib.parse import urlparse, parse_qs 8 | 9 | import requests 10 | import scrapy 11 | from pyproj import Proj, transform 12 | 13 | from scraper.items import Property 14 | from scrapy.exceptions import DropItem 15 | 16 | logging.getLogger('scrapy').setLevel(logging.WARNING) 17 | logging.getLogger('scrapy.extensions.throttle').setLevel(logging.INFO) 18 | logging.getLogger('urllib3').setLevel(logging.WARNING) 19 | pp = pprint.PrettyPrinter() 20 | 21 | URL = "http://qpublic9.qpublic.net/la_orleans_display.php?KEY={}" 22 | 23 | 24 | class AssessmentSpider(scrapy.Spider): 25 | """ 26 | All spiders must subclass scrapy.Spider 27 | https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider 28 | """ 29 | name = "assessment_spider" 30 | f = open('parcel_ids.txt') 31 | start_urls = [URL.format(pid.strip()) for pid in f.readlines()] 32 | 33 | def parse(self, response): 34 | """ 35 | Default callback function with response for the crawled url 36 | https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse 37 | """ 38 | response = response.replace(body=re.sub(r"", "\n", response.body.decode('utf=8'))) 39 | property_key = response.url.split('=')[1].replace('&', '') 40 | # logging.debug("Parsing property_key: %s", property_key) 41 | if 'No Data at this time' in response.text: 42 | msg = "No data for " + response.url 43 | logging.warning(msg) 44 | raise DropItem(msg) 45 | else: 46 | property_info = self.parse_property_info(response) 47 | property_values = self.parse_property_values(response) 48 | property_sales = self.parse_property_sales(response) 49 | property_info['sales'] = property_sales 50 | property_info['values'] = property_values 51 | property_info['property_key'] = property_key 52 | yield Property(property_info) 53 | 54 | @staticmethod 55 | def get_address_location(parcel_map_link): 56 | """ 57 | Parses the parcel map link and calculates coordinates from the extent. 58 | An example link looks like this: 59 | http://qpublic9.qpublic.net/qpmap4/map.php?county=la_orleans&parcel=41050873&extent=3667340+524208+3667804+524540&layers=parcels+aerials+roads+lakes 60 | """ 61 | o = urlparse(parcel_map_link) 62 | query = parse_qs(o.query) 63 | bbox = query['extent'][0].split(' ') 64 | x1, y1, x2, y2 = [float(pt) for pt in bbox] 65 | # get the midpoint of the extent 66 | midpoint = [(x1 + x2) / 2, (y1 + y2) / 2] 67 | # transform projected coordinates to latitude and longitude 68 | in_proj = Proj(init='epsg:3452', preserve_units=True) 69 | out_proj = Proj(init='epsg:4326') 70 | return transform(in_proj, out_proj, midpoint[0], midpoint[1]) 71 | 72 | def parse_property_info(self, response): 73 | hdrs = [h.extract().strip() for h in response.xpath('//td[@class="owner_header"]/font/text()')] 74 | value_cells = response.xpath('//td[@class="owner_value"]') 75 | value_texts = [self._extract_text_from_value_cell(value_cell) for value_cell in value_cells] 76 | value_fonts = [self._extract_font_from_value_cell(value_cell) for value_cell in value_cells] 77 | value_hrefs = [self._extract_href_from_value_cell(value_cell) for value_cell in value_cells] 78 | vals = [' '.join([v1, v2, v3]).strip() for v1, v2, v3 in zip(value_texts, value_fonts, value_hrefs)] 79 | keys = [self._clean_key(h) for h in hdrs] 80 | info = dict(zip(keys, vals)) 81 | # get href to parcel map if it exists 82 | links = response.xpath('//td[@class="owner_value"]/a[contains(@href,"extent")]/@href') 83 | if len(links) > 0: 84 | parcel_map_link = links[0].extract() 85 | [lng, lat] = self.get_address_location(parcel_map_link) 86 | info['location'] = [lng, lat] 87 | return info 88 | 89 | @staticmethod 90 | def _extract_text_from_value_cell(value_cell): 91 | return '\n'.join([v.extract().strip() for v in value_cell.xpath('text()')]) 92 | 93 | @staticmethod 94 | def _extract_font_from_value_cell(value_cell): 95 | return '\n'.join([v.extract().strip() for v in value_cell.xpath('font/text()')]) 96 | 97 | @staticmethod 98 | def _extract_href_from_value_cell(value_cell): 99 | return '\n'.join([v.extract().strip() for v in value_cell.xpath('a/@href')]) 100 | 101 | def parse_property_sales(self, response): 102 | hdrs = response.css('td[class="sales_header"] > font::text').extract() 103 | keys = [self._clean_key(h) for h in hdrs] 104 | value_info = response.css('td[class="sales_value"]').xpath('./text()').extract() 105 | values = [v.replace('\xa0', '').strip().replace(' ', '') for v in value_info] 106 | sales = [] 107 | for i in range(0, len(values), 6): 108 | sale = values[i:i + 6] 109 | sales.append(dict(zip(keys, sale))) 110 | return sales 111 | 112 | @staticmethod 113 | def _clean_key(key): 114 | cleaned_key = re.sub(r"[\(|\)\']", "", key.lower()) 115 | cleaned_key = re.sub(r"[/ \n]", "_", cleaned_key) 116 | cleaned_key = re.sub(r"_+", "_", cleaned_key) 117 | return cleaned_key.strip() 118 | 119 | def parse_property_values(self, response): 120 | hdrs = response.css('td[class="tax_header"] > font::text').extract() 121 | keys = [self._clean_key(h) for h in hdrs] 122 | value_info = response.css('.tax_value').xpath('./text()').extract() 123 | values = [v.replace('\xa0', '').replace(' ', '') for v in value_info] 124 | special_treatment_info = response.css('.tax_value').xpath('./font').extract() 125 | special_treatment_info = [re.sub('<[^>]*>', '', s) for s in special_treatment_info] 126 | year1_vals = values[0:9] 127 | year1_vals.extend(special_treatment_info[0:4]) 128 | year2_vals = values[16:25] 129 | year2_vals.extend(special_treatment_info[4:8]) 130 | year3_vals = values[32:41] 131 | year3_vals.extend(special_treatment_info[8:12]) 132 | return [dict(zip(keys, year1_vals)), 133 | dict(zip(keys, year2_vals)), 134 | dict(zip(keys, year3_vals))] -------------------------------------------------------------------------------- /scraper/spiders/test_assessment_spider.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | 3 | import unittest 4 | from unittest.mock import patch, mock_open 5 | 6 | class AssessmentSpiderTestCase(unittest.TestCase): 7 | def setUp(self): 8 | with patch("builtins.open", mock_open(read_data="data1\ndata2")) as mock_file: 9 | from scraper.spiders.assessment_spider import AssessmentSpider 10 | self.spider = AssessmentSpider() 11 | 12 | def test_clean_key__lot_folio(self): 13 | result = self.spider._clean_key('Lot / Folio') 14 | self.assertEqual( 15 | result, 16 | 'lot_folio' 17 | ) 18 | 19 | def test_clean_key__land_area(self): 20 | result = self.spider._clean_key('Land Area (sq ft) ') 21 | self.assertEqual( 22 | result, 23 | 'land_area_sq_ft' 24 | ) 25 | -------------------------------------------------------------------------------- /scrapy.cfg: -------------------------------------------------------------------------------- 1 | # Automatically created by: scrapy startproject 2 | # 3 | # For more information about the [deploy] section see: 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html 5 | 6 | [settings] 7 | default = scraper.settings 8 | 9 | [deploy] 10 | #url = http://localhost:6800/ 11 | project = scraper 12 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup 2 | 3 | setup( 4 | name='assessor-scraper', 5 | version='0.1', 6 | description='', 7 | url='https://github.com/codefornola/assessor-scraper', 8 | author='CodeForNola', 9 | author_email='', 10 | packages=['scraper'], 11 | install_requires=[ 12 | "psycopg2==2.7.3.2", 13 | "pyproj", 14 | "requests", 15 | "Scrapy==1.4.0", 16 | "SQLAlchemy==1.1.15", 17 | ], 18 | zip_safe=False 19 | ) 20 | -------------------------------------------------------------------------------- /terraform/scraper_ec2.tf: -------------------------------------------------------------------------------- 1 | provider "aws" { 2 | region = "us-east-1" 3 | } 4 | 5 | resource "aws_instance" "scraper_server" { 6 | ami = "ami-2757f631" 7 | instance_type = "t2.micro" 8 | key_name = "deployer-key" 9 | 10 | vpc_security_group_ids = [ 11 | "${aws_security_group.allow_inbound_ssh.id}", 12 | "${aws_security_group.allow_outbound_http.id}" 13 | ] 14 | 15 | connection { 16 | type = "ssh" 17 | user = "ubuntu" 18 | private_key = "${file("~/.ssh/id_rsa")}" 19 | } 20 | 21 | provisioner "remote-exec" { 22 | script = "../install.sh" 23 | } 24 | } 25 | 26 | resource "aws_key_pair" "deployer" { 27 | key_name = "deployer-key" 28 | public_key = "${file("~/.ssh/id_rsa.pub")}" 29 | } 30 | 31 | resource "aws_security_group" "allow_inbound_ssh" { 32 | name = "allow_inbound_ssh" 33 | description = "Allow ssh inbound traffic" 34 | 35 | ingress { 36 | from_port = 22 37 | to_port = 22 38 | protocol = "tcp" 39 | cidr_blocks = ["0.0.0.0/0"] 40 | ipv6_cidr_blocks = ["::/0"] 41 | description = "Allow ssh" 42 | } 43 | } 44 | 45 | resource "aws_security_group" "allow_outbound_http" { 46 | name = "allow_outbound_http" 47 | description = "Allow outbound http and https traffic" 48 | 49 | egress { 50 | from_port = 80 51 | to_port = 80 52 | protocol = "tcp" 53 | cidr_blocks = ["0.0.0.0/0"] 54 | ipv6_cidr_blocks = ["::/0"] 55 | description = "Allow http" 56 | } 57 | 58 | egress { 59 | from_port = 443 60 | to_port = 443 61 | protocol = "tcp" 62 | cidr_blocks = ["0.0.0.0/0"] 63 | ipv6_cidr_blocks = ["::/0"] 64 | description = "Allow https" 65 | } 66 | } 67 | 68 | output "public_dns" { 69 | value = "${aws_instance.scraper_server.public_dns}" 70 | } 71 | --------------------------------------------------------------------------------