├── .gitignore
├── LICENSE.txt
├── Procfile
├── README.md
├── alembic
    ├── README
    ├── alembic.ini
    ├── env.py
    ├── script.py.mako
    └── versions
    │   ├── 25fd903281e5_initial_property_tables.py
    │   └── a91f96702e22_update_property_fields.py
├── docker-compose.yml
├── install.sh
├── parcel_id_extractor.py
├── parcel_ids.txt
├── requirements.txt
├── scraper
    ├── __init__.py
    ├── items.py
    ├── middlewares.py
    ├── models.py
    ├── pipelines.py
    ├── settings.py
    └── spiders
    │   ├── __init__.py
    │   ├── assessment_spider.py
    │   └── test_assessment_spider.py
├── scrapy.cfg
├── setup.py
└── terraform
    └── scraper_ec2.tf


/.gitignore:
--------------------------------------------------------------------------------
  1 | venv*
  2 | 
  3 | data/
  4 | output.csv
  5 | parcel_ids.txt
  6 | .idea
  7 | 
  8 | # below from: https://github.com/github/gitignore/blob/master/Python.gitignore
  9 | # Byte-compiled / optimized / DLL files
 10 | __pycache__/
 11 | *.py[cod]
 12 | *$py.class
 13 | 
 14 | # C extensions
 15 | *.so
 16 | 
 17 | # Distribution / packaging
 18 | .Python
 19 | build/
 20 | develop-eggs/
 21 | dist/
 22 | downloads/
 23 | eggs/
 24 | .eggs/
 25 | lib/
 26 | lib64/
 27 | parts/
 28 | sdist/
 29 | var/
 30 | wheels/
 31 | *.egg-info/
 32 | .installed.cfg
 33 | *.egg
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .coverage
 49 | .coverage.*
 50 | .cache
 51 | nosetests.xml
 52 | coverage.xml
 53 | *.cover
 54 | .hypothesis/
 55 | 
 56 | # Translations
 57 | *.mo
 58 | *.pot
 59 | 
 60 | # Django stuff:
 61 | *.log
 62 | local_settings.py
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # pyenv
 81 | .python-version
 82 | 
 83 | # celery beat schedule file
 84 | celerybeat-schedule
 85 | 
 86 | # SageMath parsed files
 87 | *.sage.py
 88 | 
 89 | # Environments
 90 | .env
 91 | .venv
 92 | env/
 93 | venv/
 94 | ENV/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | output.csv
110 | 
111 | # Local .terraform directories
112 | **/.terraform/*
113 | 
114 | # .tfstate files
115 | *.tfstate
116 | *.tfstate.*
117 | 
118 | # Crash log files
119 | crash.log
120 | 
121 | # Ignore any .tfvars files that are generated automatically for each Terraform run. Most
122 | # .tfvars files are managed as part of configuration and so should be included in
123 | # version control.
124 | #
125 | # example.tfvars
126 | 
127 | # Ignore override files as they are usually used to override resources locally and so
128 | # are not checked in
129 | override.tf
130 | override.tf.json
131 | *_override.tf
132 | *_override.tf.json
133 | 
134 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Copyright (c) 2017 Code For New Orleans
 2 | 
 3 | Permission is hereby granted, free of charge, to any person obtaining a copy
 4 | of this software and associated documentation files (the "Software"), to deal
 5 | in the Software without restriction, including without limitation the rights
 6 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 7 | copies of the Software, and to permit persons to whom the Software is
 8 | furnished to do so, subject to the following conditions:
 9 | 
10 | The above copyright notice and this permission notice shall be included in
11 | all copies or substantial portions of the Software.
12 | 
13 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
14 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
15 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
16 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
17 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
18 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
19 | THE SOFTWARE.
20 | 


--------------------------------------------------------------------------------
/Procfile:
--------------------------------------------------------------------------------
1 | worker: scrapy runspider scraper/spiders/assessment_spider.py
2 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # assessor-scraper
  2 | 
  3 | The goal of this project is to transform the data from the Orleans Parish
  4 | Assessor's Office [website](http://nolaassessor.com/) into formats that
  5 | are better suited for data analysis.
  6 | 
  7 | ## development environment setup
  8 | 
  9 | ### prerequisites
 10 | 
 11 | You must have Python 3 installed.  You can download it
 12 | [here](https://www.python.org/downloads/).
 13 | 
 14 | ### first setup a python [virtual environment](https://docs.python.org/3/library/venv.html#creating-virtual-environments)
 15 | 
 16 | ```
 17 | python3 -m venv .venv
 18 | . .venv/bin/activate
 19 | ```
 20 | 
 21 | ### install the dependencies with [pip](https://pip.pypa.io/en/stable/user_guide/#requirements-files)
 22 | ```
 23 | pip install -r requirements.txt
 24 | ```
 25 | 
 26 | 
 27 | ## Getting started
 28 | 
 29 | ### Set up the database
 30 | By default, the scraper is setup to load data into a PostgreSQL database.
 31 | Docs on setting up and making changes to the database are [here](alembic/README).
 32 | You can quickly get the database running locally using [Docker](https://store.docker.com/search?type=edition&offering=community).
 33 | ```
 34 | docker-compose up -d db
 35 | ```
 36 | 
 37 | If you want to explore how to extract data using scrapy, use the [scrapy
 38 | shell](https://doc.scrapy.org/en/latest/intro/tutorial.html#extracting-data) to interactively
 39 | work with the response.
 40 | 
 41 | For example,
 42 | ```
 43 | scrapy shell http://qpublic9.qpublic.net/la_orleans_display.php?KEY=1500-SUGARBOWLDR
 44 | owner = response.xpath('//td[@class="owner_value"]/text()').get()
 45 | total_value = response.xpath('//td[@class="tax_value"]/text()')[3].get().strip()
 46 | next_page = response.xpath('//td[@class="header_link"]/a/@href').get()
 47 | ```
 48 | 
 49 | ### Get all the parcel ids
 50 | 
 51 | Getting a list of parcel ids allows us to build urls for every property
 52 | so we can scrape the data for that parcel.  These parcel ids are used
 53 | in the url like `http://qpublic9.qpublic.net/la_orleans_display.php?KEY=701-POYDRASST`,
 54 | where `701-POYDRASST` is the parcel id.
 55 | 
 56 | Running the `parcel_id_extractor.py` script will cleverly use the owner search to
 57 | extract all available parcel ids, then save them in a file `parcel_ids.txt`.
 58 | 
 59 | The file is checked in to the repo, but if you want to run it yourself
 60 | to update it with the latest, run 
 61 | 
 62 | ```
 63 | python parcel_id_extractor.py
 64 | ```
 65 | 
 66 | 
 67 | ### Running the spider
 68 | Running the spider from the command line will crawl the assessors website and
 69 | [output the data](https://doc.scrapy.org/en/latest/topics/feed-exports.html) to a destination of your choice.
 70 | 
 71 | By default, the spider will output data to a postgres database, which is configured
 72 | in `scraper/settings.py`. You can use a hosted postgres instance or run one locally using
 73 | [Docker](https://store.docker.com/search?type=edition&offering=community):
 74 | 
 75 | > Important Note: Scraping should always be done responsibly so check the [robots.txt](http://www.robotstxt.org/robotstxt.html) file to ensure the site doesn't explicitly instruct crawlers to not crawl.  Also when running the scraper, be careful not to cause unexpected load to the assessors website - consider running during non-peak hours or profiling the latency to ensure you aren't overwhelming the servers.
 76 | 
 77 | 
 78 | To run the spider,
 79 | ```
 80 | scrapy runspider scraper/spiders/assessment_spider.py
 81 | ```
 82 | > Warning: this will take a long time to run...you can kill the process with ctrl+c.
 83 | 
 84 | To run the spider and output to a csv
 85 | ```
 86 | scrapy runspider scraper/spiders/assessment_spider.py -o output.csv
 87 | ```
 88 | 
 89 | #### Running on Heroku
 90 | Set required environment variables:
 91 | ```
 92 | heroku config:set DATABASE_URL=postgres://user:pass@host:5432/assessordb
 93 | ```
 94 | 
 95 | You can run the scraper on Heroku by scaling up the worker dyno:
 96 | ```
 97 | heroku ps:scale worker=1
 98 | ```
 99 | 
100 | See [the Heroku docs](https://devcenter.heroku.com/articles/getting-started-with-python#introduction) for more info on how to deploy Python code.
101 | 
102 | #### Running in aws with Terraform
103 | 1) Install terraform
104 | 2) `cd terraform`
105 | 3) `terraform init`
106 | 4) `terraform plan`
107 | 5) `terraform apply`
108 | 6) `ssh ubuntu@{public_dns}`
109 | 


--------------------------------------------------------------------------------
/alembic/README:
--------------------------------------------------------------------------------
 1 | Generic single-database configuration.
 2 | 
 3 | # Creating db
 4 | - `createuser -d -P assessor`
 5 | - `createdb -U assessor assessor`
 6 | 
 7 | # Updating db
 8 | From within the `assessor-scraper/alembic` directory, run:
 9 | 
10 | - `alembic upgrade head`
11 | 
12 | # Autogenerating new migration
13 | After making changes to scraper/models.py you'll need to create a database migration to allow others
14 | to update their db with new fields.
15 | 
16 | Make sure to include a description of changes:
17 | 
18 | - `alembic revision --autogenerate -m description_of_changes`
19 | 


--------------------------------------------------------------------------------
/alembic/alembic.ini:
--------------------------------------------------------------------------------
 1 | # A generic, single database configuration.
 2 | 
 3 | [alembic]
 4 | # path to migration scripts
 5 | script_location = .
 6 | 
 7 | # template used to generate migration files
 8 | file_template = %%(year)d%%(month).2d%%(day).2d%%(hour).2d%%(minute).2d%%(second).2d_%%(slug)s
 9 | 
10 | # timezone to use when rendering the date
11 | # within the migration file as well as the filename.
12 | # string value is passed to dateutil.tz.gettz()
13 | # leave blank for localtime
14 | # timezone =
15 | 
16 | # max length of characters to apply to the
17 | # "slug" field
18 | #truncate_slug_length = 40
19 | 
20 | # set to 'true' to run the environment during
21 | # the 'revision' command, regardless of autogenerate
22 | # revision_environment = false
23 | 
24 | # set to 'true' to allow .pyc and .pyo files without
25 | # a source .py file to be detected as revisions in the
26 | # versions/ directory
27 | # sourceless = false
28 | 
29 | # version location specification; this defaults
30 | # to alembic/versions.  When using multiple version
31 | # directories, initial revisions must be specified with --version-path
32 | # version_locations = %(here)s/bar %(here)s/bat alembic/versions
33 | 
34 | # the output encoding used when revision files
35 | # are written from script.py.mako
36 | # output_encoding = utf-8
37 | 
38 | sqlalchemy.url = postgresql://assessor:assessor@localhost/assessor
39 | 
40 | 
41 | # Logging configuration
42 | [loggers]
43 | keys = root,sqlalchemy,alembic
44 | 
45 | [handlers]
46 | keys = console
47 | 
48 | [formatters]
49 | keys = generic
50 | 
51 | [logger_root]
52 | level = WARN
53 | handlers = console
54 | qualname =
55 | 
56 | [logger_sqlalchemy]
57 | level = WARN
58 | handlers =
59 | qualname = sqlalchemy.engine
60 | 
61 | [logger_alembic]
62 | level = INFO
63 | handlers =
64 | qualname = alembic
65 | 
66 | [handler_console]
67 | class = StreamHandler
68 | args = (sys.stderr,)
69 | level = NOTSET
70 | formatter = generic
71 | 
72 | [formatter_generic]
73 | format = %(levelname)-5.5s [%(name)s] %(message)s
74 | datefmt = %H:%M:%S
75 | 


--------------------------------------------------------------------------------
/alembic/env.py:
--------------------------------------------------------------------------------
 1 | from __future__ import with_statement
 2 | 
 3 | from logging.config import fileConfig
 4 | 
 5 | from sqlalchemy import engine_from_config
 6 | from sqlalchemy import pool
 7 | 
 8 | from alembic import context
 9 | 
10 | # this is the Alembic Config object, which provides
11 | # access to the values within the .ini file in use.
12 | config = context.config
13 | 
14 | # Interpret the config file for Python logging.
15 | # This line sets up loggers basically.
16 | fileConfig(config.config_file_name)
17 | 
18 | # add your model's MetaData object here
19 | # for 'autogenerate' support
20 | # from myapp import mymodel
21 | # target_metadata = mymodel.Base.metadata
22 | import os
23 | import sys
24 | parent_dir = os.path.abspath(os.path.join(os.getcwd(), ".."))
25 | sys.path.append(parent_dir)
26 | from scraper import models
27 | target_metadata = models.Base.metadata
28 | 
29 | # other values from the config, defined by the needs of env.py,
30 | # can be acquired:
31 | # my_important_option = config.get_main_option("my_important_option")
32 | # ... etc.
33 | 
34 | 
35 | def run_migrations_offline():
36 |     """Run migrations in 'offline' mode.
37 | 
38 |     This configures the context with just a URL
39 |     and not an Engine, though an Engine is acceptable
40 |     here as well.  By skipping the Engine creation
41 |     we don't even need a DBAPI to be available.
42 | 
43 |     Calls to context.execute() here emit the given string to the
44 |     script output.
45 | 
46 |     """
47 |     url = config.get_main_option("sqlalchemy.url")
48 |     context.configure(
49 |         url=url, target_metadata=target_metadata, literal_binds=True
50 |     )
51 | 
52 |     with context.begin_transaction():
53 |         context.run_migrations()
54 | 
55 | 
56 | def run_migrations_online():
57 |     """Run migrations in 'online' mode.
58 | 
59 |     In this scenario we need to create an Engine
60 |     and associate a connection with the context.
61 | 
62 |     """
63 |     connectable = engine_from_config(
64 |         config.get_section(config.config_ini_section),
65 |         prefix="sqlalchemy.",
66 |         poolclass=pool.NullPool,
67 |     )
68 | 
69 |     with connectable.connect() as connection:
70 |         context.configure(
71 |             connection=connection, target_metadata=target_metadata
72 |         )
73 | 
74 |         with context.begin_transaction():
75 |             context.run_migrations()
76 | 
77 | 
78 | if context.is_offline_mode():
79 |     run_migrations_offline()
80 | else:
81 |     run_migrations_online()
82 | 


--------------------------------------------------------------------------------
/alembic/script.py.mako:
--------------------------------------------------------------------------------
 1 | """${message}
 2 | 
 3 | Revision ID: ${up_revision}
 4 | Revises: ${down_revision | comma,n}
 5 | Create Date: ${create_date}
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | ${imports if imports else ""}
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = ${repr(up_revision)}
14 | down_revision = ${repr(down_revision)}
15 | branch_labels = ${repr(branch_labels)}
16 | depends_on = ${repr(depends_on)}
17 | 
18 | 
19 | def upgrade():
20 |     ${upgrades if upgrades else "pass"}
21 | 
22 | 
23 | def downgrade():
24 |     ${downgrades if downgrades else "pass"}
25 | 


--------------------------------------------------------------------------------
/alembic/versions/25fd903281e5_initial_property_tables.py:
--------------------------------------------------------------------------------
 1 | """initial_property_tables
 2 | 
 3 | Revision ID: 25fd903281e5
 4 | Revises: 
 5 | Create Date: 2019-01-16 19:49:42.948323
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = '25fd903281e5'
14 | down_revision = None
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.create_table('properties',
22 |     sa.Column('id', sa.Integer(), nullable=False),
23 |     sa.Column('property_key', sa.String(), nullable=False),
24 |     sa.Column('todays_date', sa.String(), nullable=True),
25 |     sa.Column('location', sa.String(), nullable=True),
26 |     sa.Column('owner_name', sa.String(), nullable=True),
27 |     sa.Column('mailing_address', sa.String(), nullable=True),
28 |     sa.Column('municipal_district', sa.String(), nullable=True),
29 |     sa.Column('location_address', sa.String(), nullable=True),
30 |     sa.Column('tax_bill_number', sa.String(), nullable=True),
31 |     sa.Column('property_class', sa.String(), nullable=True),
32 |     sa.Column('special_tax_district', sa.String(), nullable=True),
33 |     sa.Column('subdivision_name', sa.String(), nullable=True),
34 |     sa.Column('land_area_sq_ft', sa.String(), nullable=True),
35 |     sa.Column('zoning_district', sa.String(), nullable=True),
36 |     sa.Column('building_area_sq_ft', sa.String(), nullable=True),
37 |     sa.Column('square', sa.String(), nullable=True),
38 |     sa.Column('lot', sa.String(), nullable=True),
39 |     sa.Column('book', sa.String(), nullable=True),
40 |     sa.Column('folio', sa.String(), nullable=True),
41 |     sa.Column('line', sa.String(), nullable=True),
42 |     sa.Column('parcel_map', sa.String(), nullable=True),
43 |     sa.Column('legal_description', sa.String(), nullable=True),
44 |     sa.Column('assessment_area', sa.String(), nullable=True),
45 |     sa.PrimaryKeyConstraint('id')
46 |     )
47 |     op.create_table('property_transfers',
48 |     sa.Column('id', sa.Integer(), nullable=False),
49 |     sa.Column('property_id', sa.Integer(), nullable=True),
50 |     sa.Column('sale_transfer_date', sa.String(), nullable=True),
51 |     sa.Column('price', sa.String(), nullable=True),
52 |     sa.Column('grantor', sa.String(), nullable=True),
53 |     sa.Column('grantee', sa.String(), nullable=True),
54 |     sa.Column('notarial_archive_number', sa.String(), nullable=True),
55 |     sa.Column('instrument_number', sa.String(), nullable=True),
56 |     sa.ForeignKeyConstraint(['property_id'], ['properties.id'], ),
57 |     sa.PrimaryKeyConstraint('id')
58 |     )
59 |     op.create_table('property_values',
60 |     sa.Column('id', sa.Integer(), nullable=False),
61 |     sa.Column('property_id', sa.Integer(), nullable=True),
62 |     sa.Column('year', sa.String(), nullable=True),
63 |     sa.Column('land_value', sa.String(), nullable=True),
64 |     sa.Column('building_value', sa.String(), nullable=True),
65 |     sa.Column('total_value', sa.String(), nullable=True),
66 |     sa.Column('assessed_land_value', sa.String(), nullable=True),
67 |     sa.Column('assessed_building_value', sa.String(), nullable=True),
68 |     sa.Column('total_assessed_value', sa.String(), nullable=True),
69 |     sa.Column('homestead_exemption_value', sa.String(), nullable=True),
70 |     sa.Column('taxable_assessment', sa.String(), nullable=True),
71 |     sa.Column('age_freeze', sa.String(), nullable=True),
72 |     sa.Column('disability_freeze', sa.String(), nullable=True),
73 |     sa.Column('assmnt_change', sa.String(), nullable=True),
74 |     sa.Column('tax_contract', sa.String(), nullable=True),
75 |     sa.ForeignKeyConstraint(['property_id'], ['properties.id'], ),
76 |     sa.PrimaryKeyConstraint('id')
77 |     )
78 |     # ### end Alembic commands ###
79 | 
80 | 
81 | def downgrade():
82 |     # ### commands auto generated by Alembic - please adjust! ###
83 |     op.drop_table('property_values')
84 |     op.drop_table('property_transfers')
85 |     op.drop_table('properties')
86 |     # ### end Alembic commands ###
87 | 


--------------------------------------------------------------------------------
/alembic/versions/a91f96702e22_update_property_fields.py:
--------------------------------------------------------------------------------
 1 | """update_property_fields
 2 | 
 3 | Revision ID: a91f96702e22
 4 | Revises: 25fd903281e5
 5 | Create Date: 2019-01-16 19:51:33.633171
 6 | 
 7 | """
 8 | from alembic import op
 9 | import sqlalchemy as sa
10 | 
11 | 
12 | # revision identifiers, used by Alembic.
13 | revision = 'a91f96702e22'
14 | down_revision = '25fd903281e5'
15 | branch_labels = None
16 | depends_on = None
17 | 
18 | 
19 | def upgrade():
20 |     # ### commands auto generated by Alembic - please adjust! ###
21 |     op.add_column('properties', sa.Column('lot_folio', sa.String(), nullable=True))
22 |     op.add_column('properties', sa.Column('revised_bldg_area_sqft', sa.String(), nullable=True))
23 |     op.drop_column('properties', 'lot')
24 |     # ### end Alembic commands ###
25 | 
26 | 
27 | def downgrade():
28 |     # ### commands auto generated by Alembic - please adjust! ###
29 |     op.add_column('properties', sa.Column('lot', sa.VARCHAR(), autoincrement=False, nullable=True))
30 |     op.drop_column('properties', 'revised_bldg_area_sqft')
31 |     op.drop_column('properties', 'lot_folio')
32 |     # ### end Alembic commands ###
33 | 


--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
 1 | version: '2'
 2 | services:
 3 |   db:
 4 |     image: postgres:alpine
 5 |     restart: always
 6 |     environment:
 7 |       POSTGRES_USER: assessor
 8 |       POSTGRES_PASSWORD: assessor
 9 |     ports:
10 |       - 5432:5432
11 |   elasticsearch1:
12 |     image: docker.elastic.co/elasticsearch/elasticsearch:5.5.0
13 |     container_name: elasticsearch1
14 |     environment:
15 |       - cluster.name=docker-cluster
16 |       - bootstrap.memory_lock=true
17 |       - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
18 |     ulimits:
19 |       memlock:
20 |         soft: -1
21 |         hard: -1
22 |     mem_limit: 1g
23 |     volumes:
24 |       - ./data/esdata1:/usr/share/elasticsearch/data
25 |     ports:
26 |       - 9200:9200
27 |     networks:
28 |       - esnet
29 |   elasticsearch2:
30 |     container_name: elasticsearch2
31 |     image: docker.elastic.co/elasticsearch/elasticsearch:5.5.0
32 |     environment:
33 |       - cluster.name=docker-cluster
34 |       - bootstrap.memory_lock=true
35 |       - "ES_JAVA_OPTS=-Xms512m -Xmx512m"
36 |       - "discovery.zen.ping.unicast.hosts=elasticsearch1"
37 |     ulimits:
38 |       memlock:
39 |         soft: -1
40 |         hard: -1
41 |     mem_limit: 1g
42 |     volumes:
43 |       - ./data/esdata2:/usr/share/elasticsearch/data
44 |     networks:
45 |       - esnet
46 |   kibana:
47 |     image: docker.elastic.co/kibana/kibana:5.5.0
48 |     container_name: kibana
49 |     environment:
50 |       ELASTICSEARCH_URL: http://elasticsearch1:9200
51 |     ports:
52 |       - 5601:5601
53 |     networks:
54 |       - esnet
55 | volumes:
56 |   esdata1:
57 |     driver: local
58 |   esdata2:
59 |     driver: local
60 | 
61 | networks:
62 |   esnet:
63 | 


--------------------------------------------------------------------------------
/install.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | set -ex
 4 | 
 5 | sudo apt-get -y update
 6 | sudo apt-get -y install build-essential python3-dev
 7 | sudo apt-get -y install python-pip
 8 | 
 9 | sudo apt-get -y install postgresql postgresql-contrib libpq-dev
10 | sudo service postgresql start
11 | 
12 | sudo -u postgres createuser -s -d assessor
13 | sudo -u postgres psql -c "ALTER USER assessor WITH PASSWORD 'assessor';"
14 | sudo -u postgres createdb assessor
15 | sudo -u postgres psql -c "grant all privileges on database assessor to assessor"
16 | 
17 | sudo pip install virtualenv
18 | 
19 | git clone https://github.com/codefornola/assessor-scraper.git
20 | 
21 | virtualenv -p python3 venv_scraper
22 | 
23 | . venv_scraper/bin/activate
24 | 
25 | pip install requests
26 | pip install psycopg2==2.7.3.2
27 | pip install pyproj
28 | pip install SQLAlchemy==1.1.15
29 | pip install Scrapy==1.4.0
30 | 


--------------------------------------------------------------------------------
/parcel_id_extractor.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import threading
 5 | from queue import Queue
 6 | from string import ascii_uppercase, digits
 7 | 
 8 | import requests
 9 | from bs4 import BeautifulSoup
10 | 
11 | SEARCH_URL = "http://qpublic9.qpublic.net/la_orleans_alsearch.php?" \
12 |              "searchType=owner_name&INPUT={}&BEGIN={}"
13 | 
14 | Q = Queue()
15 | 
16 | 
17 | class ParcelIdExtractor(object):
18 |     """
19 |     Fuzzes the owner search to extract all available parcel ids
20 |     """
21 | 
22 |     def __init__(self):
23 |         self.parcel_ids = frozenset()
24 |         self.lock = threading.Lock()
25 | 
26 |     def update_ids(self, ids):
27 |         """Use a lock to prevent multiple threads from updating parcel_ids"""
28 |         self.lock.acquire()
29 |         self.parcel_ids |= frozenset(ids)
30 |         self.lock.release()
31 | 
32 |     def search_all_terms(self):
33 |         """
34 |         Puts all the search terms on a queue to be processed by worker threads.
35 |         Note: all owner names are capitalized on the assessor's site, so we
36 |         only use capitalized letters
37 |         """
38 |         # 0-9 + A-Z
39 |         terms = [d for d in digits] + [l for l in ascii_uppercase]
40 |         [Q.put(t) for t in terms]
41 | 
42 |     def search(self, search_term, begin=0):
43 |         """
44 |         Searches by owner name, extracts the parcel ids, then recursively pages
45 |         through the results until no more ids are found for the search_term
46 |         """
47 |         thread = threading.current_thread().getName()
48 |         url = SEARCH_URL.format(search_term, begin)
49 |         print('{} searching {}'.format(thread, url))
50 |         r = requests.get(url)
51 |         if 'No Records Found.' in r.text:
52 |             return
53 |         else:
54 |             soup = BeautifulSoup(r.text, 'html.parser')
55 |             pids = [td.a.text for td in soup.select('td.search_value')
56 |                     if td.a is not None and td.a.text != 'Map It']
57 |             if len(pids) > 0:
58 |                 self.update_ids(pids)
59 |                 self.search(search_term, begin + len(pids))
60 | 
61 |     def process_queue(self):
62 |         while not Q.empty():
63 |             term = Q.get()
64 |             self.search(term)
65 |             Q.task_done()
66 | 
67 |     def main(self, file_name='parcel_ids.txt', num_worker_threads=10):
68 |         try:
69 |             # populate queue with all the search terms
70 |             self.search_all_terms()
71 |             # start worker threads to process queue
72 |             threads = []
73 |             for i in range(num_worker_threads):
74 |                 t = threading.Thread(target=self.process_queue)
75 |                 threads.append(t)
76 |                 t.start()
77 |             # wait for all threads to complete
78 |             [t.join() for t in threads]
79 |             with open(file_name, 'w') as f:
80 |                 print('writing {} parcel ids'.format(len(self.parcel_ids)))
81 |                 for id in self.parcel_ids:
82 |                     f.write(id + os.linesep)
83 |         except Exception as error:
84 |             print(error)
85 | 
86 | 
87 | if __name__ == '__main__':
88 |     ParcelIdExtractor().main()
89 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | alembic==1.0.8
 2 | asn1crypto==0.24.0
 3 | attrs>=19.1.0
 4 | Automat==0.7.0
 5 | beautifulsoup4==4.7.1
 6 | certifi==2019.3.9
 7 | cffi==1.12.2
 8 | chardet==3.0.4
 9 | constantly==15.1.0
10 | cryptography>=2.6.1
11 | cssselect==1.0.3
12 | elasticsearch==7.0.0
13 | hyperlink==19.0.0
14 | idna==2.8
15 | incremental==17.5.0
16 | lxml==4.3.3
17 | parsel==1.5.1
18 | psycopg2==2.8.1
19 | pyasn1==0.4.5
20 | pyasn1-modules==0.2.4
21 | pycparser==2.19
22 | PyDispatcher==2.0.5
23 | pyopenssl>=19.0.0
24 | pyproj==2.1.3
25 | queuelib==1.5.0
26 | requests>=2.21.0
27 | Scrapy==1.6.0
28 | ScrapyElasticSearch==0.9.1
29 | service-identity==18.1.0
30 | six==1.12.0
31 | SQLAlchemy==1.3.2
32 | Twisted==19.2.0
33 | urllib3>=1.24.1
34 | w3lib==1.20.0
35 | zope.interface==4.6.0
36 | 


--------------------------------------------------------------------------------
/scraper/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/codefornola/assessor-scraper/8245c6640daddd6679f793f63f4e46eb4a4bb5ab/scraper/__init__.py


--------------------------------------------------------------------------------
/scraper/items.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # See documentation in:
 4 | # http://doc.scrapy.org/en/latest/topics/items.html
 5 | 
 6 | import scrapy
 7 | 
 8 | 
 9 | class Property(scrapy.Item):
10 |     property_key = scrapy.Field()
11 |     owner_name = scrapy.Field()
12 |     todays_date = scrapy.Field()
13 |     mailing_address = scrapy.Field()
14 |     municipal_district = scrapy.Field()
15 |     location_address = scrapy.Field()
16 |     location = scrapy.Field()
17 |     tax_bill_number = scrapy.Field()
18 |     property_class = scrapy.Field()
19 |     special_tax_district = scrapy.Field()
20 |     subdivision_name = scrapy.Field()
21 |     land_area_sq_ft = scrapy.Field()
22 |     zoning_district = scrapy.Field()
23 |     building_area_sq_ft = scrapy.Field()
24 |     square = scrapy.Field()
25 |     lot_folio = scrapy.Field()
26 |     book = scrapy.Field()
27 |     folio = scrapy.Field()
28 |     line = scrapy.Field()
29 |     parcel_map = scrapy.Field()
30 |     legal_description = scrapy.Field()
31 |     assessment_area = scrapy.Field()
32 |     sales = scrapy.Field()
33 |     values = scrapy.Field()
34 |     revised_bldg_area_sqft = scrapy.Field()
35 | 


--------------------------------------------------------------------------------
/scraper/middlewares.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define here the models for your spider middleware
 4 | #
 5 | # See documentation in:
 6 | # http://doc.scrapy.org/en/latest/topics/spider-middleware.html
 7 | 
 8 | from scrapy import signals
 9 | 
10 | 
11 | class ScraperSpiderMiddleware(object):
12 |     # Not all methods need to be defined. If a method is not defined,
13 |     # scrapy acts as if the spider middleware does not modify the
14 |     # passed objects.
15 | 
16 |     @classmethod
17 |     def from_crawler(cls, crawler):
18 |         # This method is used by Scrapy to create your spiders.
19 |         s = cls()
20 |         crawler.signals.connect(s.spider_opened, signal=signals.spider_opened)
21 |         return s
22 | 
23 |     def process_spider_input(self, response, spider):
24 |         # Called for each response that goes through the spider
25 |         # middleware and into the spider.
26 | 
27 |         # Should return None or raise an exception.
28 |         return None
29 | 
30 |     def process_spider_output(self, response, result, spider):
31 |         # Called with the results returned from the Spider, after
32 |         # it has processed the response.
33 | 
34 |         # Must return an iterable of Request, dict or Item objects.
35 |         for i in result:
36 |             yield i
37 | 
38 |     def process_spider_exception(self, response, exception, spider):
39 |         # Called when a spider or process_spider_input() method
40 |         # (from other spider middleware) raises an exception.
41 | 
42 |         # Should return either None or an iterable of Response, dict
43 |         # or Item objects.
44 |         pass
45 | 
46 |     def process_start_requests(self, start_requests, spider):
47 |         # Called with the start requests of the spider, and works
48 |         # similarly to the process_spider_output() method, except
49 |         # that it doesn’t have a response associated.
50 | 
51 |         # Must return only requests (not items).
52 |         for r in start_requests:
53 |             yield r
54 | 
55 |     def spider_opened(self, spider):
56 |         spider.logger.info('Spider opened: %s' % spider.name)
57 | 


--------------------------------------------------------------------------------
/scraper/models.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import os
 4 | import logging
 5 | from sqlalchemy import create_engine, Column, Integer, String, ForeignKey
 6 | from sqlalchemy.engine.url import URL
 7 | from sqlalchemy.ext.declarative import declarative_base
 8 | from sqlalchemy.orm import relationship
 9 | 
10 | from scraper import settings
11 | 
12 | Base = declarative_base()
13 | 
14 | 
15 | def db_connect():
16 |     """
17 |     Returns sqlalchemy engine instance
18 |     """
19 |     if 'DATABASE_URL' in os.environ:
20 |         DATABASE_URL = os.environ['DATABASE_URL']
21 |         logging.debug("Connecting to %s", URL)
22 |     else:
23 |         DATABASE_URL = URL(**settings.DATABASE)
24 |         logging.debug("Connecting with settings %s", DATABASE_URL)
25 |     return create_engine(DATABASE_URL)
26 | 
27 | 
28 | def create_tables(engine):
29 |     Base.metadata.create_all(engine)
30 | 
31 | 
32 | class Property(Base):
33 |     __tablename__ = 'properties'
34 | 
35 |     id = Column(Integer, primary_key=True)
36 |     property_key = Column(String, nullable=False)
37 |     todays_date = Column(String)
38 |     location = Column(String)
39 |     owner_name = Column(String)
40 |     mailing_address = Column(String)
41 |     municipal_district = Column(String)
42 |     location_address = Column(String)
43 |     tax_bill_number = Column(String)
44 |     property_class = Column(String)
45 |     special_tax_district = Column(String)
46 |     subdivision_name = Column(String)
47 |     land_area_sq_ft = Column(String)
48 |     revised_bldg_area_sqft = Column(String)
49 |     zoning_district = Column(String)
50 |     building_area_sq_ft = Column(String)
51 |     square = Column(String)
52 |     lot_folio = Column(String)
53 |     book = Column(String)
54 |     folio = Column(String)
55 |     line = Column(String)
56 |     parcel_map = Column(String)
57 |     legal_description = Column(String)
58 |     assessment_area = Column(String)
59 |     values = relationship('PropertyValue')
60 |     transfers = relationship('PropertyTransfer')
61 | 
62 | 
63 | class PropertyValue(Base):
64 |     __tablename__ = 'property_values'
65 | 
66 |     id = Column(Integer, primary_key=True)
67 |     property_id = Column(Integer, ForeignKey('properties.id'))
68 |     year = Column(String)
69 |     land_value = Column(String)
70 |     building_value = Column(String)
71 |     total_value = Column(String)
72 |     assessed_land_value = Column(String)
73 |     assessed_building_value = Column(String)
74 |     total_assessed_value = Column(String)
75 |     homestead_exemption_value = Column(String)
76 |     taxable_assessment = Column(String)
77 |     age_freeze = Column(String)
78 |     disability_freeze = Column(String)
79 |     assmnt_change = Column(String)
80 |     tax_contract = Column(String)
81 | 
82 | 
83 | class PropertyTransfer(Base):
84 |     __tablename__ = 'property_transfers'
85 | 
86 |     id = Column(Integer, primary_key=True)
87 |     property_id = Column(Integer, ForeignKey('properties.id'))
88 |     sale_transfer_date = Column(String)
89 |     price = Column(String)
90 |     grantor = Column(String)
91 |     grantee = Column(String)
92 |     notarial_archive_number = Column(String)
93 |     instrument_number = Column(String)
94 | 


--------------------------------------------------------------------------------
/scraper/pipelines.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | # Define your item pipelines here
 4 | #
 5 | # Don't forget to add your pipeline to the ITEM_PIPELINES setting
 6 | # See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
 7 | 
 8 | from sqlalchemy.orm import sessionmaker
 9 | 
10 | from scraper.models import Property, PropertyTransfer, PropertyValue, db_connect, create_tables
11 | 
12 | 
13 | class PostgresPipeline(object):
14 |     """Pipeline for storing scraped items in postgres"""
15 | 
16 |     def __init__(self):
17 |         engine = db_connect()
18 |         create_tables(engine)
19 |         self.Session = sessionmaker(bind=engine)
20 | 
21 |     def process_item(self, item, spider):
22 |         """
23 |         This method is called for every item emitted by the spider.
24 |         """
25 |         session = self.Session()
26 |         sales = item['sales']
27 |         values = item['values']
28 |         del item['sales']
29 |         del item['values']
30 |         property = Property(**item)
31 | 
32 |         try:
33 |             session.add(property)
34 |             # flush to obtain the id of property to be used as the foreign key
35 |             session.flush()
36 | 
37 |             for sale in sales:
38 |                 sale['property_id'] = property.id
39 |                 session.add(PropertyTransfer(**sale))
40 |             for value in values:
41 |                 value['property_id'] = property.id
42 |                 session.add(PropertyValue(**value))
43 |             session.commit()
44 |         except:
45 |             session.rollback()
46 |             raise
47 |         finally:
48 |             session.close()
49 | 
50 |         return item
51 | 


--------------------------------------------------------------------------------
/scraper/settings.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | # Scrapy settings for scraper project
  4 | #
  5 | # For simplicity, this file contains only settings considered important or
  6 | # commonly used. You can find more settings consulting the documentation:
  7 | #
  8 | #     http://doc.scrapy.org/en/latest/topics/settings.html
  9 | #     http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 10 | #     http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 11 | 
 12 | BOT_NAME = 'scraper'
 13 | 
 14 | SPIDER_MODULES = ['scraper.spiders']
 15 | NEWSPIDER_MODULE = 'scraper.spiders'
 16 | 
 17 | # Obey robots.txt rules
 18 | ROBOTSTXT_OBEY = False
 19 | 
 20 | # Configure maximum concurrent requests performed by Scrapy (default: 16)
 21 | # See also autothrottle settings
 22 | # CONCURRENT_REQUESTS = 32
 23 | 
 24 | # Configure a delay for requests for the same website (default: 0)
 25 | # See http://scrapy.readthedocs.org/en/latest/topics/settings.html#download-delay
 26 | # DOWNLOAD_DELAY = 3
 27 | # RANDOMIZE_DOWNLOAD_DELAY = True
 28 | # The download delay setting will honor only one of:
 29 | # CONCURRENT_REQUESTS_PER_DOMAIN = 16
 30 | # CONCURRENT_REQUESTS_PER_IP = 16
 31 | 
 32 | # Enable and configure the AutoThrottle extension (disabled by default)
 33 | # See http://doc.scrapy.org/en/latest/topics/autothrottle.html
 34 | AUTOTHROTTLE_ENABLED = True
 35 | # The initial download delay
 36 | # AUTOTHROTTLE_START_DELAY = 5
 37 | # The maximum download delay to be set in case of high latencies
 38 | # AUTOTHROTTLE_MAX_DELAY = 60
 39 | # The average number of requests Scrapy should be sending in parallel to
 40 | # each remote server
 41 | AUTOTHROTTLE_TARGET_CONCURRENCY = 8
 42 | # Enable showing throttling stats for every response received:
 43 | # AUTOTHROTTLE_DEBUG = True
 44 | 
 45 | # Disable cookies (enabled by default)
 46 | # COOKIES_ENABLED = False
 47 | 
 48 | # Disable Telnet Console (enabled by default)
 49 | # TELNETCONSOLE_ENABLED = False
 50 | 
 51 | # Override the default request headers:
 52 | DEFAULT_REQUEST_HEADERS = {
 53 |     'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
 54 |     'Accept-Language': 'en',
 55 |     'Accept-Encoding': 'gzip, deflate, sdch',
 56 | }
 57 | 
 58 | # Enable or disable spider middlewares
 59 | # See http://scrapy.readthedocs.org/en/latest/topics/spider-middleware.html
 60 | # SPIDER_MIDDLEWARES = {
 61 | #    'scraper.middlewares.ScraperSpiderMiddleware': 543,
 62 | # }
 63 | 
 64 | # Enable or disable downloader middlewares
 65 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html
 66 | # DOWNLOADER_MIDDLEWARES = {
 67 | #    'scraper.middlewares.MyCustomDownloaderMiddleware': 543,
 68 | # }
 69 | 
 70 | # Enable or disable extensions
 71 | # See http://scrapy.readthedocs.org/en/latest/topics/extensions.html
 72 | # EXTENSIONS = {
 73 | #    'scrapy.extensions.telnet.TelnetConsole': None,
 74 | # }
 75 | 
 76 | # Configure item pipelines
 77 | # See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
 78 | ITEM_PIPELINES = {
 79 |     'scraper.pipelines.PostgresPipeline': 300,
 80 | }
 81 | # ITEM_PIPELINES = {
 82 | # 'scrapyelasticsearch.scrapyelasticsearch.ElasticSearchPipeline': 500
 83 | # }
 84 | 
 85 | ELASTICSEARCH_SERVERS = ['http://elastic:changeme@localhost:9200']
 86 | ELASTICSEARCH_INDEX = 'assessor'
 87 | ELASTICSEARCH_INDEX_DATE_FORMAT = '%Y-%m'
 88 | ELASTICSEARCH_TYPE = 'property'
 89 | ELASTICSEARCH_UNIQ_KEY = 'property_key'
 90 | 
 91 | DATABASE = {
 92 |     'drivername': 'postgres',
 93 |     'host': 'localhost',
 94 |     'port': '5432',
 95 |     'username': 'assessor',
 96 |     'password': 'assessor',
 97 |     'database': 'assessor'
 98 | }
 99 | 
100 | # Enable and configure HTTP caching (disabled by default)
101 | # See http://scrapy.readthedocs.org/en/latest/topics/downloader-middleware.html#httpcache-middleware-settings
102 | # HTTPCACHE_ENABLED = True
103 | # HTTPCACHE_EXPIRATION_SECS = 0
104 | # HTTPCACHE_DIR = 'httpcache'
105 | # HTTPCACHE_IGNORE_HTTP_CODES = []
106 | # HTTPCACHE_STORAGE = 'scrapy.extensions.httpcache.FilesystemCacheStorage'
107 | 


--------------------------------------------------------------------------------
/scraper/spiders/__init__.py:
--------------------------------------------------------------------------------
1 | # This package will contain the spiders of your Scrapy project
2 | #
3 | # Please refer to the documentation for information on how to create and manage
4 | # your spiders.
5 | 


--------------------------------------------------------------------------------
/scraper/spiders/assessment_spider.py:
--------------------------------------------------------------------------------
  1 | # -*- coding: utf-8 -*-
  2 | 
  3 | import logging
  4 | import pprint
  5 | import re
  6 | import os 
  7 | from urllib.parse import urlparse, parse_qs
  8 | 
  9 | import requests
 10 | import scrapy
 11 | from pyproj import Proj, transform
 12 | 
 13 | from scraper.items import Property
 14 | from scrapy.exceptions import DropItem
 15 | 
 16 | logging.getLogger('scrapy').setLevel(logging.WARNING)
 17 | logging.getLogger('scrapy.extensions.throttle').setLevel(logging.INFO)
 18 | logging.getLogger('urllib3').setLevel(logging.WARNING)
 19 | pp = pprint.PrettyPrinter()
 20 | 
 21 | URL = "http://qpublic9.qpublic.net/la_orleans_display.php?KEY={}"
 22 | 
 23 | 
 24 | class AssessmentSpider(scrapy.Spider):
 25 |     """
 26 |     All spiders must subclass scrapy.Spider
 27 |     https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider
 28 |     """
 29 |     name = "assessment_spider"
 30 |     f = open('parcel_ids.txt')
 31 |     start_urls = [URL.format(pid.strip()) for pid in f.readlines()]
 32 | 
 33 |     def parse(self, response):
 34 |         """
 35 |         Default callback function with response for the crawled url
 36 |         https://doc.scrapy.org/en/latest/topics/spiders.html#scrapy.spiders.Spider.parse
 37 |         """
 38 |         response = response.replace(body=re.sub(r"<br\s*[\/]?>", "\n", response.body.decode('utf=8')))
 39 |         property_key = response.url.split('=')[1].replace('&', '')
 40 |         # logging.debug("Parsing property_key: %s", property_key)
 41 |         if 'No Data at this time' in response.text:
 42 |             msg = "No data for " + response.url
 43 |             logging.warning(msg)
 44 |             raise DropItem(msg)
 45 |         else:
 46 |             property_info = self.parse_property_info(response)
 47 |             property_values = self.parse_property_values(response)
 48 |             property_sales = self.parse_property_sales(response)
 49 |             property_info['sales'] = property_sales
 50 |             property_info['values'] = property_values
 51 |             property_info['property_key'] = property_key
 52 |             yield Property(property_info)
 53 | 
 54 |     @staticmethod
 55 |     def get_address_location(parcel_map_link):
 56 |         """
 57 |         Parses the parcel map link and calculates coordinates from the extent.
 58 |         An example link looks like this:
 59 |         http://qpublic9.qpublic.net/qpmap4/map.php?county=la_orleans&parcel=41050873&extent=3667340+524208+3667804+524540&layers=parcels+aerials+roads+lakes
 60 |         """
 61 |         o = urlparse(parcel_map_link)
 62 |         query = parse_qs(o.query)
 63 |         bbox = query['extent'][0].split(' ')
 64 |         x1, y1, x2, y2 = [float(pt) for pt in bbox]
 65 |         # get the midpoint of the extent
 66 |         midpoint = [(x1 + x2) / 2, (y1 + y2) / 2]
 67 |         # transform projected coordinates to latitude and longitude
 68 |         in_proj = Proj(init='epsg:3452', preserve_units=True)
 69 |         out_proj = Proj(init='epsg:4326')
 70 |         return transform(in_proj, out_proj, midpoint[0], midpoint[1])
 71 | 
 72 |     def parse_property_info(self, response):
 73 |         hdrs = [h.extract().strip() for h in response.xpath('//td[@class="owner_header"]/font/text()')]
 74 |         value_cells = response.xpath('//td[@class="owner_value"]')
 75 |         value_texts = [self._extract_text_from_value_cell(value_cell) for value_cell in value_cells]
 76 |         value_fonts = [self._extract_font_from_value_cell(value_cell) for value_cell in value_cells]
 77 |         value_hrefs = [self._extract_href_from_value_cell(value_cell) for value_cell in value_cells]
 78 |         vals = [' '.join([v1, v2, v3]).strip() for v1, v2, v3 in zip(value_texts, value_fonts, value_hrefs)]
 79 |         keys = [self._clean_key(h) for h in hdrs]
 80 |         info = dict(zip(keys, vals))
 81 |         # get href to parcel map if it exists
 82 |         links = response.xpath('//td[@class="owner_value"]/a[contains(@href,"extent")]/@href')
 83 |         if len(links) > 0:
 84 |             parcel_map_link = links[0].extract()
 85 |             [lng, lat] = self.get_address_location(parcel_map_link)
 86 |             info['location'] = [lng, lat]
 87 |         return info
 88 | 
 89 |     @staticmethod
 90 |     def _extract_text_from_value_cell(value_cell):
 91 |         return '\n'.join([v.extract().strip() for v in value_cell.xpath('text()')])
 92 | 
 93 |     @staticmethod
 94 |     def _extract_font_from_value_cell(value_cell):
 95 |         return '\n'.join([v.extract().strip() for v in value_cell.xpath('font/text()')])
 96 | 
 97 |     @staticmethod
 98 |     def _extract_href_from_value_cell(value_cell):
 99 |         return '\n'.join([v.extract().strip() for v in value_cell.xpath('a/@href')])
100 | 
101 |     def parse_property_sales(self, response):
102 |         hdrs = response.css('td[class="sales_header"] > font::text').extract()
103 |         keys = [self._clean_key(h) for h in hdrs]
104 |         value_info = response.css('td[class="sales_value"]').xpath('./text()').extract()
105 |         values = [v.replace('\xa0', '').strip().replace('      ', '') for v in value_info]
106 |         sales = []
107 |         for i in range(0, len(values), 6):
108 |             sale = values[i:i + 6]
109 |             sales.append(dict(zip(keys, sale)))
110 |         return sales
111 | 
112 |     @staticmethod
113 |     def _clean_key(key):
114 |         cleaned_key = re.sub(r"[\(|\)\']", "", key.lower())
115 |         cleaned_key = re.sub(r"[/ \n]", "_", cleaned_key)
116 |         cleaned_key = re.sub(r"_+", "_", cleaned_key)
117 |         return cleaned_key.strip()
118 | 
119 |     def parse_property_values(self, response):
120 |         hdrs = response.css('td[class="tax_header"] > font::text').extract()
121 |         keys = [self._clean_key(h) for h in hdrs]
122 |         value_info = response.css('.tax_value').xpath('./text()').extract()
123 |         values = [v.replace('\xa0', '').replace(' ', '') for v in value_info]
124 |         special_treatment_info = response.css('.tax_value').xpath('./font').extract()
125 |         special_treatment_info = [re.sub('<[^>]*>', '', s) for s in special_treatment_info]
126 |         year1_vals = values[0:9]
127 |         year1_vals.extend(special_treatment_info[0:4])
128 |         year2_vals = values[16:25]
129 |         year2_vals.extend(special_treatment_info[4:8])
130 |         year3_vals = values[32:41]
131 |         year3_vals.extend(special_treatment_info[8:12])
132 |         return [dict(zip(keys, year1_vals)),
133 |                 dict(zip(keys, year2_vals)),
134 |                 dict(zip(keys, year3_vals))]


--------------------------------------------------------------------------------
/scraper/spiders/test_assessment_spider.py:
--------------------------------------------------------------------------------
 1 | # -*- coding: utf-8 -*-
 2 | 
 3 | import unittest
 4 | from unittest.mock import patch, mock_open
 5 | 
 6 | class AssessmentSpiderTestCase(unittest.TestCase):
 7 |     def setUp(self):
 8 |         with patch("builtins.open", mock_open(read_data="data1\ndata2")) as mock_file:
 9 |             from scraper.spiders.assessment_spider import AssessmentSpider
10 |             self.spider = AssessmentSpider()
11 | 
12 |     def test_clean_key__lot_folio(self):
13 |         result = self.spider._clean_key('Lot / Folio')
14 |         self.assertEqual(
15 |             result,
16 |             'lot_folio'
17 |         )
18 | 
19 |     def test_clean_key__land_area(self):
20 |         result = self.spider._clean_key('Land Area (sq ft)	')
21 |         self.assertEqual(
22 |             result,
23 |             'land_area_sq_ft'
24 |         )
25 | 


--------------------------------------------------------------------------------
/scrapy.cfg:
--------------------------------------------------------------------------------
 1 | # Automatically created by: scrapy startproject
 2 | #
 3 | # For more information about the [deploy] section see:
 4 | # https://scrapyd.readthedocs.org/en/latest/deploy.html
 5 | 
 6 | [settings]
 7 | default = scraper.settings
 8 | 
 9 | [deploy]
10 | #url = http://localhost:6800/
11 | project = scraper
12 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup
 2 | 
 3 | setup(
 4 |     name='assessor-scraper',
 5 |     version='0.1',
 6 |     description='',
 7 |     url='https://github.com/codefornola/assessor-scraper',
 8 |     author='CodeForNola',
 9 |     author_email='',
10 |     packages=['scraper'],
11 |     install_requires=[
12 |         "psycopg2==2.7.3.2",
13 |         "pyproj",
14 |         "requests",
15 |         "Scrapy==1.4.0",
16 |         "SQLAlchemy==1.1.15",
17 |     ],
18 |     zip_safe=False
19 | )
20 | 


--------------------------------------------------------------------------------
/terraform/scraper_ec2.tf:
--------------------------------------------------------------------------------
 1 | provider "aws" {
 2 |   region     = "us-east-1"
 3 | }
 4 | 
 5 | resource "aws_instance" "scraper_server" {
 6 |   ami           = "ami-2757f631"
 7 |   instance_type = "t2.micro"
 8 |   key_name      = "deployer-key"
 9 | 
10 |   vpc_security_group_ids = [
11 |     "${aws_security_group.allow_inbound_ssh.id}",
12 |     "${aws_security_group.allow_outbound_http.id}"
13 |   ]
14 | 
15 |   connection {
16 |     type = "ssh"
17 |     user = "ubuntu"
18 |     private_key = "${file("~/.ssh/id_rsa")}"
19 |   }
20 | 
21 |   provisioner "remote-exec" {
22 |     script      = "../install.sh"
23 |   }
24 | }
25 | 
26 | resource "aws_key_pair" "deployer" {
27 |   key_name   = "deployer-key"
28 |   public_key = "${file("~/.ssh/id_rsa.pub")}"
29 | }
30 | 
31 | resource "aws_security_group" "allow_inbound_ssh" {
32 |   name        = "allow_inbound_ssh"
33 |   description = "Allow ssh inbound traffic"
34 | 
35 |   ingress {
36 |     from_port   = 22
37 |     to_port     = 22
38 |     protocol    = "tcp"
39 |     cidr_blocks = ["0.0.0.0/0"]
40 |     ipv6_cidr_blocks = ["::/0"]
41 |     description = "Allow ssh"
42 |   }
43 | }
44 | 
45 | resource "aws_security_group" "allow_outbound_http" {
46 |   name        = "allow_outbound_http"
47 |   description = "Allow outbound http and https traffic"
48 | 
49 |   egress {
50 |     from_port   = 80
51 |     to_port     = 80
52 |     protocol    = "tcp"
53 |     cidr_blocks = ["0.0.0.0/0"]
54 |     ipv6_cidr_blocks = ["::/0"]
55 |     description = "Allow http"
56 |   }
57 | 
58 |   egress {
59 |     from_port   = 443
60 |     to_port     = 443
61 |     protocol    = "tcp"
62 |     cidr_blocks = ["0.0.0.0/0"]
63 |     ipv6_cidr_blocks = ["::/0"]
64 |     description = "Allow https"
65 |   }
66 | }
67 | 
68 | output "public_dns" {
69 |   value = "${aws_instance.scraper_server.public_dns}"
70 | }
71 | 


--------------------------------------------------------------------------------