├── .gitignore
├── LICENSE
├── README.md
├── dag.png
├── pipeline.png
├── requirements.txt
└── src
    ├── airflow
        ├── start_workers.sh
        ├── stop_workers.sh
        └── where_cycle_dag.py
    ├── config
        ├── database.py
        ├── geometries.py
        ├── ref
        │   ├── check_citibike_schema.py
        │   ├── check_tlc_schemas.py
        │   ├── get_geometries.sql
        │   └── tlc_schemas.txt
        └── schemas.py
    ├── dash
        ├── app.py
        └── assets
        │   └── background.css
    ├── postGIS_tables
        ├── geo_joined
        │   ├── citibike_stations.sql
        │   └── past_tlc_visits.sql
        ├── production
        │   ├── all_time_stats.sql
        │   └── taxi_zones.sql
        └── statistics
        │   ├── citibike.sql
        │   ├── tlc_visits.sql
        │   └── yelp_businesses.sql
    ├── preparation
        ├── extract.py
        ├── load.py
        └── transform.py
    └── spark_reduction
        ├── driver.py
        ├── extract.py
        ├── load.py
        └── transform.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Temporary resources
  2 | benchmark/
  3 | dash_project_medium.py
  4 | spark-warehouse/
  5 | 
  6 | # Byte-compiled / optimized / DLL files
  7 | __pycache__/
  8 | *.py[cod]
  9 | *$py.class
 10 | 
 11 | # C extensions
 12 | *.so
 13 | 
 14 | # Distribution / packaging
 15 | .Python
 16 | build/
 17 | develop-eggs/
 18 | dist/
 19 | downloads/
 20 | eggs/
 21 | .eggs/
 22 | lib/
 23 | lib64/
 24 | parts/
 25 | sdist/
 26 | var/
 27 | wheels/
 28 | pip-wheel-metadata/
 29 | share/python-wheels/
 30 | *.egg-info/
 31 | .installed.cfg
 32 | *.egg
 33 | MANIFEST
 34 | 
 35 | # PyInstaller
 36 | #  Usually these files are written by a python script from a template
 37 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 38 | *.manifest
 39 | *.spec
 40 | 
 41 | # Installer logs
 42 | pip-log.txt
 43 | pip-delete-this-directory.txt
 44 | 
 45 | # Unit test / coverage reports
 46 | htmlcov/
 47 | .tox/
 48 | .nox/
 49 | .coverage
 50 | .coverage.*
 51 | .cache
 52 | nosetests.xml
 53 | coverage.xml
 54 | *.cover
 55 | *.py,cover
 56 | .hypothesis/
 57 | .pytest_cache/
 58 | 
 59 | # Translations
 60 | *.mo
 61 | *.pot
 62 | 
 63 | # Django stuff:
 64 | *.log
 65 | local_settings.py
 66 | db.sqlite3
 67 | db.sqlite3-journal
 68 | 
 69 | # Flask stuff:
 70 | instance/
 71 | .webassets-cache
 72 | 
 73 | # Scrapy stuff:
 74 | .scrapy
 75 | 
 76 | # Sphinx documentation
 77 | docs/_build/
 78 | 
 79 | # PyBuilder
 80 | target/
 81 | 
 82 | # Jupyter Notebook
 83 | *.ipynb
 84 | .ipynb_checkpoints/
 85 | 
 86 | # IPython
 87 | profile_default/
 88 | ipython_config.py
 89 | 
 90 | # pyenv
 91 | .python-version
 92 | 
 93 | # pipenv
 94 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 95 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 96 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 97 | #   install all needed dependencies.
 98 | #Pipfile.lock
 99 | 
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 | 
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 | 
107 | # SageMath parsed files
108 | *.sage.py
109 | 
110 | # Environments
111 | where-cycle-env/
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 | 
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 | 
124 | # Rope project settings
125 | .ropeproject
126 | 
127 | # mkdocs documentation
128 | /site
129 | 
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 | 
135 | # Pyre type checker
136 | .pyre/
137 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2020 Josh Lang
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | ![Python 3.6.9](https://img.shields.io/badge/Python-3.6.9-light)
  2 | ![Airflow 1.10.11](https://img.shields.io/badge/Airflow-1.10.11-red)
  3 | ![GeoPandas 0.7](https://img.shields.io/badge/GeoPandas-0.7-purple)
  4 | ![Spark 2.4.5](https://img.shields.io/badge/Spark-2.4.5-orange)
  5 | ![PostGIS 2.4](https://img.shields.io/badge/PostGIS-2.4-darkblue)
  6 | ![Dash 1.12](https://img.shields.io/badge/Dash-1.12-blue)
  7 | ![MIT License](https://img.shields.io/badge/License-MIT-lightgrey)
  8 | # Where Cycle
  9 | 
 10 | *Getting New Yorkers Back to Business, Safely*
 11 | 
 12 | ## Contents
 13 | 1. [Purpose](README.md#purpose)
 14 | 1. [Pipeline](README.md#pipeline)
 15 | 1. [Summary](README.md#summary)
 16 |     - [Data](README.md#data)
 17 |     - [Preparation](README.md#preparation)
 18 |     - [Spark Reduction](README.md#spark-reduction)
 19 |     - [PostGIS Tables](README.md#postgis-tables)
 20 |     - [Dash & Airflow](README.md#dash-and-airflow)
 21 | 1. [Spark Optimization](README.md#spark-optimization)
 22 | 1. [Setup](README.md#setup)
 23 | 1. [Directory Structure](README.md#directory-structure)
 24 | 1. [License](README.md#license)
 25 | 
 26 | ## Purpose
 27 | As health officials advised social distancing and businesses closed earlier this year, subway and bus ridership plummeted in many large cities. New York saw an almost 90% reduction by late April. Now, as the city is tentatively opening back up, people may be looking to return to their places of work and to support their favorite businesses, but they might be hesitant to utilize public transit, instead seeking open-air alternatives.
 28 | 
 29 | A cursory glance at some transit coverage in NYC makes it clear that, while Citibike is an awesome open-air solution, the available stations can’t immediately meet the needs of the outer boroughs: some expansion is required. **The goal of this pipeline is to synthesize data that may help city planners and Citibike analysts determine which areas could be ideal for Citibike expansion. As an initial step toward that end, it aggregates historical taxi & for-hire vehicle trips, Citibike trips & station density, and business review statistics by taxi zone.**
 30 | 
 31 | *This project was developed by Josh Lang as part of his data engineering fellowship with Insight Data Science in the summer of 2020.*
 32 | 
 33 | ## Pipeline
 34 | ![Pipeline](https://github.com/josh-lang/where-cycle/blob/master/pipeline.png) <br/>
 35 | ![DAG](https://github.com/josh-lang/where-cycle/blob/master/dag.png)
 36 | 
 37 | ## Summary
 38 | If you'd prefer to jump right in and start clicking into the functions from that DAG above, then the file that produced it is [here](https://github.com/josh-lang/where-cycle/blob/master/src/airflow/where_cycle_dag.py). Since you can't navigate directly to everything from there, you may also find a glance at the [directory structure](README.md#directory-structure) below handy.
 39 | 
 40 | ### Data
 41 |  - Citibike Trip Histories: [S3 bucket](https://s3.console.aws.amazon.com/s3/buckets/tripdata), [documentation](https://www.citibikenyc.com/system-data)
 42 |  - NYC Taxi & Limousine Commission Trip Records: [S3 bucket](https://s3.console.aws.amazon.com/s3/buckets/nyc-tlc), [documentation](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
 43 |  - Yelp Business Search API: [documentation](https://www.yelp.com/developers/documentation/v3/business_search)
 44 | 
 45 | ### Preparation
 46 |  - In order to index everything by taxi zone, NYC TLC's shapefile needs to be pulled down from S3, processed, and saved to PostgreSQL
 47 |      - Coordinate reference system is converted from NAD83 to WGS84
 48 |      - Each polygon is replaced with its equivalent multipolygon
 49 |      - All geometries are converted to well-known text
 50 |  - Centroids are then calculated for each taxi zone and used to query Yelp's API, requesting the 50 nearest businesses. These are cleaned and written as well
 51 |      - Invalid results and duplicates are removed
 52 |      - Coordinates are unnested and combined into point geometries
 53 |      - Like with taxi zones, geometries are converted to well-known text
 54 |  - Citibike's zipped files need to be pulled out of S3, unzipped, and sent back to another S3 bucket before batch processing since Spark can't ingest zip files natively
 55 |      - This is because Hadoop, which provides its underlying filesystem interface, does not support that compression codec
 56 |      - Python's `io.BytesIO` class reads S3's *bytes-like objects* and makes this a quick streaming process
 57 | 
 58 | ### Spark Reduction
 59 |  - Spark can read csv files directly via the s3a connector for Hadoop, and multiple URIs can be specified with globbing
 60 |      - Citibike's trip data is consistent, so parsing all of it requires only one path and one schema definition
 61 |          - That schema can be truncated because this project isn't concerned with any columns that appear after trip dates and endpoint locations
 62 |      - TLC data is messier with 15 distinct csv headers over the corpus, but 10 truncated schemas are sufficient for pulling everything in correctly
 63 |          - TLC trips before 2016-07 use coordinates for pickup and dropoff locations, while trips after 2016-06 use taxi zone IDs
 64 |          - TLC's timestamps aren't always valid, so schemas are simplified further by not including those. Dates are instead assumed from csv filenames, which represent each month of trips
 65 |      - Relevant columns are selected from csvs, and then they're unioned together into 4 cached tables: Citibike trips, past TLC trips, modern TLC trips, and a small table for just the earliest for-hire vehicle trips
 66 |  - To aggregate visits by taxi zone, trip beginnings and endings need to be combined into endpoints and grouped by location. 4 tables are created in PostgreSQL:
 67 |      - Coordinates for unique Citibike stations within the taxi zone map's extent are pulled out separately from visit aggregation
 68 |      - Citibike visits are then aggragated by station ID
 69 |      - Past TLC visits are aggregated by coordinates within taxi zone extent rounded to 3 decimal places — neighborhood resolution
 70 |      - Modern TLC visits and those early for-hire vehicle visits are aggregated simply by taxi zone ID
 71 | 
 72 | ### PostGIS Tables
 73 |  - All tables so far have been written to the *staging* schema in PostgreSQL. Now, that everything's there, some final processing with the PostGIS extension can be done
 74 |  - *geo_joined* schema
 75 |      - Citibike station coordinates are matched to taxi zone polygons to create a join table for Citibike visits
 76 |      - Past TLC visits are aggregated by the taxi zone their coordinates are within
 77 |  - *statistics* schema
 78 |      - Citibike stations and trips are aggregated by taxi zone using join table
 79 |      - Past TLC visits are unioned and summed with modern TLC visits using taxi zone IDs
 80 |      - Yelp business ratings and reviews are aggregated by the taxi zone their coordinates are within
 81 |  - *production* schema
 82 |      - Taxi zone geometries are converted to GeoJSON for Dash to plot on choropleth maps
 83 |      - Citibike, TLC, and Yelp statistics are joined to taxi zone dimensions for Dash to define toggleable scales
 84 | 
 85 | ### Dash and Airflow
 86 |  - A rudimentary dashboard built with Dash lives at [dats.work/where-cycle](http://dats.work/where-cycle)
 87 |      - GeoJSON geometries from PostGIS need to be wrapped as a GeoJSON Feature Collection inside of the Dash app to be plotted on choropleth maps
 88 |      - Statistics from PostGIS define the choropleth map scales and are also used to create a supplementary bar chart of the top 15 taxi zones for whichever metric is selected
 89 |  - Airflow adds some fault tolerance and runs pipeline on a regular basis to keep data up-to-date
 90 |      - Dependencies between tasks prevent things from running out of order or unnecessarily when an upstream task has failed
 91 |      - Pipeline runs every week so that Yelp has enough time to update meaningfully and so that Citibike and TLC updates can be captured with relatively minimal delay
 92 |          - Both Citibike and TLC batch their trip data by month, but the date they update their S3 buckets isn't consistent
 93 |          - Yelp's data is queried directly from their API and may return updated or simply different results each time
 94 |      - Startup and shutdown of the standalone Spark cluster is automated within the pipeline to save money
 95 | 
 96 | ## Spark Optimization
 97 | I tested a handful of methods and configuration changes trying to make the Spark piece of the pipeline run more efficiently. First, since I had already defined each TLC schema while taking my initial stab at ingestion, I wanted to see whether those explicit definitions were, in fact, significantly faster than just using Spark's `inferSchema` option. Defining schemas before reading files was faster (as expected), but it only reduced total runtime by **~2.1%**.
 98 | 
 99 | The most dramatic improvement came with caching each table of source CSVs before running the Spark SQL queries that transform them. This increased my total runtime savings to **~32.9%**!
100 | 
101 | After that, I found that lowering the number of shuffle partitions so that it matched the number of cores in my small cluster and doubling the maximum bytes in cached storage batches and in each partition could make things even faster, but only by so much. Changing these settings in my `spark-defaults.conf` file brought total runtime reduction to **~36.6%**:
102 | | Property | Setting |
103 | | -------- | ------- |
104 | | spark.sql.files.maxPartitionBytes | 268435456 |
105 | | spark.sql.inMemoryColumnarStorage.batchSize | 20000 |
106 | | spark.sql.inMemoryColumnarStorage.compressed | true |
107 | | spark.sql.shuffle.partitions | 12 |
108 | 
109 | ## Setup
110 | Python dependencies can be installed with the following command:
111 | ```sh
112 | pip install -r requirements.txt
113 | ```
114 | 
115 | This project was built using an Apache Spark 2.4.5 / Hadoop 2.7 binary downloaded from [spark.apache.org](https://spark.apache.org/downloads.html). It reads from AWS S3 and writes to PostgreSQL, so a driver from [jdbc.postgresql.org](https://jdbc.postgresql.org) should be placed in `spark/jars/` and some configuration should be added to `spark-defaults.conf`:
116 | | Property | Setting |
117 | | -------- | ------- |
118 | | spark.driver.extraClassPath | /usr/local/spark/jars/postgresql-42.2.14.jar |
119 | | spark.driver.extraJavaOptions | -Dcom.amazonaws.services.s3.enableV4=true |
120 | | spark.executor.extraJavaOptions | -Dcom.amazonaws.services.s3.enableV4=true |
121 | | spark.hadoop.fs.s3a.awsAccessKeyId | $AWS_ACCESS_KEY_ID |
122 | | spark.hadoop.fs.s3a.awsSecretAccessKey | $AWS_SECRET_ACCESS_KEY |
123 | | spark.hadoop.fs.s3a.endpoint | $AWS_S3_ENDPOINT |
124 | | spark.hadoop.com.amazonaws.services.s3a.enableV4 | true |
125 | | spark.hadoop.fs.s3a.impl | org.apache.hadoop.fs.s3a.S3AFileSystem |
126 | | spark.jars | /usr/local/spark/jars/postgresql-42.2.14.jar |
127 | | spark.jars.packages | com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.7 |
128 | 
129 | This project also depends on PostgreSQL's PostGIS extension, which can be installed with the `CREATE EXTENSION` command:
130 | ```sh
131 | psql -d yourdatabase -c 'CREATE EXTENSION postgis;'
132 | ```
133 | 
134 | ## Directory Structure
135 | ```sh
136 | .
137 | ├── LICENSE
138 | ├── README.md
139 | ├── dag.png
140 | ├── pipeline.png
141 | ├── requirements.txt
142 | └── src/
143 |     ├── airflow/
144 |     │   ├── start_workers.sh*
145 |     │   ├── stop_workers.sh*
146 |     │   └── where_cycle_dag.py
147 |     ├── config/
148 |     │   ├── database.py
149 |     │   ├── geometries.py
150 |     │   ├── ref/
151 |     │   │   ├── check_citibike_schema.py
152 |     │   │   ├── check_tlc_schemas.py
153 |     │   │   ├── get_geometries.sql
154 |     │   │   └── tlc_schemas.txt
155 |     │   └── schemas.py
156 |     ├── dash/
157 |     │   ├── app.py
158 |     │   └── assets/
159 |     │       └── background.css
160 |     ├── postGIS_tables/
161 |     │   ├── geo_joined/
162 |     │   │   ├── citibike_stations.sql
163 |     │   │   └── past_tlc_visits.sql
164 |     │   ├── production/
165 |     │   │   ├── all_time_stats.sql
166 |     │   │   └── taxi_zones.sql
167 |     │   └── statistics/
168 |     │       ├── citibike.sql
169 |     │       ├── tlc_visits.sql
170 |     │       └── yelp_businesses.sql
171 |     ├── preparation/
172 |     │   ├── extract.py
173 |     │   ├── load.py
174 |     │   └── transform.py
175 |     └── spark_reduction/
176 |         ├── driver.py
177 |         ├── extract.py
178 |         ├── load.py
179 |         └── transform.py
180 | ```
181 | 
182 | ## License
183 | [MIT License](LICENSE)<br />
184 | Copyright (c) 2020 Josh Lang
185 | 


--------------------------------------------------------------------------------
/dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josh-lang/where-cycle/e11283acf13221f91b45baba12a08816def2a7fd/dag.png


--------------------------------------------------------------------------------
/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josh-lang/where-cycle/e11283acf13221f91b45baba12a08816def2a7fd/pipeline.png


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
  1 | alembic==1.4.2
  2 | apache-airflow==2.9.3
  3 | apispec==1.3.3
  4 | argcomplete==1.12.0
  5 | attrs==19.3.0
  6 | Babel==2.9.1
  7 | backcall==0.1.0
  8 | bleach==3.3.0
  9 | boto3==1.13.23
 10 | botocore==1.16.23
 11 | Brotli==1.0.7
 12 | cached-property==1.5.1
 13 | cattrs==1.0.0
 14 | certifi==2024.7.4
 15 | chardet==3.0.4
 16 | click==7.1.2
 17 | click-plugins==1.1.1
 18 | cligj==0.5.0
 19 | colorama==0.4.3
 20 | colorlog==4.0.2
 21 | configparser==3.5.3
 22 | croniter==0.3.34
 23 | dash-core-components==2.0.0
 24 | dash-html-components==2.0.0
 25 | dash-renderer==1.4.1
 26 | dash-table==4.7.0
 27 | decorator==4.4.2
 28 | defusedxml==0.6.0
 29 | dill==0.3.2
 30 | dnspython==2.6.1
 31 | docutils==0.16
 32 | email-validator==1.1.1
 33 | entrypoints==0.3
 34 | findspark==1.4.1
 35 | Fiona==1.8.13.post1
 36 | Flask==2.3.2
 37 | Flask-Admin==1.5.4
 38 | Flask-AppBuilder==4.3.11
 39 | Flask-Babel==1.0.0
 40 | Flask-Caching==1.11.0
 41 | Flask-Compress==1.5.0
 42 | Flask-JWT-Extended==3.24.1
 43 | Flask-Login==0.4.1
 44 | Flask-OpenID==1.2.5
 45 | Flask-SQLAlchemy==2.4.4
 46 | flask-swagger==0.2.13
 47 | Flask-WTF==0.14.3
 48 | funcsigs==1.0.2
 49 | future==0.18.3
 50 | GeoAlchemy2==0.8.3
 51 | geopandas==0.7.0
 52 | graphviz==0.14.1
 53 | gunicorn==22.0.0
 54 | idna==3.7
 55 | importlib-metadata==1.7.0
 56 | ipykernel==5.3.0
 57 | ipython==8.10.0
 58 | ipython-genutils==0.2.0
 59 | ipywidgets==7.5.1
 60 | iso8601==0.1.12
 61 | itsdangerous==1.1.0
 62 | jedi==0.17.0
 63 | Jinja2==3.1.4
 64 | jmespath==0.10.0
 65 | json-merge-patch==0.2
 66 | jsonschema==3.2.0
 67 | jupyter==1.0.0
 68 | jupyter-client==6.1.3
 69 | jupyter-console==6.1.0
 70 | jupyter-core==4.11.2
 71 | lazy-object-proxy==1.5.1
 72 | lockfile==0.12.2
 73 | Mako==1.2.2
 74 | Markdown==2.6.11
 75 | MarkupSafe==1.1.1
 76 | marshmallow==2.21.0
 77 | marshmallow-enum==1.5.1
 78 | marshmallow-sqlalchemy==0.23.1
 79 | mistune==2.0.3
 80 | munch==2.5.0
 81 | natsort==7.0.1
 82 | nbconvert==6.5.1
 83 | nbformat==5.0.6
 84 | notebook==6.4.12
 85 | numpy==1.22.0
 86 | packaging==20.4
 87 | pandas==1.0.5
 88 | pandocfilters==1.4.2
 89 | parso==0.7.0
 90 | pendulum==1.4.4
 91 | pexpect==4.8.0
 92 | pickleshare==0.7.5
 93 | pip-autoremove==0.9.1
 94 | pkg-resources==0.0.0
 95 | plotly==4.8.1
 96 | prison==0.1.3
 97 | prometheus-client==0.8.0
 98 | prompt-toolkit==3.0.5
 99 | psutil==5.7.2
100 | psycopg2-binary==2.8.5
101 | ptyprocess==0.6.0
102 | Pygments==2.15.0
103 | PyJWT==2.4.0
104 | pyparsing==2.4.7
105 | pyproj==2.6.1.post1
106 | pyrsistent==0.16.0
107 | python-daemon==2.2.4
108 | python-dateutil==2.8.1
109 | python-editor==1.0.4
110 | python-nvd3==0.15.0
111 | python-slugify==4.0.1
112 | python3-openid==3.2.0
113 | pytz==2020.1
114 | pytzdata==2020.1
115 | PyYAML==5.4
116 | pyzmq==19.0.1
117 | qtconsole==4.7.4
118 | QtPy==1.9.0
119 | requests==2.32.0
120 | retrying==1.3.3
121 | s3transfer==0.3.3
122 | Send2Trash==1.5.0
123 | setproctitle==1.1.10
124 | Shapely==1.7.0
125 | six==1.15.0
126 | SQLAlchemy==1.3.18
127 | SQLAlchemy-JSONField==0.9.0
128 | SQLAlchemy-Utils==0.36.8
129 | tabulate==0.8.7
130 | tenacity==4.12.0
131 | termcolor==1.1.0
132 | terminado==0.8.3
133 | testpath==0.4.4
134 | text-unidecode==1.3
135 | thrift==0.13.0
136 | tornado==6.4.1
137 | traitlets==4.3.3
138 | typing==3.7.4.3
139 | typing-extensions==3.7.4.2
140 | tzlocal==1.5.1
141 | unicodecsv==0.14.1
142 | urllib3==1.26.19
143 | wcwidth==0.2.3
144 | webencodings==0.5.1
145 | Werkzeug==3.0.3
146 | widgetsnbextension==3.5.1
147 | WTForms==2.3.1
148 | zipp==3.19.1
149 | zope.deprecation==4.4.0
150 | 


--------------------------------------------------------------------------------
/src/airflow/start_workers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Send start command to spark_worker instances, wait for all three
 4 | # to reach 'running' state, sleep for 15 more seconds just to be
 5 | # safe, and then launch spark
 6 | 
 7 | set -e
 8 | 
 9 | aws ec2 start-instances --instance-ids \
10 |     $EC2_WORKER1_ID $EC2_WORKER2_ID $EC2_WORKER3_ID &&
11 | aws ec2 wait instance-running --instance-ids \
12 |     $EC2_WORKER1_ID $EC2_WORKER2_ID $EC2_WORKER3_ID &&
13 | sleep 15 &&
14 | /usr/local/spark/sbin/start-all.sh
15 | 


--------------------------------------------------------------------------------
/src/airflow/stop_workers.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | # Stop spark on all workers and then send stop-instances
 4 | # command to AWS
 5 | 
 6 | set -e
 7 | 
 8 | /usr/local/spark/sbin/stop-all.sh &&
 9 | aws ec2 stop-instances --instance-ids \
10 |     $EC2_WORKER1_ID $EC2_WORKER2_ID $EC2_WORKER3_ID
11 | 


--------------------------------------------------------------------------------
/src/airflow/where_cycle_dag.py:
--------------------------------------------------------------------------------
  1 | from datetime import datetime, timedelta
  2 | from airflow import DAG
  3 | from airflow.operators.bash_operator import BashOperator
  4 | from airflow.operators.python_operator import PythonOperator
  5 | from preparation.extract import \
  6 |     get_taxi_zones, get_businesses, unzip_csvs
  7 | from preparation.transform import \
  8 |     clean_taxi_zones, calculate_centroids, clean_businesses
  9 | from preparation.load import \
 10 |     write_taxi_zones, write_businesses
 11 | 
 12 | 
 13 | airflow_path = '/home/ubuntu/where-cycle/src/airflow/'
 14 | spark_str = 'cd /home/ubuntu/where-cycle/src/spark_reduction && '
 15 | psql_str = 'psql -h $PSQL_HOST -p $PSQL_PORT -U $PSQL_USER -d ' + \
 16 |     '$PSQL_DATABASE -f /home/ubuntu/where-cycle/src/postGIS_tables/'
 17 | 
 18 | defaults = {
 19 |     'owner': 'airflow',
 20 |     'start_date': datetime(2020, 6, 21),
 21 |     'depends_on_past': False,
 22 |     'retries': 2,
 23 |     'retry_delay': timedelta(minutes=5)
 24 | }
 25 | 
 26 | with DAG(
 27 |     'where_cycle',
 28 |     default_args = defaults,
 29 |     schedule_interval = '@weekly'
 30 | ) as dag:
 31 |     #********    PREPARATION    ********#
 32 | 
 33 |     t1 = PythonOperator(
 34 |         task_id = 'get_taxi_zones',
 35 |         python_callable = get_taxi_zones
 36 |     )
 37 | 
 38 |     t2 = PythonOperator(
 39 |         task_id = 'clean_taxi_zones',
 40 |         python_callable = clean_taxi_zones,
 41 |         provide_context = True
 42 |     )
 43 | 
 44 |     t3 = PythonOperator(
 45 |         task_id = 'write_taxi_zones',
 46 |         python_callable = write_taxi_zones,
 47 |         provide_context = True
 48 |     )
 49 | 
 50 |     t4 = PythonOperator(
 51 |         task_id = 'calculate_centroids',
 52 |         python_callable = calculate_centroids,
 53 |         provide_context = True
 54 |     )
 55 | 
 56 |     t5 = PythonOperator(
 57 |         task_id = 'get_businesses',
 58 |         python_callable = get_businesses,
 59 |         provide_context = True
 60 |     )
 61 | 
 62 |     t6 = PythonOperator(
 63 |         task_id = 'clean_businesses',
 64 |         python_callable = clean_businesses,
 65 |         provide_context = True
 66 |     )
 67 | 
 68 |     t7 = PythonOperator(
 69 |         task_id = 'write_businesses',
 70 |         python_callable = write_businesses,
 71 |         provide_context = True
 72 |     )
 73 | 
 74 |     t8 = PythonOperator(
 75 |         task_id = 'unzip_csvs',
 76 |         python_callable = unzip_csvs
 77 |     )
 78 | 
 79 |     t1 >> t2 >> t3
 80 |     t1 >> t4 >> t5 >> t6 >> t7
 81 | 
 82 | 
 83 |     #********    SPARK REDUCTION    ********#
 84 | 
 85 |     t9 = BashOperator(
 86 |         task_id = 'start_spark_workers',
 87 |         bash_command = airflow_path + 'start_workers.sh '
 88 |     )
 89 | 
 90 |     t10 = BashOperator(
 91 |         task_id = 'submit_spark_driver',
 92 |         bash_command = spark_str + 'spark-submit driver.py'
 93 |     )
 94 | 
 95 |     t11 = BashOperator(
 96 |         task_id = 'stop_spark_workers',
 97 |         bash_command = airflow_path + 'stop_workers.sh ',
 98 |         trigger_rule = 'all_done'
 99 |     )
100 | 
101 |     t8 >> t9 >> t10 >> t11
102 | 
103 | 
104 |     #********    POSTGIS TABLES    ********#
105 | 
106 |     t12 = BashOperator(
107 |         task_id = 'create_production_taxi_zones',
108 |         bash_command = psql_str + 'production/taxi_zones.sql'
109 |     )
110 |     
111 |     t13 = BashOperator(
112 |         task_id = 'create_statistics_yelp_businesses',
113 |         bash_command = psql_str + 'statistics/yelp_businesses.sql'
114 |     )
115 | 
116 |     t14 = BashOperator(
117 |         task_id = 'create_geo_joined_citibike_stations',
118 |         bash_command = psql_str + 'geo_joined/citibike_stations.sql'
119 |     )
120 | 
121 |     t15 = BashOperator(
122 |         task_id = 'create_statistics_citibike',
123 |         bash_command = psql_str + 'statistics/citibike.sql'
124 |     )
125 | 
126 |     t16 = BashOperator(
127 |         task_id = 'create_geo_joined_past_tlc_visits',
128 |         bash_command = psql_str + 'geo_joined/past_tlc_visits.sql'
129 |     )
130 | 
131 |     t17 = BashOperator(
132 |         task_id = 'create_statistics_tlc_visits',
133 |         bash_command = psql_str + 'statistics/tlc_visits.sql'
134 |     )
135 | 
136 |     t18 = BashOperator(
137 |         task_id = 'create_production_all_time_stats',
138 |         bash_command = psql_str + 'production/all_time_stats.sql'
139 |     )
140 | 
141 |     t3 >> t12
142 |     t7 >> t13
143 |     t10 >> t14 >> t15
144 |     t10 >> t16 >> t17
145 |     [t13, t15, t17] >> t18
146 | 


--------------------------------------------------------------------------------
/src/config/database.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sqlalchemy as sql
 3 | 
 4 | 
 5 | jdbc_props = {
 6 |     'driver': 'org.postgresql.Driver',
 7 |     'user': os.environ['PSQL_USER'],
 8 |     'password': os.environ['PSQL_PASSWORD']
 9 | }
10 | 
11 | jdbc_url = 'jdbc:postgresql://' + \
12 |     os.environ['PSQL_HOST'] + ':' + os.environ['PSQL_PORT'] + \
13 |     '/' + os.environ['PSQL_DATABASE']
14 | 
15 | py_engine = sql.create_engine(
16 |     'postgresql://' +
17 |     os.environ['PSQL_USER'] + ':' + os.environ['PSQL_PASSWORD'] +
18 |     '@' + os.environ['PSQL_HOST'] + ':' + os.environ['PSQL_PORT'] +
19 |     '/' + os.environ['PSQL_DATABASE']
20 | )
21 | 


--------------------------------------------------------------------------------
/src/config/geometries.py:
--------------------------------------------------------------------------------
1 | TAXI_ZONE_CENTROID_LAT = 40.7058240860865
2 | TAXI_ZONE_CENTROID_LON = -73.9778002135437
3 | TAXI_ZONE_LAT_MIN = 40.4961153951704
4 | TAXI_ZONE_LAT_MAX = 40.9155327770026
5 | TAXI_ZONE_LON_MIN = -74.2555913631521
6 | TAXI_ZONE_LON_MAX = -73.7000090639354
7 | 


--------------------------------------------------------------------------------
/src/config/ref/check_citibike_schema.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | 
 5 | s3 = boto3.resource('s3')
 6 | bucket = s3.Bucket('jlang-20b-de-ny')
 7 | 
 8 | spark = SparkSession.builder \
 9 |     .appName('check-citibike-schema') \
10 |     .getOrCreate()
11 | 
12 | for obj in bucket.objects.all():
13 |     key = obj.key
14 |     if key.startswith('citibike/') and key.endswith('.csv'):
15 |         path = 's3a://jlang-20b-de-ny/' + key
16 |         csv_df = spark.read.csv(
17 |             path = path,
18 |             header = True,
19 |             inferSchema = True,
20 |             enforceSchema = False,
21 |             ignoreLeadingWhiteSpace = True,
22 |             ignoreTrailingWhiteSpace = True,
23 |             samplingRatio = 0.1
24 |         )
25 |         print(path)
26 |         csv_df.printSchema()
27 | 


--------------------------------------------------------------------------------
/src/config/ref/check_tlc_schemas.py:
--------------------------------------------------------------------------------
 1 | import boto3
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | 
 5 | s3 = boto3.resource('s3')
 6 | nyc_tlc = s3.Bucket('nyc-tlc')
 7 | 
 8 | spark = SparkSession.builder \
 9 |     .appName('check_tlc_schemas') \
10 |     .getOrCreate()
11 | 
12 | for obj in nyc_tlc.objects.all():
13 |     key = obj.key
14 |     if key.startswith('trip data/') and key.endswith('.csv'):
15 |         path = 's3a://nyc-tlc/' + key
16 |         csv_df = spark.read.csv(
17 |             path = path,
18 |             header = True,
19 |             inferSchema = True,
20 |             enforceSchema = False,
21 |             ignoreLeadingWhiteSpace = True,
22 |             ignoreTrailingWhiteSpace = True,
23 |             samplingRatio = 0.1
24 |         )
25 |         print(path)
26 |         csv_df.printSchema()
27 | 


--------------------------------------------------------------------------------
/src/config/ref/get_geometries.sql:
--------------------------------------------------------------------------------
1 | SELECT ST_Extent(geometry) AS bbox
2 | FROM staging.taxi_zones;
3 | 
4 | SELECT ST_AsText(ST_Centroid(ST_Extent(geometry))) AS center
5 | FROM staging.taxi_zones;
6 | 


--------------------------------------------------------------------------------
/src/config/ref/tlc_schemas.txt:
--------------------------------------------------------------------------------
  1 | lat-lon data
  2 |     green_tripdata_2013-08_2016-06_9-cols
  3 |         green_tripdata_2013-08_2014-12
  4 |             |-- VendorID: integer (nullable = true)
  5 |             |-- lpep_pickup_datetime: timestamp (nullable = true)
  6 |             |-- Lpep_dropoff_datetime: timestamp (nullable = true)
  7 |             |-- Store_and_fwd_flag: string (nullable = true)
  8 |             |-- RateCodeID: integer (nullable = true)
  9 |             |-- Pickup_longitude: double (nullable = true)
 10 |             |-- Pickup_latitude: double (nullable = true)
 11 |             |-- Dropoff_longitude: double (nullable = true)
 12 |             |-- Dropoff_latitude: double (nullable = true)
 13 |             |-- Passenger_count: integer (nullable = true)
 14 |             |-- Trip_distance: double (nullable = true)
 15 |             |-- Fare_amount: double (nullable = true)
 16 |             |-- Extra: double (nullable = true)
 17 |             |-- MTA_tax: double (nullable = true)
 18 |             |-- Tip_amount: double (nullable = true)
 19 |             |-- Tolls_amount: double (nullable = true)
 20 |             |-- Ehail_fee: string (nullable = true)
 21 |             |-- Total_amount: double (nullable = true)
 22 |             |-- Payment_type: integer (nullable = true)
 23 |             |-- Trip_type: string (nullable = true)
 24 | 
 25 |         green_tripdata_2015-01_2016-06_9-cols
 26 |             |-- VendorID: integer (nullable = true)
 27 |             |-- lpep_pickup_datetime: timestamp (nullable = true)
 28 |             |-- Lpep_dropoff_datetime: timestamp (nullable = true)
 29 |             |-- Store_and_fwd_flag: string (nullable = true)
 30 |             |-- RateCodeID: integer (nullable = true)
 31 |             |-- Pickup_longitude: double (nullable = true)
 32 |             |-- Pickup_latitude: double (nullable = true)
 33 |             |-- Dropoff_longitude: double (nullable = true)
 34 |             |-- Dropoff_latitude: double (nullable = true)
 35 |             |-- Passenger_count: integer (nullable = true)
 36 |             |-- Trip_distance: double (nullable = true)
 37 |             |-- Fare_amount: double (nullable = true)
 38 |             |-- Extra: double (nullable = true)
 39 |             |-- MTA_tax: double (nullable = true)
 40 |             |-- Tip_amount: double (nullable = true)
 41 |             |-- Tolls_amount: double (nullable = true)
 42 |             |-- Ehail_fee: string (nullable = true)
 43 |             |-- improvement_surcharge: double (nullable = true)
 44 |             |-- Total_amount: double (nullable = true)
 45 |             |-- Payment_type: integer (nullable = true)
 46 |             |-- Trip_type: integer (nullable = true)
 47 | 
 48 |     yellow_tripdata_2009-01__2016-06_11-cols
 49 |         yellow_tripdata_2009-01_2009-12
 50 |             |-- vendor_name: string (nullable = true)
 51 |             |-- Trip_Pickup_DateTime: timestamp (nullable = true)
 52 |             |-- Trip_Dropoff_DateTime: timestamp (nullable = true)
 53 |             |-- Passenger_Count: integer (nullable = true)
 54 |             |-- Trip_Distance: double (nullable = true)
 55 |             |-- Start_Lon: double (nullable = true)
 56 |             |-- Start_Lat: double (nullable = true)
 57 |             |-- Rate_Code: string (nullable = true)
 58 |             |-- store_and_forward: integer (nullable = true)
 59 |             |-- End_Lon: double (nullable = true)
 60 |             |-- End_Lat: double (nullable = true)
 61 |             |-- Payment_Type: string (nullable = true)
 62 |             |-- Fare_Amt: double (nullable = true)
 63 |             |-- surcharge: double (nullable = true)
 64 |             |-- mta_tax: string (nullable = true)
 65 |             |-- Tip_Amt: double (nullable = true)
 66 |             |-- Tolls_Amt: double (nullable = true)
 67 |             |-- Total_Amt: double (nullable = true)
 68 | 
 69 |         yellow_tripdata_2010-01_2014-12
 70 |             |-- vendor_id: string (nullable = true)
 71 |             |-- pickup_datetime: timestamp (nullable = true)
 72 |             |-- dropoff_datetime: timestamp (nullable = true)
 73 |             |-- passenger_count: integer (nullable = true)
 74 |             |-- trip_distance: double (nullable = true)
 75 |             |-- pickup_longitude: double (nullable = true)
 76 |             |-- pickup_latitude: double (nullable = true)
 77 |             |-- rate_code: integer (nullable = true)
 78 |             |-- store_and_fwd_flag: integer (nullable = true)
 79 |             |-- dropoff_longitude: double (nullable = true)
 80 |             |-- dropoff_latitude: double (nullable = true)
 81 |             |-- payment_type: string (nullable = true)
 82 |             |-- fare_amount: double (nullable = true)
 83 |             |-- surcharge: double (nullable = true)
 84 |             |-- mta_tax: double (nullable = true)
 85 |             |-- tip_amount: double (nullable = true)
 86 |             |-- tolls_amount: double (nullable = true)
 87 |             |-- total_amount: double (nullable = true)
 88 | 
 89 |         yellow_tripdata_2015-01_2016-06
 90 |             |-- VendorID: integer (nullable = true)
 91 |             |-- tpep_pickup_datetime: timestamp (nullable = true)
 92 |             |-- tpep_dropoff_datetime: timestamp (nullable = true)
 93 |             |-- passenger_count: integer (nullable = true)
 94 |             |-- trip_distance: double (nullable = true)
 95 |             |-- pickup_longitude: double (nullable = true)
 96 |             |-- pickup_latitude: double (nullable = true)
 97 |             |-- RateCodeID: integer (nullable = true)
 98 |             |-- store_and_fwd_flag: string (nullable = true)
 99 |             |-- dropoff_longitude: double (nullable = true)
100 |             |-- dropoff_latitude: double (nullable = true)
101 |             |-- payment_type: integer (nullable = true)
102 |             |-- fare_amount: double (nullable = true)
103 |             |-- extra: double (nullable = true)
104 |             |-- mta_tax: double (nullable = true)
105 |             |-- tip_amount: double (nullable = true)
106 |             |-- tolls_amount: double (nullable = true)
107 |             |-- improvement_surcharge: double (nullable = true)
108 |             |-- total_amount: double (nullable = true)
109 | 
110 | LocationID data
111 |     fhv_tripdata_2015-01_2016-12_3-cols
112 |         |-- Dispatching_base_num: string (nullable = true)
113 |         |-- Pickup_date: timestamp (nullable = true)
114 |         |-- locationID: integer (nullable = true)
115 | 
116 |     fhv_tripdata_2017-01_2019-12_5-cols
117 |         fhv_tripdata_2017-01_2017-06
118 |             |-- Dispatching_base_num: string (nullable = true)
119 |             |-- Pickup_DateTime: timestamp (nullable = true)
120 |             |-- DropOff_datetime: string (nullable = true)
121 |             |-- PUlocationID: integer (nullable = true)
122 |             |-- DOlocationID: integer (nullable = true)
123 | 
124 |         fhv_tripdata_2017-07_2017-12
125 |             |-- Dispatching_base_num: string (nullable = true)
126 |             |-- Pickup_DateTime: timestamp (nullable = true)
127 |             |-- DropOff_datetime: string (nullable = true)
128 |             |-- PUlocationID: integer (nullable = true)
129 |             |-- DOlocationID: integer (nullable = true)
130 |             |-- SR_Flag: integer (nullable = true)
131 |         
132 |         fhv_tripdata_2019-01_2019-12
133 |             |-- dispatching_base_num: string (nullable = true)
134 |             |-- pickup_datetime: timestamp (nullable = true)
135 |             |-- dropoff_datetime: timestamp (nullable = true)
136 |             |-- PULocationID: integer (nullable = true)
137 |             |-- DOLocationID: integer (nullable = true)
138 |             |-- SR_Flag: integer (nullable = true)
139 | 
140 |     fhv_tripdata_2018-01_2018-12_4-cols
141 |         |-- Pickup_DateTime: timestamp (nullable = true)
142 |         |-- DropOff_datetime: timestamp (nullable = true)
143 |         |-- PUlocationID: integer (nullable = true)
144 |         |-- DOlocationID: integer (nullable = true)
145 |         |-- SR_Flag: integer (nullable = true)
146 |         |-- Dispatching_base_number: string (nullable = true)
147 |         |-- Dispatching_base_num: string (nullable = true)
148 | 
149 |     fhvhv_tripdata_6-cols
150 |         |-- hvfhs_license_num: string (nullable = true)
151 |         |-- dispatching_base_num: string (nullable = true)
152 |         |-- pickup_datetime: timestamp (nullable = true)
153 |         |-- dropoff_datetime: timestamp (nullable = true)
154 |         |-- PULocationID: integer (nullable = true)
155 |         |-- DOLocationID: integer (nullable = true)
156 |         |-- SR_Flag: integer (nullable = true)
157 | 
158 |     green_tripdata_2016-07_2019-12_7-cols
159 |         green_tripdata_2016-07_2018-12
160 |             |-- VendorID: integer (nullable = true)
161 |             |-- lpep_pickup_datetime: timestamp (nullable = true)
162 |             |-- lpep_dropoff_datetime: timestamp (nullable = true)
163 |             |-- store_and_fwd_flag: string (nullable = true)
164 |             |-- RatecodeID: integer (nullable = true)
165 |             |-- PULocationID: integer (nullable = true)
166 |             |-- DOLocationID: integer (nullable = true)
167 |             |-- passenger_count: integer (nullable = true)
168 |             |-- trip_distance: double (nullable = true)
169 |             |-- fare_amount: double (nullable = true)
170 |             |-- extra: double (nullable = true)
171 |             |-- mta_tax: double (nullable = true)
172 |             |-- tip_amount: double (nullable = true)
173 |             |-- tolls_amount: double (nullable = true)
174 |             |-- ehail_fee: string (nullable = true)
175 |             |-- improvement_surcharge: double (nullable = true)
176 |             |-- total_amount: double (nullable = true)
177 |             |-- payment_type: integer (nullable = true)
178 |             |-- trip_type: integer (nullable = true)
179 | 
180 |         green_tripdata_2019-01_2019-12_7-cols
181 |             |-- VendorID: integer (nullable = true)
182 |             |-- lpep_pickup_datetime: timestamp (nullable = true)
183 |             |-- lpep_dropoff_datetime: timestamp (nullable = true)
184 |             |-- store_and_fwd_flag: string (nullable = true)
185 |             |-- RatecodeID: integer (nullable = true)
186 |             |-- PULocationID: integer (nullable = true)
187 |             |-- DOLocationID: integer (nullable = true)
188 |             |-- passenger_count: integer (nullable = true)
189 |             |-- trip_distance: double (nullable = true)
190 |             |-- fare_amount: double (nullable = true)
191 |             |-- extra: double (nullable = true)
192 |             |-- mta_tax: double (nullable = true)
193 |             |-- tip_amount: double (nullable = true)
194 |             |-- tolls_amount: double (nullable = true)
195 |             |-- ehail_fee: string (nullable = true)
196 |             |-- improvement_surcharge: double (nullable = true)
197 |             |-- total_amount: double (nullable = true)
198 |             |-- payment_type: integer (nullable = true)
199 |             |-- trip_type: integer (nullable = true)
200 |             |-- congestion_surcharge: integer (nullable = true)
201 | 
202 |     yellow_tripdata_2016-07_2019-12_9-cols
203 |         yellow_tripdata_2016-07_2018-12
204 |             |-- VendorID: integer (nullable = true)
205 |             |-- tpep_pickup_datetime: timestamp (nullable = true)
206 |             |-- tpep_dropoff_datetime: timestamp (nullable = true)
207 |             |-- passenger_count: integer (nullable = true)
208 |             |-- trip_distance: double (nullable = true)
209 |             |-- RatecodeID: integer (nullable = true)
210 |             |-- store_and_fwd_flag: string (nullable = true)
211 |             |-- PULocationID: integer (nullable = true)
212 |             |-- DOLocationID: integer (nullable = true)
213 |             |-- payment_type: integer (nullable = true)
214 |             |-- fare_amount: double (nullable = true)
215 |             |-- extra: double (nullable = true)
216 |             |-- mta_tax: double (nullable = true)
217 |             |-- tip_amount: double (nullable = true)
218 |             |-- tolls_amount: double (nullable = true)
219 |             |-- improvement_surcharge: double (nullable = true)
220 |             |-- total_amount: double (nullable = true)
221 | 
222 |         yellow_tripdata_2019-01_2019-12_9-cols
223 |             |-- VendorID: integer (nullable = true)
224 |             |-- tpep_pickup_datetime: timestamp (nullable = true)
225 |             |-- tpep_dropoff_datetime: timestamp (nullable = true)
226 |             |-- passenger_count: integer (nullable = true)
227 |             |-- trip_distance: double (nullable = true)
228 |             |-- RatecodeID: integer (nullable = true)
229 |             |-- store_and_fwd_flag: string (nullable = true)
230 |             |-- PULocationID: integer (nullable = true)
231 |             |-- DOLocationID: integer (nullable = true)
232 |             |-- payment_type: integer (nullable = true)
233 |             |-- fare_amount: double (nullable = true)
234 |             |-- extra: double (nullable = true)
235 |             |-- mta_tax: double (nullable = true)
236 |             |-- tip_amount: double (nullable = true)
237 |             |-- tolls_amount: double (nullable = true)
238 |             |-- improvement_surcharge: double (nullable = true)
239 |             |-- total_amount: double (nullable = true)
240 |             |-- congestion_surcharge: double (nullable = true)
241 | 


--------------------------------------------------------------------------------
/src/config/schemas.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql.types import StructType, StructField, \
  2 |     IntegerType, TimestampType, StringType, DoubleType
  3 | 
  4 | citibike_schema = StructType(
  5 |     [
  6 |         StructField('tripduration', IntegerType(), True),
  7 |         StructField('starttime', TimestampType(), True),
  8 |         StructField('stoptime', TimestampType(), True),
  9 |         StructField('start station id', IntegerType(), True),
 10 |         StructField('start station name', StringType(), True),
 11 |         StructField('start station latitude', DoubleType(), True),
 12 |         StructField('start station longitude', DoubleType(), True),
 13 |         StructField('end station id', IntegerType(), True),
 14 |         StructField('end station name', StringType(), True),
 15 |         StructField('end station latitude', DoubleType(), True),
 16 |         StructField('end station longitude', DoubleType(), True)
 17 |     ]
 18 | )
 19 | 
 20 | past_schema = StructType(
 21 |     [
 22 |         StructField('month', StringType(), True),
 23 |         StructField('pickup_longitude', DoubleType(), True),
 24 |         StructField('pickup_latitude', DoubleType(), True),
 25 |         StructField('dropoff_longitude', DoubleType(), True),
 26 |         StructField('dropoff_latitude', DoubleType(), True)
 27 |     ]
 28 | )
 29 | 
 30 | modern_schema = StructType(
 31 |     [
 32 |         StructField('month', StringType(), True),
 33 |         StructField('PULocationID', IntegerType(), True),
 34 |         StructField('DOLocationID', IntegerType(), True)
 35 |     ]
 36 | )
 37 | 
 38 | green_13_16_schema = StructType(
 39 |     [
 40 |         StructField('VendorID', IntegerType(), True),
 41 |         StructField('lpep_pickup_datetime', TimestampType(), True),
 42 |         StructField('Lpep_dropoff_datetime', TimestampType(), True),
 43 |         StructField('Store_and_fwd_flag', StringType(), True),
 44 |         StructField('RateCodeID', IntegerType(), True),
 45 |         StructField('pickup_longitude', DoubleType(), True),
 46 |         StructField('pickup_latitude', DoubleType(), True),
 47 |         StructField('dropoff_longitude', DoubleType(), True),
 48 |         StructField('dropoff_latitude', DoubleType(), True)
 49 |     ]
 50 | )
 51 | 
 52 | yellow_09_16_schema = StructType(
 53 |     [
 54 |         StructField('VendorID', StringType(), True),
 55 |         StructField('tpep_pickup_datetime', TimestampType(), True),
 56 |         StructField('tpep_dropoff_datetime', TimestampType(), True),
 57 |         StructField('passenger_count', IntegerType(), True),
 58 |         StructField('trip_distance', DoubleType(), True),
 59 |         StructField('pickup_longitude', DoubleType(), True),
 60 |         StructField('pickup_latitude', DoubleType(), True),
 61 |         StructField('RateCodeID', StringType(), True),
 62 |         StructField('store_and_fwd_flag', StringType(), True),
 63 |         StructField('dropoff_longitude', DoubleType(), True),
 64 |         StructField('dropoff_latitude', DoubleType(), True)
 65 |     ]
 66 | )
 67 | 
 68 | fhv_15_16_schema = StructType(
 69 |     [
 70 |         StructField('Dispatching_base_num', StringType(), True),
 71 |         StructField('Pickup_date', TimestampType(), True),
 72 |         StructField('locationID', IntegerType(), True)
 73 |     ]
 74 | )
 75 | 
 76 | fhv_17_19_schema = StructType(
 77 |     [
 78 |         StructField('Dispatching_base_num', StringType(), True),
 79 |         StructField('Pickup_DateTime', TimestampType(), True),
 80 |         StructField('DropOff_datetime', TimestampType(), True),
 81 |         StructField('PUlocationID', IntegerType(), True),
 82 |         StructField('DOlocationID', IntegerType(), True)
 83 |     ]
 84 | )
 85 | 
 86 | fhv_18_schema = StructType(
 87 |     [
 88 |         StructField('Pickup_DateTime', TimestampType(), True),
 89 |         StructField('DropOff_datetime', TimestampType(), True),
 90 |         StructField('PUlocationID', IntegerType(), True),
 91 |         StructField('DOlocationID', IntegerType(), True)
 92 |     ]
 93 | )
 94 | 
 95 | fhvhv_schema = StructType(
 96 |     [
 97 |         StructField('hvfhs_license_num', StringType(), True),
 98 |         StructField('dispatching_base_num', StringType(), True),
 99 |         StructField('pickup_datetime', TimestampType(), True),
100 |         StructField('dropoff_datetime', TimestampType(), True),
101 |         StructField('PULocationID', IntegerType(), True),
102 |         StructField('DOLocationID', IntegerType(), True)
103 |     ]
104 | )
105 | 
106 | green_16_19_schema = StructType(
107 |     [
108 |         StructField('VendorID', IntegerType(), True),
109 |         StructField('lpep_pickup_datetime', TimestampType(), True),
110 |         StructField('lpep_dropoff_datetime', TimestampType(), True),
111 |         StructField('store_and_fwd_flag', StringType(), True),
112 |         StructField('RatecodeID', IntegerType(), True),
113 |         StructField('PULocationID', IntegerType(), True),
114 |         StructField('DOLocationID', IntegerType(), True)
115 |     ]
116 | )
117 | 
118 | yellow_16_19_schema = StructType(
119 |     [
120 |         StructField('VendorID', IntegerType(), True),
121 |         StructField('tpep_pickup_datetime', TimestampType(), True),
122 |         StructField('tpep_dropoff_datetime', TimestampType(), True),
123 |         StructField('passenger_count', IntegerType(), True),
124 |         StructField('trip_distance', DoubleType(), True),
125 |         StructField('RatecodeID', IntegerType(), True),
126 |         StructField('store_and_fwd_flag', StringType(), True),
127 |         StructField('PULocationID', IntegerType(), True),
128 |         StructField('DOLocationID', IntegerType(), True)
129 |     ]
130 | )
131 | 


--------------------------------------------------------------------------------
/src/dash/app.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import dash
  4 | import dash_core_components as dcc
  5 | import dash_html_components as html
  6 | import flask
  7 | import pandas as pd
  8 | import plotly
  9 | import plotly.graph_objects as go
 10 | from config.database import py_engine
 11 | from config.geometries import \
 12 |     TAXI_ZONE_CENTROID_LAT, TAXI_ZONE_CENTROID_LON
 13 | 
 14 | 
 15 | zones = pd.read_sql_table(
 16 |     table_name = 'taxi_zones',
 17 |     con = py_engine,
 18 |     schema = 'production'
 19 | )
 20 | 
 21 | json_zones = {'type': 'FeatureCollection', 'features': []}
 22 | for _, row in zones.iterrows():
 23 |     feature = {
 24 |         'type':'Feature',
 25 |         'id': row['zone_id'],
 26 |         'geometry': json.loads(row['geometry'])
 27 |     }
 28 |     json_zones['features'].append(feature)
 29 | 
 30 | stats = pd.read_sql_table(
 31 |     table_name = 'all_time_stats',
 32 |     con = py_engine,
 33 |     schema = 'production'
 34 | )
 35 | 
 36 | columns = [
 37 |     'tlc_visits',
 38 |     'citibike_visits',
 39 |     'citibike_stations',
 40 |     # 'yelp_avg_rating',
 41 |     # 'yelp_sum_reviews',
 42 |     'yelp_weighted_sum_reviews'
 43 | ]
 44 | 
 45 | map_views = []
 46 | bar_charts = []
 47 | 
 48 | for column in columns:
 49 |     map_views.append(
 50 |         go.Choroplethmapbox(
 51 |             geojson = json_zones,
 52 |             locations = stats['zone_id'].tolist(),
 53 |             z = stats[column].tolist(),
 54 |             text = stats['zone_name'] + ', ' + stats['borough'],
 55 |             visible = False,
 56 |             subplot = 'mapbox',
 57 |             hovertemplate = '%{text}<br />' +
 58 |                             '%{z}<br />' +
 59 |                             '<extra></extra>'
 60 |         )
 61 |     )
 62 | 
 63 |     top = stats.sort_values([column], ascending = False).head(15)
 64 |     bar_charts.append(
 65 |         go.Bar(
 66 |             x = top[column],
 67 |             y = top['zone_name'] + ', ' + top['borough'],
 68 |             text = top['zone_name'] + ', ' + top['borough'],
 69 |             textposition = 'inside',
 70 |             hovertemplate = '%{text}<br />' +
 71 |                             '%{x}<br />' +
 72 |                             '<extra></extra>',
 73 |             xaxis = 'x',
 74 |             yaxis = 'y',
 75 |             marker = dict(color = 'blue'),
 76 |             visible = False,
 77 |             name = '',
 78 |             orientation = 'h'
 79 |         )
 80 |     )
 81 | 
 82 | map_views[0]['visible'] = True
 83 | bar_charts[0]['visible'] = True
 84 | 
 85 | fig = go.Figure(data = map_views + bar_charts)
 86 | 
 87 | fig.update_layout(
 88 |     title = dict(
 89 |         text = 'Where Cycle',
 90 |         font = dict(size = 36),
 91 |         x = 0.5,
 92 |         xanchor = 'center'
 93 |     ),
 94 |     autosize = True,
 95 |     height = 700,
 96 |     mapbox = dict(
 97 |         domain = dict(x = [0.25, 1], y = [0, 1]),
 98 |         accesstoken = os.environ['MAPBOX_ACCESS_TOKEN'],
 99 |         style = 'dark',
100 |         center = dict(
101 |             lon = TAXI_ZONE_CENTROID_LON,
102 |             lat = TAXI_ZONE_CENTROID_LAT
103 |         ),
104 |         zoom = 9.35
105 |     ),
106 |     xaxis = dict(
107 |         domain = [0, 0.25],
108 |         anchor = 'x',
109 |         showticklabels = True,
110 |         showgrid = True
111 |     ),
112 |     yaxis = dict(
113 |         domain = [0, 1],
114 |         anchor = 'y',
115 |         autorange = 'reversed',
116 |         visible = False
117 |     ),
118 |     margin = dict(l = 0, r = 0, t = 70, b = 50),
119 |     paper_bgcolor='black',
120 |     plot_bgcolor='black'
121 | )
122 | 
123 | fig.update_layout(
124 |     updatemenus = [dict(
125 |         x = 0,
126 |         y = 1,
127 |         xanchor = 'left',
128 |         yanchor = 'bottom',
129 |         buttons = list([
130 |             dict(
131 |                 args = [
132 |                     'visible',
133 |                     [True, False, False, False] # , False, False]
134 |                 ],
135 |                 label = 'Taxi Visits',
136 |                 method = 'restyle'
137 |             ),
138 |             dict(
139 |                 args = [
140 |                     'visible',
141 |                     [False, True, False, False] # , False, False]
142 |                 ],
143 |                 label = 'Citibike Visits',
144 |                 method = 'restyle'
145 |             ),
146 |             dict(
147 |                 args = [
148 |                     'visible',
149 |                     [False, False, True, False] # , False, False]
150 |                 ],
151 |                 label = 'Citibike Stations',
152 |                 method = 'restyle'
153 |             ),
154 |             # dict(
155 |             #     args = [
156 |             #         'visible',
157 |             #         [False, False, False, True, False, False]
158 |             #     ],
159 |             #     label = 'Yelp Average Rating',
160 |             #     method = 'restyle'
161 |             # ),
162 |             # dict(
163 |             #     args = [
164 |             #         'visible',
165 |             #         [False, False, False, False, True, False]
166 |             #     ],
167 |             #     label = 'Yelp Reviews',
168 |             #     method = 'restyle'
169 |             # ),
170 |             dict(
171 |                 args = [
172 |                     'visible',
173 |                     [False, False, False, True] # , False, True]
174 |                 ],
175 |                 label = 'Yelp Stars (weighted review count)',
176 |                 method = 'restyle'
177 |             )
178 |         ]),
179 |     )]
180 | )
181 | 
182 | server = flask.Flask(__name__)
183 | stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
184 | 
185 | app = dash.Dash(
186 |     __name__,
187 |     external_stylesheets = stylesheets,
188 |     server = server
189 | )
190 | 
191 | app.layout = html.Div([
192 |     dcc.Location(
193 |         id = 'url',
194 |         pathname = '/where-cycle',
195 |         refresh = False
196 |     ),
197 |     dcc.Graph(figure = fig),
198 |     html.Div([
199 |         'Read more about this project on ',
200 |         html.A(
201 |             ['Github'],
202 |             href = 'https://github.com/josh-lang/where-cycle'
203 |         )
204 |     ])
205 | ])
206 | 
207 | app.title = 'Where Cycle'
208 | 
209 | if __name__ == '__main__':
210 |     app.run_server(
211 |         debug = False,
212 |         dev_tools_props_check = False,
213 |         dev_tools_ui = False
214 |     )
215 | 


--------------------------------------------------------------------------------
/src/dash/assets/background.css:
--------------------------------------------------------------------------------
1 | body {
2 |     background-color: black;
3 |     color: rgb(42, 63, 95);
4 | }
5 | 


--------------------------------------------------------------------------------
/src/postGIS_tables/geo_joined/citibike_stations.sql:
--------------------------------------------------------------------------------
 1 | -- Create join table for taxi zones and Citibike stations
 2 | 
 3 | DROP TABLE IF EXISTS geo_joined.citibike_stations;
 4 | 
 5 | CREATE TABLE geo_joined.citibike_stations AS
 6 | 	SELECT
 7 | 		z.zone_id,
 8 | 		c.station_id
 9 | 	FROM
10 | 		staging.taxi_zones AS z
11 | 		JOIN (
12 | 			SELECT
13 | 				station_id,
14 | 				ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS geometry
15 | 			FROM staging.citibike_stations
16 | 		) AS c
17 | 			ON ST_WITHIN(c.geometry, z.geometry)
18 | 	GROUP BY 1, 2;
19 | 


--------------------------------------------------------------------------------
/src/postGIS_tables/geo_joined/past_tlc_visits.sql:
--------------------------------------------------------------------------------
 1 | -- Aggregate past TLC visits by the taxi zone their coordinates are within
 2 | 
 3 | DROP TABLE IF EXISTS geo_joined.past_tlc_visits;
 4 | 
 5 | CREATE TABLE geo_joined.past_tlc_visits AS
 6 | 	SELECT
 7 | 		p.month,
 8 | 		z.zone_id,
 9 | 		SUM(p.visits) AS visits
10 | 	FROM
11 | 		staging.taxi_zones AS z
12 | 		JOIN (
13 | 			SELECT
14 | 				month,
15 | 				ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS geometry,
16 | 				visits
17 | 			FROM staging.past_tlc_visits
18 | 		) AS p
19 | 			ON ST_WITHIN(p.geometry, z.geometry)
20 | 	GROUP BY 1, 2;
21 | 


--------------------------------------------------------------------------------
/src/postGIS_tables/production/all_time_stats.sql:
--------------------------------------------------------------------------------
 1 | -- Join Citibike, TLC, and Yelp statistics to taxi zones for Dash
 2 | 
 3 | DROP TABLE IF EXISTS production.all_time_stats;
 4 | 
 5 | CREATE TABLE production.all_time_stats AS
 6 |     SELECT
 7 |         v.zone_id,
 8 |         v.zone_name,
 9 |         v.borough,
10 |         v.tlc_visits,
11 |         v.citibike_visits,
12 |         v.citibike_stations,
13 |         y.avg_rating AS yelp_avg_rating,
14 |         y.sum_reviews AS yelp_sum_reviews,
15 |         y.weighted_sum_reviews AS yelp_weighted_sum_reviews
16 |     FROM
17 |         (
18 |             SELECT
19 |                 z.zone_id,
20 |                 z.zone_name,
21 |                 z.borough,
22 |                 COALESCE(SUM(t.visits), 0) AS tlc_visits,
23 |                 COALESCE(SUM(c.visits), 0) AS citibike_visits,
24 |                 COALESCE(MAX(c.stations), 0) AS citibike_stations
25 |             FROM
26 |                 staging.taxi_zones AS z
27 |                 LEFT JOIN statistics.tlc_visits AS t USING (zone_id)
28 |                 LEFT JOIN statistics.citibike AS c
29 |                     ON t.zone_id = c.zone_id AND t.month = c.month
30 |             GROUP BY 1, 2, 3
31 |         ) AS v
32 |         LEFT JOIN statistics.yelp_businesses AS y USING (zone_id)
33 |     ORDER BY 1;
34 | 


--------------------------------------------------------------------------------
/src/postGIS_tables/production/taxi_zones.sql:
--------------------------------------------------------------------------------
 1 | -- Convert taxi zone geometries to GeoJSON for Dash
 2 | 
 3 | DROP TABLE IF EXISTS production.taxi_zones;
 4 | 
 5 | CREATE TABLE production.taxi_zones AS
 6 |     SELECT
 7 | 		zone_id,
 8 |         ST_ASGeoJSON(ST_ForcePolygonCW(geometry)) AS geometry
 9 |     FROM staging.taxi_zones
10 |     ORDER BY 1;
11 | 


--------------------------------------------------------------------------------
/src/postGIS_tables/statistics/citibike.sql:
--------------------------------------------------------------------------------
 1 | -- Aggregate Citibike visits by taxi zone
 2 | -- and estimate monthly station additions with rolling maximum
 3 | 
 4 | DROP TABLE IF EXISTS statistics.citibike;
 5 | 
 6 | CREATE TABLE statistics.citibike AS
 7 |     SELECT
 8 |         t.month,
 9 |         t.zone_id,
10 |         MAX(active_stations) OVER (
11 |             PARTITION BY t.zone_id
12 |             ORDER BY t.month ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
13 |         ) AS stations,
14 |         visits
15 |     FROM (
16 |         SELECT
17 |             v.month,
18 |             s.zone_id,
19 |             COUNT(s.station_id) AS active_stations,
20 |             SUM(v.visits) AS visits
21 |         FROM
22 |             geo_joined.citibike_stations AS s
23 |             JOIN staging.citibike_visits AS v
24 |                 USING (station_id)
25 |         GROUP BY 1, 2
26 |     ) AS t;
27 | 


--------------------------------------------------------------------------------
/src/postGIS_tables/statistics/tlc_visits.sql:
--------------------------------------------------------------------------------
 1 | -- Combine past TLC visits with modern TLC visits
 2 | -- and aggregate by taxi zone ID
 3 | 
 4 | DROP TABLE IF EXISTS statistics.tlc_visits;
 5 | 
 6 | CREATE TABLE statistics.tlc_visits AS
 7 | 	SELECT
 8 | 		t.month,
 9 | 		t.zone_id,
10 | 		SUM(t.visits) AS visits
11 | 	FROM (
12 | 		SELECT
13 | 			month,
14 | 			zone_id,
15 | 			visits
16 | 		FROM geo_joined.past_tlc_visits
17 | 		UNION ALL
18 | 		SELECT
19 | 			month,
20 | 			zone_id,
21 | 			visits
22 | 		FROM staging.modern_tlc_visits
23 | 	) AS t
24 | 	GROUP BY 1, 2;
25 | 


--------------------------------------------------------------------------------
/src/postGIS_tables/statistics/yelp_businesses.sql:
--------------------------------------------------------------------------------
 1 | -- Aggregate Yelp business ratings and reviews by taxi zone
 2 | 
 3 | DROP TABLE IF EXISTS statistics.yelp_businesses;
 4 | 
 5 | CREATE TABLE statistics.yelp_businesses AS
 6 |     SELECT
 7 |         z.zone_id,
 8 |         AVG(y.rating) AS avg_rating,
 9 |         SUM(y.review_count) AS sum_reviews,
10 |         SUM(y.review_count * y.rating) AS weighted_sum_reviews
11 |     FROM
12 |         staging.taxi_zones AS z
13 |         JOIN staging.yelp_businesses AS y
14 |             ON ST_Within(y.geometry, z.geometry)
15 |     GROUP BY 1;
16 | 


--------------------------------------------------------------------------------
/src/preparation/extract.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os
 3 | import requests
 4 | import time
 5 | import zipfile
 6 | import boto3
 7 | import geopandas as gpd
 8 | import pandas as pd
 9 | 
10 | 
11 | s3 = boto3.resource('s3')
12 | 
13 | def get_taxi_zones():
14 |     '''Pull taxi zone shapfile and convert to WGS 84 (EPSG:4326)'''
15 |     s3.meta.client.download_file(
16 |         'nyc-tlc',
17 |         'misc/taxi_zones.zip',
18 |         'taxi_zones.zip'
19 |     )
20 |     taxi_zones = gpd.read_file('zip://taxi_zones.zip') \
21 |         .to_crs('EPSG:4326') \
22 |         .filter(
23 |             [
24 |                 'OBJECTID',
25 |                 'zone',
26 |                 'borough',
27 |                 'geometry'
28 |             ],
29 |             axis = 1
30 |         ).rename(
31 |             columns = {
32 |                 'OBJECTID': 'zone_id',
33 |                 'zone': 'zone_name'
34 |             }
35 |         )
36 |     os.remove('taxi_zones.zip')
37 |     return taxi_zones
38 | 
39 | def get_businesses(**kwargs):
40 |     '''For each taxi zone, query Yelp API for businesses closest to centroid'''
41 |     ti = kwargs['ti']
42 |     centroids = ti.xcom_pull(task_ids = 'calculate_centroids')
43 |     
44 |     api_key = 'Bearer ' + os.environ['YELP_API_KEY']
45 |     head = {'Authorization': api_key}
46 |     url = 'https://api.yelp.com/v3/businesses/search'
47 |     businesses = pd.DataFrame()
48 | 
49 |     for _, row in centroids.iterrows():
50 |         query = {
51 |             'latitude': row['latitude'],
52 |             'longitude': row['longitude'],
53 |             'radius': 3000,
54 |             'limit': 50,
55 |             'sort_by': 'distance'
56 |         }
57 |         response = requests.get(url, headers = head, params = query)
58 |         json = response.json()
59 | 
60 |         retries = 0
61 |         while retries <= 10 and 'error' in json:
62 |             retries += 1
63 |             time.sleep(1)
64 |             response = requests.get(url, headers = head, params = query)
65 |             json = response.json()
66 |         matches = json['businesses']
67 |         businesses = businesses.append(matches, ignore_index = True)
68 |     return businesses
69 | 
70 | def unzip_csvs():
71 |     '''Iterate over relevant zipped files, unzip, and upload to private S3'''
72 |     source = s3.Bucket('tripdata')
73 | 
74 |     for obj in source.objects.all():
75 |         key = obj.key
76 | 
77 |         if not key.startswith('201307-201402') and key.endswith('.zip'):
78 |             buffer = io.BytesIO(obj.get()['Body'].read())
79 |             zipped = zipfile.ZipFile(buffer)
80 | 
81 |             for name in zipped.namelist():
82 | 
83 |                 if not name.startswith('_') and name.endswith('.csv'):
84 |                     s3.meta.client.upload_fileobj(
85 |                         zipped.open(name),
86 |                         Bucket = 'jlang-20b-de-ny',
87 |                         Key = 'citibike/' + name
88 |                     )
89 | 


--------------------------------------------------------------------------------
/src/preparation/load.py:
--------------------------------------------------------------------------------
 1 | import pandas
 2 | from geoalchemy2 import Geometry
 3 | from sqlalchemy import Float, Integer, String
 4 | from config.database import py_engine
 5 | 
 6 | 
 7 | def write_taxi_zones(**kwargs):
 8 |     '''Write taxi zone map to postgres'''
 9 |     ti = kwargs['ti']
10 |     taxi_zones = ti.xcom_pull(task_ids = 'clean_taxi_zones')
11 | 
12 |     taxi_zones.to_sql(
13 |         name = 'taxi_zones',
14 |         con = py_engine,
15 |         schema = 'staging',
16 |         if_exists = 'replace',
17 |         index = False,
18 |         index_label = 'zone_id',
19 |         dtype = {
20 |             'zone_id': Integer(),
21 |             'zone_name': String(length = 45),
22 |             'borough': String(length = 13),
23 |             'geometry': Geometry('MULTIPOLYGON', 4326)
24 |         }
25 |     )
26 | 
27 | def write_businesses(**kwargs):
28 |     '''Write Yelp business data to postgres for further processing'''
29 |     ti = kwargs['ti']
30 |     businesses = ti.xcom_pull(task_ids = 'clean_businesses')
31 | 
32 |     businesses.to_sql(
33 |         name = 'yelp_businesses',
34 |         con = py_engine,
35 |         schema = 'staging',
36 |         if_exists = 'replace',
37 |         index = False,
38 |         index_label = 'business_id',
39 |         dtype = {
40 |             'business_id': String(22),
41 |             'review_count': Integer(),
42 |             'rating': Float(),
43 |             'geometry': Geometry('POINT', 4326)
44 |         }
45 |     )
46 | 


--------------------------------------------------------------------------------
/src/preparation/transform.py:
--------------------------------------------------------------------------------
 1 | import geopandas as gpd
 2 | import pandas as pd
 3 | from geoalchemy2 import WKTElement
 4 | from shapely.geometry.multipolygon import MultiPolygon
 5 | from shapely.geometry.polygon import Polygon
 6 | 
 7 | 
 8 | def clean_taxi_zones(**kwargs):
 9 |     '''Make geometry column consistent for writing to postgres'''
10 |     ti = kwargs['ti']
11 |     taxi_zones = ti.xcom_pull(task_ids = 'get_taxi_zones')
12 | 
13 |     def homogenize(geometry):
14 |         '''
15 |         Convert any Polygon to a MultiPolygon
16 |         and then either to a WKTElement
17 |         '''
18 |         multi = MultiPolygon([geometry]) if type(geometry) == Polygon else geometry
19 |         return WKTElement(multi.wkt, srid = 4326)
20 | 
21 |     taxi_zones['geometry'] = taxi_zones['geometry'].apply(homogenize)
22 |     return taxi_zones
23 | 
24 | def calculate_centroids(**kwargs):
25 |     '''Calculate centroids for each taxi zone and extract lat-lons'''
26 |     ti = kwargs['ti']
27 |     taxi_zones = ti.xcom_pull(task_ids = 'get_taxi_zones')
28 | 
29 |     centroids = pd.DataFrame.from_dict({
30 |         'latitude': taxi_zones['geometry'].centroid.y,
31 |         'longitude': taxi_zones['geometry'].centroid.x
32 |     })
33 |     return centroids
34 | 
35 | def clean_businesses(**kwargs):
36 |     '''
37 |     Drop invalid and duplicated businesses,
38 |     unnest lat-lons, & combine into geometry column
39 |     '''
40 |     ti = kwargs['ti']
41 |     businesses = ti.xcom_pull(task_ids = 'get_businesses')
42 | 
43 |     businesses.drop(
44 |         businesses[businesses.distance > 3000].index,
45 |         inplace = True
46 |     )
47 |     businesses.sort_values('distance') \
48 |         .drop_duplicates('id', keep ='first') \
49 |         .sort_index()
50 |     businesses.reset_index(
51 |         drop = True,
52 |         inplace = True
53 |     )
54 | 
55 |     business_coordinates = pd.json_normalize(businesses.coordinates)
56 |     business_coordinates.dropna(how = 'any', inplace = True)
57 | 
58 |     businesses_flat = businesses.join(business_coordinates, how = 'inner')
59 |     businesses_flat.reset_index(drop = True, inplace = True)
60 | 
61 |     businesses_geo = gpd.GeoDataFrame(
62 |         businesses_flat,
63 |         geometry = gpd.points_from_xy(
64 |             businesses_flat.longitude,
65 |             businesses_flat.latitude
66 |         )
67 |     )
68 |     businesses_geo['geometry'] = businesses_geo.geometry.apply(
69 |         lambda point: WKTElement(point.wkt, srid = 4326)
70 |     )
71 | 
72 |     businesses_writable = businesses_geo.filter(
73 |         [
74 |             'id',
75 |             'review_count',
76 |             'rating',
77 |             'geometry'
78 |         ],
79 |         axis = 1
80 |     )
81 |     return businesses_writable
82 | 


--------------------------------------------------------------------------------
/src/spark_reduction/driver.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from spark_reduction.extract import \
 3 |     get_citibike_trips, get_past_tlc_trips, get_modern_tlc_trips
 4 | from spark_reduction.transform import \
 5 |     distill_citibike_stations, aggregate_citibike_visits, \
 6 |     aggregate_past_tlc_visits, aggregate_modern_tlc_visits
 7 | from spark_reduction.load import \
 8 |     write_citibike, write_tlc
 9 | 
10 | 
11 | spark = SparkSession.builder \
12 |     .appName('where-cycle') \
13 |     .getOrCreate()
14 | 
15 | # Parse CSVs from S3 and cache tables
16 | get_citibike_trips()
17 | get_past_tlc_trips()
18 | get_modern_tlc_trips()
19 | 
20 | # Reduce tables to meaningful dataframes
21 | stations = distill_citibike_stations()
22 | citibike_visits = aggregate_citibike_visits()
23 | past_visits = aggregate_past_tlc_visits()
24 | modern_visits = aggregate_modern_tlc_visits()
25 | 
26 | # Write dataframes to postgres
27 | write_citibike(stations, citibike_visits)
28 | write_tlc(past_visits, modern_visits)
29 | 
30 | spark.stop()
31 | 


--------------------------------------------------------------------------------
/src/spark_reduction/extract.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql import SparkSession
  2 | from pyspark.sql.functions import input_file_name, regexp_extract
  3 | from config.schemas import *
  4 | 
  5 | 
  6 | spark = SparkSession.builder \
  7 |     .appName('where-cycle') \
  8 |     .getOrCreate()
  9 | 
 10 | def get_citibike_trips():
 11 |     '''Parse Citibike CSVs, format date columns, & rename location columns'''
 12 |     citibike_df = spark.read.csv(
 13 |         path = 's3a://jlang-20b-de-ny/citibike/*.csv',
 14 |         schema = citibike_schema,
 15 |         header = True,
 16 |         ignoreLeadingWhiteSpace = True,
 17 |         ignoreTrailingWhiteSpace = True
 18 |     ).withColumnRenamed('start station id', 'start_id') \
 19 |     .withColumnRenamed('start station latitude', 'start_latitude') \
 20 |     .withColumnRenamed('start station longitude', 'start_longitude') \
 21 |     .withColumnRenamed('end station id', 'end_id') \
 22 |     .withColumnRenamed('end station latitude', 'end_latitude') \
 23 |     .withColumnRenamed('end station longitude', 'end_longitude') \
 24 |     .selectExpr(
 25 |         'DATE_FORMAT(starttime, "yyyy-MM") AS start_month',
 26 |         'DATE_FORMAT(stoptime, "yyyy-MM") AS end_month',
 27 |         'start_id',
 28 |         'start_latitude',
 29 |         'start_longitude',
 30 |         'end_id',
 31 |         'end_latitude',
 32 |         'end_longitude'
 33 |     )
 34 |     citibike_df.createOrReplaceTempView('citibike')
 35 |     spark.catalog.cacheTable('citibike')
 36 | 
 37 | def parse_tlc(path, schema):
 38 |     '''Parse TLC CSVs, assuming trip month from filename'''
 39 |     tlc_df = spark.read.csv(
 40 |         path = path,
 41 |         schema = schema,
 42 |         header = True,
 43 |         ignoreLeadingWhiteSpace = True,
 44 |         ignoreTrailingWhiteSpace = True
 45 |     ).withColumn(
 46 |         'month',
 47 |         regexp_extract(
 48 |             input_file_name(),
 49 |             r'tripdata_(\d{4}-\d{2})\.csv',
 50 |             1
 51 |         )
 52 |     )
 53 |     return tlc_df
 54 | 
 55 | def get_past_tlc_trips():
 56 |     '''Parse TLC CSVs from before 2016-07, filtering for lat-lon columns'''
 57 |     past_df = spark.createDataFrame(data = [], schema = past_schema)
 58 |     past_pairs = [
 59 |         (
 60 |             's3a://nyc-tlc/trip\ data/green_tripdata_201[345]-*.csv',
 61 |             green_13_16_schema
 62 |         ),
 63 |         (
 64 |             's3a://nyc-tlc/trip\ data/green_tripdata_2016-0[1-6].csv',
 65 |             green_13_16_schema
 66 |         ),
 67 |         (
 68 |             's3a://nyc-tlc/trip\ data/yellow_tripdata_2009-*.csv',
 69 |             yellow_09_16_schema
 70 |         ),
 71 |         (
 72 |             's3a://nyc-tlc/trip\ data/yellow_tripdata_201[0-5]-*.csv',
 73 |             yellow_09_16_schema
 74 |         ),
 75 |         (
 76 |             's3a://nyc-tlc/trip\ data/yellow_tripdata_2016-0[1-6].csv',
 77 |             yellow_09_16_schema
 78 |         )
 79 |     ]
 80 |     for path, schema in past_pairs:
 81 |         csv_df = parse_tlc(path, schema).select(
 82 |             'month',
 83 |             'pickup_longitude',
 84 |             'pickup_latitude',
 85 |             'dropoff_longitude',
 86 |             'dropoff_latitude'
 87 |         )
 88 |         past_df = past_df.union(csv_df)
 89 |     past_df.createOrReplaceTempView('past')
 90 |     spark.catalog.cacheTable('past')
 91 | 
 92 | def get_modern_tlc_trips():
 93 |     '''Parse TLC CSVs from after 2016-06, filtering for taxi zone ID columns'''
 94 |     fhv_15_16_df = parse_tlc(
 95 |         's3a://nyc-tlc/trip\ data/fhv_tripdata_201[56]-*.csv',
 96 |         fhv_15_16_schema
 97 |     ).select(
 98 |         'month',
 99 |         'locationID'
100 |     )
101 |     fhv_15_16_df.createOrReplaceTempView('fhv_15_16')
102 |     spark.catalog.cacheTable('fhv_15_16')
103 | 
104 |     modern_df = spark.createDataFrame(data = [], schema = modern_schema)
105 |     modern_pairs = [
106 |         (
107 |             's3a://nyc-tlc/trip\ data/fhv_tripdata_201[79]-*.csv',
108 |             fhv_17_19_schema
109 |         ),
110 |         (
111 |             's3a://nyc-tlc/trip\ data/fhv_tripdata_2018-*.csv',
112 |             fhv_18_schema
113 |         ),
114 |         (
115 |             's3a://nyc-tlc/trip\ data/fhvhv_tripdata_*.csv',
116 |             fhvhv_schema
117 |         ),
118 |         (
119 |             's3a://nyc-tlc/trip\ data/green_tripdata_2016-0[789].csv',
120 |             green_16_19_schema
121 |         ),
122 |         (
123 |             's3a://nyc-tlc/trip\ data/green_tripdata_2016-1*.csv',
124 |             green_16_19_schema
125 |         ),
126 |         (
127 |             's3a://nyc-tlc/trip\ data/green_tripdata_201[789]-*.csv',
128 |             green_16_19_schema
129 |         ),
130 |         (
131 |             's3a://nyc-tlc/trip\ data/yellow_tripdata_2016-0[789].csv',
132 |             yellow_16_19_schema
133 |         ),
134 |         (
135 |             's3a://nyc-tlc/trip\ data/yellow_tripdata_2016-1*.csv',
136 |             yellow_16_19_schema)
137 |         ,
138 |         (
139 |             's3a://nyc-tlc/trip\ data/yellow_tripdata_201[789]-*.csv',
140 |             yellow_16_19_schema
141 |         )
142 |     ]
143 |     for path, schema in modern_pairs:
144 |         csv_df = parse_tlc(path, schema).select(
145 |             'month',
146 |             'PULocationID',
147 |             'DOLocationID'
148 |         )
149 |         modern_df = modern_df.union(csv_df)
150 |     modern_df.createOrReplaceTempView('modern')
151 |     spark.catalog.cacheTable('modern')
152 | 


--------------------------------------------------------------------------------
/src/spark_reduction/load.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | from config.database import jdbc_props, jdbc_url
 3 | 
 4 | 
 5 | spark = SparkSession.builder \
 6 |     .appName('where-cycle') \
 7 |     .getOrCreate()
 8 | 
 9 | def write_citibike(stations, visits):
10 |     stations.write.jdbc(
11 |         url = jdbc_url,
12 |         table = 'staging.citibike_stations',
13 |         mode = 'overwrite',
14 |         properties = jdbc_props
15 |     )
16 |     visits.write.jdbc(
17 |         url = jdbc_url,
18 |         table = 'staging.citibike_visits',
19 |         mode = 'overwrite',
20 |         properties = jdbc_props
21 |     )
22 |     spark.catalog.uncacheTable('citibike')
23 | 
24 | def write_tlc(past, modern):
25 |     past.write.jdbc(
26 |         url = jdbc_url,
27 |         table = 'staging.past_tlc_visits',
28 |         mode = 'overwrite',
29 |         properties = jdbc_props
30 |     )
31 |     spark.catalog.uncacheTable('past')
32 | 
33 |     modern.write.jdbc(
34 |         url = jdbc_url,
35 |         table = 'staging.modern_tlc_visits',
36 |         mode = 'overwrite',
37 |         properties = jdbc_props
38 |     )
39 |     spark.catalog.uncacheTable('fhv_15_16')
40 |     spark.catalog.uncacheTable('modern')
41 | 


--------------------------------------------------------------------------------
/src/spark_reduction/transform.py:
--------------------------------------------------------------------------------
  1 | from pyspark.sql import SparkSession
  2 | from config.geometries import \
  3 |     TAXI_ZONE_LAT_MIN, TAXI_ZONE_LAT_MAX, \
  4 |     TAXI_ZONE_LON_MIN, TAXI_ZONE_LON_MAX
  5 | 
  6 | 
  7 | spark = SparkSession.builder \
  8 |     .appName('where-cycle') \
  9 |     .getOrCreate()
 10 | 
 11 | def distill_citibike_stations():
 12 |     '''Create list of unique Citibike stations across all trip endpoints'''
 13 |     stations_df = spark.sql(f'''
 14 |         SELECT
 15 |             start_id AS station_id,
 16 |             start_latitude AS latitude,
 17 |             start_longitude AS longitude
 18 |         FROM citibike
 19 |         WHERE
 20 |             start_latitude BETWEEN
 21 |                 {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX}
 22 |             AND
 23 |             start_longitude BETWEEN
 24 |                 {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX}
 25 |         GROUP BY 1, 2, 3
 26 |         UNION
 27 |         SELECT
 28 |             end_id AS station_id,
 29 |             end_latitude AS latitude,
 30 |             end_longitude AS longitude
 31 |         FROM citibike
 32 |         WHERE
 33 |             end_latitude BETWEEN
 34 |                 {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX}
 35 |             AND
 36 |             end_longitude BETWEEN
 37 |                 {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX}
 38 |         GROUP BY 1, 2, 3'''.translate({ord(c): ' ' for c in '\n\t'})
 39 |     )
 40 |     return stations_df
 41 | 
 42 | def aggregate_citibike_visits():
 43 |     '''Convert Citibike trips to visits and sum by station_id'''
 44 |     visits_df = spark.sql('''
 45 |         SELECT
 46 |             month,
 47 |             station_id,
 48 |             SUM(visits) AS visits
 49 |         FROM (
 50 |             SELECT
 51 |                 start_month AS month,
 52 |                 start_id AS station_id,
 53 |                 COUNT(*) AS visits
 54 |             FROM citibike
 55 |             GROUP BY 1, 2
 56 |             UNION ALL
 57 |             SELECT
 58 |                 end_month AS month,
 59 |                 end_id AS station_id,
 60 |                 COUNT(*) AS visits
 61 |             FROM citibike
 62 |             GROUP BY 1, 2
 63 |         )
 64 |         GROUP BY 1, 2
 65 |     ''')
 66 |     return visits_df
 67 | 
 68 | def aggregate_past_tlc_visits():
 69 |     '''
 70 |     Convert past TLC trips to visits,
 71 |     round lat-lon precision to street level,
 72 |     and sum by lat-lon
 73 |     '''
 74 |     past_df = spark.sql(f'''
 75 |         SELECT
 76 |             month,
 77 |             longitude,
 78 |             latitude,
 79 |             SUM(visits) AS visits
 80 |         FROM (
 81 |             SELECT
 82 |                 month,
 83 |                 ROUND(pickup_longitude, 3) AS longitude,
 84 |                 ROUND(pickup_latitude, 3) AS latitude,
 85 |                 COUNT(*) AS visits
 86 |             FROM past
 87 |             WHERE
 88 |                 pickup_longitude BETWEEN
 89 |                     {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX}
 90 |                 AND
 91 |                 pickup_latitude BETWEEN
 92 |                     {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX}
 93 |             GROUP BY 1, 2, 3
 94 |             UNION ALL
 95 |             SELECT
 96 |                 month,
 97 |                 ROUND(dropoff_longitude, 3) AS longitude,
 98 |                 ROUND(dropoff_latitude, 3) AS latitude,
 99 |                 COUNT(*) AS visits
100 |             FROM past
101 |             WHERE
102 |                 dropoff_longitude BETWEEN
103 |                     {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX}
104 |                 AND
105 |                 dropoff_latitude BETWEEN
106 |                     {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX}
107 |             GROUP BY 1, 2, 3
108 |         )
109 |         GROUP BY 1, 2, 3'''.translate({ord(c): ' ' for c in '\n\t'})
110 |     )
111 |     return past_df
112 | 
113 | def aggregate_modern_tlc_visits():
114 |     '''
115 |     Convert modern TLC trips to visits,
116 |     ignoring unknown taxi zone IDs,
117 |     and sum by taxi zone ID
118 |     '''
119 |     modern_df = spark.sql('''
120 |         SELECT
121 |             month,
122 |             zone_id,
123 |             SUM(visits) AS visits
124 |         FROM (
125 |             SELECT
126 |                 month,
127 |                 locationID AS zone_id,
128 |                 COUNT(*) AS visits
129 |             FROM fhv_15_16
130 |             WHERE locationID BETWEEN 1 AND 263
131 |             GROUP BY 1, 2
132 |             UNION ALL
133 |             SELECT
134 |                 month,
135 |                 PULocationID AS zone_id,
136 |                 COUNT(*) as visits
137 |             FROM modern
138 |             WHERE PUlocationID BETWEEN 1 AND 263
139 |             GROUP BY 1, 2
140 |             UNION ALL
141 |             SELECT
142 |                 month,
143 |                 DOLocationID AS zone_id,
144 |                 COUNT(*) as visits
145 |             FROM modern
146 |             WHERE DOlocationID BETWEEN 1 AND 263
147 |             GROUP BY 1, 2
148 |         )
149 |         GROUP BY 1, 2
150 |     ''')
151 |     return modern_df
152 | 


--------------------------------------------------------------------------------