├── .gitignore ├── LICENSE ├── README.md ├── dag.png ├── pipeline.png ├── requirements.txt └── src ├── airflow ├── start_workers.sh ├── stop_workers.sh └── where_cycle_dag.py ├── config ├── database.py ├── geometries.py ├── ref │ ├── check_citibike_schema.py │ ├── check_tlc_schemas.py │ ├── get_geometries.sql │ └── tlc_schemas.txt └── schemas.py ├── dash ├── app.py └── assets │ └── background.css ├── postGIS_tables ├── geo_joined │ ├── citibike_stations.sql │ └── past_tlc_visits.sql ├── production │ ├── all_time_stats.sql │ └── taxi_zones.sql └── statistics │ ├── citibike.sql │ ├── tlc_visits.sql │ └── yelp_businesses.sql ├── preparation ├── extract.py ├── load.py └── transform.py └── spark_reduction ├── driver.py ├── extract.py ├── load.py └── transform.py /.gitignore: -------------------------------------------------------------------------------- 1 | # Temporary resources 2 | benchmark/ 3 | dash_project_medium.py 4 | spark-warehouse/ 5 | 6 | # Byte-compiled / optimized / DLL files 7 | __pycache__/ 8 | *.py[cod] 9 | *$py.class 10 | 11 | # C extensions 12 | *.so 13 | 14 | # Distribution / packaging 15 | .Python 16 | build/ 17 | develop-eggs/ 18 | dist/ 19 | downloads/ 20 | eggs/ 21 | .eggs/ 22 | lib/ 23 | lib64/ 24 | parts/ 25 | sdist/ 26 | var/ 27 | wheels/ 28 | pip-wheel-metadata/ 29 | share/python-wheels/ 30 | *.egg-info/ 31 | .installed.cfg 32 | *.egg 33 | MANIFEST 34 | 35 | # PyInstaller 36 | # Usually these files are written by a python script from a template 37 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 38 | *.manifest 39 | *.spec 40 | 41 | # Installer logs 42 | pip-log.txt 43 | pip-delete-this-directory.txt 44 | 45 | # Unit test / coverage reports 46 | htmlcov/ 47 | .tox/ 48 | .nox/ 49 | .coverage 50 | .coverage.* 51 | .cache 52 | nosetests.xml 53 | coverage.xml 54 | *.cover 55 | *.py,cover 56 | .hypothesis/ 57 | .pytest_cache/ 58 | 59 | # Translations 60 | *.mo 61 | *.pot 62 | 63 | # Django stuff: 64 | *.log 65 | local_settings.py 66 | db.sqlite3 67 | db.sqlite3-journal 68 | 69 | # Flask stuff: 70 | instance/ 71 | .webassets-cache 72 | 73 | # Scrapy stuff: 74 | .scrapy 75 | 76 | # Sphinx documentation 77 | docs/_build/ 78 | 79 | # PyBuilder 80 | target/ 81 | 82 | # Jupyter Notebook 83 | *.ipynb 84 | .ipynb_checkpoints/ 85 | 86 | # IPython 87 | profile_default/ 88 | ipython_config.py 89 | 90 | # pyenv 91 | .python-version 92 | 93 | # pipenv 94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 97 | # install all needed dependencies. 98 | #Pipfile.lock 99 | 100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 101 | __pypackages__/ 102 | 103 | # Celery stuff 104 | celerybeat-schedule 105 | celerybeat.pid 106 | 107 | # SageMath parsed files 108 | *.sage.py 109 | 110 | # Environments 111 | where-cycle-env/ 112 | .env 113 | .venv 114 | env/ 115 | venv/ 116 | ENV/ 117 | env.bak/ 118 | venv.bak/ 119 | 120 | # Spyder project settings 121 | .spyderproject 122 | .spyproject 123 | 124 | # Rope project settings 125 | .ropeproject 126 | 127 | # mkdocs documentation 128 | /site 129 | 130 | # mypy 131 | .mypy_cache/ 132 | .dmypy.json 133 | dmypy.json 134 | 135 | # Pyre type checker 136 | .pyre/ 137 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2020 Josh Lang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ![Python 3.6.9](https://img.shields.io/badge/Python-3.6.9-light) 2 | ![Airflow 1.10.11](https://img.shields.io/badge/Airflow-1.10.11-red) 3 | ![GeoPandas 0.7](https://img.shields.io/badge/GeoPandas-0.7-purple) 4 | ![Spark 2.4.5](https://img.shields.io/badge/Spark-2.4.5-orange) 5 | ![PostGIS 2.4](https://img.shields.io/badge/PostGIS-2.4-darkblue) 6 | ![Dash 1.12](https://img.shields.io/badge/Dash-1.12-blue) 7 | ![MIT License](https://img.shields.io/badge/License-MIT-lightgrey) 8 | # Where Cycle 9 | 10 | *Getting New Yorkers Back to Business, Safely* 11 | 12 | ## Contents 13 | 1. [Purpose](README.md#purpose) 14 | 1. [Pipeline](README.md#pipeline) 15 | 1. [Summary](README.md#summary) 16 | - [Data](README.md#data) 17 | - [Preparation](README.md#preparation) 18 | - [Spark Reduction](README.md#spark-reduction) 19 | - [PostGIS Tables](README.md#postgis-tables) 20 | - [Dash & Airflow](README.md#dash-and-airflow) 21 | 1. [Spark Optimization](README.md#spark-optimization) 22 | 1. [Setup](README.md#setup) 23 | 1. [Directory Structure](README.md#directory-structure) 24 | 1. [License](README.md#license) 25 | 26 | ## Purpose 27 | As health officials advised social distancing and businesses closed earlier this year, subway and bus ridership plummeted in many large cities. New York saw an almost 90% reduction by late April. Now, as the city is tentatively opening back up, people may be looking to return to their places of work and to support their favorite businesses, but they might be hesitant to utilize public transit, instead seeking open-air alternatives. 28 | 29 | A cursory glance at some transit coverage in NYC makes it clear that, while Citibike is an awesome open-air solution, the available stations can’t immediately meet the needs of the outer boroughs: some expansion is required. **The goal of this pipeline is to synthesize data that may help city planners and Citibike analysts determine which areas could be ideal for Citibike expansion. As an initial step toward that end, it aggregates historical taxi & for-hire vehicle trips, Citibike trips & station density, and business review statistics by taxi zone.** 30 | 31 | *This project was developed by Josh Lang as part of his data engineering fellowship with Insight Data Science in the summer of 2020.* 32 | 33 | ## Pipeline 34 | ![Pipeline](https://github.com/josh-lang/where-cycle/blob/master/pipeline.png)
35 | ![DAG](https://github.com/josh-lang/where-cycle/blob/master/dag.png) 36 | 37 | ## Summary 38 | If you'd prefer to jump right in and start clicking into the functions from that DAG above, then the file that produced it is [here](https://github.com/josh-lang/where-cycle/blob/master/src/airflow/where_cycle_dag.py). Since you can't navigate directly to everything from there, you may also find a glance at the [directory structure](README.md#directory-structure) below handy. 39 | 40 | ### Data 41 | - Citibike Trip Histories: [S3 bucket](https://s3.console.aws.amazon.com/s3/buckets/tripdata), [documentation](https://www.citibikenyc.com/system-data) 42 | - NYC Taxi & Limousine Commission Trip Records: [S3 bucket](https://s3.console.aws.amazon.com/s3/buckets/nyc-tlc), [documentation](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) 43 | - Yelp Business Search API: [documentation](https://www.yelp.com/developers/documentation/v3/business_search) 44 | 45 | ### Preparation 46 | - In order to index everything by taxi zone, NYC TLC's shapefile needs to be pulled down from S3, processed, and saved to PostgreSQL 47 | - Coordinate reference system is converted from NAD83 to WGS84 48 | - Each polygon is replaced with its equivalent multipolygon 49 | - All geometries are converted to well-known text 50 | - Centroids are then calculated for each taxi zone and used to query Yelp's API, requesting the 50 nearest businesses. These are cleaned and written as well 51 | - Invalid results and duplicates are removed 52 | - Coordinates are unnested and combined into point geometries 53 | - Like with taxi zones, geometries are converted to well-known text 54 | - Citibike's zipped files need to be pulled out of S3, unzipped, and sent back to another S3 bucket before batch processing since Spark can't ingest zip files natively 55 | - This is because Hadoop, which provides its underlying filesystem interface, does not support that compression codec 56 | - Python's `io.BytesIO` class reads S3's *bytes-like objects* and makes this a quick streaming process 57 | 58 | ### Spark Reduction 59 | - Spark can read csv files directly via the s3a connector for Hadoop, and multiple URIs can be specified with globbing 60 | - Citibike's trip data is consistent, so parsing all of it requires only one path and one schema definition 61 | - That schema can be truncated because this project isn't concerned with any columns that appear after trip dates and endpoint locations 62 | - TLC data is messier with 15 distinct csv headers over the corpus, but 10 truncated schemas are sufficient for pulling everything in correctly 63 | - TLC trips before 2016-07 use coordinates for pickup and dropoff locations, while trips after 2016-06 use taxi zone IDs 64 | - TLC's timestamps aren't always valid, so schemas are simplified further by not including those. Dates are instead assumed from csv filenames, which represent each month of trips 65 | - Relevant columns are selected from csvs, and then they're unioned together into 4 cached tables: Citibike trips, past TLC trips, modern TLC trips, and a small table for just the earliest for-hire vehicle trips 66 | - To aggregate visits by taxi zone, trip beginnings and endings need to be combined into endpoints and grouped by location. 4 tables are created in PostgreSQL: 67 | - Coordinates for unique Citibike stations within the taxi zone map's extent are pulled out separately from visit aggregation 68 | - Citibike visits are then aggragated by station ID 69 | - Past TLC visits are aggregated by coordinates within taxi zone extent rounded to 3 decimal places — neighborhood resolution 70 | - Modern TLC visits and those early for-hire vehicle visits are aggregated simply by taxi zone ID 71 | 72 | ### PostGIS Tables 73 | - All tables so far have been written to the *staging* schema in PostgreSQL. Now, that everything's there, some final processing with the PostGIS extension can be done 74 | - *geo_joined* schema 75 | - Citibike station coordinates are matched to taxi zone polygons to create a join table for Citibike visits 76 | - Past TLC visits are aggregated by the taxi zone their coordinates are within 77 | - *statistics* schema 78 | - Citibike stations and trips are aggregated by taxi zone using join table 79 | - Past TLC visits are unioned and summed with modern TLC visits using taxi zone IDs 80 | - Yelp business ratings and reviews are aggregated by the taxi zone their coordinates are within 81 | - *production* schema 82 | - Taxi zone geometries are converted to GeoJSON for Dash to plot on choropleth maps 83 | - Citibike, TLC, and Yelp statistics are joined to taxi zone dimensions for Dash to define toggleable scales 84 | 85 | ### Dash and Airflow 86 | - A rudimentary dashboard built with Dash lives at [dats.work/where-cycle](http://dats.work/where-cycle) 87 | - GeoJSON geometries from PostGIS need to be wrapped as a GeoJSON Feature Collection inside of the Dash app to be plotted on choropleth maps 88 | - Statistics from PostGIS define the choropleth map scales and are also used to create a supplementary bar chart of the top 15 taxi zones for whichever metric is selected 89 | - Airflow adds some fault tolerance and runs pipeline on a regular basis to keep data up-to-date 90 | - Dependencies between tasks prevent things from running out of order or unnecessarily when an upstream task has failed 91 | - Pipeline runs every week so that Yelp has enough time to update meaningfully and so that Citibike and TLC updates can be captured with relatively minimal delay 92 | - Both Citibike and TLC batch their trip data by month, but the date they update their S3 buckets isn't consistent 93 | - Yelp's data is queried directly from their API and may return updated or simply different results each time 94 | - Startup and shutdown of the standalone Spark cluster is automated within the pipeline to save money 95 | 96 | ## Spark Optimization 97 | I tested a handful of methods and configuration changes trying to make the Spark piece of the pipeline run more efficiently. First, since I had already defined each TLC schema while taking my initial stab at ingestion, I wanted to see whether those explicit definitions were, in fact, significantly faster than just using Spark's `inferSchema` option. Defining schemas before reading files was faster (as expected), but it only reduced total runtime by **~2.1%**. 98 | 99 | The most dramatic improvement came with caching each table of source CSVs before running the Spark SQL queries that transform them. This increased my total runtime savings to **~32.9%**! 100 | 101 | After that, I found that lowering the number of shuffle partitions so that it matched the number of cores in my small cluster and doubling the maximum bytes in cached storage batches and in each partition could make things even faster, but only by so much. Changing these settings in my `spark-defaults.conf` file brought total runtime reduction to **~36.6%**: 102 | | Property | Setting | 103 | | -------- | ------- | 104 | | spark.sql.files.maxPartitionBytes | 268435456 | 105 | | spark.sql.inMemoryColumnarStorage.batchSize | 20000 | 106 | | spark.sql.inMemoryColumnarStorage.compressed | true | 107 | | spark.sql.shuffle.partitions | 12 | 108 | 109 | ## Setup 110 | Python dependencies can be installed with the following command: 111 | ```sh 112 | pip install -r requirements.txt 113 | ``` 114 | 115 | This project was built using an Apache Spark 2.4.5 / Hadoop 2.7 binary downloaded from [spark.apache.org](https://spark.apache.org/downloads.html). It reads from AWS S3 and writes to PostgreSQL, so a driver from [jdbc.postgresql.org](https://jdbc.postgresql.org) should be placed in `spark/jars/` and some configuration should be added to `spark-defaults.conf`: 116 | | Property | Setting | 117 | | -------- | ------- | 118 | | spark.driver.extraClassPath | /usr/local/spark/jars/postgresql-42.2.14.jar | 119 | | spark.driver.extraJavaOptions | -Dcom.amazonaws.services.s3.enableV4=true | 120 | | spark.executor.extraJavaOptions | -Dcom.amazonaws.services.s3.enableV4=true | 121 | | spark.hadoop.fs.s3a.awsAccessKeyId | $AWS_ACCESS_KEY_ID | 122 | | spark.hadoop.fs.s3a.awsSecretAccessKey | $AWS_SECRET_ACCESS_KEY | 123 | | spark.hadoop.fs.s3a.endpoint | $AWS_S3_ENDPOINT | 124 | | spark.hadoop.com.amazonaws.services.s3a.enableV4 | true | 125 | | spark.hadoop.fs.s3a.impl | org.apache.hadoop.fs.s3a.S3AFileSystem | 126 | | spark.jars | /usr/local/spark/jars/postgresql-42.2.14.jar | 127 | | spark.jars.packages | com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.7 | 128 | 129 | This project also depends on PostgreSQL's PostGIS extension, which can be installed with the `CREATE EXTENSION` command: 130 | ```sh 131 | psql -d yourdatabase -c 'CREATE EXTENSION postgis;' 132 | ``` 133 | 134 | ## Directory Structure 135 | ```sh 136 | . 137 | ├── LICENSE 138 | ├── README.md 139 | ├── dag.png 140 | ├── pipeline.png 141 | ├── requirements.txt 142 | └── src/ 143 | ├── airflow/ 144 | │ ├── start_workers.sh* 145 | │ ├── stop_workers.sh* 146 | │ └── where_cycle_dag.py 147 | ├── config/ 148 | │ ├── database.py 149 | │ ├── geometries.py 150 | │ ├── ref/ 151 | │ │ ├── check_citibike_schema.py 152 | │ │ ├── check_tlc_schemas.py 153 | │ │ ├── get_geometries.sql 154 | │ │ └── tlc_schemas.txt 155 | │ └── schemas.py 156 | ├── dash/ 157 | │ ├── app.py 158 | │ └── assets/ 159 | │ └── background.css 160 | ├── postGIS_tables/ 161 | │ ├── geo_joined/ 162 | │ │ ├── citibike_stations.sql 163 | │ │ └── past_tlc_visits.sql 164 | │ ├── production/ 165 | │ │ ├── all_time_stats.sql 166 | │ │ └── taxi_zones.sql 167 | │ └── statistics/ 168 | │ ├── citibike.sql 169 | │ ├── tlc_visits.sql 170 | │ └── yelp_businesses.sql 171 | ├── preparation/ 172 | │ ├── extract.py 173 | │ ├── load.py 174 | │ └── transform.py 175 | └── spark_reduction/ 176 | ├── driver.py 177 | ├── extract.py 178 | ├── load.py 179 | └── transform.py 180 | ``` 181 | 182 | ## License 183 | [MIT License](LICENSE)
184 | Copyright (c) 2020 Josh Lang 185 | -------------------------------------------------------------------------------- /dag.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josh-lang/where-cycle/e11283acf13221f91b45baba12a08816def2a7fd/dag.png -------------------------------------------------------------------------------- /pipeline.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/josh-lang/where-cycle/e11283acf13221f91b45baba12a08816def2a7fd/pipeline.png -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | alembic==1.4.2 2 | apache-airflow==2.9.3 3 | apispec==1.3.3 4 | argcomplete==1.12.0 5 | attrs==19.3.0 6 | Babel==2.9.1 7 | backcall==0.1.0 8 | bleach==3.3.0 9 | boto3==1.13.23 10 | botocore==1.16.23 11 | Brotli==1.0.7 12 | cached-property==1.5.1 13 | cattrs==1.0.0 14 | certifi==2024.7.4 15 | chardet==3.0.4 16 | click==7.1.2 17 | click-plugins==1.1.1 18 | cligj==0.5.0 19 | colorama==0.4.3 20 | colorlog==4.0.2 21 | configparser==3.5.3 22 | croniter==0.3.34 23 | dash-core-components==2.0.0 24 | dash-html-components==2.0.0 25 | dash-renderer==1.4.1 26 | dash-table==4.7.0 27 | decorator==4.4.2 28 | defusedxml==0.6.0 29 | dill==0.3.2 30 | dnspython==2.6.1 31 | docutils==0.16 32 | email-validator==1.1.1 33 | entrypoints==0.3 34 | findspark==1.4.1 35 | Fiona==1.8.13.post1 36 | Flask==2.3.2 37 | Flask-Admin==1.5.4 38 | Flask-AppBuilder==4.3.11 39 | Flask-Babel==1.0.0 40 | Flask-Caching==1.11.0 41 | Flask-Compress==1.5.0 42 | Flask-JWT-Extended==3.24.1 43 | Flask-Login==0.4.1 44 | Flask-OpenID==1.2.5 45 | Flask-SQLAlchemy==2.4.4 46 | flask-swagger==0.2.13 47 | Flask-WTF==0.14.3 48 | funcsigs==1.0.2 49 | future==0.18.3 50 | GeoAlchemy2==0.8.3 51 | geopandas==0.7.0 52 | graphviz==0.14.1 53 | gunicorn==22.0.0 54 | idna==3.7 55 | importlib-metadata==1.7.0 56 | ipykernel==5.3.0 57 | ipython==8.10.0 58 | ipython-genutils==0.2.0 59 | ipywidgets==7.5.1 60 | iso8601==0.1.12 61 | itsdangerous==1.1.0 62 | jedi==0.17.0 63 | Jinja2==3.1.4 64 | jmespath==0.10.0 65 | json-merge-patch==0.2 66 | jsonschema==3.2.0 67 | jupyter==1.0.0 68 | jupyter-client==6.1.3 69 | jupyter-console==6.1.0 70 | jupyter-core==4.11.2 71 | lazy-object-proxy==1.5.1 72 | lockfile==0.12.2 73 | Mako==1.2.2 74 | Markdown==2.6.11 75 | MarkupSafe==1.1.1 76 | marshmallow==2.21.0 77 | marshmallow-enum==1.5.1 78 | marshmallow-sqlalchemy==0.23.1 79 | mistune==2.0.3 80 | munch==2.5.0 81 | natsort==7.0.1 82 | nbconvert==6.5.1 83 | nbformat==5.0.6 84 | notebook==6.4.12 85 | numpy==1.22.0 86 | packaging==20.4 87 | pandas==1.0.5 88 | pandocfilters==1.4.2 89 | parso==0.7.0 90 | pendulum==1.4.4 91 | pexpect==4.8.0 92 | pickleshare==0.7.5 93 | pip-autoremove==0.9.1 94 | pkg-resources==0.0.0 95 | plotly==4.8.1 96 | prison==0.1.3 97 | prometheus-client==0.8.0 98 | prompt-toolkit==3.0.5 99 | psutil==5.7.2 100 | psycopg2-binary==2.8.5 101 | ptyprocess==0.6.0 102 | Pygments==2.15.0 103 | PyJWT==2.4.0 104 | pyparsing==2.4.7 105 | pyproj==2.6.1.post1 106 | pyrsistent==0.16.0 107 | python-daemon==2.2.4 108 | python-dateutil==2.8.1 109 | python-editor==1.0.4 110 | python-nvd3==0.15.0 111 | python-slugify==4.0.1 112 | python3-openid==3.2.0 113 | pytz==2020.1 114 | pytzdata==2020.1 115 | PyYAML==5.4 116 | pyzmq==19.0.1 117 | qtconsole==4.7.4 118 | QtPy==1.9.0 119 | requests==2.32.0 120 | retrying==1.3.3 121 | s3transfer==0.3.3 122 | Send2Trash==1.5.0 123 | setproctitle==1.1.10 124 | Shapely==1.7.0 125 | six==1.15.0 126 | SQLAlchemy==1.3.18 127 | SQLAlchemy-JSONField==0.9.0 128 | SQLAlchemy-Utils==0.36.8 129 | tabulate==0.8.7 130 | tenacity==4.12.0 131 | termcolor==1.1.0 132 | terminado==0.8.3 133 | testpath==0.4.4 134 | text-unidecode==1.3 135 | thrift==0.13.0 136 | tornado==6.4.1 137 | traitlets==4.3.3 138 | typing==3.7.4.3 139 | typing-extensions==3.7.4.2 140 | tzlocal==1.5.1 141 | unicodecsv==0.14.1 142 | urllib3==1.26.19 143 | wcwidth==0.2.3 144 | webencodings==0.5.1 145 | Werkzeug==3.0.3 146 | widgetsnbextension==3.5.1 147 | WTForms==2.3.1 148 | zipp==3.19.1 149 | zope.deprecation==4.4.0 150 | -------------------------------------------------------------------------------- /src/airflow/start_workers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Send start command to spark_worker instances, wait for all three 4 | # to reach 'running' state, sleep for 15 more seconds just to be 5 | # safe, and then launch spark 6 | 7 | set -e 8 | 9 | aws ec2 start-instances --instance-ids \ 10 | $EC2_WORKER1_ID $EC2_WORKER2_ID $EC2_WORKER3_ID && 11 | aws ec2 wait instance-running --instance-ids \ 12 | $EC2_WORKER1_ID $EC2_WORKER2_ID $EC2_WORKER3_ID && 13 | sleep 15 && 14 | /usr/local/spark/sbin/start-all.sh 15 | -------------------------------------------------------------------------------- /src/airflow/stop_workers.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | # Stop spark on all workers and then send stop-instances 4 | # command to AWS 5 | 6 | set -e 7 | 8 | /usr/local/spark/sbin/stop-all.sh && 9 | aws ec2 stop-instances --instance-ids \ 10 | $EC2_WORKER1_ID $EC2_WORKER2_ID $EC2_WORKER3_ID 11 | -------------------------------------------------------------------------------- /src/airflow/where_cycle_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import datetime, timedelta 2 | from airflow import DAG 3 | from airflow.operators.bash_operator import BashOperator 4 | from airflow.operators.python_operator import PythonOperator 5 | from preparation.extract import \ 6 | get_taxi_zones, get_businesses, unzip_csvs 7 | from preparation.transform import \ 8 | clean_taxi_zones, calculate_centroids, clean_businesses 9 | from preparation.load import \ 10 | write_taxi_zones, write_businesses 11 | 12 | 13 | airflow_path = '/home/ubuntu/where-cycle/src/airflow/' 14 | spark_str = 'cd /home/ubuntu/where-cycle/src/spark_reduction && ' 15 | psql_str = 'psql -h $PSQL_HOST -p $PSQL_PORT -U $PSQL_USER -d ' + \ 16 | '$PSQL_DATABASE -f /home/ubuntu/where-cycle/src/postGIS_tables/' 17 | 18 | defaults = { 19 | 'owner': 'airflow', 20 | 'start_date': datetime(2020, 6, 21), 21 | 'depends_on_past': False, 22 | 'retries': 2, 23 | 'retry_delay': timedelta(minutes=5) 24 | } 25 | 26 | with DAG( 27 | 'where_cycle', 28 | default_args = defaults, 29 | schedule_interval = '@weekly' 30 | ) as dag: 31 | #******** PREPARATION ********# 32 | 33 | t1 = PythonOperator( 34 | task_id = 'get_taxi_zones', 35 | python_callable = get_taxi_zones 36 | ) 37 | 38 | t2 = PythonOperator( 39 | task_id = 'clean_taxi_zones', 40 | python_callable = clean_taxi_zones, 41 | provide_context = True 42 | ) 43 | 44 | t3 = PythonOperator( 45 | task_id = 'write_taxi_zones', 46 | python_callable = write_taxi_zones, 47 | provide_context = True 48 | ) 49 | 50 | t4 = PythonOperator( 51 | task_id = 'calculate_centroids', 52 | python_callable = calculate_centroids, 53 | provide_context = True 54 | ) 55 | 56 | t5 = PythonOperator( 57 | task_id = 'get_businesses', 58 | python_callable = get_businesses, 59 | provide_context = True 60 | ) 61 | 62 | t6 = PythonOperator( 63 | task_id = 'clean_businesses', 64 | python_callable = clean_businesses, 65 | provide_context = True 66 | ) 67 | 68 | t7 = PythonOperator( 69 | task_id = 'write_businesses', 70 | python_callable = write_businesses, 71 | provide_context = True 72 | ) 73 | 74 | t8 = PythonOperator( 75 | task_id = 'unzip_csvs', 76 | python_callable = unzip_csvs 77 | ) 78 | 79 | t1 >> t2 >> t3 80 | t1 >> t4 >> t5 >> t6 >> t7 81 | 82 | 83 | #******** SPARK REDUCTION ********# 84 | 85 | t9 = BashOperator( 86 | task_id = 'start_spark_workers', 87 | bash_command = airflow_path + 'start_workers.sh ' 88 | ) 89 | 90 | t10 = BashOperator( 91 | task_id = 'submit_spark_driver', 92 | bash_command = spark_str + 'spark-submit driver.py' 93 | ) 94 | 95 | t11 = BashOperator( 96 | task_id = 'stop_spark_workers', 97 | bash_command = airflow_path + 'stop_workers.sh ', 98 | trigger_rule = 'all_done' 99 | ) 100 | 101 | t8 >> t9 >> t10 >> t11 102 | 103 | 104 | #******** POSTGIS TABLES ********# 105 | 106 | t12 = BashOperator( 107 | task_id = 'create_production_taxi_zones', 108 | bash_command = psql_str + 'production/taxi_zones.sql' 109 | ) 110 | 111 | t13 = BashOperator( 112 | task_id = 'create_statistics_yelp_businesses', 113 | bash_command = psql_str + 'statistics/yelp_businesses.sql' 114 | ) 115 | 116 | t14 = BashOperator( 117 | task_id = 'create_geo_joined_citibike_stations', 118 | bash_command = psql_str + 'geo_joined/citibike_stations.sql' 119 | ) 120 | 121 | t15 = BashOperator( 122 | task_id = 'create_statistics_citibike', 123 | bash_command = psql_str + 'statistics/citibike.sql' 124 | ) 125 | 126 | t16 = BashOperator( 127 | task_id = 'create_geo_joined_past_tlc_visits', 128 | bash_command = psql_str + 'geo_joined/past_tlc_visits.sql' 129 | ) 130 | 131 | t17 = BashOperator( 132 | task_id = 'create_statistics_tlc_visits', 133 | bash_command = psql_str + 'statistics/tlc_visits.sql' 134 | ) 135 | 136 | t18 = BashOperator( 137 | task_id = 'create_production_all_time_stats', 138 | bash_command = psql_str + 'production/all_time_stats.sql' 139 | ) 140 | 141 | t3 >> t12 142 | t7 >> t13 143 | t10 >> t14 >> t15 144 | t10 >> t16 >> t17 145 | [t13, t15, t17] >> t18 146 | -------------------------------------------------------------------------------- /src/config/database.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sqlalchemy as sql 3 | 4 | 5 | jdbc_props = { 6 | 'driver': 'org.postgresql.Driver', 7 | 'user': os.environ['PSQL_USER'], 8 | 'password': os.environ['PSQL_PASSWORD'] 9 | } 10 | 11 | jdbc_url = 'jdbc:postgresql://' + \ 12 | os.environ['PSQL_HOST'] + ':' + os.environ['PSQL_PORT'] + \ 13 | '/' + os.environ['PSQL_DATABASE'] 14 | 15 | py_engine = sql.create_engine( 16 | 'postgresql://' + 17 | os.environ['PSQL_USER'] + ':' + os.environ['PSQL_PASSWORD'] + 18 | '@' + os.environ['PSQL_HOST'] + ':' + os.environ['PSQL_PORT'] + 19 | '/' + os.environ['PSQL_DATABASE'] 20 | ) 21 | -------------------------------------------------------------------------------- /src/config/geometries.py: -------------------------------------------------------------------------------- 1 | TAXI_ZONE_CENTROID_LAT = 40.7058240860865 2 | TAXI_ZONE_CENTROID_LON = -73.9778002135437 3 | TAXI_ZONE_LAT_MIN = 40.4961153951704 4 | TAXI_ZONE_LAT_MAX = 40.9155327770026 5 | TAXI_ZONE_LON_MIN = -74.2555913631521 6 | TAXI_ZONE_LON_MAX = -73.7000090639354 7 | -------------------------------------------------------------------------------- /src/config/ref/check_citibike_schema.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from pyspark.sql import SparkSession 3 | 4 | 5 | s3 = boto3.resource('s3') 6 | bucket = s3.Bucket('jlang-20b-de-ny') 7 | 8 | spark = SparkSession.builder \ 9 | .appName('check-citibike-schema') \ 10 | .getOrCreate() 11 | 12 | for obj in bucket.objects.all(): 13 | key = obj.key 14 | if key.startswith('citibike/') and key.endswith('.csv'): 15 | path = 's3a://jlang-20b-de-ny/' + key 16 | csv_df = spark.read.csv( 17 | path = path, 18 | header = True, 19 | inferSchema = True, 20 | enforceSchema = False, 21 | ignoreLeadingWhiteSpace = True, 22 | ignoreTrailingWhiteSpace = True, 23 | samplingRatio = 0.1 24 | ) 25 | print(path) 26 | csv_df.printSchema() 27 | -------------------------------------------------------------------------------- /src/config/ref/check_tlc_schemas.py: -------------------------------------------------------------------------------- 1 | import boto3 2 | from pyspark.sql import SparkSession 3 | 4 | 5 | s3 = boto3.resource('s3') 6 | nyc_tlc = s3.Bucket('nyc-tlc') 7 | 8 | spark = SparkSession.builder \ 9 | .appName('check_tlc_schemas') \ 10 | .getOrCreate() 11 | 12 | for obj in nyc_tlc.objects.all(): 13 | key = obj.key 14 | if key.startswith('trip data/') and key.endswith('.csv'): 15 | path = 's3a://nyc-tlc/' + key 16 | csv_df = spark.read.csv( 17 | path = path, 18 | header = True, 19 | inferSchema = True, 20 | enforceSchema = False, 21 | ignoreLeadingWhiteSpace = True, 22 | ignoreTrailingWhiteSpace = True, 23 | samplingRatio = 0.1 24 | ) 25 | print(path) 26 | csv_df.printSchema() 27 | -------------------------------------------------------------------------------- /src/config/ref/get_geometries.sql: -------------------------------------------------------------------------------- 1 | SELECT ST_Extent(geometry) AS bbox 2 | FROM staging.taxi_zones; 3 | 4 | SELECT ST_AsText(ST_Centroid(ST_Extent(geometry))) AS center 5 | FROM staging.taxi_zones; 6 | -------------------------------------------------------------------------------- /src/config/ref/tlc_schemas.txt: -------------------------------------------------------------------------------- 1 | lat-lon data 2 | green_tripdata_2013-08_2016-06_9-cols 3 | green_tripdata_2013-08_2014-12 4 | |-- VendorID: integer (nullable = true) 5 | |-- lpep_pickup_datetime: timestamp (nullable = true) 6 | |-- Lpep_dropoff_datetime: timestamp (nullable = true) 7 | |-- Store_and_fwd_flag: string (nullable = true) 8 | |-- RateCodeID: integer (nullable = true) 9 | |-- Pickup_longitude: double (nullable = true) 10 | |-- Pickup_latitude: double (nullable = true) 11 | |-- Dropoff_longitude: double (nullable = true) 12 | |-- Dropoff_latitude: double (nullable = true) 13 | |-- Passenger_count: integer (nullable = true) 14 | |-- Trip_distance: double (nullable = true) 15 | |-- Fare_amount: double (nullable = true) 16 | |-- Extra: double (nullable = true) 17 | |-- MTA_tax: double (nullable = true) 18 | |-- Tip_amount: double (nullable = true) 19 | |-- Tolls_amount: double (nullable = true) 20 | |-- Ehail_fee: string (nullable = true) 21 | |-- Total_amount: double (nullable = true) 22 | |-- Payment_type: integer (nullable = true) 23 | |-- Trip_type: string (nullable = true) 24 | 25 | green_tripdata_2015-01_2016-06_9-cols 26 | |-- VendorID: integer (nullable = true) 27 | |-- lpep_pickup_datetime: timestamp (nullable = true) 28 | |-- Lpep_dropoff_datetime: timestamp (nullable = true) 29 | |-- Store_and_fwd_flag: string (nullable = true) 30 | |-- RateCodeID: integer (nullable = true) 31 | |-- Pickup_longitude: double (nullable = true) 32 | |-- Pickup_latitude: double (nullable = true) 33 | |-- Dropoff_longitude: double (nullable = true) 34 | |-- Dropoff_latitude: double (nullable = true) 35 | |-- Passenger_count: integer (nullable = true) 36 | |-- Trip_distance: double (nullable = true) 37 | |-- Fare_amount: double (nullable = true) 38 | |-- Extra: double (nullable = true) 39 | |-- MTA_tax: double (nullable = true) 40 | |-- Tip_amount: double (nullable = true) 41 | |-- Tolls_amount: double (nullable = true) 42 | |-- Ehail_fee: string (nullable = true) 43 | |-- improvement_surcharge: double (nullable = true) 44 | |-- Total_amount: double (nullable = true) 45 | |-- Payment_type: integer (nullable = true) 46 | |-- Trip_type: integer (nullable = true) 47 | 48 | yellow_tripdata_2009-01__2016-06_11-cols 49 | yellow_tripdata_2009-01_2009-12 50 | |-- vendor_name: string (nullable = true) 51 | |-- Trip_Pickup_DateTime: timestamp (nullable = true) 52 | |-- Trip_Dropoff_DateTime: timestamp (nullable = true) 53 | |-- Passenger_Count: integer (nullable = true) 54 | |-- Trip_Distance: double (nullable = true) 55 | |-- Start_Lon: double (nullable = true) 56 | |-- Start_Lat: double (nullable = true) 57 | |-- Rate_Code: string (nullable = true) 58 | |-- store_and_forward: integer (nullable = true) 59 | |-- End_Lon: double (nullable = true) 60 | |-- End_Lat: double (nullable = true) 61 | |-- Payment_Type: string (nullable = true) 62 | |-- Fare_Amt: double (nullable = true) 63 | |-- surcharge: double (nullable = true) 64 | |-- mta_tax: string (nullable = true) 65 | |-- Tip_Amt: double (nullable = true) 66 | |-- Tolls_Amt: double (nullable = true) 67 | |-- Total_Amt: double (nullable = true) 68 | 69 | yellow_tripdata_2010-01_2014-12 70 | |-- vendor_id: string (nullable = true) 71 | |-- pickup_datetime: timestamp (nullable = true) 72 | |-- dropoff_datetime: timestamp (nullable = true) 73 | |-- passenger_count: integer (nullable = true) 74 | |-- trip_distance: double (nullable = true) 75 | |-- pickup_longitude: double (nullable = true) 76 | |-- pickup_latitude: double (nullable = true) 77 | |-- rate_code: integer (nullable = true) 78 | |-- store_and_fwd_flag: integer (nullable = true) 79 | |-- dropoff_longitude: double (nullable = true) 80 | |-- dropoff_latitude: double (nullable = true) 81 | |-- payment_type: string (nullable = true) 82 | |-- fare_amount: double (nullable = true) 83 | |-- surcharge: double (nullable = true) 84 | |-- mta_tax: double (nullable = true) 85 | |-- tip_amount: double (nullable = true) 86 | |-- tolls_amount: double (nullable = true) 87 | |-- total_amount: double (nullable = true) 88 | 89 | yellow_tripdata_2015-01_2016-06 90 | |-- VendorID: integer (nullable = true) 91 | |-- tpep_pickup_datetime: timestamp (nullable = true) 92 | |-- tpep_dropoff_datetime: timestamp (nullable = true) 93 | |-- passenger_count: integer (nullable = true) 94 | |-- trip_distance: double (nullable = true) 95 | |-- pickup_longitude: double (nullable = true) 96 | |-- pickup_latitude: double (nullable = true) 97 | |-- RateCodeID: integer (nullable = true) 98 | |-- store_and_fwd_flag: string (nullable = true) 99 | |-- dropoff_longitude: double (nullable = true) 100 | |-- dropoff_latitude: double (nullable = true) 101 | |-- payment_type: integer (nullable = true) 102 | |-- fare_amount: double (nullable = true) 103 | |-- extra: double (nullable = true) 104 | |-- mta_tax: double (nullable = true) 105 | |-- tip_amount: double (nullable = true) 106 | |-- tolls_amount: double (nullable = true) 107 | |-- improvement_surcharge: double (nullable = true) 108 | |-- total_amount: double (nullable = true) 109 | 110 | LocationID data 111 | fhv_tripdata_2015-01_2016-12_3-cols 112 | |-- Dispatching_base_num: string (nullable = true) 113 | |-- Pickup_date: timestamp (nullable = true) 114 | |-- locationID: integer (nullable = true) 115 | 116 | fhv_tripdata_2017-01_2019-12_5-cols 117 | fhv_tripdata_2017-01_2017-06 118 | |-- Dispatching_base_num: string (nullable = true) 119 | |-- Pickup_DateTime: timestamp (nullable = true) 120 | |-- DropOff_datetime: string (nullable = true) 121 | |-- PUlocationID: integer (nullable = true) 122 | |-- DOlocationID: integer (nullable = true) 123 | 124 | fhv_tripdata_2017-07_2017-12 125 | |-- Dispatching_base_num: string (nullable = true) 126 | |-- Pickup_DateTime: timestamp (nullable = true) 127 | |-- DropOff_datetime: string (nullable = true) 128 | |-- PUlocationID: integer (nullable = true) 129 | |-- DOlocationID: integer (nullable = true) 130 | |-- SR_Flag: integer (nullable = true) 131 | 132 | fhv_tripdata_2019-01_2019-12 133 | |-- dispatching_base_num: string (nullable = true) 134 | |-- pickup_datetime: timestamp (nullable = true) 135 | |-- dropoff_datetime: timestamp (nullable = true) 136 | |-- PULocationID: integer (nullable = true) 137 | |-- DOLocationID: integer (nullable = true) 138 | |-- SR_Flag: integer (nullable = true) 139 | 140 | fhv_tripdata_2018-01_2018-12_4-cols 141 | |-- Pickup_DateTime: timestamp (nullable = true) 142 | |-- DropOff_datetime: timestamp (nullable = true) 143 | |-- PUlocationID: integer (nullable = true) 144 | |-- DOlocationID: integer (nullable = true) 145 | |-- SR_Flag: integer (nullable = true) 146 | |-- Dispatching_base_number: string (nullable = true) 147 | |-- Dispatching_base_num: string (nullable = true) 148 | 149 | fhvhv_tripdata_6-cols 150 | |-- hvfhs_license_num: string (nullable = true) 151 | |-- dispatching_base_num: string (nullable = true) 152 | |-- pickup_datetime: timestamp (nullable = true) 153 | |-- dropoff_datetime: timestamp (nullable = true) 154 | |-- PULocationID: integer (nullable = true) 155 | |-- DOLocationID: integer (nullable = true) 156 | |-- SR_Flag: integer (nullable = true) 157 | 158 | green_tripdata_2016-07_2019-12_7-cols 159 | green_tripdata_2016-07_2018-12 160 | |-- VendorID: integer (nullable = true) 161 | |-- lpep_pickup_datetime: timestamp (nullable = true) 162 | |-- lpep_dropoff_datetime: timestamp (nullable = true) 163 | |-- store_and_fwd_flag: string (nullable = true) 164 | |-- RatecodeID: integer (nullable = true) 165 | |-- PULocationID: integer (nullable = true) 166 | |-- DOLocationID: integer (nullable = true) 167 | |-- passenger_count: integer (nullable = true) 168 | |-- trip_distance: double (nullable = true) 169 | |-- fare_amount: double (nullable = true) 170 | |-- extra: double (nullable = true) 171 | |-- mta_tax: double (nullable = true) 172 | |-- tip_amount: double (nullable = true) 173 | |-- tolls_amount: double (nullable = true) 174 | |-- ehail_fee: string (nullable = true) 175 | |-- improvement_surcharge: double (nullable = true) 176 | |-- total_amount: double (nullable = true) 177 | |-- payment_type: integer (nullable = true) 178 | |-- trip_type: integer (nullable = true) 179 | 180 | green_tripdata_2019-01_2019-12_7-cols 181 | |-- VendorID: integer (nullable = true) 182 | |-- lpep_pickup_datetime: timestamp (nullable = true) 183 | |-- lpep_dropoff_datetime: timestamp (nullable = true) 184 | |-- store_and_fwd_flag: string (nullable = true) 185 | |-- RatecodeID: integer (nullable = true) 186 | |-- PULocationID: integer (nullable = true) 187 | |-- DOLocationID: integer (nullable = true) 188 | |-- passenger_count: integer (nullable = true) 189 | |-- trip_distance: double (nullable = true) 190 | |-- fare_amount: double (nullable = true) 191 | |-- extra: double (nullable = true) 192 | |-- mta_tax: double (nullable = true) 193 | |-- tip_amount: double (nullable = true) 194 | |-- tolls_amount: double (nullable = true) 195 | |-- ehail_fee: string (nullable = true) 196 | |-- improvement_surcharge: double (nullable = true) 197 | |-- total_amount: double (nullable = true) 198 | |-- payment_type: integer (nullable = true) 199 | |-- trip_type: integer (nullable = true) 200 | |-- congestion_surcharge: integer (nullable = true) 201 | 202 | yellow_tripdata_2016-07_2019-12_9-cols 203 | yellow_tripdata_2016-07_2018-12 204 | |-- VendorID: integer (nullable = true) 205 | |-- tpep_pickup_datetime: timestamp (nullable = true) 206 | |-- tpep_dropoff_datetime: timestamp (nullable = true) 207 | |-- passenger_count: integer (nullable = true) 208 | |-- trip_distance: double (nullable = true) 209 | |-- RatecodeID: integer (nullable = true) 210 | |-- store_and_fwd_flag: string (nullable = true) 211 | |-- PULocationID: integer (nullable = true) 212 | |-- DOLocationID: integer (nullable = true) 213 | |-- payment_type: integer (nullable = true) 214 | |-- fare_amount: double (nullable = true) 215 | |-- extra: double (nullable = true) 216 | |-- mta_tax: double (nullable = true) 217 | |-- tip_amount: double (nullable = true) 218 | |-- tolls_amount: double (nullable = true) 219 | |-- improvement_surcharge: double (nullable = true) 220 | |-- total_amount: double (nullable = true) 221 | 222 | yellow_tripdata_2019-01_2019-12_9-cols 223 | |-- VendorID: integer (nullable = true) 224 | |-- tpep_pickup_datetime: timestamp (nullable = true) 225 | |-- tpep_dropoff_datetime: timestamp (nullable = true) 226 | |-- passenger_count: integer (nullable = true) 227 | |-- trip_distance: double (nullable = true) 228 | |-- RatecodeID: integer (nullable = true) 229 | |-- store_and_fwd_flag: string (nullable = true) 230 | |-- PULocationID: integer (nullable = true) 231 | |-- DOLocationID: integer (nullable = true) 232 | |-- payment_type: integer (nullable = true) 233 | |-- fare_amount: double (nullable = true) 234 | |-- extra: double (nullable = true) 235 | |-- mta_tax: double (nullable = true) 236 | |-- tip_amount: double (nullable = true) 237 | |-- tolls_amount: double (nullable = true) 238 | |-- improvement_surcharge: double (nullable = true) 239 | |-- total_amount: double (nullable = true) 240 | |-- congestion_surcharge: double (nullable = true) 241 | -------------------------------------------------------------------------------- /src/config/schemas.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql.types import StructType, StructField, \ 2 | IntegerType, TimestampType, StringType, DoubleType 3 | 4 | citibike_schema = StructType( 5 | [ 6 | StructField('tripduration', IntegerType(), True), 7 | StructField('starttime', TimestampType(), True), 8 | StructField('stoptime', TimestampType(), True), 9 | StructField('start station id', IntegerType(), True), 10 | StructField('start station name', StringType(), True), 11 | StructField('start station latitude', DoubleType(), True), 12 | StructField('start station longitude', DoubleType(), True), 13 | StructField('end station id', IntegerType(), True), 14 | StructField('end station name', StringType(), True), 15 | StructField('end station latitude', DoubleType(), True), 16 | StructField('end station longitude', DoubleType(), True) 17 | ] 18 | ) 19 | 20 | past_schema = StructType( 21 | [ 22 | StructField('month', StringType(), True), 23 | StructField('pickup_longitude', DoubleType(), True), 24 | StructField('pickup_latitude', DoubleType(), True), 25 | StructField('dropoff_longitude', DoubleType(), True), 26 | StructField('dropoff_latitude', DoubleType(), True) 27 | ] 28 | ) 29 | 30 | modern_schema = StructType( 31 | [ 32 | StructField('month', StringType(), True), 33 | StructField('PULocationID', IntegerType(), True), 34 | StructField('DOLocationID', IntegerType(), True) 35 | ] 36 | ) 37 | 38 | green_13_16_schema = StructType( 39 | [ 40 | StructField('VendorID', IntegerType(), True), 41 | StructField('lpep_pickup_datetime', TimestampType(), True), 42 | StructField('Lpep_dropoff_datetime', TimestampType(), True), 43 | StructField('Store_and_fwd_flag', StringType(), True), 44 | StructField('RateCodeID', IntegerType(), True), 45 | StructField('pickup_longitude', DoubleType(), True), 46 | StructField('pickup_latitude', DoubleType(), True), 47 | StructField('dropoff_longitude', DoubleType(), True), 48 | StructField('dropoff_latitude', DoubleType(), True) 49 | ] 50 | ) 51 | 52 | yellow_09_16_schema = StructType( 53 | [ 54 | StructField('VendorID', StringType(), True), 55 | StructField('tpep_pickup_datetime', TimestampType(), True), 56 | StructField('tpep_dropoff_datetime', TimestampType(), True), 57 | StructField('passenger_count', IntegerType(), True), 58 | StructField('trip_distance', DoubleType(), True), 59 | StructField('pickup_longitude', DoubleType(), True), 60 | StructField('pickup_latitude', DoubleType(), True), 61 | StructField('RateCodeID', StringType(), True), 62 | StructField('store_and_fwd_flag', StringType(), True), 63 | StructField('dropoff_longitude', DoubleType(), True), 64 | StructField('dropoff_latitude', DoubleType(), True) 65 | ] 66 | ) 67 | 68 | fhv_15_16_schema = StructType( 69 | [ 70 | StructField('Dispatching_base_num', StringType(), True), 71 | StructField('Pickup_date', TimestampType(), True), 72 | StructField('locationID', IntegerType(), True) 73 | ] 74 | ) 75 | 76 | fhv_17_19_schema = StructType( 77 | [ 78 | StructField('Dispatching_base_num', StringType(), True), 79 | StructField('Pickup_DateTime', TimestampType(), True), 80 | StructField('DropOff_datetime', TimestampType(), True), 81 | StructField('PUlocationID', IntegerType(), True), 82 | StructField('DOlocationID', IntegerType(), True) 83 | ] 84 | ) 85 | 86 | fhv_18_schema = StructType( 87 | [ 88 | StructField('Pickup_DateTime', TimestampType(), True), 89 | StructField('DropOff_datetime', TimestampType(), True), 90 | StructField('PUlocationID', IntegerType(), True), 91 | StructField('DOlocationID', IntegerType(), True) 92 | ] 93 | ) 94 | 95 | fhvhv_schema = StructType( 96 | [ 97 | StructField('hvfhs_license_num', StringType(), True), 98 | StructField('dispatching_base_num', StringType(), True), 99 | StructField('pickup_datetime', TimestampType(), True), 100 | StructField('dropoff_datetime', TimestampType(), True), 101 | StructField('PULocationID', IntegerType(), True), 102 | StructField('DOLocationID', IntegerType(), True) 103 | ] 104 | ) 105 | 106 | green_16_19_schema = StructType( 107 | [ 108 | StructField('VendorID', IntegerType(), True), 109 | StructField('lpep_pickup_datetime', TimestampType(), True), 110 | StructField('lpep_dropoff_datetime', TimestampType(), True), 111 | StructField('store_and_fwd_flag', StringType(), True), 112 | StructField('RatecodeID', IntegerType(), True), 113 | StructField('PULocationID', IntegerType(), True), 114 | StructField('DOLocationID', IntegerType(), True) 115 | ] 116 | ) 117 | 118 | yellow_16_19_schema = StructType( 119 | [ 120 | StructField('VendorID', IntegerType(), True), 121 | StructField('tpep_pickup_datetime', TimestampType(), True), 122 | StructField('tpep_dropoff_datetime', TimestampType(), True), 123 | StructField('passenger_count', IntegerType(), True), 124 | StructField('trip_distance', DoubleType(), True), 125 | StructField('RatecodeID', IntegerType(), True), 126 | StructField('store_and_fwd_flag', StringType(), True), 127 | StructField('PULocationID', IntegerType(), True), 128 | StructField('DOLocationID', IntegerType(), True) 129 | ] 130 | ) 131 | -------------------------------------------------------------------------------- /src/dash/app.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import dash 4 | import dash_core_components as dcc 5 | import dash_html_components as html 6 | import flask 7 | import pandas as pd 8 | import plotly 9 | import plotly.graph_objects as go 10 | from config.database import py_engine 11 | from config.geometries import \ 12 | TAXI_ZONE_CENTROID_LAT, TAXI_ZONE_CENTROID_LON 13 | 14 | 15 | zones = pd.read_sql_table( 16 | table_name = 'taxi_zones', 17 | con = py_engine, 18 | schema = 'production' 19 | ) 20 | 21 | json_zones = {'type': 'FeatureCollection', 'features': []} 22 | for _, row in zones.iterrows(): 23 | feature = { 24 | 'type':'Feature', 25 | 'id': row['zone_id'], 26 | 'geometry': json.loads(row['geometry']) 27 | } 28 | json_zones['features'].append(feature) 29 | 30 | stats = pd.read_sql_table( 31 | table_name = 'all_time_stats', 32 | con = py_engine, 33 | schema = 'production' 34 | ) 35 | 36 | columns = [ 37 | 'tlc_visits', 38 | 'citibike_visits', 39 | 'citibike_stations', 40 | # 'yelp_avg_rating', 41 | # 'yelp_sum_reviews', 42 | 'yelp_weighted_sum_reviews' 43 | ] 44 | 45 | map_views = [] 46 | bar_charts = [] 47 | 48 | for column in columns: 49 | map_views.append( 50 | go.Choroplethmapbox( 51 | geojson = json_zones, 52 | locations = stats['zone_id'].tolist(), 53 | z = stats[column].tolist(), 54 | text = stats['zone_name'] + ', ' + stats['borough'], 55 | visible = False, 56 | subplot = 'mapbox', 57 | hovertemplate = '%{text}
' + 58 | '%{z}
' + 59 | '' 60 | ) 61 | ) 62 | 63 | top = stats.sort_values([column], ascending = False).head(15) 64 | bar_charts.append( 65 | go.Bar( 66 | x = top[column], 67 | y = top['zone_name'] + ', ' + top['borough'], 68 | text = top['zone_name'] + ', ' + top['borough'], 69 | textposition = 'inside', 70 | hovertemplate = '%{text}
' + 71 | '%{x}
' + 72 | '', 73 | xaxis = 'x', 74 | yaxis = 'y', 75 | marker = dict(color = 'blue'), 76 | visible = False, 77 | name = '', 78 | orientation = 'h' 79 | ) 80 | ) 81 | 82 | map_views[0]['visible'] = True 83 | bar_charts[0]['visible'] = True 84 | 85 | fig = go.Figure(data = map_views + bar_charts) 86 | 87 | fig.update_layout( 88 | title = dict( 89 | text = 'Where Cycle', 90 | font = dict(size = 36), 91 | x = 0.5, 92 | xanchor = 'center' 93 | ), 94 | autosize = True, 95 | height = 700, 96 | mapbox = dict( 97 | domain = dict(x = [0.25, 1], y = [0, 1]), 98 | accesstoken = os.environ['MAPBOX_ACCESS_TOKEN'], 99 | style = 'dark', 100 | center = dict( 101 | lon = TAXI_ZONE_CENTROID_LON, 102 | lat = TAXI_ZONE_CENTROID_LAT 103 | ), 104 | zoom = 9.35 105 | ), 106 | xaxis = dict( 107 | domain = [0, 0.25], 108 | anchor = 'x', 109 | showticklabels = True, 110 | showgrid = True 111 | ), 112 | yaxis = dict( 113 | domain = [0, 1], 114 | anchor = 'y', 115 | autorange = 'reversed', 116 | visible = False 117 | ), 118 | margin = dict(l = 0, r = 0, t = 70, b = 50), 119 | paper_bgcolor='black', 120 | plot_bgcolor='black' 121 | ) 122 | 123 | fig.update_layout( 124 | updatemenus = [dict( 125 | x = 0, 126 | y = 1, 127 | xanchor = 'left', 128 | yanchor = 'bottom', 129 | buttons = list([ 130 | dict( 131 | args = [ 132 | 'visible', 133 | [True, False, False, False] # , False, False] 134 | ], 135 | label = 'Taxi Visits', 136 | method = 'restyle' 137 | ), 138 | dict( 139 | args = [ 140 | 'visible', 141 | [False, True, False, False] # , False, False] 142 | ], 143 | label = 'Citibike Visits', 144 | method = 'restyle' 145 | ), 146 | dict( 147 | args = [ 148 | 'visible', 149 | [False, False, True, False] # , False, False] 150 | ], 151 | label = 'Citibike Stations', 152 | method = 'restyle' 153 | ), 154 | # dict( 155 | # args = [ 156 | # 'visible', 157 | # [False, False, False, True, False, False] 158 | # ], 159 | # label = 'Yelp Average Rating', 160 | # method = 'restyle' 161 | # ), 162 | # dict( 163 | # args = [ 164 | # 'visible', 165 | # [False, False, False, False, True, False] 166 | # ], 167 | # label = 'Yelp Reviews', 168 | # method = 'restyle' 169 | # ), 170 | dict( 171 | args = [ 172 | 'visible', 173 | [False, False, False, True] # , False, True] 174 | ], 175 | label = 'Yelp Stars (weighted review count)', 176 | method = 'restyle' 177 | ) 178 | ]), 179 | )] 180 | ) 181 | 182 | server = flask.Flask(__name__) 183 | stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] 184 | 185 | app = dash.Dash( 186 | __name__, 187 | external_stylesheets = stylesheets, 188 | server = server 189 | ) 190 | 191 | app.layout = html.Div([ 192 | dcc.Location( 193 | id = 'url', 194 | pathname = '/where-cycle', 195 | refresh = False 196 | ), 197 | dcc.Graph(figure = fig), 198 | html.Div([ 199 | 'Read more about this project on ', 200 | html.A( 201 | ['Github'], 202 | href = 'https://github.com/josh-lang/where-cycle' 203 | ) 204 | ]) 205 | ]) 206 | 207 | app.title = 'Where Cycle' 208 | 209 | if __name__ == '__main__': 210 | app.run_server( 211 | debug = False, 212 | dev_tools_props_check = False, 213 | dev_tools_ui = False 214 | ) 215 | -------------------------------------------------------------------------------- /src/dash/assets/background.css: -------------------------------------------------------------------------------- 1 | body { 2 | background-color: black; 3 | color: rgb(42, 63, 95); 4 | } 5 | -------------------------------------------------------------------------------- /src/postGIS_tables/geo_joined/citibike_stations.sql: -------------------------------------------------------------------------------- 1 | -- Create join table for taxi zones and Citibike stations 2 | 3 | DROP TABLE IF EXISTS geo_joined.citibike_stations; 4 | 5 | CREATE TABLE geo_joined.citibike_stations AS 6 | SELECT 7 | z.zone_id, 8 | c.station_id 9 | FROM 10 | staging.taxi_zones AS z 11 | JOIN ( 12 | SELECT 13 | station_id, 14 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS geometry 15 | FROM staging.citibike_stations 16 | ) AS c 17 | ON ST_WITHIN(c.geometry, z.geometry) 18 | GROUP BY 1, 2; 19 | -------------------------------------------------------------------------------- /src/postGIS_tables/geo_joined/past_tlc_visits.sql: -------------------------------------------------------------------------------- 1 | -- Aggregate past TLC visits by the taxi zone their coordinates are within 2 | 3 | DROP TABLE IF EXISTS geo_joined.past_tlc_visits; 4 | 5 | CREATE TABLE geo_joined.past_tlc_visits AS 6 | SELECT 7 | p.month, 8 | z.zone_id, 9 | SUM(p.visits) AS visits 10 | FROM 11 | staging.taxi_zones AS z 12 | JOIN ( 13 | SELECT 14 | month, 15 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS geometry, 16 | visits 17 | FROM staging.past_tlc_visits 18 | ) AS p 19 | ON ST_WITHIN(p.geometry, z.geometry) 20 | GROUP BY 1, 2; 21 | -------------------------------------------------------------------------------- /src/postGIS_tables/production/all_time_stats.sql: -------------------------------------------------------------------------------- 1 | -- Join Citibike, TLC, and Yelp statistics to taxi zones for Dash 2 | 3 | DROP TABLE IF EXISTS production.all_time_stats; 4 | 5 | CREATE TABLE production.all_time_stats AS 6 | SELECT 7 | v.zone_id, 8 | v.zone_name, 9 | v.borough, 10 | v.tlc_visits, 11 | v.citibike_visits, 12 | v.citibike_stations, 13 | y.avg_rating AS yelp_avg_rating, 14 | y.sum_reviews AS yelp_sum_reviews, 15 | y.weighted_sum_reviews AS yelp_weighted_sum_reviews 16 | FROM 17 | ( 18 | SELECT 19 | z.zone_id, 20 | z.zone_name, 21 | z.borough, 22 | COALESCE(SUM(t.visits), 0) AS tlc_visits, 23 | COALESCE(SUM(c.visits), 0) AS citibike_visits, 24 | COALESCE(MAX(c.stations), 0) AS citibike_stations 25 | FROM 26 | staging.taxi_zones AS z 27 | LEFT JOIN statistics.tlc_visits AS t USING (zone_id) 28 | LEFT JOIN statistics.citibike AS c 29 | ON t.zone_id = c.zone_id AND t.month = c.month 30 | GROUP BY 1, 2, 3 31 | ) AS v 32 | LEFT JOIN statistics.yelp_businesses AS y USING (zone_id) 33 | ORDER BY 1; 34 | -------------------------------------------------------------------------------- /src/postGIS_tables/production/taxi_zones.sql: -------------------------------------------------------------------------------- 1 | -- Convert taxi zone geometries to GeoJSON for Dash 2 | 3 | DROP TABLE IF EXISTS production.taxi_zones; 4 | 5 | CREATE TABLE production.taxi_zones AS 6 | SELECT 7 | zone_id, 8 | ST_ASGeoJSON(ST_ForcePolygonCW(geometry)) AS geometry 9 | FROM staging.taxi_zones 10 | ORDER BY 1; 11 | -------------------------------------------------------------------------------- /src/postGIS_tables/statistics/citibike.sql: -------------------------------------------------------------------------------- 1 | -- Aggregate Citibike visits by taxi zone 2 | -- and estimate monthly station additions with rolling maximum 3 | 4 | DROP TABLE IF EXISTS statistics.citibike; 5 | 6 | CREATE TABLE statistics.citibike AS 7 | SELECT 8 | t.month, 9 | t.zone_id, 10 | MAX(active_stations) OVER ( 11 | PARTITION BY t.zone_id 12 | ORDER BY t.month ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW 13 | ) AS stations, 14 | visits 15 | FROM ( 16 | SELECT 17 | v.month, 18 | s.zone_id, 19 | COUNT(s.station_id) AS active_stations, 20 | SUM(v.visits) AS visits 21 | FROM 22 | geo_joined.citibike_stations AS s 23 | JOIN staging.citibike_visits AS v 24 | USING (station_id) 25 | GROUP BY 1, 2 26 | ) AS t; 27 | -------------------------------------------------------------------------------- /src/postGIS_tables/statistics/tlc_visits.sql: -------------------------------------------------------------------------------- 1 | -- Combine past TLC visits with modern TLC visits 2 | -- and aggregate by taxi zone ID 3 | 4 | DROP TABLE IF EXISTS statistics.tlc_visits; 5 | 6 | CREATE TABLE statistics.tlc_visits AS 7 | SELECT 8 | t.month, 9 | t.zone_id, 10 | SUM(t.visits) AS visits 11 | FROM ( 12 | SELECT 13 | month, 14 | zone_id, 15 | visits 16 | FROM geo_joined.past_tlc_visits 17 | UNION ALL 18 | SELECT 19 | month, 20 | zone_id, 21 | visits 22 | FROM staging.modern_tlc_visits 23 | ) AS t 24 | GROUP BY 1, 2; 25 | -------------------------------------------------------------------------------- /src/postGIS_tables/statistics/yelp_businesses.sql: -------------------------------------------------------------------------------- 1 | -- Aggregate Yelp business ratings and reviews by taxi zone 2 | 3 | DROP TABLE IF EXISTS statistics.yelp_businesses; 4 | 5 | CREATE TABLE statistics.yelp_businesses AS 6 | SELECT 7 | z.zone_id, 8 | AVG(y.rating) AS avg_rating, 9 | SUM(y.review_count) AS sum_reviews, 10 | SUM(y.review_count * y.rating) AS weighted_sum_reviews 11 | FROM 12 | staging.taxi_zones AS z 13 | JOIN staging.yelp_businesses AS y 14 | ON ST_Within(y.geometry, z.geometry) 15 | GROUP BY 1; 16 | -------------------------------------------------------------------------------- /src/preparation/extract.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | import requests 4 | import time 5 | import zipfile 6 | import boto3 7 | import geopandas as gpd 8 | import pandas as pd 9 | 10 | 11 | s3 = boto3.resource('s3') 12 | 13 | def get_taxi_zones(): 14 | '''Pull taxi zone shapfile and convert to WGS 84 (EPSG:4326)''' 15 | s3.meta.client.download_file( 16 | 'nyc-tlc', 17 | 'misc/taxi_zones.zip', 18 | 'taxi_zones.zip' 19 | ) 20 | taxi_zones = gpd.read_file('zip://taxi_zones.zip') \ 21 | .to_crs('EPSG:4326') \ 22 | .filter( 23 | [ 24 | 'OBJECTID', 25 | 'zone', 26 | 'borough', 27 | 'geometry' 28 | ], 29 | axis = 1 30 | ).rename( 31 | columns = { 32 | 'OBJECTID': 'zone_id', 33 | 'zone': 'zone_name' 34 | } 35 | ) 36 | os.remove('taxi_zones.zip') 37 | return taxi_zones 38 | 39 | def get_businesses(**kwargs): 40 | '''For each taxi zone, query Yelp API for businesses closest to centroid''' 41 | ti = kwargs['ti'] 42 | centroids = ti.xcom_pull(task_ids = 'calculate_centroids') 43 | 44 | api_key = 'Bearer ' + os.environ['YELP_API_KEY'] 45 | head = {'Authorization': api_key} 46 | url = 'https://api.yelp.com/v3/businesses/search' 47 | businesses = pd.DataFrame() 48 | 49 | for _, row in centroids.iterrows(): 50 | query = { 51 | 'latitude': row['latitude'], 52 | 'longitude': row['longitude'], 53 | 'radius': 3000, 54 | 'limit': 50, 55 | 'sort_by': 'distance' 56 | } 57 | response = requests.get(url, headers = head, params = query) 58 | json = response.json() 59 | 60 | retries = 0 61 | while retries <= 10 and 'error' in json: 62 | retries += 1 63 | time.sleep(1) 64 | response = requests.get(url, headers = head, params = query) 65 | json = response.json() 66 | matches = json['businesses'] 67 | businesses = businesses.append(matches, ignore_index = True) 68 | return businesses 69 | 70 | def unzip_csvs(): 71 | '''Iterate over relevant zipped files, unzip, and upload to private S3''' 72 | source = s3.Bucket('tripdata') 73 | 74 | for obj in source.objects.all(): 75 | key = obj.key 76 | 77 | if not key.startswith('201307-201402') and key.endswith('.zip'): 78 | buffer = io.BytesIO(obj.get()['Body'].read()) 79 | zipped = zipfile.ZipFile(buffer) 80 | 81 | for name in zipped.namelist(): 82 | 83 | if not name.startswith('_') and name.endswith('.csv'): 84 | s3.meta.client.upload_fileobj( 85 | zipped.open(name), 86 | Bucket = 'jlang-20b-de-ny', 87 | Key = 'citibike/' + name 88 | ) 89 | -------------------------------------------------------------------------------- /src/preparation/load.py: -------------------------------------------------------------------------------- 1 | import pandas 2 | from geoalchemy2 import Geometry 3 | from sqlalchemy import Float, Integer, String 4 | from config.database import py_engine 5 | 6 | 7 | def write_taxi_zones(**kwargs): 8 | '''Write taxi zone map to postgres''' 9 | ti = kwargs['ti'] 10 | taxi_zones = ti.xcom_pull(task_ids = 'clean_taxi_zones') 11 | 12 | taxi_zones.to_sql( 13 | name = 'taxi_zones', 14 | con = py_engine, 15 | schema = 'staging', 16 | if_exists = 'replace', 17 | index = False, 18 | index_label = 'zone_id', 19 | dtype = { 20 | 'zone_id': Integer(), 21 | 'zone_name': String(length = 45), 22 | 'borough': String(length = 13), 23 | 'geometry': Geometry('MULTIPOLYGON', 4326) 24 | } 25 | ) 26 | 27 | def write_businesses(**kwargs): 28 | '''Write Yelp business data to postgres for further processing''' 29 | ti = kwargs['ti'] 30 | businesses = ti.xcom_pull(task_ids = 'clean_businesses') 31 | 32 | businesses.to_sql( 33 | name = 'yelp_businesses', 34 | con = py_engine, 35 | schema = 'staging', 36 | if_exists = 'replace', 37 | index = False, 38 | index_label = 'business_id', 39 | dtype = { 40 | 'business_id': String(22), 41 | 'review_count': Integer(), 42 | 'rating': Float(), 43 | 'geometry': Geometry('POINT', 4326) 44 | } 45 | ) 46 | -------------------------------------------------------------------------------- /src/preparation/transform.py: -------------------------------------------------------------------------------- 1 | import geopandas as gpd 2 | import pandas as pd 3 | from geoalchemy2 import WKTElement 4 | from shapely.geometry.multipolygon import MultiPolygon 5 | from shapely.geometry.polygon import Polygon 6 | 7 | 8 | def clean_taxi_zones(**kwargs): 9 | '''Make geometry column consistent for writing to postgres''' 10 | ti = kwargs['ti'] 11 | taxi_zones = ti.xcom_pull(task_ids = 'get_taxi_zones') 12 | 13 | def homogenize(geometry): 14 | ''' 15 | Convert any Polygon to a MultiPolygon 16 | and then either to a WKTElement 17 | ''' 18 | multi = MultiPolygon([geometry]) if type(geometry) == Polygon else geometry 19 | return WKTElement(multi.wkt, srid = 4326) 20 | 21 | taxi_zones['geometry'] = taxi_zones['geometry'].apply(homogenize) 22 | return taxi_zones 23 | 24 | def calculate_centroids(**kwargs): 25 | '''Calculate centroids for each taxi zone and extract lat-lons''' 26 | ti = kwargs['ti'] 27 | taxi_zones = ti.xcom_pull(task_ids = 'get_taxi_zones') 28 | 29 | centroids = pd.DataFrame.from_dict({ 30 | 'latitude': taxi_zones['geometry'].centroid.y, 31 | 'longitude': taxi_zones['geometry'].centroid.x 32 | }) 33 | return centroids 34 | 35 | def clean_businesses(**kwargs): 36 | ''' 37 | Drop invalid and duplicated businesses, 38 | unnest lat-lons, & combine into geometry column 39 | ''' 40 | ti = kwargs['ti'] 41 | businesses = ti.xcom_pull(task_ids = 'get_businesses') 42 | 43 | businesses.drop( 44 | businesses[businesses.distance > 3000].index, 45 | inplace = True 46 | ) 47 | businesses.sort_values('distance') \ 48 | .drop_duplicates('id', keep ='first') \ 49 | .sort_index() 50 | businesses.reset_index( 51 | drop = True, 52 | inplace = True 53 | ) 54 | 55 | business_coordinates = pd.json_normalize(businesses.coordinates) 56 | business_coordinates.dropna(how = 'any', inplace = True) 57 | 58 | businesses_flat = businesses.join(business_coordinates, how = 'inner') 59 | businesses_flat.reset_index(drop = True, inplace = True) 60 | 61 | businesses_geo = gpd.GeoDataFrame( 62 | businesses_flat, 63 | geometry = gpd.points_from_xy( 64 | businesses_flat.longitude, 65 | businesses_flat.latitude 66 | ) 67 | ) 68 | businesses_geo['geometry'] = businesses_geo.geometry.apply( 69 | lambda point: WKTElement(point.wkt, srid = 4326) 70 | ) 71 | 72 | businesses_writable = businesses_geo.filter( 73 | [ 74 | 'id', 75 | 'review_count', 76 | 'rating', 77 | 'geometry' 78 | ], 79 | axis = 1 80 | ) 81 | return businesses_writable 82 | -------------------------------------------------------------------------------- /src/spark_reduction/driver.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from spark_reduction.extract import \ 3 | get_citibike_trips, get_past_tlc_trips, get_modern_tlc_trips 4 | from spark_reduction.transform import \ 5 | distill_citibike_stations, aggregate_citibike_visits, \ 6 | aggregate_past_tlc_visits, aggregate_modern_tlc_visits 7 | from spark_reduction.load import \ 8 | write_citibike, write_tlc 9 | 10 | 11 | spark = SparkSession.builder \ 12 | .appName('where-cycle') \ 13 | .getOrCreate() 14 | 15 | # Parse CSVs from S3 and cache tables 16 | get_citibike_trips() 17 | get_past_tlc_trips() 18 | get_modern_tlc_trips() 19 | 20 | # Reduce tables to meaningful dataframes 21 | stations = distill_citibike_stations() 22 | citibike_visits = aggregate_citibike_visits() 23 | past_visits = aggregate_past_tlc_visits() 24 | modern_visits = aggregate_modern_tlc_visits() 25 | 26 | # Write dataframes to postgres 27 | write_citibike(stations, citibike_visits) 28 | write_tlc(past_visits, modern_visits) 29 | 30 | spark.stop() 31 | -------------------------------------------------------------------------------- /src/spark_reduction/extract.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql.functions import input_file_name, regexp_extract 3 | from config.schemas import * 4 | 5 | 6 | spark = SparkSession.builder \ 7 | .appName('where-cycle') \ 8 | .getOrCreate() 9 | 10 | def get_citibike_trips(): 11 | '''Parse Citibike CSVs, format date columns, & rename location columns''' 12 | citibike_df = spark.read.csv( 13 | path = 's3a://jlang-20b-de-ny/citibike/*.csv', 14 | schema = citibike_schema, 15 | header = True, 16 | ignoreLeadingWhiteSpace = True, 17 | ignoreTrailingWhiteSpace = True 18 | ).withColumnRenamed('start station id', 'start_id') \ 19 | .withColumnRenamed('start station latitude', 'start_latitude') \ 20 | .withColumnRenamed('start station longitude', 'start_longitude') \ 21 | .withColumnRenamed('end station id', 'end_id') \ 22 | .withColumnRenamed('end station latitude', 'end_latitude') \ 23 | .withColumnRenamed('end station longitude', 'end_longitude') \ 24 | .selectExpr( 25 | 'DATE_FORMAT(starttime, "yyyy-MM") AS start_month', 26 | 'DATE_FORMAT(stoptime, "yyyy-MM") AS end_month', 27 | 'start_id', 28 | 'start_latitude', 29 | 'start_longitude', 30 | 'end_id', 31 | 'end_latitude', 32 | 'end_longitude' 33 | ) 34 | citibike_df.createOrReplaceTempView('citibike') 35 | spark.catalog.cacheTable('citibike') 36 | 37 | def parse_tlc(path, schema): 38 | '''Parse TLC CSVs, assuming trip month from filename''' 39 | tlc_df = spark.read.csv( 40 | path = path, 41 | schema = schema, 42 | header = True, 43 | ignoreLeadingWhiteSpace = True, 44 | ignoreTrailingWhiteSpace = True 45 | ).withColumn( 46 | 'month', 47 | regexp_extract( 48 | input_file_name(), 49 | r'tripdata_(\d{4}-\d{2})\.csv', 50 | 1 51 | ) 52 | ) 53 | return tlc_df 54 | 55 | def get_past_tlc_trips(): 56 | '''Parse TLC CSVs from before 2016-07, filtering for lat-lon columns''' 57 | past_df = spark.createDataFrame(data = [], schema = past_schema) 58 | past_pairs = [ 59 | ( 60 | 's3a://nyc-tlc/trip\ data/green_tripdata_201[345]-*.csv', 61 | green_13_16_schema 62 | ), 63 | ( 64 | 's3a://nyc-tlc/trip\ data/green_tripdata_2016-0[1-6].csv', 65 | green_13_16_schema 66 | ), 67 | ( 68 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_2009-*.csv', 69 | yellow_09_16_schema 70 | ), 71 | ( 72 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_201[0-5]-*.csv', 73 | yellow_09_16_schema 74 | ), 75 | ( 76 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_2016-0[1-6].csv', 77 | yellow_09_16_schema 78 | ) 79 | ] 80 | for path, schema in past_pairs: 81 | csv_df = parse_tlc(path, schema).select( 82 | 'month', 83 | 'pickup_longitude', 84 | 'pickup_latitude', 85 | 'dropoff_longitude', 86 | 'dropoff_latitude' 87 | ) 88 | past_df = past_df.union(csv_df) 89 | past_df.createOrReplaceTempView('past') 90 | spark.catalog.cacheTable('past') 91 | 92 | def get_modern_tlc_trips(): 93 | '''Parse TLC CSVs from after 2016-06, filtering for taxi zone ID columns''' 94 | fhv_15_16_df = parse_tlc( 95 | 's3a://nyc-tlc/trip\ data/fhv_tripdata_201[56]-*.csv', 96 | fhv_15_16_schema 97 | ).select( 98 | 'month', 99 | 'locationID' 100 | ) 101 | fhv_15_16_df.createOrReplaceTempView('fhv_15_16') 102 | spark.catalog.cacheTable('fhv_15_16') 103 | 104 | modern_df = spark.createDataFrame(data = [], schema = modern_schema) 105 | modern_pairs = [ 106 | ( 107 | 's3a://nyc-tlc/trip\ data/fhv_tripdata_201[79]-*.csv', 108 | fhv_17_19_schema 109 | ), 110 | ( 111 | 's3a://nyc-tlc/trip\ data/fhv_tripdata_2018-*.csv', 112 | fhv_18_schema 113 | ), 114 | ( 115 | 's3a://nyc-tlc/trip\ data/fhvhv_tripdata_*.csv', 116 | fhvhv_schema 117 | ), 118 | ( 119 | 's3a://nyc-tlc/trip\ data/green_tripdata_2016-0[789].csv', 120 | green_16_19_schema 121 | ), 122 | ( 123 | 's3a://nyc-tlc/trip\ data/green_tripdata_2016-1*.csv', 124 | green_16_19_schema 125 | ), 126 | ( 127 | 's3a://nyc-tlc/trip\ data/green_tripdata_201[789]-*.csv', 128 | green_16_19_schema 129 | ), 130 | ( 131 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_2016-0[789].csv', 132 | yellow_16_19_schema 133 | ), 134 | ( 135 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_2016-1*.csv', 136 | yellow_16_19_schema) 137 | , 138 | ( 139 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_201[789]-*.csv', 140 | yellow_16_19_schema 141 | ) 142 | ] 143 | for path, schema in modern_pairs: 144 | csv_df = parse_tlc(path, schema).select( 145 | 'month', 146 | 'PULocationID', 147 | 'DOLocationID' 148 | ) 149 | modern_df = modern_df.union(csv_df) 150 | modern_df.createOrReplaceTempView('modern') 151 | spark.catalog.cacheTable('modern') 152 | -------------------------------------------------------------------------------- /src/spark_reduction/load.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from config.database import jdbc_props, jdbc_url 3 | 4 | 5 | spark = SparkSession.builder \ 6 | .appName('where-cycle') \ 7 | .getOrCreate() 8 | 9 | def write_citibike(stations, visits): 10 | stations.write.jdbc( 11 | url = jdbc_url, 12 | table = 'staging.citibike_stations', 13 | mode = 'overwrite', 14 | properties = jdbc_props 15 | ) 16 | visits.write.jdbc( 17 | url = jdbc_url, 18 | table = 'staging.citibike_visits', 19 | mode = 'overwrite', 20 | properties = jdbc_props 21 | ) 22 | spark.catalog.uncacheTable('citibike') 23 | 24 | def write_tlc(past, modern): 25 | past.write.jdbc( 26 | url = jdbc_url, 27 | table = 'staging.past_tlc_visits', 28 | mode = 'overwrite', 29 | properties = jdbc_props 30 | ) 31 | spark.catalog.uncacheTable('past') 32 | 33 | modern.write.jdbc( 34 | url = jdbc_url, 35 | table = 'staging.modern_tlc_visits', 36 | mode = 'overwrite', 37 | properties = jdbc_props 38 | ) 39 | spark.catalog.uncacheTable('fhv_15_16') 40 | spark.catalog.uncacheTable('modern') 41 | -------------------------------------------------------------------------------- /src/spark_reduction/transform.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from config.geometries import \ 3 | TAXI_ZONE_LAT_MIN, TAXI_ZONE_LAT_MAX, \ 4 | TAXI_ZONE_LON_MIN, TAXI_ZONE_LON_MAX 5 | 6 | 7 | spark = SparkSession.builder \ 8 | .appName('where-cycle') \ 9 | .getOrCreate() 10 | 11 | def distill_citibike_stations(): 12 | '''Create list of unique Citibike stations across all trip endpoints''' 13 | stations_df = spark.sql(f''' 14 | SELECT 15 | start_id AS station_id, 16 | start_latitude AS latitude, 17 | start_longitude AS longitude 18 | FROM citibike 19 | WHERE 20 | start_latitude BETWEEN 21 | {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX} 22 | AND 23 | start_longitude BETWEEN 24 | {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX} 25 | GROUP BY 1, 2, 3 26 | UNION 27 | SELECT 28 | end_id AS station_id, 29 | end_latitude AS latitude, 30 | end_longitude AS longitude 31 | FROM citibike 32 | WHERE 33 | end_latitude BETWEEN 34 | {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX} 35 | AND 36 | end_longitude BETWEEN 37 | {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX} 38 | GROUP BY 1, 2, 3'''.translate({ord(c): ' ' for c in '\n\t'}) 39 | ) 40 | return stations_df 41 | 42 | def aggregate_citibike_visits(): 43 | '''Convert Citibike trips to visits and sum by station_id''' 44 | visits_df = spark.sql(''' 45 | SELECT 46 | month, 47 | station_id, 48 | SUM(visits) AS visits 49 | FROM ( 50 | SELECT 51 | start_month AS month, 52 | start_id AS station_id, 53 | COUNT(*) AS visits 54 | FROM citibike 55 | GROUP BY 1, 2 56 | UNION ALL 57 | SELECT 58 | end_month AS month, 59 | end_id AS station_id, 60 | COUNT(*) AS visits 61 | FROM citibike 62 | GROUP BY 1, 2 63 | ) 64 | GROUP BY 1, 2 65 | ''') 66 | return visits_df 67 | 68 | def aggregate_past_tlc_visits(): 69 | ''' 70 | Convert past TLC trips to visits, 71 | round lat-lon precision to street level, 72 | and sum by lat-lon 73 | ''' 74 | past_df = spark.sql(f''' 75 | SELECT 76 | month, 77 | longitude, 78 | latitude, 79 | SUM(visits) AS visits 80 | FROM ( 81 | SELECT 82 | month, 83 | ROUND(pickup_longitude, 3) AS longitude, 84 | ROUND(pickup_latitude, 3) AS latitude, 85 | COUNT(*) AS visits 86 | FROM past 87 | WHERE 88 | pickup_longitude BETWEEN 89 | {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX} 90 | AND 91 | pickup_latitude BETWEEN 92 | {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX} 93 | GROUP BY 1, 2, 3 94 | UNION ALL 95 | SELECT 96 | month, 97 | ROUND(dropoff_longitude, 3) AS longitude, 98 | ROUND(dropoff_latitude, 3) AS latitude, 99 | COUNT(*) AS visits 100 | FROM past 101 | WHERE 102 | dropoff_longitude BETWEEN 103 | {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX} 104 | AND 105 | dropoff_latitude BETWEEN 106 | {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX} 107 | GROUP BY 1, 2, 3 108 | ) 109 | GROUP BY 1, 2, 3'''.translate({ord(c): ' ' for c in '\n\t'}) 110 | ) 111 | return past_df 112 | 113 | def aggregate_modern_tlc_visits(): 114 | ''' 115 | Convert modern TLC trips to visits, 116 | ignoring unknown taxi zone IDs, 117 | and sum by taxi zone ID 118 | ''' 119 | modern_df = spark.sql(''' 120 | SELECT 121 | month, 122 | zone_id, 123 | SUM(visits) AS visits 124 | FROM ( 125 | SELECT 126 | month, 127 | locationID AS zone_id, 128 | COUNT(*) AS visits 129 | FROM fhv_15_16 130 | WHERE locationID BETWEEN 1 AND 263 131 | GROUP BY 1, 2 132 | UNION ALL 133 | SELECT 134 | month, 135 | PULocationID AS zone_id, 136 | COUNT(*) as visits 137 | FROM modern 138 | WHERE PUlocationID BETWEEN 1 AND 263 139 | GROUP BY 1, 2 140 | UNION ALL 141 | SELECT 142 | month, 143 | DOLocationID AS zone_id, 144 | COUNT(*) as visits 145 | FROM modern 146 | WHERE DOlocationID BETWEEN 1 AND 263 147 | GROUP BY 1, 2 148 | ) 149 | GROUP BY 1, 2 150 | ''') 151 | return modern_df 152 | --------------------------------------------------------------------------------