├── .gitignore
├── LICENSE
├── README.md
├── dag.png
├── pipeline.png
├── requirements.txt
└── src
├── airflow
├── start_workers.sh
├── stop_workers.sh
└── where_cycle_dag.py
├── config
├── database.py
├── geometries.py
├── ref
│ ├── check_citibike_schema.py
│ ├── check_tlc_schemas.py
│ ├── get_geometries.sql
│ └── tlc_schemas.txt
└── schemas.py
├── dash
├── app.py
└── assets
│ └── background.css
├── postGIS_tables
├── geo_joined
│ ├── citibike_stations.sql
│ └── past_tlc_visits.sql
├── production
│ ├── all_time_stats.sql
│ └── taxi_zones.sql
└── statistics
│ ├── citibike.sql
│ ├── tlc_visits.sql
│ └── yelp_businesses.sql
├── preparation
├── extract.py
├── load.py
└── transform.py
└── spark_reduction
├── driver.py
├── extract.py
├── load.py
└── transform.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Temporary resources
2 | benchmark/
3 | dash_project_medium.py
4 | spark-warehouse/
5 |
6 | # Byte-compiled / optimized / DLL files
7 | __pycache__/
8 | *.py[cod]
9 | *$py.class
10 |
11 | # C extensions
12 | *.so
13 |
14 | # Distribution / packaging
15 | .Python
16 | build/
17 | develop-eggs/
18 | dist/
19 | downloads/
20 | eggs/
21 | .eggs/
22 | lib/
23 | lib64/
24 | parts/
25 | sdist/
26 | var/
27 | wheels/
28 | pip-wheel-metadata/
29 | share/python-wheels/
30 | *.egg-info/
31 | .installed.cfg
32 | *.egg
33 | MANIFEST
34 |
35 | # PyInstaller
36 | # Usually these files are written by a python script from a template
37 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
38 | *.manifest
39 | *.spec
40 |
41 | # Installer logs
42 | pip-log.txt
43 | pip-delete-this-directory.txt
44 |
45 | # Unit test / coverage reports
46 | htmlcov/
47 | .tox/
48 | .nox/
49 | .coverage
50 | .coverage.*
51 | .cache
52 | nosetests.xml
53 | coverage.xml
54 | *.cover
55 | *.py,cover
56 | .hypothesis/
57 | .pytest_cache/
58 |
59 | # Translations
60 | *.mo
61 | *.pot
62 |
63 | # Django stuff:
64 | *.log
65 | local_settings.py
66 | db.sqlite3
67 | db.sqlite3-journal
68 |
69 | # Flask stuff:
70 | instance/
71 | .webassets-cache
72 |
73 | # Scrapy stuff:
74 | .scrapy
75 |
76 | # Sphinx documentation
77 | docs/_build/
78 |
79 | # PyBuilder
80 | target/
81 |
82 | # Jupyter Notebook
83 | *.ipynb
84 | .ipynb_checkpoints/
85 |
86 | # IPython
87 | profile_default/
88 | ipython_config.py
89 |
90 | # pyenv
91 | .python-version
92 |
93 | # pipenv
94 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
95 | # However, in case of collaboration, if having platform-specific dependencies or dependencies
96 | # having no cross-platform support, pipenv may install dependencies that don't work, or not
97 | # install all needed dependencies.
98 | #Pipfile.lock
99 |
100 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
101 | __pypackages__/
102 |
103 | # Celery stuff
104 | celerybeat-schedule
105 | celerybeat.pid
106 |
107 | # SageMath parsed files
108 | *.sage.py
109 |
110 | # Environments
111 | where-cycle-env/
112 | .env
113 | .venv
114 | env/
115 | venv/
116 | ENV/
117 | env.bak/
118 | venv.bak/
119 |
120 | # Spyder project settings
121 | .spyderproject
122 | .spyproject
123 |
124 | # Rope project settings
125 | .ropeproject
126 |
127 | # mkdocs documentation
128 | /site
129 |
130 | # mypy
131 | .mypy_cache/
132 | .dmypy.json
133 | dmypy.json
134 |
135 | # Pyre type checker
136 | .pyre/
137 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2020 Josh Lang
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | 
4 | 
5 | 
6 | 
7 | 
8 | # Where Cycle
9 |
10 | *Getting New Yorkers Back to Business, Safely*
11 |
12 | ## Contents
13 | 1. [Purpose](README.md#purpose)
14 | 1. [Pipeline](README.md#pipeline)
15 | 1. [Summary](README.md#summary)
16 | - [Data](README.md#data)
17 | - [Preparation](README.md#preparation)
18 | - [Spark Reduction](README.md#spark-reduction)
19 | - [PostGIS Tables](README.md#postgis-tables)
20 | - [Dash & Airflow](README.md#dash-and-airflow)
21 | 1. [Spark Optimization](README.md#spark-optimization)
22 | 1. [Setup](README.md#setup)
23 | 1. [Directory Structure](README.md#directory-structure)
24 | 1. [License](README.md#license)
25 |
26 | ## Purpose
27 | As health officials advised social distancing and businesses closed earlier this year, subway and bus ridership plummeted in many large cities. New York saw an almost 90% reduction by late April. Now, as the city is tentatively opening back up, people may be looking to return to their places of work and to support their favorite businesses, but they might be hesitant to utilize public transit, instead seeking open-air alternatives.
28 |
29 | A cursory glance at some transit coverage in NYC makes it clear that, while Citibike is an awesome open-air solution, the available stations can’t immediately meet the needs of the outer boroughs: some expansion is required. **The goal of this pipeline is to synthesize data that may help city planners and Citibike analysts determine which areas could be ideal for Citibike expansion. As an initial step toward that end, it aggregates historical taxi & for-hire vehicle trips, Citibike trips & station density, and business review statistics by taxi zone.**
30 |
31 | *This project was developed by Josh Lang as part of his data engineering fellowship with Insight Data Science in the summer of 2020.*
32 |
33 | ## Pipeline
34 | 
35 | 
36 |
37 | ## Summary
38 | If you'd prefer to jump right in and start clicking into the functions from that DAG above, then the file that produced it is [here](https://github.com/josh-lang/where-cycle/blob/master/src/airflow/where_cycle_dag.py). Since you can't navigate directly to everything from there, you may also find a glance at the [directory structure](README.md#directory-structure) below handy.
39 |
40 | ### Data
41 | - Citibike Trip Histories: [S3 bucket](https://s3.console.aws.amazon.com/s3/buckets/tripdata), [documentation](https://www.citibikenyc.com/system-data)
42 | - NYC Taxi & Limousine Commission Trip Records: [S3 bucket](https://s3.console.aws.amazon.com/s3/buckets/nyc-tlc), [documentation](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page)
43 | - Yelp Business Search API: [documentation](https://www.yelp.com/developers/documentation/v3/business_search)
44 |
45 | ### Preparation
46 | - In order to index everything by taxi zone, NYC TLC's shapefile needs to be pulled down from S3, processed, and saved to PostgreSQL
47 | - Coordinate reference system is converted from NAD83 to WGS84
48 | - Each polygon is replaced with its equivalent multipolygon
49 | - All geometries are converted to well-known text
50 | - Centroids are then calculated for each taxi zone and used to query Yelp's API, requesting the 50 nearest businesses. These are cleaned and written as well
51 | - Invalid results and duplicates are removed
52 | - Coordinates are unnested and combined into point geometries
53 | - Like with taxi zones, geometries are converted to well-known text
54 | - Citibike's zipped files need to be pulled out of S3, unzipped, and sent back to another S3 bucket before batch processing since Spark can't ingest zip files natively
55 | - This is because Hadoop, which provides its underlying filesystem interface, does not support that compression codec
56 | - Python's `io.BytesIO` class reads S3's *bytes-like objects* and makes this a quick streaming process
57 |
58 | ### Spark Reduction
59 | - Spark can read csv files directly via the s3a connector for Hadoop, and multiple URIs can be specified with globbing
60 | - Citibike's trip data is consistent, so parsing all of it requires only one path and one schema definition
61 | - That schema can be truncated because this project isn't concerned with any columns that appear after trip dates and endpoint locations
62 | - TLC data is messier with 15 distinct csv headers over the corpus, but 10 truncated schemas are sufficient for pulling everything in correctly
63 | - TLC trips before 2016-07 use coordinates for pickup and dropoff locations, while trips after 2016-06 use taxi zone IDs
64 | - TLC's timestamps aren't always valid, so schemas are simplified further by not including those. Dates are instead assumed from csv filenames, which represent each month of trips
65 | - Relevant columns are selected from csvs, and then they're unioned together into 4 cached tables: Citibike trips, past TLC trips, modern TLC trips, and a small table for just the earliest for-hire vehicle trips
66 | - To aggregate visits by taxi zone, trip beginnings and endings need to be combined into endpoints and grouped by location. 4 tables are created in PostgreSQL:
67 | - Coordinates for unique Citibike stations within the taxi zone map's extent are pulled out separately from visit aggregation
68 | - Citibike visits are then aggragated by station ID
69 | - Past TLC visits are aggregated by coordinates within taxi zone extent rounded to 3 decimal places — neighborhood resolution
70 | - Modern TLC visits and those early for-hire vehicle visits are aggregated simply by taxi zone ID
71 |
72 | ### PostGIS Tables
73 | - All tables so far have been written to the *staging* schema in PostgreSQL. Now, that everything's there, some final processing with the PostGIS extension can be done
74 | - *geo_joined* schema
75 | - Citibike station coordinates are matched to taxi zone polygons to create a join table for Citibike visits
76 | - Past TLC visits are aggregated by the taxi zone their coordinates are within
77 | - *statistics* schema
78 | - Citibike stations and trips are aggregated by taxi zone using join table
79 | - Past TLC visits are unioned and summed with modern TLC visits using taxi zone IDs
80 | - Yelp business ratings and reviews are aggregated by the taxi zone their coordinates are within
81 | - *production* schema
82 | - Taxi zone geometries are converted to GeoJSON for Dash to plot on choropleth maps
83 | - Citibike, TLC, and Yelp statistics are joined to taxi zone dimensions for Dash to define toggleable scales
84 |
85 | ### Dash and Airflow
86 | - A rudimentary dashboard built with Dash lives at [dats.work/where-cycle](http://dats.work/where-cycle)
87 | - GeoJSON geometries from PostGIS need to be wrapped as a GeoJSON Feature Collection inside of the Dash app to be plotted on choropleth maps
88 | - Statistics from PostGIS define the choropleth map scales and are also used to create a supplementary bar chart of the top 15 taxi zones for whichever metric is selected
89 | - Airflow adds some fault tolerance and runs pipeline on a regular basis to keep data up-to-date
90 | - Dependencies between tasks prevent things from running out of order or unnecessarily when an upstream task has failed
91 | - Pipeline runs every week so that Yelp has enough time to update meaningfully and so that Citibike and TLC updates can be captured with relatively minimal delay
92 | - Both Citibike and TLC batch their trip data by month, but the date they update their S3 buckets isn't consistent
93 | - Yelp's data is queried directly from their API and may return updated or simply different results each time
94 | - Startup and shutdown of the standalone Spark cluster is automated within the pipeline to save money
95 |
96 | ## Spark Optimization
97 | I tested a handful of methods and configuration changes trying to make the Spark piece of the pipeline run more efficiently. First, since I had already defined each TLC schema while taking my initial stab at ingestion, I wanted to see whether those explicit definitions were, in fact, significantly faster than just using Spark's `inferSchema` option. Defining schemas before reading files was faster (as expected), but it only reduced total runtime by **~2.1%**.
98 |
99 | The most dramatic improvement came with caching each table of source CSVs before running the Spark SQL queries that transform them. This increased my total runtime savings to **~32.9%**!
100 |
101 | After that, I found that lowering the number of shuffle partitions so that it matched the number of cores in my small cluster and doubling the maximum bytes in cached storage batches and in each partition could make things even faster, but only by so much. Changing these settings in my `spark-defaults.conf` file brought total runtime reduction to **~36.6%**:
102 | | Property | Setting |
103 | | -------- | ------- |
104 | | spark.sql.files.maxPartitionBytes | 268435456 |
105 | | spark.sql.inMemoryColumnarStorage.batchSize | 20000 |
106 | | spark.sql.inMemoryColumnarStorage.compressed | true |
107 | | spark.sql.shuffle.partitions | 12 |
108 |
109 | ## Setup
110 | Python dependencies can be installed with the following command:
111 | ```sh
112 | pip install -r requirements.txt
113 | ```
114 |
115 | This project was built using an Apache Spark 2.4.5 / Hadoop 2.7 binary downloaded from [spark.apache.org](https://spark.apache.org/downloads.html). It reads from AWS S3 and writes to PostgreSQL, so a driver from [jdbc.postgresql.org](https://jdbc.postgresql.org) should be placed in `spark/jars/` and some configuration should be added to `spark-defaults.conf`:
116 | | Property | Setting |
117 | | -------- | ------- |
118 | | spark.driver.extraClassPath | /usr/local/spark/jars/postgresql-42.2.14.jar |
119 | | spark.driver.extraJavaOptions | -Dcom.amazonaws.services.s3.enableV4=true |
120 | | spark.executor.extraJavaOptions | -Dcom.amazonaws.services.s3.enableV4=true |
121 | | spark.hadoop.fs.s3a.awsAccessKeyId | $AWS_ACCESS_KEY_ID |
122 | | spark.hadoop.fs.s3a.awsSecretAccessKey | $AWS_SECRET_ACCESS_KEY |
123 | | spark.hadoop.fs.s3a.endpoint | $AWS_S3_ENDPOINT |
124 | | spark.hadoop.com.amazonaws.services.s3a.enableV4 | true |
125 | | spark.hadoop.fs.s3a.impl | org.apache.hadoop.fs.s3a.S3AFileSystem |
126 | | spark.jars | /usr/local/spark/jars/postgresql-42.2.14.jar |
127 | | spark.jars.packages | com.amazonaws:aws-java-sdk:1.7.4,org.apache.hadoop:hadoop-aws:2.7.7 |
128 |
129 | This project also depends on PostgreSQL's PostGIS extension, which can be installed with the `CREATE EXTENSION` command:
130 | ```sh
131 | psql -d yourdatabase -c 'CREATE EXTENSION postgis;'
132 | ```
133 |
134 | ## Directory Structure
135 | ```sh
136 | .
137 | ├── LICENSE
138 | ├── README.md
139 | ├── dag.png
140 | ├── pipeline.png
141 | ├── requirements.txt
142 | └── src/
143 | ├── airflow/
144 | │ ├── start_workers.sh*
145 | │ ├── stop_workers.sh*
146 | │ └── where_cycle_dag.py
147 | ├── config/
148 | │ ├── database.py
149 | │ ├── geometries.py
150 | │ ├── ref/
151 | │ │ ├── check_citibike_schema.py
152 | │ │ ├── check_tlc_schemas.py
153 | │ │ ├── get_geometries.sql
154 | │ │ └── tlc_schemas.txt
155 | │ └── schemas.py
156 | ├── dash/
157 | │ ├── app.py
158 | │ └── assets/
159 | │ └── background.css
160 | ├── postGIS_tables/
161 | │ ├── geo_joined/
162 | │ │ ├── citibike_stations.sql
163 | │ │ └── past_tlc_visits.sql
164 | │ ├── production/
165 | │ │ ├── all_time_stats.sql
166 | │ │ └── taxi_zones.sql
167 | │ └── statistics/
168 | │ ├── citibike.sql
169 | │ ├── tlc_visits.sql
170 | │ └── yelp_businesses.sql
171 | ├── preparation/
172 | │ ├── extract.py
173 | │ ├── load.py
174 | │ └── transform.py
175 | └── spark_reduction/
176 | ├── driver.py
177 | ├── extract.py
178 | ├── load.py
179 | └── transform.py
180 | ```
181 |
182 | ## License
183 | [MIT License](LICENSE)
184 | Copyright (c) 2020 Josh Lang
185 |
--------------------------------------------------------------------------------
/dag.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josh-lang/where-cycle/e11283acf13221f91b45baba12a08816def2a7fd/dag.png
--------------------------------------------------------------------------------
/pipeline.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/josh-lang/where-cycle/e11283acf13221f91b45baba12a08816def2a7fd/pipeline.png
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | alembic==1.4.2
2 | apache-airflow==2.9.3
3 | apispec==1.3.3
4 | argcomplete==1.12.0
5 | attrs==19.3.0
6 | Babel==2.9.1
7 | backcall==0.1.0
8 | bleach==3.3.0
9 | boto3==1.13.23
10 | botocore==1.16.23
11 | Brotli==1.0.7
12 | cached-property==1.5.1
13 | cattrs==1.0.0
14 | certifi==2024.7.4
15 | chardet==3.0.4
16 | click==7.1.2
17 | click-plugins==1.1.1
18 | cligj==0.5.0
19 | colorama==0.4.3
20 | colorlog==4.0.2
21 | configparser==3.5.3
22 | croniter==0.3.34
23 | dash-core-components==2.0.0
24 | dash-html-components==2.0.0
25 | dash-renderer==1.4.1
26 | dash-table==4.7.0
27 | decorator==4.4.2
28 | defusedxml==0.6.0
29 | dill==0.3.2
30 | dnspython==2.6.1
31 | docutils==0.16
32 | email-validator==1.1.1
33 | entrypoints==0.3
34 | findspark==1.4.1
35 | Fiona==1.8.13.post1
36 | Flask==2.3.2
37 | Flask-Admin==1.5.4
38 | Flask-AppBuilder==4.3.11
39 | Flask-Babel==1.0.0
40 | Flask-Caching==1.11.0
41 | Flask-Compress==1.5.0
42 | Flask-JWT-Extended==3.24.1
43 | Flask-Login==0.4.1
44 | Flask-OpenID==1.2.5
45 | Flask-SQLAlchemy==2.4.4
46 | flask-swagger==0.2.13
47 | Flask-WTF==0.14.3
48 | funcsigs==1.0.2
49 | future==0.18.3
50 | GeoAlchemy2==0.8.3
51 | geopandas==0.7.0
52 | graphviz==0.14.1
53 | gunicorn==22.0.0
54 | idna==3.7
55 | importlib-metadata==1.7.0
56 | ipykernel==5.3.0
57 | ipython==8.10.0
58 | ipython-genutils==0.2.0
59 | ipywidgets==7.5.1
60 | iso8601==0.1.12
61 | itsdangerous==1.1.0
62 | jedi==0.17.0
63 | Jinja2==3.1.4
64 | jmespath==0.10.0
65 | json-merge-patch==0.2
66 | jsonschema==3.2.0
67 | jupyter==1.0.0
68 | jupyter-client==6.1.3
69 | jupyter-console==6.1.0
70 | jupyter-core==4.11.2
71 | lazy-object-proxy==1.5.1
72 | lockfile==0.12.2
73 | Mako==1.2.2
74 | Markdown==2.6.11
75 | MarkupSafe==1.1.1
76 | marshmallow==2.21.0
77 | marshmallow-enum==1.5.1
78 | marshmallow-sqlalchemy==0.23.1
79 | mistune==2.0.3
80 | munch==2.5.0
81 | natsort==7.0.1
82 | nbconvert==6.5.1
83 | nbformat==5.0.6
84 | notebook==6.4.12
85 | numpy==1.22.0
86 | packaging==20.4
87 | pandas==1.0.5
88 | pandocfilters==1.4.2
89 | parso==0.7.0
90 | pendulum==1.4.4
91 | pexpect==4.8.0
92 | pickleshare==0.7.5
93 | pip-autoremove==0.9.1
94 | pkg-resources==0.0.0
95 | plotly==4.8.1
96 | prison==0.1.3
97 | prometheus-client==0.8.0
98 | prompt-toolkit==3.0.5
99 | psutil==5.7.2
100 | psycopg2-binary==2.8.5
101 | ptyprocess==0.6.0
102 | Pygments==2.15.0
103 | PyJWT==2.4.0
104 | pyparsing==2.4.7
105 | pyproj==2.6.1.post1
106 | pyrsistent==0.16.0
107 | python-daemon==2.2.4
108 | python-dateutil==2.8.1
109 | python-editor==1.0.4
110 | python-nvd3==0.15.0
111 | python-slugify==4.0.1
112 | python3-openid==3.2.0
113 | pytz==2020.1
114 | pytzdata==2020.1
115 | PyYAML==5.4
116 | pyzmq==19.0.1
117 | qtconsole==4.7.4
118 | QtPy==1.9.0
119 | requests==2.32.0
120 | retrying==1.3.3
121 | s3transfer==0.3.3
122 | Send2Trash==1.5.0
123 | setproctitle==1.1.10
124 | Shapely==1.7.0
125 | six==1.15.0
126 | SQLAlchemy==1.3.18
127 | SQLAlchemy-JSONField==0.9.0
128 | SQLAlchemy-Utils==0.36.8
129 | tabulate==0.8.7
130 | tenacity==4.12.0
131 | termcolor==1.1.0
132 | terminado==0.8.3
133 | testpath==0.4.4
134 | text-unidecode==1.3
135 | thrift==0.13.0
136 | tornado==6.4.1
137 | traitlets==4.3.3
138 | typing==3.7.4.3
139 | typing-extensions==3.7.4.2
140 | tzlocal==1.5.1
141 | unicodecsv==0.14.1
142 | urllib3==1.26.19
143 | wcwidth==0.2.3
144 | webencodings==0.5.1
145 | Werkzeug==3.0.3
146 | widgetsnbextension==3.5.1
147 | WTForms==2.3.1
148 | zipp==3.19.1
149 | zope.deprecation==4.4.0
150 |
--------------------------------------------------------------------------------
/src/airflow/start_workers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Send start command to spark_worker instances, wait for all three
4 | # to reach 'running' state, sleep for 15 more seconds just to be
5 | # safe, and then launch spark
6 |
7 | set -e
8 |
9 | aws ec2 start-instances --instance-ids \
10 | $EC2_WORKER1_ID $EC2_WORKER2_ID $EC2_WORKER3_ID &&
11 | aws ec2 wait instance-running --instance-ids \
12 | $EC2_WORKER1_ID $EC2_WORKER2_ID $EC2_WORKER3_ID &&
13 | sleep 15 &&
14 | /usr/local/spark/sbin/start-all.sh
15 |
--------------------------------------------------------------------------------
/src/airflow/stop_workers.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 |
3 | # Stop spark on all workers and then send stop-instances
4 | # command to AWS
5 |
6 | set -e
7 |
8 | /usr/local/spark/sbin/stop-all.sh &&
9 | aws ec2 stop-instances --instance-ids \
10 | $EC2_WORKER1_ID $EC2_WORKER2_ID $EC2_WORKER3_ID
11 |
--------------------------------------------------------------------------------
/src/airflow/where_cycle_dag.py:
--------------------------------------------------------------------------------
1 | from datetime import datetime, timedelta
2 | from airflow import DAG
3 | from airflow.operators.bash_operator import BashOperator
4 | from airflow.operators.python_operator import PythonOperator
5 | from preparation.extract import \
6 | get_taxi_zones, get_businesses, unzip_csvs
7 | from preparation.transform import \
8 | clean_taxi_zones, calculate_centroids, clean_businesses
9 | from preparation.load import \
10 | write_taxi_zones, write_businesses
11 |
12 |
13 | airflow_path = '/home/ubuntu/where-cycle/src/airflow/'
14 | spark_str = 'cd /home/ubuntu/where-cycle/src/spark_reduction && '
15 | psql_str = 'psql -h $PSQL_HOST -p $PSQL_PORT -U $PSQL_USER -d ' + \
16 | '$PSQL_DATABASE -f /home/ubuntu/where-cycle/src/postGIS_tables/'
17 |
18 | defaults = {
19 | 'owner': 'airflow',
20 | 'start_date': datetime(2020, 6, 21),
21 | 'depends_on_past': False,
22 | 'retries': 2,
23 | 'retry_delay': timedelta(minutes=5)
24 | }
25 |
26 | with DAG(
27 | 'where_cycle',
28 | default_args = defaults,
29 | schedule_interval = '@weekly'
30 | ) as dag:
31 | #******** PREPARATION ********#
32 |
33 | t1 = PythonOperator(
34 | task_id = 'get_taxi_zones',
35 | python_callable = get_taxi_zones
36 | )
37 |
38 | t2 = PythonOperator(
39 | task_id = 'clean_taxi_zones',
40 | python_callable = clean_taxi_zones,
41 | provide_context = True
42 | )
43 |
44 | t3 = PythonOperator(
45 | task_id = 'write_taxi_zones',
46 | python_callable = write_taxi_zones,
47 | provide_context = True
48 | )
49 |
50 | t4 = PythonOperator(
51 | task_id = 'calculate_centroids',
52 | python_callable = calculate_centroids,
53 | provide_context = True
54 | )
55 |
56 | t5 = PythonOperator(
57 | task_id = 'get_businesses',
58 | python_callable = get_businesses,
59 | provide_context = True
60 | )
61 |
62 | t6 = PythonOperator(
63 | task_id = 'clean_businesses',
64 | python_callable = clean_businesses,
65 | provide_context = True
66 | )
67 |
68 | t7 = PythonOperator(
69 | task_id = 'write_businesses',
70 | python_callable = write_businesses,
71 | provide_context = True
72 | )
73 |
74 | t8 = PythonOperator(
75 | task_id = 'unzip_csvs',
76 | python_callable = unzip_csvs
77 | )
78 |
79 | t1 >> t2 >> t3
80 | t1 >> t4 >> t5 >> t6 >> t7
81 |
82 |
83 | #******** SPARK REDUCTION ********#
84 |
85 | t9 = BashOperator(
86 | task_id = 'start_spark_workers',
87 | bash_command = airflow_path + 'start_workers.sh '
88 | )
89 |
90 | t10 = BashOperator(
91 | task_id = 'submit_spark_driver',
92 | bash_command = spark_str + 'spark-submit driver.py'
93 | )
94 |
95 | t11 = BashOperator(
96 | task_id = 'stop_spark_workers',
97 | bash_command = airflow_path + 'stop_workers.sh ',
98 | trigger_rule = 'all_done'
99 | )
100 |
101 | t8 >> t9 >> t10 >> t11
102 |
103 |
104 | #******** POSTGIS TABLES ********#
105 |
106 | t12 = BashOperator(
107 | task_id = 'create_production_taxi_zones',
108 | bash_command = psql_str + 'production/taxi_zones.sql'
109 | )
110 |
111 | t13 = BashOperator(
112 | task_id = 'create_statistics_yelp_businesses',
113 | bash_command = psql_str + 'statistics/yelp_businesses.sql'
114 | )
115 |
116 | t14 = BashOperator(
117 | task_id = 'create_geo_joined_citibike_stations',
118 | bash_command = psql_str + 'geo_joined/citibike_stations.sql'
119 | )
120 |
121 | t15 = BashOperator(
122 | task_id = 'create_statistics_citibike',
123 | bash_command = psql_str + 'statistics/citibike.sql'
124 | )
125 |
126 | t16 = BashOperator(
127 | task_id = 'create_geo_joined_past_tlc_visits',
128 | bash_command = psql_str + 'geo_joined/past_tlc_visits.sql'
129 | )
130 |
131 | t17 = BashOperator(
132 | task_id = 'create_statistics_tlc_visits',
133 | bash_command = psql_str + 'statistics/tlc_visits.sql'
134 | )
135 |
136 | t18 = BashOperator(
137 | task_id = 'create_production_all_time_stats',
138 | bash_command = psql_str + 'production/all_time_stats.sql'
139 | )
140 |
141 | t3 >> t12
142 | t7 >> t13
143 | t10 >> t14 >> t15
144 | t10 >> t16 >> t17
145 | [t13, t15, t17] >> t18
146 |
--------------------------------------------------------------------------------
/src/config/database.py:
--------------------------------------------------------------------------------
1 | import os
2 | import sqlalchemy as sql
3 |
4 |
5 | jdbc_props = {
6 | 'driver': 'org.postgresql.Driver',
7 | 'user': os.environ['PSQL_USER'],
8 | 'password': os.environ['PSQL_PASSWORD']
9 | }
10 |
11 | jdbc_url = 'jdbc:postgresql://' + \
12 | os.environ['PSQL_HOST'] + ':' + os.environ['PSQL_PORT'] + \
13 | '/' + os.environ['PSQL_DATABASE']
14 |
15 | py_engine = sql.create_engine(
16 | 'postgresql://' +
17 | os.environ['PSQL_USER'] + ':' + os.environ['PSQL_PASSWORD'] +
18 | '@' + os.environ['PSQL_HOST'] + ':' + os.environ['PSQL_PORT'] +
19 | '/' + os.environ['PSQL_DATABASE']
20 | )
21 |
--------------------------------------------------------------------------------
/src/config/geometries.py:
--------------------------------------------------------------------------------
1 | TAXI_ZONE_CENTROID_LAT = 40.7058240860865
2 | TAXI_ZONE_CENTROID_LON = -73.9778002135437
3 | TAXI_ZONE_LAT_MIN = 40.4961153951704
4 | TAXI_ZONE_LAT_MAX = 40.9155327770026
5 | TAXI_ZONE_LON_MIN = -74.2555913631521
6 | TAXI_ZONE_LON_MAX = -73.7000090639354
7 |
--------------------------------------------------------------------------------
/src/config/ref/check_citibike_schema.py:
--------------------------------------------------------------------------------
1 | import boto3
2 | from pyspark.sql import SparkSession
3 |
4 |
5 | s3 = boto3.resource('s3')
6 | bucket = s3.Bucket('jlang-20b-de-ny')
7 |
8 | spark = SparkSession.builder \
9 | .appName('check-citibike-schema') \
10 | .getOrCreate()
11 |
12 | for obj in bucket.objects.all():
13 | key = obj.key
14 | if key.startswith('citibike/') and key.endswith('.csv'):
15 | path = 's3a://jlang-20b-de-ny/' + key
16 | csv_df = spark.read.csv(
17 | path = path,
18 | header = True,
19 | inferSchema = True,
20 | enforceSchema = False,
21 | ignoreLeadingWhiteSpace = True,
22 | ignoreTrailingWhiteSpace = True,
23 | samplingRatio = 0.1
24 | )
25 | print(path)
26 | csv_df.printSchema()
27 |
--------------------------------------------------------------------------------
/src/config/ref/check_tlc_schemas.py:
--------------------------------------------------------------------------------
1 | import boto3
2 | from pyspark.sql import SparkSession
3 |
4 |
5 | s3 = boto3.resource('s3')
6 | nyc_tlc = s3.Bucket('nyc-tlc')
7 |
8 | spark = SparkSession.builder \
9 | .appName('check_tlc_schemas') \
10 | .getOrCreate()
11 |
12 | for obj in nyc_tlc.objects.all():
13 | key = obj.key
14 | if key.startswith('trip data/') and key.endswith('.csv'):
15 | path = 's3a://nyc-tlc/' + key
16 | csv_df = spark.read.csv(
17 | path = path,
18 | header = True,
19 | inferSchema = True,
20 | enforceSchema = False,
21 | ignoreLeadingWhiteSpace = True,
22 | ignoreTrailingWhiteSpace = True,
23 | samplingRatio = 0.1
24 | )
25 | print(path)
26 | csv_df.printSchema()
27 |
--------------------------------------------------------------------------------
/src/config/ref/get_geometries.sql:
--------------------------------------------------------------------------------
1 | SELECT ST_Extent(geometry) AS bbox
2 | FROM staging.taxi_zones;
3 |
4 | SELECT ST_AsText(ST_Centroid(ST_Extent(geometry))) AS center
5 | FROM staging.taxi_zones;
6 |
--------------------------------------------------------------------------------
/src/config/ref/tlc_schemas.txt:
--------------------------------------------------------------------------------
1 | lat-lon data
2 | green_tripdata_2013-08_2016-06_9-cols
3 | green_tripdata_2013-08_2014-12
4 | |-- VendorID: integer (nullable = true)
5 | |-- lpep_pickup_datetime: timestamp (nullable = true)
6 | |-- Lpep_dropoff_datetime: timestamp (nullable = true)
7 | |-- Store_and_fwd_flag: string (nullable = true)
8 | |-- RateCodeID: integer (nullable = true)
9 | |-- Pickup_longitude: double (nullable = true)
10 | |-- Pickup_latitude: double (nullable = true)
11 | |-- Dropoff_longitude: double (nullable = true)
12 | |-- Dropoff_latitude: double (nullable = true)
13 | |-- Passenger_count: integer (nullable = true)
14 | |-- Trip_distance: double (nullable = true)
15 | |-- Fare_amount: double (nullable = true)
16 | |-- Extra: double (nullable = true)
17 | |-- MTA_tax: double (nullable = true)
18 | |-- Tip_amount: double (nullable = true)
19 | |-- Tolls_amount: double (nullable = true)
20 | |-- Ehail_fee: string (nullable = true)
21 | |-- Total_amount: double (nullable = true)
22 | |-- Payment_type: integer (nullable = true)
23 | |-- Trip_type: string (nullable = true)
24 |
25 | green_tripdata_2015-01_2016-06_9-cols
26 | |-- VendorID: integer (nullable = true)
27 | |-- lpep_pickup_datetime: timestamp (nullable = true)
28 | |-- Lpep_dropoff_datetime: timestamp (nullable = true)
29 | |-- Store_and_fwd_flag: string (nullable = true)
30 | |-- RateCodeID: integer (nullable = true)
31 | |-- Pickup_longitude: double (nullable = true)
32 | |-- Pickup_latitude: double (nullable = true)
33 | |-- Dropoff_longitude: double (nullable = true)
34 | |-- Dropoff_latitude: double (nullable = true)
35 | |-- Passenger_count: integer (nullable = true)
36 | |-- Trip_distance: double (nullable = true)
37 | |-- Fare_amount: double (nullable = true)
38 | |-- Extra: double (nullable = true)
39 | |-- MTA_tax: double (nullable = true)
40 | |-- Tip_amount: double (nullable = true)
41 | |-- Tolls_amount: double (nullable = true)
42 | |-- Ehail_fee: string (nullable = true)
43 | |-- improvement_surcharge: double (nullable = true)
44 | |-- Total_amount: double (nullable = true)
45 | |-- Payment_type: integer (nullable = true)
46 | |-- Trip_type: integer (nullable = true)
47 |
48 | yellow_tripdata_2009-01__2016-06_11-cols
49 | yellow_tripdata_2009-01_2009-12
50 | |-- vendor_name: string (nullable = true)
51 | |-- Trip_Pickup_DateTime: timestamp (nullable = true)
52 | |-- Trip_Dropoff_DateTime: timestamp (nullable = true)
53 | |-- Passenger_Count: integer (nullable = true)
54 | |-- Trip_Distance: double (nullable = true)
55 | |-- Start_Lon: double (nullable = true)
56 | |-- Start_Lat: double (nullable = true)
57 | |-- Rate_Code: string (nullable = true)
58 | |-- store_and_forward: integer (nullable = true)
59 | |-- End_Lon: double (nullable = true)
60 | |-- End_Lat: double (nullable = true)
61 | |-- Payment_Type: string (nullable = true)
62 | |-- Fare_Amt: double (nullable = true)
63 | |-- surcharge: double (nullable = true)
64 | |-- mta_tax: string (nullable = true)
65 | |-- Tip_Amt: double (nullable = true)
66 | |-- Tolls_Amt: double (nullable = true)
67 | |-- Total_Amt: double (nullable = true)
68 |
69 | yellow_tripdata_2010-01_2014-12
70 | |-- vendor_id: string (nullable = true)
71 | |-- pickup_datetime: timestamp (nullable = true)
72 | |-- dropoff_datetime: timestamp (nullable = true)
73 | |-- passenger_count: integer (nullable = true)
74 | |-- trip_distance: double (nullable = true)
75 | |-- pickup_longitude: double (nullable = true)
76 | |-- pickup_latitude: double (nullable = true)
77 | |-- rate_code: integer (nullable = true)
78 | |-- store_and_fwd_flag: integer (nullable = true)
79 | |-- dropoff_longitude: double (nullable = true)
80 | |-- dropoff_latitude: double (nullable = true)
81 | |-- payment_type: string (nullable = true)
82 | |-- fare_amount: double (nullable = true)
83 | |-- surcharge: double (nullable = true)
84 | |-- mta_tax: double (nullable = true)
85 | |-- tip_amount: double (nullable = true)
86 | |-- tolls_amount: double (nullable = true)
87 | |-- total_amount: double (nullable = true)
88 |
89 | yellow_tripdata_2015-01_2016-06
90 | |-- VendorID: integer (nullable = true)
91 | |-- tpep_pickup_datetime: timestamp (nullable = true)
92 | |-- tpep_dropoff_datetime: timestamp (nullable = true)
93 | |-- passenger_count: integer (nullable = true)
94 | |-- trip_distance: double (nullable = true)
95 | |-- pickup_longitude: double (nullable = true)
96 | |-- pickup_latitude: double (nullable = true)
97 | |-- RateCodeID: integer (nullable = true)
98 | |-- store_and_fwd_flag: string (nullable = true)
99 | |-- dropoff_longitude: double (nullable = true)
100 | |-- dropoff_latitude: double (nullable = true)
101 | |-- payment_type: integer (nullable = true)
102 | |-- fare_amount: double (nullable = true)
103 | |-- extra: double (nullable = true)
104 | |-- mta_tax: double (nullable = true)
105 | |-- tip_amount: double (nullable = true)
106 | |-- tolls_amount: double (nullable = true)
107 | |-- improvement_surcharge: double (nullable = true)
108 | |-- total_amount: double (nullable = true)
109 |
110 | LocationID data
111 | fhv_tripdata_2015-01_2016-12_3-cols
112 | |-- Dispatching_base_num: string (nullable = true)
113 | |-- Pickup_date: timestamp (nullable = true)
114 | |-- locationID: integer (nullable = true)
115 |
116 | fhv_tripdata_2017-01_2019-12_5-cols
117 | fhv_tripdata_2017-01_2017-06
118 | |-- Dispatching_base_num: string (nullable = true)
119 | |-- Pickup_DateTime: timestamp (nullable = true)
120 | |-- DropOff_datetime: string (nullable = true)
121 | |-- PUlocationID: integer (nullable = true)
122 | |-- DOlocationID: integer (nullable = true)
123 |
124 | fhv_tripdata_2017-07_2017-12
125 | |-- Dispatching_base_num: string (nullable = true)
126 | |-- Pickup_DateTime: timestamp (nullable = true)
127 | |-- DropOff_datetime: string (nullable = true)
128 | |-- PUlocationID: integer (nullable = true)
129 | |-- DOlocationID: integer (nullable = true)
130 | |-- SR_Flag: integer (nullable = true)
131 |
132 | fhv_tripdata_2019-01_2019-12
133 | |-- dispatching_base_num: string (nullable = true)
134 | |-- pickup_datetime: timestamp (nullable = true)
135 | |-- dropoff_datetime: timestamp (nullable = true)
136 | |-- PULocationID: integer (nullable = true)
137 | |-- DOLocationID: integer (nullable = true)
138 | |-- SR_Flag: integer (nullable = true)
139 |
140 | fhv_tripdata_2018-01_2018-12_4-cols
141 | |-- Pickup_DateTime: timestamp (nullable = true)
142 | |-- DropOff_datetime: timestamp (nullable = true)
143 | |-- PUlocationID: integer (nullable = true)
144 | |-- DOlocationID: integer (nullable = true)
145 | |-- SR_Flag: integer (nullable = true)
146 | |-- Dispatching_base_number: string (nullable = true)
147 | |-- Dispatching_base_num: string (nullable = true)
148 |
149 | fhvhv_tripdata_6-cols
150 | |-- hvfhs_license_num: string (nullable = true)
151 | |-- dispatching_base_num: string (nullable = true)
152 | |-- pickup_datetime: timestamp (nullable = true)
153 | |-- dropoff_datetime: timestamp (nullable = true)
154 | |-- PULocationID: integer (nullable = true)
155 | |-- DOLocationID: integer (nullable = true)
156 | |-- SR_Flag: integer (nullable = true)
157 |
158 | green_tripdata_2016-07_2019-12_7-cols
159 | green_tripdata_2016-07_2018-12
160 | |-- VendorID: integer (nullable = true)
161 | |-- lpep_pickup_datetime: timestamp (nullable = true)
162 | |-- lpep_dropoff_datetime: timestamp (nullable = true)
163 | |-- store_and_fwd_flag: string (nullable = true)
164 | |-- RatecodeID: integer (nullable = true)
165 | |-- PULocationID: integer (nullable = true)
166 | |-- DOLocationID: integer (nullable = true)
167 | |-- passenger_count: integer (nullable = true)
168 | |-- trip_distance: double (nullable = true)
169 | |-- fare_amount: double (nullable = true)
170 | |-- extra: double (nullable = true)
171 | |-- mta_tax: double (nullable = true)
172 | |-- tip_amount: double (nullable = true)
173 | |-- tolls_amount: double (nullable = true)
174 | |-- ehail_fee: string (nullable = true)
175 | |-- improvement_surcharge: double (nullable = true)
176 | |-- total_amount: double (nullable = true)
177 | |-- payment_type: integer (nullable = true)
178 | |-- trip_type: integer (nullable = true)
179 |
180 | green_tripdata_2019-01_2019-12_7-cols
181 | |-- VendorID: integer (nullable = true)
182 | |-- lpep_pickup_datetime: timestamp (nullable = true)
183 | |-- lpep_dropoff_datetime: timestamp (nullable = true)
184 | |-- store_and_fwd_flag: string (nullable = true)
185 | |-- RatecodeID: integer (nullable = true)
186 | |-- PULocationID: integer (nullable = true)
187 | |-- DOLocationID: integer (nullable = true)
188 | |-- passenger_count: integer (nullable = true)
189 | |-- trip_distance: double (nullable = true)
190 | |-- fare_amount: double (nullable = true)
191 | |-- extra: double (nullable = true)
192 | |-- mta_tax: double (nullable = true)
193 | |-- tip_amount: double (nullable = true)
194 | |-- tolls_amount: double (nullable = true)
195 | |-- ehail_fee: string (nullable = true)
196 | |-- improvement_surcharge: double (nullable = true)
197 | |-- total_amount: double (nullable = true)
198 | |-- payment_type: integer (nullable = true)
199 | |-- trip_type: integer (nullable = true)
200 | |-- congestion_surcharge: integer (nullable = true)
201 |
202 | yellow_tripdata_2016-07_2019-12_9-cols
203 | yellow_tripdata_2016-07_2018-12
204 | |-- VendorID: integer (nullable = true)
205 | |-- tpep_pickup_datetime: timestamp (nullable = true)
206 | |-- tpep_dropoff_datetime: timestamp (nullable = true)
207 | |-- passenger_count: integer (nullable = true)
208 | |-- trip_distance: double (nullable = true)
209 | |-- RatecodeID: integer (nullable = true)
210 | |-- store_and_fwd_flag: string (nullable = true)
211 | |-- PULocationID: integer (nullable = true)
212 | |-- DOLocationID: integer (nullable = true)
213 | |-- payment_type: integer (nullable = true)
214 | |-- fare_amount: double (nullable = true)
215 | |-- extra: double (nullable = true)
216 | |-- mta_tax: double (nullable = true)
217 | |-- tip_amount: double (nullable = true)
218 | |-- tolls_amount: double (nullable = true)
219 | |-- improvement_surcharge: double (nullable = true)
220 | |-- total_amount: double (nullable = true)
221 |
222 | yellow_tripdata_2019-01_2019-12_9-cols
223 | |-- VendorID: integer (nullable = true)
224 | |-- tpep_pickup_datetime: timestamp (nullable = true)
225 | |-- tpep_dropoff_datetime: timestamp (nullable = true)
226 | |-- passenger_count: integer (nullable = true)
227 | |-- trip_distance: double (nullable = true)
228 | |-- RatecodeID: integer (nullable = true)
229 | |-- store_and_fwd_flag: string (nullable = true)
230 | |-- PULocationID: integer (nullable = true)
231 | |-- DOLocationID: integer (nullable = true)
232 | |-- payment_type: integer (nullable = true)
233 | |-- fare_amount: double (nullable = true)
234 | |-- extra: double (nullable = true)
235 | |-- mta_tax: double (nullable = true)
236 | |-- tip_amount: double (nullable = true)
237 | |-- tolls_amount: double (nullable = true)
238 | |-- improvement_surcharge: double (nullable = true)
239 | |-- total_amount: double (nullable = true)
240 | |-- congestion_surcharge: double (nullable = true)
241 |
--------------------------------------------------------------------------------
/src/config/schemas.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql.types import StructType, StructField, \
2 | IntegerType, TimestampType, StringType, DoubleType
3 |
4 | citibike_schema = StructType(
5 | [
6 | StructField('tripduration', IntegerType(), True),
7 | StructField('starttime', TimestampType(), True),
8 | StructField('stoptime', TimestampType(), True),
9 | StructField('start station id', IntegerType(), True),
10 | StructField('start station name', StringType(), True),
11 | StructField('start station latitude', DoubleType(), True),
12 | StructField('start station longitude', DoubleType(), True),
13 | StructField('end station id', IntegerType(), True),
14 | StructField('end station name', StringType(), True),
15 | StructField('end station latitude', DoubleType(), True),
16 | StructField('end station longitude', DoubleType(), True)
17 | ]
18 | )
19 |
20 | past_schema = StructType(
21 | [
22 | StructField('month', StringType(), True),
23 | StructField('pickup_longitude', DoubleType(), True),
24 | StructField('pickup_latitude', DoubleType(), True),
25 | StructField('dropoff_longitude', DoubleType(), True),
26 | StructField('dropoff_latitude', DoubleType(), True)
27 | ]
28 | )
29 |
30 | modern_schema = StructType(
31 | [
32 | StructField('month', StringType(), True),
33 | StructField('PULocationID', IntegerType(), True),
34 | StructField('DOLocationID', IntegerType(), True)
35 | ]
36 | )
37 |
38 | green_13_16_schema = StructType(
39 | [
40 | StructField('VendorID', IntegerType(), True),
41 | StructField('lpep_pickup_datetime', TimestampType(), True),
42 | StructField('Lpep_dropoff_datetime', TimestampType(), True),
43 | StructField('Store_and_fwd_flag', StringType(), True),
44 | StructField('RateCodeID', IntegerType(), True),
45 | StructField('pickup_longitude', DoubleType(), True),
46 | StructField('pickup_latitude', DoubleType(), True),
47 | StructField('dropoff_longitude', DoubleType(), True),
48 | StructField('dropoff_latitude', DoubleType(), True)
49 | ]
50 | )
51 |
52 | yellow_09_16_schema = StructType(
53 | [
54 | StructField('VendorID', StringType(), True),
55 | StructField('tpep_pickup_datetime', TimestampType(), True),
56 | StructField('tpep_dropoff_datetime', TimestampType(), True),
57 | StructField('passenger_count', IntegerType(), True),
58 | StructField('trip_distance', DoubleType(), True),
59 | StructField('pickup_longitude', DoubleType(), True),
60 | StructField('pickup_latitude', DoubleType(), True),
61 | StructField('RateCodeID', StringType(), True),
62 | StructField('store_and_fwd_flag', StringType(), True),
63 | StructField('dropoff_longitude', DoubleType(), True),
64 | StructField('dropoff_latitude', DoubleType(), True)
65 | ]
66 | )
67 |
68 | fhv_15_16_schema = StructType(
69 | [
70 | StructField('Dispatching_base_num', StringType(), True),
71 | StructField('Pickup_date', TimestampType(), True),
72 | StructField('locationID', IntegerType(), True)
73 | ]
74 | )
75 |
76 | fhv_17_19_schema = StructType(
77 | [
78 | StructField('Dispatching_base_num', StringType(), True),
79 | StructField('Pickup_DateTime', TimestampType(), True),
80 | StructField('DropOff_datetime', TimestampType(), True),
81 | StructField('PUlocationID', IntegerType(), True),
82 | StructField('DOlocationID', IntegerType(), True)
83 | ]
84 | )
85 |
86 | fhv_18_schema = StructType(
87 | [
88 | StructField('Pickup_DateTime', TimestampType(), True),
89 | StructField('DropOff_datetime', TimestampType(), True),
90 | StructField('PUlocationID', IntegerType(), True),
91 | StructField('DOlocationID', IntegerType(), True)
92 | ]
93 | )
94 |
95 | fhvhv_schema = StructType(
96 | [
97 | StructField('hvfhs_license_num', StringType(), True),
98 | StructField('dispatching_base_num', StringType(), True),
99 | StructField('pickup_datetime', TimestampType(), True),
100 | StructField('dropoff_datetime', TimestampType(), True),
101 | StructField('PULocationID', IntegerType(), True),
102 | StructField('DOLocationID', IntegerType(), True)
103 | ]
104 | )
105 |
106 | green_16_19_schema = StructType(
107 | [
108 | StructField('VendorID', IntegerType(), True),
109 | StructField('lpep_pickup_datetime', TimestampType(), True),
110 | StructField('lpep_dropoff_datetime', TimestampType(), True),
111 | StructField('store_and_fwd_flag', StringType(), True),
112 | StructField('RatecodeID', IntegerType(), True),
113 | StructField('PULocationID', IntegerType(), True),
114 | StructField('DOLocationID', IntegerType(), True)
115 | ]
116 | )
117 |
118 | yellow_16_19_schema = StructType(
119 | [
120 | StructField('VendorID', IntegerType(), True),
121 | StructField('tpep_pickup_datetime', TimestampType(), True),
122 | StructField('tpep_dropoff_datetime', TimestampType(), True),
123 | StructField('passenger_count', IntegerType(), True),
124 | StructField('trip_distance', DoubleType(), True),
125 | StructField('RatecodeID', IntegerType(), True),
126 | StructField('store_and_fwd_flag', StringType(), True),
127 | StructField('PULocationID', IntegerType(), True),
128 | StructField('DOLocationID', IntegerType(), True)
129 | ]
130 | )
131 |
--------------------------------------------------------------------------------
/src/dash/app.py:
--------------------------------------------------------------------------------
1 | import json
2 | import os
3 | import dash
4 | import dash_core_components as dcc
5 | import dash_html_components as html
6 | import flask
7 | import pandas as pd
8 | import plotly
9 | import plotly.graph_objects as go
10 | from config.database import py_engine
11 | from config.geometries import \
12 | TAXI_ZONE_CENTROID_LAT, TAXI_ZONE_CENTROID_LON
13 |
14 |
15 | zones = pd.read_sql_table(
16 | table_name = 'taxi_zones',
17 | con = py_engine,
18 | schema = 'production'
19 | )
20 |
21 | json_zones = {'type': 'FeatureCollection', 'features': []}
22 | for _, row in zones.iterrows():
23 | feature = {
24 | 'type':'Feature',
25 | 'id': row['zone_id'],
26 | 'geometry': json.loads(row['geometry'])
27 | }
28 | json_zones['features'].append(feature)
29 |
30 | stats = pd.read_sql_table(
31 | table_name = 'all_time_stats',
32 | con = py_engine,
33 | schema = 'production'
34 | )
35 |
36 | columns = [
37 | 'tlc_visits',
38 | 'citibike_visits',
39 | 'citibike_stations',
40 | # 'yelp_avg_rating',
41 | # 'yelp_sum_reviews',
42 | 'yelp_weighted_sum_reviews'
43 | ]
44 |
45 | map_views = []
46 | bar_charts = []
47 |
48 | for column in columns:
49 | map_views.append(
50 | go.Choroplethmapbox(
51 | geojson = json_zones,
52 | locations = stats['zone_id'].tolist(),
53 | z = stats[column].tolist(),
54 | text = stats['zone_name'] + ', ' + stats['borough'],
55 | visible = False,
56 | subplot = 'mapbox',
57 | hovertemplate = '%{text}
' +
58 | '%{z}
' +
59 | ''
60 | )
61 | )
62 |
63 | top = stats.sort_values([column], ascending = False).head(15)
64 | bar_charts.append(
65 | go.Bar(
66 | x = top[column],
67 | y = top['zone_name'] + ', ' + top['borough'],
68 | text = top['zone_name'] + ', ' + top['borough'],
69 | textposition = 'inside',
70 | hovertemplate = '%{text}
' +
71 | '%{x}
' +
72 | '',
73 | xaxis = 'x',
74 | yaxis = 'y',
75 | marker = dict(color = 'blue'),
76 | visible = False,
77 | name = '',
78 | orientation = 'h'
79 | )
80 | )
81 |
82 | map_views[0]['visible'] = True
83 | bar_charts[0]['visible'] = True
84 |
85 | fig = go.Figure(data = map_views + bar_charts)
86 |
87 | fig.update_layout(
88 | title = dict(
89 | text = 'Where Cycle',
90 | font = dict(size = 36),
91 | x = 0.5,
92 | xanchor = 'center'
93 | ),
94 | autosize = True,
95 | height = 700,
96 | mapbox = dict(
97 | domain = dict(x = [0.25, 1], y = [0, 1]),
98 | accesstoken = os.environ['MAPBOX_ACCESS_TOKEN'],
99 | style = 'dark',
100 | center = dict(
101 | lon = TAXI_ZONE_CENTROID_LON,
102 | lat = TAXI_ZONE_CENTROID_LAT
103 | ),
104 | zoom = 9.35
105 | ),
106 | xaxis = dict(
107 | domain = [0, 0.25],
108 | anchor = 'x',
109 | showticklabels = True,
110 | showgrid = True
111 | ),
112 | yaxis = dict(
113 | domain = [0, 1],
114 | anchor = 'y',
115 | autorange = 'reversed',
116 | visible = False
117 | ),
118 | margin = dict(l = 0, r = 0, t = 70, b = 50),
119 | paper_bgcolor='black',
120 | plot_bgcolor='black'
121 | )
122 |
123 | fig.update_layout(
124 | updatemenus = [dict(
125 | x = 0,
126 | y = 1,
127 | xanchor = 'left',
128 | yanchor = 'bottom',
129 | buttons = list([
130 | dict(
131 | args = [
132 | 'visible',
133 | [True, False, False, False] # , False, False]
134 | ],
135 | label = 'Taxi Visits',
136 | method = 'restyle'
137 | ),
138 | dict(
139 | args = [
140 | 'visible',
141 | [False, True, False, False] # , False, False]
142 | ],
143 | label = 'Citibike Visits',
144 | method = 'restyle'
145 | ),
146 | dict(
147 | args = [
148 | 'visible',
149 | [False, False, True, False] # , False, False]
150 | ],
151 | label = 'Citibike Stations',
152 | method = 'restyle'
153 | ),
154 | # dict(
155 | # args = [
156 | # 'visible',
157 | # [False, False, False, True, False, False]
158 | # ],
159 | # label = 'Yelp Average Rating',
160 | # method = 'restyle'
161 | # ),
162 | # dict(
163 | # args = [
164 | # 'visible',
165 | # [False, False, False, False, True, False]
166 | # ],
167 | # label = 'Yelp Reviews',
168 | # method = 'restyle'
169 | # ),
170 | dict(
171 | args = [
172 | 'visible',
173 | [False, False, False, True] # , False, True]
174 | ],
175 | label = 'Yelp Stars (weighted review count)',
176 | method = 'restyle'
177 | )
178 | ]),
179 | )]
180 | )
181 |
182 | server = flask.Flask(__name__)
183 | stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']
184 |
185 | app = dash.Dash(
186 | __name__,
187 | external_stylesheets = stylesheets,
188 | server = server
189 | )
190 |
191 | app.layout = html.Div([
192 | dcc.Location(
193 | id = 'url',
194 | pathname = '/where-cycle',
195 | refresh = False
196 | ),
197 | dcc.Graph(figure = fig),
198 | html.Div([
199 | 'Read more about this project on ',
200 | html.A(
201 | ['Github'],
202 | href = 'https://github.com/josh-lang/where-cycle'
203 | )
204 | ])
205 | ])
206 |
207 | app.title = 'Where Cycle'
208 |
209 | if __name__ == '__main__':
210 | app.run_server(
211 | debug = False,
212 | dev_tools_props_check = False,
213 | dev_tools_ui = False
214 | )
215 |
--------------------------------------------------------------------------------
/src/dash/assets/background.css:
--------------------------------------------------------------------------------
1 | body {
2 | background-color: black;
3 | color: rgb(42, 63, 95);
4 | }
5 |
--------------------------------------------------------------------------------
/src/postGIS_tables/geo_joined/citibike_stations.sql:
--------------------------------------------------------------------------------
1 | -- Create join table for taxi zones and Citibike stations
2 |
3 | DROP TABLE IF EXISTS geo_joined.citibike_stations;
4 |
5 | CREATE TABLE geo_joined.citibike_stations AS
6 | SELECT
7 | z.zone_id,
8 | c.station_id
9 | FROM
10 | staging.taxi_zones AS z
11 | JOIN (
12 | SELECT
13 | station_id,
14 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS geometry
15 | FROM staging.citibike_stations
16 | ) AS c
17 | ON ST_WITHIN(c.geometry, z.geometry)
18 | GROUP BY 1, 2;
19 |
--------------------------------------------------------------------------------
/src/postGIS_tables/geo_joined/past_tlc_visits.sql:
--------------------------------------------------------------------------------
1 | -- Aggregate past TLC visits by the taxi zone their coordinates are within
2 |
3 | DROP TABLE IF EXISTS geo_joined.past_tlc_visits;
4 |
5 | CREATE TABLE geo_joined.past_tlc_visits AS
6 | SELECT
7 | p.month,
8 | z.zone_id,
9 | SUM(p.visits) AS visits
10 | FROM
11 | staging.taxi_zones AS z
12 | JOIN (
13 | SELECT
14 | month,
15 | ST_SetSRID(ST_MakePoint(longitude, latitude), 4326) AS geometry,
16 | visits
17 | FROM staging.past_tlc_visits
18 | ) AS p
19 | ON ST_WITHIN(p.geometry, z.geometry)
20 | GROUP BY 1, 2;
21 |
--------------------------------------------------------------------------------
/src/postGIS_tables/production/all_time_stats.sql:
--------------------------------------------------------------------------------
1 | -- Join Citibike, TLC, and Yelp statistics to taxi zones for Dash
2 |
3 | DROP TABLE IF EXISTS production.all_time_stats;
4 |
5 | CREATE TABLE production.all_time_stats AS
6 | SELECT
7 | v.zone_id,
8 | v.zone_name,
9 | v.borough,
10 | v.tlc_visits,
11 | v.citibike_visits,
12 | v.citibike_stations,
13 | y.avg_rating AS yelp_avg_rating,
14 | y.sum_reviews AS yelp_sum_reviews,
15 | y.weighted_sum_reviews AS yelp_weighted_sum_reviews
16 | FROM
17 | (
18 | SELECT
19 | z.zone_id,
20 | z.zone_name,
21 | z.borough,
22 | COALESCE(SUM(t.visits), 0) AS tlc_visits,
23 | COALESCE(SUM(c.visits), 0) AS citibike_visits,
24 | COALESCE(MAX(c.stations), 0) AS citibike_stations
25 | FROM
26 | staging.taxi_zones AS z
27 | LEFT JOIN statistics.tlc_visits AS t USING (zone_id)
28 | LEFT JOIN statistics.citibike AS c
29 | ON t.zone_id = c.zone_id AND t.month = c.month
30 | GROUP BY 1, 2, 3
31 | ) AS v
32 | LEFT JOIN statistics.yelp_businesses AS y USING (zone_id)
33 | ORDER BY 1;
34 |
--------------------------------------------------------------------------------
/src/postGIS_tables/production/taxi_zones.sql:
--------------------------------------------------------------------------------
1 | -- Convert taxi zone geometries to GeoJSON for Dash
2 |
3 | DROP TABLE IF EXISTS production.taxi_zones;
4 |
5 | CREATE TABLE production.taxi_zones AS
6 | SELECT
7 | zone_id,
8 | ST_ASGeoJSON(ST_ForcePolygonCW(geometry)) AS geometry
9 | FROM staging.taxi_zones
10 | ORDER BY 1;
11 |
--------------------------------------------------------------------------------
/src/postGIS_tables/statistics/citibike.sql:
--------------------------------------------------------------------------------
1 | -- Aggregate Citibike visits by taxi zone
2 | -- and estimate monthly station additions with rolling maximum
3 |
4 | DROP TABLE IF EXISTS statistics.citibike;
5 |
6 | CREATE TABLE statistics.citibike AS
7 | SELECT
8 | t.month,
9 | t.zone_id,
10 | MAX(active_stations) OVER (
11 | PARTITION BY t.zone_id
12 | ORDER BY t.month ROWS BETWEEN UNBOUNDED PRECEDING AND CURRENT ROW
13 | ) AS stations,
14 | visits
15 | FROM (
16 | SELECT
17 | v.month,
18 | s.zone_id,
19 | COUNT(s.station_id) AS active_stations,
20 | SUM(v.visits) AS visits
21 | FROM
22 | geo_joined.citibike_stations AS s
23 | JOIN staging.citibike_visits AS v
24 | USING (station_id)
25 | GROUP BY 1, 2
26 | ) AS t;
27 |
--------------------------------------------------------------------------------
/src/postGIS_tables/statistics/tlc_visits.sql:
--------------------------------------------------------------------------------
1 | -- Combine past TLC visits with modern TLC visits
2 | -- and aggregate by taxi zone ID
3 |
4 | DROP TABLE IF EXISTS statistics.tlc_visits;
5 |
6 | CREATE TABLE statistics.tlc_visits AS
7 | SELECT
8 | t.month,
9 | t.zone_id,
10 | SUM(t.visits) AS visits
11 | FROM (
12 | SELECT
13 | month,
14 | zone_id,
15 | visits
16 | FROM geo_joined.past_tlc_visits
17 | UNION ALL
18 | SELECT
19 | month,
20 | zone_id,
21 | visits
22 | FROM staging.modern_tlc_visits
23 | ) AS t
24 | GROUP BY 1, 2;
25 |
--------------------------------------------------------------------------------
/src/postGIS_tables/statistics/yelp_businesses.sql:
--------------------------------------------------------------------------------
1 | -- Aggregate Yelp business ratings and reviews by taxi zone
2 |
3 | DROP TABLE IF EXISTS statistics.yelp_businesses;
4 |
5 | CREATE TABLE statistics.yelp_businesses AS
6 | SELECT
7 | z.zone_id,
8 | AVG(y.rating) AS avg_rating,
9 | SUM(y.review_count) AS sum_reviews,
10 | SUM(y.review_count * y.rating) AS weighted_sum_reviews
11 | FROM
12 | staging.taxi_zones AS z
13 | JOIN staging.yelp_businesses AS y
14 | ON ST_Within(y.geometry, z.geometry)
15 | GROUP BY 1;
16 |
--------------------------------------------------------------------------------
/src/preparation/extract.py:
--------------------------------------------------------------------------------
1 | import io
2 | import os
3 | import requests
4 | import time
5 | import zipfile
6 | import boto3
7 | import geopandas as gpd
8 | import pandas as pd
9 |
10 |
11 | s3 = boto3.resource('s3')
12 |
13 | def get_taxi_zones():
14 | '''Pull taxi zone shapfile and convert to WGS 84 (EPSG:4326)'''
15 | s3.meta.client.download_file(
16 | 'nyc-tlc',
17 | 'misc/taxi_zones.zip',
18 | 'taxi_zones.zip'
19 | )
20 | taxi_zones = gpd.read_file('zip://taxi_zones.zip') \
21 | .to_crs('EPSG:4326') \
22 | .filter(
23 | [
24 | 'OBJECTID',
25 | 'zone',
26 | 'borough',
27 | 'geometry'
28 | ],
29 | axis = 1
30 | ).rename(
31 | columns = {
32 | 'OBJECTID': 'zone_id',
33 | 'zone': 'zone_name'
34 | }
35 | )
36 | os.remove('taxi_zones.zip')
37 | return taxi_zones
38 |
39 | def get_businesses(**kwargs):
40 | '''For each taxi zone, query Yelp API for businesses closest to centroid'''
41 | ti = kwargs['ti']
42 | centroids = ti.xcom_pull(task_ids = 'calculate_centroids')
43 |
44 | api_key = 'Bearer ' + os.environ['YELP_API_KEY']
45 | head = {'Authorization': api_key}
46 | url = 'https://api.yelp.com/v3/businesses/search'
47 | businesses = pd.DataFrame()
48 |
49 | for _, row in centroids.iterrows():
50 | query = {
51 | 'latitude': row['latitude'],
52 | 'longitude': row['longitude'],
53 | 'radius': 3000,
54 | 'limit': 50,
55 | 'sort_by': 'distance'
56 | }
57 | response = requests.get(url, headers = head, params = query)
58 | json = response.json()
59 |
60 | retries = 0
61 | while retries <= 10 and 'error' in json:
62 | retries += 1
63 | time.sleep(1)
64 | response = requests.get(url, headers = head, params = query)
65 | json = response.json()
66 | matches = json['businesses']
67 | businesses = businesses.append(matches, ignore_index = True)
68 | return businesses
69 |
70 | def unzip_csvs():
71 | '''Iterate over relevant zipped files, unzip, and upload to private S3'''
72 | source = s3.Bucket('tripdata')
73 |
74 | for obj in source.objects.all():
75 | key = obj.key
76 |
77 | if not key.startswith('201307-201402') and key.endswith('.zip'):
78 | buffer = io.BytesIO(obj.get()['Body'].read())
79 | zipped = zipfile.ZipFile(buffer)
80 |
81 | for name in zipped.namelist():
82 |
83 | if not name.startswith('_') and name.endswith('.csv'):
84 | s3.meta.client.upload_fileobj(
85 | zipped.open(name),
86 | Bucket = 'jlang-20b-de-ny',
87 | Key = 'citibike/' + name
88 | )
89 |
--------------------------------------------------------------------------------
/src/preparation/load.py:
--------------------------------------------------------------------------------
1 | import pandas
2 | from geoalchemy2 import Geometry
3 | from sqlalchemy import Float, Integer, String
4 | from config.database import py_engine
5 |
6 |
7 | def write_taxi_zones(**kwargs):
8 | '''Write taxi zone map to postgres'''
9 | ti = kwargs['ti']
10 | taxi_zones = ti.xcom_pull(task_ids = 'clean_taxi_zones')
11 |
12 | taxi_zones.to_sql(
13 | name = 'taxi_zones',
14 | con = py_engine,
15 | schema = 'staging',
16 | if_exists = 'replace',
17 | index = False,
18 | index_label = 'zone_id',
19 | dtype = {
20 | 'zone_id': Integer(),
21 | 'zone_name': String(length = 45),
22 | 'borough': String(length = 13),
23 | 'geometry': Geometry('MULTIPOLYGON', 4326)
24 | }
25 | )
26 |
27 | def write_businesses(**kwargs):
28 | '''Write Yelp business data to postgres for further processing'''
29 | ti = kwargs['ti']
30 | businesses = ti.xcom_pull(task_ids = 'clean_businesses')
31 |
32 | businesses.to_sql(
33 | name = 'yelp_businesses',
34 | con = py_engine,
35 | schema = 'staging',
36 | if_exists = 'replace',
37 | index = False,
38 | index_label = 'business_id',
39 | dtype = {
40 | 'business_id': String(22),
41 | 'review_count': Integer(),
42 | 'rating': Float(),
43 | 'geometry': Geometry('POINT', 4326)
44 | }
45 | )
46 |
--------------------------------------------------------------------------------
/src/preparation/transform.py:
--------------------------------------------------------------------------------
1 | import geopandas as gpd
2 | import pandas as pd
3 | from geoalchemy2 import WKTElement
4 | from shapely.geometry.multipolygon import MultiPolygon
5 | from shapely.geometry.polygon import Polygon
6 |
7 |
8 | def clean_taxi_zones(**kwargs):
9 | '''Make geometry column consistent for writing to postgres'''
10 | ti = kwargs['ti']
11 | taxi_zones = ti.xcom_pull(task_ids = 'get_taxi_zones')
12 |
13 | def homogenize(geometry):
14 | '''
15 | Convert any Polygon to a MultiPolygon
16 | and then either to a WKTElement
17 | '''
18 | multi = MultiPolygon([geometry]) if type(geometry) == Polygon else geometry
19 | return WKTElement(multi.wkt, srid = 4326)
20 |
21 | taxi_zones['geometry'] = taxi_zones['geometry'].apply(homogenize)
22 | return taxi_zones
23 |
24 | def calculate_centroids(**kwargs):
25 | '''Calculate centroids for each taxi zone and extract lat-lons'''
26 | ti = kwargs['ti']
27 | taxi_zones = ti.xcom_pull(task_ids = 'get_taxi_zones')
28 |
29 | centroids = pd.DataFrame.from_dict({
30 | 'latitude': taxi_zones['geometry'].centroid.y,
31 | 'longitude': taxi_zones['geometry'].centroid.x
32 | })
33 | return centroids
34 |
35 | def clean_businesses(**kwargs):
36 | '''
37 | Drop invalid and duplicated businesses,
38 | unnest lat-lons, & combine into geometry column
39 | '''
40 | ti = kwargs['ti']
41 | businesses = ti.xcom_pull(task_ids = 'get_businesses')
42 |
43 | businesses.drop(
44 | businesses[businesses.distance > 3000].index,
45 | inplace = True
46 | )
47 | businesses.sort_values('distance') \
48 | .drop_duplicates('id', keep ='first') \
49 | .sort_index()
50 | businesses.reset_index(
51 | drop = True,
52 | inplace = True
53 | )
54 |
55 | business_coordinates = pd.json_normalize(businesses.coordinates)
56 | business_coordinates.dropna(how = 'any', inplace = True)
57 |
58 | businesses_flat = businesses.join(business_coordinates, how = 'inner')
59 | businesses_flat.reset_index(drop = True, inplace = True)
60 |
61 | businesses_geo = gpd.GeoDataFrame(
62 | businesses_flat,
63 | geometry = gpd.points_from_xy(
64 | businesses_flat.longitude,
65 | businesses_flat.latitude
66 | )
67 | )
68 | businesses_geo['geometry'] = businesses_geo.geometry.apply(
69 | lambda point: WKTElement(point.wkt, srid = 4326)
70 | )
71 |
72 | businesses_writable = businesses_geo.filter(
73 | [
74 | 'id',
75 | 'review_count',
76 | 'rating',
77 | 'geometry'
78 | ],
79 | axis = 1
80 | )
81 | return businesses_writable
82 |
--------------------------------------------------------------------------------
/src/spark_reduction/driver.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from spark_reduction.extract import \
3 | get_citibike_trips, get_past_tlc_trips, get_modern_tlc_trips
4 | from spark_reduction.transform import \
5 | distill_citibike_stations, aggregate_citibike_visits, \
6 | aggregate_past_tlc_visits, aggregate_modern_tlc_visits
7 | from spark_reduction.load import \
8 | write_citibike, write_tlc
9 |
10 |
11 | spark = SparkSession.builder \
12 | .appName('where-cycle') \
13 | .getOrCreate()
14 |
15 | # Parse CSVs from S3 and cache tables
16 | get_citibike_trips()
17 | get_past_tlc_trips()
18 | get_modern_tlc_trips()
19 |
20 | # Reduce tables to meaningful dataframes
21 | stations = distill_citibike_stations()
22 | citibike_visits = aggregate_citibike_visits()
23 | past_visits = aggregate_past_tlc_visits()
24 | modern_visits = aggregate_modern_tlc_visits()
25 |
26 | # Write dataframes to postgres
27 | write_citibike(stations, citibike_visits)
28 | write_tlc(past_visits, modern_visits)
29 |
30 | spark.stop()
31 |
--------------------------------------------------------------------------------
/src/spark_reduction/extract.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql.functions import input_file_name, regexp_extract
3 | from config.schemas import *
4 |
5 |
6 | spark = SparkSession.builder \
7 | .appName('where-cycle') \
8 | .getOrCreate()
9 |
10 | def get_citibike_trips():
11 | '''Parse Citibike CSVs, format date columns, & rename location columns'''
12 | citibike_df = spark.read.csv(
13 | path = 's3a://jlang-20b-de-ny/citibike/*.csv',
14 | schema = citibike_schema,
15 | header = True,
16 | ignoreLeadingWhiteSpace = True,
17 | ignoreTrailingWhiteSpace = True
18 | ).withColumnRenamed('start station id', 'start_id') \
19 | .withColumnRenamed('start station latitude', 'start_latitude') \
20 | .withColumnRenamed('start station longitude', 'start_longitude') \
21 | .withColumnRenamed('end station id', 'end_id') \
22 | .withColumnRenamed('end station latitude', 'end_latitude') \
23 | .withColumnRenamed('end station longitude', 'end_longitude') \
24 | .selectExpr(
25 | 'DATE_FORMAT(starttime, "yyyy-MM") AS start_month',
26 | 'DATE_FORMAT(stoptime, "yyyy-MM") AS end_month',
27 | 'start_id',
28 | 'start_latitude',
29 | 'start_longitude',
30 | 'end_id',
31 | 'end_latitude',
32 | 'end_longitude'
33 | )
34 | citibike_df.createOrReplaceTempView('citibike')
35 | spark.catalog.cacheTable('citibike')
36 |
37 | def parse_tlc(path, schema):
38 | '''Parse TLC CSVs, assuming trip month from filename'''
39 | tlc_df = spark.read.csv(
40 | path = path,
41 | schema = schema,
42 | header = True,
43 | ignoreLeadingWhiteSpace = True,
44 | ignoreTrailingWhiteSpace = True
45 | ).withColumn(
46 | 'month',
47 | regexp_extract(
48 | input_file_name(),
49 | r'tripdata_(\d{4}-\d{2})\.csv',
50 | 1
51 | )
52 | )
53 | return tlc_df
54 |
55 | def get_past_tlc_trips():
56 | '''Parse TLC CSVs from before 2016-07, filtering for lat-lon columns'''
57 | past_df = spark.createDataFrame(data = [], schema = past_schema)
58 | past_pairs = [
59 | (
60 | 's3a://nyc-tlc/trip\ data/green_tripdata_201[345]-*.csv',
61 | green_13_16_schema
62 | ),
63 | (
64 | 's3a://nyc-tlc/trip\ data/green_tripdata_2016-0[1-6].csv',
65 | green_13_16_schema
66 | ),
67 | (
68 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_2009-*.csv',
69 | yellow_09_16_schema
70 | ),
71 | (
72 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_201[0-5]-*.csv',
73 | yellow_09_16_schema
74 | ),
75 | (
76 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_2016-0[1-6].csv',
77 | yellow_09_16_schema
78 | )
79 | ]
80 | for path, schema in past_pairs:
81 | csv_df = parse_tlc(path, schema).select(
82 | 'month',
83 | 'pickup_longitude',
84 | 'pickup_latitude',
85 | 'dropoff_longitude',
86 | 'dropoff_latitude'
87 | )
88 | past_df = past_df.union(csv_df)
89 | past_df.createOrReplaceTempView('past')
90 | spark.catalog.cacheTable('past')
91 |
92 | def get_modern_tlc_trips():
93 | '''Parse TLC CSVs from after 2016-06, filtering for taxi zone ID columns'''
94 | fhv_15_16_df = parse_tlc(
95 | 's3a://nyc-tlc/trip\ data/fhv_tripdata_201[56]-*.csv',
96 | fhv_15_16_schema
97 | ).select(
98 | 'month',
99 | 'locationID'
100 | )
101 | fhv_15_16_df.createOrReplaceTempView('fhv_15_16')
102 | spark.catalog.cacheTable('fhv_15_16')
103 |
104 | modern_df = spark.createDataFrame(data = [], schema = modern_schema)
105 | modern_pairs = [
106 | (
107 | 's3a://nyc-tlc/trip\ data/fhv_tripdata_201[79]-*.csv',
108 | fhv_17_19_schema
109 | ),
110 | (
111 | 's3a://nyc-tlc/trip\ data/fhv_tripdata_2018-*.csv',
112 | fhv_18_schema
113 | ),
114 | (
115 | 's3a://nyc-tlc/trip\ data/fhvhv_tripdata_*.csv',
116 | fhvhv_schema
117 | ),
118 | (
119 | 's3a://nyc-tlc/trip\ data/green_tripdata_2016-0[789].csv',
120 | green_16_19_schema
121 | ),
122 | (
123 | 's3a://nyc-tlc/trip\ data/green_tripdata_2016-1*.csv',
124 | green_16_19_schema
125 | ),
126 | (
127 | 's3a://nyc-tlc/trip\ data/green_tripdata_201[789]-*.csv',
128 | green_16_19_schema
129 | ),
130 | (
131 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_2016-0[789].csv',
132 | yellow_16_19_schema
133 | ),
134 | (
135 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_2016-1*.csv',
136 | yellow_16_19_schema)
137 | ,
138 | (
139 | 's3a://nyc-tlc/trip\ data/yellow_tripdata_201[789]-*.csv',
140 | yellow_16_19_schema
141 | )
142 | ]
143 | for path, schema in modern_pairs:
144 | csv_df = parse_tlc(path, schema).select(
145 | 'month',
146 | 'PULocationID',
147 | 'DOLocationID'
148 | )
149 | modern_df = modern_df.union(csv_df)
150 | modern_df.createOrReplaceTempView('modern')
151 | spark.catalog.cacheTable('modern')
152 |
--------------------------------------------------------------------------------
/src/spark_reduction/load.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from config.database import jdbc_props, jdbc_url
3 |
4 |
5 | spark = SparkSession.builder \
6 | .appName('where-cycle') \
7 | .getOrCreate()
8 |
9 | def write_citibike(stations, visits):
10 | stations.write.jdbc(
11 | url = jdbc_url,
12 | table = 'staging.citibike_stations',
13 | mode = 'overwrite',
14 | properties = jdbc_props
15 | )
16 | visits.write.jdbc(
17 | url = jdbc_url,
18 | table = 'staging.citibike_visits',
19 | mode = 'overwrite',
20 | properties = jdbc_props
21 | )
22 | spark.catalog.uncacheTable('citibike')
23 |
24 | def write_tlc(past, modern):
25 | past.write.jdbc(
26 | url = jdbc_url,
27 | table = 'staging.past_tlc_visits',
28 | mode = 'overwrite',
29 | properties = jdbc_props
30 | )
31 | spark.catalog.uncacheTable('past')
32 |
33 | modern.write.jdbc(
34 | url = jdbc_url,
35 | table = 'staging.modern_tlc_visits',
36 | mode = 'overwrite',
37 | properties = jdbc_props
38 | )
39 | spark.catalog.uncacheTable('fhv_15_16')
40 | spark.catalog.uncacheTable('modern')
41 |
--------------------------------------------------------------------------------
/src/spark_reduction/transform.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from config.geometries import \
3 | TAXI_ZONE_LAT_MIN, TAXI_ZONE_LAT_MAX, \
4 | TAXI_ZONE_LON_MIN, TAXI_ZONE_LON_MAX
5 |
6 |
7 | spark = SparkSession.builder \
8 | .appName('where-cycle') \
9 | .getOrCreate()
10 |
11 | def distill_citibike_stations():
12 | '''Create list of unique Citibike stations across all trip endpoints'''
13 | stations_df = spark.sql(f'''
14 | SELECT
15 | start_id AS station_id,
16 | start_latitude AS latitude,
17 | start_longitude AS longitude
18 | FROM citibike
19 | WHERE
20 | start_latitude BETWEEN
21 | {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX}
22 | AND
23 | start_longitude BETWEEN
24 | {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX}
25 | GROUP BY 1, 2, 3
26 | UNION
27 | SELECT
28 | end_id AS station_id,
29 | end_latitude AS latitude,
30 | end_longitude AS longitude
31 | FROM citibike
32 | WHERE
33 | end_latitude BETWEEN
34 | {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX}
35 | AND
36 | end_longitude BETWEEN
37 | {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX}
38 | GROUP BY 1, 2, 3'''.translate({ord(c): ' ' for c in '\n\t'})
39 | )
40 | return stations_df
41 |
42 | def aggregate_citibike_visits():
43 | '''Convert Citibike trips to visits and sum by station_id'''
44 | visits_df = spark.sql('''
45 | SELECT
46 | month,
47 | station_id,
48 | SUM(visits) AS visits
49 | FROM (
50 | SELECT
51 | start_month AS month,
52 | start_id AS station_id,
53 | COUNT(*) AS visits
54 | FROM citibike
55 | GROUP BY 1, 2
56 | UNION ALL
57 | SELECT
58 | end_month AS month,
59 | end_id AS station_id,
60 | COUNT(*) AS visits
61 | FROM citibike
62 | GROUP BY 1, 2
63 | )
64 | GROUP BY 1, 2
65 | ''')
66 | return visits_df
67 |
68 | def aggregate_past_tlc_visits():
69 | '''
70 | Convert past TLC trips to visits,
71 | round lat-lon precision to street level,
72 | and sum by lat-lon
73 | '''
74 | past_df = spark.sql(f'''
75 | SELECT
76 | month,
77 | longitude,
78 | latitude,
79 | SUM(visits) AS visits
80 | FROM (
81 | SELECT
82 | month,
83 | ROUND(pickup_longitude, 3) AS longitude,
84 | ROUND(pickup_latitude, 3) AS latitude,
85 | COUNT(*) AS visits
86 | FROM past
87 | WHERE
88 | pickup_longitude BETWEEN
89 | {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX}
90 | AND
91 | pickup_latitude BETWEEN
92 | {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX}
93 | GROUP BY 1, 2, 3
94 | UNION ALL
95 | SELECT
96 | month,
97 | ROUND(dropoff_longitude, 3) AS longitude,
98 | ROUND(dropoff_latitude, 3) AS latitude,
99 | COUNT(*) AS visits
100 | FROM past
101 | WHERE
102 | dropoff_longitude BETWEEN
103 | {TAXI_ZONE_LON_MIN} AND {TAXI_ZONE_LON_MAX}
104 | AND
105 | dropoff_latitude BETWEEN
106 | {TAXI_ZONE_LAT_MIN} AND {TAXI_ZONE_LAT_MAX}
107 | GROUP BY 1, 2, 3
108 | )
109 | GROUP BY 1, 2, 3'''.translate({ord(c): ' ' for c in '\n\t'})
110 | )
111 | return past_df
112 |
113 | def aggregate_modern_tlc_visits():
114 | '''
115 | Convert modern TLC trips to visits,
116 | ignoring unknown taxi zone IDs,
117 | and sum by taxi zone ID
118 | '''
119 | modern_df = spark.sql('''
120 | SELECT
121 | month,
122 | zone_id,
123 | SUM(visits) AS visits
124 | FROM (
125 | SELECT
126 | month,
127 | locationID AS zone_id,
128 | COUNT(*) AS visits
129 | FROM fhv_15_16
130 | WHERE locationID BETWEEN 1 AND 263
131 | GROUP BY 1, 2
132 | UNION ALL
133 | SELECT
134 | month,
135 | PULocationID AS zone_id,
136 | COUNT(*) as visits
137 | FROM modern
138 | WHERE PUlocationID BETWEEN 1 AND 263
139 | GROUP BY 1, 2
140 | UNION ALL
141 | SELECT
142 | month,
143 | DOLocationID AS zone_id,
144 | COUNT(*) as visits
145 | FROM modern
146 | WHERE DOlocationID BETWEEN 1 AND 263
147 | GROUP BY 1, 2
148 | )
149 | GROUP BY 1, 2
150 | ''')
151 | return modern_df
152 |
--------------------------------------------------------------------------------