├── pictures ├── nyc-how-log.png ├── nyc-matplotlib.png ├── taxi-overview.png └── green_dropoff_plot_hd.png ├── downloadDatasetsWithWget.sh ├── countTotalRecords.q ├── createTableYellowTaxi_2009_2014.q ├── createTableGreenTaxi.q ├── createTableYellowTaxi_2016-07_2016-12.q ├── createTableYellowTaxi_2015_2016-06.q ├── README.md └── downloadDatasetsWithRequest.py /pictures/nyc-how-log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/filipyoo/nyc-taxi-analysis/HEAD/pictures/nyc-how-log.png -------------------------------------------------------------------------------- /pictures/nyc-matplotlib.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/filipyoo/nyc-taxi-analysis/HEAD/pictures/nyc-matplotlib.png -------------------------------------------------------------------------------- /pictures/taxi-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/filipyoo/nyc-taxi-analysis/HEAD/pictures/taxi-overview.png -------------------------------------------------------------------------------- /pictures/green_dropoff_plot_hd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/filipyoo/nyc-taxi-analysis/HEAD/pictures/green_dropoff_plot_hd.png -------------------------------------------------------------------------------- /downloadDatasetsWithWget.sh: -------------------------------------------------------------------------------- 1 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_20{13..16}-{01..12}.csv 2 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_20{09..16}-{01..12}.csv 3 | -------------------------------------------------------------------------------- /countTotalRecords.q: -------------------------------------------------------------------------------- 1 | SELECT SUM(cnt) as total_records 2 | FROM ( 3 | SELECT COUNT(*) as cnt FROM green_taxi 4 | UNION ALL 5 | SELECT COUNT(*) as cnt FROM yellow_taxi_2009_2014 6 | UNION ALL 7 | SELECT COUNT(*) as cnt FROM yellow_taxi_2015_2016_06 8 | UNION ALL 9 | SELECT COUNT(*) as cnt FROM yellow_taxi_2016_07_2016_12 10 | ) as subquery -------------------------------------------------------------------------------- /createTableYellowTaxi_2009_2014.q: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE yellow_taxi_2009_2014 ( 2 | vendorID VARCHAR(3), 3 | pickup_datetime TIMESTAMP, 4 | dropoff_datetime TIMESTAMP, 5 | passenger_count SMALLINT, 6 | trip_distance DECIMAL(6,3), 7 | pickup_longitude DECIMAL(9,6), 8 | pickup_latitude DECIMAL(9,6), 9 | ratecodeID SMALLINT, 10 | store_and_fwd_flag VARCHAR(1), 11 | dropoff_longitude DECIMAL(9,6), 12 | dropoff_latitude DECIMAL(9,6), 13 | payment_type VARCHAR(10), 14 | fare_amount DECIMAL(6,2), 15 | extra DECIMAL(6,2), 16 | mta_tax DECIMAL(6,2), 17 | tip_amount DECIMAL(6,2), 18 | tolls_amount DECIMAL(6,2), 19 | total_amount DECIMAL(6,2) 20 | ) 21 | ROW FORMAT DELIMITED 22 | FIELDS TERMINATED BY ',' 23 | LINES TERMINATED BY '\n' 24 | LOCATION 's3a://nyctaxidataset/yellow_taxi/yellow_taxi_2009_2014/' 25 | tblproperties("skip.header.line.count"="1"); -------------------------------------------------------------------------------- /createTableGreenTaxi.q: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE green_taxi ( 2 | vendorID VARCHAR(3), 3 | pickup_datetime TIMESTAMP, 4 | dropoff_datetime TIMESTAMP, 5 | store_and_fwd_flag VARCHAR(1), 6 | ratecodeID SMALLINT, 7 | pickup_longitude DECIMAL(9,6), 8 | pickup_latitude DECIMAL(9,6), 9 | dropoff_longitude DECIMAL(9,6), 10 | dropoff_latitude DECIMAL(9,6), 11 | passenger_count SMALLINT, 12 | trip_distance DECIMAL(6,3), 13 | fare_amount DECIMAL(6,2), 14 | extra DECIMAL(6,2), 15 | mta_tax DECIMAL(6,2), 16 | tip_amount DECIMAL(6,2), 17 | tolls_amount DECIMAL(6,2), 18 | ehail_fee DECIMAL(6,2), 19 | total_amount DECIMAL(6,2), 20 | payment_type VARCHAR(3), 21 | trip_type SMALLINT 22 | ) 23 | ROW FORMAT DELIMITED 24 | FIELDS TERMINATED BY ',' 25 | LINES TERMINATED BY '\n' 26 | LOCATION 's3a://nyctaxidataset/green_taxi/' 27 | tblproperties("skip.header.line.count"="1"); -------------------------------------------------------------------------------- /createTableYellowTaxi_2016-07_2016-12.q: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE yellow_taxi_2016_07_2016_12 ( 2 | vendorID VARCHAR(3), 3 | pickup_datetime TIMESTAMP, 4 | dropoff_datetime TIMESTAMP, 5 | passenger_count SMALLINT, 6 | trip_distance DECIMAL(6,3), 7 | ratecodeID SMALLINT, 8 | store_and_fwd_flag VARCHAR(1), 9 | PULocationID SMALLINT, 10 | DOLocationID SMALLINT, 11 | payment_type VARCHAR(3), 12 | fare_amount DECIMAL(6,2), 13 | extra DECIMAL(6,2), 14 | mta_tax DECIMAL(6,2), 15 | tip_amount DECIMAL(6,2), 16 | tolls_amount DECIMAL(6,2), 17 | improvement_surcharge DECIMAL(6,2), 18 | total_amount DECIMAL(6,2) 19 | ) 20 | ROW FORMAT DELIMITED 21 | FIELDS TERMINATED BY ',' 22 | LINES TERMINATED BY '\n' 23 | LOCATION 's3a://nyctaxidataset/yellow_taxi/yellow_taxi_2016-07_2016-12/' 24 | tblproperties("skip.header.line.count"="1"); -------------------------------------------------------------------------------- /createTableYellowTaxi_2015_2016-06.q: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE yellow_taxi_2015_2016_06 ( 2 | vendorID VARCHAR(3), 3 | pickup_datetime TIMESTAMP, 4 | dropoff_datetime TIMESTAMP, 5 | passenger_count SMALLINT, 6 | trip_distance DECIMAL(6,3), 7 | pickup_longitude DECIMAL(9,6), 8 | pickup_latitude DECIMAL(9,6), 9 | ratecodeID SMALLINT, 10 | store_and_fwd_flag VARCHAR(1), 11 | dropoff_longitude DECIMAL(9,6), 12 | dropoff_latitude DECIMAL(9,6), 13 | payment_type VARCHAR(3), 14 | fare_amount DECIMAL(6,2), 15 | extra DECIMAL(6,2), 16 | mta_tax DECIMAL(6,2), 17 | tip_amount DECIMAL(6,2), 18 | tolls_amount DECIMAL(6,2), 19 | improvement_surcharge DECIMAL(6,2), 20 | total_amount DECIMAL(6,2) 21 | ) 22 | ROW FORMAT DELIMITED 23 | FIELDS TERMINATED BY ',' 24 | LINES TERMINATED BY '\n' 25 | LOCATION 's3a://nyctaxidataset/yellow_taxi/yellow_taxi_2015_2016-06/' 26 | tblproperties("skip.header.line.count"="1"); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # How to handle 200 GB of data with AWS EC2 Hadoop cluster 2 | http://www.filipyoo.com/handle-200-GB-of-data-with-AWS-EC2-hadoop-cluster/ 3 | 4 | Storing 200 GB of NYC taxi dataset and deploying a Cloudera Hadoop cluster to visualize it. 5 | 6 | # Plot and visualization of Hadoop large dataset with Python Datashader 7 | http://www.filipyoo.com/plot-visualization-Hadoop-large-dataset-with-python-datashader/ 8 | 9 | ### Visualization without Datashader 10 |
11 |
12 |
16 |
17 |
21 |
22 |
26 |
27 |