├── pictures ├── nyc-how-log.png ├── nyc-matplotlib.png ├── taxi-overview.png └── green_dropoff_plot_hd.png ├── downloadDatasetsWithWget.sh ├── countTotalRecords.q ├── createTableYellowTaxi_2009_2014.q ├── createTableGreenTaxi.q ├── createTableYellowTaxi_2016-07_2016-12.q ├── createTableYellowTaxi_2015_2016-06.q ├── README.md └── downloadDatasetsWithRequest.py /pictures/nyc-how-log.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/filipyoo/nyc-taxi-analysis/HEAD/pictures/nyc-how-log.png -------------------------------------------------------------------------------- /pictures/nyc-matplotlib.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/filipyoo/nyc-taxi-analysis/HEAD/pictures/nyc-matplotlib.png -------------------------------------------------------------------------------- /pictures/taxi-overview.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/filipyoo/nyc-taxi-analysis/HEAD/pictures/taxi-overview.png -------------------------------------------------------------------------------- /pictures/green_dropoff_plot_hd.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/filipyoo/nyc-taxi-analysis/HEAD/pictures/green_dropoff_plot_hd.png -------------------------------------------------------------------------------- /downloadDatasetsWithWget.sh: -------------------------------------------------------------------------------- 1 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_20{13..16}-{01..12}.csv 2 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_20{09..16}-{01..12}.csv 3 | -------------------------------------------------------------------------------- /countTotalRecords.q: -------------------------------------------------------------------------------- 1 | SELECT SUM(cnt) as total_records 2 | FROM ( 3 | SELECT COUNT(*) as cnt FROM green_taxi 4 | UNION ALL 5 | SELECT COUNT(*) as cnt FROM yellow_taxi_2009_2014 6 | UNION ALL 7 | SELECT COUNT(*) as cnt FROM yellow_taxi_2015_2016_06 8 | UNION ALL 9 | SELECT COUNT(*) as cnt FROM yellow_taxi_2016_07_2016_12 10 | ) as subquery -------------------------------------------------------------------------------- /createTableYellowTaxi_2009_2014.q: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE yellow_taxi_2009_2014 ( 2 | vendorID VARCHAR(3), 3 | pickup_datetime TIMESTAMP, 4 | dropoff_datetime TIMESTAMP, 5 | passenger_count SMALLINT, 6 | trip_distance DECIMAL(6,3), 7 | pickup_longitude DECIMAL(9,6), 8 | pickup_latitude DECIMAL(9,6), 9 | ratecodeID SMALLINT, 10 | store_and_fwd_flag VARCHAR(1), 11 | dropoff_longitude DECIMAL(9,6), 12 | dropoff_latitude DECIMAL(9,6), 13 | payment_type VARCHAR(10), 14 | fare_amount DECIMAL(6,2), 15 | extra DECIMAL(6,2), 16 | mta_tax DECIMAL(6,2), 17 | tip_amount DECIMAL(6,2), 18 | tolls_amount DECIMAL(6,2), 19 | total_amount DECIMAL(6,2) 20 | ) 21 | ROW FORMAT DELIMITED 22 | FIELDS TERMINATED BY ',' 23 | LINES TERMINATED BY '\n' 24 | LOCATION 's3a://nyctaxidataset/yellow_taxi/yellow_taxi_2009_2014/' 25 | tblproperties("skip.header.line.count"="1"); -------------------------------------------------------------------------------- /createTableGreenTaxi.q: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE green_taxi ( 2 | vendorID VARCHAR(3), 3 | pickup_datetime TIMESTAMP, 4 | dropoff_datetime TIMESTAMP, 5 | store_and_fwd_flag VARCHAR(1), 6 | ratecodeID SMALLINT, 7 | pickup_longitude DECIMAL(9,6), 8 | pickup_latitude DECIMAL(9,6), 9 | dropoff_longitude DECIMAL(9,6), 10 | dropoff_latitude DECIMAL(9,6), 11 | passenger_count SMALLINT, 12 | trip_distance DECIMAL(6,3), 13 | fare_amount DECIMAL(6,2), 14 | extra DECIMAL(6,2), 15 | mta_tax DECIMAL(6,2), 16 | tip_amount DECIMAL(6,2), 17 | tolls_amount DECIMAL(6,2), 18 | ehail_fee DECIMAL(6,2), 19 | total_amount DECIMAL(6,2), 20 | payment_type VARCHAR(3), 21 | trip_type SMALLINT 22 | ) 23 | ROW FORMAT DELIMITED 24 | FIELDS TERMINATED BY ',' 25 | LINES TERMINATED BY '\n' 26 | LOCATION 's3a://nyctaxidataset/green_taxi/' 27 | tblproperties("skip.header.line.count"="1"); -------------------------------------------------------------------------------- /createTableYellowTaxi_2016-07_2016-12.q: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE yellow_taxi_2016_07_2016_12 ( 2 | vendorID VARCHAR(3), 3 | pickup_datetime TIMESTAMP, 4 | dropoff_datetime TIMESTAMP, 5 | passenger_count SMALLINT, 6 | trip_distance DECIMAL(6,3), 7 | ratecodeID SMALLINT, 8 | store_and_fwd_flag VARCHAR(1), 9 | PULocationID SMALLINT, 10 | DOLocationID SMALLINT, 11 | payment_type VARCHAR(3), 12 | fare_amount DECIMAL(6,2), 13 | extra DECIMAL(6,2), 14 | mta_tax DECIMAL(6,2), 15 | tip_amount DECIMAL(6,2), 16 | tolls_amount DECIMAL(6,2), 17 | improvement_surcharge DECIMAL(6,2), 18 | total_amount DECIMAL(6,2) 19 | ) 20 | ROW FORMAT DELIMITED 21 | FIELDS TERMINATED BY ',' 22 | LINES TERMINATED BY '\n' 23 | LOCATION 's3a://nyctaxidataset/yellow_taxi/yellow_taxi_2016-07_2016-12/' 24 | tblproperties("skip.header.line.count"="1"); -------------------------------------------------------------------------------- /createTableYellowTaxi_2015_2016-06.q: -------------------------------------------------------------------------------- 1 | CREATE EXTERNAL TABLE yellow_taxi_2015_2016_06 ( 2 | vendorID VARCHAR(3), 3 | pickup_datetime TIMESTAMP, 4 | dropoff_datetime TIMESTAMP, 5 | passenger_count SMALLINT, 6 | trip_distance DECIMAL(6,3), 7 | pickup_longitude DECIMAL(9,6), 8 | pickup_latitude DECIMAL(9,6), 9 | ratecodeID SMALLINT, 10 | store_and_fwd_flag VARCHAR(1), 11 | dropoff_longitude DECIMAL(9,6), 12 | dropoff_latitude DECIMAL(9,6), 13 | payment_type VARCHAR(3), 14 | fare_amount DECIMAL(6,2), 15 | extra DECIMAL(6,2), 16 | mta_tax DECIMAL(6,2), 17 | tip_amount DECIMAL(6,2), 18 | tolls_amount DECIMAL(6,2), 19 | improvement_surcharge DECIMAL(6,2), 20 | total_amount DECIMAL(6,2) 21 | ) 22 | ROW FORMAT DELIMITED 23 | FIELDS TERMINATED BY ',' 24 | LINES TERMINATED BY '\n' 25 | LOCATION 's3a://nyctaxidataset/yellow_taxi/yellow_taxi_2015_2016-06/' 26 | tblproperties("skip.header.line.count"="1"); -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # How to handle 200 GB of data with AWS EC2 Hadoop cluster 2 | http://www.filipyoo.com/handle-200-GB-of-data-with-AWS-EC2-hadoop-cluster/ 3 | 4 | Storing 200 GB of NYC taxi dataset and deploying a Cloudera Hadoop cluster to visualize it. 5 | 6 | # Plot and visualization of Hadoop large dataset with Python Datashader 7 | http://www.filipyoo.com/plot-visualization-Hadoop-large-dataset-with-python-datashader/ 8 | 9 | ### Visualization without Datashader 10 |

11 | visualization hadoo height= 12 |

13 | 14 | ### Workflow overview 15 |

16 | visualization hadoo height= 17 |

18 | 19 | ### Using Datashader 20 |

21 | visualization hadoo height= 22 |

23 | 24 | ### Final visualization 25 |

26 | visualization hadoop big data dataset spark pyspark cloudera aws ec2 hive s3 python dask datashader nyc taxi dataset 27 |

28 | -------------------------------------------------------------------------------- /downloadDatasetsWithRequest.py: -------------------------------------------------------------------------------- 1 | import requests 2 | import tqdm 3 | import os 4 | 5 | def fileAlreadyExists(request, file_name, data_dir): 6 | total_file_size = int(request.headers.get('content-length', 0)) 7 | if os.path.exists(os.path.join(data_dir, file_name)): 8 | local_filename_size = os.path.getsize(os.path.join(data_dir, file_name)) 9 | alreadyDownloaded = True if total_file_size==local_filename_size else False 10 | else: 11 | alreadyDownloaded = False 12 | return alreadyDownloaded 13 | 14 | 15 | def write_file_chunks(request, file_name, data_dir): 16 | total_file_size = int(request.headers.get('content-length', 0)) 17 | with open(os.path.join(data_dir, file_name), 'wb') as f: 18 | # for chunk in request.iter_content(chunk_size=1024): 19 | for chunk in tqdm.tqdm(request.iter_content(chunk_size =1024000), total=total_file_size, unit='B', unit_scale=True): 20 | if chunk: # filter out keep-alive new chunks 21 | f.write(chunk) 22 | 23 | 24 | def downloadFile(request, file_name): 25 | if fileAlreadyExists(request, file_name, data_dir): 26 | print ("File {} already downloaded.".format(file_name)) 27 | else: 28 | print ("Start downloading file: {}".format(file_name)) 29 | write_file_chunks(request, file_name, data_dir) 30 | print ("Downloaded file: {}".format(file_name)) 31 | 32 | 33 | def downloadAllFiles(): 34 | for taxi_color in ['green', 'yellow']: 35 | for year in range(2016, 2008, -1): 36 | for month in range(1, 13): 37 | dataset_url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/{}_tripdata_{}-{}.csv'.format(taxi_color, year, str(month).zfill(2)) 38 | file_name = dataset_url.split('/')[-1] 39 | request = requests.get(dataset_url, stream=True) 40 | downloadFile(request, file_name) 41 | 42 | 43 | 44 | 45 | if __name__ == '__main__': 46 | # dataset_url = 'https://s3.amazonaws.com/nyc-tlc/trip+data/green_tripdata_2016-04.csv' 47 | # file_name = dataset_url.split('/')[-1] 48 | data_dir = r"F:\NYC_taxi_dataset" 49 | # r = requests.get(dataset_url, stream=True) 50 | # downloadFile(file_name) 51 | downloadAllFiles() 52 | --------------------------------------------------------------------------------