├── Images
    └── arch_dig.png
├── Code
    ├── dataingestion.sh
    ├── CFT.json
    └── finalScript.py
└── README.md


/Images/arch_dig.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/AWS-Big-Data-Projects/Analysis-Of-NYC-Yellow-Taxi/HEAD/Images/arch_dig.png


--------------------------------------------------------------------------------
/Code/dataingestion.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/sh
 2 | 
 3 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-01.csv
 4 | hdfs dfs -mkdir rawdata
 5 | hdfs dfs -put yellow_tripdata_2014-01.csv /user/hadoop/rawdata/
 6 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-01.csv s3a://nycproject23/rawdatas3/
 7 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-01.csv
 8 | rm yellow_tripdata_2014-01.csv
 9 | 
10 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-02.csv
11 | hdfs dfs -put yellow_tripdata_2014-02.csv /user/hadoop/rawdata/
12 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-02.csv s3a://nycproject23/rawdatas3/
13 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-02.csv
14 | rm yellow_tripdata_2014-02.csv
15 | 
16 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-03.csv
17 | hdfs dfs -put yellow_tripdata_2014-03.csv /user/hadoop/rawdata/
18 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-03.csv s3a://nycproject23/rawdatas3/
19 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-03.csv
20 | rm yellow_tripdata_2014-03.csv
21 | 
22 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-04.csv
23 | hdfs dfs -put yellow_tripdata_2014-04.csv /user/hadoop/rawdata/
24 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-04.csv s3a://nycproject23/rawdatas3/
25 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-04.csv
26 | rm yellow_tripdata_2014-04.csv
27 | 
28 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-05.csv
29 | hdfs dfs -put yellow_tripdata_2014-05.csv /user/hadoop/rawdata/
30 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-05.csv s3a://nycproject23/rawdatas3/
31 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-05.csv
32 | rm yellow_tripdata_2014-05.csv
33 | 
34 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-06.csv
35 | hdfs dfs -put yellow_tripdata_2014-06.csv /user/hadoop/rawdata/
36 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-06.csv s3a://nycproject23/rawdatas3/
37 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-06.csv
38 | rm yellow_tripdata_2014-06.csv
39 | 
40 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-07.csv
41 | hdfs dfs -put yellow_tripdata_2014-07.csv /user/hadoop/rawdata/
42 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-07.csv s3a://nycproject23/rawdatas3/
43 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-07.csv
44 | rm yellow_tripdata_2014-07.csv
45 | 
46 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-08.csv
47 | hdfs dfs -put yellow_tripdata_2014-08.csv /user/hadoop/rawdata/
48 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-08.csv s3a://nycproject23/rawdatas3/
49 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-08.csv
50 | rm yellow_tripdata_2014-08.csv
51 | 
52 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-09.csv
53 | hdfs dfs -put yellow_tripdata_2014-09.csv /user/hadoop/rawdata/
54 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-09.csv s3a://nycproject23/rawdatas3/
55 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-09.csv
56 | rm yellow_tripdata_2014-09.csv
57 | 
58 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-10.csv
59 | hdfs dfs -put yellow_tripdata_2014-10.csv /user/hadoop/rawdata/
60 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-10.csv s3a://nycproject23/rawdatas3/
61 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-10.csv
62 | rm yellow_tripdata_2014-10.csv
63 | 
64 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-11.csv
65 | hdfs dfs -put yellow_tripdata_2014-11.csv /user/hadoop/rawdata/
66 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-11.csv s3a://nycproject23/rawdatas3/
67 | hdfs dfs -rm /user/hadoop/rawdata/yellow_tripdata_2014-11.csv
68 | rm yellow_tripdata_2014-11.csv
69 | 
70 | wget https://s3.amazonaws.com/nyc-tlc/trip+data/yellow_tripdata_2014-12.csv
71 | hdfs dfs -put yellow_tripdata_2014-12.csv /user/hadoop/rawdata/
72 | hdfs dfs -cp /user/hadoop/rawdata/yellow_tripdata_2014-12.csv s3a://nycproject23/rawdatas3/
73 | rm yellow_tripdata_2014-12.csv
74 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # *Analysis of New York Yellow Taxi Trip Data*
 2 | 
 3 | ## Introduction of NYC Yellow Taxi: 
 4 | - ### Yellow Taxis are the only vehicles licensed to pick up street-hailing passengers anywhere in NYC. 
 5 | - ### Yellow Taxis charge standard [metered fares](https://www1.nyc.gov/site/tlc/passengers/taxi-fare.page) for all street hail trips.
 6 | - ### Yellow Taxi smartphone apps can offer set, upfront pricing for trips booked through an app.
 7 | - ### Yellow Taxis are easily identified by their yellow color, taxi “T” markings, and license numbers on the roof and sides of the vehicle.
 8 | 
 9 | ## Introduction of NYC Yellow Taxi Trip Data:
10 | Variable Name | Description |
11 | --------------|------------------|
12 | **vendorid :**|A code indicating the [TPEP provider](https://www1.nyc.gov/site/tlc/about/tlc-trip-record-data.page) that provided the record.<br>
13 | **pickup_datetime:**| The date and time when the meter was engaged.<br>
14 | **dropoff_datetime:**          |  The date and time when the meter was disengaged.<br>
15 | **passanger_count:**        | The number of passengers in the vehicle. This is a driver-entered value.<br>
16 | **trip_distance :**          | The elapsed trip distance in miles reported by the taximeter.<br>
17 | **PULocationID :**| Taxi Zone in which the taximeter was engaged.<br>
18 | **DOLocationID :** |Taxi Zone in which the taximeter was disengaged.<br>
19 | **ratecodeid :**             |The final [rate code](https://www1.nyc.gov/site/tlc/passengers/taxi-fare.page) in effect at the end of the trip.<br>
20 | **store_fwd_flg :**           |This flag indicates whether the trip record was held in vehicle memory before sending to the vendor, aka “store and forward”, because the vehicle did not have a connection to the server.<br>
21 | **pay_type :**                |Signifying how the passenger paid for the trip.<br>
22 | **fare_amount :**            | The time-and-distance fare calculated by the meter.<br>
23 | **surcharge :**              | $0.30 improvement surcharge,  $0.50 overnight surcharge 8pm to 6am, New York State Congestion Surcharge of $2.50.<br>
24 | **mta_tax :**                | $0.50 MTA tax that is automatically triggered based on the metered rate in use.<br>
25 | **tip_amount :**             | This field is automatically populated for credit card tips. Cash tips are not included.<br>
26 | **toll_amount :**            | Total amount of all tolls paid in trip.<br>
27 | **total_amount :**           | The total amount charged to passengers. Does not include cash tips.
28 | 
29 | 
30 | 
31 | 
32 | 
33 | ## Objective:
34 | - **The core objective of this project is to analyse the factors for demand for taxis, to find the most pickups, drop-offs of public based on their location, time of most traffic and how to overcome the needs of the public.**
35 | 
36 | ## Architecture Diagram:<br>
37 | ![](Images/arch_dig.png)
38 | 
39 | ## Code to Get Raw Data from NYC website and store data in S3 bucket:
40 | [Dataingestion](https://github.com/nileshsingal/Analysis-Of-NYC-Yellow-Taxi-/blob/master/Code/dataingestion.sh)
41 | 
42 | ## Code to Read Raw Data from S3 bucket and create Dataframe in PySpark, Perform Cleaning and Transformations, and Create Table in Hive:
43 | [PyScript](https://github.com/nileshsingal/Analysis-Of-NYC-Yellow-Taxi-/blob/master/Code/finalScript.py)
44 | 
45 | ## We have used AWS CloudFormation to automate above process
46 | [CloudFormation templet](https://github.com/nileshsingal/Analysis-Of-NYC-Yellow-Taxi-/blob/master/Code/CFT.json)
47 | 
48 | ## - Then we connected Hive table with Tableau Public using Amazon EMR Hadoop Hive Connector
49 | 
50 | ## Chalanges Faced:
51 | **Slow query performance**
52 | - Because of big volume(30GB) of our data,the performance of our query become very poor, we tackled this problem with the help of diffrent bigdata file formats(orc,parquet,etc.)
53 | - After converting to parquet format our data volume reduced to 8GB, and performance improved.
54 | 
55 | **Emr steps**
56 | - Initially we were strugling alot with EMR steps, but after reading AWS documentation and doing some trial and erros this problem got solved.
57 | 
58 | **CloudFormation**
59 | - Our EMR cluster use to through some errors while we were doing cloudeFormation, but after backtracking the error message, we were able to launch EMR cluster successfully using CFT.
60 | 
61 | **Loading data in Tableau**
62 | - This was biggest chalange in front of us to connect and load the data with tableau,even after converting our data to bigdata file formats(orc,parquet), volume of our data was still very large for the tableau to execute the query and do visualizations, we solved this problem by saving extract of our data in local machine, and then everything went very smooth.
63 | 


--------------------------------------------------------------------------------
/Code/CFT.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "AWSTemplateFormatVersion": "2010-09-09",
  3 |     "Parameters": {
  4 |         "InstanceType": {
  5 |             "Default": "m4.xlarge",
  6 |             "Type": "String"
  7 |         },
  8 |         "ReleaseLabel": {
  9 |             "Default": "emr-5.32.0",
 10 |             "Type": "String"
 11 |         },
 12 |         "SubnetId": {
 13 |             "Default": "subnet-5276aa63",
 14 |             "Type": "String"
 15 |         },
 16 |         "TerminationProtected": {
 17 |             "Type": "String",
 18 |             "Default": "false"
 19 |         },
 20 |         "ElasticMapReducePrincipal": {
 21 |             "Default": "elasticmapreduce.amazonaws.com",
 22 |             "Type": "String"
 23 |         },
 24 |         "Ec2Principal": {
 25 |             "Default": "ec2.amazonaws.com",
 26 |             "Type": "String"
 27 |         },
 28 |         "EMRLogDir": {
 29 |             "Description": "Log Dir for the EMR cluster",
 30 |             "Default": "s3://aws-logs-749482940850-us-east-1/elasticmapreduce/",
 31 |             "Type": "String"
 32 |         },
 33 |         "KeyName": {
 34 |             "Description": "Name of an existing EC2 KeyPair to enable SSH to the instances",
 35 |             "Default": "project",
 36 |             "Type": "String"
 37 |         }
 38 |     },
 39 |     "Resources": {
 40 |         "cluster": {
 41 |             "Type": "AWS::EMR::Cluster",
 42 |             "Properties": {
 43 |                 "Applications": [
 44 |                     {
 45 |                         "Name": "Hadoop"
 46 |                     },
 47 |                     {
 48 |                         "Name": "Hive"
 49 |                     },
 50 |                     {
 51 |                         "Name": "Spark"
 52 |                     },
 53 |                     {
 54 |                         "Name": "Zeppelin"
 55 |                     },
 56 |                     {
 57 |                         "Name": "ZooKeeper"
 58 |                     }
 59 |                 ],
 60 |                 "Instances": {
 61 |                     "MasterInstanceGroup": {
 62 |                         "InstanceCount": 1,
 63 |                         "InstanceType": {
 64 |                             "Ref": "InstanceType"
 65 |                         },
 66 |                         "Market": "ON_DEMAND",
 67 |                         "Name": "Master"
 68 |                     },
 69 |                     "CoreInstanceGroup": {
 70 |                         "InstanceCount": 2,
 71 |                         "InstanceType": {
 72 |                             "Ref": "InstanceType"
 73 |                         },
 74 |                         "Market": "ON_DEMAND",
 75 |                         "Name": "Core"
 76 |                     },
 77 |                     "TerminationProtected": {
 78 |                         "Ref": "TerminationProtected"
 79 |                     },
 80 |                     "Ec2SubnetId": {
 81 |                         "Ref": "SubnetId"
 82 |                     },
 83 |                     "Ec2KeyName": {
 84 |                         "Ref": "KeyName"
 85 |                     }
 86 |                 },
 87 |                 "LogUri": {
 88 |                     "Ref": "EMRLogDir"
 89 |                 },
 90 |                 "Name": "NYCTAXI",
 91 |                 "JobFlowRole": {
 92 |                     "Ref": "emrEc2InstanceProfile"
 93 |                 },
 94 |                 "ServiceRole": {
 95 |                     "Ref": "emrRole"
 96 |                 },
 97 |                 "ReleaseLabel": {
 98 |                     "Ref": "ReleaseLabel"
 99 |                 },
100 |                 "VisibleToAllUsers": true,
101 |                 "Tags": [
102 |                     {
103 |                         "Key": "key1",
104 |                         "Value": "value1"
105 |                     }
106 |                 ]
107 |             }
108 |         },
109 |         "emrRole": {
110 |             "Type": "AWS::IAM::Role",
111 |             "Properties": {
112 |                 "AssumeRolePolicyDocument": {
113 |                     "Version": "2008-10-17",
114 |                     "Statement": [
115 |                         {
116 |                             "Sid": "",
117 |                             "Effect": "Allow",
118 |                             "Principal": {
119 |                                 "Service": {
120 |                                     "Ref": "ElasticMapReducePrincipal"
121 |                                 }
122 |                             },
123 |                             "Action": "sts:AssumeRole"
124 |                         }
125 |                     ]
126 |                 },
127 |                 "Path": "/",
128 |                 "ManagedPolicyArns": [
129 |                     "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceRole"
130 |                 ]
131 |             }
132 |         },
133 |         "emrEc2Role": {
134 |             "Type": "AWS::IAM::Role",
135 |             "Properties": {
136 |                 "AssumeRolePolicyDocument": {
137 |                     "Version": "2008-10-17",
138 |                     "Statement": [
139 |                         {
140 |                             "Sid": "",
141 |                             "Effect": "Allow",
142 |                             "Principal": {
143 |                                 "Service": {
144 |                                     "Ref": "Ec2Principal"
145 |                                 }
146 |                             },
147 |                             "Action": "sts:AssumeRole"
148 |                         }
149 |                     ]
150 |                 },
151 |                 "Path": "/",
152 |                 "ManagedPolicyArns": [
153 |                     "arn:aws:iam::aws:policy/service-role/AmazonElasticMapReduceforEC2Role"
154 |                 ]
155 |             }
156 |         },
157 |         "emrEc2InstanceProfile": {
158 |             "Type": "AWS::IAM::InstanceProfile",
159 |             "Properties": {
160 |                 "Path": "/",
161 |                 "Roles": [
162 |                     {
163 |                         "Ref": "emrEc2Role"
164 |                     }
165 |                 ]
166 |             }
167 |         },
168 |         "TestStep": {
169 |             "Properties": {
170 |                 "ActionOnFailure": "CANCEL_AND_WAIT",
171 |                 "HadoopJarStep": {
172 |                     "Args": [
173 |                         "s3://nycproject23/dataingestion.sh"
174 |                     ],
175 |                     "Jar": "s3://us-east-1.elasticmapreduce/libs/script-runner/script-runner.jar"
176 |                 },
177 |                 "JobFlowId": {
178 |                     "Ref": "cluster"
179 |                 },
180 |                 "Name": "TestStep"
181 |             },
182 |             "Type": "AWS::EMR::Step"
183 |         },
184 |         "SparkStep": {
185 |             "Properties": {
186 |                 "ActionOnFailure": "CONTINUE",
187 |                 "HadoopJarStep": {
188 |                     "Args": [
189 |                         "spark-submit",
190 |                         "--deploy-mode",
191 |                         "cluster",
192 |                         "--conf",
193 |                         "spark.sql.catalogImplementation=hive",
194 |                         "s3://nycproject23/finalpyscript.py"
195 |                     ],
196 |                     "Jar": "command-runner.jar"
197 |                 },
198 |                 "JobFlowId": {
199 |                     "Ref": "cluster"
200 |                 },
201 |                 "Name": "SparkStep"
202 |             },
203 |             "Type": "AWS::EMR::Step"
204 |         }
205 |     }
206 | }
207 | 


--------------------------------------------------------------------------------
/Code/finalScript.py:
--------------------------------------------------------------------------------
  1 | import pyspark
  2 | from pyspark.sql.functions import *
  3 | from pyspark.context import SparkContext
  4 | from pyspark.sql import SQLContext
  5 | from pyspark.sql.session import SparkSession
  6 | 
  7 | 
  8 | sc = SparkContext()
  9 | 
 10 | sqlContext = SQLContext(sc)
 11 | 
 12 | spark = SparkSession.builder.master("local").appName("app name").config("spark.some.config.option", 'true').getOrCreate()
 13 | 
 14 | sampledf1 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-01.csv")
 15 | 
 16 | #Sampling data
 17 | import pyspark.sql.functions as F
 18 | sampledf1 = sampledf1.sample(False, 0.50, seed=0)
 19 | sampledf1 = sampledf1.withColumnRenamed(' passenger_count',"passenger_count")
 20 | sampledf1 = sampledf1.withColumnRenamed(' trip_distance',"trip_distance")
 21 | sampledf1 = sampledf1.withColumnRenamed(' pickup_longitude',"pickup_longitude")
 22 | sampledf1 = sampledf1.withColumnRenamed(' pickup_latitude',"pickup_latitude")
 23 | sampledf1 = sampledf1.withColumnRenamed(' rate_code',"rate_code")
 24 | sampledf1 = sampledf1.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
 25 | sampledf1 = sampledf1.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
 26 | sampledf1 = sampledf1.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
 27 | sampledf1 = sampledf1.withColumnRenamed(' payment_type',"payment_type")
 28 | sampledf1 = sampledf1.withColumnRenamed(' fare_amount',"fare_amount")
 29 | sampledf1 = sampledf1.withColumnRenamed(' surcharge',"surcharge")
 30 | sampledf1 = sampledf1.withColumnRenamed(' mta_tax',"mta_tax")
 31 | sampledf1 = sampledf1.withColumnRenamed(' tip_amount',"tip_amount")
 32 | sampledf1 = sampledf1.withColumnRenamed(' tolls_amount',"tolls_amount")
 33 | sampledf1 = sampledf1.withColumnRenamed(' total_amount',"total_amount")
 34 | sampledf1 = sampledf1.withColumnRenamed(' pickup_datetime',"pickup_datetime")
 35 | sampledf1 = sampledf1.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
 36 | sampledf1.write.parquet("s3a://nycproject23/sampledrawdata/df1.parquet")
 37 | 
 38 | 
 39 | sampledf2 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-02.csv")
 40 | import pyspark.sql.functions as F
 41 | sampledf2 = sampledf2.sample(False, 0.50, seed=0)
 42 | sampledf2 = sampledf2.withColumnRenamed(' passenger_count',"passenger_count")
 43 | sampledf2 = sampledf2.withColumnRenamed(' trip_distance',"trip_distance")
 44 | sampledf2 = sampledf2.withColumnRenamed(' pickup_longitude',"pickup_longitude")
 45 | sampledf2 = sampledf2.withColumnRenamed(' pickup_latitude',"pickup_latitude")
 46 | sampledf2 = sampledf2.withColumnRenamed(' rate_code',"rate_code")
 47 | sampledf2 = sampledf2.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
 48 | sampledf2 = sampledf2.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
 49 | sampledf2 = sampledf2.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
 50 | sampledf2 = sampledf2.withColumnRenamed(' payment_type',"payment_type")
 51 | sampledf2 = sampledf2.withColumnRenamed(' fare_amount',"fare_amount")
 52 | sampledf2 = sampledf2.withColumnRenamed(' surcharge',"surcharge")
 53 | sampledf2 = sampledf2.withColumnRenamed(' mta_tax',"mta_tax")
 54 | sampledf2 = sampledf2.withColumnRenamed(' tip_amount',"tip_amount")
 55 | sampledf2 = sampledf2.withColumnRenamed(' tolls_amount',"tolls_amount")
 56 | sampledf2 = sampledf2.withColumnRenamed(' total_amount',"total_amount")
 57 | sampledf2 = sampledf2.withColumnRenamed(' pickup_datetime',"pickup_datetime")
 58 | sampledf2 = sampledf2.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
 59 | sampledf2.write.parquet("s3a://nycproject23/sampledrawdata/df2.parquet")
 60 | 
 61 | 
 62 | 
 63 | sampledf3 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-03.csv")
 64 | import pyspark.sql.functions as F
 65 | sampledf3 = sampledf3.sample(False, 0.50, seed=0)
 66 | sampledf3 = sampledf3.withColumnRenamed(' passenger_count',"passenger_count")
 67 | sampledf3 = sampledf3.withColumnRenamed(' trip_distance',"trip_distance")
 68 | sampledf3 = sampledf3.withColumnRenamed(' pickup_longitude',"pickup_longitude")
 69 | sampledf3 = sampledf3.withColumnRenamed(' pickup_latitude',"pickup_latitude")
 70 | sampledf3 = sampledf3.withColumnRenamed(' rate_code',"rate_code")
 71 | sampledf3 = sampledf3.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
 72 | sampledf3 = sampledf3.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
 73 | sampledf3 = sampledf3.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
 74 | sampledf3 = sampledf3.withColumnRenamed(' payment_type',"payment_type")
 75 | sampledf3 = sampledf3.withColumnRenamed(' fare_amount',"fare_amount")
 76 | sampledf3 = sampledf3.withColumnRenamed(' surcharge',"surcharge")
 77 | sampledf3 = sampledf3.withColumnRenamed(' mta_tax',"mta_tax")
 78 | sampledf3 = sampledf3.withColumnRenamed(' tip_amount',"tip_amount")
 79 | sampledf3 = sampledf3.withColumnRenamed(' tolls_amount',"tolls_amount")
 80 | sampledf3 = sampledf3.withColumnRenamed(' total_amount',"total_amount")
 81 | sampledf3 = sampledf3.withColumnRenamed(' pickup_datetime',"pickup_datetime")
 82 | sampledf3 = sampledf3.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
 83 | sampledf3.write.parquet("s3a://nycproject23/sampledrawdata/df3.parquet")
 84 | 
 85 | 
 86 | sampledf4 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-04.csv")
 87 | import pyspark.sql.functions as F
 88 | sampledf4 = sampledf4.sample(False, 0.50, seed=0)
 89 | sampledf4 = sampledf4.withColumnRenamed(' passenger_count',"passenger_count")
 90 | sampledf4 = sampledf4.withColumnRenamed(' trip_distance',"trip_distance")
 91 | sampledf4 = sampledf4.withColumnRenamed(' pickup_longitude',"pickup_longitude")
 92 | sampledf4 = sampledf4.withColumnRenamed(' pickup_latitude',"pickup_latitude")
 93 | sampledf4 = sampledf4.withColumnRenamed(' rate_code',"rate_code")
 94 | sampledf4 = sampledf4.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
 95 | sampledf4 = sampledf4.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
 96 | sampledf4 = sampledf4.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
 97 | sampledf4 = sampledf4.withColumnRenamed(' payment_type',"payment_type")
 98 | sampledf4 = sampledf4.withColumnRenamed(' fare_amount',"fare_amount")
 99 | sampledf4 = sampledf4.withColumnRenamed(' surcharge',"surcharge")
100 | sampledf4 = sampledf4.withColumnRenamed(' mta_tax',"mta_tax")
101 | sampledf4 = sampledf4.withColumnRenamed(' tip_amount',"tip_amount")
102 | sampledf4 = sampledf4.withColumnRenamed(' tolls_amount',"tolls_amount")
103 | sampledf4 = sampledf4.withColumnRenamed(' total_amount',"total_amount")
104 | sampledf4 = sampledf4.withColumnRenamed(' pickup_datetime',"pickup_datetime")
105 | sampledf4 = sampledf4.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
106 | sampledf4.write.parquet("s3a://nycproject23/sampledrawdata/df4.parquet")
107 | 
108 | 
109 | sampledf5 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-05.csv")
110 | import pyspark.sql.functions as F
111 | sampledf5 = sampledf5.sample(False, 0.50, seed=0)
112 | sampledf5 = sampledf5.withColumnRenamed(' passenger_count',"passenger_count")
113 | sampledf5 = sampledf5.withColumnRenamed(' trip_distance',"trip_distance")
114 | sampledf5 = sampledf5.withColumnRenamed(' pickup_longitude',"pickup_longitude")
115 | sampledf5 = sampledf5.withColumnRenamed(' pickup_latitude',"pickup_latitude")
116 | sampledf5 = sampledf5.withColumnRenamed(' rate_code',"rate_code")
117 | sampledf5 = sampledf5.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
118 | sampledf5 = sampledf5.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
119 | sampledf5 = sampledf5.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
120 | sampledf5 = sampledf5.withColumnRenamed(' payment_type',"payment_type")
121 | sampledf5 = sampledf5.withColumnRenamed(' fare_amount',"fare_amount")
122 | sampledf5 = sampledf5.withColumnRenamed(' surcharge',"surcharge")
123 | sampledf5 = sampledf5.withColumnRenamed(' mta_tax',"mta_tax")
124 | sampledf5 = sampledf5.withColumnRenamed(' tip_amount',"tip_amount")
125 | sampledf5 = sampledf5.withColumnRenamed(' tolls_amount',"tolls_amount")
126 | sampledf5 = sampledf5.withColumnRenamed(' total_amount',"total_amount")
127 | sampledf5 = sampledf5.withColumnRenamed(' pickup_datetime',"pickup_datetime")
128 | sampledf5 = sampledf5.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
129 | sampledf5.write.parquet("s3a://nycproject23/sampledrawdata/df5.parquet")
130 | 
131 | 
132 | sampledf6 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-06.csv")
133 | import pyspark.sql.functions as F
134 | sampledf6 = sampledf6.sample(False, 0.50, seed=0)
135 | sampledf6 = sampledf6.withColumnRenamed(' passenger_count',"passenger_count")
136 | sampledf6 = sampledf6.withColumnRenamed(' trip_distance',"trip_distance")
137 | sampledf6 = sampledf6.withColumnRenamed(' pickup_longitude',"pickup_longitude")
138 | sampledf6 = sampledf6.withColumnRenamed(' pickup_latitude',"pickup_latitude")
139 | sampledf6 = sampledf6.withColumnRenamed(' rate_code',"rate_code")
140 | sampledf6 = sampledf6.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
141 | sampledf6 = sampledf6.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
142 | sampledf6 = sampledf6.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
143 | sampledf6 = sampledf6.withColumnRenamed(' payment_type',"payment_type")
144 | sampledf6 = sampledf6.withColumnRenamed(' fare_amount',"fare_amount")
145 | sampledf6 = sampledf6.withColumnRenamed(' surcharge',"surcharge")
146 | sampledf6 = sampledf6.withColumnRenamed(' mta_tax',"mta_tax")
147 | sampledf6 = sampledf6.withColumnRenamed(' tip_amount',"tip_amount")
148 | sampledf6 = sampledf6.withColumnRenamed(' tolls_amount',"tolls_amount")
149 | sampledf6 = sampledf6.withColumnRenamed(' total_amount',"total_amount")
150 | sampledf6 = sampledf6.withColumnRenamed(' pickup_datetime',"pickup_datetime")
151 | sampledf6 = sampledf6.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
152 | sampledf6.write.parquet("s3a://nycproject23/sampledrawdata/df6.parquet")
153 | 
154 | sampledf7 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-07.csv")
155 | import pyspark.sql.functions as F
156 | sampledf7 = sampledf7.sample(False, 0.50, seed=0)
157 | sampledf7 = sampledf7.withColumnRenamed(' passenger_count',"passenger_count")
158 | sampledf7 = sampledf7.withColumnRenamed(' trip_distance',"trip_distance")
159 | sampledf7 = sampledf7.withColumnRenamed(' pickup_longitude',"pickup_longitude")
160 | sampledf7 = sampledf7.withColumnRenamed(' pickup_latitude',"pickup_latitude")
161 | sampledf7 = sampledf7.withColumnRenamed(' rate_code',"rate_code")
162 | sampledf7 = sampledf7.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
163 | sampledf7 = sampledf7.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
164 | sampledf7 = sampledf7.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
165 | sampledf7 = sampledf7.withColumnRenamed(' payment_type',"payment_type")
166 | sampledf7 = sampledf7.withColumnRenamed(' fare_amount',"fare_amount")
167 | sampledf7 = sampledf7.withColumnRenamed(' surcharge',"surcharge")
168 | sampledf7 = sampledf7.withColumnRenamed(' mta_tax',"mta_tax")
169 | sampledf7 = sampledf7.withColumnRenamed(' tip_amount',"tip_amount")
170 | sampledf7 = sampledf7.withColumnRenamed(' tolls_amount',"tolls_amount")
171 | sampledf7 = sampledf7.withColumnRenamed(' total_amount',"total_amount")
172 | sampledf7 = sampledf7.withColumnRenamed(' pickup_datetime',"pickup_datetime")
173 | sampledf7 = sampledf7.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
174 | sampledf7.write.parquet("s3a://nycproject23/sampledrawdata/df7.parquet")
175 | 
176 | 
177 | sampledf8 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-08.csv")
178 | import pyspark.sql.functions as F
179 | sampledf8 = sampledf8.sample(False, 0.50, seed=0)
180 | sampledf8 = sampledf8.withColumnRenamed(' passenger_count',"passenger_count")
181 | sampledf8 = sampledf8.withColumnRenamed(' trip_distance',"trip_distance")
182 | sampledf8 = sampledf8.withColumnRenamed(' pickup_longitude',"pickup_longitude")
183 | sampledf8 = sampledf8.withColumnRenamed(' pickup_latitude',"pickup_latitude")
184 | sampledf8 = sampledf8.withColumnRenamed(' rate_code',"rate_code")
185 | sampledf8 = sampledf8.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
186 | sampledf8 = sampledf8.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
187 | sampledf8 = sampledf8.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
188 | sampledf8 = sampledf8.withColumnRenamed(' payment_type',"payment_type")
189 | sampledf8 = sampledf8.withColumnRenamed(' fare_amount',"fare_amount")
190 | sampledf8 = sampledf8.withColumnRenamed(' surcharge',"surcharge")
191 | sampledf8 = sampledf8.withColumnRenamed(' mta_tax',"mta_tax")
192 | sampledf8 = sampledf8.withColumnRenamed(' tip_amount',"tip_amount")
193 | sampledf8 = sampledf8.withColumnRenamed(' tolls_amount',"tolls_amount")
194 | sampledf8 = sampledf8.withColumnRenamed(' total_amount',"total_amount")
195 | sampledf8 = sampledf8.withColumnRenamed(' pickup_datetime',"pickup_datetime")
196 | sampledf8 = sampledf8.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
197 | sampledf8.write.parquet("s3a://nycproject23/sampledrawdata/df8.parquet")
198 | 
199 | sampledf9 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-09.csv")
200 | import pyspark.sql.functions as F
201 | sampledf9 = sampledf9.sample(False, 0.50, seed=0)
202 | sampledf9 = sampledf9.withColumnRenamed(' passenger_count',"passenger_count")
203 | sampledf9 = sampledf9.withColumnRenamed(' trip_distance',"trip_distance")
204 | sampledf9 = sampledf9.withColumnRenamed(' pickup_longitude',"pickup_longitude")
205 | sampledf9 = sampledf9.withColumnRenamed(' pickup_latitude',"pickup_latitude")
206 | sampledf9 = sampledf9.withColumnRenamed(' rate_code',"rate_code")
207 | sampledf9 = sampledf9.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
208 | sampledf9 = sampledf9.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
209 | sampledf9 = sampledf9.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
210 | sampledf9 = sampledf9.withColumnRenamed(' payment_type',"payment_type")
211 | sampledf9 = sampledf9.withColumnRenamed(' fare_amount',"fare_amount")
212 | sampledf9 = sampledf9.withColumnRenamed(' surcharge',"surcharge")
213 | sampledf9 = sampledf9.withColumnRenamed(' mta_tax',"mta_tax")
214 | sampledf9 = sampledf9.withColumnRenamed(' tip_amount',"tip_amount")
215 | sampledf9 = sampledf9.withColumnRenamed(' tolls_amount',"tolls_amount")
216 | sampledf9 = sampledf9.withColumnRenamed(' total_amount',"total_amount")
217 | sampledf9 = sampledf9.withColumnRenamed(' pickup_datetime',"pickup_datetime")
218 | sampledf9 = sampledf9.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
219 | sampledf9.write.parquet("s3a://nycproject23/sampledrawdata/df9.parquet")
220 | 
221 | 
222 | sampledf10 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-10.csv")
223 | import pyspark.sql.functions as F
224 | sampledf10 = sampledf10.sample(False, 0.50, seed=0)
225 | sampledf10 = sampledf10.withColumnRenamed(' passenger_count',"passenger_count")
226 | sampledf10 = sampledf10.withColumnRenamed(' trip_distance',"trip_distance")
227 | sampledf10 = sampledf10.withColumnRenamed(' pickup_longitude',"pickup_longitude")
228 | sampledf10 = sampledf10.withColumnRenamed(' pickup_latitude',"pickup_latitude")
229 | sampledf10 = sampledf10.withColumnRenamed(' rate_code',"rate_code")
230 | sampledf10 = sampledf10.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
231 | sampledf10 = sampledf10.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
232 | sampledf10 = sampledf10.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
233 | sampledf10 = sampledf10.withColumnRenamed(' payment_type',"payment_type")
234 | sampledf10 = sampledf10.withColumnRenamed(' fare_amount',"fare_amount")
235 | sampledf10 = sampledf10.withColumnRenamed(' surcharge',"surcharge")
236 | sampledf10 = sampledf10.withColumnRenamed(' mta_tax',"mta_tax")
237 | sampledf10 = sampledf10.withColumnRenamed(' tip_amount',"tip_amount")
238 | sampledf10 = sampledf10.withColumnRenamed(' tolls_amount',"tolls_amount")
239 | sampledf10 = sampledf10.withColumnRenamed(' total_amount',"total_amount")
240 | sampledf10 = sampledf10.withColumnRenamed(' pickup_datetime',"pickup_datetime")
241 | sampledf10 = sampledf10.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
242 | sampledf10.write.parquet("s3a://nycproject23/sampledrawdata/df10.parquet")
243 | 
244 | sampledf11 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-11.csv")
245 | import pyspark.sql.functions as F
246 | sampledf11 = sampledf11.sample(False, 0.50, seed=0)
247 | sampledf11 = sampledf11.withColumnRenamed(' passenger_count',"passenger_count")
248 | sampledf11 = sampledf11.withColumnRenamed(' trip_distance',"trip_distance")
249 | sampledf11 = sampledf11.withColumnRenamed(' pickup_longitude',"pickup_longitude")
250 | sampledf11 = sampledf11.withColumnRenamed(' pickup_latitude',"pickup_latitude")
251 | sampledf11 = sampledf11.withColumnRenamed(' rate_code',"rate_code")
252 | sampledf11 = sampledf11.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
253 | sampledf11 = sampledf11.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
254 | sampledf11 = sampledf11.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
255 | sampledf11 = sampledf11.withColumnRenamed(' payment_type',"payment_type")
256 | sampledf11 = sampledf11.withColumnRenamed(' fare_amount',"fare_amount")
257 | sampledf11 = sampledf11.withColumnRenamed(' surcharge',"surcharge")
258 | sampledf11 = sampledf11.withColumnRenamed(' mta_tax',"mta_tax")
259 | sampledf11 = sampledf11.withColumnRenamed(' tip_amount',"tip_amount")
260 | sampledf11 = sampledf11.withColumnRenamed(' tolls_amount',"tolls_amount")
261 | sampledf11 = sampledf11.withColumnRenamed(' total_amount',"total_amount")
262 | sampledf11 = sampledf11.withColumnRenamed(' pickup_datetime',"pickup_datetime")
263 | sampledf11 = sampledf11.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
264 | sampledf11.write.parquet("s3a://nycproject23/sampledrawdata/df11.parquet")
265 | 
266 | sampledf12 = spark.read.option("header",True).csv("s3a://nycproject23/rawdata/yellow_tripdata_2014-12.csv")
267 | import pyspark.sql.functions as F
268 | sampledf12 = sampledf12.sample(False, 0.50, seed=0)
269 | sampledf12 = sampledf12.withColumnRenamed(' passenger_count',"passenger_count")
270 | sampledf12 = sampledf12.withColumnRenamed(' trip_distance',"trip_distance")
271 | sampledf12 = sampledf12.withColumnRenamed(' pickup_longitude',"pickup_longitude")
272 | sampledf12 = sampledf12.withColumnRenamed(' pickup_latitude',"pickup_latitude")
273 | sampledf12 = sampledf12.withColumnRenamed(' rate_code',"rate_code")
274 | sampledf12 = sampledf12.withColumnRenamed(' store_and_fwd_flag',"store_and_fwd_flag")
275 | sampledf12 = sampledf12.withColumnRenamed(' dropoff_longitude',"dropoff_longitude")
276 | sampledf12 = sampledf12.withColumnRenamed(' dropoff_latitude',"dropoff_latitude")
277 | sampledf12 = sampledf12.withColumnRenamed(' payment_type',"payment_type")
278 | sampledf12 = sampledf12.withColumnRenamed(' fare_amount',"fare_amount")
279 | sampledf12 = sampledf12.withColumnRenamed(' surcharge',"surcharge")
280 | sampledf12 = sampledf12.withColumnRenamed(' mta_tax',"mta_tax")
281 | sampledf12 = sampledf12.withColumnRenamed(' tip_amount',"tip_amount")
282 | sampledf12 = sampledf12.withColumnRenamed(' tolls_amount',"tolls_amount")
283 | sampledf12 = sampledf12.withColumnRenamed(' total_amount',"total_amount")
284 | sampledf12 = sampledf12.withColumnRenamed(' pickup_datetime',"pickup_datetime")
285 | sampledf12 = sampledf12.withColumnRenamed(' dropoff_datetime',"dropoff_datetime")
286 | sampledf12.write.parquet("s3a://nycproject23/sampledrawdata/df12.parquet")
287 | 
288 | #Cleaning data
289 | df1 = spark.read.option("header",True).parquet("s3a://nycproject23/sampledrawdata/*.parquet")
290 | 
291 | df2 = df1.filter("passenger_count != '208'")
292 | 
293 | df3 = df2.filter((df2.rate_code!='156') & (df2.rate_code!='208') & (df2.rate_code!='210') & (df2.rate_code!='28') & (df2.rate_code!='65') & (df2.rate_code!='77') & (df2.rate_code!='7') & (df2.rate_code!='8') & (df2.rate_code!='9') & (df2.rate_code!='0') & (df2.rate_code!='16'))
294 | 
295 | final_df = df3.drop('store_and_fwd_flag')
296 | 
297 | final_df = final_df.na.drop()
298 | 
299 | #Transformations on pickup and dropoff datetime column(timestamp).
300 | from pyspark.sql.functions import *
301 | df1 = final_df.withColumn('pickup_hour',hour(final_df.pickup_datetime))
302 | df1 =df1.withColumn('dropoff_hour',hour(df1.dropoff_datetime))
303 | df1 = df1.withColumn('pickup_day',date_format(col("pickup_datetime"),"EEEE"))
304 | df1 = df1.withColumn('dropoff_day',date_format(col("dropoff_datetime"),"EEEE"))
305 | df1 = df1.withColumn('pickup_year',year(df1.pickup_datetime))
306 | df1 = df1.withColumn('dropoff_year',year(df1.dropoff_datetime))
307 | df1 = df1.withColumn('pickup_month',month(df1.pickup_datetime))
308 | df1 = df1.withColumn('dropoff_month',month(df1.dropoff_datetime))
309 | 
310 | df1.createOrReplaceTempView("taxitable")
311 | 
312 | sqlContext.sql("Create table nyctaxi select * from taxitable")
313 | 


--------------------------------------------------------------------------------