├── ClickHouse └── create.sql ├── GreenPlum └── faa_otp_loadX.sql ├── README.md ├── download.sh ├── mysql ├── create_table.sql └── loaddata.sh └── spark └── csv_to_parquet.txt /ClickHouse/create.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `ontime` ( 2 | `Year` UInt16, 3 | `Quarter` UInt8, 4 | `Month` UInt8, 5 | `DayofMonth` UInt8, 6 | `DayOfWeek` UInt8, 7 | `FlightDate` Date, 8 | `UniqueCarrier` FixedString(7), 9 | `AirlineID` Int32, 10 | `Carrier` FixedString(2), 11 | `TailNum` String, 12 | `FlightNum` String, 13 | `OriginAirportID` Int32, 14 | `OriginAirportSeqID` Int32, 15 | `OriginCityMarketID` Int32, 16 | `Origin` FixedString(5), 17 | `OriginCityName` String, 18 | `OriginState` FixedString(2), 19 | `OriginStateFips` String, 20 | `OriginStateName` String, 21 | `OriginWac` Int32, 22 | `DestAirportID` Int32, 23 | `DestAirportSeqID` Int32, 24 | `DestCityMarketID` Int32, 25 | `Dest` FixedString(5), 26 | `DestCityName` String, 27 | `DestState` FixedString(2), 28 | `DestStateFips` String, 29 | `DestStateName` String, 30 | `DestWac` Int32, 31 | `CRSDepTime` Int32, 32 | `DepTime` Int32, 33 | `DepDelay` Int32, 34 | `DepDelayMinutes` Int32, 35 | `DepDel15` Int32, 36 | `DepartureDelayGroups` String, 37 | `DepTimeBlk` String, 38 | `TaxiOut` Int32, 39 | `WheelsOff` Int32, 40 | `WheelsOn` Int32, 41 | `TaxiIn` Int32, 42 | `CRSArrTime` Int32, 43 | `ArrTime` Int32, 44 | `ArrDelay` Int32, 45 | `ArrDelayMinutes` Int32, 46 | `ArrDel15` Int32, 47 | `ArrivalDelayGroups` Int32, 48 | `ArrTimeBlk` String, 49 | `Cancelled` UInt8, 50 | `CancellationCode` FixedString(1), 51 | `Diverted` UInt8, 52 | `CRSElapsedTime` Int32, 53 | `ActualElapsedTime` Int32, 54 | `AirTime` Int32, 55 | `Flights` Int32, 56 | `Distance` Int32, 57 | `DistanceGroup` UInt8, 58 | `CarrierDelay` Int32, 59 | `WeatherDelay` Int32, 60 | `NASDelay` Int32, 61 | `SecurityDelay` Int32, 62 | `LateAircraftDelay` Int32, 63 | `FirstDepTime` String, 64 | `TotalAddGTime` String, 65 | `LongestAddGTime` String, 66 | `DivAirportLandings` String, 67 | `DivReachedDest` String, 68 | `DivActualElapsedTime` String, 69 | `DivArrDelay` String, 70 | `DivDistance` String, 71 | `Div1Airport` String, 72 | `Div1AirportID` Int32, 73 | `Div1AirportSeqID` Int32, 74 | `Div1WheelsOn` String, 75 | `Div1TotalGTime` String, 76 | `Div1LongestGTime` String, 77 | `Div1WheelsOff` String, 78 | `Div1TailNum` String, 79 | `Div2Airport` String, 80 | `Div2AirportID` Int32, 81 | `Div2AirportSeqID` Int32, 82 | `Div2WheelsOn` String, 83 | `Div2TotalGTime` String, 84 | `Div2LongestGTime` String, 85 | `Div2WheelsOff` String, 86 | `Div2TailNum` String, 87 | `Div3Airport` String, 88 | `Div3AirportID` Int32, 89 | `Div3AirportSeqID` Int32, 90 | `Div3WheelsOn` String, 91 | `Div3TotalGTime` String, 92 | `Div3LongestGTime` String, 93 | `Div3WheelsOff` String, 94 | `Div3TailNum` String, 95 | `Div4Airport` String, 96 | `Div4AirportID` Int32, 97 | `Div4AirportSeqID` Int32, 98 | `Div4WheelsOn` String, 99 | `Div4TotalGTime` String, 100 | `Div4LongestGTime` String, 101 | `Div4WheelsOff` String, 102 | `Div4TailNum` String, 103 | `Div5Airport` String, 104 | `Div5AirportID` Int32, 105 | `Div5AirportSeqID` Int32, 106 | `Div5WheelsOn` String, 107 | `Div5TotalGTime` String, 108 | `Div5LongestGTime` String, 109 | `Div5WheelsOff` String, 110 | `Div5TailNum` String 111 | ) ENGINE = MergeTree(FlightDate, (Year, FlightDate), 8192) 112 | -------------------------------------------------------------------------------- /GreenPlum/faa_otp_loadX.sql: -------------------------------------------------------------------------------- 1 | drop table if exists faa.faa_otp_load; 2 | create table faa.faa_otp_load ( 3 | Flt_Year smallint, 4 | Flt_Quarter smallint, 5 | Flt_Month smallint, 6 | Flt_DayofMonth smallint, 7 | Flt_DayOfWeek smallint, 8 | FlightDate date, 9 | UniqueCarrier text, 10 | AirlineID integer, 11 | Carrier text, 12 | TailNum text, 13 | FlightNum text, 14 | OriginAirportID integer, 15 | OriginAirportSeqID integer, 16 | OriginCityMarketID integer, 17 | Origin text, 18 | OriginCityName text, 19 | OriginState text, 20 | OriginStateFips text, 21 | OriginStateName text, 22 | OriginWac smallint, 23 | DestAirportID integer, 24 | DestAirportSeqID integer, 25 | DestCityMarketID integer, 26 | Dest text, 27 | DestCityName text, 28 | DestState text, 29 | DestStateFips text, 30 | DestStateName text, 31 | DestWac smallint, 32 | CRSDepTime text, 33 | DepTime text, 34 | DepDelay numeric, 35 | DepDelayMinutes numeric, 36 | DepDel15 numeric, 37 | DepartureDelayGroups smallint, 38 | DepTimeBlk text, 39 | TaxiOut numeric, 40 | WheelsOff text, 41 | WheelsOn text, 42 | TaxiIn numeric, 43 | CRSArrTime text, 44 | ArrTime text, 45 | ArrDelay numeric, 46 | ArrDelayMinutes numeric, 47 | ArrDel15 numeric, 48 | ArrivalDelayGroups smallint, 49 | ArrTimeBlk text, 50 | Cancelled numeric, 51 | CancellationCode text, 52 | Diverted numeric, 53 | CRSElapsedTime numeric, 54 | ActualElapsedTime numeric, 55 | AirTime numeric, 56 | Flights numeric, 57 | Distance numeric, 58 | DistanceGroup smallint, 59 | CarrierDelay numeric, 60 | WeatherDelay numeric, 61 | NASDelay numeric, 62 | SecurityDelay numeric, 63 | LateAircraftDelay numeric, 64 | FirstDepTime text, 65 | TotalAddGTime numeric, 66 | LongestAddGTime numeric, 67 | DivAirportLandings numeric, 68 | DivReachedDest numeric, 69 | DivActualElapsedTime numeric, 70 | DivArrDelay numeric, 71 | DivDistance numeric, 72 | Div1Airport text, 73 | Div1AirportID integer, 74 | Div1AirportSeqID integer, 75 | Div1WheelsOn text, 76 | Div1TotalGTime numeric, 77 | Div1LongestGTime numeric, 78 | Div1WheelsOff text, 79 | Div1TailNum text, 80 | Div2Airport text, 81 | Div2AirportID integer, 82 | Div2AirportSeqID integer, 83 | Div2WheelsOn text, 84 | Div2TotalGTime numeric, 85 | Div2LongestGTime numeric, 86 | Div2WheelsOff text, 87 | Div2TailNum text, 88 | Div3Airport text, 89 | Div3AirportID integer, 90 | Div3AirportSeqID integer, 91 | Div3WheelsOn text, 92 | Div3TotalGTime numeric, 93 | Div3LongestGTime numeric, 94 | Div3WheelsOff text, 95 | Div3TailNum text, 96 | Div4Airport text, 97 | Div4AirportID integer, 98 | Div4AirportSeqID integer, 99 | Div4WheelsOn text, 100 | Div4TotalGTime numeric, 101 | Div4LongestGTime numeric, 102 | Div4WheelsOff text, 103 | Div4TailNum text, 104 | Div5Airport text, 105 | Div5AirportID integer, 106 | Div5AirportSeqID integer, 107 | Div5WheelsOn text, 108 | Div5TotalGTime numeric, 109 | Div5LongestGTime numeric, 110 | Div5WheelsOff text, 111 | Div5TailNum text, 112 | trailer smallint 113 | ) 114 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ontime-airline-performance 2 | 3 | This repo contains scripts and loaders for "On-Time Performance" data. 4 | http://www.transtats.bts.gov/DL_SelectFields.asp?Table_ID=236&DB_Short_Name=On-Time 5 | 6 | This data is from the USA government without a license and therefore [assumed](https://en.wikipedia.org/wiki/Copyright_status_of_works_by_the_federal_government_of_the_United_States) to be in the public domain in the United States. 7 | -------------------------------------------------------------------------------- /download.sh: -------------------------------------------------------------------------------- 1 | #download database 2 | 3 | for s in `seq 1987 2018` 4 | do 5 | for m in `seq 1 12` 6 | do 7 | wget --no-check-certificate https://transtats.bts.gov/PREZIP/On_Time_Reporting_Carrier_On_Time_Performance_1987_present_${s}_${m}.zip 8 | done 9 | done 10 | -------------------------------------------------------------------------------- /mysql/create_table.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `ontime` ( 2 | `Year` year(4) DEFAULT NULL, 3 | `Quarter` tinyint(4) DEFAULT NULL, 4 | `Month` tinyint(4) DEFAULT NULL, 5 | `DayofMonth` tinyint(4) DEFAULT NULL, 6 | `DayOfWeek` tinyint(4) DEFAULT NULL, 7 | `FlightDate` date DEFAULT NULL, 8 | `UniqueCarrier` char(7) DEFAULT NULL, 9 | `AirlineID` int(11) DEFAULT NULL, 10 | `Carrier` char(2) DEFAULT NULL, 11 | `TailNum` varchar(50) DEFAULT NULL, 12 | `FlightNum` varchar(10) DEFAULT NULL, 13 | `OriginAirportID` int(11) DEFAULT NULL, 14 | `OriginAirportSeqID` int(11) DEFAULT NULL, 15 | `OriginCityMarketID` int(11) DEFAULT NULL, 16 | `Origin` char(5) DEFAULT NULL, 17 | `OriginCityName` varchar(100) DEFAULT NULL, 18 | `OriginState` char(2) DEFAULT NULL, 19 | `OriginStateFips` varchar(10) DEFAULT NULL, 20 | `OriginStateName` varchar(100) DEFAULT NULL, 21 | `OriginWac` int(11) DEFAULT NULL, 22 | `DestAirportID` int(11) DEFAULT NULL, 23 | `DestAirportSeqID` int(11) DEFAULT NULL, 24 | `DestCityMarketID` int(11) DEFAULT NULL, 25 | `Dest` char(5) DEFAULT NULL, 26 | `DestCityName` varchar(100) DEFAULT NULL, 27 | `DestState` char(2) DEFAULT NULL, 28 | `DestStateFips` varchar(10) DEFAULT NULL, 29 | `DestStateName` varchar(100) DEFAULT NULL, 30 | `DestWac` int(11) DEFAULT NULL, 31 | `CRSDepTime` int(11) DEFAULT NULL, 32 | `DepTime` int(11) DEFAULT NULL, 33 | `DepDelay` int(11) DEFAULT NULL, 34 | `DepDelayMinutes` int(11) DEFAULT NULL, 35 | `DepDel15` int(11) DEFAULT NULL, 36 | `DepartureDelayGroups` int(11) DEFAULT NULL, 37 | `DepTimeBlk` varchar(20) DEFAULT NULL, 38 | `TaxiOut` int(11) DEFAULT NULL, 39 | `WheelsOff` int(11) DEFAULT NULL, 40 | `WheelsOn` int(11) DEFAULT NULL, 41 | `TaxiIn` int(11) DEFAULT NULL, 42 | `CRSArrTime` int(11) DEFAULT NULL, 43 | `ArrTime` int(11) DEFAULT NULL, 44 | `ArrDelay` int(11) DEFAULT NULL, 45 | `ArrDelayMinutes` int(11) DEFAULT NULL, 46 | `ArrDel15` int(11) DEFAULT NULL, 47 | `ArrivalDelayGroups` int(11) DEFAULT NULL, 48 | `ArrTimeBlk` varchar(20) DEFAULT NULL, 49 | `Cancelled` tinyint(4) DEFAULT NULL, 50 | `CancellationCode` char(1) DEFAULT NULL, 51 | `Diverted` tinyint(4) DEFAULT NULL, 52 | `CRSElapsedTime` int(11) DEFAULT NULL, 53 | `ActualElapsedTime` int(11) DEFAULT NULL, 54 | `AirTime` int(11) DEFAULT NULL, 55 | `Flights` int(11) DEFAULT NULL, 56 | `Distance` int(11) DEFAULT NULL, 57 | `DistanceGroup` tinyint(4) DEFAULT NULL, 58 | `CarrierDelay` int(11) DEFAULT NULL, 59 | `WeatherDelay` int(11) DEFAULT NULL, 60 | `NASDelay` int(11) DEFAULT NULL, 61 | `SecurityDelay` int(11) DEFAULT NULL, 62 | `LateAircraftDelay` int(11) DEFAULT NULL, 63 | `FirstDepTime` varchar(10) DEFAULT NULL, 64 | `TotalAddGTime` varchar(10) DEFAULT NULL, 65 | `LongestAddGTime` varchar(10) DEFAULT NULL, 66 | `DivAirportLandings` varchar(10) DEFAULT NULL, 67 | `DivReachedDest` varchar(10) DEFAULT NULL, 68 | `DivActualElapsedTime` varchar(10) DEFAULT NULL, 69 | `DivArrDelay` varchar(10) DEFAULT NULL, 70 | `DivDistance` varchar(10) DEFAULT NULL, 71 | `Div1Airport` varchar(10) DEFAULT NULL, 72 | `Div1AirportID` int(11) DEFAULT NULL, 73 | `Div1AirportSeqID` int(11) DEFAULT NULL, 74 | `Div1WheelsOn` varchar(10) DEFAULT NULL, 75 | `Div1TotalGTime` varchar(10) DEFAULT NULL, 76 | `Div1LongestGTime` varchar(10) DEFAULT NULL, 77 | `Div1WheelsOff` varchar(10) DEFAULT NULL, 78 | `Div1TailNum` varchar(10) DEFAULT NULL, 79 | `Div2Airport` varchar(10) DEFAULT NULL, 80 | `Div2AirportID` int(11) DEFAULT NULL, 81 | `Div2AirportSeqID` int(11) DEFAULT NULL, 82 | `Div2WheelsOn` varchar(10) DEFAULT NULL, 83 | `Div2TotalGTime` varchar(10) DEFAULT NULL, 84 | `Div2LongestGTime` varchar(10) DEFAULT NULL, 85 | `Div2WheelsOff` varchar(10) DEFAULT NULL, 86 | `Div2TailNum` varchar(10) DEFAULT NULL, 87 | `Div3Airport` varchar(10) DEFAULT NULL, 88 | `Div3AirportID` int(11) DEFAULT NULL, 89 | `Div3AirportSeqID` int(11) DEFAULT NULL, 90 | `Div3WheelsOn` varchar(10) DEFAULT NULL, 91 | `Div3TotalGTime` varchar(10) DEFAULT NULL, 92 | `Div3LongestGTime` varchar(10) DEFAULT NULL, 93 | `Div3WheelsOff` varchar(10) DEFAULT NULL, 94 | `Div3TailNum` varchar(10) DEFAULT NULL, 95 | `Div4Airport` varchar(10) DEFAULT NULL, 96 | `Div4AirportID` int(11) DEFAULT NULL, 97 | `Div4AirportSeqID` int(11) DEFAULT NULL, 98 | `Div4WheelsOn` varchar(10) DEFAULT NULL, 99 | `Div4TotalGTime` varchar(10) DEFAULT NULL, 100 | `Div4LongestGTime` varchar(10) DEFAULT NULL, 101 | `Div4WheelsOff` varchar(10) DEFAULT NULL, 102 | `Div4TailNum` varchar(10) DEFAULT NULL, 103 | `Div5Airport` varchar(10) DEFAULT NULL, 104 | `Div5AirportID` int(11) DEFAULT NULL, 105 | `Div5AirportSeqID` int(11) DEFAULT NULL, 106 | `Div5WheelsOn` varchar(10) DEFAULT NULL, 107 | `Div5TotalGTime` varchar(10) DEFAULT NULL, 108 | `Div5LongestGTime` varchar(10) DEFAULT NULL, 109 | `Div5WheelsOff` varchar(10) DEFAULT NULL, 110 | `Div5TailNum` varchar(10) DEFAULT NULL 111 | ) DEFAULT CHARSET=latin1 112 | -------------------------------------------------------------------------------- /mysql/loaddata.sh: -------------------------------------------------------------------------------- 1 | # SET GLOBAL sql_mode = '' 2 | for y in {1987..1988} 3 | do 4 | for i in {1..12} 5 | do 6 | echo "$y - $i" 7 | mysql -S /var/lib/mysql/mysql.sock -pperc0na00 -e "load data infile '/mnt/md0/otp/On_Time_On_Time_Performance_${y}_${i}.csv' into table otp.ontime fields terminated by ',' OPTIONALLY ENCLOSED BY '\"' ignore 1 lines" 8 | done 9 | done 10 | -------------------------------------------------------------------------------- /spark/csv_to_parquet.txt: -------------------------------------------------------------------------------- 1 | # bin/spark-shell --packages com.databricks:spark-csv_2.11:1.3.0 2 | val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").load("/data/opt/otp/On_Time_On_Time_Performance_*.csv") 3 | # or 4 | val df = sqlContext.read.format("com.databricks.spark.csv").option("header", "true").option("inferSchema", "true").load("/data/opt/otp/On_Time_On_Time_Performance_*.csv") 5 | 6 | sqlContext.setConf("spark.sql.parquet.compression.codec", "snappy") 7 | df.write.partitionBy("Year").parquet("/data/flash/spark/otp") 8 | 9 | # read data 10 | val pFile = sqlContext.read.parquet("/mnt/i3600/spark/otp").cache() 11 | 12 | (pFile.groupBy("Year","Month").count()).agg(avg("count")).collect() 13 | val t1=System.currentTimeMillis; (pFile.filter("DepDel15=1").groupBy("Year","Month").count()).agg(avg("count")).collect(); val t2=System.currentTimeMillis; t2-t1 14 | --------------------------------------------------------------------------------