├── README.md ├── airline ├── data │ ├── clean.sh │ ├── download.sh │ ├── import.php │ ├── import.sh │ └── readme.html ├── ddl │ └── schema.sql └── readme.txt ├── nyc-taxi-rides ├── data │ ├── central_park_weather_observations.tsv │ └── taxi_zones.tsv ├── ddl │ ├── import_trip_data.php │ ├── schema.sql │ ├── taxi_zones.xml │ └── weather.xml └── readme.txt └── star ├── data ├── download.sh └── import.php ├── ddl └── schema.sql ├── readme.txt └── sql ├── test1.sql ├── test2.sql ├── test3.sql ├── test4.sql ├── test5.sql ├── test6.sql ├── test7.sql ├── test8.sql └── test9.sql /README.md: -------------------------------------------------------------------------------- 1 | # clickhouse-benchmarks 2 | 3 | Benchmarks are organized as follows: 4 | 5 | * \/ -- root folder, benchmark description, reference data etc. 6 | * \/ddl/ -- schema files, e.g. 'create table' statements 7 | * \/data/ -- data files an load scrips 8 | * \/sql/ -- queries 9 | 10 | -------------------------------------------------------------------------------- /airline/data/clean.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ZIP_FILES_DIR="zip" 4 | CSV_FILES_DIR="csv" 5 | 6 | rm -f "$CSV_FILES_DIR"/readme.html 7 | rm -f "$CSV_FILES_DIR"/*.csv 8 | rm -f "$ZIP_FILES_DIR"/*.zip 9 | 10 | rmdir "$CSV_FILES_DIR" 11 | rmdir "$ZIP_FILES_DIR" 12 | 13 | -------------------------------------------------------------------------------- /airline/data/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | echo "Check required commands availability" 4 | if command -v wget && command -v unzip && command -v clickhouse-client && command -v wc && command -v awk; then 5 | echo "Looks like all required commands are available" 6 | else 7 | echo "Please ensure availability of: wget && unzip && clickhouse-client && wc && awk" 8 | exit 1 9 | fi 10 | 11 | # download database 12 | 13 | ZIP_FILES_DIR="zip" 14 | echo "Create dir $ZIP_FILES_DIR for downloading zip files" 15 | mkdir "$ZIP_FILES_DIR" 16 | 17 | if [ ! -d "$ZIP_FILES_DIR" ]; then 18 | "Can' use dir: $ZIP_FILES_DIR - not available" 19 | exit 1 20 | fi 21 | 22 | echo "Download files into $ZIP_FILES_DIR" 23 | for year in `seq 1987 2017`; do 24 | for month in `seq 1 12`; do 25 | FILE_NAME="On_Time_On_Time_Performance_${year}_${month}.zip" 26 | wget -O "$ZIP_FILES_DIR/$FILE_NAME" "http://transtats.bts.gov/PREZIP/$FILE_NAME" 27 | done 28 | done 29 | 30 | -------------------------------------------------------------------------------- /airline/data/import.php: -------------------------------------------------------------------------------- 1 | read()) { 8 | if (stripos($entry, '.zip') !== false) { 9 | exec('unzip -o ' . $sourceDir . $entry); 10 | exec('unlink ' . $sourceDir . $entry); 11 | $entry = str_replace('.zip', '.csv', $entry); 12 | } 13 | 14 | if (stripos($entry, '.csv') === false) { 15 | continue; 16 | } 17 | 18 | echo "processing '$entry'...\n"; 19 | 20 | $entry = $sourceDir . $entry; 21 | exec("tail -n +2 $entry | clickhouse-client --query=\"INSERT INTO ontime ( Year, Quarter, Month, DayofMonth, DayOfWeek, FlightDate, UniqueCarrier, AirlineID, Carrier, TailNum, FlightNum, OriginAirportID, OriginAirportSeqID, OriginCityMarketID, Origin, OriginCityName, OriginState, OriginStateFips, OriginStateName, OriginWac, DestAirportID, DestAirportSeqID, DestCityMarketID, Dest, DestCityName, DestState, DestStateFips, DestStateName, DestWac, CRSDepTime, DepTime, DepDelay, DepDelayMinutes, DepDel15, DepartureDelayGroups, DepTimeBlk, TaxiOut, WheelsOff, WheelsOn, TaxiIn, CRSArrTime, ArrTime, ArrDelay, ArrDelayMinutes, ArrDel15, ArrivalDelayGroups, ArrTimeBlk, Cancelled, CancellationCode, Diverted, CRSElapsedTime, ActualElapsedTime, AirTime, Flights, Distance, DistanceGroup, CarrierDelay, WeatherDelay, NASDelay, SecurityDelay, LateAircraftDelay, FirstDepTime, TotalAddGTime, LongestAddGTime, DivAirportLandings, DivReachedDest, DivActualElapsedTime, DivArrDelay, DivDistance, Div1Airport, Div1AirportID, Div1AirportSeqID, Div1WheelsOn, Div1TotalGTime, Div1LongestGTime, Div1WheelsOff, Div1TailNum, Div2Airport, Div2AirportID, Div2AirportSeqID, Div2WheelsOn, Div2TotalGTime, Div2LongestGTime, Div2WheelsOff, Div2TailNum, Div3Airport, Div3AirportID, Div3AirportSeqID, Div3WheelsOn, Div3TotalGTime, Div3LongestGTime, Div3WheelsOff, Div3TailNum, Div4Airport, Div4AirportID, Div4AirportSeqID, Div4WheelsOn, Div4TotalGTime, Div4LongestGTime, Div4WheelsOff, Div4TailNum, Div5Airport, Div5AirportID, Div5AirportSeqID, Div5WheelsOn, Div5TotalGTime, Div5LongestGTime, Div5WheelsOff, Div5TailNum) FORMAT CSV\""); 22 | } 23 | 24 | echo "Complete in " . (time() - $start) . " seconds.\n"; 25 | 26 | -------------------------------------------------------------------------------- /airline/data/import.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | ZIP_FILES_DIR="zip" 4 | CSV_FILES_DIR="csv" 5 | 6 | mkdir "$CSV_FILES_DIR" 7 | 8 | if [ ! -d "$CSV_FILES_DIR" ]; then 9 | "Can' use dir: $CSV_FILES_DIR - not available" 10 | exit 1 11 | fi 12 | 13 | for ZIP_FILENAME in `ls "$ZIP_FILES_DIR"/*.zip`; do 14 | echo "Unzipping $ZIP_FILENAME to $CSV_FILES_DIR/" 15 | unzip -o "$ZIP_FILENAME" -d "$CSV_FILES_DIR/" 16 | done 17 | 18 | clickhouse-client --multiline --multiquery --query="$(cat ../ddl/schema.sql)" 19 | 20 | for CSV_FILENAME in `ls "$CSV_FILES_DIR"/*.csv`; do 21 | LINES_NUM=`wc -l "$CSV_FILENAME"|awk '{print $1}'` 22 | echo "*** Importing $CSV_FILENAME, see $LINES_NUM lines in it" 23 | if [[ $LINES_NUM < 2 ]]; then 24 | echo "NOTICE: too few lines in $CSV_FILENAME, skip import" 25 | else 26 | tail -n +2 $CSV_FILENAME | clickhouse-client --query="INSERT INTO airline.ontime ( 27 | Year, 28 | Quarter, 29 | Month, 30 | DayofMonth, 31 | DayOfWeek, 32 | FlightDate, 33 | UniqueCarrier, 34 | AirlineID, 35 | Carrier, 36 | TailNum, 37 | FlightNum, 38 | OriginAirportID, 39 | OriginAirportSeqID, 40 | OriginCityMarketID, 41 | Origin, 42 | OriginCityName, 43 | OriginState, 44 | OriginStateFips, 45 | OriginStateName, 46 | OriginWac, 47 | DestAirportID, 48 | DestAirportSeqID, 49 | DestCityMarketID, 50 | Dest, 51 | DestCityName, 52 | DestState, 53 | DestStateFips, 54 | DestStateName, 55 | DestWac, 56 | CRSDepTime, 57 | DepTime, 58 | DepDelay, 59 | DepDelayMinutes, 60 | DepDel15, 61 | DepartureDelayGroups, 62 | DepTimeBlk, 63 | TaxiOut, 64 | WheelsOff, 65 | WheelsOn, 66 | TaxiIn, 67 | CRSArrTime, 68 | ArrTime, 69 | ArrDelay, 70 | ArrDelayMinutes, 71 | ArrDel15, 72 | ArrivalDelayGroups, 73 | ArrTimeBlk, 74 | Cancelled, 75 | CancellationCode, 76 | Diverted, 77 | CRSElapsedTime, 78 | ActualElapsedTime, 79 | AirTime, 80 | Flights, 81 | Distance, 82 | DistanceGroup, 83 | CarrierDelay, 84 | WeatherDelay, 85 | NASDelay, 86 | SecurityDelay, 87 | LateAircraftDelay, 88 | FirstDepTime, 89 | TotalAddGTime, 90 | LongestAddGTime, 91 | DivAirportLandings, 92 | DivReachedDest, 93 | DivActualElapsedTime, 94 | DivArrDelay, 95 | DivDistance, 96 | Div1Airport, 97 | Div1AirportID, 98 | Div1AirportSeqID, 99 | Div1WheelsOn, 100 | Div1TotalGTime, 101 | Div1LongestGTime, 102 | Div1WheelsOff, 103 | Div1TailNum, 104 | Div2Airport, 105 | Div2AirportID, 106 | Div2AirportSeqID, 107 | Div2WheelsOn, 108 | Div2TotalGTime, 109 | Div2LongestGTime, 110 | Div2WheelsOff, 111 | Div2TailNum, 112 | Div3Airport, 113 | Div3AirportID, 114 | Div3AirportSeqID, 115 | Div3WheelsOn, 116 | Div3TotalGTime, 117 | Div3LongestGTime, 118 | Div3WheelsOff, 119 | Div3TailNum, 120 | Div4Airport, 121 | Div4AirportID, 122 | Div4AirportSeqID, 123 | Div4WheelsOn, 124 | Div4TotalGTime, 125 | Div4LongestGTime, 126 | Div4WheelsOff, 127 | Div4TailNum, 128 | Div5Airport, 129 | Div5AirportID, 130 | Div5AirportSeqID, 131 | Div5WheelsOn, 132 | Div5TotalGTime, 133 | Div5LongestGTime, 134 | Div5WheelsOff, 135 | Div5TailNum 136 | ) FORMAT CSV" 137 | 138 | fi 139 | done 140 | 141 | -------------------------------------------------------------------------------- /airline/data/readme.html: -------------------------------------------------------------------------------- 1 | 2 | 3 | readme.html 4 | 5 | 6 | 7 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 |

BACKGROUND

The data contained in the compressed file has been extracted from the 8 | On-Time Performance data table of the "On-Time" database from the TranStats data library. 9 | The time period is indicated in the name of the compressed file; 10 | for example, XXX_XXXXX_2001_1 contains data of the first month of the year 2001.
 

RECORD LAYOUT

Below are fields in the order that they appear on the records:
YearYear
QuarterQuarter (1-4)
MonthMonth
DayofMonthDay of Month
DayOfWeekDay of Week
FlightDateFlight Date (yyyymmdd)
UniqueCarrierUnique Carrier Code. When the same code has been used by multiple carriers, a numeric suffix is used for earlier users, for example, PA, PA(1), PA(2). Use this field for analysis across a range of years.
AirlineIDAn identification number assigned by US DOT to identify a unique airline (carrier). A unique airline (carrier) is defined as one holding and reporting under the same DOT certificate regardless of its Code, Name, or holding company/corporation.
CarrierCode assigned by IATA and commonly used to identify a carrier. As the same code may have been assigned to different carriers over time, the code is not always unique. For analysis, use the Unique Carrier Code.
TailNumTail Number
FlightNumFlight Number
OriginAirportIDOrigin Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
OriginAirportSeqIDOrigin Airport, Airport Sequence ID. An identification number assigned by US DOT to identify a unique airport at a given point of time. Airport attributes, such as airport name or coordinates, may change over time.
OriginCityMarketIDOrigin Airport, City Market ID. City Market ID is an identification number assigned by US DOT to identify a city market. Use this field to consolidate airports serving the same city market.
OriginOrigin Airport
OriginCityNameOrigin Airport, City Name
OriginStateOrigin Airport, State Code
OriginStateFipsOrigin Airport, State Fips
OriginStateNameOrigin Airport, State Name
OriginWacOrigin Airport, World Area Code
DestAirportIDDestination Airport, Airport ID. An identification number assigned by US DOT to identify a unique airport. Use this field for airport analysis across a range of years because an airport can change its airport code and airport codes can be reused.
DestAirportSeqIDDestination Airport, Airport Sequence ID. An identification number assigned by US DOT to identify a unique airport at a given point of time. Airport attributes, such as airport name or coordinates, may change over time.
DestCityMarketIDDestination Airport, City Market ID. City Market ID is an identification number assigned by US DOT to identify a city market. Use this field to consolidate airports serving the same city market.
DestDestination Airport
DestCityNameDestination Airport, City Name
DestStateDestination Airport, State Code
DestStateFipsDestination Airport, State Fips
DestStateNameDestination Airport, State Name
DestWacDestination Airport, World Area Code
CRSDepTimeCRS Departure Time (local time: hhmm)
DepTimeActual Departure Time (local time: hhmm)
DepDelayDifference in minutes between scheduled and actual departure time. Early departures show negative numbers.
DepDelayMinutesDifference in minutes between scheduled and actual departure time. Early departures set to 0.
DepDel15Departure Delay Indicator, 15 Minutes or More (1=Yes)
DepartureDelayGroupsDeparture Delay intervals, every (15 minutes from <-15 to >180)
DepTimeBlkCRS Departure Time Block, Hourly Intervals
TaxiOutTaxi Out Time, in Minutes
WheelsOffWheels Off Time (local time: hhmm)
WheelsOnWheels On Time (local time: hhmm)
TaxiInTaxi In Time, in Minutes
CRSArrTimeCRS Arrival Time (local time: hhmm)
ArrTimeActual Arrival Time (local time: hhmm)
ArrDelayDifference in minutes between scheduled and actual arrival time. Early arrivals show negative numbers.
ArrDelayMinutesDifference in minutes between scheduled and actual arrival time. Early arrivals set to 0.
ArrDel15Arrival Delay Indicator, 15 Minutes or More (1=Yes)
ArrivalDelayGroupsArrival Delay intervals, every (15-minutes from <-15 to >180)
ArrTimeBlkCRS Arrival Time Block, Hourly Intervals
CancelledCancelled Flight Indicator (1=Yes)
CancellationCodeSpecifies The Reason For Cancellation
DivertedDiverted Flight Indicator (1=Yes)
CRSElapsedTimeCRS Elapsed Time of Flight, in Minutes
ActualElapsedTimeElapsed Time of Flight, in Minutes
AirTimeFlight Time, in Minutes
FlightsNumber of Flights
DistanceDistance between airports (miles)
DistanceGroupDistance Intervals, every 250 Miles, for Flight Segment
CarrierDelayCarrier Delay, in Minutes
WeatherDelayWeather Delay, in Minutes
NASDelayNational Air System Delay, in Minutes
SecurityDelaySecurity Delay, in Minutes
LateAircraftDelayLate Aircraft Delay, in Minutes
FirstDepTimeFirst Gate Departure Time at Origin Airport
TotalAddGTimeTotal Ground Time Away from Gate for Gate Return or Cancelled Flight
LongestAddGTimeLongest Time Away from Gate for Gate Return or Cancelled Flight
DivAirportLandingsNumber of Diverted Airport Landings
DivReachedDestDiverted Flight Reaching Scheduled Destination Indicator (1=Yes)
DivActualElapsedTimeElapsed Time of Diverted Flight Reaching Scheduled Destination, in Minutes. The ActualElapsedTime column remains NULL for all diverted flights.
DivArrDelayDifference in minutes between scheduled and actual arrival time for a diverted flight reaching scheduled destination. The ArrDelay column remains NULL for all diverted flights.
DivDistanceDistance between scheduled destination and final diverted airport (miles). Value will be 0 for diverted flight reaching scheduled destination.
Div1AirportDiverted Airport Code1
Div1AirportIDAirport ID of Diverted Airport 1. Airport ID is a Unique Key for an Airport
Div1AirportSeqIDAirport Sequence ID of Diverted Airport 1. Unique Key for Time Specific Information for an Airport
Div1WheelsOnWheels On Time (local time: hhmm) at Diverted Airport Code1
Div1TotalGTimeTotal Ground Time Away from Gate at Diverted Airport Code1
Div1LongestGTimeLongest Ground Time Away from Gate at Diverted Airport Code1
Div1WheelsOffWheels Off Time (local time: hhmm) at Diverted Airport Code1
Div1TailNumAircraft Tail Number for Diverted Airport Code1
Div2AirportDiverted Airport Code2
Div2AirportIDAirport ID of Diverted Airport 2. Airport ID is a Unique Key for an Airport
Div2AirportSeqIDAirport Sequence ID of Diverted Airport 2. Unique Key for Time Specific Information for an Airport
Div2WheelsOnWheels On Time (local time: hhmm) at Diverted Airport Code2
Div2TotalGTimeTotal Ground Time Away from Gate at Diverted Airport Code2
Div2LongestGTimeLongest Ground Time Away from Gate at Diverted Airport Code2
Div2WheelsOffWheels Off Time (local time: hhmm) at Diverted Airport Code2
Div2TailNumAircraft Tail Number for Diverted Airport Code2
Div3AirportDiverted Airport Code3
Div3AirportIDAirport ID of Diverted Airport 3. Airport ID is a Unique Key for an Airport
Div3AirportSeqIDAirport Sequence ID of Diverted Airport 3. Unique Key for Time Specific Information for an Airport
Div3WheelsOnWheels On Time (local time: hhmm) at Diverted Airport Code3
Div3TotalGTimeTotal Ground Time Away from Gate at Diverted Airport Code3
Div3LongestGTimeLongest Ground Time Away from Gate at Diverted Airport Code3
Div3WheelsOffWheels Off Time (local time: hhmm) at Diverted Airport Code3
Div3TailNumAircraft Tail Number for Diverted Airport Code3
Div4AirportDiverted Airport Code4
Div4AirportIDAirport ID of Diverted Airport 4. Airport ID is a Unique Key for an Airport
Div4AirportSeqIDAirport Sequence ID of Diverted Airport 4. Unique Key for Time Specific Information for an Airport
Div4WheelsOnWheels On Time (local time: hhmm) at Diverted Airport Code4
Div4TotalGTimeTotal Ground Time Away from Gate at Diverted Airport Code4
Div4LongestGTimeLongest Ground Time Away from Gate at Diverted Airport Code4
Div4WheelsOffWheels Off Time (local time: hhmm) at Diverted Airport Code4
Div4TailNumAircraft Tail Number for Diverted Airport Code4
Div5AirportDiverted Airport Code5
Div5AirportIDAirport ID of Diverted Airport 5. Airport ID is a Unique Key for an Airport
Div5AirportSeqIDAirport Sequence ID of Diverted Airport 5. Unique Key for Time Specific Information for an Airport
Div5WheelsOnWheels On Time (local time: hhmm) at Diverted Airport Code5
Div5TotalGTimeTotal Ground Time Away from Gate at Diverted Airport Code5
Div5LongestGTimeLongest Ground Time Away from Gate at Diverted Airport Code5
Div5WheelsOffWheels Off Time (local time: hhmm) at Diverted Airport Code5
Div5TailNumAircraft Tail Number for Diverted Airport Code5
124 | 125 | 126 | -------------------------------------------------------------------------------- /airline/ddl/schema.sql: -------------------------------------------------------------------------------- 1 | CREATE DATABASE IF NOT EXISTS `airline`; 2 | CREATE TABLE IF NOT EXISTS `airline`.`ontime` ( 3 | `Year` UInt16, 4 | `Quarter` UInt8, 5 | `Month` UInt8, 6 | `DayofMonth` UInt8, 7 | `DayOfWeek` UInt8, 8 | `FlightDate` Date, 9 | `UniqueCarrier` String, 10 | `AirlineID` UInt32, 11 | `Carrier` String, 12 | `TailNum` String, 13 | `FlightNum` String, 14 | `OriginAirportID` UInt32, 15 | `OriginAirportSeqID` UInt32, 16 | `OriginCityMarketID` UInt32, 17 | `Origin` String, 18 | `OriginCityName` String, 19 | `OriginState` String, 20 | `OriginStateFips` String, 21 | `OriginStateName` String, 22 | `OriginWac` UInt32, 23 | `DestAirportID` UInt32, 24 | `DestAirportSeqID` UInt32, 25 | `DestCityMarketID` UInt32, 26 | `Dest` String, 27 | `DestCityName` String, 28 | `DestState` String, 29 | `DestStateFips` String, 30 | `DestStateName` String, 31 | `DestWac` UInt32, 32 | `CRSDepTime` UInt32, 33 | `DepTime` UInt32, 34 | `DepDelay` Float32, 35 | `DepDelayMinutes` Float32, 36 | `DepDel15` Float32, 37 | `DepartureDelayGroups` Int32, 38 | `DepTimeBlk` String, 39 | `TaxiOut` Float32, 40 | `WheelsOff` UInt32, 41 | `WheelsOn` UInt32, 42 | `TaxiIn` Float32, 43 | `CRSArrTime` UInt32, 44 | `ArrTime` UInt32, 45 | `ArrDelay` Float32, 46 | `ArrDelayMinutes` Float32, 47 | `ArrDel15` Float32, 48 | `ArrivalDelayGroups` Int32, 49 | `ArrTimeBlk` String, 50 | `Cancelled` Float32, 51 | `CancellationCode` String, 52 | `Diverted` Float32, 53 | `CRSElapsedTime` Float32, 54 | `ActualElapsedTime` Float32, 55 | `AirTime` Float32, 56 | `Flights` Float32, 57 | `Distance` Float32, 58 | `DistanceGroup` Float32, 59 | `CarrierDelay` Float32, 60 | `WeatherDelay` Float32, 61 | `NASDelay` Float32, 62 | `SecurityDelay` Float32, 63 | `LateAircraftDelay` Float32, 64 | `FirstDepTime` String, 65 | `TotalAddGTime` String, 66 | `LongestAddGTime` String, 67 | `DivAirportLandings` String, 68 | `DivReachedDest` String, 69 | `DivActualElapsedTime` String, 70 | `DivArrDelay` String, 71 | `DivDistance` String, 72 | `Div1Airport` String, 73 | `Div1AirportID` UInt32, 74 | `Div1AirportSeqID` UInt32, 75 | `Div1WheelsOn` String, 76 | `Div1TotalGTime` String, 77 | `Div1LongestGTime` String, 78 | `Div1WheelsOff` String, 79 | `Div1TailNum` String, 80 | `Div2Airport` String, 81 | `Div2AirportID` UInt32, 82 | `Div2AirportSeqID` UInt32, 83 | `Div2WheelsOn` String, 84 | `Div2TotalGTime` String, 85 | `Div2LongestGTime` String, 86 | `Div2WheelsOff` String, 87 | `Div2TailNum` String, 88 | `Div3Airport` String, 89 | `Div3AirportID` UInt32, 90 | `Div3AirportSeqID` UInt32, 91 | `Div3WheelsOn` String, 92 | `Div3TotalGTime` String, 93 | `Div3LongestGTime` String, 94 | `Div3WheelsOff` String, 95 | `Div3TailNum` String, 96 | `Div4Airport` String, 97 | `Div4AirportID` UInt32, 98 | `Div4AirportSeqID` UInt32, 99 | `Div4WheelsOn` String, 100 | `Div4TotalGTime` String, 101 | `Div4LongestGTime` String, 102 | `Div4WheelsOff` String, 103 | `Div4TailNum` String, 104 | `Div5Airport` String, 105 | `Div5AirportID` UInt32, 106 | `Div5AirportSeqID` UInt32, 107 | `Div5WheelsOn` String, 108 | `Div5TotalGTime` String, 109 | `Div5LongestGTime` String, 110 | `Div5WheelsOff` String, 111 | `Div5TailNum` String 112 | ) 113 | ENGINE = MergeTree(FlightDate, (FlightDate, `Year`, `Month`, DepDel15), 8192); 114 | 115 | -------------------------------------------------------------------------------- /airline/readme.txt: -------------------------------------------------------------------------------- 1 | Airline On-Time Performance Data 2 | 3 | This dataset contains on-time arrival data for non-stop domestic flights by major air carriers, and provides such additional items as departure and arrival delays, origin and destination airports, flight numbers, scheduled and actual departure and arrival times, cancelled or diverted flights, taxi-out and taxi-in times, air time, and non-stop distance. This is an official dataset took from https://transtats.bts.gov/. 4 | 5 | This repo has scripts that downloads the data from its source and prepares ClickHouse schema. 6 | 7 | The Schema 8 | 9 | There is only one table `ontime` which has 109 columns with more than 175 mil rows. -------------------------------------------------------------------------------- /nyc-taxi-rides/data/taxi_zones.tsv: -------------------------------------------------------------------------------- 1 | 1 Newark Airport 0000-00-00 2 | 2 Jamaica Bay 0000-00-00 3 | 3 Allerton/Pelham Gardens 0000-00-00 4 | 4 Alphabet City 0000-00-00 5 | 5 Arden Heights 0000-00-00 6 | 6 Arrochar/Fort Wadsworth 0000-00-00 7 | 7 Astoria 0000-00-00 8 | 8 Astoria Park 0000-00-00 9 | 9 Auburndale 0000-00-00 10 | 10 Baisley Park 0000-00-00 11 | 11 Bath Beach 0000-00-00 12 | 12 Battery Park 0000-00-00 13 | 13 Battery Park City 0000-00-00 14 | 14 Bay Ridge 0000-00-00 15 | 15 Bay Terrace/Fort Totten 0000-00-00 16 | 16 Bayside 0000-00-00 17 | 17 Bedford 0000-00-00 18 | 18 Bedford Park 0000-00-00 19 | 19 Bellerose 0000-00-00 20 | 20 Belmont 0000-00-00 21 | 21 Bensonhurst East 0000-00-00 22 | 22 Bensonhurst West 0000-00-00 23 | 23 Bloomfield/Emerson Hill 0000-00-00 24 | 24 Bloomingdale 0000-00-00 25 | 25 Boerum Hill 0000-00-00 26 | 26 Borough Park 0000-00-00 27 | 27 Breezy Point/Fort Tilden/Riis Beach 0000-00-00 28 | 28 Briarwood/Jamaica Hills 0000-00-00 29 | 29 Brighton Beach 0000-00-00 30 | 30 Broad Channel 0000-00-00 31 | 31 Bronx Park 0000-00-00 32 | 32 Bronxdale 0000-00-00 33 | 33 Brooklyn Heights 0000-00-00 34 | 34 Brooklyn Navy Yard 0000-00-00 35 | 35 Brownsville 0000-00-00 36 | 36 Bushwick North 0000-00-00 37 | 37 Bushwick South 0000-00-00 38 | 38 Cambria Heights 0000-00-00 39 | 39 Canarsie 0000-00-00 40 | 40 Carroll Gardens 0000-00-00 41 | 41 Central Harlem 0000-00-00 42 | 42 Central Harlem North 0000-00-00 43 | 43 Central Park 0000-00-00 44 | 44 Charleston/Tottenville 0000-00-00 45 | 45 Chinatown 0000-00-00 46 | 46 City Island 0000-00-00 47 | 47 Claremont/Bathgate 0000-00-00 48 | 48 Clinton East 0000-00-00 49 | 49 Clinton Hill 0000-00-00 50 | 50 Clinton West 0000-00-00 51 | 51 Co-Op City 0000-00-00 52 | 52 Cobble Hill 0000-00-00 53 | 53 College Point 0000-00-00 54 | 54 Columbia Street 0000-00-00 55 | 55 Coney Island 0000-00-00 56 | 56 Corona 0000-00-00 57 | 56 Corona 0000-00-00 58 | 58 Country Club 0000-00-00 59 | 59 Crotona Park 0000-00-00 60 | 60 Crotona Park East 0000-00-00 61 | 61 Crown Heights North 0000-00-00 62 | 62 Crown Heights South 0000-00-00 63 | 63 Cypress Hills 0000-00-00 64 | 64 Douglaston 0000-00-00 65 | 65 Downtown Brooklyn/MetroTech 0000-00-00 66 | 66 DUMBO/Vinegar Hill 0000-00-00 67 | 67 Dyker Heights 0000-00-00 68 | 68 East Chelsea 0000-00-00 69 | 69 East Concourse/Concourse Village 0000-00-00 70 | 70 East Elmhurst 0000-00-00 71 | 71 East Flatbush/Farragut 0000-00-00 72 | 72 East Flatbush/Remsen Village 0000-00-00 73 | 73 East Flushing 0000-00-00 74 | 74 East Harlem North 0000-00-00 75 | 75 East Harlem South 0000-00-00 76 | 76 East New York 0000-00-00 77 | 77 East New York/Pennsylvania Avenue 0000-00-00 78 | 78 East Tremont 0000-00-00 79 | 79 East Village 0000-00-00 80 | 80 East Williamsburg 0000-00-00 81 | 81 Eastchester 0000-00-00 82 | 82 Elmhurst 0000-00-00 83 | 83 Elmhurst/Maspeth 0000-00-00 84 | 84 Eltingville/Annadale/Prince\'s Bay 0000-00-00 85 | 85 Erasmus 0000-00-00 86 | 86 Far Rockaway 0000-00-00 87 | 87 Financial District North 0000-00-00 88 | 88 Financial District South 0000-00-00 89 | 89 Flatbush/Ditmas Park 0000-00-00 90 | 90 Flatiron 0000-00-00 91 | 91 Flatlands 0000-00-00 92 | 92 Flushing 0000-00-00 93 | 93 Flushing Meadows-Corona Park 0000-00-00 94 | 94 Fordham South 0000-00-00 95 | 95 Forest Hills 0000-00-00 96 | 96 Forest Park/Highland Park 0000-00-00 97 | 97 Fort Greene 0000-00-00 98 | 98 Fresh Meadows 0000-00-00 99 | 99 Freshkills Park 0000-00-00 100 | 100 Garment District 0000-00-00 101 | 101 Glen Oaks 0000-00-00 102 | 102 Glendale 0000-00-00 103 | 103 Governor\'s Island/Ellis Island/Liberty Island 0000-00-00 104 | 103 Governor\'s Island/Ellis Island/Liberty Island 0000-00-00 105 | 103 Governor\'s Island/Ellis Island/Liberty Island 0000-00-00 106 | 106 Gowanus 0000-00-00 107 | 107 Gramercy 0000-00-00 108 | 108 Gravesend 0000-00-00 109 | 109 Great Kills 0000-00-00 110 | 110 Great Kills Park 0000-00-00 111 | 111 Green-Wood Cemetery 0000-00-00 112 | 112 Greenpoint 0000-00-00 113 | 113 Greenwich Village North 0000-00-00 114 | 114 Greenwich Village South 0000-00-00 115 | 115 Grymes Hill/Clifton 0000-00-00 116 | 116 Hamilton Heights 0000-00-00 117 | 117 Hammels/Arverne 0000-00-00 118 | 118 Heartland Village/Todt Hill 0000-00-00 119 | 119 Highbridge 0000-00-00 120 | 120 Highbridge Park 0000-00-00 121 | 121 Hillcrest/Pomonok 0000-00-00 122 | 122 Hollis 0000-00-00 123 | 123 Homecrest 0000-00-00 124 | 124 Howard Beach 0000-00-00 125 | 125 Hudson Sq 0000-00-00 126 | 126 Hunts Point 0000-00-00 127 | 127 Inwood 0000-00-00 128 | 128 Inwood Hill Park 0000-00-00 129 | 129 Jackson Heights 0000-00-00 130 | 130 Jamaica 0000-00-00 131 | 131 Jamaica Estates 0000-00-00 132 | 132 JFK Airport 0000-00-00 133 | 133 Kensington 0000-00-00 134 | 134 Kew Gardens 0000-00-00 135 | 135 Kew Gardens Hills 0000-00-00 136 | 136 Kingsbridge Heights 0000-00-00 137 | 137 Kips Bay 0000-00-00 138 | 138 LaGuardia Airport 0000-00-00 139 | 139 Laurelton 0000-00-00 140 | 140 Lenox Hill East 0000-00-00 141 | 141 Lenox Hill West 0000-00-00 142 | 142 Lincoln Square East 0000-00-00 143 | 143 Lincoln Square West 0000-00-00 144 | 144 Little Italy/NoLiTa 0000-00-00 145 | 145 Long Island City/Hunters Point 0000-00-00 146 | 146 Long Island City/Queens Plaza 0000-00-00 147 | 147 Longwood 0000-00-00 148 | 148 Lower East Side 0000-00-00 149 | 149 Madison 0000-00-00 150 | 150 Manhattan Beach 0000-00-00 151 | 151 Manhattan Valley 0000-00-00 152 | 152 Manhattanville 0000-00-00 153 | 153 Marble Hill 0000-00-00 154 | 154 Marine Park/Floyd Bennett Field 0000-00-00 155 | 155 Marine Park/Mill Basin 0000-00-00 156 | 156 Mariners Harbor 0000-00-00 157 | 157 Maspeth 0000-00-00 158 | 158 Meatpacking/West Village West 0000-00-00 159 | 159 Melrose South 0000-00-00 160 | 160 Middle Village 0000-00-00 161 | 161 Midtown Center 0000-00-00 162 | 162 Midtown East 0000-00-00 163 | 163 Midtown North 0000-00-00 164 | 164 Midtown South 0000-00-00 165 | 165 Midwood 0000-00-00 166 | 166 Morningside Heights 0000-00-00 167 | 167 Morrisania/Melrose 0000-00-00 168 | 168 Mott Haven/Port Morris 0000-00-00 169 | 169 Mount Hope 0000-00-00 170 | 170 Murray Hill 0000-00-00 171 | 171 Murray Hill-Queens 0000-00-00 172 | 172 New Dorp/Midland Beach 0000-00-00 173 | 173 North Corona 0000-00-00 174 | 174 Norwood 0000-00-00 175 | 175 Oakland Gardens 0000-00-00 176 | 176 Oakwood 0000-00-00 177 | 177 Ocean Hill 0000-00-00 178 | 178 Ocean Parkway South 0000-00-00 179 | 179 Old Astoria 0000-00-00 180 | 180 Ozone Park 0000-00-00 181 | 181 Park Slope 0000-00-00 182 | 182 Parkchester 0000-00-00 183 | 183 Pelham Bay 0000-00-00 184 | 184 Pelham Bay Park 0000-00-00 185 | 185 Pelham Parkway 0000-00-00 186 | 186 Penn Station/Madison Sq West 0000-00-00 187 | 187 Port Richmond 0000-00-00 188 | 188 Prospect-Lefferts Gardens 0000-00-00 189 | 189 Prospect Heights 0000-00-00 190 | 190 Prospect Park 0000-00-00 191 | 191 Queens Village 0000-00-00 192 | 192 Queensboro Hill 0000-00-00 193 | 193 Queensbridge/Ravenswood 0000-00-00 194 | 194 Randalls Island 0000-00-00 195 | 195 Red Hook 0000-00-00 196 | 196 Rego Park 0000-00-00 197 | 197 Richmond Hill 0000-00-00 198 | 198 Ridgewood 0000-00-00 199 | 199 Rikers Island 0000-00-00 200 | 200 Riverdale/North Riverdale/Fieldston 0000-00-00 201 | 201 Rockaway Park 0000-00-00 202 | 202 Roosevelt Island 0000-00-00 203 | 203 Rosedale 0000-00-00 204 | 204 Rossville/Woodrow 0000-00-00 205 | 205 Saint Albans 0000-00-00 206 | 206 Saint George/New Brighton 0000-00-00 207 | 207 Saint Michaels Cemetery/Woodside 0000-00-00 208 | 208 Schuylerville/Edgewater Park 0000-00-00 209 | 209 Seaport 0000-00-00 210 | 210 Sheepshead Bay 0000-00-00 211 | 211 SoHo 0000-00-00 212 | 212 Soundview/Bruckner 0000-00-00 213 | 213 Soundview/Castle Hill 0000-00-00 214 | 214 South Beach/Dongan Hills 0000-00-00 215 | 215 South Jamaica 0000-00-00 216 | 216 South Ozone Park 0000-00-00 217 | 217 South Williamsburg 0000-00-00 218 | 218 Springfield Gardens North 0000-00-00 219 | 219 Springfield Gardens South 0000-00-00 220 | 220 Spuyten Duyvil/Kingsbridge 0000-00-00 221 | 221 Stapleton 0000-00-00 222 | 222 Starrett City 0000-00-00 223 | 223 Steinway 0000-00-00 224 | 224 Stuy Town/Peter Cooper Village 0000-00-00 225 | 225 Stuyvesant Heights 0000-00-00 226 | 226 Sunnyside 0000-00-00 227 | 227 Sunset Park East 0000-00-00 228 | 228 Sunset Park West 0000-00-00 229 | 229 Sutton Place/Turtle Bay North 0000-00-00 230 | 230 Times Sq/Theatre District 0000-00-00 231 | 231 TriBeCa/Civic Center 0000-00-00 232 | 232 Two Bridges/Seward Park 0000-00-00 233 | 233 UN/Turtle Bay South 0000-00-00 234 | 234 Union Sq 0000-00-00 235 | 235 University Heights/Morris Heights 0000-00-00 236 | 236 Upper East Side North 0000-00-00 237 | 237 Upper East Side South 0000-00-00 238 | 238 Upper West Side North 0000-00-00 239 | 239 Upper West Side South 0000-00-00 240 | 240 Van Cortlandt Park 0000-00-00 241 | 241 Van Cortlandt Village 0000-00-00 242 | 242 Van Nest/Morris Park 0000-00-00 243 | 243 Washington Heights North 0000-00-00 244 | 244 Washington Heights South 0000-00-00 245 | 245 West Brighton 0000-00-00 246 | 246 West Chelsea/Hudson Yards 0000-00-00 247 | 247 West Concourse 0000-00-00 248 | 248 West Farms/Bronx River 0000-00-00 249 | 249 West Village 0000-00-00 250 | 250 Westchester Village/Unionport 0000-00-00 251 | 251 Westerleigh 0000-00-00 252 | 252 Whitestone 0000-00-00 253 | 253 Willets Point 0000-00-00 254 | 254 Williamsbridge/Olinville 0000-00-00 255 | 255 Williamsburg (North Side) 0000-00-00 256 | 256 Williamsburg (South Side) 0000-00-00 257 | 257 Windsor Terrace 0000-00-00 258 | 258 Woodhaven 0000-00-00 259 | 259 Woodlawn/Wakefield 0000-00-00 260 | 260 Woodside 0000-00-00 261 | 261 World Trade Center 0000-00-00 262 | 262 Yorkville East 0000-00-00 263 | 263 Yorkville West 0000-00-00 264 | -------------------------------------------------------------------------------- /nyc-taxi-rides/ddl/import_trip_data.php: -------------------------------------------------------------------------------- 1 | read()) !== false) { 18 | if (preg_match($year_month_regex, $file, $m)) { 19 | $year = $m[1]; 20 | $month = $m[2]; 21 | } 22 | $schema = ''; 23 | 24 | 25 | if (stripos($file, 'yellow') !== false) { 26 | if ($year < 2015) { 27 | $schema = $yellow_schema_pre_2015; 28 | } elseif ($year == 2015 || $year == 2016 && $month < 7) { 29 | $schema = $yellow_schema_2015_2016_h1; 30 | } else { 31 | $schema = $yellow_schema_2016_h2; 32 | } 33 | 34 | echo date('Y-m-d H:i:s'), ": beginning load for $file\n"; 35 | exec("tail -n +4 $dataFolder/$file | clickhouse-client --query=\"INSERT INTO yellow_tripdata_staging $schema FORMAT CSV\""); 36 | echo date('Y-m-d H:i:s'), ": finish load for $file\n"; 37 | } 38 | 39 | } -------------------------------------------------------------------------------- /nyc-taxi-rides/ddl/schema.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE test.taxi_zones ( 2 | location_id UInt32, 3 | zone String, 4 | create_date Date DEFAULT toDate(0) 5 | ) ENGINE = MergeTree(create_date, location_id, 8192); 6 | 7 | CREATE TABLE test.central_park_weather_observations ( 8 | station_id String, 9 | station_name String, 10 | weather_date Date, 11 | precipitation Float32, 12 | snow_depth Float32, 13 | snowfall Int32, 14 | max_temperature Float32, 15 | min_temperature Float32, 16 | average_wind_speed Float32) 17 | ENGINE = MergeTree(weather_date, station_id, 8192); 18 | 19 | CREATE TABLE test.yellow_tripdata_staging ( 20 | pickup_date Date DEFAULT toDate(tpep_pickup_datetime), 21 | id UInt64, 22 | vendor_id String, 23 | tpep_pickup_datetime DateTime, 24 | tpep_dropoff_datetime DateTime, 25 | passenger_count Int32, 26 | trip_distance Float32, 27 | pickup_longitude Float32, 28 | pickup_latitude Float32, 29 | rate_code_id String, 30 | store_and_fwd_flag String, 31 | dropoff_longitude Float32, 32 | dropoff_latitude Float32, 33 | payment_type String, 34 | fare_amount String, 35 | extra String, 36 | mta_tax String, 37 | tip_amount String, 38 | tolls_amount String, 39 | improvement_surcharge String, 40 | total_amount Float32, 41 | pickup_location_id UInt32, 42 | dropoff_location_id UInt32, 43 | junk1 String, 44 | junk2 String 45 | ) ENGINE = MergeTree(pickup_date, (id, pickup_location_id, dropoff_location_id, vendor_id), 8192); 46 | 47 | -------------------------------------------------------------------------------- /nyc-taxi-rides/ddl/taxi_zones.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | taxi_zones 4 | 5 | 6 | localhost 7 | 9000 8 | default 9 | 10 | test 11 | taxi_zones
12 |
13 | 14 | 15 | 300 16 | 360 17 | 18 | 19 | 20 | 21 | 22 | 23 | location_id 24 | 25 | 26 | zone 27 | String 28 | 29 | 30 | 31 |
32 |
33 | 34 | -------------------------------------------------------------------------------- /nyc-taxi-rides/ddl/weather.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | weather 4 | 5 | 6 | localhost 7 | 9000 8 | default 9 | 10 | test 11 | central_park_weather_observations
12 |
13 | 14 | 15 | 300 16 | 360 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | station_id 25 | String 26 | 27 | 28 | 29 | weather_date 30 | Date 31 | 0000-00-00 32 | 33 | 34 | 35 | station_id 36 | String 37 | 38 | 39 | 40 | station_name 41 | String 42 | 43 | 44 | 45 | precipitation 46 | Float32 47 | 0 48 | 49 | 50 | snow_depth 51 | Float32 52 | 0 53 | 54 | 55 | snowfall 56 | UInt32 57 | 0 58 | 59 | 60 | max_temperature 61 | Float32 62 | 0 63 | 64 | 65 | min_temperature 66 | Float32 67 | 0 68 | 69 | 70 | average_wind_speed 71 | Float32 72 | 0 73 | 74 | 75 |
76 |
77 | -------------------------------------------------------------------------------- /nyc-taxi-rides/readme.txt: -------------------------------------------------------------------------------- 1 | New York City Taxi Dataset 2 | 3 | The official TLC trip record dataset contains data for over 1.1 billion taxi trips from January 2009 through June 2015, covering both yellow and green taxis. Each individual trip record contains precise location coordinates for where the trip started and ended, timestamps for when the trip started and ended, plus a few other variables including fare amount, payment method, and distance traveled. 4 | 5 | This repo provides scripts to download that data and prepare the schema for ClickHouse. 6 | 7 | The Schema 8 | * yellow_tripdata_staging table contains all yellow taxi trips. Each trip maps census tracts to NYC's official neighborhood tabulation areas by pickup/dropoff location ids. 9 | * tripdata table is a distributed table along the cluster with the same data as yellow_tripdata_staging 10 | * taxi_zones table contains the TLC's official taxi zone boundaries. Starting in July 2016, the TLC no longer provides pickup and dropoff coordinates. Instead, each trip comes with taxi zone pickup and dropoff location IDs 11 | * central_park_weather_observations dictionary has summary weather data by date 12 | * taxi_zones dictionary has official neighborhood tabulation areas data 13 | 14 | 15 | -------------------------------------------------------------------------------- /star/data/download.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | wget http://sdm.lbl.gov/fastbit/data/star2002-full.csv.gz 3 | gzip -d star2002-full.csv.gz -------------------------------------------------------------------------------- /star/data/import.php: -------------------------------------------------------------------------------- 1 | 1 -------------------------------------------------------------------------------- /star/sql/test3.sql: -------------------------------------------------------------------------------- 1 | SELECT count(*) 2 | FROM starexp 3 | WHERE eventNumber > 20000 -------------------------------------------------------------------------------- /star/sql/test4.sql: -------------------------------------------------------------------------------- 1 | SELECT count(*) 2 | FROM starexp 3 | WHERE eventNumber > 500000 -------------------------------------------------------------------------------- /star/sql/test5.sql: -------------------------------------------------------------------------------- 1 | SELECT eventFile, count(*) 2 | FROM starexp 3 | GROUP BY eventFile -------------------------------------------------------------------------------- /star/sql/test6.sql: -------------------------------------------------------------------------------- 1 | SELECT eventFile, count(*) 2 | FROM starexp 3 | WHERE eventNumber > 525000 4 | GROUP BY eventFile -------------------------------------------------------------------------------- /star/sql/test7.sql: -------------------------------------------------------------------------------- 1 | SELECT eventFile, eventTime, count(*) 2 | FROM starexp 3 | WHERE eventNumber > 525000 4 | GROUP BY eventFile, eventTime 5 | ORDER BY eventFile DESC, eventTime ASC -------------------------------------------------------------------------------- /star/sql/test8.sql: -------------------------------------------------------------------------------- 1 | SELECT MAX(runNumber) 2 | FROM starexp 3 | SELECT AVG(eventTime) 4 | FROM starexp 5 | WHERE eventNumber > 20000 -------------------------------------------------------------------------------- /star/sql/test9.sql: -------------------------------------------------------------------------------- 1 | SELECT eventFile, 2 | AVG(eventTime), AVG(multiplicity), 3 | MAX(runNumber), count(*) 4 | FROM starexp 5 | WHERE eventNumber > 20000 6 | GROUP BY eventFile --------------------------------------------------------------------------------