├── README.md
├── hu.bme.bigdata.homework.mapreduce.flight2
    ├── .classpath
    ├── .project
    ├── .settings
    │   └── org.eclipse.jdt.core.prefs
    ├── build.sh
    └── src
    │   └── hu
    │       └── bme
    │           └── bigdata
    │               └── homework
    │                   └── mapreduce
    │                       └── flight2
    │                           ├── JobRunner.java
    │                           ├── MapReduceApplication.java
    │                           ├── mappers
    │                               ├── RawDataToOriginMapper.java
    │                               └── RowToPairMapper.java
    │                           └── reducers
    │                               ├── MaxOccurenceReducer.java
    │                               └── OriginCounterReducer.java
├── hu.bme.bigdata.homework.spark.flight2
    ├── .classpath
    ├── .project
    ├── .settings
    │   ├── org.eclipse.jdt.core.prefs
    │   └── org.eclipse.m2e.core.prefs
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── hu
    │           │   └── bme
    │           │       └── bigdata
    │           │           └── homework
    │           │               └── spark
    │           │                   └── flight2
    │           │                       ├── application
    │           │                           ├── SparkApplication.java
    │           │                           ├── TransformationManager.java
    │           │                           └── jopt
    │           │                           │   └── ParameterManager.java
    │           │                       ├── transformation
    │           │                           ├── OccurenceSummarizer.java
    │           │                           ├── RawToPairTransformer.java
    │           │                           └── ValidRecordFilter.java
    │           │                       └── utility
    │           │                           └── OriginTupleComparator.java
    │       └── resources
    │           └── log4j.properties
├── hu.bme.bigdata.homework.spark.flight3
    ├── .classpath
    ├── .project
    ├── .settings
    │   ├── org.eclipse.jdt.core.prefs
    │   └── org.eclipse.m2e.core.prefs
    ├── pom.xml
    └── src
    │   └── main
    │       ├── java
    │           └── hu
    │           │   └── bme
    │           │       └── bigdata
    │           │           └── homework
    │           │               └── spark
    │           │                   └── flight3
    │           │                       ├── application
    │           │                           ├── SparkApplication.java
    │           │                           ├── TransformationManager.java
    │           │                           └── jopt
    │           │                           │   └── ParameterManager.java
    │           │                       ├── enums
    │           │                           └── Season.java
    │           │                       ├── transformation
    │           │                           ├── ArrDelayToIsDelayedTransformer.java
    │           │                           ├── AverageCalculator.java
    │           │                           ├── CancelledExcluder.java
    │           │                           ├── DateToSeasonTransformer.java
    │           │                           ├── RawToPairTransformer.java
    │           │                           └── SumCalculator.java
    │           │                       └── utility
    │           │                           ├── SeasonTupleComparator.java
    │           │                           └── SeasonTuplePrinter.java
    │       └── resources
    │           └── log4j.properties
└── images
    └── benchmark.png


/README.md:
--------------------------------------------------------------------------------
  1 | # bigdata-projects
  2 | Student projects in Big Data field, during my studies at Budapest University of Technology and Economics (BME).
  3 | All the projects were homework assignments which were implemented by me.
  4 | 
  5 | **Comments and suggestions are warmly welcome.**
  6 | 
  7 | **This repository uses large files. Please consider integrating [Git LFS](https://git-lfs.github.com/) into your Git workflow as well! (Track *.7z files.)**
  8 | 
  9 | # Projects
 10 | ## Flight data analysis
 11 | 
 12 | ### Task
 13 | 
 14 | Flight records in USA are stored and some of them are made available for research purposes at [Statistical Computing](http://stat-computing.org/dataexpo/2009/the-data.html). The data are separated by year from 1987 to 2008. The attributes include the common properties a flight record have (e.g. date, origin and destination airports, air time, scheduled and actual departure and arrival times, etc).
 15 | 
 16 | During a practical course called 'Big Data Analytics Tools with Open-Source Platforms' at BME we had a homework assignment which contained two questions. The questions had to be answered by implementing a data analysis chain, that retrieves the neccesary information from the input files. We could use several technologies from the Hadoop Framework. I used Apache Spark™ and native Java MapReduce.
 17 | 
 18 | **We had to work with the dataset of year 2008, that has been stored on the [datasets branch](https://github.com/benedekh/bigdata-projects/tree/datasets) of this as well.**
 19 | 
 20 | ### Question #1
 21 | 
 22 | *From which airport took most of the airplanes off?*
 23 | 
 24 | ### Answer #1
 25 | 
 26 | *ATL, 408683 times*
 27 | 
 28 | According to the dataset of year 2008.
 29 | 
 30 | I used Apache Spark™ and Java MapReduce to answer the question. The Apache Spark™ solution is avaiable [here](https://github.com/benedekh/bigdata-projects/tree/master/hu.bme.bigdata.homework.spark.flight2/), while the Java MapReduce solution is available [here](https://github.com/benedekh/bigdata-projects/tree/master/hu.bme.bigdata.homework.mapreduce.flight2/).
 31 | 
 32 | #### Apache Spark™
 33 | 
 34 | You should use Apache Spark™ 1.5.1. for Hadoop 2.6. I downloaded a pre-built version [here](http://spark.apache.org/downloads.html), and I used Apache Spark™ in a standanlone mode, without Hadoop.
 35 | 
 36 | To compile the source code of the implementation, you should use Maven:
 37 | 
 38 | ```bash
 39 | cd hu.bme.bigdata.homework.spark.flight2
 40 | 
 41 | mvn clean
 42 | 
 43 | mvn install
 44 | ```
 45 | 
 46 | The compiled jar (spark.flight2-0.0.1-SNAPSHOT.jar) is available in the *target* folder. To run the solution, you should either download a csv from [Statistical Computing](http://stat-computing.org/dataexpo/2009/the-data.html) or switch to the *datasets* branch of this repository, and download the *2008.csv*.
 47 | 
 48 | To run from the command line:
 49 | 
 50 | ```bash
 51 | cd target
 52 | 
 53 | java -jar spark.flight2-0.0.1-SNAPSHOT.jar --home <Spark installation directory> --data <2008.csv path> --partitions <number of partitions>
 54 | ```
 55 | 
 56 | The parameters are self-explaining, though the partitions parameter should be set for the number of cores, your computer CPU has (use *--partitions 1*, if you are not sure how many cores your CPU has).
 57 | 
 58 | 
 59 | #### Java MapReduce
 60 | 
 61 | I used *Cloudera QuickStart 5.4.2.0* virtual machine image for VirtualBox, you can downloaded it [here](http://www.cloudera.com/content/www/en-us/downloads/quickstart_vms/5-4.html). I used the local Hadoop cluster offered by the image, via the cloudera web console. I uploaded the data file (2008.csv) to the HDFS.
 62 | 
 63 | To compile the source code of the homework assignment:
 64 | 
 65 | ```bash
 66 | cd hu.bme.bigdata.homework.mapreduce.flight2
 67 | 
 68 | javac -cp /usr/lib/hadoop/*:/usr/lib/hadoop-mapreduce/* hu/bme/bigdata/homework/mapreduce/flight2/*.java hu/bme/bigdata/homework/mapreduce/flight2/mappers/*.java hu/bme/bigdata/homework/mapreduce/flight2/reducers/*.java -d build -Xlint
 69 | 
 70 | jar -cvf demo.jar -C build/ .
 71 | ```
 72 | 
 73 | The *demo.jar* is the compiled jar, that should be run on hadoop.
 74 | 
 75 | ```bash
 76 | hadoop jar demo.jar hu.bme.bigdata.homework.mapreduce.flight2.MapReduceApplication <2008.csv path on HDFS> <output folder path on HDFS>
 77 | ```
 78 | 
 79 | The output of the script is at *[output folder path on HDFS]-result* folder (pay attention to the *-result* prefix).
 80 | 
 81 | 
 82 | ### Question #2
 83 | 
 84 | *In winter or summer are more planes delayed by proportion?*
 85 | 
 86 | Winter is between 1st November and 7th March. Other dates belong to summer.
 87 | 
 88 | ### Answer #2
 89 | 
 90 | |SEASON|NUMBER OF DELAYS|NUMBER OF RECORDS|
 91 | |--------|------------------|-------------------|
 92 | |SUMMER|1894499|4554528|
 93 | |WINTER|1085005|2317766|
 94 | 
 95 | *In average, in WINTER 0.05% more of the planes are delayed than in SUMMER.*
 96 | 
 97 | According to the dataset of year 2008.
 98 | 
 99 | I used Apache Spark™ to answer the question. The Apache Spark™ solution is avaiable [here](https://github.com/benedekh/bigdata-projects/tree/master/hu.bme.bigdata.homework.spark.flight3/).
100 | 
101 | #### Apache Spark™
102 | 
103 | You should use Apache Spark™ 1.5.1. for Hadoop 2.6. I downloaded a pre-built version [here](http://spark.apache.org/downloads.html), and I used Apache Spark™ in a standanlone mode, without Hadoop.
104 | 
105 | To compile the source code of the implementation, you should use Maven:
106 | 
107 | ```bash
108 | cd hu.bme.bigdata.homework.spark.flight3
109 | 
110 | mvn clean
111 | 
112 | mvn install
113 | ```
114 | 
115 | The compiled jar (spark.flight3-0.0.1-SNAPSHOT.jar) is available in the *target* folder. To run the solution, you should either download a csv from [Statistical Computing](http://stat-computing.org/dataexpo/2009/the-data.html) or switch to the *datasets* branch of this repository, and download the *2008.csv*.
116 | 
117 | To run from the command line:
118 | 
119 | ```bash
120 | cd target
121 | 
122 | java -jar spark.flight3-0.0.1-SNAPSHOT.jar --home <Apache Spark™ installation directory> --data <2008.csv path> --partitions <number of partitions>
123 | ```
124 | 
125 | The parameters are self-explaining, though the partitions parameter should be set for the number of cores, your computer CPU has (use *--partitions 1*, if you are not sure how many cores your CPU has).
126 | 
127 | 
128 | ### Benchmark
129 | 
130 | I benchmarked the two Apache Spark™ solutions for the questions, and the Java MapReduce implementation for the first question. 
131 | 
132 | _The input data was the **2008.csv**, that is available in a compressed archive [here](https://github.com/benedekh/bigdata-projects/tree/datasets) in the repository, and [here](http://stat-computing.org/dataexpo/2009/the-data.html) on the original website._
133 | 
134 | The benchmarking was done on a computer containing Intel Core i7-4700MQ @ 2.4GHz CPU, 8 GB RAM. 
135 | 
136 | Apache Spark™ was run on a VirtualBox virtual machine using 4 CPU cores and 5 GB RAM. The Apache Spark™ implementations of assigments were run using 4 partitions as a parameter. The Apache Spark™ solution of the 1st assigment is called **Flight2-Spark**, the 2nd assignment is called **Flight3-Spark** on the figure.
137 | 
138 | Java MapReduce was run on the Cloudera VM using 4 CPU cores and 5 GB RAM. The Java MapReduce solution of the 1st assigment is called **Flight2-MR** on the figure.
139 | 
140 | [![](images/benchmark.png)](images/benchmark.png)
141 | 
142 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.mapreduce.flight2/.classpath:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8"?>
  2 | <classpath>
  3 | 	<classpathentry kind="src" path="src"/>
  4 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7"/>
  5 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/activation.jar"/>
  6 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/activation-1.1.jar"/>
  7 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/apacheds-i18n.jar"/>
  8 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/apacheds-i18n-2.0.0-M15.jar"/>
  9 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/apacheds-kerberos-codec.jar"/>
 10 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/apacheds-kerberos-codec-2.0.0-M15.jar"/>
 11 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/api-asn1-api.jar"/>
 12 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/api-asn1-api-1.0.0-M20.jar"/>
 13 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/api-util.jar"/>
 14 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/api-util-1.0.0-M20.jar"/>
 15 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/avro.jar"/>
 16 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/aws-java-sdk.jar"/>
 17 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/aws-java-sdk-1.7.4.jar"/>
 18 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-beanutils.jar"/>
 19 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-beanutils-1.7.0.jar"/>
 20 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-beanutils-core.jar"/>
 21 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-beanutils-core-1.8.0.jar"/>
 22 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-cli.jar"/>
 23 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-cli-1.2.jar"/>
 24 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-codec.jar"/>
 25 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-codec-1.4.jar"/>
 26 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-collections.jar"/>
 27 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-collections-3.2.1.jar"/>
 28 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-compress.jar"/>
 29 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-compress-1.4.1.jar"/>
 30 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-configuration.jar"/>
 31 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-configuration-1.6.jar"/>
 32 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-digester.jar"/>
 33 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-digester-1.8.jar"/>
 34 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-httpclient.jar"/>
 35 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-httpclient-3.1.jar"/>
 36 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-io.jar"/>
 37 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-io-2.4.jar"/>
 38 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-lang.jar"/>
 39 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-lang-2.6.jar"/>
 40 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-logging.jar"/>
 41 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-logging-1.1.3.jar"/>
 42 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-math3.jar"/>
 43 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-math3-3.1.1.jar"/>
 44 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-net.jar"/>
 45 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/commons-net-3.1.jar"/>
 46 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/curator-client.jar"/>
 47 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/curator-client-2.7.1.jar"/>
 48 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/curator-framework.jar"/>
 49 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/curator-framework-2.7.1.jar"/>
 50 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/curator-recipes.jar"/>
 51 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/curator-recipes-2.7.1.jar"/>
 52 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/gson.jar"/>
 53 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/gson-2.2.4.jar"/>
 54 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/guava.jar"/>
 55 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/guava-11.0.2.jar"/>
 56 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-annotations.jar"/>
 57 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-annotations-2.6.0-cdh5.4.2.jar"/>
 58 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-auth.jar"/>
 59 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-auth-2.6.0-cdh5.4.2.jar"/>
 60 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-aws.jar"/>
 61 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-aws-2.6.0-cdh5.4.2.jar"/>
 62 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-common.jar"/>
 63 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-common-2.6.0-cdh5.4.2.jar"/>
 64 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-hdfs.jar"/>
 65 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-hdfs-2.6.0-cdh5.4.2.jar"/>
 66 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-mapreduce-client-app.jar"/>
 67 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-mapreduce-client-app-2.6.0-cdh5.4.2.jar"/>
 68 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-mapreduce-client-common.jar"/>
 69 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-mapreduce-client-common-2.6.0-cdh5.4.2.jar"/>
 70 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-mapreduce-client-core.jar"/>
 71 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-mapreduce-client-core-2.6.0-cdh5.4.2.jar"/>
 72 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-mapreduce-client-jobclient.jar"/>
 73 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-mapreduce-client-jobclient-2.6.0-cdh5.4.2.jar"/>
 74 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-mapreduce-client-shuffle.jar"/>
 75 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-mapreduce-client-shuffle-2.6.0-cdh5.4.2.jar"/>
 76 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-yarn-api.jar"/>
 77 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-yarn-api-2.6.0-cdh5.4.2.jar"/>
 78 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-yarn-client.jar"/>
 79 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-yarn-client-2.6.0-cdh5.4.2.jar"/>
 80 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-yarn-common.jar"/>
 81 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-yarn-common-2.6.0-cdh5.4.2.jar"/>
 82 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-yarn-server-common.jar"/>
 83 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/hadoop-yarn-server-common-2.6.0-cdh5.4.2.jar"/>
 84 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/htrace-core.jar"/>
 85 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/htrace-core-3.0.4.jar"/>
 86 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/httpclient.jar"/>
 87 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/httpclient-4.2.5.jar"/>
 88 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/httpcore.jar"/>
 89 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/httpcore-4.2.5.jar"/>
 90 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-annotations.jar"/>
 91 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-annotations-2.2.3.jar"/>
 92 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-core.jar"/>
 93 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-core-2.2.3.jar"/>
 94 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-core-asl.jar"/>
 95 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-core-asl-1.8.8.jar"/>
 96 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-databind.jar"/>
 97 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-databind-2.2.3.jar"/>
 98 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-jaxrs.jar"/>
 99 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-jaxrs-1.8.8.jar"/>
100 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-mapper-asl.jar"/>
101 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-mapper-asl-1.8.8.jar"/>
102 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-xc.jar"/>
103 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jackson-xc-1.8.8.jar"/>
104 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jaxb-api.jar"/>
105 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jaxb-api-2.2.2.jar"/>
106 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jersey-client.jar"/>
107 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jersey-client-1.9.jar"/>
108 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jersey-core.jar"/>
109 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jersey-core-1.9.jar"/>
110 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jetty-util.jar"/>
111 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jetty-util-6.1.26.cloudera.4.jar"/>
112 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jsr305.jar"/>
113 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/jsr305-3.0.0.jar"/>
114 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/leveldbjni-all.jar"/>
115 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/leveldbjni-all-1.8.jar"/>
116 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/log4j.jar"/>
117 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/log4j-1.2.17.jar"/>
118 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/netty.jar"/>
119 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/netty-3.6.2.Final.jar"/>
120 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/paranamer.jar"/>
121 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/paranamer-2.3.jar"/>
122 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/protobuf-java.jar"/>
123 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/protobuf-java-2.5.0.jar"/>
124 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/servlet-api.jar"/>
125 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/servlet-api-2.5.jar"/>
126 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/slf4j-api.jar"/>
127 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/slf4j-api-1.7.5.jar"/>
128 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/slf4j-log4j12.jar"/>
129 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/snappy-java.jar"/>
130 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/snappy-java-1.0.4.1.jar"/>
131 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/stax-api.jar"/>
132 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/stax-api-1.0-2.jar"/>
133 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/xmlenc.jar"/>
134 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/xmlenc-0.52.jar"/>
135 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/xz.jar"/>
136 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/xz-1.0.jar"/>
137 | 	<classpathentry kind="lib" path="/usr/lib/hadoop/client/zookeeper.jar"/>
138 | 	<classpathentry kind="lib" path="/home/cloudera/lib/hamcrest-all-1.1.jar"/>
139 | 	<classpathentry kind="lib" path="/home/cloudera/lib/junit-4.11.jar"/>
140 | 	<classpathentry kind="lib" path="/home/cloudera/lib/mrunit-0.9.0-incubating-hadoop2.jar"/>
141 | 	<classpathentry kind="output" path="bin"/>
142 | </classpath>
143 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.mapreduce.flight2/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>hu.bme.bigdata.homework.mapreduce.flight2</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 	</buildSpec>
14 | 	<natures>
15 | 		<nature>org.eclipse.jdt.core.javanature</nature>
16 | 	</natures>
17 | </projectDescription>
18 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.mapreduce.flight2/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
 5 | org.eclipse.jdt.core.compiler.compliance=1.7
 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.source=1.7
12 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.mapreduce.flight2/build.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | javac -cp /usr/lib/hadoop/*:/usr/lib/hadoop-mapreduce/* hu/bme/bigdata/homework/mapreduce/flight2/*.java hu/bme/bigdata/homework/mapreduce/flight2/mappers/*.java hu/bme/bigdata/homework/mapreduce/flight2/reducers/*.java -d build -Xlint
4 | jar -cvf demo.jar -C build/ .
5 | hadoop jar demo.jar hu.bme.bigdata.homework.mapreduce.flight2.MapReduceApplication input/2008.csv output1
6 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.mapreduce.flight2/src/hu/bme/bigdata/homework/mapreduce/flight2/JobRunner.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.mapreduce.flight2;
 2 | 
 3 | import hu.bme.bigdata.homework.mapreduce.flight2.mappers.RawDataToOriginMapper;
 4 | import hu.bme.bigdata.homework.mapreduce.flight2.mappers.RowToPairMapper;
 5 | import hu.bme.bigdata.homework.mapreduce.flight2.reducers.MaxOccurenceReducer;
 6 | import hu.bme.bigdata.homework.mapreduce.flight2.reducers.OriginCounterReducer;
 7 | 
 8 | import java.io.File;
 9 | 
10 | import org.apache.hadoop.conf.Configured;
11 | import org.apache.hadoop.fs.Path;
12 | import org.apache.hadoop.io.IntWritable;
13 | import org.apache.hadoop.io.MapWritable;
14 | import org.apache.hadoop.io.Text;
15 | import org.apache.hadoop.mapreduce.Job;
16 | import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
17 | import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
18 | import org.apache.hadoop.util.Tool;
19 | 
20 | public class JobRunner extends Configured implements Tool {
21 | 
22 | 	@Override
23 | 	public int run(String[] args) throws Exception {
24 | 		String partialPath = args[1] + File.separator + "partial";
25 | 
26 | 		Job firstJob = Job.getInstance(getConf(), "Flight-2 - 1st phase");
27 | 		firstJob.setJarByClass(this.getClass());
28 | 
29 | 		FileInputFormat.addInputPath(firstJob, new Path(args[0]));
30 | 		FileOutputFormat.setOutputPath(firstJob, new Path(partialPath));
31 | 
32 | 		firstJob.setMapperClass(RawDataToOriginMapper.class);
33 | 		firstJob.setReducerClass(OriginCounterReducer.class);
34 | 
35 | 		firstJob.setOutputKeyClass(Text.class);
36 | 		firstJob.setOutputValueClass(IntWritable.class);
37 | 
38 | 		firstJob.waitForCompletion(true);
39 | 
40 | 		Job secondJob = Job.getInstance(getConf(), "Flight-2 - 2nd phase");
41 | 		secondJob.setJarByClass(this.getClass());
42 | 
43 | 		FileInputFormat.addInputPath(secondJob, new Path(partialPath
44 | 				+ File.separator + "part-r-00000"));
45 | 		FileOutputFormat
46 | 				.setOutputPath(secondJob, new Path(args[1] + "-result"));
47 | 
48 | 		secondJob.setMapperClass(RowToPairMapper.class);
49 | 		secondJob.setMapOutputValueClass(MapWritable.class);
50 | 		secondJob.setReducerClass(MaxOccurenceReducer.class);
51 | 
52 | 		secondJob.setOutputKeyClass(Text.class);
53 | 		secondJob.setOutputValueClass(IntWritable.class);
54 | 
55 | 		int completionCode = secondJob.waitForCompletion(true) ? 0 : 1;
56 | 
57 | 		return completionCode;
58 | 	}
59 | }
60 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.mapreduce.flight2/src/hu/bme/bigdata/homework/mapreduce/flight2/MapReduceApplication.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.mapreduce.flight2;
 2 | 
 3 | import org.apache.hadoop.util.ToolRunner;
 4 | 
 5 | public class MapReduceApplication {
 6 | 
 7 | 	public static void main(String[] args) throws Exception {
 8 | 		int res = ToolRunner.run(new JobRunner(), args);
 9 | 		System.exit(res);
10 | 	}
11 | }


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.mapreduce.flight2/src/hu/bme/bigdata/homework/mapreduce/flight2/mappers/RawDataToOriginMapper.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.mapreduce.flight2.mappers;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.LongWritable;
 7 | import org.apache.hadoop.io.Text;
 8 | import org.apache.hadoop.mapreduce.Mapper;
 9 | 
10 | public class RawDataToOriginMapper extends
11 | 		Mapper<LongWritable, Text, Text, IntWritable> {
12 | 
13 | 	private final static IntWritable one = new IntWritable(1);
14 | 
15 | 	public void map(LongWritable offset, Text lineText, Context context)
16 | 			throws IOException, InterruptedException {
17 | 		String[] splitted = lineText.toString().split(",");
18 | 		String cancelled = splitted[21];
19 | 		String origin = splitted[16];
20 | 
21 | 		if ("0".equals(cancelled)) {
22 | 			Text originText = new Text(origin);
23 | 			context.write(originText, one);
24 | 		}
25 | 	}
26 | }
27 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.mapreduce.flight2/src/hu/bme/bigdata/homework/mapreduce/flight2/mappers/RowToPairMapper.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.mapreduce.flight2.mappers;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.LongWritable;
 7 | import org.apache.hadoop.io.MapWritable;
 8 | import org.apache.hadoop.io.Text;
 9 | import org.apache.hadoop.mapreduce.Mapper;
10 | 
11 | public class RowToPairMapper extends
12 | 		Mapper<LongWritable, Text, Text, MapWritable> {
13 | 
14 | 	public static final Text occurenceKey = new Text("occurence");
15 | 	public static final Text originKey = new Text("origin");
16 | 
17 | 	private static final Text max = new Text("Max");
18 | 
19 | 	public void map(LongWritable offset, Text lineText, Context context)
20 | 			throws IOException, InterruptedException {
21 | 		String[] splitted = lineText.toString().split("	");
22 | 
23 | 		String origin = splitted[0];
24 | 		Integer occurence = Integer.parseInt(splitted[1]);
25 | 
26 | 		MapWritable occurenceMap = new MapWritable();
27 | 		occurenceMap.put(originKey, new Text(origin));
28 | 		occurenceMap.put(occurenceKey, new IntWritable(occurence));
29 | 
30 | 		context.write(max, occurenceMap);
31 | 	}
32 | }


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.mapreduce.flight2/src/hu/bme/bigdata/homework/mapreduce/flight2/reducers/MaxOccurenceReducer.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.mapreduce.flight2.reducers;
 2 | 
 3 | import static hu.bme.bigdata.homework.mapreduce.flight2.mappers.RowToPairMapper.occurenceKey;
 4 | import static hu.bme.bigdata.homework.mapreduce.flight2.mappers.RowToPairMapper.originKey;
 5 | 
 6 | import java.io.IOException;
 7 | 
 8 | import org.apache.hadoop.io.IntWritable;
 9 | import org.apache.hadoop.io.MapWritable;
10 | import org.apache.hadoop.io.Text;
11 | import org.apache.hadoop.mapreduce.Reducer;
12 | 
13 | public class MaxOccurenceReducer extends
14 | 		Reducer<Text, MapWritable, Text, IntWritable> {
15 | 
16 | 	@Override
17 | 	public void reduce(Text word, Iterable<MapWritable> records, Context context)
18 | 			throws IOException, InterruptedException {
19 | 		int maxOccurences = -1;
20 | 		String place = "";
21 | 
22 | 		for (MapWritable occurenceMap : records) {
23 | 			int occurences = ((IntWritable) occurenceMap.get(occurenceKey))
24 | 					.get();
25 | 			if (occurences > maxOccurences) {
26 | 				maxOccurences = occurences;
27 | 				place = ((Text) occurenceMap.get(originKey)).toString();
28 | 			}
29 | 		}
30 | 
31 | 		System.out.println("Most of the airplanes took off from " + place
32 | 				+ " (" + maxOccurences + " times).");
33 | 
34 | 		Text maxPlaceName = new Text(place);
35 | 		IntWritable maxPlaceOccurences = new IntWritable(maxOccurences);
36 | 
37 | 		context.write(maxPlaceName, maxPlaceOccurences);
38 | 	}
39 | }
40 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.mapreduce.flight2/src/hu/bme/bigdata/homework/mapreduce/flight2/reducers/OriginCounterReducer.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.mapreduce.flight2.reducers;
 2 | 
 3 | import java.io.IOException;
 4 | 
 5 | import org.apache.hadoop.io.IntWritable;
 6 | import org.apache.hadoop.io.Text;
 7 | import org.apache.hadoop.mapreduce.Reducer;
 8 | 
 9 | public class OriginCounterReducer extends
10 | 		Reducer<Text, IntWritable, Text, IntWritable> {
11 | 	@Override
12 | 	public void reduce(Text origin, Iterable<IntWritable> counts,
13 | 			Context context) throws IOException, InterruptedException {
14 | 		int sum = 0;
15 | 		for (IntWritable count : counts) {
16 | 			sum += count.get();
17 | 		}
18 | 		context.write(origin, new IntWritable(sum));
19 | 	}
20 | }


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
15 | 		<attributes>
16 | 			<attribute name="optional" value="true"/>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 		</attributes>
19 | 	</classpathentry>
20 | 	<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
21 | 		<attributes>
22 | 			<attribute name="maven.pomderived" value="true"/>
23 | 		</attributes>
24 | 	</classpathentry>
25 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER/org.eclipse.jdt.internal.debug.ui.launcher.StandardVMType/JavaSE-1.7">
26 | 		<attributes>
27 | 			<attribute name="maven.pomderived" value="true"/>
28 | 		</attributes>
29 | 	</classpathentry>
30 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
31 | 		<attributes>
32 | 			<attribute name="maven.pomderived" value="true"/>
33 | 		</attributes>
34 | 	</classpathentry>
35 | 	<classpathentry kind="output" path="target/classes"/>
36 | </classpath>
37 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>hu.bme.bigdata.homework.spark.flight2</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
 5 | org.eclipse.jdt.core.compiler.compliance=1.7
 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.source=1.7
13 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 	<groupId>hu.bme.bigdata.homework</groupId>
  5 | 	<artifactId>spark.flight2</artifactId>
  6 | 	<version>0.0.1-SNAPSHOT</version>
  7 | 
  8 | 	<dependencyManagement>
  9 | 		<dependencies>
 10 | 		    <dependency>
 11 | 		        <groupId>com.fasterxml.jackson.core</groupId>
 12 | 		        <artifactId>jackson-core</artifactId>
 13 | 		        <version>2.13.0</version>
 14 | 		    </dependency>
 15 | 		    <dependency>
 16 | 		        <groupId>org.apache.hadoop</groupId>
 17 |         	        <artifactId>hadoop-client</artifactId>
 18 | 		        <version>2.10.0</version>
 19 |                     </dependency>
 20 | 		    <dependency>
 21 | 		        <groupId>com.fasterxml.jackson.core</groupId>
 22 | 		        <artifactId>jackson-databind</artifactId>
 23 | 		        <version>2.13.4.2</version>
 24 | 		    </dependency>
 25 | 		    <dependency>
 26 | 		        <groupId>commons-beanutils</groupId>
 27 | 		        <artifactId>commons-beanutils</artifactId>
 28 | 		        <version>1.11.0</version>
 29 | 		    </dependency>
 30 | 		    <dependency>
 31 | 		        <groupId>io.netty</groupId>
 32 | 		        <artifactId>netty-codec</artifactId>
 33 | 		        <version>4.1.68.Final</version>
 34 | 		    </dependency>
 35 | 		    <dependency>
 36 | 		        <groupId>io.netty</groupId>
 37 | 		        <artifactId>netty-handler</artifactId>
 38 | 		        <version>4.1.118.Final</version>
 39 | 		    </dependency>
 40 | 		    <dependency>
 41 | 		        <groupId>xerces</groupId>
 42 | 		        <artifactId>xercesImpl</artifactId>
 43 | 		        <version>2.12.2</version>
 44 | 		    </dependency>
 45 | 		    <dependency>
 46 | 		        <groupId>org.apache.hadoop</groupId>
 47 | 		        <artifactId>hadoop-hdfs</artifactId>
 48 | 		        <version>2.9.2</version>
 49 | 		    </dependency>
 50 | 		    <dependency>
 51 | 		        <groupId>org.apache.hadoop</groupId>
 52 | 		        <artifactId>hadoop-yarn-common</artifactId>
 53 | 		        <version>2.9.2</version>
 54 | 		    </dependency>
 55 | 		    <dependency>
 56 | 		        <groupId>org.apache.hadoop</groupId>
 57 | 		        <artifactId>hadoop-mapreduce-client-core</artifactId>
 58 | 		        <version>2.9.2</version>
 59 | 		    </dependency>
 60 | 		    <dependency>
 61 | 		        <groupId>org.apache.hadoop</groupId>
 62 | 		        <artifactId>hadoop-mapreduce-client-common</artifactId>
 63 | 		        <version>2.9.2</version>
 64 | 		    </dependency>
 65 | 		    <dependency>
 66 | 		        <groupId>org.apache.hadoop</groupId>
 67 | 		        <artifactId>hadoop-mapreduce-client-app</artifactId>
 68 | 		        <version>2.9.2</version>
 69 | 		    </dependency>
 70 | 		    <dependency>
 71 | 		        <groupId>jline</groupId>
 72 | 		        <artifactId>jline</artifactId>
 73 | 		        <version>2.14.6</version>
 74 | 		    </dependency>
 75 | 		    <dependency>
 76 | 		        <groupId>org.apache.httpcomponents</groupId>
 77 | 		        <artifactId>httpclient</artifactId>
 78 | 		        <version>4.5.13</version>
 79 | 		    </dependency>
 80 | 		    <dependency>
 81 | 		        <groupId>org.apache.commons</groupId>
 82 | 		        <artifactId>commons-compress</artifactId>
 83 | 		        <version>1.26.0</version>
 84 | 		    </dependency>
 85 | 		    <dependency>
 86 | 		        <groupId>com.google.guava</groupId>
 87 | 		        <artifactId>guava</artifactId>
 88 | 		        <version>32.1.2-jre</version>
 89 | 		    </dependency>
 90 | 		    <dependency>
 91 | 		        <groupId>org.tukaani</groupId>
 92 | 		        <artifactId>xz</artifactId>
 93 | 		        <version>1.8</version>
 94 | 		    </dependency>
 95 | 		    <dependency>
 96 | 		        <groupId>com.google.protobuf</groupId>
 97 | 		        <artifactId>protobuf-java</artifactId>
 98 | 		        <version>3.25.5</version>
 99 | 		    </dependency>
100 | 		    <dependency>
101 | 		        <groupId>org.lz4</groupId>
102 |       	        <artifactId>lz4-java</artifactId>
103 | 		        <version>1.6.0</version>
104 | 			</dependency>
105 | 		    <dependency>
106 | 		        <groupId>org.apache.avro</groupId>
107 | 		        <artifactId>avro</artifactId>
108 | 		        <version>1.11.4</version>
109 |         	    </dependency>
110 | 		    <dependency>
111 | 		        <groupId>io.netty</groupId>
112 | 		        <artifactId>netty-all</artifactId>
113 | 		        <version>4.1.44.Final</version>
114 | 		    </dependency>
115 | 		    <dependency>
116 | 		        <groupId>com.nimbusds</groupId>
117 | 		        <artifactId>nimbus-jose-jwt</artifactId>
118 | 		        <version>9.37.2</version>
119 | 		    </dependency>
120 | 			<dependency>
121 | 				<groupId>commons-codec</groupId>
122 | 				<artifactId>commons-codec</artifactId>
123 | 				<version>1.14</version>
124 | 			</dependency>
125 | 			<dependency>
126 | 				<groupId>org.glassfish.jersey.media</groupId>
127 | 				<artifactId>jersey-media-jaxb</artifactId>
128 | 				<version>2.31</version>
129 | 			</dependency>
130 | 		</dependencies>
131 | 	</dependencyManagement>
132 | 	<dependencies>
133 | 		<dependency>
134 | 			<groupId>org.apache.spark</groupId>
135 | 			<artifactId>spark-core_2.12</artifactId>
136 | 			<version>3.5.4</version>
137 | 		</dependency>
138 | 		<dependency>
139 | 			<groupId>net.sf.jopt-simple</groupId>
140 | 			<artifactId>jopt-simple</artifactId>
141 | 			<version>4.9</version>
142 | 		</dependency>
143 | 		<dependency>
144 | 		  <groupId>org.apache.zookeeper</groupId>
145 | 		  <artifactId>zookeeper</artifactId>
146 | 		  <version>3.9.3</version>
147 | 		</dependency>
148 | 	</dependencies>
149 | 	<build>
150 | 		<plugins>
151 | 			<plugin>
152 | 				<groupId>org.apache.maven.plugins</groupId>
153 | 				<artifactId>maven-dependency-plugin</artifactId>
154 | 				<version>2.9</version>
155 | 				<executions>
156 | 					<execution>
157 | 						<id>copy-dependencies</id>
158 | 						<phase>package</phase>
159 | 						<goals>
160 | 							<goal>copy-dependencies</goal>
161 | 						</goals>
162 | 						<configuration>
163 | 							<outputDirectory>${project.build.directory}/lib</outputDirectory>
164 | 							<overWriteReleases>false</overWriteReleases>
165 | 							<overWriteSnapshots>false</overWriteSnapshots>
166 | 							<overWriteIfNewer>true</overWriteIfNewer>
167 | 						</configuration>
168 | 					</execution>
169 | 				</executions>
170 | 			</plugin>
171 | 			<plugin>
172 | 				<groupId>org.apache.maven.plugins</groupId>
173 | 				<artifactId>maven-jar-plugin</artifactId>
174 | 				<version>2.4</version>
175 | 				<configuration>
176 | 					<archive>
177 | 						<manifest>
178 | 							<addClasspath>true</addClasspath>
179 | 							<classpathPrefix>lib/</classpathPrefix>
180 | 							<mainClass>hu.bme.bigdata.homework.spark.flight2.application.SparkApplication</mainClass>
181 | 							<useUniqueVersions>false</useUniqueVersions>
182 | 						</manifest>
183 | 					</archive>
184 | 				</configuration>
185 | 			</plugin>
186 | 			<plugin>
187 | 				<groupId>org.apache.maven.plugins</groupId>
188 | 				<artifactId>maven-compiler-plugin</artifactId>
189 | 				<version>3.1</version>
190 | 				<configuration>
191 | 					<source>1.7</source>
192 | 					<target>1.7</target>
193 | 				</configuration>
194 | 			</plugin>
195 | 		</plugins>
196 | 	</build>
197 | </project>
198 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/src/main/java/hu/bme/bigdata/homework/spark/flight2/application/SparkApplication.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight2.application;
 2 | 
 3 | import hu.bme.bigdata.homework.spark.flight2.application.jopt.ParameterManager;
 4 | 
 5 | import org.apache.log4j.Logger;
 6 | import org.apache.spark.SparkConf;
 7 | import org.apache.spark.api.java.JavaSparkContext;
 8 | 
 9 | public class SparkApplication {
10 | 
11 |     /**
12 |      * 
13 |      * @param args
14 |      *            --home <SPARK_HOME>
15 |      *            --data <FLIGHT.CSV PATH>
16 |      *            --partitions <# PARTITIONS>
17 |      */
18 |     public static final void main(String[] args) {
19 |         try {
20 |             ParameterManager parameters = new ParameterManager();
21 |             parameters.storeParameters(args, System.err);
22 | 
23 |             JavaSparkContext ctx = createJavaSparkContext(parameters);
24 |             TransformationManager tm = new TransformationManager();
25 | 
26 |             tm.calculateMaxTakeOff(ctx, parameters.getFlightData(), parameters.getPartitions());
27 |             ctx.stop();
28 |         } catch (Exception ex) {
29 |             Logger.getLogger(SparkApplication.class.getName()).error(
30 |                     "Error while calculating the average of delays: " + ex.getMessage());
31 |         }
32 |     }
33 | 
34 |     private static JavaSparkContext createJavaSparkContext(ParameterManager parameters) {
35 |         SparkConf sparkConf = new SparkConf().setAppName("Flight-3 ArrDelays");
36 |         sparkConf.setSparkHome(parameters.getSparkHome());
37 |         sparkConf.setMaster("local[" + parameters.getPartitions() + "]");
38 | 
39 |         return new JavaSparkContext(sparkConf);
40 |     }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/src/main/java/hu/bme/bigdata/homework/spark/flight2/application/TransformationManager.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight2.application;
 2 | 
 3 | import hu.bme.bigdata.homework.spark.flight2.transformation.OccurenceSummarizer;
 4 | import hu.bme.bigdata.homework.spark.flight2.transformation.RawToPairTransformer;
 5 | import hu.bme.bigdata.homework.spark.flight2.transformation.ValidRecordFilter;
 6 | import hu.bme.bigdata.homework.spark.flight2.utility.OriginTupleComparator;
 7 | 
 8 | import org.apache.spark.api.java.JavaPairRDD;
 9 | import org.apache.spark.api.java.JavaRDD;
10 | import org.apache.spark.api.java.JavaRDDLike;
11 | import org.apache.spark.api.java.JavaSparkContext;
12 | 
13 | import scala.Tuple2;
14 | 
15 | public class TransformationManager {
16 | 
17 |     public void calculateMaxTakeOff(JavaSparkContext ctx, String flightData, int partitions) {
18 |         // transformers initialization
19 |         ValidRecordFilter validRecordFilter = new ValidRecordFilter();
20 |         RawToPairTransformer rawToPairTransformer = new RawToPairTransformer();
21 |         OccurenceSummarizer occurenceSummarizer = new OccurenceSummarizer();
22 |         OriginTupleComparator originTupleComparator = new OriginTupleComparator();
23 | 
24 |         // RDD transformations
25 |         JavaRDD<String> lines = ctx.textFile(flightData, partitions);
26 |         JavaRDD<String> validRecords = lines.filter(validRecordFilter);
27 |         JavaPairRDD<String, Integer> flightOrigins = validRecords.mapToPair(rawToPairTransformer);
28 |         JavaPairRDD<String, Integer> occurencesByKey = flightOrigins.reduceByKey(occurenceSummarizer);
29 |         // workaround, otherwise .max function could not be invoked (SPARK-3266)
30 |         Tuple2<String, Integer> max = ((JavaRDDLike<Tuple2<String, Integer>, ?>) occurencesByKey)
31 |                 .max(originTupleComparator);
32 | 
33 |         // print result
34 |         System.out.println("\nMost of the airplanes took off from " + max._1 + ", " + max._2 + " times.");
35 |     }
36 | }
37 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/src/main/java/hu/bme/bigdata/homework/spark/flight2/application/jopt/ParameterManager.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight2.application.jopt;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.PrintStream;
 5 | 
 6 | import joptsimple.ArgumentAcceptingOptionSpec;
 7 | import joptsimple.OptionParser;
 8 | import joptsimple.OptionSet;
 9 | 
10 | public class ParameterManager {
11 | 
12 |     private String sparkHome;
13 |     private String flightData;
14 |     private int    partitions;
15 | 
16 |     public void storeParameters(String[] args, PrintStream stdOut) throws IOException {
17 |         OptionParser parser = new OptionParser();
18 | 
19 |         ArgumentAcceptingOptionSpec<String> sparkHomeArg = parser.accepts("home", "Spark home directory [mandatory]")
20 |                 .withRequiredArg().ofType(String.class);
21 |         ArgumentAcceptingOptionSpec<String> flightDataArg = parser
22 |                 .accepts("data", "Flight data CSV location [mandatory]").withRequiredArg().ofType(String.class);
23 |         ArgumentAcceptingOptionSpec<Integer> partitionsArg = parser
24 |                 .accepts("partitions", "Number of partitions [mandatory]").withRequiredArg().ofType(Integer.class);
25 | 
26 |         OptionSet parsed = parser.parse(args);
27 | 
28 |         if (!parsed.has(sparkHomeArg) || !parsed.has(flightDataArg) || !parsed.has(partitionsArg)) {
29 |             parser.printHelpOn(stdOut);
30 |         }
31 | 
32 |         sparkHome = parsed.valueOf(sparkHomeArg);
33 |         flightData = parsed.valueOf(flightDataArg);
34 |         partitions = parsed.valueOf(partitionsArg).intValue();
35 |     }
36 | 
37 |     public String getSparkHome() {
38 |         return sparkHome;
39 |     }
40 | 
41 |     public String getFlightData() {
42 |         return flightData;
43 |     }
44 | 
45 |     public int getPartitions() {
46 |         return partitions;
47 |     }
48 |     
49 |     
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/src/main/java/hu/bme/bigdata/homework/spark/flight2/transformation/OccurenceSummarizer.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight2.transformation;
 2 | 
 3 | import org.apache.spark.api.java.function.Function2;
 4 | 
 5 | public class OccurenceSummarizer implements Function2<Integer, Integer, Integer> {
 6 | 
 7 |     private static final long serialVersionUID = -4541162907421330727L;
 8 | 
 9 |     @Override
10 |     public Integer call(Integer accumulator, Integer value) throws Exception {
11 |         return accumulator + value;
12 |     }
13 | 
14 | }
15 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/src/main/java/hu/bme/bigdata/homework/spark/flight2/transformation/RawToPairTransformer.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight2.transformation;
 2 | 
 3 | import org.apache.spark.api.java.function.PairFunction;
 4 | 
 5 | import scala.Tuple2;
 6 | 
 7 | public class RawToPairTransformer implements PairFunction<String, String, Integer> {
 8 | 
 9 |     private static final long serialVersionUID = 399006031280178508L;
10 | 
11 |     @Override
12 |     public Tuple2<String, Integer> call(String record) throws Exception {
13 |         String[] splitted = record.split(",");
14 |         String origin = splitted[16];
15 | 
16 |         return new Tuple2<>(origin, Integer.valueOf(1));
17 |     }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/src/main/java/hu/bme/bigdata/homework/spark/flight2/transformation/ValidRecordFilter.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight2.transformation;
 2 | 
 3 | import org.apache.spark.api.java.function.Function;
 4 | 
 5 | public class ValidRecordFilter implements Function<String, Boolean> {
 6 | 
 7 |     private static final long serialVersionUID = 3508379993559478037L;
 8 | 
 9 |     public Boolean call(String line) throws Exception {
10 |         String[] splitted = line.split(",");
11 |         String cancelled = splitted[21];
12 | 
13 |         return "0".equals(cancelled);
14 |     }
15 | 
16 | }
17 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/src/main/java/hu/bme/bigdata/homework/spark/flight2/utility/OriginTupleComparator.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight2.utility;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.Comparator;
 5 | 
 6 | import scala.Tuple2;
 7 | 
 8 | public class OriginTupleComparator implements Comparator<Tuple2<String, Integer>>, Serializable {
 9 | 
10 |     private static final long serialVersionUID = -2644667108949856177L;
11 | 
12 |     @Override
13 |     public int compare(Tuple2<String, Integer> firstTuple, Tuple2<String, Integer> secondTuple) {
14 |         return firstTuple._2.compareTo(secondTuple._2);
15 |     }
16 | 
17 | }
18 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight2/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=WARN, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.err
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/.classpath:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <classpath>
 3 | 	<classpathentry kind="src" output="target/classes" path="src/main/java">
 4 | 		<attributes>
 5 | 			<attribute name="optional" value="true"/>
 6 | 			<attribute name="maven.pomderived" value="true"/>
 7 | 		</attributes>
 8 | 	</classpathentry>
 9 | 	<classpathentry excluding="**" kind="src" output="target/classes" path="src/main/resources">
10 | 		<attributes>
11 | 			<attribute name="maven.pomderived" value="true"/>
12 | 		</attributes>
13 | 	</classpathentry>
14 | 	<classpathentry kind="src" output="target/test-classes" path="src/test/java">
15 | 		<attributes>
16 | 			<attribute name="optional" value="true"/>
17 | 			<attribute name="maven.pomderived" value="true"/>
18 | 		</attributes>
19 | 	</classpathentry>
20 | 	<classpathentry excluding="**" kind="src" output="target/test-classes" path="src/test/resources">
21 | 		<attributes>
22 | 			<attribute name="maven.pomderived" value="true"/>
23 | 		</attributes>
24 | 	</classpathentry>
25 | 	<classpathentry kind="con" path="org.eclipse.m2e.MAVEN2_CLASSPATH_CONTAINER">
26 | 		<attributes>
27 | 			<attribute name="maven.pomderived" value="true"/>
28 | 		</attributes>
29 | 	</classpathentry>
30 | 	<classpathentry kind="con" path="org.eclipse.jdt.launching.JRE_CONTAINER"/>
31 | 	<classpathentry kind="output" path="target/classes"/>
32 | </classpath>
33 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/.project:
--------------------------------------------------------------------------------
 1 | <?xml version="1.0" encoding="UTF-8"?>
 2 | <projectDescription>
 3 | 	<name>hu.bme.bigdata.homework.spark.flight3</name>
 4 | 	<comment></comment>
 5 | 	<projects>
 6 | 	</projects>
 7 | 	<buildSpec>
 8 | 		<buildCommand>
 9 | 			<name>org.eclipse.jdt.core.javabuilder</name>
10 | 			<arguments>
11 | 			</arguments>
12 | 		</buildCommand>
13 | 		<buildCommand>
14 | 			<name>org.eclipse.m2e.core.maven2Builder</name>
15 | 			<arguments>
16 | 			</arguments>
17 | 		</buildCommand>
18 | 	</buildSpec>
19 | 	<natures>
20 | 		<nature>org.eclipse.jdt.core.javanature</nature>
21 | 		<nature>org.eclipse.m2e.core.maven2Nature</nature>
22 | 	</natures>
23 | </projectDescription>
24 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
 1 | eclipse.preferences.version=1
 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.7
 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
 5 | org.eclipse.jdt.core.compiler.compliance=1.7
 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.source=1.7
13 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 | 	xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
  3 | 	<modelVersion>4.0.0</modelVersion>
  4 | 	<groupId>hu.bme.bigdata.homework</groupId>
  5 | 	<artifactId>spark.flight3</artifactId>
  6 | 	<version>0.0.1-SNAPSHOT</version>
  7 | 
  8 | 	<dependencyManagement>
  9 | 		<dependencies>
 10 | 		    <dependency>
 11 | 		        <groupId>com.fasterxml.jackson.core</groupId>
 12 | 		        <artifactId>jackson-core</artifactId>
 13 | 		        <version>2.9.8</version>
 14 | 		    </dependency>
 15 | 		    <dependency>
 16 | 		        <groupId>org.apache.hadoop</groupId>
 17 |         	        <artifactId>hadoop-client</artifactId>
 18 | 		        <version>2.10.0</version>
 19 |                     </dependency>
 20 | 		    <dependency>
 21 | 		        <groupId>com.fasterxml.jackson.core</groupId>
 22 | 		        <artifactId>jackson-databind</artifactId>
 23 | 		        <version>2.13.4.2</version>
 24 | 		    </dependency>
 25 | 		    <dependency>
 26 | 		        <groupId>commons-beanutils</groupId>
 27 | 		        <artifactId>commons-beanutils</artifactId>
 28 | 		        <version>1.11.0</version>
 29 | 		    </dependency>
 30 | 		    <dependency>
 31 | 		        <groupId>io.netty</groupId>
 32 | 		        <artifactId>netty-codec</artifactId>
 33 | 		        <version>4.1.68.Final</version>
 34 | 		    </dependency>
 35 | 		    <dependency>
 36 | 		        <groupId>io.netty</groupId>
 37 | 		        <artifactId>netty-handler</artifactId>
 38 | 		        <version>4.1.118.Final</version>
 39 | 		    </dependency>
 40 | 		    <dependency>
 41 | 		        <groupId>xerces</groupId>
 42 | 		        <artifactId>xercesImpl</artifactId>
 43 | 		        <version>2.12.2</version>
 44 | 		    </dependency>
 45 | 		    <dependency>
 46 | 		        <groupId>org.apache.hadoop</groupId>
 47 | 		        <artifactId>hadoop-hdfs</artifactId>
 48 | 		        <version>2.9.2</version>
 49 | 		    </dependency>
 50 | 		    <dependency>
 51 | 		        <groupId>org.apache.hadoop</groupId>
 52 | 		        <artifactId>hadoop-yarn-common</artifactId>
 53 | 		        <version>2.9.2</version>
 54 | 		    </dependency>
 55 | 		    <dependency>
 56 | 		        <groupId>org.apache.hadoop</groupId>
 57 | 		        <artifactId>hadoop-mapreduce-client-core</artifactId>
 58 | 		        <version>2.9.2</version>
 59 | 		    </dependency>
 60 | 		    <dependency>
 61 | 		        <groupId>org.apache.hadoop</groupId>
 62 | 		        <artifactId>hadoop-mapreduce-client-common</artifactId>
 63 | 		        <version>2.9.2</version>
 64 | 		    </dependency>
 65 | 		    <dependency>
 66 | 		        <groupId>org.apache.hadoop</groupId>
 67 | 		        <artifactId>hadoop-mapreduce-client-app</artifactId>
 68 | 		        <version>2.9.2</version>
 69 | 		    </dependency>
 70 | 		    <dependency>
 71 | 		        <groupId>jline</groupId>
 72 | 		        <artifactId>jline</artifactId>
 73 | 		        <version>2.14.6</version>
 74 | 		    </dependency>
 75 | 		    <dependency>
 76 | 		        <groupId>org.apache.httpcomponents</groupId>
 77 | 		        <artifactId>httpclient</artifactId>
 78 | 		        <version>4.5.13</version>
 79 | 		    </dependency>
 80 | 		    <dependency>
 81 | 		        <groupId>org.apache.commons</groupId>
 82 | 		        <artifactId>commons-compress</artifactId>
 83 | 		        <version>1.26.0</version>
 84 | 		    </dependency>
 85 | 		    <dependency>
 86 | 		        <groupId>com.google.guava</groupId>
 87 | 		        <artifactId>guava</artifactId>
 88 | 		        <version>32.1.2-jre</version>
 89 | 		    </dependency>
 90 | 		    <dependency>
 91 | 		        <groupId>org.tukaani</groupId>
 92 | 		        <artifactId>xz</artifactId>
 93 | 		        <version>1.8</version>
 94 | 		    </dependency>
 95 | 		    <dependency>
 96 | 		        <groupId>com.google.protobuf</groupId>
 97 | 		        <artifactId>protobuf-java</artifactId>
 98 | 		        <version>3.25.5</version>
 99 | 		    </dependency>
100 | 		    <dependency>
101 | 		        <groupId>org.lz4</groupId>
102 |       	                <artifactId>lz4-java</artifactId>
103 | 		        <version>1.6.0</version>
104 |             	    </dependency>
105 | 		    <dependency>
106 | 		        <groupId>org.apache.avro</groupId>
107 | 		        <artifactId>avro</artifactId>
108 | 		        <version>1.11.4</version>
109 |         	    </dependency>
110 | 		    <dependency>
111 | 		        <groupId>io.netty</groupId>
112 | 		        <artifactId>netty-all</artifactId>
113 | 		        <version>4.1.44.Final</version>
114 | 		    </dependency>
115 | 		    <dependency>
116 | 		        <groupId>com.nimbusds</groupId>
117 | 		        <artifactId>nimbus-jose-jwt</artifactId>
118 | 		        <version>9.37.2</version>
119 | 		    </dependency>
120 | 			<dependency>
121 | 				<groupId>commons-codec</groupId>
122 | 				<artifactId>commons-codec</artifactId>
123 | 				<version>1.14</version>
124 | 			</dependency>
125 | 			<dependency>
126 | 				<groupId>org.glassfish.jersey.media</groupId>
127 | 				<artifactId>jersey-media-jaxb</artifactId>
128 | 				<version>2.31</version>
129 | 			</dependency>
130 | 		</dependencies>
131 | 	</dependencyManagement>
132 | 	<dependencies>
133 | 		<dependency>
134 | 			<groupId>org.apache.spark</groupId>
135 | 			<artifactId>spark-core_2.12</artifactId>
136 | 			<version>3.5.4</version>
137 | 		</dependency>
138 | 		<dependency>
139 | 			<groupId>net.sf.jopt-simple</groupId>
140 | 			<artifactId>jopt-simple</artifactId>
141 | 			<version>4.9</version>
142 | 		</dependency>
143 | 		<dependency>
144 | 		  <groupId>org.apache.zookeeper</groupId>
145 | 		  <artifactId>zookeeper</artifactId>
146 | 		  <version>3.9.3</version>
147 | 		</dependency>
148 | 	</dependencies>
149 | 	<build>
150 | 		<plugins>
151 | 			<plugin>
152 | 				<groupId>org.apache.maven.plugins</groupId>
153 | 				<artifactId>maven-dependency-plugin</artifactId>
154 | 				<version>2.9</version>
155 | 				<executions>
156 | 					<execution>
157 | 						<id>copy-dependencies</id>
158 | 						<phase>package</phase>
159 | 						<goals>
160 | 							<goal>copy-dependencies</goal>
161 | 						</goals>
162 | 						<configuration>
163 | 							<outputDirectory>${project.build.directory}/lib</outputDirectory>
164 | 							<overWriteReleases>false</overWriteReleases>
165 | 							<overWriteSnapshots>false</overWriteSnapshots>
166 | 							<overWriteIfNewer>true</overWriteIfNewer>
167 | 						</configuration>
168 | 					</execution>
169 | 				</executions>
170 | 			</plugin>
171 | 			<plugin>
172 | 				<groupId>org.apache.maven.plugins</groupId>
173 | 				<artifactId>maven-jar-plugin</artifactId>
174 | 				<version>2.4</version>
175 | 				<configuration>
176 | 					<archive>
177 | 						<manifest>
178 | 							<addClasspath>true</addClasspath>
179 | 							<classpathPrefix>lib/</classpathPrefix>
180 | 							<mainClass>hu.bme.bigdata.homework.spark.flight3.application.SparkApplication</mainClass>
181 | 							<useUniqueVersions>false</useUniqueVersions>
182 | 						</manifest>
183 | 					</archive>
184 | 				</configuration>
185 | 			</plugin>
186 | 			<plugin>
187 | 				<groupId>org.apache.maven.plugins</groupId>
188 | 				<artifactId>maven-compiler-plugin</artifactId>
189 | 				<version>3.1</version>
190 | 				<configuration>
191 | 					<source>1.7</source>
192 | 					<target>1.7</target>
193 | 				</configuration>
194 | 			</plugin>
195 | 		</plugins>
196 | 	</build>
197 | </project>
198 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/application/SparkApplication.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.application;
 2 | 
 3 | import hu.bme.bigdata.homework.spark.flight3.application.jopt.ParameterManager;
 4 | 
 5 | import org.apache.log4j.Logger;
 6 | import org.apache.spark.SparkConf;
 7 | import org.apache.spark.api.java.JavaSparkContext;
 8 | 
 9 | public class SparkApplication {
10 | 
11 |     /**
12 |      * 
13 |      * @param args
14 |      *            --home <SPARK_HOME>
15 |      *            --data <FLIGHT.CSV PATH>
16 |      *            --partitions <# PARTITIONS>
17 |      */
18 |     public static final void main(String[] args) {
19 |         try {
20 |             ParameterManager parameters = new ParameterManager();
21 |             parameters.storeParameters(args, System.err);
22 | 
23 |             JavaSparkContext ctx = createJavaSparkContext(parameters);
24 |             TransformationManager tm = new TransformationManager();
25 | 
26 |             tm.calculateAvgDelays(ctx, parameters.getFlightData(), parameters.getPartitions());
27 |             ctx.stop();
28 |         } catch (Exception ex) {
29 |             Logger.getLogger(SparkApplication.class.getName()).error(
30 |                     "Error while calculating the average of delays: " + ex.getMessage());
31 |         }
32 |     }
33 | 
34 |     private static JavaSparkContext createJavaSparkContext(ParameterManager parameters) {
35 |         SparkConf sparkConf = new SparkConf().setAppName("Flight-3 ArrDelays");
36 |         sparkConf.setSparkHome(parameters.getSparkHome());
37 |         sparkConf.setMaster("local[" + parameters.getPartitions() + "]");
38 | 
39 |         return new JavaSparkContext(sparkConf);
40 |     }
41 | 
42 | }
43 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/application/TransformationManager.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.application;
 2 | 
 3 | import hu.bme.bigdata.homework.spark.flight3.enums.Season;
 4 | import hu.bme.bigdata.homework.spark.flight3.transformation.ArrDelayToIsDelayedTransformer;
 5 | import hu.bme.bigdata.homework.spark.flight3.transformation.AverageCalculator;
 6 | import hu.bme.bigdata.homework.spark.flight3.transformation.CancelledExcluder;
 7 | import hu.bme.bigdata.homework.spark.flight3.transformation.DateToSeasonTransformer;
 8 | import hu.bme.bigdata.homework.spark.flight3.transformation.RawToPairTransformer;
 9 | import hu.bme.bigdata.homework.spark.flight3.transformation.SumCalculator;
10 | import hu.bme.bigdata.homework.spark.flight3.utility.SeasonTupleComparator;
11 | import hu.bme.bigdata.homework.spark.flight3.utility.SeasonTuplePrinter;
12 | 
13 | import java.time.LocalDateTime;
14 | 
15 | import org.apache.spark.api.java.JavaPairRDD;
16 | import org.apache.spark.api.java.JavaRDD;
17 | import org.apache.spark.api.java.JavaRDDLike;
18 | import org.apache.spark.api.java.JavaSparkContext;
19 | 
20 | import scala.Tuple2;
21 | 
22 | public class TransformationManager {
23 | 
24 |     public void calculateAvgDelays(JavaSparkContext ctx, String flightData, int partitions) {
25 |         // transformers initialization
26 |         CancelledExcluder cancelledExcluder = new CancelledExcluder();
27 |         RawToPairTransformer rawToPairTransformer = new RawToPairTransformer();
28 |         DateToSeasonTransformer dateToSeasonTransformer = new DateToSeasonTransformer();
29 |         ArrDelayToIsDelayedTransformer arrDelayToIsDelyedTransformer = new ArrDelayToIsDelayedTransformer();
30 | 
31 |         SumCalculator sumCalculator = new SumCalculator();
32 |         SumCalculator.InPartitionSumCalculator inpartitionSummarize = sumCalculator.new InPartitionSumCalculator();
33 |         SumCalculator.CrossPartitionSumCalculator crosspartitionSummarize = sumCalculator.new CrossPartitionSumCalculator();
34 | 
35 |         AverageCalculator avgCalculator = new AverageCalculator();
36 | 
37 |         // RDD transformations
38 |         JavaRDD<String> lines = ctx.textFile(flightData, partitions);
39 |         JavaRDD<String> preparedInput = lines.filter(cancelledExcluder);
40 |         JavaPairRDD<LocalDateTime, Integer> dates = preparedInput.mapToPair(rawToPairTransformer);
41 |         JavaPairRDD<Season, Integer> seasonedDates = dates.mapToPair(dateToSeasonTransformer);
42 |         JavaPairRDD<Season, Integer> seasonedIsDelayed = seasonedDates.mapToPair(arrDelayToIsDelyedTransformer);
43 |         JavaPairRDD<Season, Tuple2<Long, Integer>> summarizedBySeason = seasonedIsDelayed.aggregateByKey(
44 |                 sumCalculator.initialValue(), inpartitionSummarize, crosspartitionSummarize);
45 |         JavaPairRDD<Season, Double> averageBySeason = summarizedBySeason.mapValues(avgCalculator);
46 | 
47 |         // print the result
48 |         printResult(summarizedBySeason, averageBySeason);
49 |     }
50 | 
51 |     private void printResult(JavaPairRDD<Season, Tuple2<Long, Integer>> summarizedBySeason,
52 |             JavaPairRDD<Season, Double> averageBySeason) {
53 |         SeasonTuplePrinter printer = new SeasonTuplePrinter();
54 |         summarizedBySeason.foreach(printer);
55 | 
56 |         SeasonTupleComparator seasonTupleComparator = new SeasonTupleComparator();
57 | 
58 |         // workaround, otherwise .max function could not be invoked (SPARK-3266)
59 |         Tuple2<Season, Double> max = ((JavaRDDLike<Tuple2<Season, Double>, ?>) averageBySeason)
60 |                 .max(seasonTupleComparator);
61 | 
62 |         // workaround, otherwise .min function could not be invoked (SPARK-3266)
63 |         Tuple2<Season, Double> min = ((JavaRDDLike<Tuple2<Season, Double>, ?>) averageBySeason)
64 |                 .min(seasonTupleComparator);
65 | 
66 |         Double percentages = Math.round((max._2 - min._2) * 100.0) / 100.0;
67 |         System.out.println();
68 |         System.out.println("In average, in " + max._1 + " " + percentages + "% more of the planes are delayed than in "
69 |                 + min._1 + ".");
70 |     }
71 | }
72 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/application/jopt/ParameterManager.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.application.jopt;
 2 | 
 3 | import java.io.IOException;
 4 | import java.io.PrintStream;
 5 | 
 6 | import joptsimple.ArgumentAcceptingOptionSpec;
 7 | import joptsimple.OptionParser;
 8 | import joptsimple.OptionSet;
 9 | 
10 | public class ParameterManager {
11 | 
12 |     private String sparkHome;
13 |     private String flightData;
14 |     private int    partitions;
15 | 
16 |     public void storeParameters(String[] args, PrintStream stdOut) throws IOException {
17 |         OptionParser parser = new OptionParser();
18 | 
19 |         ArgumentAcceptingOptionSpec<String> sparkHomeArg = parser.accepts("home", "Spark home directory [mandatory]")
20 |                 .withRequiredArg().ofType(String.class);
21 |         ArgumentAcceptingOptionSpec<String> flightDataArg = parser
22 |                 .accepts("data", "Flight data CSV location [mandatory]").withRequiredArg().ofType(String.class);
23 |         ArgumentAcceptingOptionSpec<Integer> partitionsArg = parser
24 |                 .accepts("partitions", "Number of partitions [mandatory]").withRequiredArg().ofType(Integer.class);
25 | 
26 |         OptionSet parsed = parser.parse(args);
27 | 
28 |         if (!parsed.has(sparkHomeArg) || !parsed.has(flightDataArg) || !parsed.has(partitionsArg)) {
29 |             parser.printHelpOn(stdOut);
30 |         }
31 | 
32 |         sparkHome = parsed.valueOf(sparkHomeArg);
33 |         flightData = parsed.valueOf(flightDataArg);
34 |         partitions = parsed.valueOf(partitionsArg).intValue();
35 |     }
36 | 
37 |     public String getSparkHome() {
38 |         return sparkHome;
39 |     }
40 | 
41 |     public String getFlightData() {
42 |         return flightData;
43 |     }
44 | 
45 |     public int getPartitions() {
46 |         return partitions;
47 |     }
48 |     
49 |     
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/enums/Season.java:
--------------------------------------------------------------------------------
1 | package hu.bme.bigdata.homework.spark.flight3.enums;
2 | 
3 | public enum Season {
4 | 
5 |     WINTER, SUMMER;
6 | 
7 | }
8 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/transformation/ArrDelayToIsDelayedTransformer.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.transformation;
 2 | 
 3 | import hu.bme.bigdata.homework.spark.flight3.enums.Season;
 4 | 
 5 | import org.apache.spark.api.java.function.PairFunction;
 6 | 
 7 | import scala.Tuple2;
 8 | 
 9 | public class ArrDelayToIsDelayedTransformer implements PairFunction<Tuple2<Season, Integer>, Season, Integer> {
10 | 
11 |     private static final long serialVersionUID = 1338211487290991471L;
12 | 
13 |     @Override
14 |     public Tuple2<Season, Integer> call(Tuple2<Season, Integer> record) throws Exception {
15 |         int arrDelay = record._2.intValue();
16 |         int isDelayed = (arrDelay > 0) ? 1 : 0;
17 | 
18 |         return new Tuple2<>(record._1, isDelayed);
19 |     }
20 | 
21 | }
22 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/transformation/AverageCalculator.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.transformation;
 2 | 
 3 | import org.apache.spark.api.java.function.Function;
 4 | 
 5 | import scala.Tuple2;
 6 | 
 7 | public class AverageCalculator implements Function<Tuple2<Long, Integer>, Double> {
 8 | 
 9 |     private static final long serialVersionUID = -3602170028043630989L;
10 | 
11 |     @Override
12 |     public Double call(Tuple2<Long, Integer> record) throws Exception {
13 |         Double sum = Double.valueOf(record._1);
14 |         Integer count = record._2;
15 | 
16 |         return sum / count;
17 |     }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/transformation/CancelledExcluder.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.transformation;
 2 | 
 3 | import org.apache.spark.api.java.function.Function;
 4 | 
 5 | public class CancelledExcluder implements Function<String, Boolean> {
 6 | 
 7 |     private static final long serialVersionUID = -4513475604270181839L;
 8 | 
 9 |     @Override
10 |     public Boolean call(String line) throws Exception {
11 |         String[] splitted = line.split(",");
12 |         String cancelled = splitted[21];
13 | 
14 |         return "0".equals(cancelled);
15 |     }
16 | }
17 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/transformation/DateToSeasonTransformer.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.transformation;
 2 | 
 3 | import hu.bme.bigdata.homework.spark.flight3.enums.Season;
 4 | 
 5 | import java.time.LocalDateTime;
 6 | import java.time.Month;
 7 | 
 8 | import org.apache.spark.api.java.function.PairFunction;
 9 | 
10 | import scala.Tuple2;
11 | 
12 | public class DateToSeasonTransformer implements PairFunction<Tuple2<LocalDateTime, Integer>, Season, Integer> {
13 | 
14 |     private static final long serialVersionUID = -8940629807933769588L;
15 | 
16 |     @Override
17 |     public Tuple2<Season, Integer> call(Tuple2<LocalDateTime, Integer> tuple) throws Exception {
18 |         Season season = getSeasonFromDate(tuple._1);
19 |         return new Tuple2<>(season, tuple._2);
20 |     }
21 | 
22 |     private Season getSeasonFromDate(LocalDateTime date) {
23 |         Month month = date.getMonth();
24 |         int dayOfMonth = date.getDayOfMonth();
25 | 
26 |         Season season;
27 |         switch (month) {
28 |             case NOVEMBER:
29 |                 season = Season.WINTER;
30 |                 break;
31 |             case DECEMBER:
32 |                 season = Season.WINTER;
33 |                 break;
34 |             case JANUARY:
35 |                 season = Season.WINTER;
36 |                 break;
37 |             case FEBRUARY:
38 |                 season = Season.WINTER;
39 |                 break;
40 |             case MARCH:
41 |                 if (dayOfMonth <= 7) {
42 |                     season = Season.WINTER;
43 |                 } else {
44 |                     season = Season.SUMMER;
45 |                 }
46 |                 break;
47 |             default:
48 |                 season = Season.SUMMER;
49 |                 break;
50 |         }
51 | 
52 |         return season;
53 |     }
54 | 
55 | }
56 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/transformation/RawToPairTransformer.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.transformation;
 2 | 
 3 | import java.time.LocalDateTime;
 4 | import java.time.Month;
 5 | 
 6 | import org.apache.spark.api.java.function.PairFunction;
 7 | 
 8 | import scala.Tuple2;
 9 | 
10 | public class RawToPairTransformer implements PairFunction<String, LocalDateTime, Integer> {
11 | 
12 |     private static final long serialVersionUID = 1592716716673893751L;
13 | 
14 |     @Override
15 |     public Tuple2<LocalDateTime, Integer> call(String line) throws Exception {
16 |         String[] splitted = line.split(",");
17 |         String arrDelayString = splitted[14];
18 |         String yearString = splitted[0];
19 |         String monthString = splitted[1];
20 |         String dayString = splitted[2];
21 | 
22 |         Integer arrDelay = 0;
23 |         try {
24 |             arrDelay = Integer.valueOf(arrDelayString);
25 |         } catch (NumberFormatException ex) {
26 |         }
27 | 
28 |         Integer month = 5;
29 |         try {
30 |             month = Integer.valueOf(monthString);
31 |         } catch (NumberFormatException ex) {
32 |         }
33 | 
34 |         Integer day = 1;
35 |         try {
36 |             day = Integer.valueOf(dayString);
37 |         } catch (NumberFormatException ex) {
38 |         }
39 | 
40 |         Integer year = 0;
41 |         try {
42 |             year = Integer.valueOf(yearString);
43 |         } catch (NumberFormatException ex) {
44 |         }
45 | 
46 |         LocalDateTime time = LocalDateTime.of(year, Month.of(month), day, 0, 0);
47 | 
48 |         return new Tuple2<>(time, arrDelay);
49 |     }
50 | 
51 | }
52 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/transformation/SumCalculator.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.transformation;
 2 | 
 3 | import java.io.Serializable;
 4 | 
 5 | import org.apache.spark.api.java.function.Function2;
 6 | 
 7 | import scala.Tuple2;
 8 | 
 9 | public class SumCalculator implements Serializable {
10 | 
11 |     private static final long serialVersionUID = -5458753512529207349L;
12 | 
13 |     public class InPartitionSumCalculator implements Function2<Tuple2<Long, Integer>, Integer, Tuple2<Long, Integer>> {
14 | 
15 |         private static final long serialVersionUID = -8607578313978918953L;
16 | 
17 |         /**
18 |          * Summarizes the values in-partition.
19 |          * 
20 |          * @param accumulator
21 |          *            ._1 stores the sum
22 |          * @param accumulator
23 |          *            ._2 stores the count
24 |          * @param isDelayed
25 |          *            is the next value in the sum
26 |          */
27 |         @Override
28 |         public Tuple2<Long, Integer> call(Tuple2<Long, Integer> accumulator, Integer isDelayed) throws Exception {
29 |             Long sum = accumulator._1 + isDelayed;
30 |             Integer count = accumulator._2 + 1;
31 |             return new Tuple2<>(sum, count);
32 |         }
33 |     }
34 | 
35 |     public class CrossPartitionSumCalculator implements
36 |             Function2<Tuple2<Long, Integer>, Tuple2<Long, Integer>, Tuple2<Long, Integer>> {
37 | 
38 |         private static final long serialVersionUID = 2198752266676874321L;
39 | 
40 |         /**
41 |          * Summarizes the values cross-partitions.
42 |          * 
43 |          * @param accumulator
44 |          *            ._1 stores the sum (cross-partition)
45 |          * @param accumulator
46 |          *            ._2 stores the count (cross-partition)
47 |          * @param nextPartion
48 |          *            ._1 stores the next partion's sum
49 |          * @param nextPartition
50 |          *            ._2 stores the next partition's count
51 |          */
52 |         @Override
53 |         public Tuple2<Long, Integer> call(Tuple2<Long, Integer> accumulator, Tuple2<Long, Integer> nextPartition)
54 |                 throws Exception {
55 |             Long crossSum = accumulator._1 + nextPartition._1;
56 |             Integer crossCount = accumulator._2 + nextPartition._2;
57 |             return new Tuple2<>(crossSum, crossCount);
58 |         }
59 |     }
60 | 
61 |     /**
62 |      * @return initial value of the aggregation
63 |      */
64 |     public Tuple2<Long, Integer> initialValue() {
65 |         return new Tuple2<>(Long.valueOf(0), Integer.valueOf(0));
66 |     }
67 | 
68 | }
69 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/utility/SeasonTupleComparator.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.utility;
 2 | 
 3 | import hu.bme.bigdata.homework.spark.flight3.enums.Season;
 4 | 
 5 | import java.io.Serializable;
 6 | import java.util.Comparator;
 7 | 
 8 | import scala.Tuple2;
 9 | 
10 | public class SeasonTupleComparator implements Comparator<Tuple2<Season, Double>>, Serializable {
11 | 
12 |     private static final long serialVersionUID = 13826599364549827L;
13 | 
14 |     @Override
15 |     public int compare(Tuple2<Season, Double> firstTuple, Tuple2<Season, Double> secondTuple) {
16 |         return firstTuple._2.compareTo(secondTuple._2);
17 |     }
18 | 
19 | }
20 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/java/hu/bme/bigdata/homework/spark/flight3/utility/SeasonTuplePrinter.java:
--------------------------------------------------------------------------------
 1 | package hu.bme.bigdata.homework.spark.flight3.utility;
 2 | 
 3 | import hu.bme.bigdata.homework.spark.flight3.enums.Season;
 4 | 
 5 | import org.apache.spark.api.java.function.VoidFunction;
 6 | 
 7 | import scala.Tuple2;
 8 | 
 9 | public class SeasonTuplePrinter implements VoidFunction<Tuple2<Season, Tuple2<Long, Integer>>> {
10 | 
11 |     private static final long serialVersionUID = -3624526999361038180L;
12 | 
13 |     private static boolean    isHeaderPrinted  = false;
14 | 
15 |     @Override
16 |     public void call(Tuple2<Season, Tuple2<Long, Integer>> tuple) throws Exception {
17 |         if (!isHeaderPrinted) {
18 |             System.out.println();
19 |             System.out.println("<SEASON>" + '\t' + "<NUMBER OF DELAYS>" + '\t' + "<NUMBER OF RECORDS>");
20 |             isHeaderPrinted = true;
21 |         }
22 |         System.out.println(tuple._1.toString() + '\t' + tuple._2._1 + '\t' + tuple._2._2);
23 |     }
24 | 
25 | }
26 | 


--------------------------------------------------------------------------------
/hu.bme.bigdata.homework.spark.flight3/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
1 | # Root logger option
2 | log4j.rootLogger=WARN, stdout
3 | 
4 | # Direct log messages to stdout
5 | log4j.appender.stdout=org.apache.log4j.ConsoleAppender
6 | log4j.appender.stdout.Target=System.err
7 | log4j.appender.stdout.layout=org.apache.log4j.PatternLayout
8 | log4j.appender.stdout.layout.ConversionPattern=%d{yyyy-MM-dd HH:mm:ss} %-5p %c{1}:%L - %m%n


--------------------------------------------------------------------------------
/images/benchmark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/benedekh/bigdata-projects/8c50147e984930665e475dc5da1ae4e5e1c43dc6/images/benchmark.png


--------------------------------------------------------------------------------