├── src
    ├── main
    │   ├── scala
    │   │   └── org
    │   │   │   └── anish
    │   │   │       └── spark
    │   │   │           └── etl
    │   │   │               ├── hive
    │   │   │                   ├── Constants.scala
    │   │   │                   ├── HiveSetup.scala
    │   │   │                   ├── DemoRunner.scala
    │   │   │                   └── LoadToHive.scala
    │   │   │               └── ProcessData.scala
    │   └── resources
    │   │   └── log4j.properties
    └── test
    │   ├── resources
    │       ├── expectedOutputs
    │       │   └── cleanedEmails
    │       │   │   └── correctEmailIds.csv
    │       ├── log4j.properties
    │       └── input_data
    │       │   └── testData.csv
    │   └── scala
    │       └── org
    │           └── anish
    │               └── spark
    │                   ├── SparkTestUtils.scala
    │                   └── etl
    │                       └── ProcessDataTest.scala
├── .gitignore
├── README.md
└── pom.xml


/src/main/scala/org/anish/spark/etl/hive/Constants.scala:
--------------------------------------------------------------------------------
 1 | package org.anish.hackerearth.mastglobal.hive
 2 | 
 3 | /**
 4 |   * Contains configs requred by the Hive component.
 5 |   *
 6 |   * Created by anish on 24/01/17.
 7 |   */
 8 | object Constants {
 9 |   val pathOfAlreadyExistingData = "data/alreadyExistingData"
10 |   val pathOfIncrementalData = "data/newIncrement"
11 |   val hiveDatabaseName = "default"
12 |   val hiveTableName = "member_details"
13 |   val hiveWareHouseLocation = System.getProperty("user.dir") + "/warehouse/member_details/"
14 | }
15 | 


--------------------------------------------------------------------------------
/src/test/resources/expectedOutputs/cleanedEmails/correctEmailIds.csv:
--------------------------------------------------------------------------------
 1 | corrected_email
 2 | apnok@wp.pl
 3 | afael.rafa@hotmail.com
 4 | nnenbba@outlook.com
 5 | fhdfshdf@hotmail.com
 6 | rcemola30@gmail.com
 7 | rcemola30@gmail.com
 8 | dam987_3@hotmail.com
 9 | unman@hotmail.com
10 | anos-origi@hotmail.com
11 | a9os44@gmail.com
12 | otoskrotos@gmail.com
13 | deepza007@gmail.om
14 | afaelagbra@hotmail.com
15 | uy@gmail.com
16 | mtucu@gmail.com
17 | ptic_xenon@hotmail.com
18 | zan.22.ozt@gmail.com
19 | imothy001@icloud.com
20 | halajmi2@gmai.com
21 | askvirus@yahoo.com
22 | yphly@mail.com
23 | 


--------------------------------------------------------------------------------
/src/main/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %C: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.apache.spark=WARN
11 | log4j.logger.org.apache.spark.mllib=INFO
12 | log4j.logger.org.spark-project=WARN
13 | log4j.logger.org.spark-project.mllib=INFO
14 | log4j.logger.akka.event=WARN


--------------------------------------------------------------------------------
/src/test/resources/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set everything to be logged to the console
 2 | log4j.rootCategory=ERROR, console
 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender
 4 | log4j.appender.console.target=System.err
 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout
 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %C: %m%n
 7 | 
 8 | # Settings to quiet third party logs that are too verbose
 9 | log4j.logger.org.eclipse.jetty=WARN
10 | log4j.logger.org.apache.spark=WARN
11 | log4j.logger.org.apache.spark.mllib=INFO
12 | log4j.logger.org.spark-project=WARN
13 | log4j.logger.org.spark-project.mllib=INFO
14 | log4j.logger.akka.event=WARN


--------------------------------------------------------------------------------
/src/test/scala/org/anish/spark/SparkTestUtils.scala:
--------------------------------------------------------------------------------
 1 | package org.anish.spark
 2 | 
 3 | import org.apache.spark.sql.DataFrame
 4 | import org.scalatest.Matchers
 5 | 
 6 | /**
 7 |   * Created by anish on 24/01/17.
 8 |   */
 9 | object SparkTestUtils extends Matchers {
10 | 
11 |   /**
12 |     * Gets absolute file path of a resource.
13 |     *
14 |     * @param pathInResource
15 |     * @return actual path of file
16 |     */
17 |   def getResourcePath(pathInResource: String): String = {
18 |     getClass.getResource(pathInResource).getPath
19 |   }
20 | 
21 |   /**
22 |     * Compares two dataframes and ensures that they have the same schema (ignore nullable) and the same values
23 |     * This collects both data frames in the driver, thus not suitable for very large test data. Good for unit testing.
24 |     *
25 |     * @param actualDF   The DF we want to check for correctness
26 |     * @param expectedDF  The correct DF we use for comparison
27 |     * @param onlySchema only compare the schemas of the dataframes
28 |     */
29 |   def dfEquals(actualDF: DataFrame, expectedDF: DataFrame, onlySchema: Boolean = false): Unit = {
30 |     actualDF.schema.map(f => (f.name, f.dataType)).toSet shouldBe expectedDF.schema.map(f => (f.name, f.dataType)).toSet
31 |     if (!onlySchema) {
32 |       actualDF.collect.map(_.toSeq.toSet).toSet shouldBe expectedDF.collect.map(_.toSeq.toSet).toSet
33 |     }
34 |   }
35 | }
36 | 


--------------------------------------------------------------------------------
/src/main/scala/org/anish/spark/etl/hive/HiveSetup.scala:
--------------------------------------------------------------------------------
 1 | package org.anish.hackerearth.mastglobal.hive
 2 | 
 3 | import org.apache.spark.sql.SparkSession
 4 | 
 5 | /**
 6 |   * This class creates the hive tables and loads some initial data to begin with.
 7 |   *
 8 |   * Created by anish on 24/01/17.
 9 |   */
10 | object HiveSetup {
11 | 
12 |   /**
13 |     * Create a table with already existing data. Increments that arrive should get added (updated) to this data.
14 |     * This should be run once to setup the metastore
15 |     *
16 |     * @param spark the SparkSession object
17 |     */
18 |   def loadAlreadyExistingData(spark: SparkSession): Unit = {
19 |     val data = spark.read.option("header", "true").csv(Constants.pathOfAlreadyExistingData)
20 |     val alreadyExistingData_df = data.toDF(data.columns.map(x => x.trim): _*)
21 | 
22 |     spark.catalog.setCurrentDatabase(Constants.hiveDatabaseName)
23 | 
24 | 
25 |     spark.sql("CREATE EXTERNAL TABLE " + Constants.hiveDatabaseName + "." + Constants.hiveTableName +
26 |       "( member_id int" +
27 |       ",name string" +
28 |       ",email string" +
29 |       ",joined long" +
30 |       ",ip_address string" +
31 |       ",posts int" +
32 |       ",bday_day int" +
33 |       ",bday_month int" +
34 |       ",bday_year int" +
35 |       ",members_profile_views int" +
36 |       ",referred_by int" +
37 |       " ) STORED AS AVRO" +
38 |       " LOCATION '" + Constants.hiveWareHouseLocation + "'")
39 | 
40 |     alreadyExistingData_df
41 |       .write
42 |       .format("com.databricks.spark.avro")
43 |       .mode("overwrite")
44 |       .saveAsTable(Constants.hiveTableName)
45 |   }
46 | }
47 | 


--------------------------------------------------------------------------------
/src/main/scala/org/anish/spark/etl/hive/DemoRunner.scala:
--------------------------------------------------------------------------------
 1 | package org.anish.hackerearth.mastglobal.hive
 2 | 
 3 | import org.apache.spark.SparkConf
 4 | import org.apache.spark.sql.SparkSession
 5 | 
 6 | /**
 7 |   * This class orchestrates the Hive demo and calls various classes.
 8 |   *
 9 |   * Created by anish on 24/01/17.
10 |   */
11 | object DemoRunner {
12 |   def main(args: Array[String]): Unit = {
13 |     val spark = getSparkSession
14 | 
15 |     println("Setting up a Hive Metastore and load the data there.")
16 |     HiveSetup.loadAlreadyExistingData(spark)
17 | 
18 |     // check for files loaded
19 |     spark.catalog.setCurrentDatabase(Constants.hiveDatabaseName)
20 |     val loaded_data = spark.table(Constants.hiveTableName)
21 |     println("Loaded : " + loaded_data.count + " record(s)")
22 |     // Initial files have been loaded
23 | 
24 | 
25 |     // Load incremental data now
26 |     println("Loading incremental data from " + Constants.pathOfIncrementalData)
27 |     LoadToHive.loadIncrement(spark)
28 |     val afterIncrementLoad_df = spark.table(Constants.hiveTableName)
29 |     println("Increment load complete. Total " + afterIncrementLoad_df.count + " record(s)")
30 |   }
31 | 
32 |   /**
33 |     * Get the Spark Session
34 |     *
35 |     * @return SparkSession object
36 |     */
37 |   def getSparkSession: SparkSession = {
38 |     val sparkConf = new SparkConf
39 |     sparkConf.set("spark.sql.crossJoin.enabled", "true")
40 |     if (!sparkConf.contains("spark.master")) {
41 |       sparkConf.setMaster("local[3]")
42 |     }
43 |     if (!sparkConf.contains("spark.app.name")) {
44 |       sparkConf.setAppName("MastGlobalDataProcessing-" + getClass.getName)
45 |     }
46 |     SparkSession
47 |       .builder()
48 |       .config(sparkConf)
49 |       .getOrCreate()
50 |   }
51 | }
52 | 


--------------------------------------------------------------------------------
/src/test/resources/input_data/testData.csv:
--------------------------------------------------------------------------------
 1 | member_id, name, email, joined, ip_address, posts, bday_day, bday_month, bday_year, members_profile_views, referred_by
 2 | 598700,Wapno,apnok@wp.p@wp.pl,1462453725,67.71.23.95,233,0,0,0,0,0
 3 | 598701,fatal,afael.rafa@hotmail.com,1462453780,187.180.176.123,1,0,0,0,0,0
 4 | 598702,ulasd,nnenbba@ou@outlook.com,1462454015,67.71.23.113,3,0,0,0,0,0
 5 | 598703,gdsdf,fhdfshdf@h@hotmail.com,1462454033,67.71.23.115,3,0,0,0,0,0
 6 | 598882,es232,rcemola30@@gmail.com,1462468252,95.14.204.112,2,23,3,1997,0,0
 7 | 980003,es232,rcemola30@@gmail.com,1462468252,67.71.23.112,2,24,3,1997,0,0
 8 | 598883,pfhrt,dam987_3@h@hotmail.com,1462468259,67.71.23.103,2,0,0,0,0,0
 9 | 598884,axisb,unman@hotm@hotmail.com,1462468298,67.71.23.78,1,0,0,0,0,0
10 | 598885,dikef,anos-origi@hotmail.com,1462468303,193.92.228.232,2,0,0,0,0,0
11 | 599180,redo5,a9os44@gma@gmail.com,1462491665,67.71.23.34,0,0,0,0,0,0
12 | 599181,Panda,otoskrotos@gmail.com,1462491934,67.71.23.123,3,0,0,0,0,0
13 | 599184,adeep,deepza007@@gmail.om,1462492091,67.71.23.114,0,0,0,0,0,0
14 | 599185,rafin,afaelagbra@hotmail.com,1462492196,67.71.23.4,0,0,0,0,0,599181
15 | 599186,Monge,randonmong@gmail,1462492239,67.71.23.215,0,0,0,0,0,0
16 | 599187,erois,uy@gmail.c@gmail.com,1462492311,67.71.23.44,0,0,0,0,0,599181
17 | 599188,jmtuc,mtucu@gmai@gmail.com,1462492430,190.230.223.156,1,0,0,0,0,599199
18 | 599189,Ferlu,ptic_xenon@hotmail.com,1462492650,67.71.23.152,3,0,0,0,0,599199
19 | 599190,imsta,zan.22.ozt@gmail.com,1462492670,78.165.195.23,6,0,0,0,0,0
20 | 599193,Bruws,imothy001@@icloud.com,1462492960,67.71.23.116,0,0,0,0,0,0
21 | 599194,LeCla,halajmi2@g@gmai.com,1462493191,67.71.23.183,0,0,0,0,0,599199
22 | 599198,maskv,askvirus@h@yahoo.com,1462493370,67.71.23.232,0,0,0,0,0,0
23 | 599199,wyohl,yphly@mail@mail.com,1462493395,108.209.248.132,0,0,0,0,0,0
24 | 


--------------------------------------------------------------------------------
/src/main/scala/org/anish/spark/etl/hive/LoadToHive.scala:
--------------------------------------------------------------------------------
 1 | package org.anish.hackerearth.mastglobal.hive
 2 | 
 3 | import org.apache.spark.sql.functions._
 4 | import org.apache.spark.sql.{DataFrame, SparkSession}
 5 | 
 6 | /**
 7 |   * Class for loading increment data to Hive tables.
 8 |   * This also updates old data while an increment is being loaded.
 9 |   *
10 |   * Created by anish on 24/01/17.
11 |   */
12 | object LoadToHive {
13 | 
14 |   /**
15 |     * This function performs an incremental update to data that is already present
16 |     *
17 |     * @param spark
18 |     */
19 |   def loadIncrement(spark: SparkSession): Unit = {
20 |     // Create a DF out of the increment
21 |     val increment_data = spark.read.option("header", "true").csv(Constants.pathOfIncrementalData)
22 | 
23 |     // Update the already existing data with the increment data received
24 |     spark.catalog.setCurrentDatabase(Constants.hiveDatabaseName)
25 |     val masterData_df = spark.table(Constants.hiveTableName)
26 | 
27 |     // Do an upsert - Updates old data with new data, and addes new data if it is not existing.
28 |     // Member_id is used as unique key
29 |     val upsert_df: DataFrame = upsert(spark, masterData_df, increment_data, "member_id")
30 | 
31 |     // Write upserted data to the same table (overwritten)
32 |     upsert_df
33 |       .write
34 |       .format("com.databricks.spark.avro")
35 |       .mode("overwrite")
36 |       .saveAsTable(Constants.hiveTableName)
37 |   }
38 | 
39 |   /**
40 |     * Update a table with an increment data coming it. It does an update else inserts.
41 |     * @param spark
42 |     * @param masterData_df
43 |     * @param increment_data
44 |     * @param uniqueKey
45 |     * @return
46 |     */
47 |   def upsert(spark: SparkSession, masterData_df: DataFrame, increment_data: DataFrame, uniqueKey: String): DataFrame = {
48 |     import spark.implicits._
49 |     val columns = masterData_df.columns
50 |     val increment_df = increment_data.toDF(increment_data.columns.map(x => x.trim + "_i"): _*)
51 |     val joined_df = masterData_df.as("m").join(increment_df.as("i"), $"m.$uniqueKey" === $"i.${uniqueKey}_i", "outer")
52 |     val upsert_df = columns.foldLeft(joined_df) {
53 |       (acc: DataFrame, colName: String) =>
54 |         acc.withColumn(colName + "_j", coalesce(col(colName + "_i"), col(colName)))
55 |           .drop(colName)
56 |           .drop(colName + "_i")
57 |           .withColumnRenamed(colName + "_j", colName)
58 |     }
59 |     upsert_df
60 |   }
61 | }
62 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | 
  2 | # Created by https://www.gitignore.io/api/scala,osx,windows,eclipse,intellij,maven
  3 | 
  4 | ### Eclipse ###
  5 | 
  6 | .metadata
  7 | bin/
  8 | tmp/
  9 | *.tmp
 10 | *.bak
 11 | *.swp
 12 | *~.nib
 13 | local.properties
 14 | .settings/
 15 | .loadpath
 16 | .recommenders
 17 | 
 18 | # Eclipse Core
 19 | .project
 20 | 
 21 | # External tool builders
 22 | .externalToolBuilders/
 23 | 
 24 | # Locally stored "Eclipse launch configurations"
 25 | *.launch
 26 | 
 27 | # PyDev specific (Python IDE for Eclipse)
 28 | *.pydevproject
 29 | 
 30 | # CDT-specific (C/C++ Development Tooling)
 31 | .cproject
 32 | 
 33 | # JDT-specific (Eclipse Java Development Tools)
 34 | .classpath
 35 | 
 36 | # Java annotation processor (APT)
 37 | .factorypath
 38 | 
 39 | # PDT-specific (PHP Development Tools)
 40 | .buildpath
 41 | 
 42 | # sbteclipse plugin
 43 | .target
 44 | 
 45 | # Tern plugin
 46 | .tern-project
 47 | 
 48 | # TeXlipse plugin
 49 | .texlipse
 50 | 
 51 | # STS (Spring Tool Suite)
 52 | .springBeans
 53 | 
 54 | # Code Recommenders
 55 | .recommenders/
 56 | 
 57 | ### Intellij ###
 58 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm
 59 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839
 60 | 
 61 | # User-specific stuff:
 62 | .idea/**/workspace.xml
 63 | .idea/**/tasks.xml
 64 | 
 65 | # Sensitive or high-churn files:
 66 | .idea/**/dataSources/
 67 | .idea/**/dataSources.ids
 68 | .idea/**/dataSources.xml
 69 | .idea/**/dataSources.local.xml
 70 | .idea/**/sqlDataSources.xml
 71 | .idea/**/dynamic.xml
 72 | .idea/**/uiDesigner.xml
 73 | 
 74 | # Gradle:
 75 | .idea/**/gradle.xml
 76 | .idea/**/libraries
 77 | 
 78 | # Mongo Explorer plugin:
 79 | .idea/**/mongoSettings.xml
 80 | 
 81 | ## File-based project format:
 82 | *.iws
 83 | 
 84 | ## Plugin-specific files:
 85 | 
 86 | # IntelliJ
 87 | /out/
 88 | 
 89 | # mpeltonen/sbt-idea plugin
 90 | .idea_modules/
 91 | 
 92 | # JIRA plugin
 93 | atlassian-ide-plugin.xml
 94 | 
 95 | # Crashlytics plugin (for Android Studio and IntelliJ)
 96 | com_crashlytics_export_strings.xml
 97 | crashlytics.properties
 98 | crashlytics-build.properties
 99 | fabric.properties
100 | 
101 | ### Intellij Patch ###
102 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721
103 | 
104 | # *.iml
105 | # modules.xml
106 | # .idea/misc.xml
107 | # *.ipr
108 | 
109 | ### Maven ###
110 | target/
111 | pom.xml.tag
112 | pom.xml.releaseBackup
113 | pom.xml.versionsBackup
114 | pom.xml.next
115 | release.properties
116 | dependency-reduced-pom.xml
117 | buildNumber.properties
118 | .mvn/timing.properties
119 | 
120 | # Exclude maven wrapper
121 | !/.mvn/wrapper/maven-wrapper.jar
122 | 
123 | ### OSX ###
124 | *.DS_Store
125 | .AppleDouble
126 | .LSOverride
127 | 
128 | # Icon must end with two \r
129 | Icon
130 | 
131 | 
132 | # Thumbnails
133 | ._*
134 | 
135 | # Files that might appear in the root of a volume
136 | .DocumentRevisions-V100
137 | .fseventsd
138 | .Spotlight-V100
139 | .TemporaryItems
140 | .Trashes
141 | .VolumeIcon.icns
142 | .com.apple.timemachine.donotpresent
143 | 
144 | # Directories potentially created on remote AFP share
145 | .AppleDB
146 | .AppleDesktop
147 | Network Trash Folder
148 | Temporary Items
149 | .apdisk
150 | 
151 | ### Scala ###
152 | *.class
153 | *.log
154 | 
155 | # sbt specific
156 | .cache
157 | .history
158 | .lib/
159 | dist/*
160 | lib_managed/
161 | src_managed/
162 | project/boot/
163 | project/plugins/project/
164 | 
165 | # Scala-IDE specific
166 | .ensime
167 | .ensime_cache/
168 | .scala_dependencies
169 | .worksheet
170 | 
171 | # ENSIME specific
172 | 
173 | ### Windows ###
174 | # Windows thumbnail cache files
175 | Thumbs.db
176 | ehthumbs.db
177 | ehthumbs_vista.db
178 | 
179 | # Folder config file
180 | Desktop.ini
181 | 
182 | # Recycle Bin used on file shares
183 | $RECYCLE.BIN/
184 | 
185 | # Windows Installer files
186 | *.cab
187 | *.msi
188 | *.msm
189 | *.msp
190 | 
191 | # Windows shortcuts
192 | *.lnk
193 | 
194 | # End of https://www.gitignore.io/api/scala,osx,windows,eclipse,intellij,maven
195 | 
196 | # Project related files
197 | .idea/*
198 | *.iml
199 | spark-warehouse/*
200 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Transformations using Apache Spark 2.0.0
  2 | A project with examples of using few commonly used data manipulation/processing/transformation APIs in Apache Spark 2.0.0
  3 | 
  4 | ### Tech Stack used:
  5 | **Framework**: Spark v2.0.0
  6 | 
  7 | **Programming Language**: Scala v2.11.6
  8 | 
  9 | ### About the project
 10 | The project can be loaded in IntelliJ IDEA and the class  _org.anish.spark.etc.ProcessData_ can be directly run. This produces all the output.
 11 | 
 12 | ### Code File descriptions
 13 | **org.anish.spark.etc.ProcessData.scala** : Main object along with all transformations and aggregations to process data. Running this object (tested in local system) should produce all the required results.
 14 | The input data has the following fields: 
 15 | ```
 16 | member_id, name, email, joined, ip_address, posts, bday_day, bday_month, bday_year, members_profile_views, referred_by
 17 | ```
 18 | A given output is saved in SampleOutput.txt
 19 | The output of the occurrence of IP address based on the first 3 octets group has been truncated at 500, to make it more presentable. The complete data frame is however saved in the hive tables.
 20 | 
 21 | Build with maven:
 22 | ```
 23 | mvn clean install package
 24 | ```
 25 | To run the main scala object:
 26 | Data (for testing) should be in _data/allData/_
 27 | ```
 28 | java -jar target/spark2-etl-examples-1.0-SNAPSHOT-jar-with-dependencies.jar 
 29 | ```
 30 | 
 31 | **org.anish.spark.etl.hive.Constants.scala** : Configurations stored as Strings in a class. Can be made configurable later.
 32 | 
 33 | **org.anish.spark.etl.hive.HiveSetup.scala** : Creates Hive tables and loads the initial data.
 34 | 
 35 | **org.anish.spark.etl.hive.LoadToHive.scala** : Do incremental loads to Hive. Also has a function to do update else insert option on the whole data set in a Hive table.
 36 | 
 37 | **org.anish.spark.etl.hive.DemoRunner.scala** : Run a demo of loading an initial data to Hive and then 1 increment to Hive. All sources are taken from appropriate folders in the data/* directory. This reqires to be run from an edge node with Hive and Spark clients running and connected to a Hive Meta Store and Spark server.
 38 | 
 39 | 
 40 | **org.anish.spark.etl.ProcessDataTest.scala** : Test class testing all utility methods defined in the ProcessData and LoadToHive Objects 
 41 | 
 42 | ### Avro Outputs:
 43 | For analysis which gave a single or a list of numbers as output like most birth days day, least birthdays month, years with most signups, the output from the provided sample is in SampleOutput.txt along with data frames truncated at 500 records.
 44 | 
 45 | All queries which produced a dataset as output are saved as avro files in the folder _spark-warehouse/_. This can be recreated by executing _java -jar target/spark2-etl-examples-1.0-SNAPSHOT-jar-with-dependencies.jar_ 
 46 | 
 47 | 
 48 | ### Running the project
 49 | 1. Run _mvn clean install_ to build the project
 50 | 2. Scala tests 
 51 | 3. Build is successful
 52 | 4. Run _java -jar target/spark2-etl-examples-1.0-SNAPSHOT-jar-with-dependencies.jar_ to produce analysis results. This also shows the following outputs:
 53 |     - Most birthdays are on: 1 day(s)                                                 
 54 |     - Least birthdays are on: 11 month(s)
 55 | 5. Continuation of output:
 56 |     - Email providers with more than 10K 
 57 |     - Posts by email providers
 58 |     - Year(s) with max sign ups: 2015.
 59 |     - Class C IP address frequency by 1st octet
 60 | 6. Continuation of output:
 61 |     - Frequency of IP address based on first 3 octets (truncated)
 62 | 7. Continuation of output:
 63 |     - Number of referral by members
 64 | 
 65 | ### Hive related Demo
 66 | For loading incremental data to hive tables:
 67 | This creates a table in hive with already existing data. Loads the data already present.
 68 | 
 69 | Increment Load: Loads an increment data, updating the fields which are already present based on member_id. Appends data which is not already present. (New members will be added. Data for old members will be updated.) For the sample data I have not partitioned and bucketed the data since, frequency of incomming increments, size and query pattern of data is not known.
 70 | 
 71 | This assumes that Hive metastore is up and running. Also HiveServer2 should be running and hive client jars present. This should ideally be run from an 'edge node' of a cluster. I've tested it in Spark Local, and not on cluster mode.
 72 | ```
 73 | java -cp target/spark2-etl-examples-1.0-SNAPSHOT-jar-with-dependencies.jar org.anish.spark.etl.hive.DemoRunner
 74 | ```
 75 | 
 76 | 
 77 | ### Submitting to Spark Standalone
 78 | ```
 79 | spark-submit --class org.anish.spark.etl.ProcessData --master local[4] \
 80 | --jars $(find '<***lib directory with spark jars***>' -name '*.jar' | xargs echo | tr ' ' ',') \
 81 | --packages com.databricks:spark-avro_2.11:3.1.0 \
 82 | spark2-etl-examples-1.0-SNAPSHOT.jar 
 83 | ```
 84 | 
 85 | Currently the source is coded to take from local as _data/all_data/_
 86 | To read from HDFS, the path should be appropriately given. Eg - _hdfs://data/all_data/_
 87 | It would automatically take HDFS path if HDFS is running on the same node.
 88 | 
 89 | Submitting from "edge nodes" (Yarn Client Mode)
 90 | ```
 91 | spark-submit --class org.anish.spark.etl.ProcessData --master yarn-client \
 92 | --jars $(find '<***lib directory with spark jars***>' -name '*.jar' | xargs echo | tr ' ' ',') \
 93 | --packages com.databricks:spark-avro_2.11:3.1.0 \
 94 | spark2-etl-examples-1.0-SNAPSHOT.jar
 95 | ```
 96 | 
 97 | ### Use for educational purposes
 98 | If you are trying to run these examples to understand Spark, and you need data, kindly have a look at the 'data' branch
 99 | 
100 | ___
101 | 


--------------------------------------------------------------------------------
/src/test/scala/org/anish/spark/etl/ProcessDataTest.scala:
--------------------------------------------------------------------------------
  1 | package org.anish.spark.etl
  2 | 
  3 | import java.io.File
  4 | 
  5 | import org.anish.hackerearth.mastglobal.ProcessData
  6 | import org.anish.hackerearth.mastglobal.hive.LoadToHive
  7 | import org.anish.spark.SparkTestUtils
  8 | import org.apache.commons.io.FileUtils
  9 | import org.apache.spark.SparkConf
 10 | import org.apache.spark.sql.{DataFrame, SparkSession}
 11 | 
 12 | //import org.junit.runner.RunWith
 13 | //import org.scalatest.junit.JUnitRunner
 14 | import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers}
 15 | 
 16 | /**
 17 |   * Created by anish on 24/01/17.
 18 |   */
 19 | //@RunWith(classOf[JUnitRunner])
 20 | class ProcessDataTest extends FlatSpec with Matchers with BeforeAndAfter {
 21 |   var spark: SparkSession = _
 22 |   var source_data: DataFrame = _
 23 |   before {
 24 |     val sparkConf = new SparkConf
 25 |     sparkConf.set("spark.sql.crossJoin.enabled", "true")
 26 |     if (!sparkConf.contains("spark.master")) {
 27 |       sparkConf.setMaster("local[3]")
 28 |     }
 29 |     if (!sparkConf.contains("spark.app.name")) {
 30 |       sparkConf.setAppName("UnitTest-" + getClass.getName)
 31 |     }
 32 |     spark = SparkSession
 33 |       .builder()
 34 |       .config(sparkConf)
 35 |       .getOrCreate()
 36 | 
 37 |     val data = spark.read
 38 |       .option("header", "true")
 39 |       .option("inferSchema", "true")
 40 |       .csv(SparkTestUtils.getResourcePath("/input_data/"))
 41 |     source_data = data.toDF(data.columns.map(x => x.trim): _*)
 42 |   }
 43 |   behavior of "Process Data"
 44 |   it should "getMostFrequentBday should return the most frequent day" in {
 45 |     val mostFreqBday = ProcessData.getMostFrequentBday(source_data)
 46 |     mostFreqBday shouldBe List(23, 24)
 47 |   }
 48 | 
 49 |   it should "getLeastFrequentBmonth should return the least frequent month" in {
 50 |     val mostFreqBday = ProcessData.getLeastFrequentBmonth(source_data)
 51 |     mostFreqBday shouldBe List(3)
 52 |   }
 53 | 
 54 |   it should "clean email address" in {
 55 |     val cleanEmails = ProcessData.cleanEmails(source_data)
 56 |     val tempPath = "tmp_unitTestTemp_" + System.currentTimeMillis()
 57 |     cleanEmails
 58 |       .select("corrected_email")
 59 |       .write.format("com.databricks.spark.avro")
 60 |       .save(tempPath)
 61 | 
 62 |     val actualData = spark.read.format("com.databricks.spark.avro").load(tempPath)
 63 |     val expectedData = spark.read.option("header", "true").csv(SparkTestUtils.getResourcePath("/expectedOutputs/cleanedEmails"))
 64 | 
 65 |     // Check if the two DF are equal
 66 |     SparkTestUtils.dfEquals(actualData, expectedData)
 67 |     // Check and make sure that the output was generated. And then delete it
 68 |     val errorFile = new File(tempPath)
 69 |     errorFile.exists() shouldBe true
 70 |     if (errorFile.isDirectory) {
 71 |       FileUtils.deleteDirectory(errorFile)
 72 |       errorFile.exists() shouldBe false
 73 |     }
 74 |   }
 75 | 
 76 |   it should "find the year with max signups" in {
 77 |     val yearWithMaxSignUps = ProcessData.yearWithMaxSignUps(source_data)
 78 |     yearWithMaxSignUps shouldBe List(2016)
 79 |   }
 80 | 
 81 |   it should "find the max referrals in given set" in {
 82 |     val maxReferralsActual = ProcessData.maxReferrals(source_data)
 83 |     val maxReferralsExpected = spark.createDataFrame(
 84 |       Seq(
 85 |         ("wyohl", 3L),
 86 |         ("Panda", 2L)
 87 |       )).toDF("referred_by_name", "no_of_people_referred")
 88 | 
 89 |     SparkTestUtils.dfEquals(maxReferralsActual, maxReferralsExpected)
 90 |   }
 91 | 
 92 |   it should "get post by provider" in {
 93 |     val postsByProviderActual = ProcessData.getPostsByProvider(ProcessData.cleanEmails(source_data))
 94 |     val postsByProviderExpected = spark.createDataFrame(
 95 |       Seq(
 96 |         ("Gmail", 14L),
 97 |         ("Hotmail", 12L),
 98 |         ("Yahoo", 0L)
 99 |       )).toDF("provider", "posts_count")
100 | 
101 |     SparkTestUtils.dfEquals(postsByProviderActual, postsByProviderExpected)
102 |   }
103 | 
104 |   it should "list TLD with more than 10K members" in {
105 |     val moreThan10KActual = ProcessData.findMoreThan10KTld(ProcessData.cleanEmails(source_data))
106 |     val moreThan10KExpected = spark.createDataFrame(
107 |       Seq(("", 0L))
108 |     ).toDF("tld", "tld_count")
109 |     SparkTestUtils.dfEquals(moreThan10KActual, moreThan10KExpected, onlySchema = true)
110 |   }
111 | 
112 |   it should "List edu and gov email ids" in {
113 |     val eduGovIdsActual = ProcessData.findEduGovEmailIds(ProcessData.cleanEmails(source_data))
114 |     val eduGovIdsExpected = spark.createDataFrame(
115 |       Seq(("", 0L))
116 |     ).toDF("provider", "eduGov_Count")
117 |     SparkTestUtils.dfEquals(eduGovIdsActual, eduGovIdsExpected, onlySchema = true)
118 |   }
119 | 
120 |   it should "count occurence of class C IP" in {
121 |     val classCipActual = ProcessData.classCipByFirstOctet(spark, source_data)
122 |     val classCipExpected = spark.createDataFrame(
123 |       Seq(("193.x.x.x", 1L))
124 |     ).toDF("ipClassC", "count_octet1")
125 |     SparkTestUtils.dfEquals(classCipActual, classCipExpected, onlySchema = true)
126 |   }
127 | 
128 |   it should "count occurence of IP by first 3 octets" in {
129 |     val ipBy3OctetsActual = ProcessData.ipAddressFreqBy3Octets(source_data)
130 |     val ipBy3OctetsExpected = spark.createDataFrame(
131 |       Seq(("67.71.23.x", 16L),
132 |         ("95.14.204.x", 1L),
133 |         ("108.209.248.x", 1L),
134 |         ("95.14.204.x", 1L),
135 |         ("190.230.223.x", 1L),
136 |         ("95.14.204.x", 1L),
137 |         ("193.92.228.x", 1L),
138 |         ("187.180.176.x", 1L)
139 |       )).toDF("octet13", "occurrence")
140 |     SparkTestUtils.dfEquals(ipBy3OctetsActual, ipBy3OctetsExpected, onlySchema = true)
141 |   }
142 | 
143 |   behavior of "LoadToHive class"
144 |   it should "Do an upsert, i.e. update instead of only append when new data arrives" in {
145 |     val oldData = spark.createDataFrame(
146 |       Seq(
147 |         ("1", "Data"),
148 |         ("2", "AnotherData"),
149 |         ("3", "JustAnotherOldData")
150 |       )).toDF("id", "data")
151 | 
152 |     val newData = spark.createDataFrame(
153 |       Seq(
154 |         ("1", "UpdatedData"),
155 |         ("4", "NewData"),
156 |         ("5", "AnotherNewData")
157 |       )).toDF("id", "data")
158 | 
159 |     val expectedMergedData = spark.createDataFrame(
160 |       Seq(
161 |         ("1", "UpdatedData"),
162 |         ("2", "AnotherData"),
163 |         ("3", "JustAnotherOldData"),
164 |         ("4", "NewData"),
165 |         ("5", "AnotherNewData")
166 |       )).toDF("id", "data")
167 | 
168 |     val actualMergedData = LoadToHive.upsert(spark, oldData, newData, "id")
169 | 
170 |     SparkTestUtils.dfEquals(actualMergedData, expectedMergedData)
171 |   }
172 | }
173 | 


--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
  1 | <project xmlns="http://maven.apache.org/POM/4.0.0" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
  2 |          xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/maven-v4_0_0.xsd">
  3 |     <modelVersion>4.0.0</modelVersion>
  4 |     <groupId>org.anish.spark</groupId>
  5 |     <artifactId>spark2-etl-examples</artifactId>
  6 |     <version>1.0-SNAPSHOT</version>
  7 |     <inceptionYear>2008</inceptionYear>
  8 |     <properties>
  9 |         <spark.version>2.0.0</spark.version>
 10 |         <scala.tools.version>2.11</scala.tools.version>
 11 |         <scala.version>2.11.6</scala.version>
 12 |     </properties>
 13 | 
 14 |     <repositories>
 15 |         <repository>
 16 |             <id>scala-tools.org</id>
 17 |             <name>Scala-Tools Maven2 Repository</name>
 18 |             <url>http://scala-tools.org/repo-releases</url>
 19 |         </repository>
 20 |     </repositories>
 21 | 
 22 |     <pluginRepositories>
 23 |         <pluginRepository>
 24 |             <id>scala-tools.org</id>
 25 |             <name>Scala-Tools Maven2 Repository</name>
 26 |             <url>http://scala-tools.org/repo-releases</url>
 27 |         </pluginRepository>
 28 |     </pluginRepositories>
 29 | 
 30 |     <dependencies>
 31 |         <dependency>
 32 |             <groupId>org.apache.spark</groupId>
 33 |             <artifactId>spark-core_${scala.tools.version}</artifactId>
 34 |             <version>${spark.version}</version>
 35 |         </dependency>
 36 | 
 37 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql_2.11 -->
 38 |         <dependency>
 39 |             <groupId>org.apache.spark</groupId>
 40 |             <artifactId>spark-sql_${scala.tools.version}</artifactId>
 41 |             <version>${spark.version}</version>
 42 |         </dependency>
 43 | 
 44 |         <dependency>
 45 |             <groupId>com.databricks</groupId>
 46 |             <artifactId>spark-avro_${scala.tools.version}</artifactId>
 47 |             <version>3.1.0</version>
 48 |         </dependency>
 49 | 
 50 |         <!-- https://mvnrepository.com/artifact/org.apache.spark/spark-hive_2.10 -->
 51 |         <dependency>
 52 |             <groupId>org.apache.spark</groupId>
 53 |             <artifactId>spark-hive_${scala.tools.version}</artifactId>
 54 |             <version>${spark.version}</version>
 55 |         </dependency>
 56 |         <dependency>
 57 |             <groupId>org.scala-lang</groupId>
 58 |             <artifactId>scala-library</artifactId>
 59 |             <version>${scala.version}</version>
 60 |         </dependency>
 61 |         <dependency>
 62 |             <groupId>junit</groupId>
 63 |             <artifactId>junit</artifactId>
 64 |             <version>4.4</version>
 65 |             <scope>test</scope>
 66 |         </dependency>
 67 |         <dependency>
 68 |             <groupId>org.scala-tools.testing</groupId>
 69 |             <artifactId>specs</artifactId>
 70 |             <version>1.6.2.2_1.5.0</version>
 71 |             <scope>test</scope>
 72 |         </dependency>
 73 |         <dependency>
 74 |             <groupId>org.scalatest</groupId>
 75 |             <artifactId>scalatest_${scala.tools.version}</artifactId>
 76 |             <version>2.2.5</version>
 77 |         </dependency>
 78 |     </dependencies>
 79 | 
 80 |     <build>
 81 |         <sourceDirectory>src/main/scala</sourceDirectory>
 82 |         <testSourceDirectory>src/test/scala</testSourceDirectory>
 83 |         <plugins>
 84 |             <plugin>
 85 |                 <groupId>org.scala-tools</groupId>
 86 |                 <artifactId>maven-scala-plugin</artifactId>
 87 |                 <executions>
 88 |                     <execution>
 89 |                         <goals>
 90 |                             <goal>compile</goal>
 91 |                             <goal>testCompile</goal>
 92 |                         </goals>
 93 |                     </execution>
 94 |                 </executions>
 95 |                 <configuration>
 96 |                     <scalaVersion>${scala.version}</scalaVersion>
 97 |                     <args>
 98 |                         <arg>-target:jvm-1.5</arg>
 99 |                     </args>
100 |                 </configuration>
101 |             </plugin>
102 |             <plugin>
103 |                 <groupId>org.apache.maven.plugins</groupId>
104 |                 <artifactId>maven-eclipse-plugin</artifactId>
105 |                 <configuration>
106 |                     <downloadSources>true</downloadSources>
107 |                     <buildcommands>
108 |                         <buildcommand>ch.epfl.lamp.sdt.core.scalabuilder</buildcommand>
109 |                     </buildcommands>
110 |                     <additionalProjectnatures>
111 |                         <projectnature>ch.epfl.lamp.sdt.core.scalanature</projectnature>
112 |                     </additionalProjectnatures>
113 |                     <classpathContainers>
114 |                         <classpathContainer>org.eclipse.jdt.launching.JRE_CONTAINER</classpathContainer>
115 |                         <classpathContainer>ch.epfl.lamp.sdt.launching.SCALA_CONTAINER</classpathContainer>
116 |                     </classpathContainers>
117 |                 </configuration>
118 |             </plugin>
119 |             <plugin>
120 |                 <groupId>org.apache.maven.plugins</groupId>
121 |                 <artifactId>maven-surefire-plugin</artifactId>
122 |                 <version>2.19.1</version>
123 |                 <configuration>
124 |                     <skipTests>false</skipTests>
125 |                 </configuration>
126 |             </plugin>
127 | 
128 |             <!-- enable scalatest , after disabling surefire-->
129 |             <plugin>
130 |                 <groupId>org.scalatest</groupId>
131 |                 <artifactId>scalatest-maven-plugin</artifactId>
132 |                 <version>1.0</version>
133 |                 <configuration>
134 |                     <reportsDirectory>${project.build.directory}/surefire-reports</reportsDirectory>
135 |                     <junitxml>.</junitxml>
136 |                     <filereports>WDF TestSuite.txt</filereports>
137 |                 </configuration>
138 |                 <executions>
139 |                     <execution>
140 |                         <id>test</id>
141 |                         <goals>
142 |                             <goal>test</goal>
143 |                         </goals>
144 |                     </execution>
145 |                 </executions>
146 |             </plugin>
147 |         </plugins>
148 |     </build>
149 |     <reporting>
150 |         <plugins>
151 |             <plugin>
152 |                 <groupId>org.scala-tools</groupId>
153 |                 <artifactId>maven-scala-plugin</artifactId>
154 |                 <configuration>
155 |                     <scalaVersion>${scala.version}</scalaVersion>
156 |                 </configuration>
157 |             </plugin>
158 |         </plugins>
159 |     </reporting>
160 | </project>
161 | 


--------------------------------------------------------------------------------
/src/main/scala/org/anish/spark/etl/ProcessData.scala:
--------------------------------------------------------------------------------
  1 | package org.anish.hackerearth.mastglobal
  2 | 
  3 | import org.apache.spark.sql.functions._
  4 | import org.apache.spark.sql.types.{IntegerType, LongType}
  5 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SaveMode, SparkSession}
  6 | import org.apache.spark.{SparkConf, sql}
  7 | import org.joda.time.format.DateTimeFormat
  8 | 
  9 | /**
 10 |   * Class to process data to get findings
 11 |   *
 12 |   * Created by anish on 24/01/17.
 13 |   */
 14 | object ProcessData {
 15 | 
 16 |   implicit class Regex(sc: StringContext) {
 17 |     def r = new util.matching.Regex(sc.parts.mkString, sc.parts.tail.map(_ => "x"): _*)
 18 |   }
 19 | 
 20 |   /**
 21 |     * This function works with the data in local system as well.
 22 |     * It thus reads from a folder (which can be HDFS/S3 path as well)
 23 |     * This can be modified to read from Hive as well.
 24 |     *
 25 |     * @param args
 26 |     */
 27 |   def main(args: Array[String]): Unit = {
 28 |     val spark = getSparkSession
 29 |     val data_df = spark.read
 30 |       .option("header", "true")
 31 |       .option("inferSchema", "true")
 32 |       .csv("data/allData/")
 33 | 
 34 |     // Remove extra space in column header names and cache the source
 35 |     val source_df = data_df.toDF(data_df.columns.map(x => x.trim): _*)
 36 |       .cache
 37 | 
 38 | 
 39 |     val mostFrequentBday: List[Int] = getMostFrequentBday(source_df)
 40 |     println("Most birthdays are on: " + mostFrequentBday.mkString(",") + " day(s)")
 41 | 
 42 |     val leastFrequentBmonth: List[Int] = getLeastFrequentBmonth(source_df)
 43 |     println("Least birthdays are on: " + leastFrequentBmonth.mkString(",") + " month(s)")
 44 | 
 45 | 
 46 |     // Work with emails
 47 |     val emailCorrected_df: DataFrame = cleanEmails(source_df)
 48 |       .cache()
 49 | 
 50 |     val eduGovCount_df: Dataset[Row] = findEduGovEmailIds(emailCorrected_df)
 51 |     println("Email id from government and educational TLDs : ")
 52 |     eduGovCount_df.show(500, truncate = false)
 53 | 
 54 |     val moreThan10KTld: Dataset[Row] = findMoreThan10KTld(emailCorrected_df)
 55 |     println("Email providers with more than 10K : ")
 56 |     moreThan10KTld.show()
 57 | 
 58 |     val providerGrp_df: Dataset[Row] = getPostsByProvider(emailCorrected_df)
 59 |     println("Posts by email providers: ")
 60 |     providerGrp_df.show()
 61 | 
 62 |     emailCorrected_df.unpersist
 63 | 
 64 |     val year_maxJoined: List[Int] = yearWithMaxSignUps(source_df)
 65 |     // List because 2 years can experience exact same number of signups
 66 |     println("Year(s) with max sign ups: " + year_maxJoined.mkString(",") + ".")
 67 | 
 68 |     // Find class C ip address
 69 |     val classCip_df: DataFrame = classCipByFirstOctet(spark, source_df)
 70 |     println("Class C IP address frequency by 1st octet:")
 71 |     classCip_df.show(50)
 72 | 
 73 |     val ip_occurBy3octets: DataFrame = ipAddressFreqBy3Octets(source_df)
 74 |     println("Frequency of IP address based on first 3 octets")
 75 |     ip_occurBy3octets.show(500)
 76 | 
 77 |     val max_referral: DataFrame = maxReferrals(source_df)
 78 |     println("Number of referral by members: ")
 79 |     max_referral.show(500)
 80 | 
 81 |     // Save to Hive
 82 |     saveDFsToHive(eduGovCount_df, moreThan10KTld, providerGrp_df, classCip_df, ip_occurBy3octets, max_referral)
 83 | 
 84 |     // All done, now Unpersist the sourceDF
 85 |     source_df.unpersist
 86 |   }
 87 | 
 88 | 
 89 |   /**
 90 |     * Save output data to Hive tables.
 91 |     */
 92 |   def saveDFsToHive(eduGovCount_df: Dataset[Row], moreThan10KTld: Dataset[Row], providerGrp_df: Dataset[Row], classCip_df: DataFrame, ip_occurBy3octets: DataFrame, max_referral: DataFrame): Unit = {
 93 |     eduGovCount_df
 94 |       .write
 95 |       .mode(SaveMode.Overwrite)
 96 |       .format("com.databricks.spark.avro")
 97 |       .saveAsTable("eduGovCount_df")
 98 |     moreThan10KTld
 99 |       .write
100 |       .mode(SaveMode.Overwrite)
101 |       .format("com.databricks.spark.avro")
102 |       .saveAsTable("moreThan10KTld")
103 |     providerGrp_df
104 |       .write
105 |       .mode(SaveMode.Overwrite)
106 |       .format("com.databricks.spark.avro")
107 |       .saveAsTable("providerGrp_df")
108 |     classCip_df
109 |       .write
110 |       .mode(SaveMode.Overwrite)
111 |       .format("com.databricks.spark.avro")
112 |       .saveAsTable("classCip_df")
113 |     ip_occurBy3octets
114 |       .write
115 |       .mode(SaveMode.Overwrite)
116 |       .format("com.databricks.spark.avro")
117 |       .saveAsTable("ip_occurBy3octets")
118 |     max_referral
119 |       .write
120 |       .mode(SaveMode.Overwrite)
121 |       .format("com.databricks.spark.avro")
122 |       .saveAsTable("max_referral")
123 |   }
124 | 
125 |   /**
126 |     * Find ipAdress occurences by first 3 octets
127 |     * @param source_df
128 |     * @return
129 |     */
130 |   def ipAddressFreqBy3Octets(source_df: DataFrame): DataFrame = {
131 |     val ip_occurBy3octets = source_df
132 |       .select("ip_address")
133 |       .withColumn("octet13", concat(substring_index(col("ip_address"), ".", 3), lit(".x")))
134 |       .filter("octet13 <> '.x'")
135 |       .groupBy("octet13")
136 |       .agg(count("*").alias("occurrence"))
137 |       .sort(desc("occurrence"))
138 |       .toDF()
139 |     ip_occurBy3octets
140 |   }
141 | 
142 |   /**
143 |     * Count hits from class C IP addresses
144 |     * @param spark
145 |     * @param source_df
146 |     * @return
147 |     */
148 |   def classCipByFirstOctet(spark: SparkSession, source_df: DataFrame): DataFrame = {
149 |     import spark.implicits._
150 |     val classCip_df = source_df
151 |       .select("ip_address")
152 |       .withColumn("octet1", split(col("ip_address"), "\\.")(0))
153 |       .filter($"octet1" >= 192 && $"octet1" <= 223)
154 |       .withColumn("ipClassC", concat(col("octet1"), lit(".x.x.x")))
155 |       .groupBy("ipClassC")
156 |       .agg(count("*").alias("count_octet1"))
157 |       .sort(desc("count_octet1"))
158 |       .toDF()
159 |     classCip_df
160 |   }
161 | 
162 |   /**
163 |     * Count total number of posts by Email Provider
164 |     * @param emailCorrected_df
165 |     * @return
166 |     */
167 |   def getPostsByProvider(emailCorrected_df: DataFrame): Dataset[Row] = {
168 |     val providerGrp_df = emailCorrected_df
169 |       .filter("provider in ('Gmail', 'Yahoo', 'Hotmail')")
170 |       .groupBy("provider")
171 |       .agg(sum("posts").cast(LongType).alias("posts_count"))
172 |       .sort(desc("posts_count"))
173 |     providerGrp_df
174 |   }
175 | 
176 |   /**
177 |     * Emails of TLDs of more than 10K occurences
178 |     * @param emailCorrected_df
179 |     * @return
180 |     */
181 |   def findMoreThan10KTld(emailCorrected_df: DataFrame): Dataset[Row] = {
182 |     val moreThan10KTld = emailCorrected_df
183 |       .groupBy("tld")
184 |       .agg(count("*").alias("tld_count"))
185 |       .filter("tld_count > 10000")
186 |       .sort(desc("tld_count"))
187 |     moreThan10KTld
188 |   }
189 | 
190 |   /**
191 |     * Count of members from edu and gov email ids
192 |     * @param emailCorrected_df
193 |     * @return
194 |     */
195 |   def findEduGovEmailIds(emailCorrected_df: DataFrame): Dataset[Row] = {
196 |     val eduGovCount_df = emailCorrected_df
197 |       .filter("tld like '%gov%' OR tld like '%edu%'")
198 |       .filter("provider <> 'Others'") // Filters out edu.*.* OR edubs.ch-> This is not edu
199 |       .groupBy("provider")
200 |       .agg(count("*").alias("eduGov_Count"))
201 |       .sort(desc("eduGov_Count"))
202 |     eduGovCount_df
203 |   }
204 | 
205 |   /**
206 |     * Count of referrals by members
207 |     * @param source_df
208 |     * @return
209 |     */
210 |   def maxReferrals(source_df: DataFrame): DataFrame = {
211 |     val rootFiltered_df = source_df
212 |       .filter("referred_by <> 0 ")
213 |       .select("member_id", "name", "referred_by")
214 |     val referrers = source_df
215 |       .select("member_id", "name")
216 |       .withColumnRenamed("member_id", "referer_id")
217 |       .withColumnRenamed("name", "referred_by_name")
218 | 
219 |     val refJoined_df = rootFiltered_df
220 |       .join(referrers, rootFiltered_df("referred_by") === referrers("referer_id"))
221 |       .drop("referer_id")
222 | 
223 |     val refferedGroups_df = refJoined_df
224 |       .groupBy("referred_by_name")
225 |       .agg(count("*").alias("no_of_people_referred"))
226 |       .select("referred_by_name", "no_of_people_referred")
227 |       .sort(desc("no_of_people_referred"))
228 |       .toDF()
229 | 
230 |     refferedGroups_df
231 |   }
232 | 
233 |   /**
234 |     * Year(s) which had max signups
235 |     * @param source_df
236 |     * @return A List of Years which experienced the max signups
237 |     */
238 |   def yearWithMaxSignUps(source_df: DataFrame): List[Int] = {
239 |     val sql_yearFromEpoch = udf((epoch: Long) => {
240 |       DateTimeFormat.forPattern("YYYY").print(epoch * 1000)
241 |     })
242 | 
243 |     val yrJoinedGrp_df = source_df
244 |       .select("joined")
245 |       .withColumn("year_joined", sql_yearFromEpoch(col("joined")).cast(IntegerType))
246 |       .groupBy("year_joined")
247 |       .agg(count("*").alias("year_joined_count"))
248 |       .select("year_joined", "year_joined_count")
249 |       .cache
250 | 
251 |     val maxJoined = yrJoinedGrp_df
252 |       .agg(max("year_joined_count").alias("max_year_joined_count"))
253 |       .collect
254 |       .head.get(0).toString
255 | 
256 |     val year_maxJoined = yrJoinedGrp_df
257 |       .filter("year_joined_count = " + maxJoined)
258 |       .select("year_joined")
259 |       .collect()
260 | 
261 |     val asScalaList = year_maxJoined.map(x => x.getAs[Int]("year_joined")).toList
262 | 
263 |     yrJoinedGrp_df.unpersist
264 |     asScalaList
265 |   }
266 | 
267 |   /**
268 |     * Clean Junk email addresses
269 |     * @param source_df
270 |     * @return
271 |     */
272 |   def cleanEmails(source_df: DataFrame): sql.DataFrame = {
273 |     val sql_emailCorrector = udf((email: String) => {
274 |       email match {
275 |         case r"(.*@)${id}.*@(.*)${dom}" => // Remove everything between multiple @
276 |           id + dom
277 |         case x =>
278 |           if (x.contains("@") && x.split("@").length == 2) {
279 |             val y = x.split("@")(1)
280 |             if (!y.contains("."))
281 |               "invalid"
282 |             else
283 |               x // has 1 @ and dots
284 |           }
285 |           else "invalid"
286 |       }
287 |     })
288 | 
289 |     val sql_tld = udf((email: String) => {
290 |       if (email.contains("@"))
291 |         email.split("@")(1)
292 |       else
293 |         email
294 |     })
295 | 
296 |     val sql_provider = udf((tld: String) => {
297 |       if (tld.contains("gmail"))
298 |         "Gmail"
299 |       else if (tld.contains("yahoo"))
300 |         "Yahoo"
301 |       else if (tld.contains("hotmail"))
302 |         "Hotmail"
303 |       else if (tld.endsWith(".edu"))
304 |         ".edu"
305 |       else if (tld.contains(".edu."))
306 |         tld.substring(tld.indexOf(".edu."))
307 |       else if (tld.endsWith(".gov"))
308 |         ".gov"
309 |       else if (tld.contains(".gov."))
310 |         tld.substring(tld.indexOf(".gov."))
311 |       else {
312 |         "Others"
313 |       }
314 |     })
315 | 
316 |     val emailCorrected_df = source_df
317 |       .select("email", "posts")
318 |       .withColumn("corrected_email", sql_emailCorrector(col("email")))
319 |       .filter("corrected_email <> 'invalid'")
320 |       .withColumn("tld", sql_tld(col("corrected_email")))
321 |       .withColumn("provider", sql_provider(col("tld")))
322 | 
323 |     emailCorrected_df
324 |   }
325 | 
326 |   /**
327 |     * Find the least frequent birth month among all users
328 |     * @param source_df
329 |     * @return
330 |     */
331 |   def getLeastFrequentBmonth(source_df: DataFrame): List[Int] = {
332 |     // Find the least frequent birth month
333 |     val bday_monthGrp = source_df
334 |       .filter("bday_day >= 1 AND bday_day <= 31 AND bday_month >= 1 AND bday_month <= 12 AND bday_year > 0") // Filter out invalid values
335 |       .groupBy("bday_month")
336 |       .agg(count("*").alias("count_bday_month"))
337 |       .select("bday_month", "count_bday_month")
338 |       .cache
339 | 
340 |     val minCountBmonth = bday_monthGrp
341 |       .agg(min("count_bday_month").alias("leastFrequentBmonth"))
342 |       .collect
343 |       .head.get(0).toString
344 | 
345 |     val leastFrequentBmonth = bday_monthGrp
346 |       .filter("count_bday_month = " + minCountBmonth) // This is because there can be multiple dates having the same count
347 |       .select("bday_month")
348 |       .collect()
349 | 
350 |     val asScalaList = leastFrequentBmonth.map(x => x.getAs[Int]("bday_month")).toList
351 | 
352 |     bday_monthGrp.unpersist
353 |     asScalaList
354 |   }
355 | 
356 |   /**
357 |     * Most frequent Birthdays of members
358 |     * @param source_df
359 |     * @return
360 |     */
361 |   def getMostFrequentBday(source_df: DataFrame): List[Int] = {
362 |     // Find the most frequent birthday
363 |     val bday_dayGrp = source_df
364 |       .filter("bday_day >= 1 AND bday_day <= 31 AND bday_month >= 1 AND bday_month <= 12 AND bday_year > 0") // Filter out invalid values
365 |       .groupBy("bday_day")
366 |       .agg(count("*").alias("count_bday_day"))
367 |       .select("bday_day", "count_bday_day")
368 |       .cache
369 | 
370 |     val maxCountBday = bday_dayGrp
371 |       .agg(max("count_bday_day").alias("mostFrequentBday"))
372 |       .collect
373 |       .head.get(0).toString
374 | 
375 |     val mostFrequentBday = bday_dayGrp
376 |       .filter("count_bday_day = " + maxCountBday) // This is because there can be multiple dates having the same count
377 |       .select("bday_day")
378 |       .toDF()
379 |       .collect()
380 |     val asScalaList = mostFrequentBday.map(x => x.getAs[Int]("bday_day")).toList
381 | 
382 |     bday_dayGrp.unpersist
383 |     asScalaList
384 |   }
385 | 
386 |   /**
387 |     * Returns the SparkSession object. Manages the configs for the spark session
388 |     * @return
389 |     */
390 |   def getSparkSession: SparkSession = {
391 |     val sparkConf = new SparkConf
392 |     sparkConf.set("spark.sql.crossJoin.enabled", "true")
393 |     if (!sparkConf.contains("spark.master")) {
394 |       sparkConf.setMaster("local[3]")
395 |     }
396 |     if (!sparkConf.contains("spark.app.name")) {
397 |       sparkConf.setAppName("MastGlobalDataProcessing-" + getClass.getName)
398 |     }
399 |     SparkSession
400 |       .builder()
401 |       .config(sparkConf)
402 |       // .enableHiveSupport()
403 |       .getOrCreate()
404 |   }
405 | }


--------------------------------------------------------------------------------