├── src ├── main │ ├── scala │ │ └── org │ │ │ └── anish │ │ │ └── spark │ │ │ └── etl │ │ │ ├── hive │ │ │ ├── Constants.scala │ │ │ ├── HiveSetup.scala │ │ │ ├── DemoRunner.scala │ │ │ └── LoadToHive.scala │ │ │ └── ProcessData.scala │ └── resources │ │ └── log4j.properties └── test │ ├── resources │ ├── expectedOutputs │ │ └── cleanedEmails │ │ │ └── correctEmailIds.csv │ ├── log4j.properties │ └── input_data │ │ └── testData.csv │ └── scala │ └── org │ └── anish │ └── spark │ ├── SparkTestUtils.scala │ └── etl │ └── ProcessDataTest.scala ├── .gitignore ├── README.md └── pom.xml /src/main/scala/org/anish/spark/etl/hive/Constants.scala: -------------------------------------------------------------------------------- 1 | package org.anish.hackerearth.mastglobal.hive 2 | 3 | /** 4 | * Contains configs requred by the Hive component. 5 | * 6 | * Created by anish on 24/01/17. 7 | */ 8 | object Constants { 9 | val pathOfAlreadyExistingData = "data/alreadyExistingData" 10 | val pathOfIncrementalData = "data/newIncrement" 11 | val hiveDatabaseName = "default" 12 | val hiveTableName = "member_details" 13 | val hiveWareHouseLocation = System.getProperty("user.dir") + "/warehouse/member_details/" 14 | } 15 | -------------------------------------------------------------------------------- /src/test/resources/expectedOutputs/cleanedEmails/correctEmailIds.csv: -------------------------------------------------------------------------------- 1 | corrected_email 2 | apnok@wp.pl 3 | afael.rafa@hotmail.com 4 | nnenbba@outlook.com 5 | fhdfshdf@hotmail.com 6 | rcemola30@gmail.com 7 | rcemola30@gmail.com 8 | dam987_3@hotmail.com 9 | unman@hotmail.com 10 | anos-origi@hotmail.com 11 | a9os44@gmail.com 12 | otoskrotos@gmail.com 13 | deepza007@gmail.om 14 | afaelagbra@hotmail.com 15 | uy@gmail.com 16 | mtucu@gmail.com 17 | ptic_xenon@hotmail.com 18 | zan.22.ozt@gmail.com 19 | imothy001@icloud.com 20 | halajmi2@gmai.com 21 | askvirus@yahoo.com 22 | yphly@mail.com 23 | -------------------------------------------------------------------------------- /src/main/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %C: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.apache.spark=WARN 11 | log4j.logger.org.apache.spark.mllib=INFO 12 | log4j.logger.org.spark-project=WARN 13 | log4j.logger.org.spark-project.mllib=INFO 14 | log4j.logger.akka.event=WARN -------------------------------------------------------------------------------- /src/test/resources/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set everything to be logged to the console 2 | log4j.rootCategory=ERROR, console 3 | log4j.appender.console=org.apache.log4j.ConsoleAppender 4 | log4j.appender.console.target=System.err 5 | log4j.appender.console.layout=org.apache.log4j.PatternLayout 6 | log4j.appender.console.layout.ConversionPattern=%d{yy/MM/dd HH:mm:ss} %p %C: %m%n 7 | 8 | # Settings to quiet third party logs that are too verbose 9 | log4j.logger.org.eclipse.jetty=WARN 10 | log4j.logger.org.apache.spark=WARN 11 | log4j.logger.org.apache.spark.mllib=INFO 12 | log4j.logger.org.spark-project=WARN 13 | log4j.logger.org.spark-project.mllib=INFO 14 | log4j.logger.akka.event=WARN -------------------------------------------------------------------------------- /src/test/scala/org/anish/spark/SparkTestUtils.scala: -------------------------------------------------------------------------------- 1 | package org.anish.spark 2 | 3 | import org.apache.spark.sql.DataFrame 4 | import org.scalatest.Matchers 5 | 6 | /** 7 | * Created by anish on 24/01/17. 8 | */ 9 | object SparkTestUtils extends Matchers { 10 | 11 | /** 12 | * Gets absolute file path of a resource. 13 | * 14 | * @param pathInResource 15 | * @return actual path of file 16 | */ 17 | def getResourcePath(pathInResource: String): String = { 18 | getClass.getResource(pathInResource).getPath 19 | } 20 | 21 | /** 22 | * Compares two dataframes and ensures that they have the same schema (ignore nullable) and the same values 23 | * This collects both data frames in the driver, thus not suitable for very large test data. Good for unit testing. 24 | * 25 | * @param actualDF The DF we want to check for correctness 26 | * @param expectedDF The correct DF we use for comparison 27 | * @param onlySchema only compare the schemas of the dataframes 28 | */ 29 | def dfEquals(actualDF: DataFrame, expectedDF: DataFrame, onlySchema: Boolean = false): Unit = { 30 | actualDF.schema.map(f => (f.name, f.dataType)).toSet shouldBe expectedDF.schema.map(f => (f.name, f.dataType)).toSet 31 | if (!onlySchema) { 32 | actualDF.collect.map(_.toSeq.toSet).toSet shouldBe expectedDF.collect.map(_.toSeq.toSet).toSet 33 | } 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/main/scala/org/anish/spark/etl/hive/HiveSetup.scala: -------------------------------------------------------------------------------- 1 | package org.anish.hackerearth.mastglobal.hive 2 | 3 | import org.apache.spark.sql.SparkSession 4 | 5 | /** 6 | * This class creates the hive tables and loads some initial data to begin with. 7 | * 8 | * Created by anish on 24/01/17. 9 | */ 10 | object HiveSetup { 11 | 12 | /** 13 | * Create a table with already existing data. Increments that arrive should get added (updated) to this data. 14 | * This should be run once to setup the metastore 15 | * 16 | * @param spark the SparkSession object 17 | */ 18 | def loadAlreadyExistingData(spark: SparkSession): Unit = { 19 | val data = spark.read.option("header", "true").csv(Constants.pathOfAlreadyExistingData) 20 | val alreadyExistingData_df = data.toDF(data.columns.map(x => x.trim): _*) 21 | 22 | spark.catalog.setCurrentDatabase(Constants.hiveDatabaseName) 23 | 24 | 25 | spark.sql("CREATE EXTERNAL TABLE " + Constants.hiveDatabaseName + "." + Constants.hiveTableName + 26 | "( member_id int" + 27 | ",name string" + 28 | ",email string" + 29 | ",joined long" + 30 | ",ip_address string" + 31 | ",posts int" + 32 | ",bday_day int" + 33 | ",bday_month int" + 34 | ",bday_year int" + 35 | ",members_profile_views int" + 36 | ",referred_by int" + 37 | " ) STORED AS AVRO" + 38 | " LOCATION '" + Constants.hiveWareHouseLocation + "'") 39 | 40 | alreadyExistingData_df 41 | .write 42 | .format("com.databricks.spark.avro") 43 | .mode("overwrite") 44 | .saveAsTable(Constants.hiveTableName) 45 | } 46 | } 47 | -------------------------------------------------------------------------------- /src/main/scala/org/anish/spark/etl/hive/DemoRunner.scala: -------------------------------------------------------------------------------- 1 | package org.anish.hackerearth.mastglobal.hive 2 | 3 | import org.apache.spark.SparkConf 4 | import org.apache.spark.sql.SparkSession 5 | 6 | /** 7 | * This class orchestrates the Hive demo and calls various classes. 8 | * 9 | * Created by anish on 24/01/17. 10 | */ 11 | object DemoRunner { 12 | def main(args: Array[String]): Unit = { 13 | val spark = getSparkSession 14 | 15 | println("Setting up a Hive Metastore and load the data there.") 16 | HiveSetup.loadAlreadyExistingData(spark) 17 | 18 | // check for files loaded 19 | spark.catalog.setCurrentDatabase(Constants.hiveDatabaseName) 20 | val loaded_data = spark.table(Constants.hiveTableName) 21 | println("Loaded : " + loaded_data.count + " record(s)") 22 | // Initial files have been loaded 23 | 24 | 25 | // Load incremental data now 26 | println("Loading incremental data from " + Constants.pathOfIncrementalData) 27 | LoadToHive.loadIncrement(spark) 28 | val afterIncrementLoad_df = spark.table(Constants.hiveTableName) 29 | println("Increment load complete. Total " + afterIncrementLoad_df.count + " record(s)") 30 | } 31 | 32 | /** 33 | * Get the Spark Session 34 | * 35 | * @return SparkSession object 36 | */ 37 | def getSparkSession: SparkSession = { 38 | val sparkConf = new SparkConf 39 | sparkConf.set("spark.sql.crossJoin.enabled", "true") 40 | if (!sparkConf.contains("spark.master")) { 41 | sparkConf.setMaster("local[3]") 42 | } 43 | if (!sparkConf.contains("spark.app.name")) { 44 | sparkConf.setAppName("MastGlobalDataProcessing-" + getClass.getName) 45 | } 46 | SparkSession 47 | .builder() 48 | .config(sparkConf) 49 | .getOrCreate() 50 | } 51 | } 52 | -------------------------------------------------------------------------------- /src/test/resources/input_data/testData.csv: -------------------------------------------------------------------------------- 1 | member_id, name, email, joined, ip_address, posts, bday_day, bday_month, bday_year, members_profile_views, referred_by 2 | 598700,Wapno,apnok@wp.p@wp.pl,1462453725,67.71.23.95,233,0,0,0,0,0 3 | 598701,fatal,afael.rafa@hotmail.com,1462453780,187.180.176.123,1,0,0,0,0,0 4 | 598702,ulasd,nnenbba@ou@outlook.com,1462454015,67.71.23.113,3,0,0,0,0,0 5 | 598703,gdsdf,fhdfshdf@h@hotmail.com,1462454033,67.71.23.115,3,0,0,0,0,0 6 | 598882,es232,rcemola30@@gmail.com,1462468252,95.14.204.112,2,23,3,1997,0,0 7 | 980003,es232,rcemola30@@gmail.com,1462468252,67.71.23.112,2,24,3,1997,0,0 8 | 598883,pfhrt,dam987_3@h@hotmail.com,1462468259,67.71.23.103,2,0,0,0,0,0 9 | 598884,axisb,unman@hotm@hotmail.com,1462468298,67.71.23.78,1,0,0,0,0,0 10 | 598885,dikef,anos-origi@hotmail.com,1462468303,193.92.228.232,2,0,0,0,0,0 11 | 599180,redo5,a9os44@gma@gmail.com,1462491665,67.71.23.34,0,0,0,0,0,0 12 | 599181,Panda,otoskrotos@gmail.com,1462491934,67.71.23.123,3,0,0,0,0,0 13 | 599184,adeep,deepza007@@gmail.om,1462492091,67.71.23.114,0,0,0,0,0,0 14 | 599185,rafin,afaelagbra@hotmail.com,1462492196,67.71.23.4,0,0,0,0,0,599181 15 | 599186,Monge,randonmong@gmail,1462492239,67.71.23.215,0,0,0,0,0,0 16 | 599187,erois,uy@gmail.c@gmail.com,1462492311,67.71.23.44,0,0,0,0,0,599181 17 | 599188,jmtuc,mtucu@gmai@gmail.com,1462492430,190.230.223.156,1,0,0,0,0,599199 18 | 599189,Ferlu,ptic_xenon@hotmail.com,1462492650,67.71.23.152,3,0,0,0,0,599199 19 | 599190,imsta,zan.22.ozt@gmail.com,1462492670,78.165.195.23,6,0,0,0,0,0 20 | 599193,Bruws,imothy001@@icloud.com,1462492960,67.71.23.116,0,0,0,0,0,0 21 | 599194,LeCla,halajmi2@g@gmai.com,1462493191,67.71.23.183,0,0,0,0,0,599199 22 | 599198,maskv,askvirus@h@yahoo.com,1462493370,67.71.23.232,0,0,0,0,0,0 23 | 599199,wyohl,yphly@mail@mail.com,1462493395,108.209.248.132,0,0,0,0,0,0 24 | -------------------------------------------------------------------------------- /src/main/scala/org/anish/spark/etl/hive/LoadToHive.scala: -------------------------------------------------------------------------------- 1 | package org.anish.hackerearth.mastglobal.hive 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.{DataFrame, SparkSession} 5 | 6 | /** 7 | * Class for loading increment data to Hive tables. 8 | * This also updates old data while an increment is being loaded. 9 | * 10 | * Created by anish on 24/01/17. 11 | */ 12 | object LoadToHive { 13 | 14 | /** 15 | * This function performs an incremental update to data that is already present 16 | * 17 | * @param spark 18 | */ 19 | def loadIncrement(spark: SparkSession): Unit = { 20 | // Create a DF out of the increment 21 | val increment_data = spark.read.option("header", "true").csv(Constants.pathOfIncrementalData) 22 | 23 | // Update the already existing data with the increment data received 24 | spark.catalog.setCurrentDatabase(Constants.hiveDatabaseName) 25 | val masterData_df = spark.table(Constants.hiveTableName) 26 | 27 | // Do an upsert - Updates old data with new data, and addes new data if it is not existing. 28 | // Member_id is used as unique key 29 | val upsert_df: DataFrame = upsert(spark, masterData_df, increment_data, "member_id") 30 | 31 | // Write upserted data to the same table (overwritten) 32 | upsert_df 33 | .write 34 | .format("com.databricks.spark.avro") 35 | .mode("overwrite") 36 | .saveAsTable(Constants.hiveTableName) 37 | } 38 | 39 | /** 40 | * Update a table with an increment data coming it. It does an update else inserts. 41 | * @param spark 42 | * @param masterData_df 43 | * @param increment_data 44 | * @param uniqueKey 45 | * @return 46 | */ 47 | def upsert(spark: SparkSession, masterData_df: DataFrame, increment_data: DataFrame, uniqueKey: String): DataFrame = { 48 | import spark.implicits._ 49 | val columns = masterData_df.columns 50 | val increment_df = increment_data.toDF(increment_data.columns.map(x => x.trim + "_i"): _*) 51 | val joined_df = masterData_df.as("m").join(increment_df.as("i"), $"m.$uniqueKey" === $"i.${uniqueKey}_i", "outer") 52 | val upsert_df = columns.foldLeft(joined_df) { 53 | (acc: DataFrame, colName: String) => 54 | acc.withColumn(colName + "_j", coalesce(col(colName + "_i"), col(colName))) 55 | .drop(colName) 56 | .drop(colName + "_i") 57 | .withColumnRenamed(colName + "_j", colName) 58 | } 59 | upsert_df 60 | } 61 | } 62 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | # Created by https://www.gitignore.io/api/scala,osx,windows,eclipse,intellij,maven 3 | 4 | ### Eclipse ### 5 | 6 | .metadata 7 | bin/ 8 | tmp/ 9 | *.tmp 10 | *.bak 11 | *.swp 12 | *~.nib 13 | local.properties 14 | .settings/ 15 | .loadpath 16 | .recommenders 17 | 18 | # Eclipse Core 19 | .project 20 | 21 | # External tool builders 22 | .externalToolBuilders/ 23 | 24 | # Locally stored "Eclipse launch configurations" 25 | *.launch 26 | 27 | # PyDev specific (Python IDE for Eclipse) 28 | *.pydevproject 29 | 30 | # CDT-specific (C/C++ Development Tooling) 31 | .cproject 32 | 33 | # JDT-specific (Eclipse Java Development Tools) 34 | .classpath 35 | 36 | # Java annotation processor (APT) 37 | .factorypath 38 | 39 | # PDT-specific (PHP Development Tools) 40 | .buildpath 41 | 42 | # sbteclipse plugin 43 | .target 44 | 45 | # Tern plugin 46 | .tern-project 47 | 48 | # TeXlipse plugin 49 | .texlipse 50 | 51 | # STS (Spring Tool Suite) 52 | .springBeans 53 | 54 | # Code Recommenders 55 | .recommenders/ 56 | 57 | ### Intellij ### 58 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio and Webstorm 59 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 60 | 61 | # User-specific stuff: 62 | .idea/**/workspace.xml 63 | .idea/**/tasks.xml 64 | 65 | # Sensitive or high-churn files: 66 | .idea/**/dataSources/ 67 | .idea/**/dataSources.ids 68 | .idea/**/dataSources.xml 69 | .idea/**/dataSources.local.xml 70 | .idea/**/sqlDataSources.xml 71 | .idea/**/dynamic.xml 72 | .idea/**/uiDesigner.xml 73 | 74 | # Gradle: 75 | .idea/**/gradle.xml 76 | .idea/**/libraries 77 | 78 | # Mongo Explorer plugin: 79 | .idea/**/mongoSettings.xml 80 | 81 | ## File-based project format: 82 | *.iws 83 | 84 | ## Plugin-specific files: 85 | 86 | # IntelliJ 87 | /out/ 88 | 89 | # mpeltonen/sbt-idea plugin 90 | .idea_modules/ 91 | 92 | # JIRA plugin 93 | atlassian-ide-plugin.xml 94 | 95 | # Crashlytics plugin (for Android Studio and IntelliJ) 96 | com_crashlytics_export_strings.xml 97 | crashlytics.properties 98 | crashlytics-build.properties 99 | fabric.properties 100 | 101 | ### Intellij Patch ### 102 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 103 | 104 | # *.iml 105 | # modules.xml 106 | # .idea/misc.xml 107 | # *.ipr 108 | 109 | ### Maven ### 110 | target/ 111 | pom.xml.tag 112 | pom.xml.releaseBackup 113 | pom.xml.versionsBackup 114 | pom.xml.next 115 | release.properties 116 | dependency-reduced-pom.xml 117 | buildNumber.properties 118 | .mvn/timing.properties 119 | 120 | # Exclude maven wrapper 121 | !/.mvn/wrapper/maven-wrapper.jar 122 | 123 | ### OSX ### 124 | *.DS_Store 125 | .AppleDouble 126 | .LSOverride 127 | 128 | # Icon must end with two \r 129 | Icon 130 | 131 | 132 | # Thumbnails 133 | ._* 134 | 135 | # Files that might appear in the root of a volume 136 | .DocumentRevisions-V100 137 | .fseventsd 138 | .Spotlight-V100 139 | .TemporaryItems 140 | .Trashes 141 | .VolumeIcon.icns 142 | .com.apple.timemachine.donotpresent 143 | 144 | # Directories potentially created on remote AFP share 145 | .AppleDB 146 | .AppleDesktop 147 | Network Trash Folder 148 | Temporary Items 149 | .apdisk 150 | 151 | ### Scala ### 152 | *.class 153 | *.log 154 | 155 | # sbt specific 156 | .cache 157 | .history 158 | .lib/ 159 | dist/* 160 | lib_managed/ 161 | src_managed/ 162 | project/boot/ 163 | project/plugins/project/ 164 | 165 | # Scala-IDE specific 166 | .ensime 167 | .ensime_cache/ 168 | .scala_dependencies 169 | .worksheet 170 | 171 | # ENSIME specific 172 | 173 | ### Windows ### 174 | # Windows thumbnail cache files 175 | Thumbs.db 176 | ehthumbs.db 177 | ehthumbs_vista.db 178 | 179 | # Folder config file 180 | Desktop.ini 181 | 182 | # Recycle Bin used on file shares 183 | $RECYCLE.BIN/ 184 | 185 | # Windows Installer files 186 | *.cab 187 | *.msi 188 | *.msm 189 | *.msp 190 | 191 | # Windows shortcuts 192 | *.lnk 193 | 194 | # End of https://www.gitignore.io/api/scala,osx,windows,eclipse,intellij,maven 195 | 196 | # Project related files 197 | .idea/* 198 | *.iml 199 | spark-warehouse/* 200 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Transformations using Apache Spark 2.0.0 2 | A project with examples of using few commonly used data manipulation/processing/transformation APIs in Apache Spark 2.0.0 3 | 4 | ### Tech Stack used: 5 | **Framework**: Spark v2.0.0 6 | 7 | **Programming Language**: Scala v2.11.6 8 | 9 | ### About the project 10 | The project can be loaded in IntelliJ IDEA and the class _org.anish.spark.etc.ProcessData_ can be directly run. This produces all the output. 11 | 12 | ### Code File descriptions 13 | **org.anish.spark.etc.ProcessData.scala** : Main object along with all transformations and aggregations to process data. Running this object (tested in local system) should produce all the required results. 14 | The input data has the following fields: 15 | ``` 16 | member_id, name, email, joined, ip_address, posts, bday_day, bday_month, bday_year, members_profile_views, referred_by 17 | ``` 18 | A given output is saved in SampleOutput.txt 19 | The output of the occurrence of IP address based on the first 3 octets group has been truncated at 500, to make it more presentable. The complete data frame is however saved in the hive tables. 20 | 21 | Build with maven: 22 | ``` 23 | mvn clean install package 24 | ``` 25 | To run the main scala object: 26 | Data (for testing) should be in _data/allData/_ 27 | ``` 28 | java -jar target/spark2-etl-examples-1.0-SNAPSHOT-jar-with-dependencies.jar 29 | ``` 30 | 31 | **org.anish.spark.etl.hive.Constants.scala** : Configurations stored as Strings in a class. Can be made configurable later. 32 | 33 | **org.anish.spark.etl.hive.HiveSetup.scala** : Creates Hive tables and loads the initial data. 34 | 35 | **org.anish.spark.etl.hive.LoadToHive.scala** : Do incremental loads to Hive. Also has a function to do update else insert option on the whole data set in a Hive table. 36 | 37 | **org.anish.spark.etl.hive.DemoRunner.scala** : Run a demo of loading an initial data to Hive and then 1 increment to Hive. All sources are taken from appropriate folders in the data/* directory. This reqires to be run from an edge node with Hive and Spark clients running and connected to a Hive Meta Store and Spark server. 38 | 39 | 40 | **org.anish.spark.etl.ProcessDataTest.scala** : Test class testing all utility methods defined in the ProcessData and LoadToHive Objects 41 | 42 | ### Avro Outputs: 43 | For analysis which gave a single or a list of numbers as output like most birth days day, least birthdays month, years with most signups, the output from the provided sample is in SampleOutput.txt along with data frames truncated at 500 records. 44 | 45 | All queries which produced a dataset as output are saved as avro files in the folder _spark-warehouse/_. This can be recreated by executing _java -jar target/spark2-etl-examples-1.0-SNAPSHOT-jar-with-dependencies.jar_ 46 | 47 | 48 | ### Running the project 49 | 1. Run _mvn clean install_ to build the project 50 | 2. Scala tests 51 | 3. Build is successful 52 | 4. Run _java -jar target/spark2-etl-examples-1.0-SNAPSHOT-jar-with-dependencies.jar_ to produce analysis results. This also shows the following outputs: 53 | - Most birthdays are on: 1 day(s) 54 | - Least birthdays are on: 11 month(s) 55 | 5. Continuation of output: 56 | - Email providers with more than 10K 57 | - Posts by email providers 58 | - Year(s) with max sign ups: 2015. 59 | - Class C IP address frequency by 1st octet 60 | 6. Continuation of output: 61 | - Frequency of IP address based on first 3 octets (truncated) 62 | 7. Continuation of output: 63 | - Number of referral by members 64 | 65 | ### Hive related Demo 66 | For loading incremental data to hive tables: 67 | This creates a table in hive with already existing data. Loads the data already present. 68 | 69 | Increment Load: Loads an increment data, updating the fields which are already present based on member_id. Appends data which is not already present. (New members will be added. Data for old members will be updated.) For the sample data I have not partitioned and bucketed the data since, frequency of incomming increments, size and query pattern of data is not known. 70 | 71 | This assumes that Hive metastore is up and running. Also HiveServer2 should be running and hive client jars present. This should ideally be run from an 'edge node' of a cluster. I've tested it in Spark Local, and not on cluster mode. 72 | ``` 73 | java -cp target/spark2-etl-examples-1.0-SNAPSHOT-jar-with-dependencies.jar org.anish.spark.etl.hive.DemoRunner 74 | ``` 75 | 76 | 77 | ### Submitting to Spark Standalone 78 | ``` 79 | spark-submit --class org.anish.spark.etl.ProcessData --master local[4] \ 80 | --jars $(find '<***lib directory with spark jars***>' -name '*.jar' | xargs echo | tr ' ' ',') \ 81 | --packages com.databricks:spark-avro_2.11:3.1.0 \ 82 | spark2-etl-examples-1.0-SNAPSHOT.jar 83 | ``` 84 | 85 | Currently the source is coded to take from local as _data/all_data/_ 86 | To read from HDFS, the path should be appropriately given. Eg - _hdfs://data/all_data/_ 87 | It would automatically take HDFS path if HDFS is running on the same node. 88 | 89 | Submitting from "edge nodes" (Yarn Client Mode) 90 | ``` 91 | spark-submit --class org.anish.spark.etl.ProcessData --master yarn-client \ 92 | --jars $(find '<***lib directory with spark jars***>' -name '*.jar' | xargs echo | tr ' ' ',') \ 93 | --packages com.databricks:spark-avro_2.11:3.1.0 \ 94 | spark2-etl-examples-1.0-SNAPSHOT.jar 95 | ``` 96 | 97 | ### Use for educational purposes 98 | If you are trying to run these examples to understand Spark, and you need data, kindly have a look at the 'data' branch 99 | 100 | ___ 101 | -------------------------------------------------------------------------------- /src/test/scala/org/anish/spark/etl/ProcessDataTest.scala: -------------------------------------------------------------------------------- 1 | package org.anish.spark.etl 2 | 3 | import java.io.File 4 | 5 | import org.anish.hackerearth.mastglobal.ProcessData 6 | import org.anish.hackerearth.mastglobal.hive.LoadToHive 7 | import org.anish.spark.SparkTestUtils 8 | import org.apache.commons.io.FileUtils 9 | import org.apache.spark.SparkConf 10 | import org.apache.spark.sql.{DataFrame, SparkSession} 11 | 12 | //import org.junit.runner.RunWith 13 | //import org.scalatest.junit.JUnitRunner 14 | import org.scalatest.{BeforeAndAfter, FlatSpec, Matchers} 15 | 16 | /** 17 | * Created by anish on 24/01/17. 18 | */ 19 | //@RunWith(classOf[JUnitRunner]) 20 | class ProcessDataTest extends FlatSpec with Matchers with BeforeAndAfter { 21 | var spark: SparkSession = _ 22 | var source_data: DataFrame = _ 23 | before { 24 | val sparkConf = new SparkConf 25 | sparkConf.set("spark.sql.crossJoin.enabled", "true") 26 | if (!sparkConf.contains("spark.master")) { 27 | sparkConf.setMaster("local[3]") 28 | } 29 | if (!sparkConf.contains("spark.app.name")) { 30 | sparkConf.setAppName("UnitTest-" + getClass.getName) 31 | } 32 | spark = SparkSession 33 | .builder() 34 | .config(sparkConf) 35 | .getOrCreate() 36 | 37 | val data = spark.read 38 | .option("header", "true") 39 | .option("inferSchema", "true") 40 | .csv(SparkTestUtils.getResourcePath("/input_data/")) 41 | source_data = data.toDF(data.columns.map(x => x.trim): _*) 42 | } 43 | behavior of "Process Data" 44 | it should "getMostFrequentBday should return the most frequent day" in { 45 | val mostFreqBday = ProcessData.getMostFrequentBday(source_data) 46 | mostFreqBday shouldBe List(23, 24) 47 | } 48 | 49 | it should "getLeastFrequentBmonth should return the least frequent month" in { 50 | val mostFreqBday = ProcessData.getLeastFrequentBmonth(source_data) 51 | mostFreqBday shouldBe List(3) 52 | } 53 | 54 | it should "clean email address" in { 55 | val cleanEmails = ProcessData.cleanEmails(source_data) 56 | val tempPath = "tmp_unitTestTemp_" + System.currentTimeMillis() 57 | cleanEmails 58 | .select("corrected_email") 59 | .write.format("com.databricks.spark.avro") 60 | .save(tempPath) 61 | 62 | val actualData = spark.read.format("com.databricks.spark.avro").load(tempPath) 63 | val expectedData = spark.read.option("header", "true").csv(SparkTestUtils.getResourcePath("/expectedOutputs/cleanedEmails")) 64 | 65 | // Check if the two DF are equal 66 | SparkTestUtils.dfEquals(actualData, expectedData) 67 | // Check and make sure that the output was generated. And then delete it 68 | val errorFile = new File(tempPath) 69 | errorFile.exists() shouldBe true 70 | if (errorFile.isDirectory) { 71 | FileUtils.deleteDirectory(errorFile) 72 | errorFile.exists() shouldBe false 73 | } 74 | } 75 | 76 | it should "find the year with max signups" in { 77 | val yearWithMaxSignUps = ProcessData.yearWithMaxSignUps(source_data) 78 | yearWithMaxSignUps shouldBe List(2016) 79 | } 80 | 81 | it should "find the max referrals in given set" in { 82 | val maxReferralsActual = ProcessData.maxReferrals(source_data) 83 | val maxReferralsExpected = spark.createDataFrame( 84 | Seq( 85 | ("wyohl", 3L), 86 | ("Panda", 2L) 87 | )).toDF("referred_by_name", "no_of_people_referred") 88 | 89 | SparkTestUtils.dfEquals(maxReferralsActual, maxReferralsExpected) 90 | } 91 | 92 | it should "get post by provider" in { 93 | val postsByProviderActual = ProcessData.getPostsByProvider(ProcessData.cleanEmails(source_data)) 94 | val postsByProviderExpected = spark.createDataFrame( 95 | Seq( 96 | ("Gmail", 14L), 97 | ("Hotmail", 12L), 98 | ("Yahoo", 0L) 99 | )).toDF("provider", "posts_count") 100 | 101 | SparkTestUtils.dfEquals(postsByProviderActual, postsByProviderExpected) 102 | } 103 | 104 | it should "list TLD with more than 10K members" in { 105 | val moreThan10KActual = ProcessData.findMoreThan10KTld(ProcessData.cleanEmails(source_data)) 106 | val moreThan10KExpected = spark.createDataFrame( 107 | Seq(("", 0L)) 108 | ).toDF("tld", "tld_count") 109 | SparkTestUtils.dfEquals(moreThan10KActual, moreThan10KExpected, onlySchema = true) 110 | } 111 | 112 | it should "List edu and gov email ids" in { 113 | val eduGovIdsActual = ProcessData.findEduGovEmailIds(ProcessData.cleanEmails(source_data)) 114 | val eduGovIdsExpected = spark.createDataFrame( 115 | Seq(("", 0L)) 116 | ).toDF("provider", "eduGov_Count") 117 | SparkTestUtils.dfEquals(eduGovIdsActual, eduGovIdsExpected, onlySchema = true) 118 | } 119 | 120 | it should "count occurence of class C IP" in { 121 | val classCipActual = ProcessData.classCipByFirstOctet(spark, source_data) 122 | val classCipExpected = spark.createDataFrame( 123 | Seq(("193.x.x.x", 1L)) 124 | ).toDF("ipClassC", "count_octet1") 125 | SparkTestUtils.dfEquals(classCipActual, classCipExpected, onlySchema = true) 126 | } 127 | 128 | it should "count occurence of IP by first 3 octets" in { 129 | val ipBy3OctetsActual = ProcessData.ipAddressFreqBy3Octets(source_data) 130 | val ipBy3OctetsExpected = spark.createDataFrame( 131 | Seq(("67.71.23.x", 16L), 132 | ("95.14.204.x", 1L), 133 | ("108.209.248.x", 1L), 134 | ("95.14.204.x", 1L), 135 | ("190.230.223.x", 1L), 136 | ("95.14.204.x", 1L), 137 | ("193.92.228.x", 1L), 138 | ("187.180.176.x", 1L) 139 | )).toDF("octet13", "occurrence") 140 | SparkTestUtils.dfEquals(ipBy3OctetsActual, ipBy3OctetsExpected, onlySchema = true) 141 | } 142 | 143 | behavior of "LoadToHive class" 144 | it should "Do an upsert, i.e. update instead of only append when new data arrives" in { 145 | val oldData = spark.createDataFrame( 146 | Seq( 147 | ("1", "Data"), 148 | ("2", "AnotherData"), 149 | ("3", "JustAnotherOldData") 150 | )).toDF("id", "data") 151 | 152 | val newData = spark.createDataFrame( 153 | Seq( 154 | ("1", "UpdatedData"), 155 | ("4", "NewData"), 156 | ("5", "AnotherNewData") 157 | )).toDF("id", "data") 158 | 159 | val expectedMergedData = spark.createDataFrame( 160 | Seq( 161 | ("1", "UpdatedData"), 162 | ("2", "AnotherData"), 163 | ("3", "JustAnotherOldData"), 164 | ("4", "NewData"), 165 | ("5", "AnotherNewData") 166 | )).toDF("id", "data") 167 | 168 | val actualMergedData = LoadToHive.upsert(spark, oldData, newData, "id") 169 | 170 | SparkTestUtils.dfEquals(actualMergedData, expectedMergedData) 171 | } 172 | } 173 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 3 | 4.0.0 4 | org.anish.spark 5 | spark2-etl-examples 6 | 1.0-SNAPSHOT 7 | 2008 8 | 9 | 2.0.0 10 | 2.11 11 | 2.11.6 12 | 13 | 14 | 15 | 16 | scala-tools.org 17 | Scala-Tools Maven2 Repository 18 | http://scala-tools.org/repo-releases 19 | 20 | 21 | 22 | 23 | 24 | scala-tools.org 25 | Scala-Tools Maven2 Repository 26 | http://scala-tools.org/repo-releases 27 | 28 | 29 | 30 | 31 | 32 | org.apache.spark 33 | spark-core_${scala.tools.version} 34 | ${spark.version} 35 | 36 | 37 | 38 | 39 | org.apache.spark 40 | spark-sql_${scala.tools.version} 41 | ${spark.version} 42 | 43 | 44 | 45 | com.databricks 46 | spark-avro_${scala.tools.version} 47 | 3.1.0 48 | 49 | 50 | 51 | 52 | org.apache.spark 53 | spark-hive_${scala.tools.version} 54 | ${spark.version} 55 | 56 | 57 | org.scala-lang 58 | scala-library 59 | ${scala.version} 60 | 61 | 62 | junit 63 | junit 64 | 4.4 65 | test 66 | 67 | 68 | org.scala-tools.testing 69 | specs 70 | 1.6.2.2_1.5.0 71 | test 72 | 73 | 74 | org.scalatest 75 | scalatest_${scala.tools.version} 76 | 2.2.5 77 | 78 | 79 | 80 | 81 | src/main/scala 82 | src/test/scala 83 | 84 | 85 | org.scala-tools 86 | maven-scala-plugin 87 | 88 | 89 | 90 | compile 91 | testCompile 92 | 93 | 94 | 95 | 96 | ${scala.version} 97 | 98 | -target:jvm-1.5 99 | 100 | 101 | 102 | 103 | org.apache.maven.plugins 104 | maven-eclipse-plugin 105 | 106 | true 107 | 108 | ch.epfl.lamp.sdt.core.scalabuilder 109 | 110 | 111 | ch.epfl.lamp.sdt.core.scalanature 112 | 113 | 114 | org.eclipse.jdt.launching.JRE_CONTAINER 115 | ch.epfl.lamp.sdt.launching.SCALA_CONTAINER 116 | 117 | 118 | 119 | 120 | org.apache.maven.plugins 121 | maven-surefire-plugin 122 | 2.19.1 123 | 124 | false 125 | 126 | 127 | 128 | 129 | 130 | org.scalatest 131 | scalatest-maven-plugin 132 | 1.0 133 | 134 | ${project.build.directory}/surefire-reports 135 | . 136 | WDF TestSuite.txt 137 | 138 | 139 | 140 | test 141 | 142 | test 143 | 144 | 145 | 146 | 147 | 148 | 149 | 150 | 151 | 152 | org.scala-tools 153 | maven-scala-plugin 154 | 155 | ${scala.version} 156 | 157 | 158 | 159 | 160 | 161 | -------------------------------------------------------------------------------- /src/main/scala/org/anish/spark/etl/ProcessData.scala: -------------------------------------------------------------------------------- 1 | package org.anish.hackerearth.mastglobal 2 | 3 | import org.apache.spark.sql.functions._ 4 | import org.apache.spark.sql.types.{IntegerType, LongType} 5 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SaveMode, SparkSession} 6 | import org.apache.spark.{SparkConf, sql} 7 | import org.joda.time.format.DateTimeFormat 8 | 9 | /** 10 | * Class to process data to get findings 11 | * 12 | * Created by anish on 24/01/17. 13 | */ 14 | object ProcessData { 15 | 16 | implicit class Regex(sc: StringContext) { 17 | def r = new util.matching.Regex(sc.parts.mkString, sc.parts.tail.map(_ => "x"): _*) 18 | } 19 | 20 | /** 21 | * This function works with the data in local system as well. 22 | * It thus reads from a folder (which can be HDFS/S3 path as well) 23 | * This can be modified to read from Hive as well. 24 | * 25 | * @param args 26 | */ 27 | def main(args: Array[String]): Unit = { 28 | val spark = getSparkSession 29 | val data_df = spark.read 30 | .option("header", "true") 31 | .option("inferSchema", "true") 32 | .csv("data/allData/") 33 | 34 | // Remove extra space in column header names and cache the source 35 | val source_df = data_df.toDF(data_df.columns.map(x => x.trim): _*) 36 | .cache 37 | 38 | 39 | val mostFrequentBday: List[Int] = getMostFrequentBday(source_df) 40 | println("Most birthdays are on: " + mostFrequentBday.mkString(",") + " day(s)") 41 | 42 | val leastFrequentBmonth: List[Int] = getLeastFrequentBmonth(source_df) 43 | println("Least birthdays are on: " + leastFrequentBmonth.mkString(",") + " month(s)") 44 | 45 | 46 | // Work with emails 47 | val emailCorrected_df: DataFrame = cleanEmails(source_df) 48 | .cache() 49 | 50 | val eduGovCount_df: Dataset[Row] = findEduGovEmailIds(emailCorrected_df) 51 | println("Email id from government and educational TLDs : ") 52 | eduGovCount_df.show(500, truncate = false) 53 | 54 | val moreThan10KTld: Dataset[Row] = findMoreThan10KTld(emailCorrected_df) 55 | println("Email providers with more than 10K : ") 56 | moreThan10KTld.show() 57 | 58 | val providerGrp_df: Dataset[Row] = getPostsByProvider(emailCorrected_df) 59 | println("Posts by email providers: ") 60 | providerGrp_df.show() 61 | 62 | emailCorrected_df.unpersist 63 | 64 | val year_maxJoined: List[Int] = yearWithMaxSignUps(source_df) 65 | // List because 2 years can experience exact same number of signups 66 | println("Year(s) with max sign ups: " + year_maxJoined.mkString(",") + ".") 67 | 68 | // Find class C ip address 69 | val classCip_df: DataFrame = classCipByFirstOctet(spark, source_df) 70 | println("Class C IP address frequency by 1st octet:") 71 | classCip_df.show(50) 72 | 73 | val ip_occurBy3octets: DataFrame = ipAddressFreqBy3Octets(source_df) 74 | println("Frequency of IP address based on first 3 octets") 75 | ip_occurBy3octets.show(500) 76 | 77 | val max_referral: DataFrame = maxReferrals(source_df) 78 | println("Number of referral by members: ") 79 | max_referral.show(500) 80 | 81 | // Save to Hive 82 | saveDFsToHive(eduGovCount_df, moreThan10KTld, providerGrp_df, classCip_df, ip_occurBy3octets, max_referral) 83 | 84 | // All done, now Unpersist the sourceDF 85 | source_df.unpersist 86 | } 87 | 88 | 89 | /** 90 | * Save output data to Hive tables. 91 | */ 92 | def saveDFsToHive(eduGovCount_df: Dataset[Row], moreThan10KTld: Dataset[Row], providerGrp_df: Dataset[Row], classCip_df: DataFrame, ip_occurBy3octets: DataFrame, max_referral: DataFrame): Unit = { 93 | eduGovCount_df 94 | .write 95 | .mode(SaveMode.Overwrite) 96 | .format("com.databricks.spark.avro") 97 | .saveAsTable("eduGovCount_df") 98 | moreThan10KTld 99 | .write 100 | .mode(SaveMode.Overwrite) 101 | .format("com.databricks.spark.avro") 102 | .saveAsTable("moreThan10KTld") 103 | providerGrp_df 104 | .write 105 | .mode(SaveMode.Overwrite) 106 | .format("com.databricks.spark.avro") 107 | .saveAsTable("providerGrp_df") 108 | classCip_df 109 | .write 110 | .mode(SaveMode.Overwrite) 111 | .format("com.databricks.spark.avro") 112 | .saveAsTable("classCip_df") 113 | ip_occurBy3octets 114 | .write 115 | .mode(SaveMode.Overwrite) 116 | .format("com.databricks.spark.avro") 117 | .saveAsTable("ip_occurBy3octets") 118 | max_referral 119 | .write 120 | .mode(SaveMode.Overwrite) 121 | .format("com.databricks.spark.avro") 122 | .saveAsTable("max_referral") 123 | } 124 | 125 | /** 126 | * Find ipAdress occurences by first 3 octets 127 | * @param source_df 128 | * @return 129 | */ 130 | def ipAddressFreqBy3Octets(source_df: DataFrame): DataFrame = { 131 | val ip_occurBy3octets = source_df 132 | .select("ip_address") 133 | .withColumn("octet13", concat(substring_index(col("ip_address"), ".", 3), lit(".x"))) 134 | .filter("octet13 <> '.x'") 135 | .groupBy("octet13") 136 | .agg(count("*").alias("occurrence")) 137 | .sort(desc("occurrence")) 138 | .toDF() 139 | ip_occurBy3octets 140 | } 141 | 142 | /** 143 | * Count hits from class C IP addresses 144 | * @param spark 145 | * @param source_df 146 | * @return 147 | */ 148 | def classCipByFirstOctet(spark: SparkSession, source_df: DataFrame): DataFrame = { 149 | import spark.implicits._ 150 | val classCip_df = source_df 151 | .select("ip_address") 152 | .withColumn("octet1", split(col("ip_address"), "\\.")(0)) 153 | .filter($"octet1" >= 192 && $"octet1" <= 223) 154 | .withColumn("ipClassC", concat(col("octet1"), lit(".x.x.x"))) 155 | .groupBy("ipClassC") 156 | .agg(count("*").alias("count_octet1")) 157 | .sort(desc("count_octet1")) 158 | .toDF() 159 | classCip_df 160 | } 161 | 162 | /** 163 | * Count total number of posts by Email Provider 164 | * @param emailCorrected_df 165 | * @return 166 | */ 167 | def getPostsByProvider(emailCorrected_df: DataFrame): Dataset[Row] = { 168 | val providerGrp_df = emailCorrected_df 169 | .filter("provider in ('Gmail', 'Yahoo', 'Hotmail')") 170 | .groupBy("provider") 171 | .agg(sum("posts").cast(LongType).alias("posts_count")) 172 | .sort(desc("posts_count")) 173 | providerGrp_df 174 | } 175 | 176 | /** 177 | * Emails of TLDs of more than 10K occurences 178 | * @param emailCorrected_df 179 | * @return 180 | */ 181 | def findMoreThan10KTld(emailCorrected_df: DataFrame): Dataset[Row] = { 182 | val moreThan10KTld = emailCorrected_df 183 | .groupBy("tld") 184 | .agg(count("*").alias("tld_count")) 185 | .filter("tld_count > 10000") 186 | .sort(desc("tld_count")) 187 | moreThan10KTld 188 | } 189 | 190 | /** 191 | * Count of members from edu and gov email ids 192 | * @param emailCorrected_df 193 | * @return 194 | */ 195 | def findEduGovEmailIds(emailCorrected_df: DataFrame): Dataset[Row] = { 196 | val eduGovCount_df = emailCorrected_df 197 | .filter("tld like '%gov%' OR tld like '%edu%'") 198 | .filter("provider <> 'Others'") // Filters out edu.*.* OR edubs.ch-> This is not edu 199 | .groupBy("provider") 200 | .agg(count("*").alias("eduGov_Count")) 201 | .sort(desc("eduGov_Count")) 202 | eduGovCount_df 203 | } 204 | 205 | /** 206 | * Count of referrals by members 207 | * @param source_df 208 | * @return 209 | */ 210 | def maxReferrals(source_df: DataFrame): DataFrame = { 211 | val rootFiltered_df = source_df 212 | .filter("referred_by <> 0 ") 213 | .select("member_id", "name", "referred_by") 214 | val referrers = source_df 215 | .select("member_id", "name") 216 | .withColumnRenamed("member_id", "referer_id") 217 | .withColumnRenamed("name", "referred_by_name") 218 | 219 | val refJoined_df = rootFiltered_df 220 | .join(referrers, rootFiltered_df("referred_by") === referrers("referer_id")) 221 | .drop("referer_id") 222 | 223 | val refferedGroups_df = refJoined_df 224 | .groupBy("referred_by_name") 225 | .agg(count("*").alias("no_of_people_referred")) 226 | .select("referred_by_name", "no_of_people_referred") 227 | .sort(desc("no_of_people_referred")) 228 | .toDF() 229 | 230 | refferedGroups_df 231 | } 232 | 233 | /** 234 | * Year(s) which had max signups 235 | * @param source_df 236 | * @return A List of Years which experienced the max signups 237 | */ 238 | def yearWithMaxSignUps(source_df: DataFrame): List[Int] = { 239 | val sql_yearFromEpoch = udf((epoch: Long) => { 240 | DateTimeFormat.forPattern("YYYY").print(epoch * 1000) 241 | }) 242 | 243 | val yrJoinedGrp_df = source_df 244 | .select("joined") 245 | .withColumn("year_joined", sql_yearFromEpoch(col("joined")).cast(IntegerType)) 246 | .groupBy("year_joined") 247 | .agg(count("*").alias("year_joined_count")) 248 | .select("year_joined", "year_joined_count") 249 | .cache 250 | 251 | val maxJoined = yrJoinedGrp_df 252 | .agg(max("year_joined_count").alias("max_year_joined_count")) 253 | .collect 254 | .head.get(0).toString 255 | 256 | val year_maxJoined = yrJoinedGrp_df 257 | .filter("year_joined_count = " + maxJoined) 258 | .select("year_joined") 259 | .collect() 260 | 261 | val asScalaList = year_maxJoined.map(x => x.getAs[Int]("year_joined")).toList 262 | 263 | yrJoinedGrp_df.unpersist 264 | asScalaList 265 | } 266 | 267 | /** 268 | * Clean Junk email addresses 269 | * @param source_df 270 | * @return 271 | */ 272 | def cleanEmails(source_df: DataFrame): sql.DataFrame = { 273 | val sql_emailCorrector = udf((email: String) => { 274 | email match { 275 | case r"(.*@)${id}.*@(.*)${dom}" => // Remove everything between multiple @ 276 | id + dom 277 | case x => 278 | if (x.contains("@") && x.split("@").length == 2) { 279 | val y = x.split("@")(1) 280 | if (!y.contains(".")) 281 | "invalid" 282 | else 283 | x // has 1 @ and dots 284 | } 285 | else "invalid" 286 | } 287 | }) 288 | 289 | val sql_tld = udf((email: String) => { 290 | if (email.contains("@")) 291 | email.split("@")(1) 292 | else 293 | email 294 | }) 295 | 296 | val sql_provider = udf((tld: String) => { 297 | if (tld.contains("gmail")) 298 | "Gmail" 299 | else if (tld.contains("yahoo")) 300 | "Yahoo" 301 | else if (tld.contains("hotmail")) 302 | "Hotmail" 303 | else if (tld.endsWith(".edu")) 304 | ".edu" 305 | else if (tld.contains(".edu.")) 306 | tld.substring(tld.indexOf(".edu.")) 307 | else if (tld.endsWith(".gov")) 308 | ".gov" 309 | else if (tld.contains(".gov.")) 310 | tld.substring(tld.indexOf(".gov.")) 311 | else { 312 | "Others" 313 | } 314 | }) 315 | 316 | val emailCorrected_df = source_df 317 | .select("email", "posts") 318 | .withColumn("corrected_email", sql_emailCorrector(col("email"))) 319 | .filter("corrected_email <> 'invalid'") 320 | .withColumn("tld", sql_tld(col("corrected_email"))) 321 | .withColumn("provider", sql_provider(col("tld"))) 322 | 323 | emailCorrected_df 324 | } 325 | 326 | /** 327 | * Find the least frequent birth month among all users 328 | * @param source_df 329 | * @return 330 | */ 331 | def getLeastFrequentBmonth(source_df: DataFrame): List[Int] = { 332 | // Find the least frequent birth month 333 | val bday_monthGrp = source_df 334 | .filter("bday_day >= 1 AND bday_day <= 31 AND bday_month >= 1 AND bday_month <= 12 AND bday_year > 0") // Filter out invalid values 335 | .groupBy("bday_month") 336 | .agg(count("*").alias("count_bday_month")) 337 | .select("bday_month", "count_bday_month") 338 | .cache 339 | 340 | val minCountBmonth = bday_monthGrp 341 | .agg(min("count_bday_month").alias("leastFrequentBmonth")) 342 | .collect 343 | .head.get(0).toString 344 | 345 | val leastFrequentBmonth = bday_monthGrp 346 | .filter("count_bday_month = " + minCountBmonth) // This is because there can be multiple dates having the same count 347 | .select("bday_month") 348 | .collect() 349 | 350 | val asScalaList = leastFrequentBmonth.map(x => x.getAs[Int]("bday_month")).toList 351 | 352 | bday_monthGrp.unpersist 353 | asScalaList 354 | } 355 | 356 | /** 357 | * Most frequent Birthdays of members 358 | * @param source_df 359 | * @return 360 | */ 361 | def getMostFrequentBday(source_df: DataFrame): List[Int] = { 362 | // Find the most frequent birthday 363 | val bday_dayGrp = source_df 364 | .filter("bday_day >= 1 AND bday_day <= 31 AND bday_month >= 1 AND bday_month <= 12 AND bday_year > 0") // Filter out invalid values 365 | .groupBy("bday_day") 366 | .agg(count("*").alias("count_bday_day")) 367 | .select("bday_day", "count_bday_day") 368 | .cache 369 | 370 | val maxCountBday = bday_dayGrp 371 | .agg(max("count_bday_day").alias("mostFrequentBday")) 372 | .collect 373 | .head.get(0).toString 374 | 375 | val mostFrequentBday = bday_dayGrp 376 | .filter("count_bday_day = " + maxCountBday) // This is because there can be multiple dates having the same count 377 | .select("bday_day") 378 | .toDF() 379 | .collect() 380 | val asScalaList = mostFrequentBday.map(x => x.getAs[Int]("bday_day")).toList 381 | 382 | bday_dayGrp.unpersist 383 | asScalaList 384 | } 385 | 386 | /** 387 | * Returns the SparkSession object. Manages the configs for the spark session 388 | * @return 389 | */ 390 | def getSparkSession: SparkSession = { 391 | val sparkConf = new SparkConf 392 | sparkConf.set("spark.sql.crossJoin.enabled", "true") 393 | if (!sparkConf.contains("spark.master")) { 394 | sparkConf.setMaster("local[3]") 395 | } 396 | if (!sparkConf.contains("spark.app.name")) { 397 | sparkConf.setAppName("MastGlobalDataProcessing-" + getClass.getName) 398 | } 399 | SparkSession 400 | .builder() 401 | .config(sparkConf) 402 | // .enableHiveSupport() 403 | .getOrCreate() 404 | } 405 | } --------------------------------------------------------------------------------