├── .idea ├── .gitignore ├── Spark_Example_Project.iml ├── codeStyles │ ├── Project.xml │ └── codeStyleConfig.xml ├── compiler.xml ├── jarRepositories.xml ├── misc.xml ├── scala_compiler.xml ├── uiDesigner.xml └── vcs.xml ├── README.md ├── Spark_Example_project.iml ├── data └── data.md ├── pom.xml └── src └── main └── scala └── com └── example └── App.scala /.idea/.gitignore: -------------------------------------------------------------------------------- 1 | # Default ignored files 2 | /shelf/ 3 | /workspace.xml 4 | -------------------------------------------------------------------------------- /.idea/Spark_Example_Project.iml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /.idea/codeStyles/Project.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 6 | 7 | -------------------------------------------------------------------------------- /.idea/codeStyles/codeStyleConfig.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 5 | -------------------------------------------------------------------------------- /.idea/compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/jarRepositories.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 9 | 10 | 14 | 15 | 19 | 20 | -------------------------------------------------------------------------------- /.idea/misc.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 10 | 11 | 12 | 13 | 14 | -------------------------------------------------------------------------------- /.idea/scala_compiler.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /.idea/uiDesigner.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | 23 | 24 | 25 | 26 | 27 | 28 | 29 | 30 | 31 | 32 | 33 | 34 | 35 | 36 | 37 | 38 | 39 | 40 | 41 | 42 | 43 | 44 | 45 | 46 | 47 | 48 | 49 | 50 | 51 | 52 | 53 | 54 | 55 | 56 | 57 | 58 | 59 | 60 | 61 | 62 | 63 | 64 | 65 | 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | 76 | 77 | 78 | 79 | 80 | 81 | 82 | 83 | 84 | 85 | 86 | 87 | 88 | 89 | 90 | 91 | 92 | 93 | 94 | 95 | 96 | 97 | 98 | 99 | 100 | 101 | 102 | 103 | 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | 115 | 116 | 117 | 118 | 119 | 120 | 121 | 122 | 123 | 124 | -------------------------------------------------------------------------------- /.idea/vcs.xml: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering Hands On: Apache Spark 2 | 3 | ## Problem Description 4 | Web Server logs contain information on any event that was registered/logged with the website. This data contains a lot of insights on **website visitors**, **behavior**, **crawlers accessing the site**, **business insights**, **security issues**, etc. 5 | 6 | In this **Bootcamp**, we will learn to use **Apache Spark** and extract useful insights from a sample **Web Server Log**. 7 | 8 | ## Downloading the Dataset 9 | Download the sample **Web Server Log** dataset by clicking here. 10 | 11 | ## Project Setup: 12 | To run this project in your local. Follow the below steps: 13 | 1) Open it up in IntelliJ or any other IDE. (Download IntelliJ Community Version from here) 14 | 2) Install Scala plugin by going into Settings. (For reference)
15 | 3) Put the dataset in data directory. (data/access.log) 16 | 4) run _mvn clean install_ on the terminal to download all dependencies and build the jar. IntelliJ users can ignore this step, as it automatically downloads the requisite dependancies and creates the build. 17 | 18 | Once you run the main class it will read the data/access.log file and parse it using Apache Spark and write the output files in data/logdata/ directory in parquet file format. It will also print the hourly trend table data on the console. 19 | 20 | ## Interactively running the code in Scala shell 21 | 1) Setup the shell from run configuration.\ 22 | ![alt text](https://i.ibb.co/SyQy2mC/Screenshot-2021-10-01-at-10-10-41-AM.png)\ 23 | 24 | 2) Select scala REPL \ 25 | ![alt text](https://i.ibb.co/GTz1bsN/Screenshot-2021-10-01-at-10-02-20-AM.png)\ 26 | ![alt text](https://i.ibb.co/cL3m0pD/Screenshot-2021-10-01-at-10-01-42-AM.png) 27 | 28 | 3) Run the Scala REPL and execute the code by _CTRL + ALT + X_ for windows and linux users, and _CONTROL + COMMAND + X_ for Mac users. 29 | 30 | For reference please visit. 31 | 32 | ## Frequently Asked Questions 33 | ### What is scala-shell? 34 | This is an interactive way to run the scala code. It is similar to python shell where we interactively run python scripts. 35 | 36 | ### What is parquet file? 37 | It is highly compressed columnar file format which is optimized for high speed reads and is prominently used in enterprise data warehouses. Further reads about Apache Parquet. 38 | 39 | 40 | ## You're all set. VyuWing is Happy to Help! 41 | 42 | For doubts on the project and to learn more, get in touch with our team : info@vyuwinglearning.com 43 | -------------------------------------------------------------------------------- /Spark_Example_project.iml: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /data/data.md: -------------------------------------------------------------------------------- 1 | ### Download the dataset from here, and place it in /data directory. 2 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 5 | 4.0.0 6 | 7 | com.example 8 | Spark_Example_project 9 | 1.0-SNAPSHOT 10 | 11 | 12 | 8 13 | 8 14 | 2.11 15 | 2.11.1 16 | 2.4.5 17 | 18 | 19 | 20 | 21 | 22 | 23 | org.scala-lang 24 | scala-library 25 | ${scala.version} 26 | 27 | 28 | 29 | 30 | org.scala-lang 31 | scala-reflect 32 | ${scala.version} 33 | 34 | 35 | 36 | 37 | org.apache.spark 38 | spark-sql_${scala.tools.version} 39 | ${spark.version} 40 | compile 41 | 42 | 43 | 44 | 45 | org.apache.spark 46 | spark-core_${scala.tools.version} 47 | ${spark.version} 48 | 49 | 50 | 51 | 52 | org.apache.hadoop 53 | hadoop-core 54 | 1.2.1 55 | 56 | 57 | 58 | 59 | org.apache.hadoop 60 | hadoop-hdfs 61 | 3.0.0 62 | provided 63 | 64 | 65 | 66 | 67 | 68 | 69 | src/main/scala 70 | src/test/scala 71 | 72 | 73 | net.alchim31.maven 74 | scala-maven-plugin 75 | 4.5.3 76 | 77 | 78 | scala-compile-first 79 | process-resources 80 | 81 | add-source 82 | compile 83 | 84 | 85 | 86 | scala-test-compile 87 | process-test-resources 88 | 89 | testCompile 90 | 91 | 92 | 93 | 94 | 95 | 96 | org.apache.maven.plugins 97 | maven-jar-plugin 98 | 3.2.0 99 | 100 | 101 | 102 | true 103 | com.example.App 104 | 105 | 106 | 107 | 108 | 109 | 110 | 111 | 112 | 113 | 114 | -------------------------------------------------------------------------------- /src/main/scala/com/example/App.scala: -------------------------------------------------------------------------------- 1 | package com.example 2 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} 3 | import org.apache.spark.sql.functions._ 4 | import org.joda.time.DateTime 5 | import org.joda.time.format.DateTimeFormat 6 | 7 | import java.sql.Timestamp 8 | object App { 9 | 10 | case class Log(ip:String,datetime:String,requesttype:String,path:String,status:Integer, PageSize:String,referer:String,agent:String) 11 | 12 | def processdata(spark:SparkSession,fileLoc:String) = { 13 | val rdd = spark.sparkContext.textFile(fileLoc,30) 14 | val rdd1 = rdd.map(line=> { 15 | val pattern = "([0-9.]+) - - \\[([^\\]]+)\\]\\s?+\\\"([A-Z]+)\\s?+([^\\\"]+)\\\"\\s?+([0-9]+)\\s?+([0-9]+)\\s?+\\\"([^\\\"]+)\\\"\\s?+\\\"([^\\\"]+)\\\"".r 16 | val groups = pattern.findAllIn(line) 17 | if(groups.hasNext){ 18 | Log(groups.group(1),groups.group(2),groups.group(3),groups.group(4),Integer.parseInt(groups.group(5)) 19 | ,groups.group(6),groups.group(7),groups.group(8)) 20 | } else 21 | null 22 | }).filter(x=> x != null) 23 | 24 | val df = spark.createDataFrame(rdd1) 25 | df.coalesce(3).write.option("compression","none").mode("overwrite").parquet("data/logdata/") 26 | } 27 | 28 | def botsAndMobileFeatures(df:Dataset[Row],spark:SparkSession):DataFrame = { 29 | df.withColumn("isBot",when(lower(col("agent")).contains("bot"),true).otherwise(false)) 30 | .withColumn("isAndroid",when(col("isBot") === false && col("agent").contains("Android"),true).otherwise(false)) 31 | .withColumn("isIphone",when(col("isBot") === false && col("agent").contains("Iphone"),true).otherwise(false)) 32 | } 33 | 34 | def parsedatetime(datetime:String):Timestamp = { 35 | val dtFmt = DateTimeFormat.forPattern("dd/MMM/yyyy:HH:mm:ss Z") 36 | val dt = DateTime.parse(datetime,dtFmt) 37 | new Timestamp(dt.getMillis) 38 | } 39 | 40 | def processDatetimeFeatures(df:Dataset[Row],spark:SparkSession):Dataset[Row] = { 41 | val parsedate = udf((dt:String) => parsedatetime(dt)) 42 | df.withColumn("datetime",parsedate(col("datetime"))) 43 | .withColumn("hour",hour(col("datetime"))) 44 | .withColumn("date",to_date(col("datetime"))) 45 | .withColumn("dayofWeek",dayofweek(col("datetime"))) 46 | } 47 | 48 | def main(args: Array[String]): Unit = { 49 | //use the spark variable as spark session in local to build and run your code 50 | val spark = SparkSession.builder().appName("test").master("local[4]").getOrCreate() 51 | 52 | 53 | //for windows users 54 | /** 55 | * System.setProperty("hadoop.home.dir","") 56 | val spark = SparkSession.builder().appName("test").config("spark.driver.memory","6g") 57 | .config("spark.testing.memory","1g").master("local[*]").getOrCreate() 58 | */ 59 | processdata(spark,"data/access.log") 60 | 61 | 62 | val df = spark.read.parquet("data/logdata") 63 | df.select("agent").show(50,false) 64 | var df1 = botsAndMobileFeatures(df,spark) 65 | df1.show() 66 | df1 = processDatetimeFeatures(df1,spark) 67 | df1.show() 68 | 69 | /** 70 | * SQL API of spark 71 | */ 72 | 73 | df1.createOrReplaceTempView("logdata") 74 | 75 | var df2 = spark.sql("select date,hour, count(1) from logdata group by date, hour order by date,hour") 76 | df2.show() 77 | 78 | 79 | 80 | 81 | println("hello") 82 | } 83 | } 84 | --------------------------------------------------------------------------------