├── .idea
├── .gitignore
├── Spark_Example_Project.iml
├── codeStyles
│ ├── Project.xml
│ └── codeStyleConfig.xml
├── compiler.xml
├── jarRepositories.xml
├── misc.xml
├── scala_compiler.xml
├── uiDesigner.xml
└── vcs.xml
├── README.md
├── Spark_Example_project.iml
├── data
└── data.md
├── pom.xml
└── src
└── main
└── scala
└── com
└── example
└── App.scala
/.idea/.gitignore:
--------------------------------------------------------------------------------
1 | # Default ignored files
2 | /shelf/
3 | /workspace.xml
4 |
--------------------------------------------------------------------------------
/.idea/Spark_Example_Project.iml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/.idea/codeStyles/Project.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
--------------------------------------------------------------------------------
/.idea/codeStyles/codeStyleConfig.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
--------------------------------------------------------------------------------
/.idea/compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/jarRepositories.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
--------------------------------------------------------------------------------
/.idea/misc.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
10 |
11 |
12 |
13 |
14 |
--------------------------------------------------------------------------------
/.idea/scala_compiler.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/.idea/uiDesigner.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 | -
6 |
7 |
8 | -
9 |
10 |
11 | -
12 |
13 |
14 | -
15 |
16 |
17 | -
18 |
19 |
20 |
21 |
22 |
23 | -
24 |
25 |
26 |
27 |
28 |
29 | -
30 |
31 |
32 |
33 |
34 |
35 | -
36 |
37 |
38 |
39 |
40 |
41 | -
42 |
43 |
44 |
45 |
46 | -
47 |
48 |
49 |
50 |
51 | -
52 |
53 |
54 |
55 |
56 | -
57 |
58 |
59 |
60 |
61 | -
62 |
63 |
64 |
65 |
66 | -
67 |
68 |
69 |
70 |
71 | -
72 |
73 |
74 | -
75 |
76 |
77 |
78 |
79 | -
80 |
81 |
82 |
83 |
84 | -
85 |
86 |
87 |
88 |
89 | -
90 |
91 |
92 |
93 |
94 | -
95 |
96 |
97 |
98 |
99 | -
100 |
101 |
102 | -
103 |
104 |
105 | -
106 |
107 |
108 | -
109 |
110 |
111 | -
112 |
113 |
114 |
115 |
116 | -
117 |
118 |
119 | -
120 |
121 |
122 |
123 |
124 |
--------------------------------------------------------------------------------
/.idea/vcs.xml:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Engineering Hands On: Apache Spark
2 |
3 | ## Problem Description
4 | Web Server logs contain information on any event that was registered/logged with the website. This data contains a lot of insights on **website visitors**, **behavior**, **crawlers accessing the site**, **business insights**, **security issues**, etc.
5 |
6 | In this **Bootcamp**, we will learn to use **Apache Spark** and extract useful insights from a sample **Web Server Log**.
7 |
8 | ## Downloading the Dataset
9 | Download the sample **Web Server Log** dataset by clicking here.
10 |
11 | ## Project Setup:
12 | To run this project in your local. Follow the below steps:
13 | 1) Open it up in IntelliJ or any other IDE. (Download IntelliJ Community Version from here)
14 | 2) Install Scala plugin by going into Settings. (For reference)
15 | 3) Put the dataset in data directory. (data/access.log)
16 | 4) run _mvn clean install_ on the terminal to download all dependencies and build the jar. IntelliJ users can ignore this step, as it automatically downloads the requisite dependancies and creates the build.
17 |
18 | Once you run the main class it will read the data/access.log file and parse it using Apache Spark and write the output files in data/logdata/ directory in parquet file format. It will also print the hourly trend table data on the console.
19 |
20 | ## Interactively running the code in Scala shell
21 | 1) Setup the shell from run configuration.\
22 | \
23 |
24 | 2) Select scala REPL \
25 | \
26 | 
27 |
28 | 3) Run the Scala REPL and execute the code by _CTRL + ALT + X_ for windows and linux users, and _CONTROL + COMMAND + X_ for Mac users.
29 |
30 | For reference please visit.
31 |
32 | ## Frequently Asked Questions
33 | ### What is scala-shell?
34 | This is an interactive way to run the scala code. It is similar to python shell where we interactively run python scripts.
35 |
36 | ### What is parquet file?
37 | It is highly compressed columnar file format which is optimized for high speed reads and is prominently used in enterprise data warehouses. Further reads about Apache Parquet.
38 |
39 |
40 | ## You're all set. VyuWing is Happy to Help!
41 |
42 | For doubts on the project and to learn more, get in touch with our team : info@vyuwinglearning.com
43 |
--------------------------------------------------------------------------------
/Spark_Example_project.iml:
--------------------------------------------------------------------------------
1 |
2 |
--------------------------------------------------------------------------------
/data/data.md:
--------------------------------------------------------------------------------
1 | ### Download the dataset from here, and place it in /data directory.
2 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 |
5 | 4.0.0
6 |
7 | com.example
8 | Spark_Example_project
9 | 1.0-SNAPSHOT
10 |
11 |
12 | 8
13 | 8
14 | 2.11
15 | 2.11.1
16 | 2.4.5
17 |
18 |
19 |
20 |
21 |
22 |
23 | org.scala-lang
24 | scala-library
25 | ${scala.version}
26 |
27 |
28 |
29 |
30 | org.scala-lang
31 | scala-reflect
32 | ${scala.version}
33 |
34 |
35 |
36 |
37 | org.apache.spark
38 | spark-sql_${scala.tools.version}
39 | ${spark.version}
40 | compile
41 |
42 |
43 |
44 |
45 | org.apache.spark
46 | spark-core_${scala.tools.version}
47 | ${spark.version}
48 |
49 |
50 |
51 |
52 | org.apache.hadoop
53 | hadoop-core
54 | 1.2.1
55 |
56 |
57 |
58 |
59 | org.apache.hadoop
60 | hadoop-hdfs
61 | 3.0.0
62 | provided
63 |
64 |
65 |
66 |
67 |
68 |
69 | src/main/scala
70 | src/test/scala
71 |
72 |
73 | net.alchim31.maven
74 | scala-maven-plugin
75 | 4.5.3
76 |
77 |
78 | scala-compile-first
79 | process-resources
80 |
81 | add-source
82 | compile
83 |
84 |
85 |
86 | scala-test-compile
87 | process-test-resources
88 |
89 | testCompile
90 |
91 |
92 |
93 |
94 |
95 |
96 | org.apache.maven.plugins
97 | maven-jar-plugin
98 | 3.2.0
99 |
100 |
101 |
102 | true
103 | com.example.App
104 |
105 |
106 |
107 |
108 |
109 |
110 |
111 |
112 |
113 |
114 |
--------------------------------------------------------------------------------
/src/main/scala/com/example/App.scala:
--------------------------------------------------------------------------------
1 | package com.example
2 | import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
3 | import org.apache.spark.sql.functions._
4 | import org.joda.time.DateTime
5 | import org.joda.time.format.DateTimeFormat
6 |
7 | import java.sql.Timestamp
8 | object App {
9 |
10 | case class Log(ip:String,datetime:String,requesttype:String,path:String,status:Integer, PageSize:String,referer:String,agent:String)
11 |
12 | def processdata(spark:SparkSession,fileLoc:String) = {
13 | val rdd = spark.sparkContext.textFile(fileLoc,30)
14 | val rdd1 = rdd.map(line=> {
15 | val pattern = "([0-9.]+) - - \\[([^\\]]+)\\]\\s?+\\\"([A-Z]+)\\s?+([^\\\"]+)\\\"\\s?+([0-9]+)\\s?+([0-9]+)\\s?+\\\"([^\\\"]+)\\\"\\s?+\\\"([^\\\"]+)\\\"".r
16 | val groups = pattern.findAllIn(line)
17 | if(groups.hasNext){
18 | Log(groups.group(1),groups.group(2),groups.group(3),groups.group(4),Integer.parseInt(groups.group(5))
19 | ,groups.group(6),groups.group(7),groups.group(8))
20 | } else
21 | null
22 | }).filter(x=> x != null)
23 |
24 | val df = spark.createDataFrame(rdd1)
25 | df.coalesce(3).write.option("compression","none").mode("overwrite").parquet("data/logdata/")
26 | }
27 |
28 | def botsAndMobileFeatures(df:Dataset[Row],spark:SparkSession):DataFrame = {
29 | df.withColumn("isBot",when(lower(col("agent")).contains("bot"),true).otherwise(false))
30 | .withColumn("isAndroid",when(col("isBot") === false && col("agent").contains("Android"),true).otherwise(false))
31 | .withColumn("isIphone",when(col("isBot") === false && col("agent").contains("Iphone"),true).otherwise(false))
32 | }
33 |
34 | def parsedatetime(datetime:String):Timestamp = {
35 | val dtFmt = DateTimeFormat.forPattern("dd/MMM/yyyy:HH:mm:ss Z")
36 | val dt = DateTime.parse(datetime,dtFmt)
37 | new Timestamp(dt.getMillis)
38 | }
39 |
40 | def processDatetimeFeatures(df:Dataset[Row],spark:SparkSession):Dataset[Row] = {
41 | val parsedate = udf((dt:String) => parsedatetime(dt))
42 | df.withColumn("datetime",parsedate(col("datetime")))
43 | .withColumn("hour",hour(col("datetime")))
44 | .withColumn("date",to_date(col("datetime")))
45 | .withColumn("dayofWeek",dayofweek(col("datetime")))
46 | }
47 |
48 | def main(args: Array[String]): Unit = {
49 | //use the spark variable as spark session in local to build and run your code
50 | val spark = SparkSession.builder().appName("test").master("local[4]").getOrCreate()
51 |
52 |
53 | //for windows users
54 | /**
55 | * System.setProperty("hadoop.home.dir","")
56 | val spark = SparkSession.builder().appName("test").config("spark.driver.memory","6g")
57 | .config("spark.testing.memory","1g").master("local[*]").getOrCreate()
58 | */
59 | processdata(spark,"data/access.log")
60 |
61 |
62 | val df = spark.read.parquet("data/logdata")
63 | df.select("agent").show(50,false)
64 | var df1 = botsAndMobileFeatures(df,spark)
65 | df1.show()
66 | df1 = processDatetimeFeatures(df1,spark)
67 | df1.show()
68 |
69 | /**
70 | * SQL API of spark
71 | */
72 |
73 | df1.createOrReplaceTempView("logdata")
74 |
75 | var df2 = spark.sql("select date,hour, count(1) from logdata group by date, hour order by date,hour")
76 | df2.show()
77 |
78 |
79 |
80 |
81 | println("hello")
82 | }
83 | }
84 |
--------------------------------------------------------------------------------