├── .gitignore
├── README.md
├── build.sbt
├── config
├── test_linux
│ ├── application.conf
│ └── log4j.properties
└── test_windows
│ ├── application.conf
│ └── log4j.properties
├── insight_data
├── patients.csv
├── products.csv
├── products_suppliers.csv
└── supplier.csv
├── problem_scenarios
├── patient_data.md
├── products.md
└── rdd_operations.md
├── project
├── assembly.sbt
└── build.properties
├── scripts
├── create_products_parquet_table_in_hive.sh
└── create_products_table_in_hive.sh
├── src
└── main
│ └── scala
│ └── com
│ └── jwk
│ └── development
│ └── big_data_insights
│ └── scala
│ └── products
│ ├── driver
│ └── run_problem_scenario_part_One.scala
│ └── problem_scenario
│ └── part_One.scala
└── wiki_data
└── screenshots
└── how_to_set_spark_master_to_local_in _intellij.PNG
/.gitignore:
--------------------------------------------------------------------------------
1 | RemoteSystemsTempFiles/
2 | Servers/
3 | target/
4 | logs/
5 | .metadata/
6 | bin/
7 | tmp/
8 | *.tmp
9 | *.bak
10 | *.swp
11 | *~.nib
12 | local.properties
13 | .settings/
14 | .loadpath
15 | .recommenders
16 | .idea/
17 | .project
18 | classes/
19 | .classpath
20 | .iml
21 | *_SUCCESS*
22 | *.crc
23 |
24 | # External tool builders
25 | .externalToolBuilders/
26 |
27 | # Locally stored "Eclipse launch configurations"
28 | *.launch
29 |
30 | # PyDev specific (Python IDE for Eclipse)
31 | *.pydevproject
32 |
33 | # CDT-specific (C/C++ Development Tooling)
34 | .cproject
35 |
36 | # Java annotation processor (APT)
37 | .factorypath
38 |
39 | # PDT-specific (PHP Development Tools)
40 | .buildpath
41 |
42 | # sbteclipse plugin
43 | .target
44 |
45 | # Tern plugin
46 | .tern-project
47 |
48 | # TeXlipse plugin
49 | .texlipse
50 |
51 | # STS (Spring Tool Suite)
52 | .springBeans
53 |
54 | # Code Recommenders
55 | .recommenders/
56 |
57 | # Scala IDE specific (Scala & Java development for Eclipse)
58 | .cache-main
59 | .scala_dependencies
60 | .worksheet
61 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # big-data-insights-scala
2 | personal solutions to big data problem scenarios using scala
3 |
4 | ## Problem Scenarios
5 |
6 | ### 1. [Product Data for a pen company](https://github.com/jwkimani/big-data-insights-scala/blob/master/problem_scenarios/products.md)
7 |
8 | ### 2. [Patient Data](https://github.com/jwkimani/big-data-insights-scala/blob/master/problem_scenarios/patient_data.md)
9 |
10 | ### 3. [RDD (resilient distributed dataset) Operations](https://github.com/jwkimani/big-data-insights-scala/blob/master/problem_scenarios/rdd_operations.md)
11 |
12 |
13 | ## Troubleshooting
14 | 1. When running applications if below error occurs: *A master URL must be set in your configuration*
15 | ```
16 | Exception in thread "main" java.lang.ExceptionInInitializerError
17 | at com.jwk.development.big_data_insights.scala.products.driver.problem_scenario_1.main(problem_scenario_1.scala)
18 | Caused by: org.apache.spark.SparkException: A master URL must be set in your configuration
19 | ```
20 |
21 | Solution:
22 |
23 | Add the following VM option to your run configurations
24 | ```
25 | -Dspark.master=local
26 | ```
27 | [How to set spark master to local in intellij](https://github.com/jwkimani/big-data-insights-scala/blob/master/wiki_data/screenshots/how_to_set_spark_master_to_local_in%20_intellij.PNG)
28 |
29 |
--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
1 | name := "big-data-insights-scala"
2 |
3 | version := "1.0"
4 |
5 | scalaVersion := "2.11.8"
6 |
7 | libraryDependencies ++= Seq(
8 | "org.apache.hadoop" % "hadoop-client" % "2.7.3",
9 | ("org.apache.spark" % "spark-core_2.11" % "2.1.0"),
10 | ("org.apache.spark" % "spark-sql_2.11" % "2.1.0"),
11 | "org.apache.spark" % "spark-hive_2.11" % "2.1.0",
12 | "com.databricks" % "spark-avro_2.11" % "3.2.0",
13 | "com.databricks" % "spark-csv_2.10" % "1.3.0",
14 | "org.scala-lang" % "scala-library" % "2.11.8",
15 | "org.scala-lang" % "scala-reflect" % "2.11.8",
16 | "com.typesafe" % "config" % "1.3.1",
17 | "org.apache.logging.log4j" %% "log4j-api-scala" % "2.8.1",
18 | "org.apache.logging.log4j" % "log4j-core" % "2.8.1",
19 | "org.apache.kafka" %% "kafka" % "0.9.0.2.3.4.51-1"
20 |
21 | )
22 | //use external repositories
23 | resolvers += "HortonWorksRepo" at "http://repo.hortonworks.com/content/repositories/releases/"
24 |
25 | parallelExecution in test := false
26 |
27 |
28 | initialCommands := "import org.test._"
29 |
30 | //clean operations
31 | cleanFiles += baseDirectory { base => base / "build" }.value
32 | cleanFiles += baseDirectory { base => base / "metastore_db" }.value
33 |
34 | //assembly-settings
--------------------------------------------------------------------------------
/config/test_linux/application.conf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jwkimani/big-data-insights-scala/e999dffab10c4fa8f5e716da461ae14e86965c33/config/test_linux/application.conf
--------------------------------------------------------------------------------
/config/test_linux/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set root logger level to DEBUG and its only appender to A1.
2 | log4j.rootLogger=ERROR, A1
3 | # If we get chained appenders, this stops the message being written multiple times
4 | log4j.additivity.org.apache=false
5 | log4j.additivity.xdasLogger=false
6 | # A1 is set to be a ConsoleAppender.
7 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
8 | log4j.appender.stdout.Target=System.out
9 | # A1 uses PatternLayout.
10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
12 |
13 |
--------------------------------------------------------------------------------
/config/test_windows/application.conf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jwkimani/big-data-insights-scala/e999dffab10c4fa8f5e716da461ae14e86965c33/config/test_windows/application.conf
--------------------------------------------------------------------------------
/config/test_windows/log4j.properties:
--------------------------------------------------------------------------------
1 | # Set root logger level to DEBUG and its only appender to A1.
2 | log4j.rootLogger=ERROR, A1
3 | # If we get chained appenders, this stops the message being written multiple times
4 | log4j.additivity.org.apache=false
5 | log4j.additivity.xdasLogger=false
6 | # A1 is set to be a ConsoleAppender.
7 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
8 | log4j.appender.stdout.Target=System.out
9 | # A1 uses PatternLayout.
10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
12 |
13 |
--------------------------------------------------------------------------------
/insight_data/patients.csv:
--------------------------------------------------------------------------------
1 | patientID,name ,address ,dateOfBirth,lastVisitDate
2 | 1001 ,Homer Simpson ,"123 Blue St.,Los Angeles, CA 12345" ,1989-12-31 ,2017-01-21
3 | 1002 ,Peter Griffin ,"234 Brown St., San Fransisco, CA 23456",1950-01-30 ,2015-04-18
4 | 1003 ,Hubert J. Fansworth,"546 Red Dr., Sacramento, CA 54678" ,1978-08-21 ,2017-02-14
5 | 1004 ,Marge Simpson ,"123 Blue St.,Los Angeles, CA 12345" ,1990-03-18 ,2016-02-15
6 | 1005 ,Bender Rodriguez ,"127 Brown St., Charlotte, NC 28223" ,1986-12-31 ,2013-12-14
7 | 1006 ,Turanga Leela ,"128 Brown St., Charlotte, NC 28223" ,1978-08-21 ,2012-09-15
8 |
--------------------------------------------------------------------------------
/insight_data/products.csv:
--------------------------------------------------------------------------------
1 | productID productCode name quantity price supplierid
2 | 1001 PEN Pen Red 5000 1.23 501
3 | 1002 PEN Pen Blue 8001 1.25 501
4 | 1003 PEN Pen Black 2000 1.25 501
5 | 1004 PEC Pencil 2B 10000 0.48 502
6 | 1005 PEC Pencil 2H 8000 0.49 502
7 | 1006 PEC Pencil HB 0 9999.99 502
8 | 2001 PEC Pencil 3B 500 0.52 501
9 | 2002 PEC Pencil 4B 200 0.62 501
10 | 2003 PEC Pencil 5B 100 0.73 501
11 | 2004 PEC Pencil 6B 500 0.47 502
12 |
--------------------------------------------------------------------------------
/insight_data/products_suppliers.csv:
--------------------------------------------------------------------------------
1 | productID,supplierID
2 | 2001 ,501
3 | 2002 ,501
4 | 2003 ,501
5 | 2004 ,502
6 | 2001 ,503
7 |
--------------------------------------------------------------------------------
/insight_data/supplier.csv:
--------------------------------------------------------------------------------
1 | supplierID,name ,phone
2 | 501 ,ABC Traders,88881111
3 | 502 ,XYZ Company,88882222
4 | 503 ,QQ Corp ,88883333
5 | 504 ,DEG LLC ,88884444
6 | 505 ,FGH Limited,88885555
7 |
--------------------------------------------------------------------------------
/problem_scenarios/patient_data.md:
--------------------------------------------------------------------------------
1 | #Problem Scenario 3
2 |
3 | __Problem:__ **
4 |
5 | __Package name:__ **
6 |
7 | __Driver/Main class:__ **
8 |
9 | You have been given the following file containing patient data:
10 | [patients.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/patients.csv)
11 |
12 | Accomplish the following activities:
13 |
14 | 1. Find all the patients whose lastVisitDate between current time and '2012-09-15'
15 |
16 | ```
17 |
18 | ```
19 |
20 | 2. Find all the patients who bom in 1990
21 |
22 | ```
23 |
24 | ```
25 | 3. Find all the patients age
26 |
27 | ```
28 |
29 | ```
30 |
31 | 4. List patients uhose last visited more than 60 days ago
32 |
33 | ```
34 |
35 | ```
36 |
37 | 5. Select patients 18 years old or younger
38 |
39 | ```
40 |
41 | ```
42 |
43 |
44 |
45 |
--------------------------------------------------------------------------------
/problem_scenarios/products.md:
--------------------------------------------------------------------------------
1 | # Problem Scenario 1
2 |
3 | __Problem:__ *Given csv files with product information from a pen company, provide some insights using big data technologies*
4 |
5 | __Package name:__ *[com.jwk.development.big_data_insights.scala.products](https://github.com/jwkimani/big-data-insights-scala/tree/master/src/main/scala/com/jwk/development/big_data_insights/scala/products)*
6 |
7 | __Driver/Main class:__ *[com.jwk.development.big_data_insights.scala.products.driver](https://github.com/jwkimani/big-data-insights-scala/tree/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/driver)*
8 |
9 | __Solution Package__ *[com.jwk.development.big_data_insights.scala.products.problem_scenario](https://github.com/jwkimani/big-data-insights-scala/tree/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/problem_scenario)*
10 |
11 | ##
12 | This problem has three parts hence the solutions are broken into three different modules in the package:
13 | 1. [Part One](#part1)
14 |
15 | 2. [Part Two](#part2)
16 |
17 | 3. [Part Three](#part3)
18 |
19 | ## Part One
20 |
21 | __Driver/Main class:__ *[com.jwk.development.big_data_insights.scala.products.driver](https://github.com/jwkimani/big-data-insights-scala/blob/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/driver/run_problem_scenario_part_One.scala)*
22 |
23 | __Solution Package__ *[com.jwk.development.big_data_insights.scala.products.problem_scenario](https://github.com/jwkimani/big-data-insights-scala/blob/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/problem_scenario/part_One.scala)*
24 |
25 | You have the following tab delimited csv file [products.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/products.csv)
26 |
27 | Using Spark and SparkSQL perform the following tasks:
28 | 1. Load the csv file to a dataframe
29 |
30 | Using schema:
31 | ```
32 | val schema =
33 | StructType(
34 | Array(
35 | StructField("productID", IntegerType, false),
36 | StructField("productCode", StringType, false),
37 | StructField("name", StringType, false),
38 | StructField("quantity", IntegerType, false),
39 | StructField("price", FloatType, false)
40 | )
41 | )
42 | ```
43 | approach
44 | ```
45 | val productDF = sqlContext.read.format("com.databricks.spark.csv").option("delimiter","\t").option("header","true").option("inferSchema", "false").schema(schema).load(path)
46 | ```
47 |
48 | 2. Create a global temporary view named `products` from dataframe with data
49 |
50 | `productDF.createGlobalTempView("products")`
51 |
52 | 3. Using the global temporary view, perform the task below
53 |
54 | 4. Select and show all the records with quantity >= 5000 and name starts with 'Pen'
55 | ```
56 | +---------+-----------+---------+--------+-----+
57 | |productID|productCode| name|quantity|price|
58 | +---------+-----------+---------+--------+-----+
59 | | 1001| PEN| Pen Red| 5000| 1.23|
60 | | 1002| PEN| Pen Blue| 8001| 1.25|
61 | | 1004| PEC|Pencil 2B| 10000| 0.48|
62 | | 1005| PEC|Pencil 2H| 8000| 0.49|
63 | +---------+-----------+---------+--------+-----+
64 | ```
65 |
66 | 5. Select and show all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen'
67 | ```
68 | +---------+-----------+---------+--------+-----+
69 | |productID|productCode| name|quantity|price|
70 | +---------+-----------+---------+--------+-----+
71 | | 1001| PEN| Pen Red| 5000| 1.23|
72 | | 1004| PEC|Pencil 2B| 10000| 0.48|
73 | | 1005| PEC|Pencil 2H| 8000| 0.49|
74 | +---------+-----------+---------+--------+-----+
75 | ```
76 |
77 | 6. Select and show all the records witch does not have quantity 5000 and name does not starts with 'Pen'
78 | ```
79 | +---------+-----------+---------+--------+-------+
80 | |productID|productCode| name|quantity| price|
81 | +---------+-----------+---------+--------+-------+
82 | | 1003| PEN|Pen Black| 2000| 1.25|
83 | | 1006| PEC|Pencil HB| 0|9999.99|
84 | | 2001| PEC|Pencil 3B| 500| 0.52|
85 | | 2002| PEC|Pencil 4B| 200| 0.62|
86 | | 2003| PEC|Pencil 5B| 100| 0.73|
87 | | 2004| PEC|Pencil 6B| 500| 0.47|
88 | +---------+-----------+---------+--------+-------+
89 | ```
90 |
91 | 7. Select and show all the products which name is 'Pen Red', 'Pen Black'
92 | ```
93 | +---------+-----------+---------+--------+-----+
94 | |productID|productCode| name|quantity|price|
95 | +---------+-----------+---------+--------+-----+
96 | | 1001| PEN| Pen Red| 5000| 1.23|
97 | | 1003| PEN|Pen Black| 2000| 1.25|
98 | +---------+-----------+---------+--------+-----+
99 | ```
100 | 8. Select and show all the products which has price BETWEEN 1.0 AND 2 0 AND quantity
101 | ```
102 | +---------+-----------+---------+--------+-----+
103 | |productID|productCode| name|quantity|price|
104 | +---------+-----------+---------+--------+-----+
105 | | 1003| PEN|Pen Black| 2000| 1.25|
106 | +---------+-----------+---------+--------+-----+
107 | ```
108 |
109 | 9. Select all the products which has product code as null
110 |
111 | ```
112 | +---------+-----------+----+--------+-----+
113 | |productID|productCode|name|quantity|price|
114 | +---------+-----------+----+--------+-----+
115 | +---------+-----------+----+--------+-----+
116 | ```
117 |
118 | 10. Select all the products, whose name stalls with Pen and results should be order by Price descending order.
119 |
120 | ```
121 | +---------+-----------+---------+--------+-------+
122 | |productID|productCode| name|quantity| price|
123 | +---------+-----------+---------+--------+-------+
124 | | 1006| PEC|Pencil HB| 0|9999.99|
125 | | 1003| PEN|Pen Black| 2000| 1.25|
126 | | 1002| PEN| Pen Blue| 8001| 1.25|
127 | | 1001| PEN| Pen Red| 5000| 1.23|
128 | | 2003| PEC|Pencil 5B| 100| 0.73|
129 | | 2002| PEC|Pencil 4B| 200| 0.62|
130 | | 2001| PEC|Pencil 3B| 500| 0.52|
131 | | 1005| PEC|Pencil 2H| 8000| 0.49|
132 | | 1004| PEC|Pencil 2B| 10000| 0.48|
133 | | 2004| PEC|Pencil 6B| 500| 0.47|
134 | +---------+-----------+---------+--------+-------+
135 | ```
136 |
137 | 11. Select all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order.
138 |
139 | ```
140 | +---------+-----------+---------+--------+-------+
141 | |productID|productCode| name|quantity| price|
142 | +---------+-----------+---------+--------+-------+
143 | | 1006| PEC|Pencil HB| 0|9999.99|
144 | | 1003| PEN|Pen Black| 2000| 1.25|
145 | | 1002| PEN| Pen Blue| 8001| 1.25|
146 | | 1001| PEN| Pen Red| 5000| 1.23|
147 | | 2003| PEC|Pencil 5B| 100| 0.73|
148 | | 2002| PEC|Pencil 4B| 200| 0.62|
149 | | 2001| PEC|Pencil 3B| 500| 0.52|
150 | | 1005| PEC|Pencil 2H| 8000| 0.49|
151 | | 1004| PEC|Pencil 2B| 10000| 0.48|
152 | | 2004| PEC|Pencil 6B| 500| 0.47|
153 | +---------+-----------+---------+--------+-------+
154 | ```
155 |
156 | 12. Select top 2 products by price
157 |
158 | ```
159 | +---------+-----------+---------+--------+-------+
160 | |productID|productCode| name|quantity| price|
161 | +---------+-----------+---------+--------+-------+
162 | | 1006| PEC|Pencil HB| 0|9999.99|
163 | | 1002| PEN| Pen Blue| 8001| 1.25|
164 | +---------+-----------+---------+--------+-------+
165 | ```
166 |
167 |
168 | 13. Select all the columns from product table with output header as below.
169 | `
170 | productID AS ID
171 | code AS Code
172 | name AS Description
173 | price AS 'Unit Price'
174 | `
175 | ```
176 |
177 | ```
178 |
179 | 14. Select code and name both separated by '-' and header name should be ProductDescription'_
180 |
181 | ```
182 | ```
183 |
184 | 15. Select all distinct prices.
185 |
186 | ```
187 | ```
188 |
189 | 16. Select distinct price and name combination
190 |
191 | ```
192 | ```
193 |
194 | 17. Select all price data sorted by both code and productID combinatiom
195 |
196 | ```
197 | ```
198 |
199 | 18. count number of products.
200 |
201 | ```
202 | ```
203 |
204 | 19. Count number ofproducts for each code
205 |
206 | ```
207 | ```
208 |
209 |
210 | 20. Select Maximum, minimum, average Standard Deviation, and total quantity _
211 |
212 | 21. Select minimum and maximum price for each product code.
213 |
214 | 22. Select Maximum, minimum, average Standard Deviation, and total quantity for each product code, hwoeiM make sure and Standard deviation will have maximum two decimal values.
215 |
216 | 23. Select all the product code and average price only where product count is more than or equal to 3
217 |
218 | 24. Select maximum, minimum average and total of all the products for each code. Also produce the same across all the products
219 |
220 |
221 | ## Part Two
222 | __Package name:__ *[com.jwk.development.big_data_insights.scala.products]()*
223 |
224 | __Driver/Main class:__ *[com.jwk.development.big_data_insights.scala.products.driver]()*
225 |
226 | __Solution Package__ *[com.jwk.development.big_data_insights.scala.products.problem_scenario]()*
227 |
228 | You have been provided two additional files:
229 |
230 | 1. [suppliers.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/supplier.csv)
231 |
232 | 2. [products_suppliers.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/products_suppliers.csv)
233 |
234 |
235 | Now accomplish all the queries.
236 |
237 | 1. Select product, its price , its supplier name where product price is less than 0.6 using SparkSQL
238 |
239 |
240 | 2. It is possible that, same product can be supplied by multiple supplier. Now find each product, its price according to
241 | each supplier.
242 |
243 | 3. Find all the supllier name, who are supplying 'Pencil 3B'
244 |
245 | 4. Find all the products , which are supplied by ABC Traders _
246 |
247 |
248 | ## Part Three
249 | 1. Create a Hive ORC table using SparkSQL
250 |
251 | 2. Load this data in Hive table.
252 |
253 | 3. Create a Hive parquet table using SparkSQL and load data in it.
254 |
255 |
256 | ## Developer Notes:
257 | Add the following VM options arguments to set spark master
258 | ```
259 | -Dspark.master=local
260 | -Dhadoop.home.dir=C:\hadoop-2.7.4
261 | ```
262 |
--------------------------------------------------------------------------------
/problem_scenarios/rdd_operations.md:
--------------------------------------------------------------------------------
1 | # Problem Scenario: RDD Operations
2 |
3 | __Problem:__ **
4 |
5 | __Package name:__ **
6 |
7 | __Driver/Main class:__ **
8 |
9 | __solution:__**
10 |
11 |
12 | 1. You have been given below code snippet
13 |
14 | ```
15 | val a = "salmon", "salmon", "rat", "elephant"}, 3}
16 |
17 | val b = a.keyBy(_.1ength)
18 |
19 | val c = sc.parallelize(List("dog", "cat", "gnu", "salmon", )), "salmon" , 3)
20 |
21 | val d = c.keyBy(_.1ength)
22 |
23 | ```
24 |
25 | Write a correct code snippet for operationl which will produce desired output, shoun below.
26 |
27 | ```
28 | Array[(lnt, (String, String))] =
29 |
30 | (6 , (salmon,rabbit))
31 |
32 | (3,(dog,dog)), (3 ,(dog,cat)), (3 ,(dog,gnu)),
33 | ```
34 |
35 |
--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 | resolvers += Resolver.url("bintray-sbt-plugins", url("http://dl.bintray.com/sbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)
--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.15
2 |
--------------------------------------------------------------------------------
/scripts/create_products_parquet_table_in_hive.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | hive -e
4 | "
5 | CREATE EXTERNAL TABLE products_orc (productid int, code string, name string, quantity int, price float)
6 | STORED AS parquet
7 | LOCATION /user/hive/warehouse/product_parquet_table
8 | ;"
9 |
10 | #"select * from product_parquet_table;"
--------------------------------------------------------------------------------
/scripts/create_products_table_in_hive.sh:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env bash
2 |
3 | hive -e
4 | "
5 | CREATE EXTERNAL TABLE products_orc (productid int, code string, name string, quantity int, price float)
6 | STORED AS orc
7 | LOCATION /user/hive/warehouse/product_orc_table
8 | ;"
9 |
10 | #select * from product_orc_table
--------------------------------------------------------------------------------
/src/main/scala/com/jwk/development/big_data_insights/scala/products/driver/run_problem_scenario_part_One.scala:
--------------------------------------------------------------------------------
1 | package com.jwk.development.big_data_insights.scala.products.driver
2 |
3 | import java.util.Date
4 |
5 | import com.jwk.development.big_data_insights.scala.products.problem_scenario.part_One
6 | import org.apache.spark.sql.SparkSession
7 |
8 | object run_problem_scenario_part_One {
9 | val spark: SparkSession = SparkSession.builder.getOrCreate()
10 |
11 | def main(args: Array[String]): Unit = {
12 |
13 | //signal start message
14 | println("Start " + this.getClass.getName() + " : " + new Date())
15 |
16 | try {
17 | val problemPart = new part_One
18 | problemPart.part_One_Solution("insight_data/products.csv")
19 | } catch {
20 | case ex: Exception => {
21 | println(this.getClass.getName() + ". Error during program run. Root cause: " + ex.getMessage())
22 | }
23 | }
24 |
25 | //signal end message
26 | println("End " + this.getClass.getName() + " : " + new Date())
27 | }
28 | }
29 |
--------------------------------------------------------------------------------
/src/main/scala/com/jwk/development/big_data_insights/scala/products/problem_scenario/part_One.scala:
--------------------------------------------------------------------------------
1 | package com.jwk.development.big_data_insights.scala.products.problem_scenario
2 |
3 | import org.apache.spark.sql.types._
4 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
5 |
6 | class part_One {
7 | val spark: SparkSession = SparkSession.builder.appName("products_application").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").getOrCreate()
8 | val sparkContext = spark.sparkContext
9 | val sqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
10 |
11 |
12 | /**
13 | * Solution to part one of products problem scenario
14 | *
15 | * @param path file path to products.csv file
16 | */
17 | def part_One_Solution(path: String): Unit = {
18 | //val tab_delimited_Header= "productID\tproductCode\tname\tquantity\tprice\tsupplierid"
19 | //val comma_delimited_Header= "productID,productCode,name,quantity,price,supplierid"
20 |
21 | //define schema of csv file
22 | val schema =
23 | StructType(
24 | Array(
25 | StructField("productID", IntegerType, false),
26 | StructField("productCode", StringType, false),
27 | StructField("name", StringType, false),
28 | StructField("quantity", IntegerType, false),
29 | StructField("price", FloatType, false)
30 | )
31 | )
32 |
33 | //read csv file from directory path using schema
34 | val productDF = sqlContext.read.format("com.databricks.spark.csv").option("delimiter", "\t").option("header", "true").option("inferSchema", "false").schema(schema).load(path)
35 | //show first 10 records in the dataframe
36 | productDF.show(10)
37 |
38 | // Register the DataFrame as a global temporary view
39 | val tempTableName = "products"
40 | productDF.createGlobalTempView(tempTableName)
41 | val globalTempViewName = s"global_temp.$tempTableName"
42 |
43 | //import apache spark sql
44 | import org.apache.spark.sql._
45 |
46 | //The following answers PART ONE questions of the problem scenario.
47 | //1. Select all the records with quantity >= 5000 and name starts with 'Pen'
48 | println("SELECTING: all the records with quantity >= 5000 and name starts with 'Pen'")
49 | val results1 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE quantity >= 5000 AND name LIKE 'Pen%'")
50 | println("SHOWING: all the records with quantity >= 5000 and name starts with 'Pen'")
51 | results1.show()
52 |
53 | //2. Select all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen'
54 | println("SELECTING: all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen'")
55 | val results2 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE quantity >= 5000 AND price < 1.24 AND name LIKE 'Pen%'")
56 | println("SHOWING: all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen'")
57 | results2.show()
58 |
59 | //3. Select all the records witch does not have quantity 5000 and name does not starts with 'Pen'
60 | println("SELECTING: all the records witch does not have quantity 5000 and name does not starts with 'Pen'")
61 | val results3 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE NOT (quantity >= 5000 AND name LIKE 'Pen%')")
62 | println("SHOWING: all the records witch does not have quantity 5000 and name does not starts with 'Pen'")
63 | results3.show()
64 |
65 | //4. Select all the products which name is 'Pen Red', 'Pen Black'
66 | println("SELECTING: all the products which name is 'Pen Red', 'Pen Black'")
67 | val results4 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE name IN ('Pen Red', 'Pen Black')")
68 | println("SHOWING: all the products which name is 'Pen Red', 'Pen Black'")
69 | results4.show()
70 |
71 | //5. Select all the products which has price BETWEEN 1.0 AND 2.0 AND quantity BETWEEN 1000 AND 2000
72 | println("SELECTING : all the products which has price BETWEEN 1.0 AND 2.0 AND quantity BETWEEN 1000 AND 2000")
73 | val results5 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE (price BETWEEN 1 AND 2) AND (quantity BETWEEN 1000 AND 2000)")
74 | println("SHOWING: all the products which has price BETWEEN 1.0 AND 2.0 AND quantity BETWEEN 1000 AND 2000")
75 | results5.show()
76 |
77 | //Select all the products which has product code as null
78 | println("SELECTING : all the products which has product code as null")
79 | val results6 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE productCode IS NULL")
80 | println("SHOWING: all the products which has product code as null")
81 | results6.show()
82 |
83 | //Select all the products, whose name starts with Pen and results should be order by Price descending order.
84 | println("SELECTING : all the products, whose name stalls with Pen and results should be order by Price descending order.")
85 | val results7 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE name LIKE 'Pen%' ORDER BY price DESC")
86 | println("SHOWING: all the products, whose name startss with Pen and results should be order by Price descending order.")
87 | results7.show()
88 |
89 | //Select all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order.
90 | println("SELECTING : all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order")
91 | val results8 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE name LIKE 'Pen%' ORDER BY price DESC, quantity ASC")
92 | println("SHOWING: all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order")
93 | results8.show()
94 |
95 | //Select top 2 products by price
96 | println("SELECTING : top 2 products by price")
97 | val results9 = spark.sql(s"SELECT * FROM $globalTempViewName ORDER BY price DESC LIMIT 2")
98 | println("SHOWING: top 2 products by price")
99 | results9.show()
100 |
101 | //Select all the columns from product table with output header as below.: `productID AS ID code AS Code name AS Description price AS 'Unit Price'
102 | println("SELECTING : all the columns from product table with output header as below.: `productID AS ID code AS Code name AS Description price AS 'Unit Price'")
103 | val results10 = spark.sql(s"SELECT productID AS ID, productCode AS Code, name AS Description, price AS Unit_Price FROM $globalTempViewName")
104 | println("SHOWING: all the columns from product table with output header as below.: `productID AS ID code AS Code name AS Description price AS 'Unit Pric'")
105 | results10.show()
106 |
107 | //Select code and name both separated by - and header name should be ProductDescription'
108 | println("SELECTING : code and name both separated by ' and header name should be ProductDescription'")
109 | val results11 = spark.sql(s"SELECT CONCAT(productID,'-',name) AS ProductDescription FROM $globalTempViewName")
110 | println("SHOWING: code and name both separated by ' and header name should be ProductDescription'")
111 | results11.show()
112 |
113 | //Select all distinct prices
114 | println("SELECTING : all distinct prices")
115 | val results12 = spark.sql(s"SELECT DISTINCT price AS Distinct_Price FROM $globalTempViewName")
116 | println("SHOWING: all distinct prices")
117 | results12.show()
118 |
119 | //Select distinct price and name combination
120 | println("SELECTING : distinct price and name combination")
121 | val results13 = spark.sql(s"SELECT DISTINCT price, name FROM $globalTempViewName")
122 | println("SHOWING: distinct price and name combination")
123 | results13.show()
124 |
125 | //Select all price data sorted by both code and productID combination
126 | println("SELECTING : all price data sorted by both code and productID combinatiom")
127 | val results15 = spark.sql(s"SELECT * FROM $globalTempViewName ORDER BY productID, productID")
128 | println("SHOWING: all price data sorted by both code and productID combinatiom")
129 | results15.show()
130 |
131 |
132 | //Count number ofproducts for each code
133 | println("SELECTING : Count number of products for each code")
134 | val results16 = spark.sql(s"SELECT * FROM $globalTempViewName ORDER BY price DESC LIMIT 2")
135 | println("SHOWING: Count number ofproducts for each code")
136 | results16.show()
137 |
138 | //save daraframe to hive table in orc format
139 | writeDataFrameToHiveTable(productDF, SaveMode.Overwrite, "orc", "product_orc_table")
140 |
141 | //save daraframe to hive table in orc format
142 | writeDataFrameToHiveTable(productDF, SaveMode.Overwrite, "parquet", "product_parquet_table")
143 |
144 | }
145 |
146 | def writeDataFrameToHiveTable(inputDF: DataFrame, saveMode: SaveMode, dataFormat: String, hiveTableName: String) = {
147 | println(s"Starting to write dataframe to hive table with the following data format $dataFormat and hive table name: $hiveTableName")
148 | //match cases: json, parquet, jdbc, orc, libsvm, csv, text
149 | dataFormat match {
150 | case "json" => inputDF.write.mode(saveMode).format("json").saveAsTable(hiveTableName)
151 | case "parquet" => inputDF.write.mode(saveMode).format("parquet").saveAsTable(hiveTableName)
152 | case "jdbc" => inputDF.write.mode(saveMode).format("jdbc").saveAsTable(hiveTableName)
153 | case "orc" => inputDF.write.mode(saveMode).format("orc").saveAsTable(hiveTableName)
154 | case "csv" => inputDF.write.mode(saveMode).format("libsvm").saveAsTable(hiveTableName)
155 | case "text" => inputDF.write.mode(saveMode).format("text").saveAsTable(hiveTableName)
156 | case "libsvm" => inputDF.write.mode(saveMode).format("libsvm").saveAsTable(hiveTableName)
157 | case _ => "Invalid dataFormat. Allowed formats are: json, parquet, jdbc, orc, csv, text or libsvm" // the default, catch-all
158 | }
159 |
160 | println(s"End write dataframe to hive table with the following table name $dataFormat and hive table name: $hiveTableName")
161 |
162 | }
163 |
164 |
165 | }
166 |
--------------------------------------------------------------------------------
/wiki_data/screenshots/how_to_set_spark_master_to_local_in _intellij.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jwkimani/big-data-insights-scala/e999dffab10c4fa8f5e716da461ae14e86965c33/wiki_data/screenshots/how_to_set_spark_master_to_local_in _intellij.PNG
--------------------------------------------------------------------------------