├── .gitignore
├── README.md
├── build.sbt
├── config
    ├── test_linux
    │   ├── application.conf
    │   └── log4j.properties
    └── test_windows
    │   ├── application.conf
    │   └── log4j.properties
├── insight_data
    ├── patients.csv
    ├── products.csv
    ├── products_suppliers.csv
    └── supplier.csv
├── problem_scenarios
    ├── patient_data.md
    ├── products.md
    └── rdd_operations.md
├── project
    ├── assembly.sbt
    └── build.properties
├── scripts
    ├── create_products_parquet_table_in_hive.sh
    └── create_products_table_in_hive.sh
├── src
    └── main
    │   └── scala
    │       └── com
    │           └── jwk
    │               └── development
    │                   └── big_data_insights
    │                       └── scala
    │                           └── products
    │                               ├── driver
    │                                   └── run_problem_scenario_part_One.scala
    │                               └── problem_scenario
    │                                   └── part_One.scala
└── wiki_data
    └── screenshots
        └── how_to_set_spark_master_to_local_in _intellij.PNG


/.gitignore:
--------------------------------------------------------------------------------
 1 | RemoteSystemsTempFiles/
 2 | Servers/
 3 | target/
 4 | logs/
 5 | .metadata/
 6 | bin/
 7 | tmp/
 8 | *.tmp
 9 | *.bak
10 | *.swp
11 | *~.nib
12 | local.properties
13 | .settings/
14 | .loadpath
15 | .recommenders
16 | .idea/
17 | .project
18 | classes/
19 | .classpath
20 | .iml
21 | *_SUCCESS*
22 | *.crc
23 | 
24 | # External tool builders
25 | .externalToolBuilders/
26 |  
27 | # Locally stored "Eclipse launch configurations"
28 | *.launch
29 |  
30 | # PyDev specific (Python IDE for Eclipse)
31 | *.pydevproject
32 |  
33 | # CDT-specific (C/C++ Development Tooling)
34 | .cproject
35 |  
36 | # Java annotation processor (APT)
37 | .factorypath
38 |  
39 | # PDT-specific (PHP Development Tools)
40 | .buildpath
41 |  
42 | # sbteclipse plugin
43 | .target
44 |  
45 | # Tern plugin
46 | .tern-project
47 |  
48 | # TeXlipse plugin
49 | .texlipse
50 |  
51 | # STS (Spring Tool Suite)
52 | .springBeans
53 |  
54 | # Code Recommenders
55 | .recommenders/
56 |  
57 | # Scala IDE specific (Scala & Java development for Eclipse)
58 | .cache-main
59 | .scala_dependencies
60 | .worksheet
61 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # big-data-insights-scala
 2 | personal solutions to big data problem scenarios using scala 
 3 | 
 4 | ## Problem Scenarios
 5 | 
 6 | ### 1. [Product Data for a pen company](https://github.com/jwkimani/big-data-insights-scala/blob/master/problem_scenarios/products.md) 
 7 |    
 8 | ### 2. [Patient Data](https://github.com/jwkimani/big-data-insights-scala/blob/master/problem_scenarios/patient_data.md)
 9 | 
10 | ### 3. [RDD (resilient distributed dataset) Operations](https://github.com/jwkimani/big-data-insights-scala/blob/master/problem_scenarios/rdd_operations.md)
11 | 
12 | 
13 | ## Troubleshooting
14 | 1. When running applications if below error occurs: *A master URL must be set in your configuration*
15 |     ```
16 |     Exception in thread "main" java.lang.ExceptionInInitializerError
17 |         at com.jwk.development.big_data_insights.scala.products.driver.problem_scenario_1.main(problem_scenario_1.scala)
18 |     Caused by: org.apache.spark.SparkException: A master URL must be set in your configuration
19 |     ```
20 |     
21 |     Solution: 
22 |     
23 |     Add the following VM option to your run configurations
24 |     ```
25 |     -Dspark.master=local    
26 |     ```
27 |     [How to set spark master to local in intellij](https://github.com/jwkimani/big-data-insights-scala/blob/master/wiki_data/screenshots/how_to_set_spark_master_to_local_in%20_intellij.PNG)
28 |    
29 | 


--------------------------------------------------------------------------------
/build.sbt:
--------------------------------------------------------------------------------
 1 | name := "big-data-insights-scala"
 2 | 
 3 | version := "1.0"
 4 | 
 5 | scalaVersion := "2.11.8"
 6 | 
 7 | libraryDependencies ++= Seq(
 8 |   "org.apache.hadoop" % "hadoop-client" % "2.7.3",
 9 |   ("org.apache.spark" % "spark-core_2.11" % "2.1.0"),
10 |   ("org.apache.spark" % "spark-sql_2.11" % "2.1.0"),
11 |   "org.apache.spark" % "spark-hive_2.11" % "2.1.0",
12 |   "com.databricks" % "spark-avro_2.11" % "3.2.0",
13 |   "com.databricks" % "spark-csv_2.10" % "1.3.0",
14 |   "org.scala-lang" % "scala-library" % "2.11.8",
15 |   "org.scala-lang" % "scala-reflect" % "2.11.8",
16 |   "com.typesafe" % "config" % "1.3.1",
17 |   "org.apache.logging.log4j" %% "log4j-api-scala" % "2.8.1",
18 |   "org.apache.logging.log4j" % "log4j-core" % "2.8.1",
19 |   "org.apache.kafka" %% "kafka" % "0.9.0.2.3.4.51-1"
20 | 
21 | )
22 | //use external repositories
23 | resolvers += "HortonWorksRepo" at "http://repo.hortonworks.com/content/repositories/releases/"
24 | 
25 | parallelExecution in test := false
26 | 
27 | 
28 | initialCommands := "import org.test._"
29 | 
30 | //clean operations
31 | cleanFiles += baseDirectory { base => base / "build" }.value
32 | cleanFiles += baseDirectory { base => base / "metastore_db" }.value
33 | 
34 | //assembly-settings


--------------------------------------------------------------------------------
/config/test_linux/application.conf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jwkimani/big-data-insights-scala/e999dffab10c4fa8f5e716da461ae14e86965c33/config/test_linux/application.conf


--------------------------------------------------------------------------------
/config/test_linux/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set root logger level to DEBUG and its only appender to A1.
 2 | log4j.rootLogger=ERROR, A1
 3 | # If we get chained appenders, this stops the message being written multiple times
 4 | log4j.additivity.org.apache=false
 5 | log4j.additivity.xdasLogger=false
 6 | # A1 is set to be a ConsoleAppender.
 7 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
 8 | log4j.appender.stdout.Target=System.out
 9 | # A1 uses PatternLayout.
10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
12 | 
13 | 


--------------------------------------------------------------------------------
/config/test_windows/application.conf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jwkimani/big-data-insights-scala/e999dffab10c4fa8f5e716da461ae14e86965c33/config/test_windows/application.conf


--------------------------------------------------------------------------------
/config/test_windows/log4j.properties:
--------------------------------------------------------------------------------
 1 | # Set root logger level to DEBUG and its only appender to A1.
 2 | log4j.rootLogger=ERROR, A1
 3 | # If we get chained appenders, this stops the message being written multiple times
 4 | log4j.additivity.org.apache=false
 5 | log4j.additivity.xdasLogger=false
 6 | # A1 is set to be a ConsoleAppender.
 7 | log4j.appender.A1=org.apache.log4j.ConsoleAppender
 8 | log4j.appender.stdout.Target=System.out
 9 | # A1 uses PatternLayout.
10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout
11 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n
12 | 
13 | 


--------------------------------------------------------------------------------
/insight_data/patients.csv:
--------------------------------------------------------------------------------
1 | ﻿patientID,name               ,address                                 ,dateOfBirth,lastVisitDate
2 | 1001     ,Homer Simpson      ,"123 Blue St.,Los Angeles, CA 12345"    ,1989-12-31 ,2017-01-21
3 | 1002     ,Peter Griffin      ,"234 Brown St., San Fransisco, CA 23456",1950-01-30 ,2015-04-18
4 | 1003     ,Hubert J. Fansworth,"546 Red Dr., Sacramento, CA 54678"     ,1978-08-21 ,2017-02-14
5 | 1004     ,Marge Simpson      ,"123 Blue St.,Los Angeles, CA 12345"    ,1990-03-18 ,2016-02-15
6 | 1005     ,Bender Rodriguez   ,"127 Brown St., Charlotte, NC 28223"    ,1986-12-31 ,2013-12-14
7 | 1006     ,Turanga Leela      ,"128 Brown St., Charlotte, NC 28223"    ,1978-08-21 ,2012-09-15
8 | 


--------------------------------------------------------------------------------
/insight_data/products.csv:
--------------------------------------------------------------------------------
 1 | productID	productCode	name	quantity	price	supplierid
 2 | 1001	PEN	Pen Red	5000	1.23	501
 3 | 1002	PEN	Pen Blue	8001	1.25	501
 4 | 1003	PEN	Pen Black	2000	1.25	501
 5 | 1004	PEC	Pencil 2B	10000	0.48	502
 6 | 1005	PEC	Pencil 2H	8000	0.49	502
 7 | 1006	PEC	Pencil HB	0	9999.99	502
 8 | 2001	PEC	Pencil 3B	500	0.52	501
 9 | 2002	PEC	Pencil 4B	200	0.62	501
10 | 2003	PEC	Pencil 5B	100	0.73	501
11 | 2004	PEC	Pencil 6B	500	0.47	502
12 | 


--------------------------------------------------------------------------------
/insight_data/products_suppliers.csv:
--------------------------------------------------------------------------------
1 | ﻿productID,supplierID
2 | 2001     ,501
3 | 2002     ,501
4 | 2003     ,501
5 | 2004     ,502
6 | 2001     ,503
7 | 


--------------------------------------------------------------------------------
/insight_data/supplier.csv:
--------------------------------------------------------------------------------
1 | ﻿supplierID,name       ,phone
2 | 501       ,ABC Traders,88881111
3 | 502       ,XYZ Company,88882222
4 | 503       ,QQ Corp    ,88883333
5 | 504       ,DEG LLC    ,88884444
6 | 505       ,FGH Limited,88885555
7 | 


--------------------------------------------------------------------------------
/problem_scenarios/patient_data.md:
--------------------------------------------------------------------------------
 1 | #Problem Scenario 3
 2 | 
 3 | __Problem:__ **
 4 |     
 5 | __Package name:__ **
 6 |     
 7 | __Driver/Main class:__ **
 8 | 
 9 | You have been given the following file containing patient data:
10 | [patients.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/patients.csv)
11 | 
12 | Accomplish the following activities:
13 | <!--q43-->
14 | 1. Find all the patients whose lastVisitDate between current time and '2012-09-15' 
15 | 
16 |     ```
17 |     
18 |     ```
19 | 
20 | 2. Find all the patients who bom in 1990 
21 | 
22 |     ```
23 |     
24 |     ```
25 | 3. Find all the patients age
26 | 
27 |     ```
28 |     
29 |     ``` 
30 | 
31 | 4. List patients uhose last visited more than 60 days ago
32 | 
33 |     ```
34 |     
35 |     ``` 
36 |     
37 | 5. Select patients 18 years old or younger
38 | 
39 |     ```
40 |     
41 |     ``` 
42 | 
43 | 
44 | 
45 | 


--------------------------------------------------------------------------------
/problem_scenarios/products.md:
--------------------------------------------------------------------------------
  1 | # Problem Scenario 1
  2 | 
  3 | __Problem:__ *Given csv files with product information from a pen company, provide some insights using big data technologies*
  4 | 
  5 | __Package name:__ *[com.jwk.development.big_data_insights.scala.products](https://github.com/jwkimani/big-data-insights-scala/tree/master/src/main/scala/com/jwk/development/big_data_insights/scala/products)*
  6 |     
  7 | __Driver/Main class:__ *[com.jwk.development.big_data_insights.scala.products.driver](https://github.com/jwkimani/big-data-insights-scala/tree/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/driver)*
  8 | 
  9 | __Solution Package__ *[com.jwk.development.big_data_insights.scala.products.problem_scenario](https://github.com/jwkimani/big-data-insights-scala/tree/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/problem_scenario)*
 10 | 
 11 | ##
 12 | This problem has three parts hence the solutions are broken into three different modules in the package:
 13 | 1. [Part One](#part1)
 14 |    
 15 | 2. [Part Two](#part2)
 16 |    
 17 | 3. [Part Three](#part3)
 18 | 
 19 | ## Part One <a name="part1"></a>
 20 |     
 21 | __Driver/Main class:__ *[com.jwk.development.big_data_insights.scala.products.driver](https://github.com/jwkimani/big-data-insights-scala/blob/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/driver/run_problem_scenario_part_One.scala)*
 22 | 
 23 | __Solution Package__ *[com.jwk.development.big_data_insights.scala.products.problem_scenario](https://github.com/jwkimani/big-data-insights-scala/blob/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/problem_scenario/part_One.scala)*
 24 | 
 25 | You have the following tab delimited csv file [products.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/products.csv)
 26 | 
 27 | Using Spark and SparkSQL perform the following tasks: 
 28 | 1. Load the csv file to a dataframe 
 29 |     
 30 |     Using schema: 
 31 |     ```
 32 |      val schema =
 33 |           StructType(
 34 |             Array(
 35 |               StructField("productID", IntegerType, false),
 36 |               StructField("productCode", StringType, false),
 37 |               StructField("name", StringType, false),
 38 |               StructField("quantity", IntegerType, false),
 39 |               StructField("price", FloatType, false)
 40 |             )
 41 |           )
 42 |     ```
 43 |     approach
 44 |     ```
 45 |     val productDF = sqlContext.read.format("com.databricks.spark.csv").option("delimiter","\t").option("header","true").option("inferSchema", "false").schema(schema).load(path)
 46 |     ```
 47 | 
 48 | 2. Create a global temporary view named `products` from dataframe with data
 49 | 
 50 |     `productDF.createGlobalTempView("products")`
 51 | 
 52 | 3. Using the global temporary view, perform the task below
 53 | 
 54 | 4. Select and show all the records with quantity >= 5000 and name starts with 'Pen' 
 55 |     ``` 
 56 |     +---------+-----------+---------+--------+-----+
 57 |     |productID|productCode|     name|quantity|price|
 58 |     +---------+-----------+---------+--------+-----+
 59 |     |     1001|        PEN|  Pen Red|    5000| 1.23|
 60 |     |     1002|        PEN| Pen Blue|    8001| 1.25|
 61 |     |     1004|        PEC|Pencil 2B|   10000| 0.48|
 62 |     |     1005|        PEC|Pencil 2H|    8000| 0.49|
 63 |     +---------+-----------+---------+--------+-----+
 64 |     ```
 65 | 
 66 | 5. Select and show all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen' 
 67 |     ``` 
 68 |     +---------+-----------+---------+--------+-----+
 69 |     |productID|productCode|     name|quantity|price|
 70 |     +---------+-----------+---------+--------+-----+
 71 |     |     1001|        PEN|  Pen Red|    5000| 1.23|
 72 |     |     1004|        PEC|Pencil 2B|   10000| 0.48|
 73 |     |     1005|        PEC|Pencil 2H|    8000| 0.49|
 74 |     +---------+-----------+---------+--------+-----+
 75 |     ```
 76 | 
 77 | 6. Select and show all the records witch does not have quantity 5000 and name does not starts with 'Pen' 
 78 |     ``` 
 79 |     +---------+-----------+---------+--------+-------+
 80 |     |productID|productCode|     name|quantity|  price|
 81 |     +---------+-----------+---------+--------+-------+
 82 |     |     1003|        PEN|Pen Black|    2000|   1.25|
 83 |     |     1006|        PEC|Pencil HB|       0|9999.99|
 84 |     |     2001|        PEC|Pencil 3B|     500|   0.52|
 85 |     |     2002|        PEC|Pencil 4B|     200|   0.62|
 86 |     |     2003|        PEC|Pencil 5B|     100|   0.73|
 87 |     |     2004|        PEC|Pencil 6B|     500|   0.47|
 88 |     +---------+-----------+---------+--------+-------+
 89 |     ```
 90 | 
 91 | 7. Select and show all the products which name is 'Pen Red', 'Pen Black' 
 92 |     ``` 
 93 |     +---------+-----------+---------+--------+-----+
 94 |     |productID|productCode|     name|quantity|price|
 95 |     +---------+-----------+---------+--------+-----+
 96 |     |     1001|        PEN|  Pen Red|    5000| 1.23|
 97 |     |     1003|        PEN|Pen Black|    2000| 1.25|
 98 |     +---------+-----------+---------+--------+-----+
 99 |     ```
100 | 8. Select and show all the products which has price BETWEEN 1.0 AND 2 0 AND quantity 
101 |     ``` 
102 |     +---------+-----------+---------+--------+-----+
103 |     |productID|productCode|     name|quantity|price|
104 |     +---------+-----------+---------+--------+-----+
105 |     |     1003|        PEN|Pen Black|    2000| 1.25|
106 |     +---------+-----------+---------+--------+-----+
107 |     ```
108 |     <!--question 88-->
109 | 9. Select all the products which has product code as null 
110 | 
111 |     ``` 
112 |     +---------+-----------+----+--------+-----+
113 |     |productID|productCode|name|quantity|price|
114 |     +---------+-----------+----+--------+-----+
115 |     +---------+-----------+----+--------+-----+
116 |     ```
117 | 
118 | 10. Select all the products, whose name stalls with Pen and results should be order by Price descending order. 
119 | 
120 |     ``` 
121 |     +---------+-----------+---------+--------+-------+
122 |     |productID|productCode|     name|quantity|  price|
123 |     +---------+-----------+---------+--------+-------+
124 |     |     1006|        PEC|Pencil HB|       0|9999.99|
125 |     |     1003|        PEN|Pen Black|    2000|   1.25|
126 |     |     1002|        PEN| Pen Blue|    8001|   1.25|
127 |     |     1001|        PEN|  Pen Red|    5000|   1.23|
128 |     |     2003|        PEC|Pencil 5B|     100|   0.73|
129 |     |     2002|        PEC|Pencil 4B|     200|   0.62|
130 |     |     2001|        PEC|Pencil 3B|     500|   0.52|
131 |     |     1005|        PEC|Pencil 2H|    8000|   0.49|
132 |     |     1004|        PEC|Pencil 2B|   10000|   0.48|
133 |     |     2004|        PEC|Pencil 6B|     500|   0.47|
134 |     +---------+-----------+---------+--------+-------+
135 |     ```
136 | 
137 | 11. Select all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order. 
138 | 
139 |     ``` 
140 |     +---------+-----------+---------+--------+-------+
141 |     |productID|productCode|     name|quantity|  price|
142 |     +---------+-----------+---------+--------+-------+
143 |     |     1006|        PEC|Pencil HB|       0|9999.99|
144 |     |     1003|        PEN|Pen Black|    2000|   1.25|
145 |     |     1002|        PEN| Pen Blue|    8001|   1.25|
146 |     |     1001|        PEN|  Pen Red|    5000|   1.23|
147 |     |     2003|        PEC|Pencil 5B|     100|   0.73|
148 |     |     2002|        PEC|Pencil 4B|     200|   0.62|
149 |     |     2001|        PEC|Pencil 3B|     500|   0.52|
150 |     |     1005|        PEC|Pencil 2H|    8000|   0.49|
151 |     |     1004|        PEC|Pencil 2B|   10000|   0.48|
152 |     |     2004|        PEC|Pencil 6B|     500|   0.47|
153 |     +---------+-----------+---------+--------+-------+
154 |     ```
155 | 
156 | 12. Select top 2 products by price 
157 | 
158 |     ``` 
159 |     +---------+-----------+---------+--------+-------+
160 |     |productID|productCode|     name|quantity|  price|
161 |     +---------+-----------+---------+--------+-------+
162 |     |     1006|        PEC|Pencil HB|       0|9999.99|
163 |     |     1002|        PEN| Pen Blue|    8001|   1.25|
164 |     +---------+-----------+---------+--------+-------+
165 |     ```
166 | 
167 |     <!--question 21-->
168 | 13. Select all the columns from product table with output header as below.
169 |     `
170 |     productID AS ID
171 |     code AS Code 
172 |     name AS Description 
173 |     price AS 'Unit Price'
174 |     `
175 |     ``` 
176 |     
177 |     ```
178 | 
179 | 14. Select code and name both separated by '-' and header name should be ProductDescription'_ 
180 | 
181 |     ``` 
182 |     ```
183 | 
184 | 15. Select all distinct prices. 
185 | 
186 |     ``` 
187 |     ```
188 | 
189 | 16. Select distinct price and name combination 
190 | 
191 |     ``` 
192 |     ```
193 | 
194 | 17. Select all price data sorted by both code and productID combinatiom 
195 | 
196 |     ``` 
197 |     ```
198 | 
199 | 18. count number of products. 
200 | 
201 |     ``` 
202 |     ```
203 | 
204 | 19. Count number ofproducts for each code 
205 | 
206 |     ``` 
207 |     ```
208 | 
209 |     <!--question 14-->
210 | 20. Select Maximum, minimum, average Standard Deviation, and total quantity _ 
211 | 
212 | 21. Select minimum and maximum price for each product code. 
213 | 
214 | 22. Select Maximum, minimum, average Standard Deviation, and total quantity for each product code, hwoeiM make sure and Standard deviation will have maximum two decimal values. 
215 | 
216 | 23. Select all the product code and average price only where product count is more than or equal to 3 
217 | 
218 | 24. Select maximum, minimum average and total of all the products for each code. Also produce the same across all the products 
219 | 
220 | 
221 | ## Part Two <a name="part2"></a>
222 | __Package name:__ *[com.jwk.development.big_data_insights.scala.products]()*
223 |     
224 | __Driver/Main class:__ *[com.jwk.development.big_data_insights.scala.products.driver]()*
225 | 
226 | __Solution Package__ *[com.jwk.development.big_data_insights.scala.products.problem_scenario]()*
227 | 
228 | You have been provided two additional files:
229 | 
230 | 1. [suppliers.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/supplier.csv)
231 | 
232 | 2. [products_suppliers.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/products_suppliers.csv)
233 | 
234 | 
235 | Now accomplish all the queries. 
236 |     <!--q70-->
237 | 1. Select product, its price , its supplier name where product price is less than 0.6 using SparkSQL 
238 | 
239 |     <!--q37-->
240 | 2. It is possible that, same product can be supplied by multiple supplier. Now find each product, its price according to 
241 | each supplier. 
242 | 
243 | 3. Find all the supllier name, who are supplying 'Pencil 3B' 
244 | 
245 | 4. Find all the products , which are supplied by ABC Traders _ 
246 | 
247 | 
248 | ## Part Three <a name="part3"></a>
249 | 1. Create a Hive ORC table using SparkSQL
250 | 
251 | 2. Load this data in Hive table.
252 | 
253 | 3. Create a Hive parquet table using SparkSQL and load data in it.
254 | 
255 | 
256 | ## Developer Notes:
257 | Add the following VM options arguments to set spark master 
258 | ```
259 | -Dspark.master=local
260 | -Dhadoop.home.dir=C:\hadoop-2.7.4
261 | ```
262 | 


--------------------------------------------------------------------------------
/problem_scenarios/rdd_operations.md:
--------------------------------------------------------------------------------
 1 | # Problem Scenario: RDD Operations
 2 | 
 3 |  __Problem:__ **
 4 |     
 5 |  __Package name:__ **
 6 |     
 7 |  __Driver/Main class:__ **
 8 | 
 9 |  __solution:__**
10 | 
11 | 
12 | 1. You have been given below code snippet 
13 | 
14 |     ```
15 |     val a = "salmon", "salmon", "rat", "elephant"}, 3} 
16 |     
17 |     val b = a.keyBy(_.1ength) 
18 |     
19 |     val c = sc.parallelize(List("dog", "cat", "gnu", "salmon", )), "salmon" , 3) 
20 |     
21 |     val d = c.keyBy(_.1ength) 
22 |     
23 |     ``` 
24 |     
25 |     Write a correct code snippet for operationl which will produce desired output, shoun below. 
26 |     
27 |     ```
28 |     Array[(lnt, (String, String))] = 
29 |     
30 |     (6 , (salmon,rabbit)) 
31 |     
32 |     (3,(dog,dog)), (3 ,(dog,cat)), (3 ,(dog,gnu)),
33 |     ```
34 | 
35 | 


--------------------------------------------------------------------------------
/project/assembly.sbt:
--------------------------------------------------------------------------------
1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0")
2 | resolvers += Resolver.url("bintray-sbt-plugins", url("http://dl.bintray.com/sbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns)


--------------------------------------------------------------------------------
/project/build.properties:
--------------------------------------------------------------------------------
1 | sbt.version=0.13.15
2 | 


--------------------------------------------------------------------------------
/scripts/create_products_parquet_table_in_hive.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | hive -e
 4 | "
 5 | CREATE EXTERNAL TABLE products_orc (productid int, code string, name string, quantity int, price float)
 6 | STORED AS parquet
 7 | LOCATION /user/hive/warehouse/product_parquet_table
 8 | ;"
 9 | 
10 | #"select * from product_parquet_table;"


--------------------------------------------------------------------------------
/scripts/create_products_table_in_hive.sh:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env bash
 2 | 
 3 | hive -e
 4 | "
 5 | CREATE EXTERNAL TABLE products_orc (productid int, code string, name string, quantity int, price float)
 6 | STORED AS orc
 7 | LOCATION /user/hive/warehouse/product_orc_table
 8 | ;"
 9 | 
10 | #select * from product_orc_table


--------------------------------------------------------------------------------
/src/main/scala/com/jwk/development/big_data_insights/scala/products/driver/run_problem_scenario_part_One.scala:
--------------------------------------------------------------------------------
 1 | package com.jwk.development.big_data_insights.scala.products.driver
 2 | 
 3 | import java.util.Date
 4 | 
 5 | import com.jwk.development.big_data_insights.scala.products.problem_scenario.part_One
 6 | import org.apache.spark.sql.SparkSession
 7 | 
 8 | object run_problem_scenario_part_One {
 9 |   val spark: SparkSession = SparkSession.builder.getOrCreate()
10 | 
11 |   def main(args: Array[String]): Unit = {
12 | 
13 |     //signal start message
14 |     println("Start " + this.getClass.getName() + " : " + new Date())
15 | 
16 |     try {
17 |       val problemPart = new part_One
18 |       problemPart.part_One_Solution("insight_data/products.csv")
19 |     } catch {
20 |       case ex: Exception => {
21 |         println(this.getClass.getName() + ". Error during program run. Root cause: " + ex.getMessage())
22 |       }
23 |     }
24 | 
25 |     //signal end message
26 |     println("End " + this.getClass.getName() + " : " + new Date())
27 |   }
28 | }
29 | 


--------------------------------------------------------------------------------
/src/main/scala/com/jwk/development/big_data_insights/scala/products/problem_scenario/part_One.scala:
--------------------------------------------------------------------------------
  1 | package com.jwk.development.big_data_insights.scala.products.problem_scenario
  2 | 
  3 | import org.apache.spark.sql.types._
  4 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession}
  5 | 
  6 | class part_One {
  7 |   val spark: SparkSession = SparkSession.builder.appName("products_application").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").getOrCreate()
  8 |   val sparkContext = spark.sparkContext
  9 |   val sqlContext = new org.apache.spark.sql.SQLContext(sparkContext)
 10 | 
 11 | 
 12 |   /**
 13 |     * Solution to part one of products problem scenario
 14 |     *
 15 |     * @param path file path to products.csv file
 16 |     */
 17 |   def part_One_Solution(path: String): Unit = {
 18 |     //val tab_delimited_Header= "productID\tproductCode\tname\tquantity\tprice\tsupplierid"
 19 |     //val comma_delimited_Header= "productID,productCode,name,quantity,price,supplierid"
 20 | 
 21 |     //define schema of csv file
 22 |     val schema =
 23 |       StructType(
 24 |         Array(
 25 |           StructField("productID", IntegerType, false),
 26 |           StructField("productCode", StringType, false),
 27 |           StructField("name", StringType, false),
 28 |           StructField("quantity", IntegerType, false),
 29 |           StructField("price", FloatType, false)
 30 |         )
 31 |       )
 32 | 
 33 |     //read csv file from directory path using schema
 34 |     val productDF = sqlContext.read.format("com.databricks.spark.csv").option("delimiter", "\t").option("header", "true").option("inferSchema", "false").schema(schema).load(path)
 35 |     //show first 10 records in the dataframe
 36 |     productDF.show(10)
 37 | 
 38 |     // Register the DataFrame as a global temporary view
 39 |     val tempTableName = "products"
 40 |     productDF.createGlobalTempView(tempTableName)
 41 |     val globalTempViewName = s"global_temp.$tempTableName"
 42 | 
 43 |     //import apache spark sql
 44 |     import org.apache.spark.sql._
 45 | 
 46 |     //The following answers PART ONE questions of the problem scenario.
 47 |     //1. Select all the records with quantity >= 5000 and name starts with 'Pen'
 48 |     println("SELECTING: all the records with quantity >= 5000 and name starts with 'Pen'")
 49 |     val results1 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE quantity >= 5000 AND name LIKE 'Pen%'")
 50 |     println("SHOWING: all the records with quantity >= 5000 and name starts with 'Pen'")
 51 |     results1.show()
 52 | 
 53 |     //2. Select all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen'
 54 |     println("SELECTING: all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen'")
 55 |     val results2 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE quantity >= 5000 AND price < 1.24 AND name LIKE 'Pen%'")
 56 |     println("SHOWING: all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen'")
 57 |     results2.show()
 58 | 
 59 |     //3. Select all the records witch does not have quantity 5000 and name does not starts with 'Pen'
 60 |     println("SELECTING: all the records witch does not have quantity 5000 and name does not starts with 'Pen'")
 61 |     val results3 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE NOT (quantity >= 5000 AND name LIKE 'Pen%')")
 62 |     println("SHOWING: all the records witch does not have quantity 5000 and name does not starts with 'Pen'")
 63 |     results3.show()
 64 | 
 65 |     //4. Select all the products which name is 'Pen Red', 'Pen Black'
 66 |     println("SELECTING: all the products which name is 'Pen Red', 'Pen Black'")
 67 |     val results4 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE name IN ('Pen Red', 'Pen Black')")
 68 |     println("SHOWING: all the products which name is 'Pen Red', 'Pen Black'")
 69 |     results4.show()
 70 | 
 71 |     //5. Select all the products which has price BETWEEN 1.0 AND 2.0 AND quantity BETWEEN 1000 AND 2000
 72 |     println("SELECTING : all the products which has price BETWEEN 1.0 AND 2.0 AND quantity BETWEEN 1000 AND 2000")
 73 |     val results5 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE (price BETWEEN 1 AND 2) AND (quantity BETWEEN 1000 AND 2000)")
 74 |     println("SHOWING: all the products which has price BETWEEN 1.0 AND 2.0 AND quantity BETWEEN 1000 AND 2000")
 75 |     results5.show()
 76 | 
 77 |     //Select all the products which has product code as null
 78 |     println("SELECTING : all the products which has product code as null")
 79 |     val results6 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE productCode IS NULL")
 80 |     println("SHOWING: all the products which has product code as null")
 81 |     results6.show()
 82 | 
 83 |     //Select all the products, whose name starts with Pen and results should be order by Price descending order.
 84 |     println("SELECTING : all the products, whose name stalls with Pen and results should be order by Price descending order.")
 85 |     val results7 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE name LIKE 'Pen%' ORDER BY price DESC")
 86 |     println("SHOWING: all the products, whose name startss with Pen and results should be order by Price descending order.")
 87 |     results7.show()
 88 | 
 89 |     //Select all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order.
 90 |     println("SELECTING : all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order")
 91 |     val results8 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE name LIKE 'Pen%' ORDER BY price DESC, quantity ASC")
 92 |     println("SHOWING: all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order")
 93 |     results8.show()
 94 | 
 95 |     //Select top 2 products by price
 96 |     println("SELECTING : top 2 products by price")
 97 |     val results9 = spark.sql(s"SELECT * FROM $globalTempViewName ORDER BY price DESC LIMIT 2")
 98 |     println("SHOWING: top 2 products by price")
 99 |     results9.show()
100 | 
101 |     //Select all the columns from product table with output header as below.: `productID AS ID code AS Code name AS Description price AS 'Unit Price'
102 |     println("SELECTING : all the columns from product table with output header as below.: `productID AS ID code AS Code name AS Description price AS 'Unit Price'")
103 |     val results10 = spark.sql(s"SELECT productID AS ID, productCode AS Code, name AS Description, price AS Unit_Price FROM $globalTempViewName")
104 |     println("SHOWING: all the columns from product table with output header as below.: `productID AS ID code AS Code name AS Description price AS 'Unit Pric'")
105 |     results10.show()
106 | 
107 |     //Select code and name both separated by - and header name should be ProductDescription'
108 |     println("SELECTING : code and name both separated by ' and header name should be ProductDescription'")
109 |     val results11 = spark.sql(s"SELECT CONCAT(productID,'-',name) AS ProductDescription FROM $globalTempViewName")
110 |     println("SHOWING: code and name both separated by ' and header name should be ProductDescription'")
111 |     results11.show()
112 | 
113 |     //Select all distinct prices
114 |     println("SELECTING : all distinct prices")
115 |     val results12 = spark.sql(s"SELECT DISTINCT price AS Distinct_Price FROM $globalTempViewName")
116 |     println("SHOWING: all distinct prices")
117 |     results12.show()
118 | 
119 |     //Select distinct price and name combination
120 |     println("SELECTING : distinct price and name combination")
121 |     val results13 = spark.sql(s"SELECT DISTINCT price, name FROM $globalTempViewName")
122 |     println("SHOWING: distinct price and name combination")
123 |     results13.show()
124 | 
125 |     //Select all price data sorted by both code and productID combination
126 |     println("SELECTING : all price data sorted by both code and productID combinatiom")
127 |     val results15 = spark.sql(s"SELECT * FROM $globalTempViewName ORDER BY productID, productID")
128 |     println("SHOWING: all price data sorted by both code and productID combinatiom")
129 |     results15.show()
130 | 
131 | 
132 |     //Count number ofproducts for each code
133 |     println("SELECTING : Count number of products for each code")
134 |     val results16 = spark.sql(s"SELECT * FROM $globalTempViewName ORDER BY price DESC LIMIT 2")
135 |     println("SHOWING: Count number ofproducts for each code")
136 |     results16.show()
137 | 
138 |     //save daraframe to hive table in orc format
139 |     writeDataFrameToHiveTable(productDF, SaveMode.Overwrite, "orc", "product_orc_table")
140 | 
141 |     //save daraframe to hive table in orc format
142 |     writeDataFrameToHiveTable(productDF, SaveMode.Overwrite, "parquet", "product_parquet_table")
143 | 
144 |   }
145 | 
146 |   def writeDataFrameToHiveTable(inputDF: DataFrame, saveMode: SaveMode, dataFormat: String, hiveTableName: String) = {
147 |     println(s"Starting to write dataframe to hive table with the following data format $dataFormat and hive table name: $hiveTableName")
148 |     //match cases: json, parquet, jdbc, orc, libsvm, csv, text
149 |     dataFormat match {
150 |       case "json" => inputDF.write.mode(saveMode).format("json").saveAsTable(hiveTableName)
151 |       case "parquet" => inputDF.write.mode(saveMode).format("parquet").saveAsTable(hiveTableName)
152 |       case "jdbc" => inputDF.write.mode(saveMode).format("jdbc").saveAsTable(hiveTableName)
153 |       case "orc" => inputDF.write.mode(saveMode).format("orc").saveAsTable(hiveTableName)
154 |       case "csv" => inputDF.write.mode(saveMode).format("libsvm").saveAsTable(hiveTableName)
155 |       case "text" => inputDF.write.mode(saveMode).format("text").saveAsTable(hiveTableName)
156 |       case "libsvm" => inputDF.write.mode(saveMode).format("libsvm").saveAsTable(hiveTableName)
157 |       case _ => "Invalid dataFormat. Allowed formats are: json, parquet, jdbc, orc, csv, text or libsvm" // the default, catch-all
158 |     }
159 | 
160 |     println(s"End write dataframe to hive table with the following table name $dataFormat and hive table name: $hiveTableName")
161 | 
162 |   }
163 | 
164 | 
165 | }
166 | 


--------------------------------------------------------------------------------
/wiki_data/screenshots/how_to_set_spark_master_to_local_in _intellij.PNG:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/jwkimani/big-data-insights-scala/e999dffab10c4fa8f5e716da461ae14e86965c33/wiki_data/screenshots/how_to_set_spark_master_to_local_in _intellij.PNG


--------------------------------------------------------------------------------