├── .gitignore ├── README.md ├── build.sbt ├── config ├── test_linux │ ├── application.conf │ └── log4j.properties └── test_windows │ ├── application.conf │ └── log4j.properties ├── insight_data ├── patients.csv ├── products.csv ├── products_suppliers.csv └── supplier.csv ├── problem_scenarios ├── patient_data.md ├── products.md └── rdd_operations.md ├── project ├── assembly.sbt └── build.properties ├── scripts ├── create_products_parquet_table_in_hive.sh └── create_products_table_in_hive.sh ├── src └── main │ └── scala │ └── com │ └── jwk │ └── development │ └── big_data_insights │ └── scala │ └── products │ ├── driver │ └── run_problem_scenario_part_One.scala │ └── problem_scenario │ └── part_One.scala └── wiki_data └── screenshots └── how_to_set_spark_master_to_local_in _intellij.PNG /.gitignore: -------------------------------------------------------------------------------- 1 | RemoteSystemsTempFiles/ 2 | Servers/ 3 | target/ 4 | logs/ 5 | .metadata/ 6 | bin/ 7 | tmp/ 8 | *.tmp 9 | *.bak 10 | *.swp 11 | *~.nib 12 | local.properties 13 | .settings/ 14 | .loadpath 15 | .recommenders 16 | .idea/ 17 | .project 18 | classes/ 19 | .classpath 20 | .iml 21 | *_SUCCESS* 22 | *.crc 23 | 24 | # External tool builders 25 | .externalToolBuilders/ 26 | 27 | # Locally stored "Eclipse launch configurations" 28 | *.launch 29 | 30 | # PyDev specific (Python IDE for Eclipse) 31 | *.pydevproject 32 | 33 | # CDT-specific (C/C++ Development Tooling) 34 | .cproject 35 | 36 | # Java annotation processor (APT) 37 | .factorypath 38 | 39 | # PDT-specific (PHP Development Tools) 40 | .buildpath 41 | 42 | # sbteclipse plugin 43 | .target 44 | 45 | # Tern plugin 46 | .tern-project 47 | 48 | # TeXlipse plugin 49 | .texlipse 50 | 51 | # STS (Spring Tool Suite) 52 | .springBeans 53 | 54 | # Code Recommenders 55 | .recommenders/ 56 | 57 | # Scala IDE specific (Scala & Java development for Eclipse) 58 | .cache-main 59 | .scala_dependencies 60 | .worksheet 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # big-data-insights-scala 2 | personal solutions to big data problem scenarios using scala 3 | 4 | ## Problem Scenarios 5 | 6 | ### 1. [Product Data for a pen company](https://github.com/jwkimani/big-data-insights-scala/blob/master/problem_scenarios/products.md) 7 | 8 | ### 2. [Patient Data](https://github.com/jwkimani/big-data-insights-scala/blob/master/problem_scenarios/patient_data.md) 9 | 10 | ### 3. [RDD (resilient distributed dataset) Operations](https://github.com/jwkimani/big-data-insights-scala/blob/master/problem_scenarios/rdd_operations.md) 11 | 12 | 13 | ## Troubleshooting 14 | 1. When running applications if below error occurs: *A master URL must be set in your configuration* 15 | ``` 16 | Exception in thread "main" java.lang.ExceptionInInitializerError 17 | at com.jwk.development.big_data_insights.scala.products.driver.problem_scenario_1.main(problem_scenario_1.scala) 18 | Caused by: org.apache.spark.SparkException: A master URL must be set in your configuration 19 | ``` 20 | 21 | Solution: 22 | 23 | Add the following VM option to your run configurations 24 | ``` 25 | -Dspark.master=local 26 | ``` 27 | [How to set spark master to local in intellij](https://github.com/jwkimani/big-data-insights-scala/blob/master/wiki_data/screenshots/how_to_set_spark_master_to_local_in%20_intellij.PNG) 28 | 29 | -------------------------------------------------------------------------------- /build.sbt: -------------------------------------------------------------------------------- 1 | name := "big-data-insights-scala" 2 | 3 | version := "1.0" 4 | 5 | scalaVersion := "2.11.8" 6 | 7 | libraryDependencies ++= Seq( 8 | "org.apache.hadoop" % "hadoop-client" % "2.7.3", 9 | ("org.apache.spark" % "spark-core_2.11" % "2.1.0"), 10 | ("org.apache.spark" % "spark-sql_2.11" % "2.1.0"), 11 | "org.apache.spark" % "spark-hive_2.11" % "2.1.0", 12 | "com.databricks" % "spark-avro_2.11" % "3.2.0", 13 | "com.databricks" % "spark-csv_2.10" % "1.3.0", 14 | "org.scala-lang" % "scala-library" % "2.11.8", 15 | "org.scala-lang" % "scala-reflect" % "2.11.8", 16 | "com.typesafe" % "config" % "1.3.1", 17 | "org.apache.logging.log4j" %% "log4j-api-scala" % "2.8.1", 18 | "org.apache.logging.log4j" % "log4j-core" % "2.8.1", 19 | "org.apache.kafka" %% "kafka" % "0.9.0.2.3.4.51-1" 20 | 21 | ) 22 | //use external repositories 23 | resolvers += "HortonWorksRepo" at "http://repo.hortonworks.com/content/repositories/releases/" 24 | 25 | parallelExecution in test := false 26 | 27 | 28 | initialCommands := "import org.test._" 29 | 30 | //clean operations 31 | cleanFiles += baseDirectory { base => base / "build" }.value 32 | cleanFiles += baseDirectory { base => base / "metastore_db" }.value 33 | 34 | //assembly-settings -------------------------------------------------------------------------------- /config/test_linux/application.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwkimani/big-data-insights-scala/e999dffab10c4fa8f5e716da461ae14e86965c33/config/test_linux/application.conf -------------------------------------------------------------------------------- /config/test_linux/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=ERROR, A1 3 | # If we get chained appenders, this stops the message being written multiple times 4 | log4j.additivity.org.apache=false 5 | log4j.additivity.xdasLogger=false 6 | # A1 is set to be a ConsoleAppender. 7 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 8 | log4j.appender.stdout.Target=System.out 9 | # A1 uses PatternLayout. 10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 12 | 13 | -------------------------------------------------------------------------------- /config/test_windows/application.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwkimani/big-data-insights-scala/e999dffab10c4fa8f5e716da461ae14e86965c33/config/test_windows/application.conf -------------------------------------------------------------------------------- /config/test_windows/log4j.properties: -------------------------------------------------------------------------------- 1 | # Set root logger level to DEBUG and its only appender to A1. 2 | log4j.rootLogger=ERROR, A1 3 | # If we get chained appenders, this stops the message being written multiple times 4 | log4j.additivity.org.apache=false 5 | log4j.additivity.xdasLogger=false 6 | # A1 is set to be a ConsoleAppender. 7 | log4j.appender.A1=org.apache.log4j.ConsoleAppender 8 | log4j.appender.stdout.Target=System.out 9 | # A1 uses PatternLayout. 10 | log4j.appender.A1.layout=org.apache.log4j.PatternLayout 11 | log4j.appender.A1.layout.ConversionPattern=%-4r [%t] %-5p %c %x - %m%n 12 | 13 | -------------------------------------------------------------------------------- /insight_data/patients.csv: -------------------------------------------------------------------------------- 1 | patientID,name ,address ,dateOfBirth,lastVisitDate 2 | 1001 ,Homer Simpson ,"123 Blue St.,Los Angeles, CA 12345" ,1989-12-31 ,2017-01-21 3 | 1002 ,Peter Griffin ,"234 Brown St., San Fransisco, CA 23456",1950-01-30 ,2015-04-18 4 | 1003 ,Hubert J. Fansworth,"546 Red Dr., Sacramento, CA 54678" ,1978-08-21 ,2017-02-14 5 | 1004 ,Marge Simpson ,"123 Blue St.,Los Angeles, CA 12345" ,1990-03-18 ,2016-02-15 6 | 1005 ,Bender Rodriguez ,"127 Brown St., Charlotte, NC 28223" ,1986-12-31 ,2013-12-14 7 | 1006 ,Turanga Leela ,"128 Brown St., Charlotte, NC 28223" ,1978-08-21 ,2012-09-15 8 | -------------------------------------------------------------------------------- /insight_data/products.csv: -------------------------------------------------------------------------------- 1 | productID productCode name quantity price supplierid 2 | 1001 PEN Pen Red 5000 1.23 501 3 | 1002 PEN Pen Blue 8001 1.25 501 4 | 1003 PEN Pen Black 2000 1.25 501 5 | 1004 PEC Pencil 2B 10000 0.48 502 6 | 1005 PEC Pencil 2H 8000 0.49 502 7 | 1006 PEC Pencil HB 0 9999.99 502 8 | 2001 PEC Pencil 3B 500 0.52 501 9 | 2002 PEC Pencil 4B 200 0.62 501 10 | 2003 PEC Pencil 5B 100 0.73 501 11 | 2004 PEC Pencil 6B 500 0.47 502 12 | -------------------------------------------------------------------------------- /insight_data/products_suppliers.csv: -------------------------------------------------------------------------------- 1 | productID,supplierID 2 | 2001 ,501 3 | 2002 ,501 4 | 2003 ,501 5 | 2004 ,502 6 | 2001 ,503 7 | -------------------------------------------------------------------------------- /insight_data/supplier.csv: -------------------------------------------------------------------------------- 1 | supplierID,name ,phone 2 | 501 ,ABC Traders,88881111 3 | 502 ,XYZ Company,88882222 4 | 503 ,QQ Corp ,88883333 5 | 504 ,DEG LLC ,88884444 6 | 505 ,FGH Limited,88885555 7 | -------------------------------------------------------------------------------- /problem_scenarios/patient_data.md: -------------------------------------------------------------------------------- 1 | #Problem Scenario 3 2 | 3 | __Problem:__ ** 4 | 5 | __Package name:__ ** 6 | 7 | __Driver/Main class:__ ** 8 | 9 | You have been given the following file containing patient data: 10 | [patients.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/patients.csv) 11 | 12 | Accomplish the following activities: 13 | 14 | 1. Find all the patients whose lastVisitDate between current time and '2012-09-15' 15 | 16 | ``` 17 | 18 | ``` 19 | 20 | 2. Find all the patients who bom in 1990 21 | 22 | ``` 23 | 24 | ``` 25 | 3. Find all the patients age 26 | 27 | ``` 28 | 29 | ``` 30 | 31 | 4. List patients uhose last visited more than 60 days ago 32 | 33 | ``` 34 | 35 | ``` 36 | 37 | 5. Select patients 18 years old or younger 38 | 39 | ``` 40 | 41 | ``` 42 | 43 | 44 | 45 | -------------------------------------------------------------------------------- /problem_scenarios/products.md: -------------------------------------------------------------------------------- 1 | # Problem Scenario 1 2 | 3 | __Problem:__ *Given csv files with product information from a pen company, provide some insights using big data technologies* 4 | 5 | __Package name:__ *[com.jwk.development.big_data_insights.scala.products](https://github.com/jwkimani/big-data-insights-scala/tree/master/src/main/scala/com/jwk/development/big_data_insights/scala/products)* 6 | 7 | __Driver/Main class:__ *[com.jwk.development.big_data_insights.scala.products.driver](https://github.com/jwkimani/big-data-insights-scala/tree/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/driver)* 8 | 9 | __Solution Package__ *[com.jwk.development.big_data_insights.scala.products.problem_scenario](https://github.com/jwkimani/big-data-insights-scala/tree/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/problem_scenario)* 10 | 11 | ## 12 | This problem has three parts hence the solutions are broken into three different modules in the package: 13 | 1. [Part One](#part1) 14 | 15 | 2. [Part Two](#part2) 16 | 17 | 3. [Part Three](#part3) 18 | 19 | ## Part One 20 | 21 | __Driver/Main class:__ *[com.jwk.development.big_data_insights.scala.products.driver](https://github.com/jwkimani/big-data-insights-scala/blob/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/driver/run_problem_scenario_part_One.scala)* 22 | 23 | __Solution Package__ *[com.jwk.development.big_data_insights.scala.products.problem_scenario](https://github.com/jwkimani/big-data-insights-scala/blob/master/src/main/scala/com/jwk/development/big_data_insights/scala/products/problem_scenario/part_One.scala)* 24 | 25 | You have the following tab delimited csv file [products.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/products.csv) 26 | 27 | Using Spark and SparkSQL perform the following tasks: 28 | 1. Load the csv file to a dataframe 29 | 30 | Using schema: 31 | ``` 32 | val schema = 33 | StructType( 34 | Array( 35 | StructField("productID", IntegerType, false), 36 | StructField("productCode", StringType, false), 37 | StructField("name", StringType, false), 38 | StructField("quantity", IntegerType, false), 39 | StructField("price", FloatType, false) 40 | ) 41 | ) 42 | ``` 43 | approach 44 | ``` 45 | val productDF = sqlContext.read.format("com.databricks.spark.csv").option("delimiter","\t").option("header","true").option("inferSchema", "false").schema(schema).load(path) 46 | ``` 47 | 48 | 2. Create a global temporary view named `products` from dataframe with data 49 | 50 | `productDF.createGlobalTempView("products")` 51 | 52 | 3. Using the global temporary view, perform the task below 53 | 54 | 4. Select and show all the records with quantity >= 5000 and name starts with 'Pen' 55 | ``` 56 | +---------+-----------+---------+--------+-----+ 57 | |productID|productCode| name|quantity|price| 58 | +---------+-----------+---------+--------+-----+ 59 | | 1001| PEN| Pen Red| 5000| 1.23| 60 | | 1002| PEN| Pen Blue| 8001| 1.25| 61 | | 1004| PEC|Pencil 2B| 10000| 0.48| 62 | | 1005| PEC|Pencil 2H| 8000| 0.49| 63 | +---------+-----------+---------+--------+-----+ 64 | ``` 65 | 66 | 5. Select and show all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen' 67 | ``` 68 | +---------+-----------+---------+--------+-----+ 69 | |productID|productCode| name|quantity|price| 70 | +---------+-----------+---------+--------+-----+ 71 | | 1001| PEN| Pen Red| 5000| 1.23| 72 | | 1004| PEC|Pencil 2B| 10000| 0.48| 73 | | 1005| PEC|Pencil 2H| 8000| 0.49| 74 | +---------+-----------+---------+--------+-----+ 75 | ``` 76 | 77 | 6. Select and show all the records witch does not have quantity 5000 and name does not starts with 'Pen' 78 | ``` 79 | +---------+-----------+---------+--------+-------+ 80 | |productID|productCode| name|quantity| price| 81 | +---------+-----------+---------+--------+-------+ 82 | | 1003| PEN|Pen Black| 2000| 1.25| 83 | | 1006| PEC|Pencil HB| 0|9999.99| 84 | | 2001| PEC|Pencil 3B| 500| 0.52| 85 | | 2002| PEC|Pencil 4B| 200| 0.62| 86 | | 2003| PEC|Pencil 5B| 100| 0.73| 87 | | 2004| PEC|Pencil 6B| 500| 0.47| 88 | +---------+-----------+---------+--------+-------+ 89 | ``` 90 | 91 | 7. Select and show all the products which name is 'Pen Red', 'Pen Black' 92 | ``` 93 | +---------+-----------+---------+--------+-----+ 94 | |productID|productCode| name|quantity|price| 95 | +---------+-----------+---------+--------+-----+ 96 | | 1001| PEN| Pen Red| 5000| 1.23| 97 | | 1003| PEN|Pen Black| 2000| 1.25| 98 | +---------+-----------+---------+--------+-----+ 99 | ``` 100 | 8. Select and show all the products which has price BETWEEN 1.0 AND 2 0 AND quantity 101 | ``` 102 | +---------+-----------+---------+--------+-----+ 103 | |productID|productCode| name|quantity|price| 104 | +---------+-----------+---------+--------+-----+ 105 | | 1003| PEN|Pen Black| 2000| 1.25| 106 | +---------+-----------+---------+--------+-----+ 107 | ``` 108 | 109 | 9. Select all the products which has product code as null 110 | 111 | ``` 112 | +---------+-----------+----+--------+-----+ 113 | |productID|productCode|name|quantity|price| 114 | +---------+-----------+----+--------+-----+ 115 | +---------+-----------+----+--------+-----+ 116 | ``` 117 | 118 | 10. Select all the products, whose name stalls with Pen and results should be order by Price descending order. 119 | 120 | ``` 121 | +---------+-----------+---------+--------+-------+ 122 | |productID|productCode| name|quantity| price| 123 | +---------+-----------+---------+--------+-------+ 124 | | 1006| PEC|Pencil HB| 0|9999.99| 125 | | 1003| PEN|Pen Black| 2000| 1.25| 126 | | 1002| PEN| Pen Blue| 8001| 1.25| 127 | | 1001| PEN| Pen Red| 5000| 1.23| 128 | | 2003| PEC|Pencil 5B| 100| 0.73| 129 | | 2002| PEC|Pencil 4B| 200| 0.62| 130 | | 2001| PEC|Pencil 3B| 500| 0.52| 131 | | 1005| PEC|Pencil 2H| 8000| 0.49| 132 | | 1004| PEC|Pencil 2B| 10000| 0.48| 133 | | 2004| PEC|Pencil 6B| 500| 0.47| 134 | +---------+-----------+---------+--------+-------+ 135 | ``` 136 | 137 | 11. Select all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order. 138 | 139 | ``` 140 | +---------+-----------+---------+--------+-------+ 141 | |productID|productCode| name|quantity| price| 142 | +---------+-----------+---------+--------+-------+ 143 | | 1006| PEC|Pencil HB| 0|9999.99| 144 | | 1003| PEN|Pen Black| 2000| 1.25| 145 | | 1002| PEN| Pen Blue| 8001| 1.25| 146 | | 1001| PEN| Pen Red| 5000| 1.23| 147 | | 2003| PEC|Pencil 5B| 100| 0.73| 148 | | 2002| PEC|Pencil 4B| 200| 0.62| 149 | | 2001| PEC|Pencil 3B| 500| 0.52| 150 | | 1005| PEC|Pencil 2H| 8000| 0.49| 151 | | 1004| PEC|Pencil 2B| 10000| 0.48| 152 | | 2004| PEC|Pencil 6B| 500| 0.47| 153 | +---------+-----------+---------+--------+-------+ 154 | ``` 155 | 156 | 12. Select top 2 products by price 157 | 158 | ``` 159 | +---------+-----------+---------+--------+-------+ 160 | |productID|productCode| name|quantity| price| 161 | +---------+-----------+---------+--------+-------+ 162 | | 1006| PEC|Pencil HB| 0|9999.99| 163 | | 1002| PEN| Pen Blue| 8001| 1.25| 164 | +---------+-----------+---------+--------+-------+ 165 | ``` 166 | 167 | 168 | 13. Select all the columns from product table with output header as below. 169 | ` 170 | productID AS ID 171 | code AS Code 172 | name AS Description 173 | price AS 'Unit Price' 174 | ` 175 | ``` 176 | 177 | ``` 178 | 179 | 14. Select code and name both separated by '-' and header name should be ProductDescription'_ 180 | 181 | ``` 182 | ``` 183 | 184 | 15. Select all distinct prices. 185 | 186 | ``` 187 | ``` 188 | 189 | 16. Select distinct price and name combination 190 | 191 | ``` 192 | ``` 193 | 194 | 17. Select all price data sorted by both code and productID combinatiom 195 | 196 | ``` 197 | ``` 198 | 199 | 18. count number of products. 200 | 201 | ``` 202 | ``` 203 | 204 | 19. Count number ofproducts for each code 205 | 206 | ``` 207 | ``` 208 | 209 | 210 | 20. Select Maximum, minimum, average Standard Deviation, and total quantity _ 211 | 212 | 21. Select minimum and maximum price for each product code. 213 | 214 | 22. Select Maximum, minimum, average Standard Deviation, and total quantity for each product code, hwoeiM make sure and Standard deviation will have maximum two decimal values. 215 | 216 | 23. Select all the product code and average price only where product count is more than or equal to 3 217 | 218 | 24. Select maximum, minimum average and total of all the products for each code. Also produce the same across all the products 219 | 220 | 221 | ## Part Two 222 | __Package name:__ *[com.jwk.development.big_data_insights.scala.products]()* 223 | 224 | __Driver/Main class:__ *[com.jwk.development.big_data_insights.scala.products.driver]()* 225 | 226 | __Solution Package__ *[com.jwk.development.big_data_insights.scala.products.problem_scenario]()* 227 | 228 | You have been provided two additional files: 229 | 230 | 1. [suppliers.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/supplier.csv) 231 | 232 | 2. [products_suppliers.csv](https://github.com/jwkimani/big-data-insights-scala/blob/master/insight_data/products_suppliers.csv) 233 | 234 | 235 | Now accomplish all the queries. 236 | 237 | 1. Select product, its price , its supplier name where product price is less than 0.6 using SparkSQL 238 | 239 | 240 | 2. It is possible that, same product can be supplied by multiple supplier. Now find each product, its price according to 241 | each supplier. 242 | 243 | 3. Find all the supllier name, who are supplying 'Pencil 3B' 244 | 245 | 4. Find all the products , which are supplied by ABC Traders _ 246 | 247 | 248 | ## Part Three 249 | 1. Create a Hive ORC table using SparkSQL 250 | 251 | 2. Load this data in Hive table. 252 | 253 | 3. Create a Hive parquet table using SparkSQL and load data in it. 254 | 255 | 256 | ## Developer Notes: 257 | Add the following VM options arguments to set spark master 258 | ``` 259 | -Dspark.master=local 260 | -Dhadoop.home.dir=C:\hadoop-2.7.4 261 | ``` 262 | -------------------------------------------------------------------------------- /problem_scenarios/rdd_operations.md: -------------------------------------------------------------------------------- 1 | # Problem Scenario: RDD Operations 2 | 3 | __Problem:__ ** 4 | 5 | __Package name:__ ** 6 | 7 | __Driver/Main class:__ ** 8 | 9 | __solution:__** 10 | 11 | 12 | 1. You have been given below code snippet 13 | 14 | ``` 15 | val a = "salmon", "salmon", "rat", "elephant"}, 3} 16 | 17 | val b = a.keyBy(_.1ength) 18 | 19 | val c = sc.parallelize(List("dog", "cat", "gnu", "salmon", )), "salmon" , 3) 20 | 21 | val d = c.keyBy(_.1ength) 22 | 23 | ``` 24 | 25 | Write a correct code snippet for operationl which will produce desired output, shoun below. 26 | 27 | ``` 28 | Array[(lnt, (String, String))] = 29 | 30 | (6 , (salmon,rabbit)) 31 | 32 | (3,(dog,dog)), (3 ,(dog,cat)), (3 ,(dog,gnu)), 33 | ``` 34 | 35 | -------------------------------------------------------------------------------- /project/assembly.sbt: -------------------------------------------------------------------------------- 1 | addSbtPlugin("com.eed3si9n" % "sbt-assembly" % "0.12.0") 2 | resolvers += Resolver.url("bintray-sbt-plugins", url("http://dl.bintray.com/sbt/sbt-plugin-releases"))(Resolver.ivyStylePatterns) -------------------------------------------------------------------------------- /project/build.properties: -------------------------------------------------------------------------------- 1 | sbt.version=0.13.15 2 | -------------------------------------------------------------------------------- /scripts/create_products_parquet_table_in_hive.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | hive -e 4 | " 5 | CREATE EXTERNAL TABLE products_orc (productid int, code string, name string, quantity int, price float) 6 | STORED AS parquet 7 | LOCATION /user/hive/warehouse/product_parquet_table 8 | ;" 9 | 10 | #"select * from product_parquet_table;" -------------------------------------------------------------------------------- /scripts/create_products_table_in_hive.sh: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env bash 2 | 3 | hive -e 4 | " 5 | CREATE EXTERNAL TABLE products_orc (productid int, code string, name string, quantity int, price float) 6 | STORED AS orc 7 | LOCATION /user/hive/warehouse/product_orc_table 8 | ;" 9 | 10 | #select * from product_orc_table -------------------------------------------------------------------------------- /src/main/scala/com/jwk/development/big_data_insights/scala/products/driver/run_problem_scenario_part_One.scala: -------------------------------------------------------------------------------- 1 | package com.jwk.development.big_data_insights.scala.products.driver 2 | 3 | import java.util.Date 4 | 5 | import com.jwk.development.big_data_insights.scala.products.problem_scenario.part_One 6 | import org.apache.spark.sql.SparkSession 7 | 8 | object run_problem_scenario_part_One { 9 | val spark: SparkSession = SparkSession.builder.getOrCreate() 10 | 11 | def main(args: Array[String]): Unit = { 12 | 13 | //signal start message 14 | println("Start " + this.getClass.getName() + " : " + new Date()) 15 | 16 | try { 17 | val problemPart = new part_One 18 | problemPart.part_One_Solution("insight_data/products.csv") 19 | } catch { 20 | case ex: Exception => { 21 | println(this.getClass.getName() + ". Error during program run. Root cause: " + ex.getMessage()) 22 | } 23 | } 24 | 25 | //signal end message 26 | println("End " + this.getClass.getName() + " : " + new Date()) 27 | } 28 | } 29 | -------------------------------------------------------------------------------- /src/main/scala/com/jwk/development/big_data_insights/scala/products/problem_scenario/part_One.scala: -------------------------------------------------------------------------------- 1 | package com.jwk.development.big_data_insights.scala.products.problem_scenario 2 | 3 | import org.apache.spark.sql.types._ 4 | import org.apache.spark.sql.{DataFrame, SaveMode, SparkSession} 5 | 6 | class part_One { 7 | val spark: SparkSession = SparkSession.builder.appName("products_application").config("spark.serializer", "org.apache.spark.serializer.KryoSerializer").getOrCreate() 8 | val sparkContext = spark.sparkContext 9 | val sqlContext = new org.apache.spark.sql.SQLContext(sparkContext) 10 | 11 | 12 | /** 13 | * Solution to part one of products problem scenario 14 | * 15 | * @param path file path to products.csv file 16 | */ 17 | def part_One_Solution(path: String): Unit = { 18 | //val tab_delimited_Header= "productID\tproductCode\tname\tquantity\tprice\tsupplierid" 19 | //val comma_delimited_Header= "productID,productCode,name,quantity,price,supplierid" 20 | 21 | //define schema of csv file 22 | val schema = 23 | StructType( 24 | Array( 25 | StructField("productID", IntegerType, false), 26 | StructField("productCode", StringType, false), 27 | StructField("name", StringType, false), 28 | StructField("quantity", IntegerType, false), 29 | StructField("price", FloatType, false) 30 | ) 31 | ) 32 | 33 | //read csv file from directory path using schema 34 | val productDF = sqlContext.read.format("com.databricks.spark.csv").option("delimiter", "\t").option("header", "true").option("inferSchema", "false").schema(schema).load(path) 35 | //show first 10 records in the dataframe 36 | productDF.show(10) 37 | 38 | // Register the DataFrame as a global temporary view 39 | val tempTableName = "products" 40 | productDF.createGlobalTempView(tempTableName) 41 | val globalTempViewName = s"global_temp.$tempTableName" 42 | 43 | //import apache spark sql 44 | import org.apache.spark.sql._ 45 | 46 | //The following answers PART ONE questions of the problem scenario. 47 | //1. Select all the records with quantity >= 5000 and name starts with 'Pen' 48 | println("SELECTING: all the records with quantity >= 5000 and name starts with 'Pen'") 49 | val results1 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE quantity >= 5000 AND name LIKE 'Pen%'") 50 | println("SHOWING: all the records with quantity >= 5000 and name starts with 'Pen'") 51 | results1.show() 52 | 53 | //2. Select all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen' 54 | println("SELECTING: all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen'") 55 | val results2 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE quantity >= 5000 AND price < 1.24 AND name LIKE 'Pen%'") 56 | println("SHOWING: all the records with quantity >= 5000, price is less than 1.24 and name starts with 'Pen'") 57 | results2.show() 58 | 59 | //3. Select all the records witch does not have quantity 5000 and name does not starts with 'Pen' 60 | println("SELECTING: all the records witch does not have quantity 5000 and name does not starts with 'Pen'") 61 | val results3 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE NOT (quantity >= 5000 AND name LIKE 'Pen%')") 62 | println("SHOWING: all the records witch does not have quantity 5000 and name does not starts with 'Pen'") 63 | results3.show() 64 | 65 | //4. Select all the products which name is 'Pen Red', 'Pen Black' 66 | println("SELECTING: all the products which name is 'Pen Red', 'Pen Black'") 67 | val results4 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE name IN ('Pen Red', 'Pen Black')") 68 | println("SHOWING: all the products which name is 'Pen Red', 'Pen Black'") 69 | results4.show() 70 | 71 | //5. Select all the products which has price BETWEEN 1.0 AND 2.0 AND quantity BETWEEN 1000 AND 2000 72 | println("SELECTING : all the products which has price BETWEEN 1.0 AND 2.0 AND quantity BETWEEN 1000 AND 2000") 73 | val results5 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE (price BETWEEN 1 AND 2) AND (quantity BETWEEN 1000 AND 2000)") 74 | println("SHOWING: all the products which has price BETWEEN 1.0 AND 2.0 AND quantity BETWEEN 1000 AND 2000") 75 | results5.show() 76 | 77 | //Select all the products which has product code as null 78 | println("SELECTING : all the products which has product code as null") 79 | val results6 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE productCode IS NULL") 80 | println("SHOWING: all the products which has product code as null") 81 | results6.show() 82 | 83 | //Select all the products, whose name starts with Pen and results should be order by Price descending order. 84 | println("SELECTING : all the products, whose name stalls with Pen and results should be order by Price descending order.") 85 | val results7 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE name LIKE 'Pen%' ORDER BY price DESC") 86 | println("SHOWING: all the products, whose name startss with Pen and results should be order by Price descending order.") 87 | results7.show() 88 | 89 | //Select all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order. 90 | println("SELECTING : all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order") 91 | val results8 = spark.sql(s"SELECT * FROM $globalTempViewName WHERE name LIKE 'Pen%' ORDER BY price DESC, quantity ASC") 92 | println("SHOWING: all the products, whose name staffs with Pen and results should be order by Price descending order and quantity ascending order") 93 | results8.show() 94 | 95 | //Select top 2 products by price 96 | println("SELECTING : top 2 products by price") 97 | val results9 = spark.sql(s"SELECT * FROM $globalTempViewName ORDER BY price DESC LIMIT 2") 98 | println("SHOWING: top 2 products by price") 99 | results9.show() 100 | 101 | //Select all the columns from product table with output header as below.: `productID AS ID code AS Code name AS Description price AS 'Unit Price' 102 | println("SELECTING : all the columns from product table with output header as below.: `productID AS ID code AS Code name AS Description price AS 'Unit Price'") 103 | val results10 = spark.sql(s"SELECT productID AS ID, productCode AS Code, name AS Description, price AS Unit_Price FROM $globalTempViewName") 104 | println("SHOWING: all the columns from product table with output header as below.: `productID AS ID code AS Code name AS Description price AS 'Unit Pric'") 105 | results10.show() 106 | 107 | //Select code and name both separated by - and header name should be ProductDescription' 108 | println("SELECTING : code and name both separated by ' and header name should be ProductDescription'") 109 | val results11 = spark.sql(s"SELECT CONCAT(productID,'-',name) AS ProductDescription FROM $globalTempViewName") 110 | println("SHOWING: code and name both separated by ' and header name should be ProductDescription'") 111 | results11.show() 112 | 113 | //Select all distinct prices 114 | println("SELECTING : all distinct prices") 115 | val results12 = spark.sql(s"SELECT DISTINCT price AS Distinct_Price FROM $globalTempViewName") 116 | println("SHOWING: all distinct prices") 117 | results12.show() 118 | 119 | //Select distinct price and name combination 120 | println("SELECTING : distinct price and name combination") 121 | val results13 = spark.sql(s"SELECT DISTINCT price, name FROM $globalTempViewName") 122 | println("SHOWING: distinct price and name combination") 123 | results13.show() 124 | 125 | //Select all price data sorted by both code and productID combination 126 | println("SELECTING : all price data sorted by both code and productID combinatiom") 127 | val results15 = spark.sql(s"SELECT * FROM $globalTempViewName ORDER BY productID, productID") 128 | println("SHOWING: all price data sorted by both code and productID combinatiom") 129 | results15.show() 130 | 131 | 132 | //Count number ofproducts for each code 133 | println("SELECTING : Count number of products for each code") 134 | val results16 = spark.sql(s"SELECT * FROM $globalTempViewName ORDER BY price DESC LIMIT 2") 135 | println("SHOWING: Count number ofproducts for each code") 136 | results16.show() 137 | 138 | //save daraframe to hive table in orc format 139 | writeDataFrameToHiveTable(productDF, SaveMode.Overwrite, "orc", "product_orc_table") 140 | 141 | //save daraframe to hive table in orc format 142 | writeDataFrameToHiveTable(productDF, SaveMode.Overwrite, "parquet", "product_parquet_table") 143 | 144 | } 145 | 146 | def writeDataFrameToHiveTable(inputDF: DataFrame, saveMode: SaveMode, dataFormat: String, hiveTableName: String) = { 147 | println(s"Starting to write dataframe to hive table with the following data format $dataFormat and hive table name: $hiveTableName") 148 | //match cases: json, parquet, jdbc, orc, libsvm, csv, text 149 | dataFormat match { 150 | case "json" => inputDF.write.mode(saveMode).format("json").saveAsTable(hiveTableName) 151 | case "parquet" => inputDF.write.mode(saveMode).format("parquet").saveAsTable(hiveTableName) 152 | case "jdbc" => inputDF.write.mode(saveMode).format("jdbc").saveAsTable(hiveTableName) 153 | case "orc" => inputDF.write.mode(saveMode).format("orc").saveAsTable(hiveTableName) 154 | case "csv" => inputDF.write.mode(saveMode).format("libsvm").saveAsTable(hiveTableName) 155 | case "text" => inputDF.write.mode(saveMode).format("text").saveAsTable(hiveTableName) 156 | case "libsvm" => inputDF.write.mode(saveMode).format("libsvm").saveAsTable(hiveTableName) 157 | case _ => "Invalid dataFormat. Allowed formats are: json, parquet, jdbc, orc, csv, text or libsvm" // the default, catch-all 158 | } 159 | 160 | println(s"End write dataframe to hive table with the following table name $dataFormat and hive table name: $hiveTableName") 161 | 162 | } 163 | 164 | 165 | } 166 | -------------------------------------------------------------------------------- /wiki_data/screenshots/how_to_set_spark_master_to_local_in _intellij.PNG: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/jwkimani/big-data-insights-scala/e999dffab10c4fa8f5e716da461ae14e86965c33/wiki_data/screenshots/how_to_set_spark_master_to_local_in _intellij.PNG --------------------------------------------------------------------------------