├── DataFrame Solutions
    ├── 1158. Market Analysis I (Medium).txt
    ├── 1212. Team Scores in Football Tournament (Medium).txt
    ├── 1355. Activity Participants (Medium).txt
    ├── 1445. Apples & Oranges (Medium).txt
    ├── 1596. The Most Frequently Ordered Products for Each Customer (Medium).txt
    ├── 175. Combine Two Tables (Easy).txt
    ├── 181. Employees Earning More Than Their Managers (Easy).txt
    ├── 182. Duplicate Emails (Easy).txt
    ├── 183. Customers Who Never Order (Easy).txt
    ├── 1907. Count Salary Categories (Medium).txt
    ├── 1934. Confirmation Rate (Medium).txt
    ├── 196. Delete Duplicate Emails (Easy).txt
    ├── 197. Rising Temperature (Easy).txt
    ├── 1988. Find Cutoff Score for Each School (Medium).txt
    ├── 2051. The Category of Each Member in the Store (Medium).txt
    ├── 511. Game Play Analysis I (Easy).txt
    ├── 512. Game Play Analysis II (Easy).txt
    ├── 577. Employee Bonus (Easy).txt
    ├── 584. Find Customer Referee (Easy).txt
    ├── 586. Customer Placing the Largest Number of Orders (Easy).txt
    └── 595. Big Countries (Easy).txt
├── README.md
├── apache_spark.png
├── leetcode.png
├── plus.png
└── postgresql_dump_file
    └── leetcodedb.sql


/DataFrame Solutions/1158. Market Analysis I (Medium).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM users_1158) AS users"
 2 | 
 3 | val usersDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val query = "(SELECT * FROM orders_1158) AS orders"
 6 | 
 7 | val ordersDF = spark.read.jdbc(url, query, connectionProperties)
 8 | 
 9 | val query = "(SELECT * FROM items_1158) AS items"
10 | 
11 | val itemsDF = spark.read.jdbc(url, query, connectionProperties)
12 | 
13 | val orders2019DF = ordersDF.where(year($"order_date")===2019).groupBy($"buyer_id").agg(count($"order_id").as("orders_in_2019"))
14 | 
15 | val resultDF = usersDF.as("u").join(orders2019DF.as("o"),$"u.user_id"===$"o.buyer_id","left_outer").select($"u.user_id",$"u.join_date",coalesce($"o.orders_in_2019",lit(0)))
16 | 
17 | resultDF.show
18 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/1212. Team Scores in Football Tournament (Medium).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM teams_1212) AS teams"
 2 | 
 3 | val teamsDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val query = "(SELECT * FROM matches_1212) AS matches"
 6 | 
 7 | val matchesDF = spark.read.jdbc(url, query, connectionProperties)
 8 | 
 9 | val tiedMatchesDF = matchesDF.where($"guest_goals" === $"host_goals").select($"guest_team",$"host_team",$"guest_goals",$"host_goals")
10 | 
11 | val unionDF = matchesDF.select($"host_team",$"guest_team",$"host_goals",$"guest_goals").union(tiedMatchesDF)
12 | 
13 | val reportDF = unionDF.withColumn("winner",when($"host_goals">$"guest_goals",$"host_team").when($"host_goals"<$"guest_goals",$"host_team").otherwise($"host_team")).withColumn("points",when($"host_goals"===$"guest_goals",lit(1)).otherwise(lit(3))).select($"winner",$"points").groupBy($"winner").agg(sum($"points").as("num_points"))
14 | 
15 | val resultDF = teamsDF.as("t").join(reportDF.as("r"),$"t.team_id"===$"r.winner","left_outer").select($"t.team_id",$"t.team_name",coalesce($"r.num_points",lit(0))).orderBy($"r.num_points".desc,$"t.team_id")
16 | 
17 | resultDF.show
18 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/1355. Activity Participants (Medium).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM friends_1355) AS friends"
 2 | 
 3 | val friendsDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val groupedDF = friendsDF.groupBy($"activity").agg(count($"id").as("cnt"))
 6 | 
 7 | val resultDF = groupedDF.withColumn("min_count",min($"cnt").over()).withColumn("max_count",max($"cnt").over()).where($"cnt"=!=$"max_count" && $"cnt"=!=$"min_count").select($"activity")
 8 | 
 9 | resultDF.show
10 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/1445. Apples & Oranges (Medium).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM sales_1445) AS sales"
 2 | 
 3 | val salesDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val applesDF = salesDF.where($"fruit"==="apples")
 6 | 
 7 | val orangesDF = salesDF.where($"fruit"==="oranges")
 8 | 
 9 | val resultDF = applesDF.as("a").join(orangesDF.as("o"),$"a.sale_date"===$"o.sale_date","full_outer").select($"a.sale_date",($"a.sold_num"-$"o.sold_num").as("diff"))
10 | 
11 | resultDF.show
12 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/1596. The Most Frequently Ordered Products for Each Customer (Medium).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM customers_1596) AS customers"
 2 | 
 3 | val customersDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val query = "(SELECT * FROM orders_1596) AS orders"
 6 | 
 7 | val ordersDF = spark.read.jdbc(url, query, connectionProperties)
 8 | 
 9 | val query = "(SELECT * FROM products_1596) AS products"
10 | 
11 | val productsDF = spark.read.jdbc(url, query, connectionProperties)
12 | 
13 | 
14 | val groupedDF = ordersDF.groupBy($"customer_id",$"product_id").agg(count(lit(1)).as("cnt"))
15 | 
16 | import org.apache.spark.sql.expressions.Window
17 | 
18 | val windowSpec = Window.partitionBy("customer_id").orderBy(desc("cnt"))
19 | 
20 | val rankedDF = groupedDF.withColumn("rank",dense_rank().over(windowSpec)).where($"rank"===lit(1)).select($"customer_id",$"product_id")
21 | 
22 | val resultDF = rankedDF.as("r").join(productsDF.as("p"),$"p.product_id"===$"r.product_id").selectExpr("r.*","product_name")
23 | 
24 | resultDF.show
25 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/175. Combine Two Tables (Easy).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM person_175) AS person"
 2 | val personDF = spark.read.jdbc(url, query, connectionProperties)
 3 | 
 4 | val query = "(SELECT * FROM address_175) AS address"
 5 | val addressDF = spark.read.jdbc(url, query, connectionProperties)
 6 | 
 7 | val joinCondition = personDF.col("personid") === addressDF.col("personid")
 8 | val joinedDF = personDF.join(addressDF,joinCondition,"inner")
 9 | 
10 | joinedDF.show
11 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/181. Employees Earning More Than Their Managers (Easy).txt:
--------------------------------------------------------------------------------
1 | val query = "(SELECT * FROM employee_181) AS employee"
2 | 
3 | val employeeDF = spark.read.jdbc(url, query, connectionProperties)
4 | 
5 | val joinedDF = employeeDF.as("emp").join(employeeDF.as("mgr"),$"emp.manager_id"===$"mgr.id" && $"emp.salary" > $"mgr.salary","inner").select($"emp.name")
6 | 
7 | joinedDF.show
8 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/182. Duplicate Emails (Easy).txt:
--------------------------------------------------------------------------------
1 | val query = "(SELECT * FROM person_182) AS person"
2 | 
3 | val personDF = spark.read.jdbc(url, query, connectionProperties)
4 | 
5 | val duplicateDF = personDF.groupBy(col("email")).agg(count(col("id")).as("cnt")).where($"cnt">1).select(col("email"))
6 | 
7 | duplicateDF.show
8 | 
9 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/183. Customers Who Never Order (Easy).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM customers_183) AS customers"
 2 | 
 3 | val customerDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val query = "(SELECT * FROM orders_183) AS orders"
 6 | 
 7 | val orderDF = spark.read.jdbc(url, query, connectionProperties)
 8 | 
 9 | // Direct Method
10 | 
11 | val joinedDF = customerDF.as("c").join(orderDF.as("o"),$"c.id"===$"o.customer_id","left_anti").select($"c.name")
12 | 
13 | //Conventional Method
14 | 
15 | val joinedDF = customerDF.as("c").join(orderDF.as("o"),$"c.id"===$"o.customer_id","left_outer").where($"o.id".isNull).select($"c.name")
16 | 
17 | joinedDF.show
18 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/1907. Count Salary Categories (Medium).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM accounts_1907) AS accounts"
 2 | 
 3 | val accountsDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val categorizedDF = accountsDF.withColumn("category",when($"income"<20000,"Low Salary").when($"income">=20000 && $"income"<=50000,"Average Salary").otherwise("High Salary"))
 6 | 
 7 | val groupedDF = categorizedDF.groupBy($"category").agg(count($"account_id").as("accounts_count"))
 8 | 
 9 | val categoryDF = Seq("Low Salary","Average Salary","High Salary").toDF("category")
10 | 
11 | val resultDF = categoryDF.as("c")
12 | 		.join(groupedDF.as("g"),$"c.category"===$"g.category","left_outer")
13 | 		.select($"c.category",coalesce($"g.accounts_count",lit(0)).as("accounts_count"))
14 | 
15 | resultDF.show
16 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/1934. Confirmation Rate (Medium).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM signups_1934) AS signups"
 2 | 
 3 | val signupsDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val query = "(SELECT * FROM confirmations_1934) AS confirmations"
 6 | 
 7 | val confirmationsDF = spark.read.jdbc(url, query, connectionProperties)
 8 | 
 9 | import org.apache.spark.sql.expressions.Window
10 | 
11 | val w = Window.partitionBy("user_id")
12 | 
13 | val crDF = confirmationsDF.withColumn("confirmation_count",count(when($"action"===lit("confirmed"),lit(1)).otherwise(null)).over(w)).withColumn("total_count",count($"action").over(w)).select($"user_id",round($"confirmation_count"/$"total_count",2).as("confirmation_rate"))
14 | 
15 | val resultDF = signupsDF.as("s").join(crDF.as("c"),$"s.user_id"===$"c.user_id","left_outer").select($"s.user_id",coalesce($"confirmation_rate",lit(0))).distinct
16 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/196. Delete Duplicate Emails (Easy).txt:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.sql.expressions.Window
 2 | 
 3 | val query = "(SELECT * FROM person_196) AS person"
 4 | 
 5 | val personDF = spark.read.jdbc(url, query, connectionProperties)
 6 | 
 7 | val w = Window.partitionBy("email")
 8 | 
 9 | val distinctPersonDF = personDF.withColumn("min",min($"id").over(w)).where($"min"===$"id").drop($"min")
10 | 
11 | distinctPersonDF.show
12 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/197. Rising Temperature (Easy).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM weather_197) AS weather"
 2 | 
 3 | val weatherDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val resultDF = weatherDF.as("w1")
 6 | 		.join(weatherDF.as("w2"),$"w2.record_date"+1===$"w1.record_date" && $"w2.temperature"<$"w1.temperature","inner")
 7 | 		.select($"w1.id")
 8 | 
 9 | resultDF.show
10 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/1988. Find Cutoff Score for Each School (Medium).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM school_1988) AS school"
 2 | 
 3 | val schoolDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val query = "(SELECT * FROM exam_1988) AS exam"
 6 | 
 7 | val examDF = spark.read.jdbc(url, query, connectionProperties)
 8 | 
 9 | val resultDF = schoolDF.as("s").join(examDF.as("e"),$"s.capacity">=$"e.student_count","left_outer").groupBy($"s.school_id").agg(coalesce(min($"e.score"),lit(-1)).as("score"))
10 | 
11 | resultDF.show
12 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/2051. The Category of Each Member in the Store (Medium).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM members_2051) AS members"
 2 | 
 3 | val membersDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val query = "(SELECT * FROM visits_2051) AS visits"
 6 | 
 7 | val visitsDF = spark.read.jdbc(url, query, connectionProperties)
 8 | 
 9 | val query = "(SELECT * FROM purchases_2051) AS purchases"
10 | 
11 | val purchasesDF = spark.read.jdbc(url, query, connectionProperties)
12 | 
13 | import org.apache.spark.sql.expressions.Window
14 | 
15 | val w = Window.partitionBy("member_id")
16 | 
17 | val joinedDF = visitsDF.as("v").join(purchasesDF.as("p"),$"v.visit_id"===$"p.visit_id","left_outer").withColumn("purchase_count",count(when($"p.visit_id".isNotNull,lit(1))).over(w)).withColumn("total_count",count($"v.visit_id").over(w)).select($"member_id",($"purchase_count"*lit(100)/$"total_count").as("conversion")).distinct
18 | 
19 | val conDF = joinedDF.withColumn("category",when($"conversion">=80,"Diamond").when($"conversion">=50 && $"conversion"<80,"Gold").when($"conversion"<50,"Silver"))
20 | 
21 | val resultDF = membersDF.as("m").join(conDF.as("c"),$"m.member_id"===$"c.member_id","left_outer").select($"m.member_id",$"m.name",coalesce($"c.category",lit("Bronze")))
22 | 
23 | resultDF.show
24 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/511. Game Play Analysis I (Easy).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM activity_511) AS activity"
 2 | 
 3 | val activityDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val firstloginDF = activityDF
 6 | 		.groupBy($"player_id")
 7 | 		.agg(min($"event_date").as("first_login"))
 8 | 		
 9 | 
10 | firstloginDF.show
11 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/512. Game Play Analysis II (Easy).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM activity_511) AS activity"
 2 | 
 3 | val activityDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val firstloginDF = activityDF
 6 | 		.groupBy($"player_id")
 7 | 		.agg(min($"event_date").as("first_login"))
 8 | 		
 9 | val fisrtDeviceDF = activityDF.as("act")
10 | 			.join(firstloginDF.as("fl"),$"act.player_id"===$"fl.player_id" && $"act.event_date"===$"fl.first_login","inner")
11 | 			.select($"act.player_id",$"act.device_id")
12 | 			
13 | fisrtDeviceDF.show
14 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/577. Employee Bonus (Easy).txt:
--------------------------------------------------------------------------------
 1 | val query = "(SELECT * FROM employee_577) AS employee"
 2 | 
 3 | val employeeDF = spark.read.jdbc(url, query, connectionProperties)
 4 | 
 5 | val query = "(SELECT * FROM bonus_577) AS bonus"
 6 | 
 7 | val bonusDF = spark.read.jdbc(url, query, connectionProperties)
 8 | 
 9 | val resultDF = employeeDF.as("emp")
10 | 		.join(bonusDF.as("bn"),$"emp.empId"===$"bn.empId","left_outer")
11 | 		.where($"bn.bonus".isNull || $"bn.bonus"<1000)
12 | 		.select($"emp.name",$"bn.bonus")
13 | resultDF.show
14 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/584. Find Customer Referee (Easy).txt:
--------------------------------------------------------------------------------
1 | val query = "(SELECT * FROM customer_584) AS customer"
2 | 
3 | val customerDF = spark.read.jdbc(url, query, connectionProperties)
4 | 
5 | val resultDF = customerDF.where($"reference_id".isNull || $"reference_id" =!= 2).select($"name")
6 | 
7 | resultDF.show
8 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/586. Customer Placing the Largest Number of Orders (Easy).txt:
--------------------------------------------------------------------------------
1 | val query = "(SELECT * FROM orders_586) AS orders"
2 | 
3 | val ordersDF = spark.read.jdbc(url, query, connectionProperties)
4 | 
5 | val resultDF = ordersDF.groupBy($"customer_number").agg(count($"order_number").as("cnt")).orderBy(desc("cnt")).select($"customer_number").limit(1)
6 | 
7 | resultDF.show
8 | 


--------------------------------------------------------------------------------
/DataFrame Solutions/595. Big Countries (Easy).txt:
--------------------------------------------------------------------------------
1 | val query = "(SELECT * FROM world_595) AS world"
2 | 
3 | val worldDF = spark.read.jdbc(url, query, connectionProperties)
4 | 
5 | val resultDF = worldDF.where($"population" >= 25000000 || $"area" >= 3000000).select($"name",$"population",$"area")
6 | 
7 | resultDF.show
8 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Spark Solutions + Leetcode SQL Questions<br/>
 2 | <div>
 3 | <kbd><img src="https://github.com/cM2908/leetcode-spark/blob/main/apache_spark.png" width="375" height="200"/></kbd>
 4 | <img src="https://github.com/cM2908/leetcode-spark/blob/main/plus.png" width="30" height="30"/>
 5 | <kbd><img src="https://github.com/cM2908/leetcode-spark/blob/main/leetcode.png" width="375" height="200"/></kbd>
 6 | </div>
 7 | 
 8 | #### Want to practice & solve some complex questions using Spark? 
 9 | - Then nothing is better than solving some Leetcode questions, but you might think how, read along & you will get a fair idea.<br/>
10 | - Now, Execute Spark Dataframe/Dataset/SQL/RDD code on Leetcode SQL Questions.
11 | 
12 | #### Problem statements of all questions including leetcode premium questions :<br/>
13 | 
14 | - https://www.jiakaobo.com/leetcode <br/>
15 | - https://leetcode.ca <br/>
16 | 
17 | #### Repository Contains :<br/>
18 | - Spark Dataframe/Dataset/SQL/RDD Solutions on Leetcode Questions <br/>
19 | - PostgreSQL Dump File (leetcodedb.sql)<br/>
20 | 
21 | #### Get Started :<br/>
22 | 
23 | - This guide assumes that you already have PostgreSQL database & Apache Spark installed<br/>
24 | - Load dump file to your local PostgreSQL setup.<br/>
25 | - [Leetcode-Sql](https://github.com/cM2908/leetcode-sql) This repository contains all the information needed to load postgresql dump file (that contains tables of all leetcode sql questions) into your local postgresql setup.<br/>
26 | 
27 | #### Integrate Apache Spark with PostgreSQL database :<br/> 
28 | 
29 | - Download PostgreSQL JDBC Connector JAR (select appropriate version of JAR according your PostgreSQL setup)<br/>
30 | 
31 | - Add PostgreSQL JDBC Connector jar to "spark/jars" directory. (This step will make the JAR available directly to the classpath when starting the spark-shell)<br/>
32 | 
33 | - Start Spark-Shell
34 | ```
35 | user@my-machine:~$ spark-shell
36 | ```
37 | 
38 | - Step to Check for Classpath in Spark Shell (Optional)
39 | ```
40 | scala> import java.lang.ClassLoader
41 | scala> val cl = ClassLoader.getSystemClassLoader
42 | scala> cl.asInstanceOf[java.net.URLClassLoader].getURLs.foreach(println)
43 | ```
44 | 
45 | - Connection code (Replace your credentials)<br/>
46 | ```
47 | scala> val url = "jdbc:postgresql://localhost:5432/postgres?user=<your-username>&password=<your-password>"
48 | scala> import java.util.Properties
49 | scala> val connectionProperties = new Properties()
50 | scala> connectionProperties.setProperty("Driver", "org.postgresql.Driver")
51 | ```
52 | 
53 | - Example
54 | ```
55 | scala> val query = "(SELECT * FROM employee_181) AS employee"
56 | scala> val employeeDF = spark.read.jdbc(url, query, connectionProperties)
57 | scala> val joinedDF = employeeDF.as("emp")
58 |                         .join(employeeDF.as("mgr"),$"emp.manager_id"===$"mgr.id" && $"emp.salary" > $"mgr.salary","inner")
59 |                         .select($"emp.name")
60 | scala> joinedDF.show
61 | ```
62 | 
63 | #### Want to Contribute :<br/>
64 | 
65 | - Contribute by providing solution of any question in either/all of these dialacts (Spark DataFrame,Spark DataSet,Spark RDD,Spark SQL)
66 | - Forked the repository 
67 | - Create solution file with proper name (eg. "175. Combine Two Tables (Easy).txt")
68 | - Create Pull Request
69 | - After review I'll merge it with the main repository.
70 | - Congratulations, you've contributed something to the data community.
71 | 


--------------------------------------------------------------------------------
/apache_spark.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cM2908/leetcode-spark/7edf4b6224edb78dac3cf6b13ea057ab9dce2f76/apache_spark.png


--------------------------------------------------------------------------------
/leetcode.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cM2908/leetcode-spark/7edf4b6224edb78dac3cf6b13ea057ab9dce2f76/leetcode.png


--------------------------------------------------------------------------------
/plus.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cM2908/leetcode-spark/7edf4b6224edb78dac3cf6b13ea057ab9dce2f76/plus.png


--------------------------------------------------------------------------------
/postgresql_dump_file/leetcodedb.sql:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/cM2908/leetcode-spark/7edf4b6224edb78dac3cf6b13ea057ab9dce2f76/postgresql_dump_file/leetcodedb.sql


--------------------------------------------------------------------------------