├── DataFrame Solutions ├── 1158. Market Analysis I (Medium).txt ├── 1212. Team Scores in Football Tournament (Medium).txt ├── 1355. Activity Participants (Medium).txt ├── 1445. Apples & Oranges (Medium).txt ├── 1596. The Most Frequently Ordered Products for Each Customer (Medium).txt ├── 175. Combine Two Tables (Easy).txt ├── 181. Employees Earning More Than Their Managers (Easy).txt ├── 182. Duplicate Emails (Easy).txt ├── 183. Customers Who Never Order (Easy).txt ├── 1907. Count Salary Categories (Medium).txt ├── 1934. Confirmation Rate (Medium).txt ├── 196. Delete Duplicate Emails (Easy).txt ├── 197. Rising Temperature (Easy).txt ├── 1988. Find Cutoff Score for Each School (Medium).txt ├── 2051. The Category of Each Member in the Store (Medium).txt ├── 511. Game Play Analysis I (Easy).txt ├── 512. Game Play Analysis II (Easy).txt ├── 577. Employee Bonus (Easy).txt ├── 584. Find Customer Referee (Easy).txt ├── 586. Customer Placing the Largest Number of Orders (Easy).txt └── 595. Big Countries (Easy).txt ├── README.md ├── apache_spark.png ├── leetcode.png ├── plus.png └── postgresql_dump_file └── leetcodedb.sql /DataFrame Solutions/1158. Market Analysis I (Medium).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM users_1158) AS users" 2 | 3 | val usersDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val query = "(SELECT * FROM orders_1158) AS orders" 6 | 7 | val ordersDF = spark.read.jdbc(url, query, connectionProperties) 8 | 9 | val query = "(SELECT * FROM items_1158) AS items" 10 | 11 | val itemsDF = spark.read.jdbc(url, query, connectionProperties) 12 | 13 | val orders2019DF = ordersDF.where(year($"order_date")===2019).groupBy($"buyer_id").agg(count($"order_id").as("orders_in_2019")) 14 | 15 | val resultDF = usersDF.as("u").join(orders2019DF.as("o"),$"u.user_id"===$"o.buyer_id","left_outer").select($"u.user_id",$"u.join_date",coalesce($"o.orders_in_2019",lit(0))) 16 | 17 | resultDF.show 18 | -------------------------------------------------------------------------------- /DataFrame Solutions/1212. Team Scores in Football Tournament (Medium).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM teams_1212) AS teams" 2 | 3 | val teamsDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val query = "(SELECT * FROM matches_1212) AS matches" 6 | 7 | val matchesDF = spark.read.jdbc(url, query, connectionProperties) 8 | 9 | val tiedMatchesDF = matchesDF.where($"guest_goals" === $"host_goals").select($"guest_team",$"host_team",$"guest_goals",$"host_goals") 10 | 11 | val unionDF = matchesDF.select($"host_team",$"guest_team",$"host_goals",$"guest_goals").union(tiedMatchesDF) 12 | 13 | val reportDF = unionDF.withColumn("winner",when($"host_goals">$"guest_goals",$"host_team").when($"host_goals"<$"guest_goals",$"host_team").otherwise($"host_team")).withColumn("points",when($"host_goals"===$"guest_goals",lit(1)).otherwise(lit(3))).select($"winner",$"points").groupBy($"winner").agg(sum($"points").as("num_points")) 14 | 15 | val resultDF = teamsDF.as("t").join(reportDF.as("r"),$"t.team_id"===$"r.winner","left_outer").select($"t.team_id",$"t.team_name",coalesce($"r.num_points",lit(0))).orderBy($"r.num_points".desc,$"t.team_id") 16 | 17 | resultDF.show 18 | -------------------------------------------------------------------------------- /DataFrame Solutions/1355. Activity Participants (Medium).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM friends_1355) AS friends" 2 | 3 | val friendsDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val groupedDF = friendsDF.groupBy($"activity").agg(count($"id").as("cnt")) 6 | 7 | val resultDF = groupedDF.withColumn("min_count",min($"cnt").over()).withColumn("max_count",max($"cnt").over()).where($"cnt"=!=$"max_count" && $"cnt"=!=$"min_count").select($"activity") 8 | 9 | resultDF.show 10 | -------------------------------------------------------------------------------- /DataFrame Solutions/1445. Apples & Oranges (Medium).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM sales_1445) AS sales" 2 | 3 | val salesDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val applesDF = salesDF.where($"fruit"==="apples") 6 | 7 | val orangesDF = salesDF.where($"fruit"==="oranges") 8 | 9 | val resultDF = applesDF.as("a").join(orangesDF.as("o"),$"a.sale_date"===$"o.sale_date","full_outer").select($"a.sale_date",($"a.sold_num"-$"o.sold_num").as("diff")) 10 | 11 | resultDF.show 12 | -------------------------------------------------------------------------------- /DataFrame Solutions/1596. The Most Frequently Ordered Products for Each Customer (Medium).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM customers_1596) AS customers" 2 | 3 | val customersDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val query = "(SELECT * FROM orders_1596) AS orders" 6 | 7 | val ordersDF = spark.read.jdbc(url, query, connectionProperties) 8 | 9 | val query = "(SELECT * FROM products_1596) AS products" 10 | 11 | val productsDF = spark.read.jdbc(url, query, connectionProperties) 12 | 13 | 14 | val groupedDF = ordersDF.groupBy($"customer_id",$"product_id").agg(count(lit(1)).as("cnt")) 15 | 16 | import org.apache.spark.sql.expressions.Window 17 | 18 | val windowSpec = Window.partitionBy("customer_id").orderBy(desc("cnt")) 19 | 20 | val rankedDF = groupedDF.withColumn("rank",dense_rank().over(windowSpec)).where($"rank"===lit(1)).select($"customer_id",$"product_id") 21 | 22 | val resultDF = rankedDF.as("r").join(productsDF.as("p"),$"p.product_id"===$"r.product_id").selectExpr("r.*","product_name") 23 | 24 | resultDF.show 25 | -------------------------------------------------------------------------------- /DataFrame Solutions/175. Combine Two Tables (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM person_175) AS person" 2 | val personDF = spark.read.jdbc(url, query, connectionProperties) 3 | 4 | val query = "(SELECT * FROM address_175) AS address" 5 | val addressDF = spark.read.jdbc(url, query, connectionProperties) 6 | 7 | val joinCondition = personDF.col("personid") === addressDF.col("personid") 8 | val joinedDF = personDF.join(addressDF,joinCondition,"inner") 9 | 10 | joinedDF.show 11 | -------------------------------------------------------------------------------- /DataFrame Solutions/181. Employees Earning More Than Their Managers (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM employee_181) AS employee" 2 | 3 | val employeeDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val joinedDF = employeeDF.as("emp").join(employeeDF.as("mgr"),$"emp.manager_id"===$"mgr.id" && $"emp.salary" > $"mgr.salary","inner").select($"emp.name") 6 | 7 | joinedDF.show 8 | -------------------------------------------------------------------------------- /DataFrame Solutions/182. Duplicate Emails (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM person_182) AS person" 2 | 3 | val personDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val duplicateDF = personDF.groupBy(col("email")).agg(count(col("id")).as("cnt")).where($"cnt">1).select(col("email")) 6 | 7 | duplicateDF.show 8 | 9 | -------------------------------------------------------------------------------- /DataFrame Solutions/183. Customers Who Never Order (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM customers_183) AS customers" 2 | 3 | val customerDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val query = "(SELECT * FROM orders_183) AS orders" 6 | 7 | val orderDF = spark.read.jdbc(url, query, connectionProperties) 8 | 9 | // Direct Method 10 | 11 | val joinedDF = customerDF.as("c").join(orderDF.as("o"),$"c.id"===$"o.customer_id","left_anti").select($"c.name") 12 | 13 | //Conventional Method 14 | 15 | val joinedDF = customerDF.as("c").join(orderDF.as("o"),$"c.id"===$"o.customer_id","left_outer").where($"o.id".isNull).select($"c.name") 16 | 17 | joinedDF.show 18 | -------------------------------------------------------------------------------- /DataFrame Solutions/1907. Count Salary Categories (Medium).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM accounts_1907) AS accounts" 2 | 3 | val accountsDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val categorizedDF = accountsDF.withColumn("category",when($"income"<20000,"Low Salary").when($"income">=20000 && $"income"<=50000,"Average Salary").otherwise("High Salary")) 6 | 7 | val groupedDF = categorizedDF.groupBy($"category").agg(count($"account_id").as("accounts_count")) 8 | 9 | val categoryDF = Seq("Low Salary","Average Salary","High Salary").toDF("category") 10 | 11 | val resultDF = categoryDF.as("c") 12 | .join(groupedDF.as("g"),$"c.category"===$"g.category","left_outer") 13 | .select($"c.category",coalesce($"g.accounts_count",lit(0)).as("accounts_count")) 14 | 15 | resultDF.show 16 | -------------------------------------------------------------------------------- /DataFrame Solutions/1934. Confirmation Rate (Medium).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM signups_1934) AS signups" 2 | 3 | val signupsDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val query = "(SELECT * FROM confirmations_1934) AS confirmations" 6 | 7 | val confirmationsDF = spark.read.jdbc(url, query, connectionProperties) 8 | 9 | import org.apache.spark.sql.expressions.Window 10 | 11 | val w = Window.partitionBy("user_id") 12 | 13 | val crDF = confirmationsDF.withColumn("confirmation_count",count(when($"action"===lit("confirmed"),lit(1)).otherwise(null)).over(w)).withColumn("total_count",count($"action").over(w)).select($"user_id",round($"confirmation_count"/$"total_count",2).as("confirmation_rate")) 14 | 15 | val resultDF = signupsDF.as("s").join(crDF.as("c"),$"s.user_id"===$"c.user_id","left_outer").select($"s.user_id",coalesce($"confirmation_rate",lit(0))).distinct 16 | -------------------------------------------------------------------------------- /DataFrame Solutions/196. Delete Duplicate Emails (Easy).txt: -------------------------------------------------------------------------------- 1 | import org.apache.spark.sql.expressions.Window 2 | 3 | val query = "(SELECT * FROM person_196) AS person" 4 | 5 | val personDF = spark.read.jdbc(url, query, connectionProperties) 6 | 7 | val w = Window.partitionBy("email") 8 | 9 | val distinctPersonDF = personDF.withColumn("min",min($"id").over(w)).where($"min"===$"id").drop($"min") 10 | 11 | distinctPersonDF.show 12 | -------------------------------------------------------------------------------- /DataFrame Solutions/197. Rising Temperature (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM weather_197) AS weather" 2 | 3 | val weatherDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val resultDF = weatherDF.as("w1") 6 | .join(weatherDF.as("w2"),$"w2.record_date"+1===$"w1.record_date" && $"w2.temperature"<$"w1.temperature","inner") 7 | .select($"w1.id") 8 | 9 | resultDF.show 10 | -------------------------------------------------------------------------------- /DataFrame Solutions/1988. Find Cutoff Score for Each School (Medium).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM school_1988) AS school" 2 | 3 | val schoolDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val query = "(SELECT * FROM exam_1988) AS exam" 6 | 7 | val examDF = spark.read.jdbc(url, query, connectionProperties) 8 | 9 | val resultDF = schoolDF.as("s").join(examDF.as("e"),$"s.capacity">=$"e.student_count","left_outer").groupBy($"s.school_id").agg(coalesce(min($"e.score"),lit(-1)).as("score")) 10 | 11 | resultDF.show 12 | -------------------------------------------------------------------------------- /DataFrame Solutions/2051. The Category of Each Member in the Store (Medium).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM members_2051) AS members" 2 | 3 | val membersDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val query = "(SELECT * FROM visits_2051) AS visits" 6 | 7 | val visitsDF = spark.read.jdbc(url, query, connectionProperties) 8 | 9 | val query = "(SELECT * FROM purchases_2051) AS purchases" 10 | 11 | val purchasesDF = spark.read.jdbc(url, query, connectionProperties) 12 | 13 | import org.apache.spark.sql.expressions.Window 14 | 15 | val w = Window.partitionBy("member_id") 16 | 17 | val joinedDF = visitsDF.as("v").join(purchasesDF.as("p"),$"v.visit_id"===$"p.visit_id","left_outer").withColumn("purchase_count",count(when($"p.visit_id".isNotNull,lit(1))).over(w)).withColumn("total_count",count($"v.visit_id").over(w)).select($"member_id",($"purchase_count"*lit(100)/$"total_count").as("conversion")).distinct 18 | 19 | val conDF = joinedDF.withColumn("category",when($"conversion">=80,"Diamond").when($"conversion">=50 && $"conversion"<80,"Gold").when($"conversion"<50,"Silver")) 20 | 21 | val resultDF = membersDF.as("m").join(conDF.as("c"),$"m.member_id"===$"c.member_id","left_outer").select($"m.member_id",$"m.name",coalesce($"c.category",lit("Bronze"))) 22 | 23 | resultDF.show 24 | -------------------------------------------------------------------------------- /DataFrame Solutions/511. Game Play Analysis I (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM activity_511) AS activity" 2 | 3 | val activityDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val firstloginDF = activityDF 6 | .groupBy($"player_id") 7 | .agg(min($"event_date").as("first_login")) 8 | 9 | 10 | firstloginDF.show 11 | -------------------------------------------------------------------------------- /DataFrame Solutions/512. Game Play Analysis II (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM activity_511) AS activity" 2 | 3 | val activityDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val firstloginDF = activityDF 6 | .groupBy($"player_id") 7 | .agg(min($"event_date").as("first_login")) 8 | 9 | val fisrtDeviceDF = activityDF.as("act") 10 | .join(firstloginDF.as("fl"),$"act.player_id"===$"fl.player_id" && $"act.event_date"===$"fl.first_login","inner") 11 | .select($"act.player_id",$"act.device_id") 12 | 13 | fisrtDeviceDF.show 14 | -------------------------------------------------------------------------------- /DataFrame Solutions/577. Employee Bonus (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM employee_577) AS employee" 2 | 3 | val employeeDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val query = "(SELECT * FROM bonus_577) AS bonus" 6 | 7 | val bonusDF = spark.read.jdbc(url, query, connectionProperties) 8 | 9 | val resultDF = employeeDF.as("emp") 10 | .join(bonusDF.as("bn"),$"emp.empId"===$"bn.empId","left_outer") 11 | .where($"bn.bonus".isNull || $"bn.bonus"<1000) 12 | .select($"emp.name",$"bn.bonus") 13 | resultDF.show 14 | -------------------------------------------------------------------------------- /DataFrame Solutions/584. Find Customer Referee (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM customer_584) AS customer" 2 | 3 | val customerDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val resultDF = customerDF.where($"reference_id".isNull || $"reference_id" =!= 2).select($"name") 6 | 7 | resultDF.show 8 | -------------------------------------------------------------------------------- /DataFrame Solutions/586. Customer Placing the Largest Number of Orders (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM orders_586) AS orders" 2 | 3 | val ordersDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val resultDF = ordersDF.groupBy($"customer_number").agg(count($"order_number").as("cnt")).orderBy(desc("cnt")).select($"customer_number").limit(1) 6 | 7 | resultDF.show 8 | -------------------------------------------------------------------------------- /DataFrame Solutions/595. Big Countries (Easy).txt: -------------------------------------------------------------------------------- 1 | val query = "(SELECT * FROM world_595) AS world" 2 | 3 | val worldDF = spark.read.jdbc(url, query, connectionProperties) 4 | 5 | val resultDF = worldDF.where($"population" >= 25000000 || $"area" >= 3000000).select($"name",$"population",$"area") 6 | 7 | resultDF.show 8 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Spark Solutions + Leetcode SQL Questions
2 |
3 | 4 | 5 | 6 |
7 | 8 | #### Want to practice & solve some complex questions using Spark? 9 | - Then nothing is better than solving some Leetcode questions, but you might think how, read along & you will get a fair idea.
10 | - Now, Execute Spark Dataframe/Dataset/SQL/RDD code on Leetcode SQL Questions. 11 | 12 | #### Problem statements of all questions including leetcode premium questions :
13 | 14 | - https://www.jiakaobo.com/leetcode
15 | - https://leetcode.ca
16 | 17 | #### Repository Contains :
18 | - Spark Dataframe/Dataset/SQL/RDD Solutions on Leetcode Questions
19 | - PostgreSQL Dump File (leetcodedb.sql)
20 | 21 | #### Get Started :
22 | 23 | - This guide assumes that you already have PostgreSQL database & Apache Spark installed
24 | - Load dump file to your local PostgreSQL setup.
25 | - [Leetcode-Sql](https://github.com/cM2908/leetcode-sql) This repository contains all the information needed to load postgresql dump file (that contains tables of all leetcode sql questions) into your local postgresql setup.
26 | 27 | #### Integrate Apache Spark with PostgreSQL database :
28 | 29 | - Download PostgreSQL JDBC Connector JAR (select appropriate version of JAR according your PostgreSQL setup)
30 | 31 | - Add PostgreSQL JDBC Connector jar to "spark/jars" directory. (This step will make the JAR available directly to the classpath when starting the spark-shell)
32 | 33 | - Start Spark-Shell 34 | ``` 35 | user@my-machine:~$ spark-shell 36 | ``` 37 | 38 | - Step to Check for Classpath in Spark Shell (Optional) 39 | ``` 40 | scala> import java.lang.ClassLoader 41 | scala> val cl = ClassLoader.getSystemClassLoader 42 | scala> cl.asInstanceOf[java.net.URLClassLoader].getURLs.foreach(println) 43 | ``` 44 | 45 | - Connection code (Replace your credentials)
46 | ``` 47 | scala> val url = "jdbc:postgresql://localhost:5432/postgres?user=&password=" 48 | scala> import java.util.Properties 49 | scala> val connectionProperties = new Properties() 50 | scala> connectionProperties.setProperty("Driver", "org.postgresql.Driver") 51 | ``` 52 | 53 | - Example 54 | ``` 55 | scala> val query = "(SELECT * FROM employee_181) AS employee" 56 | scala> val employeeDF = spark.read.jdbc(url, query, connectionProperties) 57 | scala> val joinedDF = employeeDF.as("emp") 58 | .join(employeeDF.as("mgr"),$"emp.manager_id"===$"mgr.id" && $"emp.salary" > $"mgr.salary","inner") 59 | .select($"emp.name") 60 | scala> joinedDF.show 61 | ``` 62 | 63 | #### Want to Contribute :
64 | 65 | - Contribute by providing solution of any question in either/all of these dialacts (Spark DataFrame,Spark DataSet,Spark RDD,Spark SQL) 66 | - Forked the repository 67 | - Create solution file with proper name (eg. "175. Combine Two Tables (Easy).txt") 68 | - Create Pull Request 69 | - After review I'll merge it with the main repository. 70 | - Congratulations, you've contributed something to the data community. 71 | -------------------------------------------------------------------------------- /apache_spark.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cM2908/leetcode-spark/7edf4b6224edb78dac3cf6b13ea057ab9dce2f76/apache_spark.png -------------------------------------------------------------------------------- /leetcode.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cM2908/leetcode-spark/7edf4b6224edb78dac3cf6b13ea057ab9dce2f76/leetcode.png -------------------------------------------------------------------------------- /plus.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cM2908/leetcode-spark/7edf4b6224edb78dac3cf6b13ea057ab9dce2f76/plus.png -------------------------------------------------------------------------------- /postgresql_dump_file/leetcodedb.sql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/cM2908/leetcode-spark/7edf4b6224edb78dac3cf6b13ea057ab9dce2f76/postgresql_dump_file/leetcodedb.sql --------------------------------------------------------------------------------