├── .DS_Store ├── Problem 6 ├── problem6.sql ├── students.sql ├── Students.csv ├── README.md └── problem6.ipynb ├── Problem 7 ├── .DS_Store ├── problem7.sql ├── transaction.sql ├── README.md ├── transaction.csv └── problem7.ipynb ├── Problem 2 ├── department.csv ├── department.sql ├── problem2.sql ├── README.md ├── problem2_2.ipynb └── problem2_1.ipynb ├── Problem 5 ├── problem5.sql ├── station.sql ├── README.md ├── problem5.ipynb └── stations.csv ├── Problem 1 ├── problem1.sql ├── employee_table.sql ├── README.md ├── employee.csv ├── employee.json └── problem1.ipynb ├── Problem 3 ├── problem3.sql ├── station.sql ├── README.md ├── problem3.ipynb └── stations.csv ├── Problem 0 ├── employee_salary.sql ├── problem0.sql ├── README.md └── employee_salary.csv ├── Problem 4 ├── station.sql ├── problem4.sql ├── README.md ├── problem4.ipynb └── stations.csv ├── Problem 9 ├── user_type.csv ├── README.md ├── user_info.csv ├── problem9.sql └── download_facts.csv ├── Problem 8 ├── problem8.sql ├── user.csv ├── README.md ├── ride_log.csv └── problem8.ipynb └── README.md /.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/developershomes/DataEngineeringProblems/main/.DS_Store -------------------------------------------------------------------------------- /Problem 6/problem6.sql: -------------------------------------------------------------------------------- 1 | SELECT name 2 | FROM public.students 3 | WHERE marks > 75 4 | ORDER BY right(name,3),ID; -------------------------------------------------------------------------------- /Problem 7/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/developershomes/DataEngineeringProblems/main/Problem 7/.DS_Store -------------------------------------------------------------------------------- /Problem 2/department.csv: -------------------------------------------------------------------------------- 1 | department_id,department_name 2 | 1005,Sales 3 | 1002,Finanace 4 | 1004,Purchase 5 | 1001,Operations 6 | 1006,Marketing 7 | 1003,Technoogy -------------------------------------------------------------------------------- /Problem 7/problem7.sql: -------------------------------------------------------------------------------- 1 | SELECT DISTINCT(a1.user_id) 2 | FROM transaction a1 3 | JOIN transaction a2 ON a1.user_id=a2.user_id 4 | AND a1.id <> a2.id 5 | AND DATEDIFF(a2.created_at,a1.created_at) BETWEEN 0 AND 7 6 | ORDER BY a1.user_id; -------------------------------------------------------------------------------- /Problem 5/problem5.sql: -------------------------------------------------------------------------------- 1 | -- Query the list of CITY names starting with vowels (i.e., a, e, i, o, or u) from STATION. Your result cannot contain duplicates. 2 | 3 | SELECT DISTINCT(CITY) FROM STATION WHERE LEFT(CITY,1) IN ('A','E','I','O','U'); -------------------------------------------------------------------------------- /Problem 1/problem1.sql: -------------------------------------------------------------------------------- 1 | -- Active: 1675109399578@@127.0.0.1@5432@postgres 2 | SELECT id, first_name, last_name, MAX(salary) AS MaxSalary, department_id 3 | FROM public.employee 4 | GROUP BY id, first_name, last_name, department_id 5 | ORDER BY id -------------------------------------------------------------------------------- /Problem 3/problem3.sql: -------------------------------------------------------------------------------- 1 | --Find the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table. 2 | 3 | SELECT count(city) as citycount, count(distinct(city)) as distinctcitycount,(count(city) - count(distinct(city))) as diffbetweenboth 4 | FROM public.station; -------------------------------------------------------------------------------- /Problem 7/transaction.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE `transaction` ( 2 | `id` int NOT NULL, 3 | `user_id` int DEFAULT NULL, 4 | `item` varchar(45) DEFAULT NULL, 5 | `created_at` date DEFAULT NULL, 6 | `revenue` int DEFAULT NULL, 7 | PRIMARY KEY (`id`) 8 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci -------------------------------------------------------------------------------- /Problem 6/students.sql: -------------------------------------------------------------------------------- 1 | -- Table: public.station 2 | 3 | -- DROP TABLE IF EXISTS public.station; 4 | 5 | CREATE TABLE IF NOT EXISTS public.students 6 | ( 7 | ID bigint, 8 | Name character varying(100) COLLATE pg_catalog."default", 9 | Marks bigint 10 | ) 11 | 12 | TABLESPACE pg_default; 13 | 14 | ALTER TABLE IF EXISTS public.students 15 | OWNER to postgres; -------------------------------------------------------------------------------- /Problem 2/department.sql: -------------------------------------------------------------------------------- 1 | -- Table: public.department 2 | 3 | -- DROP TABLE IF EXISTS public.department; 4 | 5 | CREATE TABLE IF NOT EXISTS public.department 6 | ( 7 | department_id bigint, 8 | department_name character varying(100) COLLATE pg_catalog."default" 9 | ) 10 | 11 | TABLESPACE pg_default; 12 | 13 | ALTER TABLE IF EXISTS public.department 14 | OWNER to postgres; -------------------------------------------------------------------------------- /Problem 0/employee_salary.sql: -------------------------------------------------------------------------------- 1 | CREATE TABLE IF NOT EXISTS public.employee_salary 2 | ( 3 | id bigint, 4 | first_name character varying(100) COLLATE pg_catalog."default", 5 | last_name character varying(100) COLLATE pg_catalog."default", 6 | salary bigint, 7 | department_id bigint 8 | ) 9 | 10 | TABLESPACE pg_default; 11 | 12 | ALTER TABLE IF EXISTS public.employee_salary 13 | OWNER to postgres; -------------------------------------------------------------------------------- /Problem 3/station.sql: -------------------------------------------------------------------------------- 1 | -- Table: public.station 2 | 3 | -- DROP TABLE IF EXISTS public.station; 4 | 5 | CREATE TABLE IF NOT EXISTS public.station 6 | ( 7 | id bigint, 8 | city character varying(100) COLLATE pg_catalog."default", 9 | state character varying(100) COLLATE pg_catalog."default", 10 | lattitude numeric(20,10), 11 | longtitude numeric(20,10) 12 | ) 13 | 14 | TABLESPACE pg_default; 15 | 16 | ALTER TABLE IF EXISTS public.station 17 | OWNER to postgres; -------------------------------------------------------------------------------- /Problem 4/station.sql: -------------------------------------------------------------------------------- 1 | -- Table: public.station 2 | 3 | -- DROP TABLE IF EXISTS public.station; 4 | 5 | CREATE TABLE IF NOT EXISTS public.station 6 | ( 7 | id bigint, 8 | city character varying(100) COLLATE pg_catalog."default", 9 | state character varying(100) COLLATE pg_catalog."default", 10 | lattitude numeric(20,10), 11 | longtitude numeric(20,10) 12 | ) 13 | 14 | TABLESPACE pg_default; 15 | 16 | ALTER TABLE IF EXISTS public.station 17 | OWNER to postgres; -------------------------------------------------------------------------------- /Problem 5/station.sql: -------------------------------------------------------------------------------- 1 | -- Table: public.station 2 | 3 | -- DROP TABLE IF EXISTS public.station; 4 | 5 | CREATE TABLE IF NOT EXISTS public.station 6 | ( 7 | id bigint, 8 | city character varying(100) COLLATE pg_catalog."default", 9 | state character varying(100) COLLATE pg_catalog."default", 10 | lattitude numeric(20,10), 11 | longtitude numeric(20,10) 12 | ) 13 | 14 | TABLESPACE pg_default; 15 | 16 | ALTER TABLE IF EXISTS public.station 17 | OWNER to postgres; -------------------------------------------------------------------------------- /Problem 1/employee_table.sql: -------------------------------------------------------------------------------- 1 | -- Table: public.employee 2 | 3 | -- DROP TABLE IF EXISTS public.employee; 4 | 5 | CREATE TABLE IF NOT EXISTS public.employee 6 | ( 7 | id bigint, 8 | first_name character varying(100) COLLATE pg_catalog."default", 9 | last_name character varying(100) COLLATE pg_catalog."default", 10 | salary bigint, 11 | department_id bigint 12 | ) 13 | 14 | TABLESPACE pg_default; 15 | 16 | ALTER TABLE IF EXISTS public.employee 17 | OWNER to postgres; -------------------------------------------------------------------------------- /Problem 6/Students.csv: -------------------------------------------------------------------------------- 1 | ID,Name,Marks 2 | 19,Samantha,87 3 | 21,Julia,96 4 | 11,Britney,95 5 | 32,Kristeen,100 6 | 12,Dyana,55 7 | 13,Jenny,66 8 | 14,Christene,88 9 | 15,Meera,24 10 | 16,Priya,76 11 | 17,Priyanka,77 12 | 18,Paige,74 13 | 19,Jane,64 14 | 21,Belvet,78 15 | 31,Scarlet,80 16 | 41,Salma,81 17 | 51,Amanda,34 18 | 61,Heraldo,94 19 | 71,Stuart,99 20 | 81,Aamina,77 21 | 76,Amina,89 22 | 91,Vivek,84 23 | 17,Evil,79 24 | 16,Devil,76 25 | 34,Fanny,75 26 | 38,Danny,75 -------------------------------------------------------------------------------- /Problem 4/problem4.sql: -------------------------------------------------------------------------------- 1 | --Query the two cities in STATION with the shortest and longest CITY names, as well as their respective lengths (i.e.: number of characters in the name). If there is more than one smallest or largest city, choose the one that comes first when ordered alphabetically. 2 | 3 | SELECT q1.city, q1.citylength 4 | FROM 5 | (SELECT CITY,LENGTH(CITY) as citylength, RANK() OVER (PARTITION BY LENGTH(CITY) ORDER BY LENGTH(CITY),CITY) as actualrank 6 | FROM STATION) q1 7 | WHERE q1. actualrank = 1 8 | AND q1.citylength = (SELECT MIN(LENGTH(CITY)) FROM STATION) 9 | OR q1.citylength = (SELECT MAX(LENGTH(CITY)) FROM STATION); 10 | -------------------------------------------------------------------------------- /Problem 0/problem0.sql: -------------------------------------------------------------------------------- 1 | -- 1. List all the meployees whoes salary is more than 100K 2 | 3 | SELECT id, first_name, last_name, salary, department_id 4 | FROM public.employee_salary 5 | WHERE salary > 100000 ; 6 | 7 | -- 2. Provide distinct department id 8 | 9 | SELECT DISTINCT department_id 10 | FROM public.employee_salary ; 11 | 12 | -- 3. Provide first and last name of employees 13 | 14 | SELECT first_name, last_name 15 | FROM public.employee_salary ; 16 | 17 | -- 4. Provide all the details with the employees whose last name is 'Johnson' 18 | 19 | SELECT id, first_name, last_name, salary, department_id 20 | FROM public.employee_salary 21 | WHERE last_name = 'Johnson' ; -------------------------------------------------------------------------------- /Problem 9/user_type.csv: -------------------------------------------------------------------------------- 1 | acc_id,paying_customer 2 | 700,no 3 | 701,no 4 | 702,no 5 | 703,no 6 | 704,no 7 | 705,no 8 | 706,no 9 | 707,no 10 | 708,no 11 | 709,no 12 | 710,no 13 | 711,no 14 | 712,no 15 | 713,no 16 | 714,no 17 | 715,no 18 | 716,no 19 | 717,no 20 | 718,no 21 | 719,no 22 | 720,no 23 | 721,no 24 | 722,no 25 | 723,no 26 | 724,no 27 | 725,yes 28 | 726,yes 29 | 727,yes 30 | 728,yes 31 | 729,yes 32 | 730,yes 33 | 731,yes 34 | 732,yes 35 | 733,yes 36 | 734,yes 37 | 735,yes 38 | 736,yes 39 | 737,yes 40 | 738,yes 41 | 739,yes 42 | 740,yes 43 | 741,yes 44 | 742,yes 45 | 743,yes 46 | 744,yes 47 | 745,yes 48 | 746,yes 49 | 747,yes 50 | 748,yes 51 | 749,yes 52 | 750,yes -------------------------------------------------------------------------------- /Problem 8/problem8.sql: -------------------------------------------------------------------------------- 1 | --For Top 10 hoghest travlled users 2 | SELECT q.user_id, q.name, q.total 3 | FROM 4 | ( select user_id 5 | ,name 6 | , sum(distance) as total 7 | , RANK() OVER (ORDER BY sum(distance) DESC) as actualrank 8 | from DATAENG.ride_log as log 9 | LEFT OUTER JOIN DATAENG.user as users 10 | ON log.user_id = users.id 11 | GROUP BY user_id, name 12 | ORDER BY sum(distance) DESC) as q 13 | WHERE q.actualrank <= 10 14 | 15 | 16 | --For Top 10 Least travlled users 17 | SELECT q.user_id, q.name, q.total 18 | FROM 19 | ( select user_id 20 | ,name 21 | , sum(distance) as total 22 | , RANK() OVER (ORDER BY sum(distance)) as actualrank 23 | from DATAENG.ride_log as log 24 | LEFT OUTER JOIN DATAENG.user as users 25 | ON log.user_id = users.id 26 | GROUP BY user_id, name 27 | ORDER BY sum(distance)) as q 28 | WHERE q.actualrank <= 10 -------------------------------------------------------------------------------- /Problem 2/problem2.sql: -------------------------------------------------------------------------------- 1 | -- We have a table with employees tables in which we have employee details with salary and department id of the employees. We have one more table in which we have department id and department name. 2 | -- Provide below queries 3 | -- 1. Use this both tables and list all the employees woking in marketing department with highest to lowest salary order. 4 | 5 | SELECT first_name, last_name, salary 6 | FROM public.employee_salary as emp 7 | LEFT OUTER JOIN public.department as department 8 | ON emp.department_id = department.department_id 9 | WHERE department.department_name = 'Marketing' 10 | ORDER BY salary DESC; 11 | 12 | -- 2. Provide count of employees in each departnent with department name. 13 | 14 | SELECT department.department_name, count(*) as count_of_employee 15 | FROM public.department as department 16 | LEFT OUTER JOIN public.employee_salary as emp 17 | ON emp.department_id = department.department_id 18 | GROUP BY department.department_name; 19 | -------------------------------------------------------------------------------- /Problem 8/user.csv: -------------------------------------------------------------------------------- 1 | id,name 2 | 1,Dustin Smith 3 | 2,Jay Ramirez 4 | 3,Joseph Cooke 5 | 4,Melinda Young 6 | 5,Sean Parker 7 | 6,Ian Foster 8 | 7,Christopher Schmitt 9 | 8,Patrick Gutierrez 10 | 9,Dennis Douglas 11 | 10,Brenda Morris 12 | 11,Jeffery Hernandez 13 | 12,David Rice 14 | 13,Charles Foster 15 | 14,Keith Perez DVM 16 | 15,Dean Cuevas 17 | 16,Melissa Bishop 18 | 17,Alexander Howell 19 | 18,Austin Robertson 20 | 19,Sherri Mcdaniel 21 | 20,Nancy Nguyen 22 | 21,Melody Ball 23 | 22,Christopher Stokes 24 | 23,Joseph Hamilton 25 | 24,Kevin Fischer 26 | 25,Crystal Berg 27 | 26,Barbara Larson 28 | 27,Jacqueline Heath 29 | 28,Eric Gardner 30 | 29,Daniel Kennedy 31 | 30,Kaylee Sims 32 | 31,Shannon Green 33 | 32,Stacy Collins 34 | 33,Donna Ortiz 35 | 34,Jennifer Simmons 36 | 35,Michael Gill 37 | 36,Alyssa Shaw 38 | 37,Destiny Clark 39 | 38,Thomas Lara 40 | 39,Mark Diaz 41 | 40,Stacy Bryant 42 | 41,Howard Rose 43 | 42,Brian Schwartz 44 | 43,Kimberly Potter 45 | 44,Cassidy Ryan 46 | 45,Benjamin Mcbride 47 | 46,Elizabeth Ward 48 | 47,Christina Price 49 | 48,Pamela Cox 50 | 49,Jessica Peterson 51 | 50,Michael Nelson -------------------------------------------------------------------------------- /Problem 8/README.md: -------------------------------------------------------------------------------- 1 | # Problems 8 -> Top distance travelled 2 | 3 | Find the top 10 users that have traveled the least distance. Output their id, name and a total distance traveled. 4 | 5 | Problem Difficulty Level : Medium 6 | 7 | Data Structure 8 | ride_log 9 | 10 | - id 11 | - user_id 12 | - travel 13 | 14 | user 15 | 16 | - id 17 | - name 18 | 19 | image 20 | 21 | Data for ride_log and user table 22 | 23 | [In CSV Format](ride_log.csv)
24 | [In CSV Format](user.csv) 25 | 26 | ## Solving using PySpark 27 | 28 | In Spark we will solve this problem using two ways 29 | 1. Using PySpark Functions 30 | 2. Using Spark SQL 31 | 32 | Use below notebook for solution 33 | 34 | [Problem Solution First Part](problem8.ipynb) 35 | 36 | ## Solving using MySQL 37 | 38 | In MySQL We will load data from CSV using MySQL Import functionality. And then we will solve this problem. 39 | 40 | Output Query 41 | 42 | [Problem Solution](problem8.sql) 43 | 44 | Please also follow below blog for understanding this problem 45 | -------------------------------------------------------------------------------- /Problem 5/README.md: -------------------------------------------------------------------------------- 1 | # Problems 5 -> CITY names starting with vowels 2 | 3 | Query the list of CITY names starting with vowels (i.e., a, e, i, o, or u) from STATION. Your result cannot contain duplicates. 4 | The STATION table is described as follows: 5 | 6 | Problem Difficulty Level : Easy 7 | 8 | Data Structure 9 | 10 | - ID 11 | - City 12 | - State 13 | - Lattitude 14 | - Longitude 15 | 16 | image 17 | 18 | Data for station table 19 | 20 | [In CSV Format](stations.csv) 21 | 22 | ## Solving using PySpark 23 | 24 | In Spark we will solve this problem using two ways 25 | 1. Using PySpark Functions 26 | 2. Using Spark SQL 27 | 28 | Use below notebook for solution 29 | 30 | [Problem Solution First Part](problem5.ipynb) 31 | 32 | ## Solving using PostgreSQL 33 | 34 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem. 35 | 36 | Output Query 37 | 38 | [Problem Solution](problem5.sql) 39 | 40 | Please also follow below blog for understanding this problem 41 | -------------------------------------------------------------------------------- /Problem 3/README.md: -------------------------------------------------------------------------------- 1 | # Problems 3 -> Difference between total number of cities and distinct cities 2 | 3 | Find the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table. 4 | The STATION table is described as follows: 5 | 6 | Problem Difficulty Level : Easy 7 | 8 | Data Structure 9 | 10 | - ID 11 | - City 12 | - State 13 | - Lattitude 14 | - Longitude 15 | 16 | image 17 | 18 | Data for station table 19 | 20 | [In CSV Format](stations.csv) 21 | 22 | ## Solving using PySpark 23 | 24 | In Spark we will solve this problem using two ways 25 | 1. Using PySpark Functions 26 | 2. Using Spark SQL 27 | 28 | Use below notebook for solution 29 | 30 | [Problem Solution First Part](problem3.ipynb) 31 | 32 | ## Solving using PostgreSQL 33 | 34 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem. 35 | 36 | Output Query 37 | 38 | [Problem Solution](problem3.sql) 39 | 40 | Please also follow below blog for understanding this problem 41 | -------------------------------------------------------------------------------- /Problem 7/README.md: -------------------------------------------------------------------------------- 1 | # Problems 7 -> Returning active users 2 | 3 | Write a query that'll identify returning active users. A returning active user is a user that has made a second purchase within 7 days of any other of their purchases. Output a list of user_ids of these returning active users. 4 | 5 | Problem Difficulty Level : Medium 6 | 7 | Data Structure 8 | 9 | - id 10 | - user_id 11 | - item 12 | - created_at 13 | - revenue 14 | 15 | image 16 | 17 | Data for transaction table 18 | 19 | [In CSV Format](transaction.csv) 20 | 21 | ## Solving using PySpark 22 | 23 | In Spark we will solve this problem using two ways 24 | 1. Using PySpark Functions 25 | 2. Using Spark SQL 26 | 27 | Use below notebook for solution 28 | 29 | [Problem Solution First Part](problem7.ipynb) 30 | 31 | ## Solving using MySQL 32 | 33 | In MySQL We will load data from CSV using MySQL Import functionality. And then we will solve this problem. 34 | 35 | Output Query 36 | 37 | [Problem Solution](problem7.sql) 38 | 39 | Please also follow below blog for understanding this problem 40 | -------------------------------------------------------------------------------- /Problem 0/README.md: -------------------------------------------------------------------------------- 1 | # Problems 0 -> Employee Salary more than 2 | 3 | We have a table with employees and their salaries. Write Queries to solve below problems 4 | 1. List all the meployees whoes salary is more than 100K 5 | 2. Provide distinct department id 6 | 3. Provide first and last name of employees 7 | 4. Provide all the details with the employees whose last name is 'Johnson' 8 | 9 | Problem Difficulty Level : Easy 10 | 11 | Data Structure 12 | 13 | image 14 | 15 | 16 | Data for this problem 17 | 18 | [In CSV Format](employee_salary.csv) 19 | 20 | ## Solving using PySpark 21 | 22 | In Spark we will solve this problem using two ways 23 | 1. Using PySpark Functions 24 | 2. Using Spark SQL 25 | 26 | Use below notebook for solution 27 | 28 | [Problem Solution](problem0.ipynb) 29 | 30 | ## Solving using PostgreSQL 31 | 32 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem. 33 | 34 | Output Query 35 | 36 | [Problem Solution](problem0.sql) 37 | 38 | Please also follow below blog for understanding this problem 39 | -------------------------------------------------------------------------------- /Problem 1/README.md: -------------------------------------------------------------------------------- 1 | # Problems 1 -> Employee With his Latest Salary 2 | 3 | We have a table with employees and their salaries, however, some of the records are old and contain outdated salary information. Find the current salary of each employee assuming that salaries increase each year. Output their id, first name, last name, department ID, and current salary. Order your list by employee ID in ascending order. 4 | 5 | Problem Difficulty Level : Medium 6 | 7 | Data Structure 8 | 9 | image 10 | 11 | 12 | Data for this problem 13 | 14 | [In CSV Format](employee.csv) 15 | 16 | [In JSON Format](employee.json) 17 | 18 | ## Solving using PySpark 19 | 20 | In Spark we will solve this problem using two ways 21 | 1. Using PySpark Functions 22 | 2. Using Spark SQL 23 | 24 | Use below notebook for solution 25 | 26 | [Problem Solution](problem1.ipynb) 27 | 28 | ## Solving using PostgreSQL 29 | 30 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem. 31 | 32 | Output Query 33 | 34 | [Problem Solution](problem1.sql) 35 | 36 | Please also follow below blog for understanding this problem 37 | -------------------------------------------------------------------------------- /Problem 9/README.md: -------------------------------------------------------------------------------- 1 | # Problems 9 -> Premium vs Freemium 2 | 3 | Find the total number of downloads for paying and non-paying users by date. Include only records where non-paying customers have more downloads than paying customers. The output should be sorted by earliest date first and contain 3 columns date, non-paying downloads, paying downloads. 4 | 5 | Problem Difficulty Level : Hard 6 | 7 | Data Structure 8 | user_info 9 | 10 | - user_id 11 | - acc_id 12 | 13 | user_type 14 | 15 | - acc_id 16 | - paying_customer 17 | 18 | download_facts 19 | 20 | - date 21 | - user_id 22 | - downloads 23 | 24 | 25 | Data for ride_log and user table 26 | 27 | [User data CSV Format](user_info.csv)
28 | [User Type data CSV Format](user_type.csv)
29 | [Download facts data CSV Format](download_facts.csv) 30 | 31 | ## Solving using PySpark 32 | 33 | In Spark we will solve this problem using two ways 34 | 1. Using PySpark Functions 35 | 2. Using Spark SQL 36 | 37 | Use below notebook for solution 38 | 39 | [Problem Solution First Part](problem9.ipynb) 40 | 41 | ## Solving using MySQL 42 | 43 | In MySQL We will load data from CSV using MySQL Import functionality. And then we will solve this problem. 44 | 45 | Output Query 46 | 47 | [Problem Solution](problem9.sql) 48 | 49 | Please also follow below blog for understanding this problem 50 | -------------------------------------------------------------------------------- /Problem 9/user_info.csv: -------------------------------------------------------------------------------- 1 | user_id,acc_id 2 | 0,1 3 | 1,716 4 | 2,749 5 | 3,713 6 | 4,744 7 | 5,726 8 | 6,706 9 | 7,750 10 | 8,732 11 | 9,706 12 | 10,729 13 | 11,748 14 | 12,731 15 | 13,739 16 | 14,740 17 | 15,705 18 | 16,706 19 | 17,701 20 | 18,746 21 | 19,726 22 | 20,748 23 | 21,701 24 | 22,707 25 | 23,710 26 | 24,702 27 | 25,720 28 | 26,730 29 | 27,721 30 | 28,733 31 | 29,732 32 | 30,729 33 | 31,716 34 | 32,722 35 | 33,745 36 | 34,737 37 | 35,730 38 | 36,729 39 | 37,723 40 | 38,710 41 | 39,707 42 | 40,737 43 | 41,717 44 | 42,741 45 | 43,718 46 | 44,736 47 | 45,720 48 | 46,743 49 | 47,707 50 | 48,721 51 | 49,748 52 | 50,715 53 | 51,709 54 | 52,732 55 | 53,732 56 | 54,712 57 | 55,701 58 | 56,721 59 | 57,744 60 | 58,724 61 | 59,727 62 | 60,743 63 | 61,744 64 | 62,717 65 | 63,723 66 | 64,713 67 | 65,706 68 | 66,731 69 | 67,722 70 | 68,744 71 | 69,705 72 | 70,703 73 | 71,725 74 | 72,740 75 | 73,713 76 | 74,732 77 | 75,720 78 | 76,709 79 | 77,739 80 | 78,703 81 | 79,732 82 | 80,728 83 | 81,737 84 | 82,711 85 | 83,745 86 | 84,734 87 | 85,723 88 | 86,718 89 | 87,702 90 | 88,718 91 | 89,744 92 | 90,710 93 | 91,727 94 | 92,739 95 | 93,728 96 | 94,740 97 | 95,744 98 | 96,737 99 | 97,726 100 | 98,722 101 | 99,727 102 | 100,712 -------------------------------------------------------------------------------- /Problem 4/README.md: -------------------------------------------------------------------------------- 1 | # Problems 4 -> Get Shortest and Longest City Name 2 | 3 | Query the two cities in STATION with the shortest and longest CITY names, as well as their respective lengths (i.e.: number of characters in the name). If there is more than one smallest or largest city, choose the one that comes first when ordered alphabetically. 4 | The STATION table is described as follows: 5 | 6 | Problem Difficulty Level : Hard 7 | 8 | Data Structure 9 | 10 | - ID 11 | - City 12 | - State 13 | - Lattitude 14 | - Longitude 15 | 16 | image 17 | 18 | Data for station table 19 | 20 | [In CSV Format](stations.csv) 21 | 22 | Sample Input 23 | 24 | For example, CITY has four entries: DEF, ABC, PQRS and WXY. 25 | 26 | Sample Output 27 | 28 | `````````` 29 | ABC 3 30 | PQRS 4 31 | `````````` 32 | 33 | ## Solving using PySpark 34 | 35 | In Spark we will solve this problem using two ways 36 | 1. Using PySpark Functions 37 | 2. Using Spark SQL 38 | 39 | Use below notebook for solution 40 | 41 | [Problem Solution First Part](problem4.ipynb) 42 | 43 | ## Solving using PostgreSQL 44 | 45 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem. 46 | 47 | Output Query 48 | 49 | [Problem Solution](problem4.sql) 50 | 51 | Please also follow below blog for understanding this problem 52 | -------------------------------------------------------------------------------- /Problem 9/problem9.sql: -------------------------------------------------------------------------------- 1 | SELECT paying_customer.date,nonpaying_download,paying_downaload 2 | FROM 3 | ( select acc.paying_customer 4 | ,download.date, SUM(download.downloads) as paying_downaload 5 | FROM user_info as usr 6 | LEFT OUTER JOIN user_type as acc 7 | ON usr.acc_id = acc.acc_id 8 | LEFT OUTER JOIN download_facts as download 9 | ON usr.user_id = download.user_id 10 | WHERE paying_customer = 'yes' 11 | GROUP BY acc.paying_customer,download.date ) as paying_customer 12 | LEFT OUTER JOIN 13 | ( select acc.paying_customer 14 | ,download.date, SUM(download.downloads) as nonpaying_download 15 | FROM user_info as usr 16 | LEFT OUTER JOIN user_type as acc 17 | ON usr.acc_id = acc.acc_id 18 | LEFT OUTER JOIN download_facts as download 19 | ON usr.user_id = download.user_id 20 | WHERE paying_customer = 'no' 21 | GROUP BY acc.paying_customer,download.date) as non_paying_customer 22 | ON paying_customer.date = non_paying_customer.date 23 | WHERE nonpaying_download > paying_downaload 24 | ORDER BY paying_customer.date 25 | 26 | ---- 27 | 28 | SELECT date, non_paying, 29 | paying 30 | FROM 31 | (SELECT date, sum(CASE 32 | WHEN paying_customer = 'yes' THEN downloads 33 | END) AS paying, 34 | sum(CASE 35 | WHEN paying_customer = 'no' THEN downloads 36 | END) AS non_paying 37 | FROM user_info a 38 | INNER JOIN user_type b ON a.acc_id = b.acc_id 39 | INNER JOIN download_facts c ON a.user_id=c.user_id 40 | GROUP BY date 41 | ORDER BY date) t 42 | WHERE (non_paying - paying) >0 43 | ORDER BY t.date ASC -------------------------------------------------------------------------------- /Problem 6/README.md: -------------------------------------------------------------------------------- 1 | # Problems 6 -> Students more than 75 Marks 2 | 3 | Query the Name of any student in STUDENTS who scored higher than 75 Marks. Order your output by the last three characters of each name. If two or more students both have names ending in the same last three characters (i.e.: Bobby, Robby, etc.), secondary sort them by ascending ID. 4 | 5 | Problem Difficulty Level : Medium 6 | 7 | Data Structure 8 | 9 | - ID 10 | - Name 11 | - Marks 12 | 13 | image 14 | 15 | Data for students table 16 | 17 | [In CSV Format](Students.csv) 18 | 19 | ## Sample Input 20 | 21 | ``` 22 | 1 Ashley 81 23 | 2 Samantha 75 24 | 3 Julia 76 25 | 4 Belvet 84 26 | ``` 27 | 28 | ## Sample Output 29 | 30 | ``` 31 | Ashley 32 | Julia 33 | Belvet 34 | ``` 35 | 36 | ## Explanation 37 | 38 | Only Ashley, Julia, and Belvet have Marks > . If you look at the last three characters of each of their names, there are no duplicates and 'ley' < 'lia' < 'vet'. 39 | 40 | ## Solving using PySpark 41 | 42 | In Spark we will solve this problem using two ways 43 | 1. Using PySpark Functions 44 | 2. Using Spark SQL 45 | 46 | Use below notebook for solution 47 | 48 | [Problem Solution First Part](problem6.ipynb) 49 | 50 | ## Solving using PostgreSQL 51 | 52 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem. 53 | 54 | Output Query 55 | 56 | [Problem Solution](problem6.sql) 57 | 58 | Please also follow below blog for understanding this problem 59 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Data Engineering Problems 2 | Data Engineering Problems with Solution 3 | 4 | image 5 | 6 | Here, we are solving all the Data Engineering problems using below methods 7 | 1. Solving problem using PySpark 8 | 1. Using PySpark Functions 9 | 2. Using Spark SQL 10 | 2. Solving problem using SQL (PostgreSQL or MySQL) 11 | 12 | Please find list of all the problems 13 | 14 | 0. Problem0 -> [Get Employee with salary more than 100K](Problem%200/README.md) 15 | 1. Problem1 -> [Get Max Salary for each Employee](Problem%201/README.md) 16 | 2. Problem2 -> [Get Salary of all employees in Marketing department](Problem%202/README.md) 17 | 3. Problem3 -> [Find diff between count of cities and distict count of cities](Problem%203/README.md) 18 | 4. Problem4 -> [Get Shortest and Longest City Name](Problem%204/README.md) 19 | 5. Problem5 -> [CITY names starting with vowels](Problem%205/README.md) 20 | 6. Problem6 -> [Students more than 75 Marks ](Problem%206/README.md) 21 | 7. Problem7 -> [Returning active users](Problem%207/README.md) 22 | 8. Problem8 -> [Top distance travelled](Problem%208/README.md) 23 | 9. Problems 9 -> [Premium vs Freemium](Problem%209/README.md) 24 | 25 | 26 | Also find below blog for understanding all the data engineering problems 27 | 28 | https://developershome.blog/category/data-engineering/problem-solving/ 29 | 30 | Also find below youtube channel for understanding all the data engineering problems and learning new concepts of data engineering. 31 | 32 | https://www.youtube.com/@developershomeIn 33 | -------------------------------------------------------------------------------- /Problem 8/ride_log.csv: -------------------------------------------------------------------------------- 1 | id,user_id,distance 2 | 101,8,93 3 | 102,40,56 4 | 103,28,83 5 | 104,33,83 6 | 105,1,87 7 | 106,32,49 8 | 107,3,5 9 | 108,23,37 10 | 109,31,62 11 | 110,1,35 12 | 111,41,89 13 | 112,19,64 14 | 113,49,57 15 | 114,28,68 16 | 115,48,94 17 | 116,50,89 18 | 117,48,29 19 | 118,13,16 20 | 119,24,58 21 | 120,25,19 22 | 121,39,13 23 | 122,36,10 24 | 123,37,38 25 | 124,32,76 26 | 125,34,61 27 | 126,37,10 28 | 127,11,61 29 | 128,47,35 30 | 129,46,17 31 | 130,15,8 32 | 131,11,36 33 | 132,31,24 34 | 133,7,96 35 | 134,34,64 36 | 135,2,75 37 | 136,45,11 38 | 137,48,58 39 | 138,15,92 40 | 139,47,88 41 | 140,18,27 42 | 141,34,67 43 | 142,47,70 44 | 143,24,52 45 | 144,26,98 46 | 145,20,45 47 | 146,27,60 48 | 147,26,94 49 | 148,10,90 50 | 149,12,63 51 | 150,9,43 52 | 151,36,18 53 | 152,12,11 54 | 153,44,76 55 | 154,9,93 56 | 155,14,82 57 | 156,28,26 58 | 157,39,68 59 | 158,5,92 60 | 159,46,91 61 | 160,14,66 62 | 161,8,47 63 | 162,44,52 64 | 163,21,81 65 | 164,11,69 66 | 165,38,82 67 | 166,23,42 68 | 167,34,85 69 | 168,12,30 70 | 169,43,85 71 | 170,20,30 72 | 171,20,50 73 | 172,25,74 74 | 173,25,96 75 | 174,8,74 76 | 175,50,46 77 | 176,43,77 78 | 177,11,40 79 | 178,17,90 80 | 179,1,78 81 | 180,20,25 82 | 181,27,31 83 | 182,17,91 84 | 183,8,29 85 | 184,42,85 86 | 185,43,95 87 | 186,17,24 88 | 187,15,42 89 | 188,47,37 90 | 189,9,15 91 | 190,42,71 92 | 191,43,9 93 | 192,12,53 94 | 193,49,73 95 | 194,25,50 96 | 195,32,85 97 | 196,9,55 98 | 197,47,98 99 | 198,43,9 100 | 199,14,66 101 | 200,2,39 -------------------------------------------------------------------------------- /Problem 2/README.md: -------------------------------------------------------------------------------- 1 | # Problems 2 -> Employee From Sales Department with Salary 2 | 3 | We have a table with employees tables in which we have employee details with salary and department id of the employees. We have one more table in which we have department id and department name. 4 | Provide below queries 5 | 1. Use this both tables and list all the employees woking in marketing department with highest to lowest salary order. 6 | 2. Provide count of employees in each departnent with department name. 7 | 8 | Problem Difficulty Level : Easy 9 | 10 | Data Structure 11 | 12 | Employee table 13 | 14 | image 15 | 16 | Department table 17 | 18 | image 19 | 20 | Data for employee salary table 21 | 22 | [In CSV Format](../Problem%200/employee_salary.csv) 23 | 24 | Data for department table 25 | 26 | [In CSV Format](department.csv) 27 | 28 | ## Solving using PySpark 29 | 30 | In Spark we will solve this problem using two ways 31 | 1. Using PySpark Functions 32 | 2. Using Spark SQL 33 | 34 | Use below notebook for solution 35 | 36 | [Problem Solution First Part](problem2_1.ipynb)
37 | [Problem Solution Second Part](problem2_2.ipynb) 38 | 39 | ## Solving using PostgreSQL 40 | 41 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem. 42 | 43 | Output Query 44 | 45 | [Problem Solution](problem2.sql) 46 | 47 | Please also follow below blog for understanding this problem 48 | -------------------------------------------------------------------------------- /Problem 9/download_facts.csv: -------------------------------------------------------------------------------- 1 | date,user_id,downloads 2 | 24/8/2020,1,6 3 | 22/8/2020,2,6 4 | 18/8/2020,3,2 5 | 24/8/2020,4,4 6 | 19/8/2020,5,7 7 | 21/8/2020,6,3 8 | 24/8/2020,7,1 9 | 24/8/2020,8,8 10 | 17/8/2020,9,5 11 | 16/8/2020,10,4 12 | 22/8/2020,11,8 13 | 19/8/2020,12,6 14 | 15/8/2020,13,3 15 | 21/8/2020,14,0 16 | 24/8/2020,15,0 17 | 15/8/2020,16,5 18 | 18/8/2020,17,5 19 | 23/8/2020,18,8 20 | 15/8/2020,19,6 21 | 25/8/2020,20,4 22 | 16/8/2020,21,1 23 | 25/8/2020,22,4 24 | 22/8/2020,23,7 25 | 21/8/2020,24,4 26 | 25/8/2020,25,5 27 | 23/8/2020,26,6 28 | 19/8/2020,27,9 29 | 24/8/2020,28,3 30 | 20/8/2020,29,0 31 | 25/8/2020,30,8 32 | 20/8/2020,31,5 33 | 21/8/2020,32,8 34 | 15/8/2020,33,6 35 | 24/8/2020,34,4 36 | 25/8/2020,35,1 37 | 24/8/2020,36,7 38 | 17/8/2020,37,8 39 | 16/8/2020,38,8 40 | 17/8/2020,39,1 41 | 20/8/2020,40,8 42 | 18/8/2020,41,3 43 | 16/8/2020,42,0 44 | 23/8/2020,43,9 45 | 25/8/2020,44,9 46 | 16/8/2020,45,2 47 | 15/8/2020,46,2 48 | 21/8/2020,47,1 49 | 21/8/2020,48,4 50 | 22/8/2020,49,8 51 | 17/8/2020,50,6 52 | 21/8/2020,51,4 53 | 20/8/2020,52,7 54 | 16/8/2020,53,7 55 | 20/8/2020,54,6 56 | 20/8/2020,55,0 57 | 21/8/2020,56,8 58 | 18/8/2020,57,5 59 | 17/8/2020,58,2 60 | 24/8/2020,59,3 61 | 20/8/2020,60,7 62 | 22/8/2020,61,8 63 | 15/8/2020,62,6 64 | 23/8/2020,63,3 65 | 17/8/2020,64,4 66 | 16/8/2020,65,4 67 | 16/8/2020,66,3 68 | 19/8/2020,67,1 69 | 18/8/2020,68,2 70 | 17/8/2020,69,4 71 | 22/8/2020,70,7 72 | 20/8/2020,71,6 73 | 15/8/2020,72,2 74 | 17/8/2020,73,7 75 | 22/8/2020,74,1 76 | 17/8/2020,75,8 77 | 19/8/2020,76,0 78 | 25/8/2020,77,1 79 | 25/8/2020,78,0 80 | 17/8/2020,79,8 81 | 23/8/2020,80,7 82 | 24/8/2020,81,2 83 | 21/8/2020,82,0 84 | 24/8/2020,83,4 85 | 21/8/2020,84,0 86 | 25/8/2020,85,7 87 | 22/8/2020,86,1 88 | 20/8/2020,87,2 89 | 19/8/2020,88,3 90 | 22/8/2020,89,8 91 | 24/8/2020,90,0 92 | 22/8/2020,91,9 93 | 25/8/2020,92,7 94 | 25/8/2020,93,0 95 | 17/8/2020,94,1 96 | 23/8/2020,95,2 97 | 24/8/2020,96,3 98 | 21/8/2020,97,8 99 | 24/8/2020,98,0 100 | 21/8/2020,99,9 101 | 25/8/2020,100,7 -------------------------------------------------------------------------------- /Problem 0/employee_salary.csv: -------------------------------------------------------------------------------- 1 | id,first_name,last_name,salary,department_id 2 | 45,Kevin,Duncan,45210,1003 3 | 25,Pamela,Matthews,57944,1005 4 | 48,Robert,Lynch,117960,1004 5 | 34,Justin,Dunn,67992,1003 6 | 62,Dale,Hayes,97662,1005 7 | 1,Todd,Wilson,110000,1006 8 | 61,Ryan,Brown,120000,1003 9 | 21,Stephen,Berry,123617,1002 10 | 13,Julie,Sanchez,210000,1001 11 | 55,Michael,Morris,106799,1005 12 | 44,Trevor,Carter,38670,1001 13 | 73,William,Preston,155225,1003 14 | 39,Linda,Clark,186781,1002 15 | 10,Sean,Crawford,190000,1006 16 | 30,Stephen,Smith,194791,1001 17 | 75,Julia,Ramos,105000,1006 18 | 59,Kevin,Robinson,100924,1005 19 | 69,Ernest,Peterson,115993,1005 20 | 65,Deborah,Martin,67389,1004 21 | 63,Richard,Sanford,136083,1001 22 | 29,Jason,Olsen,51937,1006 23 | 11,Kevin,Townsend,166861,1002 24 | 43,Joseph,Rogers,22800,1005 25 | 32,Eric,Zimmerman,83093,1006 26 | 6,Natasha,Swanson,90000,1005 27 | 3,Kelly,Rosario,42689,1002 28 | 16,Briana,Rivas,151668,1005 29 | 38,Nicole,Lewis,114079,1001 30 | 42,Traci,Williams,180000,1003 31 | 49,Amber,Harding,77764,1002 32 | 26,Allison,Johnson,128782,1001 33 | 74,Richard,Cole,180361,1003 34 | 23,Angela,Williams,100875,1004 35 | 19,Michael,Ramsey,63159,1003 36 | 28,Alexis,Beck,12260,1005 37 | 64,Danielle,Williams,120000,1006 38 | 51,Theresa,Everett,31404,1002 39 | 58,Edward,Sharp,41077,1005 40 | 36,Jesus,Ward,36078,1005 41 | 5,Sherry,Golden,44101,1002 42 | 9,Christy,Mitchell,150000,1001 43 | 35,John,Ball,47795,1004 44 | 54,Wesley,Tucker,90221,1005 45 | 20,Cody,Gonzalez,112809,1004 46 | 57,Patricia,Harmon,147417,1005 47 | 24,William,Flores,142674,1003 48 | 60,Charles,Pearson,173317,1004 49 | 17,Jason,Burnett,42525,1006 50 | 7,Diane,Gordon,74591,1002 51 | 15,Anthony,Valdez,96898,1001 52 | 41,John,George,21642,1001 53 | 71,Kristine,Casey,67651,1003 54 | 12,Joshua,Johnson,123082,1004 55 | 68,Antonio,Carpenter,83684,1002 56 | 47,Kimberly,Dean,71416,1003 57 | 37,Philip,Gillespie,36424,1006 58 | 31,Kimberly,Brooks,95327,1003 59 | 27,Anthony,Ball,34386,1003 60 | 40,Colleen,Carrillo,147723,1004 61 | 70,Karen,Fernandez,101238,1003 62 | 4,Patricia,Powell,170000,1004 63 | 22,Brittany,Scott,162537,1002 64 | 8,Mercedes,Rodriguez,61048,1005 65 | 67,Tyler,Green,111085,1002 66 | 52,Kara,Smith,192838,1004 67 | 46,Joshua,Ewing,73088,1003 68 | 18,Jeffrey,Harris,20000,1002 69 | 56,Rachael,Williams,103585,1002 70 | 50,Victoria,Wilson,176620,1002 71 | 14,John,Coleman,152434,1001 72 | 72,Christine,Frye,137244,1004 73 | 2,Justin,Simon,130000,1005 74 | 53,Teresa,Cohen,98860,1001 75 | 66,Dustin,Bush,47567,1004 76 | 33,Peter,Holt,69945,1002 77 | -------------------------------------------------------------------------------- /Problem 1/employee.csv: -------------------------------------------------------------------------------- 1 | "id","first_name","last_name","salary","department_id" 2 | 1,Todd,Wilson,110000,1006 3 | 1,Todd,Wilson,106119,1006 4 | 2,Justin,Simon,128922,1005 5 | 2,Justin,Simon,130000,1005 6 | 3,Kelly,Rosario,42689,1002 7 | 4,Patricia,Powell,162825,1004 8 | 4,Patricia,Powell,170000,1004 9 | 5,Sherry,Golden,44101,1002 10 | 6,Natasha,Swanson,79632,1005 11 | 6,Natasha,Swanson,90000,1005 12 | 7,Diane,Gordon,74591,1002 13 | 8,Mercedes,Rodriguez,61048,1005 14 | 9,Christy,Mitchell,137236,1001 15 | 9,Christy,Mitchell,140000,1001 16 | 9,Christy,Mitchell,150000,1001 17 | 10,Sean,Crawford,182065,1006 18 | 10,Sean,Crawford,190000,1006 19 | 11,Kevin,Townsend,166861,1002 20 | 12,Joshua,Johnson,123082,1004 21 | 13,Julie,Sanchez,185663,1001 22 | 13,Julie,Sanchez,200000,1001 23 | 13,Julie,Sanchez,210000,1001 24 | 14,John,Coleman,152434,1001 25 | 15,Anthony,Valdez,96898,1001 26 | 16,Briana,Rivas,151668,1005 27 | 17,Jason,Burnett,42525,1006 28 | 18,Jeffrey,Harris,14491,1002 29 | 18,Jeffrey,Harris,20000,1002 30 | 19,Michael,Ramsey,63159,1003 31 | 20,Cody,Gonzalez,112809,1004 32 | 21,Stephen,Berry,123617,1002 33 | 22,Brittany,Scott,162537,1002 34 | 23,Angela,Williams,100875,1004 35 | 24,William,Flores,142674,1003 36 | 25,Pamela,Matthews,57944,1005 37 | 26,Allison,Johnson,128782,1001 38 | 27,Anthony,Ball,34386,1003 39 | 28,Alexis,Beck,12260,1005 40 | 29,Jason,Olsen,51937,1006 41 | 30,Stephen,Smith,194791,1001 42 | 31,Kimberly,Brooks,95327,1003 43 | 32,Eric,Zimmerman,83093,1006 44 | 33,Peter,Holt,69945,1002 45 | 34,Justin,Dunn,67992,1003 46 | 35,John,Ball,47795,1004 47 | 36,Jesus,Ward,36078,1005 48 | 37,Philip,Gillespie,36424,1006 49 | 38,Nicole,Lewis,114079,1001 50 | 39,Linda,Clark,186781,1002 51 | 40,Colleen,Carrillo,147723,1004 52 | 41,John,George,21642,1001 53 | 42,Traci,Williams,138892,1003 54 | 42,Traci,Williams,150000,1003 55 | 42,Traci,Williams,160000,1003 56 | 42,Traci,Williams,180000,1003 57 | 43,Joseph,Rogers,22800,1005 58 | 44,Trevor,Carter,38670,1001 59 | 45,Kevin,Duncan,45210,1003 60 | 46,Joshua,Ewing,73088,1003 61 | 47,Kimberly,Dean,71416,1003 62 | 48,Robert,Lynch,117960,1004 63 | 49,Amber,Harding,77764,1002 64 | 50,Victoria,Wilson,176620,1002 65 | 51,Theresa,Everett,31404,1002 66 | 52,Kara,Smith,192838,1004 67 | 53,Teresa,Cohen,98860,1001 68 | 54,Wesley,Tucker,90221,1005 69 | 55,Michael,Morris,106799,1005 70 | 56,Rachael,Williams,103585,1002 71 | 57,Patricia,Harmon,147417,1005 72 | 58,Edward,Sharp,41077,1005 73 | 59,Kevin,Robinson,100924,1005 74 | 60,Charles,Pearson,173317,1004 75 | 61,Ryan,Brown,110225,1003 76 | 61,Ryan,Brown,120000,1003 77 | 62,Dale,Hayes,97662,1005 78 | 63,Richard,Sanford,136083,1001 79 | 64,Danielle,Williams,98655,1006 80 | 64,Danielle,Williams,110000,1006 81 | 64,Danielle,Williams,120000,1006 82 | 65,Deborah,Martin,67389,1004 83 | 66,Dustin,Bush,47567,1004 84 | 67,Tyler,Green,111085,1002 85 | 68,Antonio,Carpenter,83684,1002 86 | 69,Ernest,Peterson,115993,1005 87 | 70,Karen,Fernandez,101238,1003 88 | 71,Kristine,Casey,67651,1003 89 | 72,Christine,Frye,137244,1004 90 | 73,William,Preston,155225,1003 91 | 74,Richard,Cole,180361,1003 92 | 75,Julia,Ramos,61398,1006 93 | 75,Julia,Ramos,70000,1006 94 | 75,Julia,Ramos,83000,1006 95 | 75,Julia,Ramos,90000,1006 96 | 75,Julia,Ramos,105000,1006 97 | -------------------------------------------------------------------------------- /Problem 7/transaction.csv: -------------------------------------------------------------------------------- 1 | id,user_id,item,created_at,revenue 2 | 1,109,milk,2020-03-03,123 3 | 2,139,biscuit,2020-03-18,421 4 | 3,120,milk,2020-03-18,176 5 | 4,108,banana,2020-03-18,862 6 | 5,130,milk,2020-03-28,333 7 | 6,103,bread,2020-03-29,862 8 | 7,122,banana,2020-03-07,952 9 | 8,125,bread,2020-03-13,317 10 | 9,139,bread,2020-03-30,929 11 | 10,141,banana,2020-03-17,812 12 | 11,116,bread,2020-03-31,226 13 | 12,128,bread,2020-03-04,112 14 | 13,146,biscuit,2020-03-04,362 15 | 14,119,banana,2020-03-28,127 16 | 15,142,bread,2020-03-09,503 17 | 16,122,bread,2020-03-06,593 18 | 17,128,biscuit,2020-03-24,160 19 | 18,112,banana,2020-03-24,262 20 | 19,149,banana,2020-03-29,382 21 | 20,100,banana,2020-03-18,599 22 | 21,130,milk,2020-03-16,604 23 | 22,103,milk,2020-03-31,290 24 | 23,112,banana,2020-03-23,523 25 | 24,102,bread,2020-03-25,325 26 | 25,120,biscuit,2020-03-21,858 27 | 26,109,bread,2020-03-22,432 28 | 27,101,milk,2020-03-01,449 29 | 28,138,milk,2020-03-19,961 30 | 29,100,milk,2020-03-29,410 31 | 30,129,milk,2020-03-02,771 32 | 31,123,milk,2020-03-31,434 33 | 32,104,biscuit,2020-03-31,957 34 | 33,110,bread,2020-03-13,210 35 | 34,143,bread,2020-03-27,870 36 | 35,130,milk,2020-03-12,176 37 | 36,128,milk,2020-03-28,498 38 | 37,133,banana,2020-03-21,837 39 | 38,150,banana,2020-03-20,927 40 | 39,120,milk,2020-03-27,793 41 | 40,109,bread,2020-03-02,362 42 | 41,110,bread,2020-03-13,262 43 | 42,140,milk,2020-03-09,468 44 | 43,112,banana,2020-03-04,381 45 | 44,117,biscuit,2020-03-19,831 46 | 45,137,banana,2020-03-23,490 47 | 46,130,bread,2020-03-09,149 48 | 47,133,bread,2020-03-08,658 49 | 48,143,milk,2020-03-11,317 50 | 49,111,biscuit,2020-03-23,204 51 | 50,150,banana,2020-03-04,299 52 | 51,131,bread,2020-03-10,155 53 | 52,140,biscuit,2020-03-17,810 54 | 53,147,banana,2020-03-22,702 55 | 54,119,biscuit,2020-03-15,355 56 | 55,116,milk,2020-03-12,468 57 | 56,141,milk,2020-03-14,254 58 | 57,143,bread,2020-03-16,647 59 | 58,105,bread,2020-03-21,562 60 | 59,149,biscuit,2020-03-11,827 61 | 60,117,banana,2020-03-22,249 62 | 61,150,banana,2020-03-21,450 63 | 62,134,bread,2020-03-08,981 64 | 63,133,banana,2020-03-26,353 65 | 64,127,milk,2020-03-27,300 66 | 65,101,milk,2020-03-26,740 67 | 66,137,biscuit,2020-03-12,473 68 | 67,113,biscuit,2020-03-21,278 69 | 68,141,bread,2020-03-21,118 70 | 69,112,biscuit,2020-03-14,334 71 | 70,118,milk,2020-03-30,603 72 | 71,111,milk,2020-03-19,205 73 | 72,146,biscuit,2020-03-13,599 74 | 73,148,banana,2020-03-14,530 75 | 74,100,banana,2020-03-13,175 76 | 75,105,banana,2020-03-05,815 77 | 76,129,milk,2020-03-02,489 78 | 77,121,milk,2020-03-16,476 79 | 78,117,bread,2020-03-11,270 80 | 79,133,milk,2020-03-12,446 81 | 80,124,bread,2020-03-31,937 82 | 81,145,bread,2020-03-07,821 83 | 82,105,banana,2020-03-09,972 84 | 83,131,milk,2020-03-09,808 85 | 84,114,biscuit,2020-03-31,202 86 | 85,120,milk,2020-03-06,898 87 | 86,130,milk,2020-03-06,581 88 | 87,141,biscuit,2020-03-11,749 89 | 88,147,bread,2020-03-14,262 90 | 89,118,milk,2020-03-15,735 91 | 90,136,biscuit,2020-03-22,410 92 | 91,132,bread,2020-03-06,161 93 | 92,137,biscuit,2020-03-31,427 94 | 93,107,bread,2020-03-01,701 95 | 94,111,biscuit,2020-03-18,218 96 | 95,100,bread,2020-03-07,410 97 | 96,106,milk,2020-03-21,379 98 | 97,114,banana,2020-03-25,705 99 | 98,110,bread,2020-03-27,225 100 | 99,130,milk,2020-03-16,494 101 | 100,117,bread,2020-03-10,209 -------------------------------------------------------------------------------- /Problem 6/problem6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c", 6 | "metadata": {}, 7 | "source": [ 8 | "Here, we will solve problems two ways\n", 9 | "1. First using PySpark function \n", 10 | "2. Second using Spark SQL" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# First Load all the required library and also Start Spark Session\n", 21 | "# Load all the required library\n", 22 | "from pyspark.sql import SparkSession" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "WARNING: An illegal reflective access operation has occurred\n", 36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 39 | "WARNING: All illegal access operations will be denied in a future release\n", 40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 41 | "Setting default log level to \"WARN\".\n", 42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 43 | "23/02/14 14:24:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "#Start Spark Session\n", 49 | "spark = SparkSession.builder.appName(\"problem6\").getOrCreate()\n", 50 | "sqlContext = SparkSession(spark)\n", 51 | "#Dont Show warning only error\n", 52 | "spark.sparkContext.setLogLevel(\"ERROR\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 4, 58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "#Load CSV file into DataFrame\n", 63 | "studentdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"students.csv\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 5, 69 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030", 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "root\n", 77 | " |-- ID: integer (nullable = true)\n", 78 | " |-- Name: string (nullable = true)\n", 79 | " |-- Marks: integer (nullable = true)\n", 80 | "\n" 81 | ] 82 | } 83 | ], 84 | "source": [ 85 | "#Check Schema of DataFrame\n", 86 | "studentdf.printSchema()" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 6, 92 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba", 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "+---+---------+-----+\n", 100 | "| ID| Name|Marks|\n", 101 | "+---+---------+-----+\n", 102 | "| 19| Samantha| 87|\n", 103 | "| 21| Julia| 96|\n", 104 | "| 11| Britney| 95|\n", 105 | "| 32| Kristeen| 100|\n", 106 | "| 12| Dyana| 55|\n", 107 | "| 13| Jenny| 66|\n", 108 | "| 14|Christene| 88|\n", 109 | "| 15| Meera| 24|\n", 110 | "| 16| Priya| 76|\n", 111 | "| 17| Priyanka| 77|\n", 112 | "| 18| Paige| 74|\n", 113 | "| 19| Jane| 64|\n", 114 | "| 21| Belvet| 78|\n", 115 | "| 31| Scarlet| 80|\n", 116 | "| 41| Salma| 81|\n", 117 | "| 51| Amanda| 34|\n", 118 | "| 61| Heraldo| 94|\n", 119 | "| 71| Stuart| 99|\n", 120 | "| 81| Aamina| 77|\n", 121 | "| 76| Amina| 89|\n", 122 | "+---+---------+-----+\n", 123 | "only showing top 20 rows\n", 124 | "\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "#Check sample Data \n", 130 | "studentdf.show()" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 14, 136 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171", 137 | "metadata": {}, 138 | "outputs": [ 139 | { 140 | "name": "stdout", 141 | "output_type": "stream", 142 | "text": [ 143 | "+---------+\n", 144 | "| Name|\n", 145 | "+---------+\n", 146 | "| Stuart|\n", 147 | "| Kristeen|\n", 148 | "|Christene|\n", 149 | "| Amina|\n", 150 | "| Aamina|\n", 151 | "| Priya|\n", 152 | "| Heraldo|\n", 153 | "| Scarlet|\n", 154 | "| Julia|\n", 155 | "| Salma|\n", 156 | "| Britney|\n", 157 | "| Priyanka|\n", 158 | "| Samantha|\n", 159 | "| Vivek|\n", 160 | "| Belvet|\n", 161 | "| Devil|\n", 162 | "| Evil|\n", 163 | "+---------+\n", 164 | "\n" 165 | ] 166 | } 167 | ], 168 | "source": [ 169 | "#Solving Problem using PySpark \n", 170 | "#Filter with Markes > 75 and then order by last 3 char and ID\n", 171 | "from pyspark.sql.functions import expr\n", 172 | "studentdf.select(\"Name\").where(\"Marks > 75\").orderBy(expr(\"RIGHT(Name,3)\"),\"ID\").show(n=100)" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 13, 178 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1", 179 | "metadata": {}, 180 | "outputs": [], 181 | "source": [ 182 | "# Now we are solving Same problem using Spark SQL \n", 183 | "# Creating Temp Table or HIVE table\n", 184 | "stationdf.createOrReplaceTempView(\"tmpStudent\")" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": 15, 190 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c", 191 | "metadata": {}, 192 | "outputs": [ 193 | { 194 | "name": "stdout", 195 | "output_type": "stream", 196 | "text": [ 197 | "+---------+\n", 198 | "| Name|\n", 199 | "+---------+\n", 200 | "| Stuart|\n", 201 | "| Kristeen|\n", 202 | "|Christene|\n", 203 | "| Amina|\n", 204 | "| Aamina|\n", 205 | "| Priya|\n", 206 | "| Heraldo|\n", 207 | "| Scarlet|\n", 208 | "| Julia|\n", 209 | "| Salma|\n", 210 | "| Britney|\n", 211 | "| Priyanka|\n", 212 | "| Samantha|\n", 213 | "| Vivek|\n", 214 | "| Belvet|\n", 215 | "| Devil|\n", 216 | "| Evil|\n", 217 | "+---------+\n", 218 | "\n" 219 | ] 220 | } 221 | ], 222 | "source": [ 223 | "# Now we have SQL Table and we can write SQL Query on top of that \n", 224 | "# For example by Select on table \n", 225 | "sqlContext.sql(\"SELECT Name \\\n", 226 | " FROM tmpStudent \\\n", 227 | " WHERE Marks > 75 \\\n", 228 | " ORDER BY right(Name,3),ID\").show()" 229 | ] 230 | } 231 | ], 232 | "metadata": { 233 | "kernelspec": { 234 | "display_name": "Python 3 (ipykernel)", 235 | "language": "python", 236 | "name": "python3" 237 | }, 238 | "language_info": { 239 | "codemirror_mode": { 240 | "name": "ipython", 241 | "version": 3 242 | }, 243 | "file_extension": ".py", 244 | "mimetype": "text/x-python", 245 | "name": "python", 246 | "nbconvert_exporter": "python", 247 | "pygments_lexer": "ipython3", 248 | "version": "3.8.13" 249 | } 250 | }, 251 | "nbformat": 4, 252 | "nbformat_minor": 5 253 | } 254 | -------------------------------------------------------------------------------- /Problem 7/problem7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c", 6 | "metadata": {}, 7 | "source": [ 8 | "Here, we will solve problems two ways\n", 9 | "1. First using PySpark function \n", 10 | "2. Second using Spark SQL" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# First Load all the required library and also Start Spark Session\n", 21 | "# Load all the required library\n", 22 | "from pyspark.sql import SparkSession" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "WARNING: An illegal reflective access operation has occurred\n", 36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 39 | "WARNING: All illegal access operations will be denied in a future release\n", 40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 41 | "Setting default log level to \"WARN\".\n", 42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 43 | "23/02/15 10:13:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "#Start Spark Session\n", 49 | "spark = SparkSession.builder.appName(\"problem7\").getOrCreate()\n", 50 | "sqlContext = SparkSession(spark)\n", 51 | "#Dont Show warning only error\n", 52 | "spark.sparkContext.setLogLevel(\"ERROR\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stderr", 63 | "output_type": "stream", 64 | "text": [ 65 | " \r" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "#Load CSV file into DataFrame\n", 71 | "transactiondf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"transaction.csv\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "root\n", 85 | " |-- id: integer (nullable = true)\n", 86 | " |-- user_id: integer (nullable = true)\n", 87 | " |-- item: string (nullable = true)\n", 88 | " |-- created_at: string (nullable = true)\n", 89 | " |-- revenue: integer (nullable = true)\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "#Check Schema of DataFrame\n", 96 | "transactiondf.printSchema()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 10, 102 | "id": "c9ba5185-8682-4b49-88b8-9391cd0c2dac", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "from pyspark.sql.functions import col\n", 107 | "transactiondf = transactiondf.withColumn(\"created_at\",col(\"created_at\").cast(\"date\"))" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 12, 113 | "id": "59514ce5-8584-4b67-9cff-934e9287f818", 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "root\n", 121 | " |-- id: integer (nullable = true)\n", 122 | " |-- user_id: integer (nullable = true)\n", 123 | " |-- item: string (nullable = true)\n", 124 | " |-- created_at: date (nullable = true)\n", 125 | " |-- revenue: integer (nullable = true)\n", 126 | "\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "transactiondf.printSchema()" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 13, 137 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba", 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "+---+-------+-------+----------+-------+\n", 145 | "| id|user_id| item|created_at|revenue|\n", 146 | "+---+-------+-------+----------+-------+\n", 147 | "| 1| 109| milk|2020-03-03| 123|\n", 148 | "| 2| 139|biscuit|2020-03-18| 421|\n", 149 | "| 3| 120| milk|2020-03-18| 176|\n", 150 | "| 4| 108| banana|2020-03-18| 862|\n", 151 | "| 5| 130| milk|2020-03-28| 333|\n", 152 | "| 6| 103| bread|2020-03-29| 862|\n", 153 | "| 7| 122| banana|2020-03-07| 952|\n", 154 | "| 8| 125| bread|2020-03-13| 317|\n", 155 | "| 9| 139| bread|2020-03-30| 929|\n", 156 | "| 10| 141| banana|2020-03-17| 812|\n", 157 | "| 11| 116| bread|2020-03-31| 226|\n", 158 | "| 12| 128| bread|2020-03-04| 112|\n", 159 | "| 13| 146|biscuit|2020-03-04| 362|\n", 160 | "| 14| 119| banana|2020-03-28| 127|\n", 161 | "| 15| 142| bread|2020-03-09| 503|\n", 162 | "| 16| 122| bread|2020-03-06| 593|\n", 163 | "| 17| 128|biscuit|2020-03-24| 160|\n", 164 | "| 18| 112| banana|2020-03-24| 262|\n", 165 | "| 19| 149| banana|2020-03-29| 382|\n", 166 | "| 20| 100| banana|2020-03-18| 599|\n", 167 | "+---+-------+-------+----------+-------+\n", 168 | "only showing top 20 rows\n", 169 | "\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "#Check sample Data \n", 175 | "transactiondf.show()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 14, 181 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1", 182 | "metadata": {}, 183 | "outputs": [], 184 | "source": [ 185 | "# Now we are solving Same problem using Spark SQL \n", 186 | "# Creating Temp Table or HIVE table\n", 187 | "transactiondf.createOrReplaceTempView(\"tmpTransaction\")" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 16, 193 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c", 194 | "metadata": {}, 195 | "outputs": [ 196 | { 197 | "name": "stdout", 198 | "output_type": "stream", 199 | "text": [ 200 | "+-------+\n", 201 | "|user_id|\n", 202 | "+-------+\n", 203 | "| 100|\n", 204 | "| 103|\n", 205 | "| 105|\n", 206 | "| 109|\n", 207 | "| 110|\n", 208 | "| 111|\n", 209 | "| 112|\n", 210 | "| 114|\n", 211 | "| 117|\n", 212 | "| 120|\n", 213 | "| 122|\n", 214 | "| 128|\n", 215 | "| 129|\n", 216 | "| 130|\n", 217 | "| 131|\n", 218 | "| 133|\n", 219 | "| 141|\n", 220 | "| 143|\n", 221 | "| 150|\n", 222 | "+-------+\n", 223 | "\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "# Now we have SQL Table and we can write SQL Query on top of that \n", 229 | "# For example by Select on table \n", 230 | "sqlContext.sql(\"SELECT DISTINCT(a1.user_id) \\\n", 231 | " FROM tmpTransaction a1 \\\n", 232 | " JOIN tmpTransaction a2 ON a1.user_id=a2.user_id \\\n", 233 | " AND a1.id <> a2.id \\\n", 234 | " AND DATEDIFF(a2.created_at,a1.created_at) BETWEEN 0 AND 7 \\\n", 235 | " ORDER BY a1.user_id;\").show()" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": null, 241 | "id": "e55eb16a-fb5c-42b6-9f7c-feb1ff9c2945", 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [] 245 | } 246 | ], 247 | "metadata": { 248 | "kernelspec": { 249 | "display_name": "Python 3 (ipykernel)", 250 | "language": "python", 251 | "name": "python3" 252 | }, 253 | "language_info": { 254 | "codemirror_mode": { 255 | "name": "ipython", 256 | "version": 3 257 | }, 258 | "file_extension": ".py", 259 | "mimetype": "text/x-python", 260 | "name": "python", 261 | "nbconvert_exporter": "python", 262 | "pygments_lexer": "ipython3", 263 | "version": "3.8.13" 264 | } 265 | }, 266 | "nbformat": 4, 267 | "nbformat_minor": 5 268 | } 269 | -------------------------------------------------------------------------------- /Problem 4/problem4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c", 6 | "metadata": {}, 7 | "source": [ 8 | "Here, we will solve problems two ways\n", 9 | "1. First using PySpark function \n", 10 | "2. Second using Spark SQL" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# First Load all the required library and also Start Spark Session\n", 21 | "# Load all the required library\n", 22 | "from pyspark.sql import SparkSession" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "WARNING: An illegal reflective access operation has occurred\n", 36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 39 | "WARNING: All illegal access operations will be denied in a future release\n", 40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 41 | "Setting default log level to \"WARN\".\n", 42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 43 | "23/02/09 22:16:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "#Start Spark Session\n", 49 | "spark = SparkSession.builder.appName(\"problem4\").getOrCreate()\n", 50 | "sqlContext = SparkSession(spark)\n", 51 | "#Dont Show warning only error\n", 52 | "spark.sparkContext.setLogLevel(\"ERROR\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stderr", 63 | "output_type": "stream", 64 | "text": [ 65 | " \r" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "#Load CSV file into DataFrame\n", 71 | "stationdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"station.csv\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "root\n", 85 | " |-- ID: integer (nullable = true)\n", 86 | " |-- City: string (nullable = true)\n", 87 | " |-- State: string (nullable = true)\n", 88 | " |-- Lattitude: double (nullable = true)\n", 89 | " |-- Longitude: double (nullable = true)\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "#Check Schema of DataFrame\n", 96 | "stationdf.printSchema()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba", 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "+---+-----------+-----+-----------+-----------+\n", 110 | "| ID| City|State| Lattitude| Longitude|\n", 111 | "+---+-----------+-----+-----------+-----------+\n", 112 | "|478| Tipton| IN|33.54792701|97.94286036|\n", 113 | "|619| Arlington| CO|75.17993079|92.94615894|\n", 114 | "|711| Turner| AR|50.24380534|101.4580163|\n", 115 | "|839| Slidell| LA|85.32270304|151.8743276|\n", 116 | "|411| Negreet| LA| 98.9707194|105.3376115|\n", 117 | "|588| Glencoe| KY|46.38739244|136.0427027|\n", 118 | "|665| Chelsea| IA|98.72210937|59.68913002|\n", 119 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n", 120 | "|811| Dorrance| KS|102.0888316|121.5614372|\n", 121 | "|698| Albany| CA|49.75112765|80.21211317|\n", 122 | "|325| Monument| KS|70.52300953|141.7680413|\n", 123 | "|414| Manchester| MD|73.51580724|37.14602869|\n", 124 | "|113| Prescott| IA|39.93234421|65.79327823|\n", 125 | "|971|Graettinger| IA|94.66283665|150.3826243|\n", 126 | "|266| Cahone| CO|116.2321963| 127.009554|\n", 127 | "|617| Sturgis| MS|36.45673517|126.1690696|\n", 128 | "|495| Upperco| MD|114.2157413|29.63104758|\n", 129 | "|473| Highwood| IL|27.25445814|150.9227402|\n", 130 | "|959| Waipahu| HI|106.4460526|33.91451792|\n", 131 | "|438| Bowdon| GA|88.98111013|78.49025241|\n", 132 | "+---+-----------+-----+-----------+-----------+\n", 133 | "only showing top 20 rows\n", 134 | "\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "#Check sample Data \n", 140 | "stationdf.show()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "# Now we are solving Same problem using Spark SQL \n", 151 | "# Creating Temp Table or HIVE table\n", 152 | "stationdf.createOrReplaceTempView(\"tmpStation\")" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 7, 158 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c", 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "+---+-----------+-----+-----------+-----------+\n", 166 | "| ID| City|State| Lattitude| Longitude|\n", 167 | "+---+-----------+-----+-----------+-----------+\n", 168 | "|478| Tipton| IN|33.54792701|97.94286036|\n", 169 | "|619| Arlington| CO|75.17993079|92.94615894|\n", 170 | "|711| Turner| AR|50.24380534|101.4580163|\n", 171 | "|839| Slidell| LA|85.32270304|151.8743276|\n", 172 | "|411| Negreet| LA| 98.9707194|105.3376115|\n", 173 | "|588| Glencoe| KY|46.38739244|136.0427027|\n", 174 | "|665| Chelsea| IA|98.72210937|59.68913002|\n", 175 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n", 176 | "|811| Dorrance| KS|102.0888316|121.5614372|\n", 177 | "|698| Albany| CA|49.75112765|80.21211317|\n", 178 | "|325| Monument| KS|70.52300953|141.7680413|\n", 179 | "|414| Manchester| MD|73.51580724|37.14602869|\n", 180 | "|113| Prescott| IA|39.93234421|65.79327823|\n", 181 | "|971|Graettinger| IA|94.66283665|150.3826243|\n", 182 | "|266| Cahone| CO|116.2321963| 127.009554|\n", 183 | "|617| Sturgis| MS|36.45673517|126.1690696|\n", 184 | "|495| Upperco| MD|114.2157413|29.63104758|\n", 185 | "|473| Highwood| IL|27.25445814|150.9227402|\n", 186 | "|959| Waipahu| HI|106.4460526|33.91451792|\n", 187 | "|438| Bowdon| GA|88.98111013|78.49025241|\n", 188 | "+---+-----------+-----+-----------+-----------+\n", 189 | "only showing top 20 rows\n", 190 | "\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "# Now we have SQL Table and we can write SQL Query on top of that \n", 196 | "# For example by Select on table \n", 197 | "sqlContext.sql(\"SELECT * FROM tmpStation\").show()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 9, 203 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24", 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stderr", 208 | "output_type": "stream", 209 | "text": [ 210 | " \r" 211 | ] 212 | }, 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "+--------------+----------+\n", 218 | "| city|citylength|\n", 219 | "+--------------+----------+\n", 220 | "| Amo| 3|\n", 221 | "|Fredericksburg| 14|\n", 222 | "+--------------+----------+\n", 223 | "\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "# Now we will write query to get max salary for each employee \n", 229 | "# so we will use SQL Group by and SQL Order by functions \n", 230 | "sqlContext.sql(\"SELECT q1.city, q1.citylength FROM \\\n", 231 | " (SELECT CITY,LENGTH(CITY) as citylength, RANK() OVER (PARTITION BY LENGTH(CITY) ORDER BY LENGTH(CITY),CITY) as actualrank \\\n", 232 | " FROM tmpStation) q1 \\\n", 233 | " WHERE q1. actualrank = 1 \\\n", 234 | " AND q1.citylength = (SELECT MIN(LENGTH(CITY)) FROM tmpStation) \\\n", 235 | " OR q1.citylength = (SELECT MAX(LENGTH(CITY)) FROM tmpStation)\").show(n=100)\n", 236 | "\n" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": null, 242 | "id": "3eeb1534-4da5-427f-9103-1a7bb847170e", 243 | "metadata": {}, 244 | "outputs": [], 245 | "source": [] 246 | } 247 | ], 248 | "metadata": { 249 | "kernelspec": { 250 | "display_name": "Python 3 (ipykernel)", 251 | "language": "python", 252 | "name": "python3" 253 | }, 254 | "language_info": { 255 | "codemirror_mode": { 256 | "name": "ipython", 257 | "version": 3 258 | }, 259 | "file_extension": ".py", 260 | "mimetype": "text/x-python", 261 | "name": "python", 262 | "nbconvert_exporter": "python", 263 | "pygments_lexer": "ipython3", 264 | "version": "3.8.13" 265 | } 266 | }, 267 | "nbformat": 4, 268 | "nbformat_minor": 5 269 | } 270 | -------------------------------------------------------------------------------- /Problem 3/problem3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c", 6 | "metadata": {}, 7 | "source": [ 8 | "Here, we will solve problems two ways\n", 9 | "1. First using PySpark function \n", 10 | "2. Second using Spark SQL" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# First Load all the required library and also Start Spark Session\n", 21 | "# Load all the required library\n", 22 | "from pyspark.sql import SparkSession" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "WARNING: An illegal reflective access operation has occurred\n", 36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 39 | "WARNING: All illegal access operations will be denied in a future release\n", 40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 41 | "Setting default log level to \"WARN\".\n", 42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 43 | "23/02/09 10:33:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "#Start Spark Session\n", 49 | "spark = SparkSession.builder.appName(\"problem3\").getOrCreate()\n", 50 | "sqlContext = SparkSession(spark)\n", 51 | "#Dont Show warning only error\n", 52 | "spark.sparkContext.setLogLevel(\"ERROR\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stderr", 63 | "output_type": "stream", 64 | "text": [ 65 | " \r" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "#Load CSV file into DataFrame\n", 71 | "stationdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"station.csv\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "root\n", 85 | " |-- ID: integer (nullable = true)\n", 86 | " |-- City: string (nullable = true)\n", 87 | " |-- State: string (nullable = true)\n", 88 | " |-- Lattitude: double (nullable = true)\n", 89 | " |-- Longitude: double (nullable = true)\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "#Check Schema of DataFrame\n", 96 | "stationdf.printSchema()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba", 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "+---+-----------+-----+-----------+-----------+\n", 110 | "| ID| City|State| Lattitude| Longitude|\n", 111 | "+---+-----------+-----+-----------+-----------+\n", 112 | "|478| Tipton| IN|33.54792701|97.94286036|\n", 113 | "|619| Arlington| CO|75.17993079|92.94615894|\n", 114 | "|711| Turner| AR|50.24380534|101.4580163|\n", 115 | "|839| Slidell| LA|85.32270304|151.8743276|\n", 116 | "|411| Negreet| LA| 98.9707194|105.3376115|\n", 117 | "|588| Glencoe| KY|46.38739244|136.0427027|\n", 118 | "|665| Chelsea| IA|98.72210937|59.68913002|\n", 119 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n", 120 | "|811| Dorrance| KS|102.0888316|121.5614372|\n", 121 | "|698| Albany| CA|49.75112765|80.21211317|\n", 122 | "|325| Monument| KS|70.52300953|141.7680413|\n", 123 | "|414| Manchester| MD|73.51580724|37.14602869|\n", 124 | "|113| Prescott| IA|39.93234421|65.79327823|\n", 125 | "|971|Graettinger| IA|94.66283665|150.3826243|\n", 126 | "|266| Cahone| CO|116.2321963| 127.009554|\n", 127 | "|617| Sturgis| MS|36.45673517|126.1690696|\n", 128 | "|495| Upperco| MD|114.2157413|29.63104758|\n", 129 | "|473| Highwood| IL|27.25445814|150.9227402|\n", 130 | "|959| Waipahu| HI|106.4460526|33.91451792|\n", 131 | "|438| Bowdon| GA|88.98111013|78.49025241|\n", 132 | "+---+-----------+-----+-----------+-----------+\n", 133 | "only showing top 20 rows\n", 134 | "\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "#Check sample Data \n", 140 | "stationdf.show()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 9, 146 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171", 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stderr", 151 | "output_type": "stream", 152 | "text": [ 153 | "[Stage 6:> (0 + 1) / 1]\r" 154 | ] 155 | }, 156 | { 157 | "name": "stdout", 158 | "output_type": "stream", 159 | "text": [ 160 | "+------------------------------------+\n", 161 | "|(count(City) - count(DISTINCT City))|\n", 162 | "+------------------------------------+\n", 163 | "| 3|\n", 164 | "+------------------------------------+\n", 165 | "\n" 166 | ] 167 | }, 168 | { 169 | "name": "stderr", 170 | "output_type": "stream", 171 | "text": [ 172 | " \r" 173 | ] 174 | } 175 | ], 176 | "source": [ 177 | "#Solving Problem using PySpark \n", 178 | "# ind the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table. \n", 179 | "from pyspark.sql.functions import countDistinct\n", 180 | "from pyspark.sql.functions import count\n", 181 | "stationdf.select(count(\"City\") - countDistinct(\"City\")).show()" 182 | ] 183 | }, 184 | { 185 | "cell_type": "code", 186 | "execution_count": 10, 187 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1", 188 | "metadata": {}, 189 | "outputs": [], 190 | "source": [ 191 | "# Now we are solving Same problem using Spark SQL \n", 192 | "# Creating Temp Table or HIVE table\n", 193 | "stationdf.createOrReplaceTempView(\"tmpStation\")" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 11, 199 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c", 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "+---+-----------+-----+-----------+-----------+\n", 207 | "| ID| City|State| Lattitude| Longitude|\n", 208 | "+---+-----------+-----+-----------+-----------+\n", 209 | "|478| Tipton| IN|33.54792701|97.94286036|\n", 210 | "|619| Arlington| CO|75.17993079|92.94615894|\n", 211 | "|711| Turner| AR|50.24380534|101.4580163|\n", 212 | "|839| Slidell| LA|85.32270304|151.8743276|\n", 213 | "|411| Negreet| LA| 98.9707194|105.3376115|\n", 214 | "|588| Glencoe| KY|46.38739244|136.0427027|\n", 215 | "|665| Chelsea| IA|98.72210937|59.68913002|\n", 216 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n", 217 | "|811| Dorrance| KS|102.0888316|121.5614372|\n", 218 | "|698| Albany| CA|49.75112765|80.21211317|\n", 219 | "|325| Monument| KS|70.52300953|141.7680413|\n", 220 | "|414| Manchester| MD|73.51580724|37.14602869|\n", 221 | "|113| Prescott| IA|39.93234421|65.79327823|\n", 222 | "|971|Graettinger| IA|94.66283665|150.3826243|\n", 223 | "|266| Cahone| CO|116.2321963| 127.009554|\n", 224 | "|617| Sturgis| MS|36.45673517|126.1690696|\n", 225 | "|495| Upperco| MD|114.2157413|29.63104758|\n", 226 | "|473| Highwood| IL|27.25445814|150.9227402|\n", 227 | "|959| Waipahu| HI|106.4460526|33.91451792|\n", 228 | "|438| Bowdon| GA|88.98111013|78.49025241|\n", 229 | "+---+-----------+-----+-----------+-----------+\n", 230 | "only showing top 20 rows\n", 231 | "\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "# Now we have SQL Table and we can write SQL Query on top of that \n", 237 | "# For example by Select on table \n", 238 | "sqlContext.sql(\"SELECT * FROM tmpStation\").show()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 12, 244 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24", 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stderr", 249 | "output_type": "stream", 250 | "text": [ 251 | "[Stage 13:> (0 + 1) / 1]\r" 252 | ] 253 | }, 254 | { 255 | "name": "stdout", 256 | "output_type": "stream", 257 | "text": [ 258 | "+---------+-----------------+---------------+\n", 259 | "|citycount|distinctcitycount|diffbetweenboth|\n", 260 | "+---------+-----------------+---------------+\n", 261 | "| 282| 279| 3|\n", 262 | "+---------+-----------------+---------------+\n", 263 | "\n" 264 | ] 265 | }, 266 | { 267 | "name": "stderr", 268 | "output_type": "stream", 269 | "text": [ 270 | " \r" 271 | ] 272 | } 273 | ], 274 | "source": [ 275 | "# Now we will write query to get max salary for each employee \n", 276 | "# so we will use SQL Group by and SQL Order by functions \n", 277 | "sqlContext.sql(\"SELECT count(city) as citycount, count(distinct(city)) as distinctcitycount \\\n", 278 | " ,(count(city) - count(distinct(city))) as diffbetweenboth \\\n", 279 | " FROM tmpStation\").show(n=100)\n", 280 | "\n" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": null, 286 | "id": "3eeb1534-4da5-427f-9103-1a7bb847170e", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [] 290 | } 291 | ], 292 | "metadata": { 293 | "kernelspec": { 294 | "display_name": "Python 3 (ipykernel)", 295 | "language": "python", 296 | "name": "python3" 297 | }, 298 | "language_info": { 299 | "codemirror_mode": { 300 | "name": "ipython", 301 | "version": 3 302 | }, 303 | "file_extension": ".py", 304 | "mimetype": "text/x-python", 305 | "name": "python", 306 | "nbconvert_exporter": "python", 307 | "pygments_lexer": "ipython3", 308 | "version": "3.8.13" 309 | } 310 | }, 311 | "nbformat": 4, 312 | "nbformat_minor": 5 313 | } 314 | -------------------------------------------------------------------------------- /Problem 5/problem5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c", 6 | "metadata": {}, 7 | "source": [ 8 | "Here, we will solve problems two ways\n", 9 | "1. First using PySpark function \n", 10 | "2. Second using Spark SQL" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# First Load all the required library and also Start Spark Session\n", 21 | "# Load all the required library\n", 22 | "from pyspark.sql import SparkSession" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "WARNING: An illegal reflective access operation has occurred\n", 36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 39 | "WARNING: All illegal access operations will be denied in a future release\n", 40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 41 | "Setting default log level to \"WARN\".\n", 42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 43 | "23/02/09 11:10:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "#Start Spark Session\n", 49 | "spark = SparkSession.builder.appName(\"problem5\").getOrCreate()\n", 50 | "sqlContext = SparkSession(spark)\n", 51 | "#Dont Show warning only error\n", 52 | "spark.sparkContext.setLogLevel(\"ERROR\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stderr", 63 | "output_type": "stream", 64 | "text": [ 65 | " \r" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "#Load CSV file into DataFrame\n", 71 | "stationdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"station.csv\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "root\n", 85 | " |-- ID: integer (nullable = true)\n", 86 | " |-- City: string (nullable = true)\n", 87 | " |-- State: string (nullable = true)\n", 88 | " |-- Lattitude: double (nullable = true)\n", 89 | " |-- Longitude: double (nullable = true)\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "#Check Schema of DataFrame\n", 96 | "stationdf.printSchema()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba", 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "+---+-----------+-----+-----------+-----------+\n", 110 | "| ID| City|State| Lattitude| Longitude|\n", 111 | "+---+-----------+-----+-----------+-----------+\n", 112 | "|478| Tipton| IN|33.54792701|97.94286036|\n", 113 | "|619| Arlington| CO|75.17993079|92.94615894|\n", 114 | "|711| Turner| AR|50.24380534|101.4580163|\n", 115 | "|839| Slidell| LA|85.32270304|151.8743276|\n", 116 | "|411| Negreet| LA| 98.9707194|105.3376115|\n", 117 | "|588| Glencoe| KY|46.38739244|136.0427027|\n", 118 | "|665| Chelsea| IA|98.72210937|59.68913002|\n", 119 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n", 120 | "|811| Dorrance| KS|102.0888316|121.5614372|\n", 121 | "|698| Albany| CA|49.75112765|80.21211317|\n", 122 | "|325| Monument| KS|70.52300953|141.7680413|\n", 123 | "|414| Manchester| MD|73.51580724|37.14602869|\n", 124 | "|113| Prescott| IA|39.93234421|65.79327823|\n", 125 | "|971|Graettinger| IA|94.66283665|150.3826243|\n", 126 | "|266| Cahone| CO|116.2321963| 127.009554|\n", 127 | "|617| Sturgis| MS|36.45673517|126.1690696|\n", 128 | "|495| Upperco| MD|114.2157413|29.63104758|\n", 129 | "|473| Highwood| IL|27.25445814|150.9227402|\n", 130 | "|959| Waipahu| HI|106.4460526|33.91451792|\n", 131 | "|438| Bowdon| GA|88.98111013|78.49025241|\n", 132 | "+---+-----------+-----+-----------+-----------+\n", 133 | "only showing top 20 rows\n", 134 | "\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "#Check sample Data \n", 140 | "stationdf.show()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 17, 146 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171", 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "+-------------+\n", 154 | "| City|\n", 155 | "+-------------+\n", 156 | "| Arlington|\n", 157 | "| Albany|\n", 158 | "| Upperco|\n", 159 | "| Aguanga|\n", 160 | "| Odin|\n", 161 | "| Algonac|\n", 162 | "| Onaway|\n", 163 | "| Irvington|\n", 164 | "| Arrowsmith|\n", 165 | "| Udall|\n", 166 | "| Oakfield|\n", 167 | "| Elkton|\n", 168 | "| Amo|\n", 169 | "| Alanson|\n", 170 | "| Eleele|\n", 171 | "| Auburn|\n", 172 | "| Oconee|\n", 173 | "| Amazonia|\n", 174 | "|Andersonville|\n", 175 | "| Eros|\n", 176 | "| Arkadelphia|\n", 177 | "| Eriline|\n", 178 | "| Edgewater|\n", 179 | "| Eastlake|\n", 180 | "| Addison|\n", 181 | "| Everton|\n", 182 | "| Eustis|\n", 183 | "| Arispe|\n", 184 | "| Ottertail|\n", 185 | "| Ermine|\n", 186 | "| Albion|\n", 187 | "| Athens|\n", 188 | "| Eufaula|\n", 189 | "| Andover|\n", 190 | "| Osborne|\n", 191 | "| Oshtemo|\n", 192 | "+-------------+\n", 193 | "\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "#Solving Problem using PySpark \n", 199 | "# ind the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table. \n", 200 | "stationdf.select(\"City\").where(\"Left(City,1) IN ('A','E','I','O','U')\").show(n=100)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 6, 206 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1", 207 | "metadata": {}, 208 | "outputs": [], 209 | "source": [ 210 | "# Now we are solving Same problem using Spark SQL \n", 211 | "# Creating Temp Table or HIVE table\n", 212 | "stationdf.createOrReplaceTempView(\"tmpStation\")" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 7, 218 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c", 219 | "metadata": {}, 220 | "outputs": [ 221 | { 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "+---+-----------+-----+-----------+-----------+\n", 226 | "| ID| City|State| Lattitude| Longitude|\n", 227 | "+---+-----------+-----+-----------+-----------+\n", 228 | "|478| Tipton| IN|33.54792701|97.94286036|\n", 229 | "|619| Arlington| CO|75.17993079|92.94615894|\n", 230 | "|711| Turner| AR|50.24380534|101.4580163|\n", 231 | "|839| Slidell| LA|85.32270304|151.8743276|\n", 232 | "|411| Negreet| LA| 98.9707194|105.3376115|\n", 233 | "|588| Glencoe| KY|46.38739244|136.0427027|\n", 234 | "|665| Chelsea| IA|98.72210937|59.68913002|\n", 235 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n", 236 | "|811| Dorrance| KS|102.0888316|121.5614372|\n", 237 | "|698| Albany| CA|49.75112765|80.21211317|\n", 238 | "|325| Monument| KS|70.52300953|141.7680413|\n", 239 | "|414| Manchester| MD|73.51580724|37.14602869|\n", 240 | "|113| Prescott| IA|39.93234421|65.79327823|\n", 241 | "|971|Graettinger| IA|94.66283665|150.3826243|\n", 242 | "|266| Cahone| CO|116.2321963| 127.009554|\n", 243 | "|617| Sturgis| MS|36.45673517|126.1690696|\n", 244 | "|495| Upperco| MD|114.2157413|29.63104758|\n", 245 | "|473| Highwood| IL|27.25445814|150.9227402|\n", 246 | "|959| Waipahu| HI|106.4460526|33.91451792|\n", 247 | "|438| Bowdon| GA|88.98111013|78.49025241|\n", 248 | "+---+-----------+-----+-----------+-----------+\n", 249 | "only showing top 20 rows\n", 250 | "\n" 251 | ] 252 | } 253 | ], 254 | "source": [ 255 | "# Now we have SQL Table and we can write SQL Query on top of that \n", 256 | "# For example by Select on table \n", 257 | "sqlContext.sql(\"SELECT * FROM tmpStation\").show()" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 9, 263 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24", 264 | "metadata": {}, 265 | "outputs": [ 266 | { 267 | "name": "stdout", 268 | "output_type": "stream", 269 | "text": [ 270 | "+-------------+\n", 271 | "| CITY|\n", 272 | "+-------------+\n", 273 | "| Auburn|\n", 274 | "|Andersonville|\n", 275 | "| Eastlake|\n", 276 | "| Albany|\n", 277 | "| Aguanga|\n", 278 | "| Onaway|\n", 279 | "| Andover|\n", 280 | "| Algonac|\n", 281 | "| Amazonia|\n", 282 | "| Arkadelphia|\n", 283 | "| Arispe|\n", 284 | "| Eustis|\n", 285 | "| Udall|\n", 286 | "| Athens|\n", 287 | "| Ottertail|\n", 288 | "| Upperco|\n", 289 | "| Ermine|\n", 290 | "| Eufaula|\n", 291 | "| Alanson|\n", 292 | "| Arlington|\n", 293 | "| Arrowsmith|\n", 294 | "| Oshtemo|\n", 295 | "| Irvington|\n", 296 | "| Elkton|\n", 297 | "| Eleele|\n", 298 | "| Oconee|\n", 299 | "| Oakfield|\n", 300 | "| Amo|\n", 301 | "| Addison|\n", 302 | "| Albion|\n", 303 | "| Everton|\n", 304 | "| Osborne|\n", 305 | "| Eriline|\n", 306 | "| Edgewater|\n", 307 | "| Eros|\n", 308 | "| Odin|\n", 309 | "+-------------+\n", 310 | "\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "# Now we will write query to get max salary for each employee \n", 316 | "# so we will use SQL Group by and SQL Order by functions \n", 317 | "sqlContext.sql(\"SELECT DISTINCT(CITY) FROM tmpStation WHERE LEFT(CITY,1) IN ('A','E','I','O','U')\").show(n=100)" 318 | ] 319 | } 320 | ], 321 | "metadata": { 322 | "kernelspec": { 323 | "display_name": "Python 3 (ipykernel)", 324 | "language": "python", 325 | "name": "python3" 326 | }, 327 | "language_info": { 328 | "codemirror_mode": { 329 | "name": "ipython", 330 | "version": 3 331 | }, 332 | "file_extension": ".py", 333 | "mimetype": "text/x-python", 334 | "name": "python", 335 | "nbconvert_exporter": "python", 336 | "pygments_lexer": "ipython3", 337 | "version": "3.8.13" 338 | } 339 | }, 340 | "nbformat": 4, 341 | "nbformat_minor": 5 342 | } 343 | -------------------------------------------------------------------------------- /Problem 3/stations.csv: -------------------------------------------------------------------------------- 1 | ID,City,State,Lattitude,Longitude 2 | 478,Tipton,IN,33.54792701,97.94286036 3 | 619,Arlington,CO,75.17993079,92.94615894 4 | 711,Turner,AR,50.24380534,101.4580163 5 | 839,Slidell,LA,85.32270304,151.8743276 6 | 411,Negreet,LA,98.9707194,105.3376115 7 | 588,Glencoe,KY,46.38739244,136.0427027 8 | 665,Chelsea,IA,98.72210937,59.68913002 9 | 733,Pelahatchie,MS,38.58161595,28.11950703 10 | 811,Dorrance,KS,102.0888316,121.5614372 11 | 698,Albany,CA,49.75112765,80.21211317 12 | 325,Monument,KS,70.52300953,141.7680413 13 | 414,Manchester,MD,73.51580724,37.14602869 14 | 113,Prescott,IA,39.93234421,65.79327823 15 | 971,Graettinger,IA,94.66283665,150.3826243 16 | 266,Cahone,CO,116.2321963,127.009554 17 | 617,Sturgis,MS,36.45673517,126.1690696 18 | 495,Upperco,MD,114.2157413,29.63104758 19 | 473,Highwood,IL,27.25445814,150.9227402 20 | 959,Waipahu,HI,106.4460526,33.91451792 21 | 438,Bowdon,GA,88.98111013,78.49025241 22 | 571,Tyler,MN,133.3521233,58.63273833 23 | 92,Watkins,CO,83.27433063,96.73732305 24 | 399,Republic,MI,75.42182,130.1266717 25 | 426,Millville,CA,32.55838209,145.7434609 26 | 844,Aguanga,CA,79.89165657,65.93959251 27 | 606,Morenci,AZ,104.8964262,110.2033978 28 | 833,Hoskinston,KY,65.7515349,65.67937265 29 | 843,Talbert,KY,39.85947921,58.84999769 30 | 166,Mccomb,MS,74.04169376,42.63374681 31 | 339,Kirk,CO,141.097397,136.3312671 32 | 909,Carlock,IL,117.3209611,84.80244659 33 | 829,Seward,IL,72.41930917,90.20890209 34 | 766,Gustine,CA,111.0875596,140.8338617 35 | 392,Delano,CA,126.3467998,91.50161746 36 | 555,Westphalia,MI,32.76641637,143.8050085 37 | 728,Roy,MT,41.31187761,51.56467929 38 | 656,Pattonsburg,MO,138.100334,32.10804024 39 | 394,Centertown,MO,133.9733513,93.17246374 40 | 366,Norvell,MI,125.3431567,93.75245864 41 | 96,Raymondville,MO,70.68239168,148.4444084 42 | 977,Odin,IL,53.48858773,115.7934363 43 | 741,Jemison,AL,62.10307108,25.71260581 44 | 323,Barrigada,GU,60.60716473,147.5296125 45 | 3,Hesperia,CA,106.0569286,71.11876711 46 | 814,Wickliffe,KY,80.29965735,46.12993489 47 | 375,Culdesac,ID,47.8418268,78.06551236 48 | 467,Roselawn,IN,87.70708169,51.74506986 49 | 647,Portland,AR,83.92116818,44.80555694 50 | 250,Hampden,MA,76.39074308,26.48368838 51 | 547,Sandborn,IN,55.94680767,93.85315475 52 | 701,Seaton,IL,128.2287955,78.43005628 53 | 197,Milledgeville,IL,90.98811028,113.2748504 54 | 679,Gretna,LA,75.26293787,142.5762285 55 | 403,Zionsville,IN,57.79181464,36.493866 56 | 482,Jolon,CA,66.65054378,52.95528769 57 | 252,Childs,MD,92.7594351,104.0155475 58 | 600,Shreveport,LA,136.2310159,38.50207291 59 | 14,Forest,MS,120.283076,50.22883356 60 | 260,Sizerock,KY,116.0212592,112.7471971 61 | 753,Algonac,MI,118.7398038,80.14671114 62 | 174,Onaway,MI,108.606587,55.75945692 63 | 263,Irvington,IL,96.70474244,68.28719181 64 | 253,Winsted,MN,68.82384939,72.51511422 65 | 557,Woodbury,GA,102.5472386,93.37553932 66 | 897,Samantha,AL,75.2235845,35.94479192 67 | 98,Hackleburg,AL,119.5607105,120.6244819 68 | 423,Soldier,KS,77.30051697,152.6019439 69 | 361,Arrowsmith,IL,28.00318693,109.3395101 70 | 409,Columbus,GA,67.33892289,46.61622653 71 | 312,Bentonville,AR,36.9528472,78.06843628 72 | 854,Kirkland,AZ,86.41004231,57.99523843 73 | 735,Wilton,ME,56.57944083,157.1906205 74 | 608,Busby,MT,104.0894472,29.83035109 75 | 122,Robertsdale,AL,97.7213689,85.3747551 76 | 93,Dale,IN,69.59335022,34.41552119 77 | 67,Reeds,MO,30.78888129,42.50211311 78 | 906,Hayfork,CA,35.2971959,116.6698147 79 | 34,Mcbrides,MI,74.05708403,35.68248542 80 | 401,Tennessee,IL,55.49838117,155.6455992 81 | 536,Henderson,IA,77.92417249,77.90662876 82 | 953,Udall,KS,112.6844799,59.95863388 83 | 614,Benedict,KS,138.4990456,95.71978969 84 | 998,Oakfield,ME,47.65762321,132.2118817 85 | 805,Tamms,IL,59.86766645,75.05164447 86 | 235,Haubstadt,IN,27.98898068,32.08170842 87 | 820,Chokio,MN,81.36073326,134.232113 88 | 650,Clancy,MT,45.82996854,164.378675 89 | 324,Norwood,MN,144.4891504,34.88529336 90 | 442,Elkton,MD,103.2547878,156.7289171 91 | 633,Bertha,MN,39.94889028,105.3111577 92 | 109,Bridgeport,MI,50.68988119,79.90137859 93 | 780,Cherry,IL,68.29708467,46.70383506 94 | 492,Regina,KY,131.5515912,90.23826291 95 | 965,Griffin,GA,38.74146904,151.7182093 96 | 337,Mascotte,FL,121.4608708,146.1675503 97 | 259,Baldwin,MD,81.73572165,40.4397386 98 | 955,Netawaka,KS,109.2057274,119.7404946 99 | 886,Pony,MT,99.25831292,162.8777336 100 | 200,Franklin,LA,82.24062794,31.77872725 101 | 384,Amo,IN,103.5871398,159.4306474 102 | 518,Vulcan,MO,108.6087788,91.56138944 103 | 161,Alanson,MI,90.6531996,72.11952297 104 | 486,Delta,LA,136.5385281,49.73086766 105 | 406,Carver,MN,45.89251104,122.069681 106 | 940,Paron,AR,59.13834287,104.3412062 107 | 237,Winchester,ID,38.37033443,80.0549859 108 | 465,Jerome,AZ,121.7110583,34.40610397 109 | 570,Greenview,CA,80.50000412,57.58800404 110 | 278,Cromwell,MN,128.8462234,53.51254061 111 | 927,Quinter,KS,59.58257004,25.36132152 112 | 59,Whitewater,MO,82.71809743,71.42607696 113 | 291,Clarkdale,AZ,58.19417297,73.94789938 114 | 668,Rockton,IL,116.1223935,86.83833004 115 | 682,Pheba,MS,90.94560988,127.3003694 116 | 775,Eleele,HI,80.90971236,152.5215045 117 | 527,Auburn,IA,95.48926949,137.0748386 118 | 190,Oconee,GA,92.56220722,119.477431 119 | 232,Grandville,MI,38.85256239,70.13776289 120 | 405,Susanville,CA,128.2498724,80.31679475 121 | 273,Rosie,AR,72.75896875,161.9173483 122 | 813,Verona,MO,109.6602903,152.6449499 123 | 444,Richland,GA,105.4709117,113.0379774 124 | 899,Fremont,MI,54.47132153,150.8233711 125 | 738,Philipsburg,MT,95.95531865,72.24442365 126 | 215,Kensett,IA,55.72295385,139.5524526 127 | 377,Koleen,IN,137.5485615,110.5110324 128 | 727,Winslow,IL,113.1328079,38.71450096 129 | 363,Reasnor,IA,41.59710148,162.564183 130 | 888,Bono,AR,133.276314,150.4963257 131 | 784,Biggsville,IL,85.92578701,138.7463469 132 | 695,Amazonia,MO,45.78566304,148.2013846 133 | 609,Marysville,MI,85.76134731,132.8724084 134 | 649,Pengilly,MN,25.07352606,154.0642918 135 | 383,Newbury,MA,128.3982315,85.17470023 136 | 44,Kismet,KS,99.82252766,156.5035829 137 | 433,Canton,ME,98.73035759,105.973446 138 | 474,Grayslake,IL,61.30374218,33.05923131 139 | 990,Bison,KS,132.2279842,74.89290079 140 | 502,Bellevue,KY,127.4330424,121.7488466 141 | 327,Ridgway,CO,77.43818081,110.2668422 142 | 228,Rydal,GA,35.68357838,78.82337343 143 | 642,Lynnville,KY,25.40836031,146.4916272 144 | 885,Deerfield,MO,40.213664,35.9386994 145 | 539,Montreal,MO,129.2453575,127.3259318 146 | 202,Hope,MN,140.3641688,43.72901978 147 | 521,Gowrie,IA,130.2024387,127.9825354 148 | 938,Andersonville,GA,141.3126586,72.53178686 149 | 528,Crouseville,ME,36.5185121,81.54481624 150 | 331,Cranks,KY,55.60911109,27.28471229 151 | 944,Ledyard,CT,134.5468125,143.8149657 152 | 949,Norway,ME,83.89130493,88.40746773 153 | 88,Eros,LA,95.16264172,58.31349033 154 | 878,Rantoul,KS,31.80492935,118.6160845 155 | 17,Fredericktown,MO,105.5334784,112.6890911 156 | 447,Arkadelphia,AR,98.62295228,49.57501146 157 | 351,Fredericksburg,IN,44.51203489,78.05797739 158 | 774,Manchester,IA,129.6682154,123.2967519 159 | 963,Eriline,KY,93.61747947,65.43902104 160 | 643,Wellington,KY,100.4511347,31.68760835 161 | 777,Edgewater,MD,130.0676569,72.29080719 162 | 15,Ducor,CA,140.8633607,102.039339 163 | 910,Salem,KY,86.97524724,113.9609797 164 | 612,Sturdivant,MO,93.84076298,86.38850955 165 | 537,Hagatna,GU,97.17321584,151.8086289 166 | 510,Eastlake,MI,134.0938535,38.78212913 167 | 354,Larkspur,CA,107.0529696,65.97363083 168 | 983,Patriot,IN,82.63795084,46.08354932 169 | 799,Corriganville,MD,141.383789,153.6500914 170 | 581,Carlos,MN,114.9060173,66.2810487 171 | 825,Addison,MI,96.36953674,142.4105732 172 | 526,Tarzana,CA,135.8603987,81.30731303 173 | 176,Grapevine,AR,92.36589225,84.54293686 174 | 994,Kanorado,KS,65.42078424,85.72249232 175 | 704,Climax,MI,127.3563782,107.0542747 176 | 582,Curdsville,KY,84.78749012,150.4842247 177 | 884,Southport,CT,59.09336238,63.13052144 178 | 196,Compton,IL,106.617993,99.40704162 179 | 605,Notasulga,AL,66.84426322,115.6864036 180 | 430,Rumsey,KY,70.6921152,50.2122756 181 | 234,Rogers,CT,140.4723914,33.18335673 182 | 702,Everton,MO,119.0469849,51.48512967 183 | 662,Skanee,MI,70.1724149,129.5593113 184 | 171,Springerville,AZ,124.6882036,150.6628287 185 | 615,Libertytown,MD,144.5783185,111.9744225 186 | 336,Dumont,MN,57.0124315,129.3675605 187 | 315,Ravenna,KY,79.15467169,106.252172 188 | 505,Williams,AZ,73.48100913,111.7413889 189 | 842,Decatur,MI,63.31154085,161.4235787 190 | 982,Holbrook,AZ,134.8838521,103.8569792 191 | 868,Sherrill,AR,79.96440727,152.2197289 192 | 554,Brownsdale,MN,52.42646664,50.79836304 193 | 199,Linden,MI,53.41116218,32.62422206 194 | 453,Sedgwick,AR,68.93334418,75.29418595 195 | 326,Rocheport,MO,114.163159,64.48216553 196 | 638,Clovis,CA,92.43965299,138.0751933 197 | 156,Heyburn,ID,82.08611195,121.0459768 198 | 861,Peabody,KS,75.41614816,152.2100746 199 | 428,Randall,KS,47.99772806,135.6275983 200 | 677,Hayesville,IA,119.9881564,42.12719349 201 | 183,Jordan,MN,68.74638928,35.46228503 202 | 242,Macy,IN,138.694477,152.3694449 203 | 621,Flowood,MS,64.88877035,149.2064111 204 | 180,Napoleon,IN,32.03325626,160.2402958 205 | 853,Coldwater,KS,47.50617517,26.31002645 206 | 105,Weldon,CA,134.0156771,118.9609382 207 | 357,Yellville,AR,35.68710434,42.24658664 208 | 920,Eustis,FL,42.73630964,39.48336091 209 | 355,Weldona,CO,32.96727204,58.44917695 210 | 501,Tefft,IN,93.21527074,150.0159946 211 | 834,Bayville,ME,106.7349403,143.4078424 212 | 255,Brighton,IL,107.6050821,32.84882058 213 | 595,Grimes,IA,42.05019623,74.73314913 214 | 709,Nubieber,CA,132.9033933,49.27761205 215 | 16,Beaufort,MO,71.77418064,85.65741838 216 | 231,Arispe,IA,31.11149635,137.7968198 217 | 891,Humeston,IA,74.51222394,122.4246326 218 | 757,Lakeville,CT,59.86867012,94.98860174 219 | 506,Firebrick,KY,49.99183934,95.03900712 220 | 583,Channing,MI,117.1645417,56.95124478 221 | 504,Melber,KY,37.24884854,55.53335159 222 | 901,Manchester,MN,71.02098012,84.00752922 223 | 586,Ottertail,MN,100.0240382,44.34165481 224 | 95,Dupo,IL,41.28342297,29.03342929 225 | 524,Montrose,CA,136.4765033,119.373558 226 | 716,Schleswig,IA,119.2539069,51.88108538 227 | 904,Ermine,KY,119.6401426,62.79812627 228 | 740,Siler,KY,137.0193079,117.2464806 229 | 57,Clifton,AZ,30.15463898,135.7025933 230 | 155,Casco,MI,138.5984073,109.0728819 231 | 755,Sturgis,MI,117.392421,135.3989883 232 | 287,Madisonville,LA,112.2163874,53.04603619 233 | 435,Albion,IN,44.25844944,121.8753316 234 | 672,Lismore,MN,58.87142971,103.8693391 235 | 572,Athens,IN,75.32104008,120.7983748 236 | 890,Eufaula,AL,140.2958283,103.0868213 237 | 119,Wildie,KY,69.65812987,111.8552379 238 | 540,Mosca,CO,89.20441335,141.4811419 239 | 678,Bennington,IN,35.52107321,26.80362207 240 | 208,Lottie,LA,109.8672979,82.76650144 241 | 512,Garland,ME,108.7311062,134.3750565 242 | 352,Clutier,IA,61.1888319,127.0339038 243 | 948,Lupton,MI,139.9255926,53.36397181 244 | 503,Northfield,MN,61.00207775,37.15335522 245 | 288,Daleville,AL,121.8865105,136.1704398 246 | 479,Cuba,MO,63.71916114,87.64843313 247 | 826,Norris,MT,47.18550342,37.25727353 248 | 651,Clopton,AL,40.77104358,84.70678339 249 | 143,Renville,MN,142.1513936,99.43201313 250 | 102,Kirksville,MO,140.0030631,143.8709979 251 | 69,Kingsland,AR,78.22487634,85.13857667 252 | 181,Fairview,KS,80.27133556,164.5798928 253 | 175,Lydia,LA,41.78237386,39.53037919 254 | 80,Bridgton,ME,93.2257481,140.415464 255 | 596,Brownstown,IL,48.65218811,63.22095723 256 | 301,Monona,IA,144.1294884,81.57803996 257 | 987,Hartland,MI,136.2638918,107.738067 258 | 973,Andover,CT,51.74018501,52.53230369 259 | 981,Lakota,IA,56.15413675,92.38612569 260 | 110,Mesick,MI,82.12446036,108.5283528 261 | 396,Dryden,MI,69.80182523,47.7436689 262 | 637,Beverly,KY,57.75450094,126.8958422 263 | 801,Pocahontas,IL,109.6304686,83.23109494 264 | 130,Hayneville,AL,109.7380661,157.4686782 265 | 345,Yoder,IN,83.49946581,143.715826 266 | 851,Gatewood,MO,76.31562733,145.668333 267 | 489,Madden,MS,81.34223218,99.37998257 268 | 223,Losantville,IN,112.5187171,106.7760547 269 | 538,Cheswold,DE,31.93743733,59.34689519 270 | 329,Caseville,MI,102.9200706,98.4033735 271 | 815,Pomona,MO,52.33346818,50.28222507 272 | 789,Hopkinsville,KY,27.31872893,47.94652919 273 | 269,Jack,AL,49.93703023,85.62817326 274 | 969,Dixie,GA,27.21713791,36.47378899 275 | 271,Hillside,CO,99.26558164,68.84352684 276 | 667,Hawarden,IA,90.96161545,46.93255602 277 | 350,Cannonsburg,MI,91.03351667,120.6696799 278 | 49,Osborne,KS,70.36168327,139.7111654 279 | 404,Farmington,IL,91.7144044,72.0223174 280 | 23,Honolulu,HI,110.101955,139.7437776 281 | 1,Pfeifer,KS,37.44478047,65.68491252 282 | 127,Oshtemo,MI,100.3702957,135.9503227 283 | 657,Gridley,KS,118.1450367,55.80178454 -------------------------------------------------------------------------------- /Problem 4/stations.csv: -------------------------------------------------------------------------------- 1 | ID,City,State,Lattitude,Longitude 2 | 478,Tipton,IN,33.54792701,97.94286036 3 | 619,Arlington,CO,75.17993079,92.94615894 4 | 711,Turner,AR,50.24380534,101.4580163 5 | 839,Slidell,LA,85.32270304,151.8743276 6 | 411,Negreet,LA,98.9707194,105.3376115 7 | 588,Glencoe,KY,46.38739244,136.0427027 8 | 665,Chelsea,IA,98.72210937,59.68913002 9 | 733,Pelahatchie,MS,38.58161595,28.11950703 10 | 811,Dorrance,KS,102.0888316,121.5614372 11 | 698,Albany,CA,49.75112765,80.21211317 12 | 325,Monument,KS,70.52300953,141.7680413 13 | 414,Manchester,MD,73.51580724,37.14602869 14 | 113,Prescott,IA,39.93234421,65.79327823 15 | 971,Graettinger,IA,94.66283665,150.3826243 16 | 266,Cahone,CO,116.2321963,127.009554 17 | 617,Sturgis,MS,36.45673517,126.1690696 18 | 495,Upperco,MD,114.2157413,29.63104758 19 | 473,Highwood,IL,27.25445814,150.9227402 20 | 959,Waipahu,HI,106.4460526,33.91451792 21 | 438,Bowdon,GA,88.98111013,78.49025241 22 | 571,Tyler,MN,133.3521233,58.63273833 23 | 92,Watkins,CO,83.27433063,96.73732305 24 | 399,Republic,MI,75.42182,130.1266717 25 | 426,Millville,CA,32.55838209,145.7434609 26 | 844,Aguanga,CA,79.89165657,65.93959251 27 | 606,Morenci,AZ,104.8964262,110.2033978 28 | 833,Hoskinston,KY,65.7515349,65.67937265 29 | 843,Talbert,KY,39.85947921,58.84999769 30 | 166,Mccomb,MS,74.04169376,42.63374681 31 | 339,Kirk,CO,141.097397,136.3312671 32 | 909,Carlock,IL,117.3209611,84.80244659 33 | 829,Seward,IL,72.41930917,90.20890209 34 | 766,Gustine,CA,111.0875596,140.8338617 35 | 392,Delano,CA,126.3467998,91.50161746 36 | 555,Westphalia,MI,32.76641637,143.8050085 37 | 728,Roy,MT,41.31187761,51.56467929 38 | 656,Pattonsburg,MO,138.100334,32.10804024 39 | 394,Centertown,MO,133.9733513,93.17246374 40 | 366,Norvell,MI,125.3431567,93.75245864 41 | 96,Raymondville,MO,70.68239168,148.4444084 42 | 977,Odin,IL,53.48858773,115.7934363 43 | 741,Jemison,AL,62.10307108,25.71260581 44 | 323,Barrigada,GU,60.60716473,147.5296125 45 | 3,Hesperia,CA,106.0569286,71.11876711 46 | 814,Wickliffe,KY,80.29965735,46.12993489 47 | 375,Culdesac,ID,47.8418268,78.06551236 48 | 467,Roselawn,IN,87.70708169,51.74506986 49 | 647,Portland,AR,83.92116818,44.80555694 50 | 250,Hampden,MA,76.39074308,26.48368838 51 | 547,Sandborn,IN,55.94680767,93.85315475 52 | 701,Seaton,IL,128.2287955,78.43005628 53 | 197,Milledgeville,IL,90.98811028,113.2748504 54 | 679,Gretna,LA,75.26293787,142.5762285 55 | 403,Zionsville,IN,57.79181464,36.493866 56 | 482,Jolon,CA,66.65054378,52.95528769 57 | 252,Childs,MD,92.7594351,104.0155475 58 | 600,Shreveport,LA,136.2310159,38.50207291 59 | 14,Forest,MS,120.283076,50.22883356 60 | 260,Sizerock,KY,116.0212592,112.7471971 61 | 753,Algonac,MI,118.7398038,80.14671114 62 | 174,Onaway,MI,108.606587,55.75945692 63 | 263,Irvington,IL,96.70474244,68.28719181 64 | 253,Winsted,MN,68.82384939,72.51511422 65 | 557,Woodbury,GA,102.5472386,93.37553932 66 | 897,Samantha,AL,75.2235845,35.94479192 67 | 98,Hackleburg,AL,119.5607105,120.6244819 68 | 423,Soldier,KS,77.30051697,152.6019439 69 | 361,Arrowsmith,IL,28.00318693,109.3395101 70 | 409,Columbus,GA,67.33892289,46.61622653 71 | 312,Bentonville,AR,36.9528472,78.06843628 72 | 854,Kirkland,AZ,86.41004231,57.99523843 73 | 735,Wilton,ME,56.57944083,157.1906205 74 | 608,Busby,MT,104.0894472,29.83035109 75 | 122,Robertsdale,AL,97.7213689,85.3747551 76 | 93,Dale,IN,69.59335022,34.41552119 77 | 67,Reeds,MO,30.78888129,42.50211311 78 | 906,Hayfork,CA,35.2971959,116.6698147 79 | 34,Mcbrides,MI,74.05708403,35.68248542 80 | 401,Tennessee,IL,55.49838117,155.6455992 81 | 536,Henderson,IA,77.92417249,77.90662876 82 | 953,Udall,KS,112.6844799,59.95863388 83 | 614,Benedict,KS,138.4990456,95.71978969 84 | 998,Oakfield,ME,47.65762321,132.2118817 85 | 805,Tamms,IL,59.86766645,75.05164447 86 | 235,Haubstadt,IN,27.98898068,32.08170842 87 | 820,Chokio,MN,81.36073326,134.232113 88 | 650,Clancy,MT,45.82996854,164.378675 89 | 324,Norwood,MN,144.4891504,34.88529336 90 | 442,Elkton,MD,103.2547878,156.7289171 91 | 633,Bertha,MN,39.94889028,105.3111577 92 | 109,Bridgeport,MI,50.68988119,79.90137859 93 | 780,Cherry,IL,68.29708467,46.70383506 94 | 492,Regina,KY,131.5515912,90.23826291 95 | 965,Griffin,GA,38.74146904,151.7182093 96 | 337,Mascotte,FL,121.4608708,146.1675503 97 | 259,Baldwin,MD,81.73572165,40.4397386 98 | 955,Netawaka,KS,109.2057274,119.7404946 99 | 886,Pony,MT,99.25831292,162.8777336 100 | 200,Franklin,LA,82.24062794,31.77872725 101 | 384,Amo,IN,103.5871398,159.4306474 102 | 518,Vulcan,MO,108.6087788,91.56138944 103 | 161,Alanson,MI,90.6531996,72.11952297 104 | 486,Delta,LA,136.5385281,49.73086766 105 | 406,Carver,MN,45.89251104,122.069681 106 | 940,Paron,AR,59.13834287,104.3412062 107 | 237,Winchester,ID,38.37033443,80.0549859 108 | 465,Jerome,AZ,121.7110583,34.40610397 109 | 570,Greenview,CA,80.50000412,57.58800404 110 | 278,Cromwell,MN,128.8462234,53.51254061 111 | 927,Quinter,KS,59.58257004,25.36132152 112 | 59,Whitewater,MO,82.71809743,71.42607696 113 | 291,Clarkdale,AZ,58.19417297,73.94789938 114 | 668,Rockton,IL,116.1223935,86.83833004 115 | 682,Pheba,MS,90.94560988,127.3003694 116 | 775,Eleele,HI,80.90971236,152.5215045 117 | 527,Auburn,IA,95.48926949,137.0748386 118 | 190,Oconee,GA,92.56220722,119.477431 119 | 232,Grandville,MI,38.85256239,70.13776289 120 | 405,Susanville,CA,128.2498724,80.31679475 121 | 273,Rosie,AR,72.75896875,161.9173483 122 | 813,Verona,MO,109.6602903,152.6449499 123 | 444,Richland,GA,105.4709117,113.0379774 124 | 899,Fremont,MI,54.47132153,150.8233711 125 | 738,Philipsburg,MT,95.95531865,72.24442365 126 | 215,Kensett,IA,55.72295385,139.5524526 127 | 377,Koleen,IN,137.5485615,110.5110324 128 | 727,Winslow,IL,113.1328079,38.71450096 129 | 363,Reasnor,IA,41.59710148,162.564183 130 | 888,Bono,AR,133.276314,150.4963257 131 | 784,Biggsville,IL,85.92578701,138.7463469 132 | 695,Amazonia,MO,45.78566304,148.2013846 133 | 609,Marysville,MI,85.76134731,132.8724084 134 | 649,Pengilly,MN,25.07352606,154.0642918 135 | 383,Newbury,MA,128.3982315,85.17470023 136 | 44,Kismet,KS,99.82252766,156.5035829 137 | 433,Canton,ME,98.73035759,105.973446 138 | 474,Grayslake,IL,61.30374218,33.05923131 139 | 990,Bison,KS,132.2279842,74.89290079 140 | 502,Bellevue,KY,127.4330424,121.7488466 141 | 327,Ridgway,CO,77.43818081,110.2668422 142 | 228,Rydal,GA,35.68357838,78.82337343 143 | 642,Lynnville,KY,25.40836031,146.4916272 144 | 885,Deerfield,MO,40.213664,35.9386994 145 | 539,Montreal,MO,129.2453575,127.3259318 146 | 202,Hope,MN,140.3641688,43.72901978 147 | 521,Gowrie,IA,130.2024387,127.9825354 148 | 938,Andersonville,GA,141.3126586,72.53178686 149 | 528,Crouseville,ME,36.5185121,81.54481624 150 | 331,Cranks,KY,55.60911109,27.28471229 151 | 944,Ledyard,CT,134.5468125,143.8149657 152 | 949,Norway,ME,83.89130493,88.40746773 153 | 88,Eros,LA,95.16264172,58.31349033 154 | 878,Rantoul,KS,31.80492935,118.6160845 155 | 17,Fredericktown,MO,105.5334784,112.6890911 156 | 447,Arkadelphia,AR,98.62295228,49.57501146 157 | 351,Fredericksburg,IN,44.51203489,78.05797739 158 | 774,Manchester,IA,129.6682154,123.2967519 159 | 963,Eriline,KY,93.61747947,65.43902104 160 | 643,Wellington,KY,100.4511347,31.68760835 161 | 777,Edgewater,MD,130.0676569,72.29080719 162 | 15,Ducor,CA,140.8633607,102.039339 163 | 910,Salem,KY,86.97524724,113.9609797 164 | 612,Sturdivant,MO,93.84076298,86.38850955 165 | 537,Hagatna,GU,97.17321584,151.8086289 166 | 510,Eastlake,MI,134.0938535,38.78212913 167 | 354,Larkspur,CA,107.0529696,65.97363083 168 | 983,Patriot,IN,82.63795084,46.08354932 169 | 799,Corriganville,MD,141.383789,153.6500914 170 | 581,Carlos,MN,114.9060173,66.2810487 171 | 825,Addison,MI,96.36953674,142.4105732 172 | 526,Tarzana,CA,135.8603987,81.30731303 173 | 176,Grapevine,AR,92.36589225,84.54293686 174 | 994,Kanorado,KS,65.42078424,85.72249232 175 | 704,Climax,MI,127.3563782,107.0542747 176 | 582,Curdsville,KY,84.78749012,150.4842247 177 | 884,Southport,CT,59.09336238,63.13052144 178 | 196,Compton,IL,106.617993,99.40704162 179 | 605,Notasulga,AL,66.84426322,115.6864036 180 | 430,Rumsey,KY,70.6921152,50.2122756 181 | 234,Rogers,CT,140.4723914,33.18335673 182 | 702,Everton,MO,119.0469849,51.48512967 183 | 662,Skanee,MI,70.1724149,129.5593113 184 | 171,Springerville,AZ,124.6882036,150.6628287 185 | 615,Libertytown,MD,144.5783185,111.9744225 186 | 336,Dumont,MN,57.0124315,129.3675605 187 | 315,Ravenna,KY,79.15467169,106.252172 188 | 505,Williams,AZ,73.48100913,111.7413889 189 | 842,Decatur,MI,63.31154085,161.4235787 190 | 982,Holbrook,AZ,134.8838521,103.8569792 191 | 868,Sherrill,AR,79.96440727,152.2197289 192 | 554,Brownsdale,MN,52.42646664,50.79836304 193 | 199,Linden,MI,53.41116218,32.62422206 194 | 453,Sedgwick,AR,68.93334418,75.29418595 195 | 326,Rocheport,MO,114.163159,64.48216553 196 | 638,Clovis,CA,92.43965299,138.0751933 197 | 156,Heyburn,ID,82.08611195,121.0459768 198 | 861,Peabody,KS,75.41614816,152.2100746 199 | 428,Randall,KS,47.99772806,135.6275983 200 | 677,Hayesville,IA,119.9881564,42.12719349 201 | 183,Jordan,MN,68.74638928,35.46228503 202 | 242,Macy,IN,138.694477,152.3694449 203 | 621,Flowood,MS,64.88877035,149.2064111 204 | 180,Napoleon,IN,32.03325626,160.2402958 205 | 853,Coldwater,KS,47.50617517,26.31002645 206 | 105,Weldon,CA,134.0156771,118.9609382 207 | 357,Yellville,AR,35.68710434,42.24658664 208 | 920,Eustis,FL,42.73630964,39.48336091 209 | 355,Weldona,CO,32.96727204,58.44917695 210 | 501,Tefft,IN,93.21527074,150.0159946 211 | 834,Bayville,ME,106.7349403,143.4078424 212 | 255,Brighton,IL,107.6050821,32.84882058 213 | 595,Grimes,IA,42.05019623,74.73314913 214 | 709,Nubieber,CA,132.9033933,49.27761205 215 | 16,Beaufort,MO,71.77418064,85.65741838 216 | 231,Arispe,IA,31.11149635,137.7968198 217 | 891,Humeston,IA,74.51222394,122.4246326 218 | 757,Lakeville,CT,59.86867012,94.98860174 219 | 506,Firebrick,KY,49.99183934,95.03900712 220 | 583,Channing,MI,117.1645417,56.95124478 221 | 504,Melber,KY,37.24884854,55.53335159 222 | 901,Manchester,MN,71.02098012,84.00752922 223 | 586,Ottertail,MN,100.0240382,44.34165481 224 | 95,Dupo,IL,41.28342297,29.03342929 225 | 524,Montrose,CA,136.4765033,119.373558 226 | 716,Schleswig,IA,119.2539069,51.88108538 227 | 904,Ermine,KY,119.6401426,62.79812627 228 | 740,Siler,KY,137.0193079,117.2464806 229 | 57,Clifton,AZ,30.15463898,135.7025933 230 | 155,Casco,MI,138.5984073,109.0728819 231 | 755,Sturgis,MI,117.392421,135.3989883 232 | 287,Madisonville,LA,112.2163874,53.04603619 233 | 435,Albion,IN,44.25844944,121.8753316 234 | 672,Lismore,MN,58.87142971,103.8693391 235 | 572,Athens,IN,75.32104008,120.7983748 236 | 890,Eufaula,AL,140.2958283,103.0868213 237 | 119,Wildie,KY,69.65812987,111.8552379 238 | 540,Mosca,CO,89.20441335,141.4811419 239 | 678,Bennington,IN,35.52107321,26.80362207 240 | 208,Lottie,LA,109.8672979,82.76650144 241 | 512,Garland,ME,108.7311062,134.3750565 242 | 352,Clutier,IA,61.1888319,127.0339038 243 | 948,Lupton,MI,139.9255926,53.36397181 244 | 503,Northfield,MN,61.00207775,37.15335522 245 | 288,Daleville,AL,121.8865105,136.1704398 246 | 479,Cuba,MO,63.71916114,87.64843313 247 | 826,Norris,MT,47.18550342,37.25727353 248 | 651,Clopton,AL,40.77104358,84.70678339 249 | 143,Renville,MN,142.1513936,99.43201313 250 | 102,Kirksville,MO,140.0030631,143.8709979 251 | 69,Kingsland,AR,78.22487634,85.13857667 252 | 181,Fairview,KS,80.27133556,164.5798928 253 | 175,Lydia,LA,41.78237386,39.53037919 254 | 80,Bridgton,ME,93.2257481,140.415464 255 | 596,Brownstown,IL,48.65218811,63.22095723 256 | 301,Monona,IA,144.1294884,81.57803996 257 | 987,Hartland,MI,136.2638918,107.738067 258 | 973,Andover,CT,51.74018501,52.53230369 259 | 981,Lakota,IA,56.15413675,92.38612569 260 | 110,Mesick,MI,82.12446036,108.5283528 261 | 396,Dryden,MI,69.80182523,47.7436689 262 | 637,Beverly,KY,57.75450094,126.8958422 263 | 801,Pocahontas,IL,109.6304686,83.23109494 264 | 130,Hayneville,AL,109.7380661,157.4686782 265 | 345,Yoder,IN,83.49946581,143.715826 266 | 851,Gatewood,MO,76.31562733,145.668333 267 | 489,Madden,MS,81.34223218,99.37998257 268 | 223,Losantville,IN,112.5187171,106.7760547 269 | 538,Cheswold,DE,31.93743733,59.34689519 270 | 329,Caseville,MI,102.9200706,98.4033735 271 | 815,Pomona,MO,52.33346818,50.28222507 272 | 789,Hopkinsville,KY,27.31872893,47.94652919 273 | 269,Jack,AL,49.93703023,85.62817326 274 | 969,Dixie,GA,27.21713791,36.47378899 275 | 271,Hillside,CO,99.26558164,68.84352684 276 | 667,Hawarden,IA,90.96161545,46.93255602 277 | 350,Cannonsburg,MI,91.03351667,120.6696799 278 | 49,Osborne,KS,70.36168327,139.7111654 279 | 404,Farmington,IL,91.7144044,72.0223174 280 | 23,Honolulu,HI,110.101955,139.7437776 281 | 1,Pfeifer,KS,37.44478047,65.68491252 282 | 127,Oshtemo,MI,100.3702957,135.9503227 283 | 657,Gridley,KS,118.1450367,55.80178454 -------------------------------------------------------------------------------- /Problem 5/stations.csv: -------------------------------------------------------------------------------- 1 | ID,City,State,Lattitude,Longitude 2 | 478,Tipton,IN,33.54792701,97.94286036 3 | 619,Arlington,CO,75.17993079,92.94615894 4 | 711,Turner,AR,50.24380534,101.4580163 5 | 839,Slidell,LA,85.32270304,151.8743276 6 | 411,Negreet,LA,98.9707194,105.3376115 7 | 588,Glencoe,KY,46.38739244,136.0427027 8 | 665,Chelsea,IA,98.72210937,59.68913002 9 | 733,Pelahatchie,MS,38.58161595,28.11950703 10 | 811,Dorrance,KS,102.0888316,121.5614372 11 | 698,Albany,CA,49.75112765,80.21211317 12 | 325,Monument,KS,70.52300953,141.7680413 13 | 414,Manchester,MD,73.51580724,37.14602869 14 | 113,Prescott,IA,39.93234421,65.79327823 15 | 971,Graettinger,IA,94.66283665,150.3826243 16 | 266,Cahone,CO,116.2321963,127.009554 17 | 617,Sturgis,MS,36.45673517,126.1690696 18 | 495,Upperco,MD,114.2157413,29.63104758 19 | 473,Highwood,IL,27.25445814,150.9227402 20 | 959,Waipahu,HI,106.4460526,33.91451792 21 | 438,Bowdon,GA,88.98111013,78.49025241 22 | 571,Tyler,MN,133.3521233,58.63273833 23 | 92,Watkins,CO,83.27433063,96.73732305 24 | 399,Republic,MI,75.42182,130.1266717 25 | 426,Millville,CA,32.55838209,145.7434609 26 | 844,Aguanga,CA,79.89165657,65.93959251 27 | 606,Morenci,AZ,104.8964262,110.2033978 28 | 833,Hoskinston,KY,65.7515349,65.67937265 29 | 843,Talbert,KY,39.85947921,58.84999769 30 | 166,Mccomb,MS,74.04169376,42.63374681 31 | 339,Kirk,CO,141.097397,136.3312671 32 | 909,Carlock,IL,117.3209611,84.80244659 33 | 829,Seward,IL,72.41930917,90.20890209 34 | 766,Gustine,CA,111.0875596,140.8338617 35 | 392,Delano,CA,126.3467998,91.50161746 36 | 555,Westphalia,MI,32.76641637,143.8050085 37 | 728,Roy,MT,41.31187761,51.56467929 38 | 656,Pattonsburg,MO,138.100334,32.10804024 39 | 394,Centertown,MO,133.9733513,93.17246374 40 | 366,Norvell,MI,125.3431567,93.75245864 41 | 96,Raymondville,MO,70.68239168,148.4444084 42 | 977,Odin,IL,53.48858773,115.7934363 43 | 741,Jemison,AL,62.10307108,25.71260581 44 | 323,Barrigada,GU,60.60716473,147.5296125 45 | 3,Hesperia,CA,106.0569286,71.11876711 46 | 814,Wickliffe,KY,80.29965735,46.12993489 47 | 375,Culdesac,ID,47.8418268,78.06551236 48 | 467,Roselawn,IN,87.70708169,51.74506986 49 | 647,Portland,AR,83.92116818,44.80555694 50 | 250,Hampden,MA,76.39074308,26.48368838 51 | 547,Sandborn,IN,55.94680767,93.85315475 52 | 701,Seaton,IL,128.2287955,78.43005628 53 | 197,Milledgeville,IL,90.98811028,113.2748504 54 | 679,Gretna,LA,75.26293787,142.5762285 55 | 403,Zionsville,IN,57.79181464,36.493866 56 | 482,Jolon,CA,66.65054378,52.95528769 57 | 252,Childs,MD,92.7594351,104.0155475 58 | 600,Shreveport,LA,136.2310159,38.50207291 59 | 14,Forest,MS,120.283076,50.22883356 60 | 260,Sizerock,KY,116.0212592,112.7471971 61 | 753,Algonac,MI,118.7398038,80.14671114 62 | 174,Onaway,MI,108.606587,55.75945692 63 | 263,Irvington,IL,96.70474244,68.28719181 64 | 253,Winsted,MN,68.82384939,72.51511422 65 | 557,Woodbury,GA,102.5472386,93.37553932 66 | 897,Samantha,AL,75.2235845,35.94479192 67 | 98,Hackleburg,AL,119.5607105,120.6244819 68 | 423,Soldier,KS,77.30051697,152.6019439 69 | 361,Arrowsmith,IL,28.00318693,109.3395101 70 | 409,Columbus,GA,67.33892289,46.61622653 71 | 312,Bentonville,AR,36.9528472,78.06843628 72 | 854,Kirkland,AZ,86.41004231,57.99523843 73 | 735,Wilton,ME,56.57944083,157.1906205 74 | 608,Busby,MT,104.0894472,29.83035109 75 | 122,Robertsdale,AL,97.7213689,85.3747551 76 | 93,Dale,IN,69.59335022,34.41552119 77 | 67,Reeds,MO,30.78888129,42.50211311 78 | 906,Hayfork,CA,35.2971959,116.6698147 79 | 34,Mcbrides,MI,74.05708403,35.68248542 80 | 401,Tennessee,IL,55.49838117,155.6455992 81 | 536,Henderson,IA,77.92417249,77.90662876 82 | 953,Udall,KS,112.6844799,59.95863388 83 | 614,Benedict,KS,138.4990456,95.71978969 84 | 998,Oakfield,ME,47.65762321,132.2118817 85 | 805,Tamms,IL,59.86766645,75.05164447 86 | 235,Haubstadt,IN,27.98898068,32.08170842 87 | 820,Chokio,MN,81.36073326,134.232113 88 | 650,Clancy,MT,45.82996854,164.378675 89 | 324,Norwood,MN,144.4891504,34.88529336 90 | 442,Elkton,MD,103.2547878,156.7289171 91 | 633,Bertha,MN,39.94889028,105.3111577 92 | 109,Bridgeport,MI,50.68988119,79.90137859 93 | 780,Cherry,IL,68.29708467,46.70383506 94 | 492,Regina,KY,131.5515912,90.23826291 95 | 965,Griffin,GA,38.74146904,151.7182093 96 | 337,Mascotte,FL,121.4608708,146.1675503 97 | 259,Baldwin,MD,81.73572165,40.4397386 98 | 955,Netawaka,KS,109.2057274,119.7404946 99 | 886,Pony,MT,99.25831292,162.8777336 100 | 200,Franklin,LA,82.24062794,31.77872725 101 | 384,Amo,IN,103.5871398,159.4306474 102 | 518,Vulcan,MO,108.6087788,91.56138944 103 | 161,Alanson,MI,90.6531996,72.11952297 104 | 486,Delta,LA,136.5385281,49.73086766 105 | 406,Carver,MN,45.89251104,122.069681 106 | 940,Paron,AR,59.13834287,104.3412062 107 | 237,Winchester,ID,38.37033443,80.0549859 108 | 465,Jerome,AZ,121.7110583,34.40610397 109 | 570,Greenview,CA,80.50000412,57.58800404 110 | 278,Cromwell,MN,128.8462234,53.51254061 111 | 927,Quinter,KS,59.58257004,25.36132152 112 | 59,Whitewater,MO,82.71809743,71.42607696 113 | 291,Clarkdale,AZ,58.19417297,73.94789938 114 | 668,Rockton,IL,116.1223935,86.83833004 115 | 682,Pheba,MS,90.94560988,127.3003694 116 | 775,Eleele,HI,80.90971236,152.5215045 117 | 527,Auburn,IA,95.48926949,137.0748386 118 | 190,Oconee,GA,92.56220722,119.477431 119 | 232,Grandville,MI,38.85256239,70.13776289 120 | 405,Susanville,CA,128.2498724,80.31679475 121 | 273,Rosie,AR,72.75896875,161.9173483 122 | 813,Verona,MO,109.6602903,152.6449499 123 | 444,Richland,GA,105.4709117,113.0379774 124 | 899,Fremont,MI,54.47132153,150.8233711 125 | 738,Philipsburg,MT,95.95531865,72.24442365 126 | 215,Kensett,IA,55.72295385,139.5524526 127 | 377,Koleen,IN,137.5485615,110.5110324 128 | 727,Winslow,IL,113.1328079,38.71450096 129 | 363,Reasnor,IA,41.59710148,162.564183 130 | 888,Bono,AR,133.276314,150.4963257 131 | 784,Biggsville,IL,85.92578701,138.7463469 132 | 695,Amazonia,MO,45.78566304,148.2013846 133 | 609,Marysville,MI,85.76134731,132.8724084 134 | 649,Pengilly,MN,25.07352606,154.0642918 135 | 383,Newbury,MA,128.3982315,85.17470023 136 | 44,Kismet,KS,99.82252766,156.5035829 137 | 433,Canton,ME,98.73035759,105.973446 138 | 474,Grayslake,IL,61.30374218,33.05923131 139 | 990,Bison,KS,132.2279842,74.89290079 140 | 502,Bellevue,KY,127.4330424,121.7488466 141 | 327,Ridgway,CO,77.43818081,110.2668422 142 | 228,Rydal,GA,35.68357838,78.82337343 143 | 642,Lynnville,KY,25.40836031,146.4916272 144 | 885,Deerfield,MO,40.213664,35.9386994 145 | 539,Montreal,MO,129.2453575,127.3259318 146 | 202,Hope,MN,140.3641688,43.72901978 147 | 521,Gowrie,IA,130.2024387,127.9825354 148 | 938,Andersonville,GA,141.3126586,72.53178686 149 | 528,Crouseville,ME,36.5185121,81.54481624 150 | 331,Cranks,KY,55.60911109,27.28471229 151 | 944,Ledyard,CT,134.5468125,143.8149657 152 | 949,Norway,ME,83.89130493,88.40746773 153 | 88,Eros,LA,95.16264172,58.31349033 154 | 878,Rantoul,KS,31.80492935,118.6160845 155 | 17,Fredericktown,MO,105.5334784,112.6890911 156 | 447,Arkadelphia,AR,98.62295228,49.57501146 157 | 351,Fredericksburg,IN,44.51203489,78.05797739 158 | 774,Manchester,IA,129.6682154,123.2967519 159 | 963,Eriline,KY,93.61747947,65.43902104 160 | 643,Wellington,KY,100.4511347,31.68760835 161 | 777,Edgewater,MD,130.0676569,72.29080719 162 | 15,Ducor,CA,140.8633607,102.039339 163 | 910,Salem,KY,86.97524724,113.9609797 164 | 612,Sturdivant,MO,93.84076298,86.38850955 165 | 537,Hagatna,GU,97.17321584,151.8086289 166 | 510,Eastlake,MI,134.0938535,38.78212913 167 | 354,Larkspur,CA,107.0529696,65.97363083 168 | 983,Patriot,IN,82.63795084,46.08354932 169 | 799,Corriganville,MD,141.383789,153.6500914 170 | 581,Carlos,MN,114.9060173,66.2810487 171 | 825,Addison,MI,96.36953674,142.4105732 172 | 526,Tarzana,CA,135.8603987,81.30731303 173 | 176,Grapevine,AR,92.36589225,84.54293686 174 | 994,Kanorado,KS,65.42078424,85.72249232 175 | 704,Climax,MI,127.3563782,107.0542747 176 | 582,Curdsville,KY,84.78749012,150.4842247 177 | 884,Southport,CT,59.09336238,63.13052144 178 | 196,Compton,IL,106.617993,99.40704162 179 | 605,Notasulga,AL,66.84426322,115.6864036 180 | 430,Rumsey,KY,70.6921152,50.2122756 181 | 234,Rogers,CT,140.4723914,33.18335673 182 | 702,Everton,MO,119.0469849,51.48512967 183 | 662,Skanee,MI,70.1724149,129.5593113 184 | 171,Springerville,AZ,124.6882036,150.6628287 185 | 615,Libertytown,MD,144.5783185,111.9744225 186 | 336,Dumont,MN,57.0124315,129.3675605 187 | 315,Ravenna,KY,79.15467169,106.252172 188 | 505,Williams,AZ,73.48100913,111.7413889 189 | 842,Decatur,MI,63.31154085,161.4235787 190 | 982,Holbrook,AZ,134.8838521,103.8569792 191 | 868,Sherrill,AR,79.96440727,152.2197289 192 | 554,Brownsdale,MN,52.42646664,50.79836304 193 | 199,Linden,MI,53.41116218,32.62422206 194 | 453,Sedgwick,AR,68.93334418,75.29418595 195 | 326,Rocheport,MO,114.163159,64.48216553 196 | 638,Clovis,CA,92.43965299,138.0751933 197 | 156,Heyburn,ID,82.08611195,121.0459768 198 | 861,Peabody,KS,75.41614816,152.2100746 199 | 428,Randall,KS,47.99772806,135.6275983 200 | 677,Hayesville,IA,119.9881564,42.12719349 201 | 183,Jordan,MN,68.74638928,35.46228503 202 | 242,Macy,IN,138.694477,152.3694449 203 | 621,Flowood,MS,64.88877035,149.2064111 204 | 180,Napoleon,IN,32.03325626,160.2402958 205 | 853,Coldwater,KS,47.50617517,26.31002645 206 | 105,Weldon,CA,134.0156771,118.9609382 207 | 357,Yellville,AR,35.68710434,42.24658664 208 | 920,Eustis,FL,42.73630964,39.48336091 209 | 355,Weldona,CO,32.96727204,58.44917695 210 | 501,Tefft,IN,93.21527074,150.0159946 211 | 834,Bayville,ME,106.7349403,143.4078424 212 | 255,Brighton,IL,107.6050821,32.84882058 213 | 595,Grimes,IA,42.05019623,74.73314913 214 | 709,Nubieber,CA,132.9033933,49.27761205 215 | 16,Beaufort,MO,71.77418064,85.65741838 216 | 231,Arispe,IA,31.11149635,137.7968198 217 | 891,Humeston,IA,74.51222394,122.4246326 218 | 757,Lakeville,CT,59.86867012,94.98860174 219 | 506,Firebrick,KY,49.99183934,95.03900712 220 | 583,Channing,MI,117.1645417,56.95124478 221 | 504,Melber,KY,37.24884854,55.53335159 222 | 901,Manchester,MN,71.02098012,84.00752922 223 | 586,Ottertail,MN,100.0240382,44.34165481 224 | 95,Dupo,IL,41.28342297,29.03342929 225 | 524,Montrose,CA,136.4765033,119.373558 226 | 716,Schleswig,IA,119.2539069,51.88108538 227 | 904,Ermine,KY,119.6401426,62.79812627 228 | 740,Siler,KY,137.0193079,117.2464806 229 | 57,Clifton,AZ,30.15463898,135.7025933 230 | 155,Casco,MI,138.5984073,109.0728819 231 | 755,Sturgis,MI,117.392421,135.3989883 232 | 287,Madisonville,LA,112.2163874,53.04603619 233 | 435,Albion,IN,44.25844944,121.8753316 234 | 672,Lismore,MN,58.87142971,103.8693391 235 | 572,Athens,IN,75.32104008,120.7983748 236 | 890,Eufaula,AL,140.2958283,103.0868213 237 | 119,Wildie,KY,69.65812987,111.8552379 238 | 540,Mosca,CO,89.20441335,141.4811419 239 | 678,Bennington,IN,35.52107321,26.80362207 240 | 208,Lottie,LA,109.8672979,82.76650144 241 | 512,Garland,ME,108.7311062,134.3750565 242 | 352,Clutier,IA,61.1888319,127.0339038 243 | 948,Lupton,MI,139.9255926,53.36397181 244 | 503,Northfield,MN,61.00207775,37.15335522 245 | 288,Daleville,AL,121.8865105,136.1704398 246 | 479,Cuba,MO,63.71916114,87.64843313 247 | 826,Norris,MT,47.18550342,37.25727353 248 | 651,Clopton,AL,40.77104358,84.70678339 249 | 143,Renville,MN,142.1513936,99.43201313 250 | 102,Kirksville,MO,140.0030631,143.8709979 251 | 69,Kingsland,AR,78.22487634,85.13857667 252 | 181,Fairview,KS,80.27133556,164.5798928 253 | 175,Lydia,LA,41.78237386,39.53037919 254 | 80,Bridgton,ME,93.2257481,140.415464 255 | 596,Brownstown,IL,48.65218811,63.22095723 256 | 301,Monona,IA,144.1294884,81.57803996 257 | 987,Hartland,MI,136.2638918,107.738067 258 | 973,Andover,CT,51.74018501,52.53230369 259 | 981,Lakota,IA,56.15413675,92.38612569 260 | 110,Mesick,MI,82.12446036,108.5283528 261 | 396,Dryden,MI,69.80182523,47.7436689 262 | 637,Beverly,KY,57.75450094,126.8958422 263 | 801,Pocahontas,IL,109.6304686,83.23109494 264 | 130,Hayneville,AL,109.7380661,157.4686782 265 | 345,Yoder,IN,83.49946581,143.715826 266 | 851,Gatewood,MO,76.31562733,145.668333 267 | 489,Madden,MS,81.34223218,99.37998257 268 | 223,Losantville,IN,112.5187171,106.7760547 269 | 538,Cheswold,DE,31.93743733,59.34689519 270 | 329,Caseville,MI,102.9200706,98.4033735 271 | 815,Pomona,MO,52.33346818,50.28222507 272 | 789,Hopkinsville,KY,27.31872893,47.94652919 273 | 269,Jack,AL,49.93703023,85.62817326 274 | 969,Dixie,GA,27.21713791,36.47378899 275 | 271,Hillside,CO,99.26558164,68.84352684 276 | 667,Hawarden,IA,90.96161545,46.93255602 277 | 350,Cannonsburg,MI,91.03351667,120.6696799 278 | 49,Osborne,KS,70.36168327,139.7111654 279 | 404,Farmington,IL,91.7144044,72.0223174 280 | 23,Honolulu,HI,110.101955,139.7437776 281 | 1,Pfeifer,KS,37.44478047,65.68491252 282 | 127,Oshtemo,MI,100.3702957,135.9503227 283 | 657,Gridley,KS,118.1450367,55.80178454 -------------------------------------------------------------------------------- /Problem 8/problem8.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c", 6 | "metadata": {}, 7 | "source": [ 8 | "Here, we will solve problems two ways\n", 9 | "1. First using PySpark function \n", 10 | "2. Second using Spark SQL" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# First Load all the required library and also Start Spark Session\n", 21 | "# Load all the required library\n", 22 | "from pyspark.sql import SparkSession" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "WARNING: An illegal reflective access operation has occurred\n", 36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 39 | "WARNING: All illegal access operations will be denied in a future release\n", 40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 41 | "Setting default log level to \"WARN\".\n", 42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 43 | "23/02/23 22:25:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 44 | "23/02/23 22:25:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", 45 | "23/02/23 22:25:40 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n" 46 | ] 47 | } 48 | ], 49 | "source": [ 50 | "#Start Spark Session\n", 51 | "spark = SparkSession.builder.appName(\"problem8\").getOrCreate()\n", 52 | "sqlContext = SparkSession(spark)\n", 53 | "#Dont Show warning only error\n", 54 | "spark.sparkContext.setLogLevel(\"ERROR\")" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68", 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "name": "stderr", 65 | "output_type": "stream", 66 | "text": [ 67 | " \r" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "#Load CSV file into DataFrame\n", 73 | "ridelogdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"ride_log.csv\")" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 4, 79 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030", 80 | "metadata": {}, 81 | "outputs": [ 82 | { 83 | "name": "stdout", 84 | "output_type": "stream", 85 | "text": [ 86 | "root\n", 87 | " |-- id: integer (nullable = true)\n", 88 | " |-- user_id: integer (nullable = true)\n", 89 | " |-- distance: integer (nullable = true)\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "#Check Schema of DataFrame\n", 96 | "ridelogdf.printSchema()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "id": "693b0edd-852f-46de-b983-81357b95ad36", 103 | "metadata": {}, 104 | "outputs": [], 105 | "source": [ 106 | "#Load CSV file into DataFrame\n", 107 | "userdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"user.csv\")" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "id": "e9d65b9d-fe5a-4ed3-91b4-c9246551cce9", 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "name": "stdout", 118 | "output_type": "stream", 119 | "text": [ 120 | "root\n", 121 | " |-- id: integer (nullable = true)\n", 122 | " |-- name: string (nullable = true)\n", 123 | "\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "#Check Schema of DataFrame\n", 129 | "userdf.printSchema()" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 10, 135 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "# Now we are solving Same problem using Spark SQL \n", 140 | "# Creating Temp Table or HIVE table\n", 141 | "ridelogdf.createOrReplaceTempView(\"tmpRidelog\")\n", 142 | "userdf.createOrReplaceTempView(\"tmpUser\")" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 11, 148 | "id": "e55eb16a-fb5c-42b6-9f7c-feb1ff9c2945", 149 | "metadata": {}, 150 | "outputs": [ 151 | { 152 | "name": "stdout", 153 | "output_type": "stream", 154 | "text": [ 155 | "+---+-------+--------+\n", 156 | "| id|user_id|distance|\n", 157 | "+---+-------+--------+\n", 158 | "|101| 8| 93|\n", 159 | "|102| 40| 56|\n", 160 | "|103| 28| 83|\n", 161 | "|104| 33| 83|\n", 162 | "|105| 1| 87|\n", 163 | "|106| 32| 49|\n", 164 | "|107| 3| 5|\n", 165 | "|108| 23| 37|\n", 166 | "|109| 31| 62|\n", 167 | "|110| 1| 35|\n", 168 | "|111| 41| 89|\n", 169 | "|112| 19| 64|\n", 170 | "|113| 49| 57|\n", 171 | "|114| 28| 68|\n", 172 | "|115| 48| 94|\n", 173 | "|116| 50| 89|\n", 174 | "|117| 48| 29|\n", 175 | "|118| 13| 16|\n", 176 | "|119| 24| 58|\n", 177 | "|120| 25| 19|\n", 178 | "+---+-------+--------+\n", 179 | "only showing top 20 rows\n", 180 | "\n" 181 | ] 182 | } 183 | ], 184 | "source": [ 185 | "sqlContext.sql(\"SELECT * FROM tmpRidelog\").show()" 186 | ] 187 | }, 188 | { 189 | "cell_type": "code", 190 | "execution_count": 12, 191 | "id": "845b2bd0-a09a-45ca-84f4-0e3593ad9026", 192 | "metadata": {}, 193 | "outputs": [ 194 | { 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "+---+-------------------+\n", 199 | "| id| name|\n", 200 | "+---+-------------------+\n", 201 | "| 1| Dustin Smith|\n", 202 | "| 2| Jay Ramirez|\n", 203 | "| 3| Joseph Cooke|\n", 204 | "| 4| Melinda Young|\n", 205 | "| 5| Sean Parker|\n", 206 | "| 6| Ian Foster|\n", 207 | "| 7|Christopher Schmitt|\n", 208 | "| 8| Patrick Gutierrez|\n", 209 | "| 9| Dennis Douglas|\n", 210 | "| 10| Brenda Morris|\n", 211 | "| 11| Jeffery Hernandez|\n", 212 | "| 12| David Rice|\n", 213 | "| 13| Charles Foster|\n", 214 | "| 14| Keith Perez DVM|\n", 215 | "| 15| Dean Cuevas|\n", 216 | "| 16| Melissa Bishop|\n", 217 | "| 17| Alexander Howell|\n", 218 | "| 18| Austin Robertson|\n", 219 | "| 19| Sherri Mcdaniel|\n", 220 | "| 20| Nancy Nguyen|\n", 221 | "+---+-------------------+\n", 222 | "only showing top 20 rows\n", 223 | "\n" 224 | ] 225 | } 226 | ], 227 | "source": [ 228 | "sqlContext.sql(\"SELECT * FROM tmpUser\").show()" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 32, 234 | "id": "d4a77a98-b4e8-4c5d-bb66-c8d1d17bc95e", 235 | "metadata": {}, 236 | "outputs": [ 237 | { 238 | "name": "stdout", 239 | "output_type": "stream", 240 | "text": [ 241 | "+-------+-------------------+-----+----------+\n", 242 | "|user_id| name|total|actualrank|\n", 243 | "+-------+-------------------+-----+----------+\n", 244 | "| 3| Joseph Cooke| 5| 1|\n", 245 | "| 45| Benjamin Mcbride| 11| 2|\n", 246 | "| 13| Charles Foster| 16| 3|\n", 247 | "| 18| Austin Robertson| 27| 4|\n", 248 | "| 36| Alyssa Shaw| 28| 5|\n", 249 | "| 37| Destiny Clark| 48| 6|\n", 250 | "| 40| Stacy Bryant| 56| 7|\n", 251 | "| 19| Sherri Mcdaniel| 64| 8|\n", 252 | "| 23| Joseph Hamilton| 79| 9|\n", 253 | "| 21| Melody Ball| 81| 10|\n", 254 | "| 39| Mark Diaz| 81| 10|\n", 255 | "| 38| Thomas Lara| 82| 12|\n", 256 | "| 33| Donna Ortiz| 83| 13|\n", 257 | "| 31| Shannon Green| 86| 14|\n", 258 | "| 41| Howard Rose| 89| 15|\n", 259 | "| 10| Brenda Morris| 90| 16|\n", 260 | "| 27| Jacqueline Heath| 91| 17|\n", 261 | "| 5| Sean Parker| 92| 18|\n", 262 | "| 7|Christopher Schmitt| 96| 19|\n", 263 | "| 46| Elizabeth Ward| 108| 20|\n", 264 | "+-------+-------------------+-----+----------+\n", 265 | "only showing top 20 rows\n", 266 | "\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "sqlContext.sql(\"SELECT user_id \\\n", 272 | " , name \\\n", 273 | " , sum(distance) as total\\\n", 274 | " , RANK() OVER (ORDER BY sum(distance)) as actualrank \\\n", 275 | " FROM tmpRidelog as log \\\n", 276 | " LEFT OUTER JOIN tmpUser as users \\\n", 277 | " ON log.user_id = users.id \\\n", 278 | " GROUP BY user_id, name\").show()" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 27, 284 | "id": "b3654e1a-6d81-418c-b16c-c605d480fde9", 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "+-------+----------------+-----+\n", 292 | "|user_id| name|total|\n", 293 | "+-------+----------------+-----+\n", 294 | "| 3| Joseph Cooke| 5|\n", 295 | "| 45|Benjamin Mcbride| 11|\n", 296 | "| 13| Charles Foster| 16|\n", 297 | "| 18|Austin Robertson| 27|\n", 298 | "| 36| Alyssa Shaw| 28|\n", 299 | "| 37| Destiny Clark| 48|\n", 300 | "| 40| Stacy Bryant| 56|\n", 301 | "| 19| Sherri Mcdaniel| 64|\n", 302 | "| 23| Joseph Hamilton| 79|\n", 303 | "| 39| Mark Diaz| 81|\n", 304 | "| 21| Melody Ball| 81|\n", 305 | "+-------+----------------+-----+\n", 306 | "\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "sqlContext.sql(\"SELECT q.user_id, q.name, q.total \\\n", 312 | " FROM \\\n", 313 | " ( \\\n", 314 | " SELECT user_id \\\n", 315 | " , name \\\n", 316 | " , sum(distance) as total\\\n", 317 | " , RANK() OVER (ORDER BY sum(distance)) as actualrank \\\n", 318 | " FROM tmpRidelog as log \\\n", 319 | " LEFT OUTER JOIN tmpUser as users \\\n", 320 | " ON log.user_id = users.id \\\n", 321 | " GROUP BY user_id, name ) as q \\\n", 322 | " WHERE q.actualrank <= 10\").show()" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": 28, 328 | "id": "b40d4481-7dfc-4cfc-a8f1-2d11e092dc2b", 329 | "metadata": {}, 330 | "outputs": [ 331 | { 332 | "name": "stdout", 333 | "output_type": "stream", 334 | "text": [ 335 | "+-------+-----------------+-----+\n", 336 | "|user_id| name|total|\n", 337 | "+-------+-----------------+-----+\n", 338 | "| 47| Christina Price| 328|\n", 339 | "| 34| Jennifer Simmons| 277|\n", 340 | "| 43| Kimberly Potter| 275|\n", 341 | "| 8|Patrick Gutierrez| 243|\n", 342 | "| 25| Crystal Berg| 239|\n", 343 | "| 14| Keith Perez DVM| 214|\n", 344 | "| 32| Stacy Collins| 210|\n", 345 | "| 11|Jeffery Hernandez| 206|\n", 346 | "| 9| Dennis Douglas| 206|\n", 347 | "| 17| Alexander Howell| 205|\n", 348 | "+-------+-----------------+-----+\n", 349 | "\n" 350 | ] 351 | } 352 | ], 353 | "source": [ 354 | "sqlContext.sql(\"SELECT q.user_id, q.name, q.total \\\n", 355 | " FROM \\\n", 356 | " ( \\\n", 357 | " SELECT user_id \\\n", 358 | " , name \\\n", 359 | " , sum(distance) as total\\\n", 360 | " , RANK() OVER (ORDER BY sum(distance) DESC) as actualrank \\\n", 361 | " FROM tmpRidelog as log \\\n", 362 | " LEFT OUTER JOIN tmpUser as users \\\n", 363 | " ON log.user_id = users.id \\\n", 364 | " GROUP BY user_id, name ) as q \\\n", 365 | " WHERE q.actualrank <= 10\").show()" 366 | ] 367 | } 368 | ], 369 | "metadata": { 370 | "kernelspec": { 371 | "display_name": "Python 3 (ipykernel)", 372 | "language": "python", 373 | "name": "python3" 374 | }, 375 | "language_info": { 376 | "codemirror_mode": { 377 | "name": "ipython", 378 | "version": 3 379 | }, 380 | "file_extension": ".py", 381 | "mimetype": "text/x-python", 382 | "name": "python", 383 | "nbconvert_exporter": "python", 384 | "pygments_lexer": "ipython3", 385 | "version": "3.8.13" 386 | } 387 | }, 388 | "nbformat": 4, 389 | "nbformat_minor": 5 390 | } 391 | -------------------------------------------------------------------------------- /Problem 1/employee.json: -------------------------------------------------------------------------------- 1 | { 2 | "columns": [ 3 | "id", 4 | "first_name", 5 | "last_name", 6 | "salary", 7 | "department_id" 8 | ], 9 | "data": [ 10 | [ 11 | 1, 12 | "Todd", 13 | "Wilson", 14 | 110000, 15 | 1006 16 | ], 17 | [ 18 | 1, 19 | "Todd", 20 | "Wilson", 21 | 106119, 22 | 1006 23 | ], 24 | [ 25 | 2, 26 | "Justin", 27 | "Simon", 28 | 128922, 29 | 1005 30 | ], 31 | [ 32 | 2, 33 | "Justin", 34 | "Simon", 35 | 130000, 36 | 1005 37 | ], 38 | [ 39 | 3, 40 | "Kelly", 41 | "Rosario", 42 | 42689, 43 | 1002 44 | ], 45 | [ 46 | 4, 47 | "Patricia", 48 | "Powell", 49 | 162825, 50 | 1004 51 | ], 52 | [ 53 | 4, 54 | "Patricia", 55 | "Powell", 56 | 170000, 57 | 1004 58 | ], 59 | [ 60 | 5, 61 | "Sherry", 62 | "Golden", 63 | 44101, 64 | 1002 65 | ], 66 | [ 67 | 6, 68 | "Natasha", 69 | "Swanson", 70 | 79632, 71 | 1005 72 | ], 73 | [ 74 | 6, 75 | "Natasha", 76 | "Swanson", 77 | 90000, 78 | 1005 79 | ], 80 | [ 81 | 7, 82 | "Diane", 83 | "Gordon", 84 | 74591, 85 | 1002 86 | ], 87 | [ 88 | 8, 89 | "Mercedes", 90 | "Rodriguez", 91 | 61048, 92 | 1005 93 | ], 94 | [ 95 | 9, 96 | "Christy", 97 | "Mitchell", 98 | 137236, 99 | 1001 100 | ], 101 | [ 102 | 9, 103 | "Christy", 104 | "Mitchell", 105 | 140000, 106 | 1001 107 | ], 108 | [ 109 | 9, 110 | "Christy", 111 | "Mitchell", 112 | 150000, 113 | 1001 114 | ], 115 | [ 116 | 10, 117 | "Sean", 118 | "Crawford", 119 | 182065, 120 | 1006 121 | ], 122 | [ 123 | 10, 124 | "Sean", 125 | "Crawford", 126 | 190000, 127 | 1006 128 | ], 129 | [ 130 | 11, 131 | "Kevin", 132 | "Townsend", 133 | 166861, 134 | 1002 135 | ], 136 | [ 137 | 12, 138 | "Joshua", 139 | "Johnson", 140 | 123082, 141 | 1004 142 | ], 143 | [ 144 | 13, 145 | "Julie", 146 | "Sanchez", 147 | 185663, 148 | 1001 149 | ], 150 | [ 151 | 13, 152 | "Julie", 153 | "Sanchez", 154 | 200000, 155 | 1001 156 | ], 157 | [ 158 | 13, 159 | "Julie", 160 | "Sanchez", 161 | 210000, 162 | 1001 163 | ], 164 | [ 165 | 14, 166 | "John", 167 | "Coleman", 168 | 152434, 169 | 1001 170 | ], 171 | [ 172 | 15, 173 | "Anthony", 174 | "Valdez", 175 | 96898, 176 | 1001 177 | ], 178 | [ 179 | 16, 180 | "Briana", 181 | "Rivas", 182 | 151668, 183 | 1005 184 | ], 185 | [ 186 | 17, 187 | "Jason", 188 | "Burnett", 189 | 42525, 190 | 1006 191 | ], 192 | [ 193 | 18, 194 | "Jeffrey", 195 | "Harris", 196 | 14491, 197 | 1002 198 | ], 199 | [ 200 | 18, 201 | "Jeffrey", 202 | "Harris", 203 | 20000, 204 | 1002 205 | ], 206 | [ 207 | 19, 208 | "Michael", 209 | "Ramsey", 210 | 63159, 211 | 1003 212 | ], 213 | [ 214 | 20, 215 | "Cody", 216 | "Gonzalez", 217 | 112809, 218 | 1004 219 | ], 220 | [ 221 | 21, 222 | "Stephen", 223 | "Berry", 224 | 123617, 225 | 1002 226 | ], 227 | [ 228 | 22, 229 | "Brittany", 230 | "Scott", 231 | 162537, 232 | 1002 233 | ], 234 | [ 235 | 23, 236 | "Angela", 237 | "Williams", 238 | 100875, 239 | 1004 240 | ], 241 | [ 242 | 24, 243 | "William", 244 | "Flores", 245 | 142674, 246 | 1003 247 | ], 248 | [ 249 | 25, 250 | "Pamela", 251 | "Matthews", 252 | 57944, 253 | 1005 254 | ], 255 | [ 256 | 26, 257 | "Allison", 258 | "Johnson", 259 | 128782, 260 | 1001 261 | ], 262 | [ 263 | 27, 264 | "Anthony", 265 | "Ball", 266 | 34386, 267 | 1003 268 | ], 269 | [ 270 | 28, 271 | "Alexis", 272 | "Beck", 273 | 12260, 274 | 1005 275 | ], 276 | [ 277 | 29, 278 | "Jason", 279 | "Olsen", 280 | 51937, 281 | 1006 282 | ], 283 | [ 284 | 30, 285 | "Stephen", 286 | "Smith", 287 | 194791, 288 | 1001 289 | ], 290 | [ 291 | 31, 292 | "Kimberly", 293 | "Brooks", 294 | 95327, 295 | 1003 296 | ], 297 | [ 298 | 32, 299 | "Eric", 300 | "Zimmerman", 301 | 83093, 302 | 1006 303 | ], 304 | [ 305 | 33, 306 | "Peter", 307 | "Holt", 308 | 69945, 309 | 1002 310 | ], 311 | [ 312 | 34, 313 | "Justin", 314 | "Dunn", 315 | 67992, 316 | 1003 317 | ], 318 | [ 319 | 35, 320 | "John", 321 | "Ball", 322 | 47795, 323 | 1004 324 | ], 325 | [ 326 | 36, 327 | "Jesus", 328 | "Ward", 329 | 36078, 330 | 1005 331 | ], 332 | [ 333 | 37, 334 | "Philip", 335 | "Gillespie", 336 | 36424, 337 | 1006 338 | ], 339 | [ 340 | 38, 341 | "Nicole", 342 | "Lewis", 343 | 114079, 344 | 1001 345 | ], 346 | [ 347 | 39, 348 | "Linda", 349 | "Clark", 350 | 186781, 351 | 1002 352 | ], 353 | [ 354 | 40, 355 | "Colleen", 356 | "Carrillo", 357 | 147723, 358 | 1004 359 | ], 360 | [ 361 | 41, 362 | "John", 363 | "George", 364 | 21642, 365 | 1001 366 | ], 367 | [ 368 | 42, 369 | "Traci", 370 | "Williams", 371 | 138892, 372 | 1003 373 | ], 374 | [ 375 | 42, 376 | "Traci", 377 | "Williams", 378 | 150000, 379 | 1003 380 | ], 381 | [ 382 | 42, 383 | "Traci", 384 | "Williams", 385 | 160000, 386 | 1003 387 | ], 388 | [ 389 | 42, 390 | "Traci", 391 | "Williams", 392 | 180000, 393 | 1003 394 | ], 395 | [ 396 | 43, 397 | "Joseph", 398 | "Rogers", 399 | 22800, 400 | 1005 401 | ], 402 | [ 403 | 44, 404 | "Trevor", 405 | "Carter", 406 | 38670, 407 | 1001 408 | ], 409 | [ 410 | 45, 411 | "Kevin", 412 | "Duncan", 413 | 45210, 414 | 1003 415 | ], 416 | [ 417 | 46, 418 | "Joshua", 419 | "Ewing", 420 | 73088, 421 | 1003 422 | ], 423 | [ 424 | 47, 425 | "Kimberly", 426 | "Dean", 427 | 71416, 428 | 1003 429 | ], 430 | [ 431 | 48, 432 | "Robert", 433 | "Lynch", 434 | 117960, 435 | 1004 436 | ], 437 | [ 438 | 49, 439 | "Amber", 440 | "Harding", 441 | 77764, 442 | 1002 443 | ], 444 | [ 445 | 50, 446 | "Victoria", 447 | "Wilson", 448 | 176620, 449 | 1002 450 | ], 451 | [ 452 | 51, 453 | "Theresa", 454 | "Everett", 455 | 31404, 456 | 1002 457 | ], 458 | [ 459 | 52, 460 | "Kara", 461 | "Smith", 462 | 192838, 463 | 1004 464 | ], 465 | [ 466 | 53, 467 | "Teresa", 468 | "Cohen", 469 | 98860, 470 | 1001 471 | ], 472 | [ 473 | 54, 474 | "Wesley", 475 | "Tucker", 476 | 90221, 477 | 1005 478 | ], 479 | [ 480 | 55, 481 | "Michael", 482 | "Morris", 483 | 106799, 484 | 1005 485 | ], 486 | [ 487 | 56, 488 | "Rachael", 489 | "Williams", 490 | 103585, 491 | 1002 492 | ], 493 | [ 494 | 57, 495 | "Patricia", 496 | "Harmon", 497 | 147417, 498 | 1005 499 | ], 500 | [ 501 | 58, 502 | "Edward", 503 | "Sharp", 504 | 41077, 505 | 1005 506 | ], 507 | [ 508 | 59, 509 | "Kevin", 510 | "Robinson", 511 | 100924, 512 | 1005 513 | ], 514 | [ 515 | 60, 516 | "Charles", 517 | "Pearson", 518 | 173317, 519 | 1004 520 | ], 521 | [ 522 | 61, 523 | "Ryan", 524 | "Brown", 525 | 110225, 526 | 1003 527 | ], 528 | [ 529 | 61, 530 | "Ryan", 531 | "Brown", 532 | 120000, 533 | 1003 534 | ], 535 | [ 536 | 62, 537 | "Dale", 538 | "Hayes", 539 | 97662, 540 | 1005 541 | ], 542 | [ 543 | 63, 544 | "Richard", 545 | "Sanford", 546 | 136083, 547 | 1001 548 | ], 549 | [ 550 | 64, 551 | "Danielle", 552 | "Williams", 553 | 98655, 554 | 1006 555 | ], 556 | [ 557 | 64, 558 | "Danielle", 559 | "Williams", 560 | 110000, 561 | 1006 562 | ], 563 | [ 564 | 64, 565 | "Danielle", 566 | "Williams", 567 | 120000, 568 | 1006 569 | ], 570 | [ 571 | 65, 572 | "Deborah", 573 | "Martin", 574 | 67389, 575 | 1004 576 | ], 577 | [ 578 | 66, 579 | "Dustin", 580 | "Bush", 581 | 47567, 582 | 1004 583 | ], 584 | [ 585 | 67, 586 | "Tyler", 587 | "Green", 588 | 111085, 589 | 1002 590 | ], 591 | [ 592 | 68, 593 | "Antonio", 594 | "Carpenter", 595 | 83684, 596 | 1002 597 | ], 598 | [ 599 | 69, 600 | "Ernest", 601 | "Peterson", 602 | 115993, 603 | 1005 604 | ], 605 | [ 606 | 70, 607 | "Karen", 608 | "Fernandez", 609 | 101238, 610 | 1003 611 | ], 612 | [ 613 | 71, 614 | "Kristine", 615 | "Casey", 616 | 67651, 617 | 1003 618 | ], 619 | [ 620 | 72, 621 | "Christine", 622 | "Frye", 623 | 137244, 624 | 1004 625 | ], 626 | [ 627 | 73, 628 | "William", 629 | "Preston", 630 | 155225, 631 | 1003 632 | ], 633 | [ 634 | 74, 635 | "Richard", 636 | "Cole", 637 | 180361, 638 | 1003 639 | ], 640 | [ 641 | 75, 642 | "Julia", 643 | "Ramos", 644 | 61398, 645 | 1006 646 | ], 647 | [ 648 | 75, 649 | "Julia", 650 | "Ramos", 651 | 70000, 652 | 1006 653 | ], 654 | [ 655 | 75, 656 | "Julia", 657 | "Ramos", 658 | 83000, 659 | 1006 660 | ], 661 | [ 662 | 75, 663 | "Julia", 664 | "Ramos", 665 | 90000, 666 | 1006 667 | ], 668 | [ 669 | 75, 670 | "Julia", 671 | "Ramos", 672 | 105000, 673 | 1006 674 | ] 675 | ] 676 | } -------------------------------------------------------------------------------- /Problem 2/problem2_2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c", 6 | "metadata": {}, 7 | "source": [ 8 | "Here, we will solve problems two ways\n", 9 | "1. First using PySpark function \n", 10 | "2. Second using Spark SQL" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# First Load all the required library and also Start Spark Session\n", 21 | "# Load all the required library\n", 22 | "from pyspark.sql import SparkSession" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "WARNING: An illegal reflective access operation has occurred\n", 36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 39 | "WARNING: All illegal access operations will be denied in a future release\n", 40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 41 | "Setting default log level to \"WARN\".\n", 42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 43 | "23/02/08 11:29:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 44 | "23/02/08 11:29:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "#Start Spark Session\n", 50 | "spark = SparkSession.builder.appName(\"problem2\").getOrCreate()\n", 51 | "sqlContext = SparkSession(spark)\n", 52 | "#Dont Show warning only error\n", 53 | "spark.sparkContext.setLogLevel(\"ERROR\")" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68", 60 | "metadata": {}, 61 | "outputs": [ 62 | { 63 | "name": "stderr", 64 | "output_type": "stream", 65 | "text": [ 66 | " \r" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "#Load CSV file into DataFrame\n", 72 | "employeedf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"employee_salary.csv\")" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030", 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "name": "stdout", 83 | "output_type": "stream", 84 | "text": [ 85 | "root\n", 86 | " |-- id: integer (nullable = true)\n", 87 | " |-- first_name: string (nullable = true)\n", 88 | " |-- last_name: string (nullable = true)\n", 89 | " |-- salary: integer (nullable = true)\n", 90 | " |-- department_id: integer (nullable = true)\n", 91 | "\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "#Check Schema of DataFrame\n", 97 | "employeedf.printSchema()" 98 | ] 99 | }, 100 | { 101 | "cell_type": "code", 102 | "execution_count": 5, 103 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba", 104 | "metadata": {}, 105 | "outputs": [ 106 | { 107 | "name": "stdout", 108 | "output_type": "stream", 109 | "text": [ 110 | "+---+----------+---------+------+-------------+\n", 111 | "| id|first_name|last_name|salary|department_id|\n", 112 | "+---+----------+---------+------+-------------+\n", 113 | "| 45| Kevin| Duncan| 45210| 1003|\n", 114 | "| 25| Pamela| Matthews| 57944| 1005|\n", 115 | "| 48| Robert| Lynch|117960| 1004|\n", 116 | "| 34| Justin| Dunn| 67992| 1003|\n", 117 | "| 62| Dale| Hayes| 97662| 1005|\n", 118 | "| 1| Todd| Wilson|110000| 1006|\n", 119 | "| 61| Ryan| Brown|120000| 1003|\n", 120 | "| 21| Stephen| Berry|123617| 1002|\n", 121 | "| 13| Julie| Sanchez|210000| 1001|\n", 122 | "| 55| Michael| Morris|106799| 1005|\n", 123 | "| 44| Trevor| Carter| 38670| 1001|\n", 124 | "| 73| William| Preston|155225| 1003|\n", 125 | "| 39| Linda| Clark|186781| 1002|\n", 126 | "| 10| Sean| Crawford|190000| 1006|\n", 127 | "| 30| Stephen| Smith|194791| 1001|\n", 128 | "| 75| Julia| Ramos|105000| 1006|\n", 129 | "| 59| Kevin| Robinson|100924| 1005|\n", 130 | "| 69| Ernest| Peterson|115993| 1005|\n", 131 | "| 65| Deborah| Martin| 67389| 1004|\n", 132 | "| 63| Richard| Sanford|136083| 1001|\n", 133 | "+---+----------+---------+------+-------------+\n", 134 | "only showing top 20 rows\n", 135 | "\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "#Check sample Data \n", 141 | "employeedf.show()" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 6, 147 | "id": "c6b4f318-0d5f-4be1-b9df-7fe6b3b008dd", 148 | "metadata": {}, 149 | "outputs": [], 150 | "source": [ 151 | "#Load CSV file into DataFrame\n", 152 | "departmentdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"department.csv\")" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 7, 158 | "id": "f4c4435b-dbdd-4890-9c0c-5b6e680005d4", 159 | "metadata": {}, 160 | "outputs": [ 161 | { 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "root\n", 166 | " |-- department_id: integer (nullable = true)\n", 167 | " |-- department_name: string (nullable = true)\n", 168 | "\n" 169 | ] 170 | } 171 | ], 172 | "source": [ 173 | "#Check Schema of DataFrame\n", 174 | "departmentdf.printSchema()" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 8, 180 | "id": "296c262a-a858-46a2-9bb3-38d212b52daf", 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "+-------------+---------------+\n", 188 | "|department_id|department_name|\n", 189 | "+-------------+---------------+\n", 190 | "| 1005| Sales|\n", 191 | "| 1002| Finanace|\n", 192 | "| 1004| Purchase|\n", 193 | "| 1001| Operations|\n", 194 | "| 1006| Marketing|\n", 195 | "| 1003| Technoogy|\n", 196 | "+-------------+---------------+\n", 197 | "\n" 198 | ] 199 | } 200 | ], 201 | "source": [ 202 | "#Check sample Data \n", 203 | "departmentdf.show()" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 10, 209 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171", 210 | "metadata": {}, 211 | "outputs": [ 212 | { 213 | "name": "stdout", 214 | "output_type": "stream", 215 | "text": [ 216 | "+---+----------+---------+------+-------------+-------------+---------------+\n", 217 | "| id|first_name|last_name|salary|department_id|department_id|department_name|\n", 218 | "+---+----------+---------+------+-------------+-------------+---------------+\n", 219 | "| 45| Kevin| Duncan| 45210| 1003| 1003| Technoogy|\n", 220 | "| 25| Pamela| Matthews| 57944| 1005| 1005| Sales|\n", 221 | "| 48| Robert| Lynch|117960| 1004| 1004| Purchase|\n", 222 | "| 34| Justin| Dunn| 67992| 1003| 1003| Technoogy|\n", 223 | "| 62| Dale| Hayes| 97662| 1005| 1005| Sales|\n", 224 | "| 1| Todd| Wilson|110000| 1006| 1006| Marketing|\n", 225 | "| 61| Ryan| Brown|120000| 1003| 1003| Technoogy|\n", 226 | "| 21| Stephen| Berry|123617| 1002| 1002| Finanace|\n", 227 | "| 13| Julie| Sanchez|210000| 1001| 1001| Operations|\n", 228 | "| 55| Michael| Morris|106799| 1005| 1005| Sales|\n", 229 | "| 44| Trevor| Carter| 38670| 1001| 1001| Operations|\n", 230 | "| 73| William| Preston|155225| 1003| 1003| Technoogy|\n", 231 | "| 39| Linda| Clark|186781| 1002| 1002| Finanace|\n", 232 | "| 10| Sean| Crawford|190000| 1006| 1006| Marketing|\n", 233 | "| 30| Stephen| Smith|194791| 1001| 1001| Operations|\n", 234 | "| 75| Julia| Ramos|105000| 1006| 1006| Marketing|\n", 235 | "| 59| Kevin| Robinson|100924| 1005| 1005| Sales|\n", 236 | "| 69| Ernest| Peterson|115993| 1005| 1005| Sales|\n", 237 | "| 65| Deborah| Martin| 67389| 1004| 1004| Purchase|\n", 238 | "| 63| Richard| Sanford|136083| 1001| 1001| Operations|\n", 239 | "+---+----------+---------+------+-------------+-------------+---------------+\n", 240 | "only showing top 20 rows\n", 241 | "\n" 242 | ] 243 | } 244 | ], 245 | "source": [ 246 | "#Solving Problem using PySpark \n", 247 | "# 2. Provide count of employees in each departnent with department name. \n", 248 | "\n", 249 | "joineddf = departmentdf.join(employeedf, employeedf.department_id == departmentdf.department_id,\"left\")\n", 250 | "joineddf.show()" 251 | ] 252 | }, 253 | { 254 | "cell_type": "code", 255 | "execution_count": 12, 256 | "id": "79d16d14-c013-416c-9552-d95e0900d4f8", 257 | "metadata": {}, 258 | "outputs": [ 259 | { 260 | "name": "stdout", 261 | "output_type": "stream", 262 | "text": [ 263 | "+---------------+-----+\n", 264 | "|department_name|count|\n", 265 | "+---------------+-----+\n", 266 | "| Purchase| 12|\n", 267 | "| Sales| 15|\n", 268 | "| Finanace| 15|\n", 269 | "| Technoogy| 14|\n", 270 | "| Marketing| 8|\n", 271 | "| Operations| 11|\n", 272 | "+---------------+-----+\n", 273 | "\n" 274 | ] 275 | } 276 | ], 277 | "source": [ 278 | "joineddf.groupBy(\"department_name\").count().show()" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": 13, 284 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1", 285 | "metadata": {}, 286 | "outputs": [], 287 | "source": [ 288 | "# Now we are solving Same problem using Spark SQL \n", 289 | "# Creating Temp Table or HIVE table\n", 290 | "employeedf.createOrReplaceTempView(\"tmpEmployee\")\n", 291 | "departmentdf.createOrReplaceTempView(\"tmpDepartment\")" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 14, 297 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c", 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "+---+----------+---------+------+-------------+\n", 305 | "| id|first_name|last_name|salary|department_id|\n", 306 | "+---+----------+---------+------+-------------+\n", 307 | "| 45| Kevin| Duncan| 45210| 1003|\n", 308 | "| 25| Pamela| Matthews| 57944| 1005|\n", 309 | "| 48| Robert| Lynch|117960| 1004|\n", 310 | "| 34| Justin| Dunn| 67992| 1003|\n", 311 | "| 62| Dale| Hayes| 97662| 1005|\n", 312 | "| 1| Todd| Wilson|110000| 1006|\n", 313 | "| 61| Ryan| Brown|120000| 1003|\n", 314 | "| 21| Stephen| Berry|123617| 1002|\n", 315 | "| 13| Julie| Sanchez|210000| 1001|\n", 316 | "| 55| Michael| Morris|106799| 1005|\n", 317 | "| 44| Trevor| Carter| 38670| 1001|\n", 318 | "| 73| William| Preston|155225| 1003|\n", 319 | "| 39| Linda| Clark|186781| 1002|\n", 320 | "| 10| Sean| Crawford|190000| 1006|\n", 321 | "| 30| Stephen| Smith|194791| 1001|\n", 322 | "| 75| Julia| Ramos|105000| 1006|\n", 323 | "| 59| Kevin| Robinson|100924| 1005|\n", 324 | "| 69| Ernest| Peterson|115993| 1005|\n", 325 | "| 65| Deborah| Martin| 67389| 1004|\n", 326 | "| 63| Richard| Sanford|136083| 1001|\n", 327 | "+---+----------+---------+------+-------------+\n", 328 | "only showing top 20 rows\n", 329 | "\n" 330 | ] 331 | } 332 | ], 333 | "source": [ 334 | "# Now we have SQL Table and we can write SQL Query on top of that \n", 335 | "# For example by Select on table \n", 336 | "sqlContext.sql(\"SELECT * FROM tmpEmployee\").show()" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 15, 342 | "id": "d4ac25f9-cd26-44dc-9852-ee0fbae70fd1", 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "name": "stdout", 347 | "output_type": "stream", 348 | "text": [ 349 | "+-------------+---------------+\n", 350 | "|department_id|department_name|\n", 351 | "+-------------+---------------+\n", 352 | "| 1005| Sales|\n", 353 | "| 1002| Finanace|\n", 354 | "| 1004| Purchase|\n", 355 | "| 1001| Operations|\n", 356 | "| 1006| Marketing|\n", 357 | "| 1003| Technoogy|\n", 358 | "+-------------+---------------+\n", 359 | "\n" 360 | ] 361 | } 362 | ], 363 | "source": [ 364 | "sqlContext.sql(\"SELECT * FROM tmpDepartment\").show()" 365 | ] 366 | }, 367 | { 368 | "cell_type": "code", 369 | "execution_count": 16, 370 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24", 371 | "metadata": {}, 372 | "outputs": [ 373 | { 374 | "name": "stdout", 375 | "output_type": "stream", 376 | "text": [ 377 | "+---------------+-----------------+\n", 378 | "|department_name|count_of_employee|\n", 379 | "+---------------+-----------------+\n", 380 | "| Purchase| 12|\n", 381 | "| Sales| 15|\n", 382 | "| Finanace| 15|\n", 383 | "| Technoogy| 14|\n", 384 | "| Marketing| 8|\n", 385 | "| Operations| 11|\n", 386 | "+---------------+-----------------+\n", 387 | "\n" 388 | ] 389 | } 390 | ], 391 | "source": [ 392 | "# Now we will write query to get max salary for each employee \n", 393 | "# so we will use SQL Group by and SQL Order by functions \n", 394 | "sqlContext.sql(\"SELECT department.department_name, count(*) as count_of_employee \\\n", 395 | " FROM tmpDepartment as department \\\n", 396 | " LEFT OUTER JOIN tmpEmployee as emp \\\n", 397 | " ON emp.department_id = department.department_id \\\n", 398 | " GROUP BY department.department_name\").show(n=100)\n", 399 | "\n" 400 | ] 401 | } 402 | ], 403 | "metadata": { 404 | "kernelspec": { 405 | "display_name": "Python 3 (ipykernel)", 406 | "language": "python", 407 | "name": "python3" 408 | }, 409 | "language_info": { 410 | "codemirror_mode": { 411 | "name": "ipython", 412 | "version": 3 413 | }, 414 | "file_extension": ".py", 415 | "mimetype": "text/x-python", 416 | "name": "python", 417 | "nbconvert_exporter": "python", 418 | "pygments_lexer": "ipython3", 419 | "version": "3.8.13" 420 | } 421 | }, 422 | "nbformat": 4, 423 | "nbformat_minor": 5 424 | } 425 | -------------------------------------------------------------------------------- /Problem 2/problem2_1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c", 6 | "metadata": {}, 7 | "source": [ 8 | "Here, we will solve problems two ways\n", 9 | "1. First using PySpark function \n", 10 | "2. Second using Spark SQL" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# First Load all the required library and also Start Spark Session\n", 21 | "# Load all the required library\n", 22 | "from pyspark.sql import SparkSession" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "WARNING: An illegal reflective access operation has occurred\n", 36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 39 | "WARNING: All illegal access operations will be denied in a future release\n", 40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 41 | "Setting default log level to \"WARN\".\n", 42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 43 | "23/02/08 11:06:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "#Start Spark Session\n", 49 | "spark = SparkSession.builder.appName(\"problem2\").getOrCreate()\n", 50 | "sqlContext = SparkSession(spark)\n", 51 | "#Dont Show warning only error\n", 52 | "spark.sparkContext.setLogLevel(\"ERROR\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68", 59 | "metadata": {}, 60 | "outputs": [ 61 | { 62 | "name": "stderr", 63 | "output_type": "stream", 64 | "text": [ 65 | " \r" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "#Load CSV file into DataFrame\n", 71 | "employeedf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"employee_salary.csv\")" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 4, 77 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030", 78 | "metadata": {}, 79 | "outputs": [ 80 | { 81 | "name": "stdout", 82 | "output_type": "stream", 83 | "text": [ 84 | "root\n", 85 | " |-- id: integer (nullable = true)\n", 86 | " |-- first_name: string (nullable = true)\n", 87 | " |-- last_name: string (nullable = true)\n", 88 | " |-- salary: integer (nullable = true)\n", 89 | " |-- department_id: integer (nullable = true)\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "#Check Schema of DataFrame\n", 96 | "employeedf.printSchema()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 5, 102 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba", 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "+---+----------+---------+------+-------------+\n", 110 | "| id|first_name|last_name|salary|department_id|\n", 111 | "+---+----------+---------+------+-------------+\n", 112 | "| 45| Kevin| Duncan| 45210| 1003|\n", 113 | "| 25| Pamela| Matthews| 57944| 1005|\n", 114 | "| 48| Robert| Lynch|117960| 1004|\n", 115 | "| 34| Justin| Dunn| 67992| 1003|\n", 116 | "| 62| Dale| Hayes| 97662| 1005|\n", 117 | "| 1| Todd| Wilson|110000| 1006|\n", 118 | "| 61| Ryan| Brown|120000| 1003|\n", 119 | "| 21| Stephen| Berry|123617| 1002|\n", 120 | "| 13| Julie| Sanchez|210000| 1001|\n", 121 | "| 55| Michael| Morris|106799| 1005|\n", 122 | "| 44| Trevor| Carter| 38670| 1001|\n", 123 | "| 73| William| Preston|155225| 1003|\n", 124 | "| 39| Linda| Clark|186781| 1002|\n", 125 | "| 10| Sean| Crawford|190000| 1006|\n", 126 | "| 30| Stephen| Smith|194791| 1001|\n", 127 | "| 75| Julia| Ramos|105000| 1006|\n", 128 | "| 59| Kevin| Robinson|100924| 1005|\n", 129 | "| 69| Ernest| Peterson|115993| 1005|\n", 130 | "| 65| Deborah| Martin| 67389| 1004|\n", 131 | "| 63| Richard| Sanford|136083| 1001|\n", 132 | "+---+----------+---------+------+-------------+\n", 133 | "only showing top 20 rows\n", 134 | "\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "#Check sample Data \n", 140 | "employeedf.show()" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 6, 146 | "id": "c6b4f318-0d5f-4be1-b9df-7fe6b3b008dd", 147 | "metadata": {}, 148 | "outputs": [], 149 | "source": [ 150 | "#Load CSV file into DataFrame\n", 151 | "departmentdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"department.csv\")" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 8, 157 | "id": "f4c4435b-dbdd-4890-9c0c-5b6e680005d4", 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "root\n", 165 | " |-- department_id: integer (nullable = true)\n", 166 | " |-- department_name: string (nullable = true)\n", 167 | "\n" 168 | ] 169 | } 170 | ], 171 | "source": [ 172 | "#Check Schema of DataFrame\n", 173 | "departmentdf.printSchema()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 9, 179 | "id": "296c262a-a858-46a2-9bb3-38d212b52daf", 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "+-------------+---------------+\n", 187 | "|department_id|department_name|\n", 188 | "+-------------+---------------+\n", 189 | "| 1005| Sales|\n", 190 | "| 1002| Finanace|\n", 191 | "| 1004| Purchase|\n", 192 | "| 1001| Operations|\n", 193 | "| 1006| Marketing|\n", 194 | "| 1003| Technoogy|\n", 195 | "+-------------+---------------+\n", 196 | "\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "#Check sample Data \n", 202 | "departmentdf.show()" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 16, 208 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171", 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "name": "stdout", 213 | "output_type": "stream", 214 | "text": [ 215 | "+---+----------+---------+------+-------------+-------------+---------------+\n", 216 | "| id|first_name|last_name|salary|department_id|department_id|department_name|\n", 217 | "+---+----------+---------+------+-------------+-------------+---------------+\n", 218 | "| 45| Kevin| Duncan| 45210| 1003| 1003| Technoogy|\n", 219 | "| 25| Pamela| Matthews| 57944| 1005| 1005| Sales|\n", 220 | "| 48| Robert| Lynch|117960| 1004| 1004| Purchase|\n", 221 | "| 34| Justin| Dunn| 67992| 1003| 1003| Technoogy|\n", 222 | "| 62| Dale| Hayes| 97662| 1005| 1005| Sales|\n", 223 | "| 1| Todd| Wilson|110000| 1006| 1006| Marketing|\n", 224 | "| 61| Ryan| Brown|120000| 1003| 1003| Technoogy|\n", 225 | "| 21| Stephen| Berry|123617| 1002| 1002| Finanace|\n", 226 | "| 13| Julie| Sanchez|210000| 1001| 1001| Operations|\n", 227 | "| 55| Michael| Morris|106799| 1005| 1005| Sales|\n", 228 | "| 44| Trevor| Carter| 38670| 1001| 1001| Operations|\n", 229 | "| 73| William| Preston|155225| 1003| 1003| Technoogy|\n", 230 | "| 39| Linda| Clark|186781| 1002| 1002| Finanace|\n", 231 | "| 10| Sean| Crawford|190000| 1006| 1006| Marketing|\n", 232 | "| 30| Stephen| Smith|194791| 1001| 1001| Operations|\n", 233 | "| 75| Julia| Ramos|105000| 1006| 1006| Marketing|\n", 234 | "| 59| Kevin| Robinson|100924| 1005| 1005| Sales|\n", 235 | "| 69| Ernest| Peterson|115993| 1005| 1005| Sales|\n", 236 | "| 65| Deborah| Martin| 67389| 1004| 1004| Purchase|\n", 237 | "| 63| Richard| Sanford|136083| 1001| 1001| Operations|\n", 238 | "+---+----------+---------+------+-------------+-------------+---------------+\n", 239 | "only showing top 20 rows\n", 240 | "\n" 241 | ] 242 | } 243 | ], 244 | "source": [ 245 | "#Solving Problem using PySpark \n", 246 | "# 1. Use this both tables and list all the employees woking in marketing department with highest to lowest salary order. \n", 247 | "\n", 248 | "joineddf = employeedf.join(departmentdf, employeedf.department_id == departmentdf.department_id,\"left\")\n", 249 | "joineddf.show()" 250 | ] 251 | }, 252 | { 253 | "cell_type": "code", 254 | "execution_count": 20, 255 | "id": "79d16d14-c013-416c-9552-d95e0900d4f8", 256 | "metadata": {}, 257 | "outputs": [ 258 | { 259 | "name": "stdout", 260 | "output_type": "stream", 261 | "text": [ 262 | "+----------+---------+------+\n", 263 | "|first_name|last_name|salary|\n", 264 | "+----------+---------+------+\n", 265 | "| Sean| Crawford|190000|\n", 266 | "| Danielle| Williams|120000|\n", 267 | "| Todd| Wilson|110000|\n", 268 | "| Julia| Ramos|105000|\n", 269 | "| Eric|Zimmerman| 83093|\n", 270 | "| Jason| Olsen| 51937|\n", 271 | "| Jason| Burnett| 42525|\n", 272 | "| Philip|Gillespie| 36424|\n", 273 | "+----------+---------+------+\n", 274 | "\n" 275 | ] 276 | } 277 | ], 278 | "source": [ 279 | "from pyspark.sql.functions import desc\n", 280 | "joineddf.select(\"first_name\",\"last_name\",\"salary\").where(\"department_name='Marketing'\").orderBy(desc(\"salary\")).show()" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 21, 286 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1", 287 | "metadata": {}, 288 | "outputs": [], 289 | "source": [ 290 | "# Now we are solving Same problem using Spark SQL \n", 291 | "# Creating Temp Table or HIVE table\n", 292 | "employeedf.createOrReplaceTempView(\"tmpEmployee\")\n", 293 | "departmentdf.createOrReplaceTempView(\"tmpDepartment\")" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 22, 299 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c", 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "+---+----------+---------+------+-------------+\n", 307 | "| id|first_name|last_name|salary|department_id|\n", 308 | "+---+----------+---------+------+-------------+\n", 309 | "| 45| Kevin| Duncan| 45210| 1003|\n", 310 | "| 25| Pamela| Matthews| 57944| 1005|\n", 311 | "| 48| Robert| Lynch|117960| 1004|\n", 312 | "| 34| Justin| Dunn| 67992| 1003|\n", 313 | "| 62| Dale| Hayes| 97662| 1005|\n", 314 | "| 1| Todd| Wilson|110000| 1006|\n", 315 | "| 61| Ryan| Brown|120000| 1003|\n", 316 | "| 21| Stephen| Berry|123617| 1002|\n", 317 | "| 13| Julie| Sanchez|210000| 1001|\n", 318 | "| 55| Michael| Morris|106799| 1005|\n", 319 | "| 44| Trevor| Carter| 38670| 1001|\n", 320 | "| 73| William| Preston|155225| 1003|\n", 321 | "| 39| Linda| Clark|186781| 1002|\n", 322 | "| 10| Sean| Crawford|190000| 1006|\n", 323 | "| 30| Stephen| Smith|194791| 1001|\n", 324 | "| 75| Julia| Ramos|105000| 1006|\n", 325 | "| 59| Kevin| Robinson|100924| 1005|\n", 326 | "| 69| Ernest| Peterson|115993| 1005|\n", 327 | "| 65| Deborah| Martin| 67389| 1004|\n", 328 | "| 63| Richard| Sanford|136083| 1001|\n", 329 | "+---+----------+---------+------+-------------+\n", 330 | "only showing top 20 rows\n", 331 | "\n" 332 | ] 333 | } 334 | ], 335 | "source": [ 336 | "# Now we have SQL Table and we can write SQL Query on top of that \n", 337 | "# For example by Select on table \n", 338 | "sqlContext.sql(\"SELECT * FROM tmpEmployee\").show()" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 23, 344 | "id": "d4ac25f9-cd26-44dc-9852-ee0fbae70fd1", 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "+-------------+---------------+\n", 352 | "|department_id|department_name|\n", 353 | "+-------------+---------------+\n", 354 | "| 1005| Sales|\n", 355 | "| 1002| Finanace|\n", 356 | "| 1004| Purchase|\n", 357 | "| 1001| Operations|\n", 358 | "| 1006| Marketing|\n", 359 | "| 1003| Technoogy|\n", 360 | "+-------------+---------------+\n", 361 | "\n" 362 | ] 363 | } 364 | ], 365 | "source": [ 366 | "sqlContext.sql(\"SELECT * FROM tmpDepartment\").show()" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 24, 372 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24", 373 | "metadata": {}, 374 | "outputs": [ 375 | { 376 | "name": "stdout", 377 | "output_type": "stream", 378 | "text": [ 379 | "+----------+---------+------+\n", 380 | "|first_name|last_name|salary|\n", 381 | "+----------+---------+------+\n", 382 | "| Sean| Crawford|190000|\n", 383 | "| Danielle| Williams|120000|\n", 384 | "| Todd| Wilson|110000|\n", 385 | "| Julia| Ramos|105000|\n", 386 | "| Eric|Zimmerman| 83093|\n", 387 | "| Jason| Olsen| 51937|\n", 388 | "| Jason| Burnett| 42525|\n", 389 | "| Philip|Gillespie| 36424|\n", 390 | "+----------+---------+------+\n", 391 | "\n" 392 | ] 393 | } 394 | ], 395 | "source": [ 396 | "# Now we will write query to get max salary for each employee \n", 397 | "# so we will use SQL Group by and SQL Order by functions \n", 398 | "sqlContext.sql(\"SELECT first_name, last_name, salary \\\n", 399 | " FROM tmpEmployee as emp \\\n", 400 | " LEFT OUTER JOIN tmpDepartment as department \\\n", 401 | " ON emp.department_id = department.department_id \\\n", 402 | " WHERE department.department_name = 'Marketing' \\\n", 403 | " ORDER BY salary DESC\").show(n=100)\n", 404 | "\n" 405 | ] 406 | } 407 | ], 408 | "metadata": { 409 | "kernelspec": { 410 | "display_name": "Python 3 (ipykernel)", 411 | "language": "python", 412 | "name": "python3" 413 | }, 414 | "language_info": { 415 | "codemirror_mode": { 416 | "name": "ipython", 417 | "version": 3 418 | }, 419 | "file_extension": ".py", 420 | "mimetype": "text/x-python", 421 | "name": "python", 422 | "nbconvert_exporter": "python", 423 | "pygments_lexer": "ipython3", 424 | "version": "3.8.13" 425 | } 426 | }, 427 | "nbformat": 4, 428 | "nbformat_minor": 5 429 | } 430 | -------------------------------------------------------------------------------- /Problem 1/problem1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c", 6 | "metadata": {}, 7 | "source": [ 8 | "Here, we will solve problems two ways\n", 9 | "1. First using PySpark function \n", 10 | "2. Second using Spark SQL" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 1, 16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "# First Load all the required library and also Start Spark Session\n", 21 | "# Load all the required library\n", 22 | "from pyspark.sql import SparkSession" 23 | ] 24 | }, 25 | { 26 | "cell_type": "code", 27 | "execution_count": 2, 28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09", 29 | "metadata": {}, 30 | "outputs": [ 31 | { 32 | "name": "stderr", 33 | "output_type": "stream", 34 | "text": [ 35 | "WARNING: An illegal reflective access operation has occurred\n", 36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n", 37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n", 38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n", 39 | "WARNING: All illegal access operations will be denied in a future release\n", 40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n", 41 | "Setting default log level to \"WARN\".\n", 42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 43 | "23/02/03 10:13:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 44 | ] 45 | } 46 | ], 47 | "source": [ 48 | "#Start Spark Session\n", 49 | "spark = SparkSession.builder.appName(\"problem1\").getOrCreate()\n", 50 | "sqlContext = SparkSession(spark)\n", 51 | "#Dont Show warning only error\n", 52 | "spark.sparkContext.setLogLevel(\"ERROR\")" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 17, 58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68", 59 | "metadata": {}, 60 | "outputs": [], 61 | "source": [ 62 | "#Load CSV file into DataFrame\n", 63 | "employeedf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"employee.csv\")" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 18, 69 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030", 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "root\n", 77 | " |-- id: integer (nullable = true)\n", 78 | " |-- first_name: string (nullable = true)\n", 79 | " |-- last_name: string (nullable = true)\n", 80 | " |-- salary: integer (nullable = true)\n", 81 | " |-- department_id: integer (nullable = true)\n", 82 | "\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "#Check Schema of DataFrame\n", 88 | "employeedf.printSchema()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 19, 94 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba", 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "+---+----------+---------+------+-------------+\n", 102 | "| id|first_name|last_name|salary|department_id|\n", 103 | "+---+----------+---------+------+-------------+\n", 104 | "| 1| Todd| Wilson|110000| 1006|\n", 105 | "| 1| Todd| Wilson|106119| 1006|\n", 106 | "| 2| Justin| Simon|128922| 1005|\n", 107 | "| 2| Justin| Simon|130000| 1005|\n", 108 | "| 3| Kelly| Rosario| 42689| 1002|\n", 109 | "| 4| Patricia| Powell|162825| 1004|\n", 110 | "| 4| Patricia| Powell|170000| 1004|\n", 111 | "| 5| Sherry| Golden| 44101| 1002|\n", 112 | "| 6| Natasha| Swanson| 79632| 1005|\n", 113 | "| 6| Natasha| Swanson| 90000| 1005|\n", 114 | "| 7| Diane| Gordon| 74591| 1002|\n", 115 | "| 8| Mercedes|Rodriguez| 61048| 1005|\n", 116 | "| 9| Christy| Mitchell|137236| 1001|\n", 117 | "| 9| Christy| Mitchell|140000| 1001|\n", 118 | "| 9| Christy| Mitchell|150000| 1001|\n", 119 | "| 10| Sean| Crawford|182065| 1006|\n", 120 | "| 10| Sean| Crawford|190000| 1006|\n", 121 | "| 11| Kevin| Townsend|166861| 1002|\n", 122 | "| 12| Joshua| Johnson|123082| 1004|\n", 123 | "| 13| Julie| Sanchez|185663| 1001|\n", 124 | "+---+----------+---------+------+-------------+\n", 125 | "only showing top 20 rows\n", 126 | "\n" 127 | ] 128 | } 129 | ], 130 | "source": [ 131 | "#Check sample Data \n", 132 | "employeedf.show()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 20, 138 | "id": "c6b4f318-0d5f-4be1-b9df-7fe6b3b008dd", 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "data": { 143 | "text/plain": [ 144 | "95" 145 | ] 146 | }, 147 | "execution_count": 20, 148 | "metadata": {}, 149 | "output_type": "execute_result" 150 | } 151 | ], 152 | "source": [ 153 | "#Checking number of rows in dataframe\n", 154 | "employeedf.count()" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 28, 160 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171", 161 | "metadata": {}, 162 | "outputs": [ 163 | { 164 | "name": "stdout", 165 | "output_type": "stream", 166 | "text": [ 167 | "+---+----------+---------+-------------+------+\n", 168 | "| id|first_name|last_name|department_id|salary|\n", 169 | "+---+----------+---------+-------------+------+\n", 170 | "| 1| Todd| Wilson| 1006|110000|\n", 171 | "| 1| Todd| Wilson| 1006|106119|\n", 172 | "| 2| Justin| Simon| 1005|128922|\n", 173 | "| 2| Justin| Simon| 1005|130000|\n", 174 | "| 3| Kelly| Rosario| 1002| 42689|\n", 175 | "| 4| Patricia| Powell| 1004|170000|\n", 176 | "| 4| Patricia| Powell| 1004|162825|\n", 177 | "| 5| Sherry| Golden| 1002| 44101|\n", 178 | "| 6| Natasha| Swanson| 1005| 79632|\n", 179 | "| 6| Natasha| Swanson| 1005| 90000|\n", 180 | "| 7| Diane| Gordon| 1002| 74591|\n", 181 | "| 8| Mercedes|Rodriguez| 1005| 61048|\n", 182 | "| 9| Christy| Mitchell| 1001|140000|\n", 183 | "| 9| Christy| Mitchell| 1001|150000|\n", 184 | "| 9| Christy| Mitchell| 1001|137236|\n", 185 | "| 10| Sean| Crawford| 1006|182065|\n", 186 | "| 10| Sean| Crawford| 1006|190000|\n", 187 | "| 11| Kevin| Townsend| 1002|166861|\n", 188 | "| 12| Joshua| Johnson| 1004|123082|\n", 189 | "| 13| Julie| Sanchez| 1001|185663|\n", 190 | "+---+----------+---------+-------------+------+\n", 191 | "only showing top 20 rows\n", 192 | "\n" 193 | ] 194 | } 195 | ], 196 | "source": [ 197 | "#Solving Problem using PySpark \n", 198 | "# 1. We need to print latest salary of each employee\n", 199 | "# 2. We also need their id, first name, lastname, department id and latest salary \n", 200 | "# 3. We also want to order by it by id \n", 201 | "\n", 202 | "# On a first step we are just getting all the columns and doing order by \n", 203 | "\n", 204 | "employeedf.select(\"id\",\"first_name\",\"last_name\",\"department_id\",\"salary\").orderBy(\"id\").show()\n" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 33, 210 | "id": "0e256a4b-450f-4846-ba97-1e51812d590e", 211 | "metadata": {}, 212 | "outputs": [ 213 | { 214 | "name": "stdout", 215 | "output_type": "stream", 216 | "text": [ 217 | "+---+----------+---------+-------------+-----------+\n", 218 | "| id|first_name|last_name|department_id|max(salary)|\n", 219 | "+---+----------+---------+-------------+-----------+\n", 220 | "| 1| Todd| Wilson| 1006| 110000|\n", 221 | "| 2| Justin| Simon| 1005| 130000|\n", 222 | "| 3| Kelly| Rosario| 1002| 42689|\n", 223 | "| 4| Patricia| Powell| 1004| 170000|\n", 224 | "| 5| Sherry| Golden| 1002| 44101|\n", 225 | "| 6| Natasha| Swanson| 1005| 90000|\n", 226 | "| 7| Diane| Gordon| 1002| 74591|\n", 227 | "| 8| Mercedes|Rodriguez| 1005| 61048|\n", 228 | "| 9| Christy| Mitchell| 1001| 150000|\n", 229 | "| 10| Sean| Crawford| 1006| 190000|\n", 230 | "| 11| Kevin| Townsend| 1002| 166861|\n", 231 | "| 12| Joshua| Johnson| 1004| 123082|\n", 232 | "| 13| Julie| Sanchez| 1001| 210000|\n", 233 | "| 14| John| Coleman| 1001| 152434|\n", 234 | "| 15| Anthony| Valdez| 1001| 96898|\n", 235 | "| 16| Briana| Rivas| 1005| 151668|\n", 236 | "| 17| Jason| Burnett| 1006| 42525|\n", 237 | "| 18| Jeffrey| Harris| 1002| 20000|\n", 238 | "| 19| Michael| Ramsey| 1003| 63159|\n", 239 | "| 20| Cody| Gonzalez| 1004| 112809|\n", 240 | "| 21| Stephen| Berry| 1002| 123617|\n", 241 | "| 22| Brittany| Scott| 1002| 162537|\n", 242 | "| 23| Angela| Williams| 1004| 100875|\n", 243 | "| 24| William| Flores| 1003| 142674|\n", 244 | "| 25| Pamela| Matthews| 1005| 57944|\n", 245 | "| 26| Allison| Johnson| 1001| 128782|\n", 246 | "| 27| Anthony| Ball| 1003| 34386|\n", 247 | "| 28| Alexis| Beck| 1005| 12260|\n", 248 | "| 29| Jason| Olsen| 1006| 51937|\n", 249 | "| 30| Stephen| Smith| 1001| 194791|\n", 250 | "| 31| Kimberly| Brooks| 1003| 95327|\n", 251 | "| 32| Eric|Zimmerman| 1006| 83093|\n", 252 | "| 33| Peter| Holt| 1002| 69945|\n", 253 | "| 34| Justin| Dunn| 1003| 67992|\n", 254 | "| 35| John| Ball| 1004| 47795|\n", 255 | "| 36| Jesus| Ward| 1005| 36078|\n", 256 | "| 37| Philip|Gillespie| 1006| 36424|\n", 257 | "| 38| Nicole| Lewis| 1001| 114079|\n", 258 | "| 39| Linda| Clark| 1002| 186781|\n", 259 | "| 40| Colleen| Carrillo| 1004| 147723|\n", 260 | "| 41| John| George| 1001| 21642|\n", 261 | "| 42| Traci| Williams| 1003| 180000|\n", 262 | "| 43| Joseph| Rogers| 1005| 22800|\n", 263 | "| 44| Trevor| Carter| 1001| 38670|\n", 264 | "| 45| Kevin| Duncan| 1003| 45210|\n", 265 | "| 46| Joshua| Ewing| 1003| 73088|\n", 266 | "| 47| Kimberly| Dean| 1003| 71416|\n", 267 | "| 48| Robert| Lynch| 1004| 117960|\n", 268 | "| 49| Amber| Harding| 1002| 77764|\n", 269 | "| 50| Victoria| Wilson| 1002| 176620|\n", 270 | "| 51| Theresa| Everett| 1002| 31404|\n", 271 | "| 52| Kara| Smith| 1004| 192838|\n", 272 | "| 53| Teresa| Cohen| 1001| 98860|\n", 273 | "| 54| Wesley| Tucker| 1005| 90221|\n", 274 | "| 55| Michael| Morris| 1005| 106799|\n", 275 | "| 56| Rachael| Williams| 1002| 103585|\n", 276 | "| 57| Patricia| Harmon| 1005| 147417|\n", 277 | "| 58| Edward| Sharp| 1005| 41077|\n", 278 | "| 59| Kevin| Robinson| 1005| 100924|\n", 279 | "| 60| Charles| Pearson| 1004| 173317|\n", 280 | "| 61| Ryan| Brown| 1003| 120000|\n", 281 | "| 62| Dale| Hayes| 1005| 97662|\n", 282 | "| 63| Richard| Sanford| 1001| 136083|\n", 283 | "| 64| Danielle| Williams| 1006| 120000|\n", 284 | "| 65| Deborah| Martin| 1004| 67389|\n", 285 | "| 66| Dustin| Bush| 1004| 47567|\n", 286 | "| 67| Tyler| Green| 1002| 111085|\n", 287 | "| 68| Antonio|Carpenter| 1002| 83684|\n", 288 | "| 69| Ernest| Peterson| 1005| 115993|\n", 289 | "| 70| Karen|Fernandez| 1003| 101238|\n", 290 | "| 71| Kristine| Casey| 1003| 67651|\n", 291 | "| 72| Christine| Frye| 1004| 137244|\n", 292 | "| 73| William| Preston| 1003| 155225|\n", 293 | "| 74| Richard| Cole| 1003| 180361|\n", 294 | "| 75| Julia| Ramos| 1006| 105000|\n", 295 | "+---+----------+---------+-------------+-----------+\n", 296 | "\n" 297 | ] 298 | } 299 | ], 300 | "source": [ 301 | "# Now we will use group by function and get max salary for each employee \n", 302 | "employeedf.groupBy(\"id\",\"first_name\",\"last_name\",\"department_id\").max(\"salary\").orderBy(\"id\").show(n=100)\n", 303 | "# We can also store result into dataframe\n", 304 | "finaldf = employeedf.groupBy(\"id\",\"first_name\",\"last_name\",\"department_id\").max(\"salary\").orderBy(\"id\")" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 34, 310 | "id": "bd049fee-18f9-48b0-a935-885614e744d3", 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "data": { 315 | "text/plain": [ 316 | "75" 317 | ] 318 | }, 319 | "execution_count": 34, 320 | "metadata": {}, 321 | "output_type": "execute_result" 322 | } 323 | ], 324 | "source": [ 325 | "# Final result into final dataframe\n", 326 | "finaldf.count()" 327 | ] 328 | }, 329 | { 330 | "cell_type": "code", 331 | "execution_count": 35, 332 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1", 333 | "metadata": {}, 334 | "outputs": [], 335 | "source": [ 336 | "# Now we are solving Same problem using Spark SQL \n", 337 | "# Creating Temp Table or HIVE table\n", 338 | "employeedf.createOrReplaceTempView(\"tmpEmployee\")" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 36, 344 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c", 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "name": "stdout", 349 | "output_type": "stream", 350 | "text": [ 351 | "+---+----------+---------+------+-------------+\n", 352 | "| id|first_name|last_name|salary|department_id|\n", 353 | "+---+----------+---------+------+-------------+\n", 354 | "| 1| Todd| Wilson|110000| 1006|\n", 355 | "| 1| Todd| Wilson|106119| 1006|\n", 356 | "| 2| Justin| Simon|128922| 1005|\n", 357 | "| 2| Justin| Simon|130000| 1005|\n", 358 | "| 3| Kelly| Rosario| 42689| 1002|\n", 359 | "| 4| Patricia| Powell|162825| 1004|\n", 360 | "| 4| Patricia| Powell|170000| 1004|\n", 361 | "| 5| Sherry| Golden| 44101| 1002|\n", 362 | "| 6| Natasha| Swanson| 79632| 1005|\n", 363 | "| 6| Natasha| Swanson| 90000| 1005|\n", 364 | "| 7| Diane| Gordon| 74591| 1002|\n", 365 | "| 8| Mercedes|Rodriguez| 61048| 1005|\n", 366 | "| 9| Christy| Mitchell|137236| 1001|\n", 367 | "| 9| Christy| Mitchell|140000| 1001|\n", 368 | "| 9| Christy| Mitchell|150000| 1001|\n", 369 | "| 10| Sean| Crawford|182065| 1006|\n", 370 | "| 10| Sean| Crawford|190000| 1006|\n", 371 | "| 11| Kevin| Townsend|166861| 1002|\n", 372 | "| 12| Joshua| Johnson|123082| 1004|\n", 373 | "| 13| Julie| Sanchez|185663| 1001|\n", 374 | "+---+----------+---------+------+-------------+\n", 375 | "only showing top 20 rows\n", 376 | "\n" 377 | ] 378 | } 379 | ], 380 | "source": [ 381 | "# Now we have SQL Table and we can write SQL Query on top of that \n", 382 | "# For example by Select on table \n", 383 | "sqlContext.sql(\"SELECT * FROM tmpEmployee\").show()" 384 | ] 385 | }, 386 | { 387 | "cell_type": "code", 388 | "execution_count": 38, 389 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24", 390 | "metadata": {}, 391 | "outputs": [ 392 | { 393 | "name": "stdout", 394 | "output_type": "stream", 395 | "text": [ 396 | "+---+----------+---------+-----------+-------------+\n", 397 | "| id|first_name|last_name|LatesSalary|department_id|\n", 398 | "+---+----------+---------+-----------+-------------+\n", 399 | "| 1| Todd| Wilson| 110000| 1006|\n", 400 | "| 2| Justin| Simon| 130000| 1005|\n", 401 | "| 3| Kelly| Rosario| 42689| 1002|\n", 402 | "| 4| Patricia| Powell| 170000| 1004|\n", 403 | "| 5| Sherry| Golden| 44101| 1002|\n", 404 | "| 6| Natasha| Swanson| 90000| 1005|\n", 405 | "| 7| Diane| Gordon| 74591| 1002|\n", 406 | "| 8| Mercedes|Rodriguez| 61048| 1005|\n", 407 | "| 9| Christy| Mitchell| 150000| 1001|\n", 408 | "| 10| Sean| Crawford| 190000| 1006|\n", 409 | "| 11| Kevin| Townsend| 166861| 1002|\n", 410 | "| 12| Joshua| Johnson| 123082| 1004|\n", 411 | "| 13| Julie| Sanchez| 210000| 1001|\n", 412 | "| 14| John| Coleman| 152434| 1001|\n", 413 | "| 15| Anthony| Valdez| 96898| 1001|\n", 414 | "| 16| Briana| Rivas| 151668| 1005|\n", 415 | "| 17| Jason| Burnett| 42525| 1006|\n", 416 | "| 18| Jeffrey| Harris| 20000| 1002|\n", 417 | "| 19| Michael| Ramsey| 63159| 1003|\n", 418 | "| 20| Cody| Gonzalez| 112809| 1004|\n", 419 | "| 21| Stephen| Berry| 123617| 1002|\n", 420 | "| 22| Brittany| Scott| 162537| 1002|\n", 421 | "| 23| Angela| Williams| 100875| 1004|\n", 422 | "| 24| William| Flores| 142674| 1003|\n", 423 | "| 25| Pamela| Matthews| 57944| 1005|\n", 424 | "| 26| Allison| Johnson| 128782| 1001|\n", 425 | "| 27| Anthony| Ball| 34386| 1003|\n", 426 | "| 28| Alexis| Beck| 12260| 1005|\n", 427 | "| 29| Jason| Olsen| 51937| 1006|\n", 428 | "| 30| Stephen| Smith| 194791| 1001|\n", 429 | "| 31| Kimberly| Brooks| 95327| 1003|\n", 430 | "| 32| Eric|Zimmerman| 83093| 1006|\n", 431 | "| 33| Peter| Holt| 69945| 1002|\n", 432 | "| 34| Justin| Dunn| 67992| 1003|\n", 433 | "| 35| John| Ball| 47795| 1004|\n", 434 | "| 36| Jesus| Ward| 36078| 1005|\n", 435 | "| 37| Philip|Gillespie| 36424| 1006|\n", 436 | "| 38| Nicole| Lewis| 114079| 1001|\n", 437 | "| 39| Linda| Clark| 186781| 1002|\n", 438 | "| 40| Colleen| Carrillo| 147723| 1004|\n", 439 | "| 41| John| George| 21642| 1001|\n", 440 | "| 42| Traci| Williams| 180000| 1003|\n", 441 | "| 43| Joseph| Rogers| 22800| 1005|\n", 442 | "| 44| Trevor| Carter| 38670| 1001|\n", 443 | "| 45| Kevin| Duncan| 45210| 1003|\n", 444 | "| 46| Joshua| Ewing| 73088| 1003|\n", 445 | "| 47| Kimberly| Dean| 71416| 1003|\n", 446 | "| 48| Robert| Lynch| 117960| 1004|\n", 447 | "| 49| Amber| Harding| 77764| 1002|\n", 448 | "| 50| Victoria| Wilson| 176620| 1002|\n", 449 | "| 51| Theresa| Everett| 31404| 1002|\n", 450 | "| 52| Kara| Smith| 192838| 1004|\n", 451 | "| 53| Teresa| Cohen| 98860| 1001|\n", 452 | "| 54| Wesley| Tucker| 90221| 1005|\n", 453 | "| 55| Michael| Morris| 106799| 1005|\n", 454 | "| 56| Rachael| Williams| 103585| 1002|\n", 455 | "| 57| Patricia| Harmon| 147417| 1005|\n", 456 | "| 58| Edward| Sharp| 41077| 1005|\n", 457 | "| 59| Kevin| Robinson| 100924| 1005|\n", 458 | "| 60| Charles| Pearson| 173317| 1004|\n", 459 | "| 61| Ryan| Brown| 120000| 1003|\n", 460 | "| 62| Dale| Hayes| 97662| 1005|\n", 461 | "| 63| Richard| Sanford| 136083| 1001|\n", 462 | "| 64| Danielle| Williams| 120000| 1006|\n", 463 | "| 65| Deborah| Martin| 67389| 1004|\n", 464 | "| 66| Dustin| Bush| 47567| 1004|\n", 465 | "| 67| Tyler| Green| 111085| 1002|\n", 466 | "| 68| Antonio|Carpenter| 83684| 1002|\n", 467 | "| 69| Ernest| Peterson| 115993| 1005|\n", 468 | "| 70| Karen|Fernandez| 101238| 1003|\n", 469 | "| 71| Kristine| Casey| 67651| 1003|\n", 470 | "| 72| Christine| Frye| 137244| 1004|\n", 471 | "| 73| William| Preston| 155225| 1003|\n", 472 | "| 74| Richard| Cole| 180361| 1003|\n", 473 | "| 75| Julia| Ramos| 105000| 1006|\n", 474 | "+---+----------+---------+-----------+-------------+\n", 475 | "\n" 476 | ] 477 | } 478 | ], 479 | "source": [ 480 | "# Now we will write query to get max salary for each employee \n", 481 | "# so we will use SQL Group by and SQL Order by functions \n", 482 | "sqlContext.sql(\"SELECT id,first_name,last_name,MAX(salary) AS LatesSalary,department_id \\\n", 483 | " FROM tmpEmployee \\\n", 484 | " GROUP BY id,first_name,last_name,department_id \\\n", 485 | " ORDER BY id\").show(n=100)" 486 | ] 487 | } 488 | ], 489 | "metadata": { 490 | "kernelspec": { 491 | "display_name": "Python 3 (ipykernel)", 492 | "language": "python", 493 | "name": "python3" 494 | }, 495 | "language_info": { 496 | "codemirror_mode": { 497 | "name": "ipython", 498 | "version": 3 499 | }, 500 | "file_extension": ".py", 501 | "mimetype": "text/x-python", 502 | "name": "python", 503 | "nbconvert_exporter": "python", 504 | "pygments_lexer": "ipython3", 505 | "version": "3.8.13" 506 | } 507 | }, 508 | "nbformat": 4, 509 | "nbformat_minor": 5 510 | } 511 | --------------------------------------------------------------------------------