├── .DS_Store
├── Problem 6
├── problem6.sql
├── students.sql
├── Students.csv
├── README.md
└── problem6.ipynb
├── Problem 7
├── .DS_Store
├── problem7.sql
├── transaction.sql
├── README.md
├── transaction.csv
└── problem7.ipynb
├── Problem 2
├── department.csv
├── department.sql
├── problem2.sql
├── README.md
├── problem2_2.ipynb
└── problem2_1.ipynb
├── Problem 5
├── problem5.sql
├── station.sql
├── README.md
├── problem5.ipynb
└── stations.csv
├── Problem 1
├── problem1.sql
├── employee_table.sql
├── README.md
├── employee.csv
├── employee.json
└── problem1.ipynb
├── Problem 3
├── problem3.sql
├── station.sql
├── README.md
├── problem3.ipynb
└── stations.csv
├── Problem 0
├── employee_salary.sql
├── problem0.sql
├── README.md
└── employee_salary.csv
├── Problem 4
├── station.sql
├── problem4.sql
├── README.md
├── problem4.ipynb
└── stations.csv
├── Problem 9
├── user_type.csv
├── README.md
├── user_info.csv
├── problem9.sql
└── download_facts.csv
├── Problem 8
├── problem8.sql
├── user.csv
├── README.md
├── ride_log.csv
└── problem8.ipynb
└── README.md
/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/developershomes/DataEngineeringProblems/main/.DS_Store
--------------------------------------------------------------------------------
/Problem 6/problem6.sql:
--------------------------------------------------------------------------------
1 | SELECT name
2 | FROM public.students
3 | WHERE marks > 75
4 | ORDER BY right(name,3),ID;
--------------------------------------------------------------------------------
/Problem 7/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/developershomes/DataEngineeringProblems/main/Problem 7/.DS_Store
--------------------------------------------------------------------------------
/Problem 2/department.csv:
--------------------------------------------------------------------------------
1 | department_id,department_name
2 | 1005,Sales
3 | 1002,Finanace
4 | 1004,Purchase
5 | 1001,Operations
6 | 1006,Marketing
7 | 1003,Technoogy
--------------------------------------------------------------------------------
/Problem 7/problem7.sql:
--------------------------------------------------------------------------------
1 | SELECT DISTINCT(a1.user_id)
2 | FROM transaction a1
3 | JOIN transaction a2 ON a1.user_id=a2.user_id
4 | AND a1.id <> a2.id
5 | AND DATEDIFF(a2.created_at,a1.created_at) BETWEEN 0 AND 7
6 | ORDER BY a1.user_id;
--------------------------------------------------------------------------------
/Problem 5/problem5.sql:
--------------------------------------------------------------------------------
1 | -- Query the list of CITY names starting with vowels (i.e., a, e, i, o, or u) from STATION. Your result cannot contain duplicates.
2 |
3 | SELECT DISTINCT(CITY) FROM STATION WHERE LEFT(CITY,1) IN ('A','E','I','O','U');
--------------------------------------------------------------------------------
/Problem 1/problem1.sql:
--------------------------------------------------------------------------------
1 | -- Active: 1675109399578@@127.0.0.1@5432@postgres
2 | SELECT id, first_name, last_name, MAX(salary) AS MaxSalary, department_id
3 | FROM public.employee
4 | GROUP BY id, first_name, last_name, department_id
5 | ORDER BY id
--------------------------------------------------------------------------------
/Problem 3/problem3.sql:
--------------------------------------------------------------------------------
1 | --Find the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table.
2 |
3 | SELECT count(city) as citycount, count(distinct(city)) as distinctcitycount,(count(city) - count(distinct(city))) as diffbetweenboth
4 | FROM public.station;
--------------------------------------------------------------------------------
/Problem 7/transaction.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE `transaction` (
2 | `id` int NOT NULL,
3 | `user_id` int DEFAULT NULL,
4 | `item` varchar(45) DEFAULT NULL,
5 | `created_at` date DEFAULT NULL,
6 | `revenue` int DEFAULT NULL,
7 | PRIMARY KEY (`id`)
8 | ) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_0900_ai_ci
--------------------------------------------------------------------------------
/Problem 6/students.sql:
--------------------------------------------------------------------------------
1 | -- Table: public.station
2 |
3 | -- DROP TABLE IF EXISTS public.station;
4 |
5 | CREATE TABLE IF NOT EXISTS public.students
6 | (
7 | ID bigint,
8 | Name character varying(100) COLLATE pg_catalog."default",
9 | Marks bigint
10 | )
11 |
12 | TABLESPACE pg_default;
13 |
14 | ALTER TABLE IF EXISTS public.students
15 | OWNER to postgres;
--------------------------------------------------------------------------------
/Problem 2/department.sql:
--------------------------------------------------------------------------------
1 | -- Table: public.department
2 |
3 | -- DROP TABLE IF EXISTS public.department;
4 |
5 | CREATE TABLE IF NOT EXISTS public.department
6 | (
7 | department_id bigint,
8 | department_name character varying(100) COLLATE pg_catalog."default"
9 | )
10 |
11 | TABLESPACE pg_default;
12 |
13 | ALTER TABLE IF EXISTS public.department
14 | OWNER to postgres;
--------------------------------------------------------------------------------
/Problem 0/employee_salary.sql:
--------------------------------------------------------------------------------
1 | CREATE TABLE IF NOT EXISTS public.employee_salary
2 | (
3 | id bigint,
4 | first_name character varying(100) COLLATE pg_catalog."default",
5 | last_name character varying(100) COLLATE pg_catalog."default",
6 | salary bigint,
7 | department_id bigint
8 | )
9 |
10 | TABLESPACE pg_default;
11 |
12 | ALTER TABLE IF EXISTS public.employee_salary
13 | OWNER to postgres;
--------------------------------------------------------------------------------
/Problem 3/station.sql:
--------------------------------------------------------------------------------
1 | -- Table: public.station
2 |
3 | -- DROP TABLE IF EXISTS public.station;
4 |
5 | CREATE TABLE IF NOT EXISTS public.station
6 | (
7 | id bigint,
8 | city character varying(100) COLLATE pg_catalog."default",
9 | state character varying(100) COLLATE pg_catalog."default",
10 | lattitude numeric(20,10),
11 | longtitude numeric(20,10)
12 | )
13 |
14 | TABLESPACE pg_default;
15 |
16 | ALTER TABLE IF EXISTS public.station
17 | OWNER to postgres;
--------------------------------------------------------------------------------
/Problem 4/station.sql:
--------------------------------------------------------------------------------
1 | -- Table: public.station
2 |
3 | -- DROP TABLE IF EXISTS public.station;
4 |
5 | CREATE TABLE IF NOT EXISTS public.station
6 | (
7 | id bigint,
8 | city character varying(100) COLLATE pg_catalog."default",
9 | state character varying(100) COLLATE pg_catalog."default",
10 | lattitude numeric(20,10),
11 | longtitude numeric(20,10)
12 | )
13 |
14 | TABLESPACE pg_default;
15 |
16 | ALTER TABLE IF EXISTS public.station
17 | OWNER to postgres;
--------------------------------------------------------------------------------
/Problem 5/station.sql:
--------------------------------------------------------------------------------
1 | -- Table: public.station
2 |
3 | -- DROP TABLE IF EXISTS public.station;
4 |
5 | CREATE TABLE IF NOT EXISTS public.station
6 | (
7 | id bigint,
8 | city character varying(100) COLLATE pg_catalog."default",
9 | state character varying(100) COLLATE pg_catalog."default",
10 | lattitude numeric(20,10),
11 | longtitude numeric(20,10)
12 | )
13 |
14 | TABLESPACE pg_default;
15 |
16 | ALTER TABLE IF EXISTS public.station
17 | OWNER to postgres;
--------------------------------------------------------------------------------
/Problem 1/employee_table.sql:
--------------------------------------------------------------------------------
1 | -- Table: public.employee
2 |
3 | -- DROP TABLE IF EXISTS public.employee;
4 |
5 | CREATE TABLE IF NOT EXISTS public.employee
6 | (
7 | id bigint,
8 | first_name character varying(100) COLLATE pg_catalog."default",
9 | last_name character varying(100) COLLATE pg_catalog."default",
10 | salary bigint,
11 | department_id bigint
12 | )
13 |
14 | TABLESPACE pg_default;
15 |
16 | ALTER TABLE IF EXISTS public.employee
17 | OWNER to postgres;
--------------------------------------------------------------------------------
/Problem 6/Students.csv:
--------------------------------------------------------------------------------
1 | ID,Name,Marks
2 | 19,Samantha,87
3 | 21,Julia,96
4 | 11,Britney,95
5 | 32,Kristeen,100
6 | 12,Dyana,55
7 | 13,Jenny,66
8 | 14,Christene,88
9 | 15,Meera,24
10 | 16,Priya,76
11 | 17,Priyanka,77
12 | 18,Paige,74
13 | 19,Jane,64
14 | 21,Belvet,78
15 | 31,Scarlet,80
16 | 41,Salma,81
17 | 51,Amanda,34
18 | 61,Heraldo,94
19 | 71,Stuart,99
20 | 81,Aamina,77
21 | 76,Amina,89
22 | 91,Vivek,84
23 | 17,Evil,79
24 | 16,Devil,76
25 | 34,Fanny,75
26 | 38,Danny,75
--------------------------------------------------------------------------------
/Problem 4/problem4.sql:
--------------------------------------------------------------------------------
1 | --Query the two cities in STATION with the shortest and longest CITY names, as well as their respective lengths (i.e.: number of characters in the name). If there is more than one smallest or largest city, choose the one that comes first when ordered alphabetically.
2 |
3 | SELECT q1.city, q1.citylength
4 | FROM
5 | (SELECT CITY,LENGTH(CITY) as citylength, RANK() OVER (PARTITION BY LENGTH(CITY) ORDER BY LENGTH(CITY),CITY) as actualrank
6 | FROM STATION) q1
7 | WHERE q1. actualrank = 1
8 | AND q1.citylength = (SELECT MIN(LENGTH(CITY)) FROM STATION)
9 | OR q1.citylength = (SELECT MAX(LENGTH(CITY)) FROM STATION);
10 |
--------------------------------------------------------------------------------
/Problem 0/problem0.sql:
--------------------------------------------------------------------------------
1 | -- 1. List all the meployees whoes salary is more than 100K
2 |
3 | SELECT id, first_name, last_name, salary, department_id
4 | FROM public.employee_salary
5 | WHERE salary > 100000 ;
6 |
7 | -- 2. Provide distinct department id
8 |
9 | SELECT DISTINCT department_id
10 | FROM public.employee_salary ;
11 |
12 | -- 3. Provide first and last name of employees
13 |
14 | SELECT first_name, last_name
15 | FROM public.employee_salary ;
16 |
17 | -- 4. Provide all the details with the employees whose last name is 'Johnson'
18 |
19 | SELECT id, first_name, last_name, salary, department_id
20 | FROM public.employee_salary
21 | WHERE last_name = 'Johnson' ;
--------------------------------------------------------------------------------
/Problem 9/user_type.csv:
--------------------------------------------------------------------------------
1 | acc_id,paying_customer
2 | 700,no
3 | 701,no
4 | 702,no
5 | 703,no
6 | 704,no
7 | 705,no
8 | 706,no
9 | 707,no
10 | 708,no
11 | 709,no
12 | 710,no
13 | 711,no
14 | 712,no
15 | 713,no
16 | 714,no
17 | 715,no
18 | 716,no
19 | 717,no
20 | 718,no
21 | 719,no
22 | 720,no
23 | 721,no
24 | 722,no
25 | 723,no
26 | 724,no
27 | 725,yes
28 | 726,yes
29 | 727,yes
30 | 728,yes
31 | 729,yes
32 | 730,yes
33 | 731,yes
34 | 732,yes
35 | 733,yes
36 | 734,yes
37 | 735,yes
38 | 736,yes
39 | 737,yes
40 | 738,yes
41 | 739,yes
42 | 740,yes
43 | 741,yes
44 | 742,yes
45 | 743,yes
46 | 744,yes
47 | 745,yes
48 | 746,yes
49 | 747,yes
50 | 748,yes
51 | 749,yes
52 | 750,yes
--------------------------------------------------------------------------------
/Problem 8/problem8.sql:
--------------------------------------------------------------------------------
1 | --For Top 10 hoghest travlled users
2 | SELECT q.user_id, q.name, q.total
3 | FROM
4 | ( select user_id
5 | ,name
6 | , sum(distance) as total
7 | , RANK() OVER (ORDER BY sum(distance) DESC) as actualrank
8 | from DATAENG.ride_log as log
9 | LEFT OUTER JOIN DATAENG.user as users
10 | ON log.user_id = users.id
11 | GROUP BY user_id, name
12 | ORDER BY sum(distance) DESC) as q
13 | WHERE q.actualrank <= 10
14 |
15 |
16 | --For Top 10 Least travlled users
17 | SELECT q.user_id, q.name, q.total
18 | FROM
19 | ( select user_id
20 | ,name
21 | , sum(distance) as total
22 | , RANK() OVER (ORDER BY sum(distance)) as actualrank
23 | from DATAENG.ride_log as log
24 | LEFT OUTER JOIN DATAENG.user as users
25 | ON log.user_id = users.id
26 | GROUP BY user_id, name
27 | ORDER BY sum(distance)) as q
28 | WHERE q.actualrank <= 10
--------------------------------------------------------------------------------
/Problem 2/problem2.sql:
--------------------------------------------------------------------------------
1 | -- We have a table with employees tables in which we have employee details with salary and department id of the employees. We have one more table in which we have department id and department name.
2 | -- Provide below queries
3 | -- 1. Use this both tables and list all the employees woking in marketing department with highest to lowest salary order.
4 |
5 | SELECT first_name, last_name, salary
6 | FROM public.employee_salary as emp
7 | LEFT OUTER JOIN public.department as department
8 | ON emp.department_id = department.department_id
9 | WHERE department.department_name = 'Marketing'
10 | ORDER BY salary DESC;
11 |
12 | -- 2. Provide count of employees in each departnent with department name.
13 |
14 | SELECT department.department_name, count(*) as count_of_employee
15 | FROM public.department as department
16 | LEFT OUTER JOIN public.employee_salary as emp
17 | ON emp.department_id = department.department_id
18 | GROUP BY department.department_name;
19 |
--------------------------------------------------------------------------------
/Problem 8/user.csv:
--------------------------------------------------------------------------------
1 | id,name
2 | 1,Dustin Smith
3 | 2,Jay Ramirez
4 | 3,Joseph Cooke
5 | 4,Melinda Young
6 | 5,Sean Parker
7 | 6,Ian Foster
8 | 7,Christopher Schmitt
9 | 8,Patrick Gutierrez
10 | 9,Dennis Douglas
11 | 10,Brenda Morris
12 | 11,Jeffery Hernandez
13 | 12,David Rice
14 | 13,Charles Foster
15 | 14,Keith Perez DVM
16 | 15,Dean Cuevas
17 | 16,Melissa Bishop
18 | 17,Alexander Howell
19 | 18,Austin Robertson
20 | 19,Sherri Mcdaniel
21 | 20,Nancy Nguyen
22 | 21,Melody Ball
23 | 22,Christopher Stokes
24 | 23,Joseph Hamilton
25 | 24,Kevin Fischer
26 | 25,Crystal Berg
27 | 26,Barbara Larson
28 | 27,Jacqueline Heath
29 | 28,Eric Gardner
30 | 29,Daniel Kennedy
31 | 30,Kaylee Sims
32 | 31,Shannon Green
33 | 32,Stacy Collins
34 | 33,Donna Ortiz
35 | 34,Jennifer Simmons
36 | 35,Michael Gill
37 | 36,Alyssa Shaw
38 | 37,Destiny Clark
39 | 38,Thomas Lara
40 | 39,Mark Diaz
41 | 40,Stacy Bryant
42 | 41,Howard Rose
43 | 42,Brian Schwartz
44 | 43,Kimberly Potter
45 | 44,Cassidy Ryan
46 | 45,Benjamin Mcbride
47 | 46,Elizabeth Ward
48 | 47,Christina Price
49 | 48,Pamela Cox
50 | 49,Jessica Peterson
51 | 50,Michael Nelson
--------------------------------------------------------------------------------
/Problem 8/README.md:
--------------------------------------------------------------------------------
1 | # Problems 8 -> Top distance travelled
2 |
3 | Find the top 10 users that have traveled the least distance. Output their id, name and a total distance traveled.
4 |
5 | Problem Difficulty Level : Medium
6 |
7 | Data Structure
8 | ride_log
9 |
10 | - id
11 | - user_id
12 | - travel
13 |
14 | user
15 |
16 | - id
17 | - name
18 |
19 |
20 |
21 | Data for ride_log and user table
22 |
23 | [In CSV Format](ride_log.csv)
24 | [In CSV Format](user.csv)
25 |
26 | ## Solving using PySpark
27 |
28 | In Spark we will solve this problem using two ways
29 | 1. Using PySpark Functions
30 | 2. Using Spark SQL
31 |
32 | Use below notebook for solution
33 |
34 | [Problem Solution First Part](problem8.ipynb)
35 |
36 | ## Solving using MySQL
37 |
38 | In MySQL We will load data from CSV using MySQL Import functionality. And then we will solve this problem.
39 |
40 | Output Query
41 |
42 | [Problem Solution](problem8.sql)
43 |
44 | Please also follow below blog for understanding this problem
45 |
--------------------------------------------------------------------------------
/Problem 5/README.md:
--------------------------------------------------------------------------------
1 | # Problems 5 -> CITY names starting with vowels
2 |
3 | Query the list of CITY names starting with vowels (i.e., a, e, i, o, or u) from STATION. Your result cannot contain duplicates.
4 | The STATION table is described as follows:
5 |
6 | Problem Difficulty Level : Easy
7 |
8 | Data Structure
9 |
10 | - ID
11 | - City
12 | - State
13 | - Lattitude
14 | - Longitude
15 |
16 |
17 |
18 | Data for station table
19 |
20 | [In CSV Format](stations.csv)
21 |
22 | ## Solving using PySpark
23 |
24 | In Spark we will solve this problem using two ways
25 | 1. Using PySpark Functions
26 | 2. Using Spark SQL
27 |
28 | Use below notebook for solution
29 |
30 | [Problem Solution First Part](problem5.ipynb)
31 |
32 | ## Solving using PostgreSQL
33 |
34 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem.
35 |
36 | Output Query
37 |
38 | [Problem Solution](problem5.sql)
39 |
40 | Please also follow below blog for understanding this problem
41 |
--------------------------------------------------------------------------------
/Problem 3/README.md:
--------------------------------------------------------------------------------
1 | # Problems 3 -> Difference between total number of cities and distinct cities
2 |
3 | Find the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table.
4 | The STATION table is described as follows:
5 |
6 | Problem Difficulty Level : Easy
7 |
8 | Data Structure
9 |
10 | - ID
11 | - City
12 | - State
13 | - Lattitude
14 | - Longitude
15 |
16 |
17 |
18 | Data for station table
19 |
20 | [In CSV Format](stations.csv)
21 |
22 | ## Solving using PySpark
23 |
24 | In Spark we will solve this problem using two ways
25 | 1. Using PySpark Functions
26 | 2. Using Spark SQL
27 |
28 | Use below notebook for solution
29 |
30 | [Problem Solution First Part](problem3.ipynb)
31 |
32 | ## Solving using PostgreSQL
33 |
34 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem.
35 |
36 | Output Query
37 |
38 | [Problem Solution](problem3.sql)
39 |
40 | Please also follow below blog for understanding this problem
41 |
--------------------------------------------------------------------------------
/Problem 7/README.md:
--------------------------------------------------------------------------------
1 | # Problems 7 -> Returning active users
2 |
3 | Write a query that'll identify returning active users. A returning active user is a user that has made a second purchase within 7 days of any other of their purchases. Output a list of user_ids of these returning active users.
4 |
5 | Problem Difficulty Level : Medium
6 |
7 | Data Structure
8 |
9 | - id
10 | - user_id
11 | - item
12 | - created_at
13 | - revenue
14 |
15 |
16 |
17 | Data for transaction table
18 |
19 | [In CSV Format](transaction.csv)
20 |
21 | ## Solving using PySpark
22 |
23 | In Spark we will solve this problem using two ways
24 | 1. Using PySpark Functions
25 | 2. Using Spark SQL
26 |
27 | Use below notebook for solution
28 |
29 | [Problem Solution First Part](problem7.ipynb)
30 |
31 | ## Solving using MySQL
32 |
33 | In MySQL We will load data from CSV using MySQL Import functionality. And then we will solve this problem.
34 |
35 | Output Query
36 |
37 | [Problem Solution](problem7.sql)
38 |
39 | Please also follow below blog for understanding this problem
40 |
--------------------------------------------------------------------------------
/Problem 0/README.md:
--------------------------------------------------------------------------------
1 | # Problems 0 -> Employee Salary more than
2 |
3 | We have a table with employees and their salaries. Write Queries to solve below problems
4 | 1. List all the meployees whoes salary is more than 100K
5 | 2. Provide distinct department id
6 | 3. Provide first and last name of employees
7 | 4. Provide all the details with the employees whose last name is 'Johnson'
8 |
9 | Problem Difficulty Level : Easy
10 |
11 | Data Structure
12 |
13 |
14 |
15 |
16 | Data for this problem
17 |
18 | [In CSV Format](employee_salary.csv)
19 |
20 | ## Solving using PySpark
21 |
22 | In Spark we will solve this problem using two ways
23 | 1. Using PySpark Functions
24 | 2. Using Spark SQL
25 |
26 | Use below notebook for solution
27 |
28 | [Problem Solution](problem0.ipynb)
29 |
30 | ## Solving using PostgreSQL
31 |
32 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem.
33 |
34 | Output Query
35 |
36 | [Problem Solution](problem0.sql)
37 |
38 | Please also follow below blog for understanding this problem
39 |
--------------------------------------------------------------------------------
/Problem 1/README.md:
--------------------------------------------------------------------------------
1 | # Problems 1 -> Employee With his Latest Salary
2 |
3 | We have a table with employees and their salaries, however, some of the records are old and contain outdated salary information. Find the current salary of each employee assuming that salaries increase each year. Output their id, first name, last name, department ID, and current salary. Order your list by employee ID in ascending order.
4 |
5 | Problem Difficulty Level : Medium
6 |
7 | Data Structure
8 |
9 |
10 |
11 |
12 | Data for this problem
13 |
14 | [In CSV Format](employee.csv)
15 |
16 | [In JSON Format](employee.json)
17 |
18 | ## Solving using PySpark
19 |
20 | In Spark we will solve this problem using two ways
21 | 1. Using PySpark Functions
22 | 2. Using Spark SQL
23 |
24 | Use below notebook for solution
25 |
26 | [Problem Solution](problem1.ipynb)
27 |
28 | ## Solving using PostgreSQL
29 |
30 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem.
31 |
32 | Output Query
33 |
34 | [Problem Solution](problem1.sql)
35 |
36 | Please also follow below blog for understanding this problem
37 |
--------------------------------------------------------------------------------
/Problem 9/README.md:
--------------------------------------------------------------------------------
1 | # Problems 9 -> Premium vs Freemium
2 |
3 | Find the total number of downloads for paying and non-paying users by date. Include only records where non-paying customers have more downloads than paying customers. The output should be sorted by earliest date first and contain 3 columns date, non-paying downloads, paying downloads.
4 |
5 | Problem Difficulty Level : Hard
6 |
7 | Data Structure
8 | user_info
9 |
10 | - user_id
11 | - acc_id
12 |
13 | user_type
14 |
15 | - acc_id
16 | - paying_customer
17 |
18 | download_facts
19 |
20 | - date
21 | - user_id
22 | - downloads
23 |
24 |
25 | Data for ride_log and user table
26 |
27 | [User data CSV Format](user_info.csv)
28 | [User Type data CSV Format](user_type.csv)
29 | [Download facts data CSV Format](download_facts.csv)
30 |
31 | ## Solving using PySpark
32 |
33 | In Spark we will solve this problem using two ways
34 | 1. Using PySpark Functions
35 | 2. Using Spark SQL
36 |
37 | Use below notebook for solution
38 |
39 | [Problem Solution First Part](problem9.ipynb)
40 |
41 | ## Solving using MySQL
42 |
43 | In MySQL We will load data from CSV using MySQL Import functionality. And then we will solve this problem.
44 |
45 | Output Query
46 |
47 | [Problem Solution](problem9.sql)
48 |
49 | Please also follow below blog for understanding this problem
50 |
--------------------------------------------------------------------------------
/Problem 9/user_info.csv:
--------------------------------------------------------------------------------
1 | user_id,acc_id
2 | 0,1
3 | 1,716
4 | 2,749
5 | 3,713
6 | 4,744
7 | 5,726
8 | 6,706
9 | 7,750
10 | 8,732
11 | 9,706
12 | 10,729
13 | 11,748
14 | 12,731
15 | 13,739
16 | 14,740
17 | 15,705
18 | 16,706
19 | 17,701
20 | 18,746
21 | 19,726
22 | 20,748
23 | 21,701
24 | 22,707
25 | 23,710
26 | 24,702
27 | 25,720
28 | 26,730
29 | 27,721
30 | 28,733
31 | 29,732
32 | 30,729
33 | 31,716
34 | 32,722
35 | 33,745
36 | 34,737
37 | 35,730
38 | 36,729
39 | 37,723
40 | 38,710
41 | 39,707
42 | 40,737
43 | 41,717
44 | 42,741
45 | 43,718
46 | 44,736
47 | 45,720
48 | 46,743
49 | 47,707
50 | 48,721
51 | 49,748
52 | 50,715
53 | 51,709
54 | 52,732
55 | 53,732
56 | 54,712
57 | 55,701
58 | 56,721
59 | 57,744
60 | 58,724
61 | 59,727
62 | 60,743
63 | 61,744
64 | 62,717
65 | 63,723
66 | 64,713
67 | 65,706
68 | 66,731
69 | 67,722
70 | 68,744
71 | 69,705
72 | 70,703
73 | 71,725
74 | 72,740
75 | 73,713
76 | 74,732
77 | 75,720
78 | 76,709
79 | 77,739
80 | 78,703
81 | 79,732
82 | 80,728
83 | 81,737
84 | 82,711
85 | 83,745
86 | 84,734
87 | 85,723
88 | 86,718
89 | 87,702
90 | 88,718
91 | 89,744
92 | 90,710
93 | 91,727
94 | 92,739
95 | 93,728
96 | 94,740
97 | 95,744
98 | 96,737
99 | 97,726
100 | 98,722
101 | 99,727
102 | 100,712
--------------------------------------------------------------------------------
/Problem 4/README.md:
--------------------------------------------------------------------------------
1 | # Problems 4 -> Get Shortest and Longest City Name
2 |
3 | Query the two cities in STATION with the shortest and longest CITY names, as well as their respective lengths (i.e.: number of characters in the name). If there is more than one smallest or largest city, choose the one that comes first when ordered alphabetically.
4 | The STATION table is described as follows:
5 |
6 | Problem Difficulty Level : Hard
7 |
8 | Data Structure
9 |
10 | - ID
11 | - City
12 | - State
13 | - Lattitude
14 | - Longitude
15 |
16 |
17 |
18 | Data for station table
19 |
20 | [In CSV Format](stations.csv)
21 |
22 | Sample Input
23 |
24 | For example, CITY has four entries: DEF, ABC, PQRS and WXY.
25 |
26 | Sample Output
27 |
28 | ``````````
29 | ABC 3
30 | PQRS 4
31 | ``````````
32 |
33 | ## Solving using PySpark
34 |
35 | In Spark we will solve this problem using two ways
36 | 1. Using PySpark Functions
37 | 2. Using Spark SQL
38 |
39 | Use below notebook for solution
40 |
41 | [Problem Solution First Part](problem4.ipynb)
42 |
43 | ## Solving using PostgreSQL
44 |
45 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem.
46 |
47 | Output Query
48 |
49 | [Problem Solution](problem4.sql)
50 |
51 | Please also follow below blog for understanding this problem
52 |
--------------------------------------------------------------------------------
/Problem 9/problem9.sql:
--------------------------------------------------------------------------------
1 | SELECT paying_customer.date,nonpaying_download,paying_downaload
2 | FROM
3 | ( select acc.paying_customer
4 | ,download.date, SUM(download.downloads) as paying_downaload
5 | FROM user_info as usr
6 | LEFT OUTER JOIN user_type as acc
7 | ON usr.acc_id = acc.acc_id
8 | LEFT OUTER JOIN download_facts as download
9 | ON usr.user_id = download.user_id
10 | WHERE paying_customer = 'yes'
11 | GROUP BY acc.paying_customer,download.date ) as paying_customer
12 | LEFT OUTER JOIN
13 | ( select acc.paying_customer
14 | ,download.date, SUM(download.downloads) as nonpaying_download
15 | FROM user_info as usr
16 | LEFT OUTER JOIN user_type as acc
17 | ON usr.acc_id = acc.acc_id
18 | LEFT OUTER JOIN download_facts as download
19 | ON usr.user_id = download.user_id
20 | WHERE paying_customer = 'no'
21 | GROUP BY acc.paying_customer,download.date) as non_paying_customer
22 | ON paying_customer.date = non_paying_customer.date
23 | WHERE nonpaying_download > paying_downaload
24 | ORDER BY paying_customer.date
25 |
26 | ----
27 |
28 | SELECT date, non_paying,
29 | paying
30 | FROM
31 | (SELECT date, sum(CASE
32 | WHEN paying_customer = 'yes' THEN downloads
33 | END) AS paying,
34 | sum(CASE
35 | WHEN paying_customer = 'no' THEN downloads
36 | END) AS non_paying
37 | FROM user_info a
38 | INNER JOIN user_type b ON a.acc_id = b.acc_id
39 | INNER JOIN download_facts c ON a.user_id=c.user_id
40 | GROUP BY date
41 | ORDER BY date) t
42 | WHERE (non_paying - paying) >0
43 | ORDER BY t.date ASC
--------------------------------------------------------------------------------
/Problem 6/README.md:
--------------------------------------------------------------------------------
1 | # Problems 6 -> Students more than 75 Marks
2 |
3 | Query the Name of any student in STUDENTS who scored higher than 75 Marks. Order your output by the last three characters of each name. If two or more students both have names ending in the same last three characters (i.e.: Bobby, Robby, etc.), secondary sort them by ascending ID.
4 |
5 | Problem Difficulty Level : Medium
6 |
7 | Data Structure
8 |
9 | - ID
10 | - Name
11 | - Marks
12 |
13 |
14 |
15 | Data for students table
16 |
17 | [In CSV Format](Students.csv)
18 |
19 | ## Sample Input
20 |
21 | ```
22 | 1 Ashley 81
23 | 2 Samantha 75
24 | 3 Julia 76
25 | 4 Belvet 84
26 | ```
27 |
28 | ## Sample Output
29 |
30 | ```
31 | Ashley
32 | Julia
33 | Belvet
34 | ```
35 |
36 | ## Explanation
37 |
38 | Only Ashley, Julia, and Belvet have Marks > . If you look at the last three characters of each of their names, there are no duplicates and 'ley' < 'lia' < 'vet'.
39 |
40 | ## Solving using PySpark
41 |
42 | In Spark we will solve this problem using two ways
43 | 1. Using PySpark Functions
44 | 2. Using Spark SQL
45 |
46 | Use below notebook for solution
47 |
48 | [Problem Solution First Part](problem6.ipynb)
49 |
50 | ## Solving using PostgreSQL
51 |
52 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem.
53 |
54 | Output Query
55 |
56 | [Problem Solution](problem6.sql)
57 |
58 | Please also follow below blog for understanding this problem
59 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Data Engineering Problems
2 | Data Engineering Problems with Solution
3 |
4 |
5 |
6 | Here, we are solving all the Data Engineering problems using below methods
7 | 1. Solving problem using PySpark
8 | 1. Using PySpark Functions
9 | 2. Using Spark SQL
10 | 2. Solving problem using SQL (PostgreSQL or MySQL)
11 |
12 | Please find list of all the problems
13 |
14 | 0. Problem0 -> [Get Employee with salary more than 100K](Problem%200/README.md)
15 | 1. Problem1 -> [Get Max Salary for each Employee](Problem%201/README.md)
16 | 2. Problem2 -> [Get Salary of all employees in Marketing department](Problem%202/README.md)
17 | 3. Problem3 -> [Find diff between count of cities and distict count of cities](Problem%203/README.md)
18 | 4. Problem4 -> [Get Shortest and Longest City Name](Problem%204/README.md)
19 | 5. Problem5 -> [CITY names starting with vowels](Problem%205/README.md)
20 | 6. Problem6 -> [Students more than 75 Marks ](Problem%206/README.md)
21 | 7. Problem7 -> [Returning active users](Problem%207/README.md)
22 | 8. Problem8 -> [Top distance travelled](Problem%208/README.md)
23 | 9. Problems 9 -> [Premium vs Freemium](Problem%209/README.md)
24 |
25 |
26 | Also find below blog for understanding all the data engineering problems
27 |
28 | https://developershome.blog/category/data-engineering/problem-solving/
29 |
30 | Also find below youtube channel for understanding all the data engineering problems and learning new concepts of data engineering.
31 |
32 | https://www.youtube.com/@developershomeIn
33 |
--------------------------------------------------------------------------------
/Problem 8/ride_log.csv:
--------------------------------------------------------------------------------
1 | id,user_id,distance
2 | 101,8,93
3 | 102,40,56
4 | 103,28,83
5 | 104,33,83
6 | 105,1,87
7 | 106,32,49
8 | 107,3,5
9 | 108,23,37
10 | 109,31,62
11 | 110,1,35
12 | 111,41,89
13 | 112,19,64
14 | 113,49,57
15 | 114,28,68
16 | 115,48,94
17 | 116,50,89
18 | 117,48,29
19 | 118,13,16
20 | 119,24,58
21 | 120,25,19
22 | 121,39,13
23 | 122,36,10
24 | 123,37,38
25 | 124,32,76
26 | 125,34,61
27 | 126,37,10
28 | 127,11,61
29 | 128,47,35
30 | 129,46,17
31 | 130,15,8
32 | 131,11,36
33 | 132,31,24
34 | 133,7,96
35 | 134,34,64
36 | 135,2,75
37 | 136,45,11
38 | 137,48,58
39 | 138,15,92
40 | 139,47,88
41 | 140,18,27
42 | 141,34,67
43 | 142,47,70
44 | 143,24,52
45 | 144,26,98
46 | 145,20,45
47 | 146,27,60
48 | 147,26,94
49 | 148,10,90
50 | 149,12,63
51 | 150,9,43
52 | 151,36,18
53 | 152,12,11
54 | 153,44,76
55 | 154,9,93
56 | 155,14,82
57 | 156,28,26
58 | 157,39,68
59 | 158,5,92
60 | 159,46,91
61 | 160,14,66
62 | 161,8,47
63 | 162,44,52
64 | 163,21,81
65 | 164,11,69
66 | 165,38,82
67 | 166,23,42
68 | 167,34,85
69 | 168,12,30
70 | 169,43,85
71 | 170,20,30
72 | 171,20,50
73 | 172,25,74
74 | 173,25,96
75 | 174,8,74
76 | 175,50,46
77 | 176,43,77
78 | 177,11,40
79 | 178,17,90
80 | 179,1,78
81 | 180,20,25
82 | 181,27,31
83 | 182,17,91
84 | 183,8,29
85 | 184,42,85
86 | 185,43,95
87 | 186,17,24
88 | 187,15,42
89 | 188,47,37
90 | 189,9,15
91 | 190,42,71
92 | 191,43,9
93 | 192,12,53
94 | 193,49,73
95 | 194,25,50
96 | 195,32,85
97 | 196,9,55
98 | 197,47,98
99 | 198,43,9
100 | 199,14,66
101 | 200,2,39
--------------------------------------------------------------------------------
/Problem 2/README.md:
--------------------------------------------------------------------------------
1 | # Problems 2 -> Employee From Sales Department with Salary
2 |
3 | We have a table with employees tables in which we have employee details with salary and department id of the employees. We have one more table in which we have department id and department name.
4 | Provide below queries
5 | 1. Use this both tables and list all the employees woking in marketing department with highest to lowest salary order.
6 | 2. Provide count of employees in each departnent with department name.
7 |
8 | Problem Difficulty Level : Easy
9 |
10 | Data Structure
11 |
12 | Employee table
13 |
14 |
15 |
16 | Department table
17 |
18 |
19 |
20 | Data for employee salary table
21 |
22 | [In CSV Format](../Problem%200/employee_salary.csv)
23 |
24 | Data for department table
25 |
26 | [In CSV Format](department.csv)
27 |
28 | ## Solving using PySpark
29 |
30 | In Spark we will solve this problem using two ways
31 | 1. Using PySpark Functions
32 | 2. Using Spark SQL
33 |
34 | Use below notebook for solution
35 |
36 | [Problem Solution First Part](problem2_1.ipynb)
37 | [Problem Solution Second Part](problem2_2.ipynb)
38 |
39 | ## Solving using PostgreSQL
40 |
41 | In Postgre SQL We will load data from CSV using PostgreSQL Import functionality. And then we will solve this problem.
42 |
43 | Output Query
44 |
45 | [Problem Solution](problem2.sql)
46 |
47 | Please also follow below blog for understanding this problem
48 |
--------------------------------------------------------------------------------
/Problem 9/download_facts.csv:
--------------------------------------------------------------------------------
1 | date,user_id,downloads
2 | 24/8/2020,1,6
3 | 22/8/2020,2,6
4 | 18/8/2020,3,2
5 | 24/8/2020,4,4
6 | 19/8/2020,5,7
7 | 21/8/2020,6,3
8 | 24/8/2020,7,1
9 | 24/8/2020,8,8
10 | 17/8/2020,9,5
11 | 16/8/2020,10,4
12 | 22/8/2020,11,8
13 | 19/8/2020,12,6
14 | 15/8/2020,13,3
15 | 21/8/2020,14,0
16 | 24/8/2020,15,0
17 | 15/8/2020,16,5
18 | 18/8/2020,17,5
19 | 23/8/2020,18,8
20 | 15/8/2020,19,6
21 | 25/8/2020,20,4
22 | 16/8/2020,21,1
23 | 25/8/2020,22,4
24 | 22/8/2020,23,7
25 | 21/8/2020,24,4
26 | 25/8/2020,25,5
27 | 23/8/2020,26,6
28 | 19/8/2020,27,9
29 | 24/8/2020,28,3
30 | 20/8/2020,29,0
31 | 25/8/2020,30,8
32 | 20/8/2020,31,5
33 | 21/8/2020,32,8
34 | 15/8/2020,33,6
35 | 24/8/2020,34,4
36 | 25/8/2020,35,1
37 | 24/8/2020,36,7
38 | 17/8/2020,37,8
39 | 16/8/2020,38,8
40 | 17/8/2020,39,1
41 | 20/8/2020,40,8
42 | 18/8/2020,41,3
43 | 16/8/2020,42,0
44 | 23/8/2020,43,9
45 | 25/8/2020,44,9
46 | 16/8/2020,45,2
47 | 15/8/2020,46,2
48 | 21/8/2020,47,1
49 | 21/8/2020,48,4
50 | 22/8/2020,49,8
51 | 17/8/2020,50,6
52 | 21/8/2020,51,4
53 | 20/8/2020,52,7
54 | 16/8/2020,53,7
55 | 20/8/2020,54,6
56 | 20/8/2020,55,0
57 | 21/8/2020,56,8
58 | 18/8/2020,57,5
59 | 17/8/2020,58,2
60 | 24/8/2020,59,3
61 | 20/8/2020,60,7
62 | 22/8/2020,61,8
63 | 15/8/2020,62,6
64 | 23/8/2020,63,3
65 | 17/8/2020,64,4
66 | 16/8/2020,65,4
67 | 16/8/2020,66,3
68 | 19/8/2020,67,1
69 | 18/8/2020,68,2
70 | 17/8/2020,69,4
71 | 22/8/2020,70,7
72 | 20/8/2020,71,6
73 | 15/8/2020,72,2
74 | 17/8/2020,73,7
75 | 22/8/2020,74,1
76 | 17/8/2020,75,8
77 | 19/8/2020,76,0
78 | 25/8/2020,77,1
79 | 25/8/2020,78,0
80 | 17/8/2020,79,8
81 | 23/8/2020,80,7
82 | 24/8/2020,81,2
83 | 21/8/2020,82,0
84 | 24/8/2020,83,4
85 | 21/8/2020,84,0
86 | 25/8/2020,85,7
87 | 22/8/2020,86,1
88 | 20/8/2020,87,2
89 | 19/8/2020,88,3
90 | 22/8/2020,89,8
91 | 24/8/2020,90,0
92 | 22/8/2020,91,9
93 | 25/8/2020,92,7
94 | 25/8/2020,93,0
95 | 17/8/2020,94,1
96 | 23/8/2020,95,2
97 | 24/8/2020,96,3
98 | 21/8/2020,97,8
99 | 24/8/2020,98,0
100 | 21/8/2020,99,9
101 | 25/8/2020,100,7
--------------------------------------------------------------------------------
/Problem 0/employee_salary.csv:
--------------------------------------------------------------------------------
1 | id,first_name,last_name,salary,department_id
2 | 45,Kevin,Duncan,45210,1003
3 | 25,Pamela,Matthews,57944,1005
4 | 48,Robert,Lynch,117960,1004
5 | 34,Justin,Dunn,67992,1003
6 | 62,Dale,Hayes,97662,1005
7 | 1,Todd,Wilson,110000,1006
8 | 61,Ryan,Brown,120000,1003
9 | 21,Stephen,Berry,123617,1002
10 | 13,Julie,Sanchez,210000,1001
11 | 55,Michael,Morris,106799,1005
12 | 44,Trevor,Carter,38670,1001
13 | 73,William,Preston,155225,1003
14 | 39,Linda,Clark,186781,1002
15 | 10,Sean,Crawford,190000,1006
16 | 30,Stephen,Smith,194791,1001
17 | 75,Julia,Ramos,105000,1006
18 | 59,Kevin,Robinson,100924,1005
19 | 69,Ernest,Peterson,115993,1005
20 | 65,Deborah,Martin,67389,1004
21 | 63,Richard,Sanford,136083,1001
22 | 29,Jason,Olsen,51937,1006
23 | 11,Kevin,Townsend,166861,1002
24 | 43,Joseph,Rogers,22800,1005
25 | 32,Eric,Zimmerman,83093,1006
26 | 6,Natasha,Swanson,90000,1005
27 | 3,Kelly,Rosario,42689,1002
28 | 16,Briana,Rivas,151668,1005
29 | 38,Nicole,Lewis,114079,1001
30 | 42,Traci,Williams,180000,1003
31 | 49,Amber,Harding,77764,1002
32 | 26,Allison,Johnson,128782,1001
33 | 74,Richard,Cole,180361,1003
34 | 23,Angela,Williams,100875,1004
35 | 19,Michael,Ramsey,63159,1003
36 | 28,Alexis,Beck,12260,1005
37 | 64,Danielle,Williams,120000,1006
38 | 51,Theresa,Everett,31404,1002
39 | 58,Edward,Sharp,41077,1005
40 | 36,Jesus,Ward,36078,1005
41 | 5,Sherry,Golden,44101,1002
42 | 9,Christy,Mitchell,150000,1001
43 | 35,John,Ball,47795,1004
44 | 54,Wesley,Tucker,90221,1005
45 | 20,Cody,Gonzalez,112809,1004
46 | 57,Patricia,Harmon,147417,1005
47 | 24,William,Flores,142674,1003
48 | 60,Charles,Pearson,173317,1004
49 | 17,Jason,Burnett,42525,1006
50 | 7,Diane,Gordon,74591,1002
51 | 15,Anthony,Valdez,96898,1001
52 | 41,John,George,21642,1001
53 | 71,Kristine,Casey,67651,1003
54 | 12,Joshua,Johnson,123082,1004
55 | 68,Antonio,Carpenter,83684,1002
56 | 47,Kimberly,Dean,71416,1003
57 | 37,Philip,Gillespie,36424,1006
58 | 31,Kimberly,Brooks,95327,1003
59 | 27,Anthony,Ball,34386,1003
60 | 40,Colleen,Carrillo,147723,1004
61 | 70,Karen,Fernandez,101238,1003
62 | 4,Patricia,Powell,170000,1004
63 | 22,Brittany,Scott,162537,1002
64 | 8,Mercedes,Rodriguez,61048,1005
65 | 67,Tyler,Green,111085,1002
66 | 52,Kara,Smith,192838,1004
67 | 46,Joshua,Ewing,73088,1003
68 | 18,Jeffrey,Harris,20000,1002
69 | 56,Rachael,Williams,103585,1002
70 | 50,Victoria,Wilson,176620,1002
71 | 14,John,Coleman,152434,1001
72 | 72,Christine,Frye,137244,1004
73 | 2,Justin,Simon,130000,1005
74 | 53,Teresa,Cohen,98860,1001
75 | 66,Dustin,Bush,47567,1004
76 | 33,Peter,Holt,69945,1002
77 |
--------------------------------------------------------------------------------
/Problem 1/employee.csv:
--------------------------------------------------------------------------------
1 | "id","first_name","last_name","salary","department_id"
2 | 1,Todd,Wilson,110000,1006
3 | 1,Todd,Wilson,106119,1006
4 | 2,Justin,Simon,128922,1005
5 | 2,Justin,Simon,130000,1005
6 | 3,Kelly,Rosario,42689,1002
7 | 4,Patricia,Powell,162825,1004
8 | 4,Patricia,Powell,170000,1004
9 | 5,Sherry,Golden,44101,1002
10 | 6,Natasha,Swanson,79632,1005
11 | 6,Natasha,Swanson,90000,1005
12 | 7,Diane,Gordon,74591,1002
13 | 8,Mercedes,Rodriguez,61048,1005
14 | 9,Christy,Mitchell,137236,1001
15 | 9,Christy,Mitchell,140000,1001
16 | 9,Christy,Mitchell,150000,1001
17 | 10,Sean,Crawford,182065,1006
18 | 10,Sean,Crawford,190000,1006
19 | 11,Kevin,Townsend,166861,1002
20 | 12,Joshua,Johnson,123082,1004
21 | 13,Julie,Sanchez,185663,1001
22 | 13,Julie,Sanchez,200000,1001
23 | 13,Julie,Sanchez,210000,1001
24 | 14,John,Coleman,152434,1001
25 | 15,Anthony,Valdez,96898,1001
26 | 16,Briana,Rivas,151668,1005
27 | 17,Jason,Burnett,42525,1006
28 | 18,Jeffrey,Harris,14491,1002
29 | 18,Jeffrey,Harris,20000,1002
30 | 19,Michael,Ramsey,63159,1003
31 | 20,Cody,Gonzalez,112809,1004
32 | 21,Stephen,Berry,123617,1002
33 | 22,Brittany,Scott,162537,1002
34 | 23,Angela,Williams,100875,1004
35 | 24,William,Flores,142674,1003
36 | 25,Pamela,Matthews,57944,1005
37 | 26,Allison,Johnson,128782,1001
38 | 27,Anthony,Ball,34386,1003
39 | 28,Alexis,Beck,12260,1005
40 | 29,Jason,Olsen,51937,1006
41 | 30,Stephen,Smith,194791,1001
42 | 31,Kimberly,Brooks,95327,1003
43 | 32,Eric,Zimmerman,83093,1006
44 | 33,Peter,Holt,69945,1002
45 | 34,Justin,Dunn,67992,1003
46 | 35,John,Ball,47795,1004
47 | 36,Jesus,Ward,36078,1005
48 | 37,Philip,Gillespie,36424,1006
49 | 38,Nicole,Lewis,114079,1001
50 | 39,Linda,Clark,186781,1002
51 | 40,Colleen,Carrillo,147723,1004
52 | 41,John,George,21642,1001
53 | 42,Traci,Williams,138892,1003
54 | 42,Traci,Williams,150000,1003
55 | 42,Traci,Williams,160000,1003
56 | 42,Traci,Williams,180000,1003
57 | 43,Joseph,Rogers,22800,1005
58 | 44,Trevor,Carter,38670,1001
59 | 45,Kevin,Duncan,45210,1003
60 | 46,Joshua,Ewing,73088,1003
61 | 47,Kimberly,Dean,71416,1003
62 | 48,Robert,Lynch,117960,1004
63 | 49,Amber,Harding,77764,1002
64 | 50,Victoria,Wilson,176620,1002
65 | 51,Theresa,Everett,31404,1002
66 | 52,Kara,Smith,192838,1004
67 | 53,Teresa,Cohen,98860,1001
68 | 54,Wesley,Tucker,90221,1005
69 | 55,Michael,Morris,106799,1005
70 | 56,Rachael,Williams,103585,1002
71 | 57,Patricia,Harmon,147417,1005
72 | 58,Edward,Sharp,41077,1005
73 | 59,Kevin,Robinson,100924,1005
74 | 60,Charles,Pearson,173317,1004
75 | 61,Ryan,Brown,110225,1003
76 | 61,Ryan,Brown,120000,1003
77 | 62,Dale,Hayes,97662,1005
78 | 63,Richard,Sanford,136083,1001
79 | 64,Danielle,Williams,98655,1006
80 | 64,Danielle,Williams,110000,1006
81 | 64,Danielle,Williams,120000,1006
82 | 65,Deborah,Martin,67389,1004
83 | 66,Dustin,Bush,47567,1004
84 | 67,Tyler,Green,111085,1002
85 | 68,Antonio,Carpenter,83684,1002
86 | 69,Ernest,Peterson,115993,1005
87 | 70,Karen,Fernandez,101238,1003
88 | 71,Kristine,Casey,67651,1003
89 | 72,Christine,Frye,137244,1004
90 | 73,William,Preston,155225,1003
91 | 74,Richard,Cole,180361,1003
92 | 75,Julia,Ramos,61398,1006
93 | 75,Julia,Ramos,70000,1006
94 | 75,Julia,Ramos,83000,1006
95 | 75,Julia,Ramos,90000,1006
96 | 75,Julia,Ramos,105000,1006
97 |
--------------------------------------------------------------------------------
/Problem 7/transaction.csv:
--------------------------------------------------------------------------------
1 | id,user_id,item,created_at,revenue
2 | 1,109,milk,2020-03-03,123
3 | 2,139,biscuit,2020-03-18,421
4 | 3,120,milk,2020-03-18,176
5 | 4,108,banana,2020-03-18,862
6 | 5,130,milk,2020-03-28,333
7 | 6,103,bread,2020-03-29,862
8 | 7,122,banana,2020-03-07,952
9 | 8,125,bread,2020-03-13,317
10 | 9,139,bread,2020-03-30,929
11 | 10,141,banana,2020-03-17,812
12 | 11,116,bread,2020-03-31,226
13 | 12,128,bread,2020-03-04,112
14 | 13,146,biscuit,2020-03-04,362
15 | 14,119,banana,2020-03-28,127
16 | 15,142,bread,2020-03-09,503
17 | 16,122,bread,2020-03-06,593
18 | 17,128,biscuit,2020-03-24,160
19 | 18,112,banana,2020-03-24,262
20 | 19,149,banana,2020-03-29,382
21 | 20,100,banana,2020-03-18,599
22 | 21,130,milk,2020-03-16,604
23 | 22,103,milk,2020-03-31,290
24 | 23,112,banana,2020-03-23,523
25 | 24,102,bread,2020-03-25,325
26 | 25,120,biscuit,2020-03-21,858
27 | 26,109,bread,2020-03-22,432
28 | 27,101,milk,2020-03-01,449
29 | 28,138,milk,2020-03-19,961
30 | 29,100,milk,2020-03-29,410
31 | 30,129,milk,2020-03-02,771
32 | 31,123,milk,2020-03-31,434
33 | 32,104,biscuit,2020-03-31,957
34 | 33,110,bread,2020-03-13,210
35 | 34,143,bread,2020-03-27,870
36 | 35,130,milk,2020-03-12,176
37 | 36,128,milk,2020-03-28,498
38 | 37,133,banana,2020-03-21,837
39 | 38,150,banana,2020-03-20,927
40 | 39,120,milk,2020-03-27,793
41 | 40,109,bread,2020-03-02,362
42 | 41,110,bread,2020-03-13,262
43 | 42,140,milk,2020-03-09,468
44 | 43,112,banana,2020-03-04,381
45 | 44,117,biscuit,2020-03-19,831
46 | 45,137,banana,2020-03-23,490
47 | 46,130,bread,2020-03-09,149
48 | 47,133,bread,2020-03-08,658
49 | 48,143,milk,2020-03-11,317
50 | 49,111,biscuit,2020-03-23,204
51 | 50,150,banana,2020-03-04,299
52 | 51,131,bread,2020-03-10,155
53 | 52,140,biscuit,2020-03-17,810
54 | 53,147,banana,2020-03-22,702
55 | 54,119,biscuit,2020-03-15,355
56 | 55,116,milk,2020-03-12,468
57 | 56,141,milk,2020-03-14,254
58 | 57,143,bread,2020-03-16,647
59 | 58,105,bread,2020-03-21,562
60 | 59,149,biscuit,2020-03-11,827
61 | 60,117,banana,2020-03-22,249
62 | 61,150,banana,2020-03-21,450
63 | 62,134,bread,2020-03-08,981
64 | 63,133,banana,2020-03-26,353
65 | 64,127,milk,2020-03-27,300
66 | 65,101,milk,2020-03-26,740
67 | 66,137,biscuit,2020-03-12,473
68 | 67,113,biscuit,2020-03-21,278
69 | 68,141,bread,2020-03-21,118
70 | 69,112,biscuit,2020-03-14,334
71 | 70,118,milk,2020-03-30,603
72 | 71,111,milk,2020-03-19,205
73 | 72,146,biscuit,2020-03-13,599
74 | 73,148,banana,2020-03-14,530
75 | 74,100,banana,2020-03-13,175
76 | 75,105,banana,2020-03-05,815
77 | 76,129,milk,2020-03-02,489
78 | 77,121,milk,2020-03-16,476
79 | 78,117,bread,2020-03-11,270
80 | 79,133,milk,2020-03-12,446
81 | 80,124,bread,2020-03-31,937
82 | 81,145,bread,2020-03-07,821
83 | 82,105,banana,2020-03-09,972
84 | 83,131,milk,2020-03-09,808
85 | 84,114,biscuit,2020-03-31,202
86 | 85,120,milk,2020-03-06,898
87 | 86,130,milk,2020-03-06,581
88 | 87,141,biscuit,2020-03-11,749
89 | 88,147,bread,2020-03-14,262
90 | 89,118,milk,2020-03-15,735
91 | 90,136,biscuit,2020-03-22,410
92 | 91,132,bread,2020-03-06,161
93 | 92,137,biscuit,2020-03-31,427
94 | 93,107,bread,2020-03-01,701
95 | 94,111,biscuit,2020-03-18,218
96 | 95,100,bread,2020-03-07,410
97 | 96,106,milk,2020-03-21,379
98 | 97,114,banana,2020-03-25,705
99 | 98,110,bread,2020-03-27,225
100 | 99,130,milk,2020-03-16,494
101 | 100,117,bread,2020-03-10,209
--------------------------------------------------------------------------------
/Problem 6/problem6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c",
6 | "metadata": {},
7 | "source": [
8 | "Here, we will solve problems two ways\n",
9 | "1. First using PySpark function \n",
10 | "2. Second using Spark SQL"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# First Load all the required library and also Start Spark Session\n",
21 | "# Load all the required library\n",
22 | "from pyspark.sql import SparkSession"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stderr",
33 | "output_type": "stream",
34 | "text": [
35 | "WARNING: An illegal reflective access operation has occurred\n",
36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
39 | "WARNING: All illegal access operations will be denied in a future release\n",
40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
41 | "Setting default log level to \"WARN\".\n",
42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
43 | "23/02/14 14:24:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "#Start Spark Session\n",
49 | "spark = SparkSession.builder.appName(\"problem6\").getOrCreate()\n",
50 | "sqlContext = SparkSession(spark)\n",
51 | "#Dont Show warning only error\n",
52 | "spark.sparkContext.setLogLevel(\"ERROR\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 4,
58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68",
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "#Load CSV file into DataFrame\n",
63 | "studentdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"students.csv\")"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 5,
69 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030",
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "root\n",
77 | " |-- ID: integer (nullable = true)\n",
78 | " |-- Name: string (nullable = true)\n",
79 | " |-- Marks: integer (nullable = true)\n",
80 | "\n"
81 | ]
82 | }
83 | ],
84 | "source": [
85 | "#Check Schema of DataFrame\n",
86 | "studentdf.printSchema()"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 6,
92 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba",
93 | "metadata": {},
94 | "outputs": [
95 | {
96 | "name": "stdout",
97 | "output_type": "stream",
98 | "text": [
99 | "+---+---------+-----+\n",
100 | "| ID| Name|Marks|\n",
101 | "+---+---------+-----+\n",
102 | "| 19| Samantha| 87|\n",
103 | "| 21| Julia| 96|\n",
104 | "| 11| Britney| 95|\n",
105 | "| 32| Kristeen| 100|\n",
106 | "| 12| Dyana| 55|\n",
107 | "| 13| Jenny| 66|\n",
108 | "| 14|Christene| 88|\n",
109 | "| 15| Meera| 24|\n",
110 | "| 16| Priya| 76|\n",
111 | "| 17| Priyanka| 77|\n",
112 | "| 18| Paige| 74|\n",
113 | "| 19| Jane| 64|\n",
114 | "| 21| Belvet| 78|\n",
115 | "| 31| Scarlet| 80|\n",
116 | "| 41| Salma| 81|\n",
117 | "| 51| Amanda| 34|\n",
118 | "| 61| Heraldo| 94|\n",
119 | "| 71| Stuart| 99|\n",
120 | "| 81| Aamina| 77|\n",
121 | "| 76| Amina| 89|\n",
122 | "+---+---------+-----+\n",
123 | "only showing top 20 rows\n",
124 | "\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "#Check sample Data \n",
130 | "studentdf.show()"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 14,
136 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171",
137 | "metadata": {},
138 | "outputs": [
139 | {
140 | "name": "stdout",
141 | "output_type": "stream",
142 | "text": [
143 | "+---------+\n",
144 | "| Name|\n",
145 | "+---------+\n",
146 | "| Stuart|\n",
147 | "| Kristeen|\n",
148 | "|Christene|\n",
149 | "| Amina|\n",
150 | "| Aamina|\n",
151 | "| Priya|\n",
152 | "| Heraldo|\n",
153 | "| Scarlet|\n",
154 | "| Julia|\n",
155 | "| Salma|\n",
156 | "| Britney|\n",
157 | "| Priyanka|\n",
158 | "| Samantha|\n",
159 | "| Vivek|\n",
160 | "| Belvet|\n",
161 | "| Devil|\n",
162 | "| Evil|\n",
163 | "+---------+\n",
164 | "\n"
165 | ]
166 | }
167 | ],
168 | "source": [
169 | "#Solving Problem using PySpark \n",
170 | "#Filter with Markes > 75 and then order by last 3 char and ID\n",
171 | "from pyspark.sql.functions import expr\n",
172 | "studentdf.select(\"Name\").where(\"Marks > 75\").orderBy(expr(\"RIGHT(Name,3)\"),\"ID\").show(n=100)"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 13,
178 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1",
179 | "metadata": {},
180 | "outputs": [],
181 | "source": [
182 | "# Now we are solving Same problem using Spark SQL \n",
183 | "# Creating Temp Table or HIVE table\n",
184 | "stationdf.createOrReplaceTempView(\"tmpStudent\")"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": 15,
190 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c",
191 | "metadata": {},
192 | "outputs": [
193 | {
194 | "name": "stdout",
195 | "output_type": "stream",
196 | "text": [
197 | "+---------+\n",
198 | "| Name|\n",
199 | "+---------+\n",
200 | "| Stuart|\n",
201 | "| Kristeen|\n",
202 | "|Christene|\n",
203 | "| Amina|\n",
204 | "| Aamina|\n",
205 | "| Priya|\n",
206 | "| Heraldo|\n",
207 | "| Scarlet|\n",
208 | "| Julia|\n",
209 | "| Salma|\n",
210 | "| Britney|\n",
211 | "| Priyanka|\n",
212 | "| Samantha|\n",
213 | "| Vivek|\n",
214 | "| Belvet|\n",
215 | "| Devil|\n",
216 | "| Evil|\n",
217 | "+---------+\n",
218 | "\n"
219 | ]
220 | }
221 | ],
222 | "source": [
223 | "# Now we have SQL Table and we can write SQL Query on top of that \n",
224 | "# For example by Select on table \n",
225 | "sqlContext.sql(\"SELECT Name \\\n",
226 | " FROM tmpStudent \\\n",
227 | " WHERE Marks > 75 \\\n",
228 | " ORDER BY right(Name,3),ID\").show()"
229 | ]
230 | }
231 | ],
232 | "metadata": {
233 | "kernelspec": {
234 | "display_name": "Python 3 (ipykernel)",
235 | "language": "python",
236 | "name": "python3"
237 | },
238 | "language_info": {
239 | "codemirror_mode": {
240 | "name": "ipython",
241 | "version": 3
242 | },
243 | "file_extension": ".py",
244 | "mimetype": "text/x-python",
245 | "name": "python",
246 | "nbconvert_exporter": "python",
247 | "pygments_lexer": "ipython3",
248 | "version": "3.8.13"
249 | }
250 | },
251 | "nbformat": 4,
252 | "nbformat_minor": 5
253 | }
254 |
--------------------------------------------------------------------------------
/Problem 7/problem7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c",
6 | "metadata": {},
7 | "source": [
8 | "Here, we will solve problems two ways\n",
9 | "1. First using PySpark function \n",
10 | "2. Second using Spark SQL"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# First Load all the required library and also Start Spark Session\n",
21 | "# Load all the required library\n",
22 | "from pyspark.sql import SparkSession"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stderr",
33 | "output_type": "stream",
34 | "text": [
35 | "WARNING: An illegal reflective access operation has occurred\n",
36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
39 | "WARNING: All illegal access operations will be denied in a future release\n",
40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
41 | "Setting default log level to \"WARN\".\n",
42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
43 | "23/02/15 10:13:52 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "#Start Spark Session\n",
49 | "spark = SparkSession.builder.appName(\"problem7\").getOrCreate()\n",
50 | "sqlContext = SparkSession(spark)\n",
51 | "#Dont Show warning only error\n",
52 | "spark.sparkContext.setLogLevel(\"ERROR\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68",
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "name": "stderr",
63 | "output_type": "stream",
64 | "text": [
65 | " \r"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "#Load CSV file into DataFrame\n",
71 | "transactiondf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"transaction.csv\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030",
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "root\n",
85 | " |-- id: integer (nullable = true)\n",
86 | " |-- user_id: integer (nullable = true)\n",
87 | " |-- item: string (nullable = true)\n",
88 | " |-- created_at: string (nullable = true)\n",
89 | " |-- revenue: integer (nullable = true)\n",
90 | "\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "#Check Schema of DataFrame\n",
96 | "transactiondf.printSchema()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 10,
102 | "id": "c9ba5185-8682-4b49-88b8-9391cd0c2dac",
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "from pyspark.sql.functions import col\n",
107 | "transactiondf = transactiondf.withColumn(\"created_at\",col(\"created_at\").cast(\"date\"))"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 12,
113 | "id": "59514ce5-8584-4b67-9cff-934e9287f818",
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "name": "stdout",
118 | "output_type": "stream",
119 | "text": [
120 | "root\n",
121 | " |-- id: integer (nullable = true)\n",
122 | " |-- user_id: integer (nullable = true)\n",
123 | " |-- item: string (nullable = true)\n",
124 | " |-- created_at: date (nullable = true)\n",
125 | " |-- revenue: integer (nullable = true)\n",
126 | "\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "transactiondf.printSchema()"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 13,
137 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba",
138 | "metadata": {},
139 | "outputs": [
140 | {
141 | "name": "stdout",
142 | "output_type": "stream",
143 | "text": [
144 | "+---+-------+-------+----------+-------+\n",
145 | "| id|user_id| item|created_at|revenue|\n",
146 | "+---+-------+-------+----------+-------+\n",
147 | "| 1| 109| milk|2020-03-03| 123|\n",
148 | "| 2| 139|biscuit|2020-03-18| 421|\n",
149 | "| 3| 120| milk|2020-03-18| 176|\n",
150 | "| 4| 108| banana|2020-03-18| 862|\n",
151 | "| 5| 130| milk|2020-03-28| 333|\n",
152 | "| 6| 103| bread|2020-03-29| 862|\n",
153 | "| 7| 122| banana|2020-03-07| 952|\n",
154 | "| 8| 125| bread|2020-03-13| 317|\n",
155 | "| 9| 139| bread|2020-03-30| 929|\n",
156 | "| 10| 141| banana|2020-03-17| 812|\n",
157 | "| 11| 116| bread|2020-03-31| 226|\n",
158 | "| 12| 128| bread|2020-03-04| 112|\n",
159 | "| 13| 146|biscuit|2020-03-04| 362|\n",
160 | "| 14| 119| banana|2020-03-28| 127|\n",
161 | "| 15| 142| bread|2020-03-09| 503|\n",
162 | "| 16| 122| bread|2020-03-06| 593|\n",
163 | "| 17| 128|biscuit|2020-03-24| 160|\n",
164 | "| 18| 112| banana|2020-03-24| 262|\n",
165 | "| 19| 149| banana|2020-03-29| 382|\n",
166 | "| 20| 100| banana|2020-03-18| 599|\n",
167 | "+---+-------+-------+----------+-------+\n",
168 | "only showing top 20 rows\n",
169 | "\n"
170 | ]
171 | }
172 | ],
173 | "source": [
174 | "#Check sample Data \n",
175 | "transactiondf.show()"
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "execution_count": 14,
181 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1",
182 | "metadata": {},
183 | "outputs": [],
184 | "source": [
185 | "# Now we are solving Same problem using Spark SQL \n",
186 | "# Creating Temp Table or HIVE table\n",
187 | "transactiondf.createOrReplaceTempView(\"tmpTransaction\")"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 16,
193 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c",
194 | "metadata": {},
195 | "outputs": [
196 | {
197 | "name": "stdout",
198 | "output_type": "stream",
199 | "text": [
200 | "+-------+\n",
201 | "|user_id|\n",
202 | "+-------+\n",
203 | "| 100|\n",
204 | "| 103|\n",
205 | "| 105|\n",
206 | "| 109|\n",
207 | "| 110|\n",
208 | "| 111|\n",
209 | "| 112|\n",
210 | "| 114|\n",
211 | "| 117|\n",
212 | "| 120|\n",
213 | "| 122|\n",
214 | "| 128|\n",
215 | "| 129|\n",
216 | "| 130|\n",
217 | "| 131|\n",
218 | "| 133|\n",
219 | "| 141|\n",
220 | "| 143|\n",
221 | "| 150|\n",
222 | "+-------+\n",
223 | "\n"
224 | ]
225 | }
226 | ],
227 | "source": [
228 | "# Now we have SQL Table and we can write SQL Query on top of that \n",
229 | "# For example by Select on table \n",
230 | "sqlContext.sql(\"SELECT DISTINCT(a1.user_id) \\\n",
231 | " FROM tmpTransaction a1 \\\n",
232 | " JOIN tmpTransaction a2 ON a1.user_id=a2.user_id \\\n",
233 | " AND a1.id <> a2.id \\\n",
234 | " AND DATEDIFF(a2.created_at,a1.created_at) BETWEEN 0 AND 7 \\\n",
235 | " ORDER BY a1.user_id;\").show()"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": null,
241 | "id": "e55eb16a-fb5c-42b6-9f7c-feb1ff9c2945",
242 | "metadata": {},
243 | "outputs": [],
244 | "source": []
245 | }
246 | ],
247 | "metadata": {
248 | "kernelspec": {
249 | "display_name": "Python 3 (ipykernel)",
250 | "language": "python",
251 | "name": "python3"
252 | },
253 | "language_info": {
254 | "codemirror_mode": {
255 | "name": "ipython",
256 | "version": 3
257 | },
258 | "file_extension": ".py",
259 | "mimetype": "text/x-python",
260 | "name": "python",
261 | "nbconvert_exporter": "python",
262 | "pygments_lexer": "ipython3",
263 | "version": "3.8.13"
264 | }
265 | },
266 | "nbformat": 4,
267 | "nbformat_minor": 5
268 | }
269 |
--------------------------------------------------------------------------------
/Problem 4/problem4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c",
6 | "metadata": {},
7 | "source": [
8 | "Here, we will solve problems two ways\n",
9 | "1. First using PySpark function \n",
10 | "2. Second using Spark SQL"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# First Load all the required library and also Start Spark Session\n",
21 | "# Load all the required library\n",
22 | "from pyspark.sql import SparkSession"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stderr",
33 | "output_type": "stream",
34 | "text": [
35 | "WARNING: An illegal reflective access operation has occurred\n",
36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
39 | "WARNING: All illegal access operations will be denied in a future release\n",
40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
41 | "Setting default log level to \"WARN\".\n",
42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
43 | "23/02/09 22:16:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "#Start Spark Session\n",
49 | "spark = SparkSession.builder.appName(\"problem4\").getOrCreate()\n",
50 | "sqlContext = SparkSession(spark)\n",
51 | "#Dont Show warning only error\n",
52 | "spark.sparkContext.setLogLevel(\"ERROR\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68",
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "name": "stderr",
63 | "output_type": "stream",
64 | "text": [
65 | " \r"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "#Load CSV file into DataFrame\n",
71 | "stationdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"station.csv\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030",
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "root\n",
85 | " |-- ID: integer (nullable = true)\n",
86 | " |-- City: string (nullable = true)\n",
87 | " |-- State: string (nullable = true)\n",
88 | " |-- Lattitude: double (nullable = true)\n",
89 | " |-- Longitude: double (nullable = true)\n",
90 | "\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "#Check Schema of DataFrame\n",
96 | "stationdf.printSchema()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 5,
102 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba",
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "+---+-----------+-----+-----------+-----------+\n",
110 | "| ID| City|State| Lattitude| Longitude|\n",
111 | "+---+-----------+-----+-----------+-----------+\n",
112 | "|478| Tipton| IN|33.54792701|97.94286036|\n",
113 | "|619| Arlington| CO|75.17993079|92.94615894|\n",
114 | "|711| Turner| AR|50.24380534|101.4580163|\n",
115 | "|839| Slidell| LA|85.32270304|151.8743276|\n",
116 | "|411| Negreet| LA| 98.9707194|105.3376115|\n",
117 | "|588| Glencoe| KY|46.38739244|136.0427027|\n",
118 | "|665| Chelsea| IA|98.72210937|59.68913002|\n",
119 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n",
120 | "|811| Dorrance| KS|102.0888316|121.5614372|\n",
121 | "|698| Albany| CA|49.75112765|80.21211317|\n",
122 | "|325| Monument| KS|70.52300953|141.7680413|\n",
123 | "|414| Manchester| MD|73.51580724|37.14602869|\n",
124 | "|113| Prescott| IA|39.93234421|65.79327823|\n",
125 | "|971|Graettinger| IA|94.66283665|150.3826243|\n",
126 | "|266| Cahone| CO|116.2321963| 127.009554|\n",
127 | "|617| Sturgis| MS|36.45673517|126.1690696|\n",
128 | "|495| Upperco| MD|114.2157413|29.63104758|\n",
129 | "|473| Highwood| IL|27.25445814|150.9227402|\n",
130 | "|959| Waipahu| HI|106.4460526|33.91451792|\n",
131 | "|438| Bowdon| GA|88.98111013|78.49025241|\n",
132 | "+---+-----------+-----+-----------+-----------+\n",
133 | "only showing top 20 rows\n",
134 | "\n"
135 | ]
136 | }
137 | ],
138 | "source": [
139 | "#Check sample Data \n",
140 | "stationdf.show()"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 6,
146 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1",
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "# Now we are solving Same problem using Spark SQL \n",
151 | "# Creating Temp Table or HIVE table\n",
152 | "stationdf.createOrReplaceTempView(\"tmpStation\")"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 7,
158 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c",
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "name": "stdout",
163 | "output_type": "stream",
164 | "text": [
165 | "+---+-----------+-----+-----------+-----------+\n",
166 | "| ID| City|State| Lattitude| Longitude|\n",
167 | "+---+-----------+-----+-----------+-----------+\n",
168 | "|478| Tipton| IN|33.54792701|97.94286036|\n",
169 | "|619| Arlington| CO|75.17993079|92.94615894|\n",
170 | "|711| Turner| AR|50.24380534|101.4580163|\n",
171 | "|839| Slidell| LA|85.32270304|151.8743276|\n",
172 | "|411| Negreet| LA| 98.9707194|105.3376115|\n",
173 | "|588| Glencoe| KY|46.38739244|136.0427027|\n",
174 | "|665| Chelsea| IA|98.72210937|59.68913002|\n",
175 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n",
176 | "|811| Dorrance| KS|102.0888316|121.5614372|\n",
177 | "|698| Albany| CA|49.75112765|80.21211317|\n",
178 | "|325| Monument| KS|70.52300953|141.7680413|\n",
179 | "|414| Manchester| MD|73.51580724|37.14602869|\n",
180 | "|113| Prescott| IA|39.93234421|65.79327823|\n",
181 | "|971|Graettinger| IA|94.66283665|150.3826243|\n",
182 | "|266| Cahone| CO|116.2321963| 127.009554|\n",
183 | "|617| Sturgis| MS|36.45673517|126.1690696|\n",
184 | "|495| Upperco| MD|114.2157413|29.63104758|\n",
185 | "|473| Highwood| IL|27.25445814|150.9227402|\n",
186 | "|959| Waipahu| HI|106.4460526|33.91451792|\n",
187 | "|438| Bowdon| GA|88.98111013|78.49025241|\n",
188 | "+---+-----------+-----+-----------+-----------+\n",
189 | "only showing top 20 rows\n",
190 | "\n"
191 | ]
192 | }
193 | ],
194 | "source": [
195 | "# Now we have SQL Table and we can write SQL Query on top of that \n",
196 | "# For example by Select on table \n",
197 | "sqlContext.sql(\"SELECT * FROM tmpStation\").show()"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 9,
203 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24",
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "name": "stderr",
208 | "output_type": "stream",
209 | "text": [
210 | " \r"
211 | ]
212 | },
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "+--------------+----------+\n",
218 | "| city|citylength|\n",
219 | "+--------------+----------+\n",
220 | "| Amo| 3|\n",
221 | "|Fredericksburg| 14|\n",
222 | "+--------------+----------+\n",
223 | "\n"
224 | ]
225 | }
226 | ],
227 | "source": [
228 | "# Now we will write query to get max salary for each employee \n",
229 | "# so we will use SQL Group by and SQL Order by functions \n",
230 | "sqlContext.sql(\"SELECT q1.city, q1.citylength FROM \\\n",
231 | " (SELECT CITY,LENGTH(CITY) as citylength, RANK() OVER (PARTITION BY LENGTH(CITY) ORDER BY LENGTH(CITY),CITY) as actualrank \\\n",
232 | " FROM tmpStation) q1 \\\n",
233 | " WHERE q1. actualrank = 1 \\\n",
234 | " AND q1.citylength = (SELECT MIN(LENGTH(CITY)) FROM tmpStation) \\\n",
235 | " OR q1.citylength = (SELECT MAX(LENGTH(CITY)) FROM tmpStation)\").show(n=100)\n",
236 | "\n"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": null,
242 | "id": "3eeb1534-4da5-427f-9103-1a7bb847170e",
243 | "metadata": {},
244 | "outputs": [],
245 | "source": []
246 | }
247 | ],
248 | "metadata": {
249 | "kernelspec": {
250 | "display_name": "Python 3 (ipykernel)",
251 | "language": "python",
252 | "name": "python3"
253 | },
254 | "language_info": {
255 | "codemirror_mode": {
256 | "name": "ipython",
257 | "version": 3
258 | },
259 | "file_extension": ".py",
260 | "mimetype": "text/x-python",
261 | "name": "python",
262 | "nbconvert_exporter": "python",
263 | "pygments_lexer": "ipython3",
264 | "version": "3.8.13"
265 | }
266 | },
267 | "nbformat": 4,
268 | "nbformat_minor": 5
269 | }
270 |
--------------------------------------------------------------------------------
/Problem 3/problem3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c",
6 | "metadata": {},
7 | "source": [
8 | "Here, we will solve problems two ways\n",
9 | "1. First using PySpark function \n",
10 | "2. Second using Spark SQL"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# First Load all the required library and also Start Spark Session\n",
21 | "# Load all the required library\n",
22 | "from pyspark.sql import SparkSession"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stderr",
33 | "output_type": "stream",
34 | "text": [
35 | "WARNING: An illegal reflective access operation has occurred\n",
36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
39 | "WARNING: All illegal access operations will be denied in a future release\n",
40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
41 | "Setting default log level to \"WARN\".\n",
42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
43 | "23/02/09 10:33:16 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "#Start Spark Session\n",
49 | "spark = SparkSession.builder.appName(\"problem3\").getOrCreate()\n",
50 | "sqlContext = SparkSession(spark)\n",
51 | "#Dont Show warning only error\n",
52 | "spark.sparkContext.setLogLevel(\"ERROR\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68",
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "name": "stderr",
63 | "output_type": "stream",
64 | "text": [
65 | " \r"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "#Load CSV file into DataFrame\n",
71 | "stationdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"station.csv\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030",
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "root\n",
85 | " |-- ID: integer (nullable = true)\n",
86 | " |-- City: string (nullable = true)\n",
87 | " |-- State: string (nullable = true)\n",
88 | " |-- Lattitude: double (nullable = true)\n",
89 | " |-- Longitude: double (nullable = true)\n",
90 | "\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "#Check Schema of DataFrame\n",
96 | "stationdf.printSchema()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 5,
102 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba",
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "+---+-----------+-----+-----------+-----------+\n",
110 | "| ID| City|State| Lattitude| Longitude|\n",
111 | "+---+-----------+-----+-----------+-----------+\n",
112 | "|478| Tipton| IN|33.54792701|97.94286036|\n",
113 | "|619| Arlington| CO|75.17993079|92.94615894|\n",
114 | "|711| Turner| AR|50.24380534|101.4580163|\n",
115 | "|839| Slidell| LA|85.32270304|151.8743276|\n",
116 | "|411| Negreet| LA| 98.9707194|105.3376115|\n",
117 | "|588| Glencoe| KY|46.38739244|136.0427027|\n",
118 | "|665| Chelsea| IA|98.72210937|59.68913002|\n",
119 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n",
120 | "|811| Dorrance| KS|102.0888316|121.5614372|\n",
121 | "|698| Albany| CA|49.75112765|80.21211317|\n",
122 | "|325| Monument| KS|70.52300953|141.7680413|\n",
123 | "|414| Manchester| MD|73.51580724|37.14602869|\n",
124 | "|113| Prescott| IA|39.93234421|65.79327823|\n",
125 | "|971|Graettinger| IA|94.66283665|150.3826243|\n",
126 | "|266| Cahone| CO|116.2321963| 127.009554|\n",
127 | "|617| Sturgis| MS|36.45673517|126.1690696|\n",
128 | "|495| Upperco| MD|114.2157413|29.63104758|\n",
129 | "|473| Highwood| IL|27.25445814|150.9227402|\n",
130 | "|959| Waipahu| HI|106.4460526|33.91451792|\n",
131 | "|438| Bowdon| GA|88.98111013|78.49025241|\n",
132 | "+---+-----------+-----+-----------+-----------+\n",
133 | "only showing top 20 rows\n",
134 | "\n"
135 | ]
136 | }
137 | ],
138 | "source": [
139 | "#Check sample Data \n",
140 | "stationdf.show()"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 9,
146 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171",
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "name": "stderr",
151 | "output_type": "stream",
152 | "text": [
153 | "[Stage 6:> (0 + 1) / 1]\r"
154 | ]
155 | },
156 | {
157 | "name": "stdout",
158 | "output_type": "stream",
159 | "text": [
160 | "+------------------------------------+\n",
161 | "|(count(City) - count(DISTINCT City))|\n",
162 | "+------------------------------------+\n",
163 | "| 3|\n",
164 | "+------------------------------------+\n",
165 | "\n"
166 | ]
167 | },
168 | {
169 | "name": "stderr",
170 | "output_type": "stream",
171 | "text": [
172 | " \r"
173 | ]
174 | }
175 | ],
176 | "source": [
177 | "#Solving Problem using PySpark \n",
178 | "# ind the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table. \n",
179 | "from pyspark.sql.functions import countDistinct\n",
180 | "from pyspark.sql.functions import count\n",
181 | "stationdf.select(count(\"City\") - countDistinct(\"City\")).show()"
182 | ]
183 | },
184 | {
185 | "cell_type": "code",
186 | "execution_count": 10,
187 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1",
188 | "metadata": {},
189 | "outputs": [],
190 | "source": [
191 | "# Now we are solving Same problem using Spark SQL \n",
192 | "# Creating Temp Table or HIVE table\n",
193 | "stationdf.createOrReplaceTempView(\"tmpStation\")"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 11,
199 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c",
200 | "metadata": {},
201 | "outputs": [
202 | {
203 | "name": "stdout",
204 | "output_type": "stream",
205 | "text": [
206 | "+---+-----------+-----+-----------+-----------+\n",
207 | "| ID| City|State| Lattitude| Longitude|\n",
208 | "+---+-----------+-----+-----------+-----------+\n",
209 | "|478| Tipton| IN|33.54792701|97.94286036|\n",
210 | "|619| Arlington| CO|75.17993079|92.94615894|\n",
211 | "|711| Turner| AR|50.24380534|101.4580163|\n",
212 | "|839| Slidell| LA|85.32270304|151.8743276|\n",
213 | "|411| Negreet| LA| 98.9707194|105.3376115|\n",
214 | "|588| Glencoe| KY|46.38739244|136.0427027|\n",
215 | "|665| Chelsea| IA|98.72210937|59.68913002|\n",
216 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n",
217 | "|811| Dorrance| KS|102.0888316|121.5614372|\n",
218 | "|698| Albany| CA|49.75112765|80.21211317|\n",
219 | "|325| Monument| KS|70.52300953|141.7680413|\n",
220 | "|414| Manchester| MD|73.51580724|37.14602869|\n",
221 | "|113| Prescott| IA|39.93234421|65.79327823|\n",
222 | "|971|Graettinger| IA|94.66283665|150.3826243|\n",
223 | "|266| Cahone| CO|116.2321963| 127.009554|\n",
224 | "|617| Sturgis| MS|36.45673517|126.1690696|\n",
225 | "|495| Upperco| MD|114.2157413|29.63104758|\n",
226 | "|473| Highwood| IL|27.25445814|150.9227402|\n",
227 | "|959| Waipahu| HI|106.4460526|33.91451792|\n",
228 | "|438| Bowdon| GA|88.98111013|78.49025241|\n",
229 | "+---+-----------+-----+-----------+-----------+\n",
230 | "only showing top 20 rows\n",
231 | "\n"
232 | ]
233 | }
234 | ],
235 | "source": [
236 | "# Now we have SQL Table and we can write SQL Query on top of that \n",
237 | "# For example by Select on table \n",
238 | "sqlContext.sql(\"SELECT * FROM tmpStation\").show()"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 12,
244 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24",
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "name": "stderr",
249 | "output_type": "stream",
250 | "text": [
251 | "[Stage 13:> (0 + 1) / 1]\r"
252 | ]
253 | },
254 | {
255 | "name": "stdout",
256 | "output_type": "stream",
257 | "text": [
258 | "+---------+-----------------+---------------+\n",
259 | "|citycount|distinctcitycount|diffbetweenboth|\n",
260 | "+---------+-----------------+---------------+\n",
261 | "| 282| 279| 3|\n",
262 | "+---------+-----------------+---------------+\n",
263 | "\n"
264 | ]
265 | },
266 | {
267 | "name": "stderr",
268 | "output_type": "stream",
269 | "text": [
270 | " \r"
271 | ]
272 | }
273 | ],
274 | "source": [
275 | "# Now we will write query to get max salary for each employee \n",
276 | "# so we will use SQL Group by and SQL Order by functions \n",
277 | "sqlContext.sql(\"SELECT count(city) as citycount, count(distinct(city)) as distinctcitycount \\\n",
278 | " ,(count(city) - count(distinct(city))) as diffbetweenboth \\\n",
279 | " FROM tmpStation\").show(n=100)\n",
280 | "\n"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": null,
286 | "id": "3eeb1534-4da5-427f-9103-1a7bb847170e",
287 | "metadata": {},
288 | "outputs": [],
289 | "source": []
290 | }
291 | ],
292 | "metadata": {
293 | "kernelspec": {
294 | "display_name": "Python 3 (ipykernel)",
295 | "language": "python",
296 | "name": "python3"
297 | },
298 | "language_info": {
299 | "codemirror_mode": {
300 | "name": "ipython",
301 | "version": 3
302 | },
303 | "file_extension": ".py",
304 | "mimetype": "text/x-python",
305 | "name": "python",
306 | "nbconvert_exporter": "python",
307 | "pygments_lexer": "ipython3",
308 | "version": "3.8.13"
309 | }
310 | },
311 | "nbformat": 4,
312 | "nbformat_minor": 5
313 | }
314 |
--------------------------------------------------------------------------------
/Problem 5/problem5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c",
6 | "metadata": {},
7 | "source": [
8 | "Here, we will solve problems two ways\n",
9 | "1. First using PySpark function \n",
10 | "2. Second using Spark SQL"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# First Load all the required library and also Start Spark Session\n",
21 | "# Load all the required library\n",
22 | "from pyspark.sql import SparkSession"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stderr",
33 | "output_type": "stream",
34 | "text": [
35 | "WARNING: An illegal reflective access operation has occurred\n",
36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
39 | "WARNING: All illegal access operations will be denied in a future release\n",
40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
41 | "Setting default log level to \"WARN\".\n",
42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
43 | "23/02/09 11:10:57 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "#Start Spark Session\n",
49 | "spark = SparkSession.builder.appName(\"problem5\").getOrCreate()\n",
50 | "sqlContext = SparkSession(spark)\n",
51 | "#Dont Show warning only error\n",
52 | "spark.sparkContext.setLogLevel(\"ERROR\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68",
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "name": "stderr",
63 | "output_type": "stream",
64 | "text": [
65 | " \r"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "#Load CSV file into DataFrame\n",
71 | "stationdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"station.csv\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030",
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "root\n",
85 | " |-- ID: integer (nullable = true)\n",
86 | " |-- City: string (nullable = true)\n",
87 | " |-- State: string (nullable = true)\n",
88 | " |-- Lattitude: double (nullable = true)\n",
89 | " |-- Longitude: double (nullable = true)\n",
90 | "\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "#Check Schema of DataFrame\n",
96 | "stationdf.printSchema()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 5,
102 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba",
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "+---+-----------+-----+-----------+-----------+\n",
110 | "| ID| City|State| Lattitude| Longitude|\n",
111 | "+---+-----------+-----+-----------+-----------+\n",
112 | "|478| Tipton| IN|33.54792701|97.94286036|\n",
113 | "|619| Arlington| CO|75.17993079|92.94615894|\n",
114 | "|711| Turner| AR|50.24380534|101.4580163|\n",
115 | "|839| Slidell| LA|85.32270304|151.8743276|\n",
116 | "|411| Negreet| LA| 98.9707194|105.3376115|\n",
117 | "|588| Glencoe| KY|46.38739244|136.0427027|\n",
118 | "|665| Chelsea| IA|98.72210937|59.68913002|\n",
119 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n",
120 | "|811| Dorrance| KS|102.0888316|121.5614372|\n",
121 | "|698| Albany| CA|49.75112765|80.21211317|\n",
122 | "|325| Monument| KS|70.52300953|141.7680413|\n",
123 | "|414| Manchester| MD|73.51580724|37.14602869|\n",
124 | "|113| Prescott| IA|39.93234421|65.79327823|\n",
125 | "|971|Graettinger| IA|94.66283665|150.3826243|\n",
126 | "|266| Cahone| CO|116.2321963| 127.009554|\n",
127 | "|617| Sturgis| MS|36.45673517|126.1690696|\n",
128 | "|495| Upperco| MD|114.2157413|29.63104758|\n",
129 | "|473| Highwood| IL|27.25445814|150.9227402|\n",
130 | "|959| Waipahu| HI|106.4460526|33.91451792|\n",
131 | "|438| Bowdon| GA|88.98111013|78.49025241|\n",
132 | "+---+-----------+-----+-----------+-----------+\n",
133 | "only showing top 20 rows\n",
134 | "\n"
135 | ]
136 | }
137 | ],
138 | "source": [
139 | "#Check sample Data \n",
140 | "stationdf.show()"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 17,
146 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171",
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "name": "stdout",
151 | "output_type": "stream",
152 | "text": [
153 | "+-------------+\n",
154 | "| City|\n",
155 | "+-------------+\n",
156 | "| Arlington|\n",
157 | "| Albany|\n",
158 | "| Upperco|\n",
159 | "| Aguanga|\n",
160 | "| Odin|\n",
161 | "| Algonac|\n",
162 | "| Onaway|\n",
163 | "| Irvington|\n",
164 | "| Arrowsmith|\n",
165 | "| Udall|\n",
166 | "| Oakfield|\n",
167 | "| Elkton|\n",
168 | "| Amo|\n",
169 | "| Alanson|\n",
170 | "| Eleele|\n",
171 | "| Auburn|\n",
172 | "| Oconee|\n",
173 | "| Amazonia|\n",
174 | "|Andersonville|\n",
175 | "| Eros|\n",
176 | "| Arkadelphia|\n",
177 | "| Eriline|\n",
178 | "| Edgewater|\n",
179 | "| Eastlake|\n",
180 | "| Addison|\n",
181 | "| Everton|\n",
182 | "| Eustis|\n",
183 | "| Arispe|\n",
184 | "| Ottertail|\n",
185 | "| Ermine|\n",
186 | "| Albion|\n",
187 | "| Athens|\n",
188 | "| Eufaula|\n",
189 | "| Andover|\n",
190 | "| Osborne|\n",
191 | "| Oshtemo|\n",
192 | "+-------------+\n",
193 | "\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "#Solving Problem using PySpark \n",
199 | "# ind the difference between the total number of CITY entries in the table and the number of distinct CITY entries in the table. \n",
200 | "stationdf.select(\"City\").where(\"Left(City,1) IN ('A','E','I','O','U')\").show(n=100)"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 6,
206 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1",
207 | "metadata": {},
208 | "outputs": [],
209 | "source": [
210 | "# Now we are solving Same problem using Spark SQL \n",
211 | "# Creating Temp Table or HIVE table\n",
212 | "stationdf.createOrReplaceTempView(\"tmpStation\")"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": 7,
218 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c",
219 | "metadata": {},
220 | "outputs": [
221 | {
222 | "name": "stdout",
223 | "output_type": "stream",
224 | "text": [
225 | "+---+-----------+-----+-----------+-----------+\n",
226 | "| ID| City|State| Lattitude| Longitude|\n",
227 | "+---+-----------+-----+-----------+-----------+\n",
228 | "|478| Tipton| IN|33.54792701|97.94286036|\n",
229 | "|619| Arlington| CO|75.17993079|92.94615894|\n",
230 | "|711| Turner| AR|50.24380534|101.4580163|\n",
231 | "|839| Slidell| LA|85.32270304|151.8743276|\n",
232 | "|411| Negreet| LA| 98.9707194|105.3376115|\n",
233 | "|588| Glencoe| KY|46.38739244|136.0427027|\n",
234 | "|665| Chelsea| IA|98.72210937|59.68913002|\n",
235 | "|733|Pelahatchie| MS|38.58161595|28.11950703|\n",
236 | "|811| Dorrance| KS|102.0888316|121.5614372|\n",
237 | "|698| Albany| CA|49.75112765|80.21211317|\n",
238 | "|325| Monument| KS|70.52300953|141.7680413|\n",
239 | "|414| Manchester| MD|73.51580724|37.14602869|\n",
240 | "|113| Prescott| IA|39.93234421|65.79327823|\n",
241 | "|971|Graettinger| IA|94.66283665|150.3826243|\n",
242 | "|266| Cahone| CO|116.2321963| 127.009554|\n",
243 | "|617| Sturgis| MS|36.45673517|126.1690696|\n",
244 | "|495| Upperco| MD|114.2157413|29.63104758|\n",
245 | "|473| Highwood| IL|27.25445814|150.9227402|\n",
246 | "|959| Waipahu| HI|106.4460526|33.91451792|\n",
247 | "|438| Bowdon| GA|88.98111013|78.49025241|\n",
248 | "+---+-----------+-----+-----------+-----------+\n",
249 | "only showing top 20 rows\n",
250 | "\n"
251 | ]
252 | }
253 | ],
254 | "source": [
255 | "# Now we have SQL Table and we can write SQL Query on top of that \n",
256 | "# For example by Select on table \n",
257 | "sqlContext.sql(\"SELECT * FROM tmpStation\").show()"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 9,
263 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24",
264 | "metadata": {},
265 | "outputs": [
266 | {
267 | "name": "stdout",
268 | "output_type": "stream",
269 | "text": [
270 | "+-------------+\n",
271 | "| CITY|\n",
272 | "+-------------+\n",
273 | "| Auburn|\n",
274 | "|Andersonville|\n",
275 | "| Eastlake|\n",
276 | "| Albany|\n",
277 | "| Aguanga|\n",
278 | "| Onaway|\n",
279 | "| Andover|\n",
280 | "| Algonac|\n",
281 | "| Amazonia|\n",
282 | "| Arkadelphia|\n",
283 | "| Arispe|\n",
284 | "| Eustis|\n",
285 | "| Udall|\n",
286 | "| Athens|\n",
287 | "| Ottertail|\n",
288 | "| Upperco|\n",
289 | "| Ermine|\n",
290 | "| Eufaula|\n",
291 | "| Alanson|\n",
292 | "| Arlington|\n",
293 | "| Arrowsmith|\n",
294 | "| Oshtemo|\n",
295 | "| Irvington|\n",
296 | "| Elkton|\n",
297 | "| Eleele|\n",
298 | "| Oconee|\n",
299 | "| Oakfield|\n",
300 | "| Amo|\n",
301 | "| Addison|\n",
302 | "| Albion|\n",
303 | "| Everton|\n",
304 | "| Osborne|\n",
305 | "| Eriline|\n",
306 | "| Edgewater|\n",
307 | "| Eros|\n",
308 | "| Odin|\n",
309 | "+-------------+\n",
310 | "\n"
311 | ]
312 | }
313 | ],
314 | "source": [
315 | "# Now we will write query to get max salary for each employee \n",
316 | "# so we will use SQL Group by and SQL Order by functions \n",
317 | "sqlContext.sql(\"SELECT DISTINCT(CITY) FROM tmpStation WHERE LEFT(CITY,1) IN ('A','E','I','O','U')\").show(n=100)"
318 | ]
319 | }
320 | ],
321 | "metadata": {
322 | "kernelspec": {
323 | "display_name": "Python 3 (ipykernel)",
324 | "language": "python",
325 | "name": "python3"
326 | },
327 | "language_info": {
328 | "codemirror_mode": {
329 | "name": "ipython",
330 | "version": 3
331 | },
332 | "file_extension": ".py",
333 | "mimetype": "text/x-python",
334 | "name": "python",
335 | "nbconvert_exporter": "python",
336 | "pygments_lexer": "ipython3",
337 | "version": "3.8.13"
338 | }
339 | },
340 | "nbformat": 4,
341 | "nbformat_minor": 5
342 | }
343 |
--------------------------------------------------------------------------------
/Problem 3/stations.csv:
--------------------------------------------------------------------------------
1 | ID,City,State,Lattitude,Longitude
2 | 478,Tipton,IN,33.54792701,97.94286036
3 | 619,Arlington,CO,75.17993079,92.94615894
4 | 711,Turner,AR,50.24380534,101.4580163
5 | 839,Slidell,LA,85.32270304,151.8743276
6 | 411,Negreet,LA,98.9707194,105.3376115
7 | 588,Glencoe,KY,46.38739244,136.0427027
8 | 665,Chelsea,IA,98.72210937,59.68913002
9 | 733,Pelahatchie,MS,38.58161595,28.11950703
10 | 811,Dorrance,KS,102.0888316,121.5614372
11 | 698,Albany,CA,49.75112765,80.21211317
12 | 325,Monument,KS,70.52300953,141.7680413
13 | 414,Manchester,MD,73.51580724,37.14602869
14 | 113,Prescott,IA,39.93234421,65.79327823
15 | 971,Graettinger,IA,94.66283665,150.3826243
16 | 266,Cahone,CO,116.2321963,127.009554
17 | 617,Sturgis,MS,36.45673517,126.1690696
18 | 495,Upperco,MD,114.2157413,29.63104758
19 | 473,Highwood,IL,27.25445814,150.9227402
20 | 959,Waipahu,HI,106.4460526,33.91451792
21 | 438,Bowdon,GA,88.98111013,78.49025241
22 | 571,Tyler,MN,133.3521233,58.63273833
23 | 92,Watkins,CO,83.27433063,96.73732305
24 | 399,Republic,MI,75.42182,130.1266717
25 | 426,Millville,CA,32.55838209,145.7434609
26 | 844,Aguanga,CA,79.89165657,65.93959251
27 | 606,Morenci,AZ,104.8964262,110.2033978
28 | 833,Hoskinston,KY,65.7515349,65.67937265
29 | 843,Talbert,KY,39.85947921,58.84999769
30 | 166,Mccomb,MS,74.04169376,42.63374681
31 | 339,Kirk,CO,141.097397,136.3312671
32 | 909,Carlock,IL,117.3209611,84.80244659
33 | 829,Seward,IL,72.41930917,90.20890209
34 | 766,Gustine,CA,111.0875596,140.8338617
35 | 392,Delano,CA,126.3467998,91.50161746
36 | 555,Westphalia,MI,32.76641637,143.8050085
37 | 728,Roy,MT,41.31187761,51.56467929
38 | 656,Pattonsburg,MO,138.100334,32.10804024
39 | 394,Centertown,MO,133.9733513,93.17246374
40 | 366,Norvell,MI,125.3431567,93.75245864
41 | 96,Raymondville,MO,70.68239168,148.4444084
42 | 977,Odin,IL,53.48858773,115.7934363
43 | 741,Jemison,AL,62.10307108,25.71260581
44 | 323,Barrigada,GU,60.60716473,147.5296125
45 | 3,Hesperia,CA,106.0569286,71.11876711
46 | 814,Wickliffe,KY,80.29965735,46.12993489
47 | 375,Culdesac,ID,47.8418268,78.06551236
48 | 467,Roselawn,IN,87.70708169,51.74506986
49 | 647,Portland,AR,83.92116818,44.80555694
50 | 250,Hampden,MA,76.39074308,26.48368838
51 | 547,Sandborn,IN,55.94680767,93.85315475
52 | 701,Seaton,IL,128.2287955,78.43005628
53 | 197,Milledgeville,IL,90.98811028,113.2748504
54 | 679,Gretna,LA,75.26293787,142.5762285
55 | 403,Zionsville,IN,57.79181464,36.493866
56 | 482,Jolon,CA,66.65054378,52.95528769
57 | 252,Childs,MD,92.7594351,104.0155475
58 | 600,Shreveport,LA,136.2310159,38.50207291
59 | 14,Forest,MS,120.283076,50.22883356
60 | 260,Sizerock,KY,116.0212592,112.7471971
61 | 753,Algonac,MI,118.7398038,80.14671114
62 | 174,Onaway,MI,108.606587,55.75945692
63 | 263,Irvington,IL,96.70474244,68.28719181
64 | 253,Winsted,MN,68.82384939,72.51511422
65 | 557,Woodbury,GA,102.5472386,93.37553932
66 | 897,Samantha,AL,75.2235845,35.94479192
67 | 98,Hackleburg,AL,119.5607105,120.6244819
68 | 423,Soldier,KS,77.30051697,152.6019439
69 | 361,Arrowsmith,IL,28.00318693,109.3395101
70 | 409,Columbus,GA,67.33892289,46.61622653
71 | 312,Bentonville,AR,36.9528472,78.06843628
72 | 854,Kirkland,AZ,86.41004231,57.99523843
73 | 735,Wilton,ME,56.57944083,157.1906205
74 | 608,Busby,MT,104.0894472,29.83035109
75 | 122,Robertsdale,AL,97.7213689,85.3747551
76 | 93,Dale,IN,69.59335022,34.41552119
77 | 67,Reeds,MO,30.78888129,42.50211311
78 | 906,Hayfork,CA,35.2971959,116.6698147
79 | 34,Mcbrides,MI,74.05708403,35.68248542
80 | 401,Tennessee,IL,55.49838117,155.6455992
81 | 536,Henderson,IA,77.92417249,77.90662876
82 | 953,Udall,KS,112.6844799,59.95863388
83 | 614,Benedict,KS,138.4990456,95.71978969
84 | 998,Oakfield,ME,47.65762321,132.2118817
85 | 805,Tamms,IL,59.86766645,75.05164447
86 | 235,Haubstadt,IN,27.98898068,32.08170842
87 | 820,Chokio,MN,81.36073326,134.232113
88 | 650,Clancy,MT,45.82996854,164.378675
89 | 324,Norwood,MN,144.4891504,34.88529336
90 | 442,Elkton,MD,103.2547878,156.7289171
91 | 633,Bertha,MN,39.94889028,105.3111577
92 | 109,Bridgeport,MI,50.68988119,79.90137859
93 | 780,Cherry,IL,68.29708467,46.70383506
94 | 492,Regina,KY,131.5515912,90.23826291
95 | 965,Griffin,GA,38.74146904,151.7182093
96 | 337,Mascotte,FL,121.4608708,146.1675503
97 | 259,Baldwin,MD,81.73572165,40.4397386
98 | 955,Netawaka,KS,109.2057274,119.7404946
99 | 886,Pony,MT,99.25831292,162.8777336
100 | 200,Franklin,LA,82.24062794,31.77872725
101 | 384,Amo,IN,103.5871398,159.4306474
102 | 518,Vulcan,MO,108.6087788,91.56138944
103 | 161,Alanson,MI,90.6531996,72.11952297
104 | 486,Delta,LA,136.5385281,49.73086766
105 | 406,Carver,MN,45.89251104,122.069681
106 | 940,Paron,AR,59.13834287,104.3412062
107 | 237,Winchester,ID,38.37033443,80.0549859
108 | 465,Jerome,AZ,121.7110583,34.40610397
109 | 570,Greenview,CA,80.50000412,57.58800404
110 | 278,Cromwell,MN,128.8462234,53.51254061
111 | 927,Quinter,KS,59.58257004,25.36132152
112 | 59,Whitewater,MO,82.71809743,71.42607696
113 | 291,Clarkdale,AZ,58.19417297,73.94789938
114 | 668,Rockton,IL,116.1223935,86.83833004
115 | 682,Pheba,MS,90.94560988,127.3003694
116 | 775,Eleele,HI,80.90971236,152.5215045
117 | 527,Auburn,IA,95.48926949,137.0748386
118 | 190,Oconee,GA,92.56220722,119.477431
119 | 232,Grandville,MI,38.85256239,70.13776289
120 | 405,Susanville,CA,128.2498724,80.31679475
121 | 273,Rosie,AR,72.75896875,161.9173483
122 | 813,Verona,MO,109.6602903,152.6449499
123 | 444,Richland,GA,105.4709117,113.0379774
124 | 899,Fremont,MI,54.47132153,150.8233711
125 | 738,Philipsburg,MT,95.95531865,72.24442365
126 | 215,Kensett,IA,55.72295385,139.5524526
127 | 377,Koleen,IN,137.5485615,110.5110324
128 | 727,Winslow,IL,113.1328079,38.71450096
129 | 363,Reasnor,IA,41.59710148,162.564183
130 | 888,Bono,AR,133.276314,150.4963257
131 | 784,Biggsville,IL,85.92578701,138.7463469
132 | 695,Amazonia,MO,45.78566304,148.2013846
133 | 609,Marysville,MI,85.76134731,132.8724084
134 | 649,Pengilly,MN,25.07352606,154.0642918
135 | 383,Newbury,MA,128.3982315,85.17470023
136 | 44,Kismet,KS,99.82252766,156.5035829
137 | 433,Canton,ME,98.73035759,105.973446
138 | 474,Grayslake,IL,61.30374218,33.05923131
139 | 990,Bison,KS,132.2279842,74.89290079
140 | 502,Bellevue,KY,127.4330424,121.7488466
141 | 327,Ridgway,CO,77.43818081,110.2668422
142 | 228,Rydal,GA,35.68357838,78.82337343
143 | 642,Lynnville,KY,25.40836031,146.4916272
144 | 885,Deerfield,MO,40.213664,35.9386994
145 | 539,Montreal,MO,129.2453575,127.3259318
146 | 202,Hope,MN,140.3641688,43.72901978
147 | 521,Gowrie,IA,130.2024387,127.9825354
148 | 938,Andersonville,GA,141.3126586,72.53178686
149 | 528,Crouseville,ME,36.5185121,81.54481624
150 | 331,Cranks,KY,55.60911109,27.28471229
151 | 944,Ledyard,CT,134.5468125,143.8149657
152 | 949,Norway,ME,83.89130493,88.40746773
153 | 88,Eros,LA,95.16264172,58.31349033
154 | 878,Rantoul,KS,31.80492935,118.6160845
155 | 17,Fredericktown,MO,105.5334784,112.6890911
156 | 447,Arkadelphia,AR,98.62295228,49.57501146
157 | 351,Fredericksburg,IN,44.51203489,78.05797739
158 | 774,Manchester,IA,129.6682154,123.2967519
159 | 963,Eriline,KY,93.61747947,65.43902104
160 | 643,Wellington,KY,100.4511347,31.68760835
161 | 777,Edgewater,MD,130.0676569,72.29080719
162 | 15,Ducor,CA,140.8633607,102.039339
163 | 910,Salem,KY,86.97524724,113.9609797
164 | 612,Sturdivant,MO,93.84076298,86.38850955
165 | 537,Hagatna,GU,97.17321584,151.8086289
166 | 510,Eastlake,MI,134.0938535,38.78212913
167 | 354,Larkspur,CA,107.0529696,65.97363083
168 | 983,Patriot,IN,82.63795084,46.08354932
169 | 799,Corriganville,MD,141.383789,153.6500914
170 | 581,Carlos,MN,114.9060173,66.2810487
171 | 825,Addison,MI,96.36953674,142.4105732
172 | 526,Tarzana,CA,135.8603987,81.30731303
173 | 176,Grapevine,AR,92.36589225,84.54293686
174 | 994,Kanorado,KS,65.42078424,85.72249232
175 | 704,Climax,MI,127.3563782,107.0542747
176 | 582,Curdsville,KY,84.78749012,150.4842247
177 | 884,Southport,CT,59.09336238,63.13052144
178 | 196,Compton,IL,106.617993,99.40704162
179 | 605,Notasulga,AL,66.84426322,115.6864036
180 | 430,Rumsey,KY,70.6921152,50.2122756
181 | 234,Rogers,CT,140.4723914,33.18335673
182 | 702,Everton,MO,119.0469849,51.48512967
183 | 662,Skanee,MI,70.1724149,129.5593113
184 | 171,Springerville,AZ,124.6882036,150.6628287
185 | 615,Libertytown,MD,144.5783185,111.9744225
186 | 336,Dumont,MN,57.0124315,129.3675605
187 | 315,Ravenna,KY,79.15467169,106.252172
188 | 505,Williams,AZ,73.48100913,111.7413889
189 | 842,Decatur,MI,63.31154085,161.4235787
190 | 982,Holbrook,AZ,134.8838521,103.8569792
191 | 868,Sherrill,AR,79.96440727,152.2197289
192 | 554,Brownsdale,MN,52.42646664,50.79836304
193 | 199,Linden,MI,53.41116218,32.62422206
194 | 453,Sedgwick,AR,68.93334418,75.29418595
195 | 326,Rocheport,MO,114.163159,64.48216553
196 | 638,Clovis,CA,92.43965299,138.0751933
197 | 156,Heyburn,ID,82.08611195,121.0459768
198 | 861,Peabody,KS,75.41614816,152.2100746
199 | 428,Randall,KS,47.99772806,135.6275983
200 | 677,Hayesville,IA,119.9881564,42.12719349
201 | 183,Jordan,MN,68.74638928,35.46228503
202 | 242,Macy,IN,138.694477,152.3694449
203 | 621,Flowood,MS,64.88877035,149.2064111
204 | 180,Napoleon,IN,32.03325626,160.2402958
205 | 853,Coldwater,KS,47.50617517,26.31002645
206 | 105,Weldon,CA,134.0156771,118.9609382
207 | 357,Yellville,AR,35.68710434,42.24658664
208 | 920,Eustis,FL,42.73630964,39.48336091
209 | 355,Weldona,CO,32.96727204,58.44917695
210 | 501,Tefft,IN,93.21527074,150.0159946
211 | 834,Bayville,ME,106.7349403,143.4078424
212 | 255,Brighton,IL,107.6050821,32.84882058
213 | 595,Grimes,IA,42.05019623,74.73314913
214 | 709,Nubieber,CA,132.9033933,49.27761205
215 | 16,Beaufort,MO,71.77418064,85.65741838
216 | 231,Arispe,IA,31.11149635,137.7968198
217 | 891,Humeston,IA,74.51222394,122.4246326
218 | 757,Lakeville,CT,59.86867012,94.98860174
219 | 506,Firebrick,KY,49.99183934,95.03900712
220 | 583,Channing,MI,117.1645417,56.95124478
221 | 504,Melber,KY,37.24884854,55.53335159
222 | 901,Manchester,MN,71.02098012,84.00752922
223 | 586,Ottertail,MN,100.0240382,44.34165481
224 | 95,Dupo,IL,41.28342297,29.03342929
225 | 524,Montrose,CA,136.4765033,119.373558
226 | 716,Schleswig,IA,119.2539069,51.88108538
227 | 904,Ermine,KY,119.6401426,62.79812627
228 | 740,Siler,KY,137.0193079,117.2464806
229 | 57,Clifton,AZ,30.15463898,135.7025933
230 | 155,Casco,MI,138.5984073,109.0728819
231 | 755,Sturgis,MI,117.392421,135.3989883
232 | 287,Madisonville,LA,112.2163874,53.04603619
233 | 435,Albion,IN,44.25844944,121.8753316
234 | 672,Lismore,MN,58.87142971,103.8693391
235 | 572,Athens,IN,75.32104008,120.7983748
236 | 890,Eufaula,AL,140.2958283,103.0868213
237 | 119,Wildie,KY,69.65812987,111.8552379
238 | 540,Mosca,CO,89.20441335,141.4811419
239 | 678,Bennington,IN,35.52107321,26.80362207
240 | 208,Lottie,LA,109.8672979,82.76650144
241 | 512,Garland,ME,108.7311062,134.3750565
242 | 352,Clutier,IA,61.1888319,127.0339038
243 | 948,Lupton,MI,139.9255926,53.36397181
244 | 503,Northfield,MN,61.00207775,37.15335522
245 | 288,Daleville,AL,121.8865105,136.1704398
246 | 479,Cuba,MO,63.71916114,87.64843313
247 | 826,Norris,MT,47.18550342,37.25727353
248 | 651,Clopton,AL,40.77104358,84.70678339
249 | 143,Renville,MN,142.1513936,99.43201313
250 | 102,Kirksville,MO,140.0030631,143.8709979
251 | 69,Kingsland,AR,78.22487634,85.13857667
252 | 181,Fairview,KS,80.27133556,164.5798928
253 | 175,Lydia,LA,41.78237386,39.53037919
254 | 80,Bridgton,ME,93.2257481,140.415464
255 | 596,Brownstown,IL,48.65218811,63.22095723
256 | 301,Monona,IA,144.1294884,81.57803996
257 | 987,Hartland,MI,136.2638918,107.738067
258 | 973,Andover,CT,51.74018501,52.53230369
259 | 981,Lakota,IA,56.15413675,92.38612569
260 | 110,Mesick,MI,82.12446036,108.5283528
261 | 396,Dryden,MI,69.80182523,47.7436689
262 | 637,Beverly,KY,57.75450094,126.8958422
263 | 801,Pocahontas,IL,109.6304686,83.23109494
264 | 130,Hayneville,AL,109.7380661,157.4686782
265 | 345,Yoder,IN,83.49946581,143.715826
266 | 851,Gatewood,MO,76.31562733,145.668333
267 | 489,Madden,MS,81.34223218,99.37998257
268 | 223,Losantville,IN,112.5187171,106.7760547
269 | 538,Cheswold,DE,31.93743733,59.34689519
270 | 329,Caseville,MI,102.9200706,98.4033735
271 | 815,Pomona,MO,52.33346818,50.28222507
272 | 789,Hopkinsville,KY,27.31872893,47.94652919
273 | 269,Jack,AL,49.93703023,85.62817326
274 | 969,Dixie,GA,27.21713791,36.47378899
275 | 271,Hillside,CO,99.26558164,68.84352684
276 | 667,Hawarden,IA,90.96161545,46.93255602
277 | 350,Cannonsburg,MI,91.03351667,120.6696799
278 | 49,Osborne,KS,70.36168327,139.7111654
279 | 404,Farmington,IL,91.7144044,72.0223174
280 | 23,Honolulu,HI,110.101955,139.7437776
281 | 1,Pfeifer,KS,37.44478047,65.68491252
282 | 127,Oshtemo,MI,100.3702957,135.9503227
283 | 657,Gridley,KS,118.1450367,55.80178454
--------------------------------------------------------------------------------
/Problem 4/stations.csv:
--------------------------------------------------------------------------------
1 | ID,City,State,Lattitude,Longitude
2 | 478,Tipton,IN,33.54792701,97.94286036
3 | 619,Arlington,CO,75.17993079,92.94615894
4 | 711,Turner,AR,50.24380534,101.4580163
5 | 839,Slidell,LA,85.32270304,151.8743276
6 | 411,Negreet,LA,98.9707194,105.3376115
7 | 588,Glencoe,KY,46.38739244,136.0427027
8 | 665,Chelsea,IA,98.72210937,59.68913002
9 | 733,Pelahatchie,MS,38.58161595,28.11950703
10 | 811,Dorrance,KS,102.0888316,121.5614372
11 | 698,Albany,CA,49.75112765,80.21211317
12 | 325,Monument,KS,70.52300953,141.7680413
13 | 414,Manchester,MD,73.51580724,37.14602869
14 | 113,Prescott,IA,39.93234421,65.79327823
15 | 971,Graettinger,IA,94.66283665,150.3826243
16 | 266,Cahone,CO,116.2321963,127.009554
17 | 617,Sturgis,MS,36.45673517,126.1690696
18 | 495,Upperco,MD,114.2157413,29.63104758
19 | 473,Highwood,IL,27.25445814,150.9227402
20 | 959,Waipahu,HI,106.4460526,33.91451792
21 | 438,Bowdon,GA,88.98111013,78.49025241
22 | 571,Tyler,MN,133.3521233,58.63273833
23 | 92,Watkins,CO,83.27433063,96.73732305
24 | 399,Republic,MI,75.42182,130.1266717
25 | 426,Millville,CA,32.55838209,145.7434609
26 | 844,Aguanga,CA,79.89165657,65.93959251
27 | 606,Morenci,AZ,104.8964262,110.2033978
28 | 833,Hoskinston,KY,65.7515349,65.67937265
29 | 843,Talbert,KY,39.85947921,58.84999769
30 | 166,Mccomb,MS,74.04169376,42.63374681
31 | 339,Kirk,CO,141.097397,136.3312671
32 | 909,Carlock,IL,117.3209611,84.80244659
33 | 829,Seward,IL,72.41930917,90.20890209
34 | 766,Gustine,CA,111.0875596,140.8338617
35 | 392,Delano,CA,126.3467998,91.50161746
36 | 555,Westphalia,MI,32.76641637,143.8050085
37 | 728,Roy,MT,41.31187761,51.56467929
38 | 656,Pattonsburg,MO,138.100334,32.10804024
39 | 394,Centertown,MO,133.9733513,93.17246374
40 | 366,Norvell,MI,125.3431567,93.75245864
41 | 96,Raymondville,MO,70.68239168,148.4444084
42 | 977,Odin,IL,53.48858773,115.7934363
43 | 741,Jemison,AL,62.10307108,25.71260581
44 | 323,Barrigada,GU,60.60716473,147.5296125
45 | 3,Hesperia,CA,106.0569286,71.11876711
46 | 814,Wickliffe,KY,80.29965735,46.12993489
47 | 375,Culdesac,ID,47.8418268,78.06551236
48 | 467,Roselawn,IN,87.70708169,51.74506986
49 | 647,Portland,AR,83.92116818,44.80555694
50 | 250,Hampden,MA,76.39074308,26.48368838
51 | 547,Sandborn,IN,55.94680767,93.85315475
52 | 701,Seaton,IL,128.2287955,78.43005628
53 | 197,Milledgeville,IL,90.98811028,113.2748504
54 | 679,Gretna,LA,75.26293787,142.5762285
55 | 403,Zionsville,IN,57.79181464,36.493866
56 | 482,Jolon,CA,66.65054378,52.95528769
57 | 252,Childs,MD,92.7594351,104.0155475
58 | 600,Shreveport,LA,136.2310159,38.50207291
59 | 14,Forest,MS,120.283076,50.22883356
60 | 260,Sizerock,KY,116.0212592,112.7471971
61 | 753,Algonac,MI,118.7398038,80.14671114
62 | 174,Onaway,MI,108.606587,55.75945692
63 | 263,Irvington,IL,96.70474244,68.28719181
64 | 253,Winsted,MN,68.82384939,72.51511422
65 | 557,Woodbury,GA,102.5472386,93.37553932
66 | 897,Samantha,AL,75.2235845,35.94479192
67 | 98,Hackleburg,AL,119.5607105,120.6244819
68 | 423,Soldier,KS,77.30051697,152.6019439
69 | 361,Arrowsmith,IL,28.00318693,109.3395101
70 | 409,Columbus,GA,67.33892289,46.61622653
71 | 312,Bentonville,AR,36.9528472,78.06843628
72 | 854,Kirkland,AZ,86.41004231,57.99523843
73 | 735,Wilton,ME,56.57944083,157.1906205
74 | 608,Busby,MT,104.0894472,29.83035109
75 | 122,Robertsdale,AL,97.7213689,85.3747551
76 | 93,Dale,IN,69.59335022,34.41552119
77 | 67,Reeds,MO,30.78888129,42.50211311
78 | 906,Hayfork,CA,35.2971959,116.6698147
79 | 34,Mcbrides,MI,74.05708403,35.68248542
80 | 401,Tennessee,IL,55.49838117,155.6455992
81 | 536,Henderson,IA,77.92417249,77.90662876
82 | 953,Udall,KS,112.6844799,59.95863388
83 | 614,Benedict,KS,138.4990456,95.71978969
84 | 998,Oakfield,ME,47.65762321,132.2118817
85 | 805,Tamms,IL,59.86766645,75.05164447
86 | 235,Haubstadt,IN,27.98898068,32.08170842
87 | 820,Chokio,MN,81.36073326,134.232113
88 | 650,Clancy,MT,45.82996854,164.378675
89 | 324,Norwood,MN,144.4891504,34.88529336
90 | 442,Elkton,MD,103.2547878,156.7289171
91 | 633,Bertha,MN,39.94889028,105.3111577
92 | 109,Bridgeport,MI,50.68988119,79.90137859
93 | 780,Cherry,IL,68.29708467,46.70383506
94 | 492,Regina,KY,131.5515912,90.23826291
95 | 965,Griffin,GA,38.74146904,151.7182093
96 | 337,Mascotte,FL,121.4608708,146.1675503
97 | 259,Baldwin,MD,81.73572165,40.4397386
98 | 955,Netawaka,KS,109.2057274,119.7404946
99 | 886,Pony,MT,99.25831292,162.8777336
100 | 200,Franklin,LA,82.24062794,31.77872725
101 | 384,Amo,IN,103.5871398,159.4306474
102 | 518,Vulcan,MO,108.6087788,91.56138944
103 | 161,Alanson,MI,90.6531996,72.11952297
104 | 486,Delta,LA,136.5385281,49.73086766
105 | 406,Carver,MN,45.89251104,122.069681
106 | 940,Paron,AR,59.13834287,104.3412062
107 | 237,Winchester,ID,38.37033443,80.0549859
108 | 465,Jerome,AZ,121.7110583,34.40610397
109 | 570,Greenview,CA,80.50000412,57.58800404
110 | 278,Cromwell,MN,128.8462234,53.51254061
111 | 927,Quinter,KS,59.58257004,25.36132152
112 | 59,Whitewater,MO,82.71809743,71.42607696
113 | 291,Clarkdale,AZ,58.19417297,73.94789938
114 | 668,Rockton,IL,116.1223935,86.83833004
115 | 682,Pheba,MS,90.94560988,127.3003694
116 | 775,Eleele,HI,80.90971236,152.5215045
117 | 527,Auburn,IA,95.48926949,137.0748386
118 | 190,Oconee,GA,92.56220722,119.477431
119 | 232,Grandville,MI,38.85256239,70.13776289
120 | 405,Susanville,CA,128.2498724,80.31679475
121 | 273,Rosie,AR,72.75896875,161.9173483
122 | 813,Verona,MO,109.6602903,152.6449499
123 | 444,Richland,GA,105.4709117,113.0379774
124 | 899,Fremont,MI,54.47132153,150.8233711
125 | 738,Philipsburg,MT,95.95531865,72.24442365
126 | 215,Kensett,IA,55.72295385,139.5524526
127 | 377,Koleen,IN,137.5485615,110.5110324
128 | 727,Winslow,IL,113.1328079,38.71450096
129 | 363,Reasnor,IA,41.59710148,162.564183
130 | 888,Bono,AR,133.276314,150.4963257
131 | 784,Biggsville,IL,85.92578701,138.7463469
132 | 695,Amazonia,MO,45.78566304,148.2013846
133 | 609,Marysville,MI,85.76134731,132.8724084
134 | 649,Pengilly,MN,25.07352606,154.0642918
135 | 383,Newbury,MA,128.3982315,85.17470023
136 | 44,Kismet,KS,99.82252766,156.5035829
137 | 433,Canton,ME,98.73035759,105.973446
138 | 474,Grayslake,IL,61.30374218,33.05923131
139 | 990,Bison,KS,132.2279842,74.89290079
140 | 502,Bellevue,KY,127.4330424,121.7488466
141 | 327,Ridgway,CO,77.43818081,110.2668422
142 | 228,Rydal,GA,35.68357838,78.82337343
143 | 642,Lynnville,KY,25.40836031,146.4916272
144 | 885,Deerfield,MO,40.213664,35.9386994
145 | 539,Montreal,MO,129.2453575,127.3259318
146 | 202,Hope,MN,140.3641688,43.72901978
147 | 521,Gowrie,IA,130.2024387,127.9825354
148 | 938,Andersonville,GA,141.3126586,72.53178686
149 | 528,Crouseville,ME,36.5185121,81.54481624
150 | 331,Cranks,KY,55.60911109,27.28471229
151 | 944,Ledyard,CT,134.5468125,143.8149657
152 | 949,Norway,ME,83.89130493,88.40746773
153 | 88,Eros,LA,95.16264172,58.31349033
154 | 878,Rantoul,KS,31.80492935,118.6160845
155 | 17,Fredericktown,MO,105.5334784,112.6890911
156 | 447,Arkadelphia,AR,98.62295228,49.57501146
157 | 351,Fredericksburg,IN,44.51203489,78.05797739
158 | 774,Manchester,IA,129.6682154,123.2967519
159 | 963,Eriline,KY,93.61747947,65.43902104
160 | 643,Wellington,KY,100.4511347,31.68760835
161 | 777,Edgewater,MD,130.0676569,72.29080719
162 | 15,Ducor,CA,140.8633607,102.039339
163 | 910,Salem,KY,86.97524724,113.9609797
164 | 612,Sturdivant,MO,93.84076298,86.38850955
165 | 537,Hagatna,GU,97.17321584,151.8086289
166 | 510,Eastlake,MI,134.0938535,38.78212913
167 | 354,Larkspur,CA,107.0529696,65.97363083
168 | 983,Patriot,IN,82.63795084,46.08354932
169 | 799,Corriganville,MD,141.383789,153.6500914
170 | 581,Carlos,MN,114.9060173,66.2810487
171 | 825,Addison,MI,96.36953674,142.4105732
172 | 526,Tarzana,CA,135.8603987,81.30731303
173 | 176,Grapevine,AR,92.36589225,84.54293686
174 | 994,Kanorado,KS,65.42078424,85.72249232
175 | 704,Climax,MI,127.3563782,107.0542747
176 | 582,Curdsville,KY,84.78749012,150.4842247
177 | 884,Southport,CT,59.09336238,63.13052144
178 | 196,Compton,IL,106.617993,99.40704162
179 | 605,Notasulga,AL,66.84426322,115.6864036
180 | 430,Rumsey,KY,70.6921152,50.2122756
181 | 234,Rogers,CT,140.4723914,33.18335673
182 | 702,Everton,MO,119.0469849,51.48512967
183 | 662,Skanee,MI,70.1724149,129.5593113
184 | 171,Springerville,AZ,124.6882036,150.6628287
185 | 615,Libertytown,MD,144.5783185,111.9744225
186 | 336,Dumont,MN,57.0124315,129.3675605
187 | 315,Ravenna,KY,79.15467169,106.252172
188 | 505,Williams,AZ,73.48100913,111.7413889
189 | 842,Decatur,MI,63.31154085,161.4235787
190 | 982,Holbrook,AZ,134.8838521,103.8569792
191 | 868,Sherrill,AR,79.96440727,152.2197289
192 | 554,Brownsdale,MN,52.42646664,50.79836304
193 | 199,Linden,MI,53.41116218,32.62422206
194 | 453,Sedgwick,AR,68.93334418,75.29418595
195 | 326,Rocheport,MO,114.163159,64.48216553
196 | 638,Clovis,CA,92.43965299,138.0751933
197 | 156,Heyburn,ID,82.08611195,121.0459768
198 | 861,Peabody,KS,75.41614816,152.2100746
199 | 428,Randall,KS,47.99772806,135.6275983
200 | 677,Hayesville,IA,119.9881564,42.12719349
201 | 183,Jordan,MN,68.74638928,35.46228503
202 | 242,Macy,IN,138.694477,152.3694449
203 | 621,Flowood,MS,64.88877035,149.2064111
204 | 180,Napoleon,IN,32.03325626,160.2402958
205 | 853,Coldwater,KS,47.50617517,26.31002645
206 | 105,Weldon,CA,134.0156771,118.9609382
207 | 357,Yellville,AR,35.68710434,42.24658664
208 | 920,Eustis,FL,42.73630964,39.48336091
209 | 355,Weldona,CO,32.96727204,58.44917695
210 | 501,Tefft,IN,93.21527074,150.0159946
211 | 834,Bayville,ME,106.7349403,143.4078424
212 | 255,Brighton,IL,107.6050821,32.84882058
213 | 595,Grimes,IA,42.05019623,74.73314913
214 | 709,Nubieber,CA,132.9033933,49.27761205
215 | 16,Beaufort,MO,71.77418064,85.65741838
216 | 231,Arispe,IA,31.11149635,137.7968198
217 | 891,Humeston,IA,74.51222394,122.4246326
218 | 757,Lakeville,CT,59.86867012,94.98860174
219 | 506,Firebrick,KY,49.99183934,95.03900712
220 | 583,Channing,MI,117.1645417,56.95124478
221 | 504,Melber,KY,37.24884854,55.53335159
222 | 901,Manchester,MN,71.02098012,84.00752922
223 | 586,Ottertail,MN,100.0240382,44.34165481
224 | 95,Dupo,IL,41.28342297,29.03342929
225 | 524,Montrose,CA,136.4765033,119.373558
226 | 716,Schleswig,IA,119.2539069,51.88108538
227 | 904,Ermine,KY,119.6401426,62.79812627
228 | 740,Siler,KY,137.0193079,117.2464806
229 | 57,Clifton,AZ,30.15463898,135.7025933
230 | 155,Casco,MI,138.5984073,109.0728819
231 | 755,Sturgis,MI,117.392421,135.3989883
232 | 287,Madisonville,LA,112.2163874,53.04603619
233 | 435,Albion,IN,44.25844944,121.8753316
234 | 672,Lismore,MN,58.87142971,103.8693391
235 | 572,Athens,IN,75.32104008,120.7983748
236 | 890,Eufaula,AL,140.2958283,103.0868213
237 | 119,Wildie,KY,69.65812987,111.8552379
238 | 540,Mosca,CO,89.20441335,141.4811419
239 | 678,Bennington,IN,35.52107321,26.80362207
240 | 208,Lottie,LA,109.8672979,82.76650144
241 | 512,Garland,ME,108.7311062,134.3750565
242 | 352,Clutier,IA,61.1888319,127.0339038
243 | 948,Lupton,MI,139.9255926,53.36397181
244 | 503,Northfield,MN,61.00207775,37.15335522
245 | 288,Daleville,AL,121.8865105,136.1704398
246 | 479,Cuba,MO,63.71916114,87.64843313
247 | 826,Norris,MT,47.18550342,37.25727353
248 | 651,Clopton,AL,40.77104358,84.70678339
249 | 143,Renville,MN,142.1513936,99.43201313
250 | 102,Kirksville,MO,140.0030631,143.8709979
251 | 69,Kingsland,AR,78.22487634,85.13857667
252 | 181,Fairview,KS,80.27133556,164.5798928
253 | 175,Lydia,LA,41.78237386,39.53037919
254 | 80,Bridgton,ME,93.2257481,140.415464
255 | 596,Brownstown,IL,48.65218811,63.22095723
256 | 301,Monona,IA,144.1294884,81.57803996
257 | 987,Hartland,MI,136.2638918,107.738067
258 | 973,Andover,CT,51.74018501,52.53230369
259 | 981,Lakota,IA,56.15413675,92.38612569
260 | 110,Mesick,MI,82.12446036,108.5283528
261 | 396,Dryden,MI,69.80182523,47.7436689
262 | 637,Beverly,KY,57.75450094,126.8958422
263 | 801,Pocahontas,IL,109.6304686,83.23109494
264 | 130,Hayneville,AL,109.7380661,157.4686782
265 | 345,Yoder,IN,83.49946581,143.715826
266 | 851,Gatewood,MO,76.31562733,145.668333
267 | 489,Madden,MS,81.34223218,99.37998257
268 | 223,Losantville,IN,112.5187171,106.7760547
269 | 538,Cheswold,DE,31.93743733,59.34689519
270 | 329,Caseville,MI,102.9200706,98.4033735
271 | 815,Pomona,MO,52.33346818,50.28222507
272 | 789,Hopkinsville,KY,27.31872893,47.94652919
273 | 269,Jack,AL,49.93703023,85.62817326
274 | 969,Dixie,GA,27.21713791,36.47378899
275 | 271,Hillside,CO,99.26558164,68.84352684
276 | 667,Hawarden,IA,90.96161545,46.93255602
277 | 350,Cannonsburg,MI,91.03351667,120.6696799
278 | 49,Osborne,KS,70.36168327,139.7111654
279 | 404,Farmington,IL,91.7144044,72.0223174
280 | 23,Honolulu,HI,110.101955,139.7437776
281 | 1,Pfeifer,KS,37.44478047,65.68491252
282 | 127,Oshtemo,MI,100.3702957,135.9503227
283 | 657,Gridley,KS,118.1450367,55.80178454
--------------------------------------------------------------------------------
/Problem 5/stations.csv:
--------------------------------------------------------------------------------
1 | ID,City,State,Lattitude,Longitude
2 | 478,Tipton,IN,33.54792701,97.94286036
3 | 619,Arlington,CO,75.17993079,92.94615894
4 | 711,Turner,AR,50.24380534,101.4580163
5 | 839,Slidell,LA,85.32270304,151.8743276
6 | 411,Negreet,LA,98.9707194,105.3376115
7 | 588,Glencoe,KY,46.38739244,136.0427027
8 | 665,Chelsea,IA,98.72210937,59.68913002
9 | 733,Pelahatchie,MS,38.58161595,28.11950703
10 | 811,Dorrance,KS,102.0888316,121.5614372
11 | 698,Albany,CA,49.75112765,80.21211317
12 | 325,Monument,KS,70.52300953,141.7680413
13 | 414,Manchester,MD,73.51580724,37.14602869
14 | 113,Prescott,IA,39.93234421,65.79327823
15 | 971,Graettinger,IA,94.66283665,150.3826243
16 | 266,Cahone,CO,116.2321963,127.009554
17 | 617,Sturgis,MS,36.45673517,126.1690696
18 | 495,Upperco,MD,114.2157413,29.63104758
19 | 473,Highwood,IL,27.25445814,150.9227402
20 | 959,Waipahu,HI,106.4460526,33.91451792
21 | 438,Bowdon,GA,88.98111013,78.49025241
22 | 571,Tyler,MN,133.3521233,58.63273833
23 | 92,Watkins,CO,83.27433063,96.73732305
24 | 399,Republic,MI,75.42182,130.1266717
25 | 426,Millville,CA,32.55838209,145.7434609
26 | 844,Aguanga,CA,79.89165657,65.93959251
27 | 606,Morenci,AZ,104.8964262,110.2033978
28 | 833,Hoskinston,KY,65.7515349,65.67937265
29 | 843,Talbert,KY,39.85947921,58.84999769
30 | 166,Mccomb,MS,74.04169376,42.63374681
31 | 339,Kirk,CO,141.097397,136.3312671
32 | 909,Carlock,IL,117.3209611,84.80244659
33 | 829,Seward,IL,72.41930917,90.20890209
34 | 766,Gustine,CA,111.0875596,140.8338617
35 | 392,Delano,CA,126.3467998,91.50161746
36 | 555,Westphalia,MI,32.76641637,143.8050085
37 | 728,Roy,MT,41.31187761,51.56467929
38 | 656,Pattonsburg,MO,138.100334,32.10804024
39 | 394,Centertown,MO,133.9733513,93.17246374
40 | 366,Norvell,MI,125.3431567,93.75245864
41 | 96,Raymondville,MO,70.68239168,148.4444084
42 | 977,Odin,IL,53.48858773,115.7934363
43 | 741,Jemison,AL,62.10307108,25.71260581
44 | 323,Barrigada,GU,60.60716473,147.5296125
45 | 3,Hesperia,CA,106.0569286,71.11876711
46 | 814,Wickliffe,KY,80.29965735,46.12993489
47 | 375,Culdesac,ID,47.8418268,78.06551236
48 | 467,Roselawn,IN,87.70708169,51.74506986
49 | 647,Portland,AR,83.92116818,44.80555694
50 | 250,Hampden,MA,76.39074308,26.48368838
51 | 547,Sandborn,IN,55.94680767,93.85315475
52 | 701,Seaton,IL,128.2287955,78.43005628
53 | 197,Milledgeville,IL,90.98811028,113.2748504
54 | 679,Gretna,LA,75.26293787,142.5762285
55 | 403,Zionsville,IN,57.79181464,36.493866
56 | 482,Jolon,CA,66.65054378,52.95528769
57 | 252,Childs,MD,92.7594351,104.0155475
58 | 600,Shreveport,LA,136.2310159,38.50207291
59 | 14,Forest,MS,120.283076,50.22883356
60 | 260,Sizerock,KY,116.0212592,112.7471971
61 | 753,Algonac,MI,118.7398038,80.14671114
62 | 174,Onaway,MI,108.606587,55.75945692
63 | 263,Irvington,IL,96.70474244,68.28719181
64 | 253,Winsted,MN,68.82384939,72.51511422
65 | 557,Woodbury,GA,102.5472386,93.37553932
66 | 897,Samantha,AL,75.2235845,35.94479192
67 | 98,Hackleburg,AL,119.5607105,120.6244819
68 | 423,Soldier,KS,77.30051697,152.6019439
69 | 361,Arrowsmith,IL,28.00318693,109.3395101
70 | 409,Columbus,GA,67.33892289,46.61622653
71 | 312,Bentonville,AR,36.9528472,78.06843628
72 | 854,Kirkland,AZ,86.41004231,57.99523843
73 | 735,Wilton,ME,56.57944083,157.1906205
74 | 608,Busby,MT,104.0894472,29.83035109
75 | 122,Robertsdale,AL,97.7213689,85.3747551
76 | 93,Dale,IN,69.59335022,34.41552119
77 | 67,Reeds,MO,30.78888129,42.50211311
78 | 906,Hayfork,CA,35.2971959,116.6698147
79 | 34,Mcbrides,MI,74.05708403,35.68248542
80 | 401,Tennessee,IL,55.49838117,155.6455992
81 | 536,Henderson,IA,77.92417249,77.90662876
82 | 953,Udall,KS,112.6844799,59.95863388
83 | 614,Benedict,KS,138.4990456,95.71978969
84 | 998,Oakfield,ME,47.65762321,132.2118817
85 | 805,Tamms,IL,59.86766645,75.05164447
86 | 235,Haubstadt,IN,27.98898068,32.08170842
87 | 820,Chokio,MN,81.36073326,134.232113
88 | 650,Clancy,MT,45.82996854,164.378675
89 | 324,Norwood,MN,144.4891504,34.88529336
90 | 442,Elkton,MD,103.2547878,156.7289171
91 | 633,Bertha,MN,39.94889028,105.3111577
92 | 109,Bridgeport,MI,50.68988119,79.90137859
93 | 780,Cherry,IL,68.29708467,46.70383506
94 | 492,Regina,KY,131.5515912,90.23826291
95 | 965,Griffin,GA,38.74146904,151.7182093
96 | 337,Mascotte,FL,121.4608708,146.1675503
97 | 259,Baldwin,MD,81.73572165,40.4397386
98 | 955,Netawaka,KS,109.2057274,119.7404946
99 | 886,Pony,MT,99.25831292,162.8777336
100 | 200,Franklin,LA,82.24062794,31.77872725
101 | 384,Amo,IN,103.5871398,159.4306474
102 | 518,Vulcan,MO,108.6087788,91.56138944
103 | 161,Alanson,MI,90.6531996,72.11952297
104 | 486,Delta,LA,136.5385281,49.73086766
105 | 406,Carver,MN,45.89251104,122.069681
106 | 940,Paron,AR,59.13834287,104.3412062
107 | 237,Winchester,ID,38.37033443,80.0549859
108 | 465,Jerome,AZ,121.7110583,34.40610397
109 | 570,Greenview,CA,80.50000412,57.58800404
110 | 278,Cromwell,MN,128.8462234,53.51254061
111 | 927,Quinter,KS,59.58257004,25.36132152
112 | 59,Whitewater,MO,82.71809743,71.42607696
113 | 291,Clarkdale,AZ,58.19417297,73.94789938
114 | 668,Rockton,IL,116.1223935,86.83833004
115 | 682,Pheba,MS,90.94560988,127.3003694
116 | 775,Eleele,HI,80.90971236,152.5215045
117 | 527,Auburn,IA,95.48926949,137.0748386
118 | 190,Oconee,GA,92.56220722,119.477431
119 | 232,Grandville,MI,38.85256239,70.13776289
120 | 405,Susanville,CA,128.2498724,80.31679475
121 | 273,Rosie,AR,72.75896875,161.9173483
122 | 813,Verona,MO,109.6602903,152.6449499
123 | 444,Richland,GA,105.4709117,113.0379774
124 | 899,Fremont,MI,54.47132153,150.8233711
125 | 738,Philipsburg,MT,95.95531865,72.24442365
126 | 215,Kensett,IA,55.72295385,139.5524526
127 | 377,Koleen,IN,137.5485615,110.5110324
128 | 727,Winslow,IL,113.1328079,38.71450096
129 | 363,Reasnor,IA,41.59710148,162.564183
130 | 888,Bono,AR,133.276314,150.4963257
131 | 784,Biggsville,IL,85.92578701,138.7463469
132 | 695,Amazonia,MO,45.78566304,148.2013846
133 | 609,Marysville,MI,85.76134731,132.8724084
134 | 649,Pengilly,MN,25.07352606,154.0642918
135 | 383,Newbury,MA,128.3982315,85.17470023
136 | 44,Kismet,KS,99.82252766,156.5035829
137 | 433,Canton,ME,98.73035759,105.973446
138 | 474,Grayslake,IL,61.30374218,33.05923131
139 | 990,Bison,KS,132.2279842,74.89290079
140 | 502,Bellevue,KY,127.4330424,121.7488466
141 | 327,Ridgway,CO,77.43818081,110.2668422
142 | 228,Rydal,GA,35.68357838,78.82337343
143 | 642,Lynnville,KY,25.40836031,146.4916272
144 | 885,Deerfield,MO,40.213664,35.9386994
145 | 539,Montreal,MO,129.2453575,127.3259318
146 | 202,Hope,MN,140.3641688,43.72901978
147 | 521,Gowrie,IA,130.2024387,127.9825354
148 | 938,Andersonville,GA,141.3126586,72.53178686
149 | 528,Crouseville,ME,36.5185121,81.54481624
150 | 331,Cranks,KY,55.60911109,27.28471229
151 | 944,Ledyard,CT,134.5468125,143.8149657
152 | 949,Norway,ME,83.89130493,88.40746773
153 | 88,Eros,LA,95.16264172,58.31349033
154 | 878,Rantoul,KS,31.80492935,118.6160845
155 | 17,Fredericktown,MO,105.5334784,112.6890911
156 | 447,Arkadelphia,AR,98.62295228,49.57501146
157 | 351,Fredericksburg,IN,44.51203489,78.05797739
158 | 774,Manchester,IA,129.6682154,123.2967519
159 | 963,Eriline,KY,93.61747947,65.43902104
160 | 643,Wellington,KY,100.4511347,31.68760835
161 | 777,Edgewater,MD,130.0676569,72.29080719
162 | 15,Ducor,CA,140.8633607,102.039339
163 | 910,Salem,KY,86.97524724,113.9609797
164 | 612,Sturdivant,MO,93.84076298,86.38850955
165 | 537,Hagatna,GU,97.17321584,151.8086289
166 | 510,Eastlake,MI,134.0938535,38.78212913
167 | 354,Larkspur,CA,107.0529696,65.97363083
168 | 983,Patriot,IN,82.63795084,46.08354932
169 | 799,Corriganville,MD,141.383789,153.6500914
170 | 581,Carlos,MN,114.9060173,66.2810487
171 | 825,Addison,MI,96.36953674,142.4105732
172 | 526,Tarzana,CA,135.8603987,81.30731303
173 | 176,Grapevine,AR,92.36589225,84.54293686
174 | 994,Kanorado,KS,65.42078424,85.72249232
175 | 704,Climax,MI,127.3563782,107.0542747
176 | 582,Curdsville,KY,84.78749012,150.4842247
177 | 884,Southport,CT,59.09336238,63.13052144
178 | 196,Compton,IL,106.617993,99.40704162
179 | 605,Notasulga,AL,66.84426322,115.6864036
180 | 430,Rumsey,KY,70.6921152,50.2122756
181 | 234,Rogers,CT,140.4723914,33.18335673
182 | 702,Everton,MO,119.0469849,51.48512967
183 | 662,Skanee,MI,70.1724149,129.5593113
184 | 171,Springerville,AZ,124.6882036,150.6628287
185 | 615,Libertytown,MD,144.5783185,111.9744225
186 | 336,Dumont,MN,57.0124315,129.3675605
187 | 315,Ravenna,KY,79.15467169,106.252172
188 | 505,Williams,AZ,73.48100913,111.7413889
189 | 842,Decatur,MI,63.31154085,161.4235787
190 | 982,Holbrook,AZ,134.8838521,103.8569792
191 | 868,Sherrill,AR,79.96440727,152.2197289
192 | 554,Brownsdale,MN,52.42646664,50.79836304
193 | 199,Linden,MI,53.41116218,32.62422206
194 | 453,Sedgwick,AR,68.93334418,75.29418595
195 | 326,Rocheport,MO,114.163159,64.48216553
196 | 638,Clovis,CA,92.43965299,138.0751933
197 | 156,Heyburn,ID,82.08611195,121.0459768
198 | 861,Peabody,KS,75.41614816,152.2100746
199 | 428,Randall,KS,47.99772806,135.6275983
200 | 677,Hayesville,IA,119.9881564,42.12719349
201 | 183,Jordan,MN,68.74638928,35.46228503
202 | 242,Macy,IN,138.694477,152.3694449
203 | 621,Flowood,MS,64.88877035,149.2064111
204 | 180,Napoleon,IN,32.03325626,160.2402958
205 | 853,Coldwater,KS,47.50617517,26.31002645
206 | 105,Weldon,CA,134.0156771,118.9609382
207 | 357,Yellville,AR,35.68710434,42.24658664
208 | 920,Eustis,FL,42.73630964,39.48336091
209 | 355,Weldona,CO,32.96727204,58.44917695
210 | 501,Tefft,IN,93.21527074,150.0159946
211 | 834,Bayville,ME,106.7349403,143.4078424
212 | 255,Brighton,IL,107.6050821,32.84882058
213 | 595,Grimes,IA,42.05019623,74.73314913
214 | 709,Nubieber,CA,132.9033933,49.27761205
215 | 16,Beaufort,MO,71.77418064,85.65741838
216 | 231,Arispe,IA,31.11149635,137.7968198
217 | 891,Humeston,IA,74.51222394,122.4246326
218 | 757,Lakeville,CT,59.86867012,94.98860174
219 | 506,Firebrick,KY,49.99183934,95.03900712
220 | 583,Channing,MI,117.1645417,56.95124478
221 | 504,Melber,KY,37.24884854,55.53335159
222 | 901,Manchester,MN,71.02098012,84.00752922
223 | 586,Ottertail,MN,100.0240382,44.34165481
224 | 95,Dupo,IL,41.28342297,29.03342929
225 | 524,Montrose,CA,136.4765033,119.373558
226 | 716,Schleswig,IA,119.2539069,51.88108538
227 | 904,Ermine,KY,119.6401426,62.79812627
228 | 740,Siler,KY,137.0193079,117.2464806
229 | 57,Clifton,AZ,30.15463898,135.7025933
230 | 155,Casco,MI,138.5984073,109.0728819
231 | 755,Sturgis,MI,117.392421,135.3989883
232 | 287,Madisonville,LA,112.2163874,53.04603619
233 | 435,Albion,IN,44.25844944,121.8753316
234 | 672,Lismore,MN,58.87142971,103.8693391
235 | 572,Athens,IN,75.32104008,120.7983748
236 | 890,Eufaula,AL,140.2958283,103.0868213
237 | 119,Wildie,KY,69.65812987,111.8552379
238 | 540,Mosca,CO,89.20441335,141.4811419
239 | 678,Bennington,IN,35.52107321,26.80362207
240 | 208,Lottie,LA,109.8672979,82.76650144
241 | 512,Garland,ME,108.7311062,134.3750565
242 | 352,Clutier,IA,61.1888319,127.0339038
243 | 948,Lupton,MI,139.9255926,53.36397181
244 | 503,Northfield,MN,61.00207775,37.15335522
245 | 288,Daleville,AL,121.8865105,136.1704398
246 | 479,Cuba,MO,63.71916114,87.64843313
247 | 826,Norris,MT,47.18550342,37.25727353
248 | 651,Clopton,AL,40.77104358,84.70678339
249 | 143,Renville,MN,142.1513936,99.43201313
250 | 102,Kirksville,MO,140.0030631,143.8709979
251 | 69,Kingsland,AR,78.22487634,85.13857667
252 | 181,Fairview,KS,80.27133556,164.5798928
253 | 175,Lydia,LA,41.78237386,39.53037919
254 | 80,Bridgton,ME,93.2257481,140.415464
255 | 596,Brownstown,IL,48.65218811,63.22095723
256 | 301,Monona,IA,144.1294884,81.57803996
257 | 987,Hartland,MI,136.2638918,107.738067
258 | 973,Andover,CT,51.74018501,52.53230369
259 | 981,Lakota,IA,56.15413675,92.38612569
260 | 110,Mesick,MI,82.12446036,108.5283528
261 | 396,Dryden,MI,69.80182523,47.7436689
262 | 637,Beverly,KY,57.75450094,126.8958422
263 | 801,Pocahontas,IL,109.6304686,83.23109494
264 | 130,Hayneville,AL,109.7380661,157.4686782
265 | 345,Yoder,IN,83.49946581,143.715826
266 | 851,Gatewood,MO,76.31562733,145.668333
267 | 489,Madden,MS,81.34223218,99.37998257
268 | 223,Losantville,IN,112.5187171,106.7760547
269 | 538,Cheswold,DE,31.93743733,59.34689519
270 | 329,Caseville,MI,102.9200706,98.4033735
271 | 815,Pomona,MO,52.33346818,50.28222507
272 | 789,Hopkinsville,KY,27.31872893,47.94652919
273 | 269,Jack,AL,49.93703023,85.62817326
274 | 969,Dixie,GA,27.21713791,36.47378899
275 | 271,Hillside,CO,99.26558164,68.84352684
276 | 667,Hawarden,IA,90.96161545,46.93255602
277 | 350,Cannonsburg,MI,91.03351667,120.6696799
278 | 49,Osborne,KS,70.36168327,139.7111654
279 | 404,Farmington,IL,91.7144044,72.0223174
280 | 23,Honolulu,HI,110.101955,139.7437776
281 | 1,Pfeifer,KS,37.44478047,65.68491252
282 | 127,Oshtemo,MI,100.3702957,135.9503227
283 | 657,Gridley,KS,118.1450367,55.80178454
--------------------------------------------------------------------------------
/Problem 8/problem8.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c",
6 | "metadata": {},
7 | "source": [
8 | "Here, we will solve problems two ways\n",
9 | "1. First using PySpark function \n",
10 | "2. Second using Spark SQL"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# First Load all the required library and also Start Spark Session\n",
21 | "# Load all the required library\n",
22 | "from pyspark.sql import SparkSession"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stderr",
33 | "output_type": "stream",
34 | "text": [
35 | "WARNING: An illegal reflective access operation has occurred\n",
36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
39 | "WARNING: All illegal access operations will be denied in a future release\n",
40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
41 | "Setting default log level to \"WARN\".\n",
42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
43 | "23/02/23 22:25:39 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
44 | "23/02/23 22:25:40 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n",
45 | "23/02/23 22:25:40 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n"
46 | ]
47 | }
48 | ],
49 | "source": [
50 | "#Start Spark Session\n",
51 | "spark = SparkSession.builder.appName(\"problem8\").getOrCreate()\n",
52 | "sqlContext = SparkSession(spark)\n",
53 | "#Dont Show warning only error\n",
54 | "spark.sparkContext.setLogLevel(\"ERROR\")"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 3,
60 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68",
61 | "metadata": {},
62 | "outputs": [
63 | {
64 | "name": "stderr",
65 | "output_type": "stream",
66 | "text": [
67 | " \r"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "#Load CSV file into DataFrame\n",
73 | "ridelogdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"ride_log.csv\")"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 4,
79 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030",
80 | "metadata": {},
81 | "outputs": [
82 | {
83 | "name": "stdout",
84 | "output_type": "stream",
85 | "text": [
86 | "root\n",
87 | " |-- id: integer (nullable = true)\n",
88 | " |-- user_id: integer (nullable = true)\n",
89 | " |-- distance: integer (nullable = true)\n",
90 | "\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "#Check Schema of DataFrame\n",
96 | "ridelogdf.printSchema()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 5,
102 | "id": "693b0edd-852f-46de-b983-81357b95ad36",
103 | "metadata": {},
104 | "outputs": [],
105 | "source": [
106 | "#Load CSV file into DataFrame\n",
107 | "userdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"user.csv\")"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 6,
113 | "id": "e9d65b9d-fe5a-4ed3-91b4-c9246551cce9",
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "name": "stdout",
118 | "output_type": "stream",
119 | "text": [
120 | "root\n",
121 | " |-- id: integer (nullable = true)\n",
122 | " |-- name: string (nullable = true)\n",
123 | "\n"
124 | ]
125 | }
126 | ],
127 | "source": [
128 | "#Check Schema of DataFrame\n",
129 | "userdf.printSchema()"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 10,
135 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1",
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "# Now we are solving Same problem using Spark SQL \n",
140 | "# Creating Temp Table or HIVE table\n",
141 | "ridelogdf.createOrReplaceTempView(\"tmpRidelog\")\n",
142 | "userdf.createOrReplaceTempView(\"tmpUser\")"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 11,
148 | "id": "e55eb16a-fb5c-42b6-9f7c-feb1ff9c2945",
149 | "metadata": {},
150 | "outputs": [
151 | {
152 | "name": "stdout",
153 | "output_type": "stream",
154 | "text": [
155 | "+---+-------+--------+\n",
156 | "| id|user_id|distance|\n",
157 | "+---+-------+--------+\n",
158 | "|101| 8| 93|\n",
159 | "|102| 40| 56|\n",
160 | "|103| 28| 83|\n",
161 | "|104| 33| 83|\n",
162 | "|105| 1| 87|\n",
163 | "|106| 32| 49|\n",
164 | "|107| 3| 5|\n",
165 | "|108| 23| 37|\n",
166 | "|109| 31| 62|\n",
167 | "|110| 1| 35|\n",
168 | "|111| 41| 89|\n",
169 | "|112| 19| 64|\n",
170 | "|113| 49| 57|\n",
171 | "|114| 28| 68|\n",
172 | "|115| 48| 94|\n",
173 | "|116| 50| 89|\n",
174 | "|117| 48| 29|\n",
175 | "|118| 13| 16|\n",
176 | "|119| 24| 58|\n",
177 | "|120| 25| 19|\n",
178 | "+---+-------+--------+\n",
179 | "only showing top 20 rows\n",
180 | "\n"
181 | ]
182 | }
183 | ],
184 | "source": [
185 | "sqlContext.sql(\"SELECT * FROM tmpRidelog\").show()"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": 12,
191 | "id": "845b2bd0-a09a-45ca-84f4-0e3593ad9026",
192 | "metadata": {},
193 | "outputs": [
194 | {
195 | "name": "stdout",
196 | "output_type": "stream",
197 | "text": [
198 | "+---+-------------------+\n",
199 | "| id| name|\n",
200 | "+---+-------------------+\n",
201 | "| 1| Dustin Smith|\n",
202 | "| 2| Jay Ramirez|\n",
203 | "| 3| Joseph Cooke|\n",
204 | "| 4| Melinda Young|\n",
205 | "| 5| Sean Parker|\n",
206 | "| 6| Ian Foster|\n",
207 | "| 7|Christopher Schmitt|\n",
208 | "| 8| Patrick Gutierrez|\n",
209 | "| 9| Dennis Douglas|\n",
210 | "| 10| Brenda Morris|\n",
211 | "| 11| Jeffery Hernandez|\n",
212 | "| 12| David Rice|\n",
213 | "| 13| Charles Foster|\n",
214 | "| 14| Keith Perez DVM|\n",
215 | "| 15| Dean Cuevas|\n",
216 | "| 16| Melissa Bishop|\n",
217 | "| 17| Alexander Howell|\n",
218 | "| 18| Austin Robertson|\n",
219 | "| 19| Sherri Mcdaniel|\n",
220 | "| 20| Nancy Nguyen|\n",
221 | "+---+-------------------+\n",
222 | "only showing top 20 rows\n",
223 | "\n"
224 | ]
225 | }
226 | ],
227 | "source": [
228 | "sqlContext.sql(\"SELECT * FROM tmpUser\").show()"
229 | ]
230 | },
231 | {
232 | "cell_type": "code",
233 | "execution_count": 32,
234 | "id": "d4a77a98-b4e8-4c5d-bb66-c8d1d17bc95e",
235 | "metadata": {},
236 | "outputs": [
237 | {
238 | "name": "stdout",
239 | "output_type": "stream",
240 | "text": [
241 | "+-------+-------------------+-----+----------+\n",
242 | "|user_id| name|total|actualrank|\n",
243 | "+-------+-------------------+-----+----------+\n",
244 | "| 3| Joseph Cooke| 5| 1|\n",
245 | "| 45| Benjamin Mcbride| 11| 2|\n",
246 | "| 13| Charles Foster| 16| 3|\n",
247 | "| 18| Austin Robertson| 27| 4|\n",
248 | "| 36| Alyssa Shaw| 28| 5|\n",
249 | "| 37| Destiny Clark| 48| 6|\n",
250 | "| 40| Stacy Bryant| 56| 7|\n",
251 | "| 19| Sherri Mcdaniel| 64| 8|\n",
252 | "| 23| Joseph Hamilton| 79| 9|\n",
253 | "| 21| Melody Ball| 81| 10|\n",
254 | "| 39| Mark Diaz| 81| 10|\n",
255 | "| 38| Thomas Lara| 82| 12|\n",
256 | "| 33| Donna Ortiz| 83| 13|\n",
257 | "| 31| Shannon Green| 86| 14|\n",
258 | "| 41| Howard Rose| 89| 15|\n",
259 | "| 10| Brenda Morris| 90| 16|\n",
260 | "| 27| Jacqueline Heath| 91| 17|\n",
261 | "| 5| Sean Parker| 92| 18|\n",
262 | "| 7|Christopher Schmitt| 96| 19|\n",
263 | "| 46| Elizabeth Ward| 108| 20|\n",
264 | "+-------+-------------------+-----+----------+\n",
265 | "only showing top 20 rows\n",
266 | "\n"
267 | ]
268 | }
269 | ],
270 | "source": [
271 | "sqlContext.sql(\"SELECT user_id \\\n",
272 | " , name \\\n",
273 | " , sum(distance) as total\\\n",
274 | " , RANK() OVER (ORDER BY sum(distance)) as actualrank \\\n",
275 | " FROM tmpRidelog as log \\\n",
276 | " LEFT OUTER JOIN tmpUser as users \\\n",
277 | " ON log.user_id = users.id \\\n",
278 | " GROUP BY user_id, name\").show()"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 27,
284 | "id": "b3654e1a-6d81-418c-b16c-c605d480fde9",
285 | "metadata": {},
286 | "outputs": [
287 | {
288 | "name": "stdout",
289 | "output_type": "stream",
290 | "text": [
291 | "+-------+----------------+-----+\n",
292 | "|user_id| name|total|\n",
293 | "+-------+----------------+-----+\n",
294 | "| 3| Joseph Cooke| 5|\n",
295 | "| 45|Benjamin Mcbride| 11|\n",
296 | "| 13| Charles Foster| 16|\n",
297 | "| 18|Austin Robertson| 27|\n",
298 | "| 36| Alyssa Shaw| 28|\n",
299 | "| 37| Destiny Clark| 48|\n",
300 | "| 40| Stacy Bryant| 56|\n",
301 | "| 19| Sherri Mcdaniel| 64|\n",
302 | "| 23| Joseph Hamilton| 79|\n",
303 | "| 39| Mark Diaz| 81|\n",
304 | "| 21| Melody Ball| 81|\n",
305 | "+-------+----------------+-----+\n",
306 | "\n"
307 | ]
308 | }
309 | ],
310 | "source": [
311 | "sqlContext.sql(\"SELECT q.user_id, q.name, q.total \\\n",
312 | " FROM \\\n",
313 | " ( \\\n",
314 | " SELECT user_id \\\n",
315 | " , name \\\n",
316 | " , sum(distance) as total\\\n",
317 | " , RANK() OVER (ORDER BY sum(distance)) as actualrank \\\n",
318 | " FROM tmpRidelog as log \\\n",
319 | " LEFT OUTER JOIN tmpUser as users \\\n",
320 | " ON log.user_id = users.id \\\n",
321 | " GROUP BY user_id, name ) as q \\\n",
322 | " WHERE q.actualrank <= 10\").show()"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": 28,
328 | "id": "b40d4481-7dfc-4cfc-a8f1-2d11e092dc2b",
329 | "metadata": {},
330 | "outputs": [
331 | {
332 | "name": "stdout",
333 | "output_type": "stream",
334 | "text": [
335 | "+-------+-----------------+-----+\n",
336 | "|user_id| name|total|\n",
337 | "+-------+-----------------+-----+\n",
338 | "| 47| Christina Price| 328|\n",
339 | "| 34| Jennifer Simmons| 277|\n",
340 | "| 43| Kimberly Potter| 275|\n",
341 | "| 8|Patrick Gutierrez| 243|\n",
342 | "| 25| Crystal Berg| 239|\n",
343 | "| 14| Keith Perez DVM| 214|\n",
344 | "| 32| Stacy Collins| 210|\n",
345 | "| 11|Jeffery Hernandez| 206|\n",
346 | "| 9| Dennis Douglas| 206|\n",
347 | "| 17| Alexander Howell| 205|\n",
348 | "+-------+-----------------+-----+\n",
349 | "\n"
350 | ]
351 | }
352 | ],
353 | "source": [
354 | "sqlContext.sql(\"SELECT q.user_id, q.name, q.total \\\n",
355 | " FROM \\\n",
356 | " ( \\\n",
357 | " SELECT user_id \\\n",
358 | " , name \\\n",
359 | " , sum(distance) as total\\\n",
360 | " , RANK() OVER (ORDER BY sum(distance) DESC) as actualrank \\\n",
361 | " FROM tmpRidelog as log \\\n",
362 | " LEFT OUTER JOIN tmpUser as users \\\n",
363 | " ON log.user_id = users.id \\\n",
364 | " GROUP BY user_id, name ) as q \\\n",
365 | " WHERE q.actualrank <= 10\").show()"
366 | ]
367 | }
368 | ],
369 | "metadata": {
370 | "kernelspec": {
371 | "display_name": "Python 3 (ipykernel)",
372 | "language": "python",
373 | "name": "python3"
374 | },
375 | "language_info": {
376 | "codemirror_mode": {
377 | "name": "ipython",
378 | "version": 3
379 | },
380 | "file_extension": ".py",
381 | "mimetype": "text/x-python",
382 | "name": "python",
383 | "nbconvert_exporter": "python",
384 | "pygments_lexer": "ipython3",
385 | "version": "3.8.13"
386 | }
387 | },
388 | "nbformat": 4,
389 | "nbformat_minor": 5
390 | }
391 |
--------------------------------------------------------------------------------
/Problem 1/employee.json:
--------------------------------------------------------------------------------
1 | {
2 | "columns": [
3 | "id",
4 | "first_name",
5 | "last_name",
6 | "salary",
7 | "department_id"
8 | ],
9 | "data": [
10 | [
11 | 1,
12 | "Todd",
13 | "Wilson",
14 | 110000,
15 | 1006
16 | ],
17 | [
18 | 1,
19 | "Todd",
20 | "Wilson",
21 | 106119,
22 | 1006
23 | ],
24 | [
25 | 2,
26 | "Justin",
27 | "Simon",
28 | 128922,
29 | 1005
30 | ],
31 | [
32 | 2,
33 | "Justin",
34 | "Simon",
35 | 130000,
36 | 1005
37 | ],
38 | [
39 | 3,
40 | "Kelly",
41 | "Rosario",
42 | 42689,
43 | 1002
44 | ],
45 | [
46 | 4,
47 | "Patricia",
48 | "Powell",
49 | 162825,
50 | 1004
51 | ],
52 | [
53 | 4,
54 | "Patricia",
55 | "Powell",
56 | 170000,
57 | 1004
58 | ],
59 | [
60 | 5,
61 | "Sherry",
62 | "Golden",
63 | 44101,
64 | 1002
65 | ],
66 | [
67 | 6,
68 | "Natasha",
69 | "Swanson",
70 | 79632,
71 | 1005
72 | ],
73 | [
74 | 6,
75 | "Natasha",
76 | "Swanson",
77 | 90000,
78 | 1005
79 | ],
80 | [
81 | 7,
82 | "Diane",
83 | "Gordon",
84 | 74591,
85 | 1002
86 | ],
87 | [
88 | 8,
89 | "Mercedes",
90 | "Rodriguez",
91 | 61048,
92 | 1005
93 | ],
94 | [
95 | 9,
96 | "Christy",
97 | "Mitchell",
98 | 137236,
99 | 1001
100 | ],
101 | [
102 | 9,
103 | "Christy",
104 | "Mitchell",
105 | 140000,
106 | 1001
107 | ],
108 | [
109 | 9,
110 | "Christy",
111 | "Mitchell",
112 | 150000,
113 | 1001
114 | ],
115 | [
116 | 10,
117 | "Sean",
118 | "Crawford",
119 | 182065,
120 | 1006
121 | ],
122 | [
123 | 10,
124 | "Sean",
125 | "Crawford",
126 | 190000,
127 | 1006
128 | ],
129 | [
130 | 11,
131 | "Kevin",
132 | "Townsend",
133 | 166861,
134 | 1002
135 | ],
136 | [
137 | 12,
138 | "Joshua",
139 | "Johnson",
140 | 123082,
141 | 1004
142 | ],
143 | [
144 | 13,
145 | "Julie",
146 | "Sanchez",
147 | 185663,
148 | 1001
149 | ],
150 | [
151 | 13,
152 | "Julie",
153 | "Sanchez",
154 | 200000,
155 | 1001
156 | ],
157 | [
158 | 13,
159 | "Julie",
160 | "Sanchez",
161 | 210000,
162 | 1001
163 | ],
164 | [
165 | 14,
166 | "John",
167 | "Coleman",
168 | 152434,
169 | 1001
170 | ],
171 | [
172 | 15,
173 | "Anthony",
174 | "Valdez",
175 | 96898,
176 | 1001
177 | ],
178 | [
179 | 16,
180 | "Briana",
181 | "Rivas",
182 | 151668,
183 | 1005
184 | ],
185 | [
186 | 17,
187 | "Jason",
188 | "Burnett",
189 | 42525,
190 | 1006
191 | ],
192 | [
193 | 18,
194 | "Jeffrey",
195 | "Harris",
196 | 14491,
197 | 1002
198 | ],
199 | [
200 | 18,
201 | "Jeffrey",
202 | "Harris",
203 | 20000,
204 | 1002
205 | ],
206 | [
207 | 19,
208 | "Michael",
209 | "Ramsey",
210 | 63159,
211 | 1003
212 | ],
213 | [
214 | 20,
215 | "Cody",
216 | "Gonzalez",
217 | 112809,
218 | 1004
219 | ],
220 | [
221 | 21,
222 | "Stephen",
223 | "Berry",
224 | 123617,
225 | 1002
226 | ],
227 | [
228 | 22,
229 | "Brittany",
230 | "Scott",
231 | 162537,
232 | 1002
233 | ],
234 | [
235 | 23,
236 | "Angela",
237 | "Williams",
238 | 100875,
239 | 1004
240 | ],
241 | [
242 | 24,
243 | "William",
244 | "Flores",
245 | 142674,
246 | 1003
247 | ],
248 | [
249 | 25,
250 | "Pamela",
251 | "Matthews",
252 | 57944,
253 | 1005
254 | ],
255 | [
256 | 26,
257 | "Allison",
258 | "Johnson",
259 | 128782,
260 | 1001
261 | ],
262 | [
263 | 27,
264 | "Anthony",
265 | "Ball",
266 | 34386,
267 | 1003
268 | ],
269 | [
270 | 28,
271 | "Alexis",
272 | "Beck",
273 | 12260,
274 | 1005
275 | ],
276 | [
277 | 29,
278 | "Jason",
279 | "Olsen",
280 | 51937,
281 | 1006
282 | ],
283 | [
284 | 30,
285 | "Stephen",
286 | "Smith",
287 | 194791,
288 | 1001
289 | ],
290 | [
291 | 31,
292 | "Kimberly",
293 | "Brooks",
294 | 95327,
295 | 1003
296 | ],
297 | [
298 | 32,
299 | "Eric",
300 | "Zimmerman",
301 | 83093,
302 | 1006
303 | ],
304 | [
305 | 33,
306 | "Peter",
307 | "Holt",
308 | 69945,
309 | 1002
310 | ],
311 | [
312 | 34,
313 | "Justin",
314 | "Dunn",
315 | 67992,
316 | 1003
317 | ],
318 | [
319 | 35,
320 | "John",
321 | "Ball",
322 | 47795,
323 | 1004
324 | ],
325 | [
326 | 36,
327 | "Jesus",
328 | "Ward",
329 | 36078,
330 | 1005
331 | ],
332 | [
333 | 37,
334 | "Philip",
335 | "Gillespie",
336 | 36424,
337 | 1006
338 | ],
339 | [
340 | 38,
341 | "Nicole",
342 | "Lewis",
343 | 114079,
344 | 1001
345 | ],
346 | [
347 | 39,
348 | "Linda",
349 | "Clark",
350 | 186781,
351 | 1002
352 | ],
353 | [
354 | 40,
355 | "Colleen",
356 | "Carrillo",
357 | 147723,
358 | 1004
359 | ],
360 | [
361 | 41,
362 | "John",
363 | "George",
364 | 21642,
365 | 1001
366 | ],
367 | [
368 | 42,
369 | "Traci",
370 | "Williams",
371 | 138892,
372 | 1003
373 | ],
374 | [
375 | 42,
376 | "Traci",
377 | "Williams",
378 | 150000,
379 | 1003
380 | ],
381 | [
382 | 42,
383 | "Traci",
384 | "Williams",
385 | 160000,
386 | 1003
387 | ],
388 | [
389 | 42,
390 | "Traci",
391 | "Williams",
392 | 180000,
393 | 1003
394 | ],
395 | [
396 | 43,
397 | "Joseph",
398 | "Rogers",
399 | 22800,
400 | 1005
401 | ],
402 | [
403 | 44,
404 | "Trevor",
405 | "Carter",
406 | 38670,
407 | 1001
408 | ],
409 | [
410 | 45,
411 | "Kevin",
412 | "Duncan",
413 | 45210,
414 | 1003
415 | ],
416 | [
417 | 46,
418 | "Joshua",
419 | "Ewing",
420 | 73088,
421 | 1003
422 | ],
423 | [
424 | 47,
425 | "Kimberly",
426 | "Dean",
427 | 71416,
428 | 1003
429 | ],
430 | [
431 | 48,
432 | "Robert",
433 | "Lynch",
434 | 117960,
435 | 1004
436 | ],
437 | [
438 | 49,
439 | "Amber",
440 | "Harding",
441 | 77764,
442 | 1002
443 | ],
444 | [
445 | 50,
446 | "Victoria",
447 | "Wilson",
448 | 176620,
449 | 1002
450 | ],
451 | [
452 | 51,
453 | "Theresa",
454 | "Everett",
455 | 31404,
456 | 1002
457 | ],
458 | [
459 | 52,
460 | "Kara",
461 | "Smith",
462 | 192838,
463 | 1004
464 | ],
465 | [
466 | 53,
467 | "Teresa",
468 | "Cohen",
469 | 98860,
470 | 1001
471 | ],
472 | [
473 | 54,
474 | "Wesley",
475 | "Tucker",
476 | 90221,
477 | 1005
478 | ],
479 | [
480 | 55,
481 | "Michael",
482 | "Morris",
483 | 106799,
484 | 1005
485 | ],
486 | [
487 | 56,
488 | "Rachael",
489 | "Williams",
490 | 103585,
491 | 1002
492 | ],
493 | [
494 | 57,
495 | "Patricia",
496 | "Harmon",
497 | 147417,
498 | 1005
499 | ],
500 | [
501 | 58,
502 | "Edward",
503 | "Sharp",
504 | 41077,
505 | 1005
506 | ],
507 | [
508 | 59,
509 | "Kevin",
510 | "Robinson",
511 | 100924,
512 | 1005
513 | ],
514 | [
515 | 60,
516 | "Charles",
517 | "Pearson",
518 | 173317,
519 | 1004
520 | ],
521 | [
522 | 61,
523 | "Ryan",
524 | "Brown",
525 | 110225,
526 | 1003
527 | ],
528 | [
529 | 61,
530 | "Ryan",
531 | "Brown",
532 | 120000,
533 | 1003
534 | ],
535 | [
536 | 62,
537 | "Dale",
538 | "Hayes",
539 | 97662,
540 | 1005
541 | ],
542 | [
543 | 63,
544 | "Richard",
545 | "Sanford",
546 | 136083,
547 | 1001
548 | ],
549 | [
550 | 64,
551 | "Danielle",
552 | "Williams",
553 | 98655,
554 | 1006
555 | ],
556 | [
557 | 64,
558 | "Danielle",
559 | "Williams",
560 | 110000,
561 | 1006
562 | ],
563 | [
564 | 64,
565 | "Danielle",
566 | "Williams",
567 | 120000,
568 | 1006
569 | ],
570 | [
571 | 65,
572 | "Deborah",
573 | "Martin",
574 | 67389,
575 | 1004
576 | ],
577 | [
578 | 66,
579 | "Dustin",
580 | "Bush",
581 | 47567,
582 | 1004
583 | ],
584 | [
585 | 67,
586 | "Tyler",
587 | "Green",
588 | 111085,
589 | 1002
590 | ],
591 | [
592 | 68,
593 | "Antonio",
594 | "Carpenter",
595 | 83684,
596 | 1002
597 | ],
598 | [
599 | 69,
600 | "Ernest",
601 | "Peterson",
602 | 115993,
603 | 1005
604 | ],
605 | [
606 | 70,
607 | "Karen",
608 | "Fernandez",
609 | 101238,
610 | 1003
611 | ],
612 | [
613 | 71,
614 | "Kristine",
615 | "Casey",
616 | 67651,
617 | 1003
618 | ],
619 | [
620 | 72,
621 | "Christine",
622 | "Frye",
623 | 137244,
624 | 1004
625 | ],
626 | [
627 | 73,
628 | "William",
629 | "Preston",
630 | 155225,
631 | 1003
632 | ],
633 | [
634 | 74,
635 | "Richard",
636 | "Cole",
637 | 180361,
638 | 1003
639 | ],
640 | [
641 | 75,
642 | "Julia",
643 | "Ramos",
644 | 61398,
645 | 1006
646 | ],
647 | [
648 | 75,
649 | "Julia",
650 | "Ramos",
651 | 70000,
652 | 1006
653 | ],
654 | [
655 | 75,
656 | "Julia",
657 | "Ramos",
658 | 83000,
659 | 1006
660 | ],
661 | [
662 | 75,
663 | "Julia",
664 | "Ramos",
665 | 90000,
666 | 1006
667 | ],
668 | [
669 | 75,
670 | "Julia",
671 | "Ramos",
672 | 105000,
673 | 1006
674 | ]
675 | ]
676 | }
--------------------------------------------------------------------------------
/Problem 2/problem2_2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c",
6 | "metadata": {},
7 | "source": [
8 | "Here, we will solve problems two ways\n",
9 | "1. First using PySpark function \n",
10 | "2. Second using Spark SQL"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# First Load all the required library and also Start Spark Session\n",
21 | "# Load all the required library\n",
22 | "from pyspark.sql import SparkSession"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stderr",
33 | "output_type": "stream",
34 | "text": [
35 | "WARNING: An illegal reflective access operation has occurred\n",
36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
39 | "WARNING: All illegal access operations will be denied in a future release\n",
40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
41 | "Setting default log level to \"WARN\".\n",
42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
43 | "23/02/08 11:29:48 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
44 | "23/02/08 11:29:51 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "#Start Spark Session\n",
50 | "spark = SparkSession.builder.appName(\"problem2\").getOrCreate()\n",
51 | "sqlContext = SparkSession(spark)\n",
52 | "#Dont Show warning only error\n",
53 | "spark.sparkContext.setLogLevel(\"ERROR\")"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 3,
59 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68",
60 | "metadata": {},
61 | "outputs": [
62 | {
63 | "name": "stderr",
64 | "output_type": "stream",
65 | "text": [
66 | " \r"
67 | ]
68 | }
69 | ],
70 | "source": [
71 | "#Load CSV file into DataFrame\n",
72 | "employeedf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"employee_salary.csv\")"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030",
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "name": "stdout",
83 | "output_type": "stream",
84 | "text": [
85 | "root\n",
86 | " |-- id: integer (nullable = true)\n",
87 | " |-- first_name: string (nullable = true)\n",
88 | " |-- last_name: string (nullable = true)\n",
89 | " |-- salary: integer (nullable = true)\n",
90 | " |-- department_id: integer (nullable = true)\n",
91 | "\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "#Check Schema of DataFrame\n",
97 | "employeedf.printSchema()"
98 | ]
99 | },
100 | {
101 | "cell_type": "code",
102 | "execution_count": 5,
103 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba",
104 | "metadata": {},
105 | "outputs": [
106 | {
107 | "name": "stdout",
108 | "output_type": "stream",
109 | "text": [
110 | "+---+----------+---------+------+-------------+\n",
111 | "| id|first_name|last_name|salary|department_id|\n",
112 | "+---+----------+---------+------+-------------+\n",
113 | "| 45| Kevin| Duncan| 45210| 1003|\n",
114 | "| 25| Pamela| Matthews| 57944| 1005|\n",
115 | "| 48| Robert| Lynch|117960| 1004|\n",
116 | "| 34| Justin| Dunn| 67992| 1003|\n",
117 | "| 62| Dale| Hayes| 97662| 1005|\n",
118 | "| 1| Todd| Wilson|110000| 1006|\n",
119 | "| 61| Ryan| Brown|120000| 1003|\n",
120 | "| 21| Stephen| Berry|123617| 1002|\n",
121 | "| 13| Julie| Sanchez|210000| 1001|\n",
122 | "| 55| Michael| Morris|106799| 1005|\n",
123 | "| 44| Trevor| Carter| 38670| 1001|\n",
124 | "| 73| William| Preston|155225| 1003|\n",
125 | "| 39| Linda| Clark|186781| 1002|\n",
126 | "| 10| Sean| Crawford|190000| 1006|\n",
127 | "| 30| Stephen| Smith|194791| 1001|\n",
128 | "| 75| Julia| Ramos|105000| 1006|\n",
129 | "| 59| Kevin| Robinson|100924| 1005|\n",
130 | "| 69| Ernest| Peterson|115993| 1005|\n",
131 | "| 65| Deborah| Martin| 67389| 1004|\n",
132 | "| 63| Richard| Sanford|136083| 1001|\n",
133 | "+---+----------+---------+------+-------------+\n",
134 | "only showing top 20 rows\n",
135 | "\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "#Check sample Data \n",
141 | "employeedf.show()"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 6,
147 | "id": "c6b4f318-0d5f-4be1-b9df-7fe6b3b008dd",
148 | "metadata": {},
149 | "outputs": [],
150 | "source": [
151 | "#Load CSV file into DataFrame\n",
152 | "departmentdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"department.csv\")"
153 | ]
154 | },
155 | {
156 | "cell_type": "code",
157 | "execution_count": 7,
158 | "id": "f4c4435b-dbdd-4890-9c0c-5b6e680005d4",
159 | "metadata": {},
160 | "outputs": [
161 | {
162 | "name": "stdout",
163 | "output_type": "stream",
164 | "text": [
165 | "root\n",
166 | " |-- department_id: integer (nullable = true)\n",
167 | " |-- department_name: string (nullable = true)\n",
168 | "\n"
169 | ]
170 | }
171 | ],
172 | "source": [
173 | "#Check Schema of DataFrame\n",
174 | "departmentdf.printSchema()"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 8,
180 | "id": "296c262a-a858-46a2-9bb3-38d212b52daf",
181 | "metadata": {},
182 | "outputs": [
183 | {
184 | "name": "stdout",
185 | "output_type": "stream",
186 | "text": [
187 | "+-------------+---------------+\n",
188 | "|department_id|department_name|\n",
189 | "+-------------+---------------+\n",
190 | "| 1005| Sales|\n",
191 | "| 1002| Finanace|\n",
192 | "| 1004| Purchase|\n",
193 | "| 1001| Operations|\n",
194 | "| 1006| Marketing|\n",
195 | "| 1003| Technoogy|\n",
196 | "+-------------+---------------+\n",
197 | "\n"
198 | ]
199 | }
200 | ],
201 | "source": [
202 | "#Check sample Data \n",
203 | "departmentdf.show()"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 10,
209 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171",
210 | "metadata": {},
211 | "outputs": [
212 | {
213 | "name": "stdout",
214 | "output_type": "stream",
215 | "text": [
216 | "+---+----------+---------+------+-------------+-------------+---------------+\n",
217 | "| id|first_name|last_name|salary|department_id|department_id|department_name|\n",
218 | "+---+----------+---------+------+-------------+-------------+---------------+\n",
219 | "| 45| Kevin| Duncan| 45210| 1003| 1003| Technoogy|\n",
220 | "| 25| Pamela| Matthews| 57944| 1005| 1005| Sales|\n",
221 | "| 48| Robert| Lynch|117960| 1004| 1004| Purchase|\n",
222 | "| 34| Justin| Dunn| 67992| 1003| 1003| Technoogy|\n",
223 | "| 62| Dale| Hayes| 97662| 1005| 1005| Sales|\n",
224 | "| 1| Todd| Wilson|110000| 1006| 1006| Marketing|\n",
225 | "| 61| Ryan| Brown|120000| 1003| 1003| Technoogy|\n",
226 | "| 21| Stephen| Berry|123617| 1002| 1002| Finanace|\n",
227 | "| 13| Julie| Sanchez|210000| 1001| 1001| Operations|\n",
228 | "| 55| Michael| Morris|106799| 1005| 1005| Sales|\n",
229 | "| 44| Trevor| Carter| 38670| 1001| 1001| Operations|\n",
230 | "| 73| William| Preston|155225| 1003| 1003| Technoogy|\n",
231 | "| 39| Linda| Clark|186781| 1002| 1002| Finanace|\n",
232 | "| 10| Sean| Crawford|190000| 1006| 1006| Marketing|\n",
233 | "| 30| Stephen| Smith|194791| 1001| 1001| Operations|\n",
234 | "| 75| Julia| Ramos|105000| 1006| 1006| Marketing|\n",
235 | "| 59| Kevin| Robinson|100924| 1005| 1005| Sales|\n",
236 | "| 69| Ernest| Peterson|115993| 1005| 1005| Sales|\n",
237 | "| 65| Deborah| Martin| 67389| 1004| 1004| Purchase|\n",
238 | "| 63| Richard| Sanford|136083| 1001| 1001| Operations|\n",
239 | "+---+----------+---------+------+-------------+-------------+---------------+\n",
240 | "only showing top 20 rows\n",
241 | "\n"
242 | ]
243 | }
244 | ],
245 | "source": [
246 | "#Solving Problem using PySpark \n",
247 | "# 2. Provide count of employees in each departnent with department name. \n",
248 | "\n",
249 | "joineddf = departmentdf.join(employeedf, employeedf.department_id == departmentdf.department_id,\"left\")\n",
250 | "joineddf.show()"
251 | ]
252 | },
253 | {
254 | "cell_type": "code",
255 | "execution_count": 12,
256 | "id": "79d16d14-c013-416c-9552-d95e0900d4f8",
257 | "metadata": {},
258 | "outputs": [
259 | {
260 | "name": "stdout",
261 | "output_type": "stream",
262 | "text": [
263 | "+---------------+-----+\n",
264 | "|department_name|count|\n",
265 | "+---------------+-----+\n",
266 | "| Purchase| 12|\n",
267 | "| Sales| 15|\n",
268 | "| Finanace| 15|\n",
269 | "| Technoogy| 14|\n",
270 | "| Marketing| 8|\n",
271 | "| Operations| 11|\n",
272 | "+---------------+-----+\n",
273 | "\n"
274 | ]
275 | }
276 | ],
277 | "source": [
278 | "joineddf.groupBy(\"department_name\").count().show()"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": 13,
284 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1",
285 | "metadata": {},
286 | "outputs": [],
287 | "source": [
288 | "# Now we are solving Same problem using Spark SQL \n",
289 | "# Creating Temp Table or HIVE table\n",
290 | "employeedf.createOrReplaceTempView(\"tmpEmployee\")\n",
291 | "departmentdf.createOrReplaceTempView(\"tmpDepartment\")"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 14,
297 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c",
298 | "metadata": {},
299 | "outputs": [
300 | {
301 | "name": "stdout",
302 | "output_type": "stream",
303 | "text": [
304 | "+---+----------+---------+------+-------------+\n",
305 | "| id|first_name|last_name|salary|department_id|\n",
306 | "+---+----------+---------+------+-------------+\n",
307 | "| 45| Kevin| Duncan| 45210| 1003|\n",
308 | "| 25| Pamela| Matthews| 57944| 1005|\n",
309 | "| 48| Robert| Lynch|117960| 1004|\n",
310 | "| 34| Justin| Dunn| 67992| 1003|\n",
311 | "| 62| Dale| Hayes| 97662| 1005|\n",
312 | "| 1| Todd| Wilson|110000| 1006|\n",
313 | "| 61| Ryan| Brown|120000| 1003|\n",
314 | "| 21| Stephen| Berry|123617| 1002|\n",
315 | "| 13| Julie| Sanchez|210000| 1001|\n",
316 | "| 55| Michael| Morris|106799| 1005|\n",
317 | "| 44| Trevor| Carter| 38670| 1001|\n",
318 | "| 73| William| Preston|155225| 1003|\n",
319 | "| 39| Linda| Clark|186781| 1002|\n",
320 | "| 10| Sean| Crawford|190000| 1006|\n",
321 | "| 30| Stephen| Smith|194791| 1001|\n",
322 | "| 75| Julia| Ramos|105000| 1006|\n",
323 | "| 59| Kevin| Robinson|100924| 1005|\n",
324 | "| 69| Ernest| Peterson|115993| 1005|\n",
325 | "| 65| Deborah| Martin| 67389| 1004|\n",
326 | "| 63| Richard| Sanford|136083| 1001|\n",
327 | "+---+----------+---------+------+-------------+\n",
328 | "only showing top 20 rows\n",
329 | "\n"
330 | ]
331 | }
332 | ],
333 | "source": [
334 | "# Now we have SQL Table and we can write SQL Query on top of that \n",
335 | "# For example by Select on table \n",
336 | "sqlContext.sql(\"SELECT * FROM tmpEmployee\").show()"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": 15,
342 | "id": "d4ac25f9-cd26-44dc-9852-ee0fbae70fd1",
343 | "metadata": {},
344 | "outputs": [
345 | {
346 | "name": "stdout",
347 | "output_type": "stream",
348 | "text": [
349 | "+-------------+---------------+\n",
350 | "|department_id|department_name|\n",
351 | "+-------------+---------------+\n",
352 | "| 1005| Sales|\n",
353 | "| 1002| Finanace|\n",
354 | "| 1004| Purchase|\n",
355 | "| 1001| Operations|\n",
356 | "| 1006| Marketing|\n",
357 | "| 1003| Technoogy|\n",
358 | "+-------------+---------------+\n",
359 | "\n"
360 | ]
361 | }
362 | ],
363 | "source": [
364 | "sqlContext.sql(\"SELECT * FROM tmpDepartment\").show()"
365 | ]
366 | },
367 | {
368 | "cell_type": "code",
369 | "execution_count": 16,
370 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24",
371 | "metadata": {},
372 | "outputs": [
373 | {
374 | "name": "stdout",
375 | "output_type": "stream",
376 | "text": [
377 | "+---------------+-----------------+\n",
378 | "|department_name|count_of_employee|\n",
379 | "+---------------+-----------------+\n",
380 | "| Purchase| 12|\n",
381 | "| Sales| 15|\n",
382 | "| Finanace| 15|\n",
383 | "| Technoogy| 14|\n",
384 | "| Marketing| 8|\n",
385 | "| Operations| 11|\n",
386 | "+---------------+-----------------+\n",
387 | "\n"
388 | ]
389 | }
390 | ],
391 | "source": [
392 | "# Now we will write query to get max salary for each employee \n",
393 | "# so we will use SQL Group by and SQL Order by functions \n",
394 | "sqlContext.sql(\"SELECT department.department_name, count(*) as count_of_employee \\\n",
395 | " FROM tmpDepartment as department \\\n",
396 | " LEFT OUTER JOIN tmpEmployee as emp \\\n",
397 | " ON emp.department_id = department.department_id \\\n",
398 | " GROUP BY department.department_name\").show(n=100)\n",
399 | "\n"
400 | ]
401 | }
402 | ],
403 | "metadata": {
404 | "kernelspec": {
405 | "display_name": "Python 3 (ipykernel)",
406 | "language": "python",
407 | "name": "python3"
408 | },
409 | "language_info": {
410 | "codemirror_mode": {
411 | "name": "ipython",
412 | "version": 3
413 | },
414 | "file_extension": ".py",
415 | "mimetype": "text/x-python",
416 | "name": "python",
417 | "nbconvert_exporter": "python",
418 | "pygments_lexer": "ipython3",
419 | "version": "3.8.13"
420 | }
421 | },
422 | "nbformat": 4,
423 | "nbformat_minor": 5
424 | }
425 |
--------------------------------------------------------------------------------
/Problem 2/problem2_1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c",
6 | "metadata": {},
7 | "source": [
8 | "Here, we will solve problems two ways\n",
9 | "1. First using PySpark function \n",
10 | "2. Second using Spark SQL"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# First Load all the required library and also Start Spark Session\n",
21 | "# Load all the required library\n",
22 | "from pyspark.sql import SparkSession"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stderr",
33 | "output_type": "stream",
34 | "text": [
35 | "WARNING: An illegal reflective access operation has occurred\n",
36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
39 | "WARNING: All illegal access operations will be denied in a future release\n",
40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
41 | "Setting default log level to \"WARN\".\n",
42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
43 | "23/02/08 11:06:10 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "#Start Spark Session\n",
49 | "spark = SparkSession.builder.appName(\"problem2\").getOrCreate()\n",
50 | "sqlContext = SparkSession(spark)\n",
51 | "#Dont Show warning only error\n",
52 | "spark.sparkContext.setLogLevel(\"ERROR\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 3,
58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68",
59 | "metadata": {},
60 | "outputs": [
61 | {
62 | "name": "stderr",
63 | "output_type": "stream",
64 | "text": [
65 | " \r"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "#Load CSV file into DataFrame\n",
71 | "employeedf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"employee_salary.csv\")"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 4,
77 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030",
78 | "metadata": {},
79 | "outputs": [
80 | {
81 | "name": "stdout",
82 | "output_type": "stream",
83 | "text": [
84 | "root\n",
85 | " |-- id: integer (nullable = true)\n",
86 | " |-- first_name: string (nullable = true)\n",
87 | " |-- last_name: string (nullable = true)\n",
88 | " |-- salary: integer (nullable = true)\n",
89 | " |-- department_id: integer (nullable = true)\n",
90 | "\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "#Check Schema of DataFrame\n",
96 | "employeedf.printSchema()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 5,
102 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba",
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "+---+----------+---------+------+-------------+\n",
110 | "| id|first_name|last_name|salary|department_id|\n",
111 | "+---+----------+---------+------+-------------+\n",
112 | "| 45| Kevin| Duncan| 45210| 1003|\n",
113 | "| 25| Pamela| Matthews| 57944| 1005|\n",
114 | "| 48| Robert| Lynch|117960| 1004|\n",
115 | "| 34| Justin| Dunn| 67992| 1003|\n",
116 | "| 62| Dale| Hayes| 97662| 1005|\n",
117 | "| 1| Todd| Wilson|110000| 1006|\n",
118 | "| 61| Ryan| Brown|120000| 1003|\n",
119 | "| 21| Stephen| Berry|123617| 1002|\n",
120 | "| 13| Julie| Sanchez|210000| 1001|\n",
121 | "| 55| Michael| Morris|106799| 1005|\n",
122 | "| 44| Trevor| Carter| 38670| 1001|\n",
123 | "| 73| William| Preston|155225| 1003|\n",
124 | "| 39| Linda| Clark|186781| 1002|\n",
125 | "| 10| Sean| Crawford|190000| 1006|\n",
126 | "| 30| Stephen| Smith|194791| 1001|\n",
127 | "| 75| Julia| Ramos|105000| 1006|\n",
128 | "| 59| Kevin| Robinson|100924| 1005|\n",
129 | "| 69| Ernest| Peterson|115993| 1005|\n",
130 | "| 65| Deborah| Martin| 67389| 1004|\n",
131 | "| 63| Richard| Sanford|136083| 1001|\n",
132 | "+---+----------+---------+------+-------------+\n",
133 | "only showing top 20 rows\n",
134 | "\n"
135 | ]
136 | }
137 | ],
138 | "source": [
139 | "#Check sample Data \n",
140 | "employeedf.show()"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 6,
146 | "id": "c6b4f318-0d5f-4be1-b9df-7fe6b3b008dd",
147 | "metadata": {},
148 | "outputs": [],
149 | "source": [
150 | "#Load CSV file into DataFrame\n",
151 | "departmentdf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"department.csv\")"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 8,
157 | "id": "f4c4435b-dbdd-4890-9c0c-5b6e680005d4",
158 | "metadata": {},
159 | "outputs": [
160 | {
161 | "name": "stdout",
162 | "output_type": "stream",
163 | "text": [
164 | "root\n",
165 | " |-- department_id: integer (nullable = true)\n",
166 | " |-- department_name: string (nullable = true)\n",
167 | "\n"
168 | ]
169 | }
170 | ],
171 | "source": [
172 | "#Check Schema of DataFrame\n",
173 | "departmentdf.printSchema()"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 9,
179 | "id": "296c262a-a858-46a2-9bb3-38d212b52daf",
180 | "metadata": {},
181 | "outputs": [
182 | {
183 | "name": "stdout",
184 | "output_type": "stream",
185 | "text": [
186 | "+-------------+---------------+\n",
187 | "|department_id|department_name|\n",
188 | "+-------------+---------------+\n",
189 | "| 1005| Sales|\n",
190 | "| 1002| Finanace|\n",
191 | "| 1004| Purchase|\n",
192 | "| 1001| Operations|\n",
193 | "| 1006| Marketing|\n",
194 | "| 1003| Technoogy|\n",
195 | "+-------------+---------------+\n",
196 | "\n"
197 | ]
198 | }
199 | ],
200 | "source": [
201 | "#Check sample Data \n",
202 | "departmentdf.show()"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 16,
208 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171",
209 | "metadata": {},
210 | "outputs": [
211 | {
212 | "name": "stdout",
213 | "output_type": "stream",
214 | "text": [
215 | "+---+----------+---------+------+-------------+-------------+---------------+\n",
216 | "| id|first_name|last_name|salary|department_id|department_id|department_name|\n",
217 | "+---+----------+---------+------+-------------+-------------+---------------+\n",
218 | "| 45| Kevin| Duncan| 45210| 1003| 1003| Technoogy|\n",
219 | "| 25| Pamela| Matthews| 57944| 1005| 1005| Sales|\n",
220 | "| 48| Robert| Lynch|117960| 1004| 1004| Purchase|\n",
221 | "| 34| Justin| Dunn| 67992| 1003| 1003| Technoogy|\n",
222 | "| 62| Dale| Hayes| 97662| 1005| 1005| Sales|\n",
223 | "| 1| Todd| Wilson|110000| 1006| 1006| Marketing|\n",
224 | "| 61| Ryan| Brown|120000| 1003| 1003| Technoogy|\n",
225 | "| 21| Stephen| Berry|123617| 1002| 1002| Finanace|\n",
226 | "| 13| Julie| Sanchez|210000| 1001| 1001| Operations|\n",
227 | "| 55| Michael| Morris|106799| 1005| 1005| Sales|\n",
228 | "| 44| Trevor| Carter| 38670| 1001| 1001| Operations|\n",
229 | "| 73| William| Preston|155225| 1003| 1003| Technoogy|\n",
230 | "| 39| Linda| Clark|186781| 1002| 1002| Finanace|\n",
231 | "| 10| Sean| Crawford|190000| 1006| 1006| Marketing|\n",
232 | "| 30| Stephen| Smith|194791| 1001| 1001| Operations|\n",
233 | "| 75| Julia| Ramos|105000| 1006| 1006| Marketing|\n",
234 | "| 59| Kevin| Robinson|100924| 1005| 1005| Sales|\n",
235 | "| 69| Ernest| Peterson|115993| 1005| 1005| Sales|\n",
236 | "| 65| Deborah| Martin| 67389| 1004| 1004| Purchase|\n",
237 | "| 63| Richard| Sanford|136083| 1001| 1001| Operations|\n",
238 | "+---+----------+---------+------+-------------+-------------+---------------+\n",
239 | "only showing top 20 rows\n",
240 | "\n"
241 | ]
242 | }
243 | ],
244 | "source": [
245 | "#Solving Problem using PySpark \n",
246 | "# 1. Use this both tables and list all the employees woking in marketing department with highest to lowest salary order. \n",
247 | "\n",
248 | "joineddf = employeedf.join(departmentdf, employeedf.department_id == departmentdf.department_id,\"left\")\n",
249 | "joineddf.show()"
250 | ]
251 | },
252 | {
253 | "cell_type": "code",
254 | "execution_count": 20,
255 | "id": "79d16d14-c013-416c-9552-d95e0900d4f8",
256 | "metadata": {},
257 | "outputs": [
258 | {
259 | "name": "stdout",
260 | "output_type": "stream",
261 | "text": [
262 | "+----------+---------+------+\n",
263 | "|first_name|last_name|salary|\n",
264 | "+----------+---------+------+\n",
265 | "| Sean| Crawford|190000|\n",
266 | "| Danielle| Williams|120000|\n",
267 | "| Todd| Wilson|110000|\n",
268 | "| Julia| Ramos|105000|\n",
269 | "| Eric|Zimmerman| 83093|\n",
270 | "| Jason| Olsen| 51937|\n",
271 | "| Jason| Burnett| 42525|\n",
272 | "| Philip|Gillespie| 36424|\n",
273 | "+----------+---------+------+\n",
274 | "\n"
275 | ]
276 | }
277 | ],
278 | "source": [
279 | "from pyspark.sql.functions import desc\n",
280 | "joineddf.select(\"first_name\",\"last_name\",\"salary\").where(\"department_name='Marketing'\").orderBy(desc(\"salary\")).show()"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": 21,
286 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1",
287 | "metadata": {},
288 | "outputs": [],
289 | "source": [
290 | "# Now we are solving Same problem using Spark SQL \n",
291 | "# Creating Temp Table or HIVE table\n",
292 | "employeedf.createOrReplaceTempView(\"tmpEmployee\")\n",
293 | "departmentdf.createOrReplaceTempView(\"tmpDepartment\")"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 22,
299 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c",
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "name": "stdout",
304 | "output_type": "stream",
305 | "text": [
306 | "+---+----------+---------+------+-------------+\n",
307 | "| id|first_name|last_name|salary|department_id|\n",
308 | "+---+----------+---------+------+-------------+\n",
309 | "| 45| Kevin| Duncan| 45210| 1003|\n",
310 | "| 25| Pamela| Matthews| 57944| 1005|\n",
311 | "| 48| Robert| Lynch|117960| 1004|\n",
312 | "| 34| Justin| Dunn| 67992| 1003|\n",
313 | "| 62| Dale| Hayes| 97662| 1005|\n",
314 | "| 1| Todd| Wilson|110000| 1006|\n",
315 | "| 61| Ryan| Brown|120000| 1003|\n",
316 | "| 21| Stephen| Berry|123617| 1002|\n",
317 | "| 13| Julie| Sanchez|210000| 1001|\n",
318 | "| 55| Michael| Morris|106799| 1005|\n",
319 | "| 44| Trevor| Carter| 38670| 1001|\n",
320 | "| 73| William| Preston|155225| 1003|\n",
321 | "| 39| Linda| Clark|186781| 1002|\n",
322 | "| 10| Sean| Crawford|190000| 1006|\n",
323 | "| 30| Stephen| Smith|194791| 1001|\n",
324 | "| 75| Julia| Ramos|105000| 1006|\n",
325 | "| 59| Kevin| Robinson|100924| 1005|\n",
326 | "| 69| Ernest| Peterson|115993| 1005|\n",
327 | "| 65| Deborah| Martin| 67389| 1004|\n",
328 | "| 63| Richard| Sanford|136083| 1001|\n",
329 | "+---+----------+---------+------+-------------+\n",
330 | "only showing top 20 rows\n",
331 | "\n"
332 | ]
333 | }
334 | ],
335 | "source": [
336 | "# Now we have SQL Table and we can write SQL Query on top of that \n",
337 | "# For example by Select on table \n",
338 | "sqlContext.sql(\"SELECT * FROM tmpEmployee\").show()"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 23,
344 | "id": "d4ac25f9-cd26-44dc-9852-ee0fbae70fd1",
345 | "metadata": {},
346 | "outputs": [
347 | {
348 | "name": "stdout",
349 | "output_type": "stream",
350 | "text": [
351 | "+-------------+---------------+\n",
352 | "|department_id|department_name|\n",
353 | "+-------------+---------------+\n",
354 | "| 1005| Sales|\n",
355 | "| 1002| Finanace|\n",
356 | "| 1004| Purchase|\n",
357 | "| 1001| Operations|\n",
358 | "| 1006| Marketing|\n",
359 | "| 1003| Technoogy|\n",
360 | "+-------------+---------------+\n",
361 | "\n"
362 | ]
363 | }
364 | ],
365 | "source": [
366 | "sqlContext.sql(\"SELECT * FROM tmpDepartment\").show()"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 24,
372 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24",
373 | "metadata": {},
374 | "outputs": [
375 | {
376 | "name": "stdout",
377 | "output_type": "stream",
378 | "text": [
379 | "+----------+---------+------+\n",
380 | "|first_name|last_name|salary|\n",
381 | "+----------+---------+------+\n",
382 | "| Sean| Crawford|190000|\n",
383 | "| Danielle| Williams|120000|\n",
384 | "| Todd| Wilson|110000|\n",
385 | "| Julia| Ramos|105000|\n",
386 | "| Eric|Zimmerman| 83093|\n",
387 | "| Jason| Olsen| 51937|\n",
388 | "| Jason| Burnett| 42525|\n",
389 | "| Philip|Gillespie| 36424|\n",
390 | "+----------+---------+------+\n",
391 | "\n"
392 | ]
393 | }
394 | ],
395 | "source": [
396 | "# Now we will write query to get max salary for each employee \n",
397 | "# so we will use SQL Group by and SQL Order by functions \n",
398 | "sqlContext.sql(\"SELECT first_name, last_name, salary \\\n",
399 | " FROM tmpEmployee as emp \\\n",
400 | " LEFT OUTER JOIN tmpDepartment as department \\\n",
401 | " ON emp.department_id = department.department_id \\\n",
402 | " WHERE department.department_name = 'Marketing' \\\n",
403 | " ORDER BY salary DESC\").show(n=100)\n",
404 | "\n"
405 | ]
406 | }
407 | ],
408 | "metadata": {
409 | "kernelspec": {
410 | "display_name": "Python 3 (ipykernel)",
411 | "language": "python",
412 | "name": "python3"
413 | },
414 | "language_info": {
415 | "codemirror_mode": {
416 | "name": "ipython",
417 | "version": 3
418 | },
419 | "file_extension": ".py",
420 | "mimetype": "text/x-python",
421 | "name": "python",
422 | "nbconvert_exporter": "python",
423 | "pygments_lexer": "ipython3",
424 | "version": "3.8.13"
425 | }
426 | },
427 | "nbformat": 4,
428 | "nbformat_minor": 5
429 | }
430 |
--------------------------------------------------------------------------------
/Problem 1/problem1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "4328d022-1f8d-442f-921e-d16693058a4c",
6 | "metadata": {},
7 | "source": [
8 | "Here, we will solve problems two ways\n",
9 | "1. First using PySpark function \n",
10 | "2. Second using Spark SQL"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "id": "6d4647c5-df06-4d53-b4b4-66677cc54ed1",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "# First Load all the required library and also Start Spark Session\n",
21 | "# Load all the required library\n",
22 | "from pyspark.sql import SparkSession"
23 | ]
24 | },
25 | {
26 | "cell_type": "code",
27 | "execution_count": 2,
28 | "id": "c0fdceb9-20df-4588-8820-672d48778b09",
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "name": "stderr",
33 | "output_type": "stream",
34 | "text": [
35 | "WARNING: An illegal reflective access operation has occurred\n",
36 | "WARNING: Illegal reflective access by org.apache.spark.unsafe.Platform (file:/opt/spark/jars/spark-unsafe_2.12-3.2.1.jar) to constructor java.nio.DirectByteBuffer(long,int)\n",
37 | "WARNING: Please consider reporting this to the maintainers of org.apache.spark.unsafe.Platform\n",
38 | "WARNING: Use --illegal-access=warn to enable warnings of further illegal reflective access operations\n",
39 | "WARNING: All illegal access operations will be denied in a future release\n",
40 | "Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties\n",
41 | "Setting default log level to \"WARN\".\n",
42 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
43 | "23/02/03 10:13:11 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
44 | ]
45 | }
46 | ],
47 | "source": [
48 | "#Start Spark Session\n",
49 | "spark = SparkSession.builder.appName(\"problem1\").getOrCreate()\n",
50 | "sqlContext = SparkSession(spark)\n",
51 | "#Dont Show warning only error\n",
52 | "spark.sparkContext.setLogLevel(\"ERROR\")"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 17,
58 | "id": "d5ec58af-280e-4eef-a95e-308df1bcbf68",
59 | "metadata": {},
60 | "outputs": [],
61 | "source": [
62 | "#Load CSV file into DataFrame\n",
63 | "employeedf = spark.read.format(\"csv\").option(\"header\",\"true\").option(\"inferSchema\",\"true\").load(\"employee.csv\")"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 18,
69 | "id": "a6604a74-b1f5-49e5-a593-f35ca2417030",
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "root\n",
77 | " |-- id: integer (nullable = true)\n",
78 | " |-- first_name: string (nullable = true)\n",
79 | " |-- last_name: string (nullable = true)\n",
80 | " |-- salary: integer (nullable = true)\n",
81 | " |-- department_id: integer (nullable = true)\n",
82 | "\n"
83 | ]
84 | }
85 | ],
86 | "source": [
87 | "#Check Schema of DataFrame\n",
88 | "employeedf.printSchema()"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 19,
94 | "id": "47481142-ee32-401e-a481-03b3dd5b80ba",
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "+---+----------+---------+------+-------------+\n",
102 | "| id|first_name|last_name|salary|department_id|\n",
103 | "+---+----------+---------+------+-------------+\n",
104 | "| 1| Todd| Wilson|110000| 1006|\n",
105 | "| 1| Todd| Wilson|106119| 1006|\n",
106 | "| 2| Justin| Simon|128922| 1005|\n",
107 | "| 2| Justin| Simon|130000| 1005|\n",
108 | "| 3| Kelly| Rosario| 42689| 1002|\n",
109 | "| 4| Patricia| Powell|162825| 1004|\n",
110 | "| 4| Patricia| Powell|170000| 1004|\n",
111 | "| 5| Sherry| Golden| 44101| 1002|\n",
112 | "| 6| Natasha| Swanson| 79632| 1005|\n",
113 | "| 6| Natasha| Swanson| 90000| 1005|\n",
114 | "| 7| Diane| Gordon| 74591| 1002|\n",
115 | "| 8| Mercedes|Rodriguez| 61048| 1005|\n",
116 | "| 9| Christy| Mitchell|137236| 1001|\n",
117 | "| 9| Christy| Mitchell|140000| 1001|\n",
118 | "| 9| Christy| Mitchell|150000| 1001|\n",
119 | "| 10| Sean| Crawford|182065| 1006|\n",
120 | "| 10| Sean| Crawford|190000| 1006|\n",
121 | "| 11| Kevin| Townsend|166861| 1002|\n",
122 | "| 12| Joshua| Johnson|123082| 1004|\n",
123 | "| 13| Julie| Sanchez|185663| 1001|\n",
124 | "+---+----------+---------+------+-------------+\n",
125 | "only showing top 20 rows\n",
126 | "\n"
127 | ]
128 | }
129 | ],
130 | "source": [
131 | "#Check sample Data \n",
132 | "employeedf.show()"
133 | ]
134 | },
135 | {
136 | "cell_type": "code",
137 | "execution_count": 20,
138 | "id": "c6b4f318-0d5f-4be1-b9df-7fe6b3b008dd",
139 | "metadata": {},
140 | "outputs": [
141 | {
142 | "data": {
143 | "text/plain": [
144 | "95"
145 | ]
146 | },
147 | "execution_count": 20,
148 | "metadata": {},
149 | "output_type": "execute_result"
150 | }
151 | ],
152 | "source": [
153 | "#Checking number of rows in dataframe\n",
154 | "employeedf.count()"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 28,
160 | "id": "8dc98254-6248-4cd6-af15-bb4b5a832171",
161 | "metadata": {},
162 | "outputs": [
163 | {
164 | "name": "stdout",
165 | "output_type": "stream",
166 | "text": [
167 | "+---+----------+---------+-------------+------+\n",
168 | "| id|first_name|last_name|department_id|salary|\n",
169 | "+---+----------+---------+-------------+------+\n",
170 | "| 1| Todd| Wilson| 1006|110000|\n",
171 | "| 1| Todd| Wilson| 1006|106119|\n",
172 | "| 2| Justin| Simon| 1005|128922|\n",
173 | "| 2| Justin| Simon| 1005|130000|\n",
174 | "| 3| Kelly| Rosario| 1002| 42689|\n",
175 | "| 4| Patricia| Powell| 1004|170000|\n",
176 | "| 4| Patricia| Powell| 1004|162825|\n",
177 | "| 5| Sherry| Golden| 1002| 44101|\n",
178 | "| 6| Natasha| Swanson| 1005| 79632|\n",
179 | "| 6| Natasha| Swanson| 1005| 90000|\n",
180 | "| 7| Diane| Gordon| 1002| 74591|\n",
181 | "| 8| Mercedes|Rodriguez| 1005| 61048|\n",
182 | "| 9| Christy| Mitchell| 1001|140000|\n",
183 | "| 9| Christy| Mitchell| 1001|150000|\n",
184 | "| 9| Christy| Mitchell| 1001|137236|\n",
185 | "| 10| Sean| Crawford| 1006|182065|\n",
186 | "| 10| Sean| Crawford| 1006|190000|\n",
187 | "| 11| Kevin| Townsend| 1002|166861|\n",
188 | "| 12| Joshua| Johnson| 1004|123082|\n",
189 | "| 13| Julie| Sanchez| 1001|185663|\n",
190 | "+---+----------+---------+-------------+------+\n",
191 | "only showing top 20 rows\n",
192 | "\n"
193 | ]
194 | }
195 | ],
196 | "source": [
197 | "#Solving Problem using PySpark \n",
198 | "# 1. We need to print latest salary of each employee\n",
199 | "# 2. We also need their id, first name, lastname, department id and latest salary \n",
200 | "# 3. We also want to order by it by id \n",
201 | "\n",
202 | "# On a first step we are just getting all the columns and doing order by \n",
203 | "\n",
204 | "employeedf.select(\"id\",\"first_name\",\"last_name\",\"department_id\",\"salary\").orderBy(\"id\").show()\n"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 33,
210 | "id": "0e256a4b-450f-4846-ba97-1e51812d590e",
211 | "metadata": {},
212 | "outputs": [
213 | {
214 | "name": "stdout",
215 | "output_type": "stream",
216 | "text": [
217 | "+---+----------+---------+-------------+-----------+\n",
218 | "| id|first_name|last_name|department_id|max(salary)|\n",
219 | "+---+----------+---------+-------------+-----------+\n",
220 | "| 1| Todd| Wilson| 1006| 110000|\n",
221 | "| 2| Justin| Simon| 1005| 130000|\n",
222 | "| 3| Kelly| Rosario| 1002| 42689|\n",
223 | "| 4| Patricia| Powell| 1004| 170000|\n",
224 | "| 5| Sherry| Golden| 1002| 44101|\n",
225 | "| 6| Natasha| Swanson| 1005| 90000|\n",
226 | "| 7| Diane| Gordon| 1002| 74591|\n",
227 | "| 8| Mercedes|Rodriguez| 1005| 61048|\n",
228 | "| 9| Christy| Mitchell| 1001| 150000|\n",
229 | "| 10| Sean| Crawford| 1006| 190000|\n",
230 | "| 11| Kevin| Townsend| 1002| 166861|\n",
231 | "| 12| Joshua| Johnson| 1004| 123082|\n",
232 | "| 13| Julie| Sanchez| 1001| 210000|\n",
233 | "| 14| John| Coleman| 1001| 152434|\n",
234 | "| 15| Anthony| Valdez| 1001| 96898|\n",
235 | "| 16| Briana| Rivas| 1005| 151668|\n",
236 | "| 17| Jason| Burnett| 1006| 42525|\n",
237 | "| 18| Jeffrey| Harris| 1002| 20000|\n",
238 | "| 19| Michael| Ramsey| 1003| 63159|\n",
239 | "| 20| Cody| Gonzalez| 1004| 112809|\n",
240 | "| 21| Stephen| Berry| 1002| 123617|\n",
241 | "| 22| Brittany| Scott| 1002| 162537|\n",
242 | "| 23| Angela| Williams| 1004| 100875|\n",
243 | "| 24| William| Flores| 1003| 142674|\n",
244 | "| 25| Pamela| Matthews| 1005| 57944|\n",
245 | "| 26| Allison| Johnson| 1001| 128782|\n",
246 | "| 27| Anthony| Ball| 1003| 34386|\n",
247 | "| 28| Alexis| Beck| 1005| 12260|\n",
248 | "| 29| Jason| Olsen| 1006| 51937|\n",
249 | "| 30| Stephen| Smith| 1001| 194791|\n",
250 | "| 31| Kimberly| Brooks| 1003| 95327|\n",
251 | "| 32| Eric|Zimmerman| 1006| 83093|\n",
252 | "| 33| Peter| Holt| 1002| 69945|\n",
253 | "| 34| Justin| Dunn| 1003| 67992|\n",
254 | "| 35| John| Ball| 1004| 47795|\n",
255 | "| 36| Jesus| Ward| 1005| 36078|\n",
256 | "| 37| Philip|Gillespie| 1006| 36424|\n",
257 | "| 38| Nicole| Lewis| 1001| 114079|\n",
258 | "| 39| Linda| Clark| 1002| 186781|\n",
259 | "| 40| Colleen| Carrillo| 1004| 147723|\n",
260 | "| 41| John| George| 1001| 21642|\n",
261 | "| 42| Traci| Williams| 1003| 180000|\n",
262 | "| 43| Joseph| Rogers| 1005| 22800|\n",
263 | "| 44| Trevor| Carter| 1001| 38670|\n",
264 | "| 45| Kevin| Duncan| 1003| 45210|\n",
265 | "| 46| Joshua| Ewing| 1003| 73088|\n",
266 | "| 47| Kimberly| Dean| 1003| 71416|\n",
267 | "| 48| Robert| Lynch| 1004| 117960|\n",
268 | "| 49| Amber| Harding| 1002| 77764|\n",
269 | "| 50| Victoria| Wilson| 1002| 176620|\n",
270 | "| 51| Theresa| Everett| 1002| 31404|\n",
271 | "| 52| Kara| Smith| 1004| 192838|\n",
272 | "| 53| Teresa| Cohen| 1001| 98860|\n",
273 | "| 54| Wesley| Tucker| 1005| 90221|\n",
274 | "| 55| Michael| Morris| 1005| 106799|\n",
275 | "| 56| Rachael| Williams| 1002| 103585|\n",
276 | "| 57| Patricia| Harmon| 1005| 147417|\n",
277 | "| 58| Edward| Sharp| 1005| 41077|\n",
278 | "| 59| Kevin| Robinson| 1005| 100924|\n",
279 | "| 60| Charles| Pearson| 1004| 173317|\n",
280 | "| 61| Ryan| Brown| 1003| 120000|\n",
281 | "| 62| Dale| Hayes| 1005| 97662|\n",
282 | "| 63| Richard| Sanford| 1001| 136083|\n",
283 | "| 64| Danielle| Williams| 1006| 120000|\n",
284 | "| 65| Deborah| Martin| 1004| 67389|\n",
285 | "| 66| Dustin| Bush| 1004| 47567|\n",
286 | "| 67| Tyler| Green| 1002| 111085|\n",
287 | "| 68| Antonio|Carpenter| 1002| 83684|\n",
288 | "| 69| Ernest| Peterson| 1005| 115993|\n",
289 | "| 70| Karen|Fernandez| 1003| 101238|\n",
290 | "| 71| Kristine| Casey| 1003| 67651|\n",
291 | "| 72| Christine| Frye| 1004| 137244|\n",
292 | "| 73| William| Preston| 1003| 155225|\n",
293 | "| 74| Richard| Cole| 1003| 180361|\n",
294 | "| 75| Julia| Ramos| 1006| 105000|\n",
295 | "+---+----------+---------+-------------+-----------+\n",
296 | "\n"
297 | ]
298 | }
299 | ],
300 | "source": [
301 | "# Now we will use group by function and get max salary for each employee \n",
302 | "employeedf.groupBy(\"id\",\"first_name\",\"last_name\",\"department_id\").max(\"salary\").orderBy(\"id\").show(n=100)\n",
303 | "# We can also store result into dataframe\n",
304 | "finaldf = employeedf.groupBy(\"id\",\"first_name\",\"last_name\",\"department_id\").max(\"salary\").orderBy(\"id\")"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 34,
310 | "id": "bd049fee-18f9-48b0-a935-885614e744d3",
311 | "metadata": {},
312 | "outputs": [
313 | {
314 | "data": {
315 | "text/plain": [
316 | "75"
317 | ]
318 | },
319 | "execution_count": 34,
320 | "metadata": {},
321 | "output_type": "execute_result"
322 | }
323 | ],
324 | "source": [
325 | "# Final result into final dataframe\n",
326 | "finaldf.count()"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": 35,
332 | "id": "c28f990b-7e88-4c88-bd36-ca17a83544c1",
333 | "metadata": {},
334 | "outputs": [],
335 | "source": [
336 | "# Now we are solving Same problem using Spark SQL \n",
337 | "# Creating Temp Table or HIVE table\n",
338 | "employeedf.createOrReplaceTempView(\"tmpEmployee\")"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 36,
344 | "id": "8a48a300-9f44-4321-a138-942e6f1daf2c",
345 | "metadata": {},
346 | "outputs": [
347 | {
348 | "name": "stdout",
349 | "output_type": "stream",
350 | "text": [
351 | "+---+----------+---------+------+-------------+\n",
352 | "| id|first_name|last_name|salary|department_id|\n",
353 | "+---+----------+---------+------+-------------+\n",
354 | "| 1| Todd| Wilson|110000| 1006|\n",
355 | "| 1| Todd| Wilson|106119| 1006|\n",
356 | "| 2| Justin| Simon|128922| 1005|\n",
357 | "| 2| Justin| Simon|130000| 1005|\n",
358 | "| 3| Kelly| Rosario| 42689| 1002|\n",
359 | "| 4| Patricia| Powell|162825| 1004|\n",
360 | "| 4| Patricia| Powell|170000| 1004|\n",
361 | "| 5| Sherry| Golden| 44101| 1002|\n",
362 | "| 6| Natasha| Swanson| 79632| 1005|\n",
363 | "| 6| Natasha| Swanson| 90000| 1005|\n",
364 | "| 7| Diane| Gordon| 74591| 1002|\n",
365 | "| 8| Mercedes|Rodriguez| 61048| 1005|\n",
366 | "| 9| Christy| Mitchell|137236| 1001|\n",
367 | "| 9| Christy| Mitchell|140000| 1001|\n",
368 | "| 9| Christy| Mitchell|150000| 1001|\n",
369 | "| 10| Sean| Crawford|182065| 1006|\n",
370 | "| 10| Sean| Crawford|190000| 1006|\n",
371 | "| 11| Kevin| Townsend|166861| 1002|\n",
372 | "| 12| Joshua| Johnson|123082| 1004|\n",
373 | "| 13| Julie| Sanchez|185663| 1001|\n",
374 | "+---+----------+---------+------+-------------+\n",
375 | "only showing top 20 rows\n",
376 | "\n"
377 | ]
378 | }
379 | ],
380 | "source": [
381 | "# Now we have SQL Table and we can write SQL Query on top of that \n",
382 | "# For example by Select on table \n",
383 | "sqlContext.sql(\"SELECT * FROM tmpEmployee\").show()"
384 | ]
385 | },
386 | {
387 | "cell_type": "code",
388 | "execution_count": 38,
389 | "id": "33554293-3ecb-4c46-8991-be98b4c3ea24",
390 | "metadata": {},
391 | "outputs": [
392 | {
393 | "name": "stdout",
394 | "output_type": "stream",
395 | "text": [
396 | "+---+----------+---------+-----------+-------------+\n",
397 | "| id|first_name|last_name|LatesSalary|department_id|\n",
398 | "+---+----------+---------+-----------+-------------+\n",
399 | "| 1| Todd| Wilson| 110000| 1006|\n",
400 | "| 2| Justin| Simon| 130000| 1005|\n",
401 | "| 3| Kelly| Rosario| 42689| 1002|\n",
402 | "| 4| Patricia| Powell| 170000| 1004|\n",
403 | "| 5| Sherry| Golden| 44101| 1002|\n",
404 | "| 6| Natasha| Swanson| 90000| 1005|\n",
405 | "| 7| Diane| Gordon| 74591| 1002|\n",
406 | "| 8| Mercedes|Rodriguez| 61048| 1005|\n",
407 | "| 9| Christy| Mitchell| 150000| 1001|\n",
408 | "| 10| Sean| Crawford| 190000| 1006|\n",
409 | "| 11| Kevin| Townsend| 166861| 1002|\n",
410 | "| 12| Joshua| Johnson| 123082| 1004|\n",
411 | "| 13| Julie| Sanchez| 210000| 1001|\n",
412 | "| 14| John| Coleman| 152434| 1001|\n",
413 | "| 15| Anthony| Valdez| 96898| 1001|\n",
414 | "| 16| Briana| Rivas| 151668| 1005|\n",
415 | "| 17| Jason| Burnett| 42525| 1006|\n",
416 | "| 18| Jeffrey| Harris| 20000| 1002|\n",
417 | "| 19| Michael| Ramsey| 63159| 1003|\n",
418 | "| 20| Cody| Gonzalez| 112809| 1004|\n",
419 | "| 21| Stephen| Berry| 123617| 1002|\n",
420 | "| 22| Brittany| Scott| 162537| 1002|\n",
421 | "| 23| Angela| Williams| 100875| 1004|\n",
422 | "| 24| William| Flores| 142674| 1003|\n",
423 | "| 25| Pamela| Matthews| 57944| 1005|\n",
424 | "| 26| Allison| Johnson| 128782| 1001|\n",
425 | "| 27| Anthony| Ball| 34386| 1003|\n",
426 | "| 28| Alexis| Beck| 12260| 1005|\n",
427 | "| 29| Jason| Olsen| 51937| 1006|\n",
428 | "| 30| Stephen| Smith| 194791| 1001|\n",
429 | "| 31| Kimberly| Brooks| 95327| 1003|\n",
430 | "| 32| Eric|Zimmerman| 83093| 1006|\n",
431 | "| 33| Peter| Holt| 69945| 1002|\n",
432 | "| 34| Justin| Dunn| 67992| 1003|\n",
433 | "| 35| John| Ball| 47795| 1004|\n",
434 | "| 36| Jesus| Ward| 36078| 1005|\n",
435 | "| 37| Philip|Gillespie| 36424| 1006|\n",
436 | "| 38| Nicole| Lewis| 114079| 1001|\n",
437 | "| 39| Linda| Clark| 186781| 1002|\n",
438 | "| 40| Colleen| Carrillo| 147723| 1004|\n",
439 | "| 41| John| George| 21642| 1001|\n",
440 | "| 42| Traci| Williams| 180000| 1003|\n",
441 | "| 43| Joseph| Rogers| 22800| 1005|\n",
442 | "| 44| Trevor| Carter| 38670| 1001|\n",
443 | "| 45| Kevin| Duncan| 45210| 1003|\n",
444 | "| 46| Joshua| Ewing| 73088| 1003|\n",
445 | "| 47| Kimberly| Dean| 71416| 1003|\n",
446 | "| 48| Robert| Lynch| 117960| 1004|\n",
447 | "| 49| Amber| Harding| 77764| 1002|\n",
448 | "| 50| Victoria| Wilson| 176620| 1002|\n",
449 | "| 51| Theresa| Everett| 31404| 1002|\n",
450 | "| 52| Kara| Smith| 192838| 1004|\n",
451 | "| 53| Teresa| Cohen| 98860| 1001|\n",
452 | "| 54| Wesley| Tucker| 90221| 1005|\n",
453 | "| 55| Michael| Morris| 106799| 1005|\n",
454 | "| 56| Rachael| Williams| 103585| 1002|\n",
455 | "| 57| Patricia| Harmon| 147417| 1005|\n",
456 | "| 58| Edward| Sharp| 41077| 1005|\n",
457 | "| 59| Kevin| Robinson| 100924| 1005|\n",
458 | "| 60| Charles| Pearson| 173317| 1004|\n",
459 | "| 61| Ryan| Brown| 120000| 1003|\n",
460 | "| 62| Dale| Hayes| 97662| 1005|\n",
461 | "| 63| Richard| Sanford| 136083| 1001|\n",
462 | "| 64| Danielle| Williams| 120000| 1006|\n",
463 | "| 65| Deborah| Martin| 67389| 1004|\n",
464 | "| 66| Dustin| Bush| 47567| 1004|\n",
465 | "| 67| Tyler| Green| 111085| 1002|\n",
466 | "| 68| Antonio|Carpenter| 83684| 1002|\n",
467 | "| 69| Ernest| Peterson| 115993| 1005|\n",
468 | "| 70| Karen|Fernandez| 101238| 1003|\n",
469 | "| 71| Kristine| Casey| 67651| 1003|\n",
470 | "| 72| Christine| Frye| 137244| 1004|\n",
471 | "| 73| William| Preston| 155225| 1003|\n",
472 | "| 74| Richard| Cole| 180361| 1003|\n",
473 | "| 75| Julia| Ramos| 105000| 1006|\n",
474 | "+---+----------+---------+-----------+-------------+\n",
475 | "\n"
476 | ]
477 | }
478 | ],
479 | "source": [
480 | "# Now we will write query to get max salary for each employee \n",
481 | "# so we will use SQL Group by and SQL Order by functions \n",
482 | "sqlContext.sql(\"SELECT id,first_name,last_name,MAX(salary) AS LatesSalary,department_id \\\n",
483 | " FROM tmpEmployee \\\n",
484 | " GROUP BY id,first_name,last_name,department_id \\\n",
485 | " ORDER BY id\").show(n=100)"
486 | ]
487 | }
488 | ],
489 | "metadata": {
490 | "kernelspec": {
491 | "display_name": "Python 3 (ipykernel)",
492 | "language": "python",
493 | "name": "python3"
494 | },
495 | "language_info": {
496 | "codemirror_mode": {
497 | "name": "ipython",
498 | "version": 3
499 | },
500 | "file_extension": ".py",
501 | "mimetype": "text/x-python",
502 | "name": "python",
503 | "nbconvert_exporter": "python",
504 | "pygments_lexer": "ipython3",
505 | "version": "3.8.13"
506 | }
507 | },
508 | "nbformat": 4,
509 | "nbformat_minor": 5
510 | }
511 |
--------------------------------------------------------------------------------