├── .cache-main
├── .classpath
├── .project
├── .settings
├── org.eclipse.jdt.core.prefs
└── org.eclipse.m2e.core.prefs
├── Datasets
├── Scenerio25.csv
├── scen.json
└── scen20.json
├── README.md
├── Scenerio-1.py
├── Scenerio10.py
├── Scenerio11.py
├── Scenerio12.py
├── Scenerio13.py
├── Scenerio14.py
├── Scenerio15.py
├── Scenerio16.py
├── Scenerio17.py
├── Scenerio18.py
├── Scenerio19.py
├── Scenerio2.py
├── Scenerio20.py
├── Scenerio21.py
├── Scenerio22.py
├── Scenerio23.py
├── Scenerio24.py
├── Scenerio25.py
├── Scenerio26.py
├── Scenerio27.py
├── Scenerio28.py
├── Scenerio29.py
├── Scenerio3.py
├── Scenerio30.ipynb
├── Scenerio31.ipynb
├── Scenerio32.ipynb
├── Scenerio33.ipynb
├── Scenerio34.ipynb
├── Scenerio35.ipynb
├── Scenerio36.ipynb
├── Scenerio4.py
├── Scenerio5.py
├── Scenerio6.py
├── Scenerio7.py
├── Scenerio8.py
├── Scenerio9.py
├── pom.xml
├── src
└── pack
│ ├── Scenerio1.scala
│ ├── Scenerio10.scala
│ ├── Scenerio11.scala
│ ├── Scenerio12.scala
│ ├── Scenerio13.scala
│ ├── Scenerio14.scala
│ ├── Scenerio15.scala
│ ├── Scenerio16.scala
│ ├── Scenerio17.scala
│ ├── Scenerio18.scala
│ ├── Scenerio19.scala
│ ├── Scenerio2.scala
│ ├── Scenerio20.scala
│ ├── Scenerio21.scala
│ ├── Scenerio22.scala
│ ├── Scenerio23.scala
│ ├── Scenerio24.scala
│ ├── Scenerio25.scala
│ ├── Scenerio26.scala
│ ├── Scenerio27.scala
│ ├── Scenerio28.scala
│ ├── Scenerio29.scala
│ ├── Scenerio3.scala
│ ├── Scenerio30.scala
│ ├── Scenerio31.scala
│ ├── Scenerio32 Scala.scala
│ ├── Scenerio33.scala
│ ├── Scenerio35.scala
│ ├── Scenerio36.scala
│ ├── Scenerio4.scala
│ ├── Scenerio5.scala
│ ├── Scenerio6.scala
│ ├── Scenerio7.scala
│ ├── Scenerio8.scala
│ └── Scenerio9.scala
└── target
└── classes
└── pack
├── Scenerio1$$typecreator5$1.class
├── Scenerio1$.class
└── Scenerio1.class
/.cache-main:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mohankrishna02/interview-scenerios-spark-sql/30f056cb639fe0ee812eb0eb548e9136c3845e38/.cache-main
--------------------------------------------------------------------------------
/.classpath:
--------------------------------------------------------------------------------
1 |
2 |
3 |
4 |
5 |
6 |
7 |
8 |
9 |
10 |
11 |
12 |
13 |
14 |
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 |
--------------------------------------------------------------------------------
/.project:
--------------------------------------------------------------------------------
1 |
2 |
3 | InterviewScenerios
4 |
5 |
6 |
7 |
8 |
9 | org.scala-ide.sdt.core.scalabuilder
10 |
11 |
12 |
13 |
14 | org.eclipse.m2e.core.maven2Builder
15 |
16 |
17 |
18 |
19 |
20 | org.eclipse.m2e.core.maven2Nature
21 | org.scala-ide.sdt.core.scalanature
22 | org.eclipse.jdt.core.javanature
23 |
24 |
25 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.jdt.core.prefs:
--------------------------------------------------------------------------------
1 | eclipse.preferences.version=1
2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled
3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8
4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve
5 | org.eclipse.jdt.core.compiler.compliance=1.8
6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate
7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate
8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate
9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error
10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error
11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning
12 | org.eclipse.jdt.core.compiler.source=1.8
13 |
--------------------------------------------------------------------------------
/.settings/org.eclipse.m2e.core.prefs:
--------------------------------------------------------------------------------
1 | activeProfiles=
2 | eclipse.preferences.version=1
3 | resolveWorkspaceProjects=true
4 | version=1
5 |
--------------------------------------------------------------------------------
/Datasets/Scenerio25.csv:
--------------------------------------------------------------------------------
1 | emp_no,emp_name,dep
2 | 101,Murugan,HealthCare
3 | Invalid Entry,Description: Bad Record Entry
4 | 102,Kannan,Finance
5 | 103,Mani,IT
6 | Connection lost,Description: Poor Connection
7 | 104,Pavan,HR
8 | Bad Record,Description:Corrupt Record
--------------------------------------------------------------------------------
/Datasets/scen.json:
--------------------------------------------------------------------------------
1 | {
2 | "code": 1234,
3 | "commentCount": 5,
4 | "createdAt": "2023-05-30T10:30:00",
5 | "description": "Example description",
6 | "feedsComment": "Example comment",
7 | "id": 1,
8 | "imagePaths": "/path/to/images",
9 | "images": "image1.jpg,image2.jpg,image3.jpg",
10 | "isdeleted": false,
11 | "lat": 123456789,
12 | "likeDislike": {
13 | "dislikes": 10,
14 | "likes": 20,
15 | "userAction": 1
16 | },
17 | "lng": 987654321,
18 | "location": "Example location",
19 | "mediatype": 1,
20 | "msg": "Example message",
21 | "multiMedia": [
22 | {
23 | "createAt": "2023-05-30T12:00:00",
24 | "description": "Media description",
25 | "id": 1001,
26 | "likeCount": 50,
27 | "mediatype": 1,
28 | "name": "Media name",
29 | "place": "Media place",
30 | "url": "https://example.com/media1"
31 | },
32 | {
33 | "createAt": "2023-05-30T13:30:00",
34 | "description": "Another media description",
35 | "id": 1002,
36 | "likeCount": 30,
37 | "mediatype": 2,
38 | "name": "Another media name",
39 | "place": "Another media place",
40 | "url": "https://example.com/media2"
41 | }
42 | ],
43 | "name": "John Doe",
44 | "profilePicture": "/path/to/profile_picture.jpg",
45 | "title": "Example title",
46 | "userId": 123,
47 | "videoUrl": "https://example.com/video",
48 | "totalFeed": 100
49 | }
50 |
--------------------------------------------------------------------------------
/Datasets/scen20.json:
--------------------------------------------------------------------------------
1 | [
2 | {
3 | "code": 123,
4 | "commentCount": 5,
5 | "createAt": "2023-05-30T10:30:00",
6 | "createdAt": "2023-05-30T10:30:00",
7 | "description": "This is a sample description.",
8 | "dislikes": 2,
9 | "feedsComment": "Sample feeds comment",
10 | "id": 1,
11 | "imagePaths": "path/to/images",
12 | "images": "image1.jpg,image2.jpg",
13 | "isdeleted": false,
14 | "lat": 12,
15 | "likeCount": 10,
16 | "likes": 8,
17 | "lng": 34,
18 | "location": "Sample location",
19 | "mediatype": 1,
20 | "msg": "Sample message",
21 | "name": "John Doe",
22 | "place": "Sample place",
23 | "profilePicture": "path/to/profile_picture.jpg",
24 | "title": "Sample title",
25 | "totalFeed": 100,
26 | "url": "http://sampleurl.com",
27 | "userAction": 1,
28 | "userId": 12345,
29 | "videoUrl": "http://samplevideourl.com"
30 | },
31 | {
32 | "code": 456,
33 | "commentCount": 3,
34 | "createAt": "2023-05-29T15:45:00",
35 | "createdAt": "2023-05-29T15:45:00",
36 | "description": "Another sample description.",
37 | "dislikes": 1,
38 | "feedsComment": "Another sample feeds comment",
39 | "id": 2,
40 | "imagePaths": "path/to/images2",
41 | "images": "image3.jpg,image4.jpg",
42 | "isdeleted": true,
43 | "lat": 56,
44 | "likeCount": 20,
45 | "likes": 18,
46 | "lng": 78,
47 | "location": "Another sample location",
48 | "mediatype": 2,
49 | "msg": "Another sample message",
50 | "name": "Jane Smith",
51 | "place": "Another sample place",
52 | "profilePicture": "path/to/profile_picture2.jpg",
53 | "title": "Another sample title",
54 | "totalFeed": 200,
55 | "url": "http://anotherurl.com",
56 | "userAction": 2,
57 | "userId": 67890,
58 | "videoUrl": "http://samplevideourl2.com"
59 | }
60 | ]
61 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | ## Spark and SQL Interview Scenerio Questions
2 |
3 | ### Table of Contents
4 |
5 | |No| Scenerios |
6 | |--|--------------------------------------------------------------------------|
7 | |1 |[Scenerio-1](#scenerio-1) |
8 | |2 |[Scenerio-2](#scenerio-2) |
9 | |3 |[Scenerio-3](#scenerio-3) |
10 | |4 |[Scenerio-4](#scenerio-4) |
11 | |5 |[Scenerio-5](#scenerio-5) |
12 | |6 |[Scenerio-6](#scenerio-6) |
13 | |7 |[Scenerio-7](#scenerio-7) |
14 | |8 |[Scenerio-8](#scenerio-8) |
15 | |9 |[Scenerio-9](#scenerio-9) |
16 | |10|[Scenerio-10](#scenerio-10) |
17 | |11|[Scenerio-11](#scenerio-11) |
18 | |12|[Scenerio-12](#scenerio-12) |
19 | |13|[Scenerio-13](#scenerio-13) |
20 | |14|[Scenerio-14](#scenerio-14) |
21 | |15|[Scenerio-15](#scenerio-15) |
22 | |16|[Scenerio-16](#scenerio-16) |
23 | |17|[Scenerio-17](#scenerio-17) |
24 | |18|[Scenerio-18](#scenerio-18) |
25 | |19|[Scenerio-19](#scenerio-19) |
26 | |20|[Scenerio-20](#scenerio-20) |
27 | |21|[Scenerio-21](#scenerio-21) |
28 | |22|[Scenerio-22](#scenerio-22) |
29 | |23|[Scenerio-23](#scenerio-23) |
30 | |24|[Scenerio-24](#scenerio-24) |
31 | |25|[Scenerio-25](#scenerio-25) |
32 | |26|[Scenerio-26](#scenerio-26) |
33 | |27|[Scenerio-27](#scenerio-27) |
34 | |28|[Scenerio-28](#scenerio-28) |
35 | |29|[Scenerio-29](#scenerio-29) |
36 | |30|[Scenerio-30](#scenerio-30) |
37 | |31|[Scenerio-31](#scenerio-31) |
38 | |32|[Scenerio-32](#scenerio-32) |
39 | |33|[Scenerio-33](#scenerio-33) |
40 | |34|[Scenerio-34](#scenerio-34) |
41 | |35|[Scenerio-35](#scenerio-35) |
42 | |36|[Scenerio-36](#scenerio-36) |
43 |
44 | ### Scenerio-1
45 | #### Query to get who are getting equal salary
46 | #### Input :-
47 | ```
48 | +--------+---------+--------+------+-------------------+------+
49 | |workerid|firstname|lastname|salary| joiningdate|depart|
50 | +--------+---------+--------+------+-------------------+------+
51 | | 001| Monika| Arora|100000|2014-02-20 09:00:00| HR|
52 | | 002| Niharika| Verma|300000|2014-06-11 09:00:00| Admin|
53 | | 003| Vishal| Singhal|300000|2014-02-20 09:00:00| HR|
54 | | 004| Amitabh| Singh|500000|2014-02-20 09:00:00| Admin|
55 | | 005| Vivek| Bhati|500000|2014-06-11 09:00:00| Admin|
56 | +--------+---------+--------+------+-------------------+------+
57 | ```
58 | #### Expected Output :-
59 | ```
60 | +--------+---------+--------+------+-------------------+------+
61 | |workerid|firstname|lastname|salary| joiningdate|depart|
62 | +--------+---------+--------+------+-------------------+------+
63 | | 002| Niharika| Verma|300000|2014-06-11 09:00:00| Admin|
64 | | 003| Vishal| Singhal|300000|2014-02-20 09:00:00| HR|
65 | | 004| Amitabh| Singh|500000|2014-02-20 09:00:00| Admin|
66 | | 005| Vivek| Bhati|500000|2014-06-11 09:00:00| Admin|
67 | +--------+---------+--------+------+-------------------+------+
68 | ```
69 | #### Solution :-
70 | Scala-Spark -
71 | PySpark -
72 |
73 | **[⬆ Back to Top](#table-of-contents)**
74 |
75 | ### Scenerio-2
76 | #### (Need the dates when the status gets changed like ordered to dispatched)
77 | #### Input :-
78 | ```
79 | +-------+----------+----------+
80 | |orderid|statusdate| status|
81 | +-------+----------+----------+
82 | | 1| 1-Jan| Ordered|
83 | | 1| 2-Jan|dispatched|
84 | | 1| 3-Jan|dispatched|
85 | | 1| 4-Jan| Shipped|
86 | | 1| 5-Jan| Shipped|
87 | | 1| 6-Jan| Delivered|
88 | | 2| 1-Jan| Ordered|
89 | | 2| 2-Jan|dispatched|
90 | | 2| 3-Jan| shipped|
91 | +-------+----------+----------+
92 | ```
93 | #### Expected Output :-
94 | ```
95 | +-------+----------+----------+
96 | |orderid|statusdate| status|
97 | +-------+----------+----------+
98 | | 1| 2-Jan|dispatched|
99 | | 1| 3-Jan|dispatched|
100 | | 2| 2-Jan|dispatched|
101 | +-------+----------+----------+
102 | ```
103 | #### Solution :-
104 | Scala-Spark -
105 | PySpark -
106 |
107 | **[⬆ Back to Top](#table-of-contents)**
108 |
109 | ### Scenerio-3
110 | #### Input :-
111 | ```
112 | +--------+----------+------+
113 | |sensorid| timestamp|values|
114 | +--------+----------+------+
115 | | 1111|2021-01-15| 10|
116 | | 1111|2021-01-16| 15|
117 | | 1111|2021-01-17| 30|
118 | | 1112|2021-01-15| 10|
119 | | 1112|2021-01-15| 20|
120 | | 1112|2021-01-15| 30|
121 | +--------+----------+------+
122 | ```
123 | #### Expected Output :-
124 | ```
125 | +--------+----------+------+
126 | |sensorid| timestamp|values|
127 | +--------+----------+------+
128 | | 1111|2021-01-15| 5|
129 | | 1111|2021-01-16| 15|
130 | | 1112|2021-01-15| 10|
131 | | 1112|2021-01-15| 10|
132 | +--------+----------+------+
133 | ```
134 | #### Solution :-
135 | Scala-Spark - [Click Here]()
136 | PySpark - [Click Here]()
137 | SQL -
138 | ```
139 | SELECT sensorid,
140 | timestamp,
141 | ( newvalues - values ) AS values
142 | FROM (SELECT *,
143 | Lead(values, 1, 0)
144 | OVER(
145 | partition BY sensorid
146 | ORDER BY values) AS newvalues
147 | FROM timetab)
148 | WHERE newvalues != 0
149 | ```
150 | Pandas -
151 | ```
152 | import pandas as pd
153 |
154 | data = [
155 | (1111, "2021-01-15", 10),
156 | (1111, "2021-01-16", 15),
157 | (1111, "2021-01-17", 30),
158 | (1112, "2021-01-15", 10),
159 | (1112, "2021-01-15", 20),
160 | (1112, "2021-01-15", 30),
161 | ]
162 |
163 | df = pd.DataFrame(data, columns=["sensorid", "timestamp", "values"])
164 | print(df)
165 |
166 | df["newvalues"] = df.groupby("sensorid")["values"].shift(-1)
167 | print(df)
168 |
169 | df = df.dropna(subset=["newvalues"])
170 | print(df)
171 |
172 | df["values"] = df["newvalues"] - df["values"]
173 | print(df)
174 |
175 | df = df.drop(columns=["newvalues"])
176 | print(df)
177 | ```
178 |
179 | **[⬆ Back to Top](#table-of-contents)**
180 |
181 | ### Scenerio-4
182 | #### (Write a query to list the unique customer names in the custtab table, along with the number of addresses associated with each customer.)
183 | #### Input :-
184 | ```
185 | +------+-----------+-------+
186 | |custid| custname|address|
187 | +------+-----------+-------+
188 | | 1| Mark Ray| AB|
189 | | 2|Peter Smith| CD|
190 | | 1| Mark Ray| EF|
191 | | 2|Peter Smith| GH|
192 | | 2|Peter Smith| CD|
193 | | 3| Kate| IJ|
194 | +------+-----------+-------+
195 | ```
196 | #### Expected Output :-
197 | ```
198 | +------+-----------+--------+
199 | |custid| custname| address|
200 | +------+-----------+--------+
201 | | 1| Mark Ray|[EF, AB]|
202 | | 2|Peter Smith|[CD, GH]|
203 | | 3| Kate| [IJ]|
204 | +------+-----------+--------+
205 | ```
206 | #### Solution :-
207 | Scala-Spark - [Click Here]()
208 | PySpark - [Click Here]()
209 | SQL -
210 | ```
211 | SELECT custid,
212 | custname,
213 | Collect_set(address) AS address
214 | FROM custtab
215 | GROUP BY custid,
216 | custname
217 | ORDER BY custid
218 | ```
219 | Pandas -
220 | ```
221 | data = [
222 | (1, "Mark Ray", "AB"),
223 | (2, "Peter Smith", "CD"),
224 | (1, "Mark Ray", "EF"),
225 | (2, "Peter Smith", "GH"),
226 | (2, "Peter Smith", "CD"),
227 | (3, "Kate", "IJ"),
228 | ]
229 |
230 | df = pd.DataFrame(data, columns=["custid", "custname", "address"])
231 | print(df)
232 |
233 | finaldf = (
234 | df.groupby(["custid", "custname"])["address"]
235 | .apply(lambda x: list(set(x)))
236 | .reset_index()
237 | )
238 | print(finaldf)
239 | ```
240 |
241 | **[⬆ Back to Top](#table-of-contents)**
242 |
243 | ### Scenerio-5
244 | * Read data from above file into dataframes(df1 and df2).
245 | * Display number of partitions in df1.
246 | * Create a new dataframe df3 from df1, along with a new column salary, and keep it constant 1000
247 | * append df2 and df3, and form df4
248 | * Remove records which have invalid email from df4, emails with @ are considered to be valid.
249 | * Write df4 to a target location, by partitioning on salary.
250 | #### Input :-
251 | ```
252 | +---+----+---+-------------+
253 | | id|name|age| email|
254 | +---+----+---+-------------+
255 | | 1| abc| 31|abc@gmail.com|
256 | | 2| def| 23| defyahoo.com|
257 | | 3| xyz| 26|xyz@gmail.com|
258 | | 4| qwe| 34| qwegmail.com|
259 | | 5| iop| 24|iop@gmail.com|
260 | +---+----+---+-------------+
261 | ```
262 | ```
263 | +---+----+---+---------------+------+
264 | | id|name|age| email|salary|
265 | +---+----+---+---------------+------+
266 | | 11| jkl| 22| abc@gmail.com| 1000|
267 | | 12| vbn| 33| vbn@yahoo.com| 3000|
268 | | 13| wer| 27| wer| 2000|
269 | | 14| zxc| 30| zxc.com| 2000|
270 | | 15| lkj| 29|lkj@outlook.com| 2000|
271 | +---+----+---+---------------+------+
272 | ```
273 | #### Expected Output :-
274 | ```
275 | +---+----+---+---------------+------+
276 | | id|name|age| email|salary|
277 | +---+----+---+---------------+------+
278 | | 1| abc| 31| abc@gmail.com| 1000|
279 | | 3| xyz| 26| xyz@gmail.com| 1000|
280 | | 5| iop| 24| iop@gmail.com| 1000|
281 | | 11| jkl| 22| abc@gmail.com| 1000|
282 | | 12| vbn| 33| vbn@yahoo.com| 3000|
283 | | 15| lkj| 29|lkj@outlook.com| 2000|
284 | +---+----+---+---------------+------+
285 | ```
286 | #### Solution :-
287 | Scala-Spark - [Click Here]()
288 | PySpark - [Click Here]()
289 | Pandas -
290 | ```
291 | import pandas as pd
292 |
293 | # Read data convert into dataframes(df1 and df2).
294 | data1 = [
295 | (1, "abc", 31, "abc@gmail.com"),
296 | (2, "def", 23, "defyahoo.com"),
297 | (3, "xyz", 26, "xyz@gmail.com"),
298 | (4, "qwe", 34, "qwegmail.com"),
299 | (5, "iop", 24, "iop@gmail.com"),
300 | ]
301 |
302 | df1 = pd.DataFrame(data1, columns=["id", "name", "age", "email"])
303 | print(df1)
304 |
305 | data2 = [
306 | (11, "jkl", 22, "abc@gmail.com", 1000),
307 | (12, "vbn", 33, "vbn@yahoo.com", 3000),
308 | (13, "wer", 27, "wer", 2000),
309 | (14, "zxc", 30, "zxc.com", 2000),
310 | (15, "lkj", 29, "lkj@outlook.com", 2000),
311 | ]
312 |
313 | df2 = pd.DataFrame(data2, columns=["id", "name", "age", "email", "salary"])
314 | print(df2)
315 |
316 | # Create a new dataframe df3 from df1, along with a new column salary, and keep it constant 1000
317 | df3 = df1.copy()
318 | df3["salary"] = 1000
319 | print(df3)
320 |
321 | # append df2 and df3, and form df4
322 | df4 = pd.concat([df2, df3])
323 |
324 | df4 = df4.sort_values("id")
325 | print(df4)
326 |
327 | # Remove records which have invalid email from df4, emails with @ are considered to be valid.
328 | finaldf = df4[df4["email"].str.contains("@", na=False)]
329 | print(finaldf)
330 | ```
331 |
332 | **[⬆ Back to Top](#table-of-contents)**
333 |
334 | ### Scenerio-6
335 | #### (For Employee salary greater than 10000 give designation as manager else employee)
336 | #### Input :-
337 | ```
338 | +-----+----+------+
339 | |empid|name|salary|
340 | +-----+----+------+
341 | | 1| a| 10000|
342 | | 2| b| 5000|
343 | | 3| c| 15000|
344 | | 4| d| 25000|
345 | | 5| e| 50000|
346 | | 6| f| 7000|
347 | +-----+----+------+
348 | ```
349 | #### Expected Output :-
350 | ```
351 | +-----+----+------+-----------+
352 | |empid|name|salary|Designation|
353 | +-----+----+------+-----------+
354 | | 1| a| 10000| Employee|
355 | | 2| b| 5000| Employee|
356 | | 3| c| 15000| Manager|
357 | | 4| d| 25000| Manager|
358 | | 5| e| 50000| Manager|
359 | | 6| f| 7000| Employee|
360 | +-----+----+------+-----------+
361 | ```
362 | #### Solution :-
363 | Scala-Spark - [Click Here]()
364 | PySpark - [Click Here]()
365 | SQL -
366 | ```
367 | SELECT *,
368 | CASE
369 | WHEN salary > 10000 THEN
370 | 'Manager'
371 | ELSE 'Employee'
372 | END AS Designation
373 | FROM emptab
374 | ```
375 | Pandas -
376 | ```
377 | import pandas as pd
378 |
379 | data = [
380 | ("1", "a", 10000),
381 | ("2", "b", 5000),
382 | ("3", "c", 15000),
383 | ("4", "d", 25000),
384 | ("5", "e", 50000),
385 | ("6", "f", 7000),
386 | ]
387 |
388 | df = pd.DataFrame(data, columns=["empid", "name", "salary"])
389 | print(df)
390 |
391 |
392 | def emp_desgnination(salary):
393 | return "Manager" if salary > 10000 else "Employee"
394 |
395 |
396 | df["Desgniation"] = df["salary"].apply(emp_desgnination)
397 | print(df)
398 | ```
399 |
400 | **[⬆ Back to Top](#table-of-contents)**
401 |
402 | ### Scenerio-7
403 | #### Input :-
404 | ```
405 | +-------+----------+----+--------+-----+
406 | |sale_id|product_id|year|quantity|price|
407 | +-------+----------+----+--------+-----+
408 | | 1| 100|2010| 25| 5000|
409 | | 2| 100|2011| 16| 5000|
410 | | 3| 100|2012| 8| 5000|
411 | | 4| 200|2010| 10| 9000|
412 | | 5| 200|2011| 15| 9000|
413 | | 6| 200|2012| 20| 7000|
414 | | 7| 300|2010| 20| 7000|
415 | | 8| 300|2011| 18| 7000|
416 | | 9| 300|2012| 20| 7000|
417 | +-------+----------+----+--------+-----+
418 | ```
419 | #### Expected Output :-
420 | ```
421 | +-------+----------+----+--------+-----+
422 | |sale_id|product_id|year|quantity|price|
423 | +-------+----------+----+--------+-----+
424 | | 6| 200|2012| 20| 7000|
425 | | 9| 300|2012| 20| 7000|
426 | | 1| 100|2010| 25| 5000|
427 | | 8| 300|2011| 18| 7000|
428 | +-------+----------+----+--------+-----+
429 | ```
430 | #### Solution :-
431 | Scala-Spark - [Click Here]()
432 | PySpark - [Click Here]()
433 | SQL -
434 | ```
435 | SELECT
436 | *
437 | FROM
438 | (
439 | SELECT
440 | *,
441 | DENSE_RANK() OVER (
442 | PARTITION BY year
443 | ORDER BY
444 | quantity DESC
445 | ) AS rank
446 | FROM
447 | salestab
448 | ) AS rankdf
449 | WHERE
450 | rank = 1
451 | ORDER BY
452 | sale_id
453 | ```
454 | Pandas -
455 | ```
456 | import pandas as pd
457 |
458 | data = [
459 | (1, 100, 2010, 25, 5000),
460 | (2, 100, 2011, 16, 5000),
461 | (3, 100, 2012, 8, 5000),
462 | (4, 200, 2010, 10, 9000),
463 | (5, 200, 2011, 15, 9000),
464 | (6, 200, 2012, 20, 7000),
465 | (7, 300, 2010, 20, 7000),
466 | (8, 300, 2011, 18, 7000),
467 | (9, 300, 2012, 20, 7000),
468 | ]
469 |
470 | df = pd.DataFrame(data, columns=["sale_id", "product_id", "year", "quantity", "price"])
471 | print(df)
472 |
473 | df["rank"] = df.groupby("year")["quantity"].rank(method="dense", ascending=False)
474 | print(df)
475 |
476 | df = df[df["rank"] == 1]
477 | print(df)
478 |
479 | df = df.drop("rank", axis=1).sort_values("sale_id")
480 | print(df)
481 | ```
482 |
483 | **[⬆ Back to Top](#table-of-contents)**
484 |
485 | ### Scenerio-8
486 | #### Input :-
487 | ```
488 | +--------+
489 | | teams|
490 | +--------+
491 | | India|
492 | |Pakistan|
493 | |SriLanka|
494 | +--------+
495 | ```
496 | #### Expected Output :-
497 | ```
498 | +--------------------+
499 | | matches|
500 | +--------------------+
501 | | India Vs Pakistan|
502 | | India Vs SriLanka|
503 | |Pakistan Vs SriLanka|
504 | +--------------------+
505 | ```
506 | #### Solution :-
507 | Scala-Spark -
508 | PySpark -
509 |
510 | **[⬆ Back to Top](#table-of-contents)**
511 |
512 | ### Scenerio-9
513 | #### (write spark code, list of name of participants who has rank=1 most number of times)
514 | #### Input :-
515 | ```
516 | +----+---------------+
517 | |name| rank|
518 | +----+---------------+
519 | | a| [1, 1, 1, 3]|
520 | | b| [1, 2, 3, 4]|
521 | | c|[1, 1, 1, 1, 4]|
522 | | d| [3]|
523 | +----+---------------+
524 | ```
525 | #### Expected Output :-
526 | ```
527 | c
528 | ```
529 | #### Solution :-
530 | Scala-Spark -
531 | PySpark -
532 |
533 | **[⬆ Back to Top](#table-of-contents)**
534 |
535 | ### Scenerio-10
536 | #### Input :-
537 | ```
538 | +-----+-------------+-------------+
539 | |empid|commissionamt|monthlastdate|
540 | +-----+-------------+-------------+
541 | | 1| 300| 31-Jan-2021|
542 | | 1| 400| 28-Feb-2021|
543 | | 1| 200| 31-Mar-2021|
544 | | 2| 1000| 31-Oct-2021|
545 | | 2| 900| 31-Dec-2021|
546 | +-----+-------------+-------------+
547 | ```
548 | #### Expected Output :-
549 | ```
550 | +-----+-------------+-------------+
551 | |empid|commissionamt|monthlastdate|
552 | +-----+-------------+-------------+
553 | | 1| 200| 31-Mar-2021|
554 | | 2| 1000| 31-Oct-2021|
555 | +-----+-------------+-------------+
556 | ```
557 | #### Solution :-
558 | Scala-Spark -
559 | PySpark -
560 |
561 | **[⬆ Back to Top](#table-of-contents)**
562 |
563 | ### Scenerio-11
564 | #### (I have a table called Emp_table, it has 3 columns, Emp name, emp ID , salary
565 | in this I want to get salaries that are >10000 as Grade A, 5000-10000 as grade B and < 5000 as
566 | Grade C, write an SQL query)
567 | #### Input :-
568 | ```
569 | +------+---------------+------+
570 | |emp_id| emp_name|salary|
571 | +------+---------------+------+
572 | | 1| Jhon| 4000|
573 | | 2| Tim David| 12000|
574 | | 3|Json Bhrendroff| 7000|
575 | | 4| Jordon| 8000|
576 | | 5| Green| 14000|
577 | | 6| Brewis| 6000|
578 | +------+---------------+------+
579 | ```
580 | #### Expected Output :-
581 | ```
582 | +------+---------------+------+-----+
583 | |emp_id| emp_name|salary|grade|
584 | +------+---------------+------+-----+
585 | | 1| Jhon| 4000| C|
586 | | 2| Tim David| 12000| A|
587 | | 3|Json Bhrendroff| 7000| B|
588 | | 4| Jordon| 8000| B|
589 | | 5| Green| 14000| A|
590 | | 6| Brewis| 6000| B|
591 | +------+---------------+------+-----+
592 | ```
593 | #### Solution :-
594 | Scala-Spark -
595 | PySpark -
596 |
597 | **[⬆ Back to Top](#table-of-contents)**
598 |
599 | ### Scenerio-12
600 | #### Input :-
601 | ```
602 | +--------------------+----------+
603 | | email| mobile|
604 | +--------------------+----------+
605 | |Renuka1992@gmail.com|9856765434|
606 | |anbu.arasu@gmail.com|9844567788|
607 | +--------------------+----------+
608 | ```
609 | #### Expected Output :-
610 | ```
611 | +--------------------+----------+
612 | | email| mobile|
613 | +--------------------+----------+
614 | |R**********92@gma...|98*****434|
615 | |a**********su@gma...|98*****788|
616 | +--------------------+----------+
617 | ```
618 | #### Solution :-
619 | Scala-Spark -
620 | PySpark -
621 |
622 | **[⬆ Back to Top](#table-of-contents)**
623 |
624 | ## Scenerio-13
625 | #### (We have employee id,employee name, department. Need count of every department employees.)
626 | #### Input :-
627 | ```
628 | +------+--------+-----------+
629 | |emp_id|emp_name| dept|
630 | +------+--------+-----------+
631 | | 1| Jhon|Development|
632 | | 2| Tim|Development|
633 | | 3| David| Testing|
634 | | 4| Sam| Testing|
635 | | 5| Green| Testing|
636 | | 6| Miller| Production|
637 | | 7| Brevis| Production|
638 | | 8| Warner| Production|
639 | | 9| Salt| Production|
640 | +------+--------+-----------+
641 | ```
642 | #### Expected Output :-
643 | ```
644 | +-----------+-----+
645 | | dept|total|
646 | +-----------+-----+
647 | |Development| 2|
648 | | Testing| 3|
649 | | Production| 4|
650 | +-----------+-----+
651 | ```
652 | #### Solution :-
653 | Scala-Spark -
654 | PySpark -
655 |
656 | **[⬆ Back to Top](#table-of-contents)**
657 |
658 | ## Scenerio-14
659 | #### (We need total marks)
660 | #### Input :-
661 | ```
662 | +------+------+------+-------+-----+-------+------+
663 | |rollno| name|telugu|english|maths|science|social|
664 | +------+------+------+-------+-----+-------+------+
665 | |203040|rajesh| 10| 20| 30| 40| 50|
666 | +------+------+------+-------+-----+-------+------+
667 | ```
668 | #### Expected Output :-
669 | ```
670 | +------+------+------+-------+-----+-------+------+-----+
671 | |rollno| name|telugu|english|maths|science|social|total|
672 | +------+------+------+-------+-----+-------+------+-----+
673 | |203040|rajesh| 10| 20| 30| 40| 50| 150|
674 | +------+------+------+-------+-----+-------+------+-----+
675 | ```
676 | #### Solution :-
677 | Scala-Spark - [Click Here]()
678 | PySpark - [Click Here]()
679 | SQL -
680 | ```
681 | select
682 | *,
683 | (
684 | telugu + english + maths + science + social
685 | ) as total
686 | from
687 | markstab
688 |
689 | ```
690 |
691 | **[⬆ Back to Top](#table-of-contents)**
692 |
693 | ## Scenerio-15
694 | #### (Extend and Append list in python and scala)
695 | #### Solution :-
696 | Scala-Spark -
697 | PySpark -
698 |
699 | **[⬆ Back to Top](#table-of-contents)**
700 |
701 | ## Scenerio-16
702 | #### (Remove duplicates)
703 | #### Input :-
704 | ```
705 | +---+----+-----------+------+
706 | | id|name| dept|salary|
707 | +---+----+-----------+------+
708 | | 1|Jhon| Testing| 5000|
709 | | 2| Tim|Development| 6000|
710 | | 3|Jhon|Development| 5000|
711 | | 4| Sky| Prodcution| 8000|
712 | +---+----+-----------+------+
713 | ```
714 | #### Expected Output :-
715 | ```
716 | +---+----+-----------+------+
717 | | id|name| dept|salary|
718 | +---+----+-----------+------+
719 | | 1|Jhon| Testing| 5000|
720 | | 2| Tim|Development| 6000|
721 | | 4| Sky| Prodcution| 8000|
722 | +---+----+-----------+------+
723 | ```
724 | #### Solution :-
725 | Scala-Spark -
726 | PySpark -
727 |
728 | **[⬆ Back to Top](#table-of-contents)**
729 |
730 | ## Scenerio-17
731 | #### (df1 contains Employeeid,Name,Age,State,Country columns df2 contains Employeeid,Name,Age,Address columns. how do you merge df1 and df2 to get the following output Employeeid,Name,Age,State,Country,Address)
732 | #### Input :-
733 | ```
734 | +------+-----+---+------+-------+
735 | |emp_id| name|age| state|country|
736 | +------+-----+---+------+-------+
737 | | 1| Tim| 24|Kerala| India|
738 | | 2|Asman| 26|Kerala| India|
739 | +------+-----+---+------+-------+
740 | ```
741 | ```
742 | +------+-----+---+-------+
743 | |emp_id| name|age|address|
744 | +------+-----+---+-------+
745 | | 1| Tim| 24|Comcity|
746 | | 2|Asman| 26|bimcity|
747 | +------+-----+---+-------+
748 | ```
749 | #### Expected Output :-
750 | ```
751 | +------+-----+---+------+-------+-------+
752 | |emp_id| name|age| state|country|address|
753 | +------+-----+---+------+-------+-------+
754 | | 1| Tim| 24|Kerala| India|Comcity|
755 | | 2|Asman| 26|Kerala| India|bimcity|
756 | +------+-----+---+------+-------+-------+
757 | ```
758 | #### Solution :-
759 | Scala-Spark -
760 | PySpark -
761 |
762 | **[⬆ Back to Top](#table-of-contents)**
763 |
764 | ## Scenerio-18
765 | #### Input :-
766 | ```
767 | +------------------+
768 | | word|
769 | +------------------+
770 | |The Social Dilemma|
771 | +------------------+
772 | ```
773 |
774 | #### Expected Output :-
775 | ```
776 | +------------------+
777 | | reverse word|
778 | +------------------+
779 | |ehT laicoS ammeliD|
780 | +------------------+
781 | ```
782 | #### Solution :-
783 | Scala-Spark -
784 | PySpark -
785 |
786 | **[⬆ Back to Top](#table-of-contents)**
787 |
788 | ## Scenerio-19
789 | #### (Flatten the below complex dataframe)
790 | #### Input :-
791 | ```
792 | root
793 | |-- code: long (nullable = true)
794 | |-- commentCount: long (nullable = true)
795 | |-- createdAt: string (nullable = true)
796 | |-- description: string (nullable = true)
797 | |-- feedsComment: string (nullable = true)
798 | |-- id: long (nullable = true)
799 | |-- imagePaths: string (nullable = true)
800 | |-- images: string (nullable = true)
801 | |-- isdeleted: boolean (nullable = true)
802 | |-- lat: long (nullable = true)
803 | |-- likeDislike: struct (nullable = true)
804 | | |-- dislikes: long (nullable = true)
805 | | |-- likes: long (nullable = true)
806 | | |-- userAction: long (nullable = true)
807 | |-- lng: long (nullable = true)
808 | |-- location: string (nullable = true)
809 | |-- mediatype: long (nullable = true)
810 | |-- msg: string (nullable = true)
811 | |-- multiMedia: array (nullable = true)
812 | | |-- element: struct (containsNull = true)
813 | | | |-- createAt: string (nullable = true)
814 | | | |-- description: string (nullable = true)
815 | | | |-- id: long (nullable = true)
816 | | | |-- likeCount: long (nullable = true)
817 | | | |-- mediatype: long (nullable = true)
818 | | | |-- name: string (nullable = true)
819 | | | |-- place: string (nullable = true)
820 | | | |-- url: string (nullable = true)
821 | |-- name: string (nullable = true)
822 | |-- profilePicture: string (nullable = true)
823 | |-- title: string (nullable = true)
824 | |-- totalFeed: long (nullable = true)
825 | |-- userId: long (nullable = true)
826 | |-- videoUrl: string (nullable = true)
827 | ```
828 |
829 | #### Expected Output :-
830 | ```
831 | root
832 | |-- code: long (nullable = true)
833 | |-- commentCount: long (nullable = true)
834 | |-- createdAt: string (nullable = true)
835 | |-- description: string (nullable = true)
836 | |-- feedsComment: string (nullable = true)
837 | |-- id: long (nullable = true)
838 | |-- imagePaths: string (nullable = true)
839 | |-- images: string (nullable = true)
840 | |-- isdeleted: boolean (nullable = true)
841 | |-- lat: long (nullable = true)
842 | |-- lng: long (nullable = true)
843 | |-- location: string (nullable = true)
844 | |-- mediatype: long (nullable = true)
845 | |-- msg: string (nullable = true)
846 | |-- name: string (nullable = true)
847 | |-- profilePicture: string (nullable = true)
848 | |-- title: string (nullable = true)
849 | |-- totalFeed: long (nullable = true)
850 | |-- userId: long (nullable = true)
851 | |-- videoUrl: string (nullable = true)
852 | |-- dislikes: long (nullable = true)
853 | |-- likes: long (nullable = true)
854 | |-- userAction: long (nullable = true)
855 | |-- createAt: string (nullable = true)
856 | |-- likeCount: long (nullable = true)
857 | |-- place: string (nullable = true)
858 | |-- url: string (nullable = true)
859 | ```
860 | #### Solution :-
861 | Dataset -
862 | Scala-Spark -
863 | PySpark -
864 |
865 | **[⬆ Back to Top](#table-of-contents)**
866 |
867 | ## Scenerio-20
868 | #### (Generate the complex dataframe)
869 | #### Input :-
870 | ```
871 | root
872 | |-- code: long (nullable = true)
873 | |-- commentCount: long (nullable = true)
874 | |-- createAt: string (nullable = true)
875 | |-- createdAt: string (nullable = true)
876 | |-- description: string (nullable = true)
877 | |-- dislikes: long (nullable = true)
878 | |-- feedsComment: string (nullable = true)
879 | |-- id: long (nullable = true)
880 | |-- imagePaths: string (nullable = true)
881 | |-- images: string (nullable = true)
882 | |-- isdeleted: boolean (nullable = true)
883 | |-- lat: long (nullable = true)
884 | |-- likeCount: long (nullable = true)
885 | |-- likes: long (nullable = true)
886 | |-- lng: long (nullable = true)
887 | |-- location: string (nullable = true)
888 | |-- mediatype: long (nullable = true)
889 | |-- msg: string (nullable = true)
890 | |-- name: string (nullable = true)
891 | |-- place: string (nullable = true)
892 | |-- profilePicture: string (nullable = true)
893 | |-- title: string (nullable = true)
894 | |-- totalFeed: long (nullable = true)
895 | |-- url: string (nullable = true)
896 | |-- userAction: long (nullable = true)
897 | |-- userId: long (nullable = true)
898 | |-- videoUrl: string (nullable = true)
899 | ```
900 |
901 | #### Expected Output :-
902 | ```
903 | root
904 | |-- code: long (nullable = true)
905 | |-- commentCount: long (nullable = true)
906 | |-- createdAt: string (nullable = true)
907 | |-- description: string (nullable = true)
908 | |-- feedsComment: string (nullable = true)
909 | |-- id: long (nullable = true)
910 | |-- imagePaths: string (nullable = true)
911 | |-- images: string (nullable = true)
912 | |-- isdeleted: boolean (nullable = true)
913 | |-- lat: long (nullable = true)
914 | |-- likeDislike: struct (nullable = false)
915 | | |-- dislikes: long (nullable = true)
916 | | |-- likes: long (nullable = true)
917 | | |-- userAction: long (nullable = true)
918 | |-- lng: long (nullable = true)
919 | |-- location: string (nullable = true)
920 | |-- mediatype: long (nullable = true)
921 | |-- msg: string (nullable = true)
922 | |-- multiMedia: array (nullable = false)
923 | | |-- element: struct (containsNull = false)
924 | | | |-- createAt: string (nullable = true)
925 | | | |-- description: string (nullable = true)
926 | | | |-- id: long (nullable = true)
927 | | | |-- likeCount: long (nullable = true)
928 | | | |-- mediatype: long (nullable = true)
929 | | | |-- name: string (nullable = true)
930 | | | |-- place: string (nullable = true)
931 | | | |-- url: string (nullable = true)
932 | |-- name: string (nullable = true)
933 | |-- profilePicture: string (nullable = true)
934 | |-- title: string (nullable = true)
935 | |-- userId: long (nullable = true)
936 | |-- videoUrl: string (nullable = true)
937 | |-- totalFeed: long (nullable = true)
938 | ```
939 | #### Solution :-
940 | Dataset -
941 | Scala-Spark -
942 | PySpark -
943 |
944 | **[⬆ Back to Top](#table-of-contents)**
945 |
946 | ## Scenerio-21
947 | #### (The roundtrip distance should be calculated using spark or SQL.)
948 | #### Input :-
949 | ```
950 | +----+---+----+
951 | |from| to|dist|
952 | +----+---+----+
953 | | SEA| SF| 300|
954 | | CHI|SEA|2000|
955 | | SF|SEA| 300|
956 | | SEA|CHI|2000|
957 | | SEA|LND| 500|
958 | | LND|SEA| 500|
959 | | LND|CHI|1000|
960 | | CHI|NDL| 180|
961 | +----+---+----+
962 | ```
963 |
964 | #### Expected Output :-
965 | ```
966 | +----+---+--------------+
967 | |from| to|roundtrip_dist|
968 | +----+---+--------------+
969 | | SEA| SF| 600|
970 | | CHI|SEA| 4000|
971 | | LND|SEA| 1000|
972 | +----+---+--------------+
973 |
974 | ```
975 | #### Solution :-
976 | Scala-Spark - [Click Here]()
977 | PySpark - [Click Here]()
978 | SQL -
979 | ```
980 | select
981 | r1.from,
982 | r1.to,
983 | (r1.dist + r2.dist) as round_distance
984 | from
985 | trip r1
986 | join trip r2 on r1.from = r2.to
987 | and r1.to = r2.from
988 | where
989 | r1.from < r1.to
990 | ```
991 |
992 | **[⬆ Back to Top](#table-of-contents)**
993 |
994 | ## Scenerio-22
995 | #### (Cumilative sum)
996 | #### Input :-
997 | ```
998 | +---+------+-----+
999 | |pid| date|price|
1000 | +---+------+-----+
1001 | | 1|26-May| 100|
1002 | | 1|27-May| 200|
1003 | | 1|28-May| 300|
1004 | | 2|29-May| 400|
1005 | | 3|30-May| 500|
1006 | | 3|31-May| 600|
1007 | +---+------+-----+
1008 | ```
1009 |
1010 | #### Expected Output :-
1011 | ```
1012 | +---+------+-----+---------+
1013 | |pid| date|price|new_price|
1014 | +---+------+-----+---------+
1015 | | 1|26-May| 100| 100|
1016 | | 1|27-May| 200| 300|
1017 | | 1|28-May| 300| 600|
1018 | | 2|29-May| 400| 400|
1019 | | 3|30-May| 500| 500|
1020 | | 3|31-May| 600| 1100|
1021 | +---+------+-----+---------+
1022 |
1023 | ```
1024 | #### Solution :-
1025 | Scala-Spark - [Click Here]()
1026 | PySpark - [Click Here]()
1027 | SQL -
1028 | ```
1029 | select
1030 | pid,
1031 | date,
1032 | price,
1033 | sum(price) over (
1034 | partition by pid
1035 | order by
1036 | price
1037 | ) as newprice
1038 | from
1039 | ordertab
1040 |
1041 | ```
1042 |
1043 | **[⬆ Back to Top](#table-of-contents)**
1044 |
1045 | ## Scenerio-23
1046 | #### Input :-
1047 | ```
1048 | +-----------+-----------+
1049 | |customer_id|product_key|
1050 | +-----------+-----------+
1051 | | 1| 5|
1052 | | 2| 6|
1053 | | 3| 5|
1054 | | 3| 6|
1055 | | 1| 6|
1056 | +-----------+-----------+
1057 | ```
1058 | ```
1059 | +-----------+
1060 | |product_key|
1061 | +-----------+
1062 | | 5|
1063 | | 6|
1064 | +-----------+
1065 |
1066 | ```
1067 |
1068 | #### Expected Output :-
1069 | ```
1070 | +-----------+
1071 | |customer_id|
1072 | +-----------+
1073 | | 1|
1074 | | 3|
1075 | +-----------+
1076 |
1077 | ```
1078 | #### Solution :-
1079 | Scala-Spark - [Click Here]()
1080 | PySpark - [Click Here]()
1081 |
1082 | **[⬆ Back to Top](#table-of-contents)**
1083 |
1084 | ## Scenerio-24
1085 | #### Input :-
1086 | ```
1087 | +------+------------+
1088 | |userid| page|
1089 | +------+------------+
1090 | | 1| home|
1091 | | 1| products|
1092 | | 1| checkout|
1093 | | 1|confirmation|
1094 | | 2| home|
1095 | | 2| products|
1096 | | 2| cart|
1097 | | 2| checkout|
1098 | | 2|confirmation|
1099 | | 2| home|
1100 | | 2| products|
1101 | +------+------------+
1102 |
1103 | ```
1104 |
1105 | #### Expected Output :-
1106 | ```
1107 | +------+--------------------------------------------------------------+
1108 | |userid|pages |
1109 | +------+--------------------------------------------------------------+
1110 | |1 |[home, products, checkout, confirmation] |
1111 | |2 |[home, products, cart, checkout, confirmation, home, products]|
1112 | +------+--------------------------------------------------------------+
1113 |
1114 | ```
1115 | #### Solution :-
1116 | Scala-Spark - [Click Here]()
1117 | PySpark - [Click Here]()
1118 | SQL :-
1119 | ```
1120 | select
1121 | userid,
1122 | collect_list(page) as pages
1123 | from
1124 | testcol
1125 | group by
1126 | userid;
1127 |
1128 | ```
1129 | **[⬆ Back to Top](#table-of-contents)**
1130 |
1131 | ## Scenerio-25
1132 | ### consider a file with some bad/corrupt data as shown below.How will you handle those and load into spark dataframe
1133 | Note - avoid using filter after reading as DF and try to remove bad data while reading the file itself
1134 | #### Input :-
1135 | ```
1136 | emp_no,emp_name,dep
1137 | 101,Murugan,HealthCare
1138 | Invalid Entry,Description: Bad Record Entry
1139 | 102,Kannan,Finance
1140 | 103,Mani,IT
1141 | Connection lost,Description: Poor Connection
1142 | 104,Pavan,HR
1143 | Bad Record,Description:Corrupt Record
1144 | ```
1145 |
1146 | #### Expected Output :-
1147 | ```
1148 | +------+--------+----------+
1149 | |emp_no|emp_name| dep|
1150 | +------+--------+----------+
1151 | | 101| Murugan|HealthCare|
1152 | | 102| Kannan| Finance|
1153 | | 103| Mani| IT|
1154 | | 104| Pavan| HR|
1155 | +------+--------+----------+
1156 |
1157 | ```
1158 | #### Solution :-
1159 | Scala-Spark - [Click Here]()
1160 | PySpark - [Click Here]()
1161 |
1162 | There are three modes available when reading a file in Spark:
1163 |
1164 | * `PERMISSIVE` : This is the default mode. It attempts to parse all the rows in the file, and if it encounters any malformed data or parsing errors, it sets the problematic fields to null and adds a new column called _corrupt_record to store the entire problematic row as a string.
1165 |
1166 | * `DROPMALFORMED` : This mode drops the rows that contain malformed data or cannot be parsed according to the specified schema. It only includes the rows that can be successfully parsed.
1167 |
1168 | * `FAILFAST` : This mode throws an exception and fails immediately if it encounters any malformed data or parsing errors in the file. It does not process any further rows after the first encountered error.
1169 |
1170 | You can specify the desired mode using the mode option when reading a file, such as option("mode", "PERMISSIVE") or option("mode", "FAILFAST"). If the mode option is not explicitly set, it defaults to PERMISSIVE.
1171 |
1172 | **[⬆ Back to Top](#table-of-contents)**
1173 |
1174 | ## Scenerio-26
1175 | * Input :-
1176 | ```sh
1177 | +---+----+
1178 | | id|name|
1179 | +---+----+
1180 | | 1| A|
1181 | | 2| B|
1182 | | 3| C|
1183 | | 4| D|
1184 | +---+----+
1185 |
1186 | +---+-----+
1187 | |id1|name1|
1188 | +---+-----+
1189 | | 1| A|
1190 | | 2| B|
1191 | | 4| X|
1192 | | 5| F|
1193 | +---+-----+
1194 | ```
1195 | * Output :-
1196 | ```sh
1197 | +---+-------------+
1198 | | id| comment|
1199 | +---+-------------+
1200 | | 3|new in source|
1201 | | 4| mismatch|
1202 | | 5|new in target|
1203 | +---+-------------+
1204 | ```
1205 | #### Solution :-
1206 | Scala-Spark :- [Click Here]()
1207 | PySpark :- [Click Here]()
1208 | SQL :-
1209 | ```
1210 | select
1211 | id,
1212 | case when name != name1 then 'Mismatch' when name1 is null then 'New in Source' when name is null then 'New in Target' end as comment
1213 | from
1214 | (
1215 | select
1216 | coalesce(id, id1) as id,
1217 | s.name,
1218 | t.name1
1219 | from
1220 | sourcetab s full
1221 | outer join targettab t on s.id = t.id1
1222 | WHERE
1223 | s.name != t.name1
1224 | OR s.name IS NULL
1225 | OR t.name1 IS NULL
1226 | );
1227 |
1228 | ```
1229 |
1230 | **[⬆ Back to Top](#table-of-contents)**
1231 |
1232 | ## Scenerio-27
1233 | * Input :-
1234 | ```sh
1235 | +-----+------+----+
1236 | |empid|salary|year|
1237 | +-----+------+----+
1238 | | 1| 60000|2018|
1239 | | 1| 70000|2019|
1240 | | 1| 80000|2020|
1241 | | 2| 60000|2018|
1242 | | 2| 65000|2019|
1243 | | 2| 65000|2020|
1244 | | 3| 60000|2018|
1245 | | 3| 65000|2019|
1246 | +-----+------+----+
1247 | ```
1248 | * Output :-
1249 | ```sh
1250 | +-----+------+----+-----------+
1251 | |empid|salary|year|incresalary|
1252 | +-----+------+----+-----------+
1253 | | 1| 60000|2018| 0|
1254 | | 1| 70000|2019| 10000|
1255 | | 1| 80000|2020| 10000|
1256 | | 2| 60000|2018| 0|
1257 | | 2| 65000|2019| 5000|
1258 | | 2| 65000|2020| 0|
1259 | | 3| 60000|2018| 0|
1260 | | 3| 65000|2019| 5000|
1261 | +-----+------+----+-----------+
1262 |
1263 | ```
1264 | #### Solution :-
1265 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio27.scala)
1266 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio27.py)
1267 | SQL :-
1268 | ```
1269 | select
1270 | empid,
1271 | salary,
1272 | year,
1273 | coalesce(
1274 | (salary - diff),
1275 | 0
1276 | ) as increment
1277 | from
1278 | (
1279 | select
1280 | *,
1281 | lag(salary, 1) over (
1282 | partition by empid
1283 | order by
1284 | year
1285 | ) as diff
1286 | from
1287 | salarytab
1288 | );
1289 |
1290 | ```
1291 |
1292 | **[⬆ Back to Top](#table-of-contents)**
1293 |
1294 |
1295 | ## Scenerio-28
1296 | * Input :-
1297 | ```sh
1298 | +-----+------+
1299 | |child|parent|
1300 | +-----+------+
1301 | | A| AA|
1302 | | B| BB|
1303 | | C| CC|
1304 | | AA| AAA|
1305 | | BB| BBB|
1306 | | CC| CCC|
1307 | +-----+------+
1308 | ```
1309 | * Output :-
1310 | ```sh
1311 | +-----+------+-----------+
1312 | |child|parent|grandparent|
1313 | +-----+------+-----------+
1314 | | A| AA| AAA|
1315 | | C| CC| CCC|
1316 | | B| BB| BBB|
1317 | +-----+------+-----------+
1318 | ```
1319 | #### Solution :-
1320 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio28.scala)
1321 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio28.py)
1322 |
1323 | **[⬆ Back to Top](#table-of-contents)**
1324 |
1325 |
1326 | ## Scenerio-29
1327 | * Input :-
1328 | ```sh
1329 | +---+
1330 | |col|
1331 | +---+
1332 | | 1|
1333 | | 2|
1334 | | 3|
1335 | +---+
1336 |
1337 | +----+
1338 | |col1|
1339 | +----+
1340 | | 1|
1341 | | 2|
1342 | | 3|
1343 | | 4|
1344 | | 5|
1345 | +----+
1346 | ```
1347 | * Output :-
1348 | ```sh
1349 | +---+
1350 | |col|
1351 | +---+
1352 | | 1|
1353 | | 2|
1354 | | 4|
1355 | | 5|
1356 | +---+
1357 | ```
1358 | #### Solution :-
1359 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio29.scala)
1360 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio29.py)
1361 |
1362 | **[⬆ Back to Top](#table-of-contents)**
1363 |
1364 | ## Scenerio-30
1365 | * Write a SQL Query to extract second most salary for each department
1366 | * Input :-
1367 | ```sh
1368 | +------+----+-------+-------+
1369 | |emp_id|name|dept_id| salary|
1370 | +------+----+-------+-------+
1371 | | 1| A| A|1000000|
1372 | | 2| B| A|2500000|
1373 | | 3| C| G| 500000|
1374 | | 4| D| G| 800000|
1375 | | 5| E| W|9000000|
1376 | | 6| F| W|2000000|
1377 | +------+----+-------+-------+
1378 |
1379 | +--------+---------+
1380 | |dept_id1|dept_name|
1381 | +--------+---------+
1382 | | A| AZURE|
1383 | | G| GCP|
1384 | | W| AWS|
1385 | +--------+---------+
1386 | ```
1387 | * Output :-
1388 | ```sh
1389 | +------+----+---------+-------+
1390 | |emp_id|name|dept_name| salary|
1391 | +------+----+---------+-------+
1392 | | 1| A| AZURE|1000000|
1393 | | 6| F| AWS|2000000|
1394 | | 3| C| GCP| 500000|
1395 | +------+----+---------+-------+
1396 | ```
1397 | #### Solution :-
1398 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio30.scala)
1399 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio30.ipynb)
1400 | SQL :-
1401 | ```sh
1402 | WITH jointab AS (
1403 | SELECT df1.emp_id, df1.name, df1.dept_id, df1.salary, df2.dept_name,
1404 | DENSE_RANK() OVER (PARTITION BY df1.dept_id ORDER BY df1.salary DESC) AS row_rank
1405 | FROM df1
1406 | INNER JOIN df2 ON df1.dept_id = df2.dept_id1
1407 | )
1408 | SELECT emp_id,name,dept_name,salary from jointab WHERE row_rank =2;
1409 | ```
1410 | **[⬆ Back to Top](#table-of-contents)**
1411 |
1412 | ## Scenerio-31
1413 | * Input :-
1414 | ```sh
1415 | +----+-----+--------+-----------+
1416 | |col1| col2| col3| col4|
1417 | +----+-----+--------+-----------+
1418 | | m1|m1,m2|m1,m2,m3|m1,m2,m3,m4|
1419 | +----+-----+--------+-----------+
1420 | ```
1421 | * Output :-
1422 | ```sh
1423 | +-----------+
1424 | | col|
1425 | +-----------+
1426 | | m1|
1427 | | m1,m2|
1428 | | m1,m2,m3|
1429 | |m1,m2,m3,m4|
1430 | | |
1431 | +-----------+
1432 | ```
1433 | #### Solution :-
1434 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio31.scala)
1435 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio31.ipynb)
1436 | SQL :-
1437 | ```sh
1438 | select
1439 | explode(
1440 | split(col, '-')
1441 | )
1442 | from
1443 | (
1444 | select
1445 | concat(
1446 | col1, '-', col2, '-', col3, '-', col4
1447 | ) as col
1448 | from
1449 | mtab
1450 | );
1451 |
1452 | ```
1453 | **[⬆ Back to Top](#table-of-contents)**
1454 |
1455 | ## Scenerio-32
1456 | * Input :-
1457 | ```sh
1458 | +-------+-------------------+
1459 | |food_id| food_item|
1460 | +-------+-------------------+
1461 | | 1| Veg Biryani|
1462 | | 2| Veg Fried Rice|
1463 | | 3| Kaju Fried Rice|
1464 | | 4| Chicken Biryani|
1465 | | 5|Chicken Dum Biryani|
1466 | | 6| Prawns Biryani|
1467 | | 7| Fish Birayani|
1468 | +-------+-------------------+
1469 |
1470 | +-------+------+
1471 | |food_id|rating|
1472 | +-------+------+
1473 | | 1| 5|
1474 | | 2| 3|
1475 | | 3| 4|
1476 | | 4| 4|
1477 | | 5| 5|
1478 | | 6| 4|
1479 | | 7| 4|
1480 | +-------+------+
1481 | ```
1482 | * Output :-
1483 | ```sh
1484 | +-------+-------------------+------+---------------+
1485 | |food_id| food_item|rating|stats(out of 5)|
1486 | +-------+-------------------+------+---------------+
1487 | | 1| Veg Biryani| 5| *****|
1488 | | 2| Veg Fried Rice| 3| ***|
1489 | | 3| Kaju Fried Rice| 4| ****|
1490 | | 4| Chicken Biryani| 4| ****|
1491 | | 5|Chicken Dum Biryani| 5| *****|
1492 | | 6| Prawns Biryani| 4| ****|
1493 | | 7| Fish Birayani| 4| ****|
1494 | +-------+-------------------+------+---------------+
1495 | ```
1496 | #### Solution :-
1497 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio32%20Scala.scala)
1498 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio32.ipynb)
1499 | SQL :-
1500 | ```sh
1501 | select
1502 | foodtab.food_id,
1503 | foodtab.food_item,
1504 | ratingtab.rating,
1505 | repeat('*', ratingtab.rating) as stars
1506 | from
1507 | foodtab
1508 | inner join ratingtab on foodtab.food_id = ratingtab.food_id
1509 | order by
1510 | foodtab.food_id;
1511 | ```
1512 | **[⬆ Back to Top](#table-of-contents)**
1513 |
1514 | ## Scenerio-33
1515 | * Write a query to print the maximum number of discount tours any 1 family can choose.
1516 | * Input :-
1517 | ```sh
1518 | +--------------------+--------------+-----------+
1519 | | id| name|family_size|
1520 | +--------------------+--------------+-----------+
1521 | |c00dac11bde74750b...| Alex Thomas| 9|
1522 | |eb6f2d3426694667a...| Chris Gray| 2|
1523 | |3f7b5b8e835d4e1c8...| Emily Johnson| 4|
1524 | |9a345b079d9f4d3ca...| Michael Brown| 6|
1525 | |e0a5f57516024de2a...|Jessica Wilson| 3|
1526 | +--------------------+--------------+-----------+
1527 |
1528 | +--------------------+------------+--------+--------+
1529 | | id| name|min_size|max_size|
1530 | +--------------------+------------+--------+--------+
1531 | |023fd23615bd4ff4b...| Bolivia| 2| 4|
1532 | |be247f73de0f4b2d8...|Cook Islands| 4| 8|
1533 | |3e85ab80a6f84ef3b...| Brazil| 4| 7|
1534 | |e571e164152c4f7c8...| Australia| 5| 9|
1535 | |f35a7bb7d44342f7a...| Canada| 3| 5|
1536 | |a1b5a4b5fc5f46f89...| Japan| 10| 12|
1537 | +--------------------+------------+--------+--------+
1538 | ```
1539 | * Output :-
1540 | ```sh
1541 | +-------------+-------------------+
1542 | | name|number_of_countries|
1543 | +-------------+-------------------+
1544 | |Emily Johnson| 4|
1545 | +-------------+-------------------+
1546 | ```
1547 | #### Solution :-
1548 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio33.scala)
1549 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio33.ipynb)
1550 | SQL :-
1551 | ```sh
1552 | select max(number_of_countries) from (select f.name,count(*) as number_of_countries from family f inner join country c on f.family_size between c.min_size and c.max_size group by f.name);
1553 | ```
1554 | **[⬆ Back to Top](#table-of-contents)**
1555 |
1556 | ## Scenerio-34
1557 | * Input :-
1558 | ```sh
1559 | +-----------+------+---+------+
1560 | |customer_id| name|age|gender|
1561 | +-----------+------+---+------+
1562 | | 1| Alice| 25| F|
1563 | | 2| Bob| 40| M|
1564 | | 3| Raj| 46| M|
1565 | | 4| Sekar| 66| M|
1566 | | 5| Jhon| 47| M|
1567 | | 6|Timoty| 28| M|
1568 | | 7| Brad| 90| M|
1569 | | 8| Rita| 34| F|
1570 | +-----------+------+---+------+
1571 | ```
1572 | * Output :-
1573 | ```sh
1574 | +---------+-----+
1575 | |age_group|count|
1576 | +---------+-----+
1577 | | 19-35| 3|
1578 | | 36-50| 3|
1579 | | 51+| 2|
1580 | +---------+-----+
1581 | ```
1582 | #### Solution :-
1583 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio34.scala)
1584 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio34.ipynb)
1585 |
1586 | **[⬆ Back to Top](#table-of-contents)**
1587 |
1588 | ## Scenerio-35
1589 | Question (IBM Question)
1590 | * Create a new datafrane df1 with the given values
1591 | * Count null entries in a datafarme
1592 | * Remove null entries and the store the null entries in a new datafarme df2
1593 | * Create a new dataframe df3 with the given values and join the two dataframes df1 & df2
1594 | * Fill the null values with the mean age all of students
1595 | * Filter the students who are 18 years above and older
1596 | #### Solution :-
1597 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio35.scala)
1598 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio35.ipynb)
1599 |
1600 | **[⬆ Back to Top](#table-of-contents)**
1601 |
1602 |
1603 | ## Scenerio-36
1604 | * Input :-
1605 | ```sh
1606 | +----------+----------+
1607 | | sell_date| product|
1608 | +----------+----------+
1609 | |2020-05-30| Headphone|
1610 | |2020-06-01| Pencil|
1611 | |2020-06-02| Mask|
1612 | |2020-05-30|Basketball|
1613 | |2020-06-01| Book|
1614 | |2020-06-02| Mask|
1615 | |2020-05-30| T-Shirt|
1616 | +----------+----------+
1617 | ```
1618 | * Output :-
1619 | ```sh
1620 | +----------+--------------------+---------+
1621 | | sell_date| products|null_sell|
1622 | +----------+--------------------+---------+
1623 | |2020-05-30|[T-Shirt, Basketb...| 3|
1624 | |2020-06-01| [Pencil, Book]| 2|
1625 | |2020-06-02| [Mask]| 1|
1626 | +----------+--------------------+---------+
1627 | ```
1628 | #### Solution :-
1629 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio36.scala)
1630 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio36.ipynb)
1631 |
1632 | SQL :-
1633 | ```sh
1634 | select sell_date,(collect_set(product)) as products,size(collect_set(product)) as num_sell from products group by sell_date;
1635 | ```
1636 | **[⬆ Back to Top](#table-of-contents)**
1637 |
1638 |
1639 |
1640 |
1641 |
1642 |
1643 |
1644 |
1645 |
1646 |
--------------------------------------------------------------------------------
/Scenerio-1.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf,SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-1")
6 | sc = SparkContext(conf=conf)
7 | sc.setLogLevel("ERROR")
8 | spark = SparkSession.builder.getOrCreate()
9 |
10 | data = [("001", "Monika", "Arora", 100000, "2014-02-20 09:00:00", "HR"),("002", "Niharika", "Verma", 300000, "2014-06-11 09:00:00", "Admin"),("003", "Vishal", "Singhal", 300000, "2014-02-20 09:00:00", "HR"),("004", "Amitabh", "Singh", 500000, "2014-02-20 09:00:00", "Admin"),("005", "Vivek", "Bhati", 500000, "2014-06-11 09:00:00", "Admin")]
11 | myschema = ["workerid","firstname","lastname","salary","joiningdate","depart"]
12 | df = spark.createDataFrame(data,schema=myschema)
13 | df.show()
14 | #Through SQL
15 | df.createOrReplaceTempView("worktab")
16 | spark.sql("select a.workerid,a.firstname,a.lastname,a.salary,a.joiningdate,a.depart from worktab a, worktab b where a.salary=b.salary and a.workerid !=b.workerid").show()
17 |
18 | #Through Spark DSL
19 | finaldf = df.alias("a").join(df.alias("b"), (col("a.salary") == col("b.salary")) & (col("a.workerid") != col("b.workerid")), "inner").select(col("a.workerid"), col("a.firstname"), col("a.lastname"), col("a.salary"), col("a.joiningdate"), col("a.depart")).show()
20 |
--------------------------------------------------------------------------------
/Scenerio10.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-10")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 | data = [
12 | (1, 300, "31-Jan-2021"),
13 | (1, 400, "28-Feb-2021"),
14 | (1, 200, "31-Mar-2021"),
15 | (2, 1000, "31-Oct-2021"),
16 | (2, 900, "31-Dec-2021")
17 | ]
18 | df = spark.createDataFrame(data, ["empid", "commissionamt", "monthlastdate"])
19 | df.show()
20 |
21 | maxdatedf = df.groupBy(col("empid").alias("empid1")).agg(max("monthlastdate").alias("maxdate"))
22 | maxdatedf.show()
23 |
24 | joindf = df.join(maxdatedf, (df["empid"] == maxdatedf["empid1"]) & (df["monthlastdate"] == maxdatedf["maxdate"]),
25 | "inner").drop("empid1", "maxdate")
26 | joindf.show()
27 |
--------------------------------------------------------------------------------
/Scenerio11.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-10")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 | data = [
12 | (1, "Jhon", 4000),
13 | (2, "Tim David", 12000),
14 | (3, "Json Bhrendroff", 7000),
15 | (4, "Jordon", 8000),
16 | (5, "Green", 14000),
17 | (6, "Brewis", 6000)
18 | ]
19 | df = spark.createDataFrame(data, ["emp_id", "emp_name", "salary"])
20 | df.show()
21 |
22 | # Through SQL
23 | df.createOrReplaceTempView("emptab")
24 | spark.sql(
25 | "select *,case when salary<5000 then 'C' when salary between 5000 and 10000 then 'B' else 'A' end as grade from emptab ").show()
26 |
27 | # Through DSL
28 | finaldf = df.withColumn("grade", expr(
29 | "case when salary<5000 then 'C' when salary between 5000 and 10000 then 'B' else 'A' end")).show()
30 |
--------------------------------------------------------------------------------
/Scenerio12.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio12")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 |
12 | #creating UDF functions for masked data, here email[0] is it will take first letter i.e 0th index and email[8:] is it will take the string from 8th index position to end of the string
13 | def mask_email(email):
14 | return (email[0] + "**********" + email[8:])
15 |
16 | #creating UDF functions for masked data, here mobile[0:2] is it will take string from Index 0 to 2 letters and mobile[-3:] is it will take string last three index to end the end of the string
17 | def mask_mobile(mobile):
18 | return (mobile[0:2] + "*****" + mobile[-3:])
19 |
20 |
21 | df = spark.createDataFrame([("Renuka1992@gmail.com", "9856765434"), ("anbu.arasu@gmail.com", "9844567788")], ["email", "mobile"])
22 | df.show()
23 |
24 | maskeddf = df.withColumn("email",udf(mask_email)(df.email)).withColumn("mobile",udf(mask_mobile)(df.mobile))
25 | maskeddf.show()
26 |
--------------------------------------------------------------------------------
/Scenerio13.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql import *
4 | from pyspark.sql.types import *
5 | from pyspark.sql.functions import *
6 | from pyspark.sql.window import *
7 |
8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio13")
9 | sc = SparkContext(conf=conf)
10 | sc.setLogLevel("ERROR")
11 | spark = SparkSession.builder.getOrCreate()
12 |
13 | data = [(1, "Jhon", "Development"),
14 | (2, "Tim", "Development"),
15 | (3, "David", "Testing"),
16 | (4, "Sam", "Testing"),
17 | (5, "Green", "Testing"),
18 | (6, "Miller", "Production"),
19 | (7, "Brevis", "Production"),
20 | (8, "Warner", "Production"),
21 | (9, "Salt", "Production")]
22 | df = spark.createDataFrame(data, ["emp_id", "emp_name", "dept"])
23 | df.show()
24 |
25 | # Through SQL
26 | df.createOrReplaceTempView("emptab")
27 | spark.sql("SELECT dept, COUNT(*) AS total FROM emptab GROUP BY dept").show()
28 |
29 | # Through DSL
30 | finaldf = df.groupBy(col("dept")).agg(count("*").alias("total")).show()
31 |
--------------------------------------------------------------------------------
/Scenerio14.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql import *
4 | from pyspark.sql.types import *
5 | from pyspark.sql.functions import *
6 | from pyspark.sql.window import *
7 |
8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio14")
9 | sc = SparkContext(conf=conf)
10 | sc.setLogLevel("ERROR")
11 | spark = SparkSession.builder.getOrCreate()
12 |
13 | data = [
14 | (203040, "rajesh", 10, 20, 30, 40, 50)
15 | ]
16 |
17 | df = spark.createDataFrame(data, ["rollno", "name", "telugu", "english", "maths", "science", "social"])
18 | df.show()
19 |
20 | # Through SQL
21 | df.createOrReplaceTempView("marks")
22 | spark.sql("select *, (telugu+english+maths+science+social) as total from marks").show()
23 |
24 | # Through DSL
25 | finaldf = df.withColumn("total", expr("telugu+english+maths+science+social")).show()
26 |
--------------------------------------------------------------------------------
/Scenerio15.py:
--------------------------------------------------------------------------------
1 | l1 = [2, 3, 4, 5]
2 | l2 = [6, 7, 8, 9]
3 | # append
4 | appendlst = l1.append(l2)
5 | print(l1)
6 |
7 | # extend
8 | l1.extend(l2)
9 | print(l1)
10 |
--------------------------------------------------------------------------------
/Scenerio16.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql import *
4 | from pyspark.sql.types import *
5 | from pyspark.sql.functions import *
6 | from pyspark.sql.window import *
7 |
8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio16")
9 | sc = SparkContext(conf=conf)
10 | sc.setLogLevel("ERROR")
11 | spark = SparkSession.builder.getOrCreate()
12 | data = [(1, "Jhon", "Testing", 5000),
13 | (2, "Tim", "Development", 6000),
14 | (3, "Jhon", "Development", 5000),
15 | (4, "Sky", "Prodcution", 8000)]
16 | df = spark.createDataFrame(data, ["id", "name", "dept", "salary"])
17 | df.show()
18 |
19 | finaldf = df.dropDuplicates(["name"]).orderBy("id")
20 | finaldf.show()
21 |
--------------------------------------------------------------------------------
/Scenerio17.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql import *
4 | from pyspark.sql.types import *
5 | from pyspark.sql.functions import *
6 | from pyspark.sql.window import *
7 |
8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio17")
9 | sc = SparkContext(conf=conf)
10 | sc.setLogLevel("ERROR")
11 | spark = SparkSession.builder.getOrCreate()
12 | data = [(1, "Tim", 24, "Kerala", "India"),
13 | (2, "Asman", 26, "Kerala", "India")]
14 | df1 = spark.createDataFrame(data, ["emp_id", "name", "age", "state", "country"])
15 | df1.show()
16 |
17 | data2 = [(1, "Tim", 24, "Comcity"),
18 | (2, "Asman", 26, "bimcity")]
19 | df2 = spark.createDataFrame(data2, ["emp_id", "name", "age", "address"])
20 | df2.show()
21 |
22 | findf = df1.join(df2, ["emp_id", "name", "age"], "outer")
23 | findf.show()
24 |
--------------------------------------------------------------------------------
/Scenerio18.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql import *
4 | from pyspark.sql.types import *
5 | from pyspark.sql.functions import *
6 | from pyspark.sql.window import *
7 |
8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio18")
9 | sc = SparkContext(conf=conf)
10 | sc.setLogLevel("ERROR")
11 | spark = SparkSession.builder.getOrCreate()
12 |
13 | # Create input DataFrame
14 | inputdf = spark.createDataFrame([("The Social Dilemma",)], ["word"])
15 | inputdf.show()
16 |
17 | # Define UDF for reversing words
18 | def reverse_sentence(sentence):
19 | return " ".join([word[::-1] for word in sentence.split(" ")])
20 |
21 | # Register UDF
22 | reverse_udf = udf(reverse_sentence, StringType())
23 |
24 | # Apply UDF to input DataFrame
25 | outputdf = inputdf.withColumn("reverse word", reverse_udf("word")).drop("word")
26 | outputdf.show()
--------------------------------------------------------------------------------
/Scenerio19.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql import *
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio19")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 |
12 | df = spark.read.format("json").option("multiline", "true").load("dbfs:/FileStore/scen.json")
13 | df.printSchema()
14 | finaldf = df.withColumn("multiMedia", explode(col("multiMedia"))).withColumn("dislikes",
15 | expr("likeDislike.dislikes")).withColumn(
16 | "likes", expr("likeDislike.likes")).withColumn("userAction", expr("likeDislike.userAction")).withColumn("createAt",
17 | expr(
18 | "multiMedia.createAt")).withColumn(
19 | "description", expr("multiMedia.description")).withColumn("id", expr("multiMedia.id")).withColumn("likeCount", expr(
20 | "multiMedia.likeCount")).withColumn("mediatype", expr("multiMedia.mediatype")).withColumn("name", expr(
21 | "multiMedia.name")).withColumn("place", expr("multiMedia.place")).withColumn("url", expr("multiMedia.url")).drop(
22 | "likeDislike", "multiMedia")
23 | print("flat Schema")
24 | finaldf.printSchema()
25 | finaldf.show()
26 |
--------------------------------------------------------------------------------
/Scenerio2.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf,SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-2")
6 | sc = SparkContext(conf=conf)
7 | sc.setLogLevel("ERROR")
8 | spark = SparkSession.builder.getOrCreate()
9 | data = [
10 | (1, "1-Jan", "Ordered"),
11 | (1, "2-Jan", "dispatched"),
12 | (1, "3-Jan", "dispatched"),
13 | (1, "4-Jan", "Shipped"),
14 | (1, "5-Jan", "Shipped"),
15 | (1, "6-Jan", "Delivered"),
16 | (2, "1-Jan", "Ordered"),
17 | (2, "2-Jan", "dispatched"),
18 | (2, "3-Jan", "shipped")]
19 | myschema = ["orderid","statusdate","status"]
20 | df = spark.createDataFrame(data,schema=myschema)
21 | df.show()
22 | #Through SQL
23 | df.createOrReplaceTempView("ordertab")
24 | spark.sql("select * from ordertab where status = 'dispatched' and orderid in(select orderid from ordertab where status = 'Ordered')").show()
25 |
26 | #Through DSL
27 | result = df.filter(
28 | (col("status") == "dispatched") &
29 | (col("orderid").isin(
30 | *[row[0] for row in df.filter(col("status") == "Ordered").select("orderid").collect()]
31 | ))
32 | )
33 | result.show()
34 |
--------------------------------------------------------------------------------
/Scenerio20.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql import *
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio20")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 |
12 | df = spark.read.format("json").option("multiline", "true").load(
13 | "dbfs:/FileStore/flatjson/part-00000-tid-3675309499584050336-b8650962-dec3-4fe4-a204-c914090f019e-21-1-c000.json")
14 | df.printSchema()
15 | compdf = df.select(
16 | col("code"),
17 | col("commentCount"),
18 | col("createdAt"),
19 | col("description"),
20 | col("feedsComment"),
21 | col("id"),
22 | col("imagePaths"),
23 | col("images"),
24 | col("isdeleted"),
25 | col("lat"),
26 | struct(col("dislikes"), col("likes"), col("userAction")).alias("likeDislike"),
27 | col("lng"),
28 | col("location"),
29 | col("mediatype"),
30 | col("msg"),
31 | array(
32 | struct(
33 | col("createAt"),
34 | col("description"),
35 | col("id"),
36 | col("likeCount"),
37 | col("mediatype"),
38 | col("name"),
39 | col("place"),
40 | col("url")
41 | ).alias("element")
42 | ).alias("multiMedia"),
43 | col("name"),
44 | col("profilePicture"),
45 | col("title"),
46 | col("userId"),
47 | col("videoUrl"),
48 | col("totalFeed")
49 | )
50 |
51 | compdf.printSchema()
52 |
--------------------------------------------------------------------------------
/Scenerio21.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql import *
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 | from pyspark import *
7 |
8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio21")
9 | sc = SparkContext(conf=conf)
10 | sc.setLogLevel("ERROR")
11 | spark = SparkSession.builder.getOrCreate()
12 |
13 | data = [
14 | ("SEA", "SF", 300),
15 | ("CHI", "SEA", 2000),
16 | ("SF", "SEA", 300),
17 | ("SEA", "CHI", 2000),
18 | ("SEA", "LND", 500),
19 | ("LND", "SEA", 500),
20 | ("LND", "CHI", 1000),
21 | ("CHI", "NDL", 180)]
22 | df = spark.createDataFrame(data, ["from", "to", "dist"])
23 | df.show()
24 |
25 | # Through SQL
26 | df.createOrReplaceTempView("trip")
27 | spark.sql("""SELECT r1.from, r1.to, (r1.dist + r2.dist) AS roundtrip_dist
28 | FROM trip r1
29 | JOIN trip r2 ON r1.from = r2.to AND r1.to = r2.from
30 | WHERE r1.from < r1.to
31 | """).show()
32 |
33 | # Through DSL
34 | finaldf = df.alias("r1").join(df.alias("r2"),
35 | (col("r1.from") == col("r2.to")) & (col("r1.to") == col("r2.from"))).where(
36 | col("r1.from") < col("r1.to")).select(col("r1.from"), col("r1.to"),
37 | (col("r1.dist") + col("r2.dist")).alias("roundtrip_dist"))
38 |
39 | finaldf.show()
40 |
--------------------------------------------------------------------------------
/Scenerio22.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql import *
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 | from pyspark import *
7 |
8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio22")
9 | sc = SparkContext(conf=conf)
10 | sc.setLogLevel("ERROR")
11 | spark = SparkSession.builder.getOrCreate()
12 |
13 | data = [(1, "26-May", 100),
14 | (1, "27-May", 200),
15 | (1, "28-May", 300),
16 | (2, "29-May", 400),
17 | (3, "30-May", 500),
18 | (3, "31-May", 600)]
19 | df = spark.createDataFrame(data, ["pid", "date", "price"])
20 | df.show()
21 | # Through SQL
22 | df.createOrReplaceTempView("ordertab")
23 | spark.sql("select pid,date,price, sum(price) over(partition by(pid) order by(price)) as new_price from ordertab").show()
24 | # Through DSL
25 | wn = Window.partitionBy("pid").orderBy("price")
26 | finaldf = df.withColumn("new_price", sum("price").
27 | over(wn)).show()
28 |
--------------------------------------------------------------------------------
/Scenerio23.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql import *
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 | from pyspark import *
7 |
8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio23")
9 | sc = SparkContext(conf=conf)
10 | sc.setLogLevel("ERROR")
11 | spark = SparkSession.builder.getOrCreate()
12 |
13 | data = [(1, 5), (2, 6), (3, 5), (3, 6), (1, 6)]
14 | df = spark.createDataFrame(data, ["customer_id", "product_key"])
15 | df.show()
16 | data2 = [(5,), (6,)]
17 | df2 = spark.createDataFrame(data2, ["product_key"])
18 | df2.show()
19 | finaldf = df.join(df2, ["product_key"], "inner").drop("product_key").distinct().filter(col("customer_id") != 2)
20 | finaldf.show()
21 |
--------------------------------------------------------------------------------
/Scenerio24.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql import *
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 | from pyspark import *
7 |
8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio24")
9 | sc = SparkContext(conf=conf)
10 | sc.setLogLevel("ERROR")
11 | spark = SparkSession.builder.getOrCreate()
12 |
13 | data = [
14 | (1, "home"),
15 | (1, "products"),
16 | (1, "checkout"),
17 | (1, "confirmation"),
18 | (2, "home"),
19 | (2, "products"),
20 | (2, "cart"),
21 | (2, "checkout"),
22 | (2, "confirmation"),
23 | (2, "home"),
24 | (2, "products")]
25 | df = spark.createDataFrame(data, ["userid", "page"])
26 | df.show()
27 | # Through SQL
28 | df.createOrReplaceTempView("pagetab")
29 | spark.sql("select userid, collect_list(page) as pages from pagetab group by userid").show()
30 |
31 | # Through DSL
32 | finaldf = df.groupBy("userid").agg(collect_list("page").alias("pages"))
33 | finaldf.show(truncate=False)
34 |
--------------------------------------------------------------------------------
/Scenerio25.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | from pyspark.sql import *
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 | from pyspark import *
7 |
8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio25")
9 | sc = SparkContext(conf=conf)
10 | sc.setLogLevel("ERROR")
11 | spark = SparkSession.builder.getOrCreate()
12 |
13 | df = spark.read.format("csv").option("header", "true") \
14 | .option("mode", "DROPMALFORMED") \
15 | .load("D:/BigData/Datasets/Scenerio25.csv")
16 | df.show()
17 |
--------------------------------------------------------------------------------
/Scenerio26.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf
2 | from pyspark import SparkContext
3 | from pyspark import *
4 | from pyspark.sql import SparkSession
5 | from pyspark.sql import *
6 | from pyspark.sql.types import *
7 | from pyspark.sql.functions import *
8 | from pyspark.sql.window import *
9 |
10 | conf = SparkConf().setMaster("local[*]").setAppName("test")
11 | sc = SparkContext(conf=conf)
12 | sc.setLogLevel("ERROR")
13 |
14 | spark = SparkSession.builder.getOrCreate()
15 |
16 | sourcedata = [
17 | (1, "A"),
18 | (2, "B"),
19 | (3, "C"),
20 | (4, "D")]
21 | mysourceshcema = ["id","name"]
22 | sourcedf = spark.createDataFrame(sourcedata,schema=mysourceshcema)
23 | sourcedf.show()
24 |
25 | targetdata = [
26 | (1, "A"),
27 | (2, "B"),
28 | (4, "X"),
29 | (5, "F")]
30 | mytargetschema = ["id1","name1"]
31 | targetdf = spark.createDataFrame(targetdata,schema=mytargetschema)
32 | targetdf.show()
33 |
34 | #--------------------------Through SQL
35 |
36 | sourcedf.createOrReplaceTempView("sourcetab")
37 | targetdf.createOrReplaceTempView("targettab")
38 |
39 | print("=================Through SQL==========================")
40 | spark.sql("""SELECT COALESCE(s.id, t.id1) AS id,
41 | CASE
42 | WHEN s.name IS NULL THEN 'new in target'
43 | WHEN t.name1 IS NULL THEN 'new in source'
44 | WHEN s.name != t.name1 THEN 'mismatch'
45 | END AS comment
46 | FROM sourcetab s
47 | FULL OUTER JOIN targettab t ON s.id = t.id1
48 | WHERE s.name != t.name1 OR s.name IS NULL OR t.name1 IS NULL
49 | """).show()
50 |
51 | print("==================Through DSL===============================")
52 | #--------------------------Through DSL
53 | #//Joining two dataframes
54 |
55 | joindf = sourcedf.join(targetdf, sourcedf["id"]==targetdf["id1"],"outer")
56 | joindf.show()
57 |
58 | #//filtering the columns which are not equal and null
59 |
60 | fildf = joindf.filter((col("name") != col("name1")) | col("name").isNull() | col("name1").isNull())
61 | fildf.show()
62 |
63 | #//coalesce will replace the null value with next non null value
64 |
65 | filnulldf = fildf.withColumn("id",coalesce(col("id"),col("id1"))).drop("id1")
66 | filnulldf.show()
67 |
68 | finaldf = filnulldf.withColumn("comment",expr("case when name is null then 'new in target' when name1 is null then 'new in source' when name != name1 then 'mismatch' end")).drop("name","name1")
69 | finaldf.show()
70 |
71 |
72 |
--------------------------------------------------------------------------------
/Scenerio27.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf
2 | from pyspark import SparkContext
3 | from pyspark import *
4 | from pyspark.sql import SparkSession
5 | from pyspark.sql import *
6 | from pyspark.sql.types import *
7 | from pyspark.sql.functions import *
8 | from pyspark.sql.window import *
9 |
10 | conf = SparkConf().setMaster("local[*]").setAppName("test")
11 | sc = SparkContext(conf=conf)
12 | sc.setLogLevel("ERROR")
13 |
14 | spark = SparkSession.builder.getOrCreate()
15 |
16 | data = [(1,60000,2018),(1,70000,2019),(1,80000,2020),(2,60000,2018),(2,65000,2019),(2,65000,2020),(3,60000,2018),(3,65000,2019)]
17 |
18 | df = spark.createDataFrame(data,["empid","salary","year"])
19 |
20 | df.show()
21 |
22 | wn = Window.partitionBy("empid").orderBy("year")
23 |
24 | lagdf = df.withColumn("diff",lag("salary",1).over(wn))
25 | lagdf.show()
26 |
27 | finaldf = lagdf.withColumn("incresalary",expr("salary - diff")).drop("diff").na.fill(0).orderBy("empid","year")
28 |
29 | finaldf.show()
--------------------------------------------------------------------------------
/Scenerio28.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf
2 | from pyspark import SparkContext
3 | from pyspark import *
4 | from pyspark.sql import SparkSession
5 | from pyspark.sql import *
6 | from pyspark.sql.types import *
7 | from pyspark.sql.functions import *
8 | from pyspark.sql.window import *
9 |
10 | conf = SparkConf().setMaster("local[*]").setAppName("test")
11 | sc = SparkContext(conf=conf)
12 | sc.setLogLevel("ERROR")
13 |
14 | spark = SparkSession.builder.getOrCreate()
15 |
16 | data = [("A", "AA"), ("B", "BB"), ("C", "CC"), ("AA", "AAA"), ("BB", "BBB"), ("CC", "CCC")]
17 |
18 | df = spark.createDataFrame(data, ["child", "parent"])
19 | df.show()
20 |
21 | joindf = df.alias("a").join(df.alias("b"), col("a.child") == col("b.parent")).select(
22 | col("a.child").alias("child_a"),
23 | col("a.parent").alias("parent_a"),
24 | col("b.child").alias("child_b"),
25 | col("b.parent").alias("parent_b")
26 | )
27 | joindf.show()
28 |
29 | findf = joindf.withColumnRenamed("child_a", "parent").withColumnRenamed("parent_a", "grandparent").withColumnRenamed(
30 | "child_b", "child").drop("parent_b").select("child", "parent", "grandparent")
31 |
32 | findf.show()
33 |
34 | # another way
35 |
36 | df2 = df.withColumnRenamed("child", "child1").withColumnRenamed("parent", "parent1")
37 | df2.show()
38 |
39 | secondjoindf = df.join(df2, col("parent") == col("child1"), "inner")
40 | secondjoindf.show()
41 |
42 | finaldf = secondjoindf.withColumnRenamed("parent1", "grandparent").drop("child1")
43 | finaldf.show()
44 |
--------------------------------------------------------------------------------
/Scenerio29.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf
2 | from pyspark import SparkContext
3 | from pyspark import *
4 | from pyspark.sql import SparkSession
5 | from pyspark.sql import *
6 | from pyspark.sql.types import *
7 | from pyspark.sql.functions import *
8 | from pyspark.sql.window import *
9 |
10 | conf = SparkConf().setMaster("local[*]").setAppName("test")
11 | sc = SparkContext(conf=conf)
12 | sc.setLogLevel("ERROR")
13 |
14 | spark = SparkSession.builder.getOrCreate()
15 |
16 | data1 = [(1,), (2,), (3,)]
17 |
18 | df1 = spark.createDataFrame(data1, ["col"])
19 | df1.show()
20 |
21 | data2 = [(1,), (2,), (3,), (4,), (5,)]
22 |
23 | df2 = spark.createDataFrame(data2, ["col1"])
24 | df2.show()
25 |
26 | maxdf = df1.agg(max("col").alias("max"))
27 | maxdf.show()
28 |
29 | maxsalary = maxdf.select(col("max")).first()[0]
30 |
31 | joindf = df1.join(df2, df1["col"] == df2["col1"], "outer").drop("col")
32 | joindf.show()
33 |
34 | finaldf = joindf.filter(col("col1") != maxsalary).withColumnRenamed("col1", "col").orderBy("col")
35 | finaldf.show()
36 |
--------------------------------------------------------------------------------
/Scenerio3.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-3")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 | data = [(1111, "2021-01-15", 10),
12 | (1111, "2021-01-16", 15),
13 | (1111, "2021-01-17", 30),
14 | (1112, "2021-01-15", 10),
15 | (1112, "2021-01-15", 20),
16 | (1112, "2021-01-15", 30)]
17 |
18 | myschema = ["sensorid", "timestamp", "values"]
19 |
20 | df = spark.createDataFrame(data, schema=myschema)
21 | df.show()
22 |
23 | d1 = Window.partitionBy("sensorid").orderBy("values")
24 |
25 | finaldf = df.withColumn("nextvalues", lead("values", 1).over(d1)) \
26 | .filter(col("nextvalues").isNotNull()) \
27 | .withColumn("values", expr("nextvalues-values")) \
28 | .drop("nextvalues") \
29 | .orderBy(col("sensorid")).show()
30 |
--------------------------------------------------------------------------------
/Scenerio30.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "8148ca11-3054-40c9-b01a-24dc8169bd4d",
7 | "metadata": {
8 | "tags": []
9 | },
10 | "outputs": [],
11 | "source": [
12 | "from pyspark import *\n",
13 | "from pyspark import SparkConf, SparkContext\n",
14 | "from pyspark.sql import *\n",
15 | "from pyspark.sql import SparkSession\n",
16 | "from pyspark.sql.functions import *\n",
17 | "from pyspark.sql.types import *\n",
18 | "from pyspark.sql.window import *\n",
19 | "\n",
20 | "conf = SparkConf().setMaster(\"local[*]\").setAppName(\"test\")\n",
21 | "sc = SparkContext(conf=conf)\n",
22 | "sc.setLogLevel(\"ERROR\")\n",
23 | "\n",
24 | "spark = SparkSession.builder.getOrCreate()"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 2,
30 | "id": "2eae0587-e373-4a4f-a0fa-dd1653df168f",
31 | "metadata": {
32 | "tags": []
33 | },
34 | "outputs": [
35 | {
36 | "name": "stderr",
37 | "output_type": "stream",
38 | "text": [
39 | "d:\\bigdata\\pyspark\\python37\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n",
40 | " return f(*args, **kwds)\n"
41 | ]
42 | },
43 | {
44 | "name": "stdout",
45 | "output_type": "stream",
46 | "text": [
47 | "+------+----+-------+-------+\n",
48 | "|emp_id|name|dept_id| salary|\n",
49 | "+------+----+-------+-------+\n",
50 | "| 1| A| A|1000000|\n",
51 | "| 2| B| A|2500000|\n",
52 | "| 3| C| G| 500000|\n",
53 | "| 4| D| G| 800000|\n",
54 | "| 5| E| W|9000000|\n",
55 | "| 6| F| W|2000000|\n",
56 | "+------+----+-------+-------+\n",
57 | "\n",
58 | "+--------+---------+\n",
59 | "|dept_id1|dept_name|\n",
60 | "+--------+---------+\n",
61 | "| A| AZURE|\n",
62 | "| G| GCP|\n",
63 | "| W| AWS|\n",
64 | "+--------+---------+\n",
65 | "\n"
66 | ]
67 | }
68 | ],
69 | "source": [
70 | "data1 = [\n",
71 | " (1, \"A\", \"A\", 1000000),\n",
72 | " (2, \"B\", \"A\", 2500000),\n",
73 | " (3, \"C\", \"G\", 500000),\n",
74 | " (4, \"D\", \"G\", 800000),\n",
75 | " (5, \"E\", \"W\", 9000000),\n",
76 | " (6, \"F\", \"W\", 2000000),\n",
77 | "]\n",
78 | "df1 = spark.createDataFrame(data1, [\"emp_id\", \"name\", \"dept_id\", \"salary\"])\n",
79 | "df1.show()\n",
80 | "\n",
81 | "data2 = [(\"A\", \"AZURE\"), (\"G\", \"GCP\"), (\"W\", \"AWS\")]\n",
82 | "df2 = spark.createDataFrame(data2, [\"dept_id1\", \"dept_name\"])\n",
83 | "df2.show()"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 3,
89 | "id": "d975d88d-3db6-40ab-9e3e-b43269f98188",
90 | "metadata": {
91 | "tags": []
92 | },
93 | "outputs": [
94 | {
95 | "name": "stdout",
96 | "output_type": "stream",
97 | "text": [
98 | "+------+----+-------+-------+---------+\n",
99 | "|emp_id|name|dept_id| salary|dept_name|\n",
100 | "+------+----+-------+-------+---------+\n",
101 | "| 1| A| A|1000000| AZURE|\n",
102 | "| 2| B| A|2500000| AZURE|\n",
103 | "| 5| E| W|9000000| AWS|\n",
104 | "| 6| F| W|2000000| AWS|\n",
105 | "| 3| C| G| 500000| GCP|\n",
106 | "| 4| D| G| 800000| GCP|\n",
107 | "+------+----+-------+-------+---------+\n",
108 | "\n"
109 | ]
110 | }
111 | ],
112 | "source": [
113 | "joindf = df1.join(df2, df1[\"dept_id\"] == df2[\"dept_id1\"], \"inner\").drop(\"dept_id1\")\n",
114 | "joindf.show()"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 5,
120 | "id": "8533dbae-3872-40b2-8931-5c96c832853c",
121 | "metadata": {
122 | "tags": []
123 | },
124 | "outputs": [
125 | {
126 | "name": "stdout",
127 | "output_type": "stream",
128 | "text": [
129 | "+------+----+-------+-------+---------+----+\n",
130 | "|emp_id|name|dept_id| salary|dept_name|rank|\n",
131 | "+------+----+-------+-------+---------+----+\n",
132 | "| 2| B| A|2500000| AZURE| 1|\n",
133 | "| 1| A| A|1000000| AZURE| 2|\n",
134 | "| 5| E| W|9000000| AWS| 1|\n",
135 | "| 6| F| W|2000000| AWS| 2|\n",
136 | "| 4| D| G| 800000| GCP| 1|\n",
137 | "| 3| C| G| 500000| GCP| 2|\n",
138 | "+------+----+-------+-------+---------+----+\n",
139 | "\n"
140 | ]
141 | }
142 | ],
143 | "source": [
144 | "wn = Window.partitionBy(\"dept_id\").orderBy(col(\"salary\").desc())\n",
145 | "\n",
146 | "rankdf = joindf.withColumn(\"rank\", dense_rank().over(wn))\n",
147 | "rankdf.show()"
148 | ]
149 | },
150 | {
151 | "cell_type": "code",
152 | "execution_count": 6,
153 | "id": "a37624d0-1513-4f73-ab36-4b09c1613adc",
154 | "metadata": {
155 | "tags": []
156 | },
157 | "outputs": [
158 | {
159 | "name": "stdout",
160 | "output_type": "stream",
161 | "text": [
162 | "+------+----+---------+-------+\n",
163 | "|emp_id|name|dept_name| salary|\n",
164 | "+------+----+---------+-------+\n",
165 | "| 1| A| AZURE|1000000|\n",
166 | "| 6| F| AWS|2000000|\n",
167 | "| 3| C| GCP| 500000|\n",
168 | "+------+----+---------+-------+\n",
169 | "\n"
170 | ]
171 | }
172 | ],
173 | "source": [
174 | "finaldf = (\n",
175 | " rankdf.filter(col(\"rank\") == 2)\n",
176 | " .drop(\"rank\")\n",
177 | " .select(\"emp_id\", \"name\", \"dept_name\", \"salary\")\n",
178 | ")\n",
179 | "finaldf.show()"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": null,
185 | "id": "0bc4c5b1-46b9-4837-b841-244f81f8816a",
186 | "metadata": {},
187 | "outputs": [],
188 | "source": []
189 | }
190 | ],
191 | "metadata": {
192 | "kernelspec": {
193 | "display_name": "Python 3 (ipykernel)",
194 | "language": "python",
195 | "name": "python3"
196 | },
197 | "language_info": {
198 | "codemirror_mode": {
199 | "name": "ipython",
200 | "version": 3
201 | },
202 | "file_extension": ".py",
203 | "mimetype": "text/x-python",
204 | "name": "python",
205 | "nbconvert_exporter": "python",
206 | "pygments_lexer": "ipython3",
207 | "version": "3.7.0"
208 | }
209 | },
210 | "nbformat": 4,
211 | "nbformat_minor": 5
212 | }
213 |
--------------------------------------------------------------------------------
/Scenerio31.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 7,
6 | "id": "d20891a9-7e5c-440e-be32-92e3ea3b5632",
7 | "metadata": {
8 | "tags": []
9 | },
10 | "outputs": [],
11 | "source": [
12 | "from pyspark import *\n",
13 | "from pyspark import SparkConf, SparkContext\n",
14 | "from pyspark.sql import *\n",
15 | "from pyspark.sql import SparkSession\n",
16 | "from pyspark.sql.functions import *\n",
17 | "from pyspark.sql.types import *\n",
18 | "from pyspark.sql.window import *\n",
19 | "\n",
20 | "conf = SparkConf().setMaster(\"local[*]\").setAppName(\"test\")\n",
21 | "sc = SparkContext(conf=conf)\n",
22 | "sc.setLogLevel(\"ERROR\")\n",
23 | "\n",
24 | "spark = SparkSession.builder.getOrCreate()"
25 | ]
26 | },
27 | {
28 | "cell_type": "code",
29 | "execution_count": 8,
30 | "id": "a4821839-667b-4aa0-9f40-e6f944e1d5fb",
31 | "metadata": {
32 | "tags": []
33 | },
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "+----+-----+--------+-----------+\n",
40 | "|col1| col2| col3| col4|\n",
41 | "+----+-----+--------+-----------+\n",
42 | "| m1|m1,m2|m1,m2,m3|m1,m2,m3,m4|\n",
43 | "+----+-----+--------+-----------+\n",
44 | "\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "# creating the dataframe\n",
50 | "\n",
51 | "data = [(\"m1\", \"m1,m2\", \"m1,m2,m3\", \"m1,m2,m3,m4\")]\n",
52 | "\n",
53 | "df = spark.createDataFrame(data, [\"col1\", \"col2\", \"col3\", \"col4\"])\n",
54 | "df.show()"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 14,
60 | "id": "0249aa2d-81a9-4247-a0fd-b8ee74fb6fcf",
61 | "metadata": {
62 | "tags": []
63 | },
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "+--------------------+\n",
70 | "| col|\n",
71 | "+--------------------+\n",
72 | "|m1-m1,m2-m1,m2,m3...|\n",
73 | "+--------------------+\n",
74 | "\n"
75 | ]
76 | }
77 | ],
78 | "source": [
79 | "# concating the dataframe into single column\n",
80 | "\n",
81 | "contdf = df.withColumn(\"col\", expr(\"concat(col1,'-',col2,'-',col3,'-',col4)\")).drop(\n",
82 | " \"col1\", \"col2\", \"col3\", \"col4\"\n",
83 | ")\n",
84 | "contdf.show()"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 15,
90 | "id": "594fa64d-e0fc-4335-a4f0-82ffca893248",
91 | "metadata": {
92 | "tags": []
93 | },
94 | "outputs": [
95 | {
96 | "name": "stdout",
97 | "output_type": "stream",
98 | "text": [
99 | "+-----------+\n",
100 | "| col|\n",
101 | "+-----------+\n",
102 | "| m1|\n",
103 | "| m1,m2|\n",
104 | "| m1,m2,m3|\n",
105 | "|m1,m2,m3,m4|\n",
106 | "+-----------+\n",
107 | "\n"
108 | ]
109 | }
110 | ],
111 | "source": [
112 | "finaldf = contdf.selectExpr(\"explode(split(col,'-')) as col\")\n",
113 | "finaldf.show()"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": null,
119 | "id": "ec4e34b8-ca4d-4356-a3b5-3c8b94e9b470",
120 | "metadata": {},
121 | "outputs": [],
122 | "source": []
123 | }
124 | ],
125 | "metadata": {
126 | "kernelspec": {
127 | "display_name": "Python 3 (ipykernel)",
128 | "language": "python",
129 | "name": "python3"
130 | },
131 | "language_info": {
132 | "codemirror_mode": {
133 | "name": "ipython",
134 | "version": 3
135 | },
136 | "file_extension": ".py",
137 | "mimetype": "text/x-python",
138 | "name": "python",
139 | "nbconvert_exporter": "python",
140 | "pygments_lexer": "ipython3",
141 | "version": "3.7.0"
142 | }
143 | },
144 | "nbformat": 4,
145 | "nbformat_minor": 5
146 | }
147 |
--------------------------------------------------------------------------------
/Scenerio32.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 0,
6 | "metadata": {
7 | "application/vnd.databricks.v1+cell": {
8 | "cellMetadata": {
9 | "byteLimit": 2048000,
10 | "rowLimit": 10000
11 | },
12 | "inputWidgets": {},
13 | "nuid": "46f4d97c-7f60-4a82-a7c0-4b8923dc0f46",
14 | "showTitle": false,
15 | "title": ""
16 | }
17 | },
18 | "outputs": [
19 | {
20 | "output_type": "stream",
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "+-------+-------------------+\n|food_id| food_item|\n+-------+-------------------+\n| 1| Veg Biryani|\n| 2| Veg Fried Rice|\n| 3| Kaju Fried Rice|\n| 4| Chicken Biryani|\n| 5|Chicken Dum Biryani|\n| 6| Prawns Biryani|\n| 7| Fish Birayani|\n+-------+-------------------+\n\n+-------+------+\n|food_id|rating|\n+-------+------+\n| 1| 5|\n| 2| 3|\n| 3| 4|\n| 4| 4|\n| 5| 5|\n| 6| 4|\n| 7| 4|\n+-------+------+\n\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "data = [(1,\"Veg Biryani\"),(2,\"Veg Fried Rice\"),(3,\"Kaju Fried Rice\"),(4,\"Chicken Biryani\"),(5,\"Chicken Dum Biryani\"),(6,\"Prawns Biryani\"),(7,\"Fish Birayani\")]\n",
30 | "\n",
31 | "df1 = spark.createDataFrame(data,[\"food_id\",\"food_item\"])\n",
32 | "df1.show()\n",
33 | "\n",
34 | "ratings = [(1,5),(2,3),(3,4),(4,4),(5,5),(6,4),(7,4)]\n",
35 | "\n",
36 | "df2 = spark.createDataFrame(ratings,[\"food_id\",\"rating\"])\n",
37 | "df2.show()"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 0,
43 | "metadata": {
44 | "application/vnd.databricks.v1+cell": {
45 | "cellMetadata": {
46 | "byteLimit": 2048000,
47 | "rowLimit": 10000
48 | },
49 | "inputWidgets": {},
50 | "nuid": "6876f425-2609-4923-9de4-090a4f0ecb09",
51 | "showTitle": false,
52 | "title": ""
53 | }
54 | },
55 | "outputs": [
56 | {
57 | "output_type": "stream",
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "+-------+-------------------+------+\n|food_id| food_item|rating|\n+-------+-------------------+------+\n| 1| Veg Biryani| 5|\n| 2| Veg Fried Rice| 3|\n| 3| Kaju Fried Rice| 4|\n| 4| Chicken Biryani| 4|\n| 5|Chicken Dum Biryani| 5|\n| 6| Prawns Biryani| 4|\n| 7| Fish Birayani| 4|\n+-------+-------------------+------+\n\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "joindf = df1.join(df2,df1[\"food_id\"]==df2[\"food_id\"],\"inner\").select(df1[\"food_id\"],\"food_item\",\"rating\")\n",
67 | "joindf.show()"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 0,
73 | "metadata": {
74 | "application/vnd.databricks.v1+cell": {
75 | "cellMetadata": {
76 | "byteLimit": 2048000,
77 | "rowLimit": 10000
78 | },
79 | "inputWidgets": {},
80 | "nuid": "df16c628-d638-43a0-9032-ac606c8983d7",
81 | "showTitle": false,
82 | "title": ""
83 | }
84 | },
85 | "outputs": [
86 | {
87 | "output_type": "stream",
88 | "name": "stdout",
89 | "output_type": "stream",
90 | "text": [
91 | "+-------+-------------------+------+---------------+\n|food_id| food_item|rating|stats(out of 5)|\n+-------+-------------------+------+---------------+\n| 1| Veg Biryani| 5| *****|\n| 2| Veg Fried Rice| 3| ***|\n| 3| Kaju Fried Rice| 4| ****|\n| 4| Chicken Biryani| 4| ****|\n| 5|Chicken Dum Biryani| 5| *****|\n| 6| Prawns Biryani| 4| ****|\n| 7| Fish Birayani| 4| ****|\n+-------+-------------------+------+---------------+\n\n"
92 | ]
93 | }
94 | ],
95 | "source": [
96 | "from pyspark.sql.functions import *\n",
97 | "\n",
98 | "finaldf = joindf.withColumn(\"stats(out of 5)\",expr(\"repeat('*',rating)\"))\n",
99 | "finaldf.show()"
100 | ]
101 | }
102 | ],
103 | "metadata": {
104 | "application/vnd.databricks.v1+notebook": {
105 | "dashboards": [],
106 | "language": "python",
107 | "notebookMetadata": {
108 | "pythonIndentUnit": 4
109 | },
110 | "notebookName": "Scenerio32",
111 | "widgets": {}
112 | }
113 | },
114 | "nbformat": 4,
115 | "nbformat_minor": 0
116 | }
117 |
--------------------------------------------------------------------------------
/Scenerio33.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 0,
6 | "metadata": {
7 | "application/vnd.databricks.v1+cell": {
8 | "cellMetadata": {
9 | "byteLimit": 2048000,
10 | "rowLimit": 10000
11 | },
12 | "inputWidgets": {},
13 | "nuid": "7d62da1e-f835-4c4a-9737-9d372a69a19b",
14 | "showTitle": false,
15 | "title": ""
16 | }
17 | },
18 | "outputs": [
19 | {
20 | "output_type": "stream",
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "+--------------------+--------------+-----------+\n| id| name|family_size|\n+--------------------+--------------+-----------+\n|c00dac11bde74750b...| Alex Thomas| 9|\n|eb6f2d3426694667a...| Chris Gray| 2|\n|3f7b5b8e835d4e1c8...| Emily Johnson| 4|\n|9a345b079d9f4d3ca...| Michael Brown| 6|\n|e0a5f57516024de2a...|Jessica Wilson| 3|\n+--------------------+--------------+-----------+\n\n+--------------------+------------+--------+--------+\n| id| name|min_size|max_size|\n+--------------------+------------+--------+--------+\n|023fd23615bd4ff4b...| Bolivia| 2| 4|\n|be247f73de0f4b2d8...|Cook Islands| 4| 8|\n|3e85ab80a6f84ef3b...| Brazil| 4| 7|\n|e571e164152c4f7c8...| Australia| 5| 9|\n|f35a7bb7d44342f7a...| Canada| 3| 5|\n|a1b5a4b5fc5f46f89...| Japan| 10| 12|\n+--------------------+------------+--------+--------+\n\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "data = [('c00dac11bde74750b4d207b9c182a85f', 'Alex Thomas', 9),('eb6f2d3426694667ae3e79d6274114a4', 'Chris Gray', 2),('3f7b5b8e835d4e1c8b3e12e964a741f3', 'Emily Johnson', 4),('9a345b079d9f4d3cafb2d4c11d20f8ce', 'Michael Brown', 6),('e0a5f57516024de2a231d09de2cbe9d1', 'Jessica Wilson', 3)]\n",
30 | "\n",
31 | "familydf = spark.createDataFrame(data,[\"id\",\"name\",\"family_size\"])\n",
32 | "familydf.show()\n",
33 | "\n",
34 | "countrydata = [('023fd23615bd4ff4b2ae0a13ed7efec9', 'Bolivia', 2 , 4),('be247f73de0f4b2d810367cb26941fb9', 'Cook Islands', 4,8),('3e85ab80a6f84ef3b9068b21dbcc54b3', 'Brazil', 4,7),('e571e164152c4f7c8413e2734f67b146', 'Australia', 5,9),('f35a7bb7d44342f7a8a42a53115294a8', 'Canada', 3,5),('a1b5a4b5fc5f46f891d9040566a78f27', 'Japan', 10,12)]\n",
35 | "\n",
36 | "countrydf = spark.createDataFrame(countrydata,[\"id\",\"name\",\"min_size\",\"max_size\"])\n",
37 | "countrydf.show()\n",
38 | "\n"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 0,
44 | "metadata": {
45 | "application/vnd.databricks.v1+cell": {
46 | "cellMetadata": {
47 | "byteLimit": 2048000,
48 | "rowLimit": 10000
49 | },
50 | "inputWidgets": {},
51 | "nuid": "b3301004-40eb-4c42-b786-eef92e7fff40",
52 | "showTitle": false,
53 | "title": ""
54 | }
55 | },
56 | "outputs": [
57 | {
58 | "output_type": "stream",
59 | "name": "stdout",
60 | "output_type": "stream",
61 | "text": [
62 | "+--------------+-----------+------------+--------+--------+\n| name|family_size| name|min_size|max_size|\n+--------------+-----------+------------+--------+--------+\n| Alex Thomas| 9| Australia| 5| 9|\n| Chris Gray| 2| Bolivia| 2| 4|\n| Emily Johnson| 4| Bolivia| 2| 4|\n| Emily Johnson| 4|Cook Islands| 4| 8|\n| Emily Johnson| 4| Brazil| 4| 7|\n| Emily Johnson| 4| Canada| 3| 5|\n| Michael Brown| 6|Cook Islands| 4| 8|\n| Michael Brown| 6| Brazil| 4| 7|\n| Michael Brown| 6| Australia| 5| 9|\n|Jessica Wilson| 3| Bolivia| 2| 4|\n|Jessica Wilson| 3| Canada| 3| 5|\n+--------------+-----------+------------+--------+--------+\n\n"
63 | ]
64 | }
65 | ],
66 | "source": [
67 | "joindf = familydf.join(countrydf, (familydf[\"family_size\"]>=countrydf[\"min_size\"]) & (familydf[\"family_size\"]<=countrydf[\"max_size\"]),\"inner\").select(familydf[\"name\"],familydf[\"family_size\"],countrydf[\"name\"],\"min_size\",\"max_size\")\n",
68 | "joindf.show()"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": 0,
74 | "metadata": {
75 | "application/vnd.databricks.v1+cell": {
76 | "cellMetadata": {
77 | "byteLimit": 2048000,
78 | "rowLimit": 10000
79 | },
80 | "inputWidgets": {},
81 | "nuid": "e7769658-2204-44fc-9f37-0ea2f9b40b01",
82 | "showTitle": false,
83 | "title": ""
84 | }
85 | },
86 | "outputs": [
87 | {
88 | "output_type": "stream",
89 | "name": "stdout",
90 | "output_type": "stream",
91 | "text": [
92 | "+--------------+-------------------+\n| name|number_of_countries|\n+--------------+-------------------+\n| Alex Thomas| 1|\n| Chris Gray| 1|\n| Emily Johnson| 4|\n| Michael Brown| 3|\n|Jessica Wilson| 2|\n+--------------+-------------------+\n\n"
93 | ]
94 | }
95 | ],
96 | "source": [
97 | "from pyspark.sql.functions import *\n",
98 | "\n",
99 | "groupdf = joindf.groupBy(familydf[\"name\"]).agg(count(\"*\").alias(\"number_of_countries\"))\n",
100 | "groupdf.show()"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 0,
106 | "metadata": {
107 | "application/vnd.databricks.v1+cell": {
108 | "cellMetadata": {
109 | "byteLimit": 2048000,
110 | "rowLimit": 10000
111 | },
112 | "inputWidgets": {},
113 | "nuid": "c435acec-02ea-4fe7-8c29-5c624840243c",
114 | "showTitle": false,
115 | "title": ""
116 | }
117 | },
118 | "outputs": [
119 | {
120 | "output_type": "stream",
121 | "name": "stdout",
122 | "output_type": "stream",
123 | "text": [
124 | "+-------------------+\n|number_of_countries|\n+-------------------+\n| 4|\n+-------------------+\n\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "finaldf = groupdf.agg(expr(\"max(number)\").alias(\"number_of_countries\"))\n",
130 | "finaldf.show()\n"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 0,
136 | "metadata": {
137 | "application/vnd.databricks.v1+cell": {
138 | "cellMetadata": {
139 | "byteLimit": 2048000,
140 | "rowLimit": 10000
141 | },
142 | "inputWidgets": {},
143 | "nuid": "7d1af649-a565-4bc1-816f-3d60e846d85d",
144 | "showTitle": false,
145 | "title": ""
146 | }
147 | },
148 | "outputs": [
149 | {
150 | "output_type": "stream",
151 | "name": "stdout",
152 | "output_type": "stream",
153 | "text": [
154 | "+--------------+-------------------+----+\n| name|number_of_countries|rank|\n+--------------+-------------------+----+\n| Emily Johnson| 4| 1|\n| Michael Brown| 3| 2|\n|Jessica Wilson| 2| 3|\n| Alex Thomas| 1| 4|\n| Chris Gray| 1| 5|\n+--------------+-------------------+----+\n\n+-------------+-------------------+\n| name|number_of_countries|\n+-------------+-------------------+\n|Emily Johnson| 4|\n+-------------+-------------------+\n\n"
155 | ]
156 | }
157 | ],
158 | "source": [
159 | "from pyspark.sql.functions import *\n",
160 | "from pyspark.sql import *\n",
161 | "from pyspark.sql.types import *\n",
162 | "\n",
163 | "#another way \n",
164 | "wn = Window.orderBy(desc(\"number_of_countries\"))\n",
165 | "\n",
166 | "rankdf = groupdf.withColumn(\"rank\",row_number().over(wn))\n",
167 | "rankdf.show()\n",
168 | "\n",
169 | "finaldf2 = rankdf.filter(col(\"rank\")==1).drop(\"rank\")\n",
170 | "finaldf2.show()"
171 | ]
172 | }
173 | ],
174 | "metadata": {
175 | "application/vnd.databricks.v1+notebook": {
176 | "dashboards": [],
177 | "language": "python",
178 | "notebookMetadata": {
179 | "mostRecentlyExecutedCommandWithImplicitDF": {
180 | "commandId": 1190225536909284,
181 | "dataframes": [
182 | "_sqldf"
183 | ]
184 | },
185 | "pythonIndentUnit": 4
186 | },
187 | "notebookName": "Scenerio33",
188 | "widgets": {}
189 | }
190 | },
191 | "nbformat": 4,
192 | "nbformat_minor": 0
193 | }
194 |
--------------------------------------------------------------------------------
/Scenerio34.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 0,
6 | "metadata": {
7 | "application/vnd.databricks.v1+cell": {
8 | "cellMetadata": {
9 | "byteLimit": 2048000,
10 | "rowLimit": 10000
11 | },
12 | "inputWidgets": {},
13 | "nuid": "c5c448dc-6b9b-4fd7-84c0-cc0ff8db79be",
14 | "showTitle": false,
15 | "title": ""
16 | }
17 | },
18 | "outputs": [
19 | {
20 | "output_type": "stream",
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "+-----------+------+---+------+\n|customer_id| name|age|gender|\n+-----------+------+---+------+\n| 1| Alice| 25| F|\n| 2| Bob| 40| M|\n| 3| Raj| 46| M|\n| 4| Sekar| 66| M|\n| 5| Jhon| 47| M|\n| 6|Timoty| 28| M|\n| 7| Brad| 90| M|\n| 8| Rita| 34| F|\n+-----------+------+---+------+\n\n+-----------+------+---+------+---------+\n|customer_id| name|age|gender|age_group|\n+-----------+------+---+------+---------+\n| 1| Alice| 25| F| 19-35|\n| 2| Bob| 40| M| 36-50|\n| 3| Raj| 46| M| 36-50|\n| 4| Sekar| 66| M| 51+|\n| 5| Jhon| 47| M| 36-50|\n| 6|Timoty| 28| M| 19-35|\n| 7| Brad| 90| M| 51+|\n| 8| Rita| 34| F| 19-35|\n+-----------+------+---+------+---------+\n\n+---------+-----+\n|age_group|count|\n+---------+-----+\n| 19-35| 3|\n| 36-50| 3|\n| 51+| 2|\n+---------+-----+\n\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "from pyspark.sql.types import *\n",
30 | "from pyspark.sql import *\n",
31 | "from pyspark.sql.functions import *\n",
32 | "\n",
33 | "data = [(1,'Alice',25,'F'),(2,'Bob',40,'M'),(3,'Raj',46,'M'),(4,'Sekar',66,'M'),(5,'Jhon',47,'M'),(6,'Timoty',28,'M'),(7,'Brad',90,'M'),(8,'Rita',34,'F')]\n",
34 | "\n",
35 | "df = spark.createDataFrame(data,['customer_id','name','age','gender'])\n",
36 | "df.show()\n",
37 | "\n",
38 | "#groupdf = df.withColumn(\"age_group\",expr(\"case when age between 19 and 35 then '19-35' case when age between 36 and 50 then '36-50' case when age > 51 then '51+' else age end\"))\n",
39 | "groupdf = df.withColumn(\n",
40 | " \"age_group\",\n",
41 | " expr(\n",
42 | " \"case when age between 19 and 35 then '19-35' \" +\n",
43 | " \"when age between 36 and 50 then '36-50' \" +\n",
44 | " \"when age > 51 then '51+' \" +\n",
45 | " \"else 'Other' end\"\n",
46 | " )\n",
47 | ")\n",
48 | "groupdf.show()\n",
49 | "\n",
50 | "finaldf = groupdf.groupBy('age_group').agg(count('*').alias('count'))\n",
51 | "finaldf.show()"
52 | ]
53 | }
54 | ],
55 | "metadata": {
56 | "application/vnd.databricks.v1+notebook": {
57 | "dashboards": [],
58 | "environmentMetadata": null,
59 | "language": "python",
60 | "notebookMetadata": {
61 | "pythonIndentUnit": 4
62 | },
63 | "notebookName": "Scenerio34",
64 | "widgets": {}
65 | }
66 | },
67 | "nbformat": 4,
68 | "nbformat_minor": 0
69 | }
70 |
--------------------------------------------------------------------------------
/Scenerio35.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 0,
6 | "metadata": {
7 | "application/vnd.databricks.v1+cell": {
8 | "cellMetadata": {
9 | "byteLimit": 2048000,
10 | "rowLimit": 10000
11 | },
12 | "inputWidgets": {},
13 | "nuid": "3f914fd1-1329-49c0-a8ce-60e2aa6ed910",
14 | "showTitle": false,
15 | "title": ""
16 | }
17 | },
18 | "outputs": [
19 | {
20 | "output_type": "stream",
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "+---+------+----+\n| id| name| age|\n+---+------+----+\n| 1| Jhon| 17|\n| 2| Maria| 20|\n| 3| Raj|NULL|\n| 4|Rachel| 18|\n+---+------+----+\n\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "from pyspark.sql import *\n",
30 | "from pyspark.sql.types import *\n",
31 | "from pyspark.sql.functions import *\n",
32 | "\n",
33 | "#creating the dataframe df1\n",
34 | "data1 = [(1,'Jhon',17),(2,'Maria',20),(3,'Raj',None),(4,'Rachel',18)]\n",
35 | "columns = [\"id\",\"name\",\"age\"]\n",
36 | "df1 = spark.createDataFrame(data1,columns)\n",
37 | "df1.show()"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 0,
43 | "metadata": {
44 | "application/vnd.databricks.v1+cell": {
45 | "cellMetadata": {
46 | "byteLimit": 2048000,
47 | "rowLimit": 10000
48 | },
49 | "inputWidgets": {},
50 | "nuid": "3cbc97a7-fda7-42fc-994e-75bef590271e",
51 | "showTitle": false,
52 | "title": ""
53 | }
54 | },
55 | "outputs": [
56 | {
57 | "output_type": "stream",
58 | "name": "stdout",
59 | "output_type": "stream",
60 | "text": [
61 | "+---+----+---+\n| id|name|age|\n+---+----+---+\n| 0| 0| 1|\n+---+----+---+\n\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "# Count null entries in each column\n",
67 | "null_counts = df1.select([sum(col(c).isNull().cast(\"int\")).alias(c) for c in df1.columns])\n",
68 | "\n",
69 | "null_counts.show()"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 0,
75 | "metadata": {
76 | "application/vnd.databricks.v1+cell": {
77 | "cellMetadata": {
78 | "byteLimit": 2048000,
79 | "rowLimit": 10000
80 | },
81 | "inputWidgets": {},
82 | "nuid": "9a1bfc1d-07f0-4f0a-9cb8-98943b762e3c",
83 | "showTitle": false,
84 | "title": ""
85 | }
86 | },
87 | "outputs": [
88 | {
89 | "output_type": "stream",
90 | "name": "stdout",
91 | "output_type": "stream",
92 | "text": [
93 | "+---+----+----+\n| id|name| age|\n+---+----+----+\n| 3| Raj|NULL|\n+---+----+----+\n\n"
94 | ]
95 | }
96 | ],
97 | "source": [
98 | "#Remove the row with null entires and store them in a new dataframe named df2\n",
99 | "df2 = df1.filter(col(\"age\").isNull())\n",
100 | "df2.show()"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 0,
106 | "metadata": {
107 | "application/vnd.databricks.v1+cell": {
108 | "cellMetadata": {
109 | "byteLimit": 2048000,
110 | "rowLimit": 10000
111 | },
112 | "inputWidgets": {},
113 | "nuid": "c138332c-c270-42ca-81b4-27cfee8f314e",
114 | "showTitle": false,
115 | "title": ""
116 | }
117 | },
118 | "outputs": [
119 | {
120 | "output_type": "stream",
121 | "name": "stdout",
122 | "output_type": "stream",
123 | "text": [
124 | "+---+--------+----+\n| id| city|code|\n+---+--------+----+\n| 1| seatle| 82|\n| 2| london| 75|\n| 3|banglore| 60|\n| 4| boston| 90|\n+---+--------+----+\n\n+---+------+----+--------+----+\n| id| name| age| city|code|\n+---+------+----+--------+----+\n| 1| Jhon| 17| seatle| 82|\n| 2| Maria| 20| london| 75|\n| 3| Raj|NULL|banglore| 60|\n| 4|Rachel| 18| boston| 90|\n+---+------+----+--------+----+\n\n"
125 | ]
126 | }
127 | ],
128 | "source": [
129 | "#create a new dataframe df3\n",
130 | "data2 = [(1,'seatle',82),(2,'london',75),(3,'banglore',60),(4,'boston',90)]\n",
131 | "columns2 = [\"id\",\"city\",\"code\"]\n",
132 | "\n",
133 | "df3 = spark.createDataFrame(data2,columns2)\n",
134 | "df3.show()\n",
135 | "\n",
136 | "mergedf = df1.join(df3, df1[\"id\"]==df3[\"id\"],\"inner\").select(df1[\"id\"],\"name\",\"age\",\"city\",\"code\")\n",
137 | "mergedf.show()"
138 | ]
139 | },
140 | {
141 | "cell_type": "code",
142 | "execution_count": 0,
143 | "metadata": {
144 | "application/vnd.databricks.v1+cell": {
145 | "cellMetadata": {
146 | "byteLimit": 2048000,
147 | "rowLimit": 10000
148 | },
149 | "inputWidgets": {},
150 | "nuid": "47c7ba70-7fef-4e00-b451-ac7809ca909f",
151 | "showTitle": false,
152 | "title": ""
153 | }
154 | },
155 | "outputs": [
156 | {
157 | "output_type": "stream",
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | "18.0\n+---+------+---+--------+----+\n| id| name|age| city|code|\n+---+------+---+--------+----+\n| 1| Jhon| 17| seatle| 82|\n| 2| Maria| 20| london| 75|\n| 3| Raj| 18|banglore| 60|\n| 4|Rachel| 18| boston| 90|\n+---+------+---+--------+----+\n\n"
162 | ]
163 | }
164 | ],
165 | "source": [
166 | "#fill the null value with the mean age of students\n",
167 | "#calculate the mean age\n",
168 | "meanage = mergedf.select(round(mean(\"age\"))).collect()[0][0]\n",
169 | "print(meanage)\n",
170 | "\n",
171 | "filldf = mergedf.na.fill({\"age\":meanage})\n",
172 | "filldf.show()"
173 | ]
174 | },
175 | {
176 | "cell_type": "code",
177 | "execution_count": 0,
178 | "metadata": {
179 | "application/vnd.databricks.v1+cell": {
180 | "cellMetadata": {
181 | "byteLimit": 2048000,
182 | "rowLimit": 10000
183 | },
184 | "inputWidgets": {},
185 | "nuid": "3add2a14-6501-4b80-8520-8f5310f0c45b",
186 | "showTitle": false,
187 | "title": ""
188 | }
189 | },
190 | "outputs": [
191 | {
192 | "output_type": "stream",
193 | "name": "stdout",
194 | "output_type": "stream",
195 | "text": [
196 | "+---+------+---+--------+----+\n| id| name|age| city|code|\n+---+------+---+--------+----+\n| 2| Maria| 20| london| 75|\n| 3| Raj| 18|banglore| 60|\n| 4|Rachel| 18| boston| 90|\n+---+------+---+--------+----+\n\n"
197 | ]
198 | }
199 | ],
200 | "source": [
201 | "#Get the students who are 18 years or older\n",
202 | "filterdf = filldf.filter(col(\"age\")>= 18)\n",
203 | "filterdf.show()"
204 | ]
205 | }
206 | ],
207 | "metadata": {
208 | "application/vnd.databricks.v1+notebook": {
209 | "dashboards": [],
210 | "environmentMetadata": null,
211 | "language": "python",
212 | "notebookMetadata": {
213 | "pythonIndentUnit": 4
214 | },
215 | "notebookName": "Scenerio35",
216 | "widgets": {}
217 | }
218 | },
219 | "nbformat": 4,
220 | "nbformat_minor": 0
221 | }
222 |
--------------------------------------------------------------------------------
/Scenerio36.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 0,
6 | "metadata": {
7 | "application/vnd.databricks.v1+cell": {
8 | "cellMetadata": {
9 | "byteLimit": 2048000,
10 | "rowLimit": 10000
11 | },
12 | "inputWidgets": {},
13 | "nuid": "ca93cda6-3519-4de0-9539-49871d155641",
14 | "showTitle": false,
15 | "tableResultSettingsMap": {},
16 | "title": ""
17 | }
18 | },
19 | "outputs": [
20 | {
21 | "output_type": "stream",
22 | "name": "stdout",
23 | "output_type": "stream",
24 | "text": [
25 | "+----------+----------+\n| sell_date| product|\n+----------+----------+\n|2020-05-30| Headphone|\n|2020-06-01| Pencil|\n|2020-06-02| Mask|\n|2020-05-30|Basketball|\n|2020-06-01| Book|\n|2020-06-02| Mask|\n|2020-05-30| T-Shirt|\n+----------+----------+\n\n"
26 | ]
27 | }
28 | ],
29 | "source": [
30 | "from pyspark.sql import *\n",
31 | "from pyspark.sql.types import *\n",
32 | "from pyspark.sql.functions import *\n",
33 | "\n",
34 | "data = [('2020-05-30','Headphone'),('2020-06-01','Pencil'),('2020-06-02','Mask'),('2020-05-30','Basketball'),('2020-06-01','Book'),('2020-06-02','Mask'),('2020-05-30','T-Shirt')]\n",
35 | "columns = [\"sell_date\",'product']\n",
36 | "\n",
37 | "df = spark.createDataFrame(data,schema=columns)\n",
38 | "df.show()"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 0,
44 | "metadata": {
45 | "application/vnd.databricks.v1+cell": {
46 | "cellMetadata": {
47 | "byteLimit": 2048000,
48 | "rowLimit": 10000
49 | },
50 | "inputWidgets": {},
51 | "nuid": "f53d6dfa-42b0-40bb-b525-340738e326dc",
52 | "showTitle": false,
53 | "tableResultSettingsMap": {},
54 | "title": ""
55 | }
56 | },
57 | "outputs": [
58 | {
59 | "output_type": "stream",
60 | "name": "stdout",
61 | "output_type": "stream",
62 | "text": [
63 | "+----------+--------------------+---------+\n| sell_date| products|null_sell|\n+----------+--------------------+---------+\n|2020-05-30|[T-Shirt, Basketb...| 3|\n|2020-06-01| [Pencil, Book]| 2|\n|2020-06-02| [Mask]| 1|\n+----------+--------------------+---------+\n\n"
64 | ]
65 | }
66 | ],
67 | "source": [
68 | "transfdf = df.groupBy(\"sell_date\").agg(collect_set(\"product\").alias(\"products\"),size(collect_set(\"product\")).alias(\"null_sell\"))\n",
69 | "transfdf.show()"
70 | ]
71 | }
72 | ],
73 | "metadata": {
74 | "application/vnd.databricks.v1+notebook": {
75 | "computePreferences": null,
76 | "dashboards": [],
77 | "environmentMetadata": {
78 | "base_environment": "",
79 | "client": "1"
80 | },
81 | "language": "python",
82 | "notebookMetadata": {
83 | "mostRecentlyExecutedCommandWithImplicitDF": {
84 | "commandId": 1835178097274309,
85 | "dataframes": [
86 | "_sqldf"
87 | ]
88 | },
89 | "pythonIndentUnit": 4
90 | },
91 | "notebookName": "Untitled Notebook 2025-01-09 10:25:48",
92 | "widgets": {}
93 | }
94 | },
95 | "nbformat": 4,
96 | "nbformat_minor": 0
97 | }
98 |
--------------------------------------------------------------------------------
/Scenerio4.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-4")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 | data = [(1, "Mark Ray", "AB"),
12 | (2, "Peter Smith", "CD"),
13 | (1, "Mark Ray", "EF"),
14 | (2, "Peter Smith", "GH"),
15 | (2, "Peter Smith", "CD"),
16 | (3, "Kate", "IJ")]
17 | myschema = ["custid", "custname", "address"]
18 | df = spark.createDataFrame(data, schema=myschema)
19 | df.show()
20 |
21 | # Through SQL
22 | df.createOrReplaceTempView("custtab")
23 |
24 | spark.sql(
25 | "select custid,custname,collect_set(address) as address from custtab group by custid,custname order by custid").show()
26 |
27 | # Through DSL
28 | finaldf = df.groupBy("custid", "custname").agg(collect_set("address").alias("address")).orderBy("custid").show()
29 |
--------------------------------------------------------------------------------
/Scenerio5.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-5")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 | data1 = [
12 | (1, "abc", 31, "abc@gmail.com"),
13 | (2, "def", 23, "defyahoo.com"),
14 | (3, "xyz", 26, "xyz@gmail.com"),
15 | (4, "qwe", 34, "qwegmail.com"),
16 | (5, "iop", 24, "iop@gmail.com")
17 | ]
18 | myschema1 = ["id", "name", "age", "email"]
19 | df1 = spark.createDataFrame(data1, schema=myschema1)
20 | df1.show()
21 |
22 | data2 = [
23 | (11, "jkl", 22, "abc@gmail.com", 1000),
24 | (12, "vbn", 33, "vbn@yahoo.com", 3000),
25 | (13, "wer", 27, "wer", 2000),
26 | (14, "zxc", 30, "zxc.com", 2000),
27 | (15, "lkj", 29, "lkj@outlook.com", 2000)
28 | ]
29 | myschema2 = ["id", "name", "age", "email", "salary"]
30 | df2 = spark.createDataFrame(data2, schema=myschema2)
31 | df2.show()
32 |
33 | # number of partiion in df
34 | partcount = df1.rdd.getNumPartitions()
35 | print("Number of partition:- " + str(partcount))
36 |
37 | df3 = df1.withColumn("salary", lit(1000))
38 | df3.show()
39 |
40 | # append df2 and df3, and form df4
41 | df4 = df2.union(df3).orderBy(col("id"))
42 | df4.show()
43 |
44 | # Remove records which have invalid email from df4, emails with @ are considered to be valid.
45 | rmdf = df4.filter(col("email").rlike("@"))
46 | rmdf.show()
47 |
48 | #Write df4 to a target location, by partitioning on salary.
49 | rmdf.write.format("parquet").partitionBy("salary").save("D:/BigData/Processed Datasets/interdata")
50 |
51 |
--------------------------------------------------------------------------------
/Scenerio6.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-3")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 | data = [
12 | ("1", "a", "10000"),
13 | ("2", "b", "5000"),
14 | ("3", "c", "15000"),
15 | ("4", "d", "25000"),
16 | ("5", "e", "50000"),
17 | ("6", "f", "7000")
18 | ]
19 | myschema = ["empid","name","salary"]
20 | df = spark.createDataFrame(data,schema=myschema)
21 | df.show()
22 |
23 | #Through SQL
24 | df.createOrReplaceTempView("emptab")
25 | spark.sql("select *, case when salary > 10000 then 'Manager' else 'Employee' end as Designation from emptab").show()
26 |
27 | #Through DSL
28 | finaldf = df.withColumn("Desgination", expr("case when salary > 10000 then 'Manager' else 'Employee' end"))
29 | finaldf.show()
--------------------------------------------------------------------------------
/Scenerio7.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-7")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 | data = [
12 | (1, 100, 2010, 25, 5000),
13 | (2, 100, 2011, 16, 5000),
14 | (3, 100, 2012, 8, 5000),
15 | (4, 200, 2010, 10, 9000),
16 | (5, 200, 2011, 15, 9000),
17 | (6, 200, 2012, 20, 7000),
18 | (7, 300, 2010, 20, 7000),
19 | (8, 300, 2011, 18, 7000),
20 | (9, 300, 2012, 20, 7000)
21 | ]
22 | myschema = ["sale_id", "product_id", "year", "quantity", "price"]
23 | df = spark.createDataFrame(data, schema=myschema)
24 | df.show()
25 |
26 | #Through SQL
27 | df.createOrReplaceTempView("salestab")
28 | spark.sql("SELECT *FROM (SELECT *, DENSE_RANK() OVER (PARTITION BY year ORDER BY quantity DESC) AS rank FROM salestab) AS rankdf WHERE rank = 1 ORDER BY sale_id").show()
29 |
30 | #Through DSL
31 | win = Window.partitionBy("year").orderBy(col("quantity").desc())
32 |
33 | rankdf = df.withColumn("rank", dense_rank().over(win))
34 | rankdf.show()
35 |
36 | finaldf = rankdf.filter(col("rank") == 1).drop("rank").orderBy("sale_id").show()
37 |
--------------------------------------------------------------------------------
/Scenerio8.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-8")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 | data = [
12 | ("India",),
13 | ("Pakistan",),
14 | ("SriLanka",)
15 | ]
16 | myschema = ["teams"]
17 | df = spark.createDataFrame(data, schema=myschema)
18 | df.show()
19 |
20 | # Through SQL
21 | df.createOrReplaceTempView("crickettab")
22 |
23 | # self join query for reference - select a.teams,b.teams from crickettab a inner join crickettab b on a.teams < b.teams
24 |
25 | spark.sql(
26 | "select concat(a.teams, ' Vs ', b.teams) as matches from crickettab a inner join crickettab b on a.teams < b.teams").show()
27 |
28 | # Through DSL
29 |
30 | joindf = df.alias("a").join(df.alias("b"), col("a.teams") < col("b.teams"), "inner")
31 | joindf.show()
32 |
33 | finaldf = joindf.withColumn("matches", expr("concat(a.teams,' Vs ',b.teams)")).drop("teams", "teams").show()
34 |
--------------------------------------------------------------------------------
/Scenerio9.py:
--------------------------------------------------------------------------------
1 | from pyspark import SparkConf, SparkContext
2 | from pyspark.sql import SparkSession
3 | from pyspark.sql.types import *
4 | from pyspark.sql.functions import *
5 | from pyspark.sql.window import *
6 |
7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-9")
8 | sc = SparkContext(conf=conf)
9 | sc.setLogLevel("ERROR")
10 | spark = SparkSession.builder.getOrCreate()
11 | data = [
12 | ("a", [1, 1, 1, 3]),
13 | ("b", [1, 2, 3, 4]),
14 | ("c", [1, 1, 1, 1, 4]),
15 | ("d", [3])
16 | ]
17 | df = spark.createDataFrame(data, ["name", "rank"])
18 | df.show()
19 |
20 | explodedf = df.withColumn("rank", explode(col("rank")))
21 | explodedf.show()
22 |
23 | filtdf = explodedf.filter(col("rank") == 1)
24 | filtdf.show()
25 |
26 | countdf = filtdf.groupBy("name").agg(count("*").alias("count"))
27 | countdf.show()
28 |
29 | finaldf = countdf.select(col("name")).first()[0]
30 | print(finaldf)
31 |
--------------------------------------------------------------------------------
/pom.xml:
--------------------------------------------------------------------------------
1 |
2 | 4.0.0
3 | InterviewScenerios
4 | InterviewScenerios
5 | 0.0.1-SNAPSHOT
6 |
7 | src
8 |
9 |
10 | maven-compiler-plugin
11 | 3.6.1
12 |
13 | 1.8
14 | 1.8
15 |
16 |
17 |
18 |
19 |
20 |
21 |
22 | org.apache.spark
23 | spark-sql_2.11
24 | 2.4.7
25 | provided
26 |
27 |
28 |
29 | org.apache.spark
30 | spark-core_2.11
31 | 2.4.7
32 |
33 |
34 |
35 |
--------------------------------------------------------------------------------
/src/pack/Scenerio1.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | object Scenerio1 {
8 | def main(args: Array[String]): Unit = {
9 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio1")
10 | val sc = new SparkContext(conf)
11 | sc.setLogLevel("ERROR")
12 | val spark = SparkSession.builder().getOrCreate()
13 | import spark.implicits._
14 | val df = Seq(
15 | ("001", "Monika", "Arora", 100000, "2014-02-20 09:00:00", "HR"),
16 | ("002", "Niharika", "Verma", 300000, "2014-06-11 09:00:00", "Admin"),
17 | ("003", "Vishal", "Singhal", 300000, "2014-02-20 09:00:00", "HR"),
18 | ("004", "Amitabh", "Singh", 500000, "2014-02-20 09:00:00", "Admin"),
19 | ("005", "Vivek", "Bhati", 500000, "2014-06-11 09:00:00", "Admin"))
20 | .toDF("workerid", "firstname", "lastname", "salary", "joiningdate", "depart")
21 |
22 | df.show()
23 | //Through SQL Query
24 | df.createOrReplaceTempView("worktab")
25 |
26 | spark.sql("select a.workerid,a.firstname,a.lastname,a.salary,a.joiningdate,a.depart from worktab a, worktab b where a.salary=b.salary and a.workerid !=b.workerid").show()
27 | //Through Spark DSL
28 | val finaldf = df.as("a").join(df.as("b"), $"a.salary" === $"b.salary" && $"a.workerid" =!= $"b.workerid").select($"a.workerid", $"a.firstname", $"a.lastname", $"a.salary", $"a.joiningdate", $"a.depart").show()
29 |
30 | //Another way
31 | val finaldf = df.as("a").join(df.as("b")).where(col("a.salary")===col("b.salary") && col("a.workerid") =!= col("b.workerid")).select($"a.workerid",$"a.firstname",$"a.lastname",$"a.salary",$"a.joiningdate",$"a.depart").show()
32 | }
33 | }
34 |
--------------------------------------------------------------------------------
/src/pack/Scenerio10.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio10 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio10")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = Seq(
17 | (1, 300, "31-Jan-2021"),
18 | (1, 400, "28-Feb-2021"),
19 | (1, 200, "31-Mar-2021"),
20 | (2, 1000, "31-Oct-2021"),
21 | (2, 900, "31-Dec-2021"))
22 | .toDF("empid", "commissionamt", "monthlastdate")
23 |
24 | df.show()
25 |
26 | val maxdatedf = df.groupBy(col("empid").as("empid1")).agg(max("monthlastdate").as("maxdate"))
27 | maxdatedf.show()
28 |
29 | val joindf = df.join(maxdatedf, df("empid") === maxdatedf("empid1") && df("monthlastdate") === maxdatedf("maxdate"), "inner").drop("empid1", "maxdate").show()
30 |
31 | }
32 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio11.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio11 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio11")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 | val df = Seq(
16 | (1, "Jhon", 4000),
17 | (2, "Tim David", 12000),
18 | (3, "Json Bhrendroff", 7000),
19 | (4, "Jordon", 8000),
20 | (5, "Green", 14000),
21 | (6, "Brewis", 6000)).toDF("emp_id", "emp_name", "salary")
22 | df.show()
23 |
24 | //Through SQL
25 | df.createOrReplaceTempView("emptab")
26 | spark.sql("select *,case when salary<5000 then 'C' when salary between 5000 and 10000 then 'B' else 'A' end as grade from emptab ").show()
27 |
28 | //Through DSL
29 | val finaldf = df.withColumn("grade", expr("case when salary<5000 then 'C' when salary between 5000 and 10000 then 'B' else 'A' end")).show()
30 | }
31 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio12.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio12 {
9 |
10 | //creating UDF functions for masked data, here email(0) is it will take first letter i.e 0th index and email.substring(8) is it will take the string from 8th index position to end of the string
11 | def maskEmail(email: String): String = {
12 | email(0) + "**********" + email.substring(8)
13 | }
14 |
15 | //creating UDF functions for masked data, here mobile.substring(0, 2) is it will take string from Index 0 to 2 letters and mobile.substring(mobile.length - 3)calculates the starting index for the substring. It subtracts 3 from the length of the mobile string to determine the appropriate index to start the substring.
16 |
17 | def maskMobile(mobile: String): String = {
18 | mobile.substring(0, 2) + "*****" + mobile.substring(mobile.length - 3)
19 | }
20 |
21 | def main(args: Array[String]): Unit = {
22 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio9")
23 | val sc = new SparkContext(conf)
24 | sc.setLogLevel("ERROR")
25 | val spark = SparkSession.builder().getOrCreate()
26 | import spark.implicits._
27 |
28 | val maskEmailUDF = udf[String, String](maskEmail)
29 | val maskMobileUDF = udf[String, String](maskMobile)
30 |
31 | val df = Seq(("Renuka1992@gmail.com", "9856765434"), ("anbu.arasu@gmail.com", "9844567788")).toDF("email", "mobile")
32 | df.show()
33 |
34 | val maskedDF = df.withColumn("email", maskEmailUDF(col("email")))
35 | .withColumn("mobile", maskMobileUDF(col("mobile")))
36 | maskedDF.show()
37 | }
38 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio13.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio13 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio13")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = Seq(
17 | (1, "Jhon", "Development"),
18 | (2, "Tim", "Development"),
19 | (3, "David", "Testing"),
20 | (4, "Sam", "Testing"),
21 | (5, "Green", "Testing"),
22 | (6, "Miller", "Production"),
23 | (7, "Brevis", "Production"),
24 | (8, "Warner", "Production"),
25 | (9, "Salt", "Production")).toDF("emp_id", "emp_name", "dept")
26 | df.show()
27 |
28 | //Through SQL
29 | df.createOrReplaceTempView("emptab")
30 | spark.sql("SELECT dept, COUNT(*) AS total FROM emptab GROUP BY dept").show()
31 |
32 | //Through DSL
33 | val finaldf = df.groupBy(col("dept")).agg(count("*").as("total")).show()
34 | }
35 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio14.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio14 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio14")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = Seq((203040, "rajesh", 10, 20, 30, 40, 50)).toDF("rollno", "name", "telugu", "english", "maths", "science", "social")
17 | df.show()
18 |
19 | //Through SQL
20 | df.createOrReplaceTempView("marks")
21 | spark.sql("select *, (telugu+english+maths+science+social) as total from marks").show()
22 |
23 | //Through DSL
24 | val finaldf = df.withColumn("total", expr("telugu+english+maths+science+social")).show()
25 | }
26 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio15.scala:
--------------------------------------------------------------------------------
1 | package pack
2 |
3 | object Scenerio15 {
4 | def main(args: Array[String]): Unit = {
5 | val l1 = List(2, 3, 4, 5)
6 | val l2 = List(6, 7, 8, 9)
7 | //append
8 | val appendlst = l1 ::: l2
9 | println(appendlst)
10 |
11 | //extending list
12 | val extendlst = l1 ++ l2
13 | println(extendlst)
14 | }
15 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio16.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio16 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio16")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 | val df = Seq(
16 | (1, "Jhon", "Testing", 5000),
17 | (2, "Tim", "Development", 6000),
18 | (3, "Jhon", "Development", 5000),
19 | (4, "Sky", "Prodcution", 8000)).toDF("id", "name", "dept", "salary")
20 | df.show()
21 |
22 | val finaldf = df.dropDuplicates("name").orderBy("id")
23 | finaldf.show()
24 | }
25 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio17.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio17 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio17")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df1 = Seq(
17 | (1, "Tim", 24, "Kerala", "India"),
18 | (2, "Asman", 26, "Kerala", "India")).toDF("emp_id", "name", "age", "state", "country")
19 | df1.show()
20 |
21 | val df2 = Seq(
22 | (1, "Tim", 24, "Comcity"),
23 | (2, "Asman", 26, "bimcity")).toDF("emp_id", "name", "age", "address")
24 | df2.show()
25 |
26 | val findf = df1.join(df2, Seq("emp_id", "name", "age"), "outer")
27 | findf.show()
28 | }
29 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio18.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio18 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio18")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val inputdf = Seq("The Social Dilemma").toDF("word")
17 | inputdf.show()
18 | val reverseudf = udf((sentence: String) => sentence.split(" ").map(_.reverse).mkString(" "))
19 | val outputdf = inputdf.withColumn("reverse word", reverseudf($"word")).drop("word")
20 | outputdf.show()
21 | }
22 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio19.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio19 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio19")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = spark.read.format("json").option("multiline", "true").load("dbfs:/FileStore/scen.json")
17 | df.printSchema()
18 | val finaldf = df.withColumn("multiMedia", explode(col("multiMedia"))).withColumn("dislikes", expr("likeDislike.dislikes")).withColumn("likes", expr("likeDislike.likes")).withColumn("userAction", expr("likeDislike.userAction")).withColumn("createAt", expr("multiMedia.createAt")).withColumn("description", expr("multiMedia.description")).withColumn("id", expr("multiMedia.id")).withColumn("likeCount", expr("multiMedia.likeCount")).withColumn("mediatype", expr("multiMedia.mediatype")).withColumn("name", expr("multiMedia.name")).withColumn("place", expr("multiMedia.place")).withColumn("url", expr("multiMedia.url")).drop("likeDislike", "multiMedia")
19 | println("flat Schema")
20 | finaldf.printSchema()
21 | finaldf.show()
22 | }
23 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio2.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio2 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("scenerio-2")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = Seq(
17 | (1, "1-Jan", "Ordered"),
18 | (1, "2-Jan", "dispatched"),
19 | (1, "3-Jan", "dispatched"),
20 | (1, "4-Jan", "Shipped"),
21 | (1, "5-Jan", "Shipped"),
22 | (1, "6-Jan", "Delivered"),
23 | (2, "1-Jan", "Ordered"),
24 | (2, "2-Jan", "dispatched"),
25 | (2, "3-Jan", "shipped")).toDF("orderid", "statusdate", "status")
26 |
27 | df.show()
28 |
29 | //Through SQL
30 | df.createOrReplaceTempView("ordertab")
31 | spark.sql("select * from ordertab where status = 'dispatched' and orderid in(select orderid from ordertab where status = 'Ordered')").show()
32 |
33 | //Through DSL
34 | val result = df.filter(
35 | col("status") === "dispatched" &&
36 | col("orderid").isin(
37 | df.filter(col("status") === "Ordered").select("orderid").map(_.getInt(0)).collect(): _*))
38 | result.show()
39 |
40 | }
41 | }
42 |
--------------------------------------------------------------------------------
/src/pack/Scenerio20.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio20 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio20")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = spark.read.format("json").option("multiline", "true").load("dbfs:/FileStore/flatjson/part-00000-tid-3675309499584050336-b8650962-dec3-4fe4-a204-c914090f019e-21-1-c000.json")
17 | df.printSchema()
18 | val compdf = df.select(
19 | col("code"),
20 | col("commentCount"),
21 | col("createdAt"),
22 | col("description"),
23 | col("feedsComment"),
24 | col("id"),
25 | col("imagePaths"),
26 | col("images"),
27 | col("isdeleted"),
28 | col("lat"),
29 | struct(col("dislikes"), col("likes"), col("userAction")).as("likeDislike"),
30 | col("lng"),
31 | col("location"),
32 | col("mediatype"),
33 | col("msg"),
34 | array(
35 | struct(
36 | col("createAt"),
37 | col("description"),
38 | col("id"), col("likeCount"),
39 | col("mediatype"),
40 | col("name"),
41 | col("place"),
42 | col("url")).as("element")).as("multiMedia"),
43 | col("name"),
44 | col("profilePicture"),
45 | col("title"),
46 | col("userId"),
47 | col("videoUrl"),
48 | col("totalFeed"))
49 | compdf.printSchema()
50 | }
51 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio21.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio21 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio21")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = Seq(
17 | ("SEA", "SF", 300),
18 | ("CHI", "SEA", 2000),
19 | ("SF", "SEA", 300),
20 | ("SEA", "CHI", 2000),
21 | ("SEA", "LND", 500),
22 | ("LND", "SEA", 500),
23 | ("LND", "CHI", 1000),
24 | ("CHI", "NDL", 180)).toDF("from", "to", "dist")
25 | df.show()
26 | //Through SQL
27 | df.createOrReplaceTempView("trip")
28 | spark.sql("""SELECT r1.from, r1.to, (r1.dist + r2.dist) AS roundtrip_dist
29 | FROM trip r1
30 | JOIN trip r2 ON r1.from = r2.to AND r1.to = r2.from
31 | WHERE r1.from < r1.to
32 | """).show()
33 |
34 | //Through DSL
35 | val finaldf = df.as("r1").join(
36 | df.as("r2"),
37 | (col("r1.from") === col("r2.to")) && (col("r1.to") === col("r2.from"))).where(
38 | col("r1.from") < col("r1.to")).select(col("r1.from"), col("r1.to"),
39 | (col("r1.dist") + col("r2.dist")).alias("roundtrip_dist"))
40 | finaldf.show()
41 | }
42 | }
43 |
--------------------------------------------------------------------------------
/src/pack/Scenerio22.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio22 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio22")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 | val df = Seq(
16 | (1, "26-May", 100),
17 | (1, "27-May", 200),
18 | (1, "28-May", 300),
19 | (2, "29-May", 400),
20 | (3, "30-May", 500),
21 | (3, "31-May", 600)).toDF("pid", "date", "price")
22 | df.show()
23 |
24 | //Through SQL
25 | df.createOrReplaceTempView("ordertab")
26 | spark.sql("select pid,date,price, sum(price) over(partition by(pid) order by(price)) as new_price from ordertab").show()
27 |
28 | //Through DSL
29 | val wn = Window.partitionBy("pid").orderBy("price")
30 | val finaldf = df.withColumn("new_price", sum("price") over (wn)).show()
31 | }
32 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio23.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio23 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio23")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = Seq((1, 5), (2, 6), (3, 5), (3, 6), (1, 6)).toDF("customer_id", "product_key")
17 | df.show()
18 | val df2 = Seq((5), (6)).toDF("product_key")
19 | df2.show()
20 | val finaldf = df.join(df2, Seq("product_key"), "inner").drop("product_key").distinct().filter(col("customer_id") =!= 2)
21 | finaldf.show()
22 | }
23 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio24.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio24 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio24")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = Seq(
17 | (1, "home"),
18 | (1, "products"),
19 | (1, "checkout"),
20 | (1, "confirmation"),
21 | (2, "home"),
22 | (2, "products"),
23 | (2, "cart"),
24 | (2, "checkout"),
25 | (2, "confirmation"),
26 | (2, "home"),
27 | (2, "products")).toDF("userid", "page")
28 | df.show()
29 |
30 | //Through SQL
31 | df.createOrReplaceTempView("pagetab")
32 | spark.sql("select userid, collect_list(page) as pages from pagetab group by userid").show()
33 |
34 | //Through DSL
35 | val finaldf = df.groupBy("userid").agg(collect_list("page").as("pages")).show(false)
36 | }
37 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio25.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio25 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio25")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = spark.read.format("csv").option("header", "true").option("mode","DROPMALFORMED").load("D:/BigData/Datasets/Scenerio25.csv")
17 | df.show()
18 | }
19 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio26.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio26 {
9 |
10 | def main(args: Array[String]): Unit = {
11 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio25")
12 | val sc = new SparkContext(conf)
13 | sc.setLogLevel("ERROR")
14 | val spark = SparkSession.builder().getOrCreate()
15 | import spark.implicits._
16 |
17 | val sourcedf = Seq(
18 | (1, "A"),
19 | (2, "B"),
20 | (3, "C"),
21 | (4, "D")).toDF("id", "name")
22 | sourcedf.show()
23 |
24 | val targetdf = Seq(
25 | (1, "A"),
26 | (2, "B"),
27 | (4, "X"),
28 | (5, "F")).toDF("id1", "name1")
29 | targetdf.show()
30 |
31 | sourcedf.createOrReplaceTempView("sourcetab")
32 | targetdf.createOrReplaceTempView("targettab")
33 |
34 | spark.sql("""SELECT COALESCE(s.id, t.id1) AS id,
35 | CASE
36 | WHEN s.name IS NULL THEN 'new in target'
37 | WHEN t.name1 IS NULL THEN 'new in source'
38 | WHEN s.name != t.name1 THEN 'mismatch'
39 | END AS comment
40 | FROM sourcetab s
41 | FULL OUTER JOIN targettab t ON s.id = t.id1
42 | WHERE s.name != t.name1 OR s.name IS NULL OR t.name1 IS NULL
43 | """).show()
44 |
45 | //Joining two dataframes
46 |
47 | val joindf = sourcedf.join(targetdf, col("id") === col("id1"), "outer")
48 | joindf.show()
49 |
50 | //filtering the columns which are not equal and null
51 |
52 | val filterdf = joindf.filter(col("name") =!= col("name1") || col("name").isNull || col("name1").isNull)
53 | filterdf.show()
54 |
55 | //coalesce will replace the null value with next non null value
56 |
57 | val nullfildf = filterdf.withColumn("id", coalesce(col("id"), col("id1"))).drop("id1")
58 | nullfildf.show()
59 |
60 | val finaldf = nullfildf.withColumn("comment", expr("case when name is null then 'new in target' when name1 is null then 'new in source' when name!=name1 then 'mismatch' end")).drop("name", "name1")
61 | finaldf.show()
62 |
63 | }
64 | }
65 |
--------------------------------------------------------------------------------
/src/pack/Scenerio27.scala:
--------------------------------------------------------------------------------
1 | package pack
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql._
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql.functions._
8 | import org.apache.spark.sql.expressions._
9 |
10 | object Scenerio27 {
11 | def main(args:Array[String]):Unit = {
12 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio27")
13 | val sc = new SparkContext(conf)
14 | sc.setLogLevel("ERROR")
15 | val spark = SparkSession.builder().getOrCreate()
16 | import spark.implicits._
17 |
18 | val df = Seq((1,60000,2018),(1,70000,2019),(1,80000,2020),(2,60000,2018),(2,65000,2019),(2,65000,2020),(3,60000,2018),(3,65000,2019)).toDF("empid","salary","year")
19 | df.show()
20 |
21 | val wn = Window.partitionBy("empid").orderBy(col("year"))
22 |
23 | val lagdf = df.withColumn("diff",lag("salary",1) over(wn))
24 | lagdf.show()
25 |
26 | val finaldf = lagdf.withColumn("incresalary",expr("salary - diff")).drop("diff").na.fill(0).orderBy("empid","year")
27 | finaldf.show()
28 |
29 |
30 | }
31 | }
32 |
--------------------------------------------------------------------------------
/src/pack/Scenerio28.scala:
--------------------------------------------------------------------------------
1 | package pack
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql._
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql.functions._
8 | import org.apache.spark.sql.expressions._
9 |
10 | object Scenerio28 {
11 |
12 | def main(args: Array[String]): Unit = {
13 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio27")
14 | val sc = new SparkContext(conf)
15 | sc.setLogLevel("ERROR")
16 | val spark = SparkSession.builder().getOrCreate()
17 | import spark.implicits._
18 |
19 | val df = Seq(("A", "AA"), ("B", "BB"), ("C", "CC"), ("AA", "AAA"), ("BB", "BBB"), ("CC", "CCC")).toDF("child", "parent")
20 | df.show()
21 |
22 | val joindf = df.as("a").join(df.as("b"), col("a.child") === col("b.parent")).select(
23 | col("a.child").alias("child_a"),
24 | col("a.parent").alias("parent_a"),
25 | col("b.child").alias("child_b"),
26 | col("b.parent").alias("parent_b")
27 | )
28 | joindf.show()
29 |
30 | val findf = joindf.withColumnRenamed("child_a", "parent").withColumnRenamed("parent_a", "grandparent").withColumnRenamed("child_b", "child").drop("parent_b").select("child", "parent", "grandparent")
31 |
32 | findf.show()
33 |
34 | //another way
35 |
36 | val df2 = df.withColumnRenamed("child", "child1").withColumnRenamed("parent", "parent1")
37 | df2.show()
38 |
39 | val secondjoindf = df.join(df2, df("parent") === df2("child1"), "inner")
40 | secondjoindf.show()
41 |
42 | val finaldf = secondjoindf.withColumnRenamed("parent1", "grandparent").drop("child1")
43 | finaldf.show()
44 | }
45 | }
46 |
--------------------------------------------------------------------------------
/src/pack/Scenerio29.scala:
--------------------------------------------------------------------------------
1 | package pack
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql._
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql.functions._
8 | import org.apache.spark.sql.expressions._
9 |
10 | object Scenerio29 {
11 |
12 | def main(args: Array[String]): Unit = {
13 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio27")
14 | val sc = new SparkContext(conf)
15 | sc.setLogLevel("ERROR")
16 | val spark = SparkSession.builder().getOrCreate()
17 | import spark.implicits._
18 |
19 | val df1 = Seq((1), (2), (3)).toDF("col")
20 | df1.show()
21 |
22 | val df2 = Seq((1), (2), (3), (4), (5)).toDF("col1")
23 | df2.show()
24 |
25 | val maxdf = df1.agg(max("col").as("max"))
26 | maxdf.show()
27 |
28 | val maxsalary = maxdf.select(col("max")).first().getInt(0)
29 |
30 | val joindf = df1.join(df2, df1("col") === df2("col1"), "outer").drop("col")
31 | joindf.show()
32 |
33 | val finaldf = joindf.filter(col("col1") =!= maxsalary).withColumnRenamed("col1", "col")
34 | finaldf.show()
35 |
36 | }
37 | }
38 |
--------------------------------------------------------------------------------
/src/pack/Scenerio3.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio3 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("scenerio-3")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val data = Seq(
17 | (1111, "2021-01-15", 10),
18 | (1111, "2021-01-16", 15),
19 | (1111, "2021-01-17", 30),
20 | (1112, "2021-01-15", 10),
21 | (1112, "2021-01-15", 20),
22 | (1112, "2021-01-15", 30)).toDF("sensorid", "timestamp", "values")
23 | data.show()
24 |
25 | //Through DSL
26 |
27 | val d1 = Window.partitionBy("sensorid").orderBy("values")
28 |
29 | val finaldf = data.withColumn("nextvalues", lead("values", 1) over (d1))
30 | .filter(col("nextvalues").isNotNull)
31 | .withColumn("values", expr("nextvalues-values"))
32 | .drop("nextvalues")
33 | .show()
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/pack/Scenerio30.scala:
--------------------------------------------------------------------------------
1 | package pack
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql._
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql.functions._
8 | import org.apache.spark.sql.expressions._
9 |
10 | object Scenerio30 {
11 |
12 | def main(args: Array[String]): Unit = {
13 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio27")
14 | val sc = new SparkContext(conf)
15 | sc.setLogLevel("ERROR")
16 | val spark = SparkSession.builder().getOrCreate()
17 | import spark.implicits._
18 |
19 | val df1 = Seq((1, "A", "A", 1000000), (2, "B", "A", 2500000), (3, "C", "G", 500000), (4, "D", "G", 800000), (5, "E", "W", 9000000), (6, "F", "W", 2000000)).toDF("emp_id", "name", "dept_id", "salary")
20 | df1.show()
21 |
22 | val df2 = Seq(("A", "AZURE"), ("G", "GCP"), ("W", "AWS")).toDF("dept_id1", "dept_name")
23 | df2.show()
24 |
25 | val joindf = df1.join(df2, df1("dept_id") === df2("dept_id1"), "inner").drop("dept_id1")
26 | joindf.show()
27 |
28 | val wn = Window.partitionBy("dept_id").orderBy(col("salary").desc)
29 |
30 | val rankdf = joindf.withColumn("rank", dense_rank() over (wn))
31 | rankdf.show()
32 |
33 | val finaldf = rankdf.filter(col("rank") === 2).drop("rank").select("emp_id", "name", "dept_name", "salary")
34 | finaldf.show()
35 | }
36 | }
37 |
--------------------------------------------------------------------------------
/src/pack/Scenerio31.scala:
--------------------------------------------------------------------------------
1 | package pack
2 |
3 | import org.apache.spark.{SparkConf, SparkContext}
4 | import org.apache.spark.sql.SparkSession
5 | import org.apache.spark.sql._
6 | import org.apache.spark.sql.types._
7 | import org.apache.spark.sql.functions._
8 | import org.apache.spark.sql.expressions._
9 |
10 | object Scenerio31 {
11 |
12 | def main(args: Array[String]): Unit = {
13 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio27")
14 | val sc = new SparkContext(conf)
15 | sc.setLogLevel("ERROR")
16 | val spark = SparkSession.builder().getOrCreate()
17 | import spark.implicits._
18 |
19 | val df = Seq(("m1", "m1,m2", "m1,m2,m3", "m1,m2,m3,m4")).toDF("col1", "col2", "col3", "col4")
20 | df.show()
21 |
22 | val contdf = df.withColumn("col", expr("concat(col1,'-',col2,'-',col3,'-',col4,'-')")).drop("col1", "col2", "col3", "col4")
23 | contdf.show(false)
24 |
25 | val finaldf = contdf.selectExpr("explode(split(col,'-')) as col")
26 | finaldf.show()
27 |
28 | }
29 | }
30 |
--------------------------------------------------------------------------------
/src/pack/Scenerio32 Scala.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | val df1 = Seq((1,"Veg Biryani"),(2,"Veg Fried Rice"),(3,"Kaju Fried Rice"),(4,"Chicken Biryani"),(5,"Chicken Dum Biryani"),(6,"Prawns Biryani"),(7,"Fish Birayani")).toDF("food_id","food_item")
3 | df1.show()
4 |
5 | val df2 = Seq((1,5),(2,3),(3,4),(4,4),(5,5),(6,4),(7,4)).toDF("food_id","rating")
6 | df2.show()
7 |
8 |
9 | // COMMAND ----------
10 |
11 | import org.apache.spark.sql.functions._
12 |
13 | val joindf = df1.join(df2, df1("food_id") === df2("food_id"), "inner").select(df1("food_id"), df1("food_item"), df2("rating"))
14 | joindf.show()
15 |
16 |
17 | // COMMAND ----------
18 |
19 | val finaldf = joindf.withColumn("stats(out of 5)",expr("repeat('*',rating)"))
20 | finaldf.show()
21 |
--------------------------------------------------------------------------------
/src/pack/Scenerio33.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | val familydf = Seq(("c00dac11bde74750b4d207b9c182a85f", "Alex Thomas", 9),("eb6f2d3426694667ae3e79d6274114a4", "Chris Gray", 2),("3f7b5b8e835d4e1c8b3e12e964a741f3", "Emily Johnson", 4),("9a345b079d9f4d3cafb2d4c11d20f8ce", "Michael Brown", 6),("e0a5f57516024de2a231d09de2cbe9d1", "Jessica Wilson", 3)).toDF("id","name","family_size")
3 | familydf.show()
4 |
5 | val countrydf = Seq(("023fd23615bd4ff4b2ae0a13ed7efec9", "Bolivia", 2 , 4),("be247f73de0f4b2d810367cb26941fb9", "Cook Islands", 4,8),("3e85ab80a6f84ef3b9068b21dbcc54b3", "Brazil", 4,7),("e571e164152c4f7c8413e2734f67b146", "Australia", 5,9),("f35a7bb7d44342f7a8a42a53115294a8", "Canada", 3,5),("a1b5a4b5fc5f46f891d9040566a78f27", "Japan", 10,12)).toDF("id","name","min_size","max_size")
6 | countrydf.show()
7 |
8 | // COMMAND ----------
9 |
10 | import org.apache.spark.sql.functions._
11 |
12 | val joindf = familydf.join(countrydf, familydf("family_size") >= countrydf("min_size") && familydf("family_size") <= countrydf("max_size"),"inner").select(familydf("name"), familydf("family_size"), countrydf("name").as("country_name"), countrydf("min_size"), countrydf("max_size"))
13 | joindf.show()
14 |
15 |
16 | // COMMAND ----------
17 |
18 | val groupdf = joindf.groupBy(familydf("name")).agg(count("*").alias("number_of_countries"))
19 | groupdf.show()
20 |
21 | // COMMAND ----------
22 |
23 | val finaldf = groupdf.agg(expr("max(number_of_countries)").alias("number_of_countries"))
24 | finaldf.show()
25 |
26 | // COMMAND ----------
27 |
28 | import org.apache.spark.sql.expressions._
29 |
30 | //another way
31 | val wn = Window.orderBy(desc("number_of_countries"))
32 |
33 | val rankdf = groupdf.withColumn("rank",row_number() over(wn))
34 | rankdf.show()
35 |
36 | val finaldf2 = rankdf.filter(col("rank")===1).drop("rank")
37 | finaldf2.show()
38 |
--------------------------------------------------------------------------------
/src/pack/Scenerio35.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | import org.apache.spark._
3 | import org.apache.spark.sql._
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types._
6 | import spark.implicits._
7 |
8 | //creating the dataframe df1
9 | val df1 = Seq((1,"Jhon",Some(17)),(2,"Maria",Some(20)),(3,"Raj",None),(4,"Rachel",Some(18))).toDF("id","name","age")
10 | df1.show()
11 |
12 | // COMMAND ----------
13 |
14 | //Count null entries in each column
15 | val nullCounts = df1.select(df1.columns.map(c => sum(col(c).isNull.cast("int")).alias(c)): _*)
16 | nullCounts.show()
17 |
18 | // COMMAND ----------
19 |
20 | //Remove the row with null entires and store them in a new dataframe named df2
21 | val df2 = df1.filter(col("age").isNull)
22 | df2.show()
23 |
24 | // COMMAND ----------
25 |
26 | //create a new dataframe df3
27 | val df3 = Seq((1,"seatle",82),(2,"london",75),(3,"banglore",60),(4,"boston",90)).toDF("id","city","code")
28 | df3.show()
29 |
30 | //join the df1 and df3
31 | val mergedf = df1.join(df3, df1("id") === df3("id"), "inner").select(df1("id"), df1("name"), df1("age"), df3("city"), df3("code"))
32 | mergedf.show()
33 |
34 | // COMMAND ----------
35 |
36 | //fill the null value with the mean age of students
37 | //calculate the mean age
38 | // Calculate the mean age
39 | val meanage = mergedf.select(mean("age")).first().getDouble(0)
40 |
41 | // Fill null values in the 'age' column with the mean age
42 | val filldf = mergedf.na.fill(Map("age" -> meanage))
43 |
44 | // Show the resulting DataFrame
45 | filldf.show()
46 |
47 | // COMMAND ----------
48 |
49 | //Get the students who are 18 years or older
50 | val filterdf = filldf.filter(col("age")>= 18)
51 | filterdf.show()
52 |
--------------------------------------------------------------------------------
/src/pack/Scenerio36.scala:
--------------------------------------------------------------------------------
1 | // Databricks notebook source
2 | import org.apache.spark._
3 | import org.apache.spark.sql._
4 | import org.apache.spark.sql.functions._
5 | import org.apache.spark.sql.types._
6 | import spark.implicits._
7 |
8 | val data = Seq(("2020-05-30","Headphone"),("2020-06-01","Pencil"),("2020-06-02","Mask"),("2020-05-30","Basketball"),("2020-06-01","Book"),("2020-06-02","Mask"),("2020-05-30","T-Shirt")).toDF("sell_date","product")
9 | data.show()
10 |
11 | // COMMAND ----------
12 |
13 | val transdf = data.groupBy("sell_date").agg(collect_set("product").alias("products"),size(collect_set("product")).alias("num_sell"))
14 | transdf.show()
15 |
--------------------------------------------------------------------------------
/src/pack/Scenerio4.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio4 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("scenerio-4")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = Seq(
17 | (1, "Mark Ray", "AB"),
18 | (2, "Peter Smith", "CD"),
19 | (1, "Mark Ray", "EF"),
20 | (2, "Peter Smith", "GH"),
21 | (2, "Peter Smith", "CD"),
22 | (3, "Kate", "IJ")).toDF("custid", "custname", "address")
23 | df.show()
24 |
25 | //Through SQL
26 | df.createOrReplaceTempView("custtab")
27 |
28 | spark.sql("select custid,custname,collect_set(address) as address from custtab group by custid,custname order by custid").show()
29 |
30 | //Through DSL
31 |
32 | val finaldf = df.groupBy("custid", "custname").agg(collect_set("address").as("address")).orderBy("custid").show()
33 |
34 | }
35 | }
36 |
--------------------------------------------------------------------------------
/src/pack/Scenerio5.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio5 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("scenerio-5")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 | val df1 = Seq(
16 | (1, "abc", 31, "abc@gmail.com"),
17 | (2, "def", 23, "defyahoo.com"),
18 | (3, "xyz", 26, "xyz@gmail.com"),
19 | (4, "qwe", 34, "qwegmail.com"),
20 | (5, "iop", 24, "iop@gmail.com"))
21 | .toDF("id", "name", "age", "email")
22 | df1.show()
23 |
24 | val df2 = Seq(
25 | (11, "jkl", 22, "abc@gmail.com", 1000),
26 | (12, "vbn", 33, "vbn@yahoo.com", 3000),
27 | (13, "wer", 27, "wer", 2000),
28 | (14, "zxc", 30, "zxc.com", 2000),
29 | (15, "lkj", 29, "lkj@outlook.com", 2000))
30 | .toDF("id", "name", "age", "email", "salary")
31 | df2.show()
32 |
33 | //number of partiion in df
34 | val partcount = df1.rdd.getNumPartitions
35 | println("Number of partition:- " + partcount)
36 |
37 | val df3 = df1.withColumn("salary", lit(1000))
38 | df3.show()
39 |
40 | //append df2 and df3, and form df4
41 | val df4 = df2.union(df3).orderBy(col("id") asc)
42 | df4.show()
43 |
44 | //Remove records which have invalid email from df4, emails with @ are considered to be valid.
45 | val rmdf = df4.filter(col("email").rlike("@"))
46 | rmdf.show()
47 |
48 | //Write df4 to a target location, by partitioning on salary.
49 | rmdf.write.format("parquet").partitionBy("salary").save("D:/BigData/Processed Datasets/interdata")
50 |
51 |
52 | }
53 | }
54 |
--------------------------------------------------------------------------------
/src/pack/Scenerio6.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio6 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio6")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = spark.createDataFrame(Seq(
17 | ("1", "a", "10000"),
18 | ("2", "b", "5000"),
19 | ("3", "c", "15000"),
20 | ("4", "d", "25000"),
21 | ("5", "e", "50000"),
22 | ("6", "f", "7000")))
23 | .toDF("empid", "name", "salary")
24 | df.show()
25 |
26 | //Through SQL
27 | df.createOrReplaceTempView("emptab")
28 | spark.sql("select *, case when salary > 10000 then 'Manager' else 'Employee' end as Designation from emptab").show()
29 |
30 | //Through DSL
31 | val finaldf = df.withColumn("Desgination", expr("case when salary > 10000 then 'Manager' else 'Employee' end"))
32 | finaldf.show()
33 | }
34 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio7.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio7 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio6")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = spark.createDataFrame(Seq(
17 | (1, 100, 2010, 25, 5000),
18 | (2, 100, 2011, 16, 5000),
19 | (3, 100, 2012, 8, 5000),
20 | (4, 200, 2010, 10, 9000),
21 | (5, 200, 2011, 15, 9000),
22 | (6, 200, 2012, 20, 7000),
23 | (7, 300, 2010, 20, 7000),
24 | (8, 300, 2011, 18, 7000),
25 | (9, 300, 2012, 20, 7000)))
26 | .toDF("sale_id", "product_id", "year", "quantity", "price")
27 | df.show()
28 |
29 | //Through SQL
30 | df.createOrReplaceTempView("salestab")
31 | spark.sql("SELECT *FROM (SELECT *, DENSE_RANK() OVER (PARTITION BY year ORDER BY quantity DESC) AS rank FROM salestab) AS rankdf WHERE rank = 1 ORDER BY sale_id").show()
32 |
33 | //Through DSL
34 | val win = Window.partitionBy("year").orderBy(col("quantity").desc)
35 |
36 | val rankdf = df.withColumn("rank", dense_rank() over (win))
37 | rankdf.show()
38 |
39 | val finaldf = rankdf.filter(col("rank") === "1").drop("rank").orderBy("sale_id").show()
40 |
41 | }
42 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio8.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio8 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio8")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = Seq(
17 | ("India"),
18 | ("Pakistan"),
19 | ("SriLanka")).toDF("teams")
20 |
21 | df.show()
22 |
23 | //Through SQL
24 | df.createOrReplaceTempView("crickettab")
25 |
26 | //self join query for reference - select a.teams,b.teams from crickettab a inner join crickettab b on a.teams < b.teams
27 |
28 | spark.sql("select concat(a.teams, ' Vs ', b.teams) as matches from crickettab a inner join crickettab b on a.teams < b.teams").show()
29 |
30 | //Through DSL
31 |
32 | val joindf = df.as("a").join(df.as("b"), $"a.teams" < $"b.teams", "inner")
33 | joindf.show()
34 |
35 | val finaldf = joindf.withColumn("matches", expr("concat(a.teams,' Vs ',b.teams)")).drop("teams", "teams").show()
36 | }
37 | }
--------------------------------------------------------------------------------
/src/pack/Scenerio9.scala:
--------------------------------------------------------------------------------
1 | package pack
2 | import org.apache.spark.SparkConf
3 | import org.apache.spark.SparkContext
4 | import org.apache.spark.sql._
5 | import org.apache.spark.sql.types._
6 | import org.apache.spark.sql.functions._
7 | import org.apache.spark.sql.expressions._
8 | object Scenerio9 {
9 | def main(args: Array[String]): Unit = {
10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio9")
11 | val sc = new SparkContext(conf)
12 | sc.setLogLevel("ERROR")
13 | val spark = SparkSession.builder().getOrCreate()
14 | import spark.implicits._
15 |
16 | val df = Seq(
17 | ("a", Seq(1, 1, 1, 3)),
18 | ("b", Seq(1, 2, 3, 4)),
19 | ("c", Seq(1, 1, 1, 1, 4)),
20 | ("d", Seq(3))).toDF("name", "rank")
21 |
22 | df.show()
23 |
24 | val explodedf = df.withColumn("rank", explode(col("rank")))
25 | explodedf.show()
26 |
27 | val filtdf = explodedf.filter(col("rank") === 1)
28 | filtdf.show()
29 |
30 | val countdf = filtdf.groupBy("name").agg(count("*").as("count")).orderBy(col("count") desc)
31 | countdf.show()
32 |
33 | val finaldf = countdf.select(col("name")).first().getString(0)
34 | println(finaldf)
35 |
36 | }
37 | }
--------------------------------------------------------------------------------
/target/classes/pack/Scenerio1$$typecreator5$1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mohankrishna02/interview-scenerios-spark-sql/30f056cb639fe0ee812eb0eb548e9136c3845e38/target/classes/pack/Scenerio1$$typecreator5$1.class
--------------------------------------------------------------------------------
/target/classes/pack/Scenerio1$.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mohankrishna02/interview-scenerios-spark-sql/30f056cb639fe0ee812eb0eb548e9136c3845e38/target/classes/pack/Scenerio1$.class
--------------------------------------------------------------------------------
/target/classes/pack/Scenerio1.class:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mohankrishna02/interview-scenerios-spark-sql/30f056cb639fe0ee812eb0eb548e9136c3845e38/target/classes/pack/Scenerio1.class
--------------------------------------------------------------------------------