├── .cache-main ├── .classpath ├── .project ├── .settings ├── org.eclipse.jdt.core.prefs └── org.eclipse.m2e.core.prefs ├── Datasets ├── Scenerio25.csv ├── scen.json └── scen20.json ├── README.md ├── Scenerio-1.py ├── Scenerio10.py ├── Scenerio11.py ├── Scenerio12.py ├── Scenerio13.py ├── Scenerio14.py ├── Scenerio15.py ├── Scenerio16.py ├── Scenerio17.py ├── Scenerio18.py ├── Scenerio19.py ├── Scenerio2.py ├── Scenerio20.py ├── Scenerio21.py ├── Scenerio22.py ├── Scenerio23.py ├── Scenerio24.py ├── Scenerio25.py ├── Scenerio26.py ├── Scenerio27.py ├── Scenerio28.py ├── Scenerio29.py ├── Scenerio3.py ├── Scenerio30.ipynb ├── Scenerio31.ipynb ├── Scenerio32.ipynb ├── Scenerio33.ipynb ├── Scenerio34.ipynb ├── Scenerio35.ipynb ├── Scenerio36.ipynb ├── Scenerio4.py ├── Scenerio5.py ├── Scenerio6.py ├── Scenerio7.py ├── Scenerio8.py ├── Scenerio9.py ├── pom.xml ├── src └── pack │ ├── Scenerio1.scala │ ├── Scenerio10.scala │ ├── Scenerio11.scala │ ├── Scenerio12.scala │ ├── Scenerio13.scala │ ├── Scenerio14.scala │ ├── Scenerio15.scala │ ├── Scenerio16.scala │ ├── Scenerio17.scala │ ├── Scenerio18.scala │ ├── Scenerio19.scala │ ├── Scenerio2.scala │ ├── Scenerio20.scala │ ├── Scenerio21.scala │ ├── Scenerio22.scala │ ├── Scenerio23.scala │ ├── Scenerio24.scala │ ├── Scenerio25.scala │ ├── Scenerio26.scala │ ├── Scenerio27.scala │ ├── Scenerio28.scala │ ├── Scenerio29.scala │ ├── Scenerio3.scala │ ├── Scenerio30.scala │ ├── Scenerio31.scala │ ├── Scenerio32 Scala.scala │ ├── Scenerio33.scala │ ├── Scenerio35.scala │ ├── Scenerio36.scala │ ├── Scenerio4.scala │ ├── Scenerio5.scala │ ├── Scenerio6.scala │ ├── Scenerio7.scala │ ├── Scenerio8.scala │ └── Scenerio9.scala └── target └── classes └── pack ├── Scenerio1$$typecreator5$1.class ├── Scenerio1$.class └── Scenerio1.class /.cache-main: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohankrishna02/interview-scenerios-spark-sql/30f056cb639fe0ee812eb0eb548e9136c3845e38/.cache-main -------------------------------------------------------------------------------- /.classpath: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | 5 | 6 | 7 | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | -------------------------------------------------------------------------------- /.project: -------------------------------------------------------------------------------- 1 | 2 | 3 | InterviewScenerios 4 | 5 | 6 | 7 | 8 | 9 | org.scala-ide.sdt.core.scalabuilder 10 | 11 | 12 | 13 | 14 | org.eclipse.m2e.core.maven2Builder 15 | 16 | 17 | 18 | 19 | 20 | org.eclipse.m2e.core.maven2Nature 21 | org.scala-ide.sdt.core.scalanature 22 | org.eclipse.jdt.core.javanature 23 | 24 | 25 | -------------------------------------------------------------------------------- /.settings/org.eclipse.jdt.core.prefs: -------------------------------------------------------------------------------- 1 | eclipse.preferences.version=1 2 | org.eclipse.jdt.core.compiler.codegen.inlineJsrBytecode=enabled 3 | org.eclipse.jdt.core.compiler.codegen.targetPlatform=1.8 4 | org.eclipse.jdt.core.compiler.codegen.unusedLocal=preserve 5 | org.eclipse.jdt.core.compiler.compliance=1.8 6 | org.eclipse.jdt.core.compiler.debug.lineNumber=generate 7 | org.eclipse.jdt.core.compiler.debug.localVariable=generate 8 | org.eclipse.jdt.core.compiler.debug.sourceFile=generate 9 | org.eclipse.jdt.core.compiler.problem.assertIdentifier=error 10 | org.eclipse.jdt.core.compiler.problem.enumIdentifier=error 11 | org.eclipse.jdt.core.compiler.problem.forbiddenReference=warning 12 | org.eclipse.jdt.core.compiler.source=1.8 13 | -------------------------------------------------------------------------------- /.settings/org.eclipse.m2e.core.prefs: -------------------------------------------------------------------------------- 1 | activeProfiles= 2 | eclipse.preferences.version=1 3 | resolveWorkspaceProjects=true 4 | version=1 5 | -------------------------------------------------------------------------------- /Datasets/Scenerio25.csv: -------------------------------------------------------------------------------- 1 | emp_no,emp_name,dep 2 | 101,Murugan,HealthCare 3 | Invalid Entry,Description: Bad Record Entry 4 | 102,Kannan,Finance 5 | 103,Mani,IT 6 | Connection lost,Description: Poor Connection 7 | 104,Pavan,HR 8 | Bad Record,Description:Corrupt Record -------------------------------------------------------------------------------- /Datasets/scen.json: -------------------------------------------------------------------------------- 1 | { 2 | "code": 1234, 3 | "commentCount": 5, 4 | "createdAt": "2023-05-30T10:30:00", 5 | "description": "Example description", 6 | "feedsComment": "Example comment", 7 | "id": 1, 8 | "imagePaths": "/path/to/images", 9 | "images": "image1.jpg,image2.jpg,image3.jpg", 10 | "isdeleted": false, 11 | "lat": 123456789, 12 | "likeDislike": { 13 | "dislikes": 10, 14 | "likes": 20, 15 | "userAction": 1 16 | }, 17 | "lng": 987654321, 18 | "location": "Example location", 19 | "mediatype": 1, 20 | "msg": "Example message", 21 | "multiMedia": [ 22 | { 23 | "createAt": "2023-05-30T12:00:00", 24 | "description": "Media description", 25 | "id": 1001, 26 | "likeCount": 50, 27 | "mediatype": 1, 28 | "name": "Media name", 29 | "place": "Media place", 30 | "url": "https://example.com/media1" 31 | }, 32 | { 33 | "createAt": "2023-05-30T13:30:00", 34 | "description": "Another media description", 35 | "id": 1002, 36 | "likeCount": 30, 37 | "mediatype": 2, 38 | "name": "Another media name", 39 | "place": "Another media place", 40 | "url": "https://example.com/media2" 41 | } 42 | ], 43 | "name": "John Doe", 44 | "profilePicture": "/path/to/profile_picture.jpg", 45 | "title": "Example title", 46 | "userId": 123, 47 | "videoUrl": "https://example.com/video", 48 | "totalFeed": 100 49 | } 50 | -------------------------------------------------------------------------------- /Datasets/scen20.json: -------------------------------------------------------------------------------- 1 | [ 2 | { 3 | "code": 123, 4 | "commentCount": 5, 5 | "createAt": "2023-05-30T10:30:00", 6 | "createdAt": "2023-05-30T10:30:00", 7 | "description": "This is a sample description.", 8 | "dislikes": 2, 9 | "feedsComment": "Sample feeds comment", 10 | "id": 1, 11 | "imagePaths": "path/to/images", 12 | "images": "image1.jpg,image2.jpg", 13 | "isdeleted": false, 14 | "lat": 12, 15 | "likeCount": 10, 16 | "likes": 8, 17 | "lng": 34, 18 | "location": "Sample location", 19 | "mediatype": 1, 20 | "msg": "Sample message", 21 | "name": "John Doe", 22 | "place": "Sample place", 23 | "profilePicture": "path/to/profile_picture.jpg", 24 | "title": "Sample title", 25 | "totalFeed": 100, 26 | "url": "http://sampleurl.com", 27 | "userAction": 1, 28 | "userId": 12345, 29 | "videoUrl": "http://samplevideourl.com" 30 | }, 31 | { 32 | "code": 456, 33 | "commentCount": 3, 34 | "createAt": "2023-05-29T15:45:00", 35 | "createdAt": "2023-05-29T15:45:00", 36 | "description": "Another sample description.", 37 | "dislikes": 1, 38 | "feedsComment": "Another sample feeds comment", 39 | "id": 2, 40 | "imagePaths": "path/to/images2", 41 | "images": "image3.jpg,image4.jpg", 42 | "isdeleted": true, 43 | "lat": 56, 44 | "likeCount": 20, 45 | "likes": 18, 46 | "lng": 78, 47 | "location": "Another sample location", 48 | "mediatype": 2, 49 | "msg": "Another sample message", 50 | "name": "Jane Smith", 51 | "place": "Another sample place", 52 | "profilePicture": "path/to/profile_picture2.jpg", 53 | "title": "Another sample title", 54 | "totalFeed": 200, 55 | "url": "http://anotherurl.com", 56 | "userAction": 2, 57 | "userId": 67890, 58 | "videoUrl": "http://samplevideourl2.com" 59 | } 60 | ] 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | ## Spark and SQL Interview Scenerio Questions 2 | 3 | ### Table of Contents 4 | 5 | |No| Scenerios | 6 | |--|--------------------------------------------------------------------------| 7 | |1 |[Scenerio-1](#scenerio-1) | 8 | |2 |[Scenerio-2](#scenerio-2) | 9 | |3 |[Scenerio-3](#scenerio-3) | 10 | |4 |[Scenerio-4](#scenerio-4) | 11 | |5 |[Scenerio-5](#scenerio-5) | 12 | |6 |[Scenerio-6](#scenerio-6) | 13 | |7 |[Scenerio-7](#scenerio-7) | 14 | |8 |[Scenerio-8](#scenerio-8) | 15 | |9 |[Scenerio-9](#scenerio-9) | 16 | |10|[Scenerio-10](#scenerio-10) | 17 | |11|[Scenerio-11](#scenerio-11) | 18 | |12|[Scenerio-12](#scenerio-12) | 19 | |13|[Scenerio-13](#scenerio-13) | 20 | |14|[Scenerio-14](#scenerio-14) | 21 | |15|[Scenerio-15](#scenerio-15) | 22 | |16|[Scenerio-16](#scenerio-16) | 23 | |17|[Scenerio-17](#scenerio-17) | 24 | |18|[Scenerio-18](#scenerio-18) | 25 | |19|[Scenerio-19](#scenerio-19) | 26 | |20|[Scenerio-20](#scenerio-20) | 27 | |21|[Scenerio-21](#scenerio-21) | 28 | |22|[Scenerio-22](#scenerio-22) | 29 | |23|[Scenerio-23](#scenerio-23) | 30 | |24|[Scenerio-24](#scenerio-24) | 31 | |25|[Scenerio-25](#scenerio-25) | 32 | |26|[Scenerio-26](#scenerio-26) | 33 | |27|[Scenerio-27](#scenerio-27) | 34 | |28|[Scenerio-28](#scenerio-28) | 35 | |29|[Scenerio-29](#scenerio-29) | 36 | |30|[Scenerio-30](#scenerio-30) | 37 | |31|[Scenerio-31](#scenerio-31) | 38 | |32|[Scenerio-32](#scenerio-32) | 39 | |33|[Scenerio-33](#scenerio-33) | 40 | |34|[Scenerio-34](#scenerio-34) | 41 | |35|[Scenerio-35](#scenerio-35) | 42 | |36|[Scenerio-36](#scenerio-36) | 43 | 44 | ### Scenerio-1 45 | #### Query to get who are getting equal salary 46 | #### Input :- 47 | ``` 48 | +--------+---------+--------+------+-------------------+------+ 49 | |workerid|firstname|lastname|salary| joiningdate|depart| 50 | +--------+---------+--------+------+-------------------+------+ 51 | | 001| Monika| Arora|100000|2014-02-20 09:00:00| HR| 52 | | 002| Niharika| Verma|300000|2014-06-11 09:00:00| Admin| 53 | | 003| Vishal| Singhal|300000|2014-02-20 09:00:00| HR| 54 | | 004| Amitabh| Singh|500000|2014-02-20 09:00:00| Admin| 55 | | 005| Vivek| Bhati|500000|2014-06-11 09:00:00| Admin| 56 | +--------+---------+--------+------+-------------------+------+ 57 | ``` 58 | #### Expected Output :- 59 | ``` 60 | +--------+---------+--------+------+-------------------+------+ 61 | |workerid|firstname|lastname|salary| joiningdate|depart| 62 | +--------+---------+--------+------+-------------------+------+ 63 | | 002| Niharika| Verma|300000|2014-06-11 09:00:00| Admin| 64 | | 003| Vishal| Singhal|300000|2014-02-20 09:00:00| HR| 65 | | 004| Amitabh| Singh|500000|2014-02-20 09:00:00| Admin| 66 | | 005| Vivek| Bhati|500000|2014-06-11 09:00:00| Admin| 67 | +--------+---------+--------+------+-------------------+------+ 68 | ``` 69 | #### Solution :- 70 | Scala-Spark -
71 | PySpark - 72 | 73 | **[⬆ Back to Top](#table-of-contents)** 74 | 75 | ### Scenerio-2 76 | #### (Need the dates when the status gets changed like ordered to dispatched) 77 | #### Input :- 78 | ``` 79 | +-------+----------+----------+ 80 | |orderid|statusdate| status| 81 | +-------+----------+----------+ 82 | | 1| 1-Jan| Ordered| 83 | | 1| 2-Jan|dispatched| 84 | | 1| 3-Jan|dispatched| 85 | | 1| 4-Jan| Shipped| 86 | | 1| 5-Jan| Shipped| 87 | | 1| 6-Jan| Delivered| 88 | | 2| 1-Jan| Ordered| 89 | | 2| 2-Jan|dispatched| 90 | | 2| 3-Jan| shipped| 91 | +-------+----------+----------+ 92 | ``` 93 | #### Expected Output :- 94 | ``` 95 | +-------+----------+----------+ 96 | |orderid|statusdate| status| 97 | +-------+----------+----------+ 98 | | 1| 2-Jan|dispatched| 99 | | 1| 3-Jan|dispatched| 100 | | 2| 2-Jan|dispatched| 101 | +-------+----------+----------+ 102 | ``` 103 | #### Solution :- 104 | Scala-Spark -
105 | PySpark - 106 | 107 | **[⬆ Back to Top](#table-of-contents)** 108 | 109 | ### Scenerio-3 110 | #### Input :- 111 | ``` 112 | +--------+----------+------+ 113 | |sensorid| timestamp|values| 114 | +--------+----------+------+ 115 | | 1111|2021-01-15| 10| 116 | | 1111|2021-01-16| 15| 117 | | 1111|2021-01-17| 30| 118 | | 1112|2021-01-15| 10| 119 | | 1112|2021-01-15| 20| 120 | | 1112|2021-01-15| 30| 121 | +--------+----------+------+ 122 | ``` 123 | #### Expected Output :- 124 | ``` 125 | +--------+----------+------+ 126 | |sensorid| timestamp|values| 127 | +--------+----------+------+ 128 | | 1111|2021-01-15| 5| 129 | | 1111|2021-01-16| 15| 130 | | 1112|2021-01-15| 10| 131 | | 1112|2021-01-15| 10| 132 | +--------+----------+------+ 133 | ``` 134 | #### Solution :- 135 | Scala-Spark - [Click Here]()
136 | PySpark - [Click Here]()
137 | SQL - 138 | ``` 139 | SELECT sensorid, 140 | timestamp, 141 | ( newvalues - values ) AS values 142 | FROM (SELECT *, 143 | Lead(values, 1, 0) 144 | OVER( 145 | partition BY sensorid 146 | ORDER BY values) AS newvalues 147 | FROM timetab) 148 | WHERE newvalues != 0 149 | ``` 150 | Pandas - 151 | ``` 152 | import pandas as pd 153 | 154 | data = [ 155 | (1111, "2021-01-15", 10), 156 | (1111, "2021-01-16", 15), 157 | (1111, "2021-01-17", 30), 158 | (1112, "2021-01-15", 10), 159 | (1112, "2021-01-15", 20), 160 | (1112, "2021-01-15", 30), 161 | ] 162 | 163 | df = pd.DataFrame(data, columns=["sensorid", "timestamp", "values"]) 164 | print(df) 165 | 166 | df["newvalues"] = df.groupby("sensorid")["values"].shift(-1) 167 | print(df) 168 | 169 | df = df.dropna(subset=["newvalues"]) 170 | print(df) 171 | 172 | df["values"] = df["newvalues"] - df["values"] 173 | print(df) 174 | 175 | df = df.drop(columns=["newvalues"]) 176 | print(df) 177 | ``` 178 | 179 | **[⬆ Back to Top](#table-of-contents)** 180 | 181 | ### Scenerio-4 182 | #### (Write a query to list the unique customer names in the custtab table, along with the number of addresses associated with each customer.) 183 | #### Input :- 184 | ``` 185 | +------+-----------+-------+ 186 | |custid| custname|address| 187 | +------+-----------+-------+ 188 | | 1| Mark Ray| AB| 189 | | 2|Peter Smith| CD| 190 | | 1| Mark Ray| EF| 191 | | 2|Peter Smith| GH| 192 | | 2|Peter Smith| CD| 193 | | 3| Kate| IJ| 194 | +------+-----------+-------+ 195 | ``` 196 | #### Expected Output :- 197 | ``` 198 | +------+-----------+--------+ 199 | |custid| custname| address| 200 | +------+-----------+--------+ 201 | | 1| Mark Ray|[EF, AB]| 202 | | 2|Peter Smith|[CD, GH]| 203 | | 3| Kate| [IJ]| 204 | +------+-----------+--------+ 205 | ``` 206 | #### Solution :- 207 | Scala-Spark - [Click Here]()
208 | PySpark - [Click Here]()
209 | SQL - 210 | ``` 211 | SELECT custid, 212 | custname, 213 | Collect_set(address) AS address 214 | FROM custtab 215 | GROUP BY custid, 216 | custname 217 | ORDER BY custid 218 | ``` 219 | Pandas - 220 | ``` 221 | data = [ 222 | (1, "Mark Ray", "AB"), 223 | (2, "Peter Smith", "CD"), 224 | (1, "Mark Ray", "EF"), 225 | (2, "Peter Smith", "GH"), 226 | (2, "Peter Smith", "CD"), 227 | (3, "Kate", "IJ"), 228 | ] 229 | 230 | df = pd.DataFrame(data, columns=["custid", "custname", "address"]) 231 | print(df) 232 | 233 | finaldf = ( 234 | df.groupby(["custid", "custname"])["address"] 235 | .apply(lambda x: list(set(x))) 236 | .reset_index() 237 | ) 238 | print(finaldf) 239 | ``` 240 | 241 | **[⬆ Back to Top](#table-of-contents)** 242 | 243 | ### Scenerio-5 244 | * Read data from above file into dataframes(df1 and df2). 245 | * Display number of partitions in df1. 246 | * Create a new dataframe df3 from df1, along with a new column salary, and keep it constant 1000 247 | * append df2 and df3, and form df4 248 | * Remove records which have invalid email from df4, emails with @ are considered to be valid. 249 | * Write df4 to a target location, by partitioning on salary. 250 | #### Input :- 251 | ``` 252 | +---+----+---+-------------+ 253 | | id|name|age| email| 254 | +---+----+---+-------------+ 255 | | 1| abc| 31|abc@gmail.com| 256 | | 2| def| 23| defyahoo.com| 257 | | 3| xyz| 26|xyz@gmail.com| 258 | | 4| qwe| 34| qwegmail.com| 259 | | 5| iop| 24|iop@gmail.com| 260 | +---+----+---+-------------+ 261 | ``` 262 | ``` 263 | +---+----+---+---------------+------+ 264 | | id|name|age| email|salary| 265 | +---+----+---+---------------+------+ 266 | | 11| jkl| 22| abc@gmail.com| 1000| 267 | | 12| vbn| 33| vbn@yahoo.com| 3000| 268 | | 13| wer| 27| wer| 2000| 269 | | 14| zxc| 30| zxc.com| 2000| 270 | | 15| lkj| 29|lkj@outlook.com| 2000| 271 | +---+----+---+---------------+------+ 272 | ``` 273 | #### Expected Output :- 274 | ``` 275 | +---+----+---+---------------+------+ 276 | | id|name|age| email|salary| 277 | +---+----+---+---------------+------+ 278 | | 1| abc| 31| abc@gmail.com| 1000| 279 | | 3| xyz| 26| xyz@gmail.com| 1000| 280 | | 5| iop| 24| iop@gmail.com| 1000| 281 | | 11| jkl| 22| abc@gmail.com| 1000| 282 | | 12| vbn| 33| vbn@yahoo.com| 3000| 283 | | 15| lkj| 29|lkj@outlook.com| 2000| 284 | +---+----+---+---------------+------+ 285 | ``` 286 | #### Solution :- 287 | Scala-Spark - [Click Here]()
288 | PySpark - [Click Here]()
289 | Pandas - 290 | ``` 291 | import pandas as pd 292 | 293 | # Read data convert into dataframes(df1 and df2). 294 | data1 = [ 295 | (1, "abc", 31, "abc@gmail.com"), 296 | (2, "def", 23, "defyahoo.com"), 297 | (3, "xyz", 26, "xyz@gmail.com"), 298 | (4, "qwe", 34, "qwegmail.com"), 299 | (5, "iop", 24, "iop@gmail.com"), 300 | ] 301 | 302 | df1 = pd.DataFrame(data1, columns=["id", "name", "age", "email"]) 303 | print(df1) 304 | 305 | data2 = [ 306 | (11, "jkl", 22, "abc@gmail.com", 1000), 307 | (12, "vbn", 33, "vbn@yahoo.com", 3000), 308 | (13, "wer", 27, "wer", 2000), 309 | (14, "zxc", 30, "zxc.com", 2000), 310 | (15, "lkj", 29, "lkj@outlook.com", 2000), 311 | ] 312 | 313 | df2 = pd.DataFrame(data2, columns=["id", "name", "age", "email", "salary"]) 314 | print(df2) 315 | 316 | # Create a new dataframe df3 from df1, along with a new column salary, and keep it constant 1000 317 | df3 = df1.copy() 318 | df3["salary"] = 1000 319 | print(df3) 320 | 321 | # append df2 and df3, and form df4 322 | df4 = pd.concat([df2, df3]) 323 | 324 | df4 = df4.sort_values("id") 325 | print(df4) 326 | 327 | # Remove records which have invalid email from df4, emails with @ are considered to be valid. 328 | finaldf = df4[df4["email"].str.contains("@", na=False)] 329 | print(finaldf) 330 | ``` 331 | 332 | **[⬆ Back to Top](#table-of-contents)** 333 | 334 | ### Scenerio-6 335 | #### (For Employee salary greater than 10000 give designation as manager else employee) 336 | #### Input :- 337 | ``` 338 | +-----+----+------+ 339 | |empid|name|salary| 340 | +-----+----+------+ 341 | | 1| a| 10000| 342 | | 2| b| 5000| 343 | | 3| c| 15000| 344 | | 4| d| 25000| 345 | | 5| e| 50000| 346 | | 6| f| 7000| 347 | +-----+----+------+ 348 | ``` 349 | #### Expected Output :- 350 | ``` 351 | +-----+----+------+-----------+ 352 | |empid|name|salary|Designation| 353 | +-----+----+------+-----------+ 354 | | 1| a| 10000| Employee| 355 | | 2| b| 5000| Employee| 356 | | 3| c| 15000| Manager| 357 | | 4| d| 25000| Manager| 358 | | 5| e| 50000| Manager| 359 | | 6| f| 7000| Employee| 360 | +-----+----+------+-----------+ 361 | ``` 362 | #### Solution :- 363 | Scala-Spark - [Click Here]()
364 | PySpark - [Click Here]()
365 | SQL - 366 | ``` 367 | SELECT *, 368 | CASE 369 | WHEN salary > 10000 THEN 370 | 'Manager' 371 | ELSE 'Employee' 372 | END AS Designation 373 | FROM emptab 374 | ``` 375 | Pandas - 376 | ``` 377 | import pandas as pd 378 | 379 | data = [ 380 | ("1", "a", 10000), 381 | ("2", "b", 5000), 382 | ("3", "c", 15000), 383 | ("4", "d", 25000), 384 | ("5", "e", 50000), 385 | ("6", "f", 7000), 386 | ] 387 | 388 | df = pd.DataFrame(data, columns=["empid", "name", "salary"]) 389 | print(df) 390 | 391 | 392 | def emp_desgnination(salary): 393 | return "Manager" if salary > 10000 else "Employee" 394 | 395 | 396 | df["Desgniation"] = df["salary"].apply(emp_desgnination) 397 | print(df) 398 | ``` 399 | 400 | **[⬆ Back to Top](#table-of-contents)** 401 | 402 | ### Scenerio-7 403 | #### Input :- 404 | ``` 405 | +-------+----------+----+--------+-----+ 406 | |sale_id|product_id|year|quantity|price| 407 | +-------+----------+----+--------+-----+ 408 | | 1| 100|2010| 25| 5000| 409 | | 2| 100|2011| 16| 5000| 410 | | 3| 100|2012| 8| 5000| 411 | | 4| 200|2010| 10| 9000| 412 | | 5| 200|2011| 15| 9000| 413 | | 6| 200|2012| 20| 7000| 414 | | 7| 300|2010| 20| 7000| 415 | | 8| 300|2011| 18| 7000| 416 | | 9| 300|2012| 20| 7000| 417 | +-------+----------+----+--------+-----+ 418 | ``` 419 | #### Expected Output :- 420 | ``` 421 | +-------+----------+----+--------+-----+ 422 | |sale_id|product_id|year|quantity|price| 423 | +-------+----------+----+--------+-----+ 424 | | 6| 200|2012| 20| 7000| 425 | | 9| 300|2012| 20| 7000| 426 | | 1| 100|2010| 25| 5000| 427 | | 8| 300|2011| 18| 7000| 428 | +-------+----------+----+--------+-----+ 429 | ``` 430 | #### Solution :- 431 | Scala-Spark - [Click Here]()
432 | PySpark - [Click Here]()
433 | SQL - 434 | ``` 435 | SELECT 436 | * 437 | FROM 438 | ( 439 | SELECT 440 | *, 441 | DENSE_RANK() OVER ( 442 | PARTITION BY year 443 | ORDER BY 444 | quantity DESC 445 | ) AS rank 446 | FROM 447 | salestab 448 | ) AS rankdf 449 | WHERE 450 | rank = 1 451 | ORDER BY 452 | sale_id 453 | ``` 454 | Pandas - 455 | ``` 456 | import pandas as pd 457 | 458 | data = [ 459 | (1, 100, 2010, 25, 5000), 460 | (2, 100, 2011, 16, 5000), 461 | (3, 100, 2012, 8, 5000), 462 | (4, 200, 2010, 10, 9000), 463 | (5, 200, 2011, 15, 9000), 464 | (6, 200, 2012, 20, 7000), 465 | (7, 300, 2010, 20, 7000), 466 | (8, 300, 2011, 18, 7000), 467 | (9, 300, 2012, 20, 7000), 468 | ] 469 | 470 | df = pd.DataFrame(data, columns=["sale_id", "product_id", "year", "quantity", "price"]) 471 | print(df) 472 | 473 | df["rank"] = df.groupby("year")["quantity"].rank(method="dense", ascending=False) 474 | print(df) 475 | 476 | df = df[df["rank"] == 1] 477 | print(df) 478 | 479 | df = df.drop("rank", axis=1).sort_values("sale_id") 480 | print(df) 481 | ``` 482 | 483 | **[⬆ Back to Top](#table-of-contents)** 484 | 485 | ### Scenerio-8 486 | #### Input :- 487 | ``` 488 | +--------+ 489 | | teams| 490 | +--------+ 491 | | India| 492 | |Pakistan| 493 | |SriLanka| 494 | +--------+ 495 | ``` 496 | #### Expected Output :- 497 | ``` 498 | +--------------------+ 499 | | matches| 500 | +--------------------+ 501 | | India Vs Pakistan| 502 | | India Vs SriLanka| 503 | |Pakistan Vs SriLanka| 504 | +--------------------+ 505 | ``` 506 | #### Solution :- 507 | Scala-Spark -
508 | PySpark - 509 | 510 | **[⬆ Back to Top](#table-of-contents)** 511 | 512 | ### Scenerio-9 513 | #### (write spark code, list of name of participants who has rank=1 most number of times) 514 | #### Input :- 515 | ``` 516 | +----+---------------+ 517 | |name| rank| 518 | +----+---------------+ 519 | | a| [1, 1, 1, 3]| 520 | | b| [1, 2, 3, 4]| 521 | | c|[1, 1, 1, 1, 4]| 522 | | d| [3]| 523 | +----+---------------+ 524 | ``` 525 | #### Expected Output :- 526 | ``` 527 | c 528 | ``` 529 | #### Solution :- 530 | Scala-Spark -
531 | PySpark - 532 | 533 | **[⬆ Back to Top](#table-of-contents)** 534 | 535 | ### Scenerio-10 536 | #### Input :- 537 | ``` 538 | +-----+-------------+-------------+ 539 | |empid|commissionamt|monthlastdate| 540 | +-----+-------------+-------------+ 541 | | 1| 300| 31-Jan-2021| 542 | | 1| 400| 28-Feb-2021| 543 | | 1| 200| 31-Mar-2021| 544 | | 2| 1000| 31-Oct-2021| 545 | | 2| 900| 31-Dec-2021| 546 | +-----+-------------+-------------+ 547 | ``` 548 | #### Expected Output :- 549 | ``` 550 | +-----+-------------+-------------+ 551 | |empid|commissionamt|monthlastdate| 552 | +-----+-------------+-------------+ 553 | | 1| 200| 31-Mar-2021| 554 | | 2| 1000| 31-Oct-2021| 555 | +-----+-------------+-------------+ 556 | ``` 557 | #### Solution :- 558 | Scala-Spark -
559 | PySpark - 560 | 561 | **[⬆ Back to Top](#table-of-contents)** 562 | 563 | ### Scenerio-11 564 | #### (I have a table called Emp_table, it has 3 columns, Emp name, emp ID , salary 565 | in this I want to get salaries that are >10000 as Grade A, 5000-10000 as grade B and < 5000 as 566 | Grade C, write an SQL query) 567 | #### Input :- 568 | ``` 569 | +------+---------------+------+ 570 | |emp_id| emp_name|salary| 571 | +------+---------------+------+ 572 | | 1| Jhon| 4000| 573 | | 2| Tim David| 12000| 574 | | 3|Json Bhrendroff| 7000| 575 | | 4| Jordon| 8000| 576 | | 5| Green| 14000| 577 | | 6| Brewis| 6000| 578 | +------+---------------+------+ 579 | ``` 580 | #### Expected Output :- 581 | ``` 582 | +------+---------------+------+-----+ 583 | |emp_id| emp_name|salary|grade| 584 | +------+---------------+------+-----+ 585 | | 1| Jhon| 4000| C| 586 | | 2| Tim David| 12000| A| 587 | | 3|Json Bhrendroff| 7000| B| 588 | | 4| Jordon| 8000| B| 589 | | 5| Green| 14000| A| 590 | | 6| Brewis| 6000| B| 591 | +------+---------------+------+-----+ 592 | ``` 593 | #### Solution :- 594 | Scala-Spark -
595 | PySpark - 596 | 597 | **[⬆ Back to Top](#table-of-contents)** 598 | 599 | ### Scenerio-12 600 | #### Input :- 601 | ``` 602 | +--------------------+----------+ 603 | | email| mobile| 604 | +--------------------+----------+ 605 | |Renuka1992@gmail.com|9856765434| 606 | |anbu.arasu@gmail.com|9844567788| 607 | +--------------------+----------+ 608 | ``` 609 | #### Expected Output :- 610 | ``` 611 | +--------------------+----------+ 612 | | email| mobile| 613 | +--------------------+----------+ 614 | |R**********92@gma...|98*****434| 615 | |a**********su@gma...|98*****788| 616 | +--------------------+----------+ 617 | ``` 618 | #### Solution :- 619 | Scala-Spark -
620 | PySpark - 621 | 622 | **[⬆ Back to Top](#table-of-contents)** 623 | 624 | ## Scenerio-13 625 | #### (We have employee id,employee name, department. Need count of every department employees.) 626 | #### Input :- 627 | ``` 628 | +------+--------+-----------+ 629 | |emp_id|emp_name| dept| 630 | +------+--------+-----------+ 631 | | 1| Jhon|Development| 632 | | 2| Tim|Development| 633 | | 3| David| Testing| 634 | | 4| Sam| Testing| 635 | | 5| Green| Testing| 636 | | 6| Miller| Production| 637 | | 7| Brevis| Production| 638 | | 8| Warner| Production| 639 | | 9| Salt| Production| 640 | +------+--------+-----------+ 641 | ``` 642 | #### Expected Output :- 643 | ``` 644 | +-----------+-----+ 645 | | dept|total| 646 | +-----------+-----+ 647 | |Development| 2| 648 | | Testing| 3| 649 | | Production| 4| 650 | +-----------+-----+ 651 | ``` 652 | #### Solution :- 653 | Scala-Spark -
654 | PySpark - 655 | 656 | **[⬆ Back to Top](#table-of-contents)** 657 | 658 | ## Scenerio-14 659 | #### (We need total marks) 660 | #### Input :- 661 | ``` 662 | +------+------+------+-------+-----+-------+------+ 663 | |rollno| name|telugu|english|maths|science|social| 664 | +------+------+------+-------+-----+-------+------+ 665 | |203040|rajesh| 10| 20| 30| 40| 50| 666 | +------+------+------+-------+-----+-------+------+ 667 | ``` 668 | #### Expected Output :- 669 | ``` 670 | +------+------+------+-------+-----+-------+------+-----+ 671 | |rollno| name|telugu|english|maths|science|social|total| 672 | +------+------+------+-------+-----+-------+------+-----+ 673 | |203040|rajesh| 10| 20| 30| 40| 50| 150| 674 | +------+------+------+-------+-----+-------+------+-----+ 675 | ``` 676 | #### Solution :- 677 | Scala-Spark - [Click Here]()
678 | PySpark - [Click Here]()
679 | SQL - 680 | ``` 681 | select 682 | *, 683 | ( 684 | telugu + english + maths + science + social 685 | ) as total 686 | from 687 | markstab 688 | 689 | ``` 690 | 691 | **[⬆ Back to Top](#table-of-contents)** 692 | 693 | ## Scenerio-15 694 | #### (Extend and Append list in python and scala) 695 | #### Solution :- 696 | Scala-Spark -
697 | PySpark - 698 | 699 | **[⬆ Back to Top](#table-of-contents)** 700 | 701 | ## Scenerio-16 702 | #### (Remove duplicates) 703 | #### Input :- 704 | ``` 705 | +---+----+-----------+------+ 706 | | id|name| dept|salary| 707 | +---+----+-----------+------+ 708 | | 1|Jhon| Testing| 5000| 709 | | 2| Tim|Development| 6000| 710 | | 3|Jhon|Development| 5000| 711 | | 4| Sky| Prodcution| 8000| 712 | +---+----+-----------+------+ 713 | ``` 714 | #### Expected Output :- 715 | ``` 716 | +---+----+-----------+------+ 717 | | id|name| dept|salary| 718 | +---+----+-----------+------+ 719 | | 1|Jhon| Testing| 5000| 720 | | 2| Tim|Development| 6000| 721 | | 4| Sky| Prodcution| 8000| 722 | +---+----+-----------+------+ 723 | ``` 724 | #### Solution :- 725 | Scala-Spark -
726 | PySpark - 727 | 728 | **[⬆ Back to Top](#table-of-contents)** 729 | 730 | ## Scenerio-17 731 | #### (df1 contains Employeeid,Name,Age,State,Country columns df2 contains Employeeid,Name,Age,Address columns. how do you merge df1 and df2 to get the following output Employeeid,Name,Age,State,Country,Address) 732 | #### Input :- 733 | ``` 734 | +------+-----+---+------+-------+ 735 | |emp_id| name|age| state|country| 736 | +------+-----+---+------+-------+ 737 | | 1| Tim| 24|Kerala| India| 738 | | 2|Asman| 26|Kerala| India| 739 | +------+-----+---+------+-------+ 740 | ``` 741 | ``` 742 | +------+-----+---+-------+ 743 | |emp_id| name|age|address| 744 | +------+-----+---+-------+ 745 | | 1| Tim| 24|Comcity| 746 | | 2|Asman| 26|bimcity| 747 | +------+-----+---+-------+ 748 | ``` 749 | #### Expected Output :- 750 | ``` 751 | +------+-----+---+------+-------+-------+ 752 | |emp_id| name|age| state|country|address| 753 | +------+-----+---+------+-------+-------+ 754 | | 1| Tim| 24|Kerala| India|Comcity| 755 | | 2|Asman| 26|Kerala| India|bimcity| 756 | +------+-----+---+------+-------+-------+ 757 | ``` 758 | #### Solution :- 759 | Scala-Spark -
760 | PySpark - 761 | 762 | **[⬆ Back to Top](#table-of-contents)** 763 | 764 | ## Scenerio-18 765 | #### Input :- 766 | ``` 767 | +------------------+ 768 | | word| 769 | +------------------+ 770 | |The Social Dilemma| 771 | +------------------+ 772 | ``` 773 | 774 | #### Expected Output :- 775 | ``` 776 | +------------------+ 777 | | reverse word| 778 | +------------------+ 779 | |ehT laicoS ammeliD| 780 | +------------------+ 781 | ``` 782 | #### Solution :- 783 | Scala-Spark -
784 | PySpark - 785 | 786 | **[⬆ Back to Top](#table-of-contents)** 787 | 788 | ## Scenerio-19 789 | #### (Flatten the below complex dataframe) 790 | #### Input :- 791 | ``` 792 | root 793 | |-- code: long (nullable = true) 794 | |-- commentCount: long (nullable = true) 795 | |-- createdAt: string (nullable = true) 796 | |-- description: string (nullable = true) 797 | |-- feedsComment: string (nullable = true) 798 | |-- id: long (nullable = true) 799 | |-- imagePaths: string (nullable = true) 800 | |-- images: string (nullable = true) 801 | |-- isdeleted: boolean (nullable = true) 802 | |-- lat: long (nullable = true) 803 | |-- likeDislike: struct (nullable = true) 804 | | |-- dislikes: long (nullable = true) 805 | | |-- likes: long (nullable = true) 806 | | |-- userAction: long (nullable = true) 807 | |-- lng: long (nullable = true) 808 | |-- location: string (nullable = true) 809 | |-- mediatype: long (nullable = true) 810 | |-- msg: string (nullable = true) 811 | |-- multiMedia: array (nullable = true) 812 | | |-- element: struct (containsNull = true) 813 | | | |-- createAt: string (nullable = true) 814 | | | |-- description: string (nullable = true) 815 | | | |-- id: long (nullable = true) 816 | | | |-- likeCount: long (nullable = true) 817 | | | |-- mediatype: long (nullable = true) 818 | | | |-- name: string (nullable = true) 819 | | | |-- place: string (nullable = true) 820 | | | |-- url: string (nullable = true) 821 | |-- name: string (nullable = true) 822 | |-- profilePicture: string (nullable = true) 823 | |-- title: string (nullable = true) 824 | |-- totalFeed: long (nullable = true) 825 | |-- userId: long (nullable = true) 826 | |-- videoUrl: string (nullable = true) 827 | ``` 828 | 829 | #### Expected Output :- 830 | ``` 831 | root 832 | |-- code: long (nullable = true) 833 | |-- commentCount: long (nullable = true) 834 | |-- createdAt: string (nullable = true) 835 | |-- description: string (nullable = true) 836 | |-- feedsComment: string (nullable = true) 837 | |-- id: long (nullable = true) 838 | |-- imagePaths: string (nullable = true) 839 | |-- images: string (nullable = true) 840 | |-- isdeleted: boolean (nullable = true) 841 | |-- lat: long (nullable = true) 842 | |-- lng: long (nullable = true) 843 | |-- location: string (nullable = true) 844 | |-- mediatype: long (nullable = true) 845 | |-- msg: string (nullable = true) 846 | |-- name: string (nullable = true) 847 | |-- profilePicture: string (nullable = true) 848 | |-- title: string (nullable = true) 849 | |-- totalFeed: long (nullable = true) 850 | |-- userId: long (nullable = true) 851 | |-- videoUrl: string (nullable = true) 852 | |-- dislikes: long (nullable = true) 853 | |-- likes: long (nullable = true) 854 | |-- userAction: long (nullable = true) 855 | |-- createAt: string (nullable = true) 856 | |-- likeCount: long (nullable = true) 857 | |-- place: string (nullable = true) 858 | |-- url: string (nullable = true) 859 | ``` 860 | #### Solution :- 861 | Dataset -
862 | Scala-Spark -
863 | PySpark - 864 | 865 | **[⬆ Back to Top](#table-of-contents)** 866 | 867 | ## Scenerio-20 868 | #### (Generate the complex dataframe) 869 | #### Input :- 870 | ``` 871 | root 872 | |-- code: long (nullable = true) 873 | |-- commentCount: long (nullable = true) 874 | |-- createAt: string (nullable = true) 875 | |-- createdAt: string (nullable = true) 876 | |-- description: string (nullable = true) 877 | |-- dislikes: long (nullable = true) 878 | |-- feedsComment: string (nullable = true) 879 | |-- id: long (nullable = true) 880 | |-- imagePaths: string (nullable = true) 881 | |-- images: string (nullable = true) 882 | |-- isdeleted: boolean (nullable = true) 883 | |-- lat: long (nullable = true) 884 | |-- likeCount: long (nullable = true) 885 | |-- likes: long (nullable = true) 886 | |-- lng: long (nullable = true) 887 | |-- location: string (nullable = true) 888 | |-- mediatype: long (nullable = true) 889 | |-- msg: string (nullable = true) 890 | |-- name: string (nullable = true) 891 | |-- place: string (nullable = true) 892 | |-- profilePicture: string (nullable = true) 893 | |-- title: string (nullable = true) 894 | |-- totalFeed: long (nullable = true) 895 | |-- url: string (nullable = true) 896 | |-- userAction: long (nullable = true) 897 | |-- userId: long (nullable = true) 898 | |-- videoUrl: string (nullable = true) 899 | ``` 900 | 901 | #### Expected Output :- 902 | ``` 903 | root 904 | |-- code: long (nullable = true) 905 | |-- commentCount: long (nullable = true) 906 | |-- createdAt: string (nullable = true) 907 | |-- description: string (nullable = true) 908 | |-- feedsComment: string (nullable = true) 909 | |-- id: long (nullable = true) 910 | |-- imagePaths: string (nullable = true) 911 | |-- images: string (nullable = true) 912 | |-- isdeleted: boolean (nullable = true) 913 | |-- lat: long (nullable = true) 914 | |-- likeDislike: struct (nullable = false) 915 | | |-- dislikes: long (nullable = true) 916 | | |-- likes: long (nullable = true) 917 | | |-- userAction: long (nullable = true) 918 | |-- lng: long (nullable = true) 919 | |-- location: string (nullable = true) 920 | |-- mediatype: long (nullable = true) 921 | |-- msg: string (nullable = true) 922 | |-- multiMedia: array (nullable = false) 923 | | |-- element: struct (containsNull = false) 924 | | | |-- createAt: string (nullable = true) 925 | | | |-- description: string (nullable = true) 926 | | | |-- id: long (nullable = true) 927 | | | |-- likeCount: long (nullable = true) 928 | | | |-- mediatype: long (nullable = true) 929 | | | |-- name: string (nullable = true) 930 | | | |-- place: string (nullable = true) 931 | | | |-- url: string (nullable = true) 932 | |-- name: string (nullable = true) 933 | |-- profilePicture: string (nullable = true) 934 | |-- title: string (nullable = true) 935 | |-- userId: long (nullable = true) 936 | |-- videoUrl: string (nullable = true) 937 | |-- totalFeed: long (nullable = true) 938 | ``` 939 | #### Solution :- 940 | Dataset -
941 | Scala-Spark -
942 | PySpark - 943 | 944 | **[⬆ Back to Top](#table-of-contents)** 945 | 946 | ## Scenerio-21 947 | #### (The roundtrip distance should be calculated using spark or SQL.) 948 | #### Input :- 949 | ``` 950 | +----+---+----+ 951 | |from| to|dist| 952 | +----+---+----+ 953 | | SEA| SF| 300| 954 | | CHI|SEA|2000| 955 | | SF|SEA| 300| 956 | | SEA|CHI|2000| 957 | | SEA|LND| 500| 958 | | LND|SEA| 500| 959 | | LND|CHI|1000| 960 | | CHI|NDL| 180| 961 | +----+---+----+ 962 | ``` 963 | 964 | #### Expected Output :- 965 | ``` 966 | +----+---+--------------+ 967 | |from| to|roundtrip_dist| 968 | +----+---+--------------+ 969 | | SEA| SF| 600| 970 | | CHI|SEA| 4000| 971 | | LND|SEA| 1000| 972 | +----+---+--------------+ 973 | 974 | ``` 975 | #### Solution :- 976 | Scala-Spark - [Click Here]()
977 | PySpark - [Click Here]()
978 | SQL - 979 | ``` 980 | select 981 | r1.from, 982 | r1.to, 983 | (r1.dist + r2.dist) as round_distance 984 | from 985 | trip r1 986 | join trip r2 on r1.from = r2.to 987 | and r1.to = r2.from 988 | where 989 | r1.from < r1.to 990 | ``` 991 | 992 | **[⬆ Back to Top](#table-of-contents)** 993 | 994 | ## Scenerio-22 995 | #### (Cumilative sum) 996 | #### Input :- 997 | ``` 998 | +---+------+-----+ 999 | |pid| date|price| 1000 | +---+------+-----+ 1001 | | 1|26-May| 100| 1002 | | 1|27-May| 200| 1003 | | 1|28-May| 300| 1004 | | 2|29-May| 400| 1005 | | 3|30-May| 500| 1006 | | 3|31-May| 600| 1007 | +---+------+-----+ 1008 | ``` 1009 | 1010 | #### Expected Output :- 1011 | ``` 1012 | +---+------+-----+---------+ 1013 | |pid| date|price|new_price| 1014 | +---+------+-----+---------+ 1015 | | 1|26-May| 100| 100| 1016 | | 1|27-May| 200| 300| 1017 | | 1|28-May| 300| 600| 1018 | | 2|29-May| 400| 400| 1019 | | 3|30-May| 500| 500| 1020 | | 3|31-May| 600| 1100| 1021 | +---+------+-----+---------+ 1022 | 1023 | ``` 1024 | #### Solution :- 1025 | Scala-Spark - [Click Here]()
1026 | PySpark - [Click Here]()
1027 | SQL - 1028 | ``` 1029 | select 1030 | pid, 1031 | date, 1032 | price, 1033 | sum(price) over ( 1034 | partition by pid 1035 | order by 1036 | price 1037 | ) as newprice 1038 | from 1039 | ordertab 1040 | 1041 | ``` 1042 | 1043 | **[⬆ Back to Top](#table-of-contents)** 1044 | 1045 | ## Scenerio-23 1046 | #### Input :- 1047 | ``` 1048 | +-----------+-----------+ 1049 | |customer_id|product_key| 1050 | +-----------+-----------+ 1051 | | 1| 5| 1052 | | 2| 6| 1053 | | 3| 5| 1054 | | 3| 6| 1055 | | 1| 6| 1056 | +-----------+-----------+ 1057 | ``` 1058 | ``` 1059 | +-----------+ 1060 | |product_key| 1061 | +-----------+ 1062 | | 5| 1063 | | 6| 1064 | +-----------+ 1065 | 1066 | ``` 1067 | 1068 | #### Expected Output :- 1069 | ``` 1070 | +-----------+ 1071 | |customer_id| 1072 | +-----------+ 1073 | | 1| 1074 | | 3| 1075 | +-----------+ 1076 | 1077 | ``` 1078 | #### Solution :- 1079 | Scala-Spark - [Click Here]()
1080 | PySpark - [Click Here]() 1081 | 1082 | **[⬆ Back to Top](#table-of-contents)** 1083 | 1084 | ## Scenerio-24 1085 | #### Input :- 1086 | ``` 1087 | +------+------------+ 1088 | |userid| page| 1089 | +------+------------+ 1090 | | 1| home| 1091 | | 1| products| 1092 | | 1| checkout| 1093 | | 1|confirmation| 1094 | | 2| home| 1095 | | 2| products| 1096 | | 2| cart| 1097 | | 2| checkout| 1098 | | 2|confirmation| 1099 | | 2| home| 1100 | | 2| products| 1101 | +------+------------+ 1102 | 1103 | ``` 1104 | 1105 | #### Expected Output :- 1106 | ``` 1107 | +------+--------------------------------------------------------------+ 1108 | |userid|pages | 1109 | +------+--------------------------------------------------------------+ 1110 | |1 |[home, products, checkout, confirmation] | 1111 | |2 |[home, products, cart, checkout, confirmation, home, products]| 1112 | +------+--------------------------------------------------------------+ 1113 | 1114 | ``` 1115 | #### Solution :- 1116 | Scala-Spark - [Click Here]()
1117 | PySpark - [Click Here]()
1118 | SQL :- 1119 | ``` 1120 | select 1121 | userid, 1122 | collect_list(page) as pages 1123 | from 1124 | testcol 1125 | group by 1126 | userid; 1127 | 1128 | ``` 1129 | **[⬆ Back to Top](#table-of-contents)** 1130 | 1131 | ## Scenerio-25 1132 | ### consider a file with some bad/corrupt data as shown below.How will you handle those and load into spark dataframe 1133 | Note - avoid using filter after reading as DF and try to remove bad data while reading the file itself 1134 | #### Input :- 1135 | ``` 1136 | emp_no,emp_name,dep 1137 | 101,Murugan,HealthCare 1138 | Invalid Entry,Description: Bad Record Entry 1139 | 102,Kannan,Finance 1140 | 103,Mani,IT 1141 | Connection lost,Description: Poor Connection 1142 | 104,Pavan,HR 1143 | Bad Record,Description:Corrupt Record 1144 | ``` 1145 | 1146 | #### Expected Output :- 1147 | ``` 1148 | +------+--------+----------+ 1149 | |emp_no|emp_name| dep| 1150 | +------+--------+----------+ 1151 | | 101| Murugan|HealthCare| 1152 | | 102| Kannan| Finance| 1153 | | 103| Mani| IT| 1154 | | 104| Pavan| HR| 1155 | +------+--------+----------+ 1156 | 1157 | ``` 1158 | #### Solution :- 1159 | Scala-Spark - [Click Here]()
1160 | PySpark - [Click Here]() 1161 | 1162 | There are three modes available when reading a file in Spark: 1163 | 1164 | * `PERMISSIVE` : This is the default mode. It attempts to parse all the rows in the file, and if it encounters any malformed data or parsing errors, it sets the problematic fields to null and adds a new column called _corrupt_record to store the entire problematic row as a string. 1165 | 1166 | * `DROPMALFORMED` : This mode drops the rows that contain malformed data or cannot be parsed according to the specified schema. It only includes the rows that can be successfully parsed. 1167 | 1168 | * `FAILFAST` : This mode throws an exception and fails immediately if it encounters any malformed data or parsing errors in the file. It does not process any further rows after the first encountered error. 1169 | 1170 | You can specify the desired mode using the mode option when reading a file, such as option("mode", "PERMISSIVE") or option("mode", "FAILFAST"). If the mode option is not explicitly set, it defaults to PERMISSIVE. 1171 | 1172 | **[⬆ Back to Top](#table-of-contents)** 1173 | 1174 | ## Scenerio-26 1175 | * Input :- 1176 | ```sh 1177 | +---+----+ 1178 | | id|name| 1179 | +---+----+ 1180 | | 1| A| 1181 | | 2| B| 1182 | | 3| C| 1183 | | 4| D| 1184 | +---+----+ 1185 | 1186 | +---+-----+ 1187 | |id1|name1| 1188 | +---+-----+ 1189 | | 1| A| 1190 | | 2| B| 1191 | | 4| X| 1192 | | 5| F| 1193 | +---+-----+ 1194 | ``` 1195 | * Output :- 1196 | ```sh 1197 | +---+-------------+ 1198 | | id| comment| 1199 | +---+-------------+ 1200 | | 3|new in source| 1201 | | 4| mismatch| 1202 | | 5|new in target| 1203 | +---+-------------+ 1204 | ``` 1205 | #### Solution :- 1206 | Scala-Spark :- [Click Here]()
1207 | PySpark :- [Click Here]()
1208 | SQL :- 1209 | ``` 1210 | select 1211 | id, 1212 | case when name != name1 then 'Mismatch' when name1 is null then 'New in Source' when name is null then 'New in Target' end as comment 1213 | from 1214 | ( 1215 | select 1216 | coalesce(id, id1) as id, 1217 | s.name, 1218 | t.name1 1219 | from 1220 | sourcetab s full 1221 | outer join targettab t on s.id = t.id1 1222 | WHERE 1223 | s.name != t.name1 1224 | OR s.name IS NULL 1225 | OR t.name1 IS NULL 1226 | ); 1227 | 1228 | ``` 1229 | 1230 | **[⬆ Back to Top](#table-of-contents)** 1231 | 1232 | ## Scenerio-27 1233 | * Input :- 1234 | ```sh 1235 | +-----+------+----+ 1236 | |empid|salary|year| 1237 | +-----+------+----+ 1238 | | 1| 60000|2018| 1239 | | 1| 70000|2019| 1240 | | 1| 80000|2020| 1241 | | 2| 60000|2018| 1242 | | 2| 65000|2019| 1243 | | 2| 65000|2020| 1244 | | 3| 60000|2018| 1245 | | 3| 65000|2019| 1246 | +-----+------+----+ 1247 | ``` 1248 | * Output :- 1249 | ```sh 1250 | +-----+------+----+-----------+ 1251 | |empid|salary|year|incresalary| 1252 | +-----+------+----+-----------+ 1253 | | 1| 60000|2018| 0| 1254 | | 1| 70000|2019| 10000| 1255 | | 1| 80000|2020| 10000| 1256 | | 2| 60000|2018| 0| 1257 | | 2| 65000|2019| 5000| 1258 | | 2| 65000|2020| 0| 1259 | | 3| 60000|2018| 0| 1260 | | 3| 65000|2019| 5000| 1261 | +-----+------+----+-----------+ 1262 | 1263 | ``` 1264 | #### Solution :- 1265 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio27.scala)
1266 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio27.py)
1267 | SQL :- 1268 | ``` 1269 | select 1270 | empid, 1271 | salary, 1272 | year, 1273 | coalesce( 1274 | (salary - diff), 1275 | 0 1276 | ) as increment 1277 | from 1278 | ( 1279 | select 1280 | *, 1281 | lag(salary, 1) over ( 1282 | partition by empid 1283 | order by 1284 | year 1285 | ) as diff 1286 | from 1287 | salarytab 1288 | ); 1289 | 1290 | ``` 1291 | 1292 | **[⬆ Back to Top](#table-of-contents)** 1293 | 1294 | 1295 | ## Scenerio-28 1296 | * Input :- 1297 | ```sh 1298 | +-----+------+ 1299 | |child|parent| 1300 | +-----+------+ 1301 | | A| AA| 1302 | | B| BB| 1303 | | C| CC| 1304 | | AA| AAA| 1305 | | BB| BBB| 1306 | | CC| CCC| 1307 | +-----+------+ 1308 | ``` 1309 | * Output :- 1310 | ```sh 1311 | +-----+------+-----------+ 1312 | |child|parent|grandparent| 1313 | +-----+------+-----------+ 1314 | | A| AA| AAA| 1315 | | C| CC| CCC| 1316 | | B| BB| BBB| 1317 | +-----+------+-----------+ 1318 | ``` 1319 | #### Solution :- 1320 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio28.scala)
1321 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio28.py) 1322 | 1323 | **[⬆ Back to Top](#table-of-contents)** 1324 | 1325 | 1326 | ## Scenerio-29 1327 | * Input :- 1328 | ```sh 1329 | +---+ 1330 | |col| 1331 | +---+ 1332 | | 1| 1333 | | 2| 1334 | | 3| 1335 | +---+ 1336 | 1337 | +----+ 1338 | |col1| 1339 | +----+ 1340 | | 1| 1341 | | 2| 1342 | | 3| 1343 | | 4| 1344 | | 5| 1345 | +----+ 1346 | ``` 1347 | * Output :- 1348 | ```sh 1349 | +---+ 1350 | |col| 1351 | +---+ 1352 | | 1| 1353 | | 2| 1354 | | 4| 1355 | | 5| 1356 | +---+ 1357 | ``` 1358 | #### Solution :- 1359 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio29.scala)
1360 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio29.py) 1361 | 1362 | **[⬆ Back to Top](#table-of-contents)** 1363 | 1364 | ## Scenerio-30 1365 | * Write a SQL Query to extract second most salary for each department 1366 | * Input :- 1367 | ```sh 1368 | +------+----+-------+-------+ 1369 | |emp_id|name|dept_id| salary| 1370 | +------+----+-------+-------+ 1371 | | 1| A| A|1000000| 1372 | | 2| B| A|2500000| 1373 | | 3| C| G| 500000| 1374 | | 4| D| G| 800000| 1375 | | 5| E| W|9000000| 1376 | | 6| F| W|2000000| 1377 | +------+----+-------+-------+ 1378 | 1379 | +--------+---------+ 1380 | |dept_id1|dept_name| 1381 | +--------+---------+ 1382 | | A| AZURE| 1383 | | G| GCP| 1384 | | W| AWS| 1385 | +--------+---------+ 1386 | ``` 1387 | * Output :- 1388 | ```sh 1389 | +------+----+---------+-------+ 1390 | |emp_id|name|dept_name| salary| 1391 | +------+----+---------+-------+ 1392 | | 1| A| AZURE|1000000| 1393 | | 6| F| AWS|2000000| 1394 | | 3| C| GCP| 500000| 1395 | +------+----+---------+-------+ 1396 | ``` 1397 | #### Solution :- 1398 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio30.scala)
1399 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio30.ipynb)
1400 | SQL :- 1401 | ```sh 1402 | WITH jointab AS ( 1403 | SELECT df1.emp_id, df1.name, df1.dept_id, df1.salary, df2.dept_name, 1404 | DENSE_RANK() OVER (PARTITION BY df1.dept_id ORDER BY df1.salary DESC) AS row_rank 1405 | FROM df1 1406 | INNER JOIN df2 ON df1.dept_id = df2.dept_id1 1407 | ) 1408 | SELECT emp_id,name,dept_name,salary from jointab WHERE row_rank =2; 1409 | ``` 1410 | **[⬆ Back to Top](#table-of-contents)** 1411 | 1412 | ## Scenerio-31 1413 | * Input :- 1414 | ```sh 1415 | +----+-----+--------+-----------+ 1416 | |col1| col2| col3| col4| 1417 | +----+-----+--------+-----------+ 1418 | | m1|m1,m2|m1,m2,m3|m1,m2,m3,m4| 1419 | +----+-----+--------+-----------+ 1420 | ``` 1421 | * Output :- 1422 | ```sh 1423 | +-----------+ 1424 | | col| 1425 | +-----------+ 1426 | | m1| 1427 | | m1,m2| 1428 | | m1,m2,m3| 1429 | |m1,m2,m3,m4| 1430 | | | 1431 | +-----------+ 1432 | ``` 1433 | #### Solution :- 1434 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio31.scala)
1435 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio31.ipynb)
1436 | SQL :- 1437 | ```sh 1438 | select 1439 | explode( 1440 | split(col, '-') 1441 | ) 1442 | from 1443 | ( 1444 | select 1445 | concat( 1446 | col1, '-', col2, '-', col3, '-', col4 1447 | ) as col 1448 | from 1449 | mtab 1450 | ); 1451 | 1452 | ``` 1453 | **[⬆ Back to Top](#table-of-contents)** 1454 | 1455 | ## Scenerio-32 1456 | * Input :- 1457 | ```sh 1458 | +-------+-------------------+ 1459 | |food_id| food_item| 1460 | +-------+-------------------+ 1461 | | 1| Veg Biryani| 1462 | | 2| Veg Fried Rice| 1463 | | 3| Kaju Fried Rice| 1464 | | 4| Chicken Biryani| 1465 | | 5|Chicken Dum Biryani| 1466 | | 6| Prawns Biryani| 1467 | | 7| Fish Birayani| 1468 | +-------+-------------------+ 1469 | 1470 | +-------+------+ 1471 | |food_id|rating| 1472 | +-------+------+ 1473 | | 1| 5| 1474 | | 2| 3| 1475 | | 3| 4| 1476 | | 4| 4| 1477 | | 5| 5| 1478 | | 6| 4| 1479 | | 7| 4| 1480 | +-------+------+ 1481 | ``` 1482 | * Output :- 1483 | ```sh 1484 | +-------+-------------------+------+---------------+ 1485 | |food_id| food_item|rating|stats(out of 5)| 1486 | +-------+-------------------+------+---------------+ 1487 | | 1| Veg Biryani| 5| *****| 1488 | | 2| Veg Fried Rice| 3| ***| 1489 | | 3| Kaju Fried Rice| 4| ****| 1490 | | 4| Chicken Biryani| 4| ****| 1491 | | 5|Chicken Dum Biryani| 5| *****| 1492 | | 6| Prawns Biryani| 4| ****| 1493 | | 7| Fish Birayani| 4| ****| 1494 | +-------+-------------------+------+---------------+ 1495 | ``` 1496 | #### Solution :- 1497 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio32%20Scala.scala)
1498 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio32.ipynb)
1499 | SQL :- 1500 | ```sh 1501 | select 1502 | foodtab.food_id, 1503 | foodtab.food_item, 1504 | ratingtab.rating, 1505 | repeat('*', ratingtab.rating) as stars 1506 | from 1507 | foodtab 1508 | inner join ratingtab on foodtab.food_id = ratingtab.food_id 1509 | order by 1510 | foodtab.food_id; 1511 | ``` 1512 | **[⬆ Back to Top](#table-of-contents)** 1513 | 1514 | ## Scenerio-33 1515 | * Write a query to print the maximum number of discount tours any 1 family can choose. 1516 | * Input :- 1517 | ```sh 1518 | +--------------------+--------------+-----------+ 1519 | | id| name|family_size| 1520 | +--------------------+--------------+-----------+ 1521 | |c00dac11bde74750b...| Alex Thomas| 9| 1522 | |eb6f2d3426694667a...| Chris Gray| 2| 1523 | |3f7b5b8e835d4e1c8...| Emily Johnson| 4| 1524 | |9a345b079d9f4d3ca...| Michael Brown| 6| 1525 | |e0a5f57516024de2a...|Jessica Wilson| 3| 1526 | +--------------------+--------------+-----------+ 1527 | 1528 | +--------------------+------------+--------+--------+ 1529 | | id| name|min_size|max_size| 1530 | +--------------------+------------+--------+--------+ 1531 | |023fd23615bd4ff4b...| Bolivia| 2| 4| 1532 | |be247f73de0f4b2d8...|Cook Islands| 4| 8| 1533 | |3e85ab80a6f84ef3b...| Brazil| 4| 7| 1534 | |e571e164152c4f7c8...| Australia| 5| 9| 1535 | |f35a7bb7d44342f7a...| Canada| 3| 5| 1536 | |a1b5a4b5fc5f46f89...| Japan| 10| 12| 1537 | +--------------------+------------+--------+--------+ 1538 | ``` 1539 | * Output :- 1540 | ```sh 1541 | +-------------+-------------------+ 1542 | | name|number_of_countries| 1543 | +-------------+-------------------+ 1544 | |Emily Johnson| 4| 1545 | +-------------+-------------------+ 1546 | ``` 1547 | #### Solution :- 1548 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio33.scala)
1549 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio33.ipynb)
1550 | SQL :- 1551 | ```sh 1552 | select max(number_of_countries) from (select f.name,count(*) as number_of_countries from family f inner join country c on f.family_size between c.min_size and c.max_size group by f.name); 1553 | ``` 1554 | **[⬆ Back to Top](#table-of-contents)** 1555 | 1556 | ## Scenerio-34 1557 | * Input :- 1558 | ```sh 1559 | +-----------+------+---+------+ 1560 | |customer_id| name|age|gender| 1561 | +-----------+------+---+------+ 1562 | | 1| Alice| 25| F| 1563 | | 2| Bob| 40| M| 1564 | | 3| Raj| 46| M| 1565 | | 4| Sekar| 66| M| 1566 | | 5| Jhon| 47| M| 1567 | | 6|Timoty| 28| M| 1568 | | 7| Brad| 90| M| 1569 | | 8| Rita| 34| F| 1570 | +-----------+------+---+------+ 1571 | ``` 1572 | * Output :- 1573 | ```sh 1574 | +---------+-----+ 1575 | |age_group|count| 1576 | +---------+-----+ 1577 | | 19-35| 3| 1578 | | 36-50| 3| 1579 | | 51+| 2| 1580 | +---------+-----+ 1581 | ``` 1582 | #### Solution :- 1583 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio34.scala)
1584 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio34.ipynb)
1585 | 1586 | **[⬆ Back to Top](#table-of-contents)** 1587 | 1588 | ## Scenerio-35 1589 | Question (IBM Question) 1590 | * Create a new datafrane df1 with the given values 1591 | * Count null entries in a datafarme 1592 | * Remove null entries and the store the null entries in a new datafarme df2 1593 | * Create a new dataframe df3 with the given values and join the two dataframes df1 & df2 1594 | * Fill the null values with the mean age all of students 1595 | * Filter the students who are 18 years above and older 1596 | #### Solution :- 1597 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio35.scala)
1598 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio35.ipynb)
1599 | 1600 | **[⬆ Back to Top](#table-of-contents)** 1601 | 1602 | 1603 | ## Scenerio-36 1604 | * Input :- 1605 | ```sh 1606 | +----------+----------+ 1607 | | sell_date| product| 1608 | +----------+----------+ 1609 | |2020-05-30| Headphone| 1610 | |2020-06-01| Pencil| 1611 | |2020-06-02| Mask| 1612 | |2020-05-30|Basketball| 1613 | |2020-06-01| Book| 1614 | |2020-06-02| Mask| 1615 | |2020-05-30| T-Shirt| 1616 | +----------+----------+ 1617 | ``` 1618 | * Output :- 1619 | ```sh 1620 | +----------+--------------------+---------+ 1621 | | sell_date| products|null_sell| 1622 | +----------+--------------------+---------+ 1623 | |2020-05-30|[T-Shirt, Basketb...| 3| 1624 | |2020-06-01| [Pencil, Book]| 2| 1625 | |2020-06-02| [Mask]| 1| 1626 | +----------+--------------------+---------+ 1627 | ``` 1628 | #### Solution :- 1629 | Scala-Spark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/src/pack/Scenerio36.scala)
1630 | PySpark :- [Click Here](https://github.com/mohankrishna02/interview-scenerios-spark-sql/blob/master/Scenerio36.ipynb)
1631 | 1632 | SQL :- 1633 | ```sh 1634 | select sell_date,(collect_set(product)) as products,size(collect_set(product)) as num_sell from products group by sell_date; 1635 | ``` 1636 | **[⬆ Back to Top](#table-of-contents)** 1637 | 1638 | 1639 | 1640 | 1641 | 1642 | 1643 | 1644 | 1645 | 1646 | -------------------------------------------------------------------------------- /Scenerio-1.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf,SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-1") 6 | sc = SparkContext(conf=conf) 7 | sc.setLogLevel("ERROR") 8 | spark = SparkSession.builder.getOrCreate() 9 | 10 | data = [("001", "Monika", "Arora", 100000, "2014-02-20 09:00:00", "HR"),("002", "Niharika", "Verma", 300000, "2014-06-11 09:00:00", "Admin"),("003", "Vishal", "Singhal", 300000, "2014-02-20 09:00:00", "HR"),("004", "Amitabh", "Singh", 500000, "2014-02-20 09:00:00", "Admin"),("005", "Vivek", "Bhati", 500000, "2014-06-11 09:00:00", "Admin")] 11 | myschema = ["workerid","firstname","lastname","salary","joiningdate","depart"] 12 | df = spark.createDataFrame(data,schema=myschema) 13 | df.show() 14 | #Through SQL 15 | df.createOrReplaceTempView("worktab") 16 | spark.sql("select a.workerid,a.firstname,a.lastname,a.salary,a.joiningdate,a.depart from worktab a, worktab b where a.salary=b.salary and a.workerid !=b.workerid").show() 17 | 18 | #Through Spark DSL 19 | finaldf = df.alias("a").join(df.alias("b"), (col("a.salary") == col("b.salary")) & (col("a.workerid") != col("b.workerid")), "inner").select(col("a.workerid"), col("a.firstname"), col("a.lastname"), col("a.salary"), col("a.joiningdate"), col("a.depart")).show() 20 | -------------------------------------------------------------------------------- /Scenerio10.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-10") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | data = [ 12 | (1, 300, "31-Jan-2021"), 13 | (1, 400, "28-Feb-2021"), 14 | (1, 200, "31-Mar-2021"), 15 | (2, 1000, "31-Oct-2021"), 16 | (2, 900, "31-Dec-2021") 17 | ] 18 | df = spark.createDataFrame(data, ["empid", "commissionamt", "monthlastdate"]) 19 | df.show() 20 | 21 | maxdatedf = df.groupBy(col("empid").alias("empid1")).agg(max("monthlastdate").alias("maxdate")) 22 | maxdatedf.show() 23 | 24 | joindf = df.join(maxdatedf, (df["empid"] == maxdatedf["empid1"]) & (df["monthlastdate"] == maxdatedf["maxdate"]), 25 | "inner").drop("empid1", "maxdate") 26 | joindf.show() 27 | -------------------------------------------------------------------------------- /Scenerio11.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-10") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | data = [ 12 | (1, "Jhon", 4000), 13 | (2, "Tim David", 12000), 14 | (3, "Json Bhrendroff", 7000), 15 | (4, "Jordon", 8000), 16 | (5, "Green", 14000), 17 | (6, "Brewis", 6000) 18 | ] 19 | df = spark.createDataFrame(data, ["emp_id", "emp_name", "salary"]) 20 | df.show() 21 | 22 | # Through SQL 23 | df.createOrReplaceTempView("emptab") 24 | spark.sql( 25 | "select *,case when salary<5000 then 'C' when salary between 5000 and 10000 then 'B' else 'A' end as grade from emptab ").show() 26 | 27 | # Through DSL 28 | finaldf = df.withColumn("grade", expr( 29 | "case when salary<5000 then 'C' when salary between 5000 and 10000 then 'B' else 'A' end")).show() 30 | -------------------------------------------------------------------------------- /Scenerio12.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio12") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | 12 | #creating UDF functions for masked data, here email[0] is it will take first letter i.e 0th index and email[8:] is it will take the string from 8th index position to end of the string 13 | def mask_email(email): 14 | return (email[0] + "**********" + email[8:]) 15 | 16 | #creating UDF functions for masked data, here mobile[0:2] is it will take string from Index 0 to 2 letters and mobile[-3:] is it will take string last three index to end the end of the string 17 | def mask_mobile(mobile): 18 | return (mobile[0:2] + "*****" + mobile[-3:]) 19 | 20 | 21 | df = spark.createDataFrame([("Renuka1992@gmail.com", "9856765434"), ("anbu.arasu@gmail.com", "9844567788")], ["email", "mobile"]) 22 | df.show() 23 | 24 | maskeddf = df.withColumn("email",udf(mask_email)(df.email)).withColumn("mobile",udf(mask_mobile)(df.mobile)) 25 | maskeddf.show() 26 | -------------------------------------------------------------------------------- /Scenerio13.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql import * 4 | from pyspark.sql.types import * 5 | from pyspark.sql.functions import * 6 | from pyspark.sql.window import * 7 | 8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio13") 9 | sc = SparkContext(conf=conf) 10 | sc.setLogLevel("ERROR") 11 | spark = SparkSession.builder.getOrCreate() 12 | 13 | data = [(1, "Jhon", "Development"), 14 | (2, "Tim", "Development"), 15 | (3, "David", "Testing"), 16 | (4, "Sam", "Testing"), 17 | (5, "Green", "Testing"), 18 | (6, "Miller", "Production"), 19 | (7, "Brevis", "Production"), 20 | (8, "Warner", "Production"), 21 | (9, "Salt", "Production")] 22 | df = spark.createDataFrame(data, ["emp_id", "emp_name", "dept"]) 23 | df.show() 24 | 25 | # Through SQL 26 | df.createOrReplaceTempView("emptab") 27 | spark.sql("SELECT dept, COUNT(*) AS total FROM emptab GROUP BY dept").show() 28 | 29 | # Through DSL 30 | finaldf = df.groupBy(col("dept")).agg(count("*").alias("total")).show() 31 | -------------------------------------------------------------------------------- /Scenerio14.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql import * 4 | from pyspark.sql.types import * 5 | from pyspark.sql.functions import * 6 | from pyspark.sql.window import * 7 | 8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio14") 9 | sc = SparkContext(conf=conf) 10 | sc.setLogLevel("ERROR") 11 | spark = SparkSession.builder.getOrCreate() 12 | 13 | data = [ 14 | (203040, "rajesh", 10, 20, 30, 40, 50) 15 | ] 16 | 17 | df = spark.createDataFrame(data, ["rollno", "name", "telugu", "english", "maths", "science", "social"]) 18 | df.show() 19 | 20 | # Through SQL 21 | df.createOrReplaceTempView("marks") 22 | spark.sql("select *, (telugu+english+maths+science+social) as total from marks").show() 23 | 24 | # Through DSL 25 | finaldf = df.withColumn("total", expr("telugu+english+maths+science+social")).show() 26 | -------------------------------------------------------------------------------- /Scenerio15.py: -------------------------------------------------------------------------------- 1 | l1 = [2, 3, 4, 5] 2 | l2 = [6, 7, 8, 9] 3 | # append 4 | appendlst = l1.append(l2) 5 | print(l1) 6 | 7 | # extend 8 | l1.extend(l2) 9 | print(l1) 10 | -------------------------------------------------------------------------------- /Scenerio16.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql import * 4 | from pyspark.sql.types import * 5 | from pyspark.sql.functions import * 6 | from pyspark.sql.window import * 7 | 8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio16") 9 | sc = SparkContext(conf=conf) 10 | sc.setLogLevel("ERROR") 11 | spark = SparkSession.builder.getOrCreate() 12 | data = [(1, "Jhon", "Testing", 5000), 13 | (2, "Tim", "Development", 6000), 14 | (3, "Jhon", "Development", 5000), 15 | (4, "Sky", "Prodcution", 8000)] 16 | df = spark.createDataFrame(data, ["id", "name", "dept", "salary"]) 17 | df.show() 18 | 19 | finaldf = df.dropDuplicates(["name"]).orderBy("id") 20 | finaldf.show() 21 | -------------------------------------------------------------------------------- /Scenerio17.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql import * 4 | from pyspark.sql.types import * 5 | from pyspark.sql.functions import * 6 | from pyspark.sql.window import * 7 | 8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio17") 9 | sc = SparkContext(conf=conf) 10 | sc.setLogLevel("ERROR") 11 | spark = SparkSession.builder.getOrCreate() 12 | data = [(1, "Tim", 24, "Kerala", "India"), 13 | (2, "Asman", 26, "Kerala", "India")] 14 | df1 = spark.createDataFrame(data, ["emp_id", "name", "age", "state", "country"]) 15 | df1.show() 16 | 17 | data2 = [(1, "Tim", 24, "Comcity"), 18 | (2, "Asman", 26, "bimcity")] 19 | df2 = spark.createDataFrame(data2, ["emp_id", "name", "age", "address"]) 20 | df2.show() 21 | 22 | findf = df1.join(df2, ["emp_id", "name", "age"], "outer") 23 | findf.show() 24 | -------------------------------------------------------------------------------- /Scenerio18.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql import * 4 | from pyspark.sql.types import * 5 | from pyspark.sql.functions import * 6 | from pyspark.sql.window import * 7 | 8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio18") 9 | sc = SparkContext(conf=conf) 10 | sc.setLogLevel("ERROR") 11 | spark = SparkSession.builder.getOrCreate() 12 | 13 | # Create input DataFrame 14 | inputdf = spark.createDataFrame([("The Social Dilemma",)], ["word"]) 15 | inputdf.show() 16 | 17 | # Define UDF for reversing words 18 | def reverse_sentence(sentence): 19 | return " ".join([word[::-1] for word in sentence.split(" ")]) 20 | 21 | # Register UDF 22 | reverse_udf = udf(reverse_sentence, StringType()) 23 | 24 | # Apply UDF to input DataFrame 25 | outputdf = inputdf.withColumn("reverse word", reverse_udf("word")).drop("word") 26 | outputdf.show() -------------------------------------------------------------------------------- /Scenerio19.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql import * 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio19") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | 12 | df = spark.read.format("json").option("multiline", "true").load("dbfs:/FileStore/scen.json") 13 | df.printSchema() 14 | finaldf = df.withColumn("multiMedia", explode(col("multiMedia"))).withColumn("dislikes", 15 | expr("likeDislike.dislikes")).withColumn( 16 | "likes", expr("likeDislike.likes")).withColumn("userAction", expr("likeDislike.userAction")).withColumn("createAt", 17 | expr( 18 | "multiMedia.createAt")).withColumn( 19 | "description", expr("multiMedia.description")).withColumn("id", expr("multiMedia.id")).withColumn("likeCount", expr( 20 | "multiMedia.likeCount")).withColumn("mediatype", expr("multiMedia.mediatype")).withColumn("name", expr( 21 | "multiMedia.name")).withColumn("place", expr("multiMedia.place")).withColumn("url", expr("multiMedia.url")).drop( 22 | "likeDislike", "multiMedia") 23 | print("flat Schema") 24 | finaldf.printSchema() 25 | finaldf.show() 26 | -------------------------------------------------------------------------------- /Scenerio2.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf,SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-2") 6 | sc = SparkContext(conf=conf) 7 | sc.setLogLevel("ERROR") 8 | spark = SparkSession.builder.getOrCreate() 9 | data = [ 10 | (1, "1-Jan", "Ordered"), 11 | (1, "2-Jan", "dispatched"), 12 | (1, "3-Jan", "dispatched"), 13 | (1, "4-Jan", "Shipped"), 14 | (1, "5-Jan", "Shipped"), 15 | (1, "6-Jan", "Delivered"), 16 | (2, "1-Jan", "Ordered"), 17 | (2, "2-Jan", "dispatched"), 18 | (2, "3-Jan", "shipped")] 19 | myschema = ["orderid","statusdate","status"] 20 | df = spark.createDataFrame(data,schema=myschema) 21 | df.show() 22 | #Through SQL 23 | df.createOrReplaceTempView("ordertab") 24 | spark.sql("select * from ordertab where status = 'dispatched' and orderid in(select orderid from ordertab where status = 'Ordered')").show() 25 | 26 | #Through DSL 27 | result = df.filter( 28 | (col("status") == "dispatched") & 29 | (col("orderid").isin( 30 | *[row[0] for row in df.filter(col("status") == "Ordered").select("orderid").collect()] 31 | )) 32 | ) 33 | result.show() 34 | -------------------------------------------------------------------------------- /Scenerio20.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql import * 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio20") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | 12 | df = spark.read.format("json").option("multiline", "true").load( 13 | "dbfs:/FileStore/flatjson/part-00000-tid-3675309499584050336-b8650962-dec3-4fe4-a204-c914090f019e-21-1-c000.json") 14 | df.printSchema() 15 | compdf = df.select( 16 | col("code"), 17 | col("commentCount"), 18 | col("createdAt"), 19 | col("description"), 20 | col("feedsComment"), 21 | col("id"), 22 | col("imagePaths"), 23 | col("images"), 24 | col("isdeleted"), 25 | col("lat"), 26 | struct(col("dislikes"), col("likes"), col("userAction")).alias("likeDislike"), 27 | col("lng"), 28 | col("location"), 29 | col("mediatype"), 30 | col("msg"), 31 | array( 32 | struct( 33 | col("createAt"), 34 | col("description"), 35 | col("id"), 36 | col("likeCount"), 37 | col("mediatype"), 38 | col("name"), 39 | col("place"), 40 | col("url") 41 | ).alias("element") 42 | ).alias("multiMedia"), 43 | col("name"), 44 | col("profilePicture"), 45 | col("title"), 46 | col("userId"), 47 | col("videoUrl"), 48 | col("totalFeed") 49 | ) 50 | 51 | compdf.printSchema() 52 | -------------------------------------------------------------------------------- /Scenerio21.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql import * 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | from pyspark import * 7 | 8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio21") 9 | sc = SparkContext(conf=conf) 10 | sc.setLogLevel("ERROR") 11 | spark = SparkSession.builder.getOrCreate() 12 | 13 | data = [ 14 | ("SEA", "SF", 300), 15 | ("CHI", "SEA", 2000), 16 | ("SF", "SEA", 300), 17 | ("SEA", "CHI", 2000), 18 | ("SEA", "LND", 500), 19 | ("LND", "SEA", 500), 20 | ("LND", "CHI", 1000), 21 | ("CHI", "NDL", 180)] 22 | df = spark.createDataFrame(data, ["from", "to", "dist"]) 23 | df.show() 24 | 25 | # Through SQL 26 | df.createOrReplaceTempView("trip") 27 | spark.sql("""SELECT r1.from, r1.to, (r1.dist + r2.dist) AS roundtrip_dist 28 | FROM trip r1 29 | JOIN trip r2 ON r1.from = r2.to AND r1.to = r2.from 30 | WHERE r1.from < r1.to 31 | """).show() 32 | 33 | # Through DSL 34 | finaldf = df.alias("r1").join(df.alias("r2"), 35 | (col("r1.from") == col("r2.to")) & (col("r1.to") == col("r2.from"))).where( 36 | col("r1.from") < col("r1.to")).select(col("r1.from"), col("r1.to"), 37 | (col("r1.dist") + col("r2.dist")).alias("roundtrip_dist")) 38 | 39 | finaldf.show() 40 | -------------------------------------------------------------------------------- /Scenerio22.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql import * 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | from pyspark import * 7 | 8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio22") 9 | sc = SparkContext(conf=conf) 10 | sc.setLogLevel("ERROR") 11 | spark = SparkSession.builder.getOrCreate() 12 | 13 | data = [(1, "26-May", 100), 14 | (1, "27-May", 200), 15 | (1, "28-May", 300), 16 | (2, "29-May", 400), 17 | (3, "30-May", 500), 18 | (3, "31-May", 600)] 19 | df = spark.createDataFrame(data, ["pid", "date", "price"]) 20 | df.show() 21 | # Through SQL 22 | df.createOrReplaceTempView("ordertab") 23 | spark.sql("select pid,date,price, sum(price) over(partition by(pid) order by(price)) as new_price from ordertab").show() 24 | # Through DSL 25 | wn = Window.partitionBy("pid").orderBy("price") 26 | finaldf = df.withColumn("new_price", sum("price"). 27 | over(wn)).show() 28 | -------------------------------------------------------------------------------- /Scenerio23.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql import * 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | from pyspark import * 7 | 8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio23") 9 | sc = SparkContext(conf=conf) 10 | sc.setLogLevel("ERROR") 11 | spark = SparkSession.builder.getOrCreate() 12 | 13 | data = [(1, 5), (2, 6), (3, 5), (3, 6), (1, 6)] 14 | df = spark.createDataFrame(data, ["customer_id", "product_key"]) 15 | df.show() 16 | data2 = [(5,), (6,)] 17 | df2 = spark.createDataFrame(data2, ["product_key"]) 18 | df2.show() 19 | finaldf = df.join(df2, ["product_key"], "inner").drop("product_key").distinct().filter(col("customer_id") != 2) 20 | finaldf.show() 21 | -------------------------------------------------------------------------------- /Scenerio24.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql import * 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | from pyspark import * 7 | 8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio24") 9 | sc = SparkContext(conf=conf) 10 | sc.setLogLevel("ERROR") 11 | spark = SparkSession.builder.getOrCreate() 12 | 13 | data = [ 14 | (1, "home"), 15 | (1, "products"), 16 | (1, "checkout"), 17 | (1, "confirmation"), 18 | (2, "home"), 19 | (2, "products"), 20 | (2, "cart"), 21 | (2, "checkout"), 22 | (2, "confirmation"), 23 | (2, "home"), 24 | (2, "products")] 25 | df = spark.createDataFrame(data, ["userid", "page"]) 26 | df.show() 27 | # Through SQL 28 | df.createOrReplaceTempView("pagetab") 29 | spark.sql("select userid, collect_list(page) as pages from pagetab group by userid").show() 30 | 31 | # Through DSL 32 | finaldf = df.groupBy("userid").agg(collect_list("page").alias("pages")) 33 | finaldf.show(truncate=False) 34 | -------------------------------------------------------------------------------- /Scenerio25.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | from pyspark.sql import * 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | from pyspark import * 7 | 8 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio25") 9 | sc = SparkContext(conf=conf) 10 | sc.setLogLevel("ERROR") 11 | spark = SparkSession.builder.getOrCreate() 12 | 13 | df = spark.read.format("csv").option("header", "true") \ 14 | .option("mode", "DROPMALFORMED") \ 15 | .load("D:/BigData/Datasets/Scenerio25.csv") 16 | df.show() 17 | -------------------------------------------------------------------------------- /Scenerio26.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf 2 | from pyspark import SparkContext 3 | from pyspark import * 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql import * 6 | from pyspark.sql.types import * 7 | from pyspark.sql.functions import * 8 | from pyspark.sql.window import * 9 | 10 | conf = SparkConf().setMaster("local[*]").setAppName("test") 11 | sc = SparkContext(conf=conf) 12 | sc.setLogLevel("ERROR") 13 | 14 | spark = SparkSession.builder.getOrCreate() 15 | 16 | sourcedata = [ 17 | (1, "A"), 18 | (2, "B"), 19 | (3, "C"), 20 | (4, "D")] 21 | mysourceshcema = ["id","name"] 22 | sourcedf = spark.createDataFrame(sourcedata,schema=mysourceshcema) 23 | sourcedf.show() 24 | 25 | targetdata = [ 26 | (1, "A"), 27 | (2, "B"), 28 | (4, "X"), 29 | (5, "F")] 30 | mytargetschema = ["id1","name1"] 31 | targetdf = spark.createDataFrame(targetdata,schema=mytargetschema) 32 | targetdf.show() 33 | 34 | #--------------------------Through SQL 35 | 36 | sourcedf.createOrReplaceTempView("sourcetab") 37 | targetdf.createOrReplaceTempView("targettab") 38 | 39 | print("=================Through SQL==========================") 40 | spark.sql("""SELECT COALESCE(s.id, t.id1) AS id, 41 | CASE 42 | WHEN s.name IS NULL THEN 'new in target' 43 | WHEN t.name1 IS NULL THEN 'new in source' 44 | WHEN s.name != t.name1 THEN 'mismatch' 45 | END AS comment 46 | FROM sourcetab s 47 | FULL OUTER JOIN targettab t ON s.id = t.id1 48 | WHERE s.name != t.name1 OR s.name IS NULL OR t.name1 IS NULL 49 | """).show() 50 | 51 | print("==================Through DSL===============================") 52 | #--------------------------Through DSL 53 | #//Joining two dataframes 54 | 55 | joindf = sourcedf.join(targetdf, sourcedf["id"]==targetdf["id1"],"outer") 56 | joindf.show() 57 | 58 | #//filtering the columns which are not equal and null 59 | 60 | fildf = joindf.filter((col("name") != col("name1")) | col("name").isNull() | col("name1").isNull()) 61 | fildf.show() 62 | 63 | #//coalesce will replace the null value with next non null value 64 | 65 | filnulldf = fildf.withColumn("id",coalesce(col("id"),col("id1"))).drop("id1") 66 | filnulldf.show() 67 | 68 | finaldf = filnulldf.withColumn("comment",expr("case when name is null then 'new in target' when name1 is null then 'new in source' when name != name1 then 'mismatch' end")).drop("name","name1") 69 | finaldf.show() 70 | 71 | 72 | -------------------------------------------------------------------------------- /Scenerio27.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf 2 | from pyspark import SparkContext 3 | from pyspark import * 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql import * 6 | from pyspark.sql.types import * 7 | from pyspark.sql.functions import * 8 | from pyspark.sql.window import * 9 | 10 | conf = SparkConf().setMaster("local[*]").setAppName("test") 11 | sc = SparkContext(conf=conf) 12 | sc.setLogLevel("ERROR") 13 | 14 | spark = SparkSession.builder.getOrCreate() 15 | 16 | data = [(1,60000,2018),(1,70000,2019),(1,80000,2020),(2,60000,2018),(2,65000,2019),(2,65000,2020),(3,60000,2018),(3,65000,2019)] 17 | 18 | df = spark.createDataFrame(data,["empid","salary","year"]) 19 | 20 | df.show() 21 | 22 | wn = Window.partitionBy("empid").orderBy("year") 23 | 24 | lagdf = df.withColumn("diff",lag("salary",1).over(wn)) 25 | lagdf.show() 26 | 27 | finaldf = lagdf.withColumn("incresalary",expr("salary - diff")).drop("diff").na.fill(0).orderBy("empid","year") 28 | 29 | finaldf.show() -------------------------------------------------------------------------------- /Scenerio28.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf 2 | from pyspark import SparkContext 3 | from pyspark import * 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql import * 6 | from pyspark.sql.types import * 7 | from pyspark.sql.functions import * 8 | from pyspark.sql.window import * 9 | 10 | conf = SparkConf().setMaster("local[*]").setAppName("test") 11 | sc = SparkContext(conf=conf) 12 | sc.setLogLevel("ERROR") 13 | 14 | spark = SparkSession.builder.getOrCreate() 15 | 16 | data = [("A", "AA"), ("B", "BB"), ("C", "CC"), ("AA", "AAA"), ("BB", "BBB"), ("CC", "CCC")] 17 | 18 | df = spark.createDataFrame(data, ["child", "parent"]) 19 | df.show() 20 | 21 | joindf = df.alias("a").join(df.alias("b"), col("a.child") == col("b.parent")).select( 22 | col("a.child").alias("child_a"), 23 | col("a.parent").alias("parent_a"), 24 | col("b.child").alias("child_b"), 25 | col("b.parent").alias("parent_b") 26 | ) 27 | joindf.show() 28 | 29 | findf = joindf.withColumnRenamed("child_a", "parent").withColumnRenamed("parent_a", "grandparent").withColumnRenamed( 30 | "child_b", "child").drop("parent_b").select("child", "parent", "grandparent") 31 | 32 | findf.show() 33 | 34 | # another way 35 | 36 | df2 = df.withColumnRenamed("child", "child1").withColumnRenamed("parent", "parent1") 37 | df2.show() 38 | 39 | secondjoindf = df.join(df2, col("parent") == col("child1"), "inner") 40 | secondjoindf.show() 41 | 42 | finaldf = secondjoindf.withColumnRenamed("parent1", "grandparent").drop("child1") 43 | finaldf.show() 44 | -------------------------------------------------------------------------------- /Scenerio29.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf 2 | from pyspark import SparkContext 3 | from pyspark import * 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql import * 6 | from pyspark.sql.types import * 7 | from pyspark.sql.functions import * 8 | from pyspark.sql.window import * 9 | 10 | conf = SparkConf().setMaster("local[*]").setAppName("test") 11 | sc = SparkContext(conf=conf) 12 | sc.setLogLevel("ERROR") 13 | 14 | spark = SparkSession.builder.getOrCreate() 15 | 16 | data1 = [(1,), (2,), (3,)] 17 | 18 | df1 = spark.createDataFrame(data1, ["col"]) 19 | df1.show() 20 | 21 | data2 = [(1,), (2,), (3,), (4,), (5,)] 22 | 23 | df2 = spark.createDataFrame(data2, ["col1"]) 24 | df2.show() 25 | 26 | maxdf = df1.agg(max("col").alias("max")) 27 | maxdf.show() 28 | 29 | maxsalary = maxdf.select(col("max")).first()[0] 30 | 31 | joindf = df1.join(df2, df1["col"] == df2["col1"], "outer").drop("col") 32 | joindf.show() 33 | 34 | finaldf = joindf.filter(col("col1") != maxsalary).withColumnRenamed("col1", "col").orderBy("col") 35 | finaldf.show() 36 | -------------------------------------------------------------------------------- /Scenerio3.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-3") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | data = [(1111, "2021-01-15", 10), 12 | (1111, "2021-01-16", 15), 13 | (1111, "2021-01-17", 30), 14 | (1112, "2021-01-15", 10), 15 | (1112, "2021-01-15", 20), 16 | (1112, "2021-01-15", 30)] 17 | 18 | myschema = ["sensorid", "timestamp", "values"] 19 | 20 | df = spark.createDataFrame(data, schema=myschema) 21 | df.show() 22 | 23 | d1 = Window.partitionBy("sensorid").orderBy("values") 24 | 25 | finaldf = df.withColumn("nextvalues", lead("values", 1).over(d1)) \ 26 | .filter(col("nextvalues").isNotNull()) \ 27 | .withColumn("values", expr("nextvalues-values")) \ 28 | .drop("nextvalues") \ 29 | .orderBy(col("sensorid")).show() 30 | -------------------------------------------------------------------------------- /Scenerio30.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "8148ca11-3054-40c9-b01a-24dc8169bd4d", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark import *\n", 13 | "from pyspark import SparkConf, SparkContext\n", 14 | "from pyspark.sql import *\n", 15 | "from pyspark.sql import SparkSession\n", 16 | "from pyspark.sql.functions import *\n", 17 | "from pyspark.sql.types import *\n", 18 | "from pyspark.sql.window import *\n", 19 | "\n", 20 | "conf = SparkConf().setMaster(\"local[*]\").setAppName(\"test\")\n", 21 | "sc = SparkContext(conf=conf)\n", 22 | "sc.setLogLevel(\"ERROR\")\n", 23 | "\n", 24 | "spark = SparkSession.builder.getOrCreate()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "id": "2eae0587-e373-4a4f-a0fa-dd1653df168f", 31 | "metadata": { 32 | "tags": [] 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stderr", 37 | "output_type": "stream", 38 | "text": [ 39 | "d:\\bigdata\\pyspark\\python37\\lib\\importlib\\_bootstrap.py:219: RuntimeWarning: numpy.ufunc size changed, may indicate binary incompatibility. Expected 192 from C header, got 216 from PyObject\n", 40 | " return f(*args, **kwds)\n" 41 | ] 42 | }, 43 | { 44 | "name": "stdout", 45 | "output_type": "stream", 46 | "text": [ 47 | "+------+----+-------+-------+\n", 48 | "|emp_id|name|dept_id| salary|\n", 49 | "+------+----+-------+-------+\n", 50 | "| 1| A| A|1000000|\n", 51 | "| 2| B| A|2500000|\n", 52 | "| 3| C| G| 500000|\n", 53 | "| 4| D| G| 800000|\n", 54 | "| 5| E| W|9000000|\n", 55 | "| 6| F| W|2000000|\n", 56 | "+------+----+-------+-------+\n", 57 | "\n", 58 | "+--------+---------+\n", 59 | "|dept_id1|dept_name|\n", 60 | "+--------+---------+\n", 61 | "| A| AZURE|\n", 62 | "| G| GCP|\n", 63 | "| W| AWS|\n", 64 | "+--------+---------+\n", 65 | "\n" 66 | ] 67 | } 68 | ], 69 | "source": [ 70 | "data1 = [\n", 71 | " (1, \"A\", \"A\", 1000000),\n", 72 | " (2, \"B\", \"A\", 2500000),\n", 73 | " (3, \"C\", \"G\", 500000),\n", 74 | " (4, \"D\", \"G\", 800000),\n", 75 | " (5, \"E\", \"W\", 9000000),\n", 76 | " (6, \"F\", \"W\", 2000000),\n", 77 | "]\n", 78 | "df1 = spark.createDataFrame(data1, [\"emp_id\", \"name\", \"dept_id\", \"salary\"])\n", 79 | "df1.show()\n", 80 | "\n", 81 | "data2 = [(\"A\", \"AZURE\"), (\"G\", \"GCP\"), (\"W\", \"AWS\")]\n", 82 | "df2 = spark.createDataFrame(data2, [\"dept_id1\", \"dept_name\"])\n", 83 | "df2.show()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 3, 89 | "id": "d975d88d-3db6-40ab-9e3e-b43269f98188", 90 | "metadata": { 91 | "tags": [] 92 | }, 93 | "outputs": [ 94 | { 95 | "name": "stdout", 96 | "output_type": "stream", 97 | "text": [ 98 | "+------+----+-------+-------+---------+\n", 99 | "|emp_id|name|dept_id| salary|dept_name|\n", 100 | "+------+----+-------+-------+---------+\n", 101 | "| 1| A| A|1000000| AZURE|\n", 102 | "| 2| B| A|2500000| AZURE|\n", 103 | "| 5| E| W|9000000| AWS|\n", 104 | "| 6| F| W|2000000| AWS|\n", 105 | "| 3| C| G| 500000| GCP|\n", 106 | "| 4| D| G| 800000| GCP|\n", 107 | "+------+----+-------+-------+---------+\n", 108 | "\n" 109 | ] 110 | } 111 | ], 112 | "source": [ 113 | "joindf = df1.join(df2, df1[\"dept_id\"] == df2[\"dept_id1\"], \"inner\").drop(\"dept_id1\")\n", 114 | "joindf.show()" 115 | ] 116 | }, 117 | { 118 | "cell_type": "code", 119 | "execution_count": 5, 120 | "id": "8533dbae-3872-40b2-8931-5c96c832853c", 121 | "metadata": { 122 | "tags": [] 123 | }, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "+------+----+-------+-------+---------+----+\n", 130 | "|emp_id|name|dept_id| salary|dept_name|rank|\n", 131 | "+------+----+-------+-------+---------+----+\n", 132 | "| 2| B| A|2500000| AZURE| 1|\n", 133 | "| 1| A| A|1000000| AZURE| 2|\n", 134 | "| 5| E| W|9000000| AWS| 1|\n", 135 | "| 6| F| W|2000000| AWS| 2|\n", 136 | "| 4| D| G| 800000| GCP| 1|\n", 137 | "| 3| C| G| 500000| GCP| 2|\n", 138 | "+------+----+-------+-------+---------+----+\n", 139 | "\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "wn = Window.partitionBy(\"dept_id\").orderBy(col(\"salary\").desc())\n", 145 | "\n", 146 | "rankdf = joindf.withColumn(\"rank\", dense_rank().over(wn))\n", 147 | "rankdf.show()" 148 | ] 149 | }, 150 | { 151 | "cell_type": "code", 152 | "execution_count": 6, 153 | "id": "a37624d0-1513-4f73-ab36-4b09c1613adc", 154 | "metadata": { 155 | "tags": [] 156 | }, 157 | "outputs": [ 158 | { 159 | "name": "stdout", 160 | "output_type": "stream", 161 | "text": [ 162 | "+------+----+---------+-------+\n", 163 | "|emp_id|name|dept_name| salary|\n", 164 | "+------+----+---------+-------+\n", 165 | "| 1| A| AZURE|1000000|\n", 166 | "| 6| F| AWS|2000000|\n", 167 | "| 3| C| GCP| 500000|\n", 168 | "+------+----+---------+-------+\n", 169 | "\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "finaldf = (\n", 175 | " rankdf.filter(col(\"rank\") == 2)\n", 176 | " .drop(\"rank\")\n", 177 | " .select(\"emp_id\", \"name\", \"dept_name\", \"salary\")\n", 178 | ")\n", 179 | "finaldf.show()" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": null, 185 | "id": "0bc4c5b1-46b9-4837-b841-244f81f8816a", 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [] 189 | } 190 | ], 191 | "metadata": { 192 | "kernelspec": { 193 | "display_name": "Python 3 (ipykernel)", 194 | "language": "python", 195 | "name": "python3" 196 | }, 197 | "language_info": { 198 | "codemirror_mode": { 199 | "name": "ipython", 200 | "version": 3 201 | }, 202 | "file_extension": ".py", 203 | "mimetype": "text/x-python", 204 | "name": "python", 205 | "nbconvert_exporter": "python", 206 | "pygments_lexer": "ipython3", 207 | "version": "3.7.0" 208 | } 209 | }, 210 | "nbformat": 4, 211 | "nbformat_minor": 5 212 | } 213 | -------------------------------------------------------------------------------- /Scenerio31.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 7, 6 | "id": "d20891a9-7e5c-440e-be32-92e3ea3b5632", 7 | "metadata": { 8 | "tags": [] 9 | }, 10 | "outputs": [], 11 | "source": [ 12 | "from pyspark import *\n", 13 | "from pyspark import SparkConf, SparkContext\n", 14 | "from pyspark.sql import *\n", 15 | "from pyspark.sql import SparkSession\n", 16 | "from pyspark.sql.functions import *\n", 17 | "from pyspark.sql.types import *\n", 18 | "from pyspark.sql.window import *\n", 19 | "\n", 20 | "conf = SparkConf().setMaster(\"local[*]\").setAppName(\"test\")\n", 21 | "sc = SparkContext(conf=conf)\n", 22 | "sc.setLogLevel(\"ERROR\")\n", 23 | "\n", 24 | "spark = SparkSession.builder.getOrCreate()" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 8, 30 | "id": "a4821839-667b-4aa0-9f40-e6f944e1d5fb", 31 | "metadata": { 32 | "tags": [] 33 | }, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "+----+-----+--------+-----------+\n", 40 | "|col1| col2| col3| col4|\n", 41 | "+----+-----+--------+-----------+\n", 42 | "| m1|m1,m2|m1,m2,m3|m1,m2,m3,m4|\n", 43 | "+----+-----+--------+-----------+\n", 44 | "\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "# creating the dataframe\n", 50 | "\n", 51 | "data = [(\"m1\", \"m1,m2\", \"m1,m2,m3\", \"m1,m2,m3,m4\")]\n", 52 | "\n", 53 | "df = spark.createDataFrame(data, [\"col1\", \"col2\", \"col3\", \"col4\"])\n", 54 | "df.show()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 14, 60 | "id": "0249aa2d-81a9-4247-a0fd-b8ee74fb6fcf", 61 | "metadata": { 62 | "tags": [] 63 | }, 64 | "outputs": [ 65 | { 66 | "name": "stdout", 67 | "output_type": "stream", 68 | "text": [ 69 | "+--------------------+\n", 70 | "| col|\n", 71 | "+--------------------+\n", 72 | "|m1-m1,m2-m1,m2,m3...|\n", 73 | "+--------------------+\n", 74 | "\n" 75 | ] 76 | } 77 | ], 78 | "source": [ 79 | "# concating the dataframe into single column\n", 80 | "\n", 81 | "contdf = df.withColumn(\"col\", expr(\"concat(col1,'-',col2,'-',col3,'-',col4)\")).drop(\n", 82 | " \"col1\", \"col2\", \"col3\", \"col4\"\n", 83 | ")\n", 84 | "contdf.show()" 85 | ] 86 | }, 87 | { 88 | "cell_type": "code", 89 | "execution_count": 15, 90 | "id": "594fa64d-e0fc-4335-a4f0-82ffca893248", 91 | "metadata": { 92 | "tags": [] 93 | }, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "+-----------+\n", 100 | "| col|\n", 101 | "+-----------+\n", 102 | "| m1|\n", 103 | "| m1,m2|\n", 104 | "| m1,m2,m3|\n", 105 | "|m1,m2,m3,m4|\n", 106 | "+-----------+\n", 107 | "\n" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "finaldf = contdf.selectExpr(\"explode(split(col,'-')) as col\")\n", 113 | "finaldf.show()" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": null, 119 | "id": "ec4e34b8-ca4d-4356-a3b5-3c8b94e9b470", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [] 123 | } 124 | ], 125 | "metadata": { 126 | "kernelspec": { 127 | "display_name": "Python 3 (ipykernel)", 128 | "language": "python", 129 | "name": "python3" 130 | }, 131 | "language_info": { 132 | "codemirror_mode": { 133 | "name": "ipython", 134 | "version": 3 135 | }, 136 | "file_extension": ".py", 137 | "mimetype": "text/x-python", 138 | "name": "python", 139 | "nbconvert_exporter": "python", 140 | "pygments_lexer": "ipython3", 141 | "version": "3.7.0" 142 | } 143 | }, 144 | "nbformat": 4, 145 | "nbformat_minor": 5 146 | } 147 | -------------------------------------------------------------------------------- /Scenerio32.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "46f4d97c-7f60-4a82-a7c0-4b8923dc0f46", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [ 19 | { 20 | "output_type": "stream", 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "+-------+-------------------+\n|food_id| food_item|\n+-------+-------------------+\n| 1| Veg Biryani|\n| 2| Veg Fried Rice|\n| 3| Kaju Fried Rice|\n| 4| Chicken Biryani|\n| 5|Chicken Dum Biryani|\n| 6| Prawns Biryani|\n| 7| Fish Birayani|\n+-------+-------------------+\n\n+-------+------+\n|food_id|rating|\n+-------+------+\n| 1| 5|\n| 2| 3|\n| 3| 4|\n| 4| 4|\n| 5| 5|\n| 6| 4|\n| 7| 4|\n+-------+------+\n\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "data = [(1,\"Veg Biryani\"),(2,\"Veg Fried Rice\"),(3,\"Kaju Fried Rice\"),(4,\"Chicken Biryani\"),(5,\"Chicken Dum Biryani\"),(6,\"Prawns Biryani\"),(7,\"Fish Birayani\")]\n", 30 | "\n", 31 | "df1 = spark.createDataFrame(data,[\"food_id\",\"food_item\"])\n", 32 | "df1.show()\n", 33 | "\n", 34 | "ratings = [(1,5),(2,3),(3,4),(4,4),(5,5),(6,4),(7,4)]\n", 35 | "\n", 36 | "df2 = spark.createDataFrame(ratings,[\"food_id\",\"rating\"])\n", 37 | "df2.show()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 0, 43 | "metadata": { 44 | "application/vnd.databricks.v1+cell": { 45 | "cellMetadata": { 46 | "byteLimit": 2048000, 47 | "rowLimit": 10000 48 | }, 49 | "inputWidgets": {}, 50 | "nuid": "6876f425-2609-4923-9de4-090a4f0ecb09", 51 | "showTitle": false, 52 | "title": "" 53 | } 54 | }, 55 | "outputs": [ 56 | { 57 | "output_type": "stream", 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "+-------+-------------------+------+\n|food_id| food_item|rating|\n+-------+-------------------+------+\n| 1| Veg Biryani| 5|\n| 2| Veg Fried Rice| 3|\n| 3| Kaju Fried Rice| 4|\n| 4| Chicken Biryani| 4|\n| 5|Chicken Dum Biryani| 5|\n| 6| Prawns Biryani| 4|\n| 7| Fish Birayani| 4|\n+-------+-------------------+------+\n\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "joindf = df1.join(df2,df1[\"food_id\"]==df2[\"food_id\"],\"inner\").select(df1[\"food_id\"],\"food_item\",\"rating\")\n", 67 | "joindf.show()" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 0, 73 | "metadata": { 74 | "application/vnd.databricks.v1+cell": { 75 | "cellMetadata": { 76 | "byteLimit": 2048000, 77 | "rowLimit": 10000 78 | }, 79 | "inputWidgets": {}, 80 | "nuid": "df16c628-d638-43a0-9032-ac606c8983d7", 81 | "showTitle": false, 82 | "title": "" 83 | } 84 | }, 85 | "outputs": [ 86 | { 87 | "output_type": "stream", 88 | "name": "stdout", 89 | "output_type": "stream", 90 | "text": [ 91 | "+-------+-------------------+------+---------------+\n|food_id| food_item|rating|stats(out of 5)|\n+-------+-------------------+------+---------------+\n| 1| Veg Biryani| 5| *****|\n| 2| Veg Fried Rice| 3| ***|\n| 3| Kaju Fried Rice| 4| ****|\n| 4| Chicken Biryani| 4| ****|\n| 5|Chicken Dum Biryani| 5| *****|\n| 6| Prawns Biryani| 4| ****|\n| 7| Fish Birayani| 4| ****|\n+-------+-------------------+------+---------------+\n\n" 92 | ] 93 | } 94 | ], 95 | "source": [ 96 | "from pyspark.sql.functions import *\n", 97 | "\n", 98 | "finaldf = joindf.withColumn(\"stats(out of 5)\",expr(\"repeat('*',rating)\"))\n", 99 | "finaldf.show()" 100 | ] 101 | } 102 | ], 103 | "metadata": { 104 | "application/vnd.databricks.v1+notebook": { 105 | "dashboards": [], 106 | "language": "python", 107 | "notebookMetadata": { 108 | "pythonIndentUnit": 4 109 | }, 110 | "notebookName": "Scenerio32", 111 | "widgets": {} 112 | } 113 | }, 114 | "nbformat": 4, 115 | "nbformat_minor": 0 116 | } 117 | -------------------------------------------------------------------------------- /Scenerio33.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "7d62da1e-f835-4c4a-9737-9d372a69a19b", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [ 19 | { 20 | "output_type": "stream", 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "+--------------------+--------------+-----------+\n| id| name|family_size|\n+--------------------+--------------+-----------+\n|c00dac11bde74750b...| Alex Thomas| 9|\n|eb6f2d3426694667a...| Chris Gray| 2|\n|3f7b5b8e835d4e1c8...| Emily Johnson| 4|\n|9a345b079d9f4d3ca...| Michael Brown| 6|\n|e0a5f57516024de2a...|Jessica Wilson| 3|\n+--------------------+--------------+-----------+\n\n+--------------------+------------+--------+--------+\n| id| name|min_size|max_size|\n+--------------------+------------+--------+--------+\n|023fd23615bd4ff4b...| Bolivia| 2| 4|\n|be247f73de0f4b2d8...|Cook Islands| 4| 8|\n|3e85ab80a6f84ef3b...| Brazil| 4| 7|\n|e571e164152c4f7c8...| Australia| 5| 9|\n|f35a7bb7d44342f7a...| Canada| 3| 5|\n|a1b5a4b5fc5f46f89...| Japan| 10| 12|\n+--------------------+------------+--------+--------+\n\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "data = [('c00dac11bde74750b4d207b9c182a85f', 'Alex Thomas', 9),('eb6f2d3426694667ae3e79d6274114a4', 'Chris Gray', 2),('3f7b5b8e835d4e1c8b3e12e964a741f3', 'Emily Johnson', 4),('9a345b079d9f4d3cafb2d4c11d20f8ce', 'Michael Brown', 6),('e0a5f57516024de2a231d09de2cbe9d1', 'Jessica Wilson', 3)]\n", 30 | "\n", 31 | "familydf = spark.createDataFrame(data,[\"id\",\"name\",\"family_size\"])\n", 32 | "familydf.show()\n", 33 | "\n", 34 | "countrydata = [('023fd23615bd4ff4b2ae0a13ed7efec9', 'Bolivia', 2 , 4),('be247f73de0f4b2d810367cb26941fb9', 'Cook Islands', 4,8),('3e85ab80a6f84ef3b9068b21dbcc54b3', 'Brazil', 4,7),('e571e164152c4f7c8413e2734f67b146', 'Australia', 5,9),('f35a7bb7d44342f7a8a42a53115294a8', 'Canada', 3,5),('a1b5a4b5fc5f46f891d9040566a78f27', 'Japan', 10,12)]\n", 35 | "\n", 36 | "countrydf = spark.createDataFrame(countrydata,[\"id\",\"name\",\"min_size\",\"max_size\"])\n", 37 | "countrydf.show()\n", 38 | "\n" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 0, 44 | "metadata": { 45 | "application/vnd.databricks.v1+cell": { 46 | "cellMetadata": { 47 | "byteLimit": 2048000, 48 | "rowLimit": 10000 49 | }, 50 | "inputWidgets": {}, 51 | "nuid": "b3301004-40eb-4c42-b786-eef92e7fff40", 52 | "showTitle": false, 53 | "title": "" 54 | } 55 | }, 56 | "outputs": [ 57 | { 58 | "output_type": "stream", 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "+--------------+-----------+------------+--------+--------+\n| name|family_size| name|min_size|max_size|\n+--------------+-----------+------------+--------+--------+\n| Alex Thomas| 9| Australia| 5| 9|\n| Chris Gray| 2| Bolivia| 2| 4|\n| Emily Johnson| 4| Bolivia| 2| 4|\n| Emily Johnson| 4|Cook Islands| 4| 8|\n| Emily Johnson| 4| Brazil| 4| 7|\n| Emily Johnson| 4| Canada| 3| 5|\n| Michael Brown| 6|Cook Islands| 4| 8|\n| Michael Brown| 6| Brazil| 4| 7|\n| Michael Brown| 6| Australia| 5| 9|\n|Jessica Wilson| 3| Bolivia| 2| 4|\n|Jessica Wilson| 3| Canada| 3| 5|\n+--------------+-----------+------------+--------+--------+\n\n" 63 | ] 64 | } 65 | ], 66 | "source": [ 67 | "joindf = familydf.join(countrydf, (familydf[\"family_size\"]>=countrydf[\"min_size\"]) & (familydf[\"family_size\"]<=countrydf[\"max_size\"]),\"inner\").select(familydf[\"name\"],familydf[\"family_size\"],countrydf[\"name\"],\"min_size\",\"max_size\")\n", 68 | "joindf.show()" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": 0, 74 | "metadata": { 75 | "application/vnd.databricks.v1+cell": { 76 | "cellMetadata": { 77 | "byteLimit": 2048000, 78 | "rowLimit": 10000 79 | }, 80 | "inputWidgets": {}, 81 | "nuid": "e7769658-2204-44fc-9f37-0ea2f9b40b01", 82 | "showTitle": false, 83 | "title": "" 84 | } 85 | }, 86 | "outputs": [ 87 | { 88 | "output_type": "stream", 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "+--------------+-------------------+\n| name|number_of_countries|\n+--------------+-------------------+\n| Alex Thomas| 1|\n| Chris Gray| 1|\n| Emily Johnson| 4|\n| Michael Brown| 3|\n|Jessica Wilson| 2|\n+--------------+-------------------+\n\n" 93 | ] 94 | } 95 | ], 96 | "source": [ 97 | "from pyspark.sql.functions import *\n", 98 | "\n", 99 | "groupdf = joindf.groupBy(familydf[\"name\"]).agg(count(\"*\").alias(\"number_of_countries\"))\n", 100 | "groupdf.show()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 0, 106 | "metadata": { 107 | "application/vnd.databricks.v1+cell": { 108 | "cellMetadata": { 109 | "byteLimit": 2048000, 110 | "rowLimit": 10000 111 | }, 112 | "inputWidgets": {}, 113 | "nuid": "c435acec-02ea-4fe7-8c29-5c624840243c", 114 | "showTitle": false, 115 | "title": "" 116 | } 117 | }, 118 | "outputs": [ 119 | { 120 | "output_type": "stream", 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "+-------------------+\n|number_of_countries|\n+-------------------+\n| 4|\n+-------------------+\n\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "finaldf = groupdf.agg(expr(\"max(number)\").alias(\"number_of_countries\"))\n", 130 | "finaldf.show()\n" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": 0, 136 | "metadata": { 137 | "application/vnd.databricks.v1+cell": { 138 | "cellMetadata": { 139 | "byteLimit": 2048000, 140 | "rowLimit": 10000 141 | }, 142 | "inputWidgets": {}, 143 | "nuid": "7d1af649-a565-4bc1-816f-3d60e846d85d", 144 | "showTitle": false, 145 | "title": "" 146 | } 147 | }, 148 | "outputs": [ 149 | { 150 | "output_type": "stream", 151 | "name": "stdout", 152 | "output_type": "stream", 153 | "text": [ 154 | "+--------------+-------------------+----+\n| name|number_of_countries|rank|\n+--------------+-------------------+----+\n| Emily Johnson| 4| 1|\n| Michael Brown| 3| 2|\n|Jessica Wilson| 2| 3|\n| Alex Thomas| 1| 4|\n| Chris Gray| 1| 5|\n+--------------+-------------------+----+\n\n+-------------+-------------------+\n| name|number_of_countries|\n+-------------+-------------------+\n|Emily Johnson| 4|\n+-------------+-------------------+\n\n" 155 | ] 156 | } 157 | ], 158 | "source": [ 159 | "from pyspark.sql.functions import *\n", 160 | "from pyspark.sql import *\n", 161 | "from pyspark.sql.types import *\n", 162 | "\n", 163 | "#another way \n", 164 | "wn = Window.orderBy(desc(\"number_of_countries\"))\n", 165 | "\n", 166 | "rankdf = groupdf.withColumn(\"rank\",row_number().over(wn))\n", 167 | "rankdf.show()\n", 168 | "\n", 169 | "finaldf2 = rankdf.filter(col(\"rank\")==1).drop(\"rank\")\n", 170 | "finaldf2.show()" 171 | ] 172 | } 173 | ], 174 | "metadata": { 175 | "application/vnd.databricks.v1+notebook": { 176 | "dashboards": [], 177 | "language": "python", 178 | "notebookMetadata": { 179 | "mostRecentlyExecutedCommandWithImplicitDF": { 180 | "commandId": 1190225536909284, 181 | "dataframes": [ 182 | "_sqldf" 183 | ] 184 | }, 185 | "pythonIndentUnit": 4 186 | }, 187 | "notebookName": "Scenerio33", 188 | "widgets": {} 189 | } 190 | }, 191 | "nbformat": 4, 192 | "nbformat_minor": 0 193 | } 194 | -------------------------------------------------------------------------------- /Scenerio34.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "c5c448dc-6b9b-4fd7-84c0-cc0ff8db79be", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [ 19 | { 20 | "output_type": "stream", 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "+-----------+------+---+------+\n|customer_id| name|age|gender|\n+-----------+------+---+------+\n| 1| Alice| 25| F|\n| 2| Bob| 40| M|\n| 3| Raj| 46| M|\n| 4| Sekar| 66| M|\n| 5| Jhon| 47| M|\n| 6|Timoty| 28| M|\n| 7| Brad| 90| M|\n| 8| Rita| 34| F|\n+-----------+------+---+------+\n\n+-----------+------+---+------+---------+\n|customer_id| name|age|gender|age_group|\n+-----------+------+---+------+---------+\n| 1| Alice| 25| F| 19-35|\n| 2| Bob| 40| M| 36-50|\n| 3| Raj| 46| M| 36-50|\n| 4| Sekar| 66| M| 51+|\n| 5| Jhon| 47| M| 36-50|\n| 6|Timoty| 28| M| 19-35|\n| 7| Brad| 90| M| 51+|\n| 8| Rita| 34| F| 19-35|\n+-----------+------+---+------+---------+\n\n+---------+-----+\n|age_group|count|\n+---------+-----+\n| 19-35| 3|\n| 36-50| 3|\n| 51+| 2|\n+---------+-----+\n\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "from pyspark.sql.types import *\n", 30 | "from pyspark.sql import *\n", 31 | "from pyspark.sql.functions import *\n", 32 | "\n", 33 | "data = [(1,'Alice',25,'F'),(2,'Bob',40,'M'),(3,'Raj',46,'M'),(4,'Sekar',66,'M'),(5,'Jhon',47,'M'),(6,'Timoty',28,'M'),(7,'Brad',90,'M'),(8,'Rita',34,'F')]\n", 34 | "\n", 35 | "df = spark.createDataFrame(data,['customer_id','name','age','gender'])\n", 36 | "df.show()\n", 37 | "\n", 38 | "#groupdf = df.withColumn(\"age_group\",expr(\"case when age between 19 and 35 then '19-35' case when age between 36 and 50 then '36-50' case when age > 51 then '51+' else age end\"))\n", 39 | "groupdf = df.withColumn(\n", 40 | " \"age_group\",\n", 41 | " expr(\n", 42 | " \"case when age between 19 and 35 then '19-35' \" +\n", 43 | " \"when age between 36 and 50 then '36-50' \" +\n", 44 | " \"when age > 51 then '51+' \" +\n", 45 | " \"else 'Other' end\"\n", 46 | " )\n", 47 | ")\n", 48 | "groupdf.show()\n", 49 | "\n", 50 | "finaldf = groupdf.groupBy('age_group').agg(count('*').alias('count'))\n", 51 | "finaldf.show()" 52 | ] 53 | } 54 | ], 55 | "metadata": { 56 | "application/vnd.databricks.v1+notebook": { 57 | "dashboards": [], 58 | "environmentMetadata": null, 59 | "language": "python", 60 | "notebookMetadata": { 61 | "pythonIndentUnit": 4 62 | }, 63 | "notebookName": "Scenerio34", 64 | "widgets": {} 65 | } 66 | }, 67 | "nbformat": 4, 68 | "nbformat_minor": 0 69 | } 70 | -------------------------------------------------------------------------------- /Scenerio35.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "3f914fd1-1329-49c0-a8ce-60e2aa6ed910", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [ 19 | { 20 | "output_type": "stream", 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "+---+------+----+\n| id| name| age|\n+---+------+----+\n| 1| Jhon| 17|\n| 2| Maria| 20|\n| 3| Raj|NULL|\n| 4|Rachel| 18|\n+---+------+----+\n\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "from pyspark.sql import *\n", 30 | "from pyspark.sql.types import *\n", 31 | "from pyspark.sql.functions import *\n", 32 | "\n", 33 | "#creating the dataframe df1\n", 34 | "data1 = [(1,'Jhon',17),(2,'Maria',20),(3,'Raj',None),(4,'Rachel',18)]\n", 35 | "columns = [\"id\",\"name\",\"age\"]\n", 36 | "df1 = spark.createDataFrame(data1,columns)\n", 37 | "df1.show()" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": 0, 43 | "metadata": { 44 | "application/vnd.databricks.v1+cell": { 45 | "cellMetadata": { 46 | "byteLimit": 2048000, 47 | "rowLimit": 10000 48 | }, 49 | "inputWidgets": {}, 50 | "nuid": "3cbc97a7-fda7-42fc-994e-75bef590271e", 51 | "showTitle": false, 52 | "title": "" 53 | } 54 | }, 55 | "outputs": [ 56 | { 57 | "output_type": "stream", 58 | "name": "stdout", 59 | "output_type": "stream", 60 | "text": [ 61 | "+---+----+---+\n| id|name|age|\n+---+----+---+\n| 0| 0| 1|\n+---+----+---+\n\n" 62 | ] 63 | } 64 | ], 65 | "source": [ 66 | "# Count null entries in each column\n", 67 | "null_counts = df1.select([sum(col(c).isNull().cast(\"int\")).alias(c) for c in df1.columns])\n", 68 | "\n", 69 | "null_counts.show()" 70 | ] 71 | }, 72 | { 73 | "cell_type": "code", 74 | "execution_count": 0, 75 | "metadata": { 76 | "application/vnd.databricks.v1+cell": { 77 | "cellMetadata": { 78 | "byteLimit": 2048000, 79 | "rowLimit": 10000 80 | }, 81 | "inputWidgets": {}, 82 | "nuid": "9a1bfc1d-07f0-4f0a-9cb8-98943b762e3c", 83 | "showTitle": false, 84 | "title": "" 85 | } 86 | }, 87 | "outputs": [ 88 | { 89 | "output_type": "stream", 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "+---+----+----+\n| id|name| age|\n+---+----+----+\n| 3| Raj|NULL|\n+---+----+----+\n\n" 94 | ] 95 | } 96 | ], 97 | "source": [ 98 | "#Remove the row with null entires and store them in a new dataframe named df2\n", 99 | "df2 = df1.filter(col(\"age\").isNull())\n", 100 | "df2.show()" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 0, 106 | "metadata": { 107 | "application/vnd.databricks.v1+cell": { 108 | "cellMetadata": { 109 | "byteLimit": 2048000, 110 | "rowLimit": 10000 111 | }, 112 | "inputWidgets": {}, 113 | "nuid": "c138332c-c270-42ca-81b4-27cfee8f314e", 114 | "showTitle": false, 115 | "title": "" 116 | } 117 | }, 118 | "outputs": [ 119 | { 120 | "output_type": "stream", 121 | "name": "stdout", 122 | "output_type": "stream", 123 | "text": [ 124 | "+---+--------+----+\n| id| city|code|\n+---+--------+----+\n| 1| seatle| 82|\n| 2| london| 75|\n| 3|banglore| 60|\n| 4| boston| 90|\n+---+--------+----+\n\n+---+------+----+--------+----+\n| id| name| age| city|code|\n+---+------+----+--------+----+\n| 1| Jhon| 17| seatle| 82|\n| 2| Maria| 20| london| 75|\n| 3| Raj|NULL|banglore| 60|\n| 4|Rachel| 18| boston| 90|\n+---+------+----+--------+----+\n\n" 125 | ] 126 | } 127 | ], 128 | "source": [ 129 | "#create a new dataframe df3\n", 130 | "data2 = [(1,'seatle',82),(2,'london',75),(3,'banglore',60),(4,'boston',90)]\n", 131 | "columns2 = [\"id\",\"city\",\"code\"]\n", 132 | "\n", 133 | "df3 = spark.createDataFrame(data2,columns2)\n", 134 | "df3.show()\n", 135 | "\n", 136 | "mergedf = df1.join(df3, df1[\"id\"]==df3[\"id\"],\"inner\").select(df1[\"id\"],\"name\",\"age\",\"city\",\"code\")\n", 137 | "mergedf.show()" 138 | ] 139 | }, 140 | { 141 | "cell_type": "code", 142 | "execution_count": 0, 143 | "metadata": { 144 | "application/vnd.databricks.v1+cell": { 145 | "cellMetadata": { 146 | "byteLimit": 2048000, 147 | "rowLimit": 10000 148 | }, 149 | "inputWidgets": {}, 150 | "nuid": "47c7ba70-7fef-4e00-b451-ac7809ca909f", 151 | "showTitle": false, 152 | "title": "" 153 | } 154 | }, 155 | "outputs": [ 156 | { 157 | "output_type": "stream", 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "18.0\n+---+------+---+--------+----+\n| id| name|age| city|code|\n+---+------+---+--------+----+\n| 1| Jhon| 17| seatle| 82|\n| 2| Maria| 20| london| 75|\n| 3| Raj| 18|banglore| 60|\n| 4|Rachel| 18| boston| 90|\n+---+------+---+--------+----+\n\n" 162 | ] 163 | } 164 | ], 165 | "source": [ 166 | "#fill the null value with the mean age of students\n", 167 | "#calculate the mean age\n", 168 | "meanage = mergedf.select(round(mean(\"age\"))).collect()[0][0]\n", 169 | "print(meanage)\n", 170 | "\n", 171 | "filldf = mergedf.na.fill({\"age\":meanage})\n", 172 | "filldf.show()" 173 | ] 174 | }, 175 | { 176 | "cell_type": "code", 177 | "execution_count": 0, 178 | "metadata": { 179 | "application/vnd.databricks.v1+cell": { 180 | "cellMetadata": { 181 | "byteLimit": 2048000, 182 | "rowLimit": 10000 183 | }, 184 | "inputWidgets": {}, 185 | "nuid": "3add2a14-6501-4b80-8520-8f5310f0c45b", 186 | "showTitle": false, 187 | "title": "" 188 | } 189 | }, 190 | "outputs": [ 191 | { 192 | "output_type": "stream", 193 | "name": "stdout", 194 | "output_type": "stream", 195 | "text": [ 196 | "+---+------+---+--------+----+\n| id| name|age| city|code|\n+---+------+---+--------+----+\n| 2| Maria| 20| london| 75|\n| 3| Raj| 18|banglore| 60|\n| 4|Rachel| 18| boston| 90|\n+---+------+---+--------+----+\n\n" 197 | ] 198 | } 199 | ], 200 | "source": [ 201 | "#Get the students who are 18 years or older\n", 202 | "filterdf = filldf.filter(col(\"age\")>= 18)\n", 203 | "filterdf.show()" 204 | ] 205 | } 206 | ], 207 | "metadata": { 208 | "application/vnd.databricks.v1+notebook": { 209 | "dashboards": [], 210 | "environmentMetadata": null, 211 | "language": "python", 212 | "notebookMetadata": { 213 | "pythonIndentUnit": 4 214 | }, 215 | "notebookName": "Scenerio35", 216 | "widgets": {} 217 | } 218 | }, 219 | "nbformat": 4, 220 | "nbformat_minor": 0 221 | } 222 | -------------------------------------------------------------------------------- /Scenerio36.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "ca93cda6-3519-4de0-9539-49871d155641", 14 | "showTitle": false, 15 | "tableResultSettingsMap": {}, 16 | "title": "" 17 | } 18 | }, 19 | "outputs": [ 20 | { 21 | "output_type": "stream", 22 | "name": "stdout", 23 | "output_type": "stream", 24 | "text": [ 25 | "+----------+----------+\n| sell_date| product|\n+----------+----------+\n|2020-05-30| Headphone|\n|2020-06-01| Pencil|\n|2020-06-02| Mask|\n|2020-05-30|Basketball|\n|2020-06-01| Book|\n|2020-06-02| Mask|\n|2020-05-30| T-Shirt|\n+----------+----------+\n\n" 26 | ] 27 | } 28 | ], 29 | "source": [ 30 | "from pyspark.sql import *\n", 31 | "from pyspark.sql.types import *\n", 32 | "from pyspark.sql.functions import *\n", 33 | "\n", 34 | "data = [('2020-05-30','Headphone'),('2020-06-01','Pencil'),('2020-06-02','Mask'),('2020-05-30','Basketball'),('2020-06-01','Book'),('2020-06-02','Mask'),('2020-05-30','T-Shirt')]\n", 35 | "columns = [\"sell_date\",'product']\n", 36 | "\n", 37 | "df = spark.createDataFrame(data,schema=columns)\n", 38 | "df.show()" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 0, 44 | "metadata": { 45 | "application/vnd.databricks.v1+cell": { 46 | "cellMetadata": { 47 | "byteLimit": 2048000, 48 | "rowLimit": 10000 49 | }, 50 | "inputWidgets": {}, 51 | "nuid": "f53d6dfa-42b0-40bb-b525-340738e326dc", 52 | "showTitle": false, 53 | "tableResultSettingsMap": {}, 54 | "title": "" 55 | } 56 | }, 57 | "outputs": [ 58 | { 59 | "output_type": "stream", 60 | "name": "stdout", 61 | "output_type": "stream", 62 | "text": [ 63 | "+----------+--------------------+---------+\n| sell_date| products|null_sell|\n+----------+--------------------+---------+\n|2020-05-30|[T-Shirt, Basketb...| 3|\n|2020-06-01| [Pencil, Book]| 2|\n|2020-06-02| [Mask]| 1|\n+----------+--------------------+---------+\n\n" 64 | ] 65 | } 66 | ], 67 | "source": [ 68 | "transfdf = df.groupBy(\"sell_date\").agg(collect_set(\"product\").alias(\"products\"),size(collect_set(\"product\")).alias(\"null_sell\"))\n", 69 | "transfdf.show()" 70 | ] 71 | } 72 | ], 73 | "metadata": { 74 | "application/vnd.databricks.v1+notebook": { 75 | "computePreferences": null, 76 | "dashboards": [], 77 | "environmentMetadata": { 78 | "base_environment": "", 79 | "client": "1" 80 | }, 81 | "language": "python", 82 | "notebookMetadata": { 83 | "mostRecentlyExecutedCommandWithImplicitDF": { 84 | "commandId": 1835178097274309, 85 | "dataframes": [ 86 | "_sqldf" 87 | ] 88 | }, 89 | "pythonIndentUnit": 4 90 | }, 91 | "notebookName": "Untitled Notebook 2025-01-09 10:25:48", 92 | "widgets": {} 93 | } 94 | }, 95 | "nbformat": 4, 96 | "nbformat_minor": 0 97 | } 98 | -------------------------------------------------------------------------------- /Scenerio4.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-4") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | data = [(1, "Mark Ray", "AB"), 12 | (2, "Peter Smith", "CD"), 13 | (1, "Mark Ray", "EF"), 14 | (2, "Peter Smith", "GH"), 15 | (2, "Peter Smith", "CD"), 16 | (3, "Kate", "IJ")] 17 | myschema = ["custid", "custname", "address"] 18 | df = spark.createDataFrame(data, schema=myschema) 19 | df.show() 20 | 21 | # Through SQL 22 | df.createOrReplaceTempView("custtab") 23 | 24 | spark.sql( 25 | "select custid,custname,collect_set(address) as address from custtab group by custid,custname order by custid").show() 26 | 27 | # Through DSL 28 | finaldf = df.groupBy("custid", "custname").agg(collect_set("address").alias("address")).orderBy("custid").show() 29 | -------------------------------------------------------------------------------- /Scenerio5.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-5") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | data1 = [ 12 | (1, "abc", 31, "abc@gmail.com"), 13 | (2, "def", 23, "defyahoo.com"), 14 | (3, "xyz", 26, "xyz@gmail.com"), 15 | (4, "qwe", 34, "qwegmail.com"), 16 | (5, "iop", 24, "iop@gmail.com") 17 | ] 18 | myschema1 = ["id", "name", "age", "email"] 19 | df1 = spark.createDataFrame(data1, schema=myschema1) 20 | df1.show() 21 | 22 | data2 = [ 23 | (11, "jkl", 22, "abc@gmail.com", 1000), 24 | (12, "vbn", 33, "vbn@yahoo.com", 3000), 25 | (13, "wer", 27, "wer", 2000), 26 | (14, "zxc", 30, "zxc.com", 2000), 27 | (15, "lkj", 29, "lkj@outlook.com", 2000) 28 | ] 29 | myschema2 = ["id", "name", "age", "email", "salary"] 30 | df2 = spark.createDataFrame(data2, schema=myschema2) 31 | df2.show() 32 | 33 | # number of partiion in df 34 | partcount = df1.rdd.getNumPartitions() 35 | print("Number of partition:- " + str(partcount)) 36 | 37 | df3 = df1.withColumn("salary", lit(1000)) 38 | df3.show() 39 | 40 | # append df2 and df3, and form df4 41 | df4 = df2.union(df3).orderBy(col("id")) 42 | df4.show() 43 | 44 | # Remove records which have invalid email from df4, emails with @ are considered to be valid. 45 | rmdf = df4.filter(col("email").rlike("@")) 46 | rmdf.show() 47 | 48 | #Write df4 to a target location, by partitioning on salary. 49 | rmdf.write.format("parquet").partitionBy("salary").save("D:/BigData/Processed Datasets/interdata") 50 | 51 | -------------------------------------------------------------------------------- /Scenerio6.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-3") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | data = [ 12 | ("1", "a", "10000"), 13 | ("2", "b", "5000"), 14 | ("3", "c", "15000"), 15 | ("4", "d", "25000"), 16 | ("5", "e", "50000"), 17 | ("6", "f", "7000") 18 | ] 19 | myschema = ["empid","name","salary"] 20 | df = spark.createDataFrame(data,schema=myschema) 21 | df.show() 22 | 23 | #Through SQL 24 | df.createOrReplaceTempView("emptab") 25 | spark.sql("select *, case when salary > 10000 then 'Manager' else 'Employee' end as Designation from emptab").show() 26 | 27 | #Through DSL 28 | finaldf = df.withColumn("Desgination", expr("case when salary > 10000 then 'Manager' else 'Employee' end")) 29 | finaldf.show() -------------------------------------------------------------------------------- /Scenerio7.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-7") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | data = [ 12 | (1, 100, 2010, 25, 5000), 13 | (2, 100, 2011, 16, 5000), 14 | (3, 100, 2012, 8, 5000), 15 | (4, 200, 2010, 10, 9000), 16 | (5, 200, 2011, 15, 9000), 17 | (6, 200, 2012, 20, 7000), 18 | (7, 300, 2010, 20, 7000), 19 | (8, 300, 2011, 18, 7000), 20 | (9, 300, 2012, 20, 7000) 21 | ] 22 | myschema = ["sale_id", "product_id", "year", "quantity", "price"] 23 | df = spark.createDataFrame(data, schema=myschema) 24 | df.show() 25 | 26 | #Through SQL 27 | df.createOrReplaceTempView("salestab") 28 | spark.sql("SELECT *FROM (SELECT *, DENSE_RANK() OVER (PARTITION BY year ORDER BY quantity DESC) AS rank FROM salestab) AS rankdf WHERE rank = 1 ORDER BY sale_id").show() 29 | 30 | #Through DSL 31 | win = Window.partitionBy("year").orderBy(col("quantity").desc()) 32 | 33 | rankdf = df.withColumn("rank", dense_rank().over(win)) 34 | rankdf.show() 35 | 36 | finaldf = rankdf.filter(col("rank") == 1).drop("rank").orderBy("sale_id").show() 37 | -------------------------------------------------------------------------------- /Scenerio8.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-8") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | data = [ 12 | ("India",), 13 | ("Pakistan",), 14 | ("SriLanka",) 15 | ] 16 | myschema = ["teams"] 17 | df = spark.createDataFrame(data, schema=myschema) 18 | df.show() 19 | 20 | # Through SQL 21 | df.createOrReplaceTempView("crickettab") 22 | 23 | # self join query for reference - select a.teams,b.teams from crickettab a inner join crickettab b on a.teams < b.teams 24 | 25 | spark.sql( 26 | "select concat(a.teams, ' Vs ', b.teams) as matches from crickettab a inner join crickettab b on a.teams < b.teams").show() 27 | 28 | # Through DSL 29 | 30 | joindf = df.alias("a").join(df.alias("b"), col("a.teams") < col("b.teams"), "inner") 31 | joindf.show() 32 | 33 | finaldf = joindf.withColumn("matches", expr("concat(a.teams,' Vs ',b.teams)")).drop("teams", "teams").show() 34 | -------------------------------------------------------------------------------- /Scenerio9.py: -------------------------------------------------------------------------------- 1 | from pyspark import SparkConf, SparkContext 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.sql.window import * 6 | 7 | conf = SparkConf().setMaster("local[*]").setAppName("Scenerio-9") 8 | sc = SparkContext(conf=conf) 9 | sc.setLogLevel("ERROR") 10 | spark = SparkSession.builder.getOrCreate() 11 | data = [ 12 | ("a", [1, 1, 1, 3]), 13 | ("b", [1, 2, 3, 4]), 14 | ("c", [1, 1, 1, 1, 4]), 15 | ("d", [3]) 16 | ] 17 | df = spark.createDataFrame(data, ["name", "rank"]) 18 | df.show() 19 | 20 | explodedf = df.withColumn("rank", explode(col("rank"))) 21 | explodedf.show() 22 | 23 | filtdf = explodedf.filter(col("rank") == 1) 24 | filtdf.show() 25 | 26 | countdf = filtdf.groupBy("name").agg(count("*").alias("count")) 27 | countdf.show() 28 | 29 | finaldf = countdf.select(col("name")).first()[0] 30 | print(finaldf) 31 | -------------------------------------------------------------------------------- /pom.xml: -------------------------------------------------------------------------------- 1 | 2 | 4.0.0 3 | InterviewScenerios 4 | InterviewScenerios 5 | 0.0.1-SNAPSHOT 6 | 7 | src 8 | 9 | 10 | maven-compiler-plugin 11 | 3.6.1 12 | 13 | 1.8 14 | 1.8 15 | 16 | 17 | 18 | 19 | 20 | 21 | 22 | org.apache.spark 23 | spark-sql_2.11 24 | 2.4.7 25 | provided 26 | 27 | 28 | 29 | org.apache.spark 30 | spark-core_2.11 31 | 2.4.7 32 | 33 | 34 | 35 | -------------------------------------------------------------------------------- /src/pack/Scenerio1.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | object Scenerio1 { 8 | def main(args: Array[String]): Unit = { 9 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio1") 10 | val sc = new SparkContext(conf) 11 | sc.setLogLevel("ERROR") 12 | val spark = SparkSession.builder().getOrCreate() 13 | import spark.implicits._ 14 | val df = Seq( 15 | ("001", "Monika", "Arora", 100000, "2014-02-20 09:00:00", "HR"), 16 | ("002", "Niharika", "Verma", 300000, "2014-06-11 09:00:00", "Admin"), 17 | ("003", "Vishal", "Singhal", 300000, "2014-02-20 09:00:00", "HR"), 18 | ("004", "Amitabh", "Singh", 500000, "2014-02-20 09:00:00", "Admin"), 19 | ("005", "Vivek", "Bhati", 500000, "2014-06-11 09:00:00", "Admin")) 20 | .toDF("workerid", "firstname", "lastname", "salary", "joiningdate", "depart") 21 | 22 | df.show() 23 | //Through SQL Query 24 | df.createOrReplaceTempView("worktab") 25 | 26 | spark.sql("select a.workerid,a.firstname,a.lastname,a.salary,a.joiningdate,a.depart from worktab a, worktab b where a.salary=b.salary and a.workerid !=b.workerid").show() 27 | //Through Spark DSL 28 | val finaldf = df.as("a").join(df.as("b"), $"a.salary" === $"b.salary" && $"a.workerid" =!= $"b.workerid").select($"a.workerid", $"a.firstname", $"a.lastname", $"a.salary", $"a.joiningdate", $"a.depart").show() 29 | 30 | //Another way 31 | val finaldf = df.as("a").join(df.as("b")).where(col("a.salary")===col("b.salary") && col("a.workerid") =!= col("b.workerid")).select($"a.workerid",$"a.firstname",$"a.lastname",$"a.salary",$"a.joiningdate",$"a.depart").show() 32 | } 33 | } 34 | -------------------------------------------------------------------------------- /src/pack/Scenerio10.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio10 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio10") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = Seq( 17 | (1, 300, "31-Jan-2021"), 18 | (1, 400, "28-Feb-2021"), 19 | (1, 200, "31-Mar-2021"), 20 | (2, 1000, "31-Oct-2021"), 21 | (2, 900, "31-Dec-2021")) 22 | .toDF("empid", "commissionamt", "monthlastdate") 23 | 24 | df.show() 25 | 26 | val maxdatedf = df.groupBy(col("empid").as("empid1")).agg(max("monthlastdate").as("maxdate")) 27 | maxdatedf.show() 28 | 29 | val joindf = df.join(maxdatedf, df("empid") === maxdatedf("empid1") && df("monthlastdate") === maxdatedf("maxdate"), "inner").drop("empid1", "maxdate").show() 30 | 31 | } 32 | } -------------------------------------------------------------------------------- /src/pack/Scenerio11.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio11 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio11") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | val df = Seq( 16 | (1, "Jhon", 4000), 17 | (2, "Tim David", 12000), 18 | (3, "Json Bhrendroff", 7000), 19 | (4, "Jordon", 8000), 20 | (5, "Green", 14000), 21 | (6, "Brewis", 6000)).toDF("emp_id", "emp_name", "salary") 22 | df.show() 23 | 24 | //Through SQL 25 | df.createOrReplaceTempView("emptab") 26 | spark.sql("select *,case when salary<5000 then 'C' when salary between 5000 and 10000 then 'B' else 'A' end as grade from emptab ").show() 27 | 28 | //Through DSL 29 | val finaldf = df.withColumn("grade", expr("case when salary<5000 then 'C' when salary between 5000 and 10000 then 'B' else 'A' end")).show() 30 | } 31 | } -------------------------------------------------------------------------------- /src/pack/Scenerio12.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio12 { 9 | 10 | //creating UDF functions for masked data, here email(0) is it will take first letter i.e 0th index and email.substring(8) is it will take the string from 8th index position to end of the string 11 | def maskEmail(email: String): String = { 12 | email(0) + "**********" + email.substring(8) 13 | } 14 | 15 | //creating UDF functions for masked data, here mobile.substring(0, 2) is it will take string from Index 0 to 2 letters and mobile.substring(mobile.length - 3)calculates the starting index for the substring. It subtracts 3 from the length of the mobile string to determine the appropriate index to start the substring. 16 | 17 | def maskMobile(mobile: String): String = { 18 | mobile.substring(0, 2) + "*****" + mobile.substring(mobile.length - 3) 19 | } 20 | 21 | def main(args: Array[String]): Unit = { 22 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio9") 23 | val sc = new SparkContext(conf) 24 | sc.setLogLevel("ERROR") 25 | val spark = SparkSession.builder().getOrCreate() 26 | import spark.implicits._ 27 | 28 | val maskEmailUDF = udf[String, String](maskEmail) 29 | val maskMobileUDF = udf[String, String](maskMobile) 30 | 31 | val df = Seq(("Renuka1992@gmail.com", "9856765434"), ("anbu.arasu@gmail.com", "9844567788")).toDF("email", "mobile") 32 | df.show() 33 | 34 | val maskedDF = df.withColumn("email", maskEmailUDF(col("email"))) 35 | .withColumn("mobile", maskMobileUDF(col("mobile"))) 36 | maskedDF.show() 37 | } 38 | } -------------------------------------------------------------------------------- /src/pack/Scenerio13.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio13 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio13") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = Seq( 17 | (1, "Jhon", "Development"), 18 | (2, "Tim", "Development"), 19 | (3, "David", "Testing"), 20 | (4, "Sam", "Testing"), 21 | (5, "Green", "Testing"), 22 | (6, "Miller", "Production"), 23 | (7, "Brevis", "Production"), 24 | (8, "Warner", "Production"), 25 | (9, "Salt", "Production")).toDF("emp_id", "emp_name", "dept") 26 | df.show() 27 | 28 | //Through SQL 29 | df.createOrReplaceTempView("emptab") 30 | spark.sql("SELECT dept, COUNT(*) AS total FROM emptab GROUP BY dept").show() 31 | 32 | //Through DSL 33 | val finaldf = df.groupBy(col("dept")).agg(count("*").as("total")).show() 34 | } 35 | } -------------------------------------------------------------------------------- /src/pack/Scenerio14.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio14 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio14") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = Seq((203040, "rajesh", 10, 20, 30, 40, 50)).toDF("rollno", "name", "telugu", "english", "maths", "science", "social") 17 | df.show() 18 | 19 | //Through SQL 20 | df.createOrReplaceTempView("marks") 21 | spark.sql("select *, (telugu+english+maths+science+social) as total from marks").show() 22 | 23 | //Through DSL 24 | val finaldf = df.withColumn("total", expr("telugu+english+maths+science+social")).show() 25 | } 26 | } -------------------------------------------------------------------------------- /src/pack/Scenerio15.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | 3 | object Scenerio15 { 4 | def main(args: Array[String]): Unit = { 5 | val l1 = List(2, 3, 4, 5) 6 | val l2 = List(6, 7, 8, 9) 7 | //append 8 | val appendlst = l1 ::: l2 9 | println(appendlst) 10 | 11 | //extending list 12 | val extendlst = l1 ++ l2 13 | println(extendlst) 14 | } 15 | } -------------------------------------------------------------------------------- /src/pack/Scenerio16.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio16 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio16") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | val df = Seq( 16 | (1, "Jhon", "Testing", 5000), 17 | (2, "Tim", "Development", 6000), 18 | (3, "Jhon", "Development", 5000), 19 | (4, "Sky", "Prodcution", 8000)).toDF("id", "name", "dept", "salary") 20 | df.show() 21 | 22 | val finaldf = df.dropDuplicates("name").orderBy("id") 23 | finaldf.show() 24 | } 25 | } -------------------------------------------------------------------------------- /src/pack/Scenerio17.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio17 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio17") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df1 = Seq( 17 | (1, "Tim", 24, "Kerala", "India"), 18 | (2, "Asman", 26, "Kerala", "India")).toDF("emp_id", "name", "age", "state", "country") 19 | df1.show() 20 | 21 | val df2 = Seq( 22 | (1, "Tim", 24, "Comcity"), 23 | (2, "Asman", 26, "bimcity")).toDF("emp_id", "name", "age", "address") 24 | df2.show() 25 | 26 | val findf = df1.join(df2, Seq("emp_id", "name", "age"), "outer") 27 | findf.show() 28 | } 29 | } -------------------------------------------------------------------------------- /src/pack/Scenerio18.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio18 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio18") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val inputdf = Seq("The Social Dilemma").toDF("word") 17 | inputdf.show() 18 | val reverseudf = udf((sentence: String) => sentence.split(" ").map(_.reverse).mkString(" ")) 19 | val outputdf = inputdf.withColumn("reverse word", reverseudf($"word")).drop("word") 20 | outputdf.show() 21 | } 22 | } -------------------------------------------------------------------------------- /src/pack/Scenerio19.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio19 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio19") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = spark.read.format("json").option("multiline", "true").load("dbfs:/FileStore/scen.json") 17 | df.printSchema() 18 | val finaldf = df.withColumn("multiMedia", explode(col("multiMedia"))).withColumn("dislikes", expr("likeDislike.dislikes")).withColumn("likes", expr("likeDislike.likes")).withColumn("userAction", expr("likeDislike.userAction")).withColumn("createAt", expr("multiMedia.createAt")).withColumn("description", expr("multiMedia.description")).withColumn("id", expr("multiMedia.id")).withColumn("likeCount", expr("multiMedia.likeCount")).withColumn("mediatype", expr("multiMedia.mediatype")).withColumn("name", expr("multiMedia.name")).withColumn("place", expr("multiMedia.place")).withColumn("url", expr("multiMedia.url")).drop("likeDislike", "multiMedia") 19 | println("flat Schema") 20 | finaldf.printSchema() 21 | finaldf.show() 22 | } 23 | } -------------------------------------------------------------------------------- /src/pack/Scenerio2.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio2 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("scenerio-2") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = Seq( 17 | (1, "1-Jan", "Ordered"), 18 | (1, "2-Jan", "dispatched"), 19 | (1, "3-Jan", "dispatched"), 20 | (1, "4-Jan", "Shipped"), 21 | (1, "5-Jan", "Shipped"), 22 | (1, "6-Jan", "Delivered"), 23 | (2, "1-Jan", "Ordered"), 24 | (2, "2-Jan", "dispatched"), 25 | (2, "3-Jan", "shipped")).toDF("orderid", "statusdate", "status") 26 | 27 | df.show() 28 | 29 | //Through SQL 30 | df.createOrReplaceTempView("ordertab") 31 | spark.sql("select * from ordertab where status = 'dispatched' and orderid in(select orderid from ordertab where status = 'Ordered')").show() 32 | 33 | //Through DSL 34 | val result = df.filter( 35 | col("status") === "dispatched" && 36 | col("orderid").isin( 37 | df.filter(col("status") === "Ordered").select("orderid").map(_.getInt(0)).collect(): _*)) 38 | result.show() 39 | 40 | } 41 | } 42 | -------------------------------------------------------------------------------- /src/pack/Scenerio20.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio20 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio20") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = spark.read.format("json").option("multiline", "true").load("dbfs:/FileStore/flatjson/part-00000-tid-3675309499584050336-b8650962-dec3-4fe4-a204-c914090f019e-21-1-c000.json") 17 | df.printSchema() 18 | val compdf = df.select( 19 | col("code"), 20 | col("commentCount"), 21 | col("createdAt"), 22 | col("description"), 23 | col("feedsComment"), 24 | col("id"), 25 | col("imagePaths"), 26 | col("images"), 27 | col("isdeleted"), 28 | col("lat"), 29 | struct(col("dislikes"), col("likes"), col("userAction")).as("likeDislike"), 30 | col("lng"), 31 | col("location"), 32 | col("mediatype"), 33 | col("msg"), 34 | array( 35 | struct( 36 | col("createAt"), 37 | col("description"), 38 | col("id"), col("likeCount"), 39 | col("mediatype"), 40 | col("name"), 41 | col("place"), 42 | col("url")).as("element")).as("multiMedia"), 43 | col("name"), 44 | col("profilePicture"), 45 | col("title"), 46 | col("userId"), 47 | col("videoUrl"), 48 | col("totalFeed")) 49 | compdf.printSchema() 50 | } 51 | } -------------------------------------------------------------------------------- /src/pack/Scenerio21.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio21 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio21") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = Seq( 17 | ("SEA", "SF", 300), 18 | ("CHI", "SEA", 2000), 19 | ("SF", "SEA", 300), 20 | ("SEA", "CHI", 2000), 21 | ("SEA", "LND", 500), 22 | ("LND", "SEA", 500), 23 | ("LND", "CHI", 1000), 24 | ("CHI", "NDL", 180)).toDF("from", "to", "dist") 25 | df.show() 26 | //Through SQL 27 | df.createOrReplaceTempView("trip") 28 | spark.sql("""SELECT r1.from, r1.to, (r1.dist + r2.dist) AS roundtrip_dist 29 | FROM trip r1 30 | JOIN trip r2 ON r1.from = r2.to AND r1.to = r2.from 31 | WHERE r1.from < r1.to 32 | """).show() 33 | 34 | //Through DSL 35 | val finaldf = df.as("r1").join( 36 | df.as("r2"), 37 | (col("r1.from") === col("r2.to")) && (col("r1.to") === col("r2.from"))).where( 38 | col("r1.from") < col("r1.to")).select(col("r1.from"), col("r1.to"), 39 | (col("r1.dist") + col("r2.dist")).alias("roundtrip_dist")) 40 | finaldf.show() 41 | } 42 | } 43 | -------------------------------------------------------------------------------- /src/pack/Scenerio22.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio22 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio22") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | val df = Seq( 16 | (1, "26-May", 100), 17 | (1, "27-May", 200), 18 | (1, "28-May", 300), 19 | (2, "29-May", 400), 20 | (3, "30-May", 500), 21 | (3, "31-May", 600)).toDF("pid", "date", "price") 22 | df.show() 23 | 24 | //Through SQL 25 | df.createOrReplaceTempView("ordertab") 26 | spark.sql("select pid,date,price, sum(price) over(partition by(pid) order by(price)) as new_price from ordertab").show() 27 | 28 | //Through DSL 29 | val wn = Window.partitionBy("pid").orderBy("price") 30 | val finaldf = df.withColumn("new_price", sum("price") over (wn)).show() 31 | } 32 | } -------------------------------------------------------------------------------- /src/pack/Scenerio23.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio23 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio23") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = Seq((1, 5), (2, 6), (3, 5), (3, 6), (1, 6)).toDF("customer_id", "product_key") 17 | df.show() 18 | val df2 = Seq((5), (6)).toDF("product_key") 19 | df2.show() 20 | val finaldf = df.join(df2, Seq("product_key"), "inner").drop("product_key").distinct().filter(col("customer_id") =!= 2) 21 | finaldf.show() 22 | } 23 | } -------------------------------------------------------------------------------- /src/pack/Scenerio24.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio24 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio24") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = Seq( 17 | (1, "home"), 18 | (1, "products"), 19 | (1, "checkout"), 20 | (1, "confirmation"), 21 | (2, "home"), 22 | (2, "products"), 23 | (2, "cart"), 24 | (2, "checkout"), 25 | (2, "confirmation"), 26 | (2, "home"), 27 | (2, "products")).toDF("userid", "page") 28 | df.show() 29 | 30 | //Through SQL 31 | df.createOrReplaceTempView("pagetab") 32 | spark.sql("select userid, collect_list(page) as pages from pagetab group by userid").show() 33 | 34 | //Through DSL 35 | val finaldf = df.groupBy("userid").agg(collect_list("page").as("pages")).show(false) 36 | } 37 | } -------------------------------------------------------------------------------- /src/pack/Scenerio25.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio25 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio25") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = spark.read.format("csv").option("header", "true").option("mode","DROPMALFORMED").load("D:/BigData/Datasets/Scenerio25.csv") 17 | df.show() 18 | } 19 | } -------------------------------------------------------------------------------- /src/pack/Scenerio26.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio26 { 9 | 10 | def main(args: Array[String]): Unit = { 11 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio25") 12 | val sc = new SparkContext(conf) 13 | sc.setLogLevel("ERROR") 14 | val spark = SparkSession.builder().getOrCreate() 15 | import spark.implicits._ 16 | 17 | val sourcedf = Seq( 18 | (1, "A"), 19 | (2, "B"), 20 | (3, "C"), 21 | (4, "D")).toDF("id", "name") 22 | sourcedf.show() 23 | 24 | val targetdf = Seq( 25 | (1, "A"), 26 | (2, "B"), 27 | (4, "X"), 28 | (5, "F")).toDF("id1", "name1") 29 | targetdf.show() 30 | 31 | sourcedf.createOrReplaceTempView("sourcetab") 32 | targetdf.createOrReplaceTempView("targettab") 33 | 34 | spark.sql("""SELECT COALESCE(s.id, t.id1) AS id, 35 | CASE 36 | WHEN s.name IS NULL THEN 'new in target' 37 | WHEN t.name1 IS NULL THEN 'new in source' 38 | WHEN s.name != t.name1 THEN 'mismatch' 39 | END AS comment 40 | FROM sourcetab s 41 | FULL OUTER JOIN targettab t ON s.id = t.id1 42 | WHERE s.name != t.name1 OR s.name IS NULL OR t.name1 IS NULL 43 | """).show() 44 | 45 | //Joining two dataframes 46 | 47 | val joindf = sourcedf.join(targetdf, col("id") === col("id1"), "outer") 48 | joindf.show() 49 | 50 | //filtering the columns which are not equal and null 51 | 52 | val filterdf = joindf.filter(col("name") =!= col("name1") || col("name").isNull || col("name1").isNull) 53 | filterdf.show() 54 | 55 | //coalesce will replace the null value with next non null value 56 | 57 | val nullfildf = filterdf.withColumn("id", coalesce(col("id"), col("id1"))).drop("id1") 58 | nullfildf.show() 59 | 60 | val finaldf = nullfildf.withColumn("comment", expr("case when name is null then 'new in target' when name1 is null then 'new in source' when name!=name1 then 'mismatch' end")).drop("name", "name1") 61 | finaldf.show() 62 | 63 | } 64 | } 65 | -------------------------------------------------------------------------------- /src/pack/Scenerio27.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql.expressions._ 9 | 10 | object Scenerio27 { 11 | def main(args:Array[String]):Unit = { 12 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio27") 13 | val sc = new SparkContext(conf) 14 | sc.setLogLevel("ERROR") 15 | val spark = SparkSession.builder().getOrCreate() 16 | import spark.implicits._ 17 | 18 | val df = Seq((1,60000,2018),(1,70000,2019),(1,80000,2020),(2,60000,2018),(2,65000,2019),(2,65000,2020),(3,60000,2018),(3,65000,2019)).toDF("empid","salary","year") 19 | df.show() 20 | 21 | val wn = Window.partitionBy("empid").orderBy(col("year")) 22 | 23 | val lagdf = df.withColumn("diff",lag("salary",1) over(wn)) 24 | lagdf.show() 25 | 26 | val finaldf = lagdf.withColumn("incresalary",expr("salary - diff")).drop("diff").na.fill(0).orderBy("empid","year") 27 | finaldf.show() 28 | 29 | 30 | } 31 | } 32 | -------------------------------------------------------------------------------- /src/pack/Scenerio28.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql.expressions._ 9 | 10 | object Scenerio28 { 11 | 12 | def main(args: Array[String]): Unit = { 13 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio27") 14 | val sc = new SparkContext(conf) 15 | sc.setLogLevel("ERROR") 16 | val spark = SparkSession.builder().getOrCreate() 17 | import spark.implicits._ 18 | 19 | val df = Seq(("A", "AA"), ("B", "BB"), ("C", "CC"), ("AA", "AAA"), ("BB", "BBB"), ("CC", "CCC")).toDF("child", "parent") 20 | df.show() 21 | 22 | val joindf = df.as("a").join(df.as("b"), col("a.child") === col("b.parent")).select( 23 | col("a.child").alias("child_a"), 24 | col("a.parent").alias("parent_a"), 25 | col("b.child").alias("child_b"), 26 | col("b.parent").alias("parent_b") 27 | ) 28 | joindf.show() 29 | 30 | val findf = joindf.withColumnRenamed("child_a", "parent").withColumnRenamed("parent_a", "grandparent").withColumnRenamed("child_b", "child").drop("parent_b").select("child", "parent", "grandparent") 31 | 32 | findf.show() 33 | 34 | //another way 35 | 36 | val df2 = df.withColumnRenamed("child", "child1").withColumnRenamed("parent", "parent1") 37 | df2.show() 38 | 39 | val secondjoindf = df.join(df2, df("parent") === df2("child1"), "inner") 40 | secondjoindf.show() 41 | 42 | val finaldf = secondjoindf.withColumnRenamed("parent1", "grandparent").drop("child1") 43 | finaldf.show() 44 | } 45 | } 46 | -------------------------------------------------------------------------------- /src/pack/Scenerio29.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql.expressions._ 9 | 10 | object Scenerio29 { 11 | 12 | def main(args: Array[String]): Unit = { 13 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio27") 14 | val sc = new SparkContext(conf) 15 | sc.setLogLevel("ERROR") 16 | val spark = SparkSession.builder().getOrCreate() 17 | import spark.implicits._ 18 | 19 | val df1 = Seq((1), (2), (3)).toDF("col") 20 | df1.show() 21 | 22 | val df2 = Seq((1), (2), (3), (4), (5)).toDF("col1") 23 | df2.show() 24 | 25 | val maxdf = df1.agg(max("col").as("max")) 26 | maxdf.show() 27 | 28 | val maxsalary = maxdf.select(col("max")).first().getInt(0) 29 | 30 | val joindf = df1.join(df2, df1("col") === df2("col1"), "outer").drop("col") 31 | joindf.show() 32 | 33 | val finaldf = joindf.filter(col("col1") =!= maxsalary).withColumnRenamed("col1", "col") 34 | finaldf.show() 35 | 36 | } 37 | } 38 | -------------------------------------------------------------------------------- /src/pack/Scenerio3.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio3 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("scenerio-3") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val data = Seq( 17 | (1111, "2021-01-15", 10), 18 | (1111, "2021-01-16", 15), 19 | (1111, "2021-01-17", 30), 20 | (1112, "2021-01-15", 10), 21 | (1112, "2021-01-15", 20), 22 | (1112, "2021-01-15", 30)).toDF("sensorid", "timestamp", "values") 23 | data.show() 24 | 25 | //Through DSL 26 | 27 | val d1 = Window.partitionBy("sensorid").orderBy("values") 28 | 29 | val finaldf = data.withColumn("nextvalues", lead("values", 1) over (d1)) 30 | .filter(col("nextvalues").isNotNull) 31 | .withColumn("values", expr("nextvalues-values")) 32 | .drop("nextvalues") 33 | .show() 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/pack/Scenerio30.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql.expressions._ 9 | 10 | object Scenerio30 { 11 | 12 | def main(args: Array[String]): Unit = { 13 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio27") 14 | val sc = new SparkContext(conf) 15 | sc.setLogLevel("ERROR") 16 | val spark = SparkSession.builder().getOrCreate() 17 | import spark.implicits._ 18 | 19 | val df1 = Seq((1, "A", "A", 1000000), (2, "B", "A", 2500000), (3, "C", "G", 500000), (4, "D", "G", 800000), (5, "E", "W", 9000000), (6, "F", "W", 2000000)).toDF("emp_id", "name", "dept_id", "salary") 20 | df1.show() 21 | 22 | val df2 = Seq(("A", "AZURE"), ("G", "GCP"), ("W", "AWS")).toDF("dept_id1", "dept_name") 23 | df2.show() 24 | 25 | val joindf = df1.join(df2, df1("dept_id") === df2("dept_id1"), "inner").drop("dept_id1") 26 | joindf.show() 27 | 28 | val wn = Window.partitionBy("dept_id").orderBy(col("salary").desc) 29 | 30 | val rankdf = joindf.withColumn("rank", dense_rank() over (wn)) 31 | rankdf.show() 32 | 33 | val finaldf = rankdf.filter(col("rank") === 2).drop("rank").select("emp_id", "name", "dept_name", "salary") 34 | finaldf.show() 35 | } 36 | } 37 | -------------------------------------------------------------------------------- /src/pack/Scenerio31.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | 3 | import org.apache.spark.{SparkConf, SparkContext} 4 | import org.apache.spark.sql.SparkSession 5 | import org.apache.spark.sql._ 6 | import org.apache.spark.sql.types._ 7 | import org.apache.spark.sql.functions._ 8 | import org.apache.spark.sql.expressions._ 9 | 10 | object Scenerio31 { 11 | 12 | def main(args: Array[String]): Unit = { 13 | val conf = new SparkConf().setMaster("local[*]").setAppName("Scenerio27") 14 | val sc = new SparkContext(conf) 15 | sc.setLogLevel("ERROR") 16 | val spark = SparkSession.builder().getOrCreate() 17 | import spark.implicits._ 18 | 19 | val df = Seq(("m1", "m1,m2", "m1,m2,m3", "m1,m2,m3,m4")).toDF("col1", "col2", "col3", "col4") 20 | df.show() 21 | 22 | val contdf = df.withColumn("col", expr("concat(col1,'-',col2,'-',col3,'-',col4,'-')")).drop("col1", "col2", "col3", "col4") 23 | contdf.show(false) 24 | 25 | val finaldf = contdf.selectExpr("explode(split(col,'-')) as col") 26 | finaldf.show() 27 | 28 | } 29 | } 30 | -------------------------------------------------------------------------------- /src/pack/Scenerio32 Scala.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | val df1 = Seq((1,"Veg Biryani"),(2,"Veg Fried Rice"),(3,"Kaju Fried Rice"),(4,"Chicken Biryani"),(5,"Chicken Dum Biryani"),(6,"Prawns Biryani"),(7,"Fish Birayani")).toDF("food_id","food_item") 3 | df1.show() 4 | 5 | val df2 = Seq((1,5),(2,3),(3,4),(4,4),(5,5),(6,4),(7,4)).toDF("food_id","rating") 6 | df2.show() 7 | 8 | 9 | // COMMAND ---------- 10 | 11 | import org.apache.spark.sql.functions._ 12 | 13 | val joindf = df1.join(df2, df1("food_id") === df2("food_id"), "inner").select(df1("food_id"), df1("food_item"), df2("rating")) 14 | joindf.show() 15 | 16 | 17 | // COMMAND ---------- 18 | 19 | val finaldf = joindf.withColumn("stats(out of 5)",expr("repeat('*',rating)")) 20 | finaldf.show() 21 | -------------------------------------------------------------------------------- /src/pack/Scenerio33.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | val familydf = Seq(("c00dac11bde74750b4d207b9c182a85f", "Alex Thomas", 9),("eb6f2d3426694667ae3e79d6274114a4", "Chris Gray", 2),("3f7b5b8e835d4e1c8b3e12e964a741f3", "Emily Johnson", 4),("9a345b079d9f4d3cafb2d4c11d20f8ce", "Michael Brown", 6),("e0a5f57516024de2a231d09de2cbe9d1", "Jessica Wilson", 3)).toDF("id","name","family_size") 3 | familydf.show() 4 | 5 | val countrydf = Seq(("023fd23615bd4ff4b2ae0a13ed7efec9", "Bolivia", 2 , 4),("be247f73de0f4b2d810367cb26941fb9", "Cook Islands", 4,8),("3e85ab80a6f84ef3b9068b21dbcc54b3", "Brazil", 4,7),("e571e164152c4f7c8413e2734f67b146", "Australia", 5,9),("f35a7bb7d44342f7a8a42a53115294a8", "Canada", 3,5),("a1b5a4b5fc5f46f891d9040566a78f27", "Japan", 10,12)).toDF("id","name","min_size","max_size") 6 | countrydf.show() 7 | 8 | // COMMAND ---------- 9 | 10 | import org.apache.spark.sql.functions._ 11 | 12 | val joindf = familydf.join(countrydf, familydf("family_size") >= countrydf("min_size") && familydf("family_size") <= countrydf("max_size"),"inner").select(familydf("name"), familydf("family_size"), countrydf("name").as("country_name"), countrydf("min_size"), countrydf("max_size")) 13 | joindf.show() 14 | 15 | 16 | // COMMAND ---------- 17 | 18 | val groupdf = joindf.groupBy(familydf("name")).agg(count("*").alias("number_of_countries")) 19 | groupdf.show() 20 | 21 | // COMMAND ---------- 22 | 23 | val finaldf = groupdf.agg(expr("max(number_of_countries)").alias("number_of_countries")) 24 | finaldf.show() 25 | 26 | // COMMAND ---------- 27 | 28 | import org.apache.spark.sql.expressions._ 29 | 30 | //another way 31 | val wn = Window.orderBy(desc("number_of_countries")) 32 | 33 | val rankdf = groupdf.withColumn("rank",row_number() over(wn)) 34 | rankdf.show() 35 | 36 | val finaldf2 = rankdf.filter(col("rank")===1).drop("rank") 37 | finaldf2.show() 38 | -------------------------------------------------------------------------------- /src/pack/Scenerio35.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | import org.apache.spark._ 3 | import org.apache.spark.sql._ 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types._ 6 | import spark.implicits._ 7 | 8 | //creating the dataframe df1 9 | val df1 = Seq((1,"Jhon",Some(17)),(2,"Maria",Some(20)),(3,"Raj",None),(4,"Rachel",Some(18))).toDF("id","name","age") 10 | df1.show() 11 | 12 | // COMMAND ---------- 13 | 14 | //Count null entries in each column 15 | val nullCounts = df1.select(df1.columns.map(c => sum(col(c).isNull.cast("int")).alias(c)): _*) 16 | nullCounts.show() 17 | 18 | // COMMAND ---------- 19 | 20 | //Remove the row with null entires and store them in a new dataframe named df2 21 | val df2 = df1.filter(col("age").isNull) 22 | df2.show() 23 | 24 | // COMMAND ---------- 25 | 26 | //create a new dataframe df3 27 | val df3 = Seq((1,"seatle",82),(2,"london",75),(3,"banglore",60),(4,"boston",90)).toDF("id","city","code") 28 | df3.show() 29 | 30 | //join the df1 and df3 31 | val mergedf = df1.join(df3, df1("id") === df3("id"), "inner").select(df1("id"), df1("name"), df1("age"), df3("city"), df3("code")) 32 | mergedf.show() 33 | 34 | // COMMAND ---------- 35 | 36 | //fill the null value with the mean age of students 37 | //calculate the mean age 38 | // Calculate the mean age 39 | val meanage = mergedf.select(mean("age")).first().getDouble(0) 40 | 41 | // Fill null values in the 'age' column with the mean age 42 | val filldf = mergedf.na.fill(Map("age" -> meanage)) 43 | 44 | // Show the resulting DataFrame 45 | filldf.show() 46 | 47 | // COMMAND ---------- 48 | 49 | //Get the students who are 18 years or older 50 | val filterdf = filldf.filter(col("age")>= 18) 51 | filterdf.show() 52 | -------------------------------------------------------------------------------- /src/pack/Scenerio36.scala: -------------------------------------------------------------------------------- 1 | // Databricks notebook source 2 | import org.apache.spark._ 3 | import org.apache.spark.sql._ 4 | import org.apache.spark.sql.functions._ 5 | import org.apache.spark.sql.types._ 6 | import spark.implicits._ 7 | 8 | val data = Seq(("2020-05-30","Headphone"),("2020-06-01","Pencil"),("2020-06-02","Mask"),("2020-05-30","Basketball"),("2020-06-01","Book"),("2020-06-02","Mask"),("2020-05-30","T-Shirt")).toDF("sell_date","product") 9 | data.show() 10 | 11 | // COMMAND ---------- 12 | 13 | val transdf = data.groupBy("sell_date").agg(collect_set("product").alias("products"),size(collect_set("product")).alias("num_sell")) 14 | transdf.show() 15 | -------------------------------------------------------------------------------- /src/pack/Scenerio4.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio4 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("scenerio-4") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = Seq( 17 | (1, "Mark Ray", "AB"), 18 | (2, "Peter Smith", "CD"), 19 | (1, "Mark Ray", "EF"), 20 | (2, "Peter Smith", "GH"), 21 | (2, "Peter Smith", "CD"), 22 | (3, "Kate", "IJ")).toDF("custid", "custname", "address") 23 | df.show() 24 | 25 | //Through SQL 26 | df.createOrReplaceTempView("custtab") 27 | 28 | spark.sql("select custid,custname,collect_set(address) as address from custtab group by custid,custname order by custid").show() 29 | 30 | //Through DSL 31 | 32 | val finaldf = df.groupBy("custid", "custname").agg(collect_set("address").as("address")).orderBy("custid").show() 33 | 34 | } 35 | } 36 | -------------------------------------------------------------------------------- /src/pack/Scenerio5.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio5 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("scenerio-5") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | val df1 = Seq( 16 | (1, "abc", 31, "abc@gmail.com"), 17 | (2, "def", 23, "defyahoo.com"), 18 | (3, "xyz", 26, "xyz@gmail.com"), 19 | (4, "qwe", 34, "qwegmail.com"), 20 | (5, "iop", 24, "iop@gmail.com")) 21 | .toDF("id", "name", "age", "email") 22 | df1.show() 23 | 24 | val df2 = Seq( 25 | (11, "jkl", 22, "abc@gmail.com", 1000), 26 | (12, "vbn", 33, "vbn@yahoo.com", 3000), 27 | (13, "wer", 27, "wer", 2000), 28 | (14, "zxc", 30, "zxc.com", 2000), 29 | (15, "lkj", 29, "lkj@outlook.com", 2000)) 30 | .toDF("id", "name", "age", "email", "salary") 31 | df2.show() 32 | 33 | //number of partiion in df 34 | val partcount = df1.rdd.getNumPartitions 35 | println("Number of partition:- " + partcount) 36 | 37 | val df3 = df1.withColumn("salary", lit(1000)) 38 | df3.show() 39 | 40 | //append df2 and df3, and form df4 41 | val df4 = df2.union(df3).orderBy(col("id") asc) 42 | df4.show() 43 | 44 | //Remove records which have invalid email from df4, emails with @ are considered to be valid. 45 | val rmdf = df4.filter(col("email").rlike("@")) 46 | rmdf.show() 47 | 48 | //Write df4 to a target location, by partitioning on salary. 49 | rmdf.write.format("parquet").partitionBy("salary").save("D:/BigData/Processed Datasets/interdata") 50 | 51 | 52 | } 53 | } 54 | -------------------------------------------------------------------------------- /src/pack/Scenerio6.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio6 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio6") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = spark.createDataFrame(Seq( 17 | ("1", "a", "10000"), 18 | ("2", "b", "5000"), 19 | ("3", "c", "15000"), 20 | ("4", "d", "25000"), 21 | ("5", "e", "50000"), 22 | ("6", "f", "7000"))) 23 | .toDF("empid", "name", "salary") 24 | df.show() 25 | 26 | //Through SQL 27 | df.createOrReplaceTempView("emptab") 28 | spark.sql("select *, case when salary > 10000 then 'Manager' else 'Employee' end as Designation from emptab").show() 29 | 30 | //Through DSL 31 | val finaldf = df.withColumn("Desgination", expr("case when salary > 10000 then 'Manager' else 'Employee' end")) 32 | finaldf.show() 33 | } 34 | } -------------------------------------------------------------------------------- /src/pack/Scenerio7.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio7 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio6") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = spark.createDataFrame(Seq( 17 | (1, 100, 2010, 25, 5000), 18 | (2, 100, 2011, 16, 5000), 19 | (3, 100, 2012, 8, 5000), 20 | (4, 200, 2010, 10, 9000), 21 | (5, 200, 2011, 15, 9000), 22 | (6, 200, 2012, 20, 7000), 23 | (7, 300, 2010, 20, 7000), 24 | (8, 300, 2011, 18, 7000), 25 | (9, 300, 2012, 20, 7000))) 26 | .toDF("sale_id", "product_id", "year", "quantity", "price") 27 | df.show() 28 | 29 | //Through SQL 30 | df.createOrReplaceTempView("salestab") 31 | spark.sql("SELECT *FROM (SELECT *, DENSE_RANK() OVER (PARTITION BY year ORDER BY quantity DESC) AS rank FROM salestab) AS rankdf WHERE rank = 1 ORDER BY sale_id").show() 32 | 33 | //Through DSL 34 | val win = Window.partitionBy("year").orderBy(col("quantity").desc) 35 | 36 | val rankdf = df.withColumn("rank", dense_rank() over (win)) 37 | rankdf.show() 38 | 39 | val finaldf = rankdf.filter(col("rank") === "1").drop("rank").orderBy("sale_id").show() 40 | 41 | } 42 | } -------------------------------------------------------------------------------- /src/pack/Scenerio8.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio8 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio8") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = Seq( 17 | ("India"), 18 | ("Pakistan"), 19 | ("SriLanka")).toDF("teams") 20 | 21 | df.show() 22 | 23 | //Through SQL 24 | df.createOrReplaceTempView("crickettab") 25 | 26 | //self join query for reference - select a.teams,b.teams from crickettab a inner join crickettab b on a.teams < b.teams 27 | 28 | spark.sql("select concat(a.teams, ' Vs ', b.teams) as matches from crickettab a inner join crickettab b on a.teams < b.teams").show() 29 | 30 | //Through DSL 31 | 32 | val joindf = df.as("a").join(df.as("b"), $"a.teams" < $"b.teams", "inner") 33 | joindf.show() 34 | 35 | val finaldf = joindf.withColumn("matches", expr("concat(a.teams,' Vs ',b.teams)")).drop("teams", "teams").show() 36 | } 37 | } -------------------------------------------------------------------------------- /src/pack/Scenerio9.scala: -------------------------------------------------------------------------------- 1 | package pack 2 | import org.apache.spark.SparkConf 3 | import org.apache.spark.SparkContext 4 | import org.apache.spark.sql._ 5 | import org.apache.spark.sql.types._ 6 | import org.apache.spark.sql.functions._ 7 | import org.apache.spark.sql.expressions._ 8 | object Scenerio9 { 9 | def main(args: Array[String]): Unit = { 10 | val conf = new SparkConf().setMaster("local").setAppName("Scenerio9") 11 | val sc = new SparkContext(conf) 12 | sc.setLogLevel("ERROR") 13 | val spark = SparkSession.builder().getOrCreate() 14 | import spark.implicits._ 15 | 16 | val df = Seq( 17 | ("a", Seq(1, 1, 1, 3)), 18 | ("b", Seq(1, 2, 3, 4)), 19 | ("c", Seq(1, 1, 1, 1, 4)), 20 | ("d", Seq(3))).toDF("name", "rank") 21 | 22 | df.show() 23 | 24 | val explodedf = df.withColumn("rank", explode(col("rank"))) 25 | explodedf.show() 26 | 27 | val filtdf = explodedf.filter(col("rank") === 1) 28 | filtdf.show() 29 | 30 | val countdf = filtdf.groupBy("name").agg(count("*").as("count")).orderBy(col("count") desc) 31 | countdf.show() 32 | 33 | val finaldf = countdf.select(col("name")).first().getString(0) 34 | println(finaldf) 35 | 36 | } 37 | } -------------------------------------------------------------------------------- /target/classes/pack/Scenerio1$$typecreator5$1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohankrishna02/interview-scenerios-spark-sql/30f056cb639fe0ee812eb0eb548e9136c3845e38/target/classes/pack/Scenerio1$$typecreator5$1.class -------------------------------------------------------------------------------- /target/classes/pack/Scenerio1$.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohankrishna02/interview-scenerios-spark-sql/30f056cb639fe0ee812eb0eb548e9136c3845e38/target/classes/pack/Scenerio1$.class -------------------------------------------------------------------------------- /target/classes/pack/Scenerio1.class: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mohankrishna02/interview-scenerios-spark-sql/30f056cb639fe0ee812eb0eb548e9136c3845e38/target/classes/pack/Scenerio1.class --------------------------------------------------------------------------------