├── Untitled.ipynb
├── day1.ipynb
├── day2.ipynb
├── day3.ipynb
├── day4.ipynb
├── day5.ipynb
├── day6.ipynb
├── day7.ipynb
├── test1.csv
├── test2.csv
├── test3.csv
├── test4.csv
├── test5.csv
└── tips.csv
/Untitled.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "id": "8db32105-fc54-445b-8b33-b9779412ff81",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": []
10 | }
11 | ],
12 | "metadata": {
13 | "kernelspec": {
14 | "display_name": "Python 3 (ipykernel)",
15 | "language": "python",
16 | "name": "python3"
17 | },
18 | "language_info": {
19 | "codemirror_mode": {
20 | "name": "ipython",
21 | "version": 3
22 | },
23 | "file_extension": ".py",
24 | "mimetype": "text/x-python",
25 | "name": "python",
26 | "nbconvert_exporter": "python",
27 | "pygments_lexer": "ipython3",
28 | "version": "3.11.7"
29 | }
30 | },
31 | "nbformat": 4,
32 | "nbformat_minor": 5
33 | }
34 |
--------------------------------------------------------------------------------
/day1.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "166d2b18-2e08-4cdc-a65f-fe2f84694055",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stdout",
11 | "output_type": "stream",
12 | "text": [
13 | "Collecting spark\n",
14 | " Downloading spark-0.2.1.tar.gz (41 kB)\n",
15 | "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m414.0 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m\n",
16 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n",
17 | "\u001b[?25hBuilding wheels for collected packages: spark\n",
18 | " Building wheel for spark (setup.py) ... \u001b[?25ldone\n",
19 | "\u001b[?25h Created wheel for spark: filename=spark-0.2.1-py3-none-any.whl size=58748 sha256=c546ef45511ed9eeb315c261edbfd4254da318be8cb9bca4bbd5b41b7b5273b6\n",
20 | " Stored in directory: /home/kyn/.cache/pip/wheels/67/c2/7c/a53325365fba358ffff35af84a2e14cf88c18052f88acfa5f0\n",
21 | "Successfully built spark\n",
22 | "Installing collected packages: spark\n",
23 | "Successfully installed spark-0.2.1\n"
24 | ]
25 | }
26 | ],
27 | "source": [
28 | "!pip install spark"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 2,
34 | "id": "40c102b2-0952-4ea5-b0cb-86fc18ce4798",
35 | "metadata": {},
36 | "outputs": [],
37 | "source": [
38 | "import pyspark"
39 | ]
40 | },
41 | {
42 | "cell_type": "code",
43 | "execution_count": 3,
44 | "id": "313ac534-951d-4d7b-81b5-32ea3d1aa5a8",
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "data": {
49 | "text/html": [
50 | "
\n",
51 | "\n",
64 | "
\n",
65 | " \n",
66 | " \n",
67 | " | \n",
68 | " name | \n",
69 | " age | \n",
70 | "
\n",
71 | " \n",
72 | " \n",
73 | " \n",
74 | " 0 | \n",
75 | " kani | \n",
76 | " 15 | \n",
77 | "
\n",
78 | " \n",
79 | " 1 | \n",
80 | " kani1 | \n",
81 | " 16 | \n",
82 | "
\n",
83 | " \n",
84 | " 2 | \n",
85 | " kani2 | \n",
86 | " 17 | \n",
87 | "
\n",
88 | " \n",
89 | "
\n",
90 | "
"
91 | ],
92 | "text/plain": [
93 | " name age\n",
94 | "0 kani 15\n",
95 | "1 kani1 16\n",
96 | "2 kani2 17"
97 | ]
98 | },
99 | "execution_count": 3,
100 | "metadata": {},
101 | "output_type": "execute_result"
102 | }
103 | ],
104 | "source": [
105 | "import pandas as pd\n",
106 | "df=pd.read_csv(\"test1.csv\")\n",
107 | "df"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 14,
113 | "id": "a71f8f2e-8677-4392-b042-f540b508eebc",
114 | "metadata": {},
115 | "outputs": [
116 | {
117 | "data": {
118 | "text/plain": [
119 | "pandas.core.frame.DataFrame"
120 | ]
121 | },
122 | "execution_count": 14,
123 | "metadata": {},
124 | "output_type": "execute_result"
125 | }
126 | ],
127 | "source": [
128 | "type(pd.read_csv(\"test1.csv\"))"
129 | ]
130 | },
131 | {
132 | "cell_type": "code",
133 | "execution_count": 5,
134 | "id": "fd9e7d5e-1630-4d40-91bb-5b372e9a192b",
135 | "metadata": {},
136 | "outputs": [
137 | {
138 | "name": "stderr",
139 | "output_type": "stream",
140 | "text": [
141 | "24/04/23 10:17:17 WARN Utils: Your hostname, kyn resolves to a loopback address: 127.0.1.1; using 10.0.250.239 instead (on interface wlp0s20f3)\n",
142 | "24/04/23 10:17:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
143 | "Setting default log level to \"WARN\".\n",
144 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
145 | "24/04/23 10:17:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
146 | ]
147 | }
148 | ],
149 | "source": [
150 | "from pyspark.sql import SparkSession\n",
151 | "spark=SparkSession.builder.appName(\"kani\").getOrCreate()"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 6,
157 | "id": "e9f269a8-11c1-4eb4-9f3e-a27eaef37910",
158 | "metadata": {},
159 | "outputs": [
160 | {
161 | "data": {
162 | "text/html": [
163 | "\n",
164 | " \n",
165 | "
SparkSession - in-memory
\n",
166 | " \n",
167 | "
\n",
168 | "
SparkContext
\n",
169 | "\n",
170 | "
Spark UI
\n",
171 | "\n",
172 | "
\n",
173 | " - Version
\n",
174 | " v3.5.1
\n",
175 | " - Master
\n",
176 | " local[*]
\n",
177 | " - AppName
\n",
178 | " kani
\n",
179 | "
\n",
180 | "
\n",
181 | " \n",
182 | "
\n",
183 | " "
184 | ],
185 | "text/plain": [
186 | ""
187 | ]
188 | },
189 | "execution_count": 6,
190 | "metadata": {},
191 | "output_type": "execute_result"
192 | }
193 | ],
194 | "source": [
195 | "spark"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 7,
201 | "id": "1924f962-b9d5-45e0-b326-f069ec69a27f",
202 | "metadata": {},
203 | "outputs": [],
204 | "source": [
205 | "df_spark=spark.read.csv(\"test1.csv\")"
206 | ]
207 | },
208 | {
209 | "cell_type": "code",
210 | "execution_count": 8,
211 | "id": "fa08660e-d5f6-4bb0-8810-138c05878b25",
212 | "metadata": {},
213 | "outputs": [
214 | {
215 | "data": {
216 | "text/plain": [
217 | "DataFrame[_c0: string, _c1: string]"
218 | ]
219 | },
220 | "execution_count": 8,
221 | "metadata": {},
222 | "output_type": "execute_result"
223 | }
224 | ],
225 | "source": [
226 | "df_spark"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": 9,
232 | "id": "ec97f69a-96dd-4fa9-8e85-f501f1bcf023",
233 | "metadata": {},
234 | "outputs": [
235 | {
236 | "name": "stdout",
237 | "output_type": "stream",
238 | "text": [
239 | "+-----+---+\n",
240 | "| _c0|_c1|\n",
241 | "+-----+---+\n",
242 | "| name|age|\n",
243 | "| kani| 15|\n",
244 | "|kani1| 16|\n",
245 | "|kani2| 17|\n",
246 | "+-----+---+\n",
247 | "\n"
248 | ]
249 | }
250 | ],
251 | "source": [
252 | "df_spark.show()"
253 | ]
254 | },
255 | {
256 | "cell_type": "code",
257 | "execution_count": 10,
258 | "id": "f25c9b93-bf9c-4697-bc69-69ab67424aa7",
259 | "metadata": {},
260 | "outputs": [
261 | {
262 | "data": {
263 | "text/plain": [
264 | "DataFrame[name: string, age: string]"
265 | ]
266 | },
267 | "execution_count": 10,
268 | "metadata": {},
269 | "output_type": "execute_result"
270 | }
271 | ],
272 | "source": [
273 | "spark.read.option(\"header\",\"true\").csv(\"test1.csv\")"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 11,
279 | "id": "adb50d24-9cf0-445c-86a5-427f7bebc19b",
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "name": "stdout",
284 | "output_type": "stream",
285 | "text": [
286 | "+-----+---+\n",
287 | "| name|age|\n",
288 | "+-----+---+\n",
289 | "| kani| 15|\n",
290 | "|kani1| 16|\n",
291 | "|kani2| 17|\n",
292 | "+-----+---+\n",
293 | "\n"
294 | ]
295 | }
296 | ],
297 | "source": [
298 | "spark.read.option(\"header\",\"true\").csv(\"test1.csv\").show()"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": 12,
304 | "id": "345760c6-7830-4c91-8882-052f7f8d33ea",
305 | "metadata": {},
306 | "outputs": [
307 | {
308 | "data": {
309 | "text/plain": [
310 | "pyspark.sql.dataframe.DataFrame"
311 | ]
312 | },
313 | "execution_count": 12,
314 | "metadata": {},
315 | "output_type": "execute_result"
316 | }
317 | ],
318 | "source": [
319 | "type(df_spark)"
320 | ]
321 | },
322 | {
323 | "cell_type": "code",
324 | "execution_count": 18,
325 | "id": "9b9cd715-eb93-4119-9645-1a39b6e594c8",
326 | "metadata": {},
327 | "outputs": [
328 | {
329 | "data": {
330 | "text/plain": [
331 | "[Row(_c0='name', _c1='age'),\n",
332 | " Row(_c0='kani', _c1='15'),\n",
333 | " Row(_c0='kani1', _c1='16')]"
334 | ]
335 | },
336 | "execution_count": 18,
337 | "metadata": {},
338 | "output_type": "execute_result"
339 | }
340 | ],
341 | "source": [
342 | "df_spark.head(3)"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 23,
348 | "id": "f66faaf3-bb69-4925-8a86-29d3432adfa8",
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "name": "stdout",
353 | "output_type": "stream",
354 | "text": [
355 | "root\n",
356 | " |-- _c0: string (nullable = true)\n",
357 | " |-- _c1: string (nullable = true)\n",
358 | "\n"
359 | ]
360 | }
361 | ],
362 | "source": [
363 | "df_spark.printSchema()"
364 | ]
365 | },
366 | {
367 | "cell_type": "code",
368 | "execution_count": null,
369 | "id": "e2f71893-b4ea-4cea-8eaf-15e7cf845e4e",
370 | "metadata": {},
371 | "outputs": [],
372 | "source": []
373 | }
374 | ],
375 | "metadata": {
376 | "kernelspec": {
377 | "display_name": "Python 3 (ipykernel)",
378 | "language": "python",
379 | "name": "python3"
380 | },
381 | "language_info": {
382 | "codemirror_mode": {
383 | "name": "ipython",
384 | "version": 3
385 | },
386 | "file_extension": ".py",
387 | "mimetype": "text/x-python",
388 | "name": "python",
389 | "nbconvert_exporter": "python",
390 | "pygments_lexer": "ipython3",
391 | "version": "3.11.7"
392 | }
393 | },
394 | "nbformat": 4,
395 | "nbformat_minor": 5
396 | }
397 |
--------------------------------------------------------------------------------
/day2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "0204e5b9-754d-43c9-9427-416059b90d6e",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stderr",
11 | "output_type": "stream",
12 | "text": [
13 | "24/04/24 09:35:09 WARN Utils: Your hostname, kyn resolves to a loopback address: 127.0.1.1; using 10.0.250.240 instead (on interface wlp0s20f3)\n",
14 | "24/04/24 09:35:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
15 | "Setting default log level to \"WARN\".\n",
16 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
17 | "24/04/24 09:35:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
18 | ]
19 | },
20 | {
21 | "data": {
22 | "text/html": [
23 | "\n",
24 | " \n",
25 | "
SparkSession - in-memory
\n",
26 | " \n",
27 | "
\n",
28 | "
SparkContext
\n",
29 | "\n",
30 | "
Spark UI
\n",
31 | "\n",
32 | "
\n",
33 | " - Version
\n",
34 | " v3.5.1
\n",
35 | " - Master
\n",
36 | " local[*]
\n",
37 | " - AppName
\n",
38 | " dataframe
\n",
39 | "
\n",
40 | "
\n",
41 | " \n",
42 | "
\n",
43 | " "
44 | ],
45 | "text/plain": [
46 | ""
47 | ]
48 | },
49 | "execution_count": 1,
50 | "metadata": {},
51 | "output_type": "execute_result"
52 | }
53 | ],
54 | "source": [
55 | "from pyspark.sql import SparkSession\n",
56 | "spark=SparkSession.builder.appName(\"dataframe\").getOrCreate()\n",
57 | "spark"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 2,
63 | "id": "05510abc-d3a9-42df-8146-15e7fa28403b",
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "data": {
68 | "text/plain": [
69 | "DataFrame[_c0: string, _c1: string]"
70 | ]
71 | },
72 | "execution_count": 2,
73 | "metadata": {},
74 | "output_type": "execute_result"
75 | }
76 | ],
77 | "source": [
78 | "df_spark=spark.read.csv(\"test1.csv\")\n",
79 | "df_spark"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 26,
85 | "id": "0a192b69-d0fc-4e08-bde7-1d69cf0641ae",
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "name": "stdout",
90 | "output_type": "stream",
91 | "text": [
92 | "+-----+---+----------+\n",
93 | "| name|age|experience|\n",
94 | "+-----+---+----------+\n",
95 | "| kani| 15| 10|\n",
96 | "|kani1| 16| 8|\n",
97 | "|kani2| 17| 4|\n",
98 | "+-----+---+----------+\n",
99 | "\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "#read dataset\n",
105 | "df_spark=spark.read.option(\"header\",\"true\").csv(\"test2.csv\")\n",
106 | "df_spark.show()"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 4,
112 | "id": "bc69ebc5-b466-441b-b41f-bfaed5dca671",
113 | "metadata": {},
114 | "outputs": [
115 | {
116 | "name": "stdout",
117 | "output_type": "stream",
118 | "text": [
119 | "root\n",
120 | " |-- name: string (nullable = true)\n",
121 | " |-- age: string (nullable = true)\n",
122 | " |-- experience: string (nullable = true)\n",
123 | "\n"
124 | ]
125 | }
126 | ],
127 | "source": [
128 | "#check schema\n",
129 | "df_spark.printSchema()\n",
130 | "#here all are show an string so that \n",
131 | "#use an inferschema=true"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 5,
137 | "id": "a2de4aae-47aa-4fa2-a64d-3ee819312b08",
138 | "metadata": {},
139 | "outputs": [
140 | {
141 | "name": "stdout",
142 | "output_type": "stream",
143 | "text": [
144 | "+-----+---+----------+\n",
145 | "| name|age|experience|\n",
146 | "+-----+---+----------+\n",
147 | "| kani| 15| 10|\n",
148 | "|kani1| 16| 8|\n",
149 | "|kani2| 17| 4|\n",
150 | "+-----+---+----------+\n",
151 | "\n"
152 | ]
153 | }
154 | ],
155 | "source": [
156 | "df_spark=spark.read.option(\"header\",\"true\").csv(\"test2.csv\",inferSchema=True)\n",
157 | "df_spark.show()\n"
158 | ]
159 | },
160 | {
161 | "cell_type": "code",
162 | "execution_count": 6,
163 | "id": "b8473ce1-9299-4667-a779-ea46c8538d8f",
164 | "metadata": {},
165 | "outputs": [
166 | {
167 | "name": "stdout",
168 | "output_type": "stream",
169 | "text": [
170 | "root\n",
171 | " |-- name: string (nullable = true)\n",
172 | " |-- age: integer (nullable = true)\n",
173 | " |-- experience: integer (nullable = true)\n",
174 | "\n"
175 | ]
176 | }
177 | ],
178 | "source": [
179 | "df_spark.printSchema()\n",
180 | "#now it show an crt datatype "
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 7,
186 | "id": "ea7de970-25b4-45c0-8840-95942e664579",
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "data": {
191 | "text/plain": [
192 | "pyspark.sql.dataframe.DataFrame"
193 | ]
194 | },
195 | "execution_count": 7,
196 | "metadata": {},
197 | "output_type": "execute_result"
198 | }
199 | ],
200 | "source": [
201 | "type(df_spark)\n",
202 | "#dataframe is an data structures"
203 | ]
204 | },
205 | {
206 | "cell_type": "code",
207 | "execution_count": 8,
208 | "id": "79d67920-1648-4ba7-a4d7-d1467390cfc0",
209 | "metadata": {},
210 | "outputs": [
211 | {
212 | "data": {
213 | "text/plain": [
214 | "['name', 'age', 'experience']"
215 | ]
216 | },
217 | "execution_count": 8,
218 | "metadata": {},
219 | "output_type": "execute_result"
220 | }
221 | ],
222 | "source": [
223 | "df_spark.columns"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 9,
229 | "id": "8838c7a4-f2fe-4ecc-babf-0f768b697b12",
230 | "metadata": {},
231 | "outputs": [
232 | {
233 | "data": {
234 | "text/plain": [
235 | "[Row(name='kani', age=15, experience=10),\n",
236 | " Row(name='kani1', age=16, experience=8),\n",
237 | " Row(name='kani2', age=17, experience=4)]"
238 | ]
239 | },
240 | "execution_count": 9,
241 | "metadata": {},
242 | "output_type": "execute_result"
243 | }
244 | ],
245 | "source": [
246 | "df_spark.head(3)"
247 | ]
248 | },
249 | {
250 | "cell_type": "code",
251 | "execution_count": 10,
252 | "id": "e4ffe89b-3525-4a76-9dd4-8f73258a1fc9",
253 | "metadata": {},
254 | "outputs": [
255 | {
256 | "name": "stdout",
257 | "output_type": "stream",
258 | "text": [
259 | "+-----+---+----------+\n",
260 | "| name|age|experience|\n",
261 | "+-----+---+----------+\n",
262 | "| kani| 15| 10|\n",
263 | "|kani1| 16| 8|\n",
264 | "|kani2| 17| 4|\n",
265 | "+-----+---+----------+\n",
266 | "\n"
267 | ]
268 | }
269 | ],
270 | "source": [
271 | "df_spark.show()\n"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 11,
277 | "id": "a1df0972-1e5b-41f5-a39f-824c17beb234",
278 | "metadata": {},
279 | "outputs": [
280 | {
281 | "name": "stdout",
282 | "output_type": "stream",
283 | "text": [
284 | "+-----+---+\n",
285 | "| name|age|\n",
286 | "+-----+---+\n",
287 | "| kani| 15|\n",
288 | "|kani1| 16|\n",
289 | "|kani2| 17|\n",
290 | "+-----+---+\n",
291 | "\n"
292 | ]
293 | }
294 | ],
295 | "source": [
296 | "df_spark.select(\"name\",\"age\").show()"
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "execution_count": 12,
302 | "id": "722a60f6-c3ea-4e83-9ed0-517148f0f9eb",
303 | "metadata": {},
304 | "outputs": [
305 | {
306 | "data": {
307 | "text/plain": [
308 | "DataFrame[name: string, age: int]"
309 | ]
310 | },
311 | "execution_count": 12,
312 | "metadata": {},
313 | "output_type": "execute_result"
314 | }
315 | ],
316 | "source": [
317 | "df_spark.select(\"name\",\"age\")"
318 | ]
319 | },
320 | {
321 | "cell_type": "code",
322 | "execution_count": 13,
323 | "id": "39faa7d2-0218-466b-8bae-9706fe2e0739",
324 | "metadata": {},
325 | "outputs": [
326 | {
327 | "data": {
328 | "text/plain": [
329 | "pyspark.sql.dataframe.DataFrame"
330 | ]
331 | },
332 | "execution_count": 13,
333 | "metadata": {},
334 | "output_type": "execute_result"
335 | }
336 | ],
337 | "source": [
338 | "type(df_spark.select(\"name\",\"age\"))"
339 | ]
340 | },
341 | {
342 | "cell_type": "code",
343 | "execution_count": 14,
344 | "id": "f64dddc0-147d-40c9-bd33-7cd594e37ede",
345 | "metadata": {},
346 | "outputs": [
347 | {
348 | "data": {
349 | "text/plain": [
350 | "Column<'name'>"
351 | ]
352 | },
353 | "execution_count": 14,
354 | "metadata": {},
355 | "output_type": "execute_result"
356 | }
357 | ],
358 | "source": [
359 | "df_spark['name']"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 15,
365 | "id": "9c642a38-fdaf-44c9-8386-69a158b219e3",
366 | "metadata": {},
367 | "outputs": [
368 | {
369 | "data": {
370 | "text/plain": [
371 | "[('name', 'string'), ('age', 'int'), ('experience', 'int')]"
372 | ]
373 | },
374 | "execution_count": 15,
375 | "metadata": {},
376 | "output_type": "execute_result"
377 | }
378 | ],
379 | "source": [
380 | "df_spark.dtypes"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 16,
386 | "id": "dc5083b0-7911-4bf6-99f1-f48a24e9fc7a",
387 | "metadata": {},
388 | "outputs": [
389 | {
390 | "name": "stdout",
391 | "output_type": "stream",
392 | "text": [
393 | "+-------+-----+----+-----------------+\n",
394 | "|summary| name| age| experience|\n",
395 | "+-------+-----+----+-----------------+\n",
396 | "| count| 3| 3| 3|\n",
397 | "| mean| NULL|16.0|7.333333333333333|\n",
398 | "| stddev| NULL| 1.0|3.055050463303893|\n",
399 | "| min| kani| 15| 4|\n",
400 | "| max|kani2| 17| 10|\n",
401 | "+-------+-----+----+-----------------+\n",
402 | "\n"
403 | ]
404 | }
405 | ],
406 | "source": [
407 | "df_spark.describe().show()"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 17,
413 | "id": "a8b1544d-dbeb-4e88-97be-1a786711ad83",
414 | "metadata": {},
415 | "outputs": [
416 | {
417 | "name": "stdout",
418 | "output_type": "stream",
419 | "text": [
420 | "+-----+---+----------+\n",
421 | "| name|age|experience|\n",
422 | "+-----+---+----------+\n",
423 | "| kani| 15| 10|\n",
424 | "|kani1| 16| 8|\n",
425 | "|kani2| 17| 4|\n",
426 | "+-----+---+----------+\n",
427 | "\n"
428 | ]
429 | }
430 | ],
431 | "source": [
432 | "df_spark.show()"
433 | ]
434 | },
435 | {
436 | "cell_type": "code",
437 | "execution_count": 18,
438 | "id": "cc9d2965-a122-4d01-9748-85a82b2ddaf3",
439 | "metadata": {},
440 | "outputs": [
441 | {
442 | "ename": "AnalysisException",
443 | "evalue": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `experinced` cannot be resolved. Did you mean one of the following? [`name`, `age`, `experience`].",
444 | "output_type": "error",
445 | "traceback": [
446 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
447 | "\u001b[0;31mAnalysisException\u001b[0m Traceback (most recent call last)",
448 | "Cell \u001b[0;32mIn[18], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#old verison add columns \u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df_spark\u001b[38;5;241m=\u001b[39mdf_spark\u001b[38;5;241m.\u001b[39mwithColumn(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExperineced after 2 years\u001b[39m\u001b[38;5;124m\"\u001b[39m,df_spark[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mexperinced\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m)\n",
449 | "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pyspark/sql/dataframe.py:3078\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 3006\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Returns the column as a :class:`Column`.\u001b[39;00m\n\u001b[1;32m 3007\u001b[0m \n\u001b[1;32m 3008\u001b[0m \u001b[38;5;124;03m.. versionadded:: 1.3.0\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3075\u001b[0m \u001b[38;5;124;03m+---+----+\u001b[39;00m\n\u001b[1;32m 3076\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 3077\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(item, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m-> 3078\u001b[0m jc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jdf\u001b[38;5;241m.\u001b[39mapply(item)\n\u001b[1;32m 3079\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Column(jc)\n\u001b[1;32m 3080\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(item, Column):\n",
450 | "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/py4j/java_gateway.py:1322\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1316\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1317\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1318\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1319\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1321\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1322\u001b[0m return_value \u001b[38;5;241m=\u001b[39m get_return_value(\n\u001b[1;32m 1323\u001b[0m answer, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget_id, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 1325\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(temp_arg, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_detach\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n",
451 | "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pyspark/errors/exceptions/captured.py:185\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 181\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[1;32m 183\u001b[0m \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[0;32m--> 185\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 187\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n",
452 | "\u001b[0;31mAnalysisException\u001b[0m: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `experinced` cannot be resolved. Did you mean one of the following? [`name`, `age`, `experience`]."
453 | ]
454 | }
455 | ],
456 | "source": [
457 | "#old verison add columns \n",
458 | "df_spark=df_spark.withColumn(\"Experineced after 2 years\",df_spark['experinced']+2)"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": 19,
464 | "id": "c8837df0-61ac-4868-a857-355eb3ba87b1",
465 | "metadata": {},
466 | "outputs": [
467 | {
468 | "data": {
469 | "text/plain": [
470 | "DataFrame[name: string, age: int, experience: int, Experience After 2 years: int]"
471 | ]
472 | },
473 | "execution_count": 19,
474 | "metadata": {},
475 | "output_type": "execute_result"
476 | }
477 | ],
478 | "source": [
479 | "#new version \n",
480 | "from pyspark.sql.functions import col\n",
481 | "df_spark = df_spark.withColumn(\"Experience After 2 years\", col(\"Experience\") + 2)\n",
482 | "df_spark\n"
483 | ]
484 | },
485 | {
486 | "cell_type": "code",
487 | "execution_count": 20,
488 | "id": "18aaba8c-df0e-40c0-ab95-f3eee5317dc6",
489 | "metadata": {},
490 | "outputs": [
491 | {
492 | "name": "stdout",
493 | "output_type": "stream",
494 | "text": [
495 | "+-----+---+----------+------------------------+\n",
496 | "| name|age|experience|Experience After 2 years|\n",
497 | "+-----+---+----------+------------------------+\n",
498 | "| kani| 15| 10| 12|\n",
499 | "|kani1| 16| 8| 10|\n",
500 | "|kani2| 17| 4| 6|\n",
501 | "+-----+---+----------+------------------------+\n",
502 | "\n"
503 | ]
504 | }
505 | ],
506 | "source": [
507 | "df_spark.show()"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": 24,
513 | "id": "e50ad537-2396-4ccf-b93a-9b3b07f1fa35",
514 | "metadata": {},
515 | "outputs": [
516 | {
517 | "name": "stdout",
518 | "output_type": "stream",
519 | "text": [
520 | "+-----+---+----------+\n",
521 | "| name|age|experience|\n",
522 | "+-----+---+----------+\n",
523 | "| kani| 15| 10|\n",
524 | "|kani1| 16| 8|\n",
525 | "|kani2| 17| 4|\n",
526 | "+-----+---+----------+\n",
527 | "\n"
528 | ]
529 | }
530 | ],
531 | "source": [
532 | "df_spark=df_spark.drop(\"Experience After 2 years\")\n",
533 | "df_spark.show()"
534 | ]
535 | },
536 | {
537 | "cell_type": "code",
538 | "execution_count": 25,
539 | "id": "9526a218-f1a0-48e0-bb2b-12c21f3d59c9",
540 | "metadata": {},
541 | "outputs": [
542 | {
543 | "name": "stdout",
544 | "output_type": "stream",
545 | "text": [
546 | "+-----+---+----------+\n",
547 | "| Name|age|experience|\n",
548 | "+-----+---+----------+\n",
549 | "| kani| 15| 10|\n",
550 | "|kani1| 16| 8|\n",
551 | "|kani2| 17| 4|\n",
552 | "+-----+---+----------+\n",
553 | "\n"
554 | ]
555 | }
556 | ],
557 | "source": [
558 | "df_spark.withColumnRenamed(\"name\",\"Name\").show()"
559 | ]
560 | },
561 | {
562 | "cell_type": "markdown",
563 | "id": "24e3b7df-668d-409c-a93e-b4e85704937b",
564 | "metadata": {},
565 | "source": [
566 | "# withColumn\n",
567 | "# withColumnRename\n",
568 | "# drop\n",
569 | "# printSchema\n",
570 | "# read.option().csv()"
571 | ]
572 | },
573 | {
574 | "cell_type": "markdown",
575 | "id": "5e3a192b-a315-4a6f-b723-f44dc1439fa2",
576 | "metadata": {},
577 | "source": [
578 | "---------------------------------------------------------------"
579 | ]
580 | },
581 | {
582 | "cell_type": "markdown",
583 | "id": "423543b9-1221-4296-a81e-90faa742d571",
584 | "metadata": {},
585 | "source": [
586 | "# PySpark Dataframe\n",
587 | "# Reading The Dataset\n",
588 | "# Checking the Datatypes of the Column(Schema)\n",
589 | "# Selecting Columns And Indexing\n",
590 | "# Check Describe option similar to Pandas\n",
591 | "# Adding Columns\n",
592 | "# Dropping columns\n",
593 | "# Renaming Columns"
594 | ]
595 | },
596 | {
597 | "cell_type": "code",
598 | "execution_count": null,
599 | "id": "04c02cfb-db67-4ede-92e1-ea03c1dee13b",
600 | "metadata": {},
601 | "outputs": [],
602 | "source": []
603 | },
604 | {
605 | "cell_type": "code",
606 | "execution_count": null,
607 | "id": "6fc812f3-c60a-42ea-a387-b1e7700a2e3d",
608 | "metadata": {},
609 | "outputs": [],
610 | "source": []
611 | }
612 | ],
613 | "metadata": {
614 | "kernelspec": {
615 | "display_name": "Python 3 (ipykernel)",
616 | "language": "python",
617 | "name": "python3"
618 | },
619 | "language_info": {
620 | "codemirror_mode": {
621 | "name": "ipython",
622 | "version": 3
623 | },
624 | "file_extension": ".py",
625 | "mimetype": "text/x-python",
626 | "name": "python",
627 | "nbconvert_exporter": "python",
628 | "pygments_lexer": "ipython3",
629 | "version": "3.11.7"
630 | }
631 | },
632 | "nbformat": 4,
633 | "nbformat_minor": 5
634 | }
635 |
--------------------------------------------------------------------------------
/day3.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 12,
6 | "id": "483ee2b4-8900-4113-998e-15f63d2fbce6",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stdout",
11 | "output_type": "stream",
12 | "text": [
13 | "root\n",
14 | " |-- Name: string (nullable = true)\n",
15 | " |-- age: integer (nullable = true)\n",
16 | " |-- Experience: integer (nullable = true)\n",
17 | " |-- Salary: integer (nullable = true)\n",
18 | "\n"
19 | ]
20 | }
21 | ],
22 | "source": [
23 | "from pyspark.sql import SparkSession\n",
24 | "spark=SparkSession.builder.appName('Practise').getOrCreate()\n",
25 | "df_pyspark=spark.read.csv('test3.csv',header=True,inferSchema=True)\n",
26 | "df_pyspark.printSchema()"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 13,
32 | "id": "ed321960-d843-40f6-9a21-f89685ec640b",
33 | "metadata": {},
34 | "outputs": [
35 | {
36 | "name": "stdout",
37 | "output_type": "stream",
38 | "text": [
39 | "+---------+----+----------+------+\n",
40 | "| Name| age|Experience|Salary|\n",
41 | "+---------+----+----------+------+\n",
42 | "| Krish| 31| 10| 30000|\n",
43 | "|Sudhanshu| 30| 8| 25000|\n",
44 | "| Sunny| 29| 4| 20000|\n",
45 | "| Paul| 24| 3| 20000|\n",
46 | "| Harsha| 21| 1| 15000|\n",
47 | "| Shubham| 23| 2| 18000|\n",
48 | "| Mahesh|NULL| NULL| 40000|\n",
49 | "| NULL| 34| 10| 38000|\n",
50 | "| NULL| 36| NULL| NULL|\n",
51 | "+---------+----+----------+------+\n",
52 | "\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "df_pyspark.show()\n"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 14,
63 | "id": "2ed60ea7-0bc8-4999-8739-c4f753ec604f",
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stdout",
68 | "output_type": "stream",
69 | "text": [
70 | "+----+----------+------+\n",
71 | "| age|Experience|Salary|\n",
72 | "+----+----------+------+\n",
73 | "| 31| 10| 30000|\n",
74 | "| 30| 8| 25000|\n",
75 | "| 29| 4| 20000|\n",
76 | "| 24| 3| 20000|\n",
77 | "| 21| 1| 15000|\n",
78 | "| 23| 2| 18000|\n",
79 | "|NULL| NULL| 40000|\n",
80 | "| 34| 10| 38000|\n",
81 | "| 36| NULL| NULL|\n",
82 | "+----+----------+------+\n",
83 | "\n"
84 | ]
85 | }
86 | ],
87 | "source": [
88 | "##drop the columns\n",
89 | "df_pyspark.drop('Name').show()"
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 15,
95 | "id": "d04de81f-db78-4c40-862f-80b3953a1507",
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "name": "stdout",
100 | "output_type": "stream",
101 | "text": [
102 | "+---------+----+----------+------+\n",
103 | "| Name| age|Experience|Salary|\n",
104 | "+---------+----+----------+------+\n",
105 | "| Krish| 31| 10| 30000|\n",
106 | "|Sudhanshu| 30| 8| 25000|\n",
107 | "| Sunny| 29| 4| 20000|\n",
108 | "| Paul| 24| 3| 20000|\n",
109 | "| Harsha| 21| 1| 15000|\n",
110 | "| Shubham| 23| 2| 18000|\n",
111 | "| Mahesh|NULL| NULL| 40000|\n",
112 | "| NULL| 34| 10| 38000|\n",
113 | "| NULL| 36| NULL| NULL|\n",
114 | "+---------+----+----------+------+\n",
115 | "\n"
116 | ]
117 | }
118 | ],
119 | "source": [
120 | "df_pyspark.show()\n"
121 | ]
122 | },
123 | {
124 | "cell_type": "code",
125 | "execution_count": 16,
126 | "id": "cae05add-e02d-4366-b752-b7b7b034679a",
127 | "metadata": {},
128 | "outputs": [
129 | {
130 | "name": "stdout",
131 | "output_type": "stream",
132 | "text": [
133 | "+---------+---+----------+------+\n",
134 | "| Name|age|Experience|Salary|\n",
135 | "+---------+---+----------+------+\n",
136 | "| Krish| 31| 10| 30000|\n",
137 | "|Sudhanshu| 30| 8| 25000|\n",
138 | "| Sunny| 29| 4| 20000|\n",
139 | "| Paul| 24| 3| 20000|\n",
140 | "| Harsha| 21| 1| 15000|\n",
141 | "| Shubham| 23| 2| 18000|\n",
142 | "+---------+---+----------+------+\n",
143 | "\n"
144 | ]
145 | }
146 | ],
147 | "source": [
148 | "df_pyspark.na.drop().show()\n"
149 | ]
150 | },
151 | {
152 | "cell_type": "code",
153 | "execution_count": 17,
154 | "id": "978c455f-fe3b-431f-8585-4e4e1e43fda8",
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | "+---------+---+----------+------+\n",
162 | "| Name|age|Experience|Salary|\n",
163 | "+---------+---+----------+------+\n",
164 | "| Krish| 31| 10| 30000|\n",
165 | "|Sudhanshu| 30| 8| 25000|\n",
166 | "| Sunny| 29| 4| 20000|\n",
167 | "| Paul| 24| 3| 20000|\n",
168 | "| Harsha| 21| 1| 15000|\n",
169 | "| Shubham| 23| 2| 18000|\n",
170 | "+---------+---+----------+------+\n",
171 | "\n"
172 | ]
173 | }
174 | ],
175 | "source": [
176 | "### any==how\n",
177 | "df_pyspark.na.drop(how=\"any\").show()"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 18,
183 | "id": "a3d34792-db4f-41b4-885e-96183069253a",
184 | "metadata": {},
185 | "outputs": [
186 | {
187 | "name": "stdout",
188 | "output_type": "stream",
189 | "text": [
190 | "+---------+---+----------+------+\n",
191 | "| Name|age|Experience|Salary|\n",
192 | "+---------+---+----------+------+\n",
193 | "| Krish| 31| 10| 30000|\n",
194 | "|Sudhanshu| 30| 8| 25000|\n",
195 | "| Sunny| 29| 4| 20000|\n",
196 | "| Paul| 24| 3| 20000|\n",
197 | "| Harsha| 21| 1| 15000|\n",
198 | "| Shubham| 23| 2| 18000|\n",
199 | "| NULL| 34| 10| 38000|\n",
200 | "+---------+---+----------+------+\n",
201 | "\n"
202 | ]
203 | }
204 | ],
205 | "source": [
206 | "##threshold\n",
207 | "df_pyspark.na.drop(how=\"any\",thresh=3).show()"
208 | ]
209 | },
210 | {
211 | "cell_type": "code",
212 | "execution_count": 19,
213 | "id": "0416e143-663c-4ca3-b451-db14cbd15b0c",
214 | "metadata": {},
215 | "outputs": [
216 | {
217 | "name": "stdout",
218 | "output_type": "stream",
219 | "text": [
220 | "+---------+---+----------+------+\n",
221 | "| Name|age|Experience|Salary|\n",
222 | "+---------+---+----------+------+\n",
223 | "| Krish| 31| 10| 30000|\n",
224 | "|Sudhanshu| 30| 8| 25000|\n",
225 | "| Sunny| 29| 4| 20000|\n",
226 | "| Paul| 24| 3| 20000|\n",
227 | "| Harsha| 21| 1| 15000|\n",
228 | "| Shubham| 23| 2| 18000|\n",
229 | "| NULL| 34| 10| 38000|\n",
230 | "| NULL| 36| NULL| NULL|\n",
231 | "+---------+---+----------+------+\n",
232 | "\n"
233 | ]
234 | }
235 | ],
236 | "source": [
237 | "##Subset\n",
238 | "df_pyspark.na.drop(how=\"any\",subset=['Age']).show()"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 28,
244 | "id": "0685ef56-d34b-405a-b171-aecf2ff7a621",
245 | "metadata": {},
246 | "outputs": [
247 | {
248 | "name": "stdout",
249 | "output_type": "stream",
250 | "text": [
251 | "+--------------+----+----------+------+\n",
252 | "| Name| age|Experience|Salary|\n",
253 | "+--------------+----+----------+------+\n",
254 | "| Krish| 31| 10| 30000|\n",
255 | "| Sudhanshu| 30| 8| 25000|\n",
256 | "| Sunny| 29| 4| 20000|\n",
257 | "| Paul| 24| 3| 20000|\n",
258 | "| Harsha| 21| 1| 15000|\n",
259 | "| Shubham| 23| 2| 18000|\n",
260 | "| Mahesh|NULL| NULL| 40000|\n",
261 | "|Missing Values| 34| 10| 38000|\n",
262 | "|Missing Values| 36| NULL| NULL|\n",
263 | "+--------------+----+----------+------+\n",
264 | "\n"
265 | ]
266 | }
267 | ],
268 | "source": [
269 | "### Filling the Missing Value\n",
270 | "from pyspark.sql import SparkSession\n",
271 | "from pyspark.sql.functions import col\n",
272 | "df_pyspark.na.fill('Missing Values').show()\n",
273 | "#not working"
274 | ]
275 | },
276 | {
277 | "cell_type": "code",
278 | "execution_count": 21,
279 | "id": "1e5d7ee7-bcb1-4c4f-90fd-49637fd12136",
280 | "metadata": {},
281 | "outputs": [
282 | {
283 | "name": "stdout",
284 | "output_type": "stream",
285 | "text": [
286 | "+---------+----+----------+------+\n",
287 | "| Name| age|Experience|Salary|\n",
288 | "+---------+----+----------+------+\n",
289 | "| Krish| 31| 10| 30000|\n",
290 | "|Sudhanshu| 30| 8| 25000|\n",
291 | "| Sunny| 29| 4| 20000|\n",
292 | "| Paul| 24| 3| 20000|\n",
293 | "| Harsha| 21| 1| 15000|\n",
294 | "| Shubham| 23| 2| 18000|\n",
295 | "| Mahesh|NULL| NULL| 40000|\n",
296 | "| NULL| 34| 10| 38000|\n",
297 | "| NULL| 36| NULL| NULL|\n",
298 | "+---------+----+----------+------+\n",
299 | "\n"
300 | ]
301 | }
302 | ],
303 | "source": [
304 | "df_pyspark.show()"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 22,
310 | "id": "3c2aee0c-f361-41a1-b92b-477aa1946774",
311 | "metadata": {},
312 | "outputs": [
313 | {
314 | "name": "stdout",
315 | "output_type": "stream",
316 | "text": [
317 | "root\n",
318 | " |-- Name: string (nullable = true)\n",
319 | " |-- age: integer (nullable = true)\n",
320 | " |-- Experience: integer (nullable = true)\n",
321 | " |-- Salary: integer (nullable = true)\n",
322 | "\n"
323 | ]
324 | }
325 | ],
326 | "source": [
327 | "df_pyspark.printSchema()"
328 | ]
329 | },
330 | {
331 | "cell_type": "code",
332 | "execution_count": 29,
333 | "id": "10b778bd-3044-4896-b5a7-be84a05a41a5",
334 | "metadata": {},
335 | "outputs": [],
336 | "source": [
337 | "from pyspark.ml.feature import Imputer\n",
338 | "\n",
339 | "imputer = Imputer(\n",
340 | " inputCols=['age', 'Experience', 'Salary'], \n",
341 | " outputCols=[\"{}_imputed\".format(c) for c in ['age', 'Experience', 'Salary']]\n",
342 | " ).setStrategy(\"median\")"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 31,
348 | "id": "ef331a71-d7d1-417e-8386-45c1547ce033",
349 | "metadata": {},
350 | "outputs": [
351 | {
352 | "name": "stdout",
353 | "output_type": "stream",
354 | "text": [
355 | "+---------+----+----------+------+-----------+------------------+--------------+\n",
356 | "| Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|\n",
357 | "+---------+----+----------+------+-----------+------------------+--------------+\n",
358 | "| Krish| 31| 10| 30000| 31| 10| 30000|\n",
359 | "|Sudhanshu| 30| 8| 25000| 30| 8| 25000|\n",
360 | "| Sunny| 29| 4| 20000| 29| 4| 20000|\n",
361 | "| Paul| 24| 3| 20000| 24| 3| 20000|\n",
362 | "| Harsha| 21| 1| 15000| 21| 1| 15000|\n",
363 | "| Shubham| 23| 2| 18000| 23| 2| 18000|\n",
364 | "| Mahesh|NULL| NULL| 40000| 29| 4| 40000|\n",
365 | "| NULL| 34| 10| 38000| 34| 10| 38000|\n",
366 | "| NULL| 36| NULL| NULL| 36| 4| 20000|\n",
367 | "+---------+----+----------+------+-----------+------------------+--------------+\n",
368 | "\n"
369 | ]
370 | }
371 | ],
372 | "source": [
373 | "a=imputer.fit(df_pyspark).transform(df_pyspark).show()\n",
374 | "a"
375 | ]
376 | },
377 | {
378 | "cell_type": "code",
379 | "execution_count": 40,
380 | "id": "70d0bf18-92a2-47a8-99f2-48dbf5ac4e97",
381 | "metadata": {},
382 | "outputs": [
383 | {
384 | "name": "stdout",
385 | "output_type": "stream",
386 | "text": [
387 | "+---------+----+----------+------+----------+-----------------+-------------+\n",
388 | "| Name| age|Experience|Salary|age_imuter|Experience_imuter|Salary_imuter|\n",
389 | "+---------+----+----------+------+----------+-----------------+-------------+\n",
390 | "| Krish| 31| 10| 30000| 31| 10| 30000|\n",
391 | "|Sudhanshu| 30| 8| 25000| 30| 8| 25000|\n",
392 | "| Sunny| 29| 4| 20000| 29| 4| 20000|\n",
393 | "| Paul| 24| 3| 20000| 24| 3| 20000|\n",
394 | "| Harsha| 21| 1| 15000| 21| 1| 15000|\n",
395 | "| Shubham| 23| 2| 18000| 23| 2| 18000|\n",
396 | "| Mahesh|NULL| NULL| 40000| 28| 5| 40000|\n",
397 | "| NULL| 34| 10| 38000| 34| 10| 38000|\n",
398 | "| NULL| 36| NULL| NULL| 36| 5| 25750|\n",
399 | "+---------+----+----------+------+----------+-----------------+-------------+\n",
400 | "\n"
401 | ]
402 | }
403 | ],
404 | "source": [
405 | "from pyspark.ml.feature import Imputer\n",
406 | "imputer=Imputer(\n",
407 | " inputCols=[\"age\",\"Experience\",\"Salary\"],\n",
408 | " outputCols=[\"{}_imuter\".format(c) for c in [\"age\",\"Experience\",\"Salary\"]]).setStrategy(\"mean\")\n",
409 | "a=imputer.fit(df_pyspark).transform(df_pyspark).show()\n",
410 | "a"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 46,
416 | "id": "c97c37eb-1f4a-48db-82aa-d27fbed20d6b",
417 | "metadata": {},
418 | "outputs": [
419 | {
420 | "name": "stdout",
421 | "output_type": "stream",
422 | "text": [
423 | "+---------+----+----------+------+-----------+------------------+--------------+\n",
424 | "| Name| age|Experience|Salary|age_imputer|Experience_imputer|Salary_imputer|\n",
425 | "+---------+----+----------+------+-----------+------------------+--------------+\n",
426 | "| Krish| 31| 10| 30000| 31| 10| 30000|\n",
427 | "|Sudhanshu| 30| 8| 25000| 30| 8| 25000|\n",
428 | "| Sunny| 29| 4| 20000| 29| 4| 20000|\n",
429 | "| Paul| 24| 3| 20000| 24| 3| 20000|\n",
430 | "| Harsha| 21| 1| 15000| 21| 1| 15000|\n",
431 | "| Shubham| 23| 2| 18000| 23| 2| 18000|\n",
432 | "| Mahesh|NULL| NULL| 40000| 28| 5| 40000|\n",
433 | "| NULL| 34| 10| 38000| 34| 10| 38000|\n",
434 | "| NULL| 36| NULL| NULL| 36| 5| 25750|\n",
435 | "+---------+----+----------+------+-----------+------------------+--------------+\n",
436 | "\n"
437 | ]
438 | }
439 | ],
440 | "source": [
441 | "from pyspark.ml.feature import Imputer\n",
442 | "a=Imputer(\n",
443 | " inputCols=[\"age\",\"Experience\",\"Salary\"],\n",
444 | " outputCols=[\"{}_imputer\".format(c) for c in [\"age\",\"Experience\",\"Salary\"]]).setStrategy(\"mean\")\n",
445 | "b=a.fit(df_pyspark).transform(df_pyspark).show()"
446 | ]
447 | },
448 | {
449 | "cell_type": "code",
450 | "execution_count": null,
451 | "id": "c42c0587-3642-435b-b410-7c34e20eada6",
452 | "metadata": {},
453 | "outputs": [],
454 | "source": []
455 | }
456 | ],
457 | "metadata": {
458 | "kernelspec": {
459 | "display_name": "Python 3 (ipykernel)",
460 | "language": "python",
461 | "name": "python3"
462 | },
463 | "language_info": {
464 | "codemirror_mode": {
465 | "name": "ipython",
466 | "version": 3
467 | },
468 | "file_extension": ".py",
469 | "mimetype": "text/x-python",
470 | "name": "python",
471 | "nbconvert_exporter": "python",
472 | "pygments_lexer": "ipython3",
473 | "version": "3.11.7"
474 | }
475 | },
476 | "nbformat": 4,
477 | "nbformat_minor": 5
478 | }
479 |
--------------------------------------------------------------------------------
/day4.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "b7236a68-3f52-480d-a16d-e11abafd7bee",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stderr",
11 | "output_type": "stream",
12 | "text": [
13 | "24/04/24 11:33:24 WARN Utils: Your hostname, kyn resolves to a loopback address: 127.0.1.1; using 10.0.250.240 instead (on interface wlp0s20f3)\n",
14 | "24/04/24 11:33:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
15 | "Setting default log level to \"WARN\".\n",
16 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
17 | "24/04/24 11:33:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
18 | "24/04/24 11:33:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n",
19 | "24/04/24 11:33:25 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n"
20 | ]
21 | },
22 | {
23 | "data": {
24 | "text/html": [
25 | "\n",
26 | " \n",
27 | "
SparkSession - in-memory
\n",
28 | " \n",
29 | "
\n",
30 | "
SparkContext
\n",
31 | "\n",
32 | "
Spark UI
\n",
33 | "\n",
34 | "
\n",
35 | " - Version
\n",
36 | " v3.5.1
\n",
37 | " - Master
\n",
38 | " local[*]
\n",
39 | " - AppName
\n",
40 | " filter
\n",
41 | "
\n",
42 | "
\n",
43 | " \n",
44 | "
\n",
45 | " "
46 | ],
47 | "text/plain": [
48 | ""
49 | ]
50 | },
51 | "execution_count": 1,
52 | "metadata": {},
53 | "output_type": "execute_result"
54 | }
55 | ],
56 | "source": [
57 | "from pyspark.sql import SparkSession\n",
58 | "spark=SparkSession.builder.appName(\"filter\").getOrCreate()\n",
59 | "spark"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 2,
65 | "id": "3bf95d12-f487-40fe-b951-3c7a124e1f7f",
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | "+---------+---+----------+------+\n",
73 | "| Name|age|Experience|Salary|\n",
74 | "+---------+---+----------+------+\n",
75 | "| Krish| 31| 10| 30000|\n",
76 | "|Sudhanshu| 30| 8| 25000|\n",
77 | "| Sunny| 29| 4| 20000|\n",
78 | "| Paul| 24| 3| 20000|\n",
79 | "| Harsha| 21| 1| 15000|\n",
80 | "| Shubham| 23| 2| 18000|\n",
81 | "+---------+---+----------+------+\n",
82 | "\n"
83 | ]
84 | }
85 | ],
86 | "source": [
87 | "df_spark=spark.read.csv(\"test4.csv\",header=True,inferSchema=True)\n",
88 | "df_spark.show()"
89 | ]
90 | },
91 | {
92 | "cell_type": "code",
93 | "execution_count": 4,
94 | "id": "388abc67-31a2-41c0-849c-535a1f60894d",
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "+-------+---+----------+------+\n",
102 | "| Name|age|Experience|Salary|\n",
103 | "+-------+---+----------+------+\n",
104 | "| Sunny| 29| 4| 20000|\n",
105 | "| Paul| 24| 3| 20000|\n",
106 | "| Harsha| 21| 1| 15000|\n",
107 | "|Shubham| 23| 2| 18000|\n",
108 | "+-------+---+----------+------+\n",
109 | "\n"
110 | ]
111 | }
112 | ],
113 | "source": [
114 | "##salry of people <= 2000\n",
115 | "df_spark.filter(\"Salary<=20000\").show()"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 6,
121 | "id": "8ee68b63-8680-4232-85e7-44da79d629d4",
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "name": "stdout",
126 | "output_type": "stream",
127 | "text": [
128 | "+---------+------+\n",
129 | "| Name|Salary|\n",
130 | "+---------+------+\n",
131 | "| Krish| 30000|\n",
132 | "|Sudhanshu| 25000|\n",
133 | "+---------+------+\n",
134 | "\n"
135 | ]
136 | }
137 | ],
138 | "source": [
139 | "df_spark.filter(\"Experience >= 5\").select([\"Name\",\"Salary\"]).show()"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 7,
145 | "id": "28a07f16-b0b1-4c28-b190-389279656fe5",
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "name": "stdout",
150 | "output_type": "stream",
151 | "text": [
152 | "+-------+---+----------+------+\n",
153 | "| Name|age|Experience|Salary|\n",
154 | "+-------+---+----------+------+\n",
155 | "| Sunny| 29| 4| 20000|\n",
156 | "| Paul| 24| 3| 20000|\n",
157 | "| Harsha| 21| 1| 15000|\n",
158 | "|Shubham| 23| 2| 18000|\n",
159 | "+-------+---+----------+------+\n",
160 | "\n"
161 | ]
162 | }
163 | ],
164 | "source": [
165 | "df_spark.filter(df_spark[\"Salary\"]<=20000).show()"
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "execution_count": 14,
171 | "id": "4b86de2a-82df-4b4a-89d6-3c269ab2ade7",
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "name": "stdout",
176 | "output_type": "stream",
177 | "text": [
178 | "+-------+---+----------+------+\n",
179 | "| Name|age|Experience|Salary|\n",
180 | "+-------+---+----------+------+\n",
181 | "| Sunny| 29| 4| 20000|\n",
182 | "| Paul| 24| 3| 20000|\n",
183 | "| Harsha| 21| 1| 15000|\n",
184 | "|Shubham| 23| 2| 18000|\n",
185 | "+-------+---+----------+------+\n",
186 | "\n"
187 | ]
188 | }
189 | ],
190 | "source": [
191 | "df_spark.filter((df_spark[\"Salary\"]<=20000) & \n",
192 | " (df_spark[\"Salary\"]>=15000)).show()"
193 | ]
194 | },
195 | {
196 | "cell_type": "code",
197 | "execution_count": 16,
198 | "id": "b7efea59-ce39-4ecb-9ec8-0531ff2477c1",
199 | "metadata": {},
200 | "outputs": [
201 | {
202 | "name": "stdout",
203 | "output_type": "stream",
204 | "text": [
205 | "+---------+---+----------+------+\n",
206 | "| Name|age|Experience|Salary|\n",
207 | "+---------+---+----------+------+\n",
208 | "| Krish| 31| 10| 30000|\n",
209 | "|Sudhanshu| 30| 8| 25000|\n",
210 | "+---------+---+----------+------+\n",
211 | "\n"
212 | ]
213 | }
214 | ],
215 | "source": [
216 | "df_spark.filter(~(df_spark[\"Salary\"]<=20000)).show()"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": null,
222 | "id": "c7843daa-3229-434c-9f44-2ab860b3a82d",
223 | "metadata": {},
224 | "outputs": [],
225 | "source": []
226 | }
227 | ],
228 | "metadata": {
229 | "kernelspec": {
230 | "display_name": "Python 3 (ipykernel)",
231 | "language": "python",
232 | "name": "python3"
233 | },
234 | "language_info": {
235 | "codemirror_mode": {
236 | "name": "ipython",
237 | "version": 3
238 | },
239 | "file_extension": ".py",
240 | "mimetype": "text/x-python",
241 | "name": "python",
242 | "nbconvert_exporter": "python",
243 | "pygments_lexer": "ipython3",
244 | "version": "3.11.7"
245 | }
246 | },
247 | "nbformat": 4,
248 | "nbformat_minor": 5
249 | }
250 |
--------------------------------------------------------------------------------
/day5.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "248ea11a-8dce-4633-a7c5-5a48b6daf85e",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stderr",
11 | "output_type": "stream",
12 | "text": [
13 | "24/04/24 11:52:13 WARN Utils: Your hostname, kyn resolves to a loopback address: 127.0.1.1; using 10.0.250.240 instead (on interface wlp0s20f3)\n",
14 | "24/04/24 11:52:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
15 | "Setting default log level to \"WARN\".\n",
16 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
17 | "24/04/24 11:52:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n",
18 | "24/04/24 11:52:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n",
19 | "24/04/24 11:52:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n",
20 | "24/04/24 11:52:14 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n"
21 | ]
22 | },
23 | {
24 | "data": {
25 | "text/html": [
26 | "\n",
27 | " \n",
28 | "
SparkSession - in-memory
\n",
29 | " \n",
30 | "
\n",
31 | "
SparkContext
\n",
32 | "\n",
33 | "
Spark UI
\n",
34 | "\n",
35 | "
\n",
36 | " - Version
\n",
37 | " v3.5.1
\n",
38 | " - Master
\n",
39 | " local[*]
\n",
40 | " - AppName
\n",
41 | " kani1
\n",
42 | "
\n",
43 | "
\n",
44 | " \n",
45 | "
\n",
46 | " "
47 | ],
48 | "text/plain": [
49 | ""
50 | ]
51 | },
52 | "execution_count": 1,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "from pyspark.sql import SparkSession\n",
59 | "spark=SparkSession.builder.appName(\"kani1\").getOrCreate()\n",
60 | "spark"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 2,
66 | "id": "91ed6d86-65c4-4d75-b531-7fde1090ff1c",
67 | "metadata": {},
68 | "outputs": [
69 | {
70 | "name": "stdout",
71 | "output_type": "stream",
72 | "text": [
73 | "+---------+------------+------+\n",
74 | "| Name| Departments|salary|\n",
75 | "+---------+------------+------+\n",
76 | "| Krish|Data Science| 10000|\n",
77 | "| Krish| IOT| 5000|\n",
78 | "| Mahesh| Big Data| 4000|\n",
79 | "| Krish| Big Data| 4000|\n",
80 | "| Mahesh|Data Science| 3000|\n",
81 | "|Sudhanshu|Data Science| 20000|\n",
82 | "|Sudhanshu| IOT| 10000|\n",
83 | "|Sudhanshu| Big Data| 5000|\n",
84 | "| Sunny|Data Science| 10000|\n",
85 | "| Sunny| Big Data| 2000|\n",
86 | "+---------+------------+------+\n",
87 | "\n"
88 | ]
89 | }
90 | ],
91 | "source": [
92 | "df_spark=spark.read.csv(\"test5.csv\",header=True,inferSchema=True)\n",
93 | "df_spark.show()"
94 | ]
95 | },
96 | {
97 | "cell_type": "code",
98 | "execution_count": 3,
99 | "id": "aa4ed17a-2f27-42f1-9495-861cea0ba131",
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "name": "stdout",
104 | "output_type": "stream",
105 | "text": [
106 | "root\n",
107 | " |-- Name: string (nullable = true)\n",
108 | " |-- Departments: string (nullable = true)\n",
109 | " |-- salary: integer (nullable = true)\n",
110 | "\n"
111 | ]
112 | }
113 | ],
114 | "source": [
115 | "df_spark.printSchema()"
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 10,
121 | "id": "e9298517-0c79-4edb-8e8b-34960a04cb36",
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "data": {
126 | "text/plain": [
127 | "DataFrame[Name: string, sum(salary): bigint]"
128 | ]
129 | },
130 | "execution_count": 10,
131 | "metadata": {},
132 | "output_type": "execute_result"
133 | }
134 | ],
135 | "source": [
136 | "df_spark.groupBy(\"Name\").sum()"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 11,
142 | "id": "964116b0-c611-46c5-a86d-f5158aea0f37",
143 | "metadata": {},
144 | "outputs": [
145 | {
146 | "name": "stdout",
147 | "output_type": "stream",
148 | "text": [
149 | "+---------+-----------+\n",
150 | "| Name|sum(salary)|\n",
151 | "+---------+-----------+\n",
152 | "|Sudhanshu| 35000|\n",
153 | "| Sunny| 12000|\n",
154 | "| Krish| 19000|\n",
155 | "| Mahesh| 7000|\n",
156 | "+---------+-----------+\n",
157 | "\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "#groupby\n",
163 | "df_spark.groupBy(\"Name\").sum().show()"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 13,
169 | "id": "8fbc862a-fffd-414e-b8fd-29b1823893f3",
170 | "metadata": {},
171 | "outputs": [
172 | {
173 | "name": "stdout",
174 | "output_type": "stream",
175 | "text": [
176 | "+------------+-----------+\n",
177 | "| Departments|sum(salary)|\n",
178 | "+------------+-----------+\n",
179 | "| IOT| 15000|\n",
180 | "| Big Data| 15000|\n",
181 | "|Data Science| 43000|\n",
182 | "+------------+-----------+\n",
183 | "\n"
184 | ]
185 | }
186 | ],
187 | "source": [
188 | "df_spark.groupBy(\"Departments\").sum().show()"
189 | ]
190 | },
191 | {
192 | "cell_type": "code",
193 | "execution_count": 16,
194 | "id": "775124c8-8a1b-4434-9092-f2f6ed8f9ff2",
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "name": "stdout",
199 | "output_type": "stream",
200 | "text": [
201 | "+------------+-----+\n",
202 | "| Departments|count|\n",
203 | "+------------+-----+\n",
204 | "| IOT| 2|\n",
205 | "| Big Data| 4|\n",
206 | "|Data Science| 4|\n",
207 | "+------------+-----+\n",
208 | "\n"
209 | ]
210 | }
211 | ],
212 | "source": [
213 | "df_spark.groupBy(\"Departments\").count().show()"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 18,
219 | "id": "a8ff04cb-a9b5-47f9-a300-ddc2fde7d67a",
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "name": "stdout",
224 | "output_type": "stream",
225 | "text": [
226 | "+-----------+\n",
227 | "|sum(Salary)|\n",
228 | "+-----------+\n",
229 | "| 73000|\n",
230 | "+-----------+\n",
231 | "\n"
232 | ]
233 | }
234 | ],
235 | "source": [
236 | "df_spark.agg({\"Salary\":\"sum\"}).show()"
237 | ]
238 | },
239 | {
240 | "cell_type": "code",
241 | "execution_count": 19,
242 | "id": "2130f698-7aae-4d76-8acf-4b2b0ce9d784",
243 | "metadata": {},
244 | "outputs": [
245 | {
246 | "name": "stdout",
247 | "output_type": "stream",
248 | "text": [
249 | "+---------+-----------+\n",
250 | "| Name|max(salary)|\n",
251 | "+---------+-----------+\n",
252 | "|Sudhanshu| 20000|\n",
253 | "| Sunny| 10000|\n",
254 | "| Krish| 10000|\n",
255 | "| Mahesh| 4000|\n",
256 | "+---------+-----------+\n",
257 | "\n"
258 | ]
259 | }
260 | ],
261 | "source": [
262 | "df_spark.groupBy(\"Name\").max().show()"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": null,
268 | "id": "23590cf1-9e11-42a6-ad20-06f912417643",
269 | "metadata": {},
270 | "outputs": [],
271 | "source": []
272 | }
273 | ],
274 | "metadata": {
275 | "kernelspec": {
276 | "display_name": "Python 3 (ipykernel)",
277 | "language": "python",
278 | "name": "python3"
279 | },
280 | "language_info": {
281 | "codemirror_mode": {
282 | "name": "ipython",
283 | "version": 3
284 | },
285 | "file_extension": ".py",
286 | "mimetype": "text/x-python",
287 | "name": "python",
288 | "nbconvert_exporter": "python",
289 | "pygments_lexer": "ipython3",
290 | "version": "3.11.7"
291 | }
292 | },
293 | "nbformat": 4,
294 | "nbformat_minor": 5
295 | }
296 |
--------------------------------------------------------------------------------
/day6.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 12,
6 | "id": "f3564fea-39ad-48fc-a2fb-dcc2842d46cb",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stderr",
11 | "output_type": "stream",
12 | "text": [
13 | "24/04/26 09:50:33 WARN Utils: Your hostname, kyn resolves to a loopback address: 127.0.1.1; using 10.0.250.224 instead (on interface wlp0s20f3)\n",
14 | "24/04/26 09:50:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n",
15 | "Setting default log level to \"WARN\".\n",
16 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n",
17 | "24/04/26 09:50:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n"
18 | ]
19 | }
20 | ],
21 | "source": [
22 | "from pyspark.sql import SparkSession\n",
23 | "spark=SparkSession.builder.appName(\"k\").getOrCreate()"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 13,
29 | "id": "f969b760-3d18-4ee3-a2d0-a8a0b25ceb72",
30 | "metadata": {},
31 | "outputs": [
32 | {
33 | "data": {
34 | "text/html": [
35 | "\n",
36 | " \n",
37 | "
SparkSession - in-memory
\n",
38 | " \n",
39 | "
\n",
40 | "
SparkContext
\n",
41 | "\n",
42 | "
Spark UI
\n",
43 | "\n",
44 | "
\n",
45 | " - Version
\n",
46 | " v3.5.1
\n",
47 | " - Master
\n",
48 | " local[*]
\n",
49 | " - AppName
\n",
50 | " k
\n",
51 | "
\n",
52 | "
\n",
53 | " \n",
54 | "
\n",
55 | " "
56 | ],
57 | "text/plain": [
58 | ""
59 | ]
60 | },
61 | "execution_count": 13,
62 | "metadata": {},
63 | "output_type": "execute_result"
64 | }
65 | ],
66 | "source": [
67 | "spark"
68 | ]
69 | },
70 | {
71 | "cell_type": "code",
72 | "execution_count": 14,
73 | "id": "a7b85e69-8425-420a-a8ee-a151e1857e44",
74 | "metadata": {},
75 | "outputs": [
76 | {
77 | "name": "stdout",
78 | "output_type": "stream",
79 | "text": [
80 | "+---------+---+----------+------+\n",
81 | "| Name|age|Experience|Salary|\n",
82 | "+---------+---+----------+------+\n",
83 | "| Krish| 31| 10| 30000|\n",
84 | "|Sudhanshu| 30| 8| 25000|\n",
85 | "| Sunny| 29| 4| 20000|\n",
86 | "| Paul| 24| 3| 20000|\n",
87 | "| Harsha| 21| 1| 15000|\n",
88 | "| Shubham| 23| 2| 18000|\n",
89 | "+---------+---+----------+------+\n",
90 | "\n"
91 | ]
92 | }
93 | ],
94 | "source": [
95 | "df_spark=spark.read.csv(\"test4.csv\",header=True,inferSchema=True)\n",
96 | "df_spark.show()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": 15,
102 | "id": "2001e4cf-c3ec-44a1-a475-57e57e75d66f",
103 | "metadata": {},
104 | "outputs": [
105 | {
106 | "name": "stdout",
107 | "output_type": "stream",
108 | "text": [
109 | "root\n",
110 | " |-- Name: string (nullable = true)\n",
111 | " |-- age: integer (nullable = true)\n",
112 | " |-- Experience: integer (nullable = true)\n",
113 | " |-- Salary: integer (nullable = true)\n",
114 | "\n"
115 | ]
116 | }
117 | ],
118 | "source": [
119 | "df_spark.printSchema()"
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 16,
125 | "id": "c4b63544-914e-4f6b-bff1-bdf91cdb4e4b",
126 | "metadata": {},
127 | "outputs": [
128 | {
129 | "data": {
130 | "text/plain": [
131 | "['Name', 'age', 'Experience', 'Salary']"
132 | ]
133 | },
134 | "execution_count": 16,
135 | "metadata": {},
136 | "output_type": "execute_result"
137 | }
138 | ],
139 | "source": [
140 | "df_spark.columns"
141 | ]
142 | },
143 | {
144 | "cell_type": "code",
145 | "execution_count": 17,
146 | "id": "4fd632bf-322a-4f16-a39b-4220fbcbb4e3",
147 | "metadata": {},
148 | "outputs": [
149 | {
150 | "name": "stdout",
151 | "output_type": "stream",
152 | "text": [
153 | "+---------+---+----------+------+-------------------+\n",
154 | "| Name|age|Experience|Salary|Independent feature|\n",
155 | "+---------+---+----------+------+-------------------+\n",
156 | "| Krish| 31| 10| 30000| [31.0,10.0]|\n",
157 | "|Sudhanshu| 30| 8| 25000| [30.0,8.0]|\n",
158 | "| Sunny| 29| 4| 20000| [29.0,4.0]|\n",
159 | "| Paul| 24| 3| 20000| [24.0,3.0]|\n",
160 | "| Harsha| 21| 1| 15000| [21.0,1.0]|\n",
161 | "| Shubham| 23| 2| 18000| [23.0,2.0]|\n",
162 | "+---------+---+----------+------+-------------------+\n",
163 | "\n"
164 | ]
165 | }
166 | ],
167 | "source": [
168 | "#vectorassembler for grouping 0r combining\n",
169 | "from pyspark.ml.feature import VectorAssembler\n",
170 | "feature=VectorAssembler(inputCols=[\"age\",\"Experience\"],\n",
171 | " outputCol=\"Independent feature\")\n",
172 | "output=feature.transform(df_spark)\n",
173 | "output.show()"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 18,
179 | "id": "8d60a9b6-70a3-4193-a9b3-df31a5bb23be",
180 | "metadata": {},
181 | "outputs": [
182 | {
183 | "data": {
184 | "text/plain": [
185 | "['Name', 'age', 'Experience', 'Salary', 'Independent feature']"
186 | ]
187 | },
188 | "execution_count": 18,
189 | "metadata": {},
190 | "output_type": "execute_result"
191 | }
192 | ],
193 | "source": [
194 | "output.columns"
195 | ]
196 | },
197 | {
198 | "cell_type": "code",
199 | "execution_count": 19,
200 | "id": "13672511-bd72-467d-9447-035a446387fe",
201 | "metadata": {},
202 | "outputs": [
203 | {
204 | "name": "stdout",
205 | "output_type": "stream",
206 | "text": [
207 | "+-------------------+------+\n",
208 | "|Independent feature|Salary|\n",
209 | "+-------------------+------+\n",
210 | "| [31.0,10.0]| 30000|\n",
211 | "| [30.0,8.0]| 25000|\n",
212 | "| [29.0,4.0]| 20000|\n",
213 | "| [24.0,3.0]| 20000|\n",
214 | "| [21.0,1.0]| 15000|\n",
215 | "| [23.0,2.0]| 18000|\n",
216 | "+-------------------+------+\n",
217 | "\n"
218 | ]
219 | }
220 | ],
221 | "source": [
222 | "finalize=output.select(\"Independent feature\",\"Salary\")\n",
223 | "finalize.show()"
224 | ]
225 | },
226 | {
227 | "cell_type": "code",
228 | "execution_count": 20,
229 | "id": "b947c6bf-b2d9-40cd-a56a-5967ded3f2b2",
230 | "metadata": {},
231 | "outputs": [
232 | {
233 | "name": "stderr",
234 | "output_type": "stream",
235 | "text": [
236 | "24/04/26 09:50:40 WARN Instrumentation: [b67eb33b] regParam is zero, which might cause numerical instability and overfitting.\n"
237 | ]
238 | }
239 | ],
240 | "source": [
241 | "#linear regresssion\n",
242 | "from pyspark.ml.regression import LinearRegression\n",
243 | "#split\n",
244 | "train_data,test_data=finalize.randomSplit([0.75,0.25])\n",
245 | "#implement model\n",
246 | "regressor=LinearRegression(\n",
247 | " featuresCol=\"Independent feature\",\n",
248 | " labelCol=\"Salary\"\n",
249 | ")\n",
250 | "#fit model\n",
251 | "regressor=regressor.fit(train_data)"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 21,
257 | "id": "f06a1ffc-25a6-4c78-8c22-1c866aa8f7be",
258 | "metadata": {},
259 | "outputs": [
260 | {
261 | "data": {
262 | "text/plain": [
263 | "DenseVector([28.4757, 1271.3568])"
264 | ]
265 | },
266 | "execution_count": 21,
267 | "metadata": {},
268 | "output_type": "execute_result"
269 | }
270 | ],
271 | "source": [
272 | "regressor.coefficients"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 22,
278 | "id": "946ee173-bc85-4f56-8481-63ddc13601d1",
279 | "metadata": {},
280 | "outputs": [
281 | {
282 | "data": {
283 | "text/plain": [
284 | "14299.832495812996"
285 | ]
286 | },
287 | "execution_count": 22,
288 | "metadata": {},
289 | "output_type": "execute_result"
290 | }
291 | ],
292 | "source": [
293 | "regressor.intercept"
294 | ]
295 | },
296 | {
297 | "cell_type": "code",
298 | "execution_count": 23,
299 | "id": "917cadff-be7f-4b0c-b80d-20021a233963",
300 | "metadata": {},
301 | "outputs": [
302 | {
303 | "name": "stdout",
304 | "output_type": "stream",
305 | "text": [
306 | "+-------------------+------+------------------+\n",
307 | "|Independent feature|Salary| prediction|\n",
308 | "+-------------------+------+------------------+\n",
309 | "| [31.0,10.0]| 30000|27896.147403685147|\n",
310 | "+-------------------+------+------------------+\n",
311 | "\n"
312 | ]
313 | }
314 | ],
315 | "source": [
316 | "pred_results=regressor.evaluate(test_data)\n",
317 | "#predict model\n",
318 | "pred_results.predictions.show()"
319 | ]
320 | },
321 | {
322 | "cell_type": "code",
323 | "execution_count": 31,
324 | "id": "993faeb3-b50e-407f-9dee-c95a61a57e75",
325 | "metadata": {},
326 | "outputs": [
327 | {
328 | "name": "stdout",
329 | "output_type": "stream",
330 | "text": [
331 | "+---------+---+----------+------+-------------------+\n",
332 | "| Name|age|Experience|Salary|Independent feature|\n",
333 | "+---------+---+----------+------+-------------------+\n",
334 | "| Krish| 31| 10| 30000| [31.0,10.0]|\n",
335 | "|Sudhanshu| 30| 8| 25000| [30.0,8.0]|\n",
336 | "| Sunny| 29| 4| 20000| [29.0,4.0]|\n",
337 | "| Paul| 24| 3| 20000| [24.0,3.0]|\n",
338 | "| Harsha| 21| 1| 15000| [21.0,1.0]|\n",
339 | "| Shubham| 23| 2| 18000| [23.0,2.0]|\n",
340 | "+---------+---+----------+------+-------------------+\n",
341 | "\n"
342 | ]
343 | }
344 | ],
345 | "source": [
346 | "from pyspark.ml.feature import VectorAssembler\n",
347 | "a=VectorAssembler(\n",
348 | " inputCols=[\"age\",\"Experience\"],\n",
349 | " outputCol=\"Independent feature\")\n",
350 | "b=a.transform(df_spark)\n",
351 | "b.show()\n"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 36,
357 | "id": "1a946c2d-1fcd-4522-ab02-c3798713baca",
358 | "metadata": {},
359 | "outputs": [
360 | {
361 | "name": "stdout",
362 | "output_type": "stream",
363 | "text": [
364 | "+------+-------------------+\n",
365 | "|Salary|Independent feature|\n",
366 | "+------+-------------------+\n",
367 | "| 30000| [31.0,10.0]|\n",
368 | "| 25000| [30.0,8.0]|\n",
369 | "| 20000| [29.0,4.0]|\n",
370 | "| 20000| [24.0,3.0]|\n",
371 | "| 15000| [21.0,1.0]|\n",
372 | "| 18000| [23.0,2.0]|\n",
373 | "+------+-------------------+\n",
374 | "\n"
375 | ]
376 | }
377 | ],
378 | "source": [
379 | "finalize=b.select(\"Salary\",\"Independent feature\")\n",
380 | "finalize.show()"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": null,
386 | "id": "ba9a91bf-557e-4100-9244-7a86c7f89197",
387 | "metadata": {},
388 | "outputs": [],
389 | "source": []
390 | },
391 | {
392 | "cell_type": "code",
393 | "execution_count": 38,
394 | "id": "f2289e39-cdd8-4961-8779-7c2ecfa2441d",
395 | "metadata": {},
396 | "outputs": [
397 | {
398 | "name": "stderr",
399 | "output_type": "stream",
400 | "text": [
401 | "24/04/26 09:57:06 WARN Instrumentation: [c3a3d75c] regParam is zero, which might cause numerical instability and overfitting.\n"
402 | ]
403 | }
404 | ],
405 | "source": [
406 | "from pyspark.ml.regression import LinearRegression\n",
407 | "train_data,test_data=finalize.randomSplit([0.75,0.25])\n",
408 | "reg=LinearRegression(\n",
409 | " featuresCol=\"Independent feature\",\n",
410 | " labelCol=\"Salary\"\n",
411 | ")\n",
412 | "reg=reg.fit(train_data)\n"
413 | ]
414 | },
415 | {
416 | "cell_type": "code",
417 | "execution_count": 41,
418 | "id": "e2a19a28-8d7b-449d-810e-8d1b2b1ff94a",
419 | "metadata": {},
420 | "outputs": [
421 | {
422 | "data": {
423 | "text/plain": [
424 | "DenseVector([172.4138, 1206.8966])"
425 | ]
426 | },
427 | "execution_count": 41,
428 | "metadata": {},
429 | "output_type": "execute_result"
430 | }
431 | ],
432 | "source": [
433 | "reg.coefficients"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": 42,
439 | "id": "a946ea6d-5a2c-48bf-aee5-504b9d9731e4",
440 | "metadata": {},
441 | "outputs": [
442 | {
443 | "data": {
444 | "text/plain": [
445 | "10172.41379310354"
446 | ]
447 | },
448 | "execution_count": 42,
449 | "metadata": {},
450 | "output_type": "execute_result"
451 | }
452 | ],
453 | "source": [
454 | "reg.intercept"
455 | ]
456 | },
457 | {
458 | "cell_type": "code",
459 | "execution_count": 46,
460 | "id": "0f9f0a58-296d-483b-992f-cab02e51ef2c",
461 | "metadata": {},
462 | "outputs": [
463 | {
464 | "name": "stdout",
465 | "output_type": "stream",
466 | "text": [
467 | "+------+-------------------+------------------+\n",
468 | "|Salary|Independent feature| prediction|\n",
469 | "+------+-------------------+------------------+\n",
470 | "| 18000| [23.0,2.0]|16551.724137931044|\n",
471 | "| 20000| [24.0,3.0]|17931.034482758627|\n",
472 | "| 30000| [31.0,10.0]|27586.206896551732|\n",
473 | "+------+-------------------+------------------+\n",
474 | "\n"
475 | ]
476 | }
477 | ],
478 | "source": [
479 | "x=reg.evaluate(test_data)\n",
480 | "x.predictions.show()"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": null,
486 | "id": "9b4508e3-b794-4b56-94b3-59dba6a687d0",
487 | "metadata": {},
488 | "outputs": [],
489 | "source": []
490 | }
491 | ],
492 | "metadata": {
493 | "kernelspec": {
494 | "display_name": "Python 3 (ipykernel)",
495 | "language": "python",
496 | "name": "python3"
497 | },
498 | "language_info": {
499 | "codemirror_mode": {
500 | "name": "ipython",
501 | "version": 3
502 | },
503 | "file_extension": ".py",
504 | "mimetype": "text/x-python",
505 | "name": "python",
506 | "nbconvert_exporter": "python",
507 | "pygments_lexer": "ipython3",
508 | "version": "3.11.7"
509 | }
510 | },
511 | "nbformat": 4,
512 | "nbformat_minor": 5
513 | }
514 |
--------------------------------------------------------------------------------
/day7.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "application/vnd.databricks.v1+cell": {
7 | "cellMetadata": {},
8 | "inputWidgets": {},
9 | "nuid": "96816ed7-b08a-4ca3-abb9-f99880c3535d",
10 | "showTitle": false,
11 | "title": ""
12 | }
13 | },
14 | "source": [
15 | "\n",
16 | "## Overview\n",
17 | "\n",
18 | "This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.\n",
19 | "\n",
20 | "This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported."
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 0,
26 | "metadata": {
27 | "application/vnd.databricks.v1+cell": {
28 | "cellMetadata": {
29 | "byteLimit": 2048000,
30 | "rowLimit": 10000
31 | },
32 | "inputWidgets": {},
33 | "nuid": "6482be4c-f067-47c9-b0ac-35c938b94601",
34 | "showTitle": false,
35 | "title": ""
36 | }
37 | },
38 | "outputs": [
39 | {
40 | "output_type": "stream",
41 | "name": "stdout",
42 | "output_type": "stream",
43 | "text": [
44 | "+----------+----+------+------+---+------+----+\n|total_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n| 16.99|1.01|Female| No|Sun|Dinner| 2|\n| 10.34|1.66| Male| No|Sun|Dinner| 3|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3|\n| 23.68|3.31| Male| No|Sun|Dinner| 2|\n| 24.59|3.61|Female| No|Sun|Dinner| 4|\n| 25.29|4.71| Male| No|Sun|Dinner| 4|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2|\n| 26.88|3.12| Male| No|Sun|Dinner| 4|\n| 15.04|1.96| Male| No|Sun|Dinner| 2|\n| 14.78|3.23| Male| No|Sun|Dinner| 2|\n| 10.27|1.71| Male| No|Sun|Dinner| 2|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4|\n| 15.42|1.57| Male| No|Sun|Dinner| 2|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4|\n| 14.83|3.02|Female| No|Sun|Dinner| 2|\n| 21.58|3.92| Male| No|Sun|Dinner| 2|\n| 10.33|1.67|Female| No|Sun|Dinner| 3|\n| 16.29|3.71| Male| No|Sun|Dinner| 3|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3|\n| 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "# File location and type\n",
50 | "file_location = \"/FileStore/tables/tips.csv\"\n",
51 | "file_type = \"csv\"\n",
52 | "# The applied options are for CSV files. For other file types, these will be ignored.\n",
53 | "df = spark.read.csv(file_location,header=True,inferSchema=True)\n",
54 | "df.show()"
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": 0,
60 | "metadata": {
61 | "application/vnd.databricks.v1+cell": {
62 | "cellMetadata": {
63 | "byteLimit": 2048000,
64 | "rowLimit": 10000
65 | },
66 | "inputWidgets": {},
67 | "nuid": "bd82bb99-1479-4d5c-be10-8c36df0f1d44",
68 | "showTitle": false,
69 | "title": ""
70 | }
71 | },
72 | "outputs": [
73 | {
74 | "output_type": "stream",
75 | "name": "stdout",
76 | "output_type": "stream",
77 | "text": [
78 | "root\n |-- total_bill: double (nullable = true)\n |-- tip: double (nullable = true)\n |-- sex: string (nullable = true)\n |-- smoker: string (nullable = true)\n |-- day: string (nullable = true)\n |-- time: string (nullable = true)\n |-- size: integer (nullable = true)\n\n"
79 | ]
80 | }
81 | ],
82 | "source": [
83 | "df.printSchema()"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 0,
89 | "metadata": {
90 | "application/vnd.databricks.v1+cell": {
91 | "cellMetadata": {
92 | "byteLimit": 2048000,
93 | "rowLimit": 10000
94 | },
95 | "inputWidgets": {},
96 | "nuid": "b5f66379-6f7f-42ec-8e82-d0e0926a1721",
97 | "showTitle": false,
98 | "title": ""
99 | }
100 | },
101 | "outputs": [
102 | {
103 | "output_type": "stream",
104 | "name": "stdout",
105 | "output_type": "stream",
106 | "text": [
107 | "Out[11]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']"
108 | ]
109 | }
110 | ],
111 | "source": [
112 | "df.columns"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": 0,
118 | "metadata": {
119 | "application/vnd.databricks.v1+cell": {
120 | "cellMetadata": {
121 | "byteLimit": 2048000,
122 | "rowLimit": 10000
123 | },
124 | "inputWidgets": {},
125 | "nuid": "a518f51a-d8d4-49b8-ab77-1abd1d04551b",
126 | "showTitle": false,
127 | "title": ""
128 | }
129 | },
130 | "outputs": [],
131 | "source": [
132 | "#handle categorical features\n",
133 | "from pyspark.ml.feature import StringIndexer\n",
134 | "#stringindexer means strings to numeric like 0,1 yes/no type values"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 0,
140 | "metadata": {
141 | "application/vnd.databricks.v1+cell": {
142 | "cellMetadata": {
143 | "byteLimit": 2048000,
144 | "rowLimit": 10000
145 | },
146 | "inputWidgets": {},
147 | "nuid": "58839a05-de70-40cb-ac2b-56de113fca09",
148 | "showTitle": false,
149 | "title": ""
150 | }
151 | },
152 | "outputs": [
153 | {
154 | "output_type": "stream",
155 | "name": "stdout",
156 | "output_type": "stream",
157 | "text": [
158 | "+----------+----+------+------+---+------+----+\n|total_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n| 16.99|1.01|Female| No|Sun|Dinner| 2|\n| 10.34|1.66| Male| No|Sun|Dinner| 3|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3|\n| 23.68|3.31| Male| No|Sun|Dinner| 2|\n| 24.59|3.61|Female| No|Sun|Dinner| 4|\n| 25.29|4.71| Male| No|Sun|Dinner| 4|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2|\n| 26.88|3.12| Male| No|Sun|Dinner| 4|\n| 15.04|1.96| Male| No|Sun|Dinner| 2|\n| 14.78|3.23| Male| No|Sun|Dinner| 2|\n| 10.27|1.71| Male| No|Sun|Dinner| 2|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4|\n| 15.42|1.57| Male| No|Sun|Dinner| 2|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4|\n| 14.83|3.02|Female| No|Sun|Dinner| 2|\n| 21.58|3.92| Male| No|Sun|Dinner| 2|\n| 10.33|1.67|Female| No|Sun|Dinner| 3|\n| 16.29|3.71| Male| No|Sun|Dinner| 3|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3|\n| 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n"
159 | ]
160 | }
161 | ],
162 | "source": [
163 | "df.show()"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 0,
169 | "metadata": {
170 | "application/vnd.databricks.v1+cell": {
171 | "cellMetadata": {
172 | "byteLimit": 2048000,
173 | "rowLimit": 10000
174 | },
175 | "inputWidgets": {},
176 | "nuid": "be1c1738-3f6e-4de0-bfcc-1af15ad3e8d8",
177 | "showTitle": false,
178 | "title": ""
179 | }
180 | },
181 | "outputs": [
182 | {
183 | "output_type": "stream",
184 | "name": "stdout",
185 | "output_type": "stream",
186 | "text": [
187 | "+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+\n|total_bill| tip| sex|smoker|day| time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+\n| 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n| 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n| 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n| 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n| 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n| 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n| 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0| 0.0| 0.0| 0.0|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+\nonly showing top 20 rows\n\n"
188 | ]
189 | }
190 | ],
191 | "source": [
192 | "index=StringIndexer(\n",
193 | " inputCols=[\"sex\",\"smoker\",\"day\",\"time\"],\n",
194 | " outputCols=[\"sex_indexed\",\"smoker_indexed\",\"day_indexed\",\"time_indexed\"]\n",
195 | " )\n",
196 | "df_r=index.fit(df).transform(df)\n",
197 | "df_r.show()"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 0,
203 | "metadata": {
204 | "application/vnd.databricks.v1+cell": {
205 | "cellMetadata": {
206 | "byteLimit": 2048000,
207 | "rowLimit": 10000
208 | },
209 | "inputWidgets": {},
210 | "nuid": "06e561a4-05a6-43d0-ab19-27cfdaf4afcb",
211 | "showTitle": false,
212 | "title": ""
213 | }
214 | },
215 | "outputs": [
216 | {
217 | "output_type": "stream",
218 | "name": "stdout",
219 | "output_type": "stream",
220 | "text": [
221 | "+----------+----+----+-----------+--------------+-----------+------------+\n|total_bill| tip|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|\n+----------+----+----+-----------+--------------+-----------+------------+\n| 16.99|1.01| 2| 1.0| 0.0| 1.0| 0.0|\n| 10.34|1.66| 3| 0.0| 0.0| 1.0| 0.0|\n| 21.01| 3.5| 3| 0.0| 0.0| 1.0| 0.0|\n| 23.68|3.31| 2| 0.0| 0.0| 1.0| 0.0|\n| 24.59|3.61| 4| 1.0| 0.0| 1.0| 0.0|\n| 25.29|4.71| 4| 0.0| 0.0| 1.0| 0.0|\n| 8.77| 2.0| 2| 0.0| 0.0| 1.0| 0.0|\n| 26.88|3.12| 4| 0.0| 0.0| 1.0| 0.0|\n| 15.04|1.96| 2| 0.0| 0.0| 1.0| 0.0|\n| 14.78|3.23| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.27|1.71| 2| 0.0| 0.0| 1.0| 0.0|\n| 35.26| 5.0| 4| 1.0| 0.0| 1.0| 0.0|\n| 15.42|1.57| 2| 0.0| 0.0| 1.0| 0.0|\n| 18.43| 3.0| 4| 0.0| 0.0| 1.0| 0.0|\n| 14.83|3.02| 2| 1.0| 0.0| 1.0| 0.0|\n| 21.58|3.92| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.33|1.67| 3| 1.0| 0.0| 1.0| 0.0|\n| 16.29|3.71| 3| 0.0| 0.0| 1.0| 0.0|\n| 16.97| 3.5| 3| 1.0| 0.0| 1.0| 0.0|\n| 20.65|3.35| 3| 0.0| 0.0| 0.0| 0.0|\n+----------+----+----+-----------+--------------+-----------+------------+\nonly showing top 20 rows\n\n"
222 | ]
223 | }
224 | ],
225 | "source": [
226 | "df1=df_r.drop(\"sex\",\"smoker\",\"day\",\"time\")\n",
227 | "df1.show()"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 0,
233 | "metadata": {
234 | "application/vnd.databricks.v1+cell": {
235 | "cellMetadata": {
236 | "byteLimit": 2048000,
237 | "rowLimit": 10000
238 | },
239 | "inputWidgets": {},
240 | "nuid": "2fea7c39-e5d3-4084-a7d4-2fb4de268c8f",
241 | "showTitle": false,
242 | "title": ""
243 | }
244 | },
245 | "outputs": [
246 | {
247 | "output_type": "stream",
248 | "name": "stdout",
249 | "output_type": "stream",
250 | "text": [
251 | "+----------+----+----+-----------+--------------+-----------+------------+--------------------+\n|total_bill| tip|size|sex_indexed|smoker_indexed|day_indexed|time_indexed| Independent Feature|\n+----------+----+----+-----------+--------------+-----------+------------+--------------------+\n| 16.99|1.01| 2| 1.0| 0.0| 1.0| 0.0|[1.01,2.0,1.0,0.0...|\n| 10.34|1.66| 3| 0.0| 0.0| 1.0| 0.0|[1.66,3.0,0.0,0.0...|\n| 21.01| 3.5| 3| 0.0| 0.0| 1.0| 0.0|[3.5,3.0,0.0,0.0,...|\n| 23.68|3.31| 2| 0.0| 0.0| 1.0| 0.0|[3.31,2.0,0.0,0.0...|\n| 24.59|3.61| 4| 1.0| 0.0| 1.0| 0.0|[3.61,4.0,1.0,0.0...|\n| 25.29|4.71| 4| 0.0| 0.0| 1.0| 0.0|[4.71,4.0,0.0,0.0...|\n| 8.77| 2.0| 2| 0.0| 0.0| 1.0| 0.0|[2.0,2.0,0.0,0.0,...|\n| 26.88|3.12| 4| 0.0| 0.0| 1.0| 0.0|[3.12,4.0,0.0,0.0...|\n| 15.04|1.96| 2| 0.0| 0.0| 1.0| 0.0|[1.96,2.0,0.0,0.0...|\n| 14.78|3.23| 2| 0.0| 0.0| 1.0| 0.0|[3.23,2.0,0.0,0.0...|\n| 10.27|1.71| 2| 0.0| 0.0| 1.0| 0.0|[1.71,2.0,0.0,0.0...|\n| 35.26| 5.0| 4| 1.0| 0.0| 1.0| 0.0|[5.0,4.0,1.0,0.0,...|\n| 15.42|1.57| 2| 0.0| 0.0| 1.0| 0.0|[1.57,2.0,0.0,0.0...|\n| 18.43| 3.0| 4| 0.0| 0.0| 1.0| 0.0|[3.0,4.0,0.0,0.0,...|\n| 14.83|3.02| 2| 1.0| 0.0| 1.0| 0.0|[3.02,2.0,1.0,0.0...|\n| 21.58|3.92| 2| 0.0| 0.0| 1.0| 0.0|[3.92,2.0,0.0,0.0...|\n| 10.33|1.67| 3| 1.0| 0.0| 1.0| 0.0|[1.67,3.0,1.0,0.0...|\n| 16.29|3.71| 3| 0.0| 0.0| 1.0| 0.0|[3.71,3.0,0.0,0.0...|\n| 16.97| 3.5| 3| 1.0| 0.0| 1.0| 0.0|[3.5,3.0,1.0,0.0,...|\n| 20.65|3.35| 3| 0.0| 0.0| 0.0| 0.0|(6,[0,1],[3.35,3.0])|\n+----------+----+----+-----------+--------------+-----------+------------+--------------------+\nonly showing top 20 rows\n\n"
252 | ]
253 | }
254 | ],
255 | "source": [
256 | "from pyspark.ml.feature import VectorAssembler\n",
257 | "df_spark=VectorAssembler(\n",
258 | " inputCols=[\"tip\",\"size\",\"sex_indexed\",\"smoker_indexed\",\"day_indexed\",\"time_indexed\"],\n",
259 | " outputCol=\"Independent Feature\"\n",
260 | ")\n",
261 | "output=df_spark.transform(df1)\n",
262 | "output.show()"
263 | ]
264 | },
265 | {
266 | "cell_type": "code",
267 | "execution_count": 0,
268 | "metadata": {
269 | "application/vnd.databricks.v1+cell": {
270 | "cellMetadata": {
271 | "byteLimit": 2048000,
272 | "rowLimit": 10000
273 | },
274 | "inputWidgets": {},
275 | "nuid": "cdf39b42-a310-4d91-8550-52cb26ac85d9",
276 | "showTitle": false,
277 | "title": ""
278 | }
279 | },
280 | "outputs": [
281 | {
282 | "output_type": "stream",
283 | "name": "stdout",
284 | "output_type": "stream",
285 | "text": [
286 | "+--------------------+\n| Independent Feature|\n+--------------------+\n|[1.01,2.0,1.0,0.0...|\n|[1.66,3.0,0.0,0.0...|\n|[3.5,3.0,0.0,0.0,...|\n|[3.31,2.0,0.0,0.0...|\n|[3.61,4.0,1.0,0.0...|\n|[4.71,4.0,0.0,0.0...|\n|[2.0,2.0,0.0,0.0,...|\n|[3.12,4.0,0.0,0.0...|\n|[1.96,2.0,0.0,0.0...|\n|[3.23,2.0,0.0,0.0...|\n|[1.71,2.0,0.0,0.0...|\n|[5.0,4.0,1.0,0.0,...|\n|[1.57,2.0,0.0,0.0...|\n|[3.0,4.0,0.0,0.0,...|\n|[3.02,2.0,1.0,0.0...|\n|[3.92,2.0,0.0,0.0...|\n|[1.67,3.0,1.0,0.0...|\n|[3.71,3.0,0.0,0.0...|\n|[3.5,3.0,1.0,0.0,...|\n|(6,[0,1],[3.35,3.0])|\n+--------------------+\nonly showing top 20 rows\n\n"
287 | ]
288 | }
289 | ],
290 | "source": [
291 | "output.select(\"Independent Feature\").show()"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 0,
297 | "metadata": {
298 | "application/vnd.databricks.v1+cell": {
299 | "cellMetadata": {
300 | "byteLimit": 2048000,
301 | "rowLimit": 10000
302 | },
303 | "inputWidgets": {},
304 | "nuid": "3d6f6f73-7d3b-4455-9495-f1e649e231bc",
305 | "showTitle": false,
306 | "title": ""
307 | }
308 | },
309 | "outputs": [
310 | {
311 | "output_type": "stream",
312 | "name": "stdout",
313 | "output_type": "stream",
314 | "text": [
315 | "+--------------------+----------+\n| Independent Feature|total_bill|\n+--------------------+----------+\n|[1.01,2.0,1.0,0.0...| 16.99|\n|[1.66,3.0,0.0,0.0...| 10.34|\n|[3.5,3.0,0.0,0.0,...| 21.01|\n|[3.31,2.0,0.0,0.0...| 23.68|\n|[3.61,4.0,1.0,0.0...| 24.59|\n|[4.71,4.0,0.0,0.0...| 25.29|\n|[2.0,2.0,0.0,0.0,...| 8.77|\n|[3.12,4.0,0.0,0.0...| 26.88|\n|[1.96,2.0,0.0,0.0...| 15.04|\n|[3.23,2.0,0.0,0.0...| 14.78|\n|[1.71,2.0,0.0,0.0...| 10.27|\n|[5.0,4.0,1.0,0.0,...| 35.26|\n|[1.57,2.0,0.0,0.0...| 15.42|\n|[3.0,4.0,0.0,0.0,...| 18.43|\n|[3.02,2.0,1.0,0.0...| 14.83|\n|[3.92,2.0,0.0,0.0...| 21.58|\n|[1.67,3.0,1.0,0.0...| 10.33|\n|[3.71,3.0,0.0,0.0...| 16.29|\n|[3.5,3.0,1.0,0.0,...| 16.97|\n|(6,[0,1],[3.35,3.0])| 20.65|\n+--------------------+----------+\nonly showing top 20 rows\n\n"
316 | ]
317 | }
318 | ],
319 | "source": [
320 | "finalize=ouput.select(\"Independent Feature\",\"total_bill\")\n",
321 | "finalize.show()"
322 | ]
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": 0,
327 | "metadata": {
328 | "application/vnd.databricks.v1+cell": {
329 | "cellMetadata": {
330 | "byteLimit": 2048000,
331 | "rowLimit": 10000
332 | },
333 | "inputWidgets": {},
334 | "nuid": "c6b8b082-06a8-4e06-91a9-70e5893fc436",
335 | "showTitle": false,
336 | "title": ""
337 | }
338 | },
339 | "outputs": [],
340 | "source": [
341 | "from pyspark.ml.regression import LinearRegression\n",
342 | "#split\n",
343 | "test_data,train_data=finalize.randomSplit([0.75,0.25])\n",
344 | "#model\n",
345 | "m=LinearRegression(\n",
346 | " featuresCol=\"Independent Feature\",\n",
347 | " labelCol=\"total_bill\"\n",
348 | ")\n",
349 | "#fit\n",
350 | "fitting_model=m.fit(train_data)\n"
351 | ]
352 | },
353 | {
354 | "cell_type": "code",
355 | "execution_count": 0,
356 | "metadata": {
357 | "application/vnd.databricks.v1+cell": {
358 | "cellMetadata": {
359 | "byteLimit": 2048000,
360 | "rowLimit": 10000
361 | },
362 | "inputWidgets": {},
363 | "nuid": "04a960de-c2c6-49ad-bbb6-790477ffc7d6",
364 | "showTitle": false,
365 | "title": ""
366 | }
367 | },
368 | "outputs": [
369 | {
370 | "output_type": "stream",
371 | "name": "stdout",
372 | "output_type": "stream",
373 | "text": [
374 | "Out[55]: DenseVector([3.193, 2.4931, -2.5861, 4.2707, -0.4779, -1.1637])"
375 | ]
376 | }
377 | ],
378 | "source": [
379 | "#coeficient\n",
380 | "fitting_model.coefficients"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 0,
386 | "metadata": {
387 | "application/vnd.databricks.v1+cell": {
388 | "cellMetadata": {
389 | "byteLimit": 2048000,
390 | "rowLimit": 10000
391 | },
392 | "inputWidgets": {},
393 | "nuid": "1948a0b5-a4f7-4905-b0d9-004631b6dfe9",
394 | "showTitle": false,
395 | "title": ""
396 | }
397 | },
398 | "outputs": [
399 | {
400 | "output_type": "stream",
401 | "name": "stdout",
402 | "output_type": "stream",
403 | "text": [
404 | "Out[54]: 4.00317861118615"
405 | ]
406 | }
407 | ],
408 | "source": [
409 | "#intercept\n",
410 | "fitting_model.intercept"
411 | ]
412 | },
413 | {
414 | "cell_type": "code",
415 | "execution_count": 0,
416 | "metadata": {
417 | "application/vnd.databricks.v1+cell": {
418 | "cellMetadata": {
419 | "byteLimit": 2048000,
420 | "rowLimit": 10000
421 | },
422 | "inputWidgets": {},
423 | "nuid": "9fa50b9a-73a5-471a-8e84-02c4df7635cc",
424 | "showTitle": false,
425 | "title": ""
426 | }
427 | },
428 | "outputs": [],
429 | "source": [
430 | "#evalute\n",
431 | "result=fitting_model.evaluate(test_data)\n"
432 | ]
433 | },
434 | {
435 | "cell_type": "code",
436 | "execution_count": 0,
437 | "metadata": {
438 | "application/vnd.databricks.v1+cell": {
439 | "cellMetadata": {
440 | "byteLimit": 2048000,
441 | "rowLimit": 10000
442 | },
443 | "inputWidgets": {},
444 | "nuid": "cd6030c1-28ab-4afa-82b7-ad994627ab02",
445 | "showTitle": false,
446 | "title": ""
447 | }
448 | },
449 | "outputs": [
450 | {
451 | "output_type": "stream",
452 | "name": "stdout",
453 | "output_type": "stream",
454 | "text": [
455 | "+--------------------+----------+------------------+\n| Independent Feature|total_bill| prediction|\n+--------------------+----------+------------------+\n|(6,[0,1],[1.25,2.0])| 10.07| 12.98065290956777|\n|(6,[0,1],[1.25,2.0])| 10.51| 12.98065290956777|\n|(6,[0,1],[1.47,2.0])| 10.77|13.683112213152697|\n| (6,[0,1],[2.0,2.0])| 12.69| 15.37540053542548|\n| (6,[0,1],[2.0,2.0])| 13.37| 15.37540053542548|\n| (6,[0,1],[2.0,3.0])| 16.31|17.868514663068197|\n|(6,[0,1],[2.31,3.0])| 18.69|18.858343681756054|\n|(6,[0,1],[2.34,4.0])| 17.81|21.447247714433075|\n| (6,[0,1],[2.5,4.0])| 18.35| 21.95812720794939|\n|(6,[0,1],[2.64,3.0])| 17.59|19.912032637133446|\n|(6,[0,1],[2.72,2.0])| 13.28|17.674358256248887|\n| (6,[0,1],[3.0,2.0])| 14.0| 18.56839736990243|\n| (6,[0,1],[3.0,4.0])| 20.45|23.554625625187864|\n|(6,[0,1],[3.15,3.0])| 20.08| 21.54046102271669|\n|(6,[0,1],[3.18,2.0])| 19.82|19.143136800108284|\n|(6,[0,1],[3.35,3.0])| 20.65|22.179060389612083|\n|(6,[0,1],[3.39,2.0])| 11.61|19.813666135348445|\n| (6,[0,1],[3.6,3.0])| 24.06| 22.97730959823132|\n|(6,[0,1],[3.76,2.0])| 18.24|20.995074964104916|\n| (6,[0,1],[4.3,2.0])| 21.7|22.719293254722466|\n+--------------------+----------+------------------+\nonly showing top 20 rows\n\n"
456 | ]
457 | }
458 | ],
459 | "source": [
460 | "#predict\n",
461 | "result.predictions.show()"
462 | ]
463 | },
464 | {
465 | "cell_type": "code",
466 | "execution_count": 0,
467 | "metadata": {
468 | "application/vnd.databricks.v1+cell": {
469 | "cellMetadata": {
470 | "byteLimit": 2048000,
471 | "rowLimit": 10000
472 | },
473 | "inputWidgets": {},
474 | "nuid": "0c0c1c32-74c6-43ad-bd67-b42b2c1ba148",
475 | "showTitle": false,
476 | "title": ""
477 | }
478 | },
479 | "outputs": [
480 | {
481 | "output_type": "stream",
482 | "name": "stdout",
483 | "output_type": "stream",
484 | "text": [
485 | "Out[57]: (0.554439412505905, 4.342191773023231, 34.652872891876086)"
486 | ]
487 | }
488 | ],
489 | "source": [
490 | "#performence metrics\n",
491 | "result.r2,result.meanAbsoluteError,result.meanSquaredError"
492 | ]
493 | }
494 | ],
495 | "metadata": {
496 | "application/vnd.databricks.v1+notebook": {
497 | "dashboards": [],
498 | "language": "python",
499 | "notebookMetadata": {
500 | "pythonIndentUnit": 4
501 | },
502 | "notebookName": "2024-04-26 - DBFS Example",
503 | "widgets": {}
504 | }
505 | },
506 | "nbformat": 4,
507 | "nbformat_minor": 0
508 | }
509 |
--------------------------------------------------------------------------------
/test1.csv:
--------------------------------------------------------------------------------
1 | name,age
2 | kani,15
3 | kani1,16
4 | kani2,17
--------------------------------------------------------------------------------
/test2.csv:
--------------------------------------------------------------------------------
1 | name,age,experience
2 | kani,15,10
3 | kani1,16,8
4 | kani2,17,4
--------------------------------------------------------------------------------
/test3.csv:
--------------------------------------------------------------------------------
1 | Name,age,Experience,Salary
2 | Krish,31,10,30000
3 | Sudhanshu,30,8,25000
4 | Sunny,29,4,20000
5 | Paul,24,3,20000
6 | Harsha,21,1,15000
7 | Shubham,23,2,18000
8 | Mahesh,,,40000
9 | ,34,10,38000
10 | ,36,,
11 |
--------------------------------------------------------------------------------
/test4.csv:
--------------------------------------------------------------------------------
1 | Name,age,Experience,Salary
2 | Krish,31,10,30000
3 | Sudhanshu,30,8,25000
4 | Sunny,29,4,20000
5 | Paul,24,3,20000
6 | Harsha,21,1,15000
7 | Shubham,23,2,18000
8 |
--------------------------------------------------------------------------------
/test5.csv:
--------------------------------------------------------------------------------
1 | Name,Departments,salary
2 | Krish,Data Science,10000
3 | Krish,IOT,5000
4 | Mahesh,Big Data,4000
5 | Krish,Big Data,4000
6 | Mahesh,Data Science,3000
7 | Sudhanshu,Data Science,20000
8 | Sudhanshu,IOT,10000
9 | Sudhanshu,Big Data,5000
10 | Sunny,Data Science,10000
11 | Sunny,Big Data,2000
12 |
--------------------------------------------------------------------------------
/tips.csv:
--------------------------------------------------------------------------------
1 | total_bill,tip,sex,smoker,day,time,size
2 | 16.99,1.01,Female,No,Sun,Dinner,2
3 | 10.34,1.66,Male,No,Sun,Dinner,3
4 | 21.01,3.5,Male,No,Sun,Dinner,3
5 | 23.68,3.31,Male,No,Sun,Dinner,2
6 | 24.59,3.61,Female,No,Sun,Dinner,4
7 | 25.29,4.71,Male,No,Sun,Dinner,4
8 | 8.77,2.0,Male,No,Sun,Dinner,2
9 | 26.88,3.12,Male,No,Sun,Dinner,4
10 | 15.04,1.96,Male,No,Sun,Dinner,2
11 | 14.78,3.23,Male,No,Sun,Dinner,2
12 | 10.27,1.71,Male,No,Sun,Dinner,2
13 | 35.26,5.0,Female,No,Sun,Dinner,4
14 | 15.42,1.57,Male,No,Sun,Dinner,2
15 | 18.43,3.0,Male,No,Sun,Dinner,4
16 | 14.83,3.02,Female,No,Sun,Dinner,2
17 | 21.58,3.92,Male,No,Sun,Dinner,2
18 | 10.33,1.67,Female,No,Sun,Dinner,3
19 | 16.29,3.71,Male,No,Sun,Dinner,3
20 | 16.97,3.5,Female,No,Sun,Dinner,3
21 | 20.65,3.35,Male,No,Sat,Dinner,3
22 | 17.92,4.08,Male,No,Sat,Dinner,2
23 | 20.29,2.75,Female,No,Sat,Dinner,2
24 | 15.77,2.23,Female,No,Sat,Dinner,2
25 | 39.42,7.58,Male,No,Sat,Dinner,4
26 | 19.82,3.18,Male,No,Sat,Dinner,2
27 | 17.81,2.34,Male,No,Sat,Dinner,4
28 | 13.37,2.0,Male,No,Sat,Dinner,2
29 | 12.69,2.0,Male,No,Sat,Dinner,2
30 | 21.7,4.3,Male,No,Sat,Dinner,2
31 | 19.65,3.0,Female,No,Sat,Dinner,2
32 | 9.55,1.45,Male,No,Sat,Dinner,2
33 | 18.35,2.5,Male,No,Sat,Dinner,4
34 | 15.06,3.0,Female,No,Sat,Dinner,2
35 | 20.69,2.45,Female,No,Sat,Dinner,4
36 | 17.78,3.27,Male,No,Sat,Dinner,2
37 | 24.06,3.6,Male,No,Sat,Dinner,3
38 | 16.31,2.0,Male,No,Sat,Dinner,3
39 | 16.93,3.07,Female,No,Sat,Dinner,3
40 | 18.69,2.31,Male,No,Sat,Dinner,3
41 | 31.27,5.0,Male,No,Sat,Dinner,3
42 | 16.04,2.24,Male,No,Sat,Dinner,3
43 | 17.46,2.54,Male,No,Sun,Dinner,2
44 | 13.94,3.06,Male,No,Sun,Dinner,2
45 | 9.68,1.32,Male,No,Sun,Dinner,2
46 | 30.4,5.6,Male,No,Sun,Dinner,4
47 | 18.29,3.0,Male,No,Sun,Dinner,2
48 | 22.23,5.0,Male,No,Sun,Dinner,2
49 | 32.4,6.0,Male,No,Sun,Dinner,4
50 | 28.55,2.05,Male,No,Sun,Dinner,3
51 | 18.04,3.0,Male,No,Sun,Dinner,2
52 | 12.54,2.5,Male,No,Sun,Dinner,2
53 | 10.29,2.6,Female,No,Sun,Dinner,2
54 | 34.81,5.2,Female,No,Sun,Dinner,4
55 | 9.94,1.56,Male,No,Sun,Dinner,2
56 | 25.56,4.34,Male,No,Sun,Dinner,4
57 | 19.49,3.51,Male,No,Sun,Dinner,2
58 | 38.01,3.0,Male,Yes,Sat,Dinner,4
59 | 26.41,1.5,Female,No,Sat,Dinner,2
60 | 11.24,1.76,Male,Yes,Sat,Dinner,2
61 | 48.27,6.73,Male,No,Sat,Dinner,4
62 | 20.29,3.21,Male,Yes,Sat,Dinner,2
63 | 13.81,2.0,Male,Yes,Sat,Dinner,2
64 | 11.02,1.98,Male,Yes,Sat,Dinner,2
65 | 18.29,3.76,Male,Yes,Sat,Dinner,4
66 | 17.59,2.64,Male,No,Sat,Dinner,3
67 | 20.08,3.15,Male,No,Sat,Dinner,3
68 | 16.45,2.47,Female,No,Sat,Dinner,2
69 | 3.07,1.0,Female,Yes,Sat,Dinner,1
70 | 20.23,2.01,Male,No,Sat,Dinner,2
71 | 15.01,2.09,Male,Yes,Sat,Dinner,2
72 | 12.02,1.97,Male,No,Sat,Dinner,2
73 | 17.07,3.0,Female,No,Sat,Dinner,3
74 | 26.86,3.14,Female,Yes,Sat,Dinner,2
75 | 25.28,5.0,Female,Yes,Sat,Dinner,2
76 | 14.73,2.2,Female,No,Sat,Dinner,2
77 | 10.51,1.25,Male,No,Sat,Dinner,2
78 | 17.92,3.08,Male,Yes,Sat,Dinner,2
79 | 27.2,4.0,Male,No,Thur,Lunch,4
80 | 22.76,3.0,Male,No,Thur,Lunch,2
81 | 17.29,2.71,Male,No,Thur,Lunch,2
82 | 19.44,3.0,Male,Yes,Thur,Lunch,2
83 | 16.66,3.4,Male,No,Thur,Lunch,2
84 | 10.07,1.83,Female,No,Thur,Lunch,1
85 | 32.68,5.0,Male,Yes,Thur,Lunch,2
86 | 15.98,2.03,Male,No,Thur,Lunch,2
87 | 34.83,5.17,Female,No,Thur,Lunch,4
88 | 13.03,2.0,Male,No,Thur,Lunch,2
89 | 18.28,4.0,Male,No,Thur,Lunch,2
90 | 24.71,5.85,Male,No,Thur,Lunch,2
91 | 21.16,3.0,Male,No,Thur,Lunch,2
92 | 28.97,3.0,Male,Yes,Fri,Dinner,2
93 | 22.49,3.5,Male,No,Fri,Dinner,2
94 | 5.75,1.0,Female,Yes,Fri,Dinner,2
95 | 16.32,4.3,Female,Yes,Fri,Dinner,2
96 | 22.75,3.25,Female,No,Fri,Dinner,2
97 | 40.17,4.73,Male,Yes,Fri,Dinner,4
98 | 27.28,4.0,Male,Yes,Fri,Dinner,2
99 | 12.03,1.5,Male,Yes,Fri,Dinner,2
100 | 21.01,3.0,Male,Yes,Fri,Dinner,2
101 | 12.46,1.5,Male,No,Fri,Dinner,2
102 | 11.35,2.5,Female,Yes,Fri,Dinner,2
103 | 15.38,3.0,Female,Yes,Fri,Dinner,2
104 | 44.3,2.5,Female,Yes,Sat,Dinner,3
105 | 22.42,3.48,Female,Yes,Sat,Dinner,2
106 | 20.92,4.08,Female,No,Sat,Dinner,2
107 | 15.36,1.64,Male,Yes,Sat,Dinner,2
108 | 20.49,4.06,Male,Yes,Sat,Dinner,2
109 | 25.21,4.29,Male,Yes,Sat,Dinner,2
110 | 18.24,3.76,Male,No,Sat,Dinner,2
111 | 14.31,4.0,Female,Yes,Sat,Dinner,2
112 | 14.0,3.0,Male,No,Sat,Dinner,2
113 | 7.25,1.0,Female,No,Sat,Dinner,1
114 | 38.07,4.0,Male,No,Sun,Dinner,3
115 | 23.95,2.55,Male,No,Sun,Dinner,2
116 | 25.71,4.0,Female,No,Sun,Dinner,3
117 | 17.31,3.5,Female,No,Sun,Dinner,2
118 | 29.93,5.07,Male,No,Sun,Dinner,4
119 | 10.65,1.5,Female,No,Thur,Lunch,2
120 | 12.43,1.8,Female,No,Thur,Lunch,2
121 | 24.08,2.92,Female,No,Thur,Lunch,4
122 | 11.69,2.31,Male,No,Thur,Lunch,2
123 | 13.42,1.68,Female,No,Thur,Lunch,2
124 | 14.26,2.5,Male,No,Thur,Lunch,2
125 | 15.95,2.0,Male,No,Thur,Lunch,2
126 | 12.48,2.52,Female,No,Thur,Lunch,2
127 | 29.8,4.2,Female,No,Thur,Lunch,6
128 | 8.52,1.48,Male,No,Thur,Lunch,2
129 | 14.52,2.0,Female,No,Thur,Lunch,2
130 | 11.38,2.0,Female,No,Thur,Lunch,2
131 | 22.82,2.18,Male,No,Thur,Lunch,3
132 | 19.08,1.5,Male,No,Thur,Lunch,2
133 | 20.27,2.83,Female,No,Thur,Lunch,2
134 | 11.17,1.5,Female,No,Thur,Lunch,2
135 | 12.26,2.0,Female,No,Thur,Lunch,2
136 | 18.26,3.25,Female,No,Thur,Lunch,2
137 | 8.51,1.25,Female,No,Thur,Lunch,2
138 | 10.33,2.0,Female,No,Thur,Lunch,2
139 | 14.15,2.0,Female,No,Thur,Lunch,2
140 | 16.0,2.0,Male,Yes,Thur,Lunch,2
141 | 13.16,2.75,Female,No,Thur,Lunch,2
142 | 17.47,3.5,Female,No,Thur,Lunch,2
143 | 34.3,6.7,Male,No,Thur,Lunch,6
144 | 41.19,5.0,Male,No,Thur,Lunch,5
145 | 27.05,5.0,Female,No,Thur,Lunch,6
146 | 16.43,2.3,Female,No,Thur,Lunch,2
147 | 8.35,1.5,Female,No,Thur,Lunch,2
148 | 18.64,1.36,Female,No,Thur,Lunch,3
149 | 11.87,1.63,Female,No,Thur,Lunch,2
150 | 9.78,1.73,Male,No,Thur,Lunch,2
151 | 7.51,2.0,Male,No,Thur,Lunch,2
152 | 14.07,2.5,Male,No,Sun,Dinner,2
153 | 13.13,2.0,Male,No,Sun,Dinner,2
154 | 17.26,2.74,Male,No,Sun,Dinner,3
155 | 24.55,2.0,Male,No,Sun,Dinner,4
156 | 19.77,2.0,Male,No,Sun,Dinner,4
157 | 29.85,5.14,Female,No,Sun,Dinner,5
158 | 48.17,5.0,Male,No,Sun,Dinner,6
159 | 25.0,3.75,Female,No,Sun,Dinner,4
160 | 13.39,2.61,Female,No,Sun,Dinner,2
161 | 16.49,2.0,Male,No,Sun,Dinner,4
162 | 21.5,3.5,Male,No,Sun,Dinner,4
163 | 12.66,2.5,Male,No,Sun,Dinner,2
164 | 16.21,2.0,Female,No,Sun,Dinner,3
165 | 13.81,2.0,Male,No,Sun,Dinner,2
166 | 17.51,3.0,Female,Yes,Sun,Dinner,2
167 | 24.52,3.48,Male,No,Sun,Dinner,3
168 | 20.76,2.24,Male,No,Sun,Dinner,2
169 | 31.71,4.5,Male,No,Sun,Dinner,4
170 | 10.59,1.61,Female,Yes,Sat,Dinner,2
171 | 10.63,2.0,Female,Yes,Sat,Dinner,2
172 | 50.81,10.0,Male,Yes,Sat,Dinner,3
173 | 15.81,3.16,Male,Yes,Sat,Dinner,2
174 | 7.25,5.15,Male,Yes,Sun,Dinner,2
175 | 31.85,3.18,Male,Yes,Sun,Dinner,2
176 | 16.82,4.0,Male,Yes,Sun,Dinner,2
177 | 32.9,3.11,Male,Yes,Sun,Dinner,2
178 | 17.89,2.0,Male,Yes,Sun,Dinner,2
179 | 14.48,2.0,Male,Yes,Sun,Dinner,2
180 | 9.6,4.0,Female,Yes,Sun,Dinner,2
181 | 34.63,3.55,Male,Yes,Sun,Dinner,2
182 | 34.65,3.68,Male,Yes,Sun,Dinner,4
183 | 23.33,5.65,Male,Yes,Sun,Dinner,2
184 | 45.35,3.5,Male,Yes,Sun,Dinner,3
185 | 23.17,6.5,Male,Yes,Sun,Dinner,4
186 | 40.55,3.0,Male,Yes,Sun,Dinner,2
187 | 20.69,5.0,Male,No,Sun,Dinner,5
188 | 20.9,3.5,Female,Yes,Sun,Dinner,3
189 | 30.46,2.0,Male,Yes,Sun,Dinner,5
190 | 18.15,3.5,Female,Yes,Sun,Dinner,3
191 | 23.1,4.0,Male,Yes,Sun,Dinner,3
192 | 15.69,1.5,Male,Yes,Sun,Dinner,2
193 | 19.81,4.19,Female,Yes,Thur,Lunch,2
194 | 28.44,2.56,Male,Yes,Thur,Lunch,2
195 | 15.48,2.02,Male,Yes,Thur,Lunch,2
196 | 16.58,4.0,Male,Yes,Thur,Lunch,2
197 | 7.56,1.44,Male,No,Thur,Lunch,2
198 | 10.34,2.0,Male,Yes,Thur,Lunch,2
199 | 43.11,5.0,Female,Yes,Thur,Lunch,4
200 | 13.0,2.0,Female,Yes,Thur,Lunch,2
201 | 13.51,2.0,Male,Yes,Thur,Lunch,2
202 | 18.71,4.0,Male,Yes,Thur,Lunch,3
203 | 12.74,2.01,Female,Yes,Thur,Lunch,2
204 | 13.0,2.0,Female,Yes,Thur,Lunch,2
205 | 16.4,2.5,Female,Yes,Thur,Lunch,2
206 | 20.53,4.0,Male,Yes,Thur,Lunch,4
207 | 16.47,3.23,Female,Yes,Thur,Lunch,3
208 | 26.59,3.41,Male,Yes,Sat,Dinner,3
209 | 38.73,3.0,Male,Yes,Sat,Dinner,4
210 | 24.27,2.03,Male,Yes,Sat,Dinner,2
211 | 12.76,2.23,Female,Yes,Sat,Dinner,2
212 | 30.06,2.0,Male,Yes,Sat,Dinner,3
213 | 25.89,5.16,Male,Yes,Sat,Dinner,4
214 | 48.33,9.0,Male,No,Sat,Dinner,4
215 | 13.27,2.5,Female,Yes,Sat,Dinner,2
216 | 28.17,6.5,Female,Yes,Sat,Dinner,3
217 | 12.9,1.1,Female,Yes,Sat,Dinner,2
218 | 28.15,3.0,Male,Yes,Sat,Dinner,5
219 | 11.59,1.5,Male,Yes,Sat,Dinner,2
220 | 7.74,1.44,Male,Yes,Sat,Dinner,2
221 | 30.14,3.09,Female,Yes,Sat,Dinner,4
222 | 12.16,2.2,Male,Yes,Fri,Lunch,2
223 | 13.42,3.48,Female,Yes,Fri,Lunch,2
224 | 8.58,1.92,Male,Yes,Fri,Lunch,1
225 | 15.98,3.0,Female,No,Fri,Lunch,3
226 | 13.42,1.58,Male,Yes,Fri,Lunch,2
227 | 16.27,2.5,Female,Yes,Fri,Lunch,2
228 | 10.09,2.0,Female,Yes,Fri,Lunch,2
229 | 20.45,3.0,Male,No,Sat,Dinner,4
230 | 13.28,2.72,Male,No,Sat,Dinner,2
231 | 22.12,2.88,Female,Yes,Sat,Dinner,2
232 | 24.01,2.0,Male,Yes,Sat,Dinner,4
233 | 15.69,3.0,Male,Yes,Sat,Dinner,3
234 | 11.61,3.39,Male,No,Sat,Dinner,2
235 | 10.77,1.47,Male,No,Sat,Dinner,2
236 | 15.53,3.0,Male,Yes,Sat,Dinner,2
237 | 10.07,1.25,Male,No,Sat,Dinner,2
238 | 12.6,1.0,Male,Yes,Sat,Dinner,2
239 | 32.83,1.17,Male,Yes,Sat,Dinner,2
240 | 35.83,4.67,Female,No,Sat,Dinner,3
241 | 29.03,5.92,Male,No,Sat,Dinner,3
242 | 27.18,2.0,Female,Yes,Sat,Dinner,2
243 | 22.67,2.0,Male,Yes,Sat,Dinner,2
244 | 17.82,1.75,Male,No,Sat,Dinner,2
245 | 18.78,3.0,Female,No,Thur,Dinner,2
246 |
--------------------------------------------------------------------------------