"
64 | ]
65 | },
66 | "execution_count": 3,
67 | "metadata": {},
68 | "output_type": "execute_result"
69 | }
70 | ],
71 | "source": [
72 | "spark"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 4,
78 | "id": "4d3bd081",
79 | "metadata": {},
80 | "outputs": [],
81 | "source": [
82 | "df_pyspark=spark.read.csv('test3.csv',header=True,inferSchema=True)"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 5,
88 | "id": "7ed791ed",
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "name": "stdout",
93 | "output_type": "stream",
94 | "text": [
95 | "+---------+------------+------+\n",
96 | "| Name| Departments|salary|\n",
97 | "+---------+------------+------+\n",
98 | "| Krish|Data Science| 10000|\n",
99 | "| Krish| IOT| 5000|\n",
100 | "| Mahesh| Big Data| 4000|\n",
101 | "| Krish| Big Data| 4000|\n",
102 | "| Mahesh|Data Science| 3000|\n",
103 | "|Sudhanshu|Data Science| 20000|\n",
104 | "|Sudhanshu| IOT| 10000|\n",
105 | "|Sudhanshu| Big Data| 5000|\n",
106 | "| Sunny|Data Science| 10000|\n",
107 | "| Sunny| Big Data| 2000|\n",
108 | "+---------+------------+------+\n",
109 | "\n"
110 | ]
111 | }
112 | ],
113 | "source": [
114 | "df_pyspark.show()"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 6,
120 | "id": "d57d24ca",
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "root\n",
128 | " |-- Name: string (nullable = true)\n",
129 | " |-- Departments: string (nullable = true)\n",
130 | " |-- salary: integer (nullable = true)\n",
131 | "\n"
132 | ]
133 | }
134 | ],
135 | "source": [
136 | "df_pyspark.printSchema()"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 9,
142 | "id": "f15f8197",
143 | "metadata": {},
144 | "outputs": [
145 | {
146 | "name": "stdout",
147 | "output_type": "stream",
148 | "text": [
149 | "+---------+-----------+\n",
150 | "| Name|sum(salary)|\n",
151 | "+---------+-----------+\n",
152 | "|Sudhanshu| 35000|\n",
153 | "| Sunny| 12000|\n",
154 | "| Krish| 19000|\n",
155 | "| Mahesh| 7000|\n",
156 | "+---------+-----------+\n",
157 | "\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "## Groupby\n",
163 | "### Grouped to find the maximum salary\n",
164 | "df_pyspark.groupBy('Name').sum().show()"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 19,
170 | "id": "fc122ace",
171 | "metadata": {},
172 | "outputs": [
173 | {
174 | "name": "stdout",
175 | "output_type": "stream",
176 | "text": [
177 | "+---------+------------------+\n",
178 | "| Name| avg(salary)|\n",
179 | "+---------+------------------+\n",
180 | "|Sudhanshu|11666.666666666666|\n",
181 | "| Sunny| 6000.0|\n",
182 | "| Krish| 6333.333333333333|\n",
183 | "| Mahesh| 3500.0|\n",
184 | "+---------+------------------+\n",
185 | "\n"
186 | ]
187 | }
188 | ],
189 | "source": [
190 | "df_pyspark.groupBy('Name').avg().show()"
191 | ]
192 | },
193 | {
194 | "cell_type": "code",
195 | "execution_count": 11,
196 | "id": "151d2264",
197 | "metadata": {},
198 | "outputs": [
199 | {
200 | "name": "stdout",
201 | "output_type": "stream",
202 | "text": [
203 | "+------------+-----------+\n",
204 | "| Departments|sum(salary)|\n",
205 | "+------------+-----------+\n",
206 | "| IOT| 15000|\n",
207 | "| Big Data| 15000|\n",
208 | "|Data Science| 43000|\n",
209 | "+------------+-----------+\n",
210 | "\n"
211 | ]
212 | }
213 | ],
214 | "source": [
215 | "### Groupby Departmernts which gives maximum salary\n",
216 | "df_pyspark.groupBy('Departments').sum().show()"
217 | ]
218 | },
219 | {
220 | "cell_type": "code",
221 | "execution_count": 12,
222 | "id": "66fe5552",
223 | "metadata": {},
224 | "outputs": [
225 | {
226 | "name": "stdout",
227 | "output_type": "stream",
228 | "text": [
229 | "+------------+-----------+\n",
230 | "| Departments|avg(salary)|\n",
231 | "+------------+-----------+\n",
232 | "| IOT| 7500.0|\n",
233 | "| Big Data| 3750.0|\n",
234 | "|Data Science| 10750.0|\n",
235 | "+------------+-----------+\n",
236 | "\n"
237 | ]
238 | }
239 | ],
240 | "source": [
241 | "df_pyspark.groupBy('Departments').mean().show()"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 14,
247 | "id": "bc7bf192",
248 | "metadata": {},
249 | "outputs": [
250 | {
251 | "name": "stdout",
252 | "output_type": "stream",
253 | "text": [
254 | "+------------+-----+\n",
255 | "| Departments|count|\n",
256 | "+------------+-----+\n",
257 | "| IOT| 2|\n",
258 | "| Big Data| 4|\n",
259 | "|Data Science| 4|\n",
260 | "+------------+-----+\n",
261 | "\n"
262 | ]
263 | }
264 | ],
265 | "source": [
266 | "df_pyspark.groupBy('Departments').count().show()"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 15,
272 | "id": "37b26cbe",
273 | "metadata": {},
274 | "outputs": [
275 | {
276 | "name": "stdout",
277 | "output_type": "stream",
278 | "text": [
279 | "+-----------+\n",
280 | "|sum(Salary)|\n",
281 | "+-----------+\n",
282 | "| 73000|\n",
283 | "+-----------+\n",
284 | "\n"
285 | ]
286 | }
287 | ],
288 | "source": [
289 | "df_pyspark.agg({'Salary':'sum'}).show()"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "id": "bb21f03f",
296 | "metadata": {},
297 | "outputs": [],
298 | "source": []
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": null,
303 | "id": "1c7d8f83",
304 | "metadata": {},
305 | "outputs": [],
306 | "source": []
307 | },
308 | {
309 | "cell_type": "code",
310 | "execution_count": null,
311 | "id": "9dc7aa65",
312 | "metadata": {},
313 | "outputs": [],
314 | "source": []
315 | },
316 | {
317 | "cell_type": "code",
318 | "execution_count": null,
319 | "id": "fdd3fbac",
320 | "metadata": {},
321 | "outputs": [],
322 | "source": []
323 | },
324 | {
325 | "cell_type": "code",
326 | "execution_count": null,
327 | "id": "375a6fda",
328 | "metadata": {},
329 | "outputs": [],
330 | "source": []
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": null,
335 | "id": "0aa434e6",
336 | "metadata": {},
337 | "outputs": [],
338 | "source": []
339 | },
340 | {
341 | "cell_type": "code",
342 | "execution_count": null,
343 | "id": "c82781ad",
344 | "metadata": {},
345 | "outputs": [],
346 | "source": []
347 | }
348 | ],
349 | "metadata": {
350 | "kernelspec": {
351 | "display_name": "Python 3",
352 | "language": "python",
353 | "name": "python3"
354 | },
355 | "language_info": {
356 | "codemirror_mode": {
357 | "name": "ipython",
358 | "version": 3
359 | },
360 | "file_extension": ".py",
361 | "mimetype": "text/x-python",
362 | "name": "python",
363 | "nbconvert_exporter": "python",
364 | "pygments_lexer": "ipython3",
365 | "version": "3.7.10"
366 | }
367 | },
368 | "nbformat": 4,
369 | "nbformat_minor": 5
370 | }
371 |
--------------------------------------------------------------------------------
/Tutorial 6-Example Of Pyspark ML.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "16da6c54",
6 | "metadata": {},
7 | "source": [
8 | "### Examples Of Pyspark ML"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 71,
14 | "id": "0b9da3ad",
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "from pyspark.sql import SparkSession\n",
19 | "spark=SparkSession.builder.appName('Missing').getOrCreate()"
20 | ]
21 | },
22 | {
23 | "cell_type": "code",
24 | "execution_count": 72,
25 | "id": "735525da",
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "## Read The dataset\n",
30 | "training = spark.read.csv('test1.csv',header=True,inferSchema=True)"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 73,
36 | "id": "d6e038c9",
37 | "metadata": {},
38 | "outputs": [
39 | {
40 | "name": "stdout",
41 | "output_type": "stream",
42 | "text": [
43 | "+---------+---+----------+------+\n",
44 | "| Name|age|Experience|Salary|\n",
45 | "+---------+---+----------+------+\n",
46 | "| Krish| 31| 10| 30000|\n",
47 | "|Sudhanshu| 30| 8| 25000|\n",
48 | "| Sunny| 29| 4| 20000|\n",
49 | "| Paul| 24| 3| 20000|\n",
50 | "| Harsha| 21| 1| 15000|\n",
51 | "| Shubham| 23| 2| 18000|\n",
52 | "+---------+---+----------+------+\n",
53 | "\n"
54 | ]
55 | }
56 | ],
57 | "source": [
58 | "training.show()\n",
59 | "\n"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 74,
65 | "id": "6b3dd5ff",
66 | "metadata": {},
67 | "outputs": [
68 | {
69 | "name": "stdout",
70 | "output_type": "stream",
71 | "text": [
72 | "root\n",
73 | " |-- Name: string (nullable = true)\n",
74 | " |-- age: integer (nullable = true)\n",
75 | " |-- Experience: integer (nullable = true)\n",
76 | " |-- Salary: integer (nullable = true)\n",
77 | "\n"
78 | ]
79 | }
80 | ],
81 | "source": [
82 | "training.printSchema()"
83 | ]
84 | },
85 | {
86 | "cell_type": "code",
87 | "execution_count": 75,
88 | "id": "5d3227e6",
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "data": {
93 | "text/plain": [
94 | "['Name', 'age', 'Experience', 'Salary']"
95 | ]
96 | },
97 | "execution_count": 75,
98 | "metadata": {},
99 | "output_type": "execute_result"
100 | }
101 | ],
102 | "source": [
103 | "training.columns"
104 | ]
105 | },
106 | {
107 | "cell_type": "code",
108 | "execution_count": null,
109 | "id": "cffef5b9",
110 | "metadata": {},
111 | "outputs": [],
112 | "source": [
113 | "[Age,Experience]----> new feature--->independent feature"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 76,
119 | "id": "e6273555",
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "from pyspark.ml.feature import VectorAssembler\n",
124 | "featureassembler=VectorAssembler(inputCols=[\"age\",\"Experience\"],outputCol=\"Independent Features\")"
125 | ]
126 | },
127 | {
128 | "cell_type": "code",
129 | "execution_count": 77,
130 | "id": "0b69744c",
131 | "metadata": {},
132 | "outputs": [],
133 | "source": [
134 | "output=featureassembler.transform(training)"
135 | ]
136 | },
137 | {
138 | "cell_type": "code",
139 | "execution_count": 78,
140 | "id": "60961194",
141 | "metadata": {},
142 | "outputs": [
143 | {
144 | "name": "stdout",
145 | "output_type": "stream",
146 | "text": [
147 | "+---------+---+----------+------+--------------------+\n",
148 | "| Name|age|Experience|Salary|Independent Features|\n",
149 | "+---------+---+----------+------+--------------------+\n",
150 | "| Krish| 31| 10| 30000| [31.0,10.0]|\n",
151 | "|Sudhanshu| 30| 8| 25000| [30.0,8.0]|\n",
152 | "| Sunny| 29| 4| 20000| [29.0,4.0]|\n",
153 | "| Paul| 24| 3| 20000| [24.0,3.0]|\n",
154 | "| Harsha| 21| 1| 15000| [21.0,1.0]|\n",
155 | "| Shubham| 23| 2| 18000| [23.0,2.0]|\n",
156 | "+---------+---+----------+------+--------------------+\n",
157 | "\n"
158 | ]
159 | }
160 | ],
161 | "source": [
162 | "output.show()"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": 79,
168 | "id": "2c27434a",
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "data": {
173 | "text/plain": [
174 | "['Name', 'age', 'Experience', 'Salary', 'Independent Features']"
175 | ]
176 | },
177 | "execution_count": 79,
178 | "metadata": {},
179 | "output_type": "execute_result"
180 | }
181 | ],
182 | "source": [
183 | "output.columns"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 80,
189 | "id": "54a0ccab",
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "finalized_data=output.select(\"Independent Features\",\"Salary\")"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 81,
199 | "id": "f7a73845",
200 | "metadata": {},
201 | "outputs": [
202 | {
203 | "name": "stdout",
204 | "output_type": "stream",
205 | "text": [
206 | "+--------------------+------+\n",
207 | "|Independent Features|Salary|\n",
208 | "+--------------------+------+\n",
209 | "| [31.0,10.0]| 30000|\n",
210 | "| [30.0,8.0]| 25000|\n",
211 | "| [29.0,4.0]| 20000|\n",
212 | "| [24.0,3.0]| 20000|\n",
213 | "| [21.0,1.0]| 15000|\n",
214 | "| [23.0,2.0]| 18000|\n",
215 | "+--------------------+------+\n",
216 | "\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "finalized_data.show()"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 82,
227 | "id": "0b11192b",
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "from pyspark.ml.regression import LinearRegression\n",
232 | "##train test split\n",
233 | "train_data,test_data=finalized_data.randomSplit([0.75,0.25])\n",
234 | "regressor=LinearRegression(featuresCol='Independent Features', labelCol='Salary')\n",
235 | "regressor=regressor.fit(train_data)"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 83,
241 | "id": "fa4ec997",
242 | "metadata": {},
243 | "outputs": [
244 | {
245 | "data": {
246 | "text/plain": [
247 | "DenseVector([-5000.0, 7000.0])"
248 | ]
249 | },
250 | "execution_count": 83,
251 | "metadata": {},
252 | "output_type": "execute_result"
253 | }
254 | ],
255 | "source": [
256 | "### Coefficients\n",
257 | "regressor.coefficients"
258 | ]
259 | },
260 | {
261 | "cell_type": "code",
262 | "execution_count": 84,
263 | "id": "eba911b6",
264 | "metadata": {},
265 | "outputs": [
266 | {
267 | "data": {
268 | "text/plain": [
269 | "118999.99999893687"
270 | ]
271 | },
272 | "execution_count": 84,
273 | "metadata": {},
274 | "output_type": "execute_result"
275 | }
276 | ],
277 | "source": [
278 | "### Intercepts\n",
279 | "regressor.intercept"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 85,
285 | "id": "2ba2bc70",
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "### Prediction\n",
290 | "pred_results=regressor.evaluate(test_data)"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 86,
296 | "id": "489d6392",
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "name": "stdout",
301 | "output_type": "stream",
302 | "text": [
303 | "+--------------------+------+-----------------+\n",
304 | "|Independent Features|Salary| prediction|\n",
305 | "+--------------------+------+-----------------+\n",
306 | "| [21.0,1.0]| 15000|20999.99999996154|\n",
307 | "| [29.0,4.0]| 20000|2000.000000192551|\n",
308 | "| [31.0,10.0]| 30000|33999.99999993094|\n",
309 | "+--------------------+------+-----------------+\n",
310 | "\n"
311 | ]
312 | }
313 | ],
314 | "source": [
315 | "pred_results.predictions.show()"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 87,
321 | "id": "0534e854",
322 | "metadata": {},
323 | "outputs": [
324 | {
325 | "data": {
326 | "text/plain": [
327 | "(9333.333333233308, 125333333.3306847)"
328 | ]
329 | },
330 | "execution_count": 87,
331 | "metadata": {},
332 | "output_type": "execute_result"
333 | }
334 | ],
335 | "source": [
336 | "pred_results.meanAbsoluteError,pred_results.meanSquaredError"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": null,
342 | "id": "70de559b",
343 | "metadata": {},
344 | "outputs": [],
345 | "source": []
346 | }
347 | ],
348 | "metadata": {
349 | "kernelspec": {
350 | "display_name": "Python 3",
351 | "language": "python",
352 | "name": "python3"
353 | },
354 | "language_info": {
355 | "codemirror_mode": {
356 | "name": "ipython",
357 | "version": 3
358 | },
359 | "file_extension": ".py",
360 | "mimetype": "text/x-python",
361 | "name": "python",
362 | "nbconvert_exporter": "python",
363 | "pygments_lexer": "ipython3",
364 | "version": "3.7.10"
365 | }
366 | },
367 | "nbformat": 4,
368 | "nbformat_minor": 5
369 | }
370 |
--------------------------------------------------------------------------------
/Tutorial 8-Linear Regression With Pyspark.ipynb:
--------------------------------------------------------------------------------
1 | {"cells":[{"cell_type":"markdown","source":["## Overview\n\nThis notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.\n\nThis notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported."],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"96816ed7-b08a-4ca3-abb9-f99880c3535d"}}},{"cell_type":"code","source":["# File location and type\nfile_location = \"/FileStore/tables/tips.csv\"\nfile_type = \"csv\"\n\n# The applied options are for CSV files. For other file types, these will be ignored.\ndf =spark.read.csv(file_location,header=True,inferSchema=True)\ndf.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6482be4c-f067-47c9-b0ac-35c938b94601"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+----+------+------+---+------+----+\n|total_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n| 16.99|1.01|Female| No|Sun|Dinner| 2|\n| 10.34|1.66| Male| No|Sun|Dinner| 3|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3|\n| 23.68|3.31| Male| No|Sun|Dinner| 2|\n| 24.59|3.61|Female| No|Sun|Dinner| 4|\n| 25.29|4.71| Male| No|Sun|Dinner| 4|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2|\n| 26.88|3.12| Male| No|Sun|Dinner| 4|\n| 15.04|1.96| Male| No|Sun|Dinner| 2|\n| 14.78|3.23| Male| No|Sun|Dinner| 2|\n| 10.27|1.71| Male| No|Sun|Dinner| 2|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4|\n| 15.42|1.57| Male| No|Sun|Dinner| 2|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4|\n| 14.83|3.02|Female| No|Sun|Dinner| 2|\n| 21.58|3.92| Male| No|Sun|Dinner| 2|\n| 10.33|1.67|Female| No|Sun|Dinner| 3|\n| 16.29|3.71| Male| No|Sun|Dinner| 3|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3|\n| 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+----------+----+------+------+---+------+----+\ntotal_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n 16.99|1.01|Female| No|Sun|Dinner| 2|\n 10.34|1.66| Male| No|Sun|Dinner| 3|\n 21.01| 3.5| Male| No|Sun|Dinner| 3|\n 23.68|3.31| Male| No|Sun|Dinner| 2|\n 24.59|3.61|Female| No|Sun|Dinner| 4|\n 25.29|4.71| Male| No|Sun|Dinner| 4|\n 8.77| 2.0| Male| No|Sun|Dinner| 2|\n 26.88|3.12| Male| No|Sun|Dinner| 4|\n 15.04|1.96| Male| No|Sun|Dinner| 2|\n 14.78|3.23| Male| No|Sun|Dinner| 2|\n 10.27|1.71| Male| No|Sun|Dinner| 2|\n 35.26| 5.0|Female| No|Sun|Dinner| 4|\n 15.42|1.57| Male| No|Sun|Dinner| 2|\n 18.43| 3.0| Male| No|Sun|Dinner| 4|\n 14.83|3.02|Female| No|Sun|Dinner| 2|\n 21.58|3.92| Male| No|Sun|Dinner| 2|\n 10.33|1.67|Female| No|Sun|Dinner| 3|\n 16.29|3.71| Male| No|Sun|Dinner| 3|\n 16.97| 3.5|Female| No|Sun|Dinner| 3|\n 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["df.printSchema()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"5e5b80f2-3426-44e1-b86e-171314f4827e"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"root\n |-- total_bill: double (nullable = true)\n |-- tip: double (nullable = true)\n |-- sex: string (nullable = true)\n |-- smoker: string (nullable = true)\n |-- day: string (nullable = true)\n |-- time: string (nullable = true)\n |-- size: integer (nullable = true)\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nroot\n-- total_bill: double (nullable = true)\n-- tip: double (nullable = true)\n-- sex: string (nullable = true)\n-- smoker: string (nullable = true)\n-- day: string (nullable = true)\n-- time: string (nullable = true)\n-- size: integer (nullable = true)\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["df.columns"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0432b71c-b266-417d-b0d5-1c17afa0f090"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[3]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[3]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']
"]}}],"execution_count":0},{"cell_type":"code","source":["### Handling Categorical Features\nfrom pyspark.ml.feature import StringIndexer"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0ae62ac1-81a6-4b1d-92b9-f85ec9cc93ff"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["df.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"faa6f9b0-6f8b-4dbd-a5a2-dc074181f2e3"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+----+------+------+---+------+----+\n|total_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n| 16.99|1.01|Female| No|Sun|Dinner| 2|\n| 10.34|1.66| Male| No|Sun|Dinner| 3|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3|\n| 23.68|3.31| Male| No|Sun|Dinner| 2|\n| 24.59|3.61|Female| No|Sun|Dinner| 4|\n| 25.29|4.71| Male| No|Sun|Dinner| 4|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2|\n| 26.88|3.12| Male| No|Sun|Dinner| 4|\n| 15.04|1.96| Male| No|Sun|Dinner| 2|\n| 14.78|3.23| Male| No|Sun|Dinner| 2|\n| 10.27|1.71| Male| No|Sun|Dinner| 2|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4|\n| 15.42|1.57| Male| No|Sun|Dinner| 2|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4|\n| 14.83|3.02|Female| No|Sun|Dinner| 2|\n| 21.58|3.92| Male| No|Sun|Dinner| 2|\n| 10.33|1.67|Female| No|Sun|Dinner| 3|\n| 16.29|3.71| Male| No|Sun|Dinner| 3|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3|\n| 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+----------+----+------+------+---+------+----+\ntotal_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n 16.99|1.01|Female| No|Sun|Dinner| 2|\n 10.34|1.66| Male| No|Sun|Dinner| 3|\n 21.01| 3.5| Male| No|Sun|Dinner| 3|\n 23.68|3.31| Male| No|Sun|Dinner| 2|\n 24.59|3.61|Female| No|Sun|Dinner| 4|\n 25.29|4.71| Male| No|Sun|Dinner| 4|\n 8.77| 2.0| Male| No|Sun|Dinner| 2|\n 26.88|3.12| Male| No|Sun|Dinner| 4|\n 15.04|1.96| Male| No|Sun|Dinner| 2|\n 14.78|3.23| Male| No|Sun|Dinner| 2|\n 10.27|1.71| Male| No|Sun|Dinner| 2|\n 35.26| 5.0|Female| No|Sun|Dinner| 4|\n 15.42|1.57| Male| No|Sun|Dinner| 2|\n 18.43| 3.0| Male| No|Sun|Dinner| 4|\n 14.83|3.02|Female| No|Sun|Dinner| 2|\n 21.58|3.92| Male| No|Sun|Dinner| 2|\n 10.33|1.67|Female| No|Sun|Dinner| 3|\n 16.29|3.71| Male| No|Sun|Dinner| 3|\n 16.97| 3.5|Female| No|Sun|Dinner| 3|\n 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["indexer=StringIndexer(inputCol=\"sex\",outputCol=\"sex_indexed\")\ndf_r=indexer.fit(df).transform(df)\ndf_r.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"2ee7ab64-9804-4afb-852c-ee02eb5d3a20"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+----+------+------+---+------+----+-----------+\n|total_bill| tip| sex|smoker|day| time|size|sex_indexed|\n+----------+----+------+------+---+------+----+-----------+\n| 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0|\n| 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0|\n| 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0|\n| 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0|\n| 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0|\n| 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0|\n| 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0|\n| 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0|\n| 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0|\n| 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0|\n| 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0|\n| 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0|\n| 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0|\n| 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0|\n| 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0|\n+----------+----+------+------+---+------+----+-----------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+----------+----+------+------+---+------+----+-----------+\ntotal_bill| tip| sex|smoker|day| time|size|sex_indexed|\n+----------+----+------+------+---+------+----+-----------+\n 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0|\n 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0|\n 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0|\n 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0|\n 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0|\n 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0|\n 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0|\n 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0|\n 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0|\n 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0|\n 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0|\n 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0|\n 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0|\n 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0|\n 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0|\n 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0|\n 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0|\n 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0|\n 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0|\n 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0|\n+----------+----+------+------+---+------+----+-----------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["indexer=StringIndexer(inputCols=[\"smoker\",\"day\",\"time\"],outputCols=[\"smoker_indexed\",\"day_indexed\",\n \"time_index\"])\ndf_r=indexer.fit(df_r).transform(df_r)\ndf_r.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"6b95d734-4c80-4762-bd9b-92b6a107dced"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\n|total_bill| tip| sex|smoker|day| time|size|sex_indexed|smoker_indexed|day_indexed|time_index|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\n| 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n| 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n| 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n| 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n| 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n| 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n| 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0| 0.0| 0.0| 0.0|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\ntotal_bill| tip| sex|smoker|day| time|size|sex_indexed|smoker_indexed|day_indexed|time_index|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\n 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0| 0.0| 0.0| 0.0|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["df_r.columns"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"a9909b0b-caee-4838-b477-47c3701dbfd4"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[9]: ['total_bill',\n 'tip',\n 'sex',\n 'smoker',\n 'day',\n 'time',\n 'size',\n 'sex_indexed',\n 'smoker_indexed',\n 'day_indexed',\n 'time_index']
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[9]: ['total_bill',\n 'tip',\n 'sex',\n 'smoker',\n 'day',\n 'time',\n 'size',\n 'sex_indexed',\n 'smoker_indexed',\n 'day_indexed',\n 'time_index']
"]}}],"execution_count":0},{"cell_type":"code","source":["from pyspark.ml.feature import VectorAssembler\nfeatureassembler=VectorAssembler(inputCols=['tip','size','sex_indexed','smoker_indexed','day_indexed',\n 'time_index'],outputCol=\"Independent Features\")\noutput=featureassembler.transform(df_r)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"61d875e5-71fa-4dc4-ae90-54924b00a632"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["output.select('Independent Features').show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"d33d1178-95a2-468f-a94a-e0eebc67be86"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+--------------------+\n|Independent Features|\n+--------------------+\n|[1.01,2.0,1.0,0.0...|\n|[1.66,3.0,0.0,0.0...|\n|[3.5,3.0,0.0,0.0,...|\n|[3.31,2.0,0.0,0.0...|\n|[3.61,4.0,1.0,0.0...|\n|[4.71,4.0,0.0,0.0...|\n|[2.0,2.0,0.0,0.0,...|\n|[3.12,4.0,0.0,0.0...|\n|[1.96,2.0,0.0,0.0...|\n|[3.23,2.0,0.0,0.0...|\n|[1.71,2.0,0.0,0.0...|\n|[5.0,4.0,1.0,0.0,...|\n|[1.57,2.0,0.0,0.0...|\n|[3.0,4.0,0.0,0.0,...|\n|[3.02,2.0,1.0,0.0...|\n|[3.92,2.0,0.0,0.0...|\n|[1.67,3.0,1.0,0.0...|\n|[3.71,3.0,0.0,0.0...|\n|[3.5,3.0,1.0,0.0,...|\n|(6,[0,1],[3.35,3.0])|\n+--------------------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+--------------------+\nIndependent Features|\n+--------------------+\n[1.01,2.0,1.0,0.0...|\n[1.66,3.0,0.0,0.0...|\n[3.5,3.0,0.0,0.0,...|\n[3.31,2.0,0.0,0.0...|\n[3.61,4.0,1.0,0.0...|\n[4.71,4.0,0.0,0.0...|\n[2.0,2.0,0.0,0.0,...|\n[3.12,4.0,0.0,0.0...|\n[1.96,2.0,0.0,0.0...|\n[3.23,2.0,0.0,0.0...|\n[1.71,2.0,0.0,0.0...|\n[5.0,4.0,1.0,0.0,...|\n[1.57,2.0,0.0,0.0...|\n[3.0,4.0,0.0,0.0,...|\n[3.02,2.0,1.0,0.0...|\n[3.92,2.0,0.0,0.0...|\n[1.67,3.0,1.0,0.0...|\n[3.71,3.0,0.0,0.0...|\n[3.5,3.0,1.0,0.0,...|\n(6,[0,1],[3.35,3.0])|\n+--------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["output.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"f2646b66-7710-4297-a6e1-156a37e6582d"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\n|total_bill| tip| sex|smoker|day| time|size|sex_indexed|smoker_indexed|day_indexed|time_index|Independent Features|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\n| 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|[1.01,2.0,1.0,0.0...|\n| 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[1.66,3.0,0.0,0.0...|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[3.5,3.0,0.0,0.0,...|\n| 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.31,2.0,0.0,0.0...|\n| 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|[3.61,4.0,1.0,0.0...|\n| 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[4.71,4.0,0.0,0.0...|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[2.0,2.0,0.0,0.0,...|\n| 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[3.12,4.0,0.0,0.0...|\n| 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.96,2.0,0.0,0.0...|\n| 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.23,2.0,0.0,0.0...|\n| 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.71,2.0,0.0,0.0...|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|[5.0,4.0,1.0,0.0,...|\n| 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.57,2.0,0.0,0.0...|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[3.0,4.0,0.0,0.0,...|\n| 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|[3.02,2.0,1.0,0.0...|\n| 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.92,2.0,0.0,0.0...|\n| 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|[1.67,3.0,1.0,0.0...|\n| 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[3.71,3.0,0.0,0.0...|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|[3.5,3.0,1.0,0.0,...|\n| 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0| 0.0| 0.0| 0.0|(6,[0,1],[3.35,3.0])|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\ntotal_bill| tip| sex|smoker|day| time|size|sex_indexed|smoker_indexed|day_indexed|time_index|Independent Features|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\n 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|[1.01,2.0,1.0,0.0...|\n 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[1.66,3.0,0.0,0.0...|\n 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[3.5,3.0,0.0,0.0,...|\n 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.31,2.0,0.0,0.0...|\n 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|[3.61,4.0,1.0,0.0...|\n 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[4.71,4.0,0.0,0.0...|\n 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[2.0,2.0,0.0,0.0,...|\n 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[3.12,4.0,0.0,0.0...|\n 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.96,2.0,0.0,0.0...|\n 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.23,2.0,0.0,0.0...|\n 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.71,2.0,0.0,0.0...|\n 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|[5.0,4.0,1.0,0.0,...|\n 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[1.57,2.0,0.0,0.0...|\n 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|[3.0,4.0,0.0,0.0,...|\n 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|[3.02,2.0,1.0,0.0...|\n 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|[3.92,2.0,0.0,0.0...|\n 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|[1.67,3.0,1.0,0.0...|\n 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|[3.71,3.0,0.0,0.0...|\n 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|[3.5,3.0,1.0,0.0,...|\n 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0| 0.0| 0.0| 0.0|(6,[0,1],[3.35,3.0])|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+----------+--------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["finalized_data=output.select(\"Independent Features\",\"total_bill\")"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"d1c1fa4c-c78a-441a-bed9-3bcfcc5af966"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["finalized_data.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"3d14fe7b-bc59-4376-8139-142283af09b0"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+--------------------+----------+\n|Independent Features|total_bill|\n+--------------------+----------+\n|[1.01,2.0,1.0,0.0...| 16.99|\n|[1.66,3.0,0.0,0.0...| 10.34|\n|[3.5,3.0,0.0,0.0,...| 21.01|\n|[3.31,2.0,0.0,0.0...| 23.68|\n|[3.61,4.0,1.0,0.0...| 24.59|\n|[4.71,4.0,0.0,0.0...| 25.29|\n|[2.0,2.0,0.0,0.0,...| 8.77|\n|[3.12,4.0,0.0,0.0...| 26.88|\n|[1.96,2.0,0.0,0.0...| 15.04|\n|[3.23,2.0,0.0,0.0...| 14.78|\n|[1.71,2.0,0.0,0.0...| 10.27|\n|[5.0,4.0,1.0,0.0,...| 35.26|\n|[1.57,2.0,0.0,0.0...| 15.42|\n|[3.0,4.0,0.0,0.0,...| 18.43|\n|[3.02,2.0,1.0,0.0...| 14.83|\n|[3.92,2.0,0.0,0.0...| 21.58|\n|[1.67,3.0,1.0,0.0...| 10.33|\n|[3.71,3.0,0.0,0.0...| 16.29|\n|[3.5,3.0,1.0,0.0,...| 16.97|\n|(6,[0,1],[3.35,3.0])| 20.65|\n+--------------------+----------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+--------------------+----------+\nIndependent Features|total_bill|\n+--------------------+----------+\n[1.01,2.0,1.0,0.0...| 16.99|\n[1.66,3.0,0.0,0.0...| 10.34|\n[3.5,3.0,0.0,0.0,...| 21.01|\n[3.31,2.0,0.0,0.0...| 23.68|\n[3.61,4.0,1.0,0.0...| 24.59|\n[4.71,4.0,0.0,0.0...| 25.29|\n[2.0,2.0,0.0,0.0,...| 8.77|\n[3.12,4.0,0.0,0.0...| 26.88|\n[1.96,2.0,0.0,0.0...| 15.04|\n[3.23,2.0,0.0,0.0...| 14.78|\n[1.71,2.0,0.0,0.0...| 10.27|\n[5.0,4.0,1.0,0.0,...| 35.26|\n[1.57,2.0,0.0,0.0...| 15.42|\n[3.0,4.0,0.0,0.0,...| 18.43|\n[3.02,2.0,1.0,0.0...| 14.83|\n[3.92,2.0,0.0,0.0...| 21.58|\n[1.67,3.0,1.0,0.0...| 10.33|\n[3.71,3.0,0.0,0.0...| 16.29|\n[3.5,3.0,1.0,0.0,...| 16.97|\n(6,[0,1],[3.35,3.0])| 20.65|\n+--------------------+----------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["from pyspark.ml.regression import LinearRegression\n##train test split\ntrain_data,test_data=finalized_data.randomSplit([0.75,0.25])\nregressor=LinearRegression(featuresCol='Independent Features', labelCol='total_bill')\nregressor=regressor.fit(train_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"dbe03a38-e728-40f9-8a53-0b7968b8dc87"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["regressor.coefficients"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"0fdc835a-96fb-4ab3-89be-6cfbc57c7ac6"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[17]: DenseVector([3.3598, 3.3861, -0.6641, 2.5847, -0.1423, -1.3377])
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[17]: DenseVector([3.3598, 3.3861, -0.6641, 2.5847, -0.1423, -1.3377])
"]}}],"execution_count":0},{"cell_type":"code","source":["regressor.intercept"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"fd1642d4-bb73-4fc0-a410-ada61d0f3410"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[18]: 0.9231025978363154
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[18]: 0.9231025978363154
"]}}],"execution_count":0},{"cell_type":"code","source":["### Predictions\npred_results=regressor.evaluate(test_data)"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"2e45a3d8-af1c-408b-b64f-fe466c3401bd"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n"]}}],"execution_count":0},{"cell_type":"code","source":["## Final comparison\npred_results.predictions.show()"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"01d128d2-1a71-44d0-a14c-b0377693547b"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"+--------------------+----------+------------------+\n|Independent Features|total_bill| prediction|\n+--------------------+----------+------------------+\n| (6,[0,1],[2.0,2.0])| 12.69|14.414877568922382|\n|(6,[0,1],[3.35,3.0])| 20.65|22.336705086951124|\n|[1.0,1.0,1.0,0.0,...| 7.25| 7.004851678101628|\n|[1.17,2.0,0.0,1.0...| 32.83|14.210940490994291|\n|[1.36,3.0,1.0,0.0...| 18.64|13.364280305420156|\n|[1.5,2.0,0.0,1.0,...| 11.59|15.319683950195104|\n|[1.58,2.0,0.0,1.0...| 13.42| 13.82395853728497|\n|[1.66,3.0,0.0,0.0...| 10.34|16.516310272733463|\n|[1.73,2.0,0.0,0.0...| 9.78| 11.88549649517034|\n|[2.0,2.0,0.0,0.0,...| 13.81| 14.27259319727858|\n|[2.0,2.0,0.0,0.0,...| 13.03| 12.79265023451646|\n|[2.0,2.0,1.0,0.0,...| 14.15|12.128511829238738|\n|[2.0,2.0,1.0,0.0,...| 14.52|12.128511829238738|\n|[2.0,2.0,1.0,1.0,...| 10.63|16.335459877039824|\n|[2.0,2.0,1.0,1.0,...| 27.18|16.335459877039824|\n|[2.0,3.0,1.0,0.0,...| 16.21|16.994513613299002|\n|[2.23,2.0,1.0,1.0...| 12.76| 17.10822046981615|\n|[2.24,2.0,0.0,0.0...| 20.76|15.078952076697353|\n|[2.31,2.0,0.0,0.0...| 11.69|13.834197120432375|\n|[2.5,2.0,0.0,0.0,...| 14.07|15.952507529401023|\n+--------------------+----------+------------------+\nonly showing top 20 rows\n\n
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\n+--------------------+----------+------------------+\nIndependent Features|total_bill| prediction|\n+--------------------+----------+------------------+\n (6,[0,1],[2.0,2.0])| 12.69|14.414877568922382|\n(6,[0,1],[3.35,3.0])| 20.65|22.336705086951124|\n[1.0,1.0,1.0,0.0,...| 7.25| 7.004851678101628|\n[1.17,2.0,0.0,1.0...| 32.83|14.210940490994291|\n[1.36,3.0,1.0,0.0...| 18.64|13.364280305420156|\n[1.5,2.0,0.0,1.0,...| 11.59|15.319683950195104|\n[1.58,2.0,0.0,1.0...| 13.42| 13.82395853728497|\n[1.66,3.0,0.0,0.0...| 10.34|16.516310272733463|\n[1.73,2.0,0.0,0.0...| 9.78| 11.88549649517034|\n[2.0,2.0,0.0,0.0,...| 13.81| 14.27259319727858|\n[2.0,2.0,0.0,0.0,...| 13.03| 12.79265023451646|\n[2.0,2.0,1.0,0.0,...| 14.15|12.128511829238738|\n[2.0,2.0,1.0,0.0,...| 14.52|12.128511829238738|\n[2.0,2.0,1.0,1.0,...| 10.63|16.335459877039824|\n[2.0,2.0,1.0,1.0,...| 27.18|16.335459877039824|\n[2.0,3.0,1.0,0.0,...| 16.21|16.994513613299002|\n[2.23,2.0,1.0,1.0...| 12.76| 17.10822046981615|\n[2.24,2.0,0.0,0.0...| 20.76|15.078952076697353|\n[2.31,2.0,0.0,0.0...| 11.69|13.834197120432375|\n[2.5,2.0,0.0,0.0,...| 14.07|15.952507529401023|\n+--------------------+----------+------------------+\nonly showing top 20 rows\n\n
"]}}],"execution_count":0},{"cell_type":"code","source":["### PErformance Metrics\npred_results.r2,pred_results.meanAbsoluteError,pred_results.meanSquaredError"],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"75e3e5b1-0bb4-4dbe-a1ca-08e5a31ee173"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"datasetInfos":[],"data":"Out[25]: (0.40050077944613716, 4.809771114444798, 40.934088106916576)
","removedWidgets":[],"addedWidgets":{},"metadata":{},"type":"html","arguments":{}}},"output_type":"display_data","data":{"text/html":["\nOut[25]: (0.40050077944613716, 4.809771114444798, 40.934088106916576)
"]}}],"execution_count":0},{"cell_type":"code","source":[""],"metadata":{"application/vnd.databricks.v1+cell":{"title":"","showTitle":false,"inputWidgets":{},"nuid":"ce0398a7-7ebd-4f2c-b12e-2ef701925124"}},"outputs":[{"output_type":"display_data","metadata":{"application/vnd.databricks.v1+output":{"data":"","errorSummary":"","metadata":{},"type":"ipynbError","arguments":{}}},"output_type":"display_data","data":{"text/html":[""]}}],"execution_count":0}],"metadata":{"application/vnd.databricks.v1+notebook":{"notebookName":"Tutorial 8-Linear Regression With Pyspark","dashboards":[],"notebookMetadata":{"pythonIndentUnit":2},"language":"python","widgets":{},"notebookOrigID":523045182520803}},"nbformat":4,"nbformat_minor":0}
2 |
--------------------------------------------------------------------------------
/pyspark basic introduction.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "id": "8ff41f5d",
6 | "metadata": {},
7 | "source": [
8 | "#### Pyspark Basic Introduction"
9 | ]
10 | },
11 | {
12 | "cell_type": "code",
13 | "execution_count": 1,
14 | "id": "4abcaad4",
15 | "metadata": {},
16 | "outputs": [
17 | {
18 | "name": "stdout",
19 | "output_type": "stream",
20 | "text": [
21 | "Requirement already satisfied: pyspark in c:\\users\\win10\\anaconda3\\envs\\myenv\\lib\\site-packages (3.1.1)\n",
22 | "Requirement already satisfied: py4j==0.10.9 in c:\\users\\win10\\anaconda3\\envs\\myenv\\lib\\site-packages (from pyspark) (0.10.9)\n"
23 | ]
24 | }
25 | ],
26 | "source": [
27 | "!pip install pyspark"
28 | ]
29 | },
30 | {
31 | "cell_type": "code",
32 | "execution_count": 2,
33 | "id": "08b29f6a",
34 | "metadata": {},
35 | "outputs": [],
36 | "source": [
37 | "import pyspark"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 15,
43 | "id": "e1de79e4",
44 | "metadata": {},
45 | "outputs": [
46 | {
47 | "data": {
48 | "text/plain": [
49 | "pandas.core.frame.DataFrame"
50 | ]
51 | },
52 | "execution_count": 15,
53 | "metadata": {},
54 | "output_type": "execute_result"
55 | }
56 | ],
57 | "source": [
58 | "import pandas as pd\n",
59 | "type(pd.read_csv('test1.csv'))"
60 | ]
61 | },
62 | {
63 | "cell_type": "code",
64 | "execution_count": 4,
65 | "id": "37a82e23",
66 | "metadata": {},
67 | "outputs": [],
68 | "source": [
69 | "from pyspark.sql import SparkSession"
70 | ]
71 | },
72 | {
73 | "cell_type": "code",
74 | "execution_count": 5,
75 | "id": "c334b45e",
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "spark=SparkSession.builder.appName('Practise').getOrCreate()"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 6,
85 | "id": "558caca5",
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "data": {
90 | "text/html": [
91 | "\n",
92 | " \n",
93 | "
SparkSession - in-memory
\n",
94 | " \n",
95 | "
\n",
96 | "
SparkContext
\n",
97 | "\n",
98 | "
Spark UI
\n",
99 | "\n",
100 | "
\n",
101 | " - Version
\n",
102 | " v3.1.1
\n",
103 | " - Master
\n",
104 | " local[*]
\n",
105 | " - AppName
\n",
106 | " Practise
\n",
107 | "
\n",
108 | "
\n",
109 | " \n",
110 | "
\n",
111 | " "
112 | ],
113 | "text/plain": [
114 | ""
115 | ]
116 | },
117 | "execution_count": 6,
118 | "metadata": {},
119 | "output_type": "execute_result"
120 | }
121 | ],
122 | "source": [
123 | "spark"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 8,
129 | "id": "f7ac726b",
130 | "metadata": {},
131 | "outputs": [],
132 | "source": [
133 | "df_pyspark=spark.read.csv('test1.csv')"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 13,
139 | "id": "6f077d49",
140 | "metadata": {},
141 | "outputs": [],
142 | "source": [
143 | "df_pyspark=spark.read.option('header','true').csv('test1.csv')"
144 | ]
145 | },
146 | {
147 | "cell_type": "code",
148 | "execution_count": 14,
149 | "id": "2e0eee51",
150 | "metadata": {},
151 | "outputs": [
152 | {
153 | "data": {
154 | "text/plain": [
155 | "pyspark.sql.dataframe.DataFrame"
156 | ]
157 | },
158 | "execution_count": 14,
159 | "metadata": {},
160 | "output_type": "execute_result"
161 | }
162 | ],
163 | "source": [
164 | "type(df_pyspark)"
165 | ]
166 | },
167 | {
168 | "cell_type": "code",
169 | "execution_count": 18,
170 | "id": "bb26bf24",
171 | "metadata": {},
172 | "outputs": [
173 | {
174 | "name": "stdout",
175 | "output_type": "stream",
176 | "text": [
177 | "root\n",
178 | " |-- Name: string (nullable = true)\n",
179 | " |-- age: string (nullable = true)\n",
180 | "\n"
181 | ]
182 | }
183 | ],
184 | "source": [
185 | "df_pyspark.printSchema()"
186 | ]
187 | },
188 | {
189 | "cell_type": "code",
190 | "execution_count": null,
191 | "id": "17a81cc1",
192 | "metadata": {},
193 | "outputs": [],
194 | "source": []
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": null,
199 | "id": "45996ebc",
200 | "metadata": {},
201 | "outputs": [],
202 | "source": []
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": null,
207 | "id": "2cb5eda4",
208 | "metadata": {},
209 | "outputs": [],
210 | "source": []
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": null,
215 | "id": "60d8d7b6",
216 | "metadata": {},
217 | "outputs": [],
218 | "source": []
219 | },
220 | {
221 | "cell_type": "code",
222 | "execution_count": null,
223 | "id": "e9c3ce8e",
224 | "metadata": {},
225 | "outputs": [],
226 | "source": []
227 | }
228 | ],
229 | "metadata": {
230 | "kernelspec": {
231 | "display_name": "Python 3",
232 | "language": "python",
233 | "name": "python3"
234 | },
235 | "language_info": {
236 | "codemirror_mode": {
237 | "name": "ipython",
238 | "version": 3
239 | },
240 | "file_extension": ".py",
241 | "mimetype": "text/x-python",
242 | "name": "python",
243 | "nbconvert_exporter": "python",
244 | "pygments_lexer": "ipython3",
245 | "version": "3.7.10"
246 | }
247 | },
248 | "nbformat": 4,
249 | "nbformat_minor": 5
250 | }
251 |
--------------------------------------------------------------------------------
/test1.csv:
--------------------------------------------------------------------------------
1 | Name,age,Experience,Salary
2 | Krish,31,10,30000
3 | Sudhanshu,30,8,25000
4 | Sunny,29,4,20000
5 | Paul,24,3,20000
6 | Harsha,21,1,15000
7 | Shubham,23,2,18000
8 |
--------------------------------------------------------------------------------
/test2.csv:
--------------------------------------------------------------------------------
1 | Name,age,Experience,Salary
2 | Krish,31,10,30000
3 | Sudhanshu,30,8,25000
4 | Sunny,29,4,20000
5 | Paul,24,3,20000
6 | Harsha,21,1,15000
7 | Shubham,23,2,18000
8 | Mahesh,,,40000
9 | ,34,10,38000
10 | ,36,,
11 |
--------------------------------------------------------------------------------
/test3.csv:
--------------------------------------------------------------------------------
1 | Name,Departments,salary
2 | Krish,Data Science,10000
3 | Krish,IOT,5000
4 | Mahesh,Big Data,4000
5 | Krish,Big Data,4000
6 | Mahesh,Data Science,3000
7 | Sudhanshu,Data Science,20000
8 | Sudhanshu,IOT,10000
9 | Sudhanshu,Big Data,5000
10 | Sunny,Data Science,10000
11 | Sunny,Big Data,2000
12 |
--------------------------------------------------------------------------------
/tips.csv:
--------------------------------------------------------------------------------
1 | total_bill,tip,sex,smoker,day,time,size
2 | 16.99,1.01,Female,No,Sun,Dinner,2
3 | 10.34,1.66,Male,No,Sun,Dinner,3
4 | 21.01,3.5,Male,No,Sun,Dinner,3
5 | 23.68,3.31,Male,No,Sun,Dinner,2
6 | 24.59,3.61,Female,No,Sun,Dinner,4
7 | 25.29,4.71,Male,No,Sun,Dinner,4
8 | 8.77,2.0,Male,No,Sun,Dinner,2
9 | 26.88,3.12,Male,No,Sun,Dinner,4
10 | 15.04,1.96,Male,No,Sun,Dinner,2
11 | 14.78,3.23,Male,No,Sun,Dinner,2
12 | 10.27,1.71,Male,No,Sun,Dinner,2
13 | 35.26,5.0,Female,No,Sun,Dinner,4
14 | 15.42,1.57,Male,No,Sun,Dinner,2
15 | 18.43,3.0,Male,No,Sun,Dinner,4
16 | 14.83,3.02,Female,No,Sun,Dinner,2
17 | 21.58,3.92,Male,No,Sun,Dinner,2
18 | 10.33,1.67,Female,No,Sun,Dinner,3
19 | 16.29,3.71,Male,No,Sun,Dinner,3
20 | 16.97,3.5,Female,No,Sun,Dinner,3
21 | 20.65,3.35,Male,No,Sat,Dinner,3
22 | 17.92,4.08,Male,No,Sat,Dinner,2
23 | 20.29,2.75,Female,No,Sat,Dinner,2
24 | 15.77,2.23,Female,No,Sat,Dinner,2
25 | 39.42,7.58,Male,No,Sat,Dinner,4
26 | 19.82,3.18,Male,No,Sat,Dinner,2
27 | 17.81,2.34,Male,No,Sat,Dinner,4
28 | 13.37,2.0,Male,No,Sat,Dinner,2
29 | 12.69,2.0,Male,No,Sat,Dinner,2
30 | 21.7,4.3,Male,No,Sat,Dinner,2
31 | 19.65,3.0,Female,No,Sat,Dinner,2
32 | 9.55,1.45,Male,No,Sat,Dinner,2
33 | 18.35,2.5,Male,No,Sat,Dinner,4
34 | 15.06,3.0,Female,No,Sat,Dinner,2
35 | 20.69,2.45,Female,No,Sat,Dinner,4
36 | 17.78,3.27,Male,No,Sat,Dinner,2
37 | 24.06,3.6,Male,No,Sat,Dinner,3
38 | 16.31,2.0,Male,No,Sat,Dinner,3
39 | 16.93,3.07,Female,No,Sat,Dinner,3
40 | 18.69,2.31,Male,No,Sat,Dinner,3
41 | 31.27,5.0,Male,No,Sat,Dinner,3
42 | 16.04,2.24,Male,No,Sat,Dinner,3
43 | 17.46,2.54,Male,No,Sun,Dinner,2
44 | 13.94,3.06,Male,No,Sun,Dinner,2
45 | 9.68,1.32,Male,No,Sun,Dinner,2
46 | 30.4,5.6,Male,No,Sun,Dinner,4
47 | 18.29,3.0,Male,No,Sun,Dinner,2
48 | 22.23,5.0,Male,No,Sun,Dinner,2
49 | 32.4,6.0,Male,No,Sun,Dinner,4
50 | 28.55,2.05,Male,No,Sun,Dinner,3
51 | 18.04,3.0,Male,No,Sun,Dinner,2
52 | 12.54,2.5,Male,No,Sun,Dinner,2
53 | 10.29,2.6,Female,No,Sun,Dinner,2
54 | 34.81,5.2,Female,No,Sun,Dinner,4
55 | 9.94,1.56,Male,No,Sun,Dinner,2
56 | 25.56,4.34,Male,No,Sun,Dinner,4
57 | 19.49,3.51,Male,No,Sun,Dinner,2
58 | 38.01,3.0,Male,Yes,Sat,Dinner,4
59 | 26.41,1.5,Female,No,Sat,Dinner,2
60 | 11.24,1.76,Male,Yes,Sat,Dinner,2
61 | 48.27,6.73,Male,No,Sat,Dinner,4
62 | 20.29,3.21,Male,Yes,Sat,Dinner,2
63 | 13.81,2.0,Male,Yes,Sat,Dinner,2
64 | 11.02,1.98,Male,Yes,Sat,Dinner,2
65 | 18.29,3.76,Male,Yes,Sat,Dinner,4
66 | 17.59,2.64,Male,No,Sat,Dinner,3
67 | 20.08,3.15,Male,No,Sat,Dinner,3
68 | 16.45,2.47,Female,No,Sat,Dinner,2
69 | 3.07,1.0,Female,Yes,Sat,Dinner,1
70 | 20.23,2.01,Male,No,Sat,Dinner,2
71 | 15.01,2.09,Male,Yes,Sat,Dinner,2
72 | 12.02,1.97,Male,No,Sat,Dinner,2
73 | 17.07,3.0,Female,No,Sat,Dinner,3
74 | 26.86,3.14,Female,Yes,Sat,Dinner,2
75 | 25.28,5.0,Female,Yes,Sat,Dinner,2
76 | 14.73,2.2,Female,No,Sat,Dinner,2
77 | 10.51,1.25,Male,No,Sat,Dinner,2
78 | 17.92,3.08,Male,Yes,Sat,Dinner,2
79 | 27.2,4.0,Male,No,Thur,Lunch,4
80 | 22.76,3.0,Male,No,Thur,Lunch,2
81 | 17.29,2.71,Male,No,Thur,Lunch,2
82 | 19.44,3.0,Male,Yes,Thur,Lunch,2
83 | 16.66,3.4,Male,No,Thur,Lunch,2
84 | 10.07,1.83,Female,No,Thur,Lunch,1
85 | 32.68,5.0,Male,Yes,Thur,Lunch,2
86 | 15.98,2.03,Male,No,Thur,Lunch,2
87 | 34.83,5.17,Female,No,Thur,Lunch,4
88 | 13.03,2.0,Male,No,Thur,Lunch,2
89 | 18.28,4.0,Male,No,Thur,Lunch,2
90 | 24.71,5.85,Male,No,Thur,Lunch,2
91 | 21.16,3.0,Male,No,Thur,Lunch,2
92 | 28.97,3.0,Male,Yes,Fri,Dinner,2
93 | 22.49,3.5,Male,No,Fri,Dinner,2
94 | 5.75,1.0,Female,Yes,Fri,Dinner,2
95 | 16.32,4.3,Female,Yes,Fri,Dinner,2
96 | 22.75,3.25,Female,No,Fri,Dinner,2
97 | 40.17,4.73,Male,Yes,Fri,Dinner,4
98 | 27.28,4.0,Male,Yes,Fri,Dinner,2
99 | 12.03,1.5,Male,Yes,Fri,Dinner,2
100 | 21.01,3.0,Male,Yes,Fri,Dinner,2
101 | 12.46,1.5,Male,No,Fri,Dinner,2
102 | 11.35,2.5,Female,Yes,Fri,Dinner,2
103 | 15.38,3.0,Female,Yes,Fri,Dinner,2
104 | 44.3,2.5,Female,Yes,Sat,Dinner,3
105 | 22.42,3.48,Female,Yes,Sat,Dinner,2
106 | 20.92,4.08,Female,No,Sat,Dinner,2
107 | 15.36,1.64,Male,Yes,Sat,Dinner,2
108 | 20.49,4.06,Male,Yes,Sat,Dinner,2
109 | 25.21,4.29,Male,Yes,Sat,Dinner,2
110 | 18.24,3.76,Male,No,Sat,Dinner,2
111 | 14.31,4.0,Female,Yes,Sat,Dinner,2
112 | 14.0,3.0,Male,No,Sat,Dinner,2
113 | 7.25,1.0,Female,No,Sat,Dinner,1
114 | 38.07,4.0,Male,No,Sun,Dinner,3
115 | 23.95,2.55,Male,No,Sun,Dinner,2
116 | 25.71,4.0,Female,No,Sun,Dinner,3
117 | 17.31,3.5,Female,No,Sun,Dinner,2
118 | 29.93,5.07,Male,No,Sun,Dinner,4
119 | 10.65,1.5,Female,No,Thur,Lunch,2
120 | 12.43,1.8,Female,No,Thur,Lunch,2
121 | 24.08,2.92,Female,No,Thur,Lunch,4
122 | 11.69,2.31,Male,No,Thur,Lunch,2
123 | 13.42,1.68,Female,No,Thur,Lunch,2
124 | 14.26,2.5,Male,No,Thur,Lunch,2
125 | 15.95,2.0,Male,No,Thur,Lunch,2
126 | 12.48,2.52,Female,No,Thur,Lunch,2
127 | 29.8,4.2,Female,No,Thur,Lunch,6
128 | 8.52,1.48,Male,No,Thur,Lunch,2
129 | 14.52,2.0,Female,No,Thur,Lunch,2
130 | 11.38,2.0,Female,No,Thur,Lunch,2
131 | 22.82,2.18,Male,No,Thur,Lunch,3
132 | 19.08,1.5,Male,No,Thur,Lunch,2
133 | 20.27,2.83,Female,No,Thur,Lunch,2
134 | 11.17,1.5,Female,No,Thur,Lunch,2
135 | 12.26,2.0,Female,No,Thur,Lunch,2
136 | 18.26,3.25,Female,No,Thur,Lunch,2
137 | 8.51,1.25,Female,No,Thur,Lunch,2
138 | 10.33,2.0,Female,No,Thur,Lunch,2
139 | 14.15,2.0,Female,No,Thur,Lunch,2
140 | 16.0,2.0,Male,Yes,Thur,Lunch,2
141 | 13.16,2.75,Female,No,Thur,Lunch,2
142 | 17.47,3.5,Female,No,Thur,Lunch,2
143 | 34.3,6.7,Male,No,Thur,Lunch,6
144 | 41.19,5.0,Male,No,Thur,Lunch,5
145 | 27.05,5.0,Female,No,Thur,Lunch,6
146 | 16.43,2.3,Female,No,Thur,Lunch,2
147 | 8.35,1.5,Female,No,Thur,Lunch,2
148 | 18.64,1.36,Female,No,Thur,Lunch,3
149 | 11.87,1.63,Female,No,Thur,Lunch,2
150 | 9.78,1.73,Male,No,Thur,Lunch,2
151 | 7.51,2.0,Male,No,Thur,Lunch,2
152 | 14.07,2.5,Male,No,Sun,Dinner,2
153 | 13.13,2.0,Male,No,Sun,Dinner,2
154 | 17.26,2.74,Male,No,Sun,Dinner,3
155 | 24.55,2.0,Male,No,Sun,Dinner,4
156 | 19.77,2.0,Male,No,Sun,Dinner,4
157 | 29.85,5.14,Female,No,Sun,Dinner,5
158 | 48.17,5.0,Male,No,Sun,Dinner,6
159 | 25.0,3.75,Female,No,Sun,Dinner,4
160 | 13.39,2.61,Female,No,Sun,Dinner,2
161 | 16.49,2.0,Male,No,Sun,Dinner,4
162 | 21.5,3.5,Male,No,Sun,Dinner,4
163 | 12.66,2.5,Male,No,Sun,Dinner,2
164 | 16.21,2.0,Female,No,Sun,Dinner,3
165 | 13.81,2.0,Male,No,Sun,Dinner,2
166 | 17.51,3.0,Female,Yes,Sun,Dinner,2
167 | 24.52,3.48,Male,No,Sun,Dinner,3
168 | 20.76,2.24,Male,No,Sun,Dinner,2
169 | 31.71,4.5,Male,No,Sun,Dinner,4
170 | 10.59,1.61,Female,Yes,Sat,Dinner,2
171 | 10.63,2.0,Female,Yes,Sat,Dinner,2
172 | 50.81,10.0,Male,Yes,Sat,Dinner,3
173 | 15.81,3.16,Male,Yes,Sat,Dinner,2
174 | 7.25,5.15,Male,Yes,Sun,Dinner,2
175 | 31.85,3.18,Male,Yes,Sun,Dinner,2
176 | 16.82,4.0,Male,Yes,Sun,Dinner,2
177 | 32.9,3.11,Male,Yes,Sun,Dinner,2
178 | 17.89,2.0,Male,Yes,Sun,Dinner,2
179 | 14.48,2.0,Male,Yes,Sun,Dinner,2
180 | 9.6,4.0,Female,Yes,Sun,Dinner,2
181 | 34.63,3.55,Male,Yes,Sun,Dinner,2
182 | 34.65,3.68,Male,Yes,Sun,Dinner,4
183 | 23.33,5.65,Male,Yes,Sun,Dinner,2
184 | 45.35,3.5,Male,Yes,Sun,Dinner,3
185 | 23.17,6.5,Male,Yes,Sun,Dinner,4
186 | 40.55,3.0,Male,Yes,Sun,Dinner,2
187 | 20.69,5.0,Male,No,Sun,Dinner,5
188 | 20.9,3.5,Female,Yes,Sun,Dinner,3
189 | 30.46,2.0,Male,Yes,Sun,Dinner,5
190 | 18.15,3.5,Female,Yes,Sun,Dinner,3
191 | 23.1,4.0,Male,Yes,Sun,Dinner,3
192 | 15.69,1.5,Male,Yes,Sun,Dinner,2
193 | 19.81,4.19,Female,Yes,Thur,Lunch,2
194 | 28.44,2.56,Male,Yes,Thur,Lunch,2
195 | 15.48,2.02,Male,Yes,Thur,Lunch,2
196 | 16.58,4.0,Male,Yes,Thur,Lunch,2
197 | 7.56,1.44,Male,No,Thur,Lunch,2
198 | 10.34,2.0,Male,Yes,Thur,Lunch,2
199 | 43.11,5.0,Female,Yes,Thur,Lunch,4
200 | 13.0,2.0,Female,Yes,Thur,Lunch,2
201 | 13.51,2.0,Male,Yes,Thur,Lunch,2
202 | 18.71,4.0,Male,Yes,Thur,Lunch,3
203 | 12.74,2.01,Female,Yes,Thur,Lunch,2
204 | 13.0,2.0,Female,Yes,Thur,Lunch,2
205 | 16.4,2.5,Female,Yes,Thur,Lunch,2
206 | 20.53,4.0,Male,Yes,Thur,Lunch,4
207 | 16.47,3.23,Female,Yes,Thur,Lunch,3
208 | 26.59,3.41,Male,Yes,Sat,Dinner,3
209 | 38.73,3.0,Male,Yes,Sat,Dinner,4
210 | 24.27,2.03,Male,Yes,Sat,Dinner,2
211 | 12.76,2.23,Female,Yes,Sat,Dinner,2
212 | 30.06,2.0,Male,Yes,Sat,Dinner,3
213 | 25.89,5.16,Male,Yes,Sat,Dinner,4
214 | 48.33,9.0,Male,No,Sat,Dinner,4
215 | 13.27,2.5,Female,Yes,Sat,Dinner,2
216 | 28.17,6.5,Female,Yes,Sat,Dinner,3
217 | 12.9,1.1,Female,Yes,Sat,Dinner,2
218 | 28.15,3.0,Male,Yes,Sat,Dinner,5
219 | 11.59,1.5,Male,Yes,Sat,Dinner,2
220 | 7.74,1.44,Male,Yes,Sat,Dinner,2
221 | 30.14,3.09,Female,Yes,Sat,Dinner,4
222 | 12.16,2.2,Male,Yes,Fri,Lunch,2
223 | 13.42,3.48,Female,Yes,Fri,Lunch,2
224 | 8.58,1.92,Male,Yes,Fri,Lunch,1
225 | 15.98,3.0,Female,No,Fri,Lunch,3
226 | 13.42,1.58,Male,Yes,Fri,Lunch,2
227 | 16.27,2.5,Female,Yes,Fri,Lunch,2
228 | 10.09,2.0,Female,Yes,Fri,Lunch,2
229 | 20.45,3.0,Male,No,Sat,Dinner,4
230 | 13.28,2.72,Male,No,Sat,Dinner,2
231 | 22.12,2.88,Female,Yes,Sat,Dinner,2
232 | 24.01,2.0,Male,Yes,Sat,Dinner,4
233 | 15.69,3.0,Male,Yes,Sat,Dinner,3
234 | 11.61,3.39,Male,No,Sat,Dinner,2
235 | 10.77,1.47,Male,No,Sat,Dinner,2
236 | 15.53,3.0,Male,Yes,Sat,Dinner,2
237 | 10.07,1.25,Male,No,Sat,Dinner,2
238 | 12.6,1.0,Male,Yes,Sat,Dinner,2
239 | 32.83,1.17,Male,Yes,Sat,Dinner,2
240 | 35.83,4.67,Female,No,Sat,Dinner,3
241 | 29.03,5.92,Male,No,Sat,Dinner,3
242 | 27.18,2.0,Female,Yes,Sat,Dinner,2
243 | 22.67,2.0,Male,Yes,Sat,Dinner,2
244 | 17.82,1.75,Male,No,Sat,Dinner,2
245 | 18.78,3.0,Female,No,Thur,Dinner,2
246 |
--------------------------------------------------------------------------------