├── Untitled.ipynb ├── day1.ipynb ├── day2.ipynb ├── day3.ipynb ├── day4.ipynb ├── day5.ipynb ├── day6.ipynb ├── day7.ipynb ├── test1.csv ├── test2.csv ├── test3.csv ├── test4.csv ├── test5.csv └── tips.csv /Untitled.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "id": "8db32105-fc54-445b-8b33-b9779412ff81", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [] 10 | } 11 | ], 12 | "metadata": { 13 | "kernelspec": { 14 | "display_name": "Python 3 (ipykernel)", 15 | "language": "python", 16 | "name": "python3" 17 | }, 18 | "language_info": { 19 | "codemirror_mode": { 20 | "name": "ipython", 21 | "version": 3 22 | }, 23 | "file_extension": ".py", 24 | "mimetype": "text/x-python", 25 | "name": "python", 26 | "nbconvert_exporter": "python", 27 | "pygments_lexer": "ipython3", 28 | "version": "3.11.7" 29 | } 30 | }, 31 | "nbformat": 4, 32 | "nbformat_minor": 5 33 | } 34 | -------------------------------------------------------------------------------- /day1.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "166d2b18-2e08-4cdc-a65f-fe2f84694055", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "Collecting spark\n", 14 | " Downloading spark-0.2.1.tar.gz (41 kB)\n", 15 | "\u001b[2K \u001b[38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m41.0/41.0 kB\u001b[0m \u001b[31m414.0 kB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0mm eta \u001b[36m0:00:01\u001b[0m0:01\u001b[0m\n", 16 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25ldone\n", 17 | "\u001b[?25hBuilding wheels for collected packages: spark\n", 18 | " Building wheel for spark (setup.py) ... \u001b[?25ldone\n", 19 | "\u001b[?25h Created wheel for spark: filename=spark-0.2.1-py3-none-any.whl size=58748 sha256=c546ef45511ed9eeb315c261edbfd4254da318be8cb9bca4bbd5b41b7b5273b6\n", 20 | " Stored in directory: /home/kyn/.cache/pip/wheels/67/c2/7c/a53325365fba358ffff35af84a2e14cf88c18052f88acfa5f0\n", 21 | "Successfully built spark\n", 22 | "Installing collected packages: spark\n", 23 | "Successfully installed spark-0.2.1\n" 24 | ] 25 | } 26 | ], 27 | "source": [ 28 | "!pip install spark" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 2, 34 | "id": "40c102b2-0952-4ea5-b0cb-86fc18ce4798", 35 | "metadata": {}, 36 | "outputs": [], 37 | "source": [ 38 | "import pyspark" 39 | ] 40 | }, 41 | { 42 | "cell_type": "code", 43 | "execution_count": 3, 44 | "id": "313ac534-951d-4d7b-81b5-32ea3d1aa5a8", 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "data": { 49 | "text/html": [ 50 | "
\n", 51 | "\n", 64 | "\n", 65 | " \n", 66 | " \n", 67 | " \n", 68 | " \n", 69 | " \n", 70 | " \n", 71 | " \n", 72 | " \n", 73 | " \n", 74 | " \n", 75 | " \n", 76 | " \n", 77 | " \n", 78 | " \n", 79 | " \n", 80 | " \n", 81 | " \n", 82 | " \n", 83 | " \n", 84 | " \n", 85 | " \n", 86 | " \n", 87 | " \n", 88 | " \n", 89 | "
nameage
0kani15
1kani116
2kani217
\n", 90 | "
" 91 | ], 92 | "text/plain": [ 93 | " name age\n", 94 | "0 kani 15\n", 95 | "1 kani1 16\n", 96 | "2 kani2 17" 97 | ] 98 | }, 99 | "execution_count": 3, 100 | "metadata": {}, 101 | "output_type": "execute_result" 102 | } 103 | ], 104 | "source": [ 105 | "import pandas as pd\n", 106 | "df=pd.read_csv(\"test1.csv\")\n", 107 | "df" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 14, 113 | "id": "a71f8f2e-8677-4392-b042-f540b508eebc", 114 | "metadata": {}, 115 | "outputs": [ 116 | { 117 | "data": { 118 | "text/plain": [ 119 | "pandas.core.frame.DataFrame" 120 | ] 121 | }, 122 | "execution_count": 14, 123 | "metadata": {}, 124 | "output_type": "execute_result" 125 | } 126 | ], 127 | "source": [ 128 | "type(pd.read_csv(\"test1.csv\"))" 129 | ] 130 | }, 131 | { 132 | "cell_type": "code", 133 | "execution_count": 5, 134 | "id": "fd9e7d5e-1630-4d40-91bb-5b372e9a192b", 135 | "metadata": {}, 136 | "outputs": [ 137 | { 138 | "name": "stderr", 139 | "output_type": "stream", 140 | "text": [ 141 | "24/04/23 10:17:17 WARN Utils: Your hostname, kyn resolves to a loopback address: 127.0.1.1; using 10.0.250.239 instead (on interface wlp0s20f3)\n", 142 | "24/04/23 10:17:17 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 143 | "Setting default log level to \"WARN\".\n", 144 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 145 | "24/04/23 10:17:17 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "from pyspark.sql import SparkSession\n", 151 | "spark=SparkSession.builder.appName(\"kani\").getOrCreate()" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 6, 157 | "id": "e9f269a8-11c1-4eb4-9f3e-a27eaef37910", 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "data": { 162 | "text/html": [ 163 | "\n", 164 | "
\n", 165 | "

SparkSession - in-memory

\n", 166 | " \n", 167 | "
\n", 168 | "

SparkContext

\n", 169 | "\n", 170 | "

Spark UI

\n", 171 | "\n", 172 | "
\n", 173 | "
Version
\n", 174 | "
v3.5.1
\n", 175 | "
Master
\n", 176 | "
local[*]
\n", 177 | "
AppName
\n", 178 | "
kani
\n", 179 | "
\n", 180 | "
\n", 181 | " \n", 182 | "
\n", 183 | " " 184 | ], 185 | "text/plain": [ 186 | "" 187 | ] 188 | }, 189 | "execution_count": 6, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "spark" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 7, 201 | "id": "1924f962-b9d5-45e0-b326-f069ec69a27f", 202 | "metadata": {}, 203 | "outputs": [], 204 | "source": [ 205 | "df_spark=spark.read.csv(\"test1.csv\")" 206 | ] 207 | }, 208 | { 209 | "cell_type": "code", 210 | "execution_count": 8, 211 | "id": "fa08660e-d5f6-4bb0-8810-138c05878b25", 212 | "metadata": {}, 213 | "outputs": [ 214 | { 215 | "data": { 216 | "text/plain": [ 217 | "DataFrame[_c0: string, _c1: string]" 218 | ] 219 | }, 220 | "execution_count": 8, 221 | "metadata": {}, 222 | "output_type": "execute_result" 223 | } 224 | ], 225 | "source": [ 226 | "df_spark" 227 | ] 228 | }, 229 | { 230 | "cell_type": "code", 231 | "execution_count": 9, 232 | "id": "ec97f69a-96dd-4fa9-8e85-f501f1bcf023", 233 | "metadata": {}, 234 | "outputs": [ 235 | { 236 | "name": "stdout", 237 | "output_type": "stream", 238 | "text": [ 239 | "+-----+---+\n", 240 | "| _c0|_c1|\n", 241 | "+-----+---+\n", 242 | "| name|age|\n", 243 | "| kani| 15|\n", 244 | "|kani1| 16|\n", 245 | "|kani2| 17|\n", 246 | "+-----+---+\n", 247 | "\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "df_spark.show()" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 10, 258 | "id": "f25c9b93-bf9c-4697-bc69-69ab67424aa7", 259 | "metadata": {}, 260 | "outputs": [ 261 | { 262 | "data": { 263 | "text/plain": [ 264 | "DataFrame[name: string, age: string]" 265 | ] 266 | }, 267 | "execution_count": 10, 268 | "metadata": {}, 269 | "output_type": "execute_result" 270 | } 271 | ], 272 | "source": [ 273 | "spark.read.option(\"header\",\"true\").csv(\"test1.csv\")" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 11, 279 | "id": "adb50d24-9cf0-445c-86a5-427f7bebc19b", 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "+-----+---+\n", 287 | "| name|age|\n", 288 | "+-----+---+\n", 289 | "| kani| 15|\n", 290 | "|kani1| 16|\n", 291 | "|kani2| 17|\n", 292 | "+-----+---+\n", 293 | "\n" 294 | ] 295 | } 296 | ], 297 | "source": [ 298 | "spark.read.option(\"header\",\"true\").csv(\"test1.csv\").show()" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": 12, 304 | "id": "345760c6-7830-4c91-8882-052f7f8d33ea", 305 | "metadata": {}, 306 | "outputs": [ 307 | { 308 | "data": { 309 | "text/plain": [ 310 | "pyspark.sql.dataframe.DataFrame" 311 | ] 312 | }, 313 | "execution_count": 12, 314 | "metadata": {}, 315 | "output_type": "execute_result" 316 | } 317 | ], 318 | "source": [ 319 | "type(df_spark)" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 18, 325 | "id": "9b9cd715-eb93-4119-9645-1a39b6e594c8", 326 | "metadata": {}, 327 | "outputs": [ 328 | { 329 | "data": { 330 | "text/plain": [ 331 | "[Row(_c0='name', _c1='age'),\n", 332 | " Row(_c0='kani', _c1='15'),\n", 333 | " Row(_c0='kani1', _c1='16')]" 334 | ] 335 | }, 336 | "execution_count": 18, 337 | "metadata": {}, 338 | "output_type": "execute_result" 339 | } 340 | ], 341 | "source": [ 342 | "df_spark.head(3)" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 23, 348 | "id": "f66faaf3-bb69-4925-8a86-29d3432adfa8", 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "root\n", 356 | " |-- _c0: string (nullable = true)\n", 357 | " |-- _c1: string (nullable = true)\n", 358 | "\n" 359 | ] 360 | } 361 | ], 362 | "source": [ 363 | "df_spark.printSchema()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": null, 369 | "id": "e2f71893-b4ea-4cea-8eaf-15e7cf845e4e", 370 | "metadata": {}, 371 | "outputs": [], 372 | "source": [] 373 | } 374 | ], 375 | "metadata": { 376 | "kernelspec": { 377 | "display_name": "Python 3 (ipykernel)", 378 | "language": "python", 379 | "name": "python3" 380 | }, 381 | "language_info": { 382 | "codemirror_mode": { 383 | "name": "ipython", 384 | "version": 3 385 | }, 386 | "file_extension": ".py", 387 | "mimetype": "text/x-python", 388 | "name": "python", 389 | "nbconvert_exporter": "python", 390 | "pygments_lexer": "ipython3", 391 | "version": "3.11.7" 392 | } 393 | }, 394 | "nbformat": 4, 395 | "nbformat_minor": 5 396 | } 397 | -------------------------------------------------------------------------------- /day2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "0204e5b9-754d-43c9-9427-416059b90d6e", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "24/04/24 09:35:09 WARN Utils: Your hostname, kyn resolves to a loopback address: 127.0.1.1; using 10.0.250.240 instead (on interface wlp0s20f3)\n", 14 | "24/04/24 09:35:09 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 15 | "Setting default log level to \"WARN\".\n", 16 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 17 | "24/04/24 09:35:09 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 18 | ] 19 | }, 20 | { 21 | "data": { 22 | "text/html": [ 23 | "\n", 24 | "
\n", 25 | "

SparkSession - in-memory

\n", 26 | " \n", 27 | "
\n", 28 | "

SparkContext

\n", 29 | "\n", 30 | "

Spark UI

\n", 31 | "\n", 32 | "
\n", 33 | "
Version
\n", 34 | "
v3.5.1
\n", 35 | "
Master
\n", 36 | "
local[*]
\n", 37 | "
AppName
\n", 38 | "
dataframe
\n", 39 | "
\n", 40 | "
\n", 41 | " \n", 42 | "
\n", 43 | " " 44 | ], 45 | "text/plain": [ 46 | "" 47 | ] 48 | }, 49 | "execution_count": 1, 50 | "metadata": {}, 51 | "output_type": "execute_result" 52 | } 53 | ], 54 | "source": [ 55 | "from pyspark.sql import SparkSession\n", 56 | "spark=SparkSession.builder.appName(\"dataframe\").getOrCreate()\n", 57 | "spark" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 2, 63 | "id": "05510abc-d3a9-42df-8146-15e7fa28403b", 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "DataFrame[_c0: string, _c1: string]" 70 | ] 71 | }, 72 | "execution_count": 2, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "df_spark=spark.read.csv(\"test1.csv\")\n", 79 | "df_spark" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 26, 85 | "id": "0a192b69-d0fc-4e08-bde7-1d69cf0641ae", 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "name": "stdout", 90 | "output_type": "stream", 91 | "text": [ 92 | "+-----+---+----------+\n", 93 | "| name|age|experience|\n", 94 | "+-----+---+----------+\n", 95 | "| kani| 15| 10|\n", 96 | "|kani1| 16| 8|\n", 97 | "|kani2| 17| 4|\n", 98 | "+-----+---+----------+\n", 99 | "\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "#read dataset\n", 105 | "df_spark=spark.read.option(\"header\",\"true\").csv(\"test2.csv\")\n", 106 | "df_spark.show()" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 4, 112 | "id": "bc69ebc5-b466-441b-b41f-bfaed5dca671", 113 | "metadata": {}, 114 | "outputs": [ 115 | { 116 | "name": "stdout", 117 | "output_type": "stream", 118 | "text": [ 119 | "root\n", 120 | " |-- name: string (nullable = true)\n", 121 | " |-- age: string (nullable = true)\n", 122 | " |-- experience: string (nullable = true)\n", 123 | "\n" 124 | ] 125 | } 126 | ], 127 | "source": [ 128 | "#check schema\n", 129 | "df_spark.printSchema()\n", 130 | "#here all are show an string so that \n", 131 | "#use an inferschema=true" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 5, 137 | "id": "a2de4aae-47aa-4fa2-a64d-3ee819312b08", 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "+-----+---+----------+\n", 145 | "| name|age|experience|\n", 146 | "+-----+---+----------+\n", 147 | "| kani| 15| 10|\n", 148 | "|kani1| 16| 8|\n", 149 | "|kani2| 17| 4|\n", 150 | "+-----+---+----------+\n", 151 | "\n" 152 | ] 153 | } 154 | ], 155 | "source": [ 156 | "df_spark=spark.read.option(\"header\",\"true\").csv(\"test2.csv\",inferSchema=True)\n", 157 | "df_spark.show()\n" 158 | ] 159 | }, 160 | { 161 | "cell_type": "code", 162 | "execution_count": 6, 163 | "id": "b8473ce1-9299-4667-a779-ea46c8538d8f", 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "root\n", 171 | " |-- name: string (nullable = true)\n", 172 | " |-- age: integer (nullable = true)\n", 173 | " |-- experience: integer (nullable = true)\n", 174 | "\n" 175 | ] 176 | } 177 | ], 178 | "source": [ 179 | "df_spark.printSchema()\n", 180 | "#now it show an crt datatype " 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 7, 186 | "id": "ea7de970-25b4-45c0-8840-95942e664579", 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "pyspark.sql.dataframe.DataFrame" 193 | ] 194 | }, 195 | "execution_count": 7, 196 | "metadata": {}, 197 | "output_type": "execute_result" 198 | } 199 | ], 200 | "source": [ 201 | "type(df_spark)\n", 202 | "#dataframe is an data structures" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 8, 208 | "id": "79d67920-1648-4ba7-a4d7-d1467390cfc0", 209 | "metadata": {}, 210 | "outputs": [ 211 | { 212 | "data": { 213 | "text/plain": [ 214 | "['name', 'age', 'experience']" 215 | ] 216 | }, 217 | "execution_count": 8, 218 | "metadata": {}, 219 | "output_type": "execute_result" 220 | } 221 | ], 222 | "source": [ 223 | "df_spark.columns" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 9, 229 | "id": "8838c7a4-f2fe-4ecc-babf-0f768b697b12", 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "data": { 234 | "text/plain": [ 235 | "[Row(name='kani', age=15, experience=10),\n", 236 | " Row(name='kani1', age=16, experience=8),\n", 237 | " Row(name='kani2', age=17, experience=4)]" 238 | ] 239 | }, 240 | "execution_count": 9, 241 | "metadata": {}, 242 | "output_type": "execute_result" 243 | } 244 | ], 245 | "source": [ 246 | "df_spark.head(3)" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 10, 252 | "id": "e4ffe89b-3525-4a76-9dd4-8f73258a1fc9", 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "+-----+---+----------+\n", 260 | "| name|age|experience|\n", 261 | "+-----+---+----------+\n", 262 | "| kani| 15| 10|\n", 263 | "|kani1| 16| 8|\n", 264 | "|kani2| 17| 4|\n", 265 | "+-----+---+----------+\n", 266 | "\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "df_spark.show()\n" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 11, 277 | "id": "a1df0972-1e5b-41f5-a39f-824c17beb234", 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "name": "stdout", 282 | "output_type": "stream", 283 | "text": [ 284 | "+-----+---+\n", 285 | "| name|age|\n", 286 | "+-----+---+\n", 287 | "| kani| 15|\n", 288 | "|kani1| 16|\n", 289 | "|kani2| 17|\n", 290 | "+-----+---+\n", 291 | "\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "df_spark.select(\"name\",\"age\").show()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 12, 302 | "id": "722a60f6-c3ea-4e83-9ed0-517148f0f9eb", 303 | "metadata": {}, 304 | "outputs": [ 305 | { 306 | "data": { 307 | "text/plain": [ 308 | "DataFrame[name: string, age: int]" 309 | ] 310 | }, 311 | "execution_count": 12, 312 | "metadata": {}, 313 | "output_type": "execute_result" 314 | } 315 | ], 316 | "source": [ 317 | "df_spark.select(\"name\",\"age\")" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 13, 323 | "id": "39faa7d2-0218-466b-8bae-9706fe2e0739", 324 | "metadata": {}, 325 | "outputs": [ 326 | { 327 | "data": { 328 | "text/plain": [ 329 | "pyspark.sql.dataframe.DataFrame" 330 | ] 331 | }, 332 | "execution_count": 13, 333 | "metadata": {}, 334 | "output_type": "execute_result" 335 | } 336 | ], 337 | "source": [ 338 | "type(df_spark.select(\"name\",\"age\"))" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 14, 344 | "id": "f64dddc0-147d-40c9-bd33-7cd594e37ede", 345 | "metadata": {}, 346 | "outputs": [ 347 | { 348 | "data": { 349 | "text/plain": [ 350 | "Column<'name'>" 351 | ] 352 | }, 353 | "execution_count": 14, 354 | "metadata": {}, 355 | "output_type": "execute_result" 356 | } 357 | ], 358 | "source": [ 359 | "df_spark['name']" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 15, 365 | "id": "9c642a38-fdaf-44c9-8386-69a158b219e3", 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "data": { 370 | "text/plain": [ 371 | "[('name', 'string'), ('age', 'int'), ('experience', 'int')]" 372 | ] 373 | }, 374 | "execution_count": 15, 375 | "metadata": {}, 376 | "output_type": "execute_result" 377 | } 378 | ], 379 | "source": [ 380 | "df_spark.dtypes" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 16, 386 | "id": "dc5083b0-7911-4bf6-99f1-f48a24e9fc7a", 387 | "metadata": {}, 388 | "outputs": [ 389 | { 390 | "name": "stdout", 391 | "output_type": "stream", 392 | "text": [ 393 | "+-------+-----+----+-----------------+\n", 394 | "|summary| name| age| experience|\n", 395 | "+-------+-----+----+-----------------+\n", 396 | "| count| 3| 3| 3|\n", 397 | "| mean| NULL|16.0|7.333333333333333|\n", 398 | "| stddev| NULL| 1.0|3.055050463303893|\n", 399 | "| min| kani| 15| 4|\n", 400 | "| max|kani2| 17| 10|\n", 401 | "+-------+-----+----+-----------------+\n", 402 | "\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "df_spark.describe().show()" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 17, 413 | "id": "a8b1544d-dbeb-4e88-97be-1a786711ad83", 414 | "metadata": {}, 415 | "outputs": [ 416 | { 417 | "name": "stdout", 418 | "output_type": "stream", 419 | "text": [ 420 | "+-----+---+----------+\n", 421 | "| name|age|experience|\n", 422 | "+-----+---+----------+\n", 423 | "| kani| 15| 10|\n", 424 | "|kani1| 16| 8|\n", 425 | "|kani2| 17| 4|\n", 426 | "+-----+---+----------+\n", 427 | "\n" 428 | ] 429 | } 430 | ], 431 | "source": [ 432 | "df_spark.show()" 433 | ] 434 | }, 435 | { 436 | "cell_type": "code", 437 | "execution_count": 18, 438 | "id": "cc9d2965-a122-4d01-9748-85a82b2ddaf3", 439 | "metadata": {}, 440 | "outputs": [ 441 | { 442 | "ename": "AnalysisException", 443 | "evalue": "[UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `experinced` cannot be resolved. Did you mean one of the following? [`name`, `age`, `experience`].", 444 | "output_type": "error", 445 | "traceback": [ 446 | "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", 447 | "\u001b[0;31mAnalysisException\u001b[0m Traceback (most recent call last)", 448 | "Cell \u001b[0;32mIn[18], line 2\u001b[0m\n\u001b[1;32m 1\u001b[0m \u001b[38;5;66;03m#old verison add columns \u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m df_spark\u001b[38;5;241m=\u001b[39mdf_spark\u001b[38;5;241m.\u001b[39mwithColumn(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mExperineced after 2 years\u001b[39m\u001b[38;5;124m\"\u001b[39m,df_spark[\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mexperinced\u001b[39m\u001b[38;5;124m'\u001b[39m]\u001b[38;5;241m+\u001b[39m\u001b[38;5;241m2\u001b[39m)\n", 449 | "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pyspark/sql/dataframe.py:3078\u001b[0m, in \u001b[0;36mDataFrame.__getitem__\u001b[0;34m(self, item)\u001b[0m\n\u001b[1;32m 3006\u001b[0m \u001b[38;5;250m\u001b[39m\u001b[38;5;124;03m\"\"\"Returns the column as a :class:`Column`.\u001b[39;00m\n\u001b[1;32m 3007\u001b[0m \n\u001b[1;32m 3008\u001b[0m \u001b[38;5;124;03m.. versionadded:: 1.3.0\u001b[39;00m\n\u001b[0;32m (...)\u001b[0m\n\u001b[1;32m 3075\u001b[0m \u001b[38;5;124;03m+---+----+\u001b[39;00m\n\u001b[1;32m 3076\u001b[0m \u001b[38;5;124;03m\"\"\"\u001b[39;00m\n\u001b[1;32m 3077\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(item, \u001b[38;5;28mstr\u001b[39m):\n\u001b[0;32m-> 3078\u001b[0m jc \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_jdf\u001b[38;5;241m.\u001b[39mapply(item)\n\u001b[1;32m 3079\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m Column(jc)\n\u001b[1;32m 3080\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(item, Column):\n", 450 | "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/py4j/java_gateway.py:1322\u001b[0m, in \u001b[0;36mJavaMember.__call__\u001b[0;34m(self, *args)\u001b[0m\n\u001b[1;32m 1316\u001b[0m command \u001b[38;5;241m=\u001b[39m proto\u001b[38;5;241m.\u001b[39mCALL_COMMAND_NAME \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1317\u001b[0m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mcommand_header \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1318\u001b[0m args_command \u001b[38;5;241m+\u001b[39m\\\n\u001b[1;32m 1319\u001b[0m proto\u001b[38;5;241m.\u001b[39mEND_COMMAND_PART\n\u001b[1;32m 1321\u001b[0m answer \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client\u001b[38;5;241m.\u001b[39msend_command(command)\n\u001b[0;32m-> 1322\u001b[0m return_value \u001b[38;5;241m=\u001b[39m get_return_value(\n\u001b[1;32m 1323\u001b[0m answer, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mgateway_client, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mtarget_id, \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mname)\n\u001b[1;32m 1325\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m temp_arg \u001b[38;5;129;01min\u001b[39;00m temp_args:\n\u001b[1;32m 1326\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;28mhasattr\u001b[39m(temp_arg, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m_detach\u001b[39m\u001b[38;5;124m\"\u001b[39m):\n", 451 | "File \u001b[0;32m~/anaconda3/lib/python3.11/site-packages/pyspark/errors/exceptions/captured.py:185\u001b[0m, in \u001b[0;36mcapture_sql_exception..deco\u001b[0;34m(*a, **kw)\u001b[0m\n\u001b[1;32m 181\u001b[0m converted \u001b[38;5;241m=\u001b[39m convert_exception(e\u001b[38;5;241m.\u001b[39mjava_exception)\n\u001b[1;32m 182\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28misinstance\u001b[39m(converted, UnknownException):\n\u001b[1;32m 183\u001b[0m \u001b[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001b[39;00m\n\u001b[1;32m 184\u001b[0m \u001b[38;5;66;03m# JVM exception message.\u001b[39;00m\n\u001b[0;32m--> 185\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m converted \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m 186\u001b[0m \u001b[38;5;28;01melse\u001b[39;00m:\n\u001b[1;32m 187\u001b[0m \u001b[38;5;28;01mraise\u001b[39;00m\n", 452 | "\u001b[0;31mAnalysisException\u001b[0m: [UNRESOLVED_COLUMN.WITH_SUGGESTION] A column or function parameter with name `experinced` cannot be resolved. Did you mean one of the following? [`name`, `age`, `experience`]." 453 | ] 454 | } 455 | ], 456 | "source": [ 457 | "#old verison add columns \n", 458 | "df_spark=df_spark.withColumn(\"Experineced after 2 years\",df_spark['experinced']+2)" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": 19, 464 | "id": "c8837df0-61ac-4868-a857-355eb3ba87b1", 465 | "metadata": {}, 466 | "outputs": [ 467 | { 468 | "data": { 469 | "text/plain": [ 470 | "DataFrame[name: string, age: int, experience: int, Experience After 2 years: int]" 471 | ] 472 | }, 473 | "execution_count": 19, 474 | "metadata": {}, 475 | "output_type": "execute_result" 476 | } 477 | ], 478 | "source": [ 479 | "#new version \n", 480 | "from pyspark.sql.functions import col\n", 481 | "df_spark = df_spark.withColumn(\"Experience After 2 years\", col(\"Experience\") + 2)\n", 482 | "df_spark\n" 483 | ] 484 | }, 485 | { 486 | "cell_type": "code", 487 | "execution_count": 20, 488 | "id": "18aaba8c-df0e-40c0-ab95-f3eee5317dc6", 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "name": "stdout", 493 | "output_type": "stream", 494 | "text": [ 495 | "+-----+---+----------+------------------------+\n", 496 | "| name|age|experience|Experience After 2 years|\n", 497 | "+-----+---+----------+------------------------+\n", 498 | "| kani| 15| 10| 12|\n", 499 | "|kani1| 16| 8| 10|\n", 500 | "|kani2| 17| 4| 6|\n", 501 | "+-----+---+----------+------------------------+\n", 502 | "\n" 503 | ] 504 | } 505 | ], 506 | "source": [ 507 | "df_spark.show()" 508 | ] 509 | }, 510 | { 511 | "cell_type": "code", 512 | "execution_count": 24, 513 | "id": "e50ad537-2396-4ccf-b93a-9b3b07f1fa35", 514 | "metadata": {}, 515 | "outputs": [ 516 | { 517 | "name": "stdout", 518 | "output_type": "stream", 519 | "text": [ 520 | "+-----+---+----------+\n", 521 | "| name|age|experience|\n", 522 | "+-----+---+----------+\n", 523 | "| kani| 15| 10|\n", 524 | "|kani1| 16| 8|\n", 525 | "|kani2| 17| 4|\n", 526 | "+-----+---+----------+\n", 527 | "\n" 528 | ] 529 | } 530 | ], 531 | "source": [ 532 | "df_spark=df_spark.drop(\"Experience After 2 years\")\n", 533 | "df_spark.show()" 534 | ] 535 | }, 536 | { 537 | "cell_type": "code", 538 | "execution_count": 25, 539 | "id": "9526a218-f1a0-48e0-bb2b-12c21f3d59c9", 540 | "metadata": {}, 541 | "outputs": [ 542 | { 543 | "name": "stdout", 544 | "output_type": "stream", 545 | "text": [ 546 | "+-----+---+----------+\n", 547 | "| Name|age|experience|\n", 548 | "+-----+---+----------+\n", 549 | "| kani| 15| 10|\n", 550 | "|kani1| 16| 8|\n", 551 | "|kani2| 17| 4|\n", 552 | "+-----+---+----------+\n", 553 | "\n" 554 | ] 555 | } 556 | ], 557 | "source": [ 558 | "df_spark.withColumnRenamed(\"name\",\"Name\").show()" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "id": "24e3b7df-668d-409c-a93e-b4e85704937b", 564 | "metadata": {}, 565 | "source": [ 566 | "# withColumn\n", 567 | "# withColumnRename\n", 568 | "# drop\n", 569 | "# printSchema\n", 570 | "# read.option().csv()" 571 | ] 572 | }, 573 | { 574 | "cell_type": "markdown", 575 | "id": "5e3a192b-a315-4a6f-b723-f44dc1439fa2", 576 | "metadata": {}, 577 | "source": [ 578 | "---------------------------------------------------------------" 579 | ] 580 | }, 581 | { 582 | "cell_type": "markdown", 583 | "id": "423543b9-1221-4296-a81e-90faa742d571", 584 | "metadata": {}, 585 | "source": [ 586 | "# PySpark Dataframe\n", 587 | "# Reading The Dataset\n", 588 | "# Checking the Datatypes of the Column(Schema)\n", 589 | "# Selecting Columns And Indexing\n", 590 | "# Check Describe option similar to Pandas\n", 591 | "# Adding Columns\n", 592 | "# Dropping columns\n", 593 | "# Renaming Columns" 594 | ] 595 | }, 596 | { 597 | "cell_type": "code", 598 | "execution_count": null, 599 | "id": "04c02cfb-db67-4ede-92e1-ea03c1dee13b", 600 | "metadata": {}, 601 | "outputs": [], 602 | "source": [] 603 | }, 604 | { 605 | "cell_type": "code", 606 | "execution_count": null, 607 | "id": "6fc812f3-c60a-42ea-a387-b1e7700a2e3d", 608 | "metadata": {}, 609 | "outputs": [], 610 | "source": [] 611 | } 612 | ], 613 | "metadata": { 614 | "kernelspec": { 615 | "display_name": "Python 3 (ipykernel)", 616 | "language": "python", 617 | "name": "python3" 618 | }, 619 | "language_info": { 620 | "codemirror_mode": { 621 | "name": "ipython", 622 | "version": 3 623 | }, 624 | "file_extension": ".py", 625 | "mimetype": "text/x-python", 626 | "name": "python", 627 | "nbconvert_exporter": "python", 628 | "pygments_lexer": "ipython3", 629 | "version": "3.11.7" 630 | } 631 | }, 632 | "nbformat": 4, 633 | "nbformat_minor": 5 634 | } 635 | -------------------------------------------------------------------------------- /day3.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "id": "483ee2b4-8900-4113-998e-15f63d2fbce6", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "root\n", 14 | " |-- Name: string (nullable = true)\n", 15 | " |-- age: integer (nullable = true)\n", 16 | " |-- Experience: integer (nullable = true)\n", 17 | " |-- Salary: integer (nullable = true)\n", 18 | "\n" 19 | ] 20 | } 21 | ], 22 | "source": [ 23 | "from pyspark.sql import SparkSession\n", 24 | "spark=SparkSession.builder.appName('Practise').getOrCreate()\n", 25 | "df_pyspark=spark.read.csv('test3.csv',header=True,inferSchema=True)\n", 26 | "df_pyspark.printSchema()" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 13, 32 | "id": "ed321960-d843-40f6-9a21-f89685ec640b", 33 | "metadata": {}, 34 | "outputs": [ 35 | { 36 | "name": "stdout", 37 | "output_type": "stream", 38 | "text": [ 39 | "+---------+----+----------+------+\n", 40 | "| Name| age|Experience|Salary|\n", 41 | "+---------+----+----------+------+\n", 42 | "| Krish| 31| 10| 30000|\n", 43 | "|Sudhanshu| 30| 8| 25000|\n", 44 | "| Sunny| 29| 4| 20000|\n", 45 | "| Paul| 24| 3| 20000|\n", 46 | "| Harsha| 21| 1| 15000|\n", 47 | "| Shubham| 23| 2| 18000|\n", 48 | "| Mahesh|NULL| NULL| 40000|\n", 49 | "| NULL| 34| 10| 38000|\n", 50 | "| NULL| 36| NULL| NULL|\n", 51 | "+---------+----+----------+------+\n", 52 | "\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "df_pyspark.show()\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "code", 62 | "execution_count": 14, 63 | "id": "2ed60ea7-0bc8-4999-8739-c4f753ec604f", 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "+----+----------+------+\n", 71 | "| age|Experience|Salary|\n", 72 | "+----+----------+------+\n", 73 | "| 31| 10| 30000|\n", 74 | "| 30| 8| 25000|\n", 75 | "| 29| 4| 20000|\n", 76 | "| 24| 3| 20000|\n", 77 | "| 21| 1| 15000|\n", 78 | "| 23| 2| 18000|\n", 79 | "|NULL| NULL| 40000|\n", 80 | "| 34| 10| 38000|\n", 81 | "| 36| NULL| NULL|\n", 82 | "+----+----------+------+\n", 83 | "\n" 84 | ] 85 | } 86 | ], 87 | "source": [ 88 | "##drop the columns\n", 89 | "df_pyspark.drop('Name').show()" 90 | ] 91 | }, 92 | { 93 | "cell_type": "code", 94 | "execution_count": 15, 95 | "id": "d04de81f-db78-4c40-862f-80b3953a1507", 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "+---------+----+----------+------+\n", 103 | "| Name| age|Experience|Salary|\n", 104 | "+---------+----+----------+------+\n", 105 | "| Krish| 31| 10| 30000|\n", 106 | "|Sudhanshu| 30| 8| 25000|\n", 107 | "| Sunny| 29| 4| 20000|\n", 108 | "| Paul| 24| 3| 20000|\n", 109 | "| Harsha| 21| 1| 15000|\n", 110 | "| Shubham| 23| 2| 18000|\n", 111 | "| Mahesh|NULL| NULL| 40000|\n", 112 | "| NULL| 34| 10| 38000|\n", 113 | "| NULL| 36| NULL| NULL|\n", 114 | "+---------+----+----------+------+\n", 115 | "\n" 116 | ] 117 | } 118 | ], 119 | "source": [ 120 | "df_pyspark.show()\n" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 16, 126 | "id": "cae05add-e02d-4366-b752-b7b7b034679a", 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "+---------+---+----------+------+\n", 134 | "| Name|age|Experience|Salary|\n", 135 | "+---------+---+----------+------+\n", 136 | "| Krish| 31| 10| 30000|\n", 137 | "|Sudhanshu| 30| 8| 25000|\n", 138 | "| Sunny| 29| 4| 20000|\n", 139 | "| Paul| 24| 3| 20000|\n", 140 | "| Harsha| 21| 1| 15000|\n", 141 | "| Shubham| 23| 2| 18000|\n", 142 | "+---------+---+----------+------+\n", 143 | "\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "df_pyspark.na.drop().show()\n" 149 | ] 150 | }, 151 | { 152 | "cell_type": "code", 153 | "execution_count": 17, 154 | "id": "978c455f-fe3b-431f-8585-4e4e1e43fda8", 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "+---------+---+----------+------+\n", 162 | "| Name|age|Experience|Salary|\n", 163 | "+---------+---+----------+------+\n", 164 | "| Krish| 31| 10| 30000|\n", 165 | "|Sudhanshu| 30| 8| 25000|\n", 166 | "| Sunny| 29| 4| 20000|\n", 167 | "| Paul| 24| 3| 20000|\n", 168 | "| Harsha| 21| 1| 15000|\n", 169 | "| Shubham| 23| 2| 18000|\n", 170 | "+---------+---+----------+------+\n", 171 | "\n" 172 | ] 173 | } 174 | ], 175 | "source": [ 176 | "### any==how\n", 177 | "df_pyspark.na.drop(how=\"any\").show()" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 18, 183 | "id": "a3d34792-db4f-41b4-885e-96183069253a", 184 | "metadata": {}, 185 | "outputs": [ 186 | { 187 | "name": "stdout", 188 | "output_type": "stream", 189 | "text": [ 190 | "+---------+---+----------+------+\n", 191 | "| Name|age|Experience|Salary|\n", 192 | "+---------+---+----------+------+\n", 193 | "| Krish| 31| 10| 30000|\n", 194 | "|Sudhanshu| 30| 8| 25000|\n", 195 | "| Sunny| 29| 4| 20000|\n", 196 | "| Paul| 24| 3| 20000|\n", 197 | "| Harsha| 21| 1| 15000|\n", 198 | "| Shubham| 23| 2| 18000|\n", 199 | "| NULL| 34| 10| 38000|\n", 200 | "+---------+---+----------+------+\n", 201 | "\n" 202 | ] 203 | } 204 | ], 205 | "source": [ 206 | "##threshold\n", 207 | "df_pyspark.na.drop(how=\"any\",thresh=3).show()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 19, 213 | "id": "0416e143-663c-4ca3-b451-db14cbd15b0c", 214 | "metadata": {}, 215 | "outputs": [ 216 | { 217 | "name": "stdout", 218 | "output_type": "stream", 219 | "text": [ 220 | "+---------+---+----------+------+\n", 221 | "| Name|age|Experience|Salary|\n", 222 | "+---------+---+----------+------+\n", 223 | "| Krish| 31| 10| 30000|\n", 224 | "|Sudhanshu| 30| 8| 25000|\n", 225 | "| Sunny| 29| 4| 20000|\n", 226 | "| Paul| 24| 3| 20000|\n", 227 | "| Harsha| 21| 1| 15000|\n", 228 | "| Shubham| 23| 2| 18000|\n", 229 | "| NULL| 34| 10| 38000|\n", 230 | "| NULL| 36| NULL| NULL|\n", 231 | "+---------+---+----------+------+\n", 232 | "\n" 233 | ] 234 | } 235 | ], 236 | "source": [ 237 | "##Subset\n", 238 | "df_pyspark.na.drop(how=\"any\",subset=['Age']).show()" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 28, 244 | "id": "0685ef56-d34b-405a-b171-aecf2ff7a621", 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "+--------------+----+----------+------+\n", 252 | "| Name| age|Experience|Salary|\n", 253 | "+--------------+----+----------+------+\n", 254 | "| Krish| 31| 10| 30000|\n", 255 | "| Sudhanshu| 30| 8| 25000|\n", 256 | "| Sunny| 29| 4| 20000|\n", 257 | "| Paul| 24| 3| 20000|\n", 258 | "| Harsha| 21| 1| 15000|\n", 259 | "| Shubham| 23| 2| 18000|\n", 260 | "| Mahesh|NULL| NULL| 40000|\n", 261 | "|Missing Values| 34| 10| 38000|\n", 262 | "|Missing Values| 36| NULL| NULL|\n", 263 | "+--------------+----+----------+------+\n", 264 | "\n" 265 | ] 266 | } 267 | ], 268 | "source": [ 269 | "### Filling the Missing Value\n", 270 | "from pyspark.sql import SparkSession\n", 271 | "from pyspark.sql.functions import col\n", 272 | "df_pyspark.na.fill('Missing Values').show()\n", 273 | "#not working" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 21, 279 | "id": "1e5d7ee7-bcb1-4c4f-90fd-49637fd12136", 280 | "metadata": {}, 281 | "outputs": [ 282 | { 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "+---------+----+----------+------+\n", 287 | "| Name| age|Experience|Salary|\n", 288 | "+---------+----+----------+------+\n", 289 | "| Krish| 31| 10| 30000|\n", 290 | "|Sudhanshu| 30| 8| 25000|\n", 291 | "| Sunny| 29| 4| 20000|\n", 292 | "| Paul| 24| 3| 20000|\n", 293 | "| Harsha| 21| 1| 15000|\n", 294 | "| Shubham| 23| 2| 18000|\n", 295 | "| Mahesh|NULL| NULL| 40000|\n", 296 | "| NULL| 34| 10| 38000|\n", 297 | "| NULL| 36| NULL| NULL|\n", 298 | "+---------+----+----------+------+\n", 299 | "\n" 300 | ] 301 | } 302 | ], 303 | "source": [ 304 | "df_pyspark.show()" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 22, 310 | "id": "3c2aee0c-f361-41a1-b92b-477aa1946774", 311 | "metadata": {}, 312 | "outputs": [ 313 | { 314 | "name": "stdout", 315 | "output_type": "stream", 316 | "text": [ 317 | "root\n", 318 | " |-- Name: string (nullable = true)\n", 319 | " |-- age: integer (nullable = true)\n", 320 | " |-- Experience: integer (nullable = true)\n", 321 | " |-- Salary: integer (nullable = true)\n", 322 | "\n" 323 | ] 324 | } 325 | ], 326 | "source": [ 327 | "df_pyspark.printSchema()" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 29, 333 | "id": "10b778bd-3044-4896-b5a7-be84a05a41a5", 334 | "metadata": {}, 335 | "outputs": [], 336 | "source": [ 337 | "from pyspark.ml.feature import Imputer\n", 338 | "\n", 339 | "imputer = Imputer(\n", 340 | " inputCols=['age', 'Experience', 'Salary'], \n", 341 | " outputCols=[\"{}_imputed\".format(c) for c in ['age', 'Experience', 'Salary']]\n", 342 | " ).setStrategy(\"median\")" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 31, 348 | "id": "ef331a71-d7d1-417e-8386-45c1547ce033", 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "+---------+----+----------+------+-----------+------------------+--------------+\n", 356 | "| Name| age|Experience|Salary|age_imputed|Experience_imputed|Salary_imputed|\n", 357 | "+---------+----+----------+------+-----------+------------------+--------------+\n", 358 | "| Krish| 31| 10| 30000| 31| 10| 30000|\n", 359 | "|Sudhanshu| 30| 8| 25000| 30| 8| 25000|\n", 360 | "| Sunny| 29| 4| 20000| 29| 4| 20000|\n", 361 | "| Paul| 24| 3| 20000| 24| 3| 20000|\n", 362 | "| Harsha| 21| 1| 15000| 21| 1| 15000|\n", 363 | "| Shubham| 23| 2| 18000| 23| 2| 18000|\n", 364 | "| Mahesh|NULL| NULL| 40000| 29| 4| 40000|\n", 365 | "| NULL| 34| 10| 38000| 34| 10| 38000|\n", 366 | "| NULL| 36| NULL| NULL| 36| 4| 20000|\n", 367 | "+---------+----+----------+------+-----------+------------------+--------------+\n", 368 | "\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "a=imputer.fit(df_pyspark).transform(df_pyspark).show()\n", 374 | "a" 375 | ] 376 | }, 377 | { 378 | "cell_type": "code", 379 | "execution_count": 40, 380 | "id": "70d0bf18-92a2-47a8-99f2-48dbf5ac4e97", 381 | "metadata": {}, 382 | "outputs": [ 383 | { 384 | "name": "stdout", 385 | "output_type": "stream", 386 | "text": [ 387 | "+---------+----+----------+------+----------+-----------------+-------------+\n", 388 | "| Name| age|Experience|Salary|age_imuter|Experience_imuter|Salary_imuter|\n", 389 | "+---------+----+----------+------+----------+-----------------+-------------+\n", 390 | "| Krish| 31| 10| 30000| 31| 10| 30000|\n", 391 | "|Sudhanshu| 30| 8| 25000| 30| 8| 25000|\n", 392 | "| Sunny| 29| 4| 20000| 29| 4| 20000|\n", 393 | "| Paul| 24| 3| 20000| 24| 3| 20000|\n", 394 | "| Harsha| 21| 1| 15000| 21| 1| 15000|\n", 395 | "| Shubham| 23| 2| 18000| 23| 2| 18000|\n", 396 | "| Mahesh|NULL| NULL| 40000| 28| 5| 40000|\n", 397 | "| NULL| 34| 10| 38000| 34| 10| 38000|\n", 398 | "| NULL| 36| NULL| NULL| 36| 5| 25750|\n", 399 | "+---------+----+----------+------+----------+-----------------+-------------+\n", 400 | "\n" 401 | ] 402 | } 403 | ], 404 | "source": [ 405 | "from pyspark.ml.feature import Imputer\n", 406 | "imputer=Imputer(\n", 407 | " inputCols=[\"age\",\"Experience\",\"Salary\"],\n", 408 | " outputCols=[\"{}_imuter\".format(c) for c in [\"age\",\"Experience\",\"Salary\"]]).setStrategy(\"mean\")\n", 409 | "a=imputer.fit(df_pyspark).transform(df_pyspark).show()\n", 410 | "a" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 46, 416 | "id": "c97c37eb-1f4a-48db-82aa-d27fbed20d6b", 417 | "metadata": {}, 418 | "outputs": [ 419 | { 420 | "name": "stdout", 421 | "output_type": "stream", 422 | "text": [ 423 | "+---------+----+----------+------+-----------+------------------+--------------+\n", 424 | "| Name| age|Experience|Salary|age_imputer|Experience_imputer|Salary_imputer|\n", 425 | "+---------+----+----------+------+-----------+------------------+--------------+\n", 426 | "| Krish| 31| 10| 30000| 31| 10| 30000|\n", 427 | "|Sudhanshu| 30| 8| 25000| 30| 8| 25000|\n", 428 | "| Sunny| 29| 4| 20000| 29| 4| 20000|\n", 429 | "| Paul| 24| 3| 20000| 24| 3| 20000|\n", 430 | "| Harsha| 21| 1| 15000| 21| 1| 15000|\n", 431 | "| Shubham| 23| 2| 18000| 23| 2| 18000|\n", 432 | "| Mahesh|NULL| NULL| 40000| 28| 5| 40000|\n", 433 | "| NULL| 34| 10| 38000| 34| 10| 38000|\n", 434 | "| NULL| 36| NULL| NULL| 36| 5| 25750|\n", 435 | "+---------+----+----------+------+-----------+------------------+--------------+\n", 436 | "\n" 437 | ] 438 | } 439 | ], 440 | "source": [ 441 | "from pyspark.ml.feature import Imputer\n", 442 | "a=Imputer(\n", 443 | " inputCols=[\"age\",\"Experience\",\"Salary\"],\n", 444 | " outputCols=[\"{}_imputer\".format(c) for c in [\"age\",\"Experience\",\"Salary\"]]).setStrategy(\"mean\")\n", 445 | "b=a.fit(df_pyspark).transform(df_pyspark).show()" 446 | ] 447 | }, 448 | { 449 | "cell_type": "code", 450 | "execution_count": null, 451 | "id": "c42c0587-3642-435b-b410-7c34e20eada6", 452 | "metadata": {}, 453 | "outputs": [], 454 | "source": [] 455 | } 456 | ], 457 | "metadata": { 458 | "kernelspec": { 459 | "display_name": "Python 3 (ipykernel)", 460 | "language": "python", 461 | "name": "python3" 462 | }, 463 | "language_info": { 464 | "codemirror_mode": { 465 | "name": "ipython", 466 | "version": 3 467 | }, 468 | "file_extension": ".py", 469 | "mimetype": "text/x-python", 470 | "name": "python", 471 | "nbconvert_exporter": "python", 472 | "pygments_lexer": "ipython3", 473 | "version": "3.11.7" 474 | } 475 | }, 476 | "nbformat": 4, 477 | "nbformat_minor": 5 478 | } 479 | -------------------------------------------------------------------------------- /day4.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "b7236a68-3f52-480d-a16d-e11abafd7bee", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "24/04/24 11:33:24 WARN Utils: Your hostname, kyn resolves to a loopback address: 127.0.1.1; using 10.0.250.240 instead (on interface wlp0s20f3)\n", 14 | "24/04/24 11:33:24 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 15 | "Setting default log level to \"WARN\".\n", 16 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 17 | "24/04/24 11:33:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 18 | "24/04/24 11:33:25 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", 19 | "24/04/24 11:33:25 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n" 20 | ] 21 | }, 22 | { 23 | "data": { 24 | "text/html": [ 25 | "\n", 26 | "
\n", 27 | "

SparkSession - in-memory

\n", 28 | " \n", 29 | "
\n", 30 | "

SparkContext

\n", 31 | "\n", 32 | "

Spark UI

\n", 33 | "\n", 34 | "
\n", 35 | "
Version
\n", 36 | "
v3.5.1
\n", 37 | "
Master
\n", 38 | "
local[*]
\n", 39 | "
AppName
\n", 40 | "
filter
\n", 41 | "
\n", 42 | "
\n", 43 | " \n", 44 | "
\n", 45 | " " 46 | ], 47 | "text/plain": [ 48 | "" 49 | ] 50 | }, 51 | "execution_count": 1, 52 | "metadata": {}, 53 | "output_type": "execute_result" 54 | } 55 | ], 56 | "source": [ 57 | "from pyspark.sql import SparkSession\n", 58 | "spark=SparkSession.builder.appName(\"filter\").getOrCreate()\n", 59 | "spark" 60 | ] 61 | }, 62 | { 63 | "cell_type": "code", 64 | "execution_count": 2, 65 | "id": "3bf95d12-f487-40fe-b951-3c7a124e1f7f", 66 | "metadata": {}, 67 | "outputs": [ 68 | { 69 | "name": "stdout", 70 | "output_type": "stream", 71 | "text": [ 72 | "+---------+---+----------+------+\n", 73 | "| Name|age|Experience|Salary|\n", 74 | "+---------+---+----------+------+\n", 75 | "| Krish| 31| 10| 30000|\n", 76 | "|Sudhanshu| 30| 8| 25000|\n", 77 | "| Sunny| 29| 4| 20000|\n", 78 | "| Paul| 24| 3| 20000|\n", 79 | "| Harsha| 21| 1| 15000|\n", 80 | "| Shubham| 23| 2| 18000|\n", 81 | "+---------+---+----------+------+\n", 82 | "\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "df_spark=spark.read.csv(\"test4.csv\",header=True,inferSchema=True)\n", 88 | "df_spark.show()" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": 4, 94 | "id": "388abc67-31a2-41c0-849c-535a1f60894d", 95 | "metadata": {}, 96 | "outputs": [ 97 | { 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "+-------+---+----------+------+\n", 102 | "| Name|age|Experience|Salary|\n", 103 | "+-------+---+----------+------+\n", 104 | "| Sunny| 29| 4| 20000|\n", 105 | "| Paul| 24| 3| 20000|\n", 106 | "| Harsha| 21| 1| 15000|\n", 107 | "|Shubham| 23| 2| 18000|\n", 108 | "+-------+---+----------+------+\n", 109 | "\n" 110 | ] 111 | } 112 | ], 113 | "source": [ 114 | "##salry of people <= 2000\n", 115 | "df_spark.filter(\"Salary<=20000\").show()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 6, 121 | "id": "8ee68b63-8680-4232-85e7-44da79d629d4", 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "+---------+------+\n", 129 | "| Name|Salary|\n", 130 | "+---------+------+\n", 131 | "| Krish| 30000|\n", 132 | "|Sudhanshu| 25000|\n", 133 | "+---------+------+\n", 134 | "\n" 135 | ] 136 | } 137 | ], 138 | "source": [ 139 | "df_spark.filter(\"Experience >= 5\").select([\"Name\",\"Salary\"]).show()" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 7, 145 | "id": "28a07f16-b0b1-4c28-b190-389279656fe5", 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "name": "stdout", 150 | "output_type": "stream", 151 | "text": [ 152 | "+-------+---+----------+------+\n", 153 | "| Name|age|Experience|Salary|\n", 154 | "+-------+---+----------+------+\n", 155 | "| Sunny| 29| 4| 20000|\n", 156 | "| Paul| 24| 3| 20000|\n", 157 | "| Harsha| 21| 1| 15000|\n", 158 | "|Shubham| 23| 2| 18000|\n", 159 | "+-------+---+----------+------+\n", 160 | "\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "df_spark.filter(df_spark[\"Salary\"]<=20000).show()" 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "execution_count": 14, 171 | "id": "4b86de2a-82df-4b4a-89d6-3c269ab2ade7", 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "+-------+---+----------+------+\n", 179 | "| Name|age|Experience|Salary|\n", 180 | "+-------+---+----------+------+\n", 181 | "| Sunny| 29| 4| 20000|\n", 182 | "| Paul| 24| 3| 20000|\n", 183 | "| Harsha| 21| 1| 15000|\n", 184 | "|Shubham| 23| 2| 18000|\n", 185 | "+-------+---+----------+------+\n", 186 | "\n" 187 | ] 188 | } 189 | ], 190 | "source": [ 191 | "df_spark.filter((df_spark[\"Salary\"]<=20000) & \n", 192 | " (df_spark[\"Salary\"]>=15000)).show()" 193 | ] 194 | }, 195 | { 196 | "cell_type": "code", 197 | "execution_count": 16, 198 | "id": "b7efea59-ce39-4ecb-9ec8-0531ff2477c1", 199 | "metadata": {}, 200 | "outputs": [ 201 | { 202 | "name": "stdout", 203 | "output_type": "stream", 204 | "text": [ 205 | "+---------+---+----------+------+\n", 206 | "| Name|age|Experience|Salary|\n", 207 | "+---------+---+----------+------+\n", 208 | "| Krish| 31| 10| 30000|\n", 209 | "|Sudhanshu| 30| 8| 25000|\n", 210 | "+---------+---+----------+------+\n", 211 | "\n" 212 | ] 213 | } 214 | ], 215 | "source": [ 216 | "df_spark.filter(~(df_spark[\"Salary\"]<=20000)).show()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": null, 222 | "id": "c7843daa-3229-434c-9f44-2ab860b3a82d", 223 | "metadata": {}, 224 | "outputs": [], 225 | "source": [] 226 | } 227 | ], 228 | "metadata": { 229 | "kernelspec": { 230 | "display_name": "Python 3 (ipykernel)", 231 | "language": "python", 232 | "name": "python3" 233 | }, 234 | "language_info": { 235 | "codemirror_mode": { 236 | "name": "ipython", 237 | "version": 3 238 | }, 239 | "file_extension": ".py", 240 | "mimetype": "text/x-python", 241 | "name": "python", 242 | "nbconvert_exporter": "python", 243 | "pygments_lexer": "ipython3", 244 | "version": "3.11.7" 245 | } 246 | }, 247 | "nbformat": 4, 248 | "nbformat_minor": 5 249 | } 250 | -------------------------------------------------------------------------------- /day5.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "248ea11a-8dce-4633-a7c5-5a48b6daf85e", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "24/04/24 11:52:13 WARN Utils: Your hostname, kyn resolves to a loopback address: 127.0.1.1; using 10.0.250.240 instead (on interface wlp0s20f3)\n", 14 | "24/04/24 11:52:13 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 15 | "Setting default log level to \"WARN\".\n", 16 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 17 | "24/04/24 11:52:14 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n", 18 | "24/04/24 11:52:14 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.\n", 19 | "24/04/24 11:52:14 WARN Utils: Service 'SparkUI' could not bind on port 4041. Attempting port 4042.\n", 20 | "24/04/24 11:52:14 WARN Utils: Service 'SparkUI' could not bind on port 4042. Attempting port 4043.\n" 21 | ] 22 | }, 23 | { 24 | "data": { 25 | "text/html": [ 26 | "\n", 27 | "
\n", 28 | "

SparkSession - in-memory

\n", 29 | " \n", 30 | "
\n", 31 | "

SparkContext

\n", 32 | "\n", 33 | "

Spark UI

\n", 34 | "\n", 35 | "
\n", 36 | "
Version
\n", 37 | "
v3.5.1
\n", 38 | "
Master
\n", 39 | "
local[*]
\n", 40 | "
AppName
\n", 41 | "
kani1
\n", 42 | "
\n", 43 | "
\n", 44 | " \n", 45 | "
\n", 46 | " " 47 | ], 48 | "text/plain": [ 49 | "" 50 | ] 51 | }, 52 | "execution_count": 1, 53 | "metadata": {}, 54 | "output_type": "execute_result" 55 | } 56 | ], 57 | "source": [ 58 | "from pyspark.sql import SparkSession\n", 59 | "spark=SparkSession.builder.appName(\"kani1\").getOrCreate()\n", 60 | "spark" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 2, 66 | "id": "91ed6d86-65c4-4d75-b531-7fde1090ff1c", 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "name": "stdout", 71 | "output_type": "stream", 72 | "text": [ 73 | "+---------+------------+------+\n", 74 | "| Name| Departments|salary|\n", 75 | "+---------+------------+------+\n", 76 | "| Krish|Data Science| 10000|\n", 77 | "| Krish| IOT| 5000|\n", 78 | "| Mahesh| Big Data| 4000|\n", 79 | "| Krish| Big Data| 4000|\n", 80 | "| Mahesh|Data Science| 3000|\n", 81 | "|Sudhanshu|Data Science| 20000|\n", 82 | "|Sudhanshu| IOT| 10000|\n", 83 | "|Sudhanshu| Big Data| 5000|\n", 84 | "| Sunny|Data Science| 10000|\n", 85 | "| Sunny| Big Data| 2000|\n", 86 | "+---------+------------+------+\n", 87 | "\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "df_spark=spark.read.csv(\"test5.csv\",header=True,inferSchema=True)\n", 93 | "df_spark.show()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "code", 98 | "execution_count": 3, 99 | "id": "aa4ed17a-2f27-42f1-9495-861cea0ba131", 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "root\n", 107 | " |-- Name: string (nullable = true)\n", 108 | " |-- Departments: string (nullable = true)\n", 109 | " |-- salary: integer (nullable = true)\n", 110 | "\n" 111 | ] 112 | } 113 | ], 114 | "source": [ 115 | "df_spark.printSchema()" 116 | ] 117 | }, 118 | { 119 | "cell_type": "code", 120 | "execution_count": 10, 121 | "id": "e9298517-0c79-4edb-8e8b-34960a04cb36", 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "data": { 126 | "text/plain": [ 127 | "DataFrame[Name: string, sum(salary): bigint]" 128 | ] 129 | }, 130 | "execution_count": 10, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "df_spark.groupBy(\"Name\").sum()" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 11, 142 | "id": "964116b0-c611-46c5-a86d-f5158aea0f37", 143 | "metadata": {}, 144 | "outputs": [ 145 | { 146 | "name": "stdout", 147 | "output_type": "stream", 148 | "text": [ 149 | "+---------+-----------+\n", 150 | "| Name|sum(salary)|\n", 151 | "+---------+-----------+\n", 152 | "|Sudhanshu| 35000|\n", 153 | "| Sunny| 12000|\n", 154 | "| Krish| 19000|\n", 155 | "| Mahesh| 7000|\n", 156 | "+---------+-----------+\n", 157 | "\n" 158 | ] 159 | } 160 | ], 161 | "source": [ 162 | "#groupby\n", 163 | "df_spark.groupBy(\"Name\").sum().show()" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 13, 169 | "id": "8fbc862a-fffd-414e-b8fd-29b1823893f3", 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "+------------+-----------+\n", 177 | "| Departments|sum(salary)|\n", 178 | "+------------+-----------+\n", 179 | "| IOT| 15000|\n", 180 | "| Big Data| 15000|\n", 181 | "|Data Science| 43000|\n", 182 | "+------------+-----------+\n", 183 | "\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "df_spark.groupBy(\"Departments\").sum().show()" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 16, 194 | "id": "775124c8-8a1b-4434-9092-f2f6ed8f9ff2", 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "+------------+-----+\n", 202 | "| Departments|count|\n", 203 | "+------------+-----+\n", 204 | "| IOT| 2|\n", 205 | "| Big Data| 4|\n", 206 | "|Data Science| 4|\n", 207 | "+------------+-----+\n", 208 | "\n" 209 | ] 210 | } 211 | ], 212 | "source": [ 213 | "df_spark.groupBy(\"Departments\").count().show()" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 18, 219 | "id": "a8ff04cb-a9b5-47f9-a300-ddc2fde7d67a", 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "+-----------+\n", 227 | "|sum(Salary)|\n", 228 | "+-----------+\n", 229 | "| 73000|\n", 230 | "+-----------+\n", 231 | "\n" 232 | ] 233 | } 234 | ], 235 | "source": [ 236 | "df_spark.agg({\"Salary\":\"sum\"}).show()" 237 | ] 238 | }, 239 | { 240 | "cell_type": "code", 241 | "execution_count": 19, 242 | "id": "2130f698-7aae-4d76-8acf-4b2b0ce9d784", 243 | "metadata": {}, 244 | "outputs": [ 245 | { 246 | "name": "stdout", 247 | "output_type": "stream", 248 | "text": [ 249 | "+---------+-----------+\n", 250 | "| Name|max(salary)|\n", 251 | "+---------+-----------+\n", 252 | "|Sudhanshu| 20000|\n", 253 | "| Sunny| 10000|\n", 254 | "| Krish| 10000|\n", 255 | "| Mahesh| 4000|\n", 256 | "+---------+-----------+\n", 257 | "\n" 258 | ] 259 | } 260 | ], 261 | "source": [ 262 | "df_spark.groupBy(\"Name\").max().show()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": null, 268 | "id": "23590cf1-9e11-42a6-ad20-06f912417643", 269 | "metadata": {}, 270 | "outputs": [], 271 | "source": [] 272 | } 273 | ], 274 | "metadata": { 275 | "kernelspec": { 276 | "display_name": "Python 3 (ipykernel)", 277 | "language": "python", 278 | "name": "python3" 279 | }, 280 | "language_info": { 281 | "codemirror_mode": { 282 | "name": "ipython", 283 | "version": 3 284 | }, 285 | "file_extension": ".py", 286 | "mimetype": "text/x-python", 287 | "name": "python", 288 | "nbconvert_exporter": "python", 289 | "pygments_lexer": "ipython3", 290 | "version": "3.11.7" 291 | } 292 | }, 293 | "nbformat": 4, 294 | "nbformat_minor": 5 295 | } 296 | -------------------------------------------------------------------------------- /day6.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 12, 6 | "id": "f3564fea-39ad-48fc-a2fb-dcc2842d46cb", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stderr", 11 | "output_type": "stream", 12 | "text": [ 13 | "24/04/26 09:50:33 WARN Utils: Your hostname, kyn resolves to a loopback address: 127.0.1.1; using 10.0.250.224 instead (on interface wlp0s20f3)\n", 14 | "24/04/26 09:50:33 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address\n", 15 | "Setting default log level to \"WARN\".\n", 16 | "To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).\n", 17 | "24/04/26 09:50:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable\n" 18 | ] 19 | } 20 | ], 21 | "source": [ 22 | "from pyspark.sql import SparkSession\n", 23 | "spark=SparkSession.builder.appName(\"k\").getOrCreate()" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 13, 29 | "id": "f969b760-3d18-4ee3-a2d0-a8a0b25ceb72", 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/html": [ 35 | "\n", 36 | "
\n", 37 | "

SparkSession - in-memory

\n", 38 | " \n", 39 | "
\n", 40 | "

SparkContext

\n", 41 | "\n", 42 | "

Spark UI

\n", 43 | "\n", 44 | "
\n", 45 | "
Version
\n", 46 | "
v3.5.1
\n", 47 | "
Master
\n", 48 | "
local[*]
\n", 49 | "
AppName
\n", 50 | "
k
\n", 51 | "
\n", 52 | "
\n", 53 | " \n", 54 | "
\n", 55 | " " 56 | ], 57 | "text/plain": [ 58 | "" 59 | ] 60 | }, 61 | "execution_count": 13, 62 | "metadata": {}, 63 | "output_type": "execute_result" 64 | } 65 | ], 66 | "source": [ 67 | "spark" 68 | ] 69 | }, 70 | { 71 | "cell_type": "code", 72 | "execution_count": 14, 73 | "id": "a7b85e69-8425-420a-a8ee-a151e1857e44", 74 | "metadata": {}, 75 | "outputs": [ 76 | { 77 | "name": "stdout", 78 | "output_type": "stream", 79 | "text": [ 80 | "+---------+---+----------+------+\n", 81 | "| Name|age|Experience|Salary|\n", 82 | "+---------+---+----------+------+\n", 83 | "| Krish| 31| 10| 30000|\n", 84 | "|Sudhanshu| 30| 8| 25000|\n", 85 | "| Sunny| 29| 4| 20000|\n", 86 | "| Paul| 24| 3| 20000|\n", 87 | "| Harsha| 21| 1| 15000|\n", 88 | "| Shubham| 23| 2| 18000|\n", 89 | "+---------+---+----------+------+\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "df_spark=spark.read.csv(\"test4.csv\",header=True,inferSchema=True)\n", 96 | "df_spark.show()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": 15, 102 | "id": "2001e4cf-c3ec-44a1-a475-57e57e75d66f", 103 | "metadata": {}, 104 | "outputs": [ 105 | { 106 | "name": "stdout", 107 | "output_type": "stream", 108 | "text": [ 109 | "root\n", 110 | " |-- Name: string (nullable = true)\n", 111 | " |-- age: integer (nullable = true)\n", 112 | " |-- Experience: integer (nullable = true)\n", 113 | " |-- Salary: integer (nullable = true)\n", 114 | "\n" 115 | ] 116 | } 117 | ], 118 | "source": [ 119 | "df_spark.printSchema()" 120 | ] 121 | }, 122 | { 123 | "cell_type": "code", 124 | "execution_count": 16, 125 | "id": "c4b63544-914e-4f6b-bff1-bdf91cdb4e4b", 126 | "metadata": {}, 127 | "outputs": [ 128 | { 129 | "data": { 130 | "text/plain": [ 131 | "['Name', 'age', 'Experience', 'Salary']" 132 | ] 133 | }, 134 | "execution_count": 16, 135 | "metadata": {}, 136 | "output_type": "execute_result" 137 | } 138 | ], 139 | "source": [ 140 | "df_spark.columns" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 17, 146 | "id": "4fd632bf-322a-4f16-a39b-4220fbcbb4e3", 147 | "metadata": {}, 148 | "outputs": [ 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "+---------+---+----------+------+-------------------+\n", 154 | "| Name|age|Experience|Salary|Independent feature|\n", 155 | "+---------+---+----------+------+-------------------+\n", 156 | "| Krish| 31| 10| 30000| [31.0,10.0]|\n", 157 | "|Sudhanshu| 30| 8| 25000| [30.0,8.0]|\n", 158 | "| Sunny| 29| 4| 20000| [29.0,4.0]|\n", 159 | "| Paul| 24| 3| 20000| [24.0,3.0]|\n", 160 | "| Harsha| 21| 1| 15000| [21.0,1.0]|\n", 161 | "| Shubham| 23| 2| 18000| [23.0,2.0]|\n", 162 | "+---------+---+----------+------+-------------------+\n", 163 | "\n" 164 | ] 165 | } 166 | ], 167 | "source": [ 168 | "#vectorassembler for grouping 0r combining\n", 169 | "from pyspark.ml.feature import VectorAssembler\n", 170 | "feature=VectorAssembler(inputCols=[\"age\",\"Experience\"],\n", 171 | " outputCol=\"Independent feature\")\n", 172 | "output=feature.transform(df_spark)\n", 173 | "output.show()" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 18, 179 | "id": "8d60a9b6-70a3-4193-a9b3-df31a5bb23be", 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "data": { 184 | "text/plain": [ 185 | "['Name', 'age', 'Experience', 'Salary', 'Independent feature']" 186 | ] 187 | }, 188 | "execution_count": 18, 189 | "metadata": {}, 190 | "output_type": "execute_result" 191 | } 192 | ], 193 | "source": [ 194 | "output.columns" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 19, 200 | "id": "13672511-bd72-467d-9447-035a446387fe", 201 | "metadata": {}, 202 | "outputs": [ 203 | { 204 | "name": "stdout", 205 | "output_type": "stream", 206 | "text": [ 207 | "+-------------------+------+\n", 208 | "|Independent feature|Salary|\n", 209 | "+-------------------+------+\n", 210 | "| [31.0,10.0]| 30000|\n", 211 | "| [30.0,8.0]| 25000|\n", 212 | "| [29.0,4.0]| 20000|\n", 213 | "| [24.0,3.0]| 20000|\n", 214 | "| [21.0,1.0]| 15000|\n", 215 | "| [23.0,2.0]| 18000|\n", 216 | "+-------------------+------+\n", 217 | "\n" 218 | ] 219 | } 220 | ], 221 | "source": [ 222 | "finalize=output.select(\"Independent feature\",\"Salary\")\n", 223 | "finalize.show()" 224 | ] 225 | }, 226 | { 227 | "cell_type": "code", 228 | "execution_count": 20, 229 | "id": "b947c6bf-b2d9-40cd-a56a-5967ded3f2b2", 230 | "metadata": {}, 231 | "outputs": [ 232 | { 233 | "name": "stderr", 234 | "output_type": "stream", 235 | "text": [ 236 | "24/04/26 09:50:40 WARN Instrumentation: [b67eb33b] regParam is zero, which might cause numerical instability and overfitting.\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "#linear regresssion\n", 242 | "from pyspark.ml.regression import LinearRegression\n", 243 | "#split\n", 244 | "train_data,test_data=finalize.randomSplit([0.75,0.25])\n", 245 | "#implement model\n", 246 | "regressor=LinearRegression(\n", 247 | " featuresCol=\"Independent feature\",\n", 248 | " labelCol=\"Salary\"\n", 249 | ")\n", 250 | "#fit model\n", 251 | "regressor=regressor.fit(train_data)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 21, 257 | "id": "f06a1ffc-25a6-4c78-8c22-1c866aa8f7be", 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "data": { 262 | "text/plain": [ 263 | "DenseVector([28.4757, 1271.3568])" 264 | ] 265 | }, 266 | "execution_count": 21, 267 | "metadata": {}, 268 | "output_type": "execute_result" 269 | } 270 | ], 271 | "source": [ 272 | "regressor.coefficients" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 22, 278 | "id": "946ee173-bc85-4f56-8481-63ddc13601d1", 279 | "metadata": {}, 280 | "outputs": [ 281 | { 282 | "data": { 283 | "text/plain": [ 284 | "14299.832495812996" 285 | ] 286 | }, 287 | "execution_count": 22, 288 | "metadata": {}, 289 | "output_type": "execute_result" 290 | } 291 | ], 292 | "source": [ 293 | "regressor.intercept" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": 23, 299 | "id": "917cadff-be7f-4b0c-b80d-20021a233963", 300 | "metadata": {}, 301 | "outputs": [ 302 | { 303 | "name": "stdout", 304 | "output_type": "stream", 305 | "text": [ 306 | "+-------------------+------+------------------+\n", 307 | "|Independent feature|Salary| prediction|\n", 308 | "+-------------------+------+------------------+\n", 309 | "| [31.0,10.0]| 30000|27896.147403685147|\n", 310 | "+-------------------+------+------------------+\n", 311 | "\n" 312 | ] 313 | } 314 | ], 315 | "source": [ 316 | "pred_results=regressor.evaluate(test_data)\n", 317 | "#predict model\n", 318 | "pred_results.predictions.show()" 319 | ] 320 | }, 321 | { 322 | "cell_type": "code", 323 | "execution_count": 31, 324 | "id": "993faeb3-b50e-407f-9dee-c95a61a57e75", 325 | "metadata": {}, 326 | "outputs": [ 327 | { 328 | "name": "stdout", 329 | "output_type": "stream", 330 | "text": [ 331 | "+---------+---+----------+------+-------------------+\n", 332 | "| Name|age|Experience|Salary|Independent feature|\n", 333 | "+---------+---+----------+------+-------------------+\n", 334 | "| Krish| 31| 10| 30000| [31.0,10.0]|\n", 335 | "|Sudhanshu| 30| 8| 25000| [30.0,8.0]|\n", 336 | "| Sunny| 29| 4| 20000| [29.0,4.0]|\n", 337 | "| Paul| 24| 3| 20000| [24.0,3.0]|\n", 338 | "| Harsha| 21| 1| 15000| [21.0,1.0]|\n", 339 | "| Shubham| 23| 2| 18000| [23.0,2.0]|\n", 340 | "+---------+---+----------+------+-------------------+\n", 341 | "\n" 342 | ] 343 | } 344 | ], 345 | "source": [ 346 | "from pyspark.ml.feature import VectorAssembler\n", 347 | "a=VectorAssembler(\n", 348 | " inputCols=[\"age\",\"Experience\"],\n", 349 | " outputCol=\"Independent feature\")\n", 350 | "b=a.transform(df_spark)\n", 351 | "b.show()\n" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 36, 357 | "id": "1a946c2d-1fcd-4522-ab02-c3798713baca", 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "name": "stdout", 362 | "output_type": "stream", 363 | "text": [ 364 | "+------+-------------------+\n", 365 | "|Salary|Independent feature|\n", 366 | "+------+-------------------+\n", 367 | "| 30000| [31.0,10.0]|\n", 368 | "| 25000| [30.0,8.0]|\n", 369 | "| 20000| [29.0,4.0]|\n", 370 | "| 20000| [24.0,3.0]|\n", 371 | "| 15000| [21.0,1.0]|\n", 372 | "| 18000| [23.0,2.0]|\n", 373 | "+------+-------------------+\n", 374 | "\n" 375 | ] 376 | } 377 | ], 378 | "source": [ 379 | "finalize=b.select(\"Salary\",\"Independent feature\")\n", 380 | "finalize.show()" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": null, 386 | "id": "ba9a91bf-557e-4100-9244-7a86c7f89197", 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 38, 394 | "id": "f2289e39-cdd8-4961-8779-7c2ecfa2441d", 395 | "metadata": {}, 396 | "outputs": [ 397 | { 398 | "name": "stderr", 399 | "output_type": "stream", 400 | "text": [ 401 | "24/04/26 09:57:06 WARN Instrumentation: [c3a3d75c] regParam is zero, which might cause numerical instability and overfitting.\n" 402 | ] 403 | } 404 | ], 405 | "source": [ 406 | "from pyspark.ml.regression import LinearRegression\n", 407 | "train_data,test_data=finalize.randomSplit([0.75,0.25])\n", 408 | "reg=LinearRegression(\n", 409 | " featuresCol=\"Independent feature\",\n", 410 | " labelCol=\"Salary\"\n", 411 | ")\n", 412 | "reg=reg.fit(train_data)\n" 413 | ] 414 | }, 415 | { 416 | "cell_type": "code", 417 | "execution_count": 41, 418 | "id": "e2a19a28-8d7b-449d-810e-8d1b2b1ff94a", 419 | "metadata": {}, 420 | "outputs": [ 421 | { 422 | "data": { 423 | "text/plain": [ 424 | "DenseVector([172.4138, 1206.8966])" 425 | ] 426 | }, 427 | "execution_count": 41, 428 | "metadata": {}, 429 | "output_type": "execute_result" 430 | } 431 | ], 432 | "source": [ 433 | "reg.coefficients" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": 42, 439 | "id": "a946ea6d-5a2c-48bf-aee5-504b9d9731e4", 440 | "metadata": {}, 441 | "outputs": [ 442 | { 443 | "data": { 444 | "text/plain": [ 445 | "10172.41379310354" 446 | ] 447 | }, 448 | "execution_count": 42, 449 | "metadata": {}, 450 | "output_type": "execute_result" 451 | } 452 | ], 453 | "source": [ 454 | "reg.intercept" 455 | ] 456 | }, 457 | { 458 | "cell_type": "code", 459 | "execution_count": 46, 460 | "id": "0f9f0a58-296d-483b-992f-cab02e51ef2c", 461 | "metadata": {}, 462 | "outputs": [ 463 | { 464 | "name": "stdout", 465 | "output_type": "stream", 466 | "text": [ 467 | "+------+-------------------+------------------+\n", 468 | "|Salary|Independent feature| prediction|\n", 469 | "+------+-------------------+------------------+\n", 470 | "| 18000| [23.0,2.0]|16551.724137931044|\n", 471 | "| 20000| [24.0,3.0]|17931.034482758627|\n", 472 | "| 30000| [31.0,10.0]|27586.206896551732|\n", 473 | "+------+-------------------+------------------+\n", 474 | "\n" 475 | ] 476 | } 477 | ], 478 | "source": [ 479 | "x=reg.evaluate(test_data)\n", 480 | "x.predictions.show()" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "id": "9b4508e3-b794-4b56-94b3-59dba6a687d0", 487 | "metadata": {}, 488 | "outputs": [], 489 | "source": [] 490 | } 491 | ], 492 | "metadata": { 493 | "kernelspec": { 494 | "display_name": "Python 3 (ipykernel)", 495 | "language": "python", 496 | "name": "python3" 497 | }, 498 | "language_info": { 499 | "codemirror_mode": { 500 | "name": "ipython", 501 | "version": 3 502 | }, 503 | "file_extension": ".py", 504 | "mimetype": "text/x-python", 505 | "name": "python", 506 | "nbconvert_exporter": "python", 507 | "pygments_lexer": "ipython3", 508 | "version": "3.11.7" 509 | } 510 | }, 511 | "nbformat": 4, 512 | "nbformat_minor": 5 513 | } 514 | -------------------------------------------------------------------------------- /day7.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "application/vnd.databricks.v1+cell": { 7 | "cellMetadata": {}, 8 | "inputWidgets": {}, 9 | "nuid": "96816ed7-b08a-4ca3-abb9-f99880c3535d", 10 | "showTitle": false, 11 | "title": "" 12 | } 13 | }, 14 | "source": [ 15 | "\n", 16 | "## Overview\n", 17 | "\n", 18 | "This notebook will show you how to create and query a table or DataFrame that you uploaded to DBFS. [DBFS](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) is a Databricks File System that allows you to store data for querying inside of Databricks. This notebook assumes that you have a file already inside of DBFS that you would like to read from.\n", 19 | "\n", 20 | "This notebook is written in **Python** so the default cell type is Python. However, you can use different languages by using the `%LANGUAGE` syntax. Python, Scala, SQL, and R are all supported." 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 0, 26 | "metadata": { 27 | "application/vnd.databricks.v1+cell": { 28 | "cellMetadata": { 29 | "byteLimit": 2048000, 30 | "rowLimit": 10000 31 | }, 32 | "inputWidgets": {}, 33 | "nuid": "6482be4c-f067-47c9-b0ac-35c938b94601", 34 | "showTitle": false, 35 | "title": "" 36 | } 37 | }, 38 | "outputs": [ 39 | { 40 | "output_type": "stream", 41 | "name": "stdout", 42 | "output_type": "stream", 43 | "text": [ 44 | "+----------+----+------+------+---+------+----+\n|total_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n| 16.99|1.01|Female| No|Sun|Dinner| 2|\n| 10.34|1.66| Male| No|Sun|Dinner| 3|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3|\n| 23.68|3.31| Male| No|Sun|Dinner| 2|\n| 24.59|3.61|Female| No|Sun|Dinner| 4|\n| 25.29|4.71| Male| No|Sun|Dinner| 4|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2|\n| 26.88|3.12| Male| No|Sun|Dinner| 4|\n| 15.04|1.96| Male| No|Sun|Dinner| 2|\n| 14.78|3.23| Male| No|Sun|Dinner| 2|\n| 10.27|1.71| Male| No|Sun|Dinner| 2|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4|\n| 15.42|1.57| Male| No|Sun|Dinner| 2|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4|\n| 14.83|3.02|Female| No|Sun|Dinner| 2|\n| 21.58|3.92| Male| No|Sun|Dinner| 2|\n| 10.33|1.67|Female| No|Sun|Dinner| 3|\n| 16.29|3.71| Male| No|Sun|Dinner| 3|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3|\n| 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "# File location and type\n", 50 | "file_location = \"/FileStore/tables/tips.csv\"\n", 51 | "file_type = \"csv\"\n", 52 | "# The applied options are for CSV files. For other file types, these will be ignored.\n", 53 | "df = spark.read.csv(file_location,header=True,inferSchema=True)\n", 54 | "df.show()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 0, 60 | "metadata": { 61 | "application/vnd.databricks.v1+cell": { 62 | "cellMetadata": { 63 | "byteLimit": 2048000, 64 | "rowLimit": 10000 65 | }, 66 | "inputWidgets": {}, 67 | "nuid": "bd82bb99-1479-4d5c-be10-8c36df0f1d44", 68 | "showTitle": false, 69 | "title": "" 70 | } 71 | }, 72 | "outputs": [ 73 | { 74 | "output_type": "stream", 75 | "name": "stdout", 76 | "output_type": "stream", 77 | "text": [ 78 | "root\n |-- total_bill: double (nullable = true)\n |-- tip: double (nullable = true)\n |-- sex: string (nullable = true)\n |-- smoker: string (nullable = true)\n |-- day: string (nullable = true)\n |-- time: string (nullable = true)\n |-- size: integer (nullable = true)\n\n" 79 | ] 80 | } 81 | ], 82 | "source": [ 83 | "df.printSchema()" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 0, 89 | "metadata": { 90 | "application/vnd.databricks.v1+cell": { 91 | "cellMetadata": { 92 | "byteLimit": 2048000, 93 | "rowLimit": 10000 94 | }, 95 | "inputWidgets": {}, 96 | "nuid": "b5f66379-6f7f-42ec-8e82-d0e0926a1721", 97 | "showTitle": false, 98 | "title": "" 99 | } 100 | }, 101 | "outputs": [ 102 | { 103 | "output_type": "stream", 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "Out[11]: ['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']" 108 | ] 109 | } 110 | ], 111 | "source": [ 112 | "df.columns" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": 0, 118 | "metadata": { 119 | "application/vnd.databricks.v1+cell": { 120 | "cellMetadata": { 121 | "byteLimit": 2048000, 122 | "rowLimit": 10000 123 | }, 124 | "inputWidgets": {}, 125 | "nuid": "a518f51a-d8d4-49b8-ab77-1abd1d04551b", 126 | "showTitle": false, 127 | "title": "" 128 | } 129 | }, 130 | "outputs": [], 131 | "source": [ 132 | "#handle categorical features\n", 133 | "from pyspark.ml.feature import StringIndexer\n", 134 | "#stringindexer means strings to numeric like 0,1 yes/no type values" 135 | ] 136 | }, 137 | { 138 | "cell_type": "code", 139 | "execution_count": 0, 140 | "metadata": { 141 | "application/vnd.databricks.v1+cell": { 142 | "cellMetadata": { 143 | "byteLimit": 2048000, 144 | "rowLimit": 10000 145 | }, 146 | "inputWidgets": {}, 147 | "nuid": "58839a05-de70-40cb-ac2b-56de113fca09", 148 | "showTitle": false, 149 | "title": "" 150 | } 151 | }, 152 | "outputs": [ 153 | { 154 | "output_type": "stream", 155 | "name": "stdout", 156 | "output_type": "stream", 157 | "text": [ 158 | "+----------+----+------+------+---+------+----+\n|total_bill| tip| sex|smoker|day| time|size|\n+----------+----+------+------+---+------+----+\n| 16.99|1.01|Female| No|Sun|Dinner| 2|\n| 10.34|1.66| Male| No|Sun|Dinner| 3|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3|\n| 23.68|3.31| Male| No|Sun|Dinner| 2|\n| 24.59|3.61|Female| No|Sun|Dinner| 4|\n| 25.29|4.71| Male| No|Sun|Dinner| 4|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2|\n| 26.88|3.12| Male| No|Sun|Dinner| 4|\n| 15.04|1.96| Male| No|Sun|Dinner| 2|\n| 14.78|3.23| Male| No|Sun|Dinner| 2|\n| 10.27|1.71| Male| No|Sun|Dinner| 2|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4|\n| 15.42|1.57| Male| No|Sun|Dinner| 2|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4|\n| 14.83|3.02|Female| No|Sun|Dinner| 2|\n| 21.58|3.92| Male| No|Sun|Dinner| 2|\n| 10.33|1.67|Female| No|Sun|Dinner| 3|\n| 16.29|3.71| Male| No|Sun|Dinner| 3|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3|\n| 20.65|3.35| Male| No|Sat|Dinner| 3|\n+----------+----+------+------+---+------+----+\nonly showing top 20 rows\n\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "df.show()" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 0, 169 | "metadata": { 170 | "application/vnd.databricks.v1+cell": { 171 | "cellMetadata": { 172 | "byteLimit": 2048000, 173 | "rowLimit": 10000 174 | }, 175 | "inputWidgets": {}, 176 | "nuid": "be1c1738-3f6e-4de0-bfcc-1af15ad3e8d8", 177 | "showTitle": false, 178 | "title": "" 179 | } 180 | }, 181 | "outputs": [ 182 | { 183 | "output_type": "stream", 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+\n|total_bill| tip| sex|smoker|day| time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+\n| 16.99|1.01|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n| 10.34|1.66| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 21.01| 3.5| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 23.68|3.31| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 24.59|3.61|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n| 25.29|4.71| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 8.77| 2.0| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 26.88|3.12| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 15.04|1.96| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 14.78|3.23| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.27|1.71| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 35.26| 5.0|Female| No|Sun|Dinner| 4| 1.0| 0.0| 1.0| 0.0|\n| 15.42|1.57| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 18.43| 3.0| Male| No|Sun|Dinner| 4| 0.0| 0.0| 1.0| 0.0|\n| 14.83|3.02|Female| No|Sun|Dinner| 2| 1.0| 0.0| 1.0| 0.0|\n| 21.58|3.92| Male| No|Sun|Dinner| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.33|1.67|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n| 16.29|3.71| Male| No|Sun|Dinner| 3| 0.0| 0.0| 1.0| 0.0|\n| 16.97| 3.5|Female| No|Sun|Dinner| 3| 1.0| 0.0| 1.0| 0.0|\n| 20.65|3.35| Male| No|Sat|Dinner| 3| 0.0| 0.0| 0.0| 0.0|\n+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+\nonly showing top 20 rows\n\n" 188 | ] 189 | } 190 | ], 191 | "source": [ 192 | "index=StringIndexer(\n", 193 | " inputCols=[\"sex\",\"smoker\",\"day\",\"time\"],\n", 194 | " outputCols=[\"sex_indexed\",\"smoker_indexed\",\"day_indexed\",\"time_indexed\"]\n", 195 | " )\n", 196 | "df_r=index.fit(df).transform(df)\n", 197 | "df_r.show()" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 0, 203 | "metadata": { 204 | "application/vnd.databricks.v1+cell": { 205 | "cellMetadata": { 206 | "byteLimit": 2048000, 207 | "rowLimit": 10000 208 | }, 209 | "inputWidgets": {}, 210 | "nuid": "06e561a4-05a6-43d0-ab19-27cfdaf4afcb", 211 | "showTitle": false, 212 | "title": "" 213 | } 214 | }, 215 | "outputs": [ 216 | { 217 | "output_type": "stream", 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | "+----------+----+----+-----------+--------------+-----------+------------+\n|total_bill| tip|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|\n+----------+----+----+-----------+--------------+-----------+------------+\n| 16.99|1.01| 2| 1.0| 0.0| 1.0| 0.0|\n| 10.34|1.66| 3| 0.0| 0.0| 1.0| 0.0|\n| 21.01| 3.5| 3| 0.0| 0.0| 1.0| 0.0|\n| 23.68|3.31| 2| 0.0| 0.0| 1.0| 0.0|\n| 24.59|3.61| 4| 1.0| 0.0| 1.0| 0.0|\n| 25.29|4.71| 4| 0.0| 0.0| 1.0| 0.0|\n| 8.77| 2.0| 2| 0.0| 0.0| 1.0| 0.0|\n| 26.88|3.12| 4| 0.0| 0.0| 1.0| 0.0|\n| 15.04|1.96| 2| 0.0| 0.0| 1.0| 0.0|\n| 14.78|3.23| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.27|1.71| 2| 0.0| 0.0| 1.0| 0.0|\n| 35.26| 5.0| 4| 1.0| 0.0| 1.0| 0.0|\n| 15.42|1.57| 2| 0.0| 0.0| 1.0| 0.0|\n| 18.43| 3.0| 4| 0.0| 0.0| 1.0| 0.0|\n| 14.83|3.02| 2| 1.0| 0.0| 1.0| 0.0|\n| 21.58|3.92| 2| 0.0| 0.0| 1.0| 0.0|\n| 10.33|1.67| 3| 1.0| 0.0| 1.0| 0.0|\n| 16.29|3.71| 3| 0.0| 0.0| 1.0| 0.0|\n| 16.97| 3.5| 3| 1.0| 0.0| 1.0| 0.0|\n| 20.65|3.35| 3| 0.0| 0.0| 0.0| 0.0|\n+----------+----+----+-----------+--------------+-----------+------------+\nonly showing top 20 rows\n\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "df1=df_r.drop(\"sex\",\"smoker\",\"day\",\"time\")\n", 227 | "df1.show()" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 0, 233 | "metadata": { 234 | "application/vnd.databricks.v1+cell": { 235 | "cellMetadata": { 236 | "byteLimit": 2048000, 237 | "rowLimit": 10000 238 | }, 239 | "inputWidgets": {}, 240 | "nuid": "2fea7c39-e5d3-4084-a7d4-2fb4de268c8f", 241 | "showTitle": false, 242 | "title": "" 243 | } 244 | }, 245 | "outputs": [ 246 | { 247 | "output_type": "stream", 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "+----------+----+----+-----------+--------------+-----------+------------+--------------------+\n|total_bill| tip|size|sex_indexed|smoker_indexed|day_indexed|time_indexed| Independent Feature|\n+----------+----+----+-----------+--------------+-----------+------------+--------------------+\n| 16.99|1.01| 2| 1.0| 0.0| 1.0| 0.0|[1.01,2.0,1.0,0.0...|\n| 10.34|1.66| 3| 0.0| 0.0| 1.0| 0.0|[1.66,3.0,0.0,0.0...|\n| 21.01| 3.5| 3| 0.0| 0.0| 1.0| 0.0|[3.5,3.0,0.0,0.0,...|\n| 23.68|3.31| 2| 0.0| 0.0| 1.0| 0.0|[3.31,2.0,0.0,0.0...|\n| 24.59|3.61| 4| 1.0| 0.0| 1.0| 0.0|[3.61,4.0,1.0,0.0...|\n| 25.29|4.71| 4| 0.0| 0.0| 1.0| 0.0|[4.71,4.0,0.0,0.0...|\n| 8.77| 2.0| 2| 0.0| 0.0| 1.0| 0.0|[2.0,2.0,0.0,0.0,...|\n| 26.88|3.12| 4| 0.0| 0.0| 1.0| 0.0|[3.12,4.0,0.0,0.0...|\n| 15.04|1.96| 2| 0.0| 0.0| 1.0| 0.0|[1.96,2.0,0.0,0.0...|\n| 14.78|3.23| 2| 0.0| 0.0| 1.0| 0.0|[3.23,2.0,0.0,0.0...|\n| 10.27|1.71| 2| 0.0| 0.0| 1.0| 0.0|[1.71,2.0,0.0,0.0...|\n| 35.26| 5.0| 4| 1.0| 0.0| 1.0| 0.0|[5.0,4.0,1.0,0.0,...|\n| 15.42|1.57| 2| 0.0| 0.0| 1.0| 0.0|[1.57,2.0,0.0,0.0...|\n| 18.43| 3.0| 4| 0.0| 0.0| 1.0| 0.0|[3.0,4.0,0.0,0.0,...|\n| 14.83|3.02| 2| 1.0| 0.0| 1.0| 0.0|[3.02,2.0,1.0,0.0...|\n| 21.58|3.92| 2| 0.0| 0.0| 1.0| 0.0|[3.92,2.0,0.0,0.0...|\n| 10.33|1.67| 3| 1.0| 0.0| 1.0| 0.0|[1.67,3.0,1.0,0.0...|\n| 16.29|3.71| 3| 0.0| 0.0| 1.0| 0.0|[3.71,3.0,0.0,0.0...|\n| 16.97| 3.5| 3| 1.0| 0.0| 1.0| 0.0|[3.5,3.0,1.0,0.0,...|\n| 20.65|3.35| 3| 0.0| 0.0| 0.0| 0.0|(6,[0,1],[3.35,3.0])|\n+----------+----+----+-----------+--------------+-----------+------------+--------------------+\nonly showing top 20 rows\n\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "from pyspark.ml.feature import VectorAssembler\n", 257 | "df_spark=VectorAssembler(\n", 258 | " inputCols=[\"tip\",\"size\",\"sex_indexed\",\"smoker_indexed\",\"day_indexed\",\"time_indexed\"],\n", 259 | " outputCol=\"Independent Feature\"\n", 260 | ")\n", 261 | "output=df_spark.transform(df1)\n", 262 | "output.show()" 263 | ] 264 | }, 265 | { 266 | "cell_type": "code", 267 | "execution_count": 0, 268 | "metadata": { 269 | "application/vnd.databricks.v1+cell": { 270 | "cellMetadata": { 271 | "byteLimit": 2048000, 272 | "rowLimit": 10000 273 | }, 274 | "inputWidgets": {}, 275 | "nuid": "cdf39b42-a310-4d91-8550-52cb26ac85d9", 276 | "showTitle": false, 277 | "title": "" 278 | } 279 | }, 280 | "outputs": [ 281 | { 282 | "output_type": "stream", 283 | "name": "stdout", 284 | "output_type": "stream", 285 | "text": [ 286 | "+--------------------+\n| Independent Feature|\n+--------------------+\n|[1.01,2.0,1.0,0.0...|\n|[1.66,3.0,0.0,0.0...|\n|[3.5,3.0,0.0,0.0,...|\n|[3.31,2.0,0.0,0.0...|\n|[3.61,4.0,1.0,0.0...|\n|[4.71,4.0,0.0,0.0...|\n|[2.0,2.0,0.0,0.0,...|\n|[3.12,4.0,0.0,0.0...|\n|[1.96,2.0,0.0,0.0...|\n|[3.23,2.0,0.0,0.0...|\n|[1.71,2.0,0.0,0.0...|\n|[5.0,4.0,1.0,0.0,...|\n|[1.57,2.0,0.0,0.0...|\n|[3.0,4.0,0.0,0.0,...|\n|[3.02,2.0,1.0,0.0...|\n|[3.92,2.0,0.0,0.0...|\n|[1.67,3.0,1.0,0.0...|\n|[3.71,3.0,0.0,0.0...|\n|[3.5,3.0,1.0,0.0,...|\n|(6,[0,1],[3.35,3.0])|\n+--------------------+\nonly showing top 20 rows\n\n" 287 | ] 288 | } 289 | ], 290 | "source": [ 291 | "output.select(\"Independent Feature\").show()" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 0, 297 | "metadata": { 298 | "application/vnd.databricks.v1+cell": { 299 | "cellMetadata": { 300 | "byteLimit": 2048000, 301 | "rowLimit": 10000 302 | }, 303 | "inputWidgets": {}, 304 | "nuid": "3d6f6f73-7d3b-4455-9495-f1e649e231bc", 305 | "showTitle": false, 306 | "title": "" 307 | } 308 | }, 309 | "outputs": [ 310 | { 311 | "output_type": "stream", 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "+--------------------+----------+\n| Independent Feature|total_bill|\n+--------------------+----------+\n|[1.01,2.0,1.0,0.0...| 16.99|\n|[1.66,3.0,0.0,0.0...| 10.34|\n|[3.5,3.0,0.0,0.0,...| 21.01|\n|[3.31,2.0,0.0,0.0...| 23.68|\n|[3.61,4.0,1.0,0.0...| 24.59|\n|[4.71,4.0,0.0,0.0...| 25.29|\n|[2.0,2.0,0.0,0.0,...| 8.77|\n|[3.12,4.0,0.0,0.0...| 26.88|\n|[1.96,2.0,0.0,0.0...| 15.04|\n|[3.23,2.0,0.0,0.0...| 14.78|\n|[1.71,2.0,0.0,0.0...| 10.27|\n|[5.0,4.0,1.0,0.0,...| 35.26|\n|[1.57,2.0,0.0,0.0...| 15.42|\n|[3.0,4.0,0.0,0.0,...| 18.43|\n|[3.02,2.0,1.0,0.0...| 14.83|\n|[3.92,2.0,0.0,0.0...| 21.58|\n|[1.67,3.0,1.0,0.0...| 10.33|\n|[3.71,3.0,0.0,0.0...| 16.29|\n|[3.5,3.0,1.0,0.0,...| 16.97|\n|(6,[0,1],[3.35,3.0])| 20.65|\n+--------------------+----------+\nonly showing top 20 rows\n\n" 316 | ] 317 | } 318 | ], 319 | "source": [ 320 | "finalize=ouput.select(\"Independent Feature\",\"total_bill\")\n", 321 | "finalize.show()" 322 | ] 323 | }, 324 | { 325 | "cell_type": "code", 326 | "execution_count": 0, 327 | "metadata": { 328 | "application/vnd.databricks.v1+cell": { 329 | "cellMetadata": { 330 | "byteLimit": 2048000, 331 | "rowLimit": 10000 332 | }, 333 | "inputWidgets": {}, 334 | "nuid": "c6b8b082-06a8-4e06-91a9-70e5893fc436", 335 | "showTitle": false, 336 | "title": "" 337 | } 338 | }, 339 | "outputs": [], 340 | "source": [ 341 | "from pyspark.ml.regression import LinearRegression\n", 342 | "#split\n", 343 | "test_data,train_data=finalize.randomSplit([0.75,0.25])\n", 344 | "#model\n", 345 | "m=LinearRegression(\n", 346 | " featuresCol=\"Independent Feature\",\n", 347 | " labelCol=\"total_bill\"\n", 348 | ")\n", 349 | "#fit\n", 350 | "fitting_model=m.fit(train_data)\n" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 0, 356 | "metadata": { 357 | "application/vnd.databricks.v1+cell": { 358 | "cellMetadata": { 359 | "byteLimit": 2048000, 360 | "rowLimit": 10000 361 | }, 362 | "inputWidgets": {}, 363 | "nuid": "04a960de-c2c6-49ad-bbb6-790477ffc7d6", 364 | "showTitle": false, 365 | "title": "" 366 | } 367 | }, 368 | "outputs": [ 369 | { 370 | "output_type": "stream", 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "Out[55]: DenseVector([3.193, 2.4931, -2.5861, 4.2707, -0.4779, -1.1637])" 375 | ] 376 | } 377 | ], 378 | "source": [ 379 | "#coeficient\n", 380 | "fitting_model.coefficients" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 0, 386 | "metadata": { 387 | "application/vnd.databricks.v1+cell": { 388 | "cellMetadata": { 389 | "byteLimit": 2048000, 390 | "rowLimit": 10000 391 | }, 392 | "inputWidgets": {}, 393 | "nuid": "1948a0b5-a4f7-4905-b0d9-004631b6dfe9", 394 | "showTitle": false, 395 | "title": "" 396 | } 397 | }, 398 | "outputs": [ 399 | { 400 | "output_type": "stream", 401 | "name": "stdout", 402 | "output_type": "stream", 403 | "text": [ 404 | "Out[54]: 4.00317861118615" 405 | ] 406 | } 407 | ], 408 | "source": [ 409 | "#intercept\n", 410 | "fitting_model.intercept" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 0, 416 | "metadata": { 417 | "application/vnd.databricks.v1+cell": { 418 | "cellMetadata": { 419 | "byteLimit": 2048000, 420 | "rowLimit": 10000 421 | }, 422 | "inputWidgets": {}, 423 | "nuid": "9fa50b9a-73a5-471a-8e84-02c4df7635cc", 424 | "showTitle": false, 425 | "title": "" 426 | } 427 | }, 428 | "outputs": [], 429 | "source": [ 430 | "#evalute\n", 431 | "result=fitting_model.evaluate(test_data)\n" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 0, 437 | "metadata": { 438 | "application/vnd.databricks.v1+cell": { 439 | "cellMetadata": { 440 | "byteLimit": 2048000, 441 | "rowLimit": 10000 442 | }, 443 | "inputWidgets": {}, 444 | "nuid": "cd6030c1-28ab-4afa-82b7-ad994627ab02", 445 | "showTitle": false, 446 | "title": "" 447 | } 448 | }, 449 | "outputs": [ 450 | { 451 | "output_type": "stream", 452 | "name": "stdout", 453 | "output_type": "stream", 454 | "text": [ 455 | "+--------------------+----------+------------------+\n| Independent Feature|total_bill| prediction|\n+--------------------+----------+------------------+\n|(6,[0,1],[1.25,2.0])| 10.07| 12.98065290956777|\n|(6,[0,1],[1.25,2.0])| 10.51| 12.98065290956777|\n|(6,[0,1],[1.47,2.0])| 10.77|13.683112213152697|\n| (6,[0,1],[2.0,2.0])| 12.69| 15.37540053542548|\n| (6,[0,1],[2.0,2.0])| 13.37| 15.37540053542548|\n| (6,[0,1],[2.0,3.0])| 16.31|17.868514663068197|\n|(6,[0,1],[2.31,3.0])| 18.69|18.858343681756054|\n|(6,[0,1],[2.34,4.0])| 17.81|21.447247714433075|\n| (6,[0,1],[2.5,4.0])| 18.35| 21.95812720794939|\n|(6,[0,1],[2.64,3.0])| 17.59|19.912032637133446|\n|(6,[0,1],[2.72,2.0])| 13.28|17.674358256248887|\n| (6,[0,1],[3.0,2.0])| 14.0| 18.56839736990243|\n| (6,[0,1],[3.0,4.0])| 20.45|23.554625625187864|\n|(6,[0,1],[3.15,3.0])| 20.08| 21.54046102271669|\n|(6,[0,1],[3.18,2.0])| 19.82|19.143136800108284|\n|(6,[0,1],[3.35,3.0])| 20.65|22.179060389612083|\n|(6,[0,1],[3.39,2.0])| 11.61|19.813666135348445|\n| (6,[0,1],[3.6,3.0])| 24.06| 22.97730959823132|\n|(6,[0,1],[3.76,2.0])| 18.24|20.995074964104916|\n| (6,[0,1],[4.3,2.0])| 21.7|22.719293254722466|\n+--------------------+----------+------------------+\nonly showing top 20 rows\n\n" 456 | ] 457 | } 458 | ], 459 | "source": [ 460 | "#predict\n", 461 | "result.predictions.show()" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 0, 467 | "metadata": { 468 | "application/vnd.databricks.v1+cell": { 469 | "cellMetadata": { 470 | "byteLimit": 2048000, 471 | "rowLimit": 10000 472 | }, 473 | "inputWidgets": {}, 474 | "nuid": "0c0c1c32-74c6-43ad-bd67-b42b2c1ba148", 475 | "showTitle": false, 476 | "title": "" 477 | } 478 | }, 479 | "outputs": [ 480 | { 481 | "output_type": "stream", 482 | "name": "stdout", 483 | "output_type": "stream", 484 | "text": [ 485 | "Out[57]: (0.554439412505905, 4.342191773023231, 34.652872891876086)" 486 | ] 487 | } 488 | ], 489 | "source": [ 490 | "#performence metrics\n", 491 | "result.r2,result.meanAbsoluteError,result.meanSquaredError" 492 | ] 493 | } 494 | ], 495 | "metadata": { 496 | "application/vnd.databricks.v1+notebook": { 497 | "dashboards": [], 498 | "language": "python", 499 | "notebookMetadata": { 500 | "pythonIndentUnit": 4 501 | }, 502 | "notebookName": "2024-04-26 - DBFS Example", 503 | "widgets": {} 504 | } 505 | }, 506 | "nbformat": 4, 507 | "nbformat_minor": 0 508 | } 509 | -------------------------------------------------------------------------------- /test1.csv: -------------------------------------------------------------------------------- 1 | name,age 2 | kani,15 3 | kani1,16 4 | kani2,17 -------------------------------------------------------------------------------- /test2.csv: -------------------------------------------------------------------------------- 1 | name,age,experience 2 | kani,15,10 3 | kani1,16,8 4 | kani2,17,4 -------------------------------------------------------------------------------- /test3.csv: -------------------------------------------------------------------------------- 1 | Name,age,Experience,Salary 2 | Krish,31,10,30000 3 | Sudhanshu,30,8,25000 4 | Sunny,29,4,20000 5 | Paul,24,3,20000 6 | Harsha,21,1,15000 7 | Shubham,23,2,18000 8 | Mahesh,,,40000 9 | ,34,10,38000 10 | ,36,, 11 | -------------------------------------------------------------------------------- /test4.csv: -------------------------------------------------------------------------------- 1 | Name,age,Experience,Salary 2 | Krish,31,10,30000 3 | Sudhanshu,30,8,25000 4 | Sunny,29,4,20000 5 | Paul,24,3,20000 6 | Harsha,21,1,15000 7 | Shubham,23,2,18000 8 | -------------------------------------------------------------------------------- /test5.csv: -------------------------------------------------------------------------------- 1 | Name,Departments,salary 2 | Krish,Data Science,10000 3 | Krish,IOT,5000 4 | Mahesh,Big Data,4000 5 | Krish,Big Data,4000 6 | Mahesh,Data Science,3000 7 | Sudhanshu,Data Science,20000 8 | Sudhanshu,IOT,10000 9 | Sudhanshu,Big Data,5000 10 | Sunny,Data Science,10000 11 | Sunny,Big Data,2000 12 | -------------------------------------------------------------------------------- /tips.csv: -------------------------------------------------------------------------------- 1 | total_bill,tip,sex,smoker,day,time,size 2 | 16.99,1.01,Female,No,Sun,Dinner,2 3 | 10.34,1.66,Male,No,Sun,Dinner,3 4 | 21.01,3.5,Male,No,Sun,Dinner,3 5 | 23.68,3.31,Male,No,Sun,Dinner,2 6 | 24.59,3.61,Female,No,Sun,Dinner,4 7 | 25.29,4.71,Male,No,Sun,Dinner,4 8 | 8.77,2.0,Male,No,Sun,Dinner,2 9 | 26.88,3.12,Male,No,Sun,Dinner,4 10 | 15.04,1.96,Male,No,Sun,Dinner,2 11 | 14.78,3.23,Male,No,Sun,Dinner,2 12 | 10.27,1.71,Male,No,Sun,Dinner,2 13 | 35.26,5.0,Female,No,Sun,Dinner,4 14 | 15.42,1.57,Male,No,Sun,Dinner,2 15 | 18.43,3.0,Male,No,Sun,Dinner,4 16 | 14.83,3.02,Female,No,Sun,Dinner,2 17 | 21.58,3.92,Male,No,Sun,Dinner,2 18 | 10.33,1.67,Female,No,Sun,Dinner,3 19 | 16.29,3.71,Male,No,Sun,Dinner,3 20 | 16.97,3.5,Female,No,Sun,Dinner,3 21 | 20.65,3.35,Male,No,Sat,Dinner,3 22 | 17.92,4.08,Male,No,Sat,Dinner,2 23 | 20.29,2.75,Female,No,Sat,Dinner,2 24 | 15.77,2.23,Female,No,Sat,Dinner,2 25 | 39.42,7.58,Male,No,Sat,Dinner,4 26 | 19.82,3.18,Male,No,Sat,Dinner,2 27 | 17.81,2.34,Male,No,Sat,Dinner,4 28 | 13.37,2.0,Male,No,Sat,Dinner,2 29 | 12.69,2.0,Male,No,Sat,Dinner,2 30 | 21.7,4.3,Male,No,Sat,Dinner,2 31 | 19.65,3.0,Female,No,Sat,Dinner,2 32 | 9.55,1.45,Male,No,Sat,Dinner,2 33 | 18.35,2.5,Male,No,Sat,Dinner,4 34 | 15.06,3.0,Female,No,Sat,Dinner,2 35 | 20.69,2.45,Female,No,Sat,Dinner,4 36 | 17.78,3.27,Male,No,Sat,Dinner,2 37 | 24.06,3.6,Male,No,Sat,Dinner,3 38 | 16.31,2.0,Male,No,Sat,Dinner,3 39 | 16.93,3.07,Female,No,Sat,Dinner,3 40 | 18.69,2.31,Male,No,Sat,Dinner,3 41 | 31.27,5.0,Male,No,Sat,Dinner,3 42 | 16.04,2.24,Male,No,Sat,Dinner,3 43 | 17.46,2.54,Male,No,Sun,Dinner,2 44 | 13.94,3.06,Male,No,Sun,Dinner,2 45 | 9.68,1.32,Male,No,Sun,Dinner,2 46 | 30.4,5.6,Male,No,Sun,Dinner,4 47 | 18.29,3.0,Male,No,Sun,Dinner,2 48 | 22.23,5.0,Male,No,Sun,Dinner,2 49 | 32.4,6.0,Male,No,Sun,Dinner,4 50 | 28.55,2.05,Male,No,Sun,Dinner,3 51 | 18.04,3.0,Male,No,Sun,Dinner,2 52 | 12.54,2.5,Male,No,Sun,Dinner,2 53 | 10.29,2.6,Female,No,Sun,Dinner,2 54 | 34.81,5.2,Female,No,Sun,Dinner,4 55 | 9.94,1.56,Male,No,Sun,Dinner,2 56 | 25.56,4.34,Male,No,Sun,Dinner,4 57 | 19.49,3.51,Male,No,Sun,Dinner,2 58 | 38.01,3.0,Male,Yes,Sat,Dinner,4 59 | 26.41,1.5,Female,No,Sat,Dinner,2 60 | 11.24,1.76,Male,Yes,Sat,Dinner,2 61 | 48.27,6.73,Male,No,Sat,Dinner,4 62 | 20.29,3.21,Male,Yes,Sat,Dinner,2 63 | 13.81,2.0,Male,Yes,Sat,Dinner,2 64 | 11.02,1.98,Male,Yes,Sat,Dinner,2 65 | 18.29,3.76,Male,Yes,Sat,Dinner,4 66 | 17.59,2.64,Male,No,Sat,Dinner,3 67 | 20.08,3.15,Male,No,Sat,Dinner,3 68 | 16.45,2.47,Female,No,Sat,Dinner,2 69 | 3.07,1.0,Female,Yes,Sat,Dinner,1 70 | 20.23,2.01,Male,No,Sat,Dinner,2 71 | 15.01,2.09,Male,Yes,Sat,Dinner,2 72 | 12.02,1.97,Male,No,Sat,Dinner,2 73 | 17.07,3.0,Female,No,Sat,Dinner,3 74 | 26.86,3.14,Female,Yes,Sat,Dinner,2 75 | 25.28,5.0,Female,Yes,Sat,Dinner,2 76 | 14.73,2.2,Female,No,Sat,Dinner,2 77 | 10.51,1.25,Male,No,Sat,Dinner,2 78 | 17.92,3.08,Male,Yes,Sat,Dinner,2 79 | 27.2,4.0,Male,No,Thur,Lunch,4 80 | 22.76,3.0,Male,No,Thur,Lunch,2 81 | 17.29,2.71,Male,No,Thur,Lunch,2 82 | 19.44,3.0,Male,Yes,Thur,Lunch,2 83 | 16.66,3.4,Male,No,Thur,Lunch,2 84 | 10.07,1.83,Female,No,Thur,Lunch,1 85 | 32.68,5.0,Male,Yes,Thur,Lunch,2 86 | 15.98,2.03,Male,No,Thur,Lunch,2 87 | 34.83,5.17,Female,No,Thur,Lunch,4 88 | 13.03,2.0,Male,No,Thur,Lunch,2 89 | 18.28,4.0,Male,No,Thur,Lunch,2 90 | 24.71,5.85,Male,No,Thur,Lunch,2 91 | 21.16,3.0,Male,No,Thur,Lunch,2 92 | 28.97,3.0,Male,Yes,Fri,Dinner,2 93 | 22.49,3.5,Male,No,Fri,Dinner,2 94 | 5.75,1.0,Female,Yes,Fri,Dinner,2 95 | 16.32,4.3,Female,Yes,Fri,Dinner,2 96 | 22.75,3.25,Female,No,Fri,Dinner,2 97 | 40.17,4.73,Male,Yes,Fri,Dinner,4 98 | 27.28,4.0,Male,Yes,Fri,Dinner,2 99 | 12.03,1.5,Male,Yes,Fri,Dinner,2 100 | 21.01,3.0,Male,Yes,Fri,Dinner,2 101 | 12.46,1.5,Male,No,Fri,Dinner,2 102 | 11.35,2.5,Female,Yes,Fri,Dinner,2 103 | 15.38,3.0,Female,Yes,Fri,Dinner,2 104 | 44.3,2.5,Female,Yes,Sat,Dinner,3 105 | 22.42,3.48,Female,Yes,Sat,Dinner,2 106 | 20.92,4.08,Female,No,Sat,Dinner,2 107 | 15.36,1.64,Male,Yes,Sat,Dinner,2 108 | 20.49,4.06,Male,Yes,Sat,Dinner,2 109 | 25.21,4.29,Male,Yes,Sat,Dinner,2 110 | 18.24,3.76,Male,No,Sat,Dinner,2 111 | 14.31,4.0,Female,Yes,Sat,Dinner,2 112 | 14.0,3.0,Male,No,Sat,Dinner,2 113 | 7.25,1.0,Female,No,Sat,Dinner,1 114 | 38.07,4.0,Male,No,Sun,Dinner,3 115 | 23.95,2.55,Male,No,Sun,Dinner,2 116 | 25.71,4.0,Female,No,Sun,Dinner,3 117 | 17.31,3.5,Female,No,Sun,Dinner,2 118 | 29.93,5.07,Male,No,Sun,Dinner,4 119 | 10.65,1.5,Female,No,Thur,Lunch,2 120 | 12.43,1.8,Female,No,Thur,Lunch,2 121 | 24.08,2.92,Female,No,Thur,Lunch,4 122 | 11.69,2.31,Male,No,Thur,Lunch,2 123 | 13.42,1.68,Female,No,Thur,Lunch,2 124 | 14.26,2.5,Male,No,Thur,Lunch,2 125 | 15.95,2.0,Male,No,Thur,Lunch,2 126 | 12.48,2.52,Female,No,Thur,Lunch,2 127 | 29.8,4.2,Female,No,Thur,Lunch,6 128 | 8.52,1.48,Male,No,Thur,Lunch,2 129 | 14.52,2.0,Female,No,Thur,Lunch,2 130 | 11.38,2.0,Female,No,Thur,Lunch,2 131 | 22.82,2.18,Male,No,Thur,Lunch,3 132 | 19.08,1.5,Male,No,Thur,Lunch,2 133 | 20.27,2.83,Female,No,Thur,Lunch,2 134 | 11.17,1.5,Female,No,Thur,Lunch,2 135 | 12.26,2.0,Female,No,Thur,Lunch,2 136 | 18.26,3.25,Female,No,Thur,Lunch,2 137 | 8.51,1.25,Female,No,Thur,Lunch,2 138 | 10.33,2.0,Female,No,Thur,Lunch,2 139 | 14.15,2.0,Female,No,Thur,Lunch,2 140 | 16.0,2.0,Male,Yes,Thur,Lunch,2 141 | 13.16,2.75,Female,No,Thur,Lunch,2 142 | 17.47,3.5,Female,No,Thur,Lunch,2 143 | 34.3,6.7,Male,No,Thur,Lunch,6 144 | 41.19,5.0,Male,No,Thur,Lunch,5 145 | 27.05,5.0,Female,No,Thur,Lunch,6 146 | 16.43,2.3,Female,No,Thur,Lunch,2 147 | 8.35,1.5,Female,No,Thur,Lunch,2 148 | 18.64,1.36,Female,No,Thur,Lunch,3 149 | 11.87,1.63,Female,No,Thur,Lunch,2 150 | 9.78,1.73,Male,No,Thur,Lunch,2 151 | 7.51,2.0,Male,No,Thur,Lunch,2 152 | 14.07,2.5,Male,No,Sun,Dinner,2 153 | 13.13,2.0,Male,No,Sun,Dinner,2 154 | 17.26,2.74,Male,No,Sun,Dinner,3 155 | 24.55,2.0,Male,No,Sun,Dinner,4 156 | 19.77,2.0,Male,No,Sun,Dinner,4 157 | 29.85,5.14,Female,No,Sun,Dinner,5 158 | 48.17,5.0,Male,No,Sun,Dinner,6 159 | 25.0,3.75,Female,No,Sun,Dinner,4 160 | 13.39,2.61,Female,No,Sun,Dinner,2 161 | 16.49,2.0,Male,No,Sun,Dinner,4 162 | 21.5,3.5,Male,No,Sun,Dinner,4 163 | 12.66,2.5,Male,No,Sun,Dinner,2 164 | 16.21,2.0,Female,No,Sun,Dinner,3 165 | 13.81,2.0,Male,No,Sun,Dinner,2 166 | 17.51,3.0,Female,Yes,Sun,Dinner,2 167 | 24.52,3.48,Male,No,Sun,Dinner,3 168 | 20.76,2.24,Male,No,Sun,Dinner,2 169 | 31.71,4.5,Male,No,Sun,Dinner,4 170 | 10.59,1.61,Female,Yes,Sat,Dinner,2 171 | 10.63,2.0,Female,Yes,Sat,Dinner,2 172 | 50.81,10.0,Male,Yes,Sat,Dinner,3 173 | 15.81,3.16,Male,Yes,Sat,Dinner,2 174 | 7.25,5.15,Male,Yes,Sun,Dinner,2 175 | 31.85,3.18,Male,Yes,Sun,Dinner,2 176 | 16.82,4.0,Male,Yes,Sun,Dinner,2 177 | 32.9,3.11,Male,Yes,Sun,Dinner,2 178 | 17.89,2.0,Male,Yes,Sun,Dinner,2 179 | 14.48,2.0,Male,Yes,Sun,Dinner,2 180 | 9.6,4.0,Female,Yes,Sun,Dinner,2 181 | 34.63,3.55,Male,Yes,Sun,Dinner,2 182 | 34.65,3.68,Male,Yes,Sun,Dinner,4 183 | 23.33,5.65,Male,Yes,Sun,Dinner,2 184 | 45.35,3.5,Male,Yes,Sun,Dinner,3 185 | 23.17,6.5,Male,Yes,Sun,Dinner,4 186 | 40.55,3.0,Male,Yes,Sun,Dinner,2 187 | 20.69,5.0,Male,No,Sun,Dinner,5 188 | 20.9,3.5,Female,Yes,Sun,Dinner,3 189 | 30.46,2.0,Male,Yes,Sun,Dinner,5 190 | 18.15,3.5,Female,Yes,Sun,Dinner,3 191 | 23.1,4.0,Male,Yes,Sun,Dinner,3 192 | 15.69,1.5,Male,Yes,Sun,Dinner,2 193 | 19.81,4.19,Female,Yes,Thur,Lunch,2 194 | 28.44,2.56,Male,Yes,Thur,Lunch,2 195 | 15.48,2.02,Male,Yes,Thur,Lunch,2 196 | 16.58,4.0,Male,Yes,Thur,Lunch,2 197 | 7.56,1.44,Male,No,Thur,Lunch,2 198 | 10.34,2.0,Male,Yes,Thur,Lunch,2 199 | 43.11,5.0,Female,Yes,Thur,Lunch,4 200 | 13.0,2.0,Female,Yes,Thur,Lunch,2 201 | 13.51,2.0,Male,Yes,Thur,Lunch,2 202 | 18.71,4.0,Male,Yes,Thur,Lunch,3 203 | 12.74,2.01,Female,Yes,Thur,Lunch,2 204 | 13.0,2.0,Female,Yes,Thur,Lunch,2 205 | 16.4,2.5,Female,Yes,Thur,Lunch,2 206 | 20.53,4.0,Male,Yes,Thur,Lunch,4 207 | 16.47,3.23,Female,Yes,Thur,Lunch,3 208 | 26.59,3.41,Male,Yes,Sat,Dinner,3 209 | 38.73,3.0,Male,Yes,Sat,Dinner,4 210 | 24.27,2.03,Male,Yes,Sat,Dinner,2 211 | 12.76,2.23,Female,Yes,Sat,Dinner,2 212 | 30.06,2.0,Male,Yes,Sat,Dinner,3 213 | 25.89,5.16,Male,Yes,Sat,Dinner,4 214 | 48.33,9.0,Male,No,Sat,Dinner,4 215 | 13.27,2.5,Female,Yes,Sat,Dinner,2 216 | 28.17,6.5,Female,Yes,Sat,Dinner,3 217 | 12.9,1.1,Female,Yes,Sat,Dinner,2 218 | 28.15,3.0,Male,Yes,Sat,Dinner,5 219 | 11.59,1.5,Male,Yes,Sat,Dinner,2 220 | 7.74,1.44,Male,Yes,Sat,Dinner,2 221 | 30.14,3.09,Female,Yes,Sat,Dinner,4 222 | 12.16,2.2,Male,Yes,Fri,Lunch,2 223 | 13.42,3.48,Female,Yes,Fri,Lunch,2 224 | 8.58,1.92,Male,Yes,Fri,Lunch,1 225 | 15.98,3.0,Female,No,Fri,Lunch,3 226 | 13.42,1.58,Male,Yes,Fri,Lunch,2 227 | 16.27,2.5,Female,Yes,Fri,Lunch,2 228 | 10.09,2.0,Female,Yes,Fri,Lunch,2 229 | 20.45,3.0,Male,No,Sat,Dinner,4 230 | 13.28,2.72,Male,No,Sat,Dinner,2 231 | 22.12,2.88,Female,Yes,Sat,Dinner,2 232 | 24.01,2.0,Male,Yes,Sat,Dinner,4 233 | 15.69,3.0,Male,Yes,Sat,Dinner,3 234 | 11.61,3.39,Male,No,Sat,Dinner,2 235 | 10.77,1.47,Male,No,Sat,Dinner,2 236 | 15.53,3.0,Male,Yes,Sat,Dinner,2 237 | 10.07,1.25,Male,No,Sat,Dinner,2 238 | 12.6,1.0,Male,Yes,Sat,Dinner,2 239 | 32.83,1.17,Male,Yes,Sat,Dinner,2 240 | 35.83,4.67,Female,No,Sat,Dinner,3 241 | 29.03,5.92,Male,No,Sat,Dinner,3 242 | 27.18,2.0,Female,Yes,Sat,Dinner,2 243 | 22.67,2.0,Male,Yes,Sat,Dinner,2 244 | 17.82,1.75,Male,No,Sat,Dinner,2 245 | 18.78,3.0,Female,No,Thur,Dinner,2 246 | --------------------------------------------------------------------------------