├── 01062023 - DEB.ipynb ├── 02062023 - DEB.ipynb ├── 04052023 - DEB.ipynb ├── 04072023 - DEB.ipynb ├── 05052023 - DEB.ipynb ├── 06062023 - DEB.ipynb ├── 06072023 - DEB.ipynb ├── 07072023 - DEB.ipynb ├── 08062023 - DEB.ipynb ├── 09062023 - DEB.ipynb ├── 11072023 DEB v2.ipynb ├── 13072023 - DEB.ipynb ├── 14072023 - DEB.ipynb ├── 16052023 - DEB.ipynb ├── 18052023 - DEB.ipynb ├── 19042023 - DEB.ipynb ├── 23052023 - DEB (1).ipynb ├── 25042023 - DEB.ipynb ├── 25052023 - DEB.ipynb ├── 25062023 - DEB.ipynb ├── 26052023 - DEB.ipynb ├── 27042023 - DEB.ipynb ├── 28042023 - DEB.ipynb ├── 30052023 - DEB.ipynb ├── Advertising.csv ├── DecisionTreeClassification.ipynb ├── IBBLojistikWordCount.ipynb ├── KMeansClustering.ipynb ├── LineerRegression_Albaraka.ipynb ├── WineData.csv ├── loan_sanction_test.csv ├── loan_sanction_train.csv ├── sample.txt └── sample0.txt /04072023 - DEB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "b1d043ea-8c32-4e77-9859-30369a3637c8", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [ 19 | { 20 | "output_type": "stream", 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "root\n |-- firstname: string (nullable = true)\n |-- middlename: string (nullable = true)\n |-- lastname: string (nullable = true)\n |-- id: string (nullable = true)\n |-- gender: string (nullable = true)\n |-- salary: integer (nullable = true)\n\n+---------+----------+--------+-----+------+------+\n|firstname|middlename|lastname|id |gender|salary|\n+---------+----------+--------+-----+------+------+\n|James | |William |36636|M |3000 |\n|Michael |Smith | |40288|M |4000 |\n|Robert | |Dawson |42114|M |4000 |\n|Maria | |Jones |39192|F |4000 |\n+---------+----------+--------+-----+------+------+\n\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "import pyspark \n", 30 | "from pyspark.sql import SparkSession\n", 31 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", 32 | "\n", 33 | "spark = SparkSession.builder \\\n", 34 | " .master(\"local[1]\") \\\n", 35 | " .appName('ProjectFirst') \\\n", 36 | " .getOrCreate()\n", 37 | "\n", 38 | "data = [(\"James\", \"\", \"William\", \"36636\", \"M\", 3000), (\"Michael\", \"Smith\", \"\", \"40288\", \"M\", 4000), (\"Robert\", \"\", \"Dawson\", \"42114\", \"M\", 4000), \n", 39 | " (\"Maria\", \"\", \"Jones\", \"39192\", \"F\", 4000)]\n", 40 | "\n", 41 | "schema = StructType([\n", 42 | " StructField(\"firstname\", StringType(), True),\\\n", 43 | " StructField(\"middlename\", StringType(), True),\\\n", 44 | " StructField(\"lastname\", StringType(), True),\\\n", 45 | " StructField(\"id\", StringType(), True),\\\n", 46 | " StructField(\"gender\", StringType(), True),\\\n", 47 | " StructField(\"salary\", IntegerType(), True)\\\n", 48 | " ])\n", 49 | "\n", 50 | "df = spark.createDataFrame(data = data, schema = schema)\n", 51 | "df.printSchema()\n", 52 | "df.show(truncate = False)" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 0, 58 | "metadata": { 59 | "application/vnd.databricks.v1+cell": { 60 | "cellMetadata": { 61 | "byteLimit": 2048000, 62 | "rowLimit": 10000 63 | }, 64 | "inputWidgets": {}, 65 | "nuid": "394b5bc2-e1d4-4771-91d4-d8f24ee39c57", 66 | "showTitle": false, 67 | "title": "" 68 | } 69 | }, 70 | "outputs": [ 71 | { 72 | "output_type": "stream", 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "root\n |-- employee_name: string (nullable = true)\n |-- department: string (nullable = true)\n |-- salary: long (nullable = true)\n\n+-------------+----------+------+\n|employee_name|department|salary|\n+-------------+----------+------+\n|James |Sales |3000 |\n|Michael |Sales |4600 |\n|Robert |Sales |4100 |\n|Maria |Finance |3000 |\n|James |Sales |3000 |\n|Scott |Finance |3300 |\n|Jen |Finance |3900 |\n|Jeff |Marketing |3000 |\n|Kumar |Marketing |2000 |\n|Dogu |Sales |4100 |\n+-------------+----------+------+\n\n" 77 | ] 78 | } 79 | ], 80 | "source": [ 81 | "import pyspark \n", 82 | "from pyspark.sql import SparkSession\n", 83 | "from pyspark.sql.functions import expr\n", 84 | "\n", 85 | "spark = SparkSession.builder \\\n", 86 | " .master(\"local[1]\") \\\n", 87 | " .appName('ProjectSecond') \\\n", 88 | " .getOrCreate()\n", 89 | "\n", 90 | "data = [(\"James\", \"Sales\", 3000),\\\n", 91 | " (\"Michael\", \"Sales\", 4600),\\\n", 92 | " (\"Robert\", \"Sales\", 4100),\\\n", 93 | " (\"Maria\", \"Finance\", 3000),\\\n", 94 | " (\"James\", \"Sales\", 3000),\\\n", 95 | " (\"Scott\", \"Finance\", 3300),\\\n", 96 | " (\"Jen\", \"Finance\", 3900),\\\n", 97 | " (\"Jeff\", \"Marketing\", 3000),\\\n", 98 | " (\"Kumar\", \"Marketing\", 2000),\\\n", 99 | " (\"Dogu\", \"Sales\", 4100)]\n", 100 | "\n", 101 | "column = [\"employee_name\", \"department\", \"salary\"]\n", 102 | "df = spark.createDataFrame(data = data, schema = column)\n", 103 | "df.printSchema()\n", 104 | "df.show(truncate = False)" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 0, 110 | "metadata": { 111 | "application/vnd.databricks.v1+cell": { 112 | "cellMetadata": { 113 | "byteLimit": 2048000, 114 | "rowLimit": 10000 115 | }, 116 | "inputWidgets": {}, 117 | "nuid": "033c7a6b-db54-4457-8762-51fd44b788e5", 118 | "showTitle": false, 119 | "title": "" 120 | } 121 | }, 122 | "outputs": [ 123 | { 124 | "output_type": "stream", 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "Distinct Count: 9\n+-------------+----------+------+\n|employee_name|department|salary|\n+-------------+----------+------+\n|Michael |Sales |4600 |\n|James |Sales |3000 |\n|Robert |Sales |4100 |\n|Maria |Finance |3000 |\n|Jen |Finance |3900 |\n|Scott |Finance |3300 |\n|Kumar |Marketing |2000 |\n|Jeff |Marketing |3000 |\n|Dogu |Sales |4100 |\n+-------------+----------+------+\n\nDistinct Count: 9\n+-------------+----------+------+\n|employee_name|department|salary|\n+-------------+----------+------+\n|Michael |Sales |4600 |\n|James |Sales |3000 |\n|Robert |Sales |4100 |\n|Maria |Finance |3000 |\n|Jen |Finance |3900 |\n|Scott |Finance |3300 |\n|Kumar |Marketing |2000 |\n|Jeff |Marketing |3000 |\n|Dogu |Sales |4100 |\n+-------------+----------+------+\n\nDistinct Count: 8\n+-------------+----------+------+\n|employee_name|department|salary|\n+-------------+----------+------+\n|Maria |Finance |3000 |\n|Scott |Finance |3300 |\n|Jen |Finance |3900 |\n|Kumar |Marketing |2000 |\n|Jeff |Marketing |3000 |\n|James |Sales |3000 |\n|Robert |Sales |4100 |\n|Michael |Sales |4600 |\n+-------------+----------+------+\n\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "#Distinct\n", 134 | "distinctDF = df.distinct()\n", 135 | "print(\"Distinct Count: \" + str(distinctDF.count()))\n", 136 | "distinctDF.show(truncate = False)\n", 137 | "\n", 138 | "#Drop Duplicates\n", 139 | "df2 = df.dropDuplicates()\n", 140 | "print(\"Distinct Count: \" + str(df2.count()))\n", 141 | "df2.show(truncate = False)\n", 142 | "\n", 143 | "dropDisDF = df.dropDuplicates([\"department\", \"salary\"])\n", 144 | "print(\"Distinct Count: \" + str(dropDisDF.count()))\n", 145 | "dropDisDF.show(truncate = False)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 0, 151 | "metadata": { 152 | "application/vnd.databricks.v1+cell": { 153 | "cellMetadata": { 154 | "byteLimit": 2048000, 155 | "rowLimit": 10000 156 | }, 157 | "inputWidgets": {}, 158 | "nuid": "c0f829d6-bfd6-44da-b5a9-b29bafe573d4", 159 | "showTitle": false, 160 | "title": "" 161 | } 162 | }, 163 | "outputs": [ 164 | { 165 | "output_type": "stream", 166 | "name": "stdout", 167 | "output_type": "stream", 168 | "text": [ 169 | "root\n |-- firstname: string (nullable = true)\n |-- middlename: string (nullable = true)\n |-- lastname: string (nullable = true)\n |-- id: string (nullable = true)\n |-- gender: string (nullable = true)\n |-- salary: integer (nullable = true)\n\n+---------+----------+--------+-----+------+------+\n|firstname|middlename|lastname|id |gender|salary|\n+---------+----------+--------+-----+------+------+\n|James | |William |36636|M |3000 |\n|Michael |Smith | |40288|M |4000 |\n|Robert | |Dawson |42114|M |4000 |\n|Maria | |Jones |39192|F |4000 |\n+---------+----------+--------+-----+------+------+\n\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "import pyspark \n", 175 | "from pyspark.sql import SparkSession\n", 176 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n", 177 | "\n", 178 | "spark = SparkSession.builder \\\n", 179 | " .master(\"local[1]\") \\\n", 180 | " .appName('ProjectFirst') \\\n", 181 | " .getOrCreate()\n", 182 | "\n", 183 | "data = [(\"James\", \"\", \"William\", \"36636\", \"M\", 3000), (\"Michael\", \"Smith\", \"\", \"40288\", \"M\", 4000), (\"Robert\", \"\", \"Dawson\", \"42114\", \"M\", 4000), \n", 184 | " (\"Maria\", \"\", \"Jones\", \"39192\", \"F\", 4000)]\n", 185 | "\n", 186 | "schema = StructType([\n", 187 | " StructField(\"firstname\", StringType(), True),\\\n", 188 | " StructField(\"middlename\", StringType(), True),\\\n", 189 | " StructField(\"lastname\", StringType(), True),\\\n", 190 | " StructField(\"id\", StringType(), True),\\\n", 191 | " StructField(\"gender\", StringType(), True),\\\n", 192 | " StructField(\"salary\", IntegerType(), True)\\\n", 193 | " ])\n", 194 | "\n", 195 | "df = spark.createDataFrame(data = data, schema = schema)\n", 196 | "df.printSchema()\n", 197 | "df.show(truncate = False)" 198 | ] 199 | }, 200 | { 201 | "cell_type": "code", 202 | "execution_count": 0, 203 | "metadata": { 204 | "application/vnd.databricks.v1+cell": { 205 | "cellMetadata": { 206 | "byteLimit": 2048000, 207 | "rowLimit": 10000 208 | }, 209 | "inputWidgets": {}, 210 | "nuid": "e22f9b94-b38d-4f4f-9020-4d4284479b3b", 211 | "showTitle": false, 212 | "title": "" 213 | } 214 | }, 215 | "outputs": [ 216 | { 217 | "output_type": "stream", 218 | "name": "stdout", 219 | "output_type": "stream", 220 | "text": [ 221 | " firstname middlename lastname id gender salary\n0 James William 36636 M 3000\n1 Michael Smith 40288 M 4000\n2 Robert Dawson 42114 M 4000\n3 Maria Jones 39192 F 4000\n" 222 | ] 223 | } 224 | ], 225 | "source": [ 226 | "PandasDF = df.toPandas()\n", 227 | "print(PandasDF)" 228 | ] 229 | }, 230 | { 231 | "cell_type": "code", 232 | "execution_count": 0, 233 | "metadata": { 234 | "application/vnd.databricks.v1+cell": { 235 | "cellMetadata": { 236 | "byteLimit": 2048000, 237 | "rowLimit": 10000 238 | }, 239 | "inputWidgets": {}, 240 | "nuid": "ca514ad4-b095-4345-b589-31a0d3eda120", 241 | "showTitle": false, 242 | "title": "" 243 | } 244 | }, 245 | "outputs": [ 246 | { 247 | "output_type": "stream", 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "root\n |-- Product: string (nullable = true)\n |-- Amount: long (nullable = true)\n |-- Country: string (nullable = true)\n\n+-------+------+-------+\n|Product|Amount|Country|\n+-------+------+-------+\n|Banana |1000 |USA |\n|Carrots|1500 |USA |\n|Beans |1600 |USA |\n|Orange |2000 |USA |\n|Orange |2000 |USA |\n|Banana |4000 |China |\n|Carrots|1200 |China |\n|Beans |1500 |China |\n|Orange |4000 |China |\n|Banana |2000 |Canada |\n|Carrots|2000 |Canada |\n|Beans |2000 |Mexico |\n+-------+------+-------+\n\n" 252 | ] 253 | } 254 | ], 255 | "source": [ 256 | "import pyspark \n", 257 | "from pyspark.sql import SparkSession\n", 258 | "from pyspark.sql.functions import expr\n", 259 | "\n", 260 | "data = [(\"Banana\", 1000, \"USA\"), (\"Carrots\", 1500, \"USA\"), (\"Beans\", 1600, \"USA\"),\\\n", 261 | " (\"Orange\", 2000, \"USA\"), (\"Orange\", 2000, \"USA\"), (\"Banana\", 4000, \"China\"),\\\n", 262 | " (\"Carrots\", 1200, \"China\"), (\"Beans\", 1500, \"China\"), (\"Orange\", 4000, \"China\"),\\\n", 263 | " (\"Banana\", 2000, \"Canada\"), (\"Carrots\", 2000, \"Canada\"), (\"Beans\", 2000, \"Mexico\")\\\n", 264 | " ]\n", 265 | "\n", 266 | "columns = ['Product', 'Amount', 'Country']\n", 267 | "\n", 268 | "df = spark.createDataFrame(data = data, schema = columns)\n", 269 | "df.printSchema()\n", 270 | "df.show(truncate = False)" 271 | ] 272 | }, 273 | { 274 | "cell_type": "code", 275 | "execution_count": 0, 276 | "metadata": { 277 | "application/vnd.databricks.v1+cell": { 278 | "cellMetadata": { 279 | "byteLimit": 2048000, 280 | "rowLimit": 10000 281 | }, 282 | "inputWidgets": {}, 283 | "nuid": "d4afd5bd-c8ca-4af8-bb36-55aac49cd161", 284 | "showTitle": false, 285 | "title": "" 286 | } 287 | }, 288 | "outputs": [ 289 | { 290 | "output_type": "stream", 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "root\n |-- Product: string (nullable = true)\n |-- Canada: long (nullable = true)\n |-- China: long (nullable = true)\n |-- Mexico: long (nullable = true)\n |-- USA: long (nullable = true)\n\n+-------+------+-----+------+----+\n|Product|Canada|China|Mexico|USA |\n+-------+------+-----+------+----+\n|Orange |null |4000 |null |4000|\n|Beans |null |1500 |2000 |1600|\n|Banana |2000 |4000 |null |1000|\n|Carrots|2000 |1200 |null |1500|\n+-------+------+-----+------+----+\n\n" 295 | ] 296 | } 297 | ], 298 | "source": [ 299 | "pivotDF = df.groupBy(\"Product\").pivot(\"Country\").sum(\"Amount\")\n", 300 | "pivotDF.printSchema()\n", 301 | "pivotDF.show(truncate = False)" 302 | ] 303 | }, 304 | { 305 | "cell_type": "code", 306 | "execution_count": 0, 307 | "metadata": { 308 | "application/vnd.databricks.v1+cell": { 309 | "cellMetadata": { 310 | "byteLimit": 2048000, 311 | "rowLimit": 10000 312 | }, 313 | "inputWidgets": {}, 314 | "nuid": "5c264134-47b5-4a93-ab93-370312c9d58e", 315 | "showTitle": false, 316 | "title": "" 317 | } 318 | }, 319 | "outputs": [ 320 | { 321 | "output_type": "stream", 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "+-----+-----+---------+-----+\n| TV|Radio|Newspaper|Sales|\n+-----+-----+---------+-----+\n|230.1| 37.8| 69.2| 22.1|\n| 44.5| 39.3| 45.1| 10.4|\n| 17.2| 45.9| 69.3| 9.3|\n|151.5| 41.3| 58.5| 18.5|\n|180.8| 10.8| 58.4| 12.9|\n+-----+-----+---------+-----+\nonly showing top 5 rows\n\nroot\n |-- TV: double (nullable = true)\n |-- Radio: double (nullable = true)\n |-- Newspaper: double (nullable = true)\n |-- Sales: double (nullable = true)\n\n" 326 | ] 327 | } 328 | ], 329 | "source": [ 330 | "from pyspark.sql import SparkSession\n", 331 | "\n", 332 | "spark = SparkSession.builder \\\n", 333 | " .master(\"local[1]\") \\\n", 334 | " .appName('ProjectThird') \\\n", 335 | " .getOrCreate()\n", 336 | "\n", 337 | "df = spark.read.format('delta') \\\n", 338 | " .options(header = 'True', inferschema = 'True')\\\n", 339 | " .load(\"/user/hive/warehouse/advertising\", header = True)\n", 340 | "\n", 341 | "df.show(5)\n", 342 | "df.printSchema()" 343 | ] 344 | }, 345 | { 346 | "cell_type": "code", 347 | "execution_count": 0, 348 | "metadata": { 349 | "application/vnd.databricks.v1+cell": { 350 | "cellMetadata": { 351 | "byteLimit": 2048000, 352 | "rowLimit": 10000 353 | }, 354 | "inputWidgets": {}, 355 | "nuid": "53ba210d-4f66-44d6-a595-547c62e58bee", 356 | "showTitle": false, 357 | "title": "" 358 | } 359 | }, 360 | "outputs": [ 361 | { 362 | "output_type": "stream", 363 | "name": "stdout", 364 | "output_type": "stream", 365 | "text": [ 366 | "+----+----+----+-----+\n|col1|col2|col3| col4|\n+----+----+----+-----+\n| 1| 2| 3|a b c|\n| 4| 5| 6|d e f|\n| 7| 8| 9|g h i|\n+----+----+----+-----+\n\n" 367 | ] 368 | } 369 | ], 370 | "source": [ 371 | "#RDD creation\n", 372 | "\n", 373 | "from pyspark.sql import SparkSession\n", 374 | "\n", 375 | "spark = SparkSession.builder \\\n", 376 | " .master(\"local[1]\") \\\n", 377 | " .appName('ProjectRDDCreation') \\\n", 378 | " .getOrCreate()\n", 379 | "\n", 380 | "df = spark.sparkContext.parallelize([(1,2,3, 'a b c'), (4,5,6, 'd e f'), (7,8,9, 'g h i')]).toDF(['col1', 'col2', 'col3', 'col4'])\n", 381 | "df.show()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 0, 387 | "metadata": { 388 | "application/vnd.databricks.v1+cell": { 389 | "cellMetadata": {}, 390 | "inputWidgets": {}, 391 | "nuid": "fea6c861-1141-46bb-8284-f26f9b0aff59", 392 | "showTitle": false, 393 | "title": "" 394 | } 395 | }, 396 | "outputs": [], 397 | "source": [ 398 | "#Transformations & Actions" 399 | ] 400 | } 401 | ], 402 | "metadata": { 403 | "application/vnd.databricks.v1+notebook": { 404 | "dashboards": [], 405 | "language": "python", 406 | "notebookMetadata": { 407 | "pythonIndentUnit": 4 408 | }, 409 | "notebookName": "04072023 - DEB", 410 | "widgets": {} 411 | } 412 | }, 413 | "nbformat": 4, 414 | "nbformat_minor": 0 415 | } 416 | -------------------------------------------------------------------------------- /06072023 - DEB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "2368cd69-5209-43e5-b5f5-d2946a9ed25b", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [ 19 | { 20 | "output_type": "stream", 21 | "name": "stdout", 22 | "output_type": "stream", 23 | "text": [ 24 | "root\n |-- TV: double (nullable = true)\n |-- Radio: double (nullable = true)\n |-- Newspaper: double (nullable = true)\n |-- Sales: double (nullable = true)\n\n+-----+-----+---------+-----+\n| TV|Radio|Newspaper|Sales|\n+-----+-----+---------+-----+\n|230.1| 37.8| 69.2| 22.1|\n| 44.5| 39.3| 45.1| 10.4|\n| 17.2| 45.9| 69.3| 9.3|\n|151.5| 41.3| 58.5| 18.5|\n|180.8| 10.8| 58.4| 12.9|\n+-----+-----+---------+-----+\nonly showing top 5 rows\n\n" 25 | ] 26 | } 27 | ], 28 | "source": [ 29 | "#y = b0 + b1*x\n", 30 | "\n", 31 | "from pyspark.sql import SparkSession\n", 32 | "\n", 33 | "spark = SparkSession \\\n", 34 | " .builder \\\n", 35 | " .appName('Linear Regression with PySpark') \\\n", 36 | " .getOrCreate()\n", 37 | "\n", 38 | "df = spark.read.format('delta').\\\n", 39 | " options(header = 'true', inferschema = 'true').\\\n", 40 | " load(\"/user/hive/warehouse/advertising\", header = True)\n", 41 | "\n", 42 | "df.printSchema()\n", 43 | "df.show(5)" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 0, 49 | "metadata": { 50 | "application/vnd.databricks.v1+cell": { 51 | "cellMetadata": { 52 | "byteLimit": 2048000, 53 | "rowLimit": 10000 54 | }, 55 | "inputWidgets": {}, 56 | "nuid": "2df871da-5142-42e4-8234-49ddae8ebb4c", 57 | "showTitle": false, 58 | "title": "" 59 | } 60 | }, 61 | "outputs": [ 62 | { 63 | "output_type": "stream", 64 | "name": "stdout", 65 | "output_type": "stream", 66 | "text": [ 67 | "+-------+-----------------+------------------+------------------+------------------+\n|summary| TV| Radio| Newspaper| Sales|\n+-------+-----------------+------------------+------------------+------------------+\n| count| 200| 200| 200| 200|\n| mean| 147.0425|23.264000000000024|30.553999999999995|14.022500000000003|\n| stddev|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|\n| min| 0.7| 0.0| 0.3| 1.6|\n| max| 296.4| 49.6| 114.0| 27.0|\n+-------+-----------------+------------------+------------------+------------------+\n\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "df.describe().show()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 0, 78 | "metadata": { 79 | "application/vnd.databricks.v1+cell": { 80 | "cellMetadata": { 81 | "byteLimit": 2048000, 82 | "rowLimit": 10000 83 | }, 84 | "inputWidgets": {}, 85 | "nuid": "3510b437-2a07-4830-8133-c8069353684f", 86 | "showTitle": false, 87 | "title": "" 88 | } 89 | }, 90 | "outputs": [ 91 | { 92 | "output_type": "stream", 93 | "name": "stdout", 94 | "output_type": "stream", 95 | "text": [ 96 | "+-----------------+-----+\n| features|label|\n+-----------------+-----+\n|[230.1,37.8,69.2]| 22.1|\n| [44.5,39.3,45.1]| 10.4|\n| [17.2,45.9,69.3]| 9.3|\n|[151.5,41.3,58.5]| 18.5|\n|[180.8,10.8,58.4]| 12.9|\n+-----------------+-----+\nonly showing top 5 rows\n\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "from pyspark.sql import Row\n", 102 | "from pyspark.ml.linalg import Vectors\n", 103 | "\n", 104 | "def transData(data):\n", 105 | " return data.rdd.map(lambda r : [Vectors.dense(r[:-1]), r[-1]]).toDF(['features', 'label'])\n", 106 | "\n", 107 | "transformed = transData(df)\n", 108 | "transformed.show(5)" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 0, 114 | "metadata": { 115 | "application/vnd.databricks.v1+cell": { 116 | "cellMetadata": { 117 | "byteLimit": 2048000, 118 | "rowLimit": 10000 119 | }, 120 | "inputWidgets": {}, 121 | "nuid": "7d404853-88c2-415d-bce7-9fd33e0b2360", 122 | "showTitle": false, 123 | "title": "" 124 | } 125 | }, 126 | "outputs": [ 127 | { 128 | "output_type": "stream", 129 | "name": "stdout", 130 | "output_type": "stream", 131 | "text": [ 132 | "+-----------------+-----+-----------------+\n| features|label| indexedFeatures|\n+-----------------+-----+-----------------+\n|[230.1,37.8,69.2]| 22.1|[230.1,37.8,69.2]|\n| [44.5,39.3,45.1]| 10.4| [44.5,39.3,45.1]|\n| [17.2,45.9,69.3]| 9.3| [17.2,45.9,69.3]|\n|[151.5,41.3,58.5]| 18.5|[151.5,41.3,58.5]|\n|[180.8,10.8,58.4]| 12.9|[180.8,10.8,58.4]|\n+-----------------+-----+-----------------+\nonly showing top 5 rows\n\n" 133 | ] 134 | } 135 | ], 136 | "source": [ 137 | "from pyspark.ml import Pipeline\n", 138 | "from pyspark.ml.regression import LinearRegression\n", 139 | "from pyspark.ml.feature import VectorIndexer\n", 140 | "from pyspark.ml.evaluation import RegressionEvaluator\n", 141 | "\n", 142 | "featureIndexer = VectorIndexer(inputCol = \"features\", outputCol = \"indexedFeatures\", maxCategories = 4).fit(transformed)\n", 143 | "\n", 144 | "data = featureIndexer.transform(transformed)\n", 145 | "\n", 146 | "data.show(5)" 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "execution_count": 0, 152 | "metadata": { 153 | "application/vnd.databricks.v1+cell": { 154 | "cellMetadata": { 155 | "byteLimit": 2048000, 156 | "rowLimit": 10000 157 | }, 158 | "inputWidgets": {}, 159 | "nuid": "b2bc613d-7b82-40fe-87d9-194d2718f380", 160 | "showTitle": false, 161 | "title": "" 162 | } 163 | }, 164 | "outputs": [ 165 | { 166 | "output_type": "stream", 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "+---------------+-----+---------------+\n| features|label|indexedFeatures|\n+---------------+-----+---------------+\n| [0.7,39.6,8.7]| 1.6| [0.7,39.6,8.7]|\n| [4.1,11.6,5.7]| 3.2| [4.1,11.6,5.7]|\n| [5.4,29.9,9.4]| 5.3| [5.4,29.9,9.4]|\n|[7.3,28.1,41.4]| 5.5|[7.3,28.1,41.4]|\n|[7.8,38.9,50.6]| 6.6|[7.8,38.9,50.6]|\n+---------------+-----+---------------+\nonly showing top 5 rows\n\n+----------------+-----+----------------+\n| features|label| indexedFeatures|\n+----------------+-----+----------------+\n| [8.4,27.2,2.1]| 5.7| [8.4,27.2,2.1]|\n| [8.6,2.1,1.0]| 4.8| [8.6,2.1,1.0]|\n| [8.7,48.9,75.0]| 7.2| [8.7,48.9,75.0]|\n|[13.2,15.9,49.6]| 5.6|[13.2,15.9,49.6]|\n|[18.7,12.1,23.4]| 6.7|[18.7,12.1,23.4]|\n+----------------+-----+----------------+\nonly showing top 5 rows\n\n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "(trainingData, testData) = data.randomSplit([0.6, 0.4])\n", 176 | "trainingData.show(5)\n", 177 | "testData.show(5)" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 0, 183 | "metadata": { 184 | "application/vnd.databricks.v1+cell": { 185 | "cellMetadata": { 186 | "byteLimit": 2048000, 187 | "rowLimit": 10000 188 | }, 189 | "inputWidgets": {}, 190 | "nuid": "bedc939e-99b7-4f47-9a40-b6964d9c8947", 191 | "showTitle": false, 192 | "title": "" 193 | } 194 | }, 195 | "outputs": [], 196 | "source": [ 197 | "lr = LinearRegression()\n", 198 | "\n", 199 | "pipeline = Pipeline(stages = [featureIndexer, lr])\n", 200 | "\n", 201 | "model = pipeline.fit(trainingData)" 202 | ] 203 | }, 204 | { 205 | "cell_type": "code", 206 | "execution_count": 0, 207 | "metadata": { 208 | "application/vnd.databricks.v1+cell": { 209 | "cellMetadata": { 210 | "byteLimit": 2048000, 211 | "rowLimit": 10000 212 | }, 213 | "inputWidgets": {}, 214 | "nuid": "e5f5102d-c0f5-4d72-9e27-8983a4c76876", 215 | "showTitle": false, 216 | "title": "" 217 | } 218 | }, 219 | "outputs": [ 220 | { 221 | "output_type": "stream", 222 | "name": "stdout", 223 | "output_type": "stream", 224 | "text": [ 225 | "+----------------+-----+------------------+\n| features|label| prediction|\n+----------------+-----+------------------+\n| [8.4,27.2,2.1]| 5.7| 8.229211181760627|\n| [8.6,2.1,1.0]| 4.8| 3.644851163299394|\n| [8.7,48.9,75.0]| 7.2|12.020689858644344|\n|[13.2,15.9,49.6]| 5.6| 6.25831749737122|\n|[18.7,12.1,23.4]| 6.7| 5.893004667299868|\n+----------------+-----+------------------+\nonly showing top 5 rows\n\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "predictions = model.transform(testData)\n", 231 | "\n", 232 | "predictions.select(\"features\", \"label\", \"prediction\").show(5)" 233 | ] 234 | }, 235 | { 236 | "cell_type": "code", 237 | "execution_count": 0, 238 | "metadata": { 239 | "application/vnd.databricks.v1+cell": { 240 | "cellMetadata": { 241 | "byteLimit": 2048000, 242 | "rowLimit": 10000 243 | }, 244 | "inputWidgets": {}, 245 | "nuid": "8c828be1-7ec4-4fc8-adf3-67e20b583020", 246 | "showTitle": false, 247 | "title": "" 248 | } 249 | }, 250 | "outputs": [ 251 | { 252 | "output_type": "stream", 253 | "name": "stdout", 254 | "output_type": "stream", 255 | "text": [ 256 | "Root Mean Square Error (RMSE) on test data = 1.55713\n" 257 | ] 258 | } 259 | ], 260 | "source": [ 261 | "from pyspark.ml.evaluation import RegressionEvaluator\n", 262 | "\n", 263 | "evaluator = RegressionEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'rmse')\n", 264 | "\n", 265 | "rmse = evaluator.evaluate(predictions)\n", 266 | "print('Root Mean Square Error (RMSE) on test data = %g' % rmse)" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 0, 272 | "metadata": { 273 | "application/vnd.databricks.v1+cell": { 274 | "cellMetadata": { 275 | "byteLimit": 2048000, 276 | "rowLimit": 10000 277 | }, 278 | "inputWidgets": {}, 279 | "nuid": "c87da307-16aa-41b2-b429-7cc53d7ee3e0", 280 | "showTitle": false, 281 | "title": "" 282 | } 283 | }, 284 | "outputs": [ 285 | { 286 | "output_type": "stream", 287 | "name": "stdout", 288 | "output_type": "stream", 289 | "text": [ 290 | "r2_score: 0.8899337308584387\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "#r2-score hesabı\n", 296 | "\n", 297 | "y_true = predictions.select('label').toPandas()\n", 298 | "y_pred = predictions.select('prediction').toPandas()\n", 299 | "\n", 300 | "import sklearn.metrics\n", 301 | "\n", 302 | "r2_score = sklearn.metrics.r2_score(y_true, y_pred)\n", 303 | "print('r2_score: {0}'.format(r2_score))" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 0, 309 | "metadata": { 310 | "application/vnd.databricks.v1+cell": { 311 | "cellMetadata": { 312 | "byteLimit": 2048000, 313 | "rowLimit": 10000 314 | }, 315 | "inputWidgets": {}, 316 | "nuid": "84eea05d-8018-4f69-93b5-ba9eaea5b6dc", 317 | "showTitle": false, 318 | "title": "" 319 | } 320 | }, 321 | "outputs": [], 322 | "source": [ 323 | "#Decision Tree Classification" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 0, 329 | "metadata": { 330 | "application/vnd.databricks.v1+cell": { 331 | "cellMetadata": { 332 | "byteLimit": 2048000, 333 | "rowLimit": 10000 334 | }, 335 | "inputWidgets": {}, 336 | "nuid": "06824127-5c6a-48c3-9b9d-f728a07682c4", 337 | "showTitle": false, 338 | "title": "" 339 | } 340 | }, 341 | "outputs": [], 342 | "source": [ 343 | "from pyspark.sql import SparkSession\n", 344 | "\n", 345 | "spark = SparkSession \\\n", 346 | " .builder \\\n", 347 | " .appName('DT Classification with Pyspark') \\\n", 348 | " .getOrCreate()" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 0, 354 | "metadata": { 355 | "application/vnd.databricks.v1+cell": { 356 | "cellMetadata": { 357 | "byteLimit": 2048000, 358 | "rowLimit": 10000 359 | }, 360 | "inputWidgets": {}, 361 | "nuid": "b2a61147-fe4b-462b-9b04-1d979140ef88", 362 | "showTitle": false, 363 | "title": "" 364 | } 365 | }, 366 | "outputs": [ 367 | { 368 | "output_type": "stream", 369 | "name": "stdout", 370 | "output_type": "stream", 371 | "text": [ 372 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol|quality|\n+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| 5|\n| 7.8| 0.88| 0.0| 2.6| 0.098| 25.0| 67.0| 0.9968| 3.2| 0.68| 9.8| 5|\n| 7.8| 0.76| 0.04| 2.3| 0.092| 15.0| 54.0| 0.997|3.26| 0.65| 9.8| 5|\n| 11.2| 0.28| 0.56| 1.9| 0.075| 17.0| 60.0| 0.998|3.16| 0.58| 9.8| 6|\n| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| 5|\n+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\nonly showing top 5 rows\n\n" 373 | ] 374 | } 375 | ], 376 | "source": [ 377 | "df = spark.read.format('delta').\\\n", 378 | " options(header = 'true', inferschema = 'true')\\\n", 379 | " .load(\"/user/hive/warehouse/wine_data\", header = 'True')\n", 380 | "\n", 381 | "df.show(5, True)" 382 | ] 383 | }, 384 | { 385 | "cell_type": "code", 386 | "execution_count": 0, 387 | "metadata": { 388 | "application/vnd.databricks.v1+cell": { 389 | "cellMetadata": { 390 | "byteLimit": 2048000, 391 | "rowLimit": 10000 392 | }, 393 | "inputWidgets": {}, 394 | "nuid": "acea6e1c-fae9-41c0-b071-58a429e8e7d0", 395 | "showTitle": false, 396 | "title": "" 397 | } 398 | }, 399 | "outputs": [], 400 | "source": [ 401 | "def condition(r):\n", 402 | "\n", 403 | " if (0 <= r <= 4):\n", 404 | " label = 'low'\n", 405 | " \n", 406 | " elif (4 < r <= 6):\n", 407 | " label = 'medium'\n", 408 | "\n", 409 | " else:\n", 410 | " label = 'high'\n", 411 | " \n", 412 | " return label\n", 413 | "\n", 414 | "def string_to_float(x):\n", 415 | " return float(x)" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 0, 421 | "metadata": { 422 | "application/vnd.databricks.v1+cell": { 423 | "cellMetadata": { 424 | "byteLimit": 2048000, 425 | "rowLimit": 10000 426 | }, 427 | "inputWidgets": {}, 428 | "nuid": "77ea88fe-f733-4139-9eb5-12780aafc42b", 429 | "showTitle": false, 430 | "title": "" 431 | } 432 | }, 433 | "outputs": [ 434 | { 435 | "output_type": "stream", 436 | "name": "stdout", 437 | "output_type": "stream", 438 | "text": [ 439 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol|quality|\n+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| medium|\n| 7.8| 0.88| 0.0| 2.6| 0.098| 25.0| 67.0| 0.9968| 3.2| 0.68| 9.8| medium|\n| 7.8| 0.76| 0.04| 2.3| 0.092| 15.0| 54.0| 0.997|3.26| 0.65| 9.8| medium|\n| 11.2| 0.28| 0.56| 1.9| 0.075| 17.0| 60.0| 0.998|3.16| 0.58| 9.8| medium|\n| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| medium|\n+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\nonly showing top 5 rows\n\nroot\n |-- fixed acidity: double (nullable = true)\n |-- volatile acidity: double (nullable = true)\n |-- citric acid: double (nullable = true)\n |-- residual sugar: double (nullable = true)\n |-- chlorides: double (nullable = true)\n |-- free sulfur dioxide: double (nullable = true)\n |-- total sulfur dioxide: double (nullable = true)\n |-- density: double (nullable = true)\n |-- pH: double (nullable = true)\n |-- sulphates: double (nullable = true)\n |-- alcohol: double (nullable = true)\n |-- quality: string (nullable = true)\n\n" 440 | ] 441 | } 442 | ], 443 | "source": [ 444 | "from pyspark.sql.functions import udf\n", 445 | "from pyspark.sql.types import StringType, DoubleType\n", 446 | "string_to_float_udf = udf(string_to_float, DoubleType())\n", 447 | "quality_udf = udf(lambda x : condition(x), StringType())\n", 448 | "\n", 449 | "df = df.withColumn(\"quality\", quality_udf(\"quality\"))\n", 450 | "df.show(5)\n", 451 | "df.printSchema()" 452 | ] 453 | }, 454 | { 455 | "cell_type": "code", 456 | "execution_count": 0, 457 | "metadata": { 458 | "application/vnd.databricks.v1+cell": { 459 | "cellMetadata": { 460 | "byteLimit": 2048000, 461 | "rowLimit": 10000 462 | }, 463 | "inputWidgets": {}, 464 | "nuid": "0148bf86-6375-4a52-bc48-2d12298a7ca0", 465 | "showTitle": false, 466 | "title": "" 467 | } 468 | }, 469 | "outputs": [], 470 | "source": [ 471 | "from pyspark.ml.linalg import Vectors\n", 472 | "from pyspark.ml import Pipeline\n", 473 | "from pyspark.ml.feature import VectorIndexer, StringIndexer, IndexToString\n", 474 | "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n", 475 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator" 476 | ] 477 | }, 478 | { 479 | "cell_type": "code", 480 | "execution_count": 0, 481 | "metadata": { 482 | "application/vnd.databricks.v1+cell": { 483 | "cellMetadata": { 484 | "byteLimit": 2048000, 485 | "rowLimit": 10000 486 | }, 487 | "inputWidgets": {}, 488 | "nuid": "b9746597-215f-4a67-bdde-9ae9f7572597", 489 | "showTitle": false, 490 | "title": "" 491 | } 492 | }, 493 | "outputs": [ 494 | { 495 | "output_type": "stream", 496 | "name": "stdout", 497 | "output_type": "stream", 498 | "text": [ 499 | "+--------------------+------+\n| features| label|\n+--------------------+------+\n|[7.4,0.7,0.0,1.9,...|medium|\n|[7.8,0.88,0.0,2.6...|medium|\n|[7.8,0.76,0.04,2....|medium|\n|[11.2,0.28,0.56,1...|medium|\n|[7.4,0.7,0.0,1.9,...|medium|\n+--------------------+------+\nonly showing top 5 rows\n\n" 500 | ] 501 | } 502 | ], 503 | "source": [ 504 | "def transData(data):\n", 505 | " return data.rdd.map(lambda r : [Vectors.dense(r[:-1]), r[-1]]).toDF(['features', 'label'])\n", 506 | "\n", 507 | "transformed = transData(df)\n", 508 | "transformed.show(5)" 509 | ] 510 | }, 511 | { 512 | "cell_type": "code", 513 | "execution_count": 0, 514 | "metadata": { 515 | "application/vnd.databricks.v1+cell": { 516 | "cellMetadata": {}, 517 | "inputWidgets": {}, 518 | "nuid": "6b1add08-df27-4be3-a9ab-b08ff21c1f9c", 519 | "showTitle": false, 520 | "title": "" 521 | } 522 | }, 523 | "outputs": [], 524 | "source": [] 525 | } 526 | ], 527 | "metadata": { 528 | "application/vnd.databricks.v1+notebook": { 529 | "dashboards": [], 530 | "language": "python", 531 | "notebookMetadata": { 532 | "pythonIndentUnit": 4 533 | }, 534 | "notebookName": "06072023 - DEB", 535 | "widgets": {} 536 | } 537 | }, 538 | "nbformat": 4, 539 | "nbformat_minor": 0 540 | } 541 | -------------------------------------------------------------------------------- /11072023 DEB v2.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "40b070d9-8cee-4d5b-ae3d-ad44614f153c", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "from pyspark.sql import SparkSession \n", 21 | "\n", 22 | "spark = SparkSession \\\n", 23 | " .builder \\\n", 24 | " .appName(\"RFM Customer Segmentation with PySpark\") \\\n", 25 | " .getOrCreate()" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 0, 31 | "metadata": { 32 | "application/vnd.databricks.v1+cell": { 33 | "cellMetadata": { 34 | "byteLimit": 2048000, 35 | "rowLimit": 10000 36 | }, 37 | "inputWidgets": {}, 38 | "nuid": "09fbf487-e2cd-439e-a07a-19ccfdb2c159", 39 | "showTitle": false, 40 | "title": "" 41 | } 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "df_raw = spark.read.format('delta').\\\n", 46 | " options(header = 'true', inferschema = 'true').\\\n", 47 | " load(\"/user/hive/warehouse/online_retail2\", header = True)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 0, 53 | "metadata": { 54 | "application/vnd.databricks.v1+cell": { 55 | "cellMetadata": { 56 | "byteLimit": 2048000, 57 | "rowLimit": 10000 58 | }, 59 | "inputWidgets": {}, 60 | "nuid": "9dde06e7-64c1-431c-88a7-39401b30a751", 61 | "showTitle": false, 62 | "title": "" 63 | } 64 | }, 65 | "outputs": [ 66 | { 67 | "output_type": "stream", 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2,55| 17850|United Kingdom|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2,75| 17850|United Kingdom|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\nonly showing top 5 rows\n\nroot\n |-- InvoiceNo: string (nullable = true)\n |-- StockCode: string (nullable = true)\n |-- Description: string (nullable = true)\n |-- Quantity: long (nullable = true)\n |-- InvoiceDate: string (nullable = true)\n |-- UnitPrice: string (nullable = true)\n |-- CustomerID: long (nullable = true)\n |-- Country: string (nullable = true)\n\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "df_raw.show(5)\n", 77 | "df_raw.printSchema()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 0, 83 | "metadata": { 84 | "application/vnd.databricks.v1+cell": { 85 | "cellMetadata": { 86 | "byteLimit": 2048000, 87 | "rowLimit": 10000 88 | }, 89 | "inputWidgets": {}, 90 | "nuid": "d987ba2c-dda9-4bb5-96e5-8081409408ce", 91 | "showTitle": false, 92 | "title": "" 93 | } 94 | }, 95 | "outputs": [ 96 | { 97 | "output_type": "stream", 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "+---------+---------+-----------+--------+-----------+---------+----------+-------+\n|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n| 541909| 541909| 541909| 541909| 541909| 541909| 541909| 541909|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "from pyspark.sql.functions import count\n", 107 | "\n", 108 | "def my_count(df_in):\n", 109 | " df_in.agg(*[count(c).alias(c) for c in df_in.columns]).show()\n", 110 | "\n", 111 | "my_count(df_raw)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 0, 117 | "metadata": { 118 | "application/vnd.databricks.v1+cell": { 119 | "cellMetadata": { 120 | "byteLimit": 2048000, 121 | "rowLimit": 10000 122 | }, 123 | "inputWidgets": {}, 124 | "nuid": "f21365b1-3e8a-44be-a21e-75a90fb16efa", 125 | "showTitle": false, 126 | "title": "" 127 | } 128 | }, 129 | "outputs": [ 130 | { 131 | "output_type": "stream", 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "+---------+---------+-----------+--------+-----------+---------+----------+-------+\n|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n| 541909| 541909| 541909| 541909| 541909| 541909| 541909| 541909|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "df = df_raw.dropna(how = \"any\")\n", 141 | "my_count(df)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 0, 147 | "metadata": { 148 | "application/vnd.databricks.v1+cell": { 149 | "cellMetadata": { 150 | "byteLimit": 2048000, 151 | "rowLimit": 10000 152 | }, 153 | "inputWidgets": {}, 154 | "nuid": "76ca2c15-48f7-4541-b1df-0e0443b5eb3e", 155 | "showTitle": false, 156 | "title": "" 157 | } 158 | }, 159 | "outputs": [ 160 | { 161 | "output_type": "stream", 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "+---------+---------+--------------------+--------+---------------+---------+----------+--------------+--------------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country|NewInvoiceDate|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+--------------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2,55| 17850|United Kingdom| null|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom| null|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2,75| 17850|United Kingdom| null|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom| null|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom| null|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+--------------+\nonly showing top 5 rows\n\n" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, datediff, col, when\n", 171 | "\n", 172 | "timeFmt = \"MM/dd/yy HH:mm\"\n", 173 | "\n", 174 | "df = df.withColumn('NewInvoiceDate', when(col('InvoiceDate').isNotNull(), to_utc_timestamp(unix_timestamp(col('InvoiceDate'), timeFmt).cast('timestamp'), 'UTC')).otherwise(col('InvoiceDate')))\n", 175 | "\n", 176 | "df.show(5)" 177 | ] 178 | }, 179 | { 180 | "cell_type": "code", 181 | "execution_count": 0, 182 | "metadata": { 183 | "application/vnd.databricks.v1+cell": { 184 | "cellMetadata": { 185 | "byteLimit": 2048000, 186 | "rowLimit": 10000 187 | }, 188 | "inputWidgets": {}, 189 | "nuid": "2d5f96fc-394f-44bd-87bd-4364c67d92c9", 190 | "showTitle": false, 191 | "title": "" 192 | } 193 | }, 194 | "outputs": [ 195 | { 196 | "output_type": "stream", 197 | "name": "stdout", 198 | "output_type": "stream", 199 | "text": [ 200 | "+----------+-------+---------+--------+\n|CustomerID|Recency|Frequency|Monetary|\n+----------+-------+---------+--------+\n| 15194| null| 22| null|\n| 17703| null| 3| null|\n| 13452| null| 2| 590.0|\n| 13098| null| 41| null|\n| 17048| null| 6| null|\n| 13638| null| 1| null|\n| 15322| null| 2| null|\n| 13723| null| 1| null|\n| 16597| null| 1| null|\n| 15237| null| 4| null|\n| 13248| null| 2| null|\n| 16742| null| 2| null|\n| 14719| null| 6| null|\n| 17043| null| 4| null|\n| 14117| null| 1| null|\n| 15057| null| 2| null|\n| 17979| null| 5| null|\n| 13460| null| 2| null|\n| 13518| null| 1| null|\n| 15432| null| 1| null|\n+----------+-------+---------+--------+\nonly showing top 20 rows\n\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "from pyspark.sql.functions import round\n", 206 | "\n", 207 | "df = df.withColumn('TotalPrice', round(df.Quantity * df.UnitPrice, 2) )\n", 208 | "\n", 209 | "from pyspark.sql.functions import mean, min, max, sum, datediff, to_date\n", 210 | "\n", 211 | "date_max = df.select(max('NewInvoiceDate')).toPandas()\n", 212 | "\n", 213 | "current = to_utc_timestamp(unix_timestamp(lit(str(date_max.iloc[0][0])), 'yy-MM-dd HH:mm').cast('timestamp'), 'UTC')\n", 214 | "\n", 215 | "df = df.withColumn('Duration', datediff(lit(current), 'NewInvoiceDate'))\n", 216 | "\n", 217 | "#Recency, Frequency, Monetary\n", 218 | "\n", 219 | "recency = df.groupBy('CustomerID').agg(min('Duration').alias('Recency'))\n", 220 | "\n", 221 | "frequency = df.groupBy('CustomerID', 'InvoiceNo').count()\\\n", 222 | " .groupBy('CustomerID')\\\n", 223 | " .agg(count('*').alias(\"Frequency\"))\n", 224 | "\n", 225 | "monetary = df.groupBy('CustomerID').agg(round(sum('TotalPrice'), 2).alias('Monetary'))\n", 226 | "\n", 227 | "rfm = recency.join(frequency, 'CustomerID', how = 'inner')\\\n", 228 | " .join(monetary, 'CustomerID', how = 'inner')\n", 229 | "\n", 230 | "rfm.show()" 231 | ] 232 | } 233 | ], 234 | "metadata": { 235 | "application/vnd.databricks.v1+notebook": { 236 | "dashboards": [], 237 | "language": "python", 238 | "notebookMetadata": { 239 | "pythonIndentUnit": 4 240 | }, 241 | "notebookName": "11072023 DEB v2", 242 | "widgets": {} 243 | } 244 | }, 245 | "nbformat": 4, 246 | "nbformat_minor": 0 247 | } 248 | -------------------------------------------------------------------------------- /13072023 - DEB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 0, 6 | "metadata": { 7 | "application/vnd.databricks.v1+cell": { 8 | "cellMetadata": { 9 | "byteLimit": 2048000, 10 | "rowLimit": 10000 11 | }, 12 | "inputWidgets": {}, 13 | "nuid": "40b070d9-8cee-4d5b-ae3d-ad44614f153c", 14 | "showTitle": false, 15 | "title": "" 16 | } 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "from pyspark.sql import SparkSession \n", 21 | "\n", 22 | "spark = SparkSession \\\n", 23 | " .builder \\\n", 24 | " .appName(\"RFM Customer Segmentation with PySpark\") \\\n", 25 | " .getOrCreate()" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": 0, 31 | "metadata": { 32 | "application/vnd.databricks.v1+cell": { 33 | "cellMetadata": { 34 | "byteLimit": 2048000, 35 | "rowLimit": 10000 36 | }, 37 | "inputWidgets": {}, 38 | "nuid": "09fbf487-e2cd-439e-a07a-19ccfdb2c159", 39 | "showTitle": false, 40 | "title": "" 41 | } 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "df_raw = spark.read.format('delta').\\\n", 46 | " options(header = 'true', inferschema = 'true').\\\n", 47 | " load(\"/user/hive/warehouse/online_retail2\", header = True)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "code", 52 | "execution_count": 0, 53 | "metadata": { 54 | "application/vnd.databricks.v1+cell": { 55 | "cellMetadata": { 56 | "byteLimit": 2048000, 57 | "rowLimit": 10000 58 | }, 59 | "inputWidgets": {}, 60 | "nuid": "9dde06e7-64c1-431c-88a7-39401b30a751", 61 | "showTitle": false, 62 | "title": "" 63 | } 64 | }, 65 | "outputs": [ 66 | { 67 | "output_type": "stream", 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2,55| 17850|United Kingdom|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2,75| 17850|United Kingdom|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\nonly showing top 5 rows\n\nroot\n |-- InvoiceNo: string (nullable = true)\n |-- StockCode: string (nullable = true)\n |-- Description: string (nullable = true)\n |-- Quantity: long (nullable = true)\n |-- InvoiceDate: string (nullable = true)\n |-- UnitPrice: string (nullable = true)\n |-- CustomerID: long (nullable = true)\n |-- Country: string (nullable = true)\n\n" 72 | ] 73 | } 74 | ], 75 | "source": [ 76 | "df_raw.show(5)\n", 77 | "df_raw.printSchema()" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 0, 83 | "metadata": { 84 | "application/vnd.databricks.v1+cell": { 85 | "cellMetadata": { 86 | "byteLimit": 2048000, 87 | "rowLimit": 10000 88 | }, 89 | "inputWidgets": {}, 90 | "nuid": "d987ba2c-dda9-4bb5-96e5-8081409408ce", 91 | "showTitle": false, 92 | "title": "" 93 | } 94 | }, 95 | "outputs": [ 96 | { 97 | "output_type": "stream", 98 | "name": "stdout", 99 | "output_type": "stream", 100 | "text": [ 101 | "+---------+---------+-----------+--------+-----------+---------+----------+-------+\n|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n| 541909| 541909| 541909| 541909| 541909| 541909| 541909| 541909|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n\n" 102 | ] 103 | } 104 | ], 105 | "source": [ 106 | "from pyspark.sql.functions import count\n", 107 | "\n", 108 | "def my_count(df_in):\n", 109 | " df_in.agg(*[count(c).alias(c) for c in df_in.columns]).show()\n", 110 | "\n", 111 | "my_count(df_raw)" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 0, 117 | "metadata": { 118 | "application/vnd.databricks.v1+cell": { 119 | "cellMetadata": { 120 | "byteLimit": 2048000, 121 | "rowLimit": 10000 122 | }, 123 | "inputWidgets": {}, 124 | "nuid": "f21365b1-3e8a-44be-a21e-75a90fb16efa", 125 | "showTitle": false, 126 | "title": "" 127 | } 128 | }, 129 | "outputs": [ 130 | { 131 | "output_type": "stream", 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "+---------+---------+-----------+--------+-----------+---------+----------+-------+\n|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n| 541909| 541909| 541909| 541909| 541909| 541909| 541909| 541909|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n\n" 136 | ] 137 | } 138 | ], 139 | "source": [ 140 | "df = df_raw.dropna(how = \"any\")\n", 141 | "my_count(df)" 142 | ] 143 | }, 144 | { 145 | "cell_type": "code", 146 | "execution_count": 0, 147 | "metadata": { 148 | "application/vnd.databricks.v1+cell": { 149 | "cellMetadata": { 150 | "byteLimit": 2048000, 151 | "rowLimit": 10000 152 | }, 153 | "inputWidgets": {}, 154 | "nuid": "76ca2c15-48f7-4541-b1df-0e0443b5eb3e", 155 | "showTitle": false, 156 | "title": "" 157 | } 158 | }, 159 | "outputs": [ 160 | { 161 | "output_type": "stream", 162 | "name": "stdout", 163 | "output_type": "stream", 164 | "text": [ 165 | "+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country| NewInvoiceDate|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2,55| 17850|United Kingdom|2010-12-01 08:26:00|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|2010-12-01 08:26:00|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2,75| 17850|United Kingdom|2010-12-01 08:26:00|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|2010-12-01 08:26:00|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|2010-12-01 08:26:00|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+\nonly showing top 5 rows\n\nOut[6]: '\\n\\nfrom pyspark.sql.functions import *\\ntimeFmt = \"MM/dd/yy HH:mm\"\\ndf = df.withColumn(\"NewInvoiceData2\", from_unixtime(unix_timestamp(to_timestamp(\"InvoiceDate\",\"d.M.yyyy HH:mm\").cast(\"timestamp\"),timeFmt),timeFmt))\\ndf.show()\\n\\n'" 166 | ] 167 | } 168 | ], 169 | "source": [ 170 | "from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, datediff, col, to_timestamp\n", 171 | "\n", 172 | "df = df.withColumn('NewInvoiceDate', to_timestamp(\"InvoiceDate\",\"d.M.yyyy HH:mm\"))\n", 173 | "\n", 174 | "df.show(5)\n" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 0, 180 | "metadata": { 181 | "application/vnd.databricks.v1+cell": { 182 | "cellMetadata": { 183 | "byteLimit": 2048000, 184 | "rowLimit": 10000 185 | }, 186 | "inputWidgets": {}, 187 | "nuid": "3cb5fa14-a12f-43f5-b02f-4a5852e626eb", 188 | "showTitle": false, 189 | "title": "" 190 | } 191 | }, 192 | "outputs": [ 193 | { 194 | "output_type": "stream", 195 | "name": "stdout", 196 | "output_type": "stream", 197 | "text": [ 198 | "+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country| NewInvoiceDate|TotalPrice|Duration|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2.55| 17850|United Kingdom|2010-12-01 08:26:00| 15.3| 373|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2.75| 17850|United Kingdom|2010-12-01 08:26:00| 22.0| 373|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 22752|SET 7 BABUSHKA NE...| 2|1.12.2010 08:26| 7.65| 17850|United Kingdom|2010-12-01 08:26:00| 15.3| 373|\n| 536365| 21730|GLASS STAR FROSTE...| 6|1.12.2010 08:26| 4.25| 17850|United Kingdom|2010-12-01 08:26:00| 25.5| 373|\n| 536366| 22633|HAND WARMER UNION...| 6|1.12.2010 08:28| 1.85| 17850|United Kingdom|2010-12-01 08:28:00| 11.1| 373|\n| 536366| 22632|HAND WARMER RED P...| 6|1.12.2010 08:28| 1.85| 17850|United Kingdom|2010-12-01 08:28:00| 11.1| 373|\n| 536367| 84879|ASSORTED COLOUR B...| 32|1.12.2010 08:34| 1.69| 13047|United Kingdom|2010-12-01 08:34:00| 54.08| 373|\n| 536367| 22745|POPPY'S PLAYHOUSE...| 6|1.12.2010 08:34| 2.1| 13047|United Kingdom|2010-12-01 08:34:00| 12.6| 373|\n| 536367| 22748|POPPY'S PLAYHOUSE...| 6|1.12.2010 08:34| 2.1| 13047|United Kingdom|2010-12-01 08:34:00| 12.6| 373|\n| 536367| 22749|FELTCRAFT PRINCES...| 8|1.12.2010 08:34| 3.75| 13047|United Kingdom|2010-12-01 08:34:00| 30.0| 373|\n| 536367| 22310|IVORY KNITTED MUG...| 6|1.12.2010 08:34| 1.65| 13047|United Kingdom|2010-12-01 08:34:00| 9.9| 373|\n| 536367| 84969|BOX OF 6 ASSORTED...| 6|1.12.2010 08:34| 4.25| 13047|United Kingdom|2010-12-01 08:34:00| 25.5| 373|\n| 536367| 22623|BOX OF VINTAGE JI...| 3|1.12.2010 08:34| 4.95| 13047|United Kingdom|2010-12-01 08:34:00| 14.85| 373|\n| 536367| 22622|BOX OF VINTAGE AL...| 2|1.12.2010 08:34| 9.95| 13047|United Kingdom|2010-12-01 08:34:00| 19.9| 373|\n| 536367| 21754|HOME BUILDING BLO...| 3|1.12.2010 08:34| 5.95| 13047|United Kingdom|2010-12-01 08:34:00| 17.85| 373|\n| 536367| 21755|LOVE BUILDING BLO...| 3|1.12.2010 08:34| 5.95| 13047|United Kingdom|2010-12-01 08:34:00| 17.85| 373|\n| 536367| 21777|RECIPE BOX WITH M...| 4|1.12.2010 08:34| 7.95| 13047|United Kingdom|2010-12-01 08:34:00| 31.8| 373|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\nonly showing top 20 rows\n\n" 199 | ] 200 | } 201 | ], 202 | "source": [ 203 | "from pyspark.sql.functions import round\n", 204 | "\n", 205 | "from pyspark.sql import functions as F\n", 206 | "\n", 207 | "df = df.withColumn(\"UnitPrice\", F.regexp_replace(\"UnitPrice\", \",\", \".\").cast(\"double\"))\n", 208 | "\n", 209 | "df = df.withColumn('TotalPrice', round(df.Quantity * df.UnitPrice, 2) )\n", 210 | "\n", 211 | "df.show()" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 0, 217 | "metadata": { 218 | "application/vnd.databricks.v1+cell": { 219 | "cellMetadata": { 220 | "byteLimit": 2048000, 221 | "rowLimit": 10000 222 | }, 223 | "inputWidgets": {}, 224 | "nuid": "2d5f96fc-394f-44bd-87bd-4364c67d92c9", 225 | "showTitle": false, 226 | "title": "" 227 | } 228 | }, 229 | "outputs": [ 230 | { 231 | "output_type": "stream", 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "+----------+-------+---------+--------+\n|CustomerID|Recency|Frequency|Monetary|\n+----------+-------+---------+--------+\n| 15194| 3| 22| 7521.17|\n| 17703| 35| 3| 798.74|\n| 13452| 259| 2| 590.0|\n| 13098| 1| 41|28658.88|\n| 17048| 115| 6| 864.32|\n| 13638| 15| 1| 122.64|\n| 15322| 64| 2| 602.97|\n| 13723| 217| 1| 199.85|\n| 16597| 4| 1| 90.04|\n| 15237| 1| 4| 1412.32|\n| 13248| 124| 2| 465.68|\n| 16742| 46| 2| 0.0|\n| 14719| 1| 6| 1592.18|\n| 17043| 32| 4| 1735.18|\n| 14117| 143| 1| 90.0|\n| 15057| 275| 2| 1489.5|\n| 17979| 35| 5| 737.81|\n| 13460| 29| 2| 183.44|\n| 13518| 85| 1| 659.44|\n| 15432| 23| 1| 171.19|\n| 18196| 95| 2| 689.13|\n| 15437| 262| 1| 200.16|\n| 18147| 45| 2| 179.34|\n| 17499| 289| 1| 622.88|\n| 15663| 106| 1| 138.14|\n| 13658| 9| 7| 2421.47|\n| 12936| 17| 5| 1012.9|\n| 14029| 63| 2| 467.66|\n| 15221| 366| 2| 114.1|\n| 17370| 72| 4| 446.18|\n| 12967| 3| 4| 1194.75|\n| 13240| 96| 4| 663.65|\n| 15758| 24| 1| 205.25|\n| 14805| 15| 4| 554.65|\n| 16781| 365| 2| 294.65|\n| 17595| 12| 2| 388.79|\n| 16441| 67| 2| 381.2|\n| 13299| 268| 1| 142.5|\n| 14259| 141| 1| 120.0|\n| 15921| 172| 1| 336.03|\n| 18117| 25| 1| 320.72|\n| 14543| 3| 21| 2916.17|\n| 17757| 1| 31| 5585.49|\n| 14178| 8| 7| 1620.93|\n| 14562| 3| 22| 4709.22|\n| 14215| 11| 7| 1777.92|\n| 15197| 8| 5| 656.44|\n| 13509| 8| 7| 979.72|\n| 15133| 127| 3| 982.42|\n| 14067| 63| 2| 374.7|\n| 17061| 73| 12| 5116.13|\n| 17135| 16| 7| 1139.73|\n| 17647| 65| 1| 133.06|\n| 15894| 253| 2| 263.55|\n| 14064| 29| 7| 1188.32|\n| 13659| 197| 3| 1550.85|\n| 16499| 360| 5| 319.1|\n| 13068| 10| 2| 344.0|\n| 14242| 234| 2| 280.55|\n| 17111| 47| 1| 248.61|\n| 17201| 53| 1| 342.63|\n| 15689| 119| 2| 254.1|\n| 13832| 17| 2| 40.95|\n| 16145| 8| 18| 3741.98|\n| 14779| 280| 3| 386.15|\n| 13527| 33| 8| 2263.76|\n| 13035| 57| 3| 886.63|\n| 17205| 53| 1| 384.08|\n| 17454| 192| 4| 517.53|\n| 14329| 8| 14| 4928.74|\n| 14267| 150| 7| 1279.09|\n| 12422| 95| 3| 803.56|\n| 15569| 103| 5| 1375.71|\n| 14626| 8| 7| 2757.07|\n| 12402| 323| 1| 225.6|\n| 12472| 30| 13| 6229.48|\n| 13764| 70| 3| 1521.76|\n| 15198| 92| 2| 193.64|\n| 15709| 283| 1| 133.25|\n| 16754| 372| 1| 2002.4|\n| 14496| 311| 2| 538.81|\n| 17334| 301| 3| 306.6|\n| 14639| 52| 8| 2952.34|\n| 13656| 164| 2| 379.65|\n| 13447| 23| 4| 1104.23|\n| 15312| 75| 5| 921.1|\n| 16007| 47| 4| 1701.94|\n| 15374| 128| 1| 168.0|\n| 18233| 325| 1| 440.0|\n| 14352| 157| 4| 1078.96|\n| 15799| 75| 3| 884.14|\n| 14438| 306| 1| 131.9|\n| 15274| 4| 2| 716.57|\n| 15992| 3| 1| 41.99|\n| 17022| 31| 1| 71.0|\n| 12873| 282| 1| 374.0|\n| 16828| 93| 2| 128.5|\n| 15652| 91| 1| 337.74|\n| 12390| 79| 1| 549.84|\n| 16276| 176| 1| 810.6|\n+----------+-------+---------+--------+\nonly showing top 100 rows\n\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country| NewInvoiceDate|TotalPrice|Duration|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2.55| 17850|United Kingdom|2010-12-01 08:26:00| 15.3| 373|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2.75| 17850|United Kingdom|2010-12-01 08:26:00| 22.0| 373|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 22752|SET 7 BABUSHKA NE...| 2|1.12.2010 08:26| 7.65| 17850|United Kingdom|2010-12-01 08:26:00| 15.3| 373|\n| 536365| 21730|GLASS STAR FROSTE...| 6|1.12.2010 08:26| 4.25| 17850|United Kingdom|2010-12-01 08:26:00| 25.5| 373|\n| 536366| 22633|HAND WARMER UNION...| 6|1.12.2010 08:28| 1.85| 17850|United Kingdom|2010-12-01 08:28:00| 11.1| 373|\n| 536366| 22632|HAND WARMER RED P...| 6|1.12.2010 08:28| 1.85| 17850|United Kingdom|2010-12-01 08:28:00| 11.1| 373|\n| 536367| 84879|ASSORTED COLOUR B...| 32|1.12.2010 08:34| 1.69| 13047|United Kingdom|2010-12-01 08:34:00| 54.08| 373|\n| 536367| 22745|POPPY'S PLAYHOUSE...| 6|1.12.2010 08:34| 2.1| 13047|United Kingdom|2010-12-01 08:34:00| 12.6| 373|\n| 536367| 22748|POPPY'S PLAYHOUSE...| 6|1.12.2010 08:34| 2.1| 13047|United Kingdom|2010-12-01 08:34:00| 12.6| 373|\n| 536367| 22749|FELTCRAFT PRINCES...| 8|1.12.2010 08:34| 3.75| 13047|United Kingdom|2010-12-01 08:34:00| 30.0| 373|\n| 536367| 22310|IVORY KNITTED MUG...| 6|1.12.2010 08:34| 1.65| 13047|United Kingdom|2010-12-01 08:34:00| 9.9| 373|\n| 536367| 84969|BOX OF 6 ASSORTED...| 6|1.12.2010 08:34| 4.25| 13047|United Kingdom|2010-12-01 08:34:00| 25.5| 373|\n| 536367| 22623|BOX OF VINTAGE JI...| 3|1.12.2010 08:34| 4.95| 13047|United Kingdom|2010-12-01 08:34:00| 14.85| 373|\n| 536367| 22622|BOX OF VINTAGE AL...| 2|1.12.2010 08:34| 9.95| 13047|United Kingdom|2010-12-01 08:34:00| 19.9| 373|\n| 536367| 21754|HOME BUILDING BLO...| 3|1.12.2010 08:34| 5.95| 13047|United Kingdom|2010-12-01 08:34:00| 17.85| 373|\n| 536367| 21755|LOVE BUILDING BLO...| 3|1.12.2010 08:34| 5.95| 13047|United Kingdom|2010-12-01 08:34:00| 17.85| 373|\n| 536367| 21777|RECIPE BOX WITH M...| 4|1.12.2010 08:34| 7.95| 13047|United Kingdom|2010-12-01 08:34:00| 31.8| 373|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\nonly showing top 20 rows\n\n" 236 | ] 237 | } 238 | ], 239 | "source": [ 240 | "spark.sql(\"set spark.sql.legacy.timeParserPolicy=LEGACY\")\n", 241 | "\n", 242 | "from pyspark.sql.functions import mean, min, max, sum, datediff, to_date\n", 243 | "\n", 244 | "date_max = df.select(max('NewInvoiceDate')).toPandas()\n", 245 | "\n", 246 | "current = to_utc_timestamp(unix_timestamp(lit(str(date_max.iloc[0][0])), 'yy-MM-dd HH:mm').cast('timestamp'), 'UTC')\n", 247 | "\n", 248 | "df = df.withColumn('Duration', datediff(lit(current), 'NewInvoiceDate'))\n", 249 | "\n", 250 | "#Recency, Frequency, Monetary\n", 251 | "\n", 252 | "recency = df.groupBy('CustomerID').agg(min('Duration').alias('Recency'))\n", 253 | "\n", 254 | "frequency = df.groupBy('CustomerID', 'InvoiceNo').count()\\\n", 255 | " .groupBy('CustomerID')\\\n", 256 | " .agg(count('*').alias(\"Frequency\"))\n", 257 | "\n", 258 | "monetary = df.groupBy('CustomerID').agg(round(sum('TotalPrice'), 2).alias('Monetary'))\n", 259 | "\n", 260 | "rfm = recency.join(frequency, 'CustomerID', how = 'inner')\\\n", 261 | " .join(monetary, 'CustomerID', how = 'inner')\n", 262 | "\n", 263 | "rfm.show(100)\n", 264 | "df.show()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 0, 270 | "metadata": { 271 | "application/vnd.databricks.v1+cell": { 272 | "cellMetadata": { 273 | "byteLimit": 2048000, 274 | "rowLimit": 10000 275 | }, 276 | "inputWidgets": {}, 277 | "nuid": "23464001-02f0-4989-a185-5d7dccae6ba7", 278 | "showTitle": false, 279 | "title": "" 280 | } 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "import numpy as np\n", 285 | "import pandas as pd\n", 286 | "\n", 287 | "def describe_pd(df_input, columns, deciles = False):\n", 288 | " if deciles:\n", 289 | " percentiles = [25, 50, 75]\n", 290 | "\n", 291 | " pcs = np.transpose([np.percentile(df_input.select(x).collect(),percentiles) for x in columns])\n", 292 | " pcs = pd.DataFrame(pcs, columns = columns)\n", 293 | " pcs['summary'] = [str(p) + \"%\" for p in percentiles]\n", 294 | " mydescribe = df_input.describe().toPandas()\n", 295 | " new_df = pd.concat([mydescribe, pcs], ignore_index = True)\n", 296 | " new_df = new_df.round(2)\n", 297 | " return new_df[['summary'] + columns]" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 0, 303 | "metadata": { 304 | "application/vnd.databricks.v1+cell": { 305 | "cellMetadata": { 306 | "byteLimit": 2048000, 307 | "rowLimit": 10000 308 | }, 309 | "inputWidgets": {}, 310 | "nuid": "0444e164-dab4-4f60-98be-a8f1344dc4fe", 311 | "showTitle": false, 312 | "title": "" 313 | } 314 | }, 315 | "outputs": [ 316 | { 317 | "output_type": "display_data", 318 | "data": { 319 | "text/html": [ 320 | "
\n", 321 | "\n", 334 | "\n", 335 | " \n", 336 | " \n", 337 | " \n", 338 | " \n", 339 | " \n", 340 | " \n", 341 | " \n", 342 | " \n", 343 | " \n", 344 | " \n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | " \n", 392 | " \n", 393 | " \n", 394 | " \n", 395 | " \n", 396 | " \n", 397 | " \n", 398 | " \n", 399 | " \n", 400 | " \n", 401 | " \n", 402 | "
summaryRecencyFrequencyMonetary
0count437343734373
1mean91.560256117082095.92270752343928652229.0756757374743
2stddev100.770130756258356.7988132485727623356.82678007453
3min01-4287.63
4max37337101447682.12
525%16.01.0293.45
650%50.03.0648.41
775%143.05.01612.13
\n", 403 | "
" 404 | ] 405 | }, 406 | "metadata": { 407 | "application/vnd.databricks.v1+output": { 408 | "addedWidgets": {}, 409 | "arguments": {}, 410 | "data": "
\n\n\n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n \n
summaryRecencyFrequencyMonetary
0count437343734373
1mean91.560256117082095.92270752343928652229.0756757374743
2stddev100.770130756258356.7988132485727623356.82678007453
3min01-4287.63
4max37337101447682.12
525%16.01.0293.45
650%50.03.0648.41
775%143.05.01612.13
\n
", 411 | "datasetInfos": [], 412 | "metadata": {}, 413 | "removedWidgets": [], 414 | "textData": null, 415 | "type": "htmlSandbox" 416 | } 417 | }, 418 | "output_type": "display_data" 419 | } 420 | ], 421 | "source": [ 422 | "cols = ['Recency', 'Frequency', 'Monetary']\n", 423 | "describe_pd(rfm, cols, 1)" 424 | ] 425 | }, 426 | { 427 | "cell_type": "code", 428 | "execution_count": 0, 429 | "metadata": { 430 | "application/vnd.databricks.v1+cell": { 431 | "cellMetadata": { 432 | "byteLimit": 2048000, 433 | "rowLimit": 10000 434 | }, 435 | "inputWidgets": {}, 436 | "nuid": "0e8498c0-a7a3-47d5-8d87-2fba3a40458c", 437 | "showTitle": false, 438 | "title": "" 439 | } 440 | }, 441 | "outputs": [], 442 | "source": [ 443 | "def RScore(x):\n", 444 | " if x <= 16:\n", 445 | " return 1\n", 446 | " elif x<= 50:\n", 447 | " return 2\n", 448 | " elif x <= 143:\n", 449 | " return 3\n", 450 | " else:\n", 451 | " return 4\n", 452 | "\n", 453 | "def FScore(x):\n", 454 | " if x <= 1:\n", 455 | " return 4\n", 456 | " elif x <= 3:\n", 457 | " return 3\n", 458 | " elif x <= 5:\n", 459 | " return 2\n", 460 | " else:\n", 461 | " return 1\n", 462 | " \n", 463 | "def MScore(x):\n", 464 | " if x <= 293:\n", 465 | " return 4\n", 466 | " elif x <= 648:\n", 467 | " return 3\n", 468 | " elif x <= 1612:\n", 469 | " return 2\n", 470 | " else:\n", 471 | " return 1\n", 472 | " \n", 473 | "from pyspark.sql.functions import udf\n", 474 | "from pyspark.sql.types import StringType, DoubleType\n", 475 | "\n", 476 | "R_udf = udf(lambda x : RScore(x), StringType())\n", 477 | "F_udf = udf(lambda x : FScore(x), StringType())\n", 478 | "M_udf = udf(lambda x : MScore(x), StringType())" 479 | ] 480 | }, 481 | { 482 | "cell_type": "code", 483 | "execution_count": 0, 484 | "metadata": { 485 | "application/vnd.databricks.v1+cell": { 486 | "cellMetadata": { 487 | "byteLimit": 2048000, 488 | "rowLimit": 10000 489 | }, 490 | "inputWidgets": {}, 491 | "nuid": "d0df6142-7497-402b-a259-9b1cb5a389a4", 492 | "showTitle": false, 493 | "title": "" 494 | } 495 | }, 496 | "outputs": [ 497 | { 498 | "output_type": "stream", 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "+----------+-------+---------+----------+-----+-----+-----+--------+\n|CustomerID|Recency|Frequency| Monetary|r_seg|f_seg|m_seg|RFMScore|\n+----------+-------+---------+----------+-----+-----+-----+--------+\n| 0| 0| 3710|1447682.12| 1| 1| 1| 111|\n| 13004| 11| 22| 5613.43| 1| 1| 1| 111|\n| 17602| 2| 8| 5050.77| 1| 1| 1| 111|\n| 13098| 1| 41| 28658.88| 1| 1| 1| 111|\n| 13924| 1| 11| 1682.08| 1| 1| 1| 111|\n| 13658| 9| 7| 2421.47| 1| 1| 1| 111|\n| 15061| 3| 55| 54228.74| 1| 1| 1| 111|\n| 15838| 11| 21| 33350.76| 1| 1| 1| 111|\n| 15194| 3| 22| 7521.17| 1| 1| 1| 111|\n| 14415| 1| 18| 5811.56| 1| 1| 1| 111|\n| 13798| 1| 63| 36351.42| 1| 1| 1| 111|\n| 15993| 8| 10| 2756.82| 1| 1| 1| 111|\n| 14178| 8| 7| 1620.93| 1| 1| 1| 111|\n| 17949| 1| 52| 52750.84| 1| 1| 1| 111|\n| 14329| 8| 14| 4928.74| 1| 1| 1| 111|\n| 14825| 3| 12| 2226.91| 1| 1| 1| 111|\n| 14215| 11| 7| 1777.92| 1| 1| 1| 111|\n| 12683| 4| 20| 8221.09| 1| 1| 1| 111|\n| 14543| 3| 21| 2916.17| 1| 1| 1| 111|\n| 13230| 4| 15| 2763.41| 1| 1| 1| 111|\n+----------+-------+---------+----------+-----+-----+-----+--------+\nonly showing top 20 rows\n\n" 503 | ] 504 | } 505 | ], 506 | "source": [ 507 | "rfm_seg = rfm.withColumn(\"r_seg\", R_udf(\"Recency\"))\n", 508 | "rfm_seg = rfm_seg.withColumn(\"f_seg\", F_udf(\"Frequency\"))\n", 509 | "rfm_seg = rfm_seg.withColumn(\"m_seg\", M_udf(\"Monetary\"))\n", 510 | "rfm_seg = rfm_seg.withColumn(\"RFMScore\", F.concat(F.col('r_seg'), F.col('f_seg'), F.col('m_seg')))\n", 511 | "\n", 512 | "rfm_seg.sort(F.col('RFMScore')).show(20)" 513 | ] 514 | }, 515 | { 516 | "cell_type": "code", 517 | "execution_count": 0, 518 | "metadata": { 519 | "application/vnd.databricks.v1+cell": { 520 | "cellMetadata": { 521 | "byteLimit": 2048000, 522 | "rowLimit": 10000 523 | }, 524 | "inputWidgets": {}, 525 | "nuid": "69d49f7d-a504-4b3b-b0fd-b44de363d046", 526 | "showTitle": false, 527 | "title": "" 528 | } 529 | }, 530 | "outputs": [ 531 | { 532 | "output_type": "stream", 533 | "name": "stdout", 534 | "output_type": "stream", 535 | "text": [ 536 | "+--------+-----------------+------------------+------------------+\n|RFMScore| avg(Recency)| avg(Monetary)| avg(Frequency)|\n+--------+-----------------+------------------+------------------+\n| 111|6.022680412371134|11795.596288659783|26.492783505154637|\n| 112|7.237113402061856|1223.3604123711343| 7.752577319587629|\n| 113| 8.0|505.97749999999996| 7.5|\n| 114| 11.0| 191.17| 8.0|\n| 121|6.472727272727273|2569.0619999999994| 4.636363636363637|\n+--------+-----------------+------------------+------------------+\nonly showing top 5 rows\n\n" 537 | ] 538 | } 539 | ], 540 | "source": [ 541 | "rfm_seg.groupBy('RFMScore').agg({'Recency': 'mean', 'Frequency' : 'mean', 'Monetary' : 'mean'}).sort(F.col('RFMScore')).show(5)" 542 | ] 543 | }, 544 | { 545 | "cell_type": "code", 546 | "execution_count": 0, 547 | "metadata": { 548 | "application/vnd.databricks.v1+cell": { 549 | "cellMetadata": { 550 | "byteLimit": 2048000, 551 | "rowLimit": 10000 552 | }, 553 | "inputWidgets": {}, 554 | "nuid": "34da44d3-d725-48d5-87dc-2989e0302c94", 555 | "showTitle": false, 556 | "title": "" 557 | } 558 | }, 559 | "outputs": [ 560 | { 561 | "output_type": "display_data", 562 | "data": { 563 | "text/plain": [ 564 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m\n", 565 | "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)\n", 566 | "File \u001B[0;32m:5\u001B[0m\n", 567 | "\u001B[1;32m 3\u001B[0m grp \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRFMScore\u001B[39m\u001B[38;5;124m'\u001B[39m\n", 568 | "\u001B[1;32m 4\u001B[0m num_cols \u001B[38;5;241m=\u001B[39m [\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRecency\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mFrequency\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mMonetary\u001B[39m\u001B[38;5;124m'\u001B[39m]\n", 569 | "\u001B[0;32m----> 5\u001B[0m rfm_seg\u001B[38;5;241m.\u001B[39mtoPandas()\u001B[38;5;241m.\u001B[39mto_csv(output_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mquantile_grouped.csv\u001B[39m\u001B[38;5;124m'\u001B[39m)\n", 570 | "\n", 571 | "\u001B[0;31mNameError\u001B[0m: name 'output_dir' is not defined" 572 | ] 573 | }, 574 | "metadata": { 575 | "application/vnd.databricks.v1+output": { 576 | "arguments": {}, 577 | "data": "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m\n\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)\nFile \u001B[0;32m:5\u001B[0m\n\u001B[1;32m 3\u001B[0m grp \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRFMScore\u001B[39m\u001B[38;5;124m'\u001B[39m\n\u001B[1;32m 4\u001B[0m num_cols \u001B[38;5;241m=\u001B[39m [\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRecency\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mFrequency\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mMonetary\u001B[39m\u001B[38;5;124m'\u001B[39m]\n\u001B[0;32m----> 5\u001B[0m rfm_seg\u001B[38;5;241m.\u001B[39mtoPandas()\u001B[38;5;241m.\u001B[39mto_csv(output_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mquantile_grouped.csv\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\n\u001B[0;31mNameError\u001B[0m: name 'output_dir' is not defined", 578 | "errorSummary": "NameError: name 'output_dir' is not defined", 579 | "errorTraceType": "ansi", 580 | "metadata": {}, 581 | "type": "ipynbError" 582 | } 583 | }, 584 | "output_type": "display_data" 585 | } 586 | ], 587 | "source": [ 588 | "#Detailed summary\n", 589 | "\n", 590 | "grp = 'RFMScore'\n", 591 | "num_cols = ['Recency', 'Frequency', 'Monetary']\n", 592 | "df_myinput = rfm_seg\n", 593 | "\n", 594 | "quantile_grouped = quantile_agg(df_myinput, grp, num_cols)\n", 595 | "quantile_grouped.toPandas().to_csv(output_dir + 'quantile_grouped.csv')" 596 | ] 597 | }, 598 | { 599 | "cell_type": "code", 600 | "execution_count": 0, 601 | "metadata": { 602 | "application/vnd.databricks.v1+cell": { 603 | "cellMetadata": {}, 604 | "inputWidgets": {}, 605 | "nuid": "1151a35d-85ff-4fc4-80dc-e9bd051df73d", 606 | "showTitle": false, 607 | "title": "" 608 | } 609 | }, 610 | "outputs": [], 611 | "source": [] 612 | } 613 | ], 614 | "metadata": { 615 | "application/vnd.databricks.v1+notebook": { 616 | "dashboards": [], 617 | "language": "python", 618 | "notebookMetadata": { 619 | "pythonIndentUnit": 4 620 | }, 621 | "notebookName": "11072023 DEB v2", 622 | "widgets": {} 623 | } 624 | }, 625 | "nbformat": 4, 626 | "nbformat_minor": 0 627 | } 628 | -------------------------------------------------------------------------------- /25042023 - DEB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 2, 6 | "id": "311f3e23", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "import numpy as np" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 3, 16 | "id": "a4aeeb7b", 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "p7 = np.arange(10, 200, 11)" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 4, 26 | "id": "373f96e4", 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "np.savetxt('mydata/1904DEB.csv', p7, delimiter = ',')" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 5, 36 | "id": "2a3ae98b", 37 | "metadata": {}, 38 | "outputs": [], 39 | "source": [ 40 | "p8 = np.arange(0, 121).reshape(11, 11)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 6, 46 | "id": "e0c4a649", 47 | "metadata": {}, 48 | "outputs": [ 49 | { 50 | "data": { 51 | "text/plain": [ 52 | "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n", 53 | " [ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],\n", 54 | " [ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],\n", 55 | " [ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43],\n", 56 | " [ 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54],\n", 57 | " [ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65],\n", 58 | " [ 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76],\n", 59 | " [ 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87],\n", 60 | " [ 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98],\n", 61 | " [ 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109],\n", 62 | " [110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120]])" 63 | ] 64 | }, 65 | "execution_count": 6, 66 | "metadata": {}, 67 | "output_type": "execute_result" 68 | } 69 | ], 70 | "source": [ 71 | "p8" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 7, 77 | "id": "82abc61a", 78 | "metadata": {}, 79 | "outputs": [], 80 | "source": [ 81 | "np.save('test2504.npy', p8)" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 8, 87 | "id": "50fbf18a", 88 | "metadata": {}, 89 | "outputs": [ 90 | { 91 | "data": { 92 | "text/plain": [ 93 | "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n", 94 | " [ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],\n", 95 | " [ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],\n", 96 | " [ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43],\n", 97 | " [ 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54],\n", 98 | " [ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65],\n", 99 | " [ 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76],\n", 100 | " [ 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87],\n", 101 | " [ 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98],\n", 102 | " [ 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109],\n", 103 | " [110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120]])" 104 | ] 105 | }, 106 | "execution_count": 8, 107 | "metadata": {}, 108 | "output_type": "execute_result" 109 | } 110 | ], 111 | "source": [ 112 | "p9 = np.load('test2504.npy')\n", 113 | "p9" 114 | ] 115 | }, 116 | { 117 | "cell_type": "code", 118 | "execution_count": 9, 119 | "id": "c529193c", 120 | "metadata": {}, 121 | "outputs": [], 122 | "source": [ 123 | "np.save('numpyfile2504', p8)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 10, 129 | "id": "2edeafee", 130 | "metadata": {}, 131 | "outputs": [ 132 | { 133 | "data": { 134 | "text/plain": [ 135 | "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n", 136 | " [ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],\n", 137 | " [ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],\n", 138 | " [ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43],\n", 139 | " [ 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54],\n", 140 | " [ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65],\n", 141 | " [ 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76],\n", 142 | " [ 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87],\n", 143 | " [ 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98],\n", 144 | " [ 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109],\n", 145 | " [110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120]])" 146 | ] 147 | }, 148 | "execution_count": 10, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "p10 = np.load('numpyfile2504.npy')\n", 155 | "p10" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 11, 161 | "id": "c12ffdee", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "data": { 166 | "text/plain": [ 167 | "array([[ 0, 1, 2, ..., 997, 998, 999],\n", 168 | " [ 1000, 1001, 1002, ..., 1997, 1998, 1999],\n", 169 | " [ 2000, 2001, 2002, ..., 2997, 2998, 2999],\n", 170 | " ...,\n", 171 | " [997000, 997001, 997002, ..., 997997, 997998, 997999],\n", 172 | " [998000, 998001, 998002, ..., 998997, 998998, 998999],\n", 173 | " [999000, 999001, 999002, ..., 999997, 999998, 999999]])" 174 | ] 175 | }, 176 | "execution_count": 11, 177 | "metadata": {}, 178 | "output_type": "execute_result" 179 | } 180 | ], 181 | "source": [ 182 | "p11 = np.arange(0, 1000000).reshape(1000, 1000)\n", 183 | "p11" 184 | ] 185 | }, 186 | { 187 | "cell_type": "code", 188 | "execution_count": 12, 189 | "id": "ec42190b", 190 | "metadata": {}, 191 | "outputs": [], 192 | "source": [ 193 | "np.savez_compressed('mytest2504.npz', p11)" 194 | ] 195 | }, 196 | { 197 | "cell_type": "code", 198 | "execution_count": 13, 199 | "id": "50a1bbb8", 200 | "metadata": {}, 201 | "outputs": [], 202 | "source": [ 203 | "np.save('mytest2504-1.npy', p11)" 204 | ] 205 | }, 206 | { 207 | "cell_type": "code", 208 | "execution_count": 14, 209 | "id": "da4b8726", 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "from IPython.display import Image" 214 | ] 215 | }, 216 | { 217 | "cell_type": "code", 218 | "execution_count": 15, 219 | "id": "7fbb404b", 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "image/png": "iVBORw0KGgoAAAANSUhEUgAABJgAAABLCAIAAACz06p1AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAABIjSURBVHhe7d1Nax1XmsBxfZ18G6GQrTaCTm/shdNtBFp45FYQyAxaNbORYkbCITLZWExrko4Dg9MoCU4nluKEQXScGUVx1A4MZsAwNALPea3zPOfUqXuvpWuprv4/Lm7dejlVp64W9aeu0lM//PDDPwAAAAAAfWAK7uXLl4QcAAAAAPQGIQcAAAAAPUPIAQAAAEDPEHIAAAAA0DP9DrnnP+7/9eFfq6//fPri/8KWAAAAADAxeh1y+7fnZueuXLv2u/J1ZW7WmvvDv/03LQcAAABgsvQ85GZnbz8KbzS36i/7t6/N0XIAAAAAJsxEh5xZ9ZyWAwAAADBpxhhyR0dHa2tr8/Pz1+tWV1cfPWpPsSEMEXJGaLmPn9JyAAAAACbCGENuZWVlc3Pz4ODADF6zs7Njcu7Jkydhn9EMCLk//uX58/9xr6OPb83O3d4P6wAAAACg10xMjSXk9vb2FhYWTk5OzOjd1tbW7t27F3YbTUfIPf34D/4/d5Is/vvTsHIMHq5vbx2Fn1/F0e5v1r8NP5uz/9P2G1fvhpdYbqhV9nX/YVhj/Ly1FJcv7bbP1hzo6t3lr8O7LnZLOTgAAMAk+nFjempq8bPwTnuwWFvVtdeFUp8CTu8zc3WnN34M7zR75c21fxDeSocbb05Nvblx6N+536XM9GZY2WFcIffll18uLy/7VOu2tbX1wQcfhN1G0xFymacf/9M4Q+7r+29cPU3IuQCLweZSrRnt22VTZaLlHq7naRfJQdSAQii9IULOHZeQAwAAE+/Vkqw3IXe2XIHcMG1iK2WY2JhoPtW6Qm76zekUbNHhpgs3HXLqd8ktGXh5CbmzcJYhZ3/+zZ9+dj87avBibSM7B/s8LT+l5mnegJCzQ/kne4QcAACYdIQcXsmDGzbHBoTcjcXpfAOzfHHR7NsRcn5wG8xdCDmXN0u7D9NXFm3/pG8wxu8ouiXF9xjXv01bXk2JZZ+bhYW6plIjNSP7Z1/+1RZOKsnMxnmeeX4WYoZF8vlxvh701Up3hmbHYr6a3ez+QzGdZkx/JuJ6NoPYmdYbFQAA4Dz42+hN+68nbqntvbh/a5+i3Niwz6MMc4etb779PX186x5bNfTteHga49QeucRCsPQ2tZH9edp/g3xVSy24M2m++Cf2LZ4gSZXztyc2vfkgnZ4epDprd1bBoG65WOyXKhcfDPpqpb8maspmlxsP7EdMyJ1NyJneCA/EYlbJt/7n7K/L5FsdJK7iYr241GnbLH8KF3/OqaAS4eReKbSKr1xmY8a32Szqhgi5VLlyyuF6tq2yP4vazN4CAACcg9AS8Xbc3po3d9WxgmKKpDtycfPtuqu5m2++fOi4zZq93JaqndQtvlO/xe8Y2WdY9xT07uqt2z2u0iepqJmqE3CjNQcdctblCJ0NeZHEWQwOuUP7myPmZa6GuUr1T9mxS9o/AomQ8+FRyTAVGyqNir6KIxSl1IxQT6OOkBMlGRupGVyeqv1ZjaDHtGfotjzLkBMP08Sw2fVUD+LUXsUDOgAAgNdPV4chbrKbCsqeX6Wbb7dc3Mrb5fLOPu8ldb/eIsstYdDIYgrtB1VTiOdv5FPLR4vELp7YMT/t1muYUTFjFONfWHbi/syHCDn9wZmF9oqVIVeoDZsQcnmx2LeVp0ZiyzKTQp+09I+OKFliUS3k3PKOmhIt1BlydrNw0LMMObE2Czn9nE2cWy3qAAAAzom6z3bs3bnvE11BRXXY/5RFrT3k3bkvnK6b/sTVkSHLSitHzmOpUo9uxzSdSoMZNjP0Eqs8/zSgHaQ9hquzjp2T5INcUHbWcUbDhJyYqb3s7sKWIad/i+wu6vetDSGXF0vWITLkUhFlOZSHnK01/WrGtyPEhU3GtIbcwIoz0o4dIadWnXPIpbX6wgIAAJwTeV/uDR1y5v582lSQii7XRZ69ZReZlIYdxG4ZpYPWRxbn6VRXxUKTG4hhpbBWKM9/2JBrnbVrlcKFDzk9Uzu7gSGnLrv/LAaFXPfIASGXF0vWIVlv+CbJIycPue768nzR+S3LkBum4oy0Y3bafpV79iXTUbzsXmpVVncDJjJCyOnZ2S3NtbKH5nuVAADg/BUhJ55WDQg5t8oFSZM9+f23SKYhbs1zdpfa7mJkXWtdq+wgiw9UOciNO5Xnny6dHaQecq2zTp3TJ+7SldomIiaYLntoWkLuHELOB8yyeMRkiZBTPztF2EQhacxPWchVK654imVzKARYdtw0uCaKq9upQk4dWpyk5Qpz/X776QEAALxmxW20uMlOFVQPOR9+4Z67/cGdz6TW+/VBmgG7Rh4+5PzbG/afJt1UVFh5lQXF+buJ+3HqIVeddRmQ2an2QVdu2emEa+Iugv0ibpzvcCHX+iQzGVfI7e3tLSwsnJyc+FrrsLa2du/evbDbaM4j5OJTLNVCOqLsLs2AeeGkA4mRVcip3TNutPggy+2Vzk2+VQMqrynk0okVFzDfAAAA4Dz5Impux+0NdHNXndKitaPiZq5J/Fq1u1tuxB3t7Xu6Oy9LxsgWihjoGjlLIDlIXkfuHFoKqlki8izn9o0XysdJiDd7xPaQC3u1zVqNoHbpjSFDLn5ezQehJtsScnLfqnGFnLGysrK5uXlwcGAGr9nZ2bl+/fqTJ0/CPqOxITd35dq13w1+/XZudvHPv4b9tKxY7FvRGGWHtIWWCycTJyrGwktWU2iY8BKD2C4yS7a39mwRiW3CKw3ikiksz1sonkbLqugUIaeuzICQu7/VzLQ8E7cx36sEAAAXgr2Nnt74zN5MO/K+PFVQZ8iFyvJ33i6EArNEZ4xaq2uqESPNE9vUR85qrSvkdBA27GZRe8V58hyyI8rwyKqsOmt3GYPeVZwxbMj5K5AubBlyGXkxa0xMjSvkjo6O1tbW5ufnTarVrK6uPno01DO1Vk93t27/6+2hXh98+rfnYa/TqT/sglMWYE4/wAQAALhczI37+UVLVhTorzGG3GSyT5OIkC4DQ84+rqSEAQDAZWYfwpxLTeWPztBfhNzw2v46DoWOkHOr+Os4AABwiTXfo2v/XuUYua9insNxMSaEHAAAAAD0DCEHAAAAAD1DyAEAAABAzxByAAAAANAzhBwAAAAA9AwhBwAAAAA9Q8gBAAAAQM8QcgAAAADQM4QcAAAAAPQMIQcAAAAAPUPIAQAAAEDPpJADAAAAAPRFCDnzPwAAAACAi4+QAwAAAICeIeQAAAAAoGcIOQAAAADoGUIOAAAAAHqm3yH34qfvHn39qPr627N/nIQtAQAAAGBi9DrkvtuYm5278s47vy9fV+ZmrbmlnZ9pOQAAAACTpechNzu78W14o7lVn3+3cW2OlgMAAAAwYSY65MyqF7QcAAAAgEkzxpA7Pj5eX1+fn5+/Xre6urq/vx92GNkQIWeElvvkGS0HAAAAYCKMMeRu3bq1ubl5cHBgBq/Z2dkxOXd4eBj2Gc2AkPuXz1+8+F/3Ov7kn2fnNr4P6wAAAACg10xMmX/PPuQeP368sLBwcjL4Kdja2tr29nZ4M5qOkHv2yZL/z50kN//8LKwEAAAAgD4bV8h99dVXy8vL4U2nLSe8GU1HyGWefbI43pD75r3tD4/Dz6/i+Iu330tPDH/9aPuNq3fDSyw31Cr7+vSbsMb4+4fvxuXvfvFrWKiZA129u7IX3gEAAODs7C5NTc28/0t4p/xy563aqk4/3ZmZmlr6PLyr+eV9u9VueCfZU/K7796cmnrrzuhnMKE+XxpwNcwGUf7BuQ8luNl21S175Ssfiv1lSEeXo0XD/KoQcmdh79M3rp4m5FyAxWBzqdaM9v2KqTLRct+8l6ddJAdRAwqh9Ag5AACAMegIuVd12pBLziLkXIHYdBnDTF8n304dV0NedvezmKycu6u19pZzm701Ux7FfVh5yKmPOD9iO0LuLJxlyNmf3/7o7+5nRw1erG1k52CfvOWn1DzNI+QAAADGYOJDbhKEjpIpVciuld0lvs0vtX1wV30WOnNzaWZq5s5PYZFjli8tyfHbPmJ7AtVnfQEh5/Lm3S++SV9ZtP2TvsEYv6PolhTfY3zv+7Tl1ZRY9rlZWKhryuZWXBVGds/cwkuOH6kkMxvneeb5WYivUxbJ58fZ6/hqpd/lezsvdYaGPcmVvXSqaWQ5o/giFAEAwKXkQ+6O+dcTUSe+Wum+1HfH3KkbrS3h7uy9pff1Xb5YZTTLdV3Y04hv7c9+MxUnfpzsbdTRjamCWma3a5/XeXpelb30cQd1y5nwZ2ImOFLWyo1HCzl3TdSUzfY3d8vPIrvmhFxjUMiZ/AgPxGKryLf+5+yvy+Rb/TTMVVxMMtc5bZuFDix+zqmAzKsphV/xlctszPg2m4XitmnWui1jsIXL0rZKciPU/jwPAABgwvmCijfl7gY93sTrkDNqt+lqr5YBmzt+VxThaY+oC7dLSpS2kPP51JxAfdiMHaFZVcwuO8+mXtxeKjLDqnKEocvq9FRKddPXR03BX+32jzJsZq+nOJA5rhlKHT0f3y9pjUOFkPOlVMkw/zaUiUqjoq/iCEUpNSOoXZQsuiRRkrE5m8HlqQ4IOXuGbsuBIScGEXNXp2G0zqU+QQAAgMsgv60XfZWFXHspGXlguOrzd/lZEsgAiAcqc8iekt8mjtx2kpVhlWJ5Njs5pphFOoFMPtPacccjP3o7d62MfMsQrkYsupLd166182o+brPQXjF1dDfxQvU3pEHI5flh34pnSiJm5JZlJoWQa4kZHVGyxKJayLnlXWlk+8o/GesMObuZfJjWEXLyOVsWcmovO06qX8s9LayMDAAAcBkU0ZJu4rOQqz1vEZt5bXnjCirwq9ySmRlbF9nI6ZRcPMzYPSsBUw6rlP2Zzi0/7RQq1WqNnZMUcx+n4UIuEMmq0tqfc2WcZoJppnYcl7tlyOkLbncZeHqE3Aghl3omy6E85Gyt6Vczvnu0FV5NCLWG3MCKM9KOHSGnVo0r5OwGbV+2BAAAuDxSNQUjh1yZN2JMd8fv2SUiAEKD+U5T3/RLu9t4iNuoQ9SHVcrTTlt2h1zrZF2rFPTcx2ikkPNn687NlZu8wrXLJT5Keyy7i903fRZdIdcRwAkhN1LIhSjKdhnwRK6dLzq/ZRlyw1SckXbMTltUmUxH8VKH80YJOTHlcBrq6AAAAJeQiC4v3Y4PGXJ5Ecm7/Lw9xKr0yMgOLs9Bh5zb3W2cIqFjWKVMC7tlMTsnjVkNktQ55yKf9QAq5PRp1yYilvtP3F6u8Lmro7de8Op1S8YVco8fP15YWDg5OQnv69bW1ra3t8Ob0ZxDyPnvSa64nAtLDFk1qnCsIrGi9FAr9Zjj3rZVXH4ysq+y45ZfffRO8UROTllOKruAAAAAl1V+Ty++kjdkyBWBYTf2d/l5QvincD4AsgOJEewpZSHnF8bHSl3DKkVvtM/OSceqZaE/z8rDw9egM+SKc0uzGP60xS+D293+f8rFHYcLueoviTeukDNu3bq1ubl5cHBgBq/Z2dm5fv364eFh2Gc0NuTmrrzzzu8Hv347N3vz0+dhP220kIsPuOqPp9wuzYAinLIDiZFVyKndM2602FrZczD5Vg2onCbkmh3lIHbutQEBAAAuFRdIzU25v30PeSNSp/seXe2lBrR3/82ObjPDbymaqmWEtHsTD6kPu4bNuC3jY6La7Bx5LDW+DCE1gj698RtwOHF9wjm3Xbp8lSJCzm+W9tJHd9ehWeXIfatMTJl/xxJyx8fH6+vr8/PzJtVqVldX9/f3ww6je/blhxubG0O97v7Hf70Ie2VGDLnW0HLhJJ5ZuW3CS0aOO1azSgzicsjW4He2kcQ24ZUGcREVlusTS6fRsirKQ05+Z3JAyL390Rc+58xL7hKOKF5yEAAAgEvDV1OoL0PcnYvUGfiwJdaUof9/5EIPOCao3IFcFKmQ850QisufUlyYksMP5XepDlvyz+u89tk5WSbJvdTIYqaVHBqXMuTsEnluLtiC7GoMddr2MjbXpOUDanaUo0UDK84YY8hNKJcurQ+7JpkPOfIMAAAAuBAIuRHZJ1ptf3s24Qg5AAAA4AIh5Ian/07sciHkAAAAgAuEkAMAAACAniHkAAAAAKBnCDkAAAAA6BlCDgAAAAB6hpADAAAAgJ4h5AAAAACgZwg5AAAAAOgZQg4AAAAAeoaQAwAAAICeIeQAAAAAoGdswb18+f8sLLlPuNPebAAAAABJRU5ErkJggg==\n", 225 | "text/plain": [ 226 | "" 227 | ] 228 | }, 229 | "execution_count": 15, 230 | "metadata": {}, 231 | "output_type": "execute_result" 232 | } 233 | ], 234 | "source": [ 235 | "Image(filename = 'karsilastirma.PNG')" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 16, 241 | "id": "ef8ffab5", 242 | "metadata": {}, 243 | "outputs": [], 244 | "source": [ 245 | "#printing options" 246 | ] 247 | }, 248 | { 249 | "cell_type": "code", 250 | "execution_count": 17, 251 | "id": "d97193dd", 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "data": { 256 | "text/plain": [ 257 | "array([12.6544, 90.7864])" 258 | ] 259 | }, 260 | "execution_count": 17, 261 | "metadata": {}, 262 | "output_type": "execute_result" 263 | } 264 | ], 265 | "source": [ 266 | "np.set_printoptions(precision = 4)\n", 267 | "a = np.array([12.654398765, 90.7864098354674])\n", 268 | "a" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 18, 274 | "id": "299df070", 275 | "metadata": {}, 276 | "outputs": [ 277 | { 278 | "data": { 279 | "text/plain": [ 280 | "array([12.65, 90.79])" 281 | ] 282 | }, 283 | "execution_count": 18, 284 | "metadata": {}, 285 | "output_type": "execute_result" 286 | } 287 | ], 288 | "source": [ 289 | "np.set_printoptions(precision = 2)\n", 290 | "a = np.array([12.654398765, 90.7864098354674])\n", 291 | "a" 292 | ] 293 | }, 294 | { 295 | "cell_type": "code", 296 | "execution_count": 19, 297 | "id": "ccec6685", 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "data": { 302 | "text/plain": [ 303 | "array([13., 91.])" 304 | ] 305 | }, 306 | "execution_count": 19, 307 | "metadata": {}, 308 | "output_type": "execute_result" 309 | } 310 | ], 311 | "source": [ 312 | "np.set_printoptions(precision = 0)\n", 313 | "a = np.array([12.654398765, 90.7864098354674])\n", 314 | "a" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 22, 320 | "id": "92078b13", 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "data": { 325 | "text/plain": [ 326 | "array([ 0, 1, 2, ..., 197, 198, 199])" 327 | ] 328 | }, 329 | "execution_count": 22, 330 | "metadata": {}, 331 | "output_type": "execute_result" 332 | } 333 | ], 334 | "source": [ 335 | "np.set_printoptions(threshold = 10)\n", 336 | "np.arange(200)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 23, 342 | "id": "709ecd91", 343 | "metadata": {}, 344 | "outputs": [ 345 | { 346 | "data": { 347 | "text/plain": [ 348 | "array([12.65439876, 90.78640984])" 349 | ] 350 | }, 351 | "execution_count": 23, 352 | "metadata": {}, 353 | "output_type": "execute_result" 354 | } 355 | ], 356 | "source": [ 357 | "np.set_printoptions(precision = 8, suppress = False, threshold = 1000, formatter = None)\n", 358 | "a = np.array([12.654398765, 90.7864098354674])\n", 359 | "a" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 24, 365 | "id": "641b0b8b", 366 | "metadata": {}, 367 | "outputs": [ 368 | { 369 | "name": "stdout", 370 | "output_type": "stream", 371 | "text": [ 372 | "[20 42 72]\n" 373 | ] 374 | } 375 | ], 376 | "source": [ 377 | "a1 = [5, 6, 8]\n", 378 | "a2 = [4, 7, 9]\n", 379 | "print(np.multiply(a1, a2))" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 27, 385 | "id": "7a5cd367", 386 | "metadata": {}, 387 | "outputs": [ 388 | { 389 | "name": "stdout", 390 | "output_type": "stream", 391 | "text": [ 392 | "Dot product - 134\n", 393 | "Dot product using np.dot - 134\n", 394 | "Dot product using np.inner - 134\n", 395 | "Dot product using np.multiply & sum - 134\n", 396 | "Dot product using np.matmul - 134\n", 397 | "Dot product using for loop - 134\n" 398 | ] 399 | } 400 | ], 401 | "source": [ 402 | "a1 = np.array([5, 6, 8])\n", 403 | "a2 = np.array([4, 7, 9])\n", 404 | "\n", 405 | "#Dot product\n", 406 | "dotp = a1@a2\n", 407 | "print(\"Dot product - \", dotp)\n", 408 | "\n", 409 | "dotp = np.dot(a1, a2)\n", 410 | "print(\"Dot product using np.dot - \", dotp)\n", 411 | "\n", 412 | "dotp = np.inner(a1, a2)\n", 413 | "print(\"Dot product using np.inner - \", dotp)\n", 414 | "\n", 415 | "dotp = sum(np.multiply(a1, a2))\n", 416 | "print(\"Dot product using np.multiply & sum - \", dotp)\n", 417 | "\n", 418 | "dotp = np.matmul(a1, a2)\n", 419 | "print(\"Dot product using np.matmul - \", dotp)\n", 420 | "\n", 421 | "dotp = 0\n", 422 | "for i in range(len(a1)):\n", 423 | " dotp = dotp + a1[i]*a2[i]\n", 424 | "print(\"Dot product using for loop - \", dotp)" 425 | ] 426 | }, 427 | { 428 | "cell_type": "code", 429 | "execution_count": 28, 430 | "id": "5b0752b7", 431 | "metadata": {}, 432 | "outputs": [ 433 | { 434 | "data": { 435 | "text/plain": [ 436 | "9.539392014169456" 437 | ] 438 | }, 439 | "execution_count": 28, 440 | "metadata": {}, 441 | "output_type": "execute_result" 442 | } 443 | ], 444 | "source": [ 445 | "v3 = np.array([1,2,3,4,5,6])\n", 446 | "length = np.sqrt(np.dot(v3, v3))\n", 447 | "length" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 29, 453 | "id": "e484f99f", 454 | "metadata": {}, 455 | "outputs": [ 456 | { 457 | "data": { 458 | "text/plain": [ 459 | "array([0.5547002 , 0.83205029])" 460 | ] 461 | }, 462 | "execution_count": 29, 463 | "metadata": {}, 464 | "output_type": "execute_result" 465 | } 466 | ], 467 | "source": [ 468 | "v1 = [2,3]\n", 469 | "norm_v1 = v1 / np.linalg.norm(v1)\n", 470 | "norm_v1" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 30, 476 | "id": "97b57e42", 477 | "metadata": {}, 478 | "outputs": [ 479 | { 480 | "data": { 481 | "text/plain": [ 482 | "(3.605551275463989, array([0.5547002 , 0.83205029]))" 483 | ] 484 | }, 485 | "execution_count": 30, 486 | "metadata": {}, 487 | "output_type": "execute_result" 488 | } 489 | ], 490 | "source": [ 491 | "v1 = [2,3]\n", 492 | "length_v1 = np.sqrt(np.dot(v1,v1))\n", 493 | "norm_v1 = v1 / length_v1\n", 494 | "length_v1, norm_v1" 495 | ] 496 | }, 497 | { 498 | "cell_type": "code", 499 | "execution_count": 31, 500 | "id": "440809b3", 501 | "metadata": {}, 502 | "outputs": [], 503 | "source": [ 504 | "#Matrisler" 505 | ] 506 | }, 507 | { 508 | "cell_type": "code", 509 | "execution_count": 32, 510 | "id": "70a2991d", 511 | "metadata": {}, 512 | "outputs": [ 513 | { 514 | "data": { 515 | "text/plain": [ 516 | "array([[805, 768, 403, 305, 725, 540, 179, 230, 984, 325],\n", 517 | " [433, 723, 319, 21, 873, 774, 732, 618, 806, 908],\n", 518 | " [442, 457, 533, 491, 44, 171, 64, 838, 35, 491],\n", 519 | " [832, 455, 586, 443, 159, 810, 423, 110, 796, 89],\n", 520 | " [957, 966, 438, 463, 665, 8, 394, 141, 960, 945],\n", 521 | " [ 38, 463, 719, 142, 421, 35, 561, 737, 728, 245],\n", 522 | " [861, 68, 486, 298, 180, 358, 709, 896, 932, 793],\n", 523 | " [494, 869, 472, 267, 37, 950, 168, 303, 41, 577],\n", 524 | " [983, 88, 321, 659, 569, 423, 785, 651, 591, 982],\n", 525 | " [276, 77, 193, 898, 157, 944, 924, 811, 314, 340]])" 526 | ] 527 | }, 528 | "execution_count": 32, 529 | "metadata": {}, 530 | "output_type": "execute_result" 531 | } 532 | ], 533 | "source": [ 534 | "mat1 = np.random.randint(0, 1000, 100).reshape(10, 10)\n", 535 | "mat1" 536 | ] 537 | }, 538 | { 539 | "cell_type": "code", 540 | "execution_count": 33, 541 | "id": "272bdafc", 542 | "metadata": {}, 543 | "outputs": [ 544 | { 545 | "data": { 546 | "text/plain": [ 547 | "array([805, 768, 725, 540, 984, 723, 873, 774, 732, 618, 806, 908, 533,\n", 548 | " 838, 832, 586, 810, 796, 957, 966, 665, 960, 945, 719, 561, 737,\n", 549 | " 728, 861, 709, 896, 932, 793, 869, 950, 577, 983, 659, 569, 785,\n", 550 | " 651, 591, 982, 898, 944, 924, 811])" 551 | ] 552 | }, 553 | "execution_count": 33, 554 | "metadata": {}, 555 | "output_type": "execute_result" 556 | } 557 | ], 558 | "source": [ 559 | "mat1[mat1 > 500]" 560 | ] 561 | }, 562 | { 563 | "cell_type": "code", 564 | "execution_count": 34, 565 | "id": "abb62282", 566 | "metadata": {}, 567 | "outputs": [ 568 | { 569 | "data": { 570 | "text/plain": [ 571 | "array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],\n", 572 | " [0., 1., 0., 0., 0., 0., 0., 0., 0.],\n", 573 | " [0., 0., 1., 0., 0., 0., 0., 0., 0.],\n", 574 | " [0., 0., 0., 1., 0., 0., 0., 0., 0.],\n", 575 | " [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n", 576 | " [0., 0., 0., 0., 0., 1., 0., 0., 0.],\n", 577 | " [0., 0., 0., 0., 0., 0., 1., 0., 0.],\n", 578 | " [0., 0., 0., 0., 0., 0., 0., 1., 0.],\n", 579 | " [0., 0., 0., 0., 0., 0., 0., 0., 1.]])" 580 | ] 581 | }, 582 | "execution_count": 34, 583 | "metadata": {}, 584 | "output_type": "execute_result" 585 | } 586 | ], 587 | "source": [ 588 | "I = np.eye(9)\n", 589 | "I" 590 | ] 591 | }, 592 | { 593 | "cell_type": "code", 594 | "execution_count": 35, 595 | "id": "deec1e09", 596 | "metadata": {}, 597 | "outputs": [ 598 | { 599 | "data": { 600 | "text/plain": [ 601 | "array([[1, 0, 0, 0, 0, 0, 0, 0],\n", 602 | " [0, 2, 0, 0, 0, 0, 0, 0],\n", 603 | " [0, 0, 3, 0, 0, 0, 0, 0],\n", 604 | " [0, 0, 0, 4, 0, 0, 0, 0],\n", 605 | " [0, 0, 0, 0, 5, 0, 0, 0],\n", 606 | " [0, 0, 0, 0, 0, 6, 0, 0],\n", 607 | " [0, 0, 0, 0, 0, 0, 7, 0],\n", 608 | " [0, 0, 0, 0, 0, 0, 0, 8]])" 609 | ] 610 | }, 611 | "execution_count": 35, 612 | "metadata": {}, 613 | "output_type": "execute_result" 614 | } 615 | ], 616 | "source": [ 617 | "D = np.diag([1,2,3,4,5,6,7,8])\n", 618 | "D" 619 | ] 620 | }, 621 | { 622 | "cell_type": "code", 623 | "execution_count": 36, 624 | "id": "83769873", 625 | "metadata": {}, 626 | "outputs": [ 627 | { 628 | "name": "stdout", 629 | "output_type": "stream", 630 | "text": [ 631 | "[[ 1.68035435 0.80688763 0.40287747 1.6854456 -1.32882951]\n", 632 | " [ 0.84596645 -1.65468951 0.00817708 3.19750108 -1.22710021]\n", 633 | " [-0.77196501 0.15724556 0.99325008 -0.23573474 -0.33010289]\n", 634 | " [ 1.40566255 -1.8546334 -0.19371157 -1.05308007 -1.92281742]\n", 635 | " [-2.09141583 0.2703231 -0.47696008 -1.30183295 -1.01514764]]\n" 636 | ] 637 | } 638 | ], 639 | "source": [ 640 | "M = np.random.randn(5, 5)\n", 641 | "print(M) #mean = 0, variance = 1" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 38, 647 | "id": "f803591e", 648 | "metadata": {}, 649 | "outputs": [ 650 | { 651 | "name": "stdout", 652 | "output_type": "stream", 653 | "text": [ 654 | "[[ 1.68035435 0.80688763 0.40287747 1.6854456 -1.32882951]\n", 655 | " [ 0. -1.65468951 0.00817708 3.19750108 -1.22710021]\n", 656 | " [ 0. 0. 0.99325008 -0.23573474 -0.33010289]\n", 657 | " [ 0. 0. 0. -1.05308007 -1.92281742]\n", 658 | " [ 0. 0. 0. 0. -1.01514764]]\n", 659 | "\n", 660 | "\n", 661 | "[[ 1.68035435 0. 0. 0. 0. ]\n", 662 | " [ 0.84596645 -1.65468951 0. 0. 0. ]\n", 663 | " [-0.77196501 0.15724556 0.99325008 0. 0. ]\n", 664 | " [ 1.40566255 -1.8546334 -0.19371157 -1.05308007 0. ]\n", 665 | " [-2.09141583 0.2703231 -0.47696008 -1.30183295 -1.01514764]]\n" 666 | ] 667 | } 668 | ], 669 | "source": [ 670 | "U = np.triu(M)\n", 671 | "L = np.tril(M)\n", 672 | "print(U)\n", 673 | "print('\\n')\n", 674 | "print(L)" 675 | ] 676 | }, 677 | { 678 | "cell_type": "code", 679 | "execution_count": 39, 680 | "id": "9d9c0b6b", 681 | "metadata": {}, 682 | "outputs": [ 683 | { 684 | "data": { 685 | "text/plain": [ 686 | "array([[8, 8, 8, 8, 8],\n", 687 | " [8, 8, 8, 8, 8],\n", 688 | " [8, 8, 8, 8, 8],\n", 689 | " [8, 8, 8, 8, 8],\n", 690 | " [8, 8, 8, 8, 8]])" 691 | ] 692 | }, 693 | "execution_count": 39, 694 | "metadata": {}, 695 | "output_type": "execute_result" 696 | } 697 | ], 698 | "source": [ 699 | "np.full((5,5), 8)" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": 40, 705 | "id": "42bf2701", 706 | "metadata": {}, 707 | "outputs": [ 708 | { 709 | "data": { 710 | "text/plain": [ 711 | "array([[17.32618265, 15.23235106, 13.93764801, 14.99742616, 17.67296604],\n", 712 | " [18.42662307, 14.39780883, 12.88474242, 16.76242604, 17.03254757],\n", 713 | " [12.73783671, 11.1772208 , 11.40142099, 15.55447744, 16.50116265],\n", 714 | " [19.4232284 , 17.93541716, 11.25968528, 15.97049384, 17.20164701],\n", 715 | " [19.91417029, 12.71935513, 17.96887011, 18.39930945, 13.90331467]])" 716 | ] 717 | }, 718 | "execution_count": 40, 719 | "metadata": {}, 720 | "output_type": "execute_result" 721 | } 722 | ], 723 | "source": [ 724 | "np.random.uniform(10, 20, size = (5,5))" 725 | ] 726 | }, 727 | { 728 | "cell_type": "code", 729 | "execution_count": 67, 730 | "id": "d62f900b", 731 | "metadata": {}, 732 | "outputs": [ 733 | { 734 | "data": { 735 | "text/plain": [ 736 | "array([[13, 16, 11, 16, 19],\n", 737 | " [14, 12, 15, 18, 17],\n", 738 | " [11, 19, 12, 17, 18],\n", 739 | " [19, 10, 10, 17, 19],\n", 740 | " [16, 18, 17, 12, 16]])" 741 | ] 742 | }, 743 | "execution_count": 67, 744 | "metadata": {}, 745 | "output_type": "execute_result" 746 | } 747 | ], 748 | "source": [ 749 | "B = np.random.uniform(10, 20, size = (5,5))\n", 750 | "my = B.astype(int)\n", 751 | "my" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 66, 757 | "id": "9ec23679", 758 | "metadata": {}, 759 | "outputs": [ 760 | { 761 | "data": { 762 | "text/plain": [ 763 | "numpy.ndarray" 764 | ] 765 | }, 766 | "execution_count": 66, 767 | "metadata": {}, 768 | "output_type": "execute_result" 769 | } 770 | ], 771 | "source": [ 772 | "type(my)" 773 | ] 774 | }, 775 | { 776 | "cell_type": "code", 777 | "execution_count": 47, 778 | "id": "fdc82c04", 779 | "metadata": {}, 780 | "outputs": [ 781 | { 782 | "data": { 783 | "text/plain": [ 784 | "array([[ 1, 2, 3, 4],\n", 785 | " [ 5, 6, 7, 8],\n", 786 | " [10, 11, 12, 13],\n", 787 | " [14, 15, 16, 17]])" 788 | ] 789 | }, 790 | "execution_count": 47, 791 | "metadata": {}, 792 | "output_type": "execute_result" 793 | } 794 | ], 795 | "source": [ 796 | "A = np.array([[1,2,3,4], [5,6,7,8], [10,11,12,13], [14,15,16,17]])\n", 797 | "A" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": 48, 803 | "id": "131fca35", 804 | "metadata": {}, 805 | "outputs": [ 806 | { 807 | "data": { 808 | "text/plain": [ 809 | "array([ 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17])" 810 | ] 811 | }, 812 | "execution_count": 48, 813 | "metadata": {}, 814 | "output_type": "execute_result" 815 | } 816 | ], 817 | "source": [ 818 | "A.flatten()" 819 | ] 820 | }, 821 | { 822 | "cell_type": "code", 823 | "execution_count": 49, 824 | "id": "55e11d79", 825 | "metadata": {}, 826 | "outputs": [ 827 | { 828 | "data": { 829 | "text/plain": [ 830 | "array([ 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17])" 831 | ] 832 | }, 833 | "execution_count": 49, 834 | "metadata": {}, 835 | "output_type": "execute_result" 836 | } 837 | ], 838 | "source": [ 839 | "A.ravel()" 840 | ] 841 | }, 842 | { 843 | "cell_type": "code", 844 | "execution_count": 68, 845 | "id": "4a854506", 846 | "metadata": {}, 847 | "outputs": [ 848 | { 849 | "data": { 850 | "text/plain": [ 851 | "array([13, 16, 11, 16, 19, 14, 12, 15, 18, 17, 11, 19, 12, 17, 18, 19, 10,\n", 852 | " 10, 17, 19, 16, 18, 17, 12, 16])" 853 | ] 854 | }, 855 | "execution_count": 68, 856 | "metadata": {}, 857 | "output_type": "execute_result" 858 | } 859 | ], 860 | "source": [ 861 | "my.flatten()" 862 | ] 863 | }, 864 | { 865 | "cell_type": "code", 866 | "execution_count": 69, 867 | "id": "842e8c86", 868 | "metadata": {}, 869 | "outputs": [ 870 | { 871 | "data": { 872 | "text/plain": [ 873 | "array([13, 16, 11, 16, 19, 14, 12, 15, 18, 17, 11, 19, 12, 17, 18, 19, 10,\n", 874 | " 10, 17, 19, 16, 18, 17, 12, 16])" 875 | ] 876 | }, 877 | "execution_count": 69, 878 | "metadata": {}, 879 | "output_type": "execute_result" 880 | } 881 | ], 882 | "source": [ 883 | "my.ravel()" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": 70, 889 | "id": "0ad41fd7", 890 | "metadata": {}, 891 | "outputs": [ 892 | { 893 | "data": { 894 | "text/plain": [ 895 | "array([[1, 2],\n", 896 | " [4, 5]])" 897 | ] 898 | }, 899 | "execution_count": 70, 900 | "metadata": {}, 901 | "output_type": "execute_result" 902 | } 903 | ], 904 | "source": [ 905 | "M1 = np.array([[1,2], [4,5]])\n", 906 | "M1" 907 | ] 908 | }, 909 | { 910 | "cell_type": "code", 911 | "execution_count": 71, 912 | "id": "4274913c", 913 | "metadata": {}, 914 | "outputs": [ 915 | { 916 | "data": { 917 | "text/plain": [ 918 | "array([[ 57, 78],\n", 919 | " [156, 213]])" 920 | ] 921 | }, 922 | "execution_count": 71, 923 | "metadata": {}, 924 | "output_type": "execute_result" 925 | } 926 | ], 927 | "source": [ 928 | "M1@M1@M1" 929 | ] 930 | }, 931 | { 932 | "cell_type": "code", 933 | "execution_count": 72, 934 | "id": "d8a968af", 935 | "metadata": {}, 936 | "outputs": [ 937 | { 938 | "data": { 939 | "text/plain": [ 940 | "array([[ 57, 78],\n", 941 | " [156, 213]])" 942 | ] 943 | }, 944 | "execution_count": 72, 945 | "metadata": {}, 946 | "output_type": "execute_result" 947 | } 948 | ], 949 | "source": [ 950 | "np.linalg.matrix_power(M1, 3)" 951 | ] 952 | }, 953 | { 954 | "cell_type": "code", 955 | "execution_count": null, 956 | "id": "0a581ea3", 957 | "metadata": {}, 958 | "outputs": [], 959 | "source": [ 960 | "#Tensor" 961 | ] 962 | } 963 | ], 964 | "metadata": { 965 | "kernelspec": { 966 | "display_name": "Python 3", 967 | "language": "python", 968 | "name": "python3" 969 | }, 970 | "language_info": { 971 | "codemirror_mode": { 972 | "name": "ipython", 973 | "version": 3 974 | }, 975 | "file_extension": ".py", 976 | "mimetype": "text/x-python", 977 | "name": "python", 978 | "nbconvert_exporter": "python", 979 | "pygments_lexer": "ipython3", 980 | "version": "3.8.8" 981 | } 982 | }, 983 | "nbformat": 4, 984 | "nbformat_minor": 5 985 | } 986 | -------------------------------------------------------------------------------- /25062023 - DEB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "9005df46", 7 | "metadata": {}, 8 | "outputs": [ 9 | { 10 | "name": "stdout", 11 | "output_type": "stream", 12 | "text": [ 13 | "Requirement already satisfied: findspark in c:\\users\\itu\\anaconda3\\lib\\site-packages (2.0.1)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "!pip install findspark" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "id": "8dfeefaa", 25 | "metadata": {}, 26 | "outputs": [], 27 | "source": [ 28 | "import findspark\n", 29 | "findspark.init()" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 3, 35 | "id": "6b6dce5d", 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "from pyspark.sql import SparkSession\n", 40 | "from pyspark.conf import SparkConf" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "id": "fdfdb2fe", 47 | "metadata": {}, 48 | "outputs": [], 49 | "source": [ 50 | "spark = SparkSession.builder \\\n", 51 | ".master(\"local[4]\") \\\n", 52 | ".appName(\"giveatry\") \\\n", 53 | ".getOrCreate()" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 7, 59 | "id": "400c0c9e", 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "sc = spark.sparkContext" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 9, 69 | "id": "2c88fde8", 70 | "metadata": {}, 71 | "outputs": [], 72 | "source": [ 73 | "veri_seti = \"C:\\\\Users\\\\ITU\\\\mydata\\\\ibb_lojistik.txt\"" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 10, 79 | "id": "4b999354", 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "istac_rdd = sc.textFile(veri_seti)" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 11, 89 | "id": "1eb7a9e2", 90 | "metadata": {}, 91 | "outputs": [ 92 | { 93 | "data": { 94 | "text/plain": [ 95 | "97" 96 | ] 97 | }, 98 | "execution_count": 11, 99 | "metadata": {}, 100 | "output_type": "execute_result" 101 | } 102 | ], 103 | "source": [ 104 | "istac_rdd.count()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "code", 109 | "execution_count": 13, 110 | "id": "16290c98", 111 | "metadata": {}, 112 | "outputs": [ 113 | { 114 | "data": { 115 | "text/plain": [ 116 | "['İlçeler bazında firmaların kullandıkları ortalama alanlara bakıldığında ',\n", 117 | " 'en fazla alan kullanımının Ümraniye, Çatalca, Silivri, Şile, Arnavutköy gibi',\n", 118 | " 'İstanbul’un alan bakımından daha büyük ve yerleşim alanlarının daha kısıtlı olduğu',\n", 119 | " 'ilçeleri ön plana çıkmaktadır. Bunda arazi fiyatlarının daha düşük olması, karayolu, ',\n", 120 | " 'demiryolu ve kısmen de olsa denizyolu ana bağlantı noktalarına yakınlık, kullanılabilir ']" 121 | ] 122 | }, 123 | "execution_count": 13, 124 | "metadata": {}, 125 | "output_type": "execute_result" 126 | } 127 | ], 128 | "source": [ 129 | "istac_rdd.take(5)" 130 | ] 131 | }, 132 | { 133 | "cell_type": "code", 134 | "execution_count": 18, 135 | "id": "55fca24e", 136 | "metadata": {}, 137 | "outputs": [], 138 | "source": [ 139 | "istac_rdd_kelimeler = istac_rdd.flatMap(lambda satir : satir.split(\" \"))" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 20, 145 | "id": "1ca8904e", 146 | "metadata": {}, 147 | "outputs": [ 148 | { 149 | "data": { 150 | "text/plain": [ 151 | "['İlçeler',\n", 152 | " '',\n", 153 | " 'bazında',\n", 154 | " '',\n", 155 | " 'firmaların',\n", 156 | " '',\n", 157 | " 'kullandıkları',\n", 158 | " '',\n", 159 | " 'ortalama',\n", 160 | " '',\n", 161 | " 'alanlara',\n", 162 | " '',\n", 163 | " 'bakıldığında',\n", 164 | " '',\n", 165 | " '',\n", 166 | " 'en',\n", 167 | " '',\n", 168 | " 'fazla',\n", 169 | " '',\n", 170 | " 'alan']" 171 | ] 172 | }, 173 | "execution_count": 20, 174 | "metadata": {}, 175 | "output_type": "execute_result" 176 | } 177 | ], 178 | "source": [ 179 | "istac_rdd_kelimeler.take(20)" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 22, 185 | "id": "f3c69479", 186 | "metadata": {}, 187 | "outputs": [], 188 | "source": [ 189 | "istac_rdd_kelimeler_sayilari = istac_rdd_kelimeler.map(lambda kelime : (kelime, 1))" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 25, 195 | "id": "62840d06", 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "data": { 200 | "text/plain": [ 201 | "[('İlçeler', 1),\n", 202 | " ('', 1),\n", 203 | " ('bazında', 1),\n", 204 | " ('', 1),\n", 205 | " ('firmaların', 1),\n", 206 | " ('', 1),\n", 207 | " ('kullandıkları', 1),\n", 208 | " ('', 1),\n", 209 | " ('ortalama', 1),\n", 210 | " ('', 1),\n", 211 | " ('alanlara', 1),\n", 212 | " ('', 1),\n", 213 | " ('bakıldığında', 1),\n", 214 | " ('', 1),\n", 215 | " ('', 1),\n", 216 | " ('en', 1),\n", 217 | " ('', 1),\n", 218 | " ('fazla', 1),\n", 219 | " ('', 1),\n", 220 | " ('alan', 1)]" 221 | ] 222 | }, 223 | "execution_count": 25, 224 | "metadata": {}, 225 | "output_type": "execute_result" 226 | } 227 | ], 228 | "source": [ 229 | "istac_rdd_kelimeler_sayilari.take(20)" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 26, 235 | "id": "f45015bd", 236 | "metadata": {}, 237 | "outputs": [], 238 | "source": [ 239 | "istac_rdd_kelimeler_sayilari_reduce = istac_rdd_kelimeler_sayilari.reduceByKey(lambda x, y : (x + y))" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 27, 245 | "id": "3cf53d29", 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "data": { 250 | "text/plain": [ 251 | "[('', 112),\n", 252 | " ('bazında', 3),\n", 253 | " ('kullandıkları', 1),\n", 254 | " ('ortalama', 3),\n", 255 | " ('en', 4),\n", 256 | " ('fazla', 3),\n", 257 | " ('alan', 6),\n", 258 | " ('Çatalca,', 4),\n", 259 | " ('Arnavutköy', 3),\n", 260 | " ('İstanbul’un', 5),\n", 261 | " ('daha', 6),\n", 262 | " ('ve', 42),\n", 263 | " ('alanlarının', 4),\n", 264 | " ('ilçeleri', 3),\n", 265 | " ('plana', 3),\n", 266 | " ('Bunda', 1),\n", 267 | " ('fiyatlarının', 1),\n", 268 | " ('düşük', 1),\n", 269 | " ('karayolu,', 1),\n", 270 | " ('demiryolu', 1)]" 271 | ] 272 | }, 273 | "execution_count": 27, 274 | "metadata": {}, 275 | "output_type": "execute_result" 276 | } 277 | ], 278 | "source": [ 279 | "istac_rdd_kelimeler_sayilari_reduce.take(20)" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 28, 285 | "id": "73df3ede", 286 | "metadata": {}, 287 | "outputs": [], 288 | "source": [ 289 | "istac_rdd_kelimeler_sayilari_reduce_sort = istac_rdd_kelimeler_sayilari_reduce.map(lambda x: (x[1], x[0]))" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 29, 295 | "id": "0a20e465", 296 | "metadata": {}, 297 | "outputs": [ 298 | { 299 | "data": { 300 | "text/plain": [ 301 | "[(112, ''),\n", 302 | " (3, 'bazında'),\n", 303 | " (1, 'kullandıkları'),\n", 304 | " (3, 'ortalama'),\n", 305 | " (4, 'en'),\n", 306 | " (3, 'fazla'),\n", 307 | " (6, 'alan'),\n", 308 | " (4, 'Çatalca,'),\n", 309 | " (3, 'Arnavutköy'),\n", 310 | " (5, 'İstanbul’un'),\n", 311 | " (6, 'daha'),\n", 312 | " (42, 've'),\n", 313 | " (4, 'alanlarının'),\n", 314 | " (3, 'ilçeleri'),\n", 315 | " (3, 'plana'),\n", 316 | " (1, 'Bunda'),\n", 317 | " (1, 'fiyatlarının'),\n", 318 | " (1, 'düşük'),\n", 319 | " (1, 'karayolu,'),\n", 320 | " (1, 'demiryolu')]" 321 | ] 322 | }, 323 | "execution_count": 29, 324 | "metadata": {}, 325 | "output_type": "execute_result" 326 | } 327 | ], 328 | "source": [ 329 | "istac_rdd_kelimeler_sayilari_reduce_sort.take(20)" 330 | ] 331 | }, 332 | { 333 | "cell_type": "code", 334 | "execution_count": 30, 335 | "id": "1066fed8", 336 | "metadata": {}, 337 | "outputs": [ 338 | { 339 | "data": { 340 | "text/plain": [ 341 | "[(112, ''),\n", 342 | " (42, 've'),\n", 343 | " (12, 'lojistik'),\n", 344 | " (11, 'olarak'),\n", 345 | " (10, 'bir'),\n", 346 | " (9, 'yük'),\n", 347 | " (9, 'gibi'),\n", 348 | " (8, 'ile'),\n", 349 | " (6, 'alan'),\n", 350 | " (6, 'daha'),\n", 351 | " (6, 'Lojistik'),\n", 352 | " (6, 'ilçeler'),\n", 353 | " (6, 'alanları'),\n", 354 | " (6, 'bu'),\n", 355 | " (5, 'İstanbul’un'),\n", 356 | " (5, 'depolama'),\n", 357 | " (5, 'kentsel'),\n", 358 | " (5, 'Şile,'),\n", 359 | " (5, 'Ana'),\n", 360 | " (5, 'İstanbul')]" 361 | ] 362 | }, 363 | "execution_count": 30, 364 | "metadata": {}, 365 | "output_type": "execute_result" 366 | } 367 | ], 368 | "source": [ 369 | "istac_rdd_kelimeler_sayilari_reduce_sort.sortByKey(False).take(20)" 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": null, 375 | "id": "f9c53d29", 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [] 379 | } 380 | ], 381 | "metadata": { 382 | "kernelspec": { 383 | "display_name": "Python 3", 384 | "language": "python", 385 | "name": "python3" 386 | }, 387 | "language_info": { 388 | "codemirror_mode": { 389 | "name": "ipython", 390 | "version": 3 391 | }, 392 | "file_extension": ".py", 393 | "mimetype": "text/x-python", 394 | "name": "python", 395 | "nbconvert_exporter": "python", 396 | "pygments_lexer": "ipython3", 397 | "version": "3.8.8" 398 | } 399 | }, 400 | "nbformat": 4, 401 | "nbformat_minor": 5 402 | } 403 | -------------------------------------------------------------------------------- /27042023 - DEB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 3, 6 | "id": "5aca6e0d", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "#Tensörler\n", 11 | "import numpy as np" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 6, 17 | "id": "f2ab7891", 18 | "metadata": {}, 19 | "outputs": [ 20 | { 21 | "data": { 22 | "text/plain": [ 23 | "array([[[ 1, 2, 3],\n", 24 | " [ 4, 5, 6],\n", 25 | " [ 7, 8, 9]],\n", 26 | "\n", 27 | " [[ 10, 20, 30],\n", 28 | " [ 40, 50, 60],\n", 29 | " [ 70, 80, 90]],\n", 30 | "\n", 31 | " [[100, 200, 300],\n", 32 | " [400, 500, 600],\n", 33 | " [700, 800, 900]]])" 34 | ] 35 | }, 36 | "execution_count": 6, 37 | "metadata": {}, 38 | "output_type": "execute_result" 39 | } 40 | ], 41 | "source": [ 42 | "T1 = np.array([\n", 43 | " [[1,2,3], [4,5,6], [7,8,9]],\n", 44 | " [[10,20,30], [40,50,60], [70,80,90]],\n", 45 | " [[100, 200, 300], [400, 500, 600], [700, 800, 900]],\n", 46 | "])\n", 47 | "\n", 48 | "T1" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 5, 54 | "id": "f2ad73b3", 55 | "metadata": {}, 56 | "outputs": [ 57 | { 58 | "data": { 59 | "text/plain": [ 60 | "array([[[0, 0, 0],\n", 61 | " [0, 0, 0],\n", 62 | " [0, 0, 0]],\n", 63 | "\n", 64 | " [[1, 1, 1],\n", 65 | " [1, 1, 1],\n", 66 | " [1, 1, 1]],\n", 67 | "\n", 68 | " [[2, 2, 2],\n", 69 | " [2, 2, 2],\n", 70 | " [2, 2, 2]]])" 71 | ] 72 | }, 73 | "execution_count": 5, 74 | "metadata": {}, 75 | "output_type": "execute_result" 76 | } 77 | ], 78 | "source": [ 79 | "T2= np.array([\n", 80 | " [[0,0,0], [0,0,0], [0,0,0]],\n", 81 | " [[1,1,1], [1,1,1], [1,1,1]],\n", 82 | " [[2,2,2], [2,2,2], [2,2,2]],\n", 83 | " \n", 84 | "])\n", 85 | "\n", 86 | "T2" 87 | ] 88 | }, 89 | { 90 | "cell_type": "code", 91 | "execution_count": 7, 92 | "id": "e6180340", 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "data": { 97 | "text/plain": [ 98 | "array([[[ 1, 2, 3],\n", 99 | " [ 4, 5, 6],\n", 100 | " [ 7, 8, 9]],\n", 101 | "\n", 102 | " [[ 11, 21, 31],\n", 103 | " [ 41, 51, 61],\n", 104 | " [ 71, 81, 91]],\n", 105 | "\n", 106 | " [[102, 202, 302],\n", 107 | " [402, 502, 602],\n", 108 | " [702, 802, 902]]])" 109 | ] 110 | }, 111 | "execution_count": 7, 112 | "metadata": {}, 113 | "output_type": "execute_result" 114 | } 115 | ], 116 | "source": [ 117 | "A = T1 + T2\n", 118 | "A" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 8, 124 | "id": "bbb6a509", 125 | "metadata": {}, 126 | "outputs": [ 127 | { 128 | "data": { 129 | "text/plain": [ 130 | "array([[[ 1, 2, 3],\n", 131 | " [ 4, 5, 6],\n", 132 | " [ 7, 8, 9]],\n", 133 | "\n", 134 | " [[ 11, 21, 31],\n", 135 | " [ 41, 51, 61],\n", 136 | " [ 71, 81, 91]],\n", 137 | "\n", 138 | " [[102, 202, 302],\n", 139 | " [402, 502, 602],\n", 140 | " [702, 802, 902]]])" 141 | ] 142 | }, 143 | "execution_count": 8, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "np.add(T1, T2)" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 9, 155 | "id": "353fed0a", 156 | "metadata": {}, 157 | "outputs": [ 158 | { 159 | "data": { 160 | "text/plain": [ 161 | "array([[[ 1, 2, 3],\n", 162 | " [ 4, 5, 6],\n", 163 | " [ 7, 8, 9]],\n", 164 | "\n", 165 | " [[ 9, 19, 29],\n", 166 | " [ 39, 49, 59],\n", 167 | " [ 69, 79, 89]],\n", 168 | "\n", 169 | " [[ 98, 198, 298],\n", 170 | " [398, 498, 598],\n", 171 | " [698, 798, 898]]])" 172 | ] 173 | }, 174 | "execution_count": 9, 175 | "metadata": {}, 176 | "output_type": "execute_result" 177 | } 178 | ], 179 | "source": [ 180 | "np.subtract(T1, T2)" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 10, 186 | "id": "c06fb5f2", 187 | "metadata": {}, 188 | "outputs": [ 189 | { 190 | "data": { 191 | "text/plain": [ 192 | "array([[[ 0, 0, 0],\n", 193 | " [ 0, 0, 0],\n", 194 | " [ 0, 0, 0]],\n", 195 | "\n", 196 | " [[ 10, 20, 30],\n", 197 | " [ 40, 50, 60],\n", 198 | " [ 70, 80, 90]],\n", 199 | "\n", 200 | " [[ 200, 400, 600],\n", 201 | " [ 800, 1000, 1200],\n", 202 | " [1400, 1600, 1800]]])" 203 | ] 204 | }, 205 | "execution_count": 10, 206 | "metadata": {}, 207 | "output_type": "execute_result" 208 | } 209 | ], 210 | "source": [ 211 | "np.multiply(T1, T2)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 13, 217 | "id": "05263bb5", 218 | "metadata": {}, 219 | "outputs": [], 220 | "source": [ 221 | "import warnings\n", 222 | "warnings.filterwarnings('ignore')" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 14, 228 | "id": "3b4bc8e7", 229 | "metadata": {}, 230 | "outputs": [ 231 | { 232 | "data": { 233 | "text/plain": [ 234 | "array([[[ inf, inf, inf],\n", 235 | " [ inf, inf, inf],\n", 236 | " [ inf, inf, inf]],\n", 237 | "\n", 238 | " [[ 10., 20., 30.],\n", 239 | " [ 40., 50., 60.],\n", 240 | " [ 70., 80., 90.]],\n", 241 | "\n", 242 | " [[ 50., 100., 150.],\n", 243 | " [200., 250., 300.],\n", 244 | " [350., 400., 450.]]])" 245 | ] 246 | }, 247 | "execution_count": 14, 248 | "metadata": {}, 249 | "output_type": "execute_result" 250 | } 251 | ], 252 | "source": [ 253 | "D = T1 / T2 # np.divide(T1, T2)\n", 254 | "D" 255 | ] 256 | }, 257 | { 258 | "cell_type": "code", 259 | "execution_count": 15, 260 | "id": "1f321034", 261 | "metadata": {}, 262 | "outputs": [], 263 | "source": [ 264 | "from scipy import linalg" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 16, 270 | "id": "97dabd14", 271 | "metadata": {}, 272 | "outputs": [ 273 | { 274 | "name": "stdout", 275 | "output_type": "stream", 276 | "text": [ 277 | "[[1 3 5]\n", 278 | " [2 5 1]\n", 279 | " [2 3 8]]\n", 280 | "[[10]\n", 281 | " [ 8]\n", 282 | " [ 3]]\n" 283 | ] 284 | } 285 | ], 286 | "source": [ 287 | "A = np.array([[1,3,5], [2,5,1], [2,3,8]])\n", 288 | "print(A)\n", 289 | "b = np.array([[10], [8], [3]])\n", 290 | "print(b)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 17, 296 | "id": "4ea2b7e2", 297 | "metadata": {}, 298 | "outputs": [ 299 | { 300 | "name": "stdout", 301 | "output_type": "stream", 302 | "text": [ 303 | "linalg.inv(A).dot(b) Matrix Inverse ile Denklem Takımı Çözümü\n", 304 | "[[-9.28]\n", 305 | " [ 5.16]\n", 306 | " [ 0.76]]\n" 307 | ] 308 | } 309 | ], 310 | "source": [ 311 | "print(\"linalg.inv(A).dot(b) Matrix Inverse ile Denklem Takımı Çözümü\")\n", 312 | "print(linalg.inv(A).dot(b)) #slow" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 18, 318 | "id": "ca0a4359", 319 | "metadata": {}, 320 | "outputs": [ 321 | { 322 | "name": "stdout", 323 | "output_type": "stream", 324 | "text": [ 325 | "np.linalg.solve(A, b) ile Denklem Takımı Çözümü\n", 326 | "[[-9.28]\n", 327 | " [ 5.16]\n", 328 | " [ 0.76]]\n" 329 | ] 330 | } 331 | ], 332 | "source": [ 333 | "print(\"np.linalg.solve(A, b) ile Denklem Takımı Çözümü\")\n", 334 | "print(np.linalg.solve(A, b)) #fast" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 19, 340 | "id": "16487b46", 341 | "metadata": {}, 342 | "outputs": [ 343 | { 344 | "name": "stdout", 345 | "output_type": "stream", 346 | "text": [ 347 | "(72.00000000000001, 7.993605777301129e-13)\n" 348 | ] 349 | } 350 | ], 351 | "source": [ 352 | "from scipy import integrate\n", 353 | "x2 = lambda x : x**2\n", 354 | "I = integrate.quad(x2, 0, 6)\n", 355 | "print(I)" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": 20, 361 | "id": "62a783fd", 362 | "metadata": {}, 363 | "outputs": [ 364 | { 365 | "name": "stdout", 366 | "output_type": "stream", 367 | "text": [ 368 | "4.999999999921734\n" 369 | ] 370 | } 371 | ], 372 | "source": [ 373 | "from scipy.misc import derivative\n", 374 | "def f(x):\n", 375 | " return x**3 + x**2\n", 376 | "D = derivative(f, 1.0, dx = 1e-6)\n", 377 | "print(D)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": 21, 383 | "id": "ebab2888", 384 | "metadata": {}, 385 | "outputs": [], 386 | "source": [ 387 | "import pandas as pd" 388 | ] 389 | }, 390 | { 391 | "cell_type": "code", 392 | "execution_count": 22, 393 | "id": "a8818866", 394 | "metadata": {}, 395 | "outputs": [], 396 | "source": [ 397 | "#Seriler ve Veri Çerçeveleri (DataFrame)" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 23, 403 | "id": "0aa2bb8d", 404 | "metadata": {}, 405 | "outputs": [ 406 | { 407 | "data": { 408 | "text/plain": [ 409 | "0 1\n", 410 | "1 2\n", 411 | "2 3\n", 412 | "3 4\n", 413 | "4 5\n", 414 | "5 6\n", 415 | "6 7\n", 416 | "dtype: int32" 417 | ] 418 | }, 419 | "execution_count": 23, 420 | "metadata": {}, 421 | "output_type": "execute_result" 422 | } 423 | ], 424 | "source": [ 425 | "v = np.array([1,2,3,4,5,6,7])\n", 426 | "s1 = pd.Series(v)\n", 427 | "s1" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": null, 433 | "id": "31914d3a", 434 | "metadata": {}, 435 | "outputs": [], 436 | "source": [] 437 | } 438 | ], 439 | "metadata": { 440 | "kernelspec": { 441 | "display_name": "Python 3", 442 | "language": "python", 443 | "name": "python3" 444 | }, 445 | "language_info": { 446 | "codemirror_mode": { 447 | "name": "ipython", 448 | "version": 3 449 | }, 450 | "file_extension": ".py", 451 | "mimetype": "text/x-python", 452 | "name": "python", 453 | "nbconvert_exporter": "python", 454 | "pygments_lexer": "ipython3", 455 | "version": "3.8.8" 456 | } 457 | }, 458 | "nbformat": 4, 459 | "nbformat_minor": 5 460 | } 461 | -------------------------------------------------------------------------------- /Advertising.csv: -------------------------------------------------------------------------------- 1 | "","TV","Radio","Newspaper","Sales" 2 | "1",230.1,37.8,69.2,22.1 3 | "2",44.5,39.3,45.1,10.4 4 | "3",17.2,45.9,69.3,9.3 5 | "4",151.5,41.3,58.5,18.5 6 | "5",180.8,10.8,58.4,12.9 7 | "6",8.7,48.9,75,7.2 8 | "7",57.5,32.8,23.5,11.8 9 | "8",120.2,19.6,11.6,13.2 10 | "9",8.6,2.1,1,4.8 11 | "10",199.8,2.6,21.2,10.6 12 | "11",66.1,5.8,24.2,8.6 13 | "12",214.7,24,4,17.4 14 | "13",23.8,35.1,65.9,9.2 15 | "14",97.5,7.6,7.2,9.7 16 | "15",204.1,32.9,46,19 17 | "16",195.4,47.7,52.9,22.4 18 | "17",67.8,36.6,114,12.5 19 | "18",281.4,39.6,55.8,24.4 20 | "19",69.2,20.5,18.3,11.3 21 | "20",147.3,23.9,19.1,14.6 22 | "21",218.4,27.7,53.4,18 23 | "22",237.4,5.1,23.5,12.5 24 | "23",13.2,15.9,49.6,5.6 25 | "24",228.3,16.9,26.2,15.5 26 | "25",62.3,12.6,18.3,9.7 27 | "26",262.9,3.5,19.5,12 28 | "27",142.9,29.3,12.6,15 29 | "28",240.1,16.7,22.9,15.9 30 | "29",248.8,27.1,22.9,18.9 31 | "30",70.6,16,40.8,10.5 32 | "31",292.9,28.3,43.2,21.4 33 | "32",112.9,17.4,38.6,11.9 34 | "33",97.2,1.5,30,9.6 35 | "34",265.6,20,0.3,17.4 36 | "35",95.7,1.4,7.4,9.5 37 | "36",290.7,4.1,8.5,12.8 38 | "37",266.9,43.8,5,25.4 39 | "38",74.7,49.4,45.7,14.7 40 | "39",43.1,26.7,35.1,10.1 41 | "40",228,37.7,32,21.5 42 | "41",202.5,22.3,31.6,16.6 43 | "42",177,33.4,38.7,17.1 44 | "43",293.6,27.7,1.8,20.7 45 | "44",206.9,8.4,26.4,12.9 46 | "45",25.1,25.7,43.3,8.5 47 | "46",175.1,22.5,31.5,14.9 48 | "47",89.7,9.9,35.7,10.6 49 | "48",239.9,41.5,18.5,23.2 50 | "49",227.2,15.8,49.9,14.8 51 | "50",66.9,11.7,36.8,9.7 52 | "51",199.8,3.1,34.6,11.4 53 | "52",100.4,9.6,3.6,10.7 54 | "53",216.4,41.7,39.6,22.6 55 | "54",182.6,46.2,58.7,21.2 56 | "55",262.7,28.8,15.9,20.2 57 | "56",198.9,49.4,60,23.7 58 | "57",7.3,28.1,41.4,5.5 59 | "58",136.2,19.2,16.6,13.2 60 | "59",210.8,49.6,37.7,23.8 61 | "60",210.7,29.5,9.3,18.4 62 | "61",53.5,2,21.4,8.1 63 | "62",261.3,42.7,54.7,24.2 64 | "63",239.3,15.5,27.3,15.7 65 | "64",102.7,29.6,8.4,14 66 | "65",131.1,42.8,28.9,18 67 | "66",69,9.3,0.9,9.3 68 | "67",31.5,24.6,2.2,9.5 69 | "68",139.3,14.5,10.2,13.4 70 | "69",237.4,27.5,11,18.9 71 | "70",216.8,43.9,27.2,22.3 72 | "71",199.1,30.6,38.7,18.3 73 | "72",109.8,14.3,31.7,12.4 74 | "73",26.8,33,19.3,8.8 75 | "74",129.4,5.7,31.3,11 76 | "75",213.4,24.6,13.1,17 77 | "76",16.9,43.7,89.4,8.7 78 | "77",27.5,1.6,20.7,6.9 79 | "78",120.5,28.5,14.2,14.2 80 | "79",5.4,29.9,9.4,5.3 81 | "80",116,7.7,23.1,11 82 | "81",76.4,26.7,22.3,11.8 83 | "82",239.8,4.1,36.9,12.3 84 | "83",75.3,20.3,32.5,11.3 85 | "84",68.4,44.5,35.6,13.6 86 | "85",213.5,43,33.8,21.7 87 | "86",193.2,18.4,65.7,15.2 88 | "87",76.3,27.5,16,12 89 | "88",110.7,40.6,63.2,16 90 | "89",88.3,25.5,73.4,12.9 91 | "90",109.8,47.8,51.4,16.7 92 | "91",134.3,4.9,9.3,11.2 93 | "92",28.6,1.5,33,7.3 94 | "93",217.7,33.5,59,19.4 95 | "94",250.9,36.5,72.3,22.2 96 | "95",107.4,14,10.9,11.5 97 | "96",163.3,31.6,52.9,16.9 98 | "97",197.6,3.5,5.9,11.7 99 | "98",184.9,21,22,15.5 100 | "99",289.7,42.3,51.2,25.4 101 | "100",135.2,41.7,45.9,17.2 102 | "101",222.4,4.3,49.8,11.7 103 | "102",296.4,36.3,100.9,23.8 104 | "103",280.2,10.1,21.4,14.8 105 | "104",187.9,17.2,17.9,14.7 106 | "105",238.2,34.3,5.3,20.7 107 | "106",137.9,46.4,59,19.2 108 | "107",25,11,29.7,7.2 109 | "108",90.4,0.3,23.2,8.7 110 | "109",13.1,0.4,25.6,5.3 111 | "110",255.4,26.9,5.5,19.8 112 | "111",225.8,8.2,56.5,13.4 113 | "112",241.7,38,23.2,21.8 114 | "113",175.7,15.4,2.4,14.1 115 | "114",209.6,20.6,10.7,15.9 116 | "115",78.2,46.8,34.5,14.6 117 | "116",75.1,35,52.7,12.6 118 | "117",139.2,14.3,25.6,12.2 119 | "118",76.4,0.8,14.8,9.4 120 | "119",125.7,36.9,79.2,15.9 121 | "120",19.4,16,22.3,6.6 122 | "121",141.3,26.8,46.2,15.5 123 | "122",18.8,21.7,50.4,7 124 | "123",224,2.4,15.6,11.6 125 | "124",123.1,34.6,12.4,15.2 126 | "125",229.5,32.3,74.2,19.7 127 | "126",87.2,11.8,25.9,10.6 128 | "127",7.8,38.9,50.6,6.6 129 | "128",80.2,0,9.2,8.8 130 | "129",220.3,49,3.2,24.7 131 | "130",59.6,12,43.1,9.7 132 | "131",0.7,39.6,8.7,1.6 133 | "132",265.2,2.9,43,12.7 134 | "133",8.4,27.2,2.1,5.7 135 | "134",219.8,33.5,45.1,19.6 136 | "135",36.9,38.6,65.6,10.8 137 | "136",48.3,47,8.5,11.6 138 | "137",25.6,39,9.3,9.5 139 | "138",273.7,28.9,59.7,20.8 140 | "139",43,25.9,20.5,9.6 141 | "140",184.9,43.9,1.7,20.7 142 | "141",73.4,17,12.9,10.9 143 | "142",193.7,35.4,75.6,19.2 144 | "143",220.5,33.2,37.9,20.1 145 | "144",104.6,5.7,34.4,10.4 146 | "145",96.2,14.8,38.9,11.4 147 | "146",140.3,1.9,9,10.3 148 | "147",240.1,7.3,8.7,13.2 149 | "148",243.2,49,44.3,25.4 150 | "149",38,40.3,11.9,10.9 151 | "150",44.7,25.8,20.6,10.1 152 | "151",280.7,13.9,37,16.1 153 | "152",121,8.4,48.7,11.6 154 | "153",197.6,23.3,14.2,16.6 155 | "154",171.3,39.7,37.7,19 156 | "155",187.8,21.1,9.5,15.6 157 | "156",4.1,11.6,5.7,3.2 158 | "157",93.9,43.5,50.5,15.3 159 | "158",149.8,1.3,24.3,10.1 160 | "159",11.7,36.9,45.2,7.3 161 | "160",131.7,18.4,34.6,12.9 162 | "161",172.5,18.1,30.7,14.4 163 | "162",85.7,35.8,49.3,13.3 164 | "163",188.4,18.1,25.6,14.9 165 | "164",163.5,36.8,7.4,18 166 | "165",117.2,14.7,5.4,11.9 167 | "166",234.5,3.4,84.8,11.9 168 | "167",17.9,37.6,21.6,8 169 | "168",206.8,5.2,19.4,12.2 170 | "169",215.4,23.6,57.6,17.1 171 | "170",284.3,10.6,6.4,15 172 | "171",50,11.6,18.4,8.4 173 | "172",164.5,20.9,47.4,14.5 174 | "173",19.6,20.1,17,7.6 175 | "174",168.4,7.1,12.8,11.7 176 | "175",222.4,3.4,13.1,11.5 177 | "176",276.9,48.9,41.8,27 178 | "177",248.4,30.2,20.3,20.2 179 | "178",170.2,7.8,35.2,11.7 180 | "179",276.7,2.3,23.7,11.8 181 | "180",165.6,10,17.6,12.6 182 | "181",156.6,2.6,8.3,10.5 183 | "182",218.5,5.4,27.4,12.2 184 | "183",56.2,5.7,29.7,8.7 185 | "184",287.6,43,71.8,26.2 186 | "185",253.8,21.3,30,17.6 187 | "186",205,45.1,19.6,22.6 188 | "187",139.5,2.1,26.6,10.3 189 | "188",191.1,28.7,18.2,17.3 190 | "189",286,13.9,3.7,15.9 191 | "190",18.7,12.1,23.4,6.7 192 | "191",39.5,41.1,5.8,10.8 193 | "192",75.5,10.8,6,9.9 194 | "193",17.2,4.1,31.6,5.9 195 | "194",166.8,42,3.6,19.6 196 | "195",149.7,35.6,6,17.3 197 | "196",38.2,3.7,13.8,7.6 198 | "197",94.2,4.9,8.1,9.7 199 | "198",177,9.3,6.4,12.8 200 | "199",283.6,42,66.2,25.5 201 | "200",232.1,8.6,8.7,13.4 202 | -------------------------------------------------------------------------------- /DecisionTreeClassification.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyNmlTARULSS42Up02f/RGGx", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 21, 32 | "metadata": { 33 | "colab": { 34 | "base_uri": "https://localhost:8080/", 35 | "height": 547 36 | }, 37 | "id": "GpoywBK8fUaG", 38 | "outputId": "1e47a80a-a9ee-414a-9327-f694697492ba" 39 | }, 40 | "outputs": [ 41 | { 42 | "output_type": "stream", 43 | "name": "stdout", 44 | "text": [ 45 | "Collecting delta-spark==2.4.0\n", 46 | " Downloading delta_spark-2.4.0-py3-none-any.whl.metadata (1.9 kB)\n", 47 | "Collecting pyspark<3.5.0,>=3.4.0 (from delta-spark==2.4.0)\n", 48 | " Downloading pyspark-3.4.4.tar.gz (311.4 MB)\n", 49 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.4/311.4 MB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", 50 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 51 | "Requirement already satisfied: importlib-metadata>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from delta-spark==2.4.0) (8.5.0)\n", 52 | "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.11/dist-packages (from importlib-metadata>=1.0.0->delta-spark==2.4.0) (3.21.0)\n", 53 | "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.11/dist-packages (from pyspark<3.5.0,>=3.4.0->delta-spark==2.4.0) (0.10.9.7)\n", 54 | "Downloading delta_spark-2.4.0-py3-none-any.whl (20 kB)\n", 55 | "Building wheels for collected packages: pyspark\n", 56 | " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n", 57 | " Created wheel for pyspark: filename=pyspark-3.4.4-py2.py3-none-any.whl size=311905460 sha256=49df0176ba3e140cb0fbacc113713ea216769259423fc3b261a3c15a8a5eac5d\n", 58 | " Stored in directory: /root/.cache/pip/wheels/6b/0a/a1/2b8f5f192c7df9fdceb8e5a62873d64e46b101f980519bcf55\n", 59 | "Successfully built pyspark\n", 60 | "Installing collected packages: pyspark, delta-spark\n", 61 | " Attempting uninstall: pyspark\n", 62 | " Found existing installation: pyspark 3.5.4\n", 63 | " Uninstalling pyspark-3.5.4:\n", 64 | " Successfully uninstalled pyspark-3.5.4\n", 65 | " Attempting uninstall: delta-spark\n", 66 | " Found existing installation: delta-spark 3.3.0\n", 67 | " Uninstalling delta-spark-3.3.0:\n", 68 | " Successfully uninstalled delta-spark-3.3.0\n", 69 | "Successfully installed delta-spark-2.4.0 pyspark-3.4.4\n" 70 | ] 71 | }, 72 | { 73 | "output_type": "display_data", 74 | "data": { 75 | "application/vnd.colab-display-data+json": { 76 | "pip_warning": { 77 | "packages": [ 78 | "delta", 79 | "pyspark" 80 | ] 81 | }, 82 | "id": "28824af9be4a403d8555baebe58b07da" 83 | } 84 | }, 85 | "metadata": {} 86 | } 87 | ], 88 | "source": [ 89 | "#Decision Tree Classification\n", 90 | "!pip install delta-spark==2.4.0\n" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "source": [ 96 | "from delta import *" 97 | ], 98 | "metadata": { 99 | "id": "VIPtM0dQgNpZ" 100 | }, 101 | "execution_count": 19, 102 | "outputs": [] 103 | }, 104 | { 105 | "cell_type": "code", 106 | "source": [ 107 | "from pyspark.sql import SparkSession\n", 108 | "\n", 109 | "# Add Delta Lake package dependency\n", 110 | "spark = SparkSession \\\n", 111 | " .builder \\\n", 112 | " .appName('DT Classification with Pyspark') \\\n", 113 | " .config(\"spark.jars.packages\", \"io.delta:delta-core_2.12:2.4.0\") \\\n", 114 | " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\") \\\n", 115 | " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\") \\\n", 116 | " .getOrCreate()\n", 117 | "\n", 118 | "# Read the CSV file using the correct format\n", 119 | "df = spark.read.format('csv').\\\n", 120 | " options(header = 'true', inferschema = 'true').\\\n", 121 | " load(\"/content/WineData.csv\")\n", 122 | "\n", 123 | "df.printSchema()\n", 124 | "df.show(5)" 125 | ], 126 | "metadata": { 127 | "colab": { 128 | "base_uri": "https://localhost:8080/" 129 | }, 130 | "id": "2drqxnp7fdpf", 131 | "outputId": "18b6aa25-3446-422b-d62c-ae5f13fb50fa" 132 | }, 133 | "execution_count": 1, 134 | "outputs": [ 135 | { 136 | "output_type": "stream", 137 | "name": "stdout", 138 | "text": [ 139 | "root\n", 140 | " |-- fixed acidity: double (nullable = true)\n", 141 | " |-- volatile acidity: double (nullable = true)\n", 142 | " |-- citric acid: double (nullable = true)\n", 143 | " |-- residual sugar: double (nullable = true)\n", 144 | " |-- chlorides: double (nullable = true)\n", 145 | " |-- free sulfur dioxide: double (nullable = true)\n", 146 | " |-- total sulfur dioxide: double (nullable = true)\n", 147 | " |-- density: double (nullable = true)\n", 148 | " |-- pH: double (nullable = true)\n", 149 | " |-- sulphates: double (nullable = true)\n", 150 | " |-- alcohol: double (nullable = true)\n", 151 | " |-- quality: integer (nullable = true)\n", 152 | "\n", 153 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n", 154 | "|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol|quality|\n", 155 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n", 156 | "| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| 5|\n", 157 | "| 7.8| 0.88| 0.0| 2.6| 0.098| 25.0| 67.0| 0.9968| 3.2| 0.68| 9.8| 5|\n", 158 | "| 7.8| 0.76| 0.04| 2.3| 0.092| 15.0| 54.0| 0.997|3.26| 0.65| 9.8| 5|\n", 159 | "| 11.2| 0.28| 0.56| 1.9| 0.075| 17.0| 60.0| 0.998|3.16| 0.58| 9.8| 6|\n", 160 | "| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| 5|\n", 161 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n", 162 | "only showing top 5 rows\n", 163 | "\n" 164 | ] 165 | } 166 | ] 167 | }, 168 | { 169 | "cell_type": "code", 170 | "source": [ 171 | "def condition(r):\n", 172 | "\n", 173 | " if (0 <= r <= 4):\n", 174 | " label = 'low'\n", 175 | "\n", 176 | " elif (4 < r <= 6):\n", 177 | " label = 'medium'\n", 178 | "\n", 179 | " else:\n", 180 | " label = 'high'\n", 181 | "\n", 182 | " return label\n", 183 | "\n", 184 | "def string_to_float(x):\n", 185 | " return float(x)" 186 | ], 187 | "metadata": { 188 | "id": "lrxWEoqAik9U" 189 | }, 190 | "execution_count": 2, 191 | "outputs": [] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "source": [ 196 | "from pyspark.sql.functions import udf\n", 197 | "from pyspark.sql.types import StringType, DoubleType\n", 198 | "string_to_float_udf = udf(string_to_float, DoubleType())\n", 199 | "quality_udf = udf(lambda x : condition(x), StringType())\n", 200 | "\n", 201 | "df = df.withColumn(\"quality\", quality_udf(\"quality\"))\n", 202 | "df.show(5)\n", 203 | "df.printSchema()" 204 | ], 205 | "metadata": { 206 | "colab": { 207 | "base_uri": "https://localhost:8080/" 208 | }, 209 | "id": "GBMr3ZBripCP", 210 | "outputId": "3a6fbc8b-20ca-40a3-d3fa-7f263af8ff8e" 211 | }, 212 | "execution_count": 3, 213 | "outputs": [ 214 | { 215 | "output_type": "stream", 216 | "name": "stdout", 217 | "text": [ 218 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n", 219 | "|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol|quality|\n", 220 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n", 221 | "| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| medium|\n", 222 | "| 7.8| 0.88| 0.0| 2.6| 0.098| 25.0| 67.0| 0.9968| 3.2| 0.68| 9.8| medium|\n", 223 | "| 7.8| 0.76| 0.04| 2.3| 0.092| 15.0| 54.0| 0.997|3.26| 0.65| 9.8| medium|\n", 224 | "| 11.2| 0.28| 0.56| 1.9| 0.075| 17.0| 60.0| 0.998|3.16| 0.58| 9.8| medium|\n", 225 | "| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| medium|\n", 226 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n", 227 | "only showing top 5 rows\n", 228 | "\n", 229 | "root\n", 230 | " |-- fixed acidity: double (nullable = true)\n", 231 | " |-- volatile acidity: double (nullable = true)\n", 232 | " |-- citric acid: double (nullable = true)\n", 233 | " |-- residual sugar: double (nullable = true)\n", 234 | " |-- chlorides: double (nullable = true)\n", 235 | " |-- free sulfur dioxide: double (nullable = true)\n", 236 | " |-- total sulfur dioxide: double (nullable = true)\n", 237 | " |-- density: double (nullable = true)\n", 238 | " |-- pH: double (nullable = true)\n", 239 | " |-- sulphates: double (nullable = true)\n", 240 | " |-- alcohol: double (nullable = true)\n", 241 | " |-- quality: string (nullable = true)\n", 242 | "\n" 243 | ] 244 | } 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "source": [ 250 | "from pyspark.ml.linalg import Vectors\n", 251 | "from pyspark.ml import Pipeline\n", 252 | "from pyspark.ml.feature import VectorIndexer, StringIndexer, IndexToString\n", 253 | "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n", 254 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator" 255 | ], 256 | "metadata": { 257 | "id": "pAOSSQCRivCa" 258 | }, 259 | "execution_count": 4, 260 | "outputs": [] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "source": [ 265 | "def transData(data):\n", 266 | " return data.rdd.map(lambda r : [Vectors.dense(r[:-1]), r[-1]]).toDF(['features', 'label'])\n", 267 | "\n", 268 | "transformed = transData(df)\n", 269 | "transformed.show(5)" 270 | ], 271 | "metadata": { 272 | "colab": { 273 | "base_uri": "https://localhost:8080/" 274 | }, 275 | "id": "qv8tliDciyWn", 276 | "outputId": "77bc7def-09ce-479b-e50d-07861d5c87b3" 277 | }, 278 | "execution_count": 5, 279 | "outputs": [ 280 | { 281 | "output_type": "stream", 282 | "name": "stdout", 283 | "text": [ 284 | "+--------------------+------+\n", 285 | "| features| label|\n", 286 | "+--------------------+------+\n", 287 | "|[7.4,0.7,0.0,1.9,...|medium|\n", 288 | "|[7.8,0.88,0.0,2.6...|medium|\n", 289 | "|[7.8,0.76,0.04,2....|medium|\n", 290 | "|[11.2,0.28,0.56,1...|medium|\n", 291 | "|[7.4,0.7,0.0,1.9,...|medium|\n", 292 | "+--------------------+------+\n", 293 | "only showing top 5 rows\n", 294 | "\n" 295 | ] 296 | } 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "source": [ 302 | "from pyspark.ml.feature import VectorIndexer, StringIndexer, IndexToString\n", 303 | "labelIndexer = StringIndexer(inputCol = 'label', outputCol = 'indexedLabel').fit(transformed)\n", 304 | "labelIndexer.transform(transformed).show(5, True)\n", 305 | "\n", 306 | "featureIndexer = VectorIndexer(inputCol = 'features', outputCol = 'indexedFeatures', maxCategories = 4).fit(transformed)\n", 307 | "featureIndexer.transform(transformed).show(5)" 308 | ], 309 | "metadata": { 310 | "colab": { 311 | "base_uri": "https://localhost:8080/" 312 | }, 313 | "id": "nB6Kjtawi16g", 314 | "outputId": "492b6d4c-3a53-4eff-b493-bf5341cbdb5b" 315 | }, 316 | "execution_count": 6, 317 | "outputs": [ 318 | { 319 | "output_type": "stream", 320 | "name": "stdout", 321 | "text": [ 322 | "+--------------------+------+------------+\n", 323 | "| features| label|indexedLabel|\n", 324 | "+--------------------+------+------------+\n", 325 | "|[7.4,0.7,0.0,1.9,...|medium| 0.0|\n", 326 | "|[7.8,0.88,0.0,2.6...|medium| 0.0|\n", 327 | "|[7.8,0.76,0.04,2....|medium| 0.0|\n", 328 | "|[11.2,0.28,0.56,1...|medium| 0.0|\n", 329 | "|[7.4,0.7,0.0,1.9,...|medium| 0.0|\n", 330 | "+--------------------+------+------------+\n", 331 | "only showing top 5 rows\n", 332 | "\n", 333 | "+--------------------+------+--------------------+\n", 334 | "| features| label| indexedFeatures|\n", 335 | "+--------------------+------+--------------------+\n", 336 | "|[7.4,0.7,0.0,1.9,...|medium|[7.4,0.7,0.0,1.9,...|\n", 337 | "|[7.8,0.88,0.0,2.6...|medium|[7.8,0.88,0.0,2.6...|\n", 338 | "|[7.8,0.76,0.04,2....|medium|[7.8,0.76,0.04,2....|\n", 339 | "|[11.2,0.28,0.56,1...|medium|[11.2,0.28,0.56,1...|\n", 340 | "|[7.4,0.7,0.0,1.9,...|medium|[7.4,0.7,0.0,1.9,...|\n", 341 | "+--------------------+------+--------------------+\n", 342 | "only showing top 5 rows\n", 343 | "\n" 344 | ] 345 | } 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "source": [ 351 | "(trainingData, testData) = transformed.randomSplit([0.6, 0.4])\n", 352 | "trainingData.show(5)\n", 353 | "testData.show(5)" 354 | ], 355 | "metadata": { 356 | "colab": { 357 | "base_uri": "https://localhost:8080/" 358 | }, 359 | "id": "goV-vM1Ci_l3", 360 | "outputId": "4687e794-cf73-472d-8126-4ef3b769b203" 361 | }, 362 | "execution_count": 7, 363 | "outputs": [ 364 | { 365 | "output_type": "stream", 366 | "name": "stdout", 367 | "text": [ 368 | "+--------------------+------+\n", 369 | "| features| label|\n", 370 | "+--------------------+------+\n", 371 | "|[4.6,0.52,0.15,2....| low|\n", 372 | "|[4.7,0.6,0.17,2.3...|medium|\n", 373 | "|[4.9,0.42,0.0,2.1...| high|\n", 374 | "|[5.0,0.38,0.01,1....|medium|\n", 375 | "|[5.0,0.4,0.5,4.3,...|medium|\n", 376 | "+--------------------+------+\n", 377 | "only showing top 5 rows\n", 378 | "\n", 379 | "+--------------------+------+\n", 380 | "| features| label|\n", 381 | "+--------------------+------+\n", 382 | "|[5.0,0.42,0.24,2....| high|\n", 383 | "|[5.0,1.04,0.24,1....|medium|\n", 384 | "|[5.1,0.42,0.0,1.8...| high|\n", 385 | "|[5.1,0.47,0.02,1....|medium|\n", 386 | "|[5.1,0.51,0.18,2....| high|\n", 387 | "+--------------------+------+\n", 388 | "only showing top 5 rows\n", 389 | "\n" 390 | ] 391 | } 392 | ] 393 | }, 394 | { 395 | "cell_type": "code", 396 | "source": [ 397 | "from pyspark.ml.classification import DecisionTreeClassifier\n", 398 | "\n", 399 | "dTree = DecisionTreeClassifier(labelCol = 'indexedLabel', featuresCol = 'indexedFeatures')" 400 | ], 401 | "metadata": { 402 | "id": "U6haQ1LAjEko" 403 | }, 404 | "execution_count": 8, 405 | "outputs": [] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "source": [ 410 | "#Pipeline Architecture\n", 411 | "\n", 412 | "labelConverter = IndexToString(inputCol = \"prediction\", outputCol = 'predictedLabel', labels = labelIndexer.labels)\n", 413 | "\n", 414 | "pipeline = Pipeline(stages = [labelIndexer, featureIndexer, dTree, labelConverter])\n", 415 | "\n", 416 | "model = pipeline.fit(trainingData)\n", 417 | "\n", 418 | "predictions = model.transform(testData)\n", 419 | "\n", 420 | "predictions.select('features', 'label', \"predictedLabel\").show(5)" 421 | ], 422 | "metadata": { 423 | "colab": { 424 | "base_uri": "https://localhost:8080/" 425 | }, 426 | "id": "eObhaarujHf1", 427 | "outputId": "95a07ba5-7c0f-44ac-cfc5-964829e995f6" 428 | }, 429 | "execution_count": 9, 430 | "outputs": [ 431 | { 432 | "output_type": "stream", 433 | "name": "stdout", 434 | "text": [ 435 | "+--------------------+------+--------------+\n", 436 | "| features| label|predictedLabel|\n", 437 | "+--------------------+------+--------------+\n", 438 | "|[5.0,0.42,0.24,2....| high| medium|\n", 439 | "|[5.0,1.04,0.24,1....|medium| medium|\n", 440 | "|[5.1,0.42,0.0,1.8...| high| high|\n", 441 | "|[5.1,0.47,0.02,1....|medium| medium|\n", 442 | "|[5.1,0.51,0.18,2....| high| high|\n", 443 | "+--------------------+------+--------------+\n", 444 | "only showing top 5 rows\n", 445 | "\n" 446 | ] 447 | } 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "source": [ 453 | "#Evaluation\n", 454 | "\n", 455 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", 456 | "\n", 457 | "evaluator = MulticlassClassificationEvaluator(labelCol = 'indexedLabel', predictionCol = 'prediction', metricName = 'accuracy')\n", 458 | "accuracy = evaluator.evaluate(predictions)\n", 459 | "print(\"Test Error = %g\" % (1.0 - accuracy))\n", 460 | "\n", 461 | "rfModel = model.stages[-2]\n", 462 | "print(rfModel)" 463 | ], 464 | "metadata": { 465 | "colab": { 466 | "base_uri": "https://localhost:8080/" 467 | }, 468 | "id": "BYLCy5FtjN9-", 469 | "outputId": "0181a515-4268-4531-805a-926afe0ddb92" 470 | }, 471 | "execution_count": 10, 472 | "outputs": [ 473 | { 474 | "output_type": "stream", 475 | "name": "stdout", 476 | "text": [ 477 | "Test Error = 0.143345\n", 478 | "DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5939946d6e55, depth=5, numNodes=49, numClasses=3, numFeatures=11\n" 479 | ] 480 | } 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "source": [ 486 | "from sklearn.metrics import confusion_matrix\n", 487 | "\n", 488 | "y_true = predictions.select(\"label\")\n", 489 | "y_true = y_true.toPandas()\n", 490 | "\n", 491 | "y_pred = predictions.select(\"predictedLabel\")\n", 492 | "y_pred = y_pred.toPandas()\n", 493 | "\n", 494 | "cnf_matrix = confusion_matrix(y_true, y_pred)\n", 495 | "cnf_matrix" 496 | ], 497 | "metadata": { 498 | "colab": { 499 | "base_uri": "https://localhost:8080/" 500 | }, 501 | "id": "yEc5idwAjSu_", 502 | "outputId": "2ce298f0-0994-4515-9733-17365362ef8e" 503 | }, 504 | "execution_count": 11, 505 | "outputs": [ 506 | { 507 | "output_type": "execute_result", 508 | "data": { 509 | "text/plain": [ 510 | "array([[ 34, 0, 35],\n", 511 | " [ 1, 0, 22],\n", 512 | " [ 26, 0, 468]])" 513 | ] 514 | }, 515 | "metadata": {}, 516 | "execution_count": 11 517 | } 518 | ] 519 | } 520 | ] 521 | } -------------------------------------------------------------------------------- /IBBLojistikWordCount.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "nbformat": 4, 3 | "nbformat_minor": 0, 4 | "metadata": { 5 | "colab": { 6 | "provenance": [], 7 | "authorship_tag": "ABX9TyOHR4+l0bFxeRYwqRPmcehg", 8 | "include_colab_link": true 9 | }, 10 | "kernelspec": { 11 | "name": "python3", 12 | "display_name": "Python 3" 13 | }, 14 | "language_info": { 15 | "name": "python" 16 | } 17 | }, 18 | "cells": [ 19 | { 20 | "cell_type": "markdown", 21 | "metadata": { 22 | "id": "view-in-github", 23 | "colab_type": "text" 24 | }, 25 | "source": [ 26 | "\"Open" 27 | ] 28 | }, 29 | { 30 | "cell_type": "code", 31 | "execution_count": 2, 32 | "metadata": { 33 | "colab": { 34 | "base_uri": "https://localhost:8080/" 35 | }, 36 | "id": "xldbqsBeP6qM", 37 | "outputId": "8bcc146c-ea14-44f3-82eb-16fe559cd26f" 38 | }, 39 | "outputs": [ 40 | { 41 | "output_type": "stream", 42 | "name": "stdout", 43 | "text": [ 44 | "Requirement already satisfied: findspark in /usr/local/lib/python3.11/dist-packages (2.0.1)\n" 45 | ] 46 | } 47 | ], 48 | "source": [ 49 | "!pip install findspark" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "source": [ 55 | "import findspark\n", 56 | "findspark.init()" 57 | ], 58 | "metadata": { 59 | "id": "-JmlMpBkQCzl" 60 | }, 61 | "execution_count": 3, 62 | "outputs": [] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "source": [ 67 | "from pyspark.sql import SparkSession\n", 68 | "from pyspark.conf import SparkConf" 69 | ], 70 | "metadata": { 71 | "id": "iEhUEVSJQEM9" 72 | }, 73 | "execution_count": 4, 74 | "outputs": [] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "source": [ 79 | "spark = SparkSession.builder \\\n", 80 | ".master(\"local[4]\") \\\n", 81 | ".appName(\"giveatry\") \\\n", 82 | ".getOrCreate()" 83 | ], 84 | "metadata": { 85 | "id": "l50Sk6ecQGhB" 86 | }, 87 | "execution_count": 5, 88 | "outputs": [] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "source": [ 93 | "sc = spark.sparkContext" 94 | ], 95 | "metadata": { 96 | "id": "j8OOyvOJQU-c" 97 | }, 98 | "execution_count": 6, 99 | "outputs": [] 100 | }, 101 | { 102 | "cell_type": "code", 103 | "source": [ 104 | "veri_seti = \"/content/ibb_lojistik2.txt\"" 105 | ], 106 | "metadata": { 107 | "id": "dahhlWG5QWBr" 108 | }, 109 | "execution_count": 7, 110 | "outputs": [] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "source": [ 115 | "istac_rdd = sc.textFile(veri_seti)" 116 | ], 117 | "metadata": { 118 | "id": "okuhb24OQtPO" 119 | }, 120 | "execution_count": 8, 121 | "outputs": [] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "source": [ 126 | "istac_rdd.count()" 127 | ], 128 | "metadata": { 129 | "colab": { 130 | "base_uri": "https://localhost:8080/" 131 | }, 132 | "id": "iRBbC6uXQv0V", 133 | "outputId": "f384fb2b-1cc1-4301-8797-181f58d419a0" 134 | }, 135 | "execution_count": 10, 136 | "outputs": [ 137 | { 138 | "output_type": "execute_result", 139 | "data": { 140 | "text/plain": [ 141 | "4" 142 | ] 143 | }, 144 | "metadata": {}, 145 | "execution_count": 10 146 | } 147 | ] 148 | }, 149 | { 150 | "cell_type": "code", 151 | "source": [ 152 | "istac_rdd.take(5)" 153 | ], 154 | "metadata": { 155 | "colab": { 156 | "base_uri": "https://localhost:8080/" 157 | }, 158 | "id": "iZEvmgOaQ0TB", 159 | "outputId": "41fbcf06-9bd5-41ce-ef3c-4201da8d0aa9" 160 | }, 161 | "execution_count": 11, 162 | "outputs": [ 163 | { 164 | "output_type": "execute_result", 165 | "data": { 166 | "text/plain": [ 167 | "[\"Günümüz yaşam koşullarının etkisiyle insan ve yük hareketliliği hızla artmaktadır. Kentsel yaşamdaki günlük ihtiyaçların karşılanması bakımından zorunlu hale gelen yük taşımacılığı, insan yaşamı üzerinde çeşitli sosyal ve çevresel etkilere neden olmaktadır. Özellikle büyük şehirlerde hızlı nüfus artışıyla ortaya çıkan ihtiyaçların, kısa sürelerde giderilmesi çabası ulaştırma hizmetlerini zorlaştırmaktadır. Karayolu yolcu ve yük taşımacılığında, son otuz yılda diğer türler ile yapılan taşımacılıklara kıyasla hızlı bir gelişme olmuş, bu dengesiz gelişmenin sonucu olarak karayolu taşımasının payı her iki taşıma için hızla artarak %90'ın üzerine çıkmıştır. Özellikle İstanbul, sahip olduğu nüfus ve ekonomik potansiyel nedeniyle bu hızlı gelişimden en çok etkilenen illerden biridir. Bu gerekçeyle; şehirdeki yük hareketlerinin oluşturduğu trafik sıkışıklıkları, erişim kısıtlamaları, park sorunları, emisyonlar gibi ekonomik, sosyal ve çevresel etkiler göz önüne alınarak bir kentsel lojistik planlama yapılması ihtiyacı doğmuştur. \",\n", 168 | " 'Lojistik alanında yapılan yatırımlardan en fazla yararın elde edilmesi. Ancak hedef ve amaçların iyi belirlenmesi, Yapılan yatırımlar ile elde edilecek yararların maliyetlerle dengesinin sağlanması, Yatırımları yapacak olan kuruluşların yatırım güçlerinin göz önünde tutulması, Lojistik unsurlar içindeki entegrasyonun sağlanması, Lojistik konusunda geleceğe yönelik taleplerin doğru kestirilebilmesine bağlıdır. Diğer yanda limanlar, demiryolları, karayolları ve bunların tesisleri ile depolama alanları kent coğrafyası içinde yer bulmakta, kent içinde birçok kentsel kullanım ile birlikte yer almaktadır. Diğer kentsel kullanımlarla birlikte kent içinde yer alan lojistik yatırımların diğer kentsel kullanımlara ve kentsel yaşama olumsuz etkilerinin bertaraf edilmesi kapsamlı bir mekânsal planlama ile sağlanabilir. Bu amacın gerçekleşmesi ise ancak kapsamlı bir “Lojistik Ana Planı” ile mümkün görünmektedir. Böyle bir Planın gerektirdiği araştırma, analiz, eylem planı ve değerlendirme raporuyla bu çalışmanın altyapısının oluşturulması hedeflenmektedir. ',\n", 169 | " 'İstanbul Lojistik Ana Planı İşi Kapsamında Araç Sayımına Yönelik Saha Çalışması Yapılarak Veri Alımı Hizmeti İşinin amacı, İstanbul’un lojistik hareketliliğine dair elde edilecek verilerle İSTLAP’a bilimsel altlık oluşturmaktır. Çalışmada Teknik Şartname ile lokasyonları belirlenen lojistik odaklarda, perde-kordonlarda trafik sayımı ve anketler gerçekleştirilmiştir. ',\n", 170 | " 'İstanbul Lojistik Ana Planı İşi Kapsamında Araç Sayımına Yönelik Saha Çalışması Yapılarak Veri Alımı Hizmeti İşinin amacı, İstanbul’un lojistik hareketliliğine dair elde edilecek verilerle İSTLAP’a bilimsel altlık oluşturmaktır. Çalışmada Teknik Şartname ile lokasyonları belirlenen lojistik odaklarda, perde-kordonlarda trafik sayımı ve anketler gerçekleştirilmiştir. 1 Bu şekilde İstanbul’un lojistik altyapısının kente ve kentlilere olan sosyal, ekonomik, politik ve teknolojik etkileri belirlenerek tespit edilen sorunların çözümüne yönelik strateji ve öneriler geliştirilecektir. Kısa, orta ve uzun dönemli oluşturulacak eylem planlarıyla bir yandan sağlıklı mekânsal gelişim sağlanırken diğer yandan İstanbul Ulaşım Ana Planı’na entegre bir gelişim sağlanacaktır. Çalışmada hızlı ve doğru bir veri kümesine ulaşmak için hazırlık aşamasından başlayarak veri teslimine kadar olan süreçler önceden belirlenmiş metodoloji ve iş planına uygun şekilde gerçekleştirilmiştir. Çalışma kapsamında, İstanbul’da yük taşıyan araç trafiği düzeyinin belirlenerek lojistik atama modelinde kullanılmak üzere, İstanbul, Gebze ve Dilovası’nda yer alan belirli lojistik odak ve terminallerin yoğunlaştığı bölgelerin giriş- çıkışlarında, İstanbul trafiğini temsil eden perdekordon hatlarında gerçekleştirilmiştir THIS IS THE NEW CONTENT APPENDED IN THE FILE']" 171 | ] 172 | }, 173 | "metadata": {}, 174 | "execution_count": 11 175 | } 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "source": [ 181 | "istac_rdd_kelimeler = istac_rdd.flatMap(lambda satir : satir.split(\" \"))" 182 | ], 183 | "metadata": { 184 | "id": "baMJl3qoQ-3l" 185 | }, 186 | "execution_count": 12, 187 | "outputs": [] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "source": [ 192 | "istac_rdd_kelimeler.take(20)" 193 | ], 194 | "metadata": { 195 | "colab": { 196 | "base_uri": "https://localhost:8080/" 197 | }, 198 | "id": "za-RrUr2RBCI", 199 | "outputId": "4ccc0bab-b427-4e32-833b-b41a1dac4b4b" 200 | }, 201 | "execution_count": 14, 202 | "outputs": [ 203 | { 204 | "output_type": "execute_result", 205 | "data": { 206 | "text/plain": [ 207 | "['Günümüz',\n", 208 | " 'yaşam',\n", 209 | " 'koşullarının',\n", 210 | " 'etkisiyle',\n", 211 | " 'insan',\n", 212 | " 've',\n", 213 | " 'yük',\n", 214 | " 'hareketliliği',\n", 215 | " 'hızla',\n", 216 | " 'artmaktadır.',\n", 217 | " 'Kentsel',\n", 218 | " 'yaşamdaki',\n", 219 | " 'günlük',\n", 220 | " 'ihtiyaçların',\n", 221 | " 'karşılanması',\n", 222 | " 'bakımından',\n", 223 | " 'zorunlu',\n", 224 | " 'hale',\n", 225 | " 'gelen',\n", 226 | " 'yük']" 227 | ] 228 | }, 229 | "metadata": {}, 230 | "execution_count": 14 231 | } 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "source": [ 237 | "istac_rdd_kelimeler_sayilari = istac_rdd_kelimeler.map(lambda kelime : (kelime, 1))" 238 | ], 239 | "metadata": { 240 | "id": "2sNxw8-xRCn0" 241 | }, 242 | "execution_count": 15, 243 | "outputs": [] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "source": [ 248 | "istac_rdd_kelimeler_sayilari.take(20)" 249 | ], 250 | "metadata": { 251 | "colab": { 252 | "base_uri": "https://localhost:8080/" 253 | }, 254 | "id": "iH3fR2FuRFIe", 255 | "outputId": "0dd3b278-5bfe-4c75-ddc6-d29bdd9081ad" 256 | }, 257 | "execution_count": 16, 258 | "outputs": [ 259 | { 260 | "output_type": "execute_result", 261 | "data": { 262 | "text/plain": [ 263 | "[('Günümüz', 1),\n", 264 | " ('yaşam', 1),\n", 265 | " ('koşullarının', 1),\n", 266 | " ('etkisiyle', 1),\n", 267 | " ('insan', 1),\n", 268 | " ('ve', 1),\n", 269 | " ('yük', 1),\n", 270 | " ('hareketliliği', 1),\n", 271 | " ('hızla', 1),\n", 272 | " ('artmaktadır.', 1),\n", 273 | " ('Kentsel', 1),\n", 274 | " ('yaşamdaki', 1),\n", 275 | " ('günlük', 1),\n", 276 | " ('ihtiyaçların', 1),\n", 277 | " ('karşılanması', 1),\n", 278 | " ('bakımından', 1),\n", 279 | " ('zorunlu', 1),\n", 280 | " ('hale', 1),\n", 281 | " ('gelen', 1),\n", 282 | " ('yük', 1)]" 283 | ] 284 | }, 285 | "metadata": {}, 286 | "execution_count": 16 287 | } 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "source": [ 293 | "istac_rdd_kelimeler_sayilari_reduce = istac_rdd_kelimeler_sayilari.reduceByKey(lambda x, y : (x + y))" 294 | ], 295 | "metadata": { 296 | "id": "imB4GRsGRH5p" 297 | }, 298 | "execution_count": 17, 299 | "outputs": [] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "source": [ 304 | "istac_rdd_kelimeler_sayilari_reduce.take(20)" 305 | ], 306 | "metadata": { 307 | "colab": { 308 | "base_uri": "https://localhost:8080/" 309 | }, 310 | "id": "lP_Xc5AiRKee", 311 | "outputId": "227ffec0-16db-4f70-ca29-7602d1839c60" 312 | }, 313 | "execution_count": 19, 314 | "outputs": [ 315 | { 316 | "output_type": "execute_result", 317 | "data": { 318 | "text/plain": [ 319 | "[('Günümüz', 1),\n", 320 | " ('yaşam', 1),\n", 321 | " ('koşullarının', 1),\n", 322 | " ('etkisiyle', 1),\n", 323 | " ('artmaktadır.', 1),\n", 324 | " ('yaşamdaki', 1),\n", 325 | " ('günlük', 1),\n", 326 | " ('ihtiyaçların', 1),\n", 327 | " ('zorunlu', 1),\n", 328 | " ('gelen', 1),\n", 329 | " ('yaşamı', 1),\n", 330 | " ('sosyal', 2),\n", 331 | " ('çevresel', 2),\n", 332 | " ('nüfus', 2),\n", 333 | " ('ortaya', 1),\n", 334 | " ('çıkan', 1),\n", 335 | " ('ihtiyaçların,', 1),\n", 336 | " ('sürelerde', 1),\n", 337 | " ('giderilmesi', 1),\n", 338 | " ('çabası', 1)]" 339 | ] 340 | }, 341 | "metadata": {}, 342 | "execution_count": 19 343 | } 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "source": [ 349 | "istac_rdd_kelimeler_sayilari_reduce_sort = istac_rdd_kelimeler_sayilari_reduce.map(lambda x: (x[1], x[0]))" 350 | ], 351 | "metadata": { 352 | "id": "uTlupiemRNtr" 353 | }, 354 | "execution_count": 20, 355 | "outputs": [] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "source": [ 360 | "istac_rdd_kelimeler_sayilari_reduce_sort.take(20)" 361 | ], 362 | "metadata": { 363 | "colab": { 364 | "base_uri": "https://localhost:8080/" 365 | }, 366 | "id": "5-vG9o5_RRNk", 367 | "outputId": "9597e334-22f7-4178-d53e-c0f7380b1a98" 368 | }, 369 | "execution_count": 21, 370 | "outputs": [ 371 | { 372 | "output_type": "execute_result", 373 | "data": { 374 | "text/plain": [ 375 | "[(1, 'Günümüz'),\n", 376 | " (1, 'yaşam'),\n", 377 | " (1, 'koşullarının'),\n", 378 | " (1, 'etkisiyle'),\n", 379 | " (1, 'artmaktadır.'),\n", 380 | " (1, 'yaşamdaki'),\n", 381 | " (1, 'günlük'),\n", 382 | " (1, 'ihtiyaçların'),\n", 383 | " (1, 'zorunlu'),\n", 384 | " (1, 'gelen'),\n", 385 | " (1, 'yaşamı'),\n", 386 | " (2, 'sosyal'),\n", 387 | " (2, 'çevresel'),\n", 388 | " (2, 'nüfus'),\n", 389 | " (1, 'ortaya'),\n", 390 | " (1, 'çıkan'),\n", 391 | " (1, 'ihtiyaçların,'),\n", 392 | " (1, 'sürelerde'),\n", 393 | " (1, 'giderilmesi'),\n", 394 | " (1, 'çabası')]" 395 | ] 396 | }, 397 | "metadata": {}, 398 | "execution_count": 21 399 | } 400 | ] 401 | }, 402 | { 403 | "cell_type": "code", 404 | "source": [ 405 | "istac_rdd_kelimeler_sayilari_reduce_sort.sortByKey(False).take(20)" 406 | ], 407 | "metadata": { 408 | "colab": { 409 | "base_uri": "https://localhost:8080/" 410 | }, 411 | "id": "Ixs3sovURVXO", 412 | "outputId": "def8d340-e79f-4774-b14f-c91ad79bd345" 413 | }, 414 | "execution_count": 22, 415 | "outputs": [ 416 | { 417 | "output_type": "execute_result", 418 | "data": { 419 | "text/plain": [ 420 | "[(19, 've'),\n", 421 | " (9, 'lojistik'),\n", 422 | " (8, 'ile'),\n", 423 | " (8, ''),\n", 424 | " (8, 'bir'),\n", 425 | " (5, 'Lojistik'),\n", 426 | " (5, 'yük'),\n", 427 | " (5, 'kentsel'),\n", 428 | " (4, 'elde'),\n", 429 | " (4, 'İstanbul'),\n", 430 | " (4, 'hızlı'),\n", 431 | " (4, 'yer'),\n", 432 | " (4, 'Ana'),\n", 433 | " (3, 'diğer'),\n", 434 | " (3, 'içinde'),\n", 435 | " (3, 'İstanbul’un'),\n", 436 | " (3, 'gerçekleştirilmiştir.'),\n", 437 | " (3, 'bu'),\n", 438 | " (3, 'Bu'),\n", 439 | " (3, 'trafik')]" 440 | ] 441 | }, 442 | "metadata": {}, 443 | "execution_count": 22 444 | } 445 | ] 446 | } 447 | ] 448 | } -------------------------------------------------------------------------------- /loan_sanction_test.csv: -------------------------------------------------------------------------------- 1 | Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area 2 | LP001015,Male,Yes,0,Graduate,No,5720,0,110,360,1,Urban 3 | LP001022,Male,Yes,1,Graduate,No,3076,1500,126,360,1,Urban 4 | LP001031,Male,Yes,2,Graduate,No,5000,1800,208,360,1,Urban 5 | LP001035,Male,Yes,2,Graduate,No,2340,2546,100,360,,Urban 6 | LP001051,Male,No,0,Not Graduate,No,3276,0,78,360,1,Urban 7 | LP001054,Male,Yes,0,Not Graduate,Yes,2165,3422,152,360,1,Urban 8 | LP001055,Female,No,1,Not Graduate,No,2226,0,59,360,1,Semiurban 9 | LP001056,Male,Yes,2,Not Graduate,No,3881,0,147,360,0,Rural 10 | LP001059,Male,Yes,2,Graduate,,13633,0,280,240,1,Urban 11 | LP001067,Male,No,0,Not Graduate,No,2400,2400,123,360,1,Semiurban 12 | LP001078,Male,No,0,Not Graduate,No,3091,0,90,360,1,Urban 13 | LP001082,Male,Yes,1,Graduate,,2185,1516,162,360,1,Semiurban 14 | LP001083,Male,No,3+,Graduate,No,4166,0,40,180,,Urban 15 | LP001094,Male,Yes,2,Graduate,,12173,0,166,360,0,Semiurban 16 | LP001096,Female,No,0,Graduate,No,4666,0,124,360,1,Semiurban 17 | LP001099,Male,No,1,Graduate,No,5667,0,131,360,1,Urban 18 | LP001105,Male,Yes,2,Graduate,No,4583,2916,200,360,1,Urban 19 | LP001107,Male,Yes,3+,Graduate,No,3786,333,126,360,1,Semiurban 20 | LP001108,Male,Yes,0,Graduate,No,9226,7916,300,360,1,Urban 21 | LP001115,Male,No,0,Graduate,No,1300,3470,100,180,1,Semiurban 22 | LP001121,Male,Yes,1,Not Graduate,No,1888,1620,48,360,1,Urban 23 | LP001124,Female,No,3+,Not Graduate,No,2083,0,28,180,1,Urban 24 | LP001128,,No,0,Graduate,No,3909,0,101,360,1,Urban 25 | LP001135,Female,No,0,Not Graduate,No,3765,0,125,360,1,Urban 26 | LP001149,Male,Yes,0,Graduate,No,5400,4380,290,360,1,Urban 27 | LP001153,Male,No,0,Graduate,No,0,24000,148,360,0,Rural 28 | LP001163,Male,Yes,2,Graduate,No,4363,1250,140,360,,Urban 29 | LP001169,Male,Yes,0,Graduate,No,7500,3750,275,360,1,Urban 30 | LP001174,Male,Yes,0,Graduate,No,3772,833,57,360,,Semiurban 31 | LP001176,Male,No,0,Graduate,No,2942,2382,125,180,1,Urban 32 | LP001177,Female,No,0,Not Graduate,No,2478,0,75,360,1,Semiurban 33 | LP001183,Male,Yes,2,Graduate,No,6250,820,192,360,1,Urban 34 | LP001185,Male,No,0,Graduate,No,3268,1683,152,360,1,Semiurban 35 | LP001187,Male,Yes,0,Graduate,No,2783,2708,158,360,1,Urban 36 | LP001190,Male,Yes,0,Graduate,No,2740,1541,101,360,1,Urban 37 | LP001203,Male,No,0,Graduate,No,3150,0,176,360,0,Semiurban 38 | LP001208,Male,Yes,2,Graduate,,7350,4029,185,180,1,Urban 39 | LP001210,Male,Yes,0,Graduate,Yes,2267,2792,90,360,1,Urban 40 | LP001211,Male,No,0,Graduate,Yes,5833,0,116,360,1,Urban 41 | LP001219,Male,No,0,Graduate,No,3643,1963,138,360,1,Urban 42 | LP001220,Male,Yes,0,Graduate,No,5629,818,100,360,1,Urban 43 | LP001221,Female,No,0,Graduate,No,3644,0,110,360,1,Urban 44 | LP001226,Male,Yes,0,Not Graduate,No,1750,2024,90,360,1,Semiurban 45 | LP001230,Male,No,0,Graduate,No,6500,2600,200,360,1,Semiurban 46 | LP001231,Female,No,0,Graduate,No,3666,0,84,360,1,Urban 47 | LP001232,Male,Yes,0,Graduate,No,4260,3900,185,,,Urban 48 | LP001237,Male,Yes,,Not Graduate,No,4163,1475,162,360,1,Urban 49 | LP001242,Male,No,0,Not Graduate,No,2356,1902,108,360,1,Semiurban 50 | LP001268,Male,No,0,Graduate,No,6792,3338,187,,1,Urban 51 | LP001270,Male,Yes,3+,Not Graduate,Yes,8000,250,187,360,1,Semiurban 52 | LP001284,Male,Yes,1,Graduate,No,2419,1707,124,360,1,Urban 53 | LP001287,,Yes,3+,Not Graduate,No,3500,833,120,360,1,Semiurban 54 | LP001291,Male,Yes,1,Graduate,No,3500,3077,160,360,1,Semiurban 55 | LP001298,Male,Yes,2,Graduate,No,4116,1000,30,180,1,Urban 56 | LP001312,Male,Yes,0,Not Graduate,Yes,5293,0,92,360,1,Urban 57 | LP001313,Male,No,0,Graduate,No,2750,0,130,360,0,Urban 58 | LP001317,Female,No,0,Not Graduate,No,4402,0,130,360,1,Rural 59 | LP001321,Male,Yes,2,Graduate,No,3613,3539,134,180,1,Semiurban 60 | LP001323,Female,Yes,2,Graduate,No,2779,3664,176,360,0,Semiurban 61 | LP001324,Male,Yes,3+,Graduate,No,4720,0,90,180,1,Semiurban 62 | LP001332,Male,Yes,0,Not Graduate,No,2415,1721,110,360,1,Semiurban 63 | LP001335,Male,Yes,0,Graduate,Yes,7016,292,125,360,1,Urban 64 | LP001338,Female,No,2,Graduate,No,4968,0,189,360,1,Semiurban 65 | LP001347,Female,No,0,Graduate,No,2101,1500,108,360,0,Rural 66 | LP001348,Male,Yes,3+,Not Graduate,No,4490,0,125,360,1,Urban 67 | LP001351,Male,Yes,0,Graduate,No,2917,3583,138,360,1,Semiurban 68 | LP001352,Male,Yes,0,Not Graduate,No,4700,0,135,360,0,Semiurban 69 | LP001358,Male,Yes,0,Graduate,No,3445,0,130,360,0,Semiurban 70 | LP001359,Male,Yes,0,Graduate,No,7666,0,187,360,1,Semiurban 71 | LP001361,Male,Yes,0,Graduate,No,2458,5105,188,360,0,Rural 72 | LP001366,Female,No,,Graduate,No,3250,0,95,360,1,Semiurban 73 | LP001368,Male,No,0,Graduate,No,4463,0,65,360,1,Semiurban 74 | LP001375,Male,Yes,1,Graduate,,4083,1775,139,60,1,Urban 75 | LP001380,Male,Yes,0,Graduate,Yes,3900,2094,232,360,1,Rural 76 | LP001386,Male,Yes,0,Not Graduate,No,4750,3583,144,360,1,Semiurban 77 | LP001400,Male,No,0,Graduate,No,3583,3435,155,360,1,Urban 78 | LP001407,Male,Yes,0,Graduate,No,3189,2367,186,360,1,Urban 79 | LP001413,Male,No,0,Graduate,Yes,6356,0,50,360,1,Rural 80 | LP001415,Male,Yes,1,Graduate,No,3413,4053,,360,1,Semiurban 81 | LP001419,Female,Yes,0,Graduate,No,7950,0,185,360,1,Urban 82 | LP001420,Male,Yes,3+,Graduate,No,3829,1103,163,360,0,Urban 83 | LP001428,Male,Yes,3+,Graduate,No,72529,0,360,360,1,Urban 84 | LP001445,Male,Yes,2,Not Graduate,No,4136,0,149,480,0,Rural 85 | LP001446,Male,Yes,0,Graduate,No,8449,0,257,360,1,Rural 86 | LP001450,Male,Yes,0,Graduate,No,4456,0,131,180,0,Semiurban 87 | LP001452,Male,Yes,2,Graduate,No,4635,8000,102,180,1,Rural 88 | LP001455,Male,Yes,0,Graduate,No,3571,1917,135,360,1,Urban 89 | LP001466,Male,No,0,Graduate,No,3066,0,95,360,1,Semiurban 90 | LP001471,Male,No,2,Not Graduate,No,3235,2015,77,360,1,Semiurban 91 | LP001472,Female,No,0,Graduate,,5058,0,200,360,1,Rural 92 | LP001475,Male,Yes,0,Graduate,Yes,3188,2286,130,360,,Rural 93 | LP001483,Male,Yes,3+,Graduate,No,13518,0,390,360,1,Rural 94 | LP001486,Male,Yes,1,Graduate,No,4364,2500,185,360,1,Semiurban 95 | LP001490,Male,Yes,2,Not Graduate,No,4766,1646,100,360,1,Semiurban 96 | LP001496,Male,Yes,1,Graduate,No,4609,2333,123,360,0,Semiurban 97 | LP001499,Female,Yes,3+,Graduate,No,6260,0,110,360,1,Semiurban 98 | LP001500,Male,Yes,1,Graduate,No,3333,4200,256,360,1,Urban 99 | LP001501,Male,Yes,0,Graduate,No,3500,3250,140,360,1,Semiurban 100 | LP001517,Male,Yes,3+,Graduate,No,9719,0,61,360,1,Urban 101 | LP001527,Male,Yes,3+,Graduate,No,6835,0,188,360,,Semiurban 102 | LP001534,Male,No,0,Graduate,No,4452,0,131,360,1,Rural 103 | LP001542,Female,Yes,0,Graduate,No,2262,0,,480,0,Semiurban 104 | LP001547,Male,Yes,1,Graduate,No,3901,0,116,360,1,Urban 105 | LP001548,Male,Yes,2,Not Graduate,No,2687,0,50,180,1,Rural 106 | LP001558,Male,No,0,Graduate,No,2243,2233,107,360,,Semiurban 107 | LP001561,Female,Yes,0,Graduate,No,3417,1287,200,360,1,Semiurban 108 | LP001563,,No,0,Graduate,No,1596,1760,119,360,0,Urban 109 | LP001567,Male,Yes,3+,Graduate,No,4513,0,120,360,1,Rural 110 | LP001568,Male,Yes,0,Graduate,No,4500,0,140,360,1,Semiurban 111 | LP001573,Male,Yes,0,Not Graduate,No,4523,1350,165,360,1,Urban 112 | LP001584,Female,No,0,Graduate,Yes,4742,0,108,360,1,Semiurban 113 | LP001587,Male,Yes,,Graduate,No,4082,0,93,360,1,Semiurban 114 | LP001589,Female,No,0,Graduate,No,3417,0,102,360,1,Urban 115 | LP001591,Female,Yes,2,Graduate,No,2922,3396,122,360,1,Semiurban 116 | LP001599,Male,Yes,0,Graduate,No,4167,4754,160,360,1,Rural 117 | LP001601,Male,No,3+,Graduate,No,4243,4123,157,360,,Semiurban 118 | LP001607,Female,No,0,Not Graduate,No,0,1760,180,360,1,Semiurban 119 | LP001611,Male,Yes,1,Graduate,No,1516,2900,80,,0,Rural 120 | LP001613,Female,No,0,Graduate,No,1762,2666,104,360,0,Urban 121 | LP001622,Male,Yes,2,Graduate,No,724,3510,213,360,0,Rural 122 | LP001627,Male,No,0,Graduate,No,3125,0,65,360,1,Urban 123 | LP001650,Male,Yes,0,Graduate,No,2333,3803,146,360,1,Rural 124 | LP001651,Male,Yes,3+,Graduate,No,3350,1560,135,360,1,Urban 125 | LP001652,Male,No,0,Graduate,No,2500,6414,187,360,0,Rural 126 | LP001655,Female,No,0,Graduate,No,12500,0,300,360,0,Urban 127 | LP001660,Male,No,0,Graduate,No,4667,0,120,360,1,Semiurban 128 | LP001662,Male,No,0,Graduate,No,6500,0,71,360,0,Urban 129 | LP001663,Male,Yes,2,Graduate,No,7500,0,225,360,1,Urban 130 | LP001667,Male,No,0,Graduate,No,3073,0,70,180,1,Urban 131 | LP001695,Male,Yes,1,Not Graduate,No,3321,2088,70,,1,Semiurban 132 | LP001703,Male,Yes,0,Graduate,No,3333,1270,124,360,1,Urban 133 | LP001718,Male,No,0,Graduate,No,3391,0,132,360,1,Rural 134 | LP001728,Male,Yes,1,Graduate,Yes,3343,1517,105,360,1,Rural 135 | LP001735,Female,No,1,Graduate,No,3620,0,90,360,1,Urban 136 | LP001737,Male,No,0,Graduate,No,4000,0,83,84,1,Urban 137 | LP001739,Male,Yes,0,Graduate,No,4258,0,125,360,1,Urban 138 | LP001742,Male,Yes,2,Graduate,No,4500,0,147,360,1,Rural 139 | LP001757,Male,Yes,1,Graduate,No,2014,2925,120,360,1,Rural 140 | LP001769,,No,,Graduate,No,3333,1250,110,360,1,Semiurban 141 | LP001771,Female,No,3+,Graduate,No,4083,0,103,360,,Semiurban 142 | LP001785,Male,No,0,Graduate,No,4727,0,150,360,0,Rural 143 | LP001787,Male,Yes,3+,Graduate,No,3089,2999,100,240,1,Rural 144 | LP001789,Male,Yes,3+,Not Graduate,,6794,528,139,360,0,Urban 145 | LP001791,Male,Yes,0,Graduate,Yes,32000,0,550,360,,Semiurban 146 | LP001794,Male,Yes,2,Graduate,Yes,10890,0,260,12,1,Rural 147 | LP001797,Female,No,0,Graduate,No,12941,0,150,300,1,Urban 148 | LP001815,Male,No,0,Not Graduate,No,3276,0,90,360,1,Semiurban 149 | LP001817,Male,No,0,Not Graduate,Yes,8703,0,199,360,0,Rural 150 | LP001818,Male,Yes,1,Graduate,No,4742,717,139,360,1,Semiurban 151 | LP001822,Male,No,0,Graduate,No,5900,0,150,360,1,Urban 152 | LP001827,Male,No,0,Graduate,No,3071,4309,180,360,1,Urban 153 | LP001831,Male,Yes,0,Graduate,No,2783,1456,113,360,1,Urban 154 | LP001842,Male,No,0,Graduate,No,5000,0,148,360,1,Rural 155 | LP001853,Male,Yes,1,Not Graduate,No,2463,2360,117,360,0,Urban 156 | LP001855,Male,Yes,2,Graduate,No,4855,0,72,360,1,Rural 157 | LP001857,Male,No,0,Not Graduate,Yes,1599,2474,125,300,1,Semiurban 158 | LP001862,Male,Yes,2,Graduate,Yes,4246,4246,214,360,1,Urban 159 | LP001867,Male,Yes,0,Graduate,No,4333,2291,133,350,1,Rural 160 | LP001878,Male,No,1,Graduate,No,5823,2529,187,360,1,Semiurban 161 | LP001881,Male,Yes,0,Not Graduate,No,7895,0,143,360,1,Rural 162 | LP001886,Male,No,0,Graduate,No,4150,4256,209,360,1,Rural 163 | LP001906,Male,No,0,Graduate,,2964,0,84,360,0,Semiurban 164 | LP001909,Male,No,0,Graduate,No,5583,0,116,360,1,Urban 165 | LP001911,Female,No,0,Graduate,No,2708,0,65,360,1,Rural 166 | LP001921,Male,No,1,Graduate,No,3180,2370,80,240,,Rural 167 | LP001923,Male,No,0,Not Graduate,No,2268,0,170,360,0,Semiurban 168 | LP001933,Male,No,2,Not Graduate,No,1141,2017,120,360,0,Urban 169 | LP001943,Male,Yes,0,Graduate,No,3042,3167,135,360,1,Urban 170 | LP001950,Female,Yes,3+,Graduate,,1750,2935,94,360,0,Semiurban 171 | LP001959,Female,Yes,1,Graduate,No,3564,0,79,360,1,Rural 172 | LP001961,Female,No,0,Graduate,No,3958,0,110,360,1,Rural 173 | LP001973,Male,Yes,2,Not Graduate,No,4483,0,130,360,1,Rural 174 | LP001975,Male,Yes,0,Graduate,No,5225,0,143,360,1,Rural 175 | LP001979,Male,No,0,Graduate,No,3017,2845,159,180,0,Urban 176 | LP001995,Male,Yes,0,Not Graduate,No,2431,1820,110,360,0,Rural 177 | LP001999,Male,Yes,2,Graduate,,4912,4614,160,360,1,Rural 178 | LP002007,Male,Yes,2,Not Graduate,No,2500,3333,131,360,1,Urban 179 | LP002009,Female,No,0,Graduate,No,2918,0,65,360,,Rural 180 | LP002016,Male,Yes,2,Graduate,No,5128,0,143,360,1,Rural 181 | LP002017,Male,Yes,3+,Graduate,No,15312,0,187,360,,Urban 182 | LP002018,Male,Yes,2,Graduate,No,3958,2632,160,360,1,Semiurban 183 | LP002027,Male,Yes,0,Graduate,No,4334,2945,165,360,1,Semiurban 184 | LP002028,Male,Yes,2,Graduate,No,4358,0,110,360,1,Urban 185 | LP002042,Female,Yes,1,Graduate,No,4000,3917,173,360,1,Rural 186 | LP002045,Male,Yes,3+,Graduate,No,10166,750,150,,1,Urban 187 | LP002046,Male,Yes,0,Not Graduate,No,4483,0,135,360,,Semiurban 188 | LP002047,Male,Yes,2,Not Graduate,No,4521,1184,150,360,1,Semiurban 189 | LP002056,Male,Yes,2,Graduate,No,9167,0,235,360,1,Semiurban 190 | LP002057,Male,Yes,0,Not Graduate,No,13083,0,,360,1,Rural 191 | LP002059,Male,Yes,2,Graduate,No,7874,3967,336,360,1,Rural 192 | LP002062,Female,Yes,1,Graduate,No,4333,0,132,84,1,Rural 193 | LP002064,Male,No,0,Graduate,No,4083,0,96,360,1,Urban 194 | LP002069,Male,Yes,2,Not Graduate,,3785,2912,180,360,0,Rural 195 | LP002070,Male,Yes,3+,Not Graduate,No,2654,1998,128,360,0,Rural 196 | LP002077,Male,Yes,1,Graduate,No,10000,2690,412,360,1,Semiurban 197 | LP002083,Male,No,0,Graduate,Yes,5833,0,116,360,1,Urban 198 | LP002090,Male,Yes,1,Graduate,No,4796,0,114,360,0,Semiurban 199 | LP002096,Male,Yes,0,Not Graduate,No,2000,1600,115,360,1,Rural 200 | LP002099,Male,Yes,2,Graduate,No,2540,700,104,360,0,Urban 201 | LP002102,Male,Yes,0,Graduate,Yes,1900,1442,88,360,1,Rural 202 | LP002105,Male,Yes,0,Graduate,Yes,8706,0,108,480,1,Rural 203 | LP002107,Male,Yes,3+,Not Graduate,No,2855,542,90,360,1,Urban 204 | LP002111,Male,Yes,,Graduate,No,3016,1300,100,360,,Urban 205 | LP002117,Female,Yes,0,Graduate,No,3159,2374,108,360,1,Semiurban 206 | LP002118,Female,No,0,Graduate,No,1937,1152,78,360,1,Semiurban 207 | LP002123,Male,Yes,0,Graduate,No,2613,2417,123,360,1,Semiurban 208 | LP002125,Male,Yes,1,Graduate,No,4960,2600,187,360,1,Semiurban 209 | LP002148,Male,Yes,1,Graduate,No,3074,1083,146,360,1,Semiurban 210 | LP002152,Female,No,0,Graduate,No,4213,0,80,360,1,Urban 211 | LP002165,,No,1,Not Graduate,No,2038,4027,100,360,1,Rural 212 | LP002167,Female,No,0,Graduate,No,2362,0,55,360,1,Urban 213 | LP002168,Male,No,0,Graduate,No,5333,2400,200,360,0,Rural 214 | LP002172,Male,Yes,3+,Graduate,Yes,5384,0,150,360,1,Semiurban 215 | LP002176,Male,No,0,Graduate,No,5708,0,150,360,1,Rural 216 | LP002183,Male,Yes,0,Not Graduate,No,3754,3719,118,,1,Rural 217 | LP002184,Male,Yes,0,Not Graduate,No,2914,2130,150,300,1,Urban 218 | LP002186,Male,Yes,0,Not Graduate,No,2747,2458,118,36,1,Semiurban 219 | LP002192,Male,Yes,0,Graduate,No,7830,2183,212,360,1,Rural 220 | LP002195,Male,Yes,1,Graduate,Yes,3507,3148,212,360,1,Rural 221 | LP002208,Male,Yes,1,Graduate,No,3747,2139,125,360,1,Urban 222 | LP002212,Male,Yes,0,Graduate,No,2166,2166,108,360,,Urban 223 | LP002240,Male,Yes,0,Not Graduate,No,3500,2168,149,360,1,Rural 224 | LP002245,Male,Yes,2,Not Graduate,No,2896,0,80,480,1,Urban 225 | LP002253,Female,No,1,Graduate,No,5062,0,152,300,1,Rural 226 | LP002256,Female,No,2,Graduate,Yes,5184,0,187,360,0,Semiurban 227 | LP002257,Female,No,0,Graduate,No,2545,0,74,360,1,Urban 228 | LP002264,Male,Yes,0,Graduate,No,2553,1768,102,360,1,Urban 229 | LP002270,Male,Yes,1,Graduate,No,3436,3809,100,360,1,Rural 230 | LP002279,Male,No,0,Graduate,No,2412,2755,130,360,1,Rural 231 | LP002286,Male,Yes,3+,Not Graduate,No,5180,0,125,360,0,Urban 232 | LP002294,Male,No,0,Graduate,No,14911,14507,130,360,1,Semiurban 233 | LP002298,,No,0,Graduate,Yes,2860,2988,138,360,1,Urban 234 | LP002306,Male,Yes,0,Graduate,No,1173,1594,28,180,1,Rural 235 | LP002310,Female,No,1,Graduate,No,7600,0,92,360,1,Semiurban 236 | LP002311,Female,Yes,0,Graduate,No,2157,1788,104,360,1,Urban 237 | LP002316,Male,No,0,Graduate,No,2231,2774,176,360,0,Urban 238 | LP002321,Female,No,0,Graduate,No,2274,5211,117,360,0,Semiurban 239 | LP002325,Male,Yes,2,Not Graduate,No,6166,13983,102,360,1,Rural 240 | LP002326,Male,Yes,2,Not Graduate,No,2513,1110,107,360,1,Semiurban 241 | LP002329,Male,No,0,Graduate,No,4333,0,66,480,1,Urban 242 | LP002333,Male,No,0,Not Graduate,No,3844,0,105,360,1,Urban 243 | LP002339,Male,Yes,0,Graduate,No,3887,1517,105,360,0,Semiurban 244 | LP002344,Male,Yes,0,Graduate,No,3510,828,105,360,1,Semiurban 245 | LP002346,Male,Yes,0,Graduate,,2539,1704,125,360,0,Rural 246 | LP002354,Female,No,0,Not Graduate,No,2107,0,64,360,1,Semiurban 247 | LP002355,,Yes,0,Graduate,No,3186,3145,150,180,0,Semiurban 248 | LP002358,Male,Yes,2,Graduate,Yes,5000,2166,150,360,1,Urban 249 | LP002360,Male,Yes,,Graduate,No,10000,0,,360,1,Urban 250 | LP002375,Male,Yes,0,Not Graduate,Yes,3943,0,64,360,1,Semiurban 251 | LP002376,Male,No,0,Graduate,No,2925,0,40,180,1,Rural 252 | LP002383,Male,Yes,3+,Graduate,No,3242,437,142,480,0,Urban 253 | LP002385,Male,Yes,,Graduate,No,3863,0,70,300,1,Semiurban 254 | LP002389,Female,No,1,Graduate,No,4028,0,131,360,1,Semiurban 255 | LP002394,Male,Yes,2,Graduate,No,4010,1025,120,360,1,Urban 256 | LP002397,Female,Yes,1,Graduate,No,3719,1585,114,360,1,Urban 257 | LP002399,Male,No,0,Graduate,,2858,0,123,360,0,Rural 258 | LP002400,Female,Yes,0,Graduate,No,3833,0,92,360,1,Rural 259 | LP002402,Male,Yes,0,Graduate,No,3333,4288,160,360,1,Urban 260 | LP002412,Male,Yes,0,Graduate,No,3007,3725,151,360,1,Rural 261 | LP002415,Female,No,1,Graduate,,1850,4583,81,360,,Rural 262 | LP002417,Male,Yes,3+,Not Graduate,No,2792,2619,171,360,1,Semiurban 263 | LP002420,Male,Yes,0,Graduate,No,2982,1550,110,360,1,Semiurban 264 | LP002425,Male,No,0,Graduate,No,3417,738,100,360,,Rural 265 | LP002433,Male,Yes,1,Graduate,No,18840,0,234,360,1,Rural 266 | LP002440,Male,Yes,2,Graduate,No,2995,1120,184,360,1,Rural 267 | LP002441,Male,No,,Graduate,No,3579,3308,138,360,,Semiurban 268 | LP002442,Female,Yes,1,Not Graduate,No,3835,1400,112,480,0,Urban 269 | LP002445,Female,No,1,Not Graduate,No,3854,3575,117,360,1,Rural 270 | LP002450,Male,Yes,2,Graduate,No,5833,750,49,360,0,Rural 271 | LP002471,Male,No,0,Graduate,No,3508,0,99,360,1,Rural 272 | LP002476,Female,Yes,3+,Not Graduate,No,1635,2444,99,360,1,Urban 273 | LP002482,Female,No,0,Graduate,Yes,3333,3916,212,360,1,Rural 274 | LP002485,Male,No,1,Graduate,No,24797,0,240,360,1,Semiurban 275 | LP002495,Male,Yes,2,Graduate,No,5667,440,130,360,0,Semiurban 276 | LP002496,Female,No,0,Graduate,No,3500,0,94,360,0,Semiurban 277 | LP002523,Male,Yes,3+,Graduate,No,2773,1497,108,360,1,Semiurban 278 | LP002542,Male,Yes,0,Graduate,,6500,0,144,360,1,Urban 279 | LP002550,Female,No,0,Graduate,No,5769,0,110,180,1,Semiurban 280 | LP002551,Male,Yes,3+,Not Graduate,,3634,910,176,360,0,Semiurban 281 | LP002553,,No,0,Graduate,No,29167,0,185,360,1,Semiurban 282 | LP002554,Male,No,0,Graduate,No,2166,2057,122,360,1,Semiurban 283 | LP002561,Male,Yes,0,Graduate,No,5000,0,126,360,1,Rural 284 | LP002566,Female,No,0,Graduate,No,5530,0,135,360,,Urban 285 | LP002568,Male,No,0,Not Graduate,No,9000,0,122,360,1,Rural 286 | LP002570,Female,Yes,2,Graduate,No,10000,11666,460,360,1,Urban 287 | LP002572,Male,Yes,1,Graduate,,8750,0,297,360,1,Urban 288 | LP002581,Male,Yes,0,Not Graduate,No,2157,2730,140,360,,Rural 289 | LP002584,Male,No,0,Graduate,,1972,4347,106,360,1,Rural 290 | LP002592,Male,No,0,Graduate,No,4983,0,141,360,1,Urban 291 | LP002593,Male,Yes,1,Graduate,No,8333,4000,,360,1,Urban 292 | LP002599,Male,Yes,0,Graduate,No,3667,2000,170,360,1,Semiurban 293 | LP002604,Male,Yes,2,Graduate,No,3166,2833,145,360,1,Urban 294 | LP002605,Male,No,0,Not Graduate,No,3271,0,90,360,1,Rural 295 | LP002609,Female,Yes,0,Graduate,No,2241,2000,88,360,0,Urban 296 | LP002610,Male,Yes,1,Not Graduate,,1792,2565,128,360,1,Urban 297 | LP002612,Female,Yes,0,Graduate,No,2666,0,84,480,1,Semiurban 298 | LP002614,,No,0,Graduate,No,6478,0,108,360,1,Semiurban 299 | LP002630,Male,No,0,Not Graduate,,3808,0,83,360,1,Rural 300 | LP002635,Female,Yes,2,Not Graduate,No,3729,0,117,360,1,Semiurban 301 | LP002639,Male,Yes,2,Graduate,No,4120,0,128,360,1,Rural 302 | LP002644,Male,Yes,1,Graduate,Yes,7500,0,75,360,1,Urban 303 | LP002651,Male,Yes,1,Graduate,,6300,0,125,360,0,Urban 304 | LP002654,Female,No,,Graduate,Yes,14987,0,177,360,1,Rural 305 | LP002657,,Yes,1,Not Graduate,Yes,570,2125,68,360,1,Rural 306 | LP002711,Male,Yes,0,Graduate,No,2600,700,96,360,1,Semiurban 307 | LP002712,Male,No,2,Not Graduate,No,2733,1083,180,360,,Semiurban 308 | LP002721,Male,Yes,2,Graduate,Yes,7500,0,183,360,1,Rural 309 | LP002735,Male,Yes,2,Not Graduate,No,3859,0,121,360,1,Rural 310 | LP002744,Male,Yes,1,Graduate,No,6825,0,162,360,1,Rural 311 | LP002745,Male,Yes,0,Graduate,No,3708,4700,132,360,1,Semiurban 312 | LP002746,Male,No,0,Graduate,No,5314,0,147,360,1,Urban 313 | LP002747,Female,No,3+,Graduate,No,2366,5272,153,360,0,Rural 314 | LP002754,Male,No,,Graduate,No,2066,2108,104,84,1,Urban 315 | LP002759,Male,Yes,2,Graduate,No,5000,0,149,360,1,Rural 316 | LP002760,Female,No,0,Graduate,No,3767,0,134,300,1,Urban 317 | LP002766,Female,Yes,0,Graduate,No,7859,879,165,180,1,Semiurban 318 | LP002769,Female,Yes,0,Graduate,No,4283,0,120,360,1,Rural 319 | LP002774,Male,Yes,0,Not Graduate,No,1700,2900,67,360,0,Urban 320 | LP002775,,No,0,Not Graduate,No,4768,0,125,360,1,Rural 321 | LP002781,Male,No,0,Graduate,No,3083,2738,120,360,1,Urban 322 | LP002782,Male,Yes,1,Graduate,No,2667,1542,148,360,1,Rural 323 | LP002786,Female,Yes,0,Not Graduate,No,1647,1762,181,360,1,Urban 324 | LP002790,Male,Yes,3+,Graduate,No,3400,0,80,120,1,Urban 325 | LP002791,Male,No,1,Graduate,,16000,5000,40,360,1,Semiurban 326 | LP002793,Male,Yes,0,Graduate,No,5333,0,90,360,1,Rural 327 | LP002802,Male,No,0,Graduate,No,2875,2416,95,6,0,Semiurban 328 | LP002803,Male,Yes,1,Not Graduate,,2600,618,122,360,1,Semiurban 329 | LP002805,Male,Yes,2,Graduate,No,5041,700,150,360,1,Urban 330 | LP002806,Male,Yes,3+,Graduate,Yes,6958,1411,150,360,1,Rural 331 | LP002816,Male,Yes,1,Graduate,No,3500,1658,104,360,,Semiurban 332 | LP002823,Male,Yes,0,Graduate,No,5509,0,143,360,1,Rural 333 | LP002825,Male,Yes,3+,Graduate,No,9699,0,300,360,1,Urban 334 | LP002826,Female,Yes,1,Not Graduate,No,3621,2717,171,360,1,Urban 335 | LP002843,Female,Yes,0,Graduate,No,4709,0,113,360,1,Semiurban 336 | LP002849,Male,Yes,0,Graduate,No,1516,1951,35,360,1,Semiurban 337 | LP002850,Male,No,2,Graduate,No,2400,0,46,360,1,Urban 338 | LP002853,Female,No,0,Not Graduate,No,3015,2000,145,360,,Urban 339 | LP002856,Male,Yes,0,Graduate,No,2292,1558,119,360,1,Urban 340 | LP002857,Male,Yes,1,Graduate,Yes,2360,3355,87,240,1,Rural 341 | LP002858,Female,No,0,Graduate,No,4333,2333,162,360,0,Rural 342 | LP002860,Male,Yes,0,Graduate,Yes,2623,4831,122,180,1,Semiurban 343 | LP002867,Male,No,0,Graduate,Yes,3972,4275,187,360,1,Rural 344 | LP002869,Male,Yes,3+,Not Graduate,No,3522,0,81,180,1,Rural 345 | LP002870,Male,Yes,1,Graduate,No,4700,0,80,360,1,Urban 346 | LP002876,Male,No,0,Graduate,No,6858,0,176,360,1,Rural 347 | LP002878,Male,Yes,3+,Graduate,No,8334,0,260,360,1,Urban 348 | LP002879,Male,Yes,0,Graduate,No,3391,1966,133,360,0,Rural 349 | LP002885,Male,No,0,Not Graduate,No,2868,0,70,360,1,Urban 350 | LP002890,Male,Yes,2,Not Graduate,No,3418,1380,135,360,1,Urban 351 | LP002891,Male,Yes,0,Graduate,Yes,2500,296,137,300,1,Rural 352 | LP002899,Male,Yes,2,Graduate,No,8667,0,254,360,1,Rural 353 | LP002901,Male,No,0,Graduate,No,2283,15000,106,360,,Rural 354 | LP002907,Male,Yes,0,Graduate,No,5817,910,109,360,1,Urban 355 | LP002920,Male,Yes,0,Graduate,No,5119,3769,120,360,1,Rural 356 | LP002921,Male,Yes,3+,Not Graduate,No,5316,187,158,180,0,Semiurban 357 | LP002932,Male,Yes,3+,Graduate,No,7603,1213,197,360,1,Urban 358 | LP002935,Male,Yes,1,Graduate,No,3791,1936,85,360,1,Urban 359 | LP002952,Male,No,0,Graduate,No,2500,0,60,360,1,Urban 360 | LP002954,Male,Yes,2,Not Graduate,No,3132,0,76,360,,Rural 361 | LP002962,Male,No,0,Graduate,No,4000,2667,152,360,1,Semiurban 362 | LP002965,Female,Yes,0,Graduate,No,8550,4255,96,360,,Urban 363 | LP002969,Male,Yes,1,Graduate,No,2269,2167,99,360,1,Semiurban 364 | LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113,360,1,Urban 365 | LP002975,Male,Yes,0,Graduate,No,4158,709,115,360,1,Urban 366 | LP002980,Male,No,0,Graduate,No,3250,1993,126,360,,Semiurban 367 | LP002986,Male,Yes,0,Graduate,No,5000,2393,158,360,1,Rural 368 | LP002989,Male,No,0,Graduate,Yes,9200,0,98,180,1,Rural -------------------------------------------------------------------------------- /sample.txt: -------------------------------------------------------------------------------- 1 | 24 29 88 2 | 1 0 8 3 | 33 7 99 4 | 39 11 98 5 | 22 76 87 -------------------------------------------------------------------------------- /sample0.txt: -------------------------------------------------------------------------------- 1 | Dogu Turkey Football 2 | John USA Hockey 3 | Paul Canada Basketball --------------------------------------------------------------------------------