├── 01062023 - DEB.ipynb
├── 02062023 - DEB.ipynb
├── 04052023 - DEB.ipynb
├── 04072023 - DEB.ipynb
├── 05052023 - DEB.ipynb
├── 06062023 - DEB.ipynb
├── 06072023 - DEB.ipynb
├── 07072023 - DEB.ipynb
├── 08062023 - DEB.ipynb
├── 09062023 - DEB.ipynb
├── 11072023 DEB v2.ipynb
├── 13072023 - DEB.ipynb
├── 14072023 - DEB.ipynb
├── 16052023 - DEB.ipynb
├── 18052023 - DEB.ipynb
├── 19042023 - DEB.ipynb
├── 23052023 - DEB (1).ipynb
├── 25042023 - DEB.ipynb
├── 25052023 - DEB.ipynb
├── 25062023 - DEB.ipynb
├── 26052023 - DEB.ipynb
├── 27042023 - DEB.ipynb
├── 28042023 - DEB.ipynb
├── 30052023 - DEB.ipynb
├── Advertising.csv
├── DecisionTreeClassification.ipynb
├── IBBLojistikWordCount.ipynb
├── KMeansClustering.ipynb
├── LineerRegression_Albaraka.ipynb
├── WineData.csv
├── loan_sanction_test.csv
├── loan_sanction_train.csv
├── sample.txt
└── sample0.txt
/04072023 - DEB.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 0,
6 | "metadata": {
7 | "application/vnd.databricks.v1+cell": {
8 | "cellMetadata": {
9 | "byteLimit": 2048000,
10 | "rowLimit": 10000
11 | },
12 | "inputWidgets": {},
13 | "nuid": "b1d043ea-8c32-4e77-9859-30369a3637c8",
14 | "showTitle": false,
15 | "title": ""
16 | }
17 | },
18 | "outputs": [
19 | {
20 | "output_type": "stream",
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "root\n |-- firstname: string (nullable = true)\n |-- middlename: string (nullable = true)\n |-- lastname: string (nullable = true)\n |-- id: string (nullable = true)\n |-- gender: string (nullable = true)\n |-- salary: integer (nullable = true)\n\n+---------+----------+--------+-----+------+------+\n|firstname|middlename|lastname|id |gender|salary|\n+---------+----------+--------+-----+------+------+\n|James | |William |36636|M |3000 |\n|Michael |Smith | |40288|M |4000 |\n|Robert | |Dawson |42114|M |4000 |\n|Maria | |Jones |39192|F |4000 |\n+---------+----------+--------+-----+------+------+\n\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "import pyspark \n",
30 | "from pyspark.sql import SparkSession\n",
31 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n",
32 | "\n",
33 | "spark = SparkSession.builder \\\n",
34 | " .master(\"local[1]\") \\\n",
35 | " .appName('ProjectFirst') \\\n",
36 | " .getOrCreate()\n",
37 | "\n",
38 | "data = [(\"James\", \"\", \"William\", \"36636\", \"M\", 3000), (\"Michael\", \"Smith\", \"\", \"40288\", \"M\", 4000), (\"Robert\", \"\", \"Dawson\", \"42114\", \"M\", 4000), \n",
39 | " (\"Maria\", \"\", \"Jones\", \"39192\", \"F\", 4000)]\n",
40 | "\n",
41 | "schema = StructType([\n",
42 | " StructField(\"firstname\", StringType(), True),\\\n",
43 | " StructField(\"middlename\", StringType(), True),\\\n",
44 | " StructField(\"lastname\", StringType(), True),\\\n",
45 | " StructField(\"id\", StringType(), True),\\\n",
46 | " StructField(\"gender\", StringType(), True),\\\n",
47 | " StructField(\"salary\", IntegerType(), True)\\\n",
48 | " ])\n",
49 | "\n",
50 | "df = spark.createDataFrame(data = data, schema = schema)\n",
51 | "df.printSchema()\n",
52 | "df.show(truncate = False)"
53 | ]
54 | },
55 | {
56 | "cell_type": "code",
57 | "execution_count": 0,
58 | "metadata": {
59 | "application/vnd.databricks.v1+cell": {
60 | "cellMetadata": {
61 | "byteLimit": 2048000,
62 | "rowLimit": 10000
63 | },
64 | "inputWidgets": {},
65 | "nuid": "394b5bc2-e1d4-4771-91d4-d8f24ee39c57",
66 | "showTitle": false,
67 | "title": ""
68 | }
69 | },
70 | "outputs": [
71 | {
72 | "output_type": "stream",
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "root\n |-- employee_name: string (nullable = true)\n |-- department: string (nullable = true)\n |-- salary: long (nullable = true)\n\n+-------------+----------+------+\n|employee_name|department|salary|\n+-------------+----------+------+\n|James |Sales |3000 |\n|Michael |Sales |4600 |\n|Robert |Sales |4100 |\n|Maria |Finance |3000 |\n|James |Sales |3000 |\n|Scott |Finance |3300 |\n|Jen |Finance |3900 |\n|Jeff |Marketing |3000 |\n|Kumar |Marketing |2000 |\n|Dogu |Sales |4100 |\n+-------------+----------+------+\n\n"
77 | ]
78 | }
79 | ],
80 | "source": [
81 | "import pyspark \n",
82 | "from pyspark.sql import SparkSession\n",
83 | "from pyspark.sql.functions import expr\n",
84 | "\n",
85 | "spark = SparkSession.builder \\\n",
86 | " .master(\"local[1]\") \\\n",
87 | " .appName('ProjectSecond') \\\n",
88 | " .getOrCreate()\n",
89 | "\n",
90 | "data = [(\"James\", \"Sales\", 3000),\\\n",
91 | " (\"Michael\", \"Sales\", 4600),\\\n",
92 | " (\"Robert\", \"Sales\", 4100),\\\n",
93 | " (\"Maria\", \"Finance\", 3000),\\\n",
94 | " (\"James\", \"Sales\", 3000),\\\n",
95 | " (\"Scott\", \"Finance\", 3300),\\\n",
96 | " (\"Jen\", \"Finance\", 3900),\\\n",
97 | " (\"Jeff\", \"Marketing\", 3000),\\\n",
98 | " (\"Kumar\", \"Marketing\", 2000),\\\n",
99 | " (\"Dogu\", \"Sales\", 4100)]\n",
100 | "\n",
101 | "column = [\"employee_name\", \"department\", \"salary\"]\n",
102 | "df = spark.createDataFrame(data = data, schema = column)\n",
103 | "df.printSchema()\n",
104 | "df.show(truncate = False)"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 0,
110 | "metadata": {
111 | "application/vnd.databricks.v1+cell": {
112 | "cellMetadata": {
113 | "byteLimit": 2048000,
114 | "rowLimit": 10000
115 | },
116 | "inputWidgets": {},
117 | "nuid": "033c7a6b-db54-4457-8762-51fd44b788e5",
118 | "showTitle": false,
119 | "title": ""
120 | }
121 | },
122 | "outputs": [
123 | {
124 | "output_type": "stream",
125 | "name": "stdout",
126 | "output_type": "stream",
127 | "text": [
128 | "Distinct Count: 9\n+-------------+----------+------+\n|employee_name|department|salary|\n+-------------+----------+------+\n|Michael |Sales |4600 |\n|James |Sales |3000 |\n|Robert |Sales |4100 |\n|Maria |Finance |3000 |\n|Jen |Finance |3900 |\n|Scott |Finance |3300 |\n|Kumar |Marketing |2000 |\n|Jeff |Marketing |3000 |\n|Dogu |Sales |4100 |\n+-------------+----------+------+\n\nDistinct Count: 9\n+-------------+----------+------+\n|employee_name|department|salary|\n+-------------+----------+------+\n|Michael |Sales |4600 |\n|James |Sales |3000 |\n|Robert |Sales |4100 |\n|Maria |Finance |3000 |\n|Jen |Finance |3900 |\n|Scott |Finance |3300 |\n|Kumar |Marketing |2000 |\n|Jeff |Marketing |3000 |\n|Dogu |Sales |4100 |\n+-------------+----------+------+\n\nDistinct Count: 8\n+-------------+----------+------+\n|employee_name|department|salary|\n+-------------+----------+------+\n|Maria |Finance |3000 |\n|Scott |Finance |3300 |\n|Jen |Finance |3900 |\n|Kumar |Marketing |2000 |\n|Jeff |Marketing |3000 |\n|James |Sales |3000 |\n|Robert |Sales |4100 |\n|Michael |Sales |4600 |\n+-------------+----------+------+\n\n"
129 | ]
130 | }
131 | ],
132 | "source": [
133 | "#Distinct\n",
134 | "distinctDF = df.distinct()\n",
135 | "print(\"Distinct Count: \" + str(distinctDF.count()))\n",
136 | "distinctDF.show(truncate = False)\n",
137 | "\n",
138 | "#Drop Duplicates\n",
139 | "df2 = df.dropDuplicates()\n",
140 | "print(\"Distinct Count: \" + str(df2.count()))\n",
141 | "df2.show(truncate = False)\n",
142 | "\n",
143 | "dropDisDF = df.dropDuplicates([\"department\", \"salary\"])\n",
144 | "print(\"Distinct Count: \" + str(dropDisDF.count()))\n",
145 | "dropDisDF.show(truncate = False)"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 0,
151 | "metadata": {
152 | "application/vnd.databricks.v1+cell": {
153 | "cellMetadata": {
154 | "byteLimit": 2048000,
155 | "rowLimit": 10000
156 | },
157 | "inputWidgets": {},
158 | "nuid": "c0f829d6-bfd6-44da-b5a9-b29bafe573d4",
159 | "showTitle": false,
160 | "title": ""
161 | }
162 | },
163 | "outputs": [
164 | {
165 | "output_type": "stream",
166 | "name": "stdout",
167 | "output_type": "stream",
168 | "text": [
169 | "root\n |-- firstname: string (nullable = true)\n |-- middlename: string (nullable = true)\n |-- lastname: string (nullable = true)\n |-- id: string (nullable = true)\n |-- gender: string (nullable = true)\n |-- salary: integer (nullable = true)\n\n+---------+----------+--------+-----+------+------+\n|firstname|middlename|lastname|id |gender|salary|\n+---------+----------+--------+-----+------+------+\n|James | |William |36636|M |3000 |\n|Michael |Smith | |40288|M |4000 |\n|Robert | |Dawson |42114|M |4000 |\n|Maria | |Jones |39192|F |4000 |\n+---------+----------+--------+-----+------+------+\n\n"
170 | ]
171 | }
172 | ],
173 | "source": [
174 | "import pyspark \n",
175 | "from pyspark.sql import SparkSession\n",
176 | "from pyspark.sql.types import StructType, StructField, StringType, IntegerType\n",
177 | "\n",
178 | "spark = SparkSession.builder \\\n",
179 | " .master(\"local[1]\") \\\n",
180 | " .appName('ProjectFirst') \\\n",
181 | " .getOrCreate()\n",
182 | "\n",
183 | "data = [(\"James\", \"\", \"William\", \"36636\", \"M\", 3000), (\"Michael\", \"Smith\", \"\", \"40288\", \"M\", 4000), (\"Robert\", \"\", \"Dawson\", \"42114\", \"M\", 4000), \n",
184 | " (\"Maria\", \"\", \"Jones\", \"39192\", \"F\", 4000)]\n",
185 | "\n",
186 | "schema = StructType([\n",
187 | " StructField(\"firstname\", StringType(), True),\\\n",
188 | " StructField(\"middlename\", StringType(), True),\\\n",
189 | " StructField(\"lastname\", StringType(), True),\\\n",
190 | " StructField(\"id\", StringType(), True),\\\n",
191 | " StructField(\"gender\", StringType(), True),\\\n",
192 | " StructField(\"salary\", IntegerType(), True)\\\n",
193 | " ])\n",
194 | "\n",
195 | "df = spark.createDataFrame(data = data, schema = schema)\n",
196 | "df.printSchema()\n",
197 | "df.show(truncate = False)"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": 0,
203 | "metadata": {
204 | "application/vnd.databricks.v1+cell": {
205 | "cellMetadata": {
206 | "byteLimit": 2048000,
207 | "rowLimit": 10000
208 | },
209 | "inputWidgets": {},
210 | "nuid": "e22f9b94-b38d-4f4f-9020-4d4284479b3b",
211 | "showTitle": false,
212 | "title": ""
213 | }
214 | },
215 | "outputs": [
216 | {
217 | "output_type": "stream",
218 | "name": "stdout",
219 | "output_type": "stream",
220 | "text": [
221 | " firstname middlename lastname id gender salary\n0 James William 36636 M 3000\n1 Michael Smith 40288 M 4000\n2 Robert Dawson 42114 M 4000\n3 Maria Jones 39192 F 4000\n"
222 | ]
223 | }
224 | ],
225 | "source": [
226 | "PandasDF = df.toPandas()\n",
227 | "print(PandasDF)"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": 0,
233 | "metadata": {
234 | "application/vnd.databricks.v1+cell": {
235 | "cellMetadata": {
236 | "byteLimit": 2048000,
237 | "rowLimit": 10000
238 | },
239 | "inputWidgets": {},
240 | "nuid": "ca514ad4-b095-4345-b589-31a0d3eda120",
241 | "showTitle": false,
242 | "title": ""
243 | }
244 | },
245 | "outputs": [
246 | {
247 | "output_type": "stream",
248 | "name": "stdout",
249 | "output_type": "stream",
250 | "text": [
251 | "root\n |-- Product: string (nullable = true)\n |-- Amount: long (nullable = true)\n |-- Country: string (nullable = true)\n\n+-------+------+-------+\n|Product|Amount|Country|\n+-------+------+-------+\n|Banana |1000 |USA |\n|Carrots|1500 |USA |\n|Beans |1600 |USA |\n|Orange |2000 |USA |\n|Orange |2000 |USA |\n|Banana |4000 |China |\n|Carrots|1200 |China |\n|Beans |1500 |China |\n|Orange |4000 |China |\n|Banana |2000 |Canada |\n|Carrots|2000 |Canada |\n|Beans |2000 |Mexico |\n+-------+------+-------+\n\n"
252 | ]
253 | }
254 | ],
255 | "source": [
256 | "import pyspark \n",
257 | "from pyspark.sql import SparkSession\n",
258 | "from pyspark.sql.functions import expr\n",
259 | "\n",
260 | "data = [(\"Banana\", 1000, \"USA\"), (\"Carrots\", 1500, \"USA\"), (\"Beans\", 1600, \"USA\"),\\\n",
261 | " (\"Orange\", 2000, \"USA\"), (\"Orange\", 2000, \"USA\"), (\"Banana\", 4000, \"China\"),\\\n",
262 | " (\"Carrots\", 1200, \"China\"), (\"Beans\", 1500, \"China\"), (\"Orange\", 4000, \"China\"),\\\n",
263 | " (\"Banana\", 2000, \"Canada\"), (\"Carrots\", 2000, \"Canada\"), (\"Beans\", 2000, \"Mexico\")\\\n",
264 | " ]\n",
265 | "\n",
266 | "columns = ['Product', 'Amount', 'Country']\n",
267 | "\n",
268 | "df = spark.createDataFrame(data = data, schema = columns)\n",
269 | "df.printSchema()\n",
270 | "df.show(truncate = False)"
271 | ]
272 | },
273 | {
274 | "cell_type": "code",
275 | "execution_count": 0,
276 | "metadata": {
277 | "application/vnd.databricks.v1+cell": {
278 | "cellMetadata": {
279 | "byteLimit": 2048000,
280 | "rowLimit": 10000
281 | },
282 | "inputWidgets": {},
283 | "nuid": "d4afd5bd-c8ca-4af8-bb36-55aac49cd161",
284 | "showTitle": false,
285 | "title": ""
286 | }
287 | },
288 | "outputs": [
289 | {
290 | "output_type": "stream",
291 | "name": "stdout",
292 | "output_type": "stream",
293 | "text": [
294 | "root\n |-- Product: string (nullable = true)\n |-- Canada: long (nullable = true)\n |-- China: long (nullable = true)\n |-- Mexico: long (nullable = true)\n |-- USA: long (nullable = true)\n\n+-------+------+-----+------+----+\n|Product|Canada|China|Mexico|USA |\n+-------+------+-----+------+----+\n|Orange |null |4000 |null |4000|\n|Beans |null |1500 |2000 |1600|\n|Banana |2000 |4000 |null |1000|\n|Carrots|2000 |1200 |null |1500|\n+-------+------+-----+------+----+\n\n"
295 | ]
296 | }
297 | ],
298 | "source": [
299 | "pivotDF = df.groupBy(\"Product\").pivot(\"Country\").sum(\"Amount\")\n",
300 | "pivotDF.printSchema()\n",
301 | "pivotDF.show(truncate = False)"
302 | ]
303 | },
304 | {
305 | "cell_type": "code",
306 | "execution_count": 0,
307 | "metadata": {
308 | "application/vnd.databricks.v1+cell": {
309 | "cellMetadata": {
310 | "byteLimit": 2048000,
311 | "rowLimit": 10000
312 | },
313 | "inputWidgets": {},
314 | "nuid": "5c264134-47b5-4a93-ab93-370312c9d58e",
315 | "showTitle": false,
316 | "title": ""
317 | }
318 | },
319 | "outputs": [
320 | {
321 | "output_type": "stream",
322 | "name": "stdout",
323 | "output_type": "stream",
324 | "text": [
325 | "+-----+-----+---------+-----+\n| TV|Radio|Newspaper|Sales|\n+-----+-----+---------+-----+\n|230.1| 37.8| 69.2| 22.1|\n| 44.5| 39.3| 45.1| 10.4|\n| 17.2| 45.9| 69.3| 9.3|\n|151.5| 41.3| 58.5| 18.5|\n|180.8| 10.8| 58.4| 12.9|\n+-----+-----+---------+-----+\nonly showing top 5 rows\n\nroot\n |-- TV: double (nullable = true)\n |-- Radio: double (nullable = true)\n |-- Newspaper: double (nullable = true)\n |-- Sales: double (nullable = true)\n\n"
326 | ]
327 | }
328 | ],
329 | "source": [
330 | "from pyspark.sql import SparkSession\n",
331 | "\n",
332 | "spark = SparkSession.builder \\\n",
333 | " .master(\"local[1]\") \\\n",
334 | " .appName('ProjectThird') \\\n",
335 | " .getOrCreate()\n",
336 | "\n",
337 | "df = spark.read.format('delta') \\\n",
338 | " .options(header = 'True', inferschema = 'True')\\\n",
339 | " .load(\"/user/hive/warehouse/advertising\", header = True)\n",
340 | "\n",
341 | "df.show(5)\n",
342 | "df.printSchema()"
343 | ]
344 | },
345 | {
346 | "cell_type": "code",
347 | "execution_count": 0,
348 | "metadata": {
349 | "application/vnd.databricks.v1+cell": {
350 | "cellMetadata": {
351 | "byteLimit": 2048000,
352 | "rowLimit": 10000
353 | },
354 | "inputWidgets": {},
355 | "nuid": "53ba210d-4f66-44d6-a595-547c62e58bee",
356 | "showTitle": false,
357 | "title": ""
358 | }
359 | },
360 | "outputs": [
361 | {
362 | "output_type": "stream",
363 | "name": "stdout",
364 | "output_type": "stream",
365 | "text": [
366 | "+----+----+----+-----+\n|col1|col2|col3| col4|\n+----+----+----+-----+\n| 1| 2| 3|a b c|\n| 4| 5| 6|d e f|\n| 7| 8| 9|g h i|\n+----+----+----+-----+\n\n"
367 | ]
368 | }
369 | ],
370 | "source": [
371 | "#RDD creation\n",
372 | "\n",
373 | "from pyspark.sql import SparkSession\n",
374 | "\n",
375 | "spark = SparkSession.builder \\\n",
376 | " .master(\"local[1]\") \\\n",
377 | " .appName('ProjectRDDCreation') \\\n",
378 | " .getOrCreate()\n",
379 | "\n",
380 | "df = spark.sparkContext.parallelize([(1,2,3, 'a b c'), (4,5,6, 'd e f'), (7,8,9, 'g h i')]).toDF(['col1', 'col2', 'col3', 'col4'])\n",
381 | "df.show()"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": 0,
387 | "metadata": {
388 | "application/vnd.databricks.v1+cell": {
389 | "cellMetadata": {},
390 | "inputWidgets": {},
391 | "nuid": "fea6c861-1141-46bb-8284-f26f9b0aff59",
392 | "showTitle": false,
393 | "title": ""
394 | }
395 | },
396 | "outputs": [],
397 | "source": [
398 | "#Transformations & Actions"
399 | ]
400 | }
401 | ],
402 | "metadata": {
403 | "application/vnd.databricks.v1+notebook": {
404 | "dashboards": [],
405 | "language": "python",
406 | "notebookMetadata": {
407 | "pythonIndentUnit": 4
408 | },
409 | "notebookName": "04072023 - DEB",
410 | "widgets": {}
411 | }
412 | },
413 | "nbformat": 4,
414 | "nbformat_minor": 0
415 | }
416 |
--------------------------------------------------------------------------------
/06072023 - DEB.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 0,
6 | "metadata": {
7 | "application/vnd.databricks.v1+cell": {
8 | "cellMetadata": {
9 | "byteLimit": 2048000,
10 | "rowLimit": 10000
11 | },
12 | "inputWidgets": {},
13 | "nuid": "2368cd69-5209-43e5-b5f5-d2946a9ed25b",
14 | "showTitle": false,
15 | "title": ""
16 | }
17 | },
18 | "outputs": [
19 | {
20 | "output_type": "stream",
21 | "name": "stdout",
22 | "output_type": "stream",
23 | "text": [
24 | "root\n |-- TV: double (nullable = true)\n |-- Radio: double (nullable = true)\n |-- Newspaper: double (nullable = true)\n |-- Sales: double (nullable = true)\n\n+-----+-----+---------+-----+\n| TV|Radio|Newspaper|Sales|\n+-----+-----+---------+-----+\n|230.1| 37.8| 69.2| 22.1|\n| 44.5| 39.3| 45.1| 10.4|\n| 17.2| 45.9| 69.3| 9.3|\n|151.5| 41.3| 58.5| 18.5|\n|180.8| 10.8| 58.4| 12.9|\n+-----+-----+---------+-----+\nonly showing top 5 rows\n\n"
25 | ]
26 | }
27 | ],
28 | "source": [
29 | "#y = b0 + b1*x\n",
30 | "\n",
31 | "from pyspark.sql import SparkSession\n",
32 | "\n",
33 | "spark = SparkSession \\\n",
34 | " .builder \\\n",
35 | " .appName('Linear Regression with PySpark') \\\n",
36 | " .getOrCreate()\n",
37 | "\n",
38 | "df = spark.read.format('delta').\\\n",
39 | " options(header = 'true', inferschema = 'true').\\\n",
40 | " load(\"/user/hive/warehouse/advertising\", header = True)\n",
41 | "\n",
42 | "df.printSchema()\n",
43 | "df.show(5)"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 0,
49 | "metadata": {
50 | "application/vnd.databricks.v1+cell": {
51 | "cellMetadata": {
52 | "byteLimit": 2048000,
53 | "rowLimit": 10000
54 | },
55 | "inputWidgets": {},
56 | "nuid": "2df871da-5142-42e4-8234-49ddae8ebb4c",
57 | "showTitle": false,
58 | "title": ""
59 | }
60 | },
61 | "outputs": [
62 | {
63 | "output_type": "stream",
64 | "name": "stdout",
65 | "output_type": "stream",
66 | "text": [
67 | "+-------+-----------------+------------------+------------------+------------------+\n|summary| TV| Radio| Newspaper| Sales|\n+-------+-----------------+------------------+------------------+------------------+\n| count| 200| 200| 200| 200|\n| mean| 147.0425|23.264000000000024|30.553999999999995|14.022500000000003|\n| stddev|85.85423631490805|14.846809176168728| 21.77862083852283| 5.217456565710477|\n| min| 0.7| 0.0| 0.3| 1.6|\n| max| 296.4| 49.6| 114.0| 27.0|\n+-------+-----------------+------------------+------------------+------------------+\n\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "df.describe().show()"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": 0,
78 | "metadata": {
79 | "application/vnd.databricks.v1+cell": {
80 | "cellMetadata": {
81 | "byteLimit": 2048000,
82 | "rowLimit": 10000
83 | },
84 | "inputWidgets": {},
85 | "nuid": "3510b437-2a07-4830-8133-c8069353684f",
86 | "showTitle": false,
87 | "title": ""
88 | }
89 | },
90 | "outputs": [
91 | {
92 | "output_type": "stream",
93 | "name": "stdout",
94 | "output_type": "stream",
95 | "text": [
96 | "+-----------------+-----+\n| features|label|\n+-----------------+-----+\n|[230.1,37.8,69.2]| 22.1|\n| [44.5,39.3,45.1]| 10.4|\n| [17.2,45.9,69.3]| 9.3|\n|[151.5,41.3,58.5]| 18.5|\n|[180.8,10.8,58.4]| 12.9|\n+-----------------+-----+\nonly showing top 5 rows\n\n"
97 | ]
98 | }
99 | ],
100 | "source": [
101 | "from pyspark.sql import Row\n",
102 | "from pyspark.ml.linalg import Vectors\n",
103 | "\n",
104 | "def transData(data):\n",
105 | " return data.rdd.map(lambda r : [Vectors.dense(r[:-1]), r[-1]]).toDF(['features', 'label'])\n",
106 | "\n",
107 | "transformed = transData(df)\n",
108 | "transformed.show(5)"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": 0,
114 | "metadata": {
115 | "application/vnd.databricks.v1+cell": {
116 | "cellMetadata": {
117 | "byteLimit": 2048000,
118 | "rowLimit": 10000
119 | },
120 | "inputWidgets": {},
121 | "nuid": "7d404853-88c2-415d-bce7-9fd33e0b2360",
122 | "showTitle": false,
123 | "title": ""
124 | }
125 | },
126 | "outputs": [
127 | {
128 | "output_type": "stream",
129 | "name": "stdout",
130 | "output_type": "stream",
131 | "text": [
132 | "+-----------------+-----+-----------------+\n| features|label| indexedFeatures|\n+-----------------+-----+-----------------+\n|[230.1,37.8,69.2]| 22.1|[230.1,37.8,69.2]|\n| [44.5,39.3,45.1]| 10.4| [44.5,39.3,45.1]|\n| [17.2,45.9,69.3]| 9.3| [17.2,45.9,69.3]|\n|[151.5,41.3,58.5]| 18.5|[151.5,41.3,58.5]|\n|[180.8,10.8,58.4]| 12.9|[180.8,10.8,58.4]|\n+-----------------+-----+-----------------+\nonly showing top 5 rows\n\n"
133 | ]
134 | }
135 | ],
136 | "source": [
137 | "from pyspark.ml import Pipeline\n",
138 | "from pyspark.ml.regression import LinearRegression\n",
139 | "from pyspark.ml.feature import VectorIndexer\n",
140 | "from pyspark.ml.evaluation import RegressionEvaluator\n",
141 | "\n",
142 | "featureIndexer = VectorIndexer(inputCol = \"features\", outputCol = \"indexedFeatures\", maxCategories = 4).fit(transformed)\n",
143 | "\n",
144 | "data = featureIndexer.transform(transformed)\n",
145 | "\n",
146 | "data.show(5)"
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "execution_count": 0,
152 | "metadata": {
153 | "application/vnd.databricks.v1+cell": {
154 | "cellMetadata": {
155 | "byteLimit": 2048000,
156 | "rowLimit": 10000
157 | },
158 | "inputWidgets": {},
159 | "nuid": "b2bc613d-7b82-40fe-87d9-194d2718f380",
160 | "showTitle": false,
161 | "title": ""
162 | }
163 | },
164 | "outputs": [
165 | {
166 | "output_type": "stream",
167 | "name": "stdout",
168 | "output_type": "stream",
169 | "text": [
170 | "+---------------+-----+---------------+\n| features|label|indexedFeatures|\n+---------------+-----+---------------+\n| [0.7,39.6,8.7]| 1.6| [0.7,39.6,8.7]|\n| [4.1,11.6,5.7]| 3.2| [4.1,11.6,5.7]|\n| [5.4,29.9,9.4]| 5.3| [5.4,29.9,9.4]|\n|[7.3,28.1,41.4]| 5.5|[7.3,28.1,41.4]|\n|[7.8,38.9,50.6]| 6.6|[7.8,38.9,50.6]|\n+---------------+-----+---------------+\nonly showing top 5 rows\n\n+----------------+-----+----------------+\n| features|label| indexedFeatures|\n+----------------+-----+----------------+\n| [8.4,27.2,2.1]| 5.7| [8.4,27.2,2.1]|\n| [8.6,2.1,1.0]| 4.8| [8.6,2.1,1.0]|\n| [8.7,48.9,75.0]| 7.2| [8.7,48.9,75.0]|\n|[13.2,15.9,49.6]| 5.6|[13.2,15.9,49.6]|\n|[18.7,12.1,23.4]| 6.7|[18.7,12.1,23.4]|\n+----------------+-----+----------------+\nonly showing top 5 rows\n\n"
171 | ]
172 | }
173 | ],
174 | "source": [
175 | "(trainingData, testData) = data.randomSplit([0.6, 0.4])\n",
176 | "trainingData.show(5)\n",
177 | "testData.show(5)"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 0,
183 | "metadata": {
184 | "application/vnd.databricks.v1+cell": {
185 | "cellMetadata": {
186 | "byteLimit": 2048000,
187 | "rowLimit": 10000
188 | },
189 | "inputWidgets": {},
190 | "nuid": "bedc939e-99b7-4f47-9a40-b6964d9c8947",
191 | "showTitle": false,
192 | "title": ""
193 | }
194 | },
195 | "outputs": [],
196 | "source": [
197 | "lr = LinearRegression()\n",
198 | "\n",
199 | "pipeline = Pipeline(stages = [featureIndexer, lr])\n",
200 | "\n",
201 | "model = pipeline.fit(trainingData)"
202 | ]
203 | },
204 | {
205 | "cell_type": "code",
206 | "execution_count": 0,
207 | "metadata": {
208 | "application/vnd.databricks.v1+cell": {
209 | "cellMetadata": {
210 | "byteLimit": 2048000,
211 | "rowLimit": 10000
212 | },
213 | "inputWidgets": {},
214 | "nuid": "e5f5102d-c0f5-4d72-9e27-8983a4c76876",
215 | "showTitle": false,
216 | "title": ""
217 | }
218 | },
219 | "outputs": [
220 | {
221 | "output_type": "stream",
222 | "name": "stdout",
223 | "output_type": "stream",
224 | "text": [
225 | "+----------------+-----+------------------+\n| features|label| prediction|\n+----------------+-----+------------------+\n| [8.4,27.2,2.1]| 5.7| 8.229211181760627|\n| [8.6,2.1,1.0]| 4.8| 3.644851163299394|\n| [8.7,48.9,75.0]| 7.2|12.020689858644344|\n|[13.2,15.9,49.6]| 5.6| 6.25831749737122|\n|[18.7,12.1,23.4]| 6.7| 5.893004667299868|\n+----------------+-----+------------------+\nonly showing top 5 rows\n\n"
226 | ]
227 | }
228 | ],
229 | "source": [
230 | "predictions = model.transform(testData)\n",
231 | "\n",
232 | "predictions.select(\"features\", \"label\", \"prediction\").show(5)"
233 | ]
234 | },
235 | {
236 | "cell_type": "code",
237 | "execution_count": 0,
238 | "metadata": {
239 | "application/vnd.databricks.v1+cell": {
240 | "cellMetadata": {
241 | "byteLimit": 2048000,
242 | "rowLimit": 10000
243 | },
244 | "inputWidgets": {},
245 | "nuid": "8c828be1-7ec4-4fc8-adf3-67e20b583020",
246 | "showTitle": false,
247 | "title": ""
248 | }
249 | },
250 | "outputs": [
251 | {
252 | "output_type": "stream",
253 | "name": "stdout",
254 | "output_type": "stream",
255 | "text": [
256 | "Root Mean Square Error (RMSE) on test data = 1.55713\n"
257 | ]
258 | }
259 | ],
260 | "source": [
261 | "from pyspark.ml.evaluation import RegressionEvaluator\n",
262 | "\n",
263 | "evaluator = RegressionEvaluator(labelCol = 'label', predictionCol = 'prediction', metricName = 'rmse')\n",
264 | "\n",
265 | "rmse = evaluator.evaluate(predictions)\n",
266 | "print('Root Mean Square Error (RMSE) on test data = %g' % rmse)"
267 | ]
268 | },
269 | {
270 | "cell_type": "code",
271 | "execution_count": 0,
272 | "metadata": {
273 | "application/vnd.databricks.v1+cell": {
274 | "cellMetadata": {
275 | "byteLimit": 2048000,
276 | "rowLimit": 10000
277 | },
278 | "inputWidgets": {},
279 | "nuid": "c87da307-16aa-41b2-b429-7cc53d7ee3e0",
280 | "showTitle": false,
281 | "title": ""
282 | }
283 | },
284 | "outputs": [
285 | {
286 | "output_type": "stream",
287 | "name": "stdout",
288 | "output_type": "stream",
289 | "text": [
290 | "r2_score: 0.8899337308584387\n"
291 | ]
292 | }
293 | ],
294 | "source": [
295 | "#r2-score hesabı\n",
296 | "\n",
297 | "y_true = predictions.select('label').toPandas()\n",
298 | "y_pred = predictions.select('prediction').toPandas()\n",
299 | "\n",
300 | "import sklearn.metrics\n",
301 | "\n",
302 | "r2_score = sklearn.metrics.r2_score(y_true, y_pred)\n",
303 | "print('r2_score: {0}'.format(r2_score))"
304 | ]
305 | },
306 | {
307 | "cell_type": "code",
308 | "execution_count": 0,
309 | "metadata": {
310 | "application/vnd.databricks.v1+cell": {
311 | "cellMetadata": {
312 | "byteLimit": 2048000,
313 | "rowLimit": 10000
314 | },
315 | "inputWidgets": {},
316 | "nuid": "84eea05d-8018-4f69-93b5-ba9eaea5b6dc",
317 | "showTitle": false,
318 | "title": ""
319 | }
320 | },
321 | "outputs": [],
322 | "source": [
323 | "#Decision Tree Classification"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": 0,
329 | "metadata": {
330 | "application/vnd.databricks.v1+cell": {
331 | "cellMetadata": {
332 | "byteLimit": 2048000,
333 | "rowLimit": 10000
334 | },
335 | "inputWidgets": {},
336 | "nuid": "06824127-5c6a-48c3-9b9d-f728a07682c4",
337 | "showTitle": false,
338 | "title": ""
339 | }
340 | },
341 | "outputs": [],
342 | "source": [
343 | "from pyspark.sql import SparkSession\n",
344 | "\n",
345 | "spark = SparkSession \\\n",
346 | " .builder \\\n",
347 | " .appName('DT Classification with Pyspark') \\\n",
348 | " .getOrCreate()"
349 | ]
350 | },
351 | {
352 | "cell_type": "code",
353 | "execution_count": 0,
354 | "metadata": {
355 | "application/vnd.databricks.v1+cell": {
356 | "cellMetadata": {
357 | "byteLimit": 2048000,
358 | "rowLimit": 10000
359 | },
360 | "inputWidgets": {},
361 | "nuid": "b2a61147-fe4b-462b-9b04-1d979140ef88",
362 | "showTitle": false,
363 | "title": ""
364 | }
365 | },
366 | "outputs": [
367 | {
368 | "output_type": "stream",
369 | "name": "stdout",
370 | "output_type": "stream",
371 | "text": [
372 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol|quality|\n+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| 5|\n| 7.8| 0.88| 0.0| 2.6| 0.098| 25.0| 67.0| 0.9968| 3.2| 0.68| 9.8| 5|\n| 7.8| 0.76| 0.04| 2.3| 0.092| 15.0| 54.0| 0.997|3.26| 0.65| 9.8| 5|\n| 11.2| 0.28| 0.56| 1.9| 0.075| 17.0| 60.0| 0.998|3.16| 0.58| 9.8| 6|\n| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| 5|\n+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\nonly showing top 5 rows\n\n"
373 | ]
374 | }
375 | ],
376 | "source": [
377 | "df = spark.read.format('delta').\\\n",
378 | " options(header = 'true', inferschema = 'true')\\\n",
379 | " .load(\"/user/hive/warehouse/wine_data\", header = 'True')\n",
380 | "\n",
381 | "df.show(5, True)"
382 | ]
383 | },
384 | {
385 | "cell_type": "code",
386 | "execution_count": 0,
387 | "metadata": {
388 | "application/vnd.databricks.v1+cell": {
389 | "cellMetadata": {
390 | "byteLimit": 2048000,
391 | "rowLimit": 10000
392 | },
393 | "inputWidgets": {},
394 | "nuid": "acea6e1c-fae9-41c0-b071-58a429e8e7d0",
395 | "showTitle": false,
396 | "title": ""
397 | }
398 | },
399 | "outputs": [],
400 | "source": [
401 | "def condition(r):\n",
402 | "\n",
403 | " if (0 <= r <= 4):\n",
404 | " label = 'low'\n",
405 | " \n",
406 | " elif (4 < r <= 6):\n",
407 | " label = 'medium'\n",
408 | "\n",
409 | " else:\n",
410 | " label = 'high'\n",
411 | " \n",
412 | " return label\n",
413 | "\n",
414 | "def string_to_float(x):\n",
415 | " return float(x)"
416 | ]
417 | },
418 | {
419 | "cell_type": "code",
420 | "execution_count": 0,
421 | "metadata": {
422 | "application/vnd.databricks.v1+cell": {
423 | "cellMetadata": {
424 | "byteLimit": 2048000,
425 | "rowLimit": 10000
426 | },
427 | "inputWidgets": {},
428 | "nuid": "77ea88fe-f733-4139-9eb5-12780aafc42b",
429 | "showTitle": false,
430 | "title": ""
431 | }
432 | },
433 | "outputs": [
434 | {
435 | "output_type": "stream",
436 | "name": "stdout",
437 | "output_type": "stream",
438 | "text": [
439 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol|quality|\n+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| medium|\n| 7.8| 0.88| 0.0| 2.6| 0.098| 25.0| 67.0| 0.9968| 3.2| 0.68| 9.8| medium|\n| 7.8| 0.76| 0.04| 2.3| 0.092| 15.0| 54.0| 0.997|3.26| 0.65| 9.8| medium|\n| 11.2| 0.28| 0.56| 1.9| 0.075| 17.0| 60.0| 0.998|3.16| 0.58| 9.8| medium|\n| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| medium|\n+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\nonly showing top 5 rows\n\nroot\n |-- fixed acidity: double (nullable = true)\n |-- volatile acidity: double (nullable = true)\n |-- citric acid: double (nullable = true)\n |-- residual sugar: double (nullable = true)\n |-- chlorides: double (nullable = true)\n |-- free sulfur dioxide: double (nullable = true)\n |-- total sulfur dioxide: double (nullable = true)\n |-- density: double (nullable = true)\n |-- pH: double (nullable = true)\n |-- sulphates: double (nullable = true)\n |-- alcohol: double (nullable = true)\n |-- quality: string (nullable = true)\n\n"
440 | ]
441 | }
442 | ],
443 | "source": [
444 | "from pyspark.sql.functions import udf\n",
445 | "from pyspark.sql.types import StringType, DoubleType\n",
446 | "string_to_float_udf = udf(string_to_float, DoubleType())\n",
447 | "quality_udf = udf(lambda x : condition(x), StringType())\n",
448 | "\n",
449 | "df = df.withColumn(\"quality\", quality_udf(\"quality\"))\n",
450 | "df.show(5)\n",
451 | "df.printSchema()"
452 | ]
453 | },
454 | {
455 | "cell_type": "code",
456 | "execution_count": 0,
457 | "metadata": {
458 | "application/vnd.databricks.v1+cell": {
459 | "cellMetadata": {
460 | "byteLimit": 2048000,
461 | "rowLimit": 10000
462 | },
463 | "inputWidgets": {},
464 | "nuid": "0148bf86-6375-4a52-bc48-2d12298a7ca0",
465 | "showTitle": false,
466 | "title": ""
467 | }
468 | },
469 | "outputs": [],
470 | "source": [
471 | "from pyspark.ml.linalg import Vectors\n",
472 | "from pyspark.ml import Pipeline\n",
473 | "from pyspark.ml.feature import VectorIndexer, StringIndexer, IndexToString\n",
474 | "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n",
475 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator"
476 | ]
477 | },
478 | {
479 | "cell_type": "code",
480 | "execution_count": 0,
481 | "metadata": {
482 | "application/vnd.databricks.v1+cell": {
483 | "cellMetadata": {
484 | "byteLimit": 2048000,
485 | "rowLimit": 10000
486 | },
487 | "inputWidgets": {},
488 | "nuid": "b9746597-215f-4a67-bdde-9ae9f7572597",
489 | "showTitle": false,
490 | "title": ""
491 | }
492 | },
493 | "outputs": [
494 | {
495 | "output_type": "stream",
496 | "name": "stdout",
497 | "output_type": "stream",
498 | "text": [
499 | "+--------------------+------+\n| features| label|\n+--------------------+------+\n|[7.4,0.7,0.0,1.9,...|medium|\n|[7.8,0.88,0.0,2.6...|medium|\n|[7.8,0.76,0.04,2....|medium|\n|[11.2,0.28,0.56,1...|medium|\n|[7.4,0.7,0.0,1.9,...|medium|\n+--------------------+------+\nonly showing top 5 rows\n\n"
500 | ]
501 | }
502 | ],
503 | "source": [
504 | "def transData(data):\n",
505 | " return data.rdd.map(lambda r : [Vectors.dense(r[:-1]), r[-1]]).toDF(['features', 'label'])\n",
506 | "\n",
507 | "transformed = transData(df)\n",
508 | "transformed.show(5)"
509 | ]
510 | },
511 | {
512 | "cell_type": "code",
513 | "execution_count": 0,
514 | "metadata": {
515 | "application/vnd.databricks.v1+cell": {
516 | "cellMetadata": {},
517 | "inputWidgets": {},
518 | "nuid": "6b1add08-df27-4be3-a9ab-b08ff21c1f9c",
519 | "showTitle": false,
520 | "title": ""
521 | }
522 | },
523 | "outputs": [],
524 | "source": []
525 | }
526 | ],
527 | "metadata": {
528 | "application/vnd.databricks.v1+notebook": {
529 | "dashboards": [],
530 | "language": "python",
531 | "notebookMetadata": {
532 | "pythonIndentUnit": 4
533 | },
534 | "notebookName": "06072023 - DEB",
535 | "widgets": {}
536 | }
537 | },
538 | "nbformat": 4,
539 | "nbformat_minor": 0
540 | }
541 |
--------------------------------------------------------------------------------
/11072023 DEB v2.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 0,
6 | "metadata": {
7 | "application/vnd.databricks.v1+cell": {
8 | "cellMetadata": {
9 | "byteLimit": 2048000,
10 | "rowLimit": 10000
11 | },
12 | "inputWidgets": {},
13 | "nuid": "40b070d9-8cee-4d5b-ae3d-ad44614f153c",
14 | "showTitle": false,
15 | "title": ""
16 | }
17 | },
18 | "outputs": [],
19 | "source": [
20 | "from pyspark.sql import SparkSession \n",
21 | "\n",
22 | "spark = SparkSession \\\n",
23 | " .builder \\\n",
24 | " .appName(\"RFM Customer Segmentation with PySpark\") \\\n",
25 | " .getOrCreate()"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 0,
31 | "metadata": {
32 | "application/vnd.databricks.v1+cell": {
33 | "cellMetadata": {
34 | "byteLimit": 2048000,
35 | "rowLimit": 10000
36 | },
37 | "inputWidgets": {},
38 | "nuid": "09fbf487-e2cd-439e-a07a-19ccfdb2c159",
39 | "showTitle": false,
40 | "title": ""
41 | }
42 | },
43 | "outputs": [],
44 | "source": [
45 | "df_raw = spark.read.format('delta').\\\n",
46 | " options(header = 'true', inferschema = 'true').\\\n",
47 | " load(\"/user/hive/warehouse/online_retail2\", header = True)"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 0,
53 | "metadata": {
54 | "application/vnd.databricks.v1+cell": {
55 | "cellMetadata": {
56 | "byteLimit": 2048000,
57 | "rowLimit": 10000
58 | },
59 | "inputWidgets": {},
60 | "nuid": "9dde06e7-64c1-431c-88a7-39401b30a751",
61 | "showTitle": false,
62 | "title": ""
63 | }
64 | },
65 | "outputs": [
66 | {
67 | "output_type": "stream",
68 | "name": "stdout",
69 | "output_type": "stream",
70 | "text": [
71 | "+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2,55| 17850|United Kingdom|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2,75| 17850|United Kingdom|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\nonly showing top 5 rows\n\nroot\n |-- InvoiceNo: string (nullable = true)\n |-- StockCode: string (nullable = true)\n |-- Description: string (nullable = true)\n |-- Quantity: long (nullable = true)\n |-- InvoiceDate: string (nullable = true)\n |-- UnitPrice: string (nullable = true)\n |-- CustomerID: long (nullable = true)\n |-- Country: string (nullable = true)\n\n"
72 | ]
73 | }
74 | ],
75 | "source": [
76 | "df_raw.show(5)\n",
77 | "df_raw.printSchema()"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 0,
83 | "metadata": {
84 | "application/vnd.databricks.v1+cell": {
85 | "cellMetadata": {
86 | "byteLimit": 2048000,
87 | "rowLimit": 10000
88 | },
89 | "inputWidgets": {},
90 | "nuid": "d987ba2c-dda9-4bb5-96e5-8081409408ce",
91 | "showTitle": false,
92 | "title": ""
93 | }
94 | },
95 | "outputs": [
96 | {
97 | "output_type": "stream",
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "+---------+---------+-----------+--------+-----------+---------+----------+-------+\n|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n| 541909| 541909| 541909| 541909| 541909| 541909| 541909| 541909|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n\n"
102 | ]
103 | }
104 | ],
105 | "source": [
106 | "from pyspark.sql.functions import count\n",
107 | "\n",
108 | "def my_count(df_in):\n",
109 | " df_in.agg(*[count(c).alias(c) for c in df_in.columns]).show()\n",
110 | "\n",
111 | "my_count(df_raw)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 0,
117 | "metadata": {
118 | "application/vnd.databricks.v1+cell": {
119 | "cellMetadata": {
120 | "byteLimit": 2048000,
121 | "rowLimit": 10000
122 | },
123 | "inputWidgets": {},
124 | "nuid": "f21365b1-3e8a-44be-a21e-75a90fb16efa",
125 | "showTitle": false,
126 | "title": ""
127 | }
128 | },
129 | "outputs": [
130 | {
131 | "output_type": "stream",
132 | "name": "stdout",
133 | "output_type": "stream",
134 | "text": [
135 | "+---------+---------+-----------+--------+-----------+---------+----------+-------+\n|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n| 541909| 541909| 541909| 541909| 541909| 541909| 541909| 541909|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "df = df_raw.dropna(how = \"any\")\n",
141 | "my_count(df)"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 0,
147 | "metadata": {
148 | "application/vnd.databricks.v1+cell": {
149 | "cellMetadata": {
150 | "byteLimit": 2048000,
151 | "rowLimit": 10000
152 | },
153 | "inputWidgets": {},
154 | "nuid": "76ca2c15-48f7-4541-b1df-0e0443b5eb3e",
155 | "showTitle": false,
156 | "title": ""
157 | }
158 | },
159 | "outputs": [
160 | {
161 | "output_type": "stream",
162 | "name": "stdout",
163 | "output_type": "stream",
164 | "text": [
165 | "+---------+---------+--------------------+--------+---------------+---------+----------+--------------+--------------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country|NewInvoiceDate|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+--------------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2,55| 17850|United Kingdom| null|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom| null|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2,75| 17850|United Kingdom| null|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom| null|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom| null|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+--------------+\nonly showing top 5 rows\n\n"
166 | ]
167 | }
168 | ],
169 | "source": [
170 | "from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, datediff, col, when\n",
171 | "\n",
172 | "timeFmt = \"MM/dd/yy HH:mm\"\n",
173 | "\n",
174 | "df = df.withColumn('NewInvoiceDate', when(col('InvoiceDate').isNotNull(), to_utc_timestamp(unix_timestamp(col('InvoiceDate'), timeFmt).cast('timestamp'), 'UTC')).otherwise(col('InvoiceDate')))\n",
175 | "\n",
176 | "df.show(5)"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 0,
182 | "metadata": {
183 | "application/vnd.databricks.v1+cell": {
184 | "cellMetadata": {
185 | "byteLimit": 2048000,
186 | "rowLimit": 10000
187 | },
188 | "inputWidgets": {},
189 | "nuid": "2d5f96fc-394f-44bd-87bd-4364c67d92c9",
190 | "showTitle": false,
191 | "title": ""
192 | }
193 | },
194 | "outputs": [
195 | {
196 | "output_type": "stream",
197 | "name": "stdout",
198 | "output_type": "stream",
199 | "text": [
200 | "+----------+-------+---------+--------+\n|CustomerID|Recency|Frequency|Monetary|\n+----------+-------+---------+--------+\n| 15194| null| 22| null|\n| 17703| null| 3| null|\n| 13452| null| 2| 590.0|\n| 13098| null| 41| null|\n| 17048| null| 6| null|\n| 13638| null| 1| null|\n| 15322| null| 2| null|\n| 13723| null| 1| null|\n| 16597| null| 1| null|\n| 15237| null| 4| null|\n| 13248| null| 2| null|\n| 16742| null| 2| null|\n| 14719| null| 6| null|\n| 17043| null| 4| null|\n| 14117| null| 1| null|\n| 15057| null| 2| null|\n| 17979| null| 5| null|\n| 13460| null| 2| null|\n| 13518| null| 1| null|\n| 15432| null| 1| null|\n+----------+-------+---------+--------+\nonly showing top 20 rows\n\n"
201 | ]
202 | }
203 | ],
204 | "source": [
205 | "from pyspark.sql.functions import round\n",
206 | "\n",
207 | "df = df.withColumn('TotalPrice', round(df.Quantity * df.UnitPrice, 2) )\n",
208 | "\n",
209 | "from pyspark.sql.functions import mean, min, max, sum, datediff, to_date\n",
210 | "\n",
211 | "date_max = df.select(max('NewInvoiceDate')).toPandas()\n",
212 | "\n",
213 | "current = to_utc_timestamp(unix_timestamp(lit(str(date_max.iloc[0][0])), 'yy-MM-dd HH:mm').cast('timestamp'), 'UTC')\n",
214 | "\n",
215 | "df = df.withColumn('Duration', datediff(lit(current), 'NewInvoiceDate'))\n",
216 | "\n",
217 | "#Recency, Frequency, Monetary\n",
218 | "\n",
219 | "recency = df.groupBy('CustomerID').agg(min('Duration').alias('Recency'))\n",
220 | "\n",
221 | "frequency = df.groupBy('CustomerID', 'InvoiceNo').count()\\\n",
222 | " .groupBy('CustomerID')\\\n",
223 | " .agg(count('*').alias(\"Frequency\"))\n",
224 | "\n",
225 | "monetary = df.groupBy('CustomerID').agg(round(sum('TotalPrice'), 2).alias('Monetary'))\n",
226 | "\n",
227 | "rfm = recency.join(frequency, 'CustomerID', how = 'inner')\\\n",
228 | " .join(monetary, 'CustomerID', how = 'inner')\n",
229 | "\n",
230 | "rfm.show()"
231 | ]
232 | }
233 | ],
234 | "metadata": {
235 | "application/vnd.databricks.v1+notebook": {
236 | "dashboards": [],
237 | "language": "python",
238 | "notebookMetadata": {
239 | "pythonIndentUnit": 4
240 | },
241 | "notebookName": "11072023 DEB v2",
242 | "widgets": {}
243 | }
244 | },
245 | "nbformat": 4,
246 | "nbformat_minor": 0
247 | }
248 |
--------------------------------------------------------------------------------
/13072023 - DEB.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 0,
6 | "metadata": {
7 | "application/vnd.databricks.v1+cell": {
8 | "cellMetadata": {
9 | "byteLimit": 2048000,
10 | "rowLimit": 10000
11 | },
12 | "inputWidgets": {},
13 | "nuid": "40b070d9-8cee-4d5b-ae3d-ad44614f153c",
14 | "showTitle": false,
15 | "title": ""
16 | }
17 | },
18 | "outputs": [],
19 | "source": [
20 | "from pyspark.sql import SparkSession \n",
21 | "\n",
22 | "spark = SparkSession \\\n",
23 | " .builder \\\n",
24 | " .appName(\"RFM Customer Segmentation with PySpark\") \\\n",
25 | " .getOrCreate()"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": 0,
31 | "metadata": {
32 | "application/vnd.databricks.v1+cell": {
33 | "cellMetadata": {
34 | "byteLimit": 2048000,
35 | "rowLimit": 10000
36 | },
37 | "inputWidgets": {},
38 | "nuid": "09fbf487-e2cd-439e-a07a-19ccfdb2c159",
39 | "showTitle": false,
40 | "title": ""
41 | }
42 | },
43 | "outputs": [],
44 | "source": [
45 | "df_raw = spark.read.format('delta').\\\n",
46 | " options(header = 'true', inferschema = 'true').\\\n",
47 | " load(\"/user/hive/warehouse/online_retail2\", header = True)"
48 | ]
49 | },
50 | {
51 | "cell_type": "code",
52 | "execution_count": 0,
53 | "metadata": {
54 | "application/vnd.databricks.v1+cell": {
55 | "cellMetadata": {
56 | "byteLimit": 2048000,
57 | "rowLimit": 10000
58 | },
59 | "inputWidgets": {},
60 | "nuid": "9dde06e7-64c1-431c-88a7-39401b30a751",
61 | "showTitle": false,
62 | "title": ""
63 | }
64 | },
65 | "outputs": [
66 | {
67 | "output_type": "stream",
68 | "name": "stdout",
69 | "output_type": "stream",
70 | "text": [
71 | "+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2,55| 17850|United Kingdom|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2,75| 17850|United Kingdom|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+\nonly showing top 5 rows\n\nroot\n |-- InvoiceNo: string (nullable = true)\n |-- StockCode: string (nullable = true)\n |-- Description: string (nullable = true)\n |-- Quantity: long (nullable = true)\n |-- InvoiceDate: string (nullable = true)\n |-- UnitPrice: string (nullable = true)\n |-- CustomerID: long (nullable = true)\n |-- Country: string (nullable = true)\n\n"
72 | ]
73 | }
74 | ],
75 | "source": [
76 | "df_raw.show(5)\n",
77 | "df_raw.printSchema()"
78 | ]
79 | },
80 | {
81 | "cell_type": "code",
82 | "execution_count": 0,
83 | "metadata": {
84 | "application/vnd.databricks.v1+cell": {
85 | "cellMetadata": {
86 | "byteLimit": 2048000,
87 | "rowLimit": 10000
88 | },
89 | "inputWidgets": {},
90 | "nuid": "d987ba2c-dda9-4bb5-96e5-8081409408ce",
91 | "showTitle": false,
92 | "title": ""
93 | }
94 | },
95 | "outputs": [
96 | {
97 | "output_type": "stream",
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "+---------+---------+-----------+--------+-----------+---------+----------+-------+\n|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n| 541909| 541909| 541909| 541909| 541909| 541909| 541909| 541909|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n\n"
102 | ]
103 | }
104 | ],
105 | "source": [
106 | "from pyspark.sql.functions import count\n",
107 | "\n",
108 | "def my_count(df_in):\n",
109 | " df_in.agg(*[count(c).alias(c) for c in df_in.columns]).show()\n",
110 | "\n",
111 | "my_count(df_raw)"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 0,
117 | "metadata": {
118 | "application/vnd.databricks.v1+cell": {
119 | "cellMetadata": {
120 | "byteLimit": 2048000,
121 | "rowLimit": 10000
122 | },
123 | "inputWidgets": {},
124 | "nuid": "f21365b1-3e8a-44be-a21e-75a90fb16efa",
125 | "showTitle": false,
126 | "title": ""
127 | }
128 | },
129 | "outputs": [
130 | {
131 | "output_type": "stream",
132 | "name": "stdout",
133 | "output_type": "stream",
134 | "text": [
135 | "+---------+---------+-----------+--------+-----------+---------+----------+-------+\n|InvoiceNo|StockCode|Description|Quantity|InvoiceDate|UnitPrice|CustomerID|Country|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n| 541909| 541909| 541909| 541909| 541909| 541909| 541909| 541909|\n+---------+---------+-----------+--------+-----------+---------+----------+-------+\n\n"
136 | ]
137 | }
138 | ],
139 | "source": [
140 | "df = df_raw.dropna(how = \"any\")\n",
141 | "my_count(df)"
142 | ]
143 | },
144 | {
145 | "cell_type": "code",
146 | "execution_count": 0,
147 | "metadata": {
148 | "application/vnd.databricks.v1+cell": {
149 | "cellMetadata": {
150 | "byteLimit": 2048000,
151 | "rowLimit": 10000
152 | },
153 | "inputWidgets": {},
154 | "nuid": "76ca2c15-48f7-4541-b1df-0e0443b5eb3e",
155 | "showTitle": false,
156 | "title": ""
157 | }
158 | },
159 | "outputs": [
160 | {
161 | "output_type": "stream",
162 | "name": "stdout",
163 | "output_type": "stream",
164 | "text": [
165 | "+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country| NewInvoiceDate|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2,55| 17850|United Kingdom|2010-12-01 08:26:00|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|2010-12-01 08:26:00|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2,75| 17850|United Kingdom|2010-12-01 08:26:00|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|2010-12-01 08:26:00|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3,39| 17850|United Kingdom|2010-12-01 08:26:00|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+\nonly showing top 5 rows\n\nOut[6]: '\\n\\nfrom pyspark.sql.functions import *\\ntimeFmt = \"MM/dd/yy HH:mm\"\\ndf = df.withColumn(\"NewInvoiceData2\", from_unixtime(unix_timestamp(to_timestamp(\"InvoiceDate\",\"d.M.yyyy HH:mm\").cast(\"timestamp\"),timeFmt),timeFmt))\\ndf.show()\\n\\n'"
166 | ]
167 | }
168 | ],
169 | "source": [
170 | "from pyspark.sql.functions import to_utc_timestamp, unix_timestamp, lit, datediff, col, to_timestamp\n",
171 | "\n",
172 | "df = df.withColumn('NewInvoiceDate', to_timestamp(\"InvoiceDate\",\"d.M.yyyy HH:mm\"))\n",
173 | "\n",
174 | "df.show(5)\n"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 0,
180 | "metadata": {
181 | "application/vnd.databricks.v1+cell": {
182 | "cellMetadata": {
183 | "byteLimit": 2048000,
184 | "rowLimit": 10000
185 | },
186 | "inputWidgets": {},
187 | "nuid": "3cb5fa14-a12f-43f5-b02f-4a5852e626eb",
188 | "showTitle": false,
189 | "title": ""
190 | }
191 | },
192 | "outputs": [
193 | {
194 | "output_type": "stream",
195 | "name": "stdout",
196 | "output_type": "stream",
197 | "text": [
198 | "+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country| NewInvoiceDate|TotalPrice|Duration|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2.55| 17850|United Kingdom|2010-12-01 08:26:00| 15.3| 373|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2.75| 17850|United Kingdom|2010-12-01 08:26:00| 22.0| 373|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 22752|SET 7 BABUSHKA NE...| 2|1.12.2010 08:26| 7.65| 17850|United Kingdom|2010-12-01 08:26:00| 15.3| 373|\n| 536365| 21730|GLASS STAR FROSTE...| 6|1.12.2010 08:26| 4.25| 17850|United Kingdom|2010-12-01 08:26:00| 25.5| 373|\n| 536366| 22633|HAND WARMER UNION...| 6|1.12.2010 08:28| 1.85| 17850|United Kingdom|2010-12-01 08:28:00| 11.1| 373|\n| 536366| 22632|HAND WARMER RED P...| 6|1.12.2010 08:28| 1.85| 17850|United Kingdom|2010-12-01 08:28:00| 11.1| 373|\n| 536367| 84879|ASSORTED COLOUR B...| 32|1.12.2010 08:34| 1.69| 13047|United Kingdom|2010-12-01 08:34:00| 54.08| 373|\n| 536367| 22745|POPPY'S PLAYHOUSE...| 6|1.12.2010 08:34| 2.1| 13047|United Kingdom|2010-12-01 08:34:00| 12.6| 373|\n| 536367| 22748|POPPY'S PLAYHOUSE...| 6|1.12.2010 08:34| 2.1| 13047|United Kingdom|2010-12-01 08:34:00| 12.6| 373|\n| 536367| 22749|FELTCRAFT PRINCES...| 8|1.12.2010 08:34| 3.75| 13047|United Kingdom|2010-12-01 08:34:00| 30.0| 373|\n| 536367| 22310|IVORY KNITTED MUG...| 6|1.12.2010 08:34| 1.65| 13047|United Kingdom|2010-12-01 08:34:00| 9.9| 373|\n| 536367| 84969|BOX OF 6 ASSORTED...| 6|1.12.2010 08:34| 4.25| 13047|United Kingdom|2010-12-01 08:34:00| 25.5| 373|\n| 536367| 22623|BOX OF VINTAGE JI...| 3|1.12.2010 08:34| 4.95| 13047|United Kingdom|2010-12-01 08:34:00| 14.85| 373|\n| 536367| 22622|BOX OF VINTAGE AL...| 2|1.12.2010 08:34| 9.95| 13047|United Kingdom|2010-12-01 08:34:00| 19.9| 373|\n| 536367| 21754|HOME BUILDING BLO...| 3|1.12.2010 08:34| 5.95| 13047|United Kingdom|2010-12-01 08:34:00| 17.85| 373|\n| 536367| 21755|LOVE BUILDING BLO...| 3|1.12.2010 08:34| 5.95| 13047|United Kingdom|2010-12-01 08:34:00| 17.85| 373|\n| 536367| 21777|RECIPE BOX WITH M...| 4|1.12.2010 08:34| 7.95| 13047|United Kingdom|2010-12-01 08:34:00| 31.8| 373|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\nonly showing top 20 rows\n\n"
199 | ]
200 | }
201 | ],
202 | "source": [
203 | "from pyspark.sql.functions import round\n",
204 | "\n",
205 | "from pyspark.sql import functions as F\n",
206 | "\n",
207 | "df = df.withColumn(\"UnitPrice\", F.regexp_replace(\"UnitPrice\", \",\", \".\").cast(\"double\"))\n",
208 | "\n",
209 | "df = df.withColumn('TotalPrice', round(df.Quantity * df.UnitPrice, 2) )\n",
210 | "\n",
211 | "df.show()"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 0,
217 | "metadata": {
218 | "application/vnd.databricks.v1+cell": {
219 | "cellMetadata": {
220 | "byteLimit": 2048000,
221 | "rowLimit": 10000
222 | },
223 | "inputWidgets": {},
224 | "nuid": "2d5f96fc-394f-44bd-87bd-4364c67d92c9",
225 | "showTitle": false,
226 | "title": ""
227 | }
228 | },
229 | "outputs": [
230 | {
231 | "output_type": "stream",
232 | "name": "stdout",
233 | "output_type": "stream",
234 | "text": [
235 | "+----------+-------+---------+--------+\n|CustomerID|Recency|Frequency|Monetary|\n+----------+-------+---------+--------+\n| 15194| 3| 22| 7521.17|\n| 17703| 35| 3| 798.74|\n| 13452| 259| 2| 590.0|\n| 13098| 1| 41|28658.88|\n| 17048| 115| 6| 864.32|\n| 13638| 15| 1| 122.64|\n| 15322| 64| 2| 602.97|\n| 13723| 217| 1| 199.85|\n| 16597| 4| 1| 90.04|\n| 15237| 1| 4| 1412.32|\n| 13248| 124| 2| 465.68|\n| 16742| 46| 2| 0.0|\n| 14719| 1| 6| 1592.18|\n| 17043| 32| 4| 1735.18|\n| 14117| 143| 1| 90.0|\n| 15057| 275| 2| 1489.5|\n| 17979| 35| 5| 737.81|\n| 13460| 29| 2| 183.44|\n| 13518| 85| 1| 659.44|\n| 15432| 23| 1| 171.19|\n| 18196| 95| 2| 689.13|\n| 15437| 262| 1| 200.16|\n| 18147| 45| 2| 179.34|\n| 17499| 289| 1| 622.88|\n| 15663| 106| 1| 138.14|\n| 13658| 9| 7| 2421.47|\n| 12936| 17| 5| 1012.9|\n| 14029| 63| 2| 467.66|\n| 15221| 366| 2| 114.1|\n| 17370| 72| 4| 446.18|\n| 12967| 3| 4| 1194.75|\n| 13240| 96| 4| 663.65|\n| 15758| 24| 1| 205.25|\n| 14805| 15| 4| 554.65|\n| 16781| 365| 2| 294.65|\n| 17595| 12| 2| 388.79|\n| 16441| 67| 2| 381.2|\n| 13299| 268| 1| 142.5|\n| 14259| 141| 1| 120.0|\n| 15921| 172| 1| 336.03|\n| 18117| 25| 1| 320.72|\n| 14543| 3| 21| 2916.17|\n| 17757| 1| 31| 5585.49|\n| 14178| 8| 7| 1620.93|\n| 14562| 3| 22| 4709.22|\n| 14215| 11| 7| 1777.92|\n| 15197| 8| 5| 656.44|\n| 13509| 8| 7| 979.72|\n| 15133| 127| 3| 982.42|\n| 14067| 63| 2| 374.7|\n| 17061| 73| 12| 5116.13|\n| 17135| 16| 7| 1139.73|\n| 17647| 65| 1| 133.06|\n| 15894| 253| 2| 263.55|\n| 14064| 29| 7| 1188.32|\n| 13659| 197| 3| 1550.85|\n| 16499| 360| 5| 319.1|\n| 13068| 10| 2| 344.0|\n| 14242| 234| 2| 280.55|\n| 17111| 47| 1| 248.61|\n| 17201| 53| 1| 342.63|\n| 15689| 119| 2| 254.1|\n| 13832| 17| 2| 40.95|\n| 16145| 8| 18| 3741.98|\n| 14779| 280| 3| 386.15|\n| 13527| 33| 8| 2263.76|\n| 13035| 57| 3| 886.63|\n| 17205| 53| 1| 384.08|\n| 17454| 192| 4| 517.53|\n| 14329| 8| 14| 4928.74|\n| 14267| 150| 7| 1279.09|\n| 12422| 95| 3| 803.56|\n| 15569| 103| 5| 1375.71|\n| 14626| 8| 7| 2757.07|\n| 12402| 323| 1| 225.6|\n| 12472| 30| 13| 6229.48|\n| 13764| 70| 3| 1521.76|\n| 15198| 92| 2| 193.64|\n| 15709| 283| 1| 133.25|\n| 16754| 372| 1| 2002.4|\n| 14496| 311| 2| 538.81|\n| 17334| 301| 3| 306.6|\n| 14639| 52| 8| 2952.34|\n| 13656| 164| 2| 379.65|\n| 13447| 23| 4| 1104.23|\n| 15312| 75| 5| 921.1|\n| 16007| 47| 4| 1701.94|\n| 15374| 128| 1| 168.0|\n| 18233| 325| 1| 440.0|\n| 14352| 157| 4| 1078.96|\n| 15799| 75| 3| 884.14|\n| 14438| 306| 1| 131.9|\n| 15274| 4| 2| 716.57|\n| 15992| 3| 1| 41.99|\n| 17022| 31| 1| 71.0|\n| 12873| 282| 1| 374.0|\n| 16828| 93| 2| 128.5|\n| 15652| 91| 1| 337.74|\n| 12390| 79| 1| 549.84|\n| 16276| 176| 1| 810.6|\n+----------+-------+---------+--------+\nonly showing top 100 rows\n\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\n|InvoiceNo|StockCode| Description|Quantity| InvoiceDate|UnitPrice|CustomerID| Country| NewInvoiceDate|TotalPrice|Duration|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\n| 536365| 85123A|WHITE HANGING HEA...| 6|1.12.2010 08:26| 2.55| 17850|United Kingdom|2010-12-01 08:26:00| 15.3| 373|\n| 536365| 71053| WHITE METAL LANTERN| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 84406B|CREAM CUPID HEART...| 8|1.12.2010 08:26| 2.75| 17850|United Kingdom|2010-12-01 08:26:00| 22.0| 373|\n| 536365| 84029G|KNITTED UNION FLA...| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 84029E|RED WOOLLY HOTTIE...| 6|1.12.2010 08:26| 3.39| 17850|United Kingdom|2010-12-01 08:26:00| 20.34| 373|\n| 536365| 22752|SET 7 BABUSHKA NE...| 2|1.12.2010 08:26| 7.65| 17850|United Kingdom|2010-12-01 08:26:00| 15.3| 373|\n| 536365| 21730|GLASS STAR FROSTE...| 6|1.12.2010 08:26| 4.25| 17850|United Kingdom|2010-12-01 08:26:00| 25.5| 373|\n| 536366| 22633|HAND WARMER UNION...| 6|1.12.2010 08:28| 1.85| 17850|United Kingdom|2010-12-01 08:28:00| 11.1| 373|\n| 536366| 22632|HAND WARMER RED P...| 6|1.12.2010 08:28| 1.85| 17850|United Kingdom|2010-12-01 08:28:00| 11.1| 373|\n| 536367| 84879|ASSORTED COLOUR B...| 32|1.12.2010 08:34| 1.69| 13047|United Kingdom|2010-12-01 08:34:00| 54.08| 373|\n| 536367| 22745|POPPY'S PLAYHOUSE...| 6|1.12.2010 08:34| 2.1| 13047|United Kingdom|2010-12-01 08:34:00| 12.6| 373|\n| 536367| 22748|POPPY'S PLAYHOUSE...| 6|1.12.2010 08:34| 2.1| 13047|United Kingdom|2010-12-01 08:34:00| 12.6| 373|\n| 536367| 22749|FELTCRAFT PRINCES...| 8|1.12.2010 08:34| 3.75| 13047|United Kingdom|2010-12-01 08:34:00| 30.0| 373|\n| 536367| 22310|IVORY KNITTED MUG...| 6|1.12.2010 08:34| 1.65| 13047|United Kingdom|2010-12-01 08:34:00| 9.9| 373|\n| 536367| 84969|BOX OF 6 ASSORTED...| 6|1.12.2010 08:34| 4.25| 13047|United Kingdom|2010-12-01 08:34:00| 25.5| 373|\n| 536367| 22623|BOX OF VINTAGE JI...| 3|1.12.2010 08:34| 4.95| 13047|United Kingdom|2010-12-01 08:34:00| 14.85| 373|\n| 536367| 22622|BOX OF VINTAGE AL...| 2|1.12.2010 08:34| 9.95| 13047|United Kingdom|2010-12-01 08:34:00| 19.9| 373|\n| 536367| 21754|HOME BUILDING BLO...| 3|1.12.2010 08:34| 5.95| 13047|United Kingdom|2010-12-01 08:34:00| 17.85| 373|\n| 536367| 21755|LOVE BUILDING BLO...| 3|1.12.2010 08:34| 5.95| 13047|United Kingdom|2010-12-01 08:34:00| 17.85| 373|\n| 536367| 21777|RECIPE BOX WITH M...| 4|1.12.2010 08:34| 7.95| 13047|United Kingdom|2010-12-01 08:34:00| 31.8| 373|\n+---------+---------+--------------------+--------+---------------+---------+----------+--------------+-------------------+----------+--------+\nonly showing top 20 rows\n\n"
236 | ]
237 | }
238 | ],
239 | "source": [
240 | "spark.sql(\"set spark.sql.legacy.timeParserPolicy=LEGACY\")\n",
241 | "\n",
242 | "from pyspark.sql.functions import mean, min, max, sum, datediff, to_date\n",
243 | "\n",
244 | "date_max = df.select(max('NewInvoiceDate')).toPandas()\n",
245 | "\n",
246 | "current = to_utc_timestamp(unix_timestamp(lit(str(date_max.iloc[0][0])), 'yy-MM-dd HH:mm').cast('timestamp'), 'UTC')\n",
247 | "\n",
248 | "df = df.withColumn('Duration', datediff(lit(current), 'NewInvoiceDate'))\n",
249 | "\n",
250 | "#Recency, Frequency, Monetary\n",
251 | "\n",
252 | "recency = df.groupBy('CustomerID').agg(min('Duration').alias('Recency'))\n",
253 | "\n",
254 | "frequency = df.groupBy('CustomerID', 'InvoiceNo').count()\\\n",
255 | " .groupBy('CustomerID')\\\n",
256 | " .agg(count('*').alias(\"Frequency\"))\n",
257 | "\n",
258 | "monetary = df.groupBy('CustomerID').agg(round(sum('TotalPrice'), 2).alias('Monetary'))\n",
259 | "\n",
260 | "rfm = recency.join(frequency, 'CustomerID', how = 'inner')\\\n",
261 | " .join(monetary, 'CustomerID', how = 'inner')\n",
262 | "\n",
263 | "rfm.show(100)\n",
264 | "df.show()"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 0,
270 | "metadata": {
271 | "application/vnd.databricks.v1+cell": {
272 | "cellMetadata": {
273 | "byteLimit": 2048000,
274 | "rowLimit": 10000
275 | },
276 | "inputWidgets": {},
277 | "nuid": "23464001-02f0-4989-a185-5d7dccae6ba7",
278 | "showTitle": false,
279 | "title": ""
280 | }
281 | },
282 | "outputs": [],
283 | "source": [
284 | "import numpy as np\n",
285 | "import pandas as pd\n",
286 | "\n",
287 | "def describe_pd(df_input, columns, deciles = False):\n",
288 | " if deciles:\n",
289 | " percentiles = [25, 50, 75]\n",
290 | "\n",
291 | " pcs = np.transpose([np.percentile(df_input.select(x).collect(),percentiles) for x in columns])\n",
292 | " pcs = pd.DataFrame(pcs, columns = columns)\n",
293 | " pcs['summary'] = [str(p) + \"%\" for p in percentiles]\n",
294 | " mydescribe = df_input.describe().toPandas()\n",
295 | " new_df = pd.concat([mydescribe, pcs], ignore_index = True)\n",
296 | " new_df = new_df.round(2)\n",
297 | " return new_df[['summary'] + columns]"
298 | ]
299 | },
300 | {
301 | "cell_type": "code",
302 | "execution_count": 0,
303 | "metadata": {
304 | "application/vnd.databricks.v1+cell": {
305 | "cellMetadata": {
306 | "byteLimit": 2048000,
307 | "rowLimit": 10000
308 | },
309 | "inputWidgets": {},
310 | "nuid": "0444e164-dab4-4f60-98be-a8f1344dc4fe",
311 | "showTitle": false,
312 | "title": ""
313 | }
314 | },
315 | "outputs": [
316 | {
317 | "output_type": "display_data",
318 | "data": {
319 | "text/html": [
320 | "
\n",
321 | "\n",
334 | "
\n",
335 | " \n",
336 | " \n",
337 | " | \n",
338 | " summary | \n",
339 | " Recency | \n",
340 | " Frequency | \n",
341 | " Monetary | \n",
342 | "
\n",
343 | " \n",
344 | " \n",
345 | " \n",
346 | " 0 | \n",
347 | " count | \n",
348 | " 4373 | \n",
349 | " 4373 | \n",
350 | " 4373 | \n",
351 | "
\n",
352 | " \n",
353 | " 1 | \n",
354 | " mean | \n",
355 | " 91.56025611708209 | \n",
356 | " 5.9227075234392865 | \n",
357 | " 2229.0756757374743 | \n",
358 | "
\n",
359 | " \n",
360 | " 2 | \n",
361 | " stddev | \n",
362 | " 100.7701307562583 | \n",
363 | " 56.79881324857276 | \n",
364 | " 23356.82678007453 | \n",
365 | "
\n",
366 | " \n",
367 | " 3 | \n",
368 | " min | \n",
369 | " 0 | \n",
370 | " 1 | \n",
371 | " -4287.63 | \n",
372 | "
\n",
373 | " \n",
374 | " 4 | \n",
375 | " max | \n",
376 | " 373 | \n",
377 | " 3710 | \n",
378 | " 1447682.12 | \n",
379 | "
\n",
380 | " \n",
381 | " 5 | \n",
382 | " 25% | \n",
383 | " 16.0 | \n",
384 | " 1.0 | \n",
385 | " 293.45 | \n",
386 | "
\n",
387 | " \n",
388 | " 6 | \n",
389 | " 50% | \n",
390 | " 50.0 | \n",
391 | " 3.0 | \n",
392 | " 648.41 | \n",
393 | "
\n",
394 | " \n",
395 | " 7 | \n",
396 | " 75% | \n",
397 | " 143.0 | \n",
398 | " 5.0 | \n",
399 | " 1612.13 | \n",
400 | "
\n",
401 | " \n",
402 | "
\n",
403 | "
"
404 | ]
405 | },
406 | "metadata": {
407 | "application/vnd.databricks.v1+output": {
408 | "addedWidgets": {},
409 | "arguments": {},
410 | "data": "\n\n
\n \n \n | \n summary | \n Recency | \n Frequency | \n Monetary | \n
\n \n \n \n 0 | \n count | \n 4373 | \n 4373 | \n 4373 | \n
\n \n 1 | \n mean | \n 91.56025611708209 | \n 5.9227075234392865 | \n 2229.0756757374743 | \n
\n \n 2 | \n stddev | \n 100.7701307562583 | \n 56.79881324857276 | \n 23356.82678007453 | \n
\n \n 3 | \n min | \n 0 | \n 1 | \n -4287.63 | \n
\n \n 4 | \n max | \n 373 | \n 3710 | \n 1447682.12 | \n
\n \n 5 | \n 25% | \n 16.0 | \n 1.0 | \n 293.45 | \n
\n \n 6 | \n 50% | \n 50.0 | \n 3.0 | \n 648.41 | \n
\n \n 7 | \n 75% | \n 143.0 | \n 5.0 | \n 1612.13 | \n
\n \n
\n
",
411 | "datasetInfos": [],
412 | "metadata": {},
413 | "removedWidgets": [],
414 | "textData": null,
415 | "type": "htmlSandbox"
416 | }
417 | },
418 | "output_type": "display_data"
419 | }
420 | ],
421 | "source": [
422 | "cols = ['Recency', 'Frequency', 'Monetary']\n",
423 | "describe_pd(rfm, cols, 1)"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 0,
429 | "metadata": {
430 | "application/vnd.databricks.v1+cell": {
431 | "cellMetadata": {
432 | "byteLimit": 2048000,
433 | "rowLimit": 10000
434 | },
435 | "inputWidgets": {},
436 | "nuid": "0e8498c0-a7a3-47d5-8d87-2fba3a40458c",
437 | "showTitle": false,
438 | "title": ""
439 | }
440 | },
441 | "outputs": [],
442 | "source": [
443 | "def RScore(x):\n",
444 | " if x <= 16:\n",
445 | " return 1\n",
446 | " elif x<= 50:\n",
447 | " return 2\n",
448 | " elif x <= 143:\n",
449 | " return 3\n",
450 | " else:\n",
451 | " return 4\n",
452 | "\n",
453 | "def FScore(x):\n",
454 | " if x <= 1:\n",
455 | " return 4\n",
456 | " elif x <= 3:\n",
457 | " return 3\n",
458 | " elif x <= 5:\n",
459 | " return 2\n",
460 | " else:\n",
461 | " return 1\n",
462 | " \n",
463 | "def MScore(x):\n",
464 | " if x <= 293:\n",
465 | " return 4\n",
466 | " elif x <= 648:\n",
467 | " return 3\n",
468 | " elif x <= 1612:\n",
469 | " return 2\n",
470 | " else:\n",
471 | " return 1\n",
472 | " \n",
473 | "from pyspark.sql.functions import udf\n",
474 | "from pyspark.sql.types import StringType, DoubleType\n",
475 | "\n",
476 | "R_udf = udf(lambda x : RScore(x), StringType())\n",
477 | "F_udf = udf(lambda x : FScore(x), StringType())\n",
478 | "M_udf = udf(lambda x : MScore(x), StringType())"
479 | ]
480 | },
481 | {
482 | "cell_type": "code",
483 | "execution_count": 0,
484 | "metadata": {
485 | "application/vnd.databricks.v1+cell": {
486 | "cellMetadata": {
487 | "byteLimit": 2048000,
488 | "rowLimit": 10000
489 | },
490 | "inputWidgets": {},
491 | "nuid": "d0df6142-7497-402b-a259-9b1cb5a389a4",
492 | "showTitle": false,
493 | "title": ""
494 | }
495 | },
496 | "outputs": [
497 | {
498 | "output_type": "stream",
499 | "name": "stdout",
500 | "output_type": "stream",
501 | "text": [
502 | "+----------+-------+---------+----------+-----+-----+-----+--------+\n|CustomerID|Recency|Frequency| Monetary|r_seg|f_seg|m_seg|RFMScore|\n+----------+-------+---------+----------+-----+-----+-----+--------+\n| 0| 0| 3710|1447682.12| 1| 1| 1| 111|\n| 13004| 11| 22| 5613.43| 1| 1| 1| 111|\n| 17602| 2| 8| 5050.77| 1| 1| 1| 111|\n| 13098| 1| 41| 28658.88| 1| 1| 1| 111|\n| 13924| 1| 11| 1682.08| 1| 1| 1| 111|\n| 13658| 9| 7| 2421.47| 1| 1| 1| 111|\n| 15061| 3| 55| 54228.74| 1| 1| 1| 111|\n| 15838| 11| 21| 33350.76| 1| 1| 1| 111|\n| 15194| 3| 22| 7521.17| 1| 1| 1| 111|\n| 14415| 1| 18| 5811.56| 1| 1| 1| 111|\n| 13798| 1| 63| 36351.42| 1| 1| 1| 111|\n| 15993| 8| 10| 2756.82| 1| 1| 1| 111|\n| 14178| 8| 7| 1620.93| 1| 1| 1| 111|\n| 17949| 1| 52| 52750.84| 1| 1| 1| 111|\n| 14329| 8| 14| 4928.74| 1| 1| 1| 111|\n| 14825| 3| 12| 2226.91| 1| 1| 1| 111|\n| 14215| 11| 7| 1777.92| 1| 1| 1| 111|\n| 12683| 4| 20| 8221.09| 1| 1| 1| 111|\n| 14543| 3| 21| 2916.17| 1| 1| 1| 111|\n| 13230| 4| 15| 2763.41| 1| 1| 1| 111|\n+----------+-------+---------+----------+-----+-----+-----+--------+\nonly showing top 20 rows\n\n"
503 | ]
504 | }
505 | ],
506 | "source": [
507 | "rfm_seg = rfm.withColumn(\"r_seg\", R_udf(\"Recency\"))\n",
508 | "rfm_seg = rfm_seg.withColumn(\"f_seg\", F_udf(\"Frequency\"))\n",
509 | "rfm_seg = rfm_seg.withColumn(\"m_seg\", M_udf(\"Monetary\"))\n",
510 | "rfm_seg = rfm_seg.withColumn(\"RFMScore\", F.concat(F.col('r_seg'), F.col('f_seg'), F.col('m_seg')))\n",
511 | "\n",
512 | "rfm_seg.sort(F.col('RFMScore')).show(20)"
513 | ]
514 | },
515 | {
516 | "cell_type": "code",
517 | "execution_count": 0,
518 | "metadata": {
519 | "application/vnd.databricks.v1+cell": {
520 | "cellMetadata": {
521 | "byteLimit": 2048000,
522 | "rowLimit": 10000
523 | },
524 | "inputWidgets": {},
525 | "nuid": "69d49f7d-a504-4b3b-b0fd-b44de363d046",
526 | "showTitle": false,
527 | "title": ""
528 | }
529 | },
530 | "outputs": [
531 | {
532 | "output_type": "stream",
533 | "name": "stdout",
534 | "output_type": "stream",
535 | "text": [
536 | "+--------+-----------------+------------------+------------------+\n|RFMScore| avg(Recency)| avg(Monetary)| avg(Frequency)|\n+--------+-----------------+------------------+------------------+\n| 111|6.022680412371134|11795.596288659783|26.492783505154637|\n| 112|7.237113402061856|1223.3604123711343| 7.752577319587629|\n| 113| 8.0|505.97749999999996| 7.5|\n| 114| 11.0| 191.17| 8.0|\n| 121|6.472727272727273|2569.0619999999994| 4.636363636363637|\n+--------+-----------------+------------------+------------------+\nonly showing top 5 rows\n\n"
537 | ]
538 | }
539 | ],
540 | "source": [
541 | "rfm_seg.groupBy('RFMScore').agg({'Recency': 'mean', 'Frequency' : 'mean', 'Monetary' : 'mean'}).sort(F.col('RFMScore')).show(5)"
542 | ]
543 | },
544 | {
545 | "cell_type": "code",
546 | "execution_count": 0,
547 | "metadata": {
548 | "application/vnd.databricks.v1+cell": {
549 | "cellMetadata": {
550 | "byteLimit": 2048000,
551 | "rowLimit": 10000
552 | },
553 | "inputWidgets": {},
554 | "nuid": "34da44d3-d725-48d5-87dc-2989e0302c94",
555 | "showTitle": false,
556 | "title": ""
557 | }
558 | },
559 | "outputs": [
560 | {
561 | "output_type": "display_data",
562 | "data": {
563 | "text/plain": [
564 | "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m\n",
565 | "\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)\n",
566 | "File \u001B[0;32m:5\u001B[0m\n",
567 | "\u001B[1;32m 3\u001B[0m grp \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRFMScore\u001B[39m\u001B[38;5;124m'\u001B[39m\n",
568 | "\u001B[1;32m 4\u001B[0m num_cols \u001B[38;5;241m=\u001B[39m [\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRecency\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mFrequency\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mMonetary\u001B[39m\u001B[38;5;124m'\u001B[39m]\n",
569 | "\u001B[0;32m----> 5\u001B[0m rfm_seg\u001B[38;5;241m.\u001B[39mtoPandas()\u001B[38;5;241m.\u001B[39mto_csv(output_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mquantile_grouped.csv\u001B[39m\u001B[38;5;124m'\u001B[39m)\n",
570 | "\n",
571 | "\u001B[0;31mNameError\u001B[0m: name 'output_dir' is not defined"
572 | ]
573 | },
574 | "metadata": {
575 | "application/vnd.databricks.v1+output": {
576 | "arguments": {},
577 | "data": "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m\n\u001B[0;31mNameError\u001B[0m Traceback (most recent call last)\nFile \u001B[0;32m:5\u001B[0m\n\u001B[1;32m 3\u001B[0m grp \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRFMScore\u001B[39m\u001B[38;5;124m'\u001B[39m\n\u001B[1;32m 4\u001B[0m num_cols \u001B[38;5;241m=\u001B[39m [\u001B[38;5;124m'\u001B[39m\u001B[38;5;124mRecency\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mFrequency\u001B[39m\u001B[38;5;124m'\u001B[39m, \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mMonetary\u001B[39m\u001B[38;5;124m'\u001B[39m]\n\u001B[0;32m----> 5\u001B[0m rfm_seg\u001B[38;5;241m.\u001B[39mtoPandas()\u001B[38;5;241m.\u001B[39mto_csv(output_dir \u001B[38;5;241m+\u001B[39m \u001B[38;5;124m'\u001B[39m\u001B[38;5;124mquantile_grouped.csv\u001B[39m\u001B[38;5;124m'\u001B[39m)\n\n\u001B[0;31mNameError\u001B[0m: name 'output_dir' is not defined",
578 | "errorSummary": "NameError: name 'output_dir' is not defined",
579 | "errorTraceType": "ansi",
580 | "metadata": {},
581 | "type": "ipynbError"
582 | }
583 | },
584 | "output_type": "display_data"
585 | }
586 | ],
587 | "source": [
588 | "#Detailed summary\n",
589 | "\n",
590 | "grp = 'RFMScore'\n",
591 | "num_cols = ['Recency', 'Frequency', 'Monetary']\n",
592 | "df_myinput = rfm_seg\n",
593 | "\n",
594 | "quantile_grouped = quantile_agg(df_myinput, grp, num_cols)\n",
595 | "quantile_grouped.toPandas().to_csv(output_dir + 'quantile_grouped.csv')"
596 | ]
597 | },
598 | {
599 | "cell_type": "code",
600 | "execution_count": 0,
601 | "metadata": {
602 | "application/vnd.databricks.v1+cell": {
603 | "cellMetadata": {},
604 | "inputWidgets": {},
605 | "nuid": "1151a35d-85ff-4fc4-80dc-e9bd051df73d",
606 | "showTitle": false,
607 | "title": ""
608 | }
609 | },
610 | "outputs": [],
611 | "source": []
612 | }
613 | ],
614 | "metadata": {
615 | "application/vnd.databricks.v1+notebook": {
616 | "dashboards": [],
617 | "language": "python",
618 | "notebookMetadata": {
619 | "pythonIndentUnit": 4
620 | },
621 | "notebookName": "11072023 DEB v2",
622 | "widgets": {}
623 | }
624 | },
625 | "nbformat": 4,
626 | "nbformat_minor": 0
627 | }
628 |
--------------------------------------------------------------------------------
/25042023 - DEB.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 2,
6 | "id": "311f3e23",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "import numpy as np"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 3,
16 | "id": "a4aeeb7b",
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "p7 = np.arange(10, 200, 11)"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 4,
26 | "id": "373f96e4",
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "np.savetxt('mydata/1904DEB.csv', p7, delimiter = ',')"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 5,
36 | "id": "2a3ae98b",
37 | "metadata": {},
38 | "outputs": [],
39 | "source": [
40 | "p8 = np.arange(0, 121).reshape(11, 11)"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 6,
46 | "id": "e0c4a649",
47 | "metadata": {},
48 | "outputs": [
49 | {
50 | "data": {
51 | "text/plain": [
52 | "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
53 | " [ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],\n",
54 | " [ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],\n",
55 | " [ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43],\n",
56 | " [ 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54],\n",
57 | " [ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65],\n",
58 | " [ 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76],\n",
59 | " [ 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87],\n",
60 | " [ 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98],\n",
61 | " [ 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109],\n",
62 | " [110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120]])"
63 | ]
64 | },
65 | "execution_count": 6,
66 | "metadata": {},
67 | "output_type": "execute_result"
68 | }
69 | ],
70 | "source": [
71 | "p8"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 7,
77 | "id": "82abc61a",
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "np.save('test2504.npy', p8)"
82 | ]
83 | },
84 | {
85 | "cell_type": "code",
86 | "execution_count": 8,
87 | "id": "50fbf18a",
88 | "metadata": {},
89 | "outputs": [
90 | {
91 | "data": {
92 | "text/plain": [
93 | "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
94 | " [ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],\n",
95 | " [ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],\n",
96 | " [ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43],\n",
97 | " [ 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54],\n",
98 | " [ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65],\n",
99 | " [ 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76],\n",
100 | " [ 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87],\n",
101 | " [ 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98],\n",
102 | " [ 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109],\n",
103 | " [110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120]])"
104 | ]
105 | },
106 | "execution_count": 8,
107 | "metadata": {},
108 | "output_type": "execute_result"
109 | }
110 | ],
111 | "source": [
112 | "p9 = np.load('test2504.npy')\n",
113 | "p9"
114 | ]
115 | },
116 | {
117 | "cell_type": "code",
118 | "execution_count": 9,
119 | "id": "c529193c",
120 | "metadata": {},
121 | "outputs": [],
122 | "source": [
123 | "np.save('numpyfile2504', p8)"
124 | ]
125 | },
126 | {
127 | "cell_type": "code",
128 | "execution_count": 10,
129 | "id": "2edeafee",
130 | "metadata": {},
131 | "outputs": [
132 | {
133 | "data": {
134 | "text/plain": [
135 | "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n",
136 | " [ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21],\n",
137 | " [ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32],\n",
138 | " [ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43],\n",
139 | " [ 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54],\n",
140 | " [ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65],\n",
141 | " [ 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76],\n",
142 | " [ 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87],\n",
143 | " [ 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98],\n",
144 | " [ 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109],\n",
145 | " [110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120]])"
146 | ]
147 | },
148 | "execution_count": 10,
149 | "metadata": {},
150 | "output_type": "execute_result"
151 | }
152 | ],
153 | "source": [
154 | "p10 = np.load('numpyfile2504.npy')\n",
155 | "p10"
156 | ]
157 | },
158 | {
159 | "cell_type": "code",
160 | "execution_count": 11,
161 | "id": "c12ffdee",
162 | "metadata": {},
163 | "outputs": [
164 | {
165 | "data": {
166 | "text/plain": [
167 | "array([[ 0, 1, 2, ..., 997, 998, 999],\n",
168 | " [ 1000, 1001, 1002, ..., 1997, 1998, 1999],\n",
169 | " [ 2000, 2001, 2002, ..., 2997, 2998, 2999],\n",
170 | " ...,\n",
171 | " [997000, 997001, 997002, ..., 997997, 997998, 997999],\n",
172 | " [998000, 998001, 998002, ..., 998997, 998998, 998999],\n",
173 | " [999000, 999001, 999002, ..., 999997, 999998, 999999]])"
174 | ]
175 | },
176 | "execution_count": 11,
177 | "metadata": {},
178 | "output_type": "execute_result"
179 | }
180 | ],
181 | "source": [
182 | "p11 = np.arange(0, 1000000).reshape(1000, 1000)\n",
183 | "p11"
184 | ]
185 | },
186 | {
187 | "cell_type": "code",
188 | "execution_count": 12,
189 | "id": "ec42190b",
190 | "metadata": {},
191 | "outputs": [],
192 | "source": [
193 | "np.savez_compressed('mytest2504.npz', p11)"
194 | ]
195 | },
196 | {
197 | "cell_type": "code",
198 | "execution_count": 13,
199 | "id": "50a1bbb8",
200 | "metadata": {},
201 | "outputs": [],
202 | "source": [
203 | "np.save('mytest2504-1.npy', p11)"
204 | ]
205 | },
206 | {
207 | "cell_type": "code",
208 | "execution_count": 14,
209 | "id": "da4b8726",
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "from IPython.display import Image"
214 | ]
215 | },
216 | {
217 | "cell_type": "code",
218 | "execution_count": 15,
219 | "id": "7fbb404b",
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "data": {
224 | "image/png": "iVBORw0KGgoAAAANSUhEUgAABJgAAABLCAIAAACz06p1AAAAAXNSR0IArs4c6QAAAARnQU1BAACxjwv8YQUAAAAJcEhZcwAADsMAAA7DAcdvqGQAABIjSURBVHhe7d1Nax1XmsBxfZ18G6GQrTaCTm/shdNtBFp45FYQyAxaNbORYkbCITLZWExrko4Dg9MoCU4nluKEQXScGUVx1A4MZsAwNALPea3zPOfUqXuvpWuprv4/Lm7dejlVp64W9aeu0lM//PDDPwAAAAAAfWAK7uXLl4QcAAAAAPQGIQcAAAAAPUPIAQAAAEDPEHIAAAAA0DP9DrnnP+7/9eFfq6//fPri/8KWAAAAADAxeh1y+7fnZueuXLv2u/J1ZW7WmvvDv/03LQcAAABgsvQ85GZnbz8KbzS36i/7t6/N0XIAAAAAJsxEh5xZ9ZyWAwAAADBpxhhyR0dHa2tr8/Pz1+tWV1cfPWpPsSEMEXJGaLmPn9JyAAAAACbCGENuZWVlc3Pz4ODADF6zs7Njcu7Jkydhn9EMCLk//uX58/9xr6OPb83O3d4P6wAAAACg10xMjSXk9vb2FhYWTk5OzOjd1tbW7t27F3YbTUfIPf34D/4/d5Is/vvTsHIMHq5vbx2Fn1/F0e5v1r8NP5uz/9P2G1fvhpdYbqhV9nX/YVhj/Ly1FJcv7bbP1hzo6t3lr8O7LnZLOTgAAMAk+nFjempq8bPwTnuwWFvVtdeFUp8CTu8zc3WnN34M7zR75c21fxDeSocbb05Nvblx6N+536XM9GZY2WFcIffll18uLy/7VOu2tbX1wQcfhN1G0xFymacf/9M4Q+7r+29cPU3IuQCLweZSrRnt22VTZaLlHq7naRfJQdSAQii9IULOHZeQAwAAE+/Vkqw3IXe2XIHcMG1iK2WY2JhoPtW6Qm76zekUbNHhpgs3HXLqd8ktGXh5CbmzcJYhZ3/+zZ9+dj87avBibSM7B/s8LT+l5mnegJCzQ/kne4QcAACYdIQcXsmDGzbHBoTcjcXpfAOzfHHR7NsRcn5wG8xdCDmXN0u7D9NXFm3/pG8wxu8ouiXF9xjXv01bXk2JZZ+bhYW6plIjNSP7Z1/+1RZOKsnMxnmeeX4WYoZF8vlxvh701Up3hmbHYr6a3ez+QzGdZkx/JuJ6NoPYmdYbFQAA4Dz42+hN+68nbqntvbh/a5+i3Niwz6MMc4etb779PX186x5bNfTteHga49QeucRCsPQ2tZH9edp/g3xVSy24M2m++Cf2LZ4gSZXztyc2vfkgnZ4epDprd1bBoG65WOyXKhcfDPpqpb8maspmlxsP7EdMyJ1NyJneCA/EYlbJt/7n7K/L5FsdJK7iYr241GnbLH8KF3/OqaAS4eReKbSKr1xmY8a32Szqhgi5VLlyyuF6tq2yP4vazN4CAACcg9AS8Xbc3po3d9WxgmKKpDtycfPtuqu5m2++fOi4zZq93JaqndQtvlO/xe8Y2WdY9xT07uqt2z2u0iepqJmqE3CjNQcdctblCJ0NeZHEWQwOuUP7myPmZa6GuUr1T9mxS9o/AomQ8+FRyTAVGyqNir6KIxSl1IxQT6OOkBMlGRupGVyeqv1ZjaDHtGfotjzLkBMP08Sw2fVUD+LUXsUDOgAAgNdPV4chbrKbCsqeX6Wbb7dc3Mrb5fLOPu8ldb/eIsstYdDIYgrtB1VTiOdv5FPLR4vELp7YMT/t1muYUTFjFONfWHbi/syHCDn9wZmF9oqVIVeoDZsQcnmx2LeVp0ZiyzKTQp+09I+OKFliUS3k3PKOmhIt1BlydrNw0LMMObE2Czn9nE2cWy3qAAAAzom6z3bs3bnvE11BRXXY/5RFrT3k3bkvnK6b/sTVkSHLSitHzmOpUo9uxzSdSoMZNjP0Eqs8/zSgHaQ9hquzjp2T5INcUHbWcUbDhJyYqb3s7sKWIad/i+wu6vetDSGXF0vWITLkUhFlOZSHnK01/WrGtyPEhU3GtIbcwIoz0o4dIadWnXPIpbX6wgIAAJwTeV/uDR1y5v582lSQii7XRZ69ZReZlIYdxG4ZpYPWRxbn6VRXxUKTG4hhpbBWKM9/2JBrnbVrlcKFDzk9Uzu7gSGnLrv/LAaFXPfIASGXF0vWIVlv+CbJIycPue768nzR+S3LkBum4oy0Y3bafpV79iXTUbzsXmpVVncDJjJCyOnZ2S3NtbKH5nuVAADg/BUhJ55WDQg5t8oFSZM9+f23SKYhbs1zdpfa7mJkXWtdq+wgiw9UOciNO5Xnny6dHaQecq2zTp3TJ+7SldomIiaYLntoWkLuHELOB8yyeMRkiZBTPztF2EQhacxPWchVK654imVzKARYdtw0uCaKq9upQk4dWpyk5Qpz/X776QEAALxmxW20uMlOFVQPOR9+4Z67/cGdz6TW+/VBmgG7Rh4+5PzbG/afJt1UVFh5lQXF+buJ+3HqIVeddRmQ2an2QVdu2emEa+Iugv0ibpzvcCHX+iQzGVfI7e3tLSwsnJyc+FrrsLa2du/evbDbaM4j5OJTLNVCOqLsLs2AeeGkA4mRVcip3TNutPggy+2Vzk2+VQMqrynk0okVFzDfAAAA4Dz5Impux+0NdHNXndKitaPiZq5J/Fq1u1tuxB3t7Xu6Oy9LxsgWihjoGjlLIDlIXkfuHFoKqlki8izn9o0XysdJiDd7xPaQC3u1zVqNoHbpjSFDLn5ezQehJtsScnLfqnGFnLGysrK5uXlwcGAGr9nZ2bl+/fqTJ0/CPqOxITd35dq13w1+/XZudvHPv4b9tKxY7FvRGGWHtIWWCycTJyrGwktWU2iY8BKD2C4yS7a39mwRiW3CKw3ikiksz1sonkbLqugUIaeuzICQu7/VzLQ8E7cx36sEAAAXgr2Nnt74zN5MO/K+PFVQZ8iFyvJ33i6EArNEZ4xaq2uqESPNE9vUR85qrSvkdBA27GZRe8V58hyyI8rwyKqsOmt3GYPeVZwxbMj5K5AubBlyGXkxa0xMjSvkjo6O1tbW5ufnTarVrK6uPno01DO1Vk93t27/6+2hXh98+rfnYa/TqT/sglMWYE4/wAQAALhczI37+UVLVhTorzGG3GSyT5OIkC4DQ84+rqSEAQDAZWYfwpxLTeWPztBfhNzw2v46DoWOkHOr+Os4AABwiTXfo2v/XuUYua9insNxMSaEHAAAAAD0DCEHAAAAAD1DyAEAAABAzxByAAAAANAzhBwAAAAA9AwhBwAAAAA9Q8gBAAAAQM8QcgAAAADQM4QcAAAAAPQMIQcAAAAAPUPIAQAAAEDPpJADAAAAAPRFCDnzPwAAAACAi4+QAwAAAICeIeQAAAAAoGcIOQAAAADoGUIOAAAAAHqm3yH34qfvHn39qPr627N/nIQtAQAAAGBi9DrkvtuYm5278s47vy9fV+ZmrbmlnZ9pOQAAAACTpechNzu78W14o7lVn3+3cW2OlgMAAAAwYSY65MyqF7QcAAAAgEkzxpA7Pj5eX1+fn5+/Xre6urq/vx92GNkQIWeElvvkGS0HAAAAYCKMMeRu3bq1ubl5cHBgBq/Z2dkxOXd4eBj2Gc2AkPuXz1+8+F/3Ov7kn2fnNr4P6wAAAACg10xMmX/PPuQeP368sLBwcjL4Kdja2tr29nZ4M5qOkHv2yZL/z50kN//8LKwEAAAAgD4bV8h99dVXy8vL4U2nLSe8GU1HyGWefbI43pD75r3tD4/Dz6/i+Iu330tPDH/9aPuNq3fDSyw31Cr7+vSbsMb4+4fvxuXvfvFrWKiZA129u7IX3gEAAODs7C5NTc28/0t4p/xy563aqk4/3ZmZmlr6PLyr+eV9u9VueCfZU/K7796cmnrrzuhnMKE+XxpwNcwGUf7BuQ8luNl21S175Ssfiv1lSEeXo0XD/KoQcmdh79M3rp4m5FyAxWBzqdaM9v2KqTLRct+8l6ddJAdRAwqh9Ag5AACAMegIuVd12pBLziLkXIHYdBnDTF8n304dV0NedvezmKycu6u19pZzm701Ux7FfVh5yKmPOD9iO0LuLJxlyNmf3/7o7+5nRw1erG1k52CfvOWn1DzNI+QAAADGYOJDbhKEjpIpVciuld0lvs0vtX1wV30WOnNzaWZq5s5PYZFjli8tyfHbPmJ7AtVnfQEh5/Lm3S++SV9ZtP2TvsEYv6PolhTfY3zv+7Tl1ZRY9rlZWKhryuZWXBVGds/cwkuOH6kkMxvneeb5WYivUxbJ58fZ6/hqpd/lezsvdYaGPcmVvXSqaWQ5o/giFAEAwKXkQ+6O+dcTUSe+Wum+1HfH3KkbrS3h7uy9pff1Xb5YZTTLdV3Y04hv7c9+MxUnfpzsbdTRjamCWma3a5/XeXpelb30cQd1y5nwZ2ImOFLWyo1HCzl3TdSUzfY3d8vPIrvmhFxjUMiZ/AgPxGKryLf+5+yvy+Rb/TTMVVxMMtc5bZuFDix+zqmAzKsphV/xlctszPg2m4XitmnWui1jsIXL0rZKciPU/jwPAABgwvmCijfl7gY93sTrkDNqt+lqr5YBmzt+VxThaY+oC7dLSpS2kPP51JxAfdiMHaFZVcwuO8+mXtxeKjLDqnKEocvq9FRKddPXR03BX+32jzJsZq+nOJA5rhlKHT0f3y9pjUOFkPOlVMkw/zaUiUqjoq/iCEUpNSOoXZQsuiRRkrE5m8HlqQ4IOXuGbsuBIScGEXNXp2G0zqU+QQAAgMsgv60XfZWFXHspGXlguOrzd/lZEsgAiAcqc8iekt8mjtx2kpVhlWJ5Njs5pphFOoFMPtPacccjP3o7d62MfMsQrkYsupLd166182o+brPQXjF1dDfxQvU3pEHI5flh34pnSiJm5JZlJoWQa4kZHVGyxKJayLnlXWlk+8o/GesMObuZfJjWEXLyOVsWcmovO06qX8s9LayMDAAAcBkU0ZJu4rOQqz1vEZt5bXnjCirwq9ySmRlbF9nI6ZRcPMzYPSsBUw6rlP2Zzi0/7RQq1WqNnZMUcx+n4UIuEMmq0tqfc2WcZoJppnYcl7tlyOkLbncZeHqE3Aghl3omy6E85Gyt6Vczvnu0FV5NCLWG3MCKM9KOHSGnVo0r5OwGbV+2BAAAuDxSNQUjh1yZN2JMd8fv2SUiAEKD+U5T3/RLu9t4iNuoQ9SHVcrTTlt2h1zrZF2rFPTcx2ikkPNn687NlZu8wrXLJT5Keyy7i903fRZdIdcRwAkhN1LIhSjKdhnwRK6dLzq/ZRlyw1SckXbMTltUmUxH8VKH80YJOTHlcBrq6AAAAJeQiC4v3Y4PGXJ5Ecm7/Lw9xKr0yMgOLs9Bh5zb3W2cIqFjWKVMC7tlMTsnjVkNktQ55yKf9QAq5PRp1yYilvtP3F6u8Lmro7de8Op1S8YVco8fP15YWDg5OQnv69bW1ra3t8Ob0ZxDyPnvSa64nAtLDFk1qnCsIrGi9FAr9Zjj3rZVXH4ysq+y45ZfffRO8UROTllOKruAAAAAl1V+Ty++kjdkyBWBYTf2d/l5QvincD4AsgOJEewpZSHnF8bHSl3DKkVvtM/OSceqZaE/z8rDw9egM+SKc0uzGP60xS+D293+f8rFHYcLueoviTeukDNu3bq1ubl5cHBgBq/Z2dm5fv364eFh2Gc0NuTmrrzzzu8Hv347N3vz0+dhP220kIsPuOqPp9wuzYAinLIDiZFVyKndM2602FrZczD5Vg2onCbkmh3lIHbutQEBAAAuFRdIzU25v30PeSNSp/seXe2lBrR3/82ObjPDbymaqmWEtHsTD6kPu4bNuC3jY6La7Bx5LDW+DCE1gj698RtwOHF9wjm3Xbp8lSJCzm+W9tJHd9ehWeXIfatMTJl/xxJyx8fH6+vr8/PzJtVqVldX9/f3ww6je/blhxubG0O97v7Hf70Ie2VGDLnW0HLhJJ5ZuW3CS0aOO1azSgzicsjW4He2kcQ24ZUGcREVlusTS6fRsirKQ05+Z3JAyL390Rc+58xL7hKOKF5yEAAAgEvDV1OoL0PcnYvUGfiwJdaUof9/5EIPOCao3IFcFKmQ850QisufUlyYksMP5XepDlvyz+u89tk5WSbJvdTIYqaVHBqXMuTsEnluLtiC7GoMddr2MjbXpOUDanaUo0UDK84YY8hNKJcurQ+7JpkPOfIMAAAAuBAIuRHZJ1ptf3s24Qg5AAAA4AIh5Ian/07sciHkAAAAgAuEkAMAAACAniHkAAAAAKBnCDkAAAAA6BlCDgAAAAB6hpADAAAAgJ4h5AAAAACgZwg5AAAAAOgZQg4AAAAAeoaQAwAAAICeIeQAAAAAoGdswb18+f8sLLlPuNPebAAAAABJRU5ErkJggg==\n",
225 | "text/plain": [
226 | ""
227 | ]
228 | },
229 | "execution_count": 15,
230 | "metadata": {},
231 | "output_type": "execute_result"
232 | }
233 | ],
234 | "source": [
235 | "Image(filename = 'karsilastirma.PNG')"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 16,
241 | "id": "ef8ffab5",
242 | "metadata": {},
243 | "outputs": [],
244 | "source": [
245 | "#printing options"
246 | ]
247 | },
248 | {
249 | "cell_type": "code",
250 | "execution_count": 17,
251 | "id": "d97193dd",
252 | "metadata": {},
253 | "outputs": [
254 | {
255 | "data": {
256 | "text/plain": [
257 | "array([12.6544, 90.7864])"
258 | ]
259 | },
260 | "execution_count": 17,
261 | "metadata": {},
262 | "output_type": "execute_result"
263 | }
264 | ],
265 | "source": [
266 | "np.set_printoptions(precision = 4)\n",
267 | "a = np.array([12.654398765, 90.7864098354674])\n",
268 | "a"
269 | ]
270 | },
271 | {
272 | "cell_type": "code",
273 | "execution_count": 18,
274 | "id": "299df070",
275 | "metadata": {},
276 | "outputs": [
277 | {
278 | "data": {
279 | "text/plain": [
280 | "array([12.65, 90.79])"
281 | ]
282 | },
283 | "execution_count": 18,
284 | "metadata": {},
285 | "output_type": "execute_result"
286 | }
287 | ],
288 | "source": [
289 | "np.set_printoptions(precision = 2)\n",
290 | "a = np.array([12.654398765, 90.7864098354674])\n",
291 | "a"
292 | ]
293 | },
294 | {
295 | "cell_type": "code",
296 | "execution_count": 19,
297 | "id": "ccec6685",
298 | "metadata": {},
299 | "outputs": [
300 | {
301 | "data": {
302 | "text/plain": [
303 | "array([13., 91.])"
304 | ]
305 | },
306 | "execution_count": 19,
307 | "metadata": {},
308 | "output_type": "execute_result"
309 | }
310 | ],
311 | "source": [
312 | "np.set_printoptions(precision = 0)\n",
313 | "a = np.array([12.654398765, 90.7864098354674])\n",
314 | "a"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 22,
320 | "id": "92078b13",
321 | "metadata": {},
322 | "outputs": [
323 | {
324 | "data": {
325 | "text/plain": [
326 | "array([ 0, 1, 2, ..., 197, 198, 199])"
327 | ]
328 | },
329 | "execution_count": 22,
330 | "metadata": {},
331 | "output_type": "execute_result"
332 | }
333 | ],
334 | "source": [
335 | "np.set_printoptions(threshold = 10)\n",
336 | "np.arange(200)"
337 | ]
338 | },
339 | {
340 | "cell_type": "code",
341 | "execution_count": 23,
342 | "id": "709ecd91",
343 | "metadata": {},
344 | "outputs": [
345 | {
346 | "data": {
347 | "text/plain": [
348 | "array([12.65439876, 90.78640984])"
349 | ]
350 | },
351 | "execution_count": 23,
352 | "metadata": {},
353 | "output_type": "execute_result"
354 | }
355 | ],
356 | "source": [
357 | "np.set_printoptions(precision = 8, suppress = False, threshold = 1000, formatter = None)\n",
358 | "a = np.array([12.654398765, 90.7864098354674])\n",
359 | "a"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 24,
365 | "id": "641b0b8b",
366 | "metadata": {},
367 | "outputs": [
368 | {
369 | "name": "stdout",
370 | "output_type": "stream",
371 | "text": [
372 | "[20 42 72]\n"
373 | ]
374 | }
375 | ],
376 | "source": [
377 | "a1 = [5, 6, 8]\n",
378 | "a2 = [4, 7, 9]\n",
379 | "print(np.multiply(a1, a2))"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": 27,
385 | "id": "7a5cd367",
386 | "metadata": {},
387 | "outputs": [
388 | {
389 | "name": "stdout",
390 | "output_type": "stream",
391 | "text": [
392 | "Dot product - 134\n",
393 | "Dot product using np.dot - 134\n",
394 | "Dot product using np.inner - 134\n",
395 | "Dot product using np.multiply & sum - 134\n",
396 | "Dot product using np.matmul - 134\n",
397 | "Dot product using for loop - 134\n"
398 | ]
399 | }
400 | ],
401 | "source": [
402 | "a1 = np.array([5, 6, 8])\n",
403 | "a2 = np.array([4, 7, 9])\n",
404 | "\n",
405 | "#Dot product\n",
406 | "dotp = a1@a2\n",
407 | "print(\"Dot product - \", dotp)\n",
408 | "\n",
409 | "dotp = np.dot(a1, a2)\n",
410 | "print(\"Dot product using np.dot - \", dotp)\n",
411 | "\n",
412 | "dotp = np.inner(a1, a2)\n",
413 | "print(\"Dot product using np.inner - \", dotp)\n",
414 | "\n",
415 | "dotp = sum(np.multiply(a1, a2))\n",
416 | "print(\"Dot product using np.multiply & sum - \", dotp)\n",
417 | "\n",
418 | "dotp = np.matmul(a1, a2)\n",
419 | "print(\"Dot product using np.matmul - \", dotp)\n",
420 | "\n",
421 | "dotp = 0\n",
422 | "for i in range(len(a1)):\n",
423 | " dotp = dotp + a1[i]*a2[i]\n",
424 | "print(\"Dot product using for loop - \", dotp)"
425 | ]
426 | },
427 | {
428 | "cell_type": "code",
429 | "execution_count": 28,
430 | "id": "5b0752b7",
431 | "metadata": {},
432 | "outputs": [
433 | {
434 | "data": {
435 | "text/plain": [
436 | "9.539392014169456"
437 | ]
438 | },
439 | "execution_count": 28,
440 | "metadata": {},
441 | "output_type": "execute_result"
442 | }
443 | ],
444 | "source": [
445 | "v3 = np.array([1,2,3,4,5,6])\n",
446 | "length = np.sqrt(np.dot(v3, v3))\n",
447 | "length"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": 29,
453 | "id": "e484f99f",
454 | "metadata": {},
455 | "outputs": [
456 | {
457 | "data": {
458 | "text/plain": [
459 | "array([0.5547002 , 0.83205029])"
460 | ]
461 | },
462 | "execution_count": 29,
463 | "metadata": {},
464 | "output_type": "execute_result"
465 | }
466 | ],
467 | "source": [
468 | "v1 = [2,3]\n",
469 | "norm_v1 = v1 / np.linalg.norm(v1)\n",
470 | "norm_v1"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 30,
476 | "id": "97b57e42",
477 | "metadata": {},
478 | "outputs": [
479 | {
480 | "data": {
481 | "text/plain": [
482 | "(3.605551275463989, array([0.5547002 , 0.83205029]))"
483 | ]
484 | },
485 | "execution_count": 30,
486 | "metadata": {},
487 | "output_type": "execute_result"
488 | }
489 | ],
490 | "source": [
491 | "v1 = [2,3]\n",
492 | "length_v1 = np.sqrt(np.dot(v1,v1))\n",
493 | "norm_v1 = v1 / length_v1\n",
494 | "length_v1, norm_v1"
495 | ]
496 | },
497 | {
498 | "cell_type": "code",
499 | "execution_count": 31,
500 | "id": "440809b3",
501 | "metadata": {},
502 | "outputs": [],
503 | "source": [
504 | "#Matrisler"
505 | ]
506 | },
507 | {
508 | "cell_type": "code",
509 | "execution_count": 32,
510 | "id": "70a2991d",
511 | "metadata": {},
512 | "outputs": [
513 | {
514 | "data": {
515 | "text/plain": [
516 | "array([[805, 768, 403, 305, 725, 540, 179, 230, 984, 325],\n",
517 | " [433, 723, 319, 21, 873, 774, 732, 618, 806, 908],\n",
518 | " [442, 457, 533, 491, 44, 171, 64, 838, 35, 491],\n",
519 | " [832, 455, 586, 443, 159, 810, 423, 110, 796, 89],\n",
520 | " [957, 966, 438, 463, 665, 8, 394, 141, 960, 945],\n",
521 | " [ 38, 463, 719, 142, 421, 35, 561, 737, 728, 245],\n",
522 | " [861, 68, 486, 298, 180, 358, 709, 896, 932, 793],\n",
523 | " [494, 869, 472, 267, 37, 950, 168, 303, 41, 577],\n",
524 | " [983, 88, 321, 659, 569, 423, 785, 651, 591, 982],\n",
525 | " [276, 77, 193, 898, 157, 944, 924, 811, 314, 340]])"
526 | ]
527 | },
528 | "execution_count": 32,
529 | "metadata": {},
530 | "output_type": "execute_result"
531 | }
532 | ],
533 | "source": [
534 | "mat1 = np.random.randint(0, 1000, 100).reshape(10, 10)\n",
535 | "mat1"
536 | ]
537 | },
538 | {
539 | "cell_type": "code",
540 | "execution_count": 33,
541 | "id": "272bdafc",
542 | "metadata": {},
543 | "outputs": [
544 | {
545 | "data": {
546 | "text/plain": [
547 | "array([805, 768, 725, 540, 984, 723, 873, 774, 732, 618, 806, 908, 533,\n",
548 | " 838, 832, 586, 810, 796, 957, 966, 665, 960, 945, 719, 561, 737,\n",
549 | " 728, 861, 709, 896, 932, 793, 869, 950, 577, 983, 659, 569, 785,\n",
550 | " 651, 591, 982, 898, 944, 924, 811])"
551 | ]
552 | },
553 | "execution_count": 33,
554 | "metadata": {},
555 | "output_type": "execute_result"
556 | }
557 | ],
558 | "source": [
559 | "mat1[mat1 > 500]"
560 | ]
561 | },
562 | {
563 | "cell_type": "code",
564 | "execution_count": 34,
565 | "id": "abb62282",
566 | "metadata": {},
567 | "outputs": [
568 | {
569 | "data": {
570 | "text/plain": [
571 | "array([[1., 0., 0., 0., 0., 0., 0., 0., 0.],\n",
572 | " [0., 1., 0., 0., 0., 0., 0., 0., 0.],\n",
573 | " [0., 0., 1., 0., 0., 0., 0., 0., 0.],\n",
574 | " [0., 0., 0., 1., 0., 0., 0., 0., 0.],\n",
575 | " [0., 0., 0., 0., 1., 0., 0., 0., 0.],\n",
576 | " [0., 0., 0., 0., 0., 1., 0., 0., 0.],\n",
577 | " [0., 0., 0., 0., 0., 0., 1., 0., 0.],\n",
578 | " [0., 0., 0., 0., 0., 0., 0., 1., 0.],\n",
579 | " [0., 0., 0., 0., 0., 0., 0., 0., 1.]])"
580 | ]
581 | },
582 | "execution_count": 34,
583 | "metadata": {},
584 | "output_type": "execute_result"
585 | }
586 | ],
587 | "source": [
588 | "I = np.eye(9)\n",
589 | "I"
590 | ]
591 | },
592 | {
593 | "cell_type": "code",
594 | "execution_count": 35,
595 | "id": "deec1e09",
596 | "metadata": {},
597 | "outputs": [
598 | {
599 | "data": {
600 | "text/plain": [
601 | "array([[1, 0, 0, 0, 0, 0, 0, 0],\n",
602 | " [0, 2, 0, 0, 0, 0, 0, 0],\n",
603 | " [0, 0, 3, 0, 0, 0, 0, 0],\n",
604 | " [0, 0, 0, 4, 0, 0, 0, 0],\n",
605 | " [0, 0, 0, 0, 5, 0, 0, 0],\n",
606 | " [0, 0, 0, 0, 0, 6, 0, 0],\n",
607 | " [0, 0, 0, 0, 0, 0, 7, 0],\n",
608 | " [0, 0, 0, 0, 0, 0, 0, 8]])"
609 | ]
610 | },
611 | "execution_count": 35,
612 | "metadata": {},
613 | "output_type": "execute_result"
614 | }
615 | ],
616 | "source": [
617 | "D = np.diag([1,2,3,4,5,6,7,8])\n",
618 | "D"
619 | ]
620 | },
621 | {
622 | "cell_type": "code",
623 | "execution_count": 36,
624 | "id": "83769873",
625 | "metadata": {},
626 | "outputs": [
627 | {
628 | "name": "stdout",
629 | "output_type": "stream",
630 | "text": [
631 | "[[ 1.68035435 0.80688763 0.40287747 1.6854456 -1.32882951]\n",
632 | " [ 0.84596645 -1.65468951 0.00817708 3.19750108 -1.22710021]\n",
633 | " [-0.77196501 0.15724556 0.99325008 -0.23573474 -0.33010289]\n",
634 | " [ 1.40566255 -1.8546334 -0.19371157 -1.05308007 -1.92281742]\n",
635 | " [-2.09141583 0.2703231 -0.47696008 -1.30183295 -1.01514764]]\n"
636 | ]
637 | }
638 | ],
639 | "source": [
640 | "M = np.random.randn(5, 5)\n",
641 | "print(M) #mean = 0, variance = 1"
642 | ]
643 | },
644 | {
645 | "cell_type": "code",
646 | "execution_count": 38,
647 | "id": "f803591e",
648 | "metadata": {},
649 | "outputs": [
650 | {
651 | "name": "stdout",
652 | "output_type": "stream",
653 | "text": [
654 | "[[ 1.68035435 0.80688763 0.40287747 1.6854456 -1.32882951]\n",
655 | " [ 0. -1.65468951 0.00817708 3.19750108 -1.22710021]\n",
656 | " [ 0. 0. 0.99325008 -0.23573474 -0.33010289]\n",
657 | " [ 0. 0. 0. -1.05308007 -1.92281742]\n",
658 | " [ 0. 0. 0. 0. -1.01514764]]\n",
659 | "\n",
660 | "\n",
661 | "[[ 1.68035435 0. 0. 0. 0. ]\n",
662 | " [ 0.84596645 -1.65468951 0. 0. 0. ]\n",
663 | " [-0.77196501 0.15724556 0.99325008 0. 0. ]\n",
664 | " [ 1.40566255 -1.8546334 -0.19371157 -1.05308007 0. ]\n",
665 | " [-2.09141583 0.2703231 -0.47696008 -1.30183295 -1.01514764]]\n"
666 | ]
667 | }
668 | ],
669 | "source": [
670 | "U = np.triu(M)\n",
671 | "L = np.tril(M)\n",
672 | "print(U)\n",
673 | "print('\\n')\n",
674 | "print(L)"
675 | ]
676 | },
677 | {
678 | "cell_type": "code",
679 | "execution_count": 39,
680 | "id": "9d9c0b6b",
681 | "metadata": {},
682 | "outputs": [
683 | {
684 | "data": {
685 | "text/plain": [
686 | "array([[8, 8, 8, 8, 8],\n",
687 | " [8, 8, 8, 8, 8],\n",
688 | " [8, 8, 8, 8, 8],\n",
689 | " [8, 8, 8, 8, 8],\n",
690 | " [8, 8, 8, 8, 8]])"
691 | ]
692 | },
693 | "execution_count": 39,
694 | "metadata": {},
695 | "output_type": "execute_result"
696 | }
697 | ],
698 | "source": [
699 | "np.full((5,5), 8)"
700 | ]
701 | },
702 | {
703 | "cell_type": "code",
704 | "execution_count": 40,
705 | "id": "42bf2701",
706 | "metadata": {},
707 | "outputs": [
708 | {
709 | "data": {
710 | "text/plain": [
711 | "array([[17.32618265, 15.23235106, 13.93764801, 14.99742616, 17.67296604],\n",
712 | " [18.42662307, 14.39780883, 12.88474242, 16.76242604, 17.03254757],\n",
713 | " [12.73783671, 11.1772208 , 11.40142099, 15.55447744, 16.50116265],\n",
714 | " [19.4232284 , 17.93541716, 11.25968528, 15.97049384, 17.20164701],\n",
715 | " [19.91417029, 12.71935513, 17.96887011, 18.39930945, 13.90331467]])"
716 | ]
717 | },
718 | "execution_count": 40,
719 | "metadata": {},
720 | "output_type": "execute_result"
721 | }
722 | ],
723 | "source": [
724 | "np.random.uniform(10, 20, size = (5,5))"
725 | ]
726 | },
727 | {
728 | "cell_type": "code",
729 | "execution_count": 67,
730 | "id": "d62f900b",
731 | "metadata": {},
732 | "outputs": [
733 | {
734 | "data": {
735 | "text/plain": [
736 | "array([[13, 16, 11, 16, 19],\n",
737 | " [14, 12, 15, 18, 17],\n",
738 | " [11, 19, 12, 17, 18],\n",
739 | " [19, 10, 10, 17, 19],\n",
740 | " [16, 18, 17, 12, 16]])"
741 | ]
742 | },
743 | "execution_count": 67,
744 | "metadata": {},
745 | "output_type": "execute_result"
746 | }
747 | ],
748 | "source": [
749 | "B = np.random.uniform(10, 20, size = (5,5))\n",
750 | "my = B.astype(int)\n",
751 | "my"
752 | ]
753 | },
754 | {
755 | "cell_type": "code",
756 | "execution_count": 66,
757 | "id": "9ec23679",
758 | "metadata": {},
759 | "outputs": [
760 | {
761 | "data": {
762 | "text/plain": [
763 | "numpy.ndarray"
764 | ]
765 | },
766 | "execution_count": 66,
767 | "metadata": {},
768 | "output_type": "execute_result"
769 | }
770 | ],
771 | "source": [
772 | "type(my)"
773 | ]
774 | },
775 | {
776 | "cell_type": "code",
777 | "execution_count": 47,
778 | "id": "fdc82c04",
779 | "metadata": {},
780 | "outputs": [
781 | {
782 | "data": {
783 | "text/plain": [
784 | "array([[ 1, 2, 3, 4],\n",
785 | " [ 5, 6, 7, 8],\n",
786 | " [10, 11, 12, 13],\n",
787 | " [14, 15, 16, 17]])"
788 | ]
789 | },
790 | "execution_count": 47,
791 | "metadata": {},
792 | "output_type": "execute_result"
793 | }
794 | ],
795 | "source": [
796 | "A = np.array([[1,2,3,4], [5,6,7,8], [10,11,12,13], [14,15,16,17]])\n",
797 | "A"
798 | ]
799 | },
800 | {
801 | "cell_type": "code",
802 | "execution_count": 48,
803 | "id": "131fca35",
804 | "metadata": {},
805 | "outputs": [
806 | {
807 | "data": {
808 | "text/plain": [
809 | "array([ 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17])"
810 | ]
811 | },
812 | "execution_count": 48,
813 | "metadata": {},
814 | "output_type": "execute_result"
815 | }
816 | ],
817 | "source": [
818 | "A.flatten()"
819 | ]
820 | },
821 | {
822 | "cell_type": "code",
823 | "execution_count": 49,
824 | "id": "55e11d79",
825 | "metadata": {},
826 | "outputs": [
827 | {
828 | "data": {
829 | "text/plain": [
830 | "array([ 1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17])"
831 | ]
832 | },
833 | "execution_count": 49,
834 | "metadata": {},
835 | "output_type": "execute_result"
836 | }
837 | ],
838 | "source": [
839 | "A.ravel()"
840 | ]
841 | },
842 | {
843 | "cell_type": "code",
844 | "execution_count": 68,
845 | "id": "4a854506",
846 | "metadata": {},
847 | "outputs": [
848 | {
849 | "data": {
850 | "text/plain": [
851 | "array([13, 16, 11, 16, 19, 14, 12, 15, 18, 17, 11, 19, 12, 17, 18, 19, 10,\n",
852 | " 10, 17, 19, 16, 18, 17, 12, 16])"
853 | ]
854 | },
855 | "execution_count": 68,
856 | "metadata": {},
857 | "output_type": "execute_result"
858 | }
859 | ],
860 | "source": [
861 | "my.flatten()"
862 | ]
863 | },
864 | {
865 | "cell_type": "code",
866 | "execution_count": 69,
867 | "id": "842e8c86",
868 | "metadata": {},
869 | "outputs": [
870 | {
871 | "data": {
872 | "text/plain": [
873 | "array([13, 16, 11, 16, 19, 14, 12, 15, 18, 17, 11, 19, 12, 17, 18, 19, 10,\n",
874 | " 10, 17, 19, 16, 18, 17, 12, 16])"
875 | ]
876 | },
877 | "execution_count": 69,
878 | "metadata": {},
879 | "output_type": "execute_result"
880 | }
881 | ],
882 | "source": [
883 | "my.ravel()"
884 | ]
885 | },
886 | {
887 | "cell_type": "code",
888 | "execution_count": 70,
889 | "id": "0ad41fd7",
890 | "metadata": {},
891 | "outputs": [
892 | {
893 | "data": {
894 | "text/plain": [
895 | "array([[1, 2],\n",
896 | " [4, 5]])"
897 | ]
898 | },
899 | "execution_count": 70,
900 | "metadata": {},
901 | "output_type": "execute_result"
902 | }
903 | ],
904 | "source": [
905 | "M1 = np.array([[1,2], [4,5]])\n",
906 | "M1"
907 | ]
908 | },
909 | {
910 | "cell_type": "code",
911 | "execution_count": 71,
912 | "id": "4274913c",
913 | "metadata": {},
914 | "outputs": [
915 | {
916 | "data": {
917 | "text/plain": [
918 | "array([[ 57, 78],\n",
919 | " [156, 213]])"
920 | ]
921 | },
922 | "execution_count": 71,
923 | "metadata": {},
924 | "output_type": "execute_result"
925 | }
926 | ],
927 | "source": [
928 | "M1@M1@M1"
929 | ]
930 | },
931 | {
932 | "cell_type": "code",
933 | "execution_count": 72,
934 | "id": "d8a968af",
935 | "metadata": {},
936 | "outputs": [
937 | {
938 | "data": {
939 | "text/plain": [
940 | "array([[ 57, 78],\n",
941 | " [156, 213]])"
942 | ]
943 | },
944 | "execution_count": 72,
945 | "metadata": {},
946 | "output_type": "execute_result"
947 | }
948 | ],
949 | "source": [
950 | "np.linalg.matrix_power(M1, 3)"
951 | ]
952 | },
953 | {
954 | "cell_type": "code",
955 | "execution_count": null,
956 | "id": "0a581ea3",
957 | "metadata": {},
958 | "outputs": [],
959 | "source": [
960 | "#Tensor"
961 | ]
962 | }
963 | ],
964 | "metadata": {
965 | "kernelspec": {
966 | "display_name": "Python 3",
967 | "language": "python",
968 | "name": "python3"
969 | },
970 | "language_info": {
971 | "codemirror_mode": {
972 | "name": "ipython",
973 | "version": 3
974 | },
975 | "file_extension": ".py",
976 | "mimetype": "text/x-python",
977 | "name": "python",
978 | "nbconvert_exporter": "python",
979 | "pygments_lexer": "ipython3",
980 | "version": "3.8.8"
981 | }
982 | },
983 | "nbformat": 4,
984 | "nbformat_minor": 5
985 | }
986 |
--------------------------------------------------------------------------------
/25062023 - DEB.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "id": "9005df46",
7 | "metadata": {},
8 | "outputs": [
9 | {
10 | "name": "stdout",
11 | "output_type": "stream",
12 | "text": [
13 | "Requirement already satisfied: findspark in c:\\users\\itu\\anaconda3\\lib\\site-packages (2.0.1)\n"
14 | ]
15 | }
16 | ],
17 | "source": [
18 | "!pip install findspark"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 2,
24 | "id": "8dfeefaa",
25 | "metadata": {},
26 | "outputs": [],
27 | "source": [
28 | "import findspark\n",
29 | "findspark.init()"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": 3,
35 | "id": "6b6dce5d",
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "from pyspark.sql import SparkSession\n",
40 | "from pyspark.conf import SparkConf"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 5,
46 | "id": "fdfdb2fe",
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "spark = SparkSession.builder \\\n",
51 | ".master(\"local[4]\") \\\n",
52 | ".appName(\"giveatry\") \\\n",
53 | ".getOrCreate()"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 7,
59 | "id": "400c0c9e",
60 | "metadata": {},
61 | "outputs": [],
62 | "source": [
63 | "sc = spark.sparkContext"
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 9,
69 | "id": "2c88fde8",
70 | "metadata": {},
71 | "outputs": [],
72 | "source": [
73 | "veri_seti = \"C:\\\\Users\\\\ITU\\\\mydata\\\\ibb_lojistik.txt\""
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 10,
79 | "id": "4b999354",
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "istac_rdd = sc.textFile(veri_seti)"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 11,
89 | "id": "1eb7a9e2",
90 | "metadata": {},
91 | "outputs": [
92 | {
93 | "data": {
94 | "text/plain": [
95 | "97"
96 | ]
97 | },
98 | "execution_count": 11,
99 | "metadata": {},
100 | "output_type": "execute_result"
101 | }
102 | ],
103 | "source": [
104 | "istac_rdd.count()"
105 | ]
106 | },
107 | {
108 | "cell_type": "code",
109 | "execution_count": 13,
110 | "id": "16290c98",
111 | "metadata": {},
112 | "outputs": [
113 | {
114 | "data": {
115 | "text/plain": [
116 | "['İlçeler bazında firmaların kullandıkları ortalama alanlara bakıldığında ',\n",
117 | " 'en fazla alan kullanımının Ümraniye, Çatalca, Silivri, Şile, Arnavutköy gibi',\n",
118 | " 'İstanbul’un alan bakımından daha büyük ve yerleşim alanlarının daha kısıtlı olduğu',\n",
119 | " 'ilçeleri ön plana çıkmaktadır. Bunda arazi fiyatlarının daha düşük olması, karayolu, ',\n",
120 | " 'demiryolu ve kısmen de olsa denizyolu ana bağlantı noktalarına yakınlık, kullanılabilir ']"
121 | ]
122 | },
123 | "execution_count": 13,
124 | "metadata": {},
125 | "output_type": "execute_result"
126 | }
127 | ],
128 | "source": [
129 | "istac_rdd.take(5)"
130 | ]
131 | },
132 | {
133 | "cell_type": "code",
134 | "execution_count": 18,
135 | "id": "55fca24e",
136 | "metadata": {},
137 | "outputs": [],
138 | "source": [
139 | "istac_rdd_kelimeler = istac_rdd.flatMap(lambda satir : satir.split(\" \"))"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 20,
145 | "id": "1ca8904e",
146 | "metadata": {},
147 | "outputs": [
148 | {
149 | "data": {
150 | "text/plain": [
151 | "['İlçeler',\n",
152 | " '',\n",
153 | " 'bazında',\n",
154 | " '',\n",
155 | " 'firmaların',\n",
156 | " '',\n",
157 | " 'kullandıkları',\n",
158 | " '',\n",
159 | " 'ortalama',\n",
160 | " '',\n",
161 | " 'alanlara',\n",
162 | " '',\n",
163 | " 'bakıldığında',\n",
164 | " '',\n",
165 | " '',\n",
166 | " 'en',\n",
167 | " '',\n",
168 | " 'fazla',\n",
169 | " '',\n",
170 | " 'alan']"
171 | ]
172 | },
173 | "execution_count": 20,
174 | "metadata": {},
175 | "output_type": "execute_result"
176 | }
177 | ],
178 | "source": [
179 | "istac_rdd_kelimeler.take(20)"
180 | ]
181 | },
182 | {
183 | "cell_type": "code",
184 | "execution_count": 22,
185 | "id": "f3c69479",
186 | "metadata": {},
187 | "outputs": [],
188 | "source": [
189 | "istac_rdd_kelimeler_sayilari = istac_rdd_kelimeler.map(lambda kelime : (kelime, 1))"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 25,
195 | "id": "62840d06",
196 | "metadata": {},
197 | "outputs": [
198 | {
199 | "data": {
200 | "text/plain": [
201 | "[('İlçeler', 1),\n",
202 | " ('', 1),\n",
203 | " ('bazında', 1),\n",
204 | " ('', 1),\n",
205 | " ('firmaların', 1),\n",
206 | " ('', 1),\n",
207 | " ('kullandıkları', 1),\n",
208 | " ('', 1),\n",
209 | " ('ortalama', 1),\n",
210 | " ('', 1),\n",
211 | " ('alanlara', 1),\n",
212 | " ('', 1),\n",
213 | " ('bakıldığında', 1),\n",
214 | " ('', 1),\n",
215 | " ('', 1),\n",
216 | " ('en', 1),\n",
217 | " ('', 1),\n",
218 | " ('fazla', 1),\n",
219 | " ('', 1),\n",
220 | " ('alan', 1)]"
221 | ]
222 | },
223 | "execution_count": 25,
224 | "metadata": {},
225 | "output_type": "execute_result"
226 | }
227 | ],
228 | "source": [
229 | "istac_rdd_kelimeler_sayilari.take(20)"
230 | ]
231 | },
232 | {
233 | "cell_type": "code",
234 | "execution_count": 26,
235 | "id": "f45015bd",
236 | "metadata": {},
237 | "outputs": [],
238 | "source": [
239 | "istac_rdd_kelimeler_sayilari_reduce = istac_rdd_kelimeler_sayilari.reduceByKey(lambda x, y : (x + y))"
240 | ]
241 | },
242 | {
243 | "cell_type": "code",
244 | "execution_count": 27,
245 | "id": "3cf53d29",
246 | "metadata": {},
247 | "outputs": [
248 | {
249 | "data": {
250 | "text/plain": [
251 | "[('', 112),\n",
252 | " ('bazında', 3),\n",
253 | " ('kullandıkları', 1),\n",
254 | " ('ortalama', 3),\n",
255 | " ('en', 4),\n",
256 | " ('fazla', 3),\n",
257 | " ('alan', 6),\n",
258 | " ('Çatalca,', 4),\n",
259 | " ('Arnavutköy', 3),\n",
260 | " ('İstanbul’un', 5),\n",
261 | " ('daha', 6),\n",
262 | " ('ve', 42),\n",
263 | " ('alanlarının', 4),\n",
264 | " ('ilçeleri', 3),\n",
265 | " ('plana', 3),\n",
266 | " ('Bunda', 1),\n",
267 | " ('fiyatlarının', 1),\n",
268 | " ('düşük', 1),\n",
269 | " ('karayolu,', 1),\n",
270 | " ('demiryolu', 1)]"
271 | ]
272 | },
273 | "execution_count": 27,
274 | "metadata": {},
275 | "output_type": "execute_result"
276 | }
277 | ],
278 | "source": [
279 | "istac_rdd_kelimeler_sayilari_reduce.take(20)"
280 | ]
281 | },
282 | {
283 | "cell_type": "code",
284 | "execution_count": 28,
285 | "id": "73df3ede",
286 | "metadata": {},
287 | "outputs": [],
288 | "source": [
289 | "istac_rdd_kelimeler_sayilari_reduce_sort = istac_rdd_kelimeler_sayilari_reduce.map(lambda x: (x[1], x[0]))"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": 29,
295 | "id": "0a20e465",
296 | "metadata": {},
297 | "outputs": [
298 | {
299 | "data": {
300 | "text/plain": [
301 | "[(112, ''),\n",
302 | " (3, 'bazında'),\n",
303 | " (1, 'kullandıkları'),\n",
304 | " (3, 'ortalama'),\n",
305 | " (4, 'en'),\n",
306 | " (3, 'fazla'),\n",
307 | " (6, 'alan'),\n",
308 | " (4, 'Çatalca,'),\n",
309 | " (3, 'Arnavutköy'),\n",
310 | " (5, 'İstanbul’un'),\n",
311 | " (6, 'daha'),\n",
312 | " (42, 've'),\n",
313 | " (4, 'alanlarının'),\n",
314 | " (3, 'ilçeleri'),\n",
315 | " (3, 'plana'),\n",
316 | " (1, 'Bunda'),\n",
317 | " (1, 'fiyatlarının'),\n",
318 | " (1, 'düşük'),\n",
319 | " (1, 'karayolu,'),\n",
320 | " (1, 'demiryolu')]"
321 | ]
322 | },
323 | "execution_count": 29,
324 | "metadata": {},
325 | "output_type": "execute_result"
326 | }
327 | ],
328 | "source": [
329 | "istac_rdd_kelimeler_sayilari_reduce_sort.take(20)"
330 | ]
331 | },
332 | {
333 | "cell_type": "code",
334 | "execution_count": 30,
335 | "id": "1066fed8",
336 | "metadata": {},
337 | "outputs": [
338 | {
339 | "data": {
340 | "text/plain": [
341 | "[(112, ''),\n",
342 | " (42, 've'),\n",
343 | " (12, 'lojistik'),\n",
344 | " (11, 'olarak'),\n",
345 | " (10, 'bir'),\n",
346 | " (9, 'yük'),\n",
347 | " (9, 'gibi'),\n",
348 | " (8, 'ile'),\n",
349 | " (6, 'alan'),\n",
350 | " (6, 'daha'),\n",
351 | " (6, 'Lojistik'),\n",
352 | " (6, 'ilçeler'),\n",
353 | " (6, 'alanları'),\n",
354 | " (6, 'bu'),\n",
355 | " (5, 'İstanbul’un'),\n",
356 | " (5, 'depolama'),\n",
357 | " (5, 'kentsel'),\n",
358 | " (5, 'Şile,'),\n",
359 | " (5, 'Ana'),\n",
360 | " (5, 'İstanbul')]"
361 | ]
362 | },
363 | "execution_count": 30,
364 | "metadata": {},
365 | "output_type": "execute_result"
366 | }
367 | ],
368 | "source": [
369 | "istac_rdd_kelimeler_sayilari_reduce_sort.sortByKey(False).take(20)"
370 | ]
371 | },
372 | {
373 | "cell_type": "code",
374 | "execution_count": null,
375 | "id": "f9c53d29",
376 | "metadata": {},
377 | "outputs": [],
378 | "source": []
379 | }
380 | ],
381 | "metadata": {
382 | "kernelspec": {
383 | "display_name": "Python 3",
384 | "language": "python",
385 | "name": "python3"
386 | },
387 | "language_info": {
388 | "codemirror_mode": {
389 | "name": "ipython",
390 | "version": 3
391 | },
392 | "file_extension": ".py",
393 | "mimetype": "text/x-python",
394 | "name": "python",
395 | "nbconvert_exporter": "python",
396 | "pygments_lexer": "ipython3",
397 | "version": "3.8.8"
398 | }
399 | },
400 | "nbformat": 4,
401 | "nbformat_minor": 5
402 | }
403 |
--------------------------------------------------------------------------------
/27042023 - DEB.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 3,
6 | "id": "5aca6e0d",
7 | "metadata": {},
8 | "outputs": [],
9 | "source": [
10 | "#Tensörler\n",
11 | "import numpy as np"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 6,
17 | "id": "f2ab7891",
18 | "metadata": {},
19 | "outputs": [
20 | {
21 | "data": {
22 | "text/plain": [
23 | "array([[[ 1, 2, 3],\n",
24 | " [ 4, 5, 6],\n",
25 | " [ 7, 8, 9]],\n",
26 | "\n",
27 | " [[ 10, 20, 30],\n",
28 | " [ 40, 50, 60],\n",
29 | " [ 70, 80, 90]],\n",
30 | "\n",
31 | " [[100, 200, 300],\n",
32 | " [400, 500, 600],\n",
33 | " [700, 800, 900]]])"
34 | ]
35 | },
36 | "execution_count": 6,
37 | "metadata": {},
38 | "output_type": "execute_result"
39 | }
40 | ],
41 | "source": [
42 | "T1 = np.array([\n",
43 | " [[1,2,3], [4,5,6], [7,8,9]],\n",
44 | " [[10,20,30], [40,50,60], [70,80,90]],\n",
45 | " [[100, 200, 300], [400, 500, 600], [700, 800, 900]],\n",
46 | "])\n",
47 | "\n",
48 | "T1"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 5,
54 | "id": "f2ad73b3",
55 | "metadata": {},
56 | "outputs": [
57 | {
58 | "data": {
59 | "text/plain": [
60 | "array([[[0, 0, 0],\n",
61 | " [0, 0, 0],\n",
62 | " [0, 0, 0]],\n",
63 | "\n",
64 | " [[1, 1, 1],\n",
65 | " [1, 1, 1],\n",
66 | " [1, 1, 1]],\n",
67 | "\n",
68 | " [[2, 2, 2],\n",
69 | " [2, 2, 2],\n",
70 | " [2, 2, 2]]])"
71 | ]
72 | },
73 | "execution_count": 5,
74 | "metadata": {},
75 | "output_type": "execute_result"
76 | }
77 | ],
78 | "source": [
79 | "T2= np.array([\n",
80 | " [[0,0,0], [0,0,0], [0,0,0]],\n",
81 | " [[1,1,1], [1,1,1], [1,1,1]],\n",
82 | " [[2,2,2], [2,2,2], [2,2,2]],\n",
83 | " \n",
84 | "])\n",
85 | "\n",
86 | "T2"
87 | ]
88 | },
89 | {
90 | "cell_type": "code",
91 | "execution_count": 7,
92 | "id": "e6180340",
93 | "metadata": {},
94 | "outputs": [
95 | {
96 | "data": {
97 | "text/plain": [
98 | "array([[[ 1, 2, 3],\n",
99 | " [ 4, 5, 6],\n",
100 | " [ 7, 8, 9]],\n",
101 | "\n",
102 | " [[ 11, 21, 31],\n",
103 | " [ 41, 51, 61],\n",
104 | " [ 71, 81, 91]],\n",
105 | "\n",
106 | " [[102, 202, 302],\n",
107 | " [402, 502, 602],\n",
108 | " [702, 802, 902]]])"
109 | ]
110 | },
111 | "execution_count": 7,
112 | "metadata": {},
113 | "output_type": "execute_result"
114 | }
115 | ],
116 | "source": [
117 | "A = T1 + T2\n",
118 | "A"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 8,
124 | "id": "bbb6a509",
125 | "metadata": {},
126 | "outputs": [
127 | {
128 | "data": {
129 | "text/plain": [
130 | "array([[[ 1, 2, 3],\n",
131 | " [ 4, 5, 6],\n",
132 | " [ 7, 8, 9]],\n",
133 | "\n",
134 | " [[ 11, 21, 31],\n",
135 | " [ 41, 51, 61],\n",
136 | " [ 71, 81, 91]],\n",
137 | "\n",
138 | " [[102, 202, 302],\n",
139 | " [402, 502, 602],\n",
140 | " [702, 802, 902]]])"
141 | ]
142 | },
143 | "execution_count": 8,
144 | "metadata": {},
145 | "output_type": "execute_result"
146 | }
147 | ],
148 | "source": [
149 | "np.add(T1, T2)"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 9,
155 | "id": "353fed0a",
156 | "metadata": {},
157 | "outputs": [
158 | {
159 | "data": {
160 | "text/plain": [
161 | "array([[[ 1, 2, 3],\n",
162 | " [ 4, 5, 6],\n",
163 | " [ 7, 8, 9]],\n",
164 | "\n",
165 | " [[ 9, 19, 29],\n",
166 | " [ 39, 49, 59],\n",
167 | " [ 69, 79, 89]],\n",
168 | "\n",
169 | " [[ 98, 198, 298],\n",
170 | " [398, 498, 598],\n",
171 | " [698, 798, 898]]])"
172 | ]
173 | },
174 | "execution_count": 9,
175 | "metadata": {},
176 | "output_type": "execute_result"
177 | }
178 | ],
179 | "source": [
180 | "np.subtract(T1, T2)"
181 | ]
182 | },
183 | {
184 | "cell_type": "code",
185 | "execution_count": 10,
186 | "id": "c06fb5f2",
187 | "metadata": {},
188 | "outputs": [
189 | {
190 | "data": {
191 | "text/plain": [
192 | "array([[[ 0, 0, 0],\n",
193 | " [ 0, 0, 0],\n",
194 | " [ 0, 0, 0]],\n",
195 | "\n",
196 | " [[ 10, 20, 30],\n",
197 | " [ 40, 50, 60],\n",
198 | " [ 70, 80, 90]],\n",
199 | "\n",
200 | " [[ 200, 400, 600],\n",
201 | " [ 800, 1000, 1200],\n",
202 | " [1400, 1600, 1800]]])"
203 | ]
204 | },
205 | "execution_count": 10,
206 | "metadata": {},
207 | "output_type": "execute_result"
208 | }
209 | ],
210 | "source": [
211 | "np.multiply(T1, T2)"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 13,
217 | "id": "05263bb5",
218 | "metadata": {},
219 | "outputs": [],
220 | "source": [
221 | "import warnings\n",
222 | "warnings.filterwarnings('ignore')"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 14,
228 | "id": "3b4bc8e7",
229 | "metadata": {},
230 | "outputs": [
231 | {
232 | "data": {
233 | "text/plain": [
234 | "array([[[ inf, inf, inf],\n",
235 | " [ inf, inf, inf],\n",
236 | " [ inf, inf, inf]],\n",
237 | "\n",
238 | " [[ 10., 20., 30.],\n",
239 | " [ 40., 50., 60.],\n",
240 | " [ 70., 80., 90.]],\n",
241 | "\n",
242 | " [[ 50., 100., 150.],\n",
243 | " [200., 250., 300.],\n",
244 | " [350., 400., 450.]]])"
245 | ]
246 | },
247 | "execution_count": 14,
248 | "metadata": {},
249 | "output_type": "execute_result"
250 | }
251 | ],
252 | "source": [
253 | "D = T1 / T2 # np.divide(T1, T2)\n",
254 | "D"
255 | ]
256 | },
257 | {
258 | "cell_type": "code",
259 | "execution_count": 15,
260 | "id": "1f321034",
261 | "metadata": {},
262 | "outputs": [],
263 | "source": [
264 | "from scipy import linalg"
265 | ]
266 | },
267 | {
268 | "cell_type": "code",
269 | "execution_count": 16,
270 | "id": "97dabd14",
271 | "metadata": {},
272 | "outputs": [
273 | {
274 | "name": "stdout",
275 | "output_type": "stream",
276 | "text": [
277 | "[[1 3 5]\n",
278 | " [2 5 1]\n",
279 | " [2 3 8]]\n",
280 | "[[10]\n",
281 | " [ 8]\n",
282 | " [ 3]]\n"
283 | ]
284 | }
285 | ],
286 | "source": [
287 | "A = np.array([[1,3,5], [2,5,1], [2,3,8]])\n",
288 | "print(A)\n",
289 | "b = np.array([[10], [8], [3]])\n",
290 | "print(b)"
291 | ]
292 | },
293 | {
294 | "cell_type": "code",
295 | "execution_count": 17,
296 | "id": "4ea2b7e2",
297 | "metadata": {},
298 | "outputs": [
299 | {
300 | "name": "stdout",
301 | "output_type": "stream",
302 | "text": [
303 | "linalg.inv(A).dot(b) Matrix Inverse ile Denklem Takımı Çözümü\n",
304 | "[[-9.28]\n",
305 | " [ 5.16]\n",
306 | " [ 0.76]]\n"
307 | ]
308 | }
309 | ],
310 | "source": [
311 | "print(\"linalg.inv(A).dot(b) Matrix Inverse ile Denklem Takımı Çözümü\")\n",
312 | "print(linalg.inv(A).dot(b)) #slow"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 18,
318 | "id": "ca0a4359",
319 | "metadata": {},
320 | "outputs": [
321 | {
322 | "name": "stdout",
323 | "output_type": "stream",
324 | "text": [
325 | "np.linalg.solve(A, b) ile Denklem Takımı Çözümü\n",
326 | "[[-9.28]\n",
327 | " [ 5.16]\n",
328 | " [ 0.76]]\n"
329 | ]
330 | }
331 | ],
332 | "source": [
333 | "print(\"np.linalg.solve(A, b) ile Denklem Takımı Çözümü\")\n",
334 | "print(np.linalg.solve(A, b)) #fast"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 19,
340 | "id": "16487b46",
341 | "metadata": {},
342 | "outputs": [
343 | {
344 | "name": "stdout",
345 | "output_type": "stream",
346 | "text": [
347 | "(72.00000000000001, 7.993605777301129e-13)\n"
348 | ]
349 | }
350 | ],
351 | "source": [
352 | "from scipy import integrate\n",
353 | "x2 = lambda x : x**2\n",
354 | "I = integrate.quad(x2, 0, 6)\n",
355 | "print(I)"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": 20,
361 | "id": "62a783fd",
362 | "metadata": {},
363 | "outputs": [
364 | {
365 | "name": "stdout",
366 | "output_type": "stream",
367 | "text": [
368 | "4.999999999921734\n"
369 | ]
370 | }
371 | ],
372 | "source": [
373 | "from scipy.misc import derivative\n",
374 | "def f(x):\n",
375 | " return x**3 + x**2\n",
376 | "D = derivative(f, 1.0, dx = 1e-6)\n",
377 | "print(D)"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": 21,
383 | "id": "ebab2888",
384 | "metadata": {},
385 | "outputs": [],
386 | "source": [
387 | "import pandas as pd"
388 | ]
389 | },
390 | {
391 | "cell_type": "code",
392 | "execution_count": 22,
393 | "id": "a8818866",
394 | "metadata": {},
395 | "outputs": [],
396 | "source": [
397 | "#Seriler ve Veri Çerçeveleri (DataFrame)"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": 23,
403 | "id": "0aa2bb8d",
404 | "metadata": {},
405 | "outputs": [
406 | {
407 | "data": {
408 | "text/plain": [
409 | "0 1\n",
410 | "1 2\n",
411 | "2 3\n",
412 | "3 4\n",
413 | "4 5\n",
414 | "5 6\n",
415 | "6 7\n",
416 | "dtype: int32"
417 | ]
418 | },
419 | "execution_count": 23,
420 | "metadata": {},
421 | "output_type": "execute_result"
422 | }
423 | ],
424 | "source": [
425 | "v = np.array([1,2,3,4,5,6,7])\n",
426 | "s1 = pd.Series(v)\n",
427 | "s1"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": null,
433 | "id": "31914d3a",
434 | "metadata": {},
435 | "outputs": [],
436 | "source": []
437 | }
438 | ],
439 | "metadata": {
440 | "kernelspec": {
441 | "display_name": "Python 3",
442 | "language": "python",
443 | "name": "python3"
444 | },
445 | "language_info": {
446 | "codemirror_mode": {
447 | "name": "ipython",
448 | "version": 3
449 | },
450 | "file_extension": ".py",
451 | "mimetype": "text/x-python",
452 | "name": "python",
453 | "nbconvert_exporter": "python",
454 | "pygments_lexer": "ipython3",
455 | "version": "3.8.8"
456 | }
457 | },
458 | "nbformat": 4,
459 | "nbformat_minor": 5
460 | }
461 |
--------------------------------------------------------------------------------
/Advertising.csv:
--------------------------------------------------------------------------------
1 | "","TV","Radio","Newspaper","Sales"
2 | "1",230.1,37.8,69.2,22.1
3 | "2",44.5,39.3,45.1,10.4
4 | "3",17.2,45.9,69.3,9.3
5 | "4",151.5,41.3,58.5,18.5
6 | "5",180.8,10.8,58.4,12.9
7 | "6",8.7,48.9,75,7.2
8 | "7",57.5,32.8,23.5,11.8
9 | "8",120.2,19.6,11.6,13.2
10 | "9",8.6,2.1,1,4.8
11 | "10",199.8,2.6,21.2,10.6
12 | "11",66.1,5.8,24.2,8.6
13 | "12",214.7,24,4,17.4
14 | "13",23.8,35.1,65.9,9.2
15 | "14",97.5,7.6,7.2,9.7
16 | "15",204.1,32.9,46,19
17 | "16",195.4,47.7,52.9,22.4
18 | "17",67.8,36.6,114,12.5
19 | "18",281.4,39.6,55.8,24.4
20 | "19",69.2,20.5,18.3,11.3
21 | "20",147.3,23.9,19.1,14.6
22 | "21",218.4,27.7,53.4,18
23 | "22",237.4,5.1,23.5,12.5
24 | "23",13.2,15.9,49.6,5.6
25 | "24",228.3,16.9,26.2,15.5
26 | "25",62.3,12.6,18.3,9.7
27 | "26",262.9,3.5,19.5,12
28 | "27",142.9,29.3,12.6,15
29 | "28",240.1,16.7,22.9,15.9
30 | "29",248.8,27.1,22.9,18.9
31 | "30",70.6,16,40.8,10.5
32 | "31",292.9,28.3,43.2,21.4
33 | "32",112.9,17.4,38.6,11.9
34 | "33",97.2,1.5,30,9.6
35 | "34",265.6,20,0.3,17.4
36 | "35",95.7,1.4,7.4,9.5
37 | "36",290.7,4.1,8.5,12.8
38 | "37",266.9,43.8,5,25.4
39 | "38",74.7,49.4,45.7,14.7
40 | "39",43.1,26.7,35.1,10.1
41 | "40",228,37.7,32,21.5
42 | "41",202.5,22.3,31.6,16.6
43 | "42",177,33.4,38.7,17.1
44 | "43",293.6,27.7,1.8,20.7
45 | "44",206.9,8.4,26.4,12.9
46 | "45",25.1,25.7,43.3,8.5
47 | "46",175.1,22.5,31.5,14.9
48 | "47",89.7,9.9,35.7,10.6
49 | "48",239.9,41.5,18.5,23.2
50 | "49",227.2,15.8,49.9,14.8
51 | "50",66.9,11.7,36.8,9.7
52 | "51",199.8,3.1,34.6,11.4
53 | "52",100.4,9.6,3.6,10.7
54 | "53",216.4,41.7,39.6,22.6
55 | "54",182.6,46.2,58.7,21.2
56 | "55",262.7,28.8,15.9,20.2
57 | "56",198.9,49.4,60,23.7
58 | "57",7.3,28.1,41.4,5.5
59 | "58",136.2,19.2,16.6,13.2
60 | "59",210.8,49.6,37.7,23.8
61 | "60",210.7,29.5,9.3,18.4
62 | "61",53.5,2,21.4,8.1
63 | "62",261.3,42.7,54.7,24.2
64 | "63",239.3,15.5,27.3,15.7
65 | "64",102.7,29.6,8.4,14
66 | "65",131.1,42.8,28.9,18
67 | "66",69,9.3,0.9,9.3
68 | "67",31.5,24.6,2.2,9.5
69 | "68",139.3,14.5,10.2,13.4
70 | "69",237.4,27.5,11,18.9
71 | "70",216.8,43.9,27.2,22.3
72 | "71",199.1,30.6,38.7,18.3
73 | "72",109.8,14.3,31.7,12.4
74 | "73",26.8,33,19.3,8.8
75 | "74",129.4,5.7,31.3,11
76 | "75",213.4,24.6,13.1,17
77 | "76",16.9,43.7,89.4,8.7
78 | "77",27.5,1.6,20.7,6.9
79 | "78",120.5,28.5,14.2,14.2
80 | "79",5.4,29.9,9.4,5.3
81 | "80",116,7.7,23.1,11
82 | "81",76.4,26.7,22.3,11.8
83 | "82",239.8,4.1,36.9,12.3
84 | "83",75.3,20.3,32.5,11.3
85 | "84",68.4,44.5,35.6,13.6
86 | "85",213.5,43,33.8,21.7
87 | "86",193.2,18.4,65.7,15.2
88 | "87",76.3,27.5,16,12
89 | "88",110.7,40.6,63.2,16
90 | "89",88.3,25.5,73.4,12.9
91 | "90",109.8,47.8,51.4,16.7
92 | "91",134.3,4.9,9.3,11.2
93 | "92",28.6,1.5,33,7.3
94 | "93",217.7,33.5,59,19.4
95 | "94",250.9,36.5,72.3,22.2
96 | "95",107.4,14,10.9,11.5
97 | "96",163.3,31.6,52.9,16.9
98 | "97",197.6,3.5,5.9,11.7
99 | "98",184.9,21,22,15.5
100 | "99",289.7,42.3,51.2,25.4
101 | "100",135.2,41.7,45.9,17.2
102 | "101",222.4,4.3,49.8,11.7
103 | "102",296.4,36.3,100.9,23.8
104 | "103",280.2,10.1,21.4,14.8
105 | "104",187.9,17.2,17.9,14.7
106 | "105",238.2,34.3,5.3,20.7
107 | "106",137.9,46.4,59,19.2
108 | "107",25,11,29.7,7.2
109 | "108",90.4,0.3,23.2,8.7
110 | "109",13.1,0.4,25.6,5.3
111 | "110",255.4,26.9,5.5,19.8
112 | "111",225.8,8.2,56.5,13.4
113 | "112",241.7,38,23.2,21.8
114 | "113",175.7,15.4,2.4,14.1
115 | "114",209.6,20.6,10.7,15.9
116 | "115",78.2,46.8,34.5,14.6
117 | "116",75.1,35,52.7,12.6
118 | "117",139.2,14.3,25.6,12.2
119 | "118",76.4,0.8,14.8,9.4
120 | "119",125.7,36.9,79.2,15.9
121 | "120",19.4,16,22.3,6.6
122 | "121",141.3,26.8,46.2,15.5
123 | "122",18.8,21.7,50.4,7
124 | "123",224,2.4,15.6,11.6
125 | "124",123.1,34.6,12.4,15.2
126 | "125",229.5,32.3,74.2,19.7
127 | "126",87.2,11.8,25.9,10.6
128 | "127",7.8,38.9,50.6,6.6
129 | "128",80.2,0,9.2,8.8
130 | "129",220.3,49,3.2,24.7
131 | "130",59.6,12,43.1,9.7
132 | "131",0.7,39.6,8.7,1.6
133 | "132",265.2,2.9,43,12.7
134 | "133",8.4,27.2,2.1,5.7
135 | "134",219.8,33.5,45.1,19.6
136 | "135",36.9,38.6,65.6,10.8
137 | "136",48.3,47,8.5,11.6
138 | "137",25.6,39,9.3,9.5
139 | "138",273.7,28.9,59.7,20.8
140 | "139",43,25.9,20.5,9.6
141 | "140",184.9,43.9,1.7,20.7
142 | "141",73.4,17,12.9,10.9
143 | "142",193.7,35.4,75.6,19.2
144 | "143",220.5,33.2,37.9,20.1
145 | "144",104.6,5.7,34.4,10.4
146 | "145",96.2,14.8,38.9,11.4
147 | "146",140.3,1.9,9,10.3
148 | "147",240.1,7.3,8.7,13.2
149 | "148",243.2,49,44.3,25.4
150 | "149",38,40.3,11.9,10.9
151 | "150",44.7,25.8,20.6,10.1
152 | "151",280.7,13.9,37,16.1
153 | "152",121,8.4,48.7,11.6
154 | "153",197.6,23.3,14.2,16.6
155 | "154",171.3,39.7,37.7,19
156 | "155",187.8,21.1,9.5,15.6
157 | "156",4.1,11.6,5.7,3.2
158 | "157",93.9,43.5,50.5,15.3
159 | "158",149.8,1.3,24.3,10.1
160 | "159",11.7,36.9,45.2,7.3
161 | "160",131.7,18.4,34.6,12.9
162 | "161",172.5,18.1,30.7,14.4
163 | "162",85.7,35.8,49.3,13.3
164 | "163",188.4,18.1,25.6,14.9
165 | "164",163.5,36.8,7.4,18
166 | "165",117.2,14.7,5.4,11.9
167 | "166",234.5,3.4,84.8,11.9
168 | "167",17.9,37.6,21.6,8
169 | "168",206.8,5.2,19.4,12.2
170 | "169",215.4,23.6,57.6,17.1
171 | "170",284.3,10.6,6.4,15
172 | "171",50,11.6,18.4,8.4
173 | "172",164.5,20.9,47.4,14.5
174 | "173",19.6,20.1,17,7.6
175 | "174",168.4,7.1,12.8,11.7
176 | "175",222.4,3.4,13.1,11.5
177 | "176",276.9,48.9,41.8,27
178 | "177",248.4,30.2,20.3,20.2
179 | "178",170.2,7.8,35.2,11.7
180 | "179",276.7,2.3,23.7,11.8
181 | "180",165.6,10,17.6,12.6
182 | "181",156.6,2.6,8.3,10.5
183 | "182",218.5,5.4,27.4,12.2
184 | "183",56.2,5.7,29.7,8.7
185 | "184",287.6,43,71.8,26.2
186 | "185",253.8,21.3,30,17.6
187 | "186",205,45.1,19.6,22.6
188 | "187",139.5,2.1,26.6,10.3
189 | "188",191.1,28.7,18.2,17.3
190 | "189",286,13.9,3.7,15.9
191 | "190",18.7,12.1,23.4,6.7
192 | "191",39.5,41.1,5.8,10.8
193 | "192",75.5,10.8,6,9.9
194 | "193",17.2,4.1,31.6,5.9
195 | "194",166.8,42,3.6,19.6
196 | "195",149.7,35.6,6,17.3
197 | "196",38.2,3.7,13.8,7.6
198 | "197",94.2,4.9,8.1,9.7
199 | "198",177,9.3,6.4,12.8
200 | "199",283.6,42,66.2,25.5
201 | "200",232.1,8.6,8.7,13.4
202 |
--------------------------------------------------------------------------------
/DecisionTreeClassification.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyNmlTARULSS42Up02f/RGGx",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 21,
32 | "metadata": {
33 | "colab": {
34 | "base_uri": "https://localhost:8080/",
35 | "height": 547
36 | },
37 | "id": "GpoywBK8fUaG",
38 | "outputId": "1e47a80a-a9ee-414a-9327-f694697492ba"
39 | },
40 | "outputs": [
41 | {
42 | "output_type": "stream",
43 | "name": "stdout",
44 | "text": [
45 | "Collecting delta-spark==2.4.0\n",
46 | " Downloading delta_spark-2.4.0-py3-none-any.whl.metadata (1.9 kB)\n",
47 | "Collecting pyspark<3.5.0,>=3.4.0 (from delta-spark==2.4.0)\n",
48 | " Downloading pyspark-3.4.4.tar.gz (311.4 MB)\n",
49 | "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m311.4/311.4 MB\u001b[0m \u001b[31m3.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
50 | "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
51 | "Requirement already satisfied: importlib-metadata>=1.0.0 in /usr/local/lib/python3.11/dist-packages (from delta-spark==2.4.0) (8.5.0)\n",
52 | "Requirement already satisfied: zipp>=3.20 in /usr/local/lib/python3.11/dist-packages (from importlib-metadata>=1.0.0->delta-spark==2.4.0) (3.21.0)\n",
53 | "Requirement already satisfied: py4j==0.10.9.7 in /usr/local/lib/python3.11/dist-packages (from pyspark<3.5.0,>=3.4.0->delta-spark==2.4.0) (0.10.9.7)\n",
54 | "Downloading delta_spark-2.4.0-py3-none-any.whl (20 kB)\n",
55 | "Building wheels for collected packages: pyspark\n",
56 | " Building wheel for pyspark (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
57 | " Created wheel for pyspark: filename=pyspark-3.4.4-py2.py3-none-any.whl size=311905460 sha256=49df0176ba3e140cb0fbacc113713ea216769259423fc3b261a3c15a8a5eac5d\n",
58 | " Stored in directory: /root/.cache/pip/wheels/6b/0a/a1/2b8f5f192c7df9fdceb8e5a62873d64e46b101f980519bcf55\n",
59 | "Successfully built pyspark\n",
60 | "Installing collected packages: pyspark, delta-spark\n",
61 | " Attempting uninstall: pyspark\n",
62 | " Found existing installation: pyspark 3.5.4\n",
63 | " Uninstalling pyspark-3.5.4:\n",
64 | " Successfully uninstalled pyspark-3.5.4\n",
65 | " Attempting uninstall: delta-spark\n",
66 | " Found existing installation: delta-spark 3.3.0\n",
67 | " Uninstalling delta-spark-3.3.0:\n",
68 | " Successfully uninstalled delta-spark-3.3.0\n",
69 | "Successfully installed delta-spark-2.4.0 pyspark-3.4.4\n"
70 | ]
71 | },
72 | {
73 | "output_type": "display_data",
74 | "data": {
75 | "application/vnd.colab-display-data+json": {
76 | "pip_warning": {
77 | "packages": [
78 | "delta",
79 | "pyspark"
80 | ]
81 | },
82 | "id": "28824af9be4a403d8555baebe58b07da"
83 | }
84 | },
85 | "metadata": {}
86 | }
87 | ],
88 | "source": [
89 | "#Decision Tree Classification\n",
90 | "!pip install delta-spark==2.4.0\n"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "source": [
96 | "from delta import *"
97 | ],
98 | "metadata": {
99 | "id": "VIPtM0dQgNpZ"
100 | },
101 | "execution_count": 19,
102 | "outputs": []
103 | },
104 | {
105 | "cell_type": "code",
106 | "source": [
107 | "from pyspark.sql import SparkSession\n",
108 | "\n",
109 | "# Add Delta Lake package dependency\n",
110 | "spark = SparkSession \\\n",
111 | " .builder \\\n",
112 | " .appName('DT Classification with Pyspark') \\\n",
113 | " .config(\"spark.jars.packages\", \"io.delta:delta-core_2.12:2.4.0\") \\\n",
114 | " .config(\"spark.sql.extensions\", \"io.delta.sql.DeltaSparkSessionExtension\") \\\n",
115 | " .config(\"spark.sql.catalog.spark_catalog\", \"org.apache.spark.sql.delta.catalog.DeltaCatalog\") \\\n",
116 | " .getOrCreate()\n",
117 | "\n",
118 | "# Read the CSV file using the correct format\n",
119 | "df = spark.read.format('csv').\\\n",
120 | " options(header = 'true', inferschema = 'true').\\\n",
121 | " load(\"/content/WineData.csv\")\n",
122 | "\n",
123 | "df.printSchema()\n",
124 | "df.show(5)"
125 | ],
126 | "metadata": {
127 | "colab": {
128 | "base_uri": "https://localhost:8080/"
129 | },
130 | "id": "2drqxnp7fdpf",
131 | "outputId": "18b6aa25-3446-422b-d62c-ae5f13fb50fa"
132 | },
133 | "execution_count": 1,
134 | "outputs": [
135 | {
136 | "output_type": "stream",
137 | "name": "stdout",
138 | "text": [
139 | "root\n",
140 | " |-- fixed acidity: double (nullable = true)\n",
141 | " |-- volatile acidity: double (nullable = true)\n",
142 | " |-- citric acid: double (nullable = true)\n",
143 | " |-- residual sugar: double (nullable = true)\n",
144 | " |-- chlorides: double (nullable = true)\n",
145 | " |-- free sulfur dioxide: double (nullable = true)\n",
146 | " |-- total sulfur dioxide: double (nullable = true)\n",
147 | " |-- density: double (nullable = true)\n",
148 | " |-- pH: double (nullable = true)\n",
149 | " |-- sulphates: double (nullable = true)\n",
150 | " |-- alcohol: double (nullable = true)\n",
151 | " |-- quality: integer (nullable = true)\n",
152 | "\n",
153 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n",
154 | "|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol|quality|\n",
155 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n",
156 | "| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| 5|\n",
157 | "| 7.8| 0.88| 0.0| 2.6| 0.098| 25.0| 67.0| 0.9968| 3.2| 0.68| 9.8| 5|\n",
158 | "| 7.8| 0.76| 0.04| 2.3| 0.092| 15.0| 54.0| 0.997|3.26| 0.65| 9.8| 5|\n",
159 | "| 11.2| 0.28| 0.56| 1.9| 0.075| 17.0| 60.0| 0.998|3.16| 0.58| 9.8| 6|\n",
160 | "| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| 5|\n",
161 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n",
162 | "only showing top 5 rows\n",
163 | "\n"
164 | ]
165 | }
166 | ]
167 | },
168 | {
169 | "cell_type": "code",
170 | "source": [
171 | "def condition(r):\n",
172 | "\n",
173 | " if (0 <= r <= 4):\n",
174 | " label = 'low'\n",
175 | "\n",
176 | " elif (4 < r <= 6):\n",
177 | " label = 'medium'\n",
178 | "\n",
179 | " else:\n",
180 | " label = 'high'\n",
181 | "\n",
182 | " return label\n",
183 | "\n",
184 | "def string_to_float(x):\n",
185 | " return float(x)"
186 | ],
187 | "metadata": {
188 | "id": "lrxWEoqAik9U"
189 | },
190 | "execution_count": 2,
191 | "outputs": []
192 | },
193 | {
194 | "cell_type": "code",
195 | "source": [
196 | "from pyspark.sql.functions import udf\n",
197 | "from pyspark.sql.types import StringType, DoubleType\n",
198 | "string_to_float_udf = udf(string_to_float, DoubleType())\n",
199 | "quality_udf = udf(lambda x : condition(x), StringType())\n",
200 | "\n",
201 | "df = df.withColumn(\"quality\", quality_udf(\"quality\"))\n",
202 | "df.show(5)\n",
203 | "df.printSchema()"
204 | ],
205 | "metadata": {
206 | "colab": {
207 | "base_uri": "https://localhost:8080/"
208 | },
209 | "id": "GBMr3ZBripCP",
210 | "outputId": "3a6fbc8b-20ca-40a3-d3fa-7f263af8ff8e"
211 | },
212 | "execution_count": 3,
213 | "outputs": [
214 | {
215 | "output_type": "stream",
216 | "name": "stdout",
217 | "text": [
218 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n",
219 | "|fixed acidity|volatile acidity|citric acid|residual sugar|chlorides|free sulfur dioxide|total sulfur dioxide|density| pH|sulphates|alcohol|quality|\n",
220 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n",
221 | "| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| medium|\n",
222 | "| 7.8| 0.88| 0.0| 2.6| 0.098| 25.0| 67.0| 0.9968| 3.2| 0.68| 9.8| medium|\n",
223 | "| 7.8| 0.76| 0.04| 2.3| 0.092| 15.0| 54.0| 0.997|3.26| 0.65| 9.8| medium|\n",
224 | "| 11.2| 0.28| 0.56| 1.9| 0.075| 17.0| 60.0| 0.998|3.16| 0.58| 9.8| medium|\n",
225 | "| 7.4| 0.7| 0.0| 1.9| 0.076| 11.0| 34.0| 0.9978|3.51| 0.56| 9.4| medium|\n",
226 | "+-------------+----------------+-----------+--------------+---------+-------------------+--------------------+-------+----+---------+-------+-------+\n",
227 | "only showing top 5 rows\n",
228 | "\n",
229 | "root\n",
230 | " |-- fixed acidity: double (nullable = true)\n",
231 | " |-- volatile acidity: double (nullable = true)\n",
232 | " |-- citric acid: double (nullable = true)\n",
233 | " |-- residual sugar: double (nullable = true)\n",
234 | " |-- chlorides: double (nullable = true)\n",
235 | " |-- free sulfur dioxide: double (nullable = true)\n",
236 | " |-- total sulfur dioxide: double (nullable = true)\n",
237 | " |-- density: double (nullable = true)\n",
238 | " |-- pH: double (nullable = true)\n",
239 | " |-- sulphates: double (nullable = true)\n",
240 | " |-- alcohol: double (nullable = true)\n",
241 | " |-- quality: string (nullable = true)\n",
242 | "\n"
243 | ]
244 | }
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "source": [
250 | "from pyspark.ml.linalg import Vectors\n",
251 | "from pyspark.ml import Pipeline\n",
252 | "from pyspark.ml.feature import VectorIndexer, StringIndexer, IndexToString\n",
253 | "from pyspark.ml.tuning import CrossValidator, ParamGridBuilder\n",
254 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator"
255 | ],
256 | "metadata": {
257 | "id": "pAOSSQCRivCa"
258 | },
259 | "execution_count": 4,
260 | "outputs": []
261 | },
262 | {
263 | "cell_type": "code",
264 | "source": [
265 | "def transData(data):\n",
266 | " return data.rdd.map(lambda r : [Vectors.dense(r[:-1]), r[-1]]).toDF(['features', 'label'])\n",
267 | "\n",
268 | "transformed = transData(df)\n",
269 | "transformed.show(5)"
270 | ],
271 | "metadata": {
272 | "colab": {
273 | "base_uri": "https://localhost:8080/"
274 | },
275 | "id": "qv8tliDciyWn",
276 | "outputId": "77bc7def-09ce-479b-e50d-07861d5c87b3"
277 | },
278 | "execution_count": 5,
279 | "outputs": [
280 | {
281 | "output_type": "stream",
282 | "name": "stdout",
283 | "text": [
284 | "+--------------------+------+\n",
285 | "| features| label|\n",
286 | "+--------------------+------+\n",
287 | "|[7.4,0.7,0.0,1.9,...|medium|\n",
288 | "|[7.8,0.88,0.0,2.6...|medium|\n",
289 | "|[7.8,0.76,0.04,2....|medium|\n",
290 | "|[11.2,0.28,0.56,1...|medium|\n",
291 | "|[7.4,0.7,0.0,1.9,...|medium|\n",
292 | "+--------------------+------+\n",
293 | "only showing top 5 rows\n",
294 | "\n"
295 | ]
296 | }
297 | ]
298 | },
299 | {
300 | "cell_type": "code",
301 | "source": [
302 | "from pyspark.ml.feature import VectorIndexer, StringIndexer, IndexToString\n",
303 | "labelIndexer = StringIndexer(inputCol = 'label', outputCol = 'indexedLabel').fit(transformed)\n",
304 | "labelIndexer.transform(transformed).show(5, True)\n",
305 | "\n",
306 | "featureIndexer = VectorIndexer(inputCol = 'features', outputCol = 'indexedFeatures', maxCategories = 4).fit(transformed)\n",
307 | "featureIndexer.transform(transformed).show(5)"
308 | ],
309 | "metadata": {
310 | "colab": {
311 | "base_uri": "https://localhost:8080/"
312 | },
313 | "id": "nB6Kjtawi16g",
314 | "outputId": "492b6d4c-3a53-4eff-b493-bf5341cbdb5b"
315 | },
316 | "execution_count": 6,
317 | "outputs": [
318 | {
319 | "output_type": "stream",
320 | "name": "stdout",
321 | "text": [
322 | "+--------------------+------+------------+\n",
323 | "| features| label|indexedLabel|\n",
324 | "+--------------------+------+------------+\n",
325 | "|[7.4,0.7,0.0,1.9,...|medium| 0.0|\n",
326 | "|[7.8,0.88,0.0,2.6...|medium| 0.0|\n",
327 | "|[7.8,0.76,0.04,2....|medium| 0.0|\n",
328 | "|[11.2,0.28,0.56,1...|medium| 0.0|\n",
329 | "|[7.4,0.7,0.0,1.9,...|medium| 0.0|\n",
330 | "+--------------------+------+------------+\n",
331 | "only showing top 5 rows\n",
332 | "\n",
333 | "+--------------------+------+--------------------+\n",
334 | "| features| label| indexedFeatures|\n",
335 | "+--------------------+------+--------------------+\n",
336 | "|[7.4,0.7,0.0,1.9,...|medium|[7.4,0.7,0.0,1.9,...|\n",
337 | "|[7.8,0.88,0.0,2.6...|medium|[7.8,0.88,0.0,2.6...|\n",
338 | "|[7.8,0.76,0.04,2....|medium|[7.8,0.76,0.04,2....|\n",
339 | "|[11.2,0.28,0.56,1...|medium|[11.2,0.28,0.56,1...|\n",
340 | "|[7.4,0.7,0.0,1.9,...|medium|[7.4,0.7,0.0,1.9,...|\n",
341 | "+--------------------+------+--------------------+\n",
342 | "only showing top 5 rows\n",
343 | "\n"
344 | ]
345 | }
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "source": [
351 | "(trainingData, testData) = transformed.randomSplit([0.6, 0.4])\n",
352 | "trainingData.show(5)\n",
353 | "testData.show(5)"
354 | ],
355 | "metadata": {
356 | "colab": {
357 | "base_uri": "https://localhost:8080/"
358 | },
359 | "id": "goV-vM1Ci_l3",
360 | "outputId": "4687e794-cf73-472d-8126-4ef3b769b203"
361 | },
362 | "execution_count": 7,
363 | "outputs": [
364 | {
365 | "output_type": "stream",
366 | "name": "stdout",
367 | "text": [
368 | "+--------------------+------+\n",
369 | "| features| label|\n",
370 | "+--------------------+------+\n",
371 | "|[4.6,0.52,0.15,2....| low|\n",
372 | "|[4.7,0.6,0.17,2.3...|medium|\n",
373 | "|[4.9,0.42,0.0,2.1...| high|\n",
374 | "|[5.0,0.38,0.01,1....|medium|\n",
375 | "|[5.0,0.4,0.5,4.3,...|medium|\n",
376 | "+--------------------+------+\n",
377 | "only showing top 5 rows\n",
378 | "\n",
379 | "+--------------------+------+\n",
380 | "| features| label|\n",
381 | "+--------------------+------+\n",
382 | "|[5.0,0.42,0.24,2....| high|\n",
383 | "|[5.0,1.04,0.24,1....|medium|\n",
384 | "|[5.1,0.42,0.0,1.8...| high|\n",
385 | "|[5.1,0.47,0.02,1....|medium|\n",
386 | "|[5.1,0.51,0.18,2....| high|\n",
387 | "+--------------------+------+\n",
388 | "only showing top 5 rows\n",
389 | "\n"
390 | ]
391 | }
392 | ]
393 | },
394 | {
395 | "cell_type": "code",
396 | "source": [
397 | "from pyspark.ml.classification import DecisionTreeClassifier\n",
398 | "\n",
399 | "dTree = DecisionTreeClassifier(labelCol = 'indexedLabel', featuresCol = 'indexedFeatures')"
400 | ],
401 | "metadata": {
402 | "id": "U6haQ1LAjEko"
403 | },
404 | "execution_count": 8,
405 | "outputs": []
406 | },
407 | {
408 | "cell_type": "code",
409 | "source": [
410 | "#Pipeline Architecture\n",
411 | "\n",
412 | "labelConverter = IndexToString(inputCol = \"prediction\", outputCol = 'predictedLabel', labels = labelIndexer.labels)\n",
413 | "\n",
414 | "pipeline = Pipeline(stages = [labelIndexer, featureIndexer, dTree, labelConverter])\n",
415 | "\n",
416 | "model = pipeline.fit(trainingData)\n",
417 | "\n",
418 | "predictions = model.transform(testData)\n",
419 | "\n",
420 | "predictions.select('features', 'label', \"predictedLabel\").show(5)"
421 | ],
422 | "metadata": {
423 | "colab": {
424 | "base_uri": "https://localhost:8080/"
425 | },
426 | "id": "eObhaarujHf1",
427 | "outputId": "95a07ba5-7c0f-44ac-cfc5-964829e995f6"
428 | },
429 | "execution_count": 9,
430 | "outputs": [
431 | {
432 | "output_type": "stream",
433 | "name": "stdout",
434 | "text": [
435 | "+--------------------+------+--------------+\n",
436 | "| features| label|predictedLabel|\n",
437 | "+--------------------+------+--------------+\n",
438 | "|[5.0,0.42,0.24,2....| high| medium|\n",
439 | "|[5.0,1.04,0.24,1....|medium| medium|\n",
440 | "|[5.1,0.42,0.0,1.8...| high| high|\n",
441 | "|[5.1,0.47,0.02,1....|medium| medium|\n",
442 | "|[5.1,0.51,0.18,2....| high| high|\n",
443 | "+--------------------+------+--------------+\n",
444 | "only showing top 5 rows\n",
445 | "\n"
446 | ]
447 | }
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "source": [
453 | "#Evaluation\n",
454 | "\n",
455 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
456 | "\n",
457 | "evaluator = MulticlassClassificationEvaluator(labelCol = 'indexedLabel', predictionCol = 'prediction', metricName = 'accuracy')\n",
458 | "accuracy = evaluator.evaluate(predictions)\n",
459 | "print(\"Test Error = %g\" % (1.0 - accuracy))\n",
460 | "\n",
461 | "rfModel = model.stages[-2]\n",
462 | "print(rfModel)"
463 | ],
464 | "metadata": {
465 | "colab": {
466 | "base_uri": "https://localhost:8080/"
467 | },
468 | "id": "BYLCy5FtjN9-",
469 | "outputId": "0181a515-4268-4531-805a-926afe0ddb92"
470 | },
471 | "execution_count": 10,
472 | "outputs": [
473 | {
474 | "output_type": "stream",
475 | "name": "stdout",
476 | "text": [
477 | "Test Error = 0.143345\n",
478 | "DecisionTreeClassificationModel: uid=DecisionTreeClassifier_5939946d6e55, depth=5, numNodes=49, numClasses=3, numFeatures=11\n"
479 | ]
480 | }
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "source": [
486 | "from sklearn.metrics import confusion_matrix\n",
487 | "\n",
488 | "y_true = predictions.select(\"label\")\n",
489 | "y_true = y_true.toPandas()\n",
490 | "\n",
491 | "y_pred = predictions.select(\"predictedLabel\")\n",
492 | "y_pred = y_pred.toPandas()\n",
493 | "\n",
494 | "cnf_matrix = confusion_matrix(y_true, y_pred)\n",
495 | "cnf_matrix"
496 | ],
497 | "metadata": {
498 | "colab": {
499 | "base_uri": "https://localhost:8080/"
500 | },
501 | "id": "yEc5idwAjSu_",
502 | "outputId": "2ce298f0-0994-4515-9733-17365362ef8e"
503 | },
504 | "execution_count": 11,
505 | "outputs": [
506 | {
507 | "output_type": "execute_result",
508 | "data": {
509 | "text/plain": [
510 | "array([[ 34, 0, 35],\n",
511 | " [ 1, 0, 22],\n",
512 | " [ 26, 0, 468]])"
513 | ]
514 | },
515 | "metadata": {},
516 | "execution_count": 11
517 | }
518 | ]
519 | }
520 | ]
521 | }
--------------------------------------------------------------------------------
/IBBLojistikWordCount.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "nbformat": 4,
3 | "nbformat_minor": 0,
4 | "metadata": {
5 | "colab": {
6 | "provenance": [],
7 | "authorship_tag": "ABX9TyOHR4+l0bFxeRYwqRPmcehg",
8 | "include_colab_link": true
9 | },
10 | "kernelspec": {
11 | "name": "python3",
12 | "display_name": "Python 3"
13 | },
14 | "language_info": {
15 | "name": "python"
16 | }
17 | },
18 | "cells": [
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {
22 | "id": "view-in-github",
23 | "colab_type": "text"
24 | },
25 | "source": [
26 | "
"
27 | ]
28 | },
29 | {
30 | "cell_type": "code",
31 | "execution_count": 2,
32 | "metadata": {
33 | "colab": {
34 | "base_uri": "https://localhost:8080/"
35 | },
36 | "id": "xldbqsBeP6qM",
37 | "outputId": "8bcc146c-ea14-44f3-82eb-16fe559cd26f"
38 | },
39 | "outputs": [
40 | {
41 | "output_type": "stream",
42 | "name": "stdout",
43 | "text": [
44 | "Requirement already satisfied: findspark in /usr/local/lib/python3.11/dist-packages (2.0.1)\n"
45 | ]
46 | }
47 | ],
48 | "source": [
49 | "!pip install findspark"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "source": [
55 | "import findspark\n",
56 | "findspark.init()"
57 | ],
58 | "metadata": {
59 | "id": "-JmlMpBkQCzl"
60 | },
61 | "execution_count": 3,
62 | "outputs": []
63 | },
64 | {
65 | "cell_type": "code",
66 | "source": [
67 | "from pyspark.sql import SparkSession\n",
68 | "from pyspark.conf import SparkConf"
69 | ],
70 | "metadata": {
71 | "id": "iEhUEVSJQEM9"
72 | },
73 | "execution_count": 4,
74 | "outputs": []
75 | },
76 | {
77 | "cell_type": "code",
78 | "source": [
79 | "spark = SparkSession.builder \\\n",
80 | ".master(\"local[4]\") \\\n",
81 | ".appName(\"giveatry\") \\\n",
82 | ".getOrCreate()"
83 | ],
84 | "metadata": {
85 | "id": "l50Sk6ecQGhB"
86 | },
87 | "execution_count": 5,
88 | "outputs": []
89 | },
90 | {
91 | "cell_type": "code",
92 | "source": [
93 | "sc = spark.sparkContext"
94 | ],
95 | "metadata": {
96 | "id": "j8OOyvOJQU-c"
97 | },
98 | "execution_count": 6,
99 | "outputs": []
100 | },
101 | {
102 | "cell_type": "code",
103 | "source": [
104 | "veri_seti = \"/content/ibb_lojistik2.txt\""
105 | ],
106 | "metadata": {
107 | "id": "dahhlWG5QWBr"
108 | },
109 | "execution_count": 7,
110 | "outputs": []
111 | },
112 | {
113 | "cell_type": "code",
114 | "source": [
115 | "istac_rdd = sc.textFile(veri_seti)"
116 | ],
117 | "metadata": {
118 | "id": "okuhb24OQtPO"
119 | },
120 | "execution_count": 8,
121 | "outputs": []
122 | },
123 | {
124 | "cell_type": "code",
125 | "source": [
126 | "istac_rdd.count()"
127 | ],
128 | "metadata": {
129 | "colab": {
130 | "base_uri": "https://localhost:8080/"
131 | },
132 | "id": "iRBbC6uXQv0V",
133 | "outputId": "f384fb2b-1cc1-4301-8797-181f58d419a0"
134 | },
135 | "execution_count": 10,
136 | "outputs": [
137 | {
138 | "output_type": "execute_result",
139 | "data": {
140 | "text/plain": [
141 | "4"
142 | ]
143 | },
144 | "metadata": {},
145 | "execution_count": 10
146 | }
147 | ]
148 | },
149 | {
150 | "cell_type": "code",
151 | "source": [
152 | "istac_rdd.take(5)"
153 | ],
154 | "metadata": {
155 | "colab": {
156 | "base_uri": "https://localhost:8080/"
157 | },
158 | "id": "iZEvmgOaQ0TB",
159 | "outputId": "41fbcf06-9bd5-41ce-ef3c-4201da8d0aa9"
160 | },
161 | "execution_count": 11,
162 | "outputs": [
163 | {
164 | "output_type": "execute_result",
165 | "data": {
166 | "text/plain": [
167 | "[\"Günümüz yaşam koşullarının etkisiyle insan ve yük hareketliliği hızla artmaktadır. Kentsel yaşamdaki günlük ihtiyaçların karşılanması bakımından zorunlu hale gelen yük taşımacılığı, insan yaşamı üzerinde çeşitli sosyal ve çevresel etkilere neden olmaktadır. Özellikle büyük şehirlerde hızlı nüfus artışıyla ortaya çıkan ihtiyaçların, kısa sürelerde giderilmesi çabası ulaştırma hizmetlerini zorlaştırmaktadır. Karayolu yolcu ve yük taşımacılığında, son otuz yılda diğer türler ile yapılan taşımacılıklara kıyasla hızlı bir gelişme olmuş, bu dengesiz gelişmenin sonucu olarak karayolu taşımasının payı her iki taşıma için hızla artarak %90'ın üzerine çıkmıştır. Özellikle İstanbul, sahip olduğu nüfus ve ekonomik potansiyel nedeniyle bu hızlı gelişimden en çok etkilenen illerden biridir. Bu gerekçeyle; şehirdeki yük hareketlerinin oluşturduğu trafik sıkışıklıkları, erişim kısıtlamaları, park sorunları, emisyonlar gibi ekonomik, sosyal ve çevresel etkiler göz önüne alınarak bir kentsel lojistik planlama yapılması ihtiyacı doğmuştur. \",\n",
168 | " 'Lojistik alanında yapılan yatırımlardan en fazla yararın elde edilmesi. Ancak hedef ve amaçların iyi belirlenmesi, Yapılan yatırımlar ile elde edilecek yararların maliyetlerle dengesinin sağlanması, Yatırımları yapacak olan kuruluşların yatırım güçlerinin göz önünde tutulması, Lojistik unsurlar içindeki entegrasyonun sağlanması, Lojistik konusunda geleceğe yönelik taleplerin doğru kestirilebilmesine bağlıdır. Diğer yanda limanlar, demiryolları, karayolları ve bunların tesisleri ile depolama alanları kent coğrafyası içinde yer bulmakta, kent içinde birçok kentsel kullanım ile birlikte yer almaktadır. Diğer kentsel kullanımlarla birlikte kent içinde yer alan lojistik yatırımların diğer kentsel kullanımlara ve kentsel yaşama olumsuz etkilerinin bertaraf edilmesi kapsamlı bir mekânsal planlama ile sağlanabilir. Bu amacın gerçekleşmesi ise ancak kapsamlı bir “Lojistik Ana Planı” ile mümkün görünmektedir. Böyle bir Planın gerektirdiği araştırma, analiz, eylem planı ve değerlendirme raporuyla bu çalışmanın altyapısının oluşturulması hedeflenmektedir. ',\n",
169 | " 'İstanbul Lojistik Ana Planı İşi Kapsamında Araç Sayımına Yönelik Saha Çalışması Yapılarak Veri Alımı Hizmeti İşinin amacı, İstanbul’un lojistik hareketliliğine dair elde edilecek verilerle İSTLAP’a bilimsel altlık oluşturmaktır. Çalışmada Teknik Şartname ile lokasyonları belirlenen lojistik odaklarda, perde-kordonlarda trafik sayımı ve anketler gerçekleştirilmiştir. ',\n",
170 | " 'İstanbul Lojistik Ana Planı İşi Kapsamında Araç Sayımına Yönelik Saha Çalışması Yapılarak Veri Alımı Hizmeti İşinin amacı, İstanbul’un lojistik hareketliliğine dair elde edilecek verilerle İSTLAP’a bilimsel altlık oluşturmaktır. Çalışmada Teknik Şartname ile lokasyonları belirlenen lojistik odaklarda, perde-kordonlarda trafik sayımı ve anketler gerçekleştirilmiştir. 1 Bu şekilde İstanbul’un lojistik altyapısının kente ve kentlilere olan sosyal, ekonomik, politik ve teknolojik etkileri belirlenerek tespit edilen sorunların çözümüne yönelik strateji ve öneriler geliştirilecektir. Kısa, orta ve uzun dönemli oluşturulacak eylem planlarıyla bir yandan sağlıklı mekânsal gelişim sağlanırken diğer yandan İstanbul Ulaşım Ana Planı’na entegre bir gelişim sağlanacaktır. Çalışmada hızlı ve doğru bir veri kümesine ulaşmak için hazırlık aşamasından başlayarak veri teslimine kadar olan süreçler önceden belirlenmiş metodoloji ve iş planına uygun şekilde gerçekleştirilmiştir. Çalışma kapsamında, İstanbul’da yük taşıyan araç trafiği düzeyinin belirlenerek lojistik atama modelinde kullanılmak üzere, İstanbul, Gebze ve Dilovası’nda yer alan belirli lojistik odak ve terminallerin yoğunlaştığı bölgelerin giriş- çıkışlarında, İstanbul trafiğini temsil eden perdekordon hatlarında gerçekleştirilmiştir THIS IS THE NEW CONTENT APPENDED IN THE FILE']"
171 | ]
172 | },
173 | "metadata": {},
174 | "execution_count": 11
175 | }
176 | ]
177 | },
178 | {
179 | "cell_type": "code",
180 | "source": [
181 | "istac_rdd_kelimeler = istac_rdd.flatMap(lambda satir : satir.split(\" \"))"
182 | ],
183 | "metadata": {
184 | "id": "baMJl3qoQ-3l"
185 | },
186 | "execution_count": 12,
187 | "outputs": []
188 | },
189 | {
190 | "cell_type": "code",
191 | "source": [
192 | "istac_rdd_kelimeler.take(20)"
193 | ],
194 | "metadata": {
195 | "colab": {
196 | "base_uri": "https://localhost:8080/"
197 | },
198 | "id": "za-RrUr2RBCI",
199 | "outputId": "4ccc0bab-b427-4e32-833b-b41a1dac4b4b"
200 | },
201 | "execution_count": 14,
202 | "outputs": [
203 | {
204 | "output_type": "execute_result",
205 | "data": {
206 | "text/plain": [
207 | "['Günümüz',\n",
208 | " 'yaşam',\n",
209 | " 'koşullarının',\n",
210 | " 'etkisiyle',\n",
211 | " 'insan',\n",
212 | " 've',\n",
213 | " 'yük',\n",
214 | " 'hareketliliği',\n",
215 | " 'hızla',\n",
216 | " 'artmaktadır.',\n",
217 | " 'Kentsel',\n",
218 | " 'yaşamdaki',\n",
219 | " 'günlük',\n",
220 | " 'ihtiyaçların',\n",
221 | " 'karşılanması',\n",
222 | " 'bakımından',\n",
223 | " 'zorunlu',\n",
224 | " 'hale',\n",
225 | " 'gelen',\n",
226 | " 'yük']"
227 | ]
228 | },
229 | "metadata": {},
230 | "execution_count": 14
231 | }
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "source": [
237 | "istac_rdd_kelimeler_sayilari = istac_rdd_kelimeler.map(lambda kelime : (kelime, 1))"
238 | ],
239 | "metadata": {
240 | "id": "2sNxw8-xRCn0"
241 | },
242 | "execution_count": 15,
243 | "outputs": []
244 | },
245 | {
246 | "cell_type": "code",
247 | "source": [
248 | "istac_rdd_kelimeler_sayilari.take(20)"
249 | ],
250 | "metadata": {
251 | "colab": {
252 | "base_uri": "https://localhost:8080/"
253 | },
254 | "id": "iH3fR2FuRFIe",
255 | "outputId": "0dd3b278-5bfe-4c75-ddc6-d29bdd9081ad"
256 | },
257 | "execution_count": 16,
258 | "outputs": [
259 | {
260 | "output_type": "execute_result",
261 | "data": {
262 | "text/plain": [
263 | "[('Günümüz', 1),\n",
264 | " ('yaşam', 1),\n",
265 | " ('koşullarının', 1),\n",
266 | " ('etkisiyle', 1),\n",
267 | " ('insan', 1),\n",
268 | " ('ve', 1),\n",
269 | " ('yük', 1),\n",
270 | " ('hareketliliği', 1),\n",
271 | " ('hızla', 1),\n",
272 | " ('artmaktadır.', 1),\n",
273 | " ('Kentsel', 1),\n",
274 | " ('yaşamdaki', 1),\n",
275 | " ('günlük', 1),\n",
276 | " ('ihtiyaçların', 1),\n",
277 | " ('karşılanması', 1),\n",
278 | " ('bakımından', 1),\n",
279 | " ('zorunlu', 1),\n",
280 | " ('hale', 1),\n",
281 | " ('gelen', 1),\n",
282 | " ('yük', 1)]"
283 | ]
284 | },
285 | "metadata": {},
286 | "execution_count": 16
287 | }
288 | ]
289 | },
290 | {
291 | "cell_type": "code",
292 | "source": [
293 | "istac_rdd_kelimeler_sayilari_reduce = istac_rdd_kelimeler_sayilari.reduceByKey(lambda x, y : (x + y))"
294 | ],
295 | "metadata": {
296 | "id": "imB4GRsGRH5p"
297 | },
298 | "execution_count": 17,
299 | "outputs": []
300 | },
301 | {
302 | "cell_type": "code",
303 | "source": [
304 | "istac_rdd_kelimeler_sayilari_reduce.take(20)"
305 | ],
306 | "metadata": {
307 | "colab": {
308 | "base_uri": "https://localhost:8080/"
309 | },
310 | "id": "lP_Xc5AiRKee",
311 | "outputId": "227ffec0-16db-4f70-ca29-7602d1839c60"
312 | },
313 | "execution_count": 19,
314 | "outputs": [
315 | {
316 | "output_type": "execute_result",
317 | "data": {
318 | "text/plain": [
319 | "[('Günümüz', 1),\n",
320 | " ('yaşam', 1),\n",
321 | " ('koşullarının', 1),\n",
322 | " ('etkisiyle', 1),\n",
323 | " ('artmaktadır.', 1),\n",
324 | " ('yaşamdaki', 1),\n",
325 | " ('günlük', 1),\n",
326 | " ('ihtiyaçların', 1),\n",
327 | " ('zorunlu', 1),\n",
328 | " ('gelen', 1),\n",
329 | " ('yaşamı', 1),\n",
330 | " ('sosyal', 2),\n",
331 | " ('çevresel', 2),\n",
332 | " ('nüfus', 2),\n",
333 | " ('ortaya', 1),\n",
334 | " ('çıkan', 1),\n",
335 | " ('ihtiyaçların,', 1),\n",
336 | " ('sürelerde', 1),\n",
337 | " ('giderilmesi', 1),\n",
338 | " ('çabası', 1)]"
339 | ]
340 | },
341 | "metadata": {},
342 | "execution_count": 19
343 | }
344 | ]
345 | },
346 | {
347 | "cell_type": "code",
348 | "source": [
349 | "istac_rdd_kelimeler_sayilari_reduce_sort = istac_rdd_kelimeler_sayilari_reduce.map(lambda x: (x[1], x[0]))"
350 | ],
351 | "metadata": {
352 | "id": "uTlupiemRNtr"
353 | },
354 | "execution_count": 20,
355 | "outputs": []
356 | },
357 | {
358 | "cell_type": "code",
359 | "source": [
360 | "istac_rdd_kelimeler_sayilari_reduce_sort.take(20)"
361 | ],
362 | "metadata": {
363 | "colab": {
364 | "base_uri": "https://localhost:8080/"
365 | },
366 | "id": "5-vG9o5_RRNk",
367 | "outputId": "9597e334-22f7-4178-d53e-c0f7380b1a98"
368 | },
369 | "execution_count": 21,
370 | "outputs": [
371 | {
372 | "output_type": "execute_result",
373 | "data": {
374 | "text/plain": [
375 | "[(1, 'Günümüz'),\n",
376 | " (1, 'yaşam'),\n",
377 | " (1, 'koşullarının'),\n",
378 | " (1, 'etkisiyle'),\n",
379 | " (1, 'artmaktadır.'),\n",
380 | " (1, 'yaşamdaki'),\n",
381 | " (1, 'günlük'),\n",
382 | " (1, 'ihtiyaçların'),\n",
383 | " (1, 'zorunlu'),\n",
384 | " (1, 'gelen'),\n",
385 | " (1, 'yaşamı'),\n",
386 | " (2, 'sosyal'),\n",
387 | " (2, 'çevresel'),\n",
388 | " (2, 'nüfus'),\n",
389 | " (1, 'ortaya'),\n",
390 | " (1, 'çıkan'),\n",
391 | " (1, 'ihtiyaçların,'),\n",
392 | " (1, 'sürelerde'),\n",
393 | " (1, 'giderilmesi'),\n",
394 | " (1, 'çabası')]"
395 | ]
396 | },
397 | "metadata": {},
398 | "execution_count": 21
399 | }
400 | ]
401 | },
402 | {
403 | "cell_type": "code",
404 | "source": [
405 | "istac_rdd_kelimeler_sayilari_reduce_sort.sortByKey(False).take(20)"
406 | ],
407 | "metadata": {
408 | "colab": {
409 | "base_uri": "https://localhost:8080/"
410 | },
411 | "id": "Ixs3sovURVXO",
412 | "outputId": "def8d340-e79f-4774-b14f-c91ad79bd345"
413 | },
414 | "execution_count": 22,
415 | "outputs": [
416 | {
417 | "output_type": "execute_result",
418 | "data": {
419 | "text/plain": [
420 | "[(19, 've'),\n",
421 | " (9, 'lojistik'),\n",
422 | " (8, 'ile'),\n",
423 | " (8, ''),\n",
424 | " (8, 'bir'),\n",
425 | " (5, 'Lojistik'),\n",
426 | " (5, 'yük'),\n",
427 | " (5, 'kentsel'),\n",
428 | " (4, 'elde'),\n",
429 | " (4, 'İstanbul'),\n",
430 | " (4, 'hızlı'),\n",
431 | " (4, 'yer'),\n",
432 | " (4, 'Ana'),\n",
433 | " (3, 'diğer'),\n",
434 | " (3, 'içinde'),\n",
435 | " (3, 'İstanbul’un'),\n",
436 | " (3, 'gerçekleştirilmiştir.'),\n",
437 | " (3, 'bu'),\n",
438 | " (3, 'Bu'),\n",
439 | " (3, 'trafik')]"
440 | ]
441 | },
442 | "metadata": {},
443 | "execution_count": 22
444 | }
445 | ]
446 | }
447 | ]
448 | }
--------------------------------------------------------------------------------
/loan_sanction_test.csv:
--------------------------------------------------------------------------------
1 | Loan_ID,Gender,Married,Dependents,Education,Self_Employed,ApplicantIncome,CoapplicantIncome,LoanAmount,Loan_Amount_Term,Credit_History,Property_Area
2 | LP001015,Male,Yes,0,Graduate,No,5720,0,110,360,1,Urban
3 | LP001022,Male,Yes,1,Graduate,No,3076,1500,126,360,1,Urban
4 | LP001031,Male,Yes,2,Graduate,No,5000,1800,208,360,1,Urban
5 | LP001035,Male,Yes,2,Graduate,No,2340,2546,100,360,,Urban
6 | LP001051,Male,No,0,Not Graduate,No,3276,0,78,360,1,Urban
7 | LP001054,Male,Yes,0,Not Graduate,Yes,2165,3422,152,360,1,Urban
8 | LP001055,Female,No,1,Not Graduate,No,2226,0,59,360,1,Semiurban
9 | LP001056,Male,Yes,2,Not Graduate,No,3881,0,147,360,0,Rural
10 | LP001059,Male,Yes,2,Graduate,,13633,0,280,240,1,Urban
11 | LP001067,Male,No,0,Not Graduate,No,2400,2400,123,360,1,Semiurban
12 | LP001078,Male,No,0,Not Graduate,No,3091,0,90,360,1,Urban
13 | LP001082,Male,Yes,1,Graduate,,2185,1516,162,360,1,Semiurban
14 | LP001083,Male,No,3+,Graduate,No,4166,0,40,180,,Urban
15 | LP001094,Male,Yes,2,Graduate,,12173,0,166,360,0,Semiurban
16 | LP001096,Female,No,0,Graduate,No,4666,0,124,360,1,Semiurban
17 | LP001099,Male,No,1,Graduate,No,5667,0,131,360,1,Urban
18 | LP001105,Male,Yes,2,Graduate,No,4583,2916,200,360,1,Urban
19 | LP001107,Male,Yes,3+,Graduate,No,3786,333,126,360,1,Semiurban
20 | LP001108,Male,Yes,0,Graduate,No,9226,7916,300,360,1,Urban
21 | LP001115,Male,No,0,Graduate,No,1300,3470,100,180,1,Semiurban
22 | LP001121,Male,Yes,1,Not Graduate,No,1888,1620,48,360,1,Urban
23 | LP001124,Female,No,3+,Not Graduate,No,2083,0,28,180,1,Urban
24 | LP001128,,No,0,Graduate,No,3909,0,101,360,1,Urban
25 | LP001135,Female,No,0,Not Graduate,No,3765,0,125,360,1,Urban
26 | LP001149,Male,Yes,0,Graduate,No,5400,4380,290,360,1,Urban
27 | LP001153,Male,No,0,Graduate,No,0,24000,148,360,0,Rural
28 | LP001163,Male,Yes,2,Graduate,No,4363,1250,140,360,,Urban
29 | LP001169,Male,Yes,0,Graduate,No,7500,3750,275,360,1,Urban
30 | LP001174,Male,Yes,0,Graduate,No,3772,833,57,360,,Semiurban
31 | LP001176,Male,No,0,Graduate,No,2942,2382,125,180,1,Urban
32 | LP001177,Female,No,0,Not Graduate,No,2478,0,75,360,1,Semiurban
33 | LP001183,Male,Yes,2,Graduate,No,6250,820,192,360,1,Urban
34 | LP001185,Male,No,0,Graduate,No,3268,1683,152,360,1,Semiurban
35 | LP001187,Male,Yes,0,Graduate,No,2783,2708,158,360,1,Urban
36 | LP001190,Male,Yes,0,Graduate,No,2740,1541,101,360,1,Urban
37 | LP001203,Male,No,0,Graduate,No,3150,0,176,360,0,Semiurban
38 | LP001208,Male,Yes,2,Graduate,,7350,4029,185,180,1,Urban
39 | LP001210,Male,Yes,0,Graduate,Yes,2267,2792,90,360,1,Urban
40 | LP001211,Male,No,0,Graduate,Yes,5833,0,116,360,1,Urban
41 | LP001219,Male,No,0,Graduate,No,3643,1963,138,360,1,Urban
42 | LP001220,Male,Yes,0,Graduate,No,5629,818,100,360,1,Urban
43 | LP001221,Female,No,0,Graduate,No,3644,0,110,360,1,Urban
44 | LP001226,Male,Yes,0,Not Graduate,No,1750,2024,90,360,1,Semiurban
45 | LP001230,Male,No,0,Graduate,No,6500,2600,200,360,1,Semiurban
46 | LP001231,Female,No,0,Graduate,No,3666,0,84,360,1,Urban
47 | LP001232,Male,Yes,0,Graduate,No,4260,3900,185,,,Urban
48 | LP001237,Male,Yes,,Not Graduate,No,4163,1475,162,360,1,Urban
49 | LP001242,Male,No,0,Not Graduate,No,2356,1902,108,360,1,Semiurban
50 | LP001268,Male,No,0,Graduate,No,6792,3338,187,,1,Urban
51 | LP001270,Male,Yes,3+,Not Graduate,Yes,8000,250,187,360,1,Semiurban
52 | LP001284,Male,Yes,1,Graduate,No,2419,1707,124,360,1,Urban
53 | LP001287,,Yes,3+,Not Graduate,No,3500,833,120,360,1,Semiurban
54 | LP001291,Male,Yes,1,Graduate,No,3500,3077,160,360,1,Semiurban
55 | LP001298,Male,Yes,2,Graduate,No,4116,1000,30,180,1,Urban
56 | LP001312,Male,Yes,0,Not Graduate,Yes,5293,0,92,360,1,Urban
57 | LP001313,Male,No,0,Graduate,No,2750,0,130,360,0,Urban
58 | LP001317,Female,No,0,Not Graduate,No,4402,0,130,360,1,Rural
59 | LP001321,Male,Yes,2,Graduate,No,3613,3539,134,180,1,Semiurban
60 | LP001323,Female,Yes,2,Graduate,No,2779,3664,176,360,0,Semiurban
61 | LP001324,Male,Yes,3+,Graduate,No,4720,0,90,180,1,Semiurban
62 | LP001332,Male,Yes,0,Not Graduate,No,2415,1721,110,360,1,Semiurban
63 | LP001335,Male,Yes,0,Graduate,Yes,7016,292,125,360,1,Urban
64 | LP001338,Female,No,2,Graduate,No,4968,0,189,360,1,Semiurban
65 | LP001347,Female,No,0,Graduate,No,2101,1500,108,360,0,Rural
66 | LP001348,Male,Yes,3+,Not Graduate,No,4490,0,125,360,1,Urban
67 | LP001351,Male,Yes,0,Graduate,No,2917,3583,138,360,1,Semiurban
68 | LP001352,Male,Yes,0,Not Graduate,No,4700,0,135,360,0,Semiurban
69 | LP001358,Male,Yes,0,Graduate,No,3445,0,130,360,0,Semiurban
70 | LP001359,Male,Yes,0,Graduate,No,7666,0,187,360,1,Semiurban
71 | LP001361,Male,Yes,0,Graduate,No,2458,5105,188,360,0,Rural
72 | LP001366,Female,No,,Graduate,No,3250,0,95,360,1,Semiurban
73 | LP001368,Male,No,0,Graduate,No,4463,0,65,360,1,Semiurban
74 | LP001375,Male,Yes,1,Graduate,,4083,1775,139,60,1,Urban
75 | LP001380,Male,Yes,0,Graduate,Yes,3900,2094,232,360,1,Rural
76 | LP001386,Male,Yes,0,Not Graduate,No,4750,3583,144,360,1,Semiurban
77 | LP001400,Male,No,0,Graduate,No,3583,3435,155,360,1,Urban
78 | LP001407,Male,Yes,0,Graduate,No,3189,2367,186,360,1,Urban
79 | LP001413,Male,No,0,Graduate,Yes,6356,0,50,360,1,Rural
80 | LP001415,Male,Yes,1,Graduate,No,3413,4053,,360,1,Semiurban
81 | LP001419,Female,Yes,0,Graduate,No,7950,0,185,360,1,Urban
82 | LP001420,Male,Yes,3+,Graduate,No,3829,1103,163,360,0,Urban
83 | LP001428,Male,Yes,3+,Graduate,No,72529,0,360,360,1,Urban
84 | LP001445,Male,Yes,2,Not Graduate,No,4136,0,149,480,0,Rural
85 | LP001446,Male,Yes,0,Graduate,No,8449,0,257,360,1,Rural
86 | LP001450,Male,Yes,0,Graduate,No,4456,0,131,180,0,Semiurban
87 | LP001452,Male,Yes,2,Graduate,No,4635,8000,102,180,1,Rural
88 | LP001455,Male,Yes,0,Graduate,No,3571,1917,135,360,1,Urban
89 | LP001466,Male,No,0,Graduate,No,3066,0,95,360,1,Semiurban
90 | LP001471,Male,No,2,Not Graduate,No,3235,2015,77,360,1,Semiurban
91 | LP001472,Female,No,0,Graduate,,5058,0,200,360,1,Rural
92 | LP001475,Male,Yes,0,Graduate,Yes,3188,2286,130,360,,Rural
93 | LP001483,Male,Yes,3+,Graduate,No,13518,0,390,360,1,Rural
94 | LP001486,Male,Yes,1,Graduate,No,4364,2500,185,360,1,Semiurban
95 | LP001490,Male,Yes,2,Not Graduate,No,4766,1646,100,360,1,Semiurban
96 | LP001496,Male,Yes,1,Graduate,No,4609,2333,123,360,0,Semiurban
97 | LP001499,Female,Yes,3+,Graduate,No,6260,0,110,360,1,Semiurban
98 | LP001500,Male,Yes,1,Graduate,No,3333,4200,256,360,1,Urban
99 | LP001501,Male,Yes,0,Graduate,No,3500,3250,140,360,1,Semiurban
100 | LP001517,Male,Yes,3+,Graduate,No,9719,0,61,360,1,Urban
101 | LP001527,Male,Yes,3+,Graduate,No,6835,0,188,360,,Semiurban
102 | LP001534,Male,No,0,Graduate,No,4452,0,131,360,1,Rural
103 | LP001542,Female,Yes,0,Graduate,No,2262,0,,480,0,Semiurban
104 | LP001547,Male,Yes,1,Graduate,No,3901,0,116,360,1,Urban
105 | LP001548,Male,Yes,2,Not Graduate,No,2687,0,50,180,1,Rural
106 | LP001558,Male,No,0,Graduate,No,2243,2233,107,360,,Semiurban
107 | LP001561,Female,Yes,0,Graduate,No,3417,1287,200,360,1,Semiurban
108 | LP001563,,No,0,Graduate,No,1596,1760,119,360,0,Urban
109 | LP001567,Male,Yes,3+,Graduate,No,4513,0,120,360,1,Rural
110 | LP001568,Male,Yes,0,Graduate,No,4500,0,140,360,1,Semiurban
111 | LP001573,Male,Yes,0,Not Graduate,No,4523,1350,165,360,1,Urban
112 | LP001584,Female,No,0,Graduate,Yes,4742,0,108,360,1,Semiurban
113 | LP001587,Male,Yes,,Graduate,No,4082,0,93,360,1,Semiurban
114 | LP001589,Female,No,0,Graduate,No,3417,0,102,360,1,Urban
115 | LP001591,Female,Yes,2,Graduate,No,2922,3396,122,360,1,Semiurban
116 | LP001599,Male,Yes,0,Graduate,No,4167,4754,160,360,1,Rural
117 | LP001601,Male,No,3+,Graduate,No,4243,4123,157,360,,Semiurban
118 | LP001607,Female,No,0,Not Graduate,No,0,1760,180,360,1,Semiurban
119 | LP001611,Male,Yes,1,Graduate,No,1516,2900,80,,0,Rural
120 | LP001613,Female,No,0,Graduate,No,1762,2666,104,360,0,Urban
121 | LP001622,Male,Yes,2,Graduate,No,724,3510,213,360,0,Rural
122 | LP001627,Male,No,0,Graduate,No,3125,0,65,360,1,Urban
123 | LP001650,Male,Yes,0,Graduate,No,2333,3803,146,360,1,Rural
124 | LP001651,Male,Yes,3+,Graduate,No,3350,1560,135,360,1,Urban
125 | LP001652,Male,No,0,Graduate,No,2500,6414,187,360,0,Rural
126 | LP001655,Female,No,0,Graduate,No,12500,0,300,360,0,Urban
127 | LP001660,Male,No,0,Graduate,No,4667,0,120,360,1,Semiurban
128 | LP001662,Male,No,0,Graduate,No,6500,0,71,360,0,Urban
129 | LP001663,Male,Yes,2,Graduate,No,7500,0,225,360,1,Urban
130 | LP001667,Male,No,0,Graduate,No,3073,0,70,180,1,Urban
131 | LP001695,Male,Yes,1,Not Graduate,No,3321,2088,70,,1,Semiurban
132 | LP001703,Male,Yes,0,Graduate,No,3333,1270,124,360,1,Urban
133 | LP001718,Male,No,0,Graduate,No,3391,0,132,360,1,Rural
134 | LP001728,Male,Yes,1,Graduate,Yes,3343,1517,105,360,1,Rural
135 | LP001735,Female,No,1,Graduate,No,3620,0,90,360,1,Urban
136 | LP001737,Male,No,0,Graduate,No,4000,0,83,84,1,Urban
137 | LP001739,Male,Yes,0,Graduate,No,4258,0,125,360,1,Urban
138 | LP001742,Male,Yes,2,Graduate,No,4500,0,147,360,1,Rural
139 | LP001757,Male,Yes,1,Graduate,No,2014,2925,120,360,1,Rural
140 | LP001769,,No,,Graduate,No,3333,1250,110,360,1,Semiurban
141 | LP001771,Female,No,3+,Graduate,No,4083,0,103,360,,Semiurban
142 | LP001785,Male,No,0,Graduate,No,4727,0,150,360,0,Rural
143 | LP001787,Male,Yes,3+,Graduate,No,3089,2999,100,240,1,Rural
144 | LP001789,Male,Yes,3+,Not Graduate,,6794,528,139,360,0,Urban
145 | LP001791,Male,Yes,0,Graduate,Yes,32000,0,550,360,,Semiurban
146 | LP001794,Male,Yes,2,Graduate,Yes,10890,0,260,12,1,Rural
147 | LP001797,Female,No,0,Graduate,No,12941,0,150,300,1,Urban
148 | LP001815,Male,No,0,Not Graduate,No,3276,0,90,360,1,Semiurban
149 | LP001817,Male,No,0,Not Graduate,Yes,8703,0,199,360,0,Rural
150 | LP001818,Male,Yes,1,Graduate,No,4742,717,139,360,1,Semiurban
151 | LP001822,Male,No,0,Graduate,No,5900,0,150,360,1,Urban
152 | LP001827,Male,No,0,Graduate,No,3071,4309,180,360,1,Urban
153 | LP001831,Male,Yes,0,Graduate,No,2783,1456,113,360,1,Urban
154 | LP001842,Male,No,0,Graduate,No,5000,0,148,360,1,Rural
155 | LP001853,Male,Yes,1,Not Graduate,No,2463,2360,117,360,0,Urban
156 | LP001855,Male,Yes,2,Graduate,No,4855,0,72,360,1,Rural
157 | LP001857,Male,No,0,Not Graduate,Yes,1599,2474,125,300,1,Semiurban
158 | LP001862,Male,Yes,2,Graduate,Yes,4246,4246,214,360,1,Urban
159 | LP001867,Male,Yes,0,Graduate,No,4333,2291,133,350,1,Rural
160 | LP001878,Male,No,1,Graduate,No,5823,2529,187,360,1,Semiurban
161 | LP001881,Male,Yes,0,Not Graduate,No,7895,0,143,360,1,Rural
162 | LP001886,Male,No,0,Graduate,No,4150,4256,209,360,1,Rural
163 | LP001906,Male,No,0,Graduate,,2964,0,84,360,0,Semiurban
164 | LP001909,Male,No,0,Graduate,No,5583,0,116,360,1,Urban
165 | LP001911,Female,No,0,Graduate,No,2708,0,65,360,1,Rural
166 | LP001921,Male,No,1,Graduate,No,3180,2370,80,240,,Rural
167 | LP001923,Male,No,0,Not Graduate,No,2268,0,170,360,0,Semiurban
168 | LP001933,Male,No,2,Not Graduate,No,1141,2017,120,360,0,Urban
169 | LP001943,Male,Yes,0,Graduate,No,3042,3167,135,360,1,Urban
170 | LP001950,Female,Yes,3+,Graduate,,1750,2935,94,360,0,Semiurban
171 | LP001959,Female,Yes,1,Graduate,No,3564,0,79,360,1,Rural
172 | LP001961,Female,No,0,Graduate,No,3958,0,110,360,1,Rural
173 | LP001973,Male,Yes,2,Not Graduate,No,4483,0,130,360,1,Rural
174 | LP001975,Male,Yes,0,Graduate,No,5225,0,143,360,1,Rural
175 | LP001979,Male,No,0,Graduate,No,3017,2845,159,180,0,Urban
176 | LP001995,Male,Yes,0,Not Graduate,No,2431,1820,110,360,0,Rural
177 | LP001999,Male,Yes,2,Graduate,,4912,4614,160,360,1,Rural
178 | LP002007,Male,Yes,2,Not Graduate,No,2500,3333,131,360,1,Urban
179 | LP002009,Female,No,0,Graduate,No,2918,0,65,360,,Rural
180 | LP002016,Male,Yes,2,Graduate,No,5128,0,143,360,1,Rural
181 | LP002017,Male,Yes,3+,Graduate,No,15312,0,187,360,,Urban
182 | LP002018,Male,Yes,2,Graduate,No,3958,2632,160,360,1,Semiurban
183 | LP002027,Male,Yes,0,Graduate,No,4334,2945,165,360,1,Semiurban
184 | LP002028,Male,Yes,2,Graduate,No,4358,0,110,360,1,Urban
185 | LP002042,Female,Yes,1,Graduate,No,4000,3917,173,360,1,Rural
186 | LP002045,Male,Yes,3+,Graduate,No,10166,750,150,,1,Urban
187 | LP002046,Male,Yes,0,Not Graduate,No,4483,0,135,360,,Semiurban
188 | LP002047,Male,Yes,2,Not Graduate,No,4521,1184,150,360,1,Semiurban
189 | LP002056,Male,Yes,2,Graduate,No,9167,0,235,360,1,Semiurban
190 | LP002057,Male,Yes,0,Not Graduate,No,13083,0,,360,1,Rural
191 | LP002059,Male,Yes,2,Graduate,No,7874,3967,336,360,1,Rural
192 | LP002062,Female,Yes,1,Graduate,No,4333,0,132,84,1,Rural
193 | LP002064,Male,No,0,Graduate,No,4083,0,96,360,1,Urban
194 | LP002069,Male,Yes,2,Not Graduate,,3785,2912,180,360,0,Rural
195 | LP002070,Male,Yes,3+,Not Graduate,No,2654,1998,128,360,0,Rural
196 | LP002077,Male,Yes,1,Graduate,No,10000,2690,412,360,1,Semiurban
197 | LP002083,Male,No,0,Graduate,Yes,5833,0,116,360,1,Urban
198 | LP002090,Male,Yes,1,Graduate,No,4796,0,114,360,0,Semiurban
199 | LP002096,Male,Yes,0,Not Graduate,No,2000,1600,115,360,1,Rural
200 | LP002099,Male,Yes,2,Graduate,No,2540,700,104,360,0,Urban
201 | LP002102,Male,Yes,0,Graduate,Yes,1900,1442,88,360,1,Rural
202 | LP002105,Male,Yes,0,Graduate,Yes,8706,0,108,480,1,Rural
203 | LP002107,Male,Yes,3+,Not Graduate,No,2855,542,90,360,1,Urban
204 | LP002111,Male,Yes,,Graduate,No,3016,1300,100,360,,Urban
205 | LP002117,Female,Yes,0,Graduate,No,3159,2374,108,360,1,Semiurban
206 | LP002118,Female,No,0,Graduate,No,1937,1152,78,360,1,Semiurban
207 | LP002123,Male,Yes,0,Graduate,No,2613,2417,123,360,1,Semiurban
208 | LP002125,Male,Yes,1,Graduate,No,4960,2600,187,360,1,Semiurban
209 | LP002148,Male,Yes,1,Graduate,No,3074,1083,146,360,1,Semiurban
210 | LP002152,Female,No,0,Graduate,No,4213,0,80,360,1,Urban
211 | LP002165,,No,1,Not Graduate,No,2038,4027,100,360,1,Rural
212 | LP002167,Female,No,0,Graduate,No,2362,0,55,360,1,Urban
213 | LP002168,Male,No,0,Graduate,No,5333,2400,200,360,0,Rural
214 | LP002172,Male,Yes,3+,Graduate,Yes,5384,0,150,360,1,Semiurban
215 | LP002176,Male,No,0,Graduate,No,5708,0,150,360,1,Rural
216 | LP002183,Male,Yes,0,Not Graduate,No,3754,3719,118,,1,Rural
217 | LP002184,Male,Yes,0,Not Graduate,No,2914,2130,150,300,1,Urban
218 | LP002186,Male,Yes,0,Not Graduate,No,2747,2458,118,36,1,Semiurban
219 | LP002192,Male,Yes,0,Graduate,No,7830,2183,212,360,1,Rural
220 | LP002195,Male,Yes,1,Graduate,Yes,3507,3148,212,360,1,Rural
221 | LP002208,Male,Yes,1,Graduate,No,3747,2139,125,360,1,Urban
222 | LP002212,Male,Yes,0,Graduate,No,2166,2166,108,360,,Urban
223 | LP002240,Male,Yes,0,Not Graduate,No,3500,2168,149,360,1,Rural
224 | LP002245,Male,Yes,2,Not Graduate,No,2896,0,80,480,1,Urban
225 | LP002253,Female,No,1,Graduate,No,5062,0,152,300,1,Rural
226 | LP002256,Female,No,2,Graduate,Yes,5184,0,187,360,0,Semiurban
227 | LP002257,Female,No,0,Graduate,No,2545,0,74,360,1,Urban
228 | LP002264,Male,Yes,0,Graduate,No,2553,1768,102,360,1,Urban
229 | LP002270,Male,Yes,1,Graduate,No,3436,3809,100,360,1,Rural
230 | LP002279,Male,No,0,Graduate,No,2412,2755,130,360,1,Rural
231 | LP002286,Male,Yes,3+,Not Graduate,No,5180,0,125,360,0,Urban
232 | LP002294,Male,No,0,Graduate,No,14911,14507,130,360,1,Semiurban
233 | LP002298,,No,0,Graduate,Yes,2860,2988,138,360,1,Urban
234 | LP002306,Male,Yes,0,Graduate,No,1173,1594,28,180,1,Rural
235 | LP002310,Female,No,1,Graduate,No,7600,0,92,360,1,Semiurban
236 | LP002311,Female,Yes,0,Graduate,No,2157,1788,104,360,1,Urban
237 | LP002316,Male,No,0,Graduate,No,2231,2774,176,360,0,Urban
238 | LP002321,Female,No,0,Graduate,No,2274,5211,117,360,0,Semiurban
239 | LP002325,Male,Yes,2,Not Graduate,No,6166,13983,102,360,1,Rural
240 | LP002326,Male,Yes,2,Not Graduate,No,2513,1110,107,360,1,Semiurban
241 | LP002329,Male,No,0,Graduate,No,4333,0,66,480,1,Urban
242 | LP002333,Male,No,0,Not Graduate,No,3844,0,105,360,1,Urban
243 | LP002339,Male,Yes,0,Graduate,No,3887,1517,105,360,0,Semiurban
244 | LP002344,Male,Yes,0,Graduate,No,3510,828,105,360,1,Semiurban
245 | LP002346,Male,Yes,0,Graduate,,2539,1704,125,360,0,Rural
246 | LP002354,Female,No,0,Not Graduate,No,2107,0,64,360,1,Semiurban
247 | LP002355,,Yes,0,Graduate,No,3186,3145,150,180,0,Semiurban
248 | LP002358,Male,Yes,2,Graduate,Yes,5000,2166,150,360,1,Urban
249 | LP002360,Male,Yes,,Graduate,No,10000,0,,360,1,Urban
250 | LP002375,Male,Yes,0,Not Graduate,Yes,3943,0,64,360,1,Semiurban
251 | LP002376,Male,No,0,Graduate,No,2925,0,40,180,1,Rural
252 | LP002383,Male,Yes,3+,Graduate,No,3242,437,142,480,0,Urban
253 | LP002385,Male,Yes,,Graduate,No,3863,0,70,300,1,Semiurban
254 | LP002389,Female,No,1,Graduate,No,4028,0,131,360,1,Semiurban
255 | LP002394,Male,Yes,2,Graduate,No,4010,1025,120,360,1,Urban
256 | LP002397,Female,Yes,1,Graduate,No,3719,1585,114,360,1,Urban
257 | LP002399,Male,No,0,Graduate,,2858,0,123,360,0,Rural
258 | LP002400,Female,Yes,0,Graduate,No,3833,0,92,360,1,Rural
259 | LP002402,Male,Yes,0,Graduate,No,3333,4288,160,360,1,Urban
260 | LP002412,Male,Yes,0,Graduate,No,3007,3725,151,360,1,Rural
261 | LP002415,Female,No,1,Graduate,,1850,4583,81,360,,Rural
262 | LP002417,Male,Yes,3+,Not Graduate,No,2792,2619,171,360,1,Semiurban
263 | LP002420,Male,Yes,0,Graduate,No,2982,1550,110,360,1,Semiurban
264 | LP002425,Male,No,0,Graduate,No,3417,738,100,360,,Rural
265 | LP002433,Male,Yes,1,Graduate,No,18840,0,234,360,1,Rural
266 | LP002440,Male,Yes,2,Graduate,No,2995,1120,184,360,1,Rural
267 | LP002441,Male,No,,Graduate,No,3579,3308,138,360,,Semiurban
268 | LP002442,Female,Yes,1,Not Graduate,No,3835,1400,112,480,0,Urban
269 | LP002445,Female,No,1,Not Graduate,No,3854,3575,117,360,1,Rural
270 | LP002450,Male,Yes,2,Graduate,No,5833,750,49,360,0,Rural
271 | LP002471,Male,No,0,Graduate,No,3508,0,99,360,1,Rural
272 | LP002476,Female,Yes,3+,Not Graduate,No,1635,2444,99,360,1,Urban
273 | LP002482,Female,No,0,Graduate,Yes,3333,3916,212,360,1,Rural
274 | LP002485,Male,No,1,Graduate,No,24797,0,240,360,1,Semiurban
275 | LP002495,Male,Yes,2,Graduate,No,5667,440,130,360,0,Semiurban
276 | LP002496,Female,No,0,Graduate,No,3500,0,94,360,0,Semiurban
277 | LP002523,Male,Yes,3+,Graduate,No,2773,1497,108,360,1,Semiurban
278 | LP002542,Male,Yes,0,Graduate,,6500,0,144,360,1,Urban
279 | LP002550,Female,No,0,Graduate,No,5769,0,110,180,1,Semiurban
280 | LP002551,Male,Yes,3+,Not Graduate,,3634,910,176,360,0,Semiurban
281 | LP002553,,No,0,Graduate,No,29167,0,185,360,1,Semiurban
282 | LP002554,Male,No,0,Graduate,No,2166,2057,122,360,1,Semiurban
283 | LP002561,Male,Yes,0,Graduate,No,5000,0,126,360,1,Rural
284 | LP002566,Female,No,0,Graduate,No,5530,0,135,360,,Urban
285 | LP002568,Male,No,0,Not Graduate,No,9000,0,122,360,1,Rural
286 | LP002570,Female,Yes,2,Graduate,No,10000,11666,460,360,1,Urban
287 | LP002572,Male,Yes,1,Graduate,,8750,0,297,360,1,Urban
288 | LP002581,Male,Yes,0,Not Graduate,No,2157,2730,140,360,,Rural
289 | LP002584,Male,No,0,Graduate,,1972,4347,106,360,1,Rural
290 | LP002592,Male,No,0,Graduate,No,4983,0,141,360,1,Urban
291 | LP002593,Male,Yes,1,Graduate,No,8333,4000,,360,1,Urban
292 | LP002599,Male,Yes,0,Graduate,No,3667,2000,170,360,1,Semiurban
293 | LP002604,Male,Yes,2,Graduate,No,3166,2833,145,360,1,Urban
294 | LP002605,Male,No,0,Not Graduate,No,3271,0,90,360,1,Rural
295 | LP002609,Female,Yes,0,Graduate,No,2241,2000,88,360,0,Urban
296 | LP002610,Male,Yes,1,Not Graduate,,1792,2565,128,360,1,Urban
297 | LP002612,Female,Yes,0,Graduate,No,2666,0,84,480,1,Semiurban
298 | LP002614,,No,0,Graduate,No,6478,0,108,360,1,Semiurban
299 | LP002630,Male,No,0,Not Graduate,,3808,0,83,360,1,Rural
300 | LP002635,Female,Yes,2,Not Graduate,No,3729,0,117,360,1,Semiurban
301 | LP002639,Male,Yes,2,Graduate,No,4120,0,128,360,1,Rural
302 | LP002644,Male,Yes,1,Graduate,Yes,7500,0,75,360,1,Urban
303 | LP002651,Male,Yes,1,Graduate,,6300,0,125,360,0,Urban
304 | LP002654,Female,No,,Graduate,Yes,14987,0,177,360,1,Rural
305 | LP002657,,Yes,1,Not Graduate,Yes,570,2125,68,360,1,Rural
306 | LP002711,Male,Yes,0,Graduate,No,2600,700,96,360,1,Semiurban
307 | LP002712,Male,No,2,Not Graduate,No,2733,1083,180,360,,Semiurban
308 | LP002721,Male,Yes,2,Graduate,Yes,7500,0,183,360,1,Rural
309 | LP002735,Male,Yes,2,Not Graduate,No,3859,0,121,360,1,Rural
310 | LP002744,Male,Yes,1,Graduate,No,6825,0,162,360,1,Rural
311 | LP002745,Male,Yes,0,Graduate,No,3708,4700,132,360,1,Semiurban
312 | LP002746,Male,No,0,Graduate,No,5314,0,147,360,1,Urban
313 | LP002747,Female,No,3+,Graduate,No,2366,5272,153,360,0,Rural
314 | LP002754,Male,No,,Graduate,No,2066,2108,104,84,1,Urban
315 | LP002759,Male,Yes,2,Graduate,No,5000,0,149,360,1,Rural
316 | LP002760,Female,No,0,Graduate,No,3767,0,134,300,1,Urban
317 | LP002766,Female,Yes,0,Graduate,No,7859,879,165,180,1,Semiurban
318 | LP002769,Female,Yes,0,Graduate,No,4283,0,120,360,1,Rural
319 | LP002774,Male,Yes,0,Not Graduate,No,1700,2900,67,360,0,Urban
320 | LP002775,,No,0,Not Graduate,No,4768,0,125,360,1,Rural
321 | LP002781,Male,No,0,Graduate,No,3083,2738,120,360,1,Urban
322 | LP002782,Male,Yes,1,Graduate,No,2667,1542,148,360,1,Rural
323 | LP002786,Female,Yes,0,Not Graduate,No,1647,1762,181,360,1,Urban
324 | LP002790,Male,Yes,3+,Graduate,No,3400,0,80,120,1,Urban
325 | LP002791,Male,No,1,Graduate,,16000,5000,40,360,1,Semiurban
326 | LP002793,Male,Yes,0,Graduate,No,5333,0,90,360,1,Rural
327 | LP002802,Male,No,0,Graduate,No,2875,2416,95,6,0,Semiurban
328 | LP002803,Male,Yes,1,Not Graduate,,2600,618,122,360,1,Semiurban
329 | LP002805,Male,Yes,2,Graduate,No,5041,700,150,360,1,Urban
330 | LP002806,Male,Yes,3+,Graduate,Yes,6958,1411,150,360,1,Rural
331 | LP002816,Male,Yes,1,Graduate,No,3500,1658,104,360,,Semiurban
332 | LP002823,Male,Yes,0,Graduate,No,5509,0,143,360,1,Rural
333 | LP002825,Male,Yes,3+,Graduate,No,9699,0,300,360,1,Urban
334 | LP002826,Female,Yes,1,Not Graduate,No,3621,2717,171,360,1,Urban
335 | LP002843,Female,Yes,0,Graduate,No,4709,0,113,360,1,Semiurban
336 | LP002849,Male,Yes,0,Graduate,No,1516,1951,35,360,1,Semiurban
337 | LP002850,Male,No,2,Graduate,No,2400,0,46,360,1,Urban
338 | LP002853,Female,No,0,Not Graduate,No,3015,2000,145,360,,Urban
339 | LP002856,Male,Yes,0,Graduate,No,2292,1558,119,360,1,Urban
340 | LP002857,Male,Yes,1,Graduate,Yes,2360,3355,87,240,1,Rural
341 | LP002858,Female,No,0,Graduate,No,4333,2333,162,360,0,Rural
342 | LP002860,Male,Yes,0,Graduate,Yes,2623,4831,122,180,1,Semiurban
343 | LP002867,Male,No,0,Graduate,Yes,3972,4275,187,360,1,Rural
344 | LP002869,Male,Yes,3+,Not Graduate,No,3522,0,81,180,1,Rural
345 | LP002870,Male,Yes,1,Graduate,No,4700,0,80,360,1,Urban
346 | LP002876,Male,No,0,Graduate,No,6858,0,176,360,1,Rural
347 | LP002878,Male,Yes,3+,Graduate,No,8334,0,260,360,1,Urban
348 | LP002879,Male,Yes,0,Graduate,No,3391,1966,133,360,0,Rural
349 | LP002885,Male,No,0,Not Graduate,No,2868,0,70,360,1,Urban
350 | LP002890,Male,Yes,2,Not Graduate,No,3418,1380,135,360,1,Urban
351 | LP002891,Male,Yes,0,Graduate,Yes,2500,296,137,300,1,Rural
352 | LP002899,Male,Yes,2,Graduate,No,8667,0,254,360,1,Rural
353 | LP002901,Male,No,0,Graduate,No,2283,15000,106,360,,Rural
354 | LP002907,Male,Yes,0,Graduate,No,5817,910,109,360,1,Urban
355 | LP002920,Male,Yes,0,Graduate,No,5119,3769,120,360,1,Rural
356 | LP002921,Male,Yes,3+,Not Graduate,No,5316,187,158,180,0,Semiurban
357 | LP002932,Male,Yes,3+,Graduate,No,7603,1213,197,360,1,Urban
358 | LP002935,Male,Yes,1,Graduate,No,3791,1936,85,360,1,Urban
359 | LP002952,Male,No,0,Graduate,No,2500,0,60,360,1,Urban
360 | LP002954,Male,Yes,2,Not Graduate,No,3132,0,76,360,,Rural
361 | LP002962,Male,No,0,Graduate,No,4000,2667,152,360,1,Semiurban
362 | LP002965,Female,Yes,0,Graduate,No,8550,4255,96,360,,Urban
363 | LP002969,Male,Yes,1,Graduate,No,2269,2167,99,360,1,Semiurban
364 | LP002971,Male,Yes,3+,Not Graduate,Yes,4009,1777,113,360,1,Urban
365 | LP002975,Male,Yes,0,Graduate,No,4158,709,115,360,1,Urban
366 | LP002980,Male,No,0,Graduate,No,3250,1993,126,360,,Semiurban
367 | LP002986,Male,Yes,0,Graduate,No,5000,2393,158,360,1,Rural
368 | LP002989,Male,No,0,Graduate,Yes,9200,0,98,180,1,Rural
--------------------------------------------------------------------------------
/sample.txt:
--------------------------------------------------------------------------------
1 | 24 29 88
2 | 1 0 8
3 | 33 7 99
4 | 39 11 98
5 | 22 76 87
--------------------------------------------------------------------------------
/sample0.txt:
--------------------------------------------------------------------------------
1 | Dogu Turkey Football
2 | John USA Hockey
3 | Paul Canada Basketball
--------------------------------------------------------------------------------