├── chap_2
└── .DS_Store
├── chap_3
├── .DS_Store
├── .ipynb_checkpoints
│ ├── Spark Structured Streaming-checkpoint.ipynb
│ ├── Spark Structured Streaming-ver_1-checkpoint.ipynb
│ ├── Spark Structured Streaming app-checkpoint.ipynb
│ ├── Spark Structured Streaming demo-checkpoint.ipynb
│ ├── Logistic_resgression_pyspark-checkpoint.ipynb
│ └── pyspark_basics-checkpoint.ipynb
└── Spark Structured Streaming demo.ipynb
├── chap_4
├── .DS_Store
└── pramod_dag.py
├── chap_5
├── .DS_Store
├── Classification_using_MLlib.ipynb
└── .ipynb_checkpoints
│ └── Classification_using_MLlib-checkpoint.ipynb
├── chap_6
└── .DS_Store
├── chap_7
└── .DS_Store
├── chap_8
├── .DS_Store
├── .ipynb_checkpoints
│ └── multilayer perceptron-checkpoint.ipynb
└── Multilayer_perceptron_spark.ipynb
├── 9781484249604.jpg
├── errata.md
├── README.md
├── Contributing.md
└── LICENSE.txt
/chap_2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_2/.DS_Store
--------------------------------------------------------------------------------
/chap_3/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_3/.DS_Store
--------------------------------------------------------------------------------
/chap_4/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_4/.DS_Store
--------------------------------------------------------------------------------
/chap_5/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_5/.DS_Store
--------------------------------------------------------------------------------
/chap_6/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_6/.DS_Store
--------------------------------------------------------------------------------
/chap_7/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_7/.DS_Store
--------------------------------------------------------------------------------
/chap_8/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_8/.DS_Store
--------------------------------------------------------------------------------
/9781484249604.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/9781484249604.jpg
--------------------------------------------------------------------------------
/errata.md:
--------------------------------------------------------------------------------
1 | # Errata for *Book Title*
2 |
3 | On **page xx** [Summary of error]:
4 |
5 | Details of error here. Highlight key pieces in **bold**.
6 |
7 | ***
8 |
9 | On **page xx** [Summary of error]:
10 |
11 | Details of error here. Highlight key pieces in **bold**.
12 |
13 | ***
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Apress Source Code
2 |
3 | This repository accompanies [*Learn PySpark*](https://www.apress.com/9781484249604) by Pramod Singh (Apress, 2019).
4 |
5 | [comment]: #cover
6 | 
7 |
8 | Download the files as a zip using the green button, or clone the repository to your machine using Git.
9 |
10 | ## Releases
11 |
12 | Release v1.0 corresponds to the code in the published book, without corrections or updates.
13 |
14 | ## Contributions
15 |
16 | See the file Contributing.md for more information on how you can contribute to this repository.
--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
1 | # Contributing to Apress Source Code
2 |
3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers.
4 |
5 | ## How to Contribute
6 |
7 | 1. Make sure you have a GitHub account.
8 | 2. Fork the repository for the relevant book.
9 | 3. Create a new branch on which to make your change, e.g.
10 | `git checkout -b my_code_contribution`
11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted.
12 | 5. Submit a pull request.
13 |
14 | Thank you for your contribution!
--------------------------------------------------------------------------------
/chap_4/pramod_dag.py:
--------------------------------------------------------------------------------
1 | from datetime import timedelta
2 |
3 | import airflow
4 | from airflow import DAG
5 | from airflow.operators.bash_operator import BashOperator
6 |
7 |
8 |
9 | args = {
10 | 'owner': 'Pramod',
11 | 'start_date': airflow.utils.dates.days_ago(3),
12 | # 'end_date': datetime(2018, 12, 30),
13 | 'depends_on_past': False,
14 | 'email': ['airflow@example.com'],
15 | 'email_on_failure': False,
16 | 'email_on_retry': False,
17 | # If a task fails, retry it once after waiting
18 | # at least 5 minutes
19 | 'retries': 1,
20 | 'retry_delay': timedelta(minutes=5),
21 | }
22 |
23 |
24 | dag = DAG(
25 | 'pramod_airflow_dag',
26 | default_args=args,
27 | description='A simple DAG',
28 | # Continue to run DAG once per day
29 | schedule_interval=timedelta(days=1)
30 | )
31 |
32 |
33 | # t1, t2 and t3 are examples of tasks created by instantiating operators
34 | t1 = BashOperator(
35 | task_id='print_date',
36 | bash_command='date',
37 | dag=dag,
38 | )
39 |
40 | t2 = BashOperator(
41 | task_id='sleep',
42 | depends_on_past=False,
43 | bash_command='sleep 5',
44 | dag=dag,
45 | )
46 |
47 |
48 |
49 | t1 >> t2
50 |
51 |
--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
1 | Freeware License, some rights reserved
2 |
3 | Copyright (c) 2019 Pramod Singh
4 |
5 | Permission is hereby granted, free of charge, to anyone obtaining a copy
6 | of this software and associated documentation files (the "Software"),
7 | to work with the Software within the limits of freeware distribution and fair use.
8 | This includes the rights to use, copy, and modify the Software for personal use.
9 | Users are also allowed and encouraged to submit corrections and modifications
10 | to the Software for the benefit of other users.
11 |
12 | It is not allowed to reuse, modify, or redistribute the Software for
13 | commercial use in any way, or for a user’s educational materials such as books
14 | or blog articles without prior permission from the copyright holder.
15 |
16 | The above copyright notice and this permission notice need to be included
17 | in all copies or substantial portions of the software.
18 |
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 |
27 |
28 |
--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/Spark Structured Streaming-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pyspark"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "#import SparkSession\n",
19 | "from pyspark.sql import SparkSession\n",
20 | "spark=SparkSession.builder.appName('ss').getOrCreate()"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 20,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "from pyspark.sql.functions import *\n",
30 | "from pyspark.sql.types import *"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 22,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "#create sample dataset\n",
40 | "df_1=spark.createDataFrame([('pramod neha',),('pramod ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 23,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "#define schema for input data\n",
50 | "schema=StructType().add('name','string')\n",
51 | "name_list=spark.readStream.schema(schema).format('parquet').load(\"new_folder\")"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 24,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "#split the names into individual names\n",
61 | "names=name_list.select(explode(split(name_list.name,' ')).alias('name'))"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 25,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "name_count=names.groupBy('name').count()"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 27,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "#query to write the results into memory sink\n",
80 | "query=(name_count.writeStream.queryName('new_query').outputMode('complete').format('memory').start())"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 31,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "data": {
90 | "text/html": [
91 | "
\n",
92 | "
\n",
93 | " \n",
94 | " \n",
95 | " | \n",
96 | " name | \n",
97 | " count | \n",
98 | "
\n",
99 | " \n",
100 | " \n",
101 | " \n",
102 | " | 0 | \n",
103 | " ziaan | \n",
104 | " 1 | \n",
105 | "
\n",
106 | " \n",
107 | " | 1 | \n",
108 | " neha | \n",
109 | " 1 | \n",
110 | "
\n",
111 | " \n",
112 | " | 2 | \n",
113 | " pramod | \n",
114 | " 2 | \n",
115 | "
\n",
116 | " \n",
117 | "
\n",
118 | "
"
119 | ],
120 | "text/plain": [
121 | " name count\n",
122 | "0 ziaan 1\n",
123 | "1 neha 1\n",
124 | "2 pramod 2"
125 | ]
126 | },
127 | "execution_count": 31,
128 | "metadata": {},
129 | "output_type": "execute_result"
130 | }
131 | ],
132 | "source": [
133 | "spark.sql(\"select * from new_query order by count \").toPandas().head(5)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 32,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "df_2=spark.createDataFrame([('ziaan neha',),('ziaan ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 35,
148 | "metadata": {},
149 | "outputs": [
150 | {
151 | "data": {
152 | "text/html": [
153 | "\n",
154 | "
\n",
155 | " \n",
156 | " \n",
157 | " | \n",
158 | " name | \n",
159 | " count | \n",
160 | "
\n",
161 | " \n",
162 | " \n",
163 | " \n",
164 | " | 0 | \n",
165 | " pramod | \n",
166 | " 2 | \n",
167 | "
\n",
168 | " \n",
169 | " | 1 | \n",
170 | " neha | \n",
171 | " 3 | \n",
172 | "
\n",
173 | " \n",
174 | " | 2 | \n",
175 | " ziaan | \n",
176 | " 5 | \n",
177 | "
\n",
178 | " \n",
179 | "
\n",
180 | "
"
181 | ],
182 | "text/plain": [
183 | " name count\n",
184 | "0 pramod 2\n",
185 | "1 neha 3\n",
186 | "2 ziaan 5"
187 | ]
188 | },
189 | "execution_count": 35,
190 | "metadata": {},
191 | "output_type": "execute_result"
192 | }
193 | ],
194 | "source": [
195 | "spark.sql(\"select * from new_query order by count \").toPandas().head(50)"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 34,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "df_3=spark.createDataFrame([('neha',),('ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": []
213 | }
214 | ],
215 | "metadata": {
216 | "kernelspec": {
217 | "display_name": "Python 3",
218 | "language": "python",
219 | "name": "python3"
220 | },
221 | "language_info": {
222 | "codemirror_mode": {
223 | "name": "ipython",
224 | "version": 3
225 | },
226 | "file_extension": ".py",
227 | "mimetype": "text/x-python",
228 | "name": "python",
229 | "nbconvert_exporter": "python",
230 | "pygments_lexer": "ipython3",
231 | "version": "3.6.3"
232 | }
233 | },
234 | "nbformat": 4,
235 | "nbformat_minor": 2
236 | }
237 |
--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/Spark Structured Streaming-ver_1-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import pyspark"
10 | ]
11 | },
12 | {
13 | "cell_type": "code",
14 | "execution_count": 2,
15 | "metadata": {},
16 | "outputs": [],
17 | "source": [
18 | "#import SparkSession\n",
19 | "from pyspark.sql import SparkSession\n",
20 | "spark=SparkSession.builder.appName('ss').getOrCreate()"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 20,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "from pyspark.sql.functions import *\n",
30 | "from pyspark.sql.types import *"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 22,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "#create sample dataset\n",
40 | "df_1=spark.createDataFrame([('pramod neha',),('pramod ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 23,
46 | "metadata": {},
47 | "outputs": [],
48 | "source": [
49 | "#define schema for input data\n",
50 | "schema=StructType().add('name','string')\n",
51 | "name_list=spark.readStream.schema(schema).format('parquet').load(\"new_folder\")"
52 | ]
53 | },
54 | {
55 | "cell_type": "code",
56 | "execution_count": 24,
57 | "metadata": {},
58 | "outputs": [],
59 | "source": [
60 | "#split the names into individual names\n",
61 | "names=name_list.select(explode(split(name_list.name,' ')).alias('name'))"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": 25,
67 | "metadata": {},
68 | "outputs": [],
69 | "source": [
70 | "name_count=names.groupBy('name').count()"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 27,
76 | "metadata": {},
77 | "outputs": [],
78 | "source": [
79 | "#query to write the results into memory sink\n",
80 | "query=(name_count.writeStream.queryName('new_query').outputMode('complete').format('memory').start())"
81 | ]
82 | },
83 | {
84 | "cell_type": "code",
85 | "execution_count": 31,
86 | "metadata": {},
87 | "outputs": [
88 | {
89 | "data": {
90 | "text/html": [
91 | "\n",
92 | "
\n",
93 | " \n",
94 | " \n",
95 | " | \n",
96 | " name | \n",
97 | " count | \n",
98 | "
\n",
99 | " \n",
100 | " \n",
101 | " \n",
102 | " | 0 | \n",
103 | " ziaan | \n",
104 | " 1 | \n",
105 | "
\n",
106 | " \n",
107 | " | 1 | \n",
108 | " neha | \n",
109 | " 1 | \n",
110 | "
\n",
111 | " \n",
112 | " | 2 | \n",
113 | " pramod | \n",
114 | " 2 | \n",
115 | "
\n",
116 | " \n",
117 | "
\n",
118 | "
"
119 | ],
120 | "text/plain": [
121 | " name count\n",
122 | "0 ziaan 1\n",
123 | "1 neha 1\n",
124 | "2 pramod 2"
125 | ]
126 | },
127 | "execution_count": 31,
128 | "metadata": {},
129 | "output_type": "execute_result"
130 | }
131 | ],
132 | "source": [
133 | "spark.sql(\"select * from new_query order by count \").toPandas().head(5)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": 32,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "df_2=spark.createDataFrame([('ziaan neha',),('ziaan ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
143 | ]
144 | },
145 | {
146 | "cell_type": "code",
147 | "execution_count": 35,
148 | "metadata": {},
149 | "outputs": [
150 | {
151 | "data": {
152 | "text/html": [
153 | "\n",
154 | "
\n",
155 | " \n",
156 | " \n",
157 | " | \n",
158 | " name | \n",
159 | " count | \n",
160 | "
\n",
161 | " \n",
162 | " \n",
163 | " \n",
164 | " | 0 | \n",
165 | " pramod | \n",
166 | " 2 | \n",
167 | "
\n",
168 | " \n",
169 | " | 1 | \n",
170 | " neha | \n",
171 | " 3 | \n",
172 | "
\n",
173 | " \n",
174 | " | 2 | \n",
175 | " ziaan | \n",
176 | " 5 | \n",
177 | "
\n",
178 | " \n",
179 | "
\n",
180 | "
"
181 | ],
182 | "text/plain": [
183 | " name count\n",
184 | "0 pramod 2\n",
185 | "1 neha 3\n",
186 | "2 ziaan 5"
187 | ]
188 | },
189 | "execution_count": 35,
190 | "metadata": {},
191 | "output_type": "execute_result"
192 | }
193 | ],
194 | "source": [
195 | "spark.sql(\"select * from new_query order by count \").toPandas().head(50)"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 34,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "df_3=spark.createDataFrame([('neha',),('ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": null,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": []
213 | }
214 | ],
215 | "metadata": {
216 | "kernelspec": {
217 | "display_name": "Python 3",
218 | "language": "python",
219 | "name": "python3"
220 | },
221 | "language_info": {
222 | "codemirror_mode": {
223 | "name": "ipython",
224 | "version": 3
225 | },
226 | "file_extension": ".py",
227 | "mimetype": "text/x-python",
228 | "name": "python",
229 | "nbconvert_exporter": "python",
230 | "pygments_lexer": "ipython3",
231 | "version": "3.6.3"
232 | }
233 | },
234 | "nbformat": 4,
235 | "nbformat_minor": 2
236 | }
237 |
--------------------------------------------------------------------------------
/chap_8/.ipynb_checkpoints/multilayer perceptron-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Load the libraries"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import os\n",
17 | "import numpy as np\n",
18 | "import pandas as pd\n",
19 | "from pyspark.sql.types import *\n",
20 | "from pyspark.ml import Pipeline\n",
21 | "from pyspark.sql import functions as f\n",
22 | "from pyspark.sql.functions import udf, StringType\n",
23 | "from pyspark.sql import SparkSession, functions as F\n",
24 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
25 | "from pyspark.ml.classification import MultilayerPerceptronClassifier\n",
26 | "from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer"
27 | ]
28 | },
29 | {
30 | "cell_type": "markdown",
31 | "metadata": {},
32 | "source": [
33 | "# Initialize Spark Session"
34 | ]
35 | },
36 | {
37 | "cell_type": "code",
38 | "execution_count": 2,
39 | "metadata": {},
40 | "outputs": [],
41 | "source": [
42 | "spark = SparkSession.builder.appName('pyspark-dl').getOrCreate()"
43 | ]
44 | },
45 | {
46 | "cell_type": "markdown",
47 | "metadata": {},
48 | "source": [
49 | "# Read the Dataset"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": 3,
55 | "metadata": {},
56 | "outputs": [],
57 | "source": [
58 | "web_data = spark.read.csv('data_set.csv', header=True, inferSchema=True)"
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": 4,
64 | "metadata": {},
65 | "outputs": [
66 | {
67 | "name": "stdout",
68 | "output_type": "stream",
69 | "text": [
70 | "root\n",
71 | " |-- Visit_Number_Bucket: string (nullable = true)\n",
72 | " |-- Page_Views_Normalized: double (nullable = true)\n",
73 | " |-- Orders_Normalized: integer (nullable = true)\n",
74 | " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n",
75 | " |-- Internal_Search_Null_Normalized: double (nullable = true)\n",
76 | " |-- Email_Signup_Normalized: double (nullable = true)\n",
77 | " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n",
78 | " |-- Store_Locator_Search_Normalized: double (nullable = true)\n",
79 | " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n",
80 | " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n",
81 | " |-- Mapped_Browser_Type: string (nullable = true)\n",
82 | " |-- Mapped_Entry_Pages: string (nullable = true)\n",
83 | " |-- Mapped_Site_Section: string (nullable = true)\n",
84 | " |-- Mapped_Promo_Code: string (nullable = true)\n",
85 | " |-- Maped_Product_Name: string (nullable = true)\n",
86 | " |-- Mapped_Search_Term: string (nullable = true)\n",
87 | " |-- Mapped_Product_Collection: string (nullable = true)\n",
88 | "\n"
89 | ]
90 | }
91 | ],
92 | "source": [
93 | "web_data.printSchema()"
94 | ]
95 | },
96 | {
97 | "cell_type": "markdown",
98 | "metadata": {},
99 | "source": [
100 | "# Rename Target Column"
101 | ]
102 | },
103 | {
104 | "cell_type": "code",
105 | "execution_count": 5,
106 | "metadata": {},
107 | "outputs": [],
108 | "source": [
109 | "web_data_renamed = web_data.withColumnRenamed('Orders_Normalized', 'label')"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": 6,
115 | "metadata": {},
116 | "outputs": [
117 | {
118 | "name": "stdout",
119 | "output_type": "stream",
120 | "text": [
121 | "root\n",
122 | " |-- Visit_Number_Bucket: string (nullable = true)\n",
123 | " |-- Page_Views_Normalized: double (nullable = true)\n",
124 | " |-- label: integer (nullable = true)\n",
125 | " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n",
126 | " |-- Internal_Search_Null_Normalized: double (nullable = true)\n",
127 | " |-- Email_Signup_Normalized: double (nullable = true)\n",
128 | " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n",
129 | " |-- Store_Locator_Search_Normalized: double (nullable = true)\n",
130 | " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n",
131 | " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n",
132 | " |-- Mapped_Browser_Type: string (nullable = true)\n",
133 | " |-- Mapped_Entry_Pages: string (nullable = true)\n",
134 | " |-- Mapped_Site_Section: string (nullable = true)\n",
135 | " |-- Mapped_Promo_Code: string (nullable = true)\n",
136 | " |-- Maped_Product_Name: string (nullable = true)\n",
137 | " |-- Mapped_Search_Term: string (nullable = true)\n",
138 | " |-- Mapped_Product_Collection: string (nullable = true)\n",
139 | "\n"
140 | ]
141 | }
142 | ],
143 | "source": [
144 | "web_data_renamed.printSchema()"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {},
150 | "source": [
151 | "# Split the dataset into Train, Validation and Test"
152 | ]
153 | },
154 | {
155 | "cell_type": "code",
156 | "execution_count": 7,
157 | "metadata": {},
158 | "outputs": [],
159 | "source": [
160 | "train, validation, test = web_data_renamed.randomSplit([0.7, 0.2, 0.1], 1234)"
161 | ]
162 | },
163 | {
164 | "cell_type": "markdown",
165 | "metadata": {},
166 | "source": [
167 | "# Build Pipeline"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": 8,
173 | "metadata": {},
174 | "outputs": [],
175 | "source": [
176 | "categorical_columns = [item[0] for item in web_data_renamed.dtypes if item[1].startswith('string')]\n",
177 | "numeric_columns = [item[0] for item in web_data_renamed.dtypes if item[1].startswith('double')]"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 9,
183 | "metadata": {},
184 | "outputs": [],
185 | "source": [
186 | "indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]\n"
187 | ]
188 | },
189 | {
190 | "cell_type": "code",
191 | "execution_count": 10,
192 | "metadata": {},
193 | "outputs": [],
194 | "source": [
195 | "featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol=\"features\")\n"
196 | ]
197 | },
198 | {
199 | "cell_type": "code",
200 | "execution_count": 11,
201 | "metadata": {},
202 | "outputs": [],
203 | "source": [
204 | "layers = [len(featuresCreator.getInputCols()), 4, 2, 2]\n",
205 | "\n",
206 | "classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)\n"
207 | ]
208 | },
209 | {
210 | "cell_type": "code",
211 | "execution_count": 12,
212 | "metadata": {},
213 | "outputs": [],
214 | "source": [
215 | "pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])"
216 | ]
217 | },
218 | {
219 | "cell_type": "markdown",
220 | "metadata": {},
221 | "source": [
222 | "# Fit Pipeline"
223 | ]
224 | },
225 | {
226 | "cell_type": "code",
227 | "execution_count": 13,
228 | "metadata": {},
229 | "outputs": [],
230 | "source": [
231 | "model = pipeline.fit(train)"
232 | ]
233 | },
234 | {
235 | "cell_type": "markdown",
236 | "metadata": {},
237 | "source": [
238 | "# Get Pipeline Output"
239 | ]
240 | },
241 | {
242 | "cell_type": "code",
243 | "execution_count": 14,
244 | "metadata": {},
245 | "outputs": [],
246 | "source": [
247 | "train_output_df = model.transform(train)\n",
248 | "validation_output_df = model.transform(validation)\n",
249 | "test_output_df = model.transform(test)"
250 | ]
251 | },
252 | {
253 | "cell_type": "markdown",
254 | "metadata": {},
255 | "source": [
256 | "# Evaluate the Predictions"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": 15,
262 | "metadata": {
263 | "scrolled": true
264 | },
265 | "outputs": [
266 | {
267 | "name": "stdout",
268 | "output_type": "stream",
269 | "text": [
270 | "Train weightedPrecision = 0.976101874447846\n",
271 | "Validation weightedPrecision = 0.9765821626938243\n",
272 | "Test weightedPrecision = 0.9747324280445043\n",
273 | "Train weightedRecall = 0.9755751041220662\n",
274 | "Validation weightedRecall = 0.9761613691931541\n",
275 | "Test weightedRecall = 0.9742582305920606\n",
276 | "Train accuracy = 0.975575104122066\n",
277 | "Validation accuracy = 0.976161369193154\n",
278 | "Test accuracy = 0.9742582305920607\n"
279 | ]
280 | }
281 | ],
282 | "source": [
283 | "train_predictionAndLabels = train_output_df.select(\"prediction\", \"label\")\n",
284 | "validation_predictionAndLabels = validation_output_df.select(\"prediction\", \"label\")\n",
285 | "test_predictionAndLabels = test_output_df.select(\"prediction\", \"label\")\n",
286 | "\n",
287 | "metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']\n",
288 | "\n",
289 | "for metric in metrics:\n",
290 | " evaluator = MulticlassClassificationEvaluator(metricName=metric)\n",
291 | " print('Train ' + metric + ' = ' + str(evaluator.evaluate(train_predictionAndLabels)))\n",
292 | " print('Validation ' + metric + ' = ' + str(evaluator.evaluate(validation_predictionAndLabels)))\n",
293 | " print('Test ' + metric + ' = ' + str(evaluator.evaluate(test_predictionAndLabels)))"
294 | ]
295 | }
296 | ],
297 | "metadata": {
298 | "kernelspec": {
299 | "display_name": "Python 3",
300 | "language": "python",
301 | "name": "python3"
302 | },
303 | "language_info": {
304 | "codemirror_mode": {
305 | "name": "ipython",
306 | "version": 3
307 | },
308 | "file_extension": ".py",
309 | "mimetype": "text/x-python",
310 | "name": "python",
311 | "nbconvert_exporter": "python",
312 | "pygments_lexer": "ipython3",
313 | "version": "3.6.3"
314 | }
315 | },
316 | "nbformat": 4,
317 | "nbformat_minor": 2
318 | }
319 |
--------------------------------------------------------------------------------
/chap_8/Multilayer_perceptron_spark.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Load the libraries"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {
14 | "collapsed": true
15 | },
16 | "outputs": [],
17 | "source": [
18 | "import os\n",
19 | "import numpy as np\n",
20 | "import pandas as pd\n",
21 | "from pyspark.sql.types import *\n",
22 | "from pyspark.ml import Pipeline\n",
23 | "from pyspark.sql import functions as f\n",
24 | "from pyspark.sql.functions import udf, StringType\n",
25 | "from pyspark.sql import SparkSession, functions as F\n",
26 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
27 | "from pyspark.ml.classification import MultilayerPerceptronClassifier\n",
28 | "from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer"
29 | ]
30 | },
31 | {
32 | "cell_type": "markdown",
33 | "metadata": {},
34 | "source": [
35 | "# Initialize Spark Session"
36 | ]
37 | },
38 | {
39 | "cell_type": "code",
40 | "execution_count": 2,
41 | "metadata": {
42 | "collapsed": true
43 | },
44 | "outputs": [],
45 | "source": [
46 | "spark = SparkSession.builder.appName('deep_learning').getOrCreate()"
47 | ]
48 | },
49 | {
50 | "cell_type": "markdown",
51 | "metadata": {},
52 | "source": [
53 | "# Read the Dataset"
54 | ]
55 | },
56 | {
57 | "cell_type": "code",
58 | "execution_count": 3,
59 | "metadata": {
60 | "collapsed": true
61 | },
62 | "outputs": [],
63 | "source": [
64 | "data = spark.read.csv('dl_data.csv', header=True, inferSchema=True)"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": 4,
70 | "metadata": {},
71 | "outputs": [
72 | {
73 | "name": "stdout",
74 | "output_type": "stream",
75 | "text": [
76 | "root\n",
77 | " |-- Visit_Number_Bucket: string (nullable = true)\n",
78 | " |-- Page_Views_Normalized: double (nullable = true)\n",
79 | " |-- Orders_Normalized: integer (nullable = true)\n",
80 | " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n",
81 | " |-- Internal_Search_Null_Normalized: double (nullable = true)\n",
82 | " |-- Email_Signup_Normalized: double (nullable = true)\n",
83 | " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n",
84 | " |-- Store_Locator_Search_Normalized: double (nullable = true)\n",
85 | " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n",
86 | " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n",
87 | " |-- Mapped_Browser_Type: string (nullable = true)\n",
88 | " |-- Mapped_Entry_Pages: string (nullable = true)\n",
89 | " |-- Mapped_Site_Section: string (nullable = true)\n",
90 | " |-- Mapped_Promo_Code: string (nullable = true)\n",
91 | " |-- Maped_Product_Name: string (nullable = true)\n",
92 | " |-- Mapped_Search_Term: string (nullable = true)\n",
93 | " |-- Mapped_Product_Collection: string (nullable = true)\n",
94 | "\n"
95 | ]
96 | }
97 | ],
98 | "source": [
99 | "data.printSchema()"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "# Rename Target Column"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": 5,
112 | "metadata": {
113 | "collapsed": true
114 | },
115 | "outputs": [],
116 | "source": [
117 | "data = data.withColumnRenamed('Orders_Normalized', 'label')"
118 | ]
119 | },
120 | {
121 | "cell_type": "code",
122 | "execution_count": 6,
123 | "metadata": {},
124 | "outputs": [
125 | {
126 | "name": "stdout",
127 | "output_type": "stream",
128 | "text": [
129 | "root\n",
130 | " |-- Visit_Number_Bucket: string (nullable = true)\n",
131 | " |-- Page_Views_Normalized: double (nullable = true)\n",
132 | " |-- label: integer (nullable = true)\n",
133 | " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n",
134 | " |-- Internal_Search_Null_Normalized: double (nullable = true)\n",
135 | " |-- Email_Signup_Normalized: double (nullable = true)\n",
136 | " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n",
137 | " |-- Store_Locator_Search_Normalized: double (nullable = true)\n",
138 | " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n",
139 | " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n",
140 | " |-- Mapped_Browser_Type: string (nullable = true)\n",
141 | " |-- Mapped_Entry_Pages: string (nullable = true)\n",
142 | " |-- Mapped_Site_Section: string (nullable = true)\n",
143 | " |-- Mapped_Promo_Code: string (nullable = true)\n",
144 | " |-- Maped_Product_Name: string (nullable = true)\n",
145 | " |-- Mapped_Search_Term: string (nullable = true)\n",
146 | " |-- Mapped_Product_Collection: string (nullable = true)\n",
147 | "\n"
148 | ]
149 | }
150 | ],
151 | "source": [
152 | "data.printSchema()"
153 | ]
154 | },
155 | {
156 | "cell_type": "markdown",
157 | "metadata": {},
158 | "source": [
159 | "# Split the dataset into Train, Validation and Test"
160 | ]
161 | },
162 | {
163 | "cell_type": "code",
164 | "execution_count": 7,
165 | "metadata": {
166 | "collapsed": true
167 | },
168 | "outputs": [],
169 | "source": [
170 | "train, validation, test = data.randomSplit([0.7, 0.2, 0.1], 1234)"
171 | ]
172 | },
173 | {
174 | "cell_type": "markdown",
175 | "metadata": {},
176 | "source": [
177 | "# Build Pipeline"
178 | ]
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": 8,
183 | "metadata": {
184 | "collapsed": true
185 | },
186 | "outputs": [],
187 | "source": [
188 | "categorical_columns = [item[0] for item in data.dtypes if item[1].startswith('string')]\n",
189 | "numeric_columns = [item[0] for item in data.dtypes if item[1].startswith('double')]"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 9,
195 | "metadata": {
196 | "collapsed": true
197 | },
198 | "outputs": [],
199 | "source": [
200 | "indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]\n"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 10,
206 | "metadata": {
207 | "collapsed": true
208 | },
209 | "outputs": [],
210 | "source": [
211 | "featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol=\"features\")\n"
212 | ]
213 | },
214 | {
215 | "cell_type": "code",
216 | "execution_count": 11,
217 | "metadata": {
218 | "collapsed": true
219 | },
220 | "outputs": [],
221 | "source": [
222 | "layers = [len(featuresCreator.getInputCols()), 4, 2, 2]\n",
223 | "\n",
224 | "classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)\n"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 12,
230 | "metadata": {
231 | "collapsed": true
232 | },
233 | "outputs": [],
234 | "source": [
235 | "pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])"
236 | ]
237 | },
238 | {
239 | "cell_type": "markdown",
240 | "metadata": {},
241 | "source": [
242 | "# Fit Pipeline"
243 | ]
244 | },
245 | {
246 | "cell_type": "code",
247 | "execution_count": 13,
248 | "metadata": {
249 | "collapsed": true
250 | },
251 | "outputs": [],
252 | "source": [
253 | "model = pipeline.fit(train)"
254 | ]
255 | },
256 | {
257 | "cell_type": "markdown",
258 | "metadata": {},
259 | "source": [
260 | "# Get Pipeline Output"
261 | ]
262 | },
263 | {
264 | "cell_type": "code",
265 | "execution_count": 14,
266 | "metadata": {
267 | "collapsed": true
268 | },
269 | "outputs": [],
270 | "source": [
271 | "train_output_df = model.transform(train)\n",
272 | "validation_output_df = model.transform(validation)\n",
273 | "test_output_df = model.transform(test)"
274 | ]
275 | },
276 | {
277 | "cell_type": "markdown",
278 | "metadata": {},
279 | "source": [
280 | "# Evaluate the Predictions"
281 | ]
282 | },
283 | {
284 | "cell_type": "code",
285 | "execution_count": 15,
286 | "metadata": {
287 | "scrolled": true
288 | },
289 | "outputs": [
290 | {
291 | "name": "stdout",
292 | "output_type": "stream",
293 | "text": [
294 | "Train weightedPrecision = 0.976101874447846\n",
295 | "Validation weightedPrecision = 0.9765821626938243\n",
296 | "Test weightedPrecision = 0.9747324280445043\n",
297 | "Train weightedRecall = 0.9755751041220662\n",
298 | "Validation weightedRecall = 0.9761613691931541\n",
299 | "Test weightedRecall = 0.9742582305920606\n",
300 | "Train accuracy = 0.975575104122066\n",
301 | "Validation accuracy = 0.976161369193154\n",
302 | "Test accuracy = 0.9742582305920607\n"
303 | ]
304 | }
305 | ],
306 | "source": [
307 | "train_predictionAndLabels = train_output_df.select(\"prediction\", \"label\")\n",
308 | "validation_predictionAndLabels = validation_output_df.select(\"prediction\", \"label\")\n",
309 | "test_predictionAndLabels = test_output_df.select(\"prediction\", \"label\")\n",
310 | "\n",
311 | "metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']\n",
312 | "\n",
313 | "for metric in metrics:\n",
314 | " evaluator = MulticlassClassificationEvaluator(metricName=metric)\n",
315 | " print('Train ' + metric + ' = ' + str(evaluator.evaluate(train_predictionAndLabels)))\n",
316 | " print('Validation ' + metric + ' = ' + str(evaluator.evaluate(validation_predictionAndLabels)))\n",
317 | " print('Test ' + metric + ' = ' + str(evaluator.evaluate(test_predictionAndLabels)))"
318 | ]
319 | }
320 | ],
321 | "metadata": {
322 | "kernelspec": {
323 | "display_name": "Python 3",
324 | "language": "python",
325 | "name": "python3"
326 | },
327 | "language_info": {
328 | "codemirror_mode": {
329 | "name": "ipython",
330 | "version": 3
331 | },
332 | "file_extension": ".py",
333 | "mimetype": "text/x-python",
334 | "name": "python",
335 | "nbconvert_exporter": "python",
336 | "pygments_lexer": "ipython3",
337 | "version": "3.7.0"
338 | }
339 | },
340 | "nbformat": 4,
341 | "nbformat_minor": 2
342 | }
343 |
--------------------------------------------------------------------------------
/chap_5/Classification_using_MLlib.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "collapsed": true
8 | },
9 | "outputs": [],
10 | "source": [
11 | "#import SparkSession\n",
12 | "from pyspark.sql import SparkSession\n",
13 | "spark=SparkSession.builder.appName('binary_class').getOrCreate()"
14 | ]
15 | },
16 | {
17 | "cell_type": "code",
18 | "execution_count": 2,
19 | "metadata": {
20 | "collapsed": true
21 | },
22 | "outputs": [],
23 | "source": [
24 | "#read the dataset\n",
25 | "df=spark.read.csv('classification_data.csv',inferSchema=True,header=True)"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {
32 | "collapsed": true
33 | },
34 | "outputs": [],
35 | "source": [
36 | "#check the shape of the data \n",
37 | "print((df.count(),len(df.columns)))"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": null,
43 | "metadata": {
44 | "collapsed": true
45 | },
46 | "outputs": [],
47 | "source": [
48 | "#printSchema\n",
49 | "df.printSchema()"
50 | ]
51 | },
52 | {
53 | "cell_type": "code",
54 | "execution_count": null,
55 | "metadata": {
56 | "collapsed": true
57 | },
58 | "outputs": [],
59 | "source": [
60 | "#number of columns in dataset\n",
61 | "df.columns"
62 | ]
63 | },
64 | {
65 | "cell_type": "code",
66 | "execution_count": null,
67 | "metadata": {
68 | "collapsed": true
69 | },
70 | "outputs": [],
71 | "source": [
72 | "#view the dataset\n",
73 | "df.show(5)"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": null,
79 | "metadata": {
80 | "collapsed": true
81 | },
82 | "outputs": [],
83 | "source": [
84 | "#Exploratory Data Analysis\n",
85 | "df.describe().show()\n"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {
92 | "collapsed": true
93 | },
94 | "outputs": [],
95 | "source": [
96 | "df.groupBy('label').count().show()"
97 | ]
98 | },
99 | {
100 | "cell_type": "code",
101 | "execution_count": null,
102 | "metadata": {
103 | "collapsed": true
104 | },
105 | "outputs": [],
106 | "source": [
107 | "df.groupBy('loan_purpose').count().show()"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": null,
113 | "metadata": {
114 | "collapsed": true
115 | },
116 | "outputs": [],
117 | "source": [
118 | "#converting categorical data to numerical form"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": 3,
124 | "metadata": {
125 | "collapsed": true
126 | },
127 | "outputs": [],
128 | "source": [
129 | "#import required libraries\n",
130 | "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n",
131 | "\n"
132 | ]
133 | },
134 | {
135 | "cell_type": "code",
136 | "execution_count": 4,
137 | "metadata": {
138 | "collapsed": true
139 | },
140 | "outputs": [],
141 | "source": [
142 | "loan_purpose_indexer = StringIndexer(inputCol=\"loan_purpose\", outputCol=\"loan_index\").fit(df)\n",
143 | "df = loan_purpose_indexer.transform(df)\n",
144 | "loan_encoder = OneHotEncoder(inputCol=\"loan_index\", outputCol=\"loan_purpose_vec\")\n",
145 | "df = loan_encoder.transform(df)"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {
152 | "collapsed": true
153 | },
154 | "outputs": [],
155 | "source": [
156 | "df.select(['loan_purpose','loan_index','loan_purpose_vec']).show(3,False)"
157 | ]
158 | },
159 | {
160 | "cell_type": "code",
161 | "execution_count": 5,
162 | "metadata": {
163 | "collapsed": true
164 | },
165 | "outputs": [],
166 | "source": [
167 | "from pyspark.ml.feature import VectorAssembler"
168 | ]
169 | },
170 | {
171 | "cell_type": "code",
172 | "execution_count": null,
173 | "metadata": {
174 | "collapsed": true
175 | },
176 | "outputs": [],
177 | "source": [
178 | "df.columns"
179 | ]
180 | },
181 | {
182 | "cell_type": "code",
183 | "execution_count": 6,
184 | "metadata": {
185 | "collapsed": true
186 | },
187 | "outputs": [],
188 | "source": [
189 | "df_assembler = VectorAssembler(inputCols=['is_first_loan',\n",
190 | " 'total_credit_card_limit',\n",
191 | " 'avg_percentage_credit_card_limit_used_last_year',\n",
192 | " 'saving_amount',\n",
193 | " 'checking_amount',\n",
194 | " 'is_employed',\n",
195 | " 'yearly_salary',\n",
196 | " 'age',\n",
197 | " 'dependent_number',\n",
198 | " 'loan_purpose_vec'], outputCol=\"features\")\n",
199 | "df = df_assembler.transform(df)"
200 | ]
201 | },
202 | {
203 | "cell_type": "code",
204 | "execution_count": null,
205 | "metadata": {
206 | "collapsed": true
207 | },
208 | "outputs": [],
209 | "source": [
210 | "df.printSchema()"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": null,
216 | "metadata": {
217 | "collapsed": true
218 | },
219 | "outputs": [],
220 | "source": [
221 | "df.select(['features','label']).show(10,False)"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 7,
227 | "metadata": {
228 | "collapsed": true
229 | },
230 | "outputs": [],
231 | "source": [
232 | "#select data for building model\n",
233 | "model_df=df.select(['features','label'])"
234 | ]
235 | },
236 | {
237 | "cell_type": "code",
238 | "execution_count": null,
239 | "metadata": {
240 | "collapsed": true
241 | },
242 | "outputs": [],
243 | "source": [
244 | "from pyspark.ml.classification import LogisticRegression"
245 | ]
246 | },
247 | {
248 | "cell_type": "code",
249 | "execution_count": 8,
250 | "metadata": {
251 | "collapsed": true
252 | },
253 | "outputs": [],
254 | "source": [
255 | "#split the data \n",
256 | "training_df,test_df=model_df.randomSplit([0.75,0.25])"
257 | ]
258 | },
259 | {
260 | "cell_type": "code",
261 | "execution_count": null,
262 | "metadata": {
263 | "collapsed": true
264 | },
265 | "outputs": [],
266 | "source": [
267 | "training_df.count()"
268 | ]
269 | },
270 | {
271 | "cell_type": "code",
272 | "execution_count": null,
273 | "metadata": {
274 | "collapsed": true
275 | },
276 | "outputs": [],
277 | "source": [
278 | "training_df.groupBy('label').count().show()"
279 | ]
280 | },
281 | {
282 | "cell_type": "code",
283 | "execution_count": null,
284 | "metadata": {
285 | "collapsed": true
286 | },
287 | "outputs": [],
288 | "source": [
289 | "test_df.count()"
290 | ]
291 | },
292 | {
293 | "cell_type": "code",
294 | "execution_count": null,
295 | "metadata": {
296 | "collapsed": true
297 | },
298 | "outputs": [],
299 | "source": [
300 | "test_df.groupBy('label').count().show()"
301 | ]
302 | },
303 | {
304 | "cell_type": "code",
305 | "execution_count": null,
306 | "metadata": {
307 | "collapsed": true
308 | },
309 | "outputs": [],
310 | "source": [
311 | "log_reg=LogisticRegression().fit(training_df)"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {
318 | "collapsed": true
319 | },
320 | "outputs": [],
321 | "source": [
322 | "#Training Results"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {
329 | "collapsed": true
330 | },
331 | "outputs": [],
332 | "source": [
333 | "lr_summary=log_reg.summary"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {
340 | "collapsed": true
341 | },
342 | "outputs": [],
343 | "source": [
344 | "lr_summary.accuracy"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "metadata": {
351 | "collapsed": true
352 | },
353 | "outputs": [],
354 | "source": [
355 | "lr_summary.areaUnderROC"
356 | ]
357 | },
358 | {
359 | "cell_type": "code",
360 | "execution_count": null,
361 | "metadata": {
362 | "collapsed": true
363 | },
364 | "outputs": [],
365 | "source": [
366 | "print(lr_summary.precisionByLabel)"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": null,
372 | "metadata": {
373 | "collapsed": true
374 | },
375 | "outputs": [],
376 | "source": [
377 | "print(lr_summary.recallByLabel)"
378 | ]
379 | },
380 | {
381 | "cell_type": "code",
382 | "execution_count": null,
383 | "metadata": {
384 | "collapsed": true
385 | },
386 | "outputs": [],
387 | "source": [
388 | "predictions = log_reg.transform(test_df)\n",
389 | "predictions.show(10)\n"
390 | ]
391 | },
392 | {
393 | "cell_type": "code",
394 | "execution_count": null,
395 | "metadata": {
396 | "collapsed": true
397 | },
398 | "outputs": [],
399 | "source": [
400 | "model_predictions = log_reg.transform(test_df)\n"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": null,
406 | "metadata": {
407 | "collapsed": true
408 | },
409 | "outputs": [],
410 | "source": [
411 | "model_predictions = log_reg.evaluate(test_df)\n"
412 | ]
413 | },
414 | {
415 | "cell_type": "code",
416 | "execution_count": null,
417 | "metadata": {
418 | "collapsed": true
419 | },
420 | "outputs": [],
421 | "source": [
422 | "model_predictions.accuracy"
423 | ]
424 | },
425 | {
426 | "cell_type": "code",
427 | "execution_count": null,
428 | "metadata": {
429 | "collapsed": true
430 | },
431 | "outputs": [],
432 | "source": [
433 | "model_predictions.weightedPrecision"
434 | ]
435 | },
436 | {
437 | "cell_type": "code",
438 | "execution_count": null,
439 | "metadata": {
440 | "collapsed": true
441 | },
442 | "outputs": [],
443 | "source": [
444 | "model_predictions.recallByLabel"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": null,
450 | "metadata": {
451 | "collapsed": true
452 | },
453 | "outputs": [],
454 | "source": [
455 | "print(model_predictions.precisionByLabel)"
456 | ]
457 | },
458 | {
459 | "cell_type": "code",
460 | "execution_count": null,
461 | "metadata": {
462 | "collapsed": true
463 | },
464 | "outputs": [],
465 | "source": [
466 | "model_predictions.areaUnderROC"
467 | ]
468 | },
469 | {
470 | "cell_type": "code",
471 | "execution_count": 9,
472 | "metadata": {
473 | "collapsed": true
474 | },
475 | "outputs": [],
476 | "source": [
477 | "from pyspark.ml.classification import RandomForestClassifier\n",
478 | "rf = RandomForestClassifier()\n",
479 | "rf_model = rf.fit(training_df)\n"
480 | ]
481 | },
482 | {
483 | "cell_type": "code",
484 | "execution_count": 10,
485 | "metadata": {
486 | "collapsed": true
487 | },
488 | "outputs": [],
489 | "source": [
490 | "model_predictions = rf_model.transform(test_df)\n"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 11,
496 | "metadata": {
497 | "collapsed": true
498 | },
499 | "outputs": [],
500 | "source": [
501 | "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
502 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
503 | "\n",
504 | "evaluator = BinaryClassificationEvaluator()\n",
505 | "\n",
506 | "rf = RandomForestClassifier()\n",
507 | "paramGrid = (ParamGridBuilder()\n",
508 | " .addGrid(rf.maxDepth, [5,10,20,25,30])\n",
509 | " .addGrid(rf.maxBins, [20,30,40 ])\n",
510 | " .addGrid(rf.numTrees, [5, 20,50])\n",
511 | " .build())\n",
512 | "cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)\n",
513 | "cv_model = cv.fit(training_df)"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": 12,
519 | "metadata": {
520 | "collapsed": true
521 | },
522 | "outputs": [],
523 | "source": [
524 | "best_rf_model = cv_model.bestModel"
525 | ]
526 | },
527 | {
528 | "cell_type": "code",
529 | "execution_count": 13,
530 | "metadata": {
531 | "collapsed": true
532 | },
533 | "outputs": [],
534 | "source": [
535 | "# Generate predictions for entire dataset\n",
536 | "model_predictions = best_rf_model.transform(test_df)"
537 | ]
538 | },
539 | {
540 | "cell_type": "code",
541 | "execution_count": 14,
542 | "metadata": {
543 | "collapsed": true
544 | },
545 | "outputs": [],
546 | "source": [
547 | "true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()\n",
548 | "actual_pos=model_predictions.filter(model_predictions['label']==1).count()\n",
549 | "pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()"
550 | ]
551 | },
552 | {
553 | "cell_type": "code",
554 | "execution_count": 15,
555 | "metadata": {},
556 | "outputs": [
557 | {
558 | "data": {
559 | "text/plain": [
560 | "0.912426614481409"
561 | ]
562 | },
563 | "execution_count": 15,
564 | "metadata": {},
565 | "output_type": "execute_result"
566 | }
567 | ],
568 | "source": [
569 | "#Recall \n",
570 | "float(true_pos)/(actual_pos)"
571 | ]
572 | },
573 | {
574 | "cell_type": "code",
575 | "execution_count": 16,
576 | "metadata": {},
577 | "outputs": [
578 | {
579 | "data": {
580 | "text/plain": [
581 | "0.8562901744719926"
582 | ]
583 | },
584 | "execution_count": 16,
585 | "metadata": {},
586 | "output_type": "execute_result"
587 | }
588 | ],
589 | "source": [
590 | "#Precision on test Data \n",
591 | "float(true_pos)/(pred_pos)"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": null,
597 | "metadata": {
598 | "collapsed": true
599 | },
600 | "outputs": [],
601 | "source": []
602 | }
603 | ],
604 | "metadata": {
605 | "kernelspec": {
606 | "display_name": "Python 3",
607 | "language": "python",
608 | "name": "python3"
609 | },
610 | "language_info": {
611 | "codemirror_mode": {
612 | "name": "ipython",
613 | "version": 3
614 | },
615 | "file_extension": ".py",
616 | "mimetype": "text/x-python",
617 | "name": "python",
618 | "nbconvert_exporter": "python",
619 | "pygments_lexer": "ipython3",
620 | "version": "3.7.0"
621 | }
622 | },
623 | "nbformat": 4,
624 | "nbformat_minor": 2
625 | }
626 |
--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/Spark Structured Streaming app-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#import SparkSession\n",
10 | "from pyspark.sql import SparkSession\n",
11 | "spark=SparkSession.builder.appName('structured_streaming').getOrCreate()"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import pyspark.sql.functions as F\n",
21 | "from pyspark.sql.types import *"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 3,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#create sample dataset\n",
31 | "df_1=spark.createDataFrame([(\"XN203\",'FB',300,30),(\"XN201\",'Twitter',10,19),(\"XN202\",'Insta',500,45)], \n",
32 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 4,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "#define schema for input data\n",
42 | "schema=StructType().add(\"user_id\", \"string\").add(\"app\", \"string\").add(\"time_in_secs\", \"integer\").add(\"age\", \"integer\")\n",
43 | "data=spark.readStream.option(\"sep\", \",\").schema(schema).csv(\"csv_folder\")"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 5,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "root\n",
56 | " |-- user_id: string (nullable = true)\n",
57 | " |-- app: string (nullable = true)\n",
58 | " |-- time_in_secs: integer (nullable = true)\n",
59 | " |-- age: integer (nullable = true)\n",
60 | "\n"
61 | ]
62 | }
63 | ],
64 | "source": [
65 | "data.printSchema()"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 6,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "app_count=data.groupBy('app').count()"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 7,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "query=(app_count.writeStream.queryName('count_query').outputMode('complete').format('memory').start())"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 10,
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "data": {
93 | "text/html": [
94 | "\n",
95 | "
\n",
96 | " \n",
97 | " \n",
98 | " | \n",
99 | " app | \n",
100 | " count | \n",
101 | "
\n",
102 | " \n",
103 | " \n",
104 | " \n",
105 | " | 0 | \n",
106 | " Insta | \n",
107 | " 1 | \n",
108 | "
\n",
109 | " \n",
110 | " | 1 | \n",
111 | " FB | \n",
112 | " 1 | \n",
113 | "
\n",
114 | " \n",
115 | " | 2 | \n",
116 | " Twitter | \n",
117 | " 1 | \n",
118 | "
\n",
119 | " \n",
120 | "
\n",
121 | "
"
122 | ],
123 | "text/plain": [
124 | " app count\n",
125 | "0 Insta 1\n",
126 | "1 FB 1\n",
127 | "2 Twitter 1"
128 | ]
129 | },
130 | "execution_count": 10,
131 | "metadata": {},
132 | "output_type": "execute_result"
133 | }
134 | ],
135 | "source": [
136 | "spark.sql(\"select * from count_query \").toPandas().head(5)"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 9,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "fb_data=data.filter(data['app']=='FB')"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 10,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "fb_avg_time=fb_data.groupBy('user_id').agg(F.avg(\"time_in_secs\"))"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 11,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "fb_query=(fb_avg_time.writeStream.queryName('fb_query').outputMode('complete').format('memory').start())"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 12,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "data": {
173 | "text/html": [
174 | "\n",
175 | "
\n",
176 | " \n",
177 | " \n",
178 | " | \n",
179 | " user_id | \n",
180 | " avg(time_in_secs) | \n",
181 | "
\n",
182 | " \n",
183 | " \n",
184 | " \n",
185 | "
\n",
186 | "
"
187 | ],
188 | "text/plain": [
189 | "Empty DataFrame\n",
190 | "Columns: [user_id, avg(time_in_secs)]\n",
191 | "Index: []"
192 | ]
193 | },
194 | "execution_count": 12,
195 | "metadata": {},
196 | "output_type": "execute_result"
197 | }
198 | ],
199 | "source": [
200 | "spark.sql(\"select * from fb_query \").toPandas().head(5)"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 5,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "df_2=spark.createDataFrame([(\"XN203\",'FB',100,30),(\"XN201\",'FB',10,19),(\"XN202\",'FB',2000,45)], \n",
210 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 14,
216 | "metadata": {},
217 | "outputs": [
218 | {
219 | "data": {
220 | "text/html": [
221 | "\n",
222 | "
\n",
223 | " \n",
224 | " \n",
225 | " | \n",
226 | " user_id | \n",
227 | " avg(time_in_secs) | \n",
228 | "
\n",
229 | " \n",
230 | " \n",
231 | " \n",
232 | " | 0 | \n",
233 | " XN203 | \n",
234 | " 300.0 | \n",
235 | "
\n",
236 | " \n",
237 | "
\n",
238 | "
"
239 | ],
240 | "text/plain": [
241 | " user_id avg(time_in_secs)\n",
242 | "0 XN203 300.0"
243 | ]
244 | },
245 | "execution_count": 14,
246 | "metadata": {},
247 | "output_type": "execute_result"
248 | }
249 | ],
250 | "source": [
251 | "spark.sql(\"select * from fb_query \").toPandas().head(5)"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 12,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "df_3=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
261 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 16,
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "data": {
271 | "text/html": [
272 | "\n",
273 | "
\n",
274 | " \n",
275 | " \n",
276 | " | \n",
277 | " user_id | \n",
278 | " avg(time_in_secs) | \n",
279 | "
\n",
280 | " \n",
281 | " \n",
282 | " \n",
283 | " | 0 | \n",
284 | " XN203 | \n",
285 | " 300.0 | \n",
286 | "
\n",
287 | " \n",
288 | "
\n",
289 | "
"
290 | ],
291 | "text/plain": [
292 | " user_id avg(time_in_secs)\n",
293 | "0 XN203 300.0"
294 | ]
295 | },
296 | "execution_count": 16,
297 | "metadata": {},
298 | "output_type": "execute_result"
299 | }
300 | ],
301 | "source": [
302 | "spark.sql(\"select * from fb_query \").toPandas().head(5)"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 13,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "df_4=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
312 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 18,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "#app wise time spent\n",
322 | "\n",
323 | "app_df=data.groupBy('app').agg(F.sum('time_in_secs').alias('total_time')).orderBy('total_time',ascending=False)"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": 19,
329 | "metadata": {},
330 | "outputs": [],
331 | "source": [
332 | "app_query=(app_df.writeStream.queryName('app_wise_query').outputMode('complete').format('memory').start())"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 27,
338 | "metadata": {},
339 | "outputs": [
340 | {
341 | "data": {
342 | "text/html": [
343 | "\n",
344 | "
\n",
345 | " \n",
346 | " \n",
347 | " | \n",
348 | " app | \n",
349 | " total_time | \n",
350 | "
\n",
351 | " \n",
352 | " \n",
353 | " \n",
354 | " | 0 | \n",
355 | " FB | \n",
356 | " 3410 | \n",
357 | "
\n",
358 | " \n",
359 | " | 1 | \n",
360 | " Insta | \n",
361 | " 560 | \n",
362 | "
\n",
363 | " \n",
364 | " | 2 | \n",
365 | " Twitter | \n",
366 | " 210 | \n",
367 | "
\n",
368 | " \n",
369 | "
\n",
370 | "
"
371 | ],
372 | "text/plain": [
373 | " app total_time\n",
374 | "0 FB 3410\n",
375 | "1 Insta 560\n",
376 | "2 Twitter 210"
377 | ]
378 | },
379 | "execution_count": 27,
380 | "metadata": {},
381 | "output_type": "execute_result"
382 | }
383 | ],
384 | "source": [
385 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 11,
391 | "metadata": {},
392 | "outputs": [],
393 | "source": [
394 | "df_5=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
395 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": 26,
401 | "metadata": {},
402 | "outputs": [
403 | {
404 | "data": {
405 | "text/html": [
406 | "\n",
407 | "
\n",
408 | " \n",
409 | " \n",
410 | " | \n",
411 | " app | \n",
412 | " total_time | \n",
413 | "
\n",
414 | " \n",
415 | " \n",
416 | " \n",
417 | " | 0 | \n",
418 | " FB | \n",
419 | " 3410 | \n",
420 | "
\n",
421 | " \n",
422 | " | 1 | \n",
423 | " Insta | \n",
424 | " 560 | \n",
425 | "
\n",
426 | " \n",
427 | " | 2 | \n",
428 | " Twitter | \n",
429 | " 210 | \n",
430 | "
\n",
431 | " \n",
432 | "
\n",
433 | "
"
434 | ],
435 | "text/plain": [
436 | " app total_time\n",
437 | "0 FB 3410\n",
438 | "1 Insta 560\n",
439 | "2 Twitter 210"
440 | ]
441 | },
442 | "execution_count": 26,
443 | "metadata": {},
444 | "output_type": "execute_result"
445 | }
446 | ],
447 | "source": [
448 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": 28,
454 | "metadata": {},
455 | "outputs": [],
456 | "source": [
457 | "# app wise mean age \n",
458 | "age_df=data.groupBy('app').agg(F.avg('age').alias('mean_age')).orderBy('mean_age',ascending=False)\n"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": null,
464 | "metadata": {
465 | "scrolled": true
466 | },
467 | "outputs": [],
468 | "source": [
469 | "age_query=(age_df.writeStream.queryName('age_query').outputMode('complete').format('memory').start())"
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": 30,
475 | "metadata": {},
476 | "outputs": [
477 | {
478 | "data": {
479 | "text/html": [
480 | "\n",
481 | "
\n",
482 | " \n",
483 | " \n",
484 | " | \n",
485 | " app | \n",
486 | " mean_age | \n",
487 | "
\n",
488 | " \n",
489 | " \n",
490 | " \n",
491 | " | 0 | \n",
492 | " Twitter | \n",
493 | " 38.500000 | \n",
494 | "
\n",
495 | " \n",
496 | " | 1 | \n",
497 | " FB | \n",
498 | " 30.571429 | \n",
499 | "
\n",
500 | " \n",
501 | " | 2 | \n",
502 | " Insta | \n",
503 | " 25.500000 | \n",
504 | "
\n",
505 | " \n",
506 | "
\n",
507 | "
"
508 | ],
509 | "text/plain": [
510 | " app mean_age\n",
511 | "0 Twitter 38.500000\n",
512 | "1 FB 30.571429\n",
513 | "2 Insta 25.500000"
514 | ]
515 | },
516 | "execution_count": 30,
517 | "metadata": {},
518 | "output_type": "execute_result"
519 | }
520 | ],
521 | "source": [
522 | "spark.sql(\"select * from age_query \").toPandas().head(5)"
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": 15,
528 | "metadata": {},
529 | "outputs": [],
530 | "source": [
531 | "df_6=spark.createDataFrame([(\"XN210\",'FB',500,50),(\"XN255\",'Insta',30,23),(\"XN222\",'Twitter',100,30)], \n",
532 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": 32,
538 | "metadata": {},
539 | "outputs": [
540 | {
541 | "data": {
542 | "text/html": [
543 | "\n",
544 | "
\n",
545 | " \n",
546 | " \n",
547 | " | \n",
548 | " app | \n",
549 | " mean_age | \n",
550 | "
\n",
551 | " \n",
552 | " \n",
553 | " \n",
554 | " | 0 | \n",
555 | " Twitter | \n",
556 | " 38.500000 | \n",
557 | "
\n",
558 | " \n",
559 | " | 1 | \n",
560 | " FB | \n",
561 | " 30.571429 | \n",
562 | "
\n",
563 | " \n",
564 | " | 2 | \n",
565 | " Insta | \n",
566 | " 25.500000 | \n",
567 | "
\n",
568 | " \n",
569 | "
\n",
570 | "
"
571 | ],
572 | "text/plain": [
573 | " app mean_age\n",
574 | "0 Twitter 38.500000\n",
575 | "1 FB 30.571429\n",
576 | "2 Insta 25.500000"
577 | ]
578 | },
579 | "execution_count": 32,
580 | "metadata": {},
581 | "output_type": "execute_result"
582 | }
583 | ],
584 | "source": [
585 | "spark.sql(\"select * from age_query \").toPandas().head(5)"
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": 6,
591 | "metadata": {},
592 | "outputs": [
593 | {
594 | "name": "stdout",
595 | "output_type": "stream",
596 | "text": [
597 | "+-------+---------+\n",
598 | "| app|full_name|\n",
599 | "+-------+---------+\n",
600 | "| FB| FACEBOOK|\n",
601 | "| Insta|INSTAGRAM|\n",
602 | "|Twitter| TWITTER|\n",
603 | "+-------+---------+\n",
604 | "\n"
605 | ]
606 | }
607 | ],
608 | "source": [
609 | "# Join static dataframe with streaming dataframe\n",
610 | "app_df=spark.createDataFrame([('FB','FACEBOOK'),('Insta','INSTAGRAM'),('Twitter','TWITTER')],[\"app\", \"full_name\"])\n",
611 | "app_df.show()"
612 | ]
613 | },
614 | {
615 | "cell_type": "code",
616 | "execution_count": 7,
617 | "metadata": {},
618 | "outputs": [],
619 | "source": [
620 | "app_stream_df=data.join(app_df,'app')"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 8,
626 | "metadata": {},
627 | "outputs": [],
628 | "source": [
629 | "join_query=(app_stream_df.writeStream.queryName('join_query').outputMode('append').format('memory').start())"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 14,
635 | "metadata": {},
636 | "outputs": [
637 | {
638 | "data": {
639 | "text/html": [
640 | "\n",
641 | "
\n",
642 | " \n",
643 | " \n",
644 | " | \n",
645 | " app | \n",
646 | " user_id | \n",
647 | " time_in_secs | \n",
648 | " age | \n",
649 | " full_name | \n",
650 | "
\n",
651 | " \n",
652 | " \n",
653 | " \n",
654 | " | 0 | \n",
655 | " FB | \n",
656 | " XN201 | \n",
657 | " 10 | \n",
658 | " 19 | \n",
659 | " FACEBOOK | \n",
660 | "
\n",
661 | " \n",
662 | " | 1 | \n",
663 | " FB | \n",
664 | " XN203 | \n",
665 | " 100 | \n",
666 | " 30 | \n",
667 | " FACEBOOK | \n",
668 | "
\n",
669 | " \n",
670 | " | 2 | \n",
671 | " FB | \n",
672 | " XN203 | \n",
673 | " 300 | \n",
674 | " 30 | \n",
675 | " FACEBOOK | \n",
676 | "
\n",
677 | " \n",
678 | " | 3 | \n",
679 | " FB | \n",
680 | " XN202 | \n",
681 | " 2000 | \n",
682 | " 45 | \n",
683 | " FACEBOOK | \n",
684 | "
\n",
685 | " \n",
686 | " | 4 | \n",
687 | " Insta | \n",
688 | " XN202 | \n",
689 | " 500 | \n",
690 | " 45 | \n",
691 | " INSTAGRAM | \n",
692 | "
\n",
693 | " \n",
694 | " | 5 | \n",
695 | " Twitter | \n",
696 | " XN201 | \n",
697 | " 10 | \n",
698 | " 19 | \n",
699 | " TWITTER | \n",
700 | "
\n",
701 | " \n",
702 | " | 6 | \n",
703 | " FB | \n",
704 | " XN203 | \n",
705 | " 500 | \n",
706 | " 30 | \n",
707 | " FACEBOOK | \n",
708 | "
\n",
709 | " \n",
710 | " | 7 | \n",
711 | " Insta | \n",
712 | " XN201 | \n",
713 | " 30 | \n",
714 | " 19 | \n",
715 | " INSTAGRAM | \n",
716 | "
\n",
717 | " \n",
718 | " | 8 | \n",
719 | " Twitter | \n",
720 | " XN202 | \n",
721 | " 100 | \n",
722 | " 45 | \n",
723 | " TWITTER | \n",
724 | "
\n",
725 | " \n",
726 | " | 9 | \n",
727 | " FB | \n",
728 | " XN203 | \n",
729 | " 500 | \n",
730 | " 30 | \n",
731 | " FACEBOOK | \n",
732 | "
\n",
733 | " \n",
734 | " | 10 | \n",
735 | " Insta | \n",
736 | " XN201 | \n",
737 | " 30 | \n",
738 | " 19 | \n",
739 | " INSTAGRAM | \n",
740 | "
\n",
741 | " \n",
742 | " | 11 | \n",
743 | " Twitter | \n",
744 | " XN202 | \n",
745 | " 100 | \n",
746 | " 45 | \n",
747 | " TWITTER | \n",
748 | "
\n",
749 | " \n",
750 | "
\n",
751 | "
"
752 | ],
753 | "text/plain": [
754 | " app user_id time_in_secs age full_name\n",
755 | "0 FB XN201 10 19 FACEBOOK\n",
756 | "1 FB XN203 100 30 FACEBOOK\n",
757 | "2 FB XN203 300 30 FACEBOOK\n",
758 | "3 FB XN202 2000 45 FACEBOOK\n",
759 | "4 Insta XN202 500 45 INSTAGRAM\n",
760 | "5 Twitter XN201 10 19 TWITTER\n",
761 | "6 FB XN203 500 30 FACEBOOK\n",
762 | "7 Insta XN201 30 19 INSTAGRAM\n",
763 | "8 Twitter XN202 100 45 TWITTER\n",
764 | "9 FB XN203 500 30 FACEBOOK\n",
765 | "10 Insta XN201 30 19 INSTAGRAM\n",
766 | "11 Twitter XN202 100 45 TWITTER"
767 | ]
768 | },
769 | "execution_count": 14,
770 | "metadata": {},
771 | "output_type": "execute_result"
772 | }
773 | ],
774 | "source": [
775 | "spark.sql(\"select * from join_query \").toPandas().head(50)"
776 | ]
777 | },
778 | {
779 | "cell_type": "code",
780 | "execution_count": null,
781 | "metadata": {},
782 | "outputs": [],
783 | "source": []
784 | },
785 | {
786 | "cell_type": "code",
787 | "execution_count": null,
788 | "metadata": {},
789 | "outputs": [],
790 | "source": []
791 | }
792 | ],
793 | "metadata": {
794 | "kernelspec": {
795 | "display_name": "Python 3",
796 | "language": "python",
797 | "name": "python3"
798 | },
799 | "language_info": {
800 | "codemirror_mode": {
801 | "name": "ipython",
802 | "version": 3
803 | },
804 | "file_extension": ".py",
805 | "mimetype": "text/x-python",
806 | "name": "python",
807 | "nbconvert_exporter": "python",
808 | "pygments_lexer": "ipython3",
809 | "version": "3.6.3"
810 | }
811 | },
812 | "nbformat": 4,
813 | "nbformat_minor": 2
814 | }
815 |
--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/Spark Structured Streaming demo-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#import SparkSession\n",
10 | "from pyspark.sql import SparkSession\n",
11 | "spark=SparkSession.builder.appName('structured_streaming').getOrCreate()"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import pyspark.sql.functions as F\n",
21 | "from pyspark.sql.types import *"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 3,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#create sample dataset\n",
31 | "df_1=spark.createDataFrame([(\"XN203\",'FB',300,30),(\"XN201\",'Twitter',10,19),(\"XN202\",'Insta',500,45)], \n",
32 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 4,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "#define schema for input data\n",
42 | "schema=StructType().add(\"user_id\", \"string\").add(\"app\", \"string\").add(\"time_in_secs\", \"integer\").add(\"age\", \"integer\")\n",
43 | "data=spark.readStream.option(\"sep\", \",\").schema(schema).csv(\"csv_folder\")"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 5,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "root\n",
56 | " |-- user_id: string (nullable = true)\n",
57 | " |-- app: string (nullable = true)\n",
58 | " |-- time_in_secs: integer (nullable = true)\n",
59 | " |-- age: integer (nullable = true)\n",
60 | "\n"
61 | ]
62 | }
63 | ],
64 | "source": [
65 | "data.printSchema()"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 6,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "app_count=data.groupBy('app').count()"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 7,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "query=(app_count.writeStream.queryName('count_query').outputMode('complete').format('memory').start())"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 10,
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "data": {
93 | "text/html": [
94 | "\n",
95 | "
\n",
96 | " \n",
97 | " \n",
98 | " | \n",
99 | " app | \n",
100 | " count | \n",
101 | "
\n",
102 | " \n",
103 | " \n",
104 | " \n",
105 | " | 0 | \n",
106 | " Insta | \n",
107 | " 1 | \n",
108 | "
\n",
109 | " \n",
110 | " | 1 | \n",
111 | " FB | \n",
112 | " 1 | \n",
113 | "
\n",
114 | " \n",
115 | " | 2 | \n",
116 | " Twitter | \n",
117 | " 1 | \n",
118 | "
\n",
119 | " \n",
120 | "
\n",
121 | "
"
122 | ],
123 | "text/plain": [
124 | " app count\n",
125 | "0 Insta 1\n",
126 | "1 FB 1\n",
127 | "2 Twitter 1"
128 | ]
129 | },
130 | "execution_count": 10,
131 | "metadata": {},
132 | "output_type": "execute_result"
133 | }
134 | ],
135 | "source": [
136 | "spark.sql(\"select * from count_query \").toPandas().head(5)"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 9,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "fb_data=data.filter(data['app']=='FB')"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 10,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "fb_avg_time=fb_data.groupBy('user_id').agg(F.avg(\"time_in_secs\"))"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 11,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "fb_query=(fb_avg_time.writeStream.queryName('fb_query').outputMode('complete').format('memory').start())"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 12,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "data": {
173 | "text/html": [
174 | "\n",
175 | "
\n",
176 | " \n",
177 | " \n",
178 | " | \n",
179 | " user_id | \n",
180 | " avg(time_in_secs) | \n",
181 | "
\n",
182 | " \n",
183 | " \n",
184 | " \n",
185 | "
\n",
186 | "
"
187 | ],
188 | "text/plain": [
189 | "Empty DataFrame\n",
190 | "Columns: [user_id, avg(time_in_secs)]\n",
191 | "Index: []"
192 | ]
193 | },
194 | "execution_count": 12,
195 | "metadata": {},
196 | "output_type": "execute_result"
197 | }
198 | ],
199 | "source": [
200 | "spark.sql(\"select * from fb_query \").toPandas().head(5)"
201 | ]
202 | },
203 | {
204 | "cell_type": "code",
205 | "execution_count": 5,
206 | "metadata": {},
207 | "outputs": [],
208 | "source": [
209 | "df_2=spark.createDataFrame([(\"XN203\",'FB',100,30),(\"XN201\",'FB',10,19),(\"XN202\",'FB',2000,45)], \n",
210 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
211 | ]
212 | },
213 | {
214 | "cell_type": "code",
215 | "execution_count": 14,
216 | "metadata": {},
217 | "outputs": [
218 | {
219 | "data": {
220 | "text/html": [
221 | "\n",
222 | "
\n",
223 | " \n",
224 | " \n",
225 | " | \n",
226 | " user_id | \n",
227 | " avg(time_in_secs) | \n",
228 | "
\n",
229 | " \n",
230 | " \n",
231 | " \n",
232 | " | 0 | \n",
233 | " XN203 | \n",
234 | " 300.0 | \n",
235 | "
\n",
236 | " \n",
237 | "
\n",
238 | "
"
239 | ],
240 | "text/plain": [
241 | " user_id avg(time_in_secs)\n",
242 | "0 XN203 300.0"
243 | ]
244 | },
245 | "execution_count": 14,
246 | "metadata": {},
247 | "output_type": "execute_result"
248 | }
249 | ],
250 | "source": [
251 | "spark.sql(\"select * from fb_query \").toPandas().head(5)"
252 | ]
253 | },
254 | {
255 | "cell_type": "code",
256 | "execution_count": 12,
257 | "metadata": {},
258 | "outputs": [],
259 | "source": [
260 | "df_3=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
261 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 16,
267 | "metadata": {},
268 | "outputs": [
269 | {
270 | "data": {
271 | "text/html": [
272 | "\n",
273 | "
\n",
274 | " \n",
275 | " \n",
276 | " | \n",
277 | " user_id | \n",
278 | " avg(time_in_secs) | \n",
279 | "
\n",
280 | " \n",
281 | " \n",
282 | " \n",
283 | " | 0 | \n",
284 | " XN203 | \n",
285 | " 300.0 | \n",
286 | "
\n",
287 | " \n",
288 | "
\n",
289 | "
"
290 | ],
291 | "text/plain": [
292 | " user_id avg(time_in_secs)\n",
293 | "0 XN203 300.0"
294 | ]
295 | },
296 | "execution_count": 16,
297 | "metadata": {},
298 | "output_type": "execute_result"
299 | }
300 | ],
301 | "source": [
302 | "spark.sql(\"select * from fb_query \").toPandas().head(5)"
303 | ]
304 | },
305 | {
306 | "cell_type": "code",
307 | "execution_count": 13,
308 | "metadata": {},
309 | "outputs": [],
310 | "source": [
311 | "df_4=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
312 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
313 | ]
314 | },
315 | {
316 | "cell_type": "code",
317 | "execution_count": 18,
318 | "metadata": {},
319 | "outputs": [],
320 | "source": [
321 | "#app wise time spent\n",
322 | "\n",
323 | "app_df=data.groupBy('app').agg(F.sum('time_in_secs').alias('total_time')).orderBy('total_time',ascending=False)"
324 | ]
325 | },
326 | {
327 | "cell_type": "code",
328 | "execution_count": 19,
329 | "metadata": {},
330 | "outputs": [],
331 | "source": [
332 | "app_query=(app_df.writeStream.queryName('app_wise_query').outputMode('complete').format('memory').start())"
333 | ]
334 | },
335 | {
336 | "cell_type": "code",
337 | "execution_count": 27,
338 | "metadata": {},
339 | "outputs": [
340 | {
341 | "data": {
342 | "text/html": [
343 | "\n",
344 | "
\n",
345 | " \n",
346 | " \n",
347 | " | \n",
348 | " app | \n",
349 | " total_time | \n",
350 | "
\n",
351 | " \n",
352 | " \n",
353 | " \n",
354 | " | 0 | \n",
355 | " FB | \n",
356 | " 3410 | \n",
357 | "
\n",
358 | " \n",
359 | " | 1 | \n",
360 | " Insta | \n",
361 | " 560 | \n",
362 | "
\n",
363 | " \n",
364 | " | 2 | \n",
365 | " Twitter | \n",
366 | " 210 | \n",
367 | "
\n",
368 | " \n",
369 | "
\n",
370 | "
"
371 | ],
372 | "text/plain": [
373 | " app total_time\n",
374 | "0 FB 3410\n",
375 | "1 Insta 560\n",
376 | "2 Twitter 210"
377 | ]
378 | },
379 | "execution_count": 27,
380 | "metadata": {},
381 | "output_type": "execute_result"
382 | }
383 | ],
384 | "source": [
385 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": 11,
391 | "metadata": {},
392 | "outputs": [],
393 | "source": [
394 | "df_5=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
395 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
396 | ]
397 | },
398 | {
399 | "cell_type": "code",
400 | "execution_count": 26,
401 | "metadata": {},
402 | "outputs": [
403 | {
404 | "data": {
405 | "text/html": [
406 | "\n",
407 | "
\n",
408 | " \n",
409 | " \n",
410 | " | \n",
411 | " app | \n",
412 | " total_time | \n",
413 | "
\n",
414 | " \n",
415 | " \n",
416 | " \n",
417 | " | 0 | \n",
418 | " FB | \n",
419 | " 3410 | \n",
420 | "
\n",
421 | " \n",
422 | " | 1 | \n",
423 | " Insta | \n",
424 | " 560 | \n",
425 | "
\n",
426 | " \n",
427 | " | 2 | \n",
428 | " Twitter | \n",
429 | " 210 | \n",
430 | "
\n",
431 | " \n",
432 | "
\n",
433 | "
"
434 | ],
435 | "text/plain": [
436 | " app total_time\n",
437 | "0 FB 3410\n",
438 | "1 Insta 560\n",
439 | "2 Twitter 210"
440 | ]
441 | },
442 | "execution_count": 26,
443 | "metadata": {},
444 | "output_type": "execute_result"
445 | }
446 | ],
447 | "source": [
448 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
449 | ]
450 | },
451 | {
452 | "cell_type": "code",
453 | "execution_count": 28,
454 | "metadata": {},
455 | "outputs": [],
456 | "source": [
457 | "# app wise mean age \n",
458 | "age_df=data.groupBy('app').agg(F.avg('age').alias('mean_age')).orderBy('mean_age',ascending=False)\n"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": null,
464 | "metadata": {
465 | "scrolled": true
466 | },
467 | "outputs": [],
468 | "source": [
469 | "age_query=(age_df.writeStream.queryName('age_query').outputMode('complete').format('memory').start())"
470 | ]
471 | },
472 | {
473 | "cell_type": "code",
474 | "execution_count": 30,
475 | "metadata": {},
476 | "outputs": [
477 | {
478 | "data": {
479 | "text/html": [
480 | "\n",
481 | "
\n",
482 | " \n",
483 | " \n",
484 | " | \n",
485 | " app | \n",
486 | " mean_age | \n",
487 | "
\n",
488 | " \n",
489 | " \n",
490 | " \n",
491 | " | 0 | \n",
492 | " Twitter | \n",
493 | " 38.500000 | \n",
494 | "
\n",
495 | " \n",
496 | " | 1 | \n",
497 | " FB | \n",
498 | " 30.571429 | \n",
499 | "
\n",
500 | " \n",
501 | " | 2 | \n",
502 | " Insta | \n",
503 | " 25.500000 | \n",
504 | "
\n",
505 | " \n",
506 | "
\n",
507 | "
"
508 | ],
509 | "text/plain": [
510 | " app mean_age\n",
511 | "0 Twitter 38.500000\n",
512 | "1 FB 30.571429\n",
513 | "2 Insta 25.500000"
514 | ]
515 | },
516 | "execution_count": 30,
517 | "metadata": {},
518 | "output_type": "execute_result"
519 | }
520 | ],
521 | "source": [
522 | "spark.sql(\"select * from age_query \").toPandas().head(5)"
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": 15,
528 | "metadata": {},
529 | "outputs": [],
530 | "source": [
531 | "df_6=spark.createDataFrame([(\"XN210\",'FB',500,50),(\"XN255\",'Insta',30,23),(\"XN222\",'Twitter',100,30)], \n",
532 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
533 | ]
534 | },
535 | {
536 | "cell_type": "code",
537 | "execution_count": 32,
538 | "metadata": {},
539 | "outputs": [
540 | {
541 | "data": {
542 | "text/html": [
543 | "\n",
544 | "
\n",
545 | " \n",
546 | " \n",
547 | " | \n",
548 | " app | \n",
549 | " mean_age | \n",
550 | "
\n",
551 | " \n",
552 | " \n",
553 | " \n",
554 | " | 0 | \n",
555 | " Twitter | \n",
556 | " 38.500000 | \n",
557 | "
\n",
558 | " \n",
559 | " | 1 | \n",
560 | " FB | \n",
561 | " 30.571429 | \n",
562 | "
\n",
563 | " \n",
564 | " | 2 | \n",
565 | " Insta | \n",
566 | " 25.500000 | \n",
567 | "
\n",
568 | " \n",
569 | "
\n",
570 | "
"
571 | ],
572 | "text/plain": [
573 | " app mean_age\n",
574 | "0 Twitter 38.500000\n",
575 | "1 FB 30.571429\n",
576 | "2 Insta 25.500000"
577 | ]
578 | },
579 | "execution_count": 32,
580 | "metadata": {},
581 | "output_type": "execute_result"
582 | }
583 | ],
584 | "source": [
585 | "spark.sql(\"select * from age_query \").toPandas().head(5)"
586 | ]
587 | },
588 | {
589 | "cell_type": "code",
590 | "execution_count": 6,
591 | "metadata": {},
592 | "outputs": [
593 | {
594 | "name": "stdout",
595 | "output_type": "stream",
596 | "text": [
597 | "+-------+---------+\n",
598 | "| app|full_name|\n",
599 | "+-------+---------+\n",
600 | "| FB| FACEBOOK|\n",
601 | "| Insta|INSTAGRAM|\n",
602 | "|Twitter| TWITTER|\n",
603 | "+-------+---------+\n",
604 | "\n"
605 | ]
606 | }
607 | ],
608 | "source": [
609 | "# Join static dataframe with streaming dataframe\n",
610 | "app_df=spark.createDataFrame([('FB','FACEBOOK'),('Insta','INSTAGRAM'),('Twitter','TWITTER')],[\"app\", \"full_name\"])\n",
611 | "app_df.show()"
612 | ]
613 | },
614 | {
615 | "cell_type": "code",
616 | "execution_count": 7,
617 | "metadata": {},
618 | "outputs": [],
619 | "source": [
620 | "app_stream_df=data.join(app_df,'app')"
621 | ]
622 | },
623 | {
624 | "cell_type": "code",
625 | "execution_count": 8,
626 | "metadata": {},
627 | "outputs": [],
628 | "source": [
629 | "join_query=(app_stream_df.writeStream.queryName('join_query').outputMode('append').format('memory').start())"
630 | ]
631 | },
632 | {
633 | "cell_type": "code",
634 | "execution_count": 14,
635 | "metadata": {},
636 | "outputs": [
637 | {
638 | "data": {
639 | "text/html": [
640 | "\n",
641 | "
\n",
642 | " \n",
643 | " \n",
644 | " | \n",
645 | " app | \n",
646 | " user_id | \n",
647 | " time_in_secs | \n",
648 | " age | \n",
649 | " full_name | \n",
650 | "
\n",
651 | " \n",
652 | " \n",
653 | " \n",
654 | " | 0 | \n",
655 | " FB | \n",
656 | " XN201 | \n",
657 | " 10 | \n",
658 | " 19 | \n",
659 | " FACEBOOK | \n",
660 | "
\n",
661 | " \n",
662 | " | 1 | \n",
663 | " FB | \n",
664 | " XN203 | \n",
665 | " 100 | \n",
666 | " 30 | \n",
667 | " FACEBOOK | \n",
668 | "
\n",
669 | " \n",
670 | " | 2 | \n",
671 | " FB | \n",
672 | " XN203 | \n",
673 | " 300 | \n",
674 | " 30 | \n",
675 | " FACEBOOK | \n",
676 | "
\n",
677 | " \n",
678 | " | 3 | \n",
679 | " FB | \n",
680 | " XN202 | \n",
681 | " 2000 | \n",
682 | " 45 | \n",
683 | " FACEBOOK | \n",
684 | "
\n",
685 | " \n",
686 | " | 4 | \n",
687 | " Insta | \n",
688 | " XN202 | \n",
689 | " 500 | \n",
690 | " 45 | \n",
691 | " INSTAGRAM | \n",
692 | "
\n",
693 | " \n",
694 | " | 5 | \n",
695 | " Twitter | \n",
696 | " XN201 | \n",
697 | " 10 | \n",
698 | " 19 | \n",
699 | " TWITTER | \n",
700 | "
\n",
701 | " \n",
702 | " | 6 | \n",
703 | " FB | \n",
704 | " XN203 | \n",
705 | " 500 | \n",
706 | " 30 | \n",
707 | " FACEBOOK | \n",
708 | "
\n",
709 | " \n",
710 | " | 7 | \n",
711 | " Insta | \n",
712 | " XN201 | \n",
713 | " 30 | \n",
714 | " 19 | \n",
715 | " INSTAGRAM | \n",
716 | "
\n",
717 | " \n",
718 | " | 8 | \n",
719 | " Twitter | \n",
720 | " XN202 | \n",
721 | " 100 | \n",
722 | " 45 | \n",
723 | " TWITTER | \n",
724 | "
\n",
725 | " \n",
726 | " | 9 | \n",
727 | " FB | \n",
728 | " XN203 | \n",
729 | " 500 | \n",
730 | " 30 | \n",
731 | " FACEBOOK | \n",
732 | "
\n",
733 | " \n",
734 | " | 10 | \n",
735 | " Insta | \n",
736 | " XN201 | \n",
737 | " 30 | \n",
738 | " 19 | \n",
739 | " INSTAGRAM | \n",
740 | "
\n",
741 | " \n",
742 | " | 11 | \n",
743 | " Twitter | \n",
744 | " XN202 | \n",
745 | " 100 | \n",
746 | " 45 | \n",
747 | " TWITTER | \n",
748 | "
\n",
749 | " \n",
750 | "
\n",
751 | "
"
752 | ],
753 | "text/plain": [
754 | " app user_id time_in_secs age full_name\n",
755 | "0 FB XN201 10 19 FACEBOOK\n",
756 | "1 FB XN203 100 30 FACEBOOK\n",
757 | "2 FB XN203 300 30 FACEBOOK\n",
758 | "3 FB XN202 2000 45 FACEBOOK\n",
759 | "4 Insta XN202 500 45 INSTAGRAM\n",
760 | "5 Twitter XN201 10 19 TWITTER\n",
761 | "6 FB XN203 500 30 FACEBOOK\n",
762 | "7 Insta XN201 30 19 INSTAGRAM\n",
763 | "8 Twitter XN202 100 45 TWITTER\n",
764 | "9 FB XN203 500 30 FACEBOOK\n",
765 | "10 Insta XN201 30 19 INSTAGRAM\n",
766 | "11 Twitter XN202 100 45 TWITTER"
767 | ]
768 | },
769 | "execution_count": 14,
770 | "metadata": {},
771 | "output_type": "execute_result"
772 | }
773 | ],
774 | "source": [
775 | "spark.sql(\"select * from join_query \").toPandas().head(50)"
776 | ]
777 | },
778 | {
779 | "cell_type": "code",
780 | "execution_count": null,
781 | "metadata": {},
782 | "outputs": [],
783 | "source": []
784 | },
785 | {
786 | "cell_type": "code",
787 | "execution_count": null,
788 | "metadata": {},
789 | "outputs": [],
790 | "source": []
791 | }
792 | ],
793 | "metadata": {
794 | "kernelspec": {
795 | "display_name": "Python 3",
796 | "language": "python",
797 | "name": "python3"
798 | },
799 | "language_info": {
800 | "codemirror_mode": {
801 | "name": "ipython",
802 | "version": 3
803 | },
804 | "file_extension": ".py",
805 | "mimetype": "text/x-python",
806 | "name": "python",
807 | "nbconvert_exporter": "python",
808 | "pygments_lexer": "ipython3",
809 | "version": "3.6.3"
810 | }
811 | },
812 | "nbformat": 4,
813 | "nbformat_minor": 2
814 | }
815 |
--------------------------------------------------------------------------------
/chap_3/Spark Structured Streaming demo.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#import SparkSession\n",
10 | "from pyspark.sql import SparkSession\n",
11 | "spark=SparkSession.builder.appName('structured_streaming').getOrCreate()"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "import pyspark.sql.functions as F\n",
21 | "from pyspark.sql.types import *"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 3,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "#create sample dataset\n",
31 | "df_1=spark.createDataFrame([(\"XN203\",'FB',300,30),(\"XN201\",'Twitter',10,19),(\"XN202\",'Insta',500,45)], \n",
32 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')"
33 | ]
34 | },
35 | {
36 | "cell_type": "code",
37 | "execution_count": 4,
38 | "metadata": {},
39 | "outputs": [],
40 | "source": [
41 | "#define schema for input data\n",
42 | "schema=StructType().add(\"user_id\", \"string\").add(\"app\", \"string\").add(\"time_in_secs\", \"integer\").add(\"age\", \"integer\")\n",
43 | "data=spark.readStream.option(\"sep\", \",\").schema(schema).csv(\"demo\")"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 5,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "name": "stdout",
53 | "output_type": "stream",
54 | "text": [
55 | "root\n",
56 | " |-- user_id: string (nullable = true)\n",
57 | " |-- app: string (nullable = true)\n",
58 | " |-- time_in_secs: integer (nullable = true)\n",
59 | " |-- age: integer (nullable = true)\n",
60 | "\n"
61 | ]
62 | }
63 | ],
64 | "source": [
65 | "data.printSchema()"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 6,
71 | "metadata": {},
72 | "outputs": [],
73 | "source": [
74 | "app_count=data.groupBy('app').count()"
75 | ]
76 | },
77 | {
78 | "cell_type": "code",
79 | "execution_count": 7,
80 | "metadata": {},
81 | "outputs": [],
82 | "source": [
83 | "query=(app_count.writeStream.queryName('count_query').outputMode('complete').format('memory').start())"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 8,
89 | "metadata": {},
90 | "outputs": [
91 | {
92 | "data": {
93 | "text/html": [
94 | "\n",
95 | "
\n",
96 | " \n",
97 | " \n",
98 | " | \n",
99 | " app | \n",
100 | " count | \n",
101 | "
\n",
102 | " \n",
103 | " \n",
104 | " \n",
105 | " | 0 | \n",
106 | " Insta | \n",
107 | " 1 | \n",
108 | "
\n",
109 | " \n",
110 | " | 1 | \n",
111 | " FB | \n",
112 | " 1 | \n",
113 | "
\n",
114 | " \n",
115 | " | 2 | \n",
116 | " Twitter | \n",
117 | " 1 | \n",
118 | "
\n",
119 | " \n",
120 | "
\n",
121 | "
"
122 | ],
123 | "text/plain": [
124 | " app count\n",
125 | "0 Insta 1\n",
126 | "1 FB 1\n",
127 | "2 Twitter 1"
128 | ]
129 | },
130 | "execution_count": 8,
131 | "metadata": {},
132 | "output_type": "execute_result"
133 | }
134 | ],
135 | "source": [
136 | "spark.sql(\"select * from count_query \").toPandas().head(5)"
137 | ]
138 | },
139 | {
140 | "cell_type": "code",
141 | "execution_count": 9,
142 | "metadata": {},
143 | "outputs": [],
144 | "source": [
145 | "fb_data=data.filter(data['app']=='FB')"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 10,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "fb_avg_time=fb_data.groupBy('user_id').agg(F.avg(\"time_in_secs\"))"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": 11,
160 | "metadata": {},
161 | "outputs": [],
162 | "source": [
163 | "fb_query=(fb_avg_time.writeStream.queryName('fb_query').outputMode('complete').format('memory').start())"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 13,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "data": {
173 | "text/html": [
174 | "\n",
175 | "
\n",
176 | " \n",
177 | " \n",
178 | " | \n",
179 | " user_id | \n",
180 | " avg(time_in_secs) | \n",
181 | "
\n",
182 | " \n",
183 | " \n",
184 | " \n",
185 | " | 0 | \n",
186 | " XN203 | \n",
187 | " 300.0 | \n",
188 | "
\n",
189 | " \n",
190 | "
\n",
191 | "
"
192 | ],
193 | "text/plain": [
194 | " user_id avg(time_in_secs)\n",
195 | "0 XN203 300.0"
196 | ]
197 | },
198 | "execution_count": 13,
199 | "metadata": {},
200 | "output_type": "execute_result"
201 | }
202 | ],
203 | "source": [
204 | "spark.sql(\"select * from fb_query \").toPandas().head(5)"
205 | ]
206 | },
207 | {
208 | "cell_type": "code",
209 | "execution_count": 21,
210 | "metadata": {},
211 | "outputs": [],
212 | "source": [
213 | "df_2=spark.createDataFrame([(\"XN203\",'FB',100,30),(\"XN201\",'FB',10,19),(\"XN202\",'FB',2000,45)], \n",
214 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 23,
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "data": {
224 | "text/html": [
225 | "\n",
226 | "
\n",
227 | " \n",
228 | " \n",
229 | " | \n",
230 | " user_id | \n",
231 | " avg(time_in_secs) | \n",
232 | "
\n",
233 | " \n",
234 | " \n",
235 | " \n",
236 | " | 0 | \n",
237 | " XN203 | \n",
238 | " 200.0 | \n",
239 | "
\n",
240 | " \n",
241 | " | 1 | \n",
242 | " XN201 | \n",
243 | " 10.0 | \n",
244 | "
\n",
245 | " \n",
246 | "
\n",
247 | "
"
248 | ],
249 | "text/plain": [
250 | " user_id avg(time_in_secs)\n",
251 | "0 XN203 200.0\n",
252 | "1 XN201 10.0"
253 | ]
254 | },
255 | "execution_count": 23,
256 | "metadata": {},
257 | "output_type": "execute_result"
258 | }
259 | ],
260 | "source": [
261 | "spark.sql(\"select * from fb_query \").toPandas().head(5)"
262 | ]
263 | },
264 | {
265 | "cell_type": "code",
266 | "execution_count": 24,
267 | "metadata": {},
268 | "outputs": [],
269 | "source": [
270 | "df_3=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
271 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')"
272 | ]
273 | },
274 | {
275 | "cell_type": "code",
276 | "execution_count": 25,
277 | "metadata": {},
278 | "outputs": [
279 | {
280 | "data": {
281 | "text/html": [
282 | "\n",
283 | "
\n",
284 | " \n",
285 | " \n",
286 | " | \n",
287 | " user_id | \n",
288 | " avg(time_in_secs) | \n",
289 | "
\n",
290 | " \n",
291 | " \n",
292 | " \n",
293 | " | 0 | \n",
294 | " XN203 | \n",
295 | " 200.0 | \n",
296 | "
\n",
297 | " \n",
298 | " | 1 | \n",
299 | " XN201 | \n",
300 | " 10.0 | \n",
301 | "
\n",
302 | " \n",
303 | " | 2 | \n",
304 | " XN202 | \n",
305 | " 2000.0 | \n",
306 | "
\n",
307 | " \n",
308 | "
\n",
309 | "
"
310 | ],
311 | "text/plain": [
312 | " user_id avg(time_in_secs)\n",
313 | "0 XN203 200.0\n",
314 | "1 XN201 10.0\n",
315 | "2 XN202 2000.0"
316 | ]
317 | },
318 | "execution_count": 25,
319 | "metadata": {},
320 | "output_type": "execute_result"
321 | }
322 | ],
323 | "source": [
324 | "spark.sql(\"select * from fb_query \").toPandas().head(5)"
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 26,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "df_4=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
334 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 18,
340 | "metadata": {},
341 | "outputs": [],
342 | "source": [
343 | "#app wise time spent\n",
344 | "\n",
345 | "app_df=data.groupBy('app').agg(F.sum('time_in_secs').alias('total_time')).orderBy('total_time',ascending=False)"
346 | ]
347 | },
348 | {
349 | "cell_type": "code",
350 | "execution_count": 19,
351 | "metadata": {},
352 | "outputs": [],
353 | "source": [
354 | "app_query=(app_df.writeStream.queryName('app_wise_query').outputMode('complete').format('memory').start())"
355 | ]
356 | },
357 | {
358 | "cell_type": "code",
359 | "execution_count": 27,
360 | "metadata": {},
361 | "outputs": [
362 | {
363 | "data": {
364 | "text/html": [
365 | "\n",
366 | "
\n",
367 | " \n",
368 | " \n",
369 | " | \n",
370 | " app | \n",
371 | " total_time | \n",
372 | "
\n",
373 | " \n",
374 | " \n",
375 | " \n",
376 | " | 0 | \n",
377 | " FB | \n",
378 | " 3410 | \n",
379 | "
\n",
380 | " \n",
381 | " | 1 | \n",
382 | " Insta | \n",
383 | " 560 | \n",
384 | "
\n",
385 | " \n",
386 | " | 2 | \n",
387 | " Twitter | \n",
388 | " 210 | \n",
389 | "
\n",
390 | " \n",
391 | "
\n",
392 | "
"
393 | ],
394 | "text/plain": [
395 | " app total_time\n",
396 | "0 FB 3410\n",
397 | "1 Insta 560\n",
398 | "2 Twitter 210"
399 | ]
400 | },
401 | "execution_count": 27,
402 | "metadata": {},
403 | "output_type": "execute_result"
404 | }
405 | ],
406 | "source": [
407 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
408 | ]
409 | },
410 | {
411 | "cell_type": "code",
412 | "execution_count": 11,
413 | "metadata": {},
414 | "outputs": [],
415 | "source": [
416 | "df_5=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
417 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 26,
423 | "metadata": {},
424 | "outputs": [
425 | {
426 | "data": {
427 | "text/html": [
428 | "\n",
429 | "
\n",
430 | " \n",
431 | " \n",
432 | " | \n",
433 | " app | \n",
434 | " total_time | \n",
435 | "
\n",
436 | " \n",
437 | " \n",
438 | " \n",
439 | " | 0 | \n",
440 | " FB | \n",
441 | " 3410 | \n",
442 | "
\n",
443 | " \n",
444 | " | 1 | \n",
445 | " Insta | \n",
446 | " 560 | \n",
447 | "
\n",
448 | " \n",
449 | " | 2 | \n",
450 | " Twitter | \n",
451 | " 210 | \n",
452 | "
\n",
453 | " \n",
454 | "
\n",
455 | "
"
456 | ],
457 | "text/plain": [
458 | " app total_time\n",
459 | "0 FB 3410\n",
460 | "1 Insta 560\n",
461 | "2 Twitter 210"
462 | ]
463 | },
464 | "execution_count": 26,
465 | "metadata": {},
466 | "output_type": "execute_result"
467 | }
468 | ],
469 | "source": [
470 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 28,
476 | "metadata": {},
477 | "outputs": [],
478 | "source": [
479 | "# app wise mean age \n",
480 | "age_df=data.groupBy('app').agg(F.avg('age').alias('mean_age')).orderBy('mean_age',ascending=False)\n"
481 | ]
482 | },
483 | {
484 | "cell_type": "code",
485 | "execution_count": null,
486 | "metadata": {
487 | "scrolled": true
488 | },
489 | "outputs": [],
490 | "source": [
491 | "age_query=(age_df.writeStream.queryName('age_query').outputMode('complete').format('memory').start())"
492 | ]
493 | },
494 | {
495 | "cell_type": "code",
496 | "execution_count": 30,
497 | "metadata": {},
498 | "outputs": [
499 | {
500 | "data": {
501 | "text/html": [
502 | "\n",
503 | "
\n",
504 | " \n",
505 | " \n",
506 | " | \n",
507 | " app | \n",
508 | " mean_age | \n",
509 | "
\n",
510 | " \n",
511 | " \n",
512 | " \n",
513 | " | 0 | \n",
514 | " Twitter | \n",
515 | " 38.500000 | \n",
516 | "
\n",
517 | " \n",
518 | " | 1 | \n",
519 | " FB | \n",
520 | " 30.571429 | \n",
521 | "
\n",
522 | " \n",
523 | " | 2 | \n",
524 | " Insta | \n",
525 | " 25.500000 | \n",
526 | "
\n",
527 | " \n",
528 | "
\n",
529 | "
"
530 | ],
531 | "text/plain": [
532 | " app mean_age\n",
533 | "0 Twitter 38.500000\n",
534 | "1 FB 30.571429\n",
535 | "2 Insta 25.500000"
536 | ]
537 | },
538 | "execution_count": 30,
539 | "metadata": {},
540 | "output_type": "execute_result"
541 | }
542 | ],
543 | "source": [
544 | "spark.sql(\"select * from age_query \").toPandas().head(5)"
545 | ]
546 | },
547 | {
548 | "cell_type": "code",
549 | "execution_count": 15,
550 | "metadata": {},
551 | "outputs": [],
552 | "source": [
553 | "df_6=spark.createDataFrame([(\"XN210\",'FB',500,50),(\"XN255\",'Insta',30,23),(\"XN222\",'Twitter',100,30)], \n",
554 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
555 | ]
556 | },
557 | {
558 | "cell_type": "code",
559 | "execution_count": 32,
560 | "metadata": {},
561 | "outputs": [
562 | {
563 | "data": {
564 | "text/html": [
565 | "\n",
566 | "
\n",
567 | " \n",
568 | " \n",
569 | " | \n",
570 | " app | \n",
571 | " mean_age | \n",
572 | "
\n",
573 | " \n",
574 | " \n",
575 | " \n",
576 | " | 0 | \n",
577 | " Twitter | \n",
578 | " 38.500000 | \n",
579 | "
\n",
580 | " \n",
581 | " | 1 | \n",
582 | " FB | \n",
583 | " 30.571429 | \n",
584 | "
\n",
585 | " \n",
586 | " | 2 | \n",
587 | " Insta | \n",
588 | " 25.500000 | \n",
589 | "
\n",
590 | " \n",
591 | "
\n",
592 | "
"
593 | ],
594 | "text/plain": [
595 | " app mean_age\n",
596 | "0 Twitter 38.500000\n",
597 | "1 FB 30.571429\n",
598 | "2 Insta 25.500000"
599 | ]
600 | },
601 | "execution_count": 32,
602 | "metadata": {},
603 | "output_type": "execute_result"
604 | }
605 | ],
606 | "source": [
607 | "spark.sql(\"select * from age_query \").toPandas().head(5)"
608 | ]
609 | },
610 | {
611 | "cell_type": "code",
612 | "execution_count": 27,
613 | "metadata": {},
614 | "outputs": [
615 | {
616 | "name": "stdout",
617 | "output_type": "stream",
618 | "text": [
619 | "+-------+---------+\n",
620 | "| app|full_name|\n",
621 | "+-------+---------+\n",
622 | "| FB| FACEBOOK|\n",
623 | "| Insta|INSTAGRAM|\n",
624 | "|Twitter| TWITTER|\n",
625 | "+-------+---------+\n",
626 | "\n"
627 | ]
628 | }
629 | ],
630 | "source": [
631 | "# Join static dataframe with streaming dataframe\n",
632 | "app_df=spark.createDataFrame([('FB','FACEBOOK'),('Insta','INSTAGRAM'),('Twitter','TWITTER')],[\"app\", \"full_name\"])\n",
633 | "app_df.show()"
634 | ]
635 | },
636 | {
637 | "cell_type": "code",
638 | "execution_count": 28,
639 | "metadata": {},
640 | "outputs": [],
641 | "source": [
642 | "app_stream_df=data.join(app_df,'app')"
643 | ]
644 | },
645 | {
646 | "cell_type": "code",
647 | "execution_count": 29,
648 | "metadata": {},
649 | "outputs": [],
650 | "source": [
651 | "join_query=(app_stream_df.writeStream.queryName('join_query').outputMode('append').format('memory').start())"
652 | ]
653 | },
654 | {
655 | "cell_type": "code",
656 | "execution_count": 30,
657 | "metadata": {},
658 | "outputs": [
659 | {
660 | "data": {
661 | "text/html": [
662 | "\n",
663 | "
\n",
664 | " \n",
665 | " \n",
666 | " | \n",
667 | " app | \n",
668 | " user_id | \n",
669 | " time_in_secs | \n",
670 | " age | \n",
671 | " full_name | \n",
672 | "
\n",
673 | " \n",
674 | " \n",
675 | " \n",
676 | " | 0 | \n",
677 | " FB | \n",
678 | " XN201 | \n",
679 | " 10 | \n",
680 | " 19 | \n",
681 | " FACEBOOK | \n",
682 | "
\n",
683 | " \n",
684 | " | 1 | \n",
685 | " FB | \n",
686 | " XN203 | \n",
687 | " 500 | \n",
688 | " 30 | \n",
689 | " FACEBOOK | \n",
690 | "
\n",
691 | " \n",
692 | " | 2 | \n",
693 | " FB | \n",
694 | " XN203 | \n",
695 | " 500 | \n",
696 | " 30 | \n",
697 | " FACEBOOK | \n",
698 | "
\n",
699 | " \n",
700 | " | 3 | \n",
701 | " FB | \n",
702 | " XN203 | \n",
703 | " 100 | \n",
704 | " 30 | \n",
705 | " FACEBOOK | \n",
706 | "
\n",
707 | " \n",
708 | " | 4 | \n",
709 | " FB | \n",
710 | " XN203 | \n",
711 | " 300 | \n",
712 | " 30 | \n",
713 | " FACEBOOK | \n",
714 | "
\n",
715 | " \n",
716 | " | 5 | \n",
717 | " FB | \n",
718 | " XN202 | \n",
719 | " 2000 | \n",
720 | " 45 | \n",
721 | " FACEBOOK | \n",
722 | "
\n",
723 | " \n",
724 | " | 6 | \n",
725 | " Insta | \n",
726 | " XN201 | \n",
727 | " 30 | \n",
728 | " 19 | \n",
729 | " INSTAGRAM | \n",
730 | "
\n",
731 | " \n",
732 | " | 7 | \n",
733 | " Insta | \n",
734 | " XN201 | \n",
735 | " 30 | \n",
736 | " 19 | \n",
737 | " INSTAGRAM | \n",
738 | "
\n",
739 | " \n",
740 | " | 8 | \n",
741 | " Insta | \n",
742 | " XN202 | \n",
743 | " 500 | \n",
744 | " 45 | \n",
745 | " INSTAGRAM | \n",
746 | "
\n",
747 | " \n",
748 | " | 9 | \n",
749 | " Twitter | \n",
750 | " XN201 | \n",
751 | " 10 | \n",
752 | " 19 | \n",
753 | " TWITTER | \n",
754 | "
\n",
755 | " \n",
756 | " | 10 | \n",
757 | " Twitter | \n",
758 | " XN202 | \n",
759 | " 100 | \n",
760 | " 45 | \n",
761 | " TWITTER | \n",
762 | "
\n",
763 | " \n",
764 | " | 11 | \n",
765 | " Twitter | \n",
766 | " XN202 | \n",
767 | " 100 | \n",
768 | " 45 | \n",
769 | " TWITTER | \n",
770 | "
\n",
771 | " \n",
772 | "
\n",
773 | "
"
774 | ],
775 | "text/plain": [
776 | " app user_id time_in_secs age full_name\n",
777 | "0 FB XN201 10 19 FACEBOOK\n",
778 | "1 FB XN203 500 30 FACEBOOK\n",
779 | "2 FB XN203 500 30 FACEBOOK\n",
780 | "3 FB XN203 100 30 FACEBOOK\n",
781 | "4 FB XN203 300 30 FACEBOOK\n",
782 | "5 FB XN202 2000 45 FACEBOOK\n",
783 | "6 Insta XN201 30 19 INSTAGRAM\n",
784 | "7 Insta XN201 30 19 INSTAGRAM\n",
785 | "8 Insta XN202 500 45 INSTAGRAM\n",
786 | "9 Twitter XN201 10 19 TWITTER\n",
787 | "10 Twitter XN202 100 45 TWITTER\n",
788 | "11 Twitter XN202 100 45 TWITTER"
789 | ]
790 | },
791 | "execution_count": 30,
792 | "metadata": {},
793 | "output_type": "execute_result"
794 | }
795 | ],
796 | "source": [
797 | "spark.sql(\"select * from join_query \").toPandas().head(50)"
798 | ]
799 | },
800 | {
801 | "cell_type": "code",
802 | "execution_count": null,
803 | "metadata": {},
804 | "outputs": [],
805 | "source": []
806 | },
807 | {
808 | "cell_type": "code",
809 | "execution_count": null,
810 | "metadata": {},
811 | "outputs": [],
812 | "source": []
813 | }
814 | ],
815 | "metadata": {
816 | "kernelspec": {
817 | "display_name": "Python 3",
818 | "language": "python",
819 | "name": "python3"
820 | },
821 | "language_info": {
822 | "codemirror_mode": {
823 | "name": "ipython",
824 | "version": 3
825 | },
826 | "file_extension": ".py",
827 | "mimetype": "text/x-python",
828 | "name": "python",
829 | "nbconvert_exporter": "python",
830 | "pygments_lexer": "ipython3",
831 | "version": "3.6.3"
832 | }
833 | },
834 | "nbformat": 4,
835 | "nbformat_minor": 2
836 | }
837 |
--------------------------------------------------------------------------------
/chap_5/.ipynb_checkpoints/Classification_using_MLlib-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#import SparkSession\n",
10 | "from pyspark.sql import SparkSession\n",
11 | "spark=SparkSession.builder.appName('binary_class').getOrCreate()"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 11,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "#read the dataset\n",
21 | "df=spark.read.csv('classification_data.csv',inferSchema=True,header=True)"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 3,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "from pyspark.sql.functions import *\n"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 12,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "(46751, 12)\n"
43 | ]
44 | }
45 | ],
46 | "source": [
47 | "#check the shape of the data \n",
48 | "print((df.count(),len(df.columns)))"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 13,
54 | "metadata": {},
55 | "outputs": [
56 | {
57 | "name": "stdout",
58 | "output_type": "stream",
59 | "text": [
60 | "root\n",
61 | " |-- loan_id: string (nullable = true)\n",
62 | " |-- loan_purpose: string (nullable = true)\n",
63 | " |-- is_first_loan: integer (nullable = true)\n",
64 | " |-- total_credit_card_limit: integer (nullable = true)\n",
65 | " |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)\n",
66 | " |-- saving_amount: integer (nullable = true)\n",
67 | " |-- checking_amount: integer (nullable = true)\n",
68 | " |-- is_employed: integer (nullable = true)\n",
69 | " |-- yearly_salary: integer (nullable = true)\n",
70 | " |-- age: integer (nullable = true)\n",
71 | " |-- dependent_number: integer (nullable = true)\n",
72 | " |-- loan_defaulter: integer (nullable = true)\n",
73 | "\n"
74 | ]
75 | }
76 | ],
77 | "source": [
78 | "#printSchema\n",
79 | "df.printSchema()"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": 14,
85 | "metadata": {},
86 | "outputs": [
87 | {
88 | "data": {
89 | "text/plain": [
90 | "['loan_id',\n",
91 | " 'loan_purpose',\n",
92 | " 'is_first_loan',\n",
93 | " 'total_credit_card_limit',\n",
94 | " 'avg_percentage_credit_card_limit_used_last_year',\n",
95 | " 'saving_amount',\n",
96 | " 'checking_amount',\n",
97 | " 'is_employed',\n",
98 | " 'yearly_salary',\n",
99 | " 'age',\n",
100 | " 'dependent_number',\n",
101 | " 'loan_defaulter']"
102 | ]
103 | },
104 | "execution_count": 14,
105 | "metadata": {},
106 | "output_type": "execute_result"
107 | }
108 | ],
109 | "source": [
110 | "#number of columns in dataset\n",
111 | "df.columns"
112 | ]
113 | },
114 | {
115 | "cell_type": "code",
116 | "execution_count": 15,
117 | "metadata": {},
118 | "outputs": [
119 | {
120 | "name": "stdout",
121 | "output_type": "stream",
122 | "text": [
123 | "+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+\n",
124 | "|loan_id|loan_purpose|is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|saving_amount|checking_amount|is_employed|yearly_salary|age|dependent_number|loan_defaulter|\n",
125 | "+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+\n",
126 | "| A_1| personal| 1| 7900| 0.8| 1103| 6393| 1| 16400| 42| 4| 0|\n",
127 | "| A_2| personal| 0| 3300| 0.29| 2588| 832| 1| 75500| 56| 1| 0|\n",
128 | "| A_3| personal| 0| 7600| 0.9| 1651| 8868| 1| 59000| 46| 1| 0|\n",
129 | "| A_4| personal| 1| 3400| 0.38| 1269| 6863| 1| 26000| 55| 8| 0|\n",
130 | "| A_5| emergency| 0| 2600| 0.89| 1310| 3423| 1| 9700| 41| 4| 1|\n",
131 | "+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+\n",
132 | "only showing top 5 rows\n",
133 | "\n"
134 | ]
135 | }
136 | ],
137 | "source": [
138 | "#view the dataset\n",
139 | "df.show(5)"
140 | ]
141 | },
142 | {
143 | "cell_type": "code",
144 | "execution_count": 16,
145 | "metadata": {},
146 | "outputs": [
147 | {
148 | "name": "stdout",
149 | "output_type": "stream",
150 | "text": [
151 | "+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+\n",
152 | "|summary|loan_id|loan_purpose| is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year| saving_amount| checking_amount| is_employed| yearly_salary| age| dependent_number| loan_defaulter|\n",
153 | "+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+\n",
154 | "| count| 46751| 46751| 46751| 46751| 46751| 46751| 46751| 46751| 46751| 46751| 46751| 46751|\n",
155 | "| mean| null| null|0.5414429637868708| 4615.304485465552| 0.700091121045545| 2037.636585313683|3520.6714294881394|0.9173279715941905| 29527.62079955509| 41.53979594019379|3.7448396825736348|0.34653804196701676|\n",
156 | "| stddev| null| null|0.4982848498677868| 1890.194453628314| 0.1777288093267152|1498.6710906030362|2160.9332423713727|0.2753887911928983|16149.757703029438|12.817646350266434|2.6191527902107667|0.47587211651314887|\n",
157 | "| min| A_1| emergency| 0| 500| 0.0| 0| 0| 0| 0| 18| 0| 0|\n",
158 | "| max| A_9999| property| 1| 13500| 1.09| 10641| 13165| 1| 97200| 79| 8| 1|\n",
159 | "+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+\n",
160 | "\n"
161 | ]
162 | }
163 | ],
164 | "source": [
165 | "#Exploratory Data Analysis\n",
166 | "df.describe().show()\n"
167 | ]
168 | },
169 | {
170 | "cell_type": "code",
171 | "execution_count": 17,
172 | "metadata": {},
173 | "outputs": [
174 | {
175 | "name": "stdout",
176 | "output_type": "stream",
177 | "text": [
178 | "+--------------+-----+\n",
179 | "|loan_defaulter|count|\n",
180 | "+--------------+-----+\n",
181 | "| 1|16201|\n",
182 | "| 0|30550|\n",
183 | "+--------------+-----+\n",
184 | "\n"
185 | ]
186 | }
187 | ],
188 | "source": [
189 | "df.groupBy('loan_defaulter').count().show()"
190 | ]
191 | },
192 | {
193 | "cell_type": "code",
194 | "execution_count": 18,
195 | "metadata": {},
196 | "outputs": [
197 | {
198 | "name": "stdout",
199 | "output_type": "stream",
200 | "text": [
201 | "+------------+-----+\n",
202 | "|loan_purpose|count|\n",
203 | "+------------+-----+\n",
204 | "| others| 6763|\n",
205 | "| emergency| 7562|\n",
206 | "| property|11388|\n",
207 | "| operations|10580|\n",
208 | "| personal|10458|\n",
209 | "+------------+-----+\n",
210 | "\n"
211 | ]
212 | }
213 | ],
214 | "source": [
215 | "df.groupBy('loan_purpose').count().show()"
216 | ]
217 | },
218 | {
219 | "cell_type": "code",
220 | "execution_count": 120,
221 | "metadata": {},
222 | "outputs": [],
223 | "source": [
224 | "#converting categorical data to numerical form"
225 | ]
226 | },
227 | {
228 | "cell_type": "code",
229 | "execution_count": 21,
230 | "metadata": {},
231 | "outputs": [],
232 | "source": [
233 | "#import required libraries\n",
234 | "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n",
235 | "\n"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 22,
241 | "metadata": {},
242 | "outputs": [],
243 | "source": [
244 | "loan_purpose_indexer = StringIndexer(inputCol=\"loan_purpose\", outputCol=\"loan_purpose\").fit(df)\n",
245 | "df = loan_purpose_indexer.transform(df)\n",
246 | "loan_encoder = OneHotEncoder(inputCol=\"loan_index\", outputCol=\"loan_purpose_vec\")\n",
247 | "df = loan_encoder.transform(df)"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": 63,
253 | "metadata": {},
254 | "outputs": [
255 | {
256 | "name": "stdout",
257 | "output_type": "stream",
258 | "text": [
259 | "+------------+------------+----------------+\n",
260 | "|loan_purpose|loan_purpose|loan_purpose_vec|\n",
261 | "+------------+------------+----------------+\n",
262 | "|personal |personal |(4,[2],[1.0]) |\n",
263 | "|personal |personal |(4,[2],[1.0]) |\n",
264 | "|personal |personal |(4,[2],[1.0]) |\n",
265 | "+------------+------------+----------------+\n",
266 | "only showing top 3 rows\n",
267 | "\n"
268 | ]
269 | }
270 | ],
271 | "source": [
272 | "df.select(['loan_purpose','loan_purpose','loan_purpose_vec']).show(3,False)"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 24,
278 | "metadata": {},
279 | "outputs": [],
280 | "source": [
281 | "from pyspark.ml.feature import VectorAssembler"
282 | ]
283 | },
284 | {
285 | "cell_type": "code",
286 | "execution_count": 25,
287 | "metadata": {},
288 | "outputs": [
289 | {
290 | "data": {
291 | "text/plain": [
292 | "['loan_id',\n",
293 | " 'loan_purpose',\n",
294 | " 'is_first_loan',\n",
295 | " 'total_credit_card_limit',\n",
296 | " 'avg_percentage_credit_card_limit_used_last_year',\n",
297 | " 'saving_amount',\n",
298 | " 'checking_amount',\n",
299 | " 'is_employed',\n",
300 | " 'yearly_salary',\n",
301 | " 'age',\n",
302 | " 'dependent_number',\n",
303 | " 'loan_defaulter',\n",
304 | " 'loan_index',\n",
305 | " 'loan_purpose_vec']"
306 | ]
307 | },
308 | "execution_count": 25,
309 | "metadata": {},
310 | "output_type": "execute_result"
311 | }
312 | ],
313 | "source": [
314 | "df.columns"
315 | ]
316 | },
317 | {
318 | "cell_type": "code",
319 | "execution_count": 28,
320 | "metadata": {},
321 | "outputs": [],
322 | "source": [
323 | "df_assembler = VectorAssembler(inputCols=['is_first_loan',\n",
324 | " 'total_credit_card_limit',\n",
325 | " 'avg_percentage_credit_card_limit_used_last_year',\n",
326 | " 'saving_amount',\n",
327 | " 'checking_amount',\n",
328 | " 'is_employed',\n",
329 | " 'yearly_salary',\n",
330 | " 'age',\n",
331 | " 'dependent_number',\n",
332 | " 'loan_purpose_vec'], outputCol=\"features\")\n",
333 | "df = df_assembler.transform(df)"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": 29,
339 | "metadata": {},
340 | "outputs": [
341 | {
342 | "name": "stdout",
343 | "output_type": "stream",
344 | "text": [
345 | "root\n",
346 | " |-- loan_id: string (nullable = true)\n",
347 | " |-- loan_purpose: string (nullable = true)\n",
348 | " |-- is_first_loan: integer (nullable = true)\n",
349 | " |-- total_credit_card_limit: integer (nullable = true)\n",
350 | " |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)\n",
351 | " |-- saving_amount: integer (nullable = true)\n",
352 | " |-- checking_amount: integer (nullable = true)\n",
353 | " |-- is_employed: integer (nullable = true)\n",
354 | " |-- yearly_salary: integer (nullable = true)\n",
355 | " |-- age: integer (nullable = true)\n",
356 | " |-- dependent_number: integer (nullable = true)\n",
357 | " |-- loan_defaulter: integer (nullable = true)\n",
358 | " |-- loan_index: double (nullable = false)\n",
359 | " |-- loan_purpose_vec: vector (nullable = true)\n",
360 | " |-- features: vector (nullable = true)\n",
361 | "\n"
362 | ]
363 | }
364 | ],
365 | "source": [
366 | "df.printSchema()"
367 | ]
368 | },
369 | {
370 | "cell_type": "code",
371 | "execution_count": 30,
372 | "metadata": {},
373 | "outputs": [
374 | {
375 | "name": "stdout",
376 | "output_type": "stream",
377 | "text": [
378 | "+--------------------------------------------------------------------+--------------+\n",
379 | "|features |loan_defaulter|\n",
380 | "+--------------------------------------------------------------------+--------------+\n",
381 | "|[1.0,7900.0,0.8,1103.0,6393.0,1.0,16400.0,42.0,4.0,0.0,0.0,1.0,0.0] |0 |\n",
382 | "|[0.0,3300.0,0.29,2588.0,832.0,1.0,75500.0,56.0,1.0,0.0,0.0,1.0,0.0] |0 |\n",
383 | "|[0.0,7600.0,0.9,1651.0,8868.0,1.0,59000.0,46.0,1.0,0.0,0.0,1.0,0.0] |0 |\n",
384 | "|[1.0,3400.0,0.38,1269.0,6863.0,1.0,26000.0,55.0,8.0,0.0,0.0,1.0,0.0]|0 |\n",
385 | "|[0.0,2600.0,0.89,1310.0,3423.0,1.0,9700.0,41.0,4.0,0.0,0.0,0.0,1.0] |1 |\n",
386 | "|[0.0,7600.0,0.51,1040.0,2406.0,1.0,22900.0,52.0,0.0,0.0,1.0,0.0,0.0]|0 |\n",
387 | "|[1.0,6900.0,0.82,2408.0,5556.0,1.0,34800.0,48.0,4.0,0.0,1.0,0.0,0.0]|0 |\n",
388 | "|[0.0,5700.0,0.56,1933.0,4139.0,1.0,32500.0,64.0,2.0,0.0,0.0,1.0,0.0]|0 |\n",
389 | "|[1.0,3400.0,0.95,3866.0,4131.0,1.0,13300.0,23.0,3.0,0.0,0.0,1.0,0.0]|0 |\n",
390 | "|[0.0,2900.0,0.91,88.0,2725.0,1.0,21100.0,52.0,1.0,0.0,0.0,1.0,0.0] |1 |\n",
391 | "+--------------------------------------------------------------------+--------------+\n",
392 | "only showing top 10 rows\n",
393 | "\n"
394 | ]
395 | }
396 | ],
397 | "source": [
398 | "df.select(['features','loan_defaulter']).show(10,False)"
399 | ]
400 | },
401 | {
402 | "cell_type": "code",
403 | "execution_count": 31,
404 | "metadata": {},
405 | "outputs": [],
406 | "source": [
407 | "#select data for building model\n",
408 | "model_df=df.select(['features','loan_defaulter'])"
409 | ]
410 | },
411 | {
412 | "cell_type": "code",
413 | "execution_count": 32,
414 | "metadata": {},
415 | "outputs": [],
416 | "source": [
417 | "from pyspark.ml.classification import LogisticRegression"
418 | ]
419 | },
420 | {
421 | "cell_type": "code",
422 | "execution_count": 33,
423 | "metadata": {},
424 | "outputs": [],
425 | "source": [
426 | "#split the data \n",
427 | "training_df,test_df=model_df.randomSplit([0.75,0.25])"
428 | ]
429 | },
430 | {
431 | "cell_type": "code",
432 | "execution_count": 34,
433 | "metadata": {},
434 | "outputs": [
435 | {
436 | "data": {
437 | "text/plain": [
438 | "34958"
439 | ]
440 | },
441 | "execution_count": 34,
442 | "metadata": {},
443 | "output_type": "execute_result"
444 | }
445 | ],
446 | "source": [
447 | "training_df.count()"
448 | ]
449 | },
450 | {
451 | "cell_type": "code",
452 | "execution_count": 35,
453 | "metadata": {},
454 | "outputs": [
455 | {
456 | "name": "stdout",
457 | "output_type": "stream",
458 | "text": [
459 | "+--------------+-----+\n",
460 | "|loan_defaulter|count|\n",
461 | "+--------------+-----+\n",
462 | "| 1|12048|\n",
463 | "| 0|22910|\n",
464 | "+--------------+-----+\n",
465 | "\n"
466 | ]
467 | }
468 | ],
469 | "source": [
470 | "training_df.groupBy('loan_defaulter').count().show()"
471 | ]
472 | },
473 | {
474 | "cell_type": "code",
475 | "execution_count": 36,
476 | "metadata": {},
477 | "outputs": [
478 | {
479 | "data": {
480 | "text/plain": [
481 | "11793"
482 | ]
483 | },
484 | "execution_count": 36,
485 | "metadata": {},
486 | "output_type": "execute_result"
487 | }
488 | ],
489 | "source": [
490 | "test_df.count()"
491 | ]
492 | },
493 | {
494 | "cell_type": "code",
495 | "execution_count": 37,
496 | "metadata": {},
497 | "outputs": [
498 | {
499 | "name": "stdout",
500 | "output_type": "stream",
501 | "text": [
502 | "+--------------+-----+\n",
503 | "|loan_defaulter|count|\n",
504 | "+--------------+-----+\n",
505 | "| 1| 4153|\n",
506 | "| 0| 7640|\n",
507 | "+--------------+-----+\n",
508 | "\n"
509 | ]
510 | }
511 | ],
512 | "source": [
513 | "test_df.groupBy('loan_defaulter').count().show()"
514 | ]
515 | },
516 | {
517 | "cell_type": "code",
518 | "execution_count": 38,
519 | "metadata": {},
520 | "outputs": [],
521 | "source": [
522 | "log_reg=LogisticRegression(labelCol='loan_defaulter').fit(training_df)"
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": null,
528 | "metadata": {},
529 | "outputs": [],
530 | "source": [
531 | "#Training Results"
532 | ]
533 | },
534 | {
535 | "cell_type": "code",
536 | "execution_count": 39,
537 | "metadata": {},
538 | "outputs": [],
539 | "source": [
540 | "lr_summary=log_reg.summary"
541 | ]
542 | },
543 | {
544 | "cell_type": "code",
545 | "execution_count": 40,
546 | "metadata": {},
547 | "outputs": [
548 | {
549 | "data": {
550 | "text/plain": [
551 | "0.8939298586875679"
552 | ]
553 | },
554 | "execution_count": 40,
555 | "metadata": {},
556 | "output_type": "execute_result"
557 | }
558 | ],
559 | "source": [
560 | "lr_summary.accuracy"
561 | ]
562 | },
563 | {
564 | "cell_type": "code",
565 | "execution_count": 41,
566 | "metadata": {},
567 | "outputs": [
568 | {
569 | "data": {
570 | "text/plain": [
571 | "0.9587456481363935"
572 | ]
573 | },
574 | "execution_count": 41,
575 | "metadata": {},
576 | "output_type": "execute_result"
577 | }
578 | ],
579 | "source": [
580 | "lr_summary.areaUnderROC"
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": 42,
586 | "metadata": {},
587 | "outputs": [
588 | {
589 | "name": "stdout",
590 | "output_type": "stream",
591 | "text": [
592 | "[0.9233245149911816, 0.8396318618667535]\n"
593 | ]
594 | }
595 | ],
596 | "source": [
597 | "print(lr_summary.precisionByLabel)"
598 | ]
599 | },
600 | {
601 | "cell_type": "code",
602 | "execution_count": 43,
603 | "metadata": {},
604 | "outputs": [
605 | {
606 | "name": "stdout",
607 | "output_type": "stream",
608 | "text": [
609 | "[0.914054997817547, 0.8556606905710491]\n"
610 | ]
611 | }
612 | ],
613 | "source": [
614 | "print(lr_summary.recallByLabel)"
615 | ]
616 | },
617 | {
618 | "cell_type": "code",
619 | "execution_count": 45,
620 | "metadata": {},
621 | "outputs": [
622 | {
623 | "name": "stdout",
624 | "output_type": "stream",
625 | "text": [
626 | "+--------------------+--------------+--------------------+--------------------+----------+\n",
627 | "| features|loan_defaulter| rawPrediction| probability|prediction|\n",
628 | "+--------------------+--------------+--------------------+--------------------+----------+\n",
629 | "|(13,[0,1,2,3,4,7]...| 1|[-3.4630360774167...|[0.03038246469741...| 1.0|\n",
630 | "|(13,[0,1,2,3,4,7]...| 1|[-5.5391195110590...|[0.00391460129742...| 1.0|\n",
631 | "|(13,[0,1,2,3,4,7]...| 0|[1.00238593296486...|[0.73152742283114...| 0.0|\n",
632 | "|(13,[0,1,2,3,4,7]...| 1|[-1.8290704519648...|[0.13834904603406...| 1.0|\n",
633 | "|(13,[0,1,2,3,4,7]...| 1|[-1.5501728962289...|[0.17506129798003...| 1.0|\n",
634 | "|(13,[0,1,2,3,4,7]...| 0|[6.60737916543425...|[0.99865145442765...| 0.0|\n",
635 | "|(13,[0,1,2,3,4,7]...| 0|[7.50587822302399...|[0.99945045940723...| 0.0|\n",
636 | "|(13,[0,1,2,3,4,7,...| 1|[-4.4555325192703...|[0.01148079400371...| 1.0|\n",
637 | "|(13,[0,1,2,3,4,7,...| 1|[-4.5326784954285...|[0.01063746639570...| 1.0|\n",
638 | "|(13,[0,1,2,3,4,7,...| 1|[-4.9717163244463...|[0.00688353015038...| 1.0|\n",
639 | "+--------------------+--------------+--------------------+--------------------+----------+\n",
640 | "only showing top 10 rows\n",
641 | "\n"
642 | ]
643 | }
644 | ],
645 | "source": [
646 | "predictions = log_reg.transform(test_df)\n",
647 | "predictions.show(10)\n"
648 | ]
649 | },
650 | {
651 | "cell_type": "code",
652 | "execution_count": 47,
653 | "metadata": {},
654 | "outputs": [
655 | {
656 | "data": {
657 | "text/plain": [
658 | "['features', 'loan_defaulter', 'rawPrediction', 'probability', 'prediction']"
659 | ]
660 | },
661 | "execution_count": 47,
662 | "metadata": {},
663 | "output_type": "execute_result"
664 | }
665 | ],
666 | "source": [
667 | "model_predictions = log_reg.transform(test_df)\n",
668 | "model_predictions.columns"
669 | ]
670 | },
671 | {
672 | "cell_type": "code",
673 | "execution_count": 48,
674 | "metadata": {},
675 | "outputs": [],
676 | "source": [
677 | "model_predictions = log_reg.evaluate(test_df)\n"
678 | ]
679 | },
680 | {
681 | "cell_type": "code",
682 | "execution_count": 49,
683 | "metadata": {},
684 | "outputs": [
685 | {
686 | "data": {
687 | "text/plain": [
688 | "0.8945984906300347"
689 | ]
690 | },
691 | "execution_count": 49,
692 | "metadata": {},
693 | "output_type": "execute_result"
694 | }
695 | ],
696 | "source": [
697 | "model_predictions.accuracy"
698 | ]
699 | },
700 | {
701 | "cell_type": "code",
702 | "execution_count": 50,
703 | "metadata": {},
704 | "outputs": [
705 | {
706 | "data": {
707 | "text/plain": [
708 | "0.8951909857782705"
709 | ]
710 | },
711 | "execution_count": 50,
712 | "metadata": {},
713 | "output_type": "execute_result"
714 | }
715 | ],
716 | "source": [
717 | "model_predictions.weightedPrecision"
718 | ]
719 | },
720 | {
721 | "cell_type": "code",
722 | "execution_count": 52,
723 | "metadata": {},
724 | "outputs": [
725 | {
726 | "data": {
727 | "text/plain": [
728 | "[0.9129581151832461, 0.8608235010835541]"
729 | ]
730 | },
731 | "execution_count": 52,
732 | "metadata": {},
733 | "output_type": "execute_result"
734 | }
735 | ],
736 | "source": [
737 | "model_predictions.recallByLabel"
738 | ]
739 | },
740 | {
741 | "cell_type": "code",
742 | "execution_count": 53,
743 | "metadata": {},
744 | "outputs": [
745 | {
746 | "name": "stdout",
747 | "output_type": "stream",
748 | "text": [
749 | "[0.9234741162452006, 0.8431603773584906]\n"
750 | ]
751 | }
752 | ],
753 | "source": [
754 | "print(model_predictions.precisionByLabel)"
755 | ]
756 | },
757 | {
758 | "cell_type": "code",
759 | "execution_count": 54,
760 | "metadata": {},
761 | "outputs": [
762 | {
763 | "data": {
764 | "text/plain": [
765 | "0.9594316478468224"
766 | ]
767 | },
768 | "execution_count": 54,
769 | "metadata": {},
770 | "output_type": "execute_result"
771 | }
772 | ],
773 | "source": [
774 | "model_predictions.areaUnderROC"
775 | ]
776 | },
777 | {
778 | "cell_type": "code",
779 | "execution_count": 56,
780 | "metadata": {},
781 | "outputs": [],
782 | "source": [
783 | "from pyspark.ml.classification import RandomForestClassifier\n",
784 | "rf = RandomForestClassifier(numTrees=50,maxDepth=30,labelCol='loan_defaulter')\n",
785 | "rf_model = rf.fit(training_df)\n"
786 | ]
787 | },
788 | {
789 | "cell_type": "code",
790 | "execution_count": 57,
791 | "metadata": {},
792 | "outputs": [],
793 | "source": [
794 | "model_predictions = rf_model.transform(test_df)\n"
795 | ]
796 | },
797 | {
798 | "cell_type": "code",
799 | "execution_count": 59,
800 | "metadata": {},
801 | "outputs": [],
802 | "source": [
803 | "true_pos=model_predictions.filter(model_predictions['loan_defaulter']==1).filter(model_predictions['prediction']==1).count()\n",
804 | "actual_pos=model_predictions.filter(model_predictions['loan_defaulter']==1).count()\n",
805 | "pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()"
806 | ]
807 | },
808 | {
809 | "cell_type": "code",
810 | "execution_count": 60,
811 | "metadata": {},
812 | "outputs": [
813 | {
814 | "data": {
815 | "text/plain": [
816 | "0.8979051288225379"
817 | ]
818 | },
819 | "execution_count": 60,
820 | "metadata": {},
821 | "output_type": "execute_result"
822 | }
823 | ],
824 | "source": [
825 | "#Recall \n",
826 | "float(true_pos)/(actual_pos)"
827 | ]
828 | },
829 | {
830 | "cell_type": "code",
831 | "execution_count": 61,
832 | "metadata": {},
833 | "outputs": [
834 | {
835 | "data": {
836 | "text/plain": [
837 | "0.8660009289363678"
838 | ]
839 | },
840 | "execution_count": 61,
841 | "metadata": {},
842 | "output_type": "execute_result"
843 | }
844 | ],
845 | "source": [
846 | "#Precision on test Data \n",
847 | "float(true_pos)/(pred_pos)"
848 | ]
849 | },
850 | {
851 | "cell_type": "code",
852 | "execution_count": null,
853 | "metadata": {},
854 | "outputs": [],
855 | "source": []
856 | }
857 | ],
858 | "metadata": {
859 | "kernelspec": {
860 | "display_name": "Python 3",
861 | "language": "python",
862 | "name": "python3"
863 | },
864 | "language_info": {
865 | "codemirror_mode": {
866 | "name": "ipython",
867 | "version": 3
868 | },
869 | "file_extension": ".py",
870 | "mimetype": "text/x-python",
871 | "name": "python",
872 | "nbconvert_exporter": "python",
873 | "pygments_lexer": "ipython3",
874 | "version": "3.6.3"
875 | }
876 | },
877 | "nbformat": 4,
878 | "nbformat_minor": 2
879 | }
880 |
--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/Logistic_resgression_pyspark-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "#import SparkSession\n",
10 | "from pyspark.sql import SparkSession\n",
11 | "spark=SparkSession.builder.appName('log_reg').getOrCreate()"
12 | ]
13 | },
14 | {
15 | "cell_type": "code",
16 | "execution_count": 2,
17 | "metadata": {},
18 | "outputs": [],
19 | "source": [
20 | "#read the dataset\n",
21 | "df=spark.read.csv('Log_Reg_dataset.csv',inferSchema=True,header=True)"
22 | ]
23 | },
24 | {
25 | "cell_type": "code",
26 | "execution_count": 14,
27 | "metadata": {},
28 | "outputs": [],
29 | "source": [
30 | "from pyspark.sql.functions import *\n"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "(20000, 6)\n"
43 | ]
44 | }
45 | ],
46 | "source": [
47 | "#check the shape of the data \n",
48 | "print((df.count(),len(df.columns)))"
49 | ]
50 | },
51 | {
52 | "cell_type": "code",
53 | "execution_count": 4,
54 | "metadata": {},
55 | "outputs": [
56 | {
57 | "name": "stdout",
58 | "output_type": "stream",
59 | "text": [
60 | "root\n",
61 | " |-- Country: string (nullable = true)\n",
62 | " |-- Age: integer (nullable = true)\n",
63 | " |-- Repeat_Visitor: integer (nullable = true)\n",
64 | " |-- Platform: string (nullable = true)\n",
65 | " |-- Web_pages_viewed: integer (nullable = true)\n",
66 | " |-- Status: integer (nullable = true)\n",
67 | "\n"
68 | ]
69 | }
70 | ],
71 | "source": [
72 | "#printSchema\n",
73 | "df.printSchema()"
74 | ]
75 | },
76 | {
77 | "cell_type": "code",
78 | "execution_count": 5,
79 | "metadata": {},
80 | "outputs": [
81 | {
82 | "data": {
83 | "text/plain": [
84 | "['Country', 'Age', 'Repeat_Visitor', 'Platform', 'Web_pages_viewed', 'Status']"
85 | ]
86 | },
87 | "execution_count": 5,
88 | "metadata": {},
89 | "output_type": "execute_result"
90 | }
91 | ],
92 | "source": [
93 | "#number of columns in dataset\n",
94 | "df.columns"
95 | ]
96 | },
97 | {
98 | "cell_type": "code",
99 | "execution_count": 6,
100 | "metadata": {},
101 | "outputs": [
102 | {
103 | "name": "stdout",
104 | "output_type": "stream",
105 | "text": [
106 | "+---------+---+--------------+--------+----------------+------+\n",
107 | "| Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|\n",
108 | "+---------+---+--------------+--------+----------------+------+\n",
109 | "| India| 41| 1| Yahoo| 21| 1|\n",
110 | "| Brazil| 28| 1| Yahoo| 5| 0|\n",
111 | "| Brazil| 40| 0| Google| 3| 0|\n",
112 | "|Indonesia| 31| 1| Bing| 15| 1|\n",
113 | "| Malaysia| 32| 0| Google| 15| 1|\n",
114 | "+---------+---+--------------+--------+----------------+------+\n",
115 | "only showing top 5 rows\n",
116 | "\n"
117 | ]
118 | }
119 | ],
120 | "source": [
121 | "#view the dataset\n",
122 | "df.show(5)"
123 | ]
124 | },
125 | {
126 | "cell_type": "code",
127 | "execution_count": 19,
128 | "metadata": {},
129 | "outputs": [
130 | {
131 | "name": "stdout",
132 | "output_type": "stream",
133 | "text": [
134 | "+-------+--------+-----------------+-----------------+--------+-----------------+------------------+\n",
135 | "|summary| Country| Age| Repeat_Visitor|Platform| Web_pages_viewed| Status|\n",
136 | "+-------+--------+-----------------+-----------------+--------+-----------------+------------------+\n",
137 | "| count| 20000| 20000| 20000| 20000| 20000| 20000|\n",
138 | "| mean| null| 28.53955| 0.5029| null| 9.5533| 0.5|\n",
139 | "| stddev| null|7.888912950773227|0.500004090187782| null|6.073903499824976|0.5000125004687693|\n",
140 | "| min| Brazil| 17| 0| Bing| 1| 0|\n",
141 | "| max|Malaysia| 111| 1| Yahoo| 29| 1|\n",
142 | "+-------+--------+-----------------+-----------------+--------+-----------------+------------------+\n",
143 | "\n"
144 | ]
145 | }
146 | ],
147 | "source": [
148 | "#Exploratory Data Analysis\n",
149 | "df.describe().show()\n"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": 22,
155 | "metadata": {},
156 | "outputs": [
157 | {
158 | "name": "stdout",
159 | "output_type": "stream",
160 | "text": [
161 | "+---------+-----+\n",
162 | "| Country|count|\n",
163 | "+---------+-----+\n",
164 | "| Malaysia| 1218|\n",
165 | "| India| 4018|\n",
166 | "|Indonesia|12178|\n",
167 | "| Brazil| 2586|\n",
168 | "+---------+-----+\n",
169 | "\n"
170 | ]
171 | }
172 | ],
173 | "source": [
174 | "df.groupBy('Country').count().show()"
175 | ]
176 | },
177 | {
178 | "cell_type": "code",
179 | "execution_count": 118,
180 | "metadata": {},
181 | "outputs": [
182 | {
183 | "name": "stdout",
184 | "output_type": "stream",
185 | "text": [
186 | "+--------+-----+\n",
187 | "|Platform|count|\n",
188 | "+--------+-----+\n",
189 | "| Yahoo| 9859|\n",
190 | "| Bing| 4360|\n",
191 | "| Google| 5781|\n",
192 | "+--------+-----+\n",
193 | "\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "df.groupBy('Platform').count().show()"
199 | ]
200 | },
201 | {
202 | "cell_type": "code",
203 | "execution_count": 119,
204 | "metadata": {},
205 | "outputs": [
206 | {
207 | "name": "stdout",
208 | "output_type": "stream",
209 | "text": [
210 | "+------+-----+\n",
211 | "|Status|count|\n",
212 | "+------+-----+\n",
213 | "| 1|10000|\n",
214 | "| 0|10000|\n",
215 | "+------+-----+\n",
216 | "\n"
217 | ]
218 | }
219 | ],
220 | "source": [
221 | "df.groupBy('Status').count().show()"
222 | ]
223 | },
224 | {
225 | "cell_type": "code",
226 | "execution_count": 23,
227 | "metadata": {
228 | "scrolled": true
229 | },
230 | "outputs": [
231 | {
232 | "name": "stdout",
233 | "output_type": "stream",
234 | "text": [
235 | "+---------+------------------+-------------------+---------------------+--------------------+\n",
236 | "| Country| avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)| avg(Status)|\n",
237 | "+---------+------------------+-------------------+---------------------+--------------------+\n",
238 | "| Malaysia|27.792282430213465| 0.5730706075533661| 11.192118226600986| 0.6568144499178982|\n",
239 | "| India|27.976854156296664| 0.5433051269288203| 10.727227476356397| 0.6212045793927327|\n",
240 | "|Indonesia| 28.43159796354081| 0.5207751683363442| 9.985711939563148| 0.5422893742814913|\n",
241 | "| Brazil|30.274168600154677| 0.322892498066512| 4.921113689095128|0.038669760247486466|\n",
242 | "+---------+------------------+-------------------+---------------------+--------------------+\n",
243 | "\n"
244 | ]
245 | }
246 | ],
247 | "source": [
248 | "df.groupBy('Country').mean().show()"
249 | ]
250 | },
251 | {
252 | "cell_type": "code",
253 | "execution_count": 24,
254 | "metadata": {},
255 | "outputs": [
256 | {
257 | "name": "stdout",
258 | "output_type": "stream",
259 | "text": [
260 | "+--------+------------------+-------------------+---------------------+------------------+\n",
261 | "|Platform| avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)| avg(Status)|\n",
262 | "+--------+------------------+-------------------+---------------------+------------------+\n",
263 | "| Yahoo|28.569226087838523| 0.5094837204584644| 9.599655137437875|0.5071508266558474|\n",
264 | "| Bing| 28.68394495412844| 0.4720183486238532| 9.114908256880733|0.4559633027522936|\n",
265 | "| Google|28.380038055699707| 0.5149628092025601| 9.804878048780488|0.5210171250648676|\n",
266 | "+--------+------------------+-------------------+---------------------+------------------+\n",
267 | "\n"
268 | ]
269 | }
270 | ],
271 | "source": [
272 | "df.groupBy('Platform').mean().show()"
273 | ]
274 | },
275 | {
276 | "cell_type": "code",
277 | "execution_count": 25,
278 | "metadata": {},
279 | "outputs": [
280 | {
281 | "name": "stdout",
282 | "output_type": "stream",
283 | "text": [
284 | "+------+--------+-------------------+---------------------+-----------+\n",
285 | "|Status|avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|avg(Status)|\n",
286 | "+------+--------+-------------------+---------------------+-----------+\n",
287 | "| 1| 26.5435| 0.7019| 14.5617| 1.0|\n",
288 | "| 0| 30.5356| 0.3039| 4.5449| 0.0|\n",
289 | "+------+--------+-------------------+---------------------+-----------+\n",
290 | "\n"
291 | ]
292 | }
293 | ],
294 | "source": [
295 | "df.groupBy('Status').mean().show()"
296 | ]
297 | },
298 | {
299 | "cell_type": "code",
300 | "execution_count": 120,
301 | "metadata": {},
302 | "outputs": [],
303 | "source": [
304 | "#converting categorical data to numerical form"
305 | ]
306 | },
307 | {
308 | "cell_type": "code",
309 | "execution_count": 121,
310 | "metadata": {},
311 | "outputs": [],
312 | "source": [
313 | "#import required libraries\n",
314 | "\n",
315 | "from pyspark.ml.feature import StringIndexer\n"
316 | ]
317 | },
318 | {
319 | "cell_type": "code",
320 | "execution_count": 122,
321 | "metadata": {},
322 | "outputs": [],
323 | "source": [
324 | "#Indexing "
325 | ]
326 | },
327 | {
328 | "cell_type": "code",
329 | "execution_count": 123,
330 | "metadata": {},
331 | "outputs": [],
332 | "source": [
333 | "platform_indexer = StringIndexer(inputCol=\"Platform\", outputCol=\"platform_num\").fit(df)\n",
334 | "df = platform_indexer.transform(df)"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 124,
340 | "metadata": {},
341 | "outputs": [
342 | {
343 | "name": "stdout",
344 | "output_type": "stream",
345 | "text": [
346 | "+-------+---+--------------+--------+----------------+------+------------+\n",
347 | "|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|platform_num|\n",
348 | "+-------+---+--------------+--------+----------------+------+------------+\n",
349 | "|India |41 |1 |Yahoo |21 |1 |0.0 |\n",
350 | "|Brazil |28 |1 |Yahoo |5 |0 |0.0 |\n",
351 | "|Brazil |40 |0 |Google |3 |0 |1.0 |\n",
352 | "+-------+---+--------------+--------+----------------+------+------------+\n",
353 | "only showing top 3 rows\n",
354 | "\n"
355 | ]
356 | }
357 | ],
358 | "source": [
359 | "df.show(3,False)"
360 | ]
361 | },
362 | {
363 | "cell_type": "code",
364 | "execution_count": 125,
365 | "metadata": {},
366 | "outputs": [],
367 | "source": [
368 | "from pyspark.ml.feature import OneHotEncoder"
369 | ]
370 | },
371 | {
372 | "cell_type": "code",
373 | "execution_count": 126,
374 | "metadata": {},
375 | "outputs": [],
376 | "source": [
377 | "#one hot encoding\n",
378 | "platform_encoder = OneHotEncoder(inputCol=\"platform_num\", outputCol=\"platform_vector\")\n",
379 | "df = platform_encoder.transform(df)"
380 | ]
381 | },
382 | {
383 | "cell_type": "code",
384 | "execution_count": 129,
385 | "metadata": {},
386 | "outputs": [
387 | {
388 | "name": "stdout",
389 | "output_type": "stream",
390 | "text": [
391 | "+-------+---+--------------+--------+----------------+------+------------+---------------+\n",
392 | "|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|platform_num|platform_vector|\n",
393 | "+-------+---+--------------+--------+----------------+------+------------+---------------+\n",
394 | "|India |41 |1 |Yahoo |21 |1 |0.0 |(2,[0],[1.0]) |\n",
395 | "|Brazil |28 |1 |Yahoo |5 |0 |0.0 |(2,[0],[1.0]) |\n",
396 | "|Brazil |40 |0 |Google |3 |0 |1.0 |(2,[1],[1.0]) |\n",
397 | "+-------+---+--------------+--------+----------------+------+------------+---------------+\n",
398 | "only showing top 3 rows\n",
399 | "\n"
400 | ]
401 | }
402 | ],
403 | "source": [
404 | "df.show(3,False)"
405 | ]
406 | },
407 | {
408 | "cell_type": "code",
409 | "execution_count": 134,
410 | "metadata": {},
411 | "outputs": [
412 | {
413 | "name": "stdout",
414 | "output_type": "stream",
415 | "text": [
416 | "+--------+-----+\n",
417 | "|Platform|count|\n",
418 | "+--------+-----+\n",
419 | "|Yahoo |9859 |\n",
420 | "|Google |5781 |\n",
421 | "|Bing |4360 |\n",
422 | "+--------+-----+\n",
423 | "\n"
424 | ]
425 | }
426 | ],
427 | "source": [
428 | "df.groupBy('Platform').count().orderBy('count',ascending=False).show(5,False)"
429 | ]
430 | },
431 | {
432 | "cell_type": "code",
433 | "execution_count": 135,
434 | "metadata": {},
435 | "outputs": [
436 | {
437 | "name": "stdout",
438 | "output_type": "stream",
439 | "text": [
440 | "+------------+-----+\n",
441 | "|platform_num|count|\n",
442 | "+------------+-----+\n",
443 | "|0.0 |9859 |\n",
444 | "|1.0 |5781 |\n",
445 | "|2.0 |4360 |\n",
446 | "+------------+-----+\n",
447 | "\n"
448 | ]
449 | }
450 | ],
451 | "source": [
452 | "df.groupBy('platform_num').count().orderBy('count',ascending=False).show(5,False)"
453 | ]
454 | },
455 | {
456 | "cell_type": "code",
457 | "execution_count": 136,
458 | "metadata": {},
459 | "outputs": [
460 | {
461 | "name": "stdout",
462 | "output_type": "stream",
463 | "text": [
464 | "+---------------+-----+\n",
465 | "|platform_vector|count|\n",
466 | "+---------------+-----+\n",
467 | "|(2,[0],[1.0]) |9859 |\n",
468 | "|(2,[1],[1.0]) |5781 |\n",
469 | "|(2,[],[]) |4360 |\n",
470 | "+---------------+-----+\n",
471 | "\n"
472 | ]
473 | }
474 | ],
475 | "source": [
476 | "df.groupBy('platform_vector').count().orderBy('count',ascending=False).show(5,False)"
477 | ]
478 | },
479 | {
480 | "cell_type": "code",
481 | "execution_count": 137,
482 | "metadata": {},
483 | "outputs": [],
484 | "source": [
485 | "country_indexer = StringIndexer(inputCol=\"Country\", outputCol=\"country_num\").fit(df)\n",
486 | "df = country_indexer.transform(df)"
487 | ]
488 | },
489 | {
490 | "cell_type": "code",
491 | "execution_count": 139,
492 | "metadata": {},
493 | "outputs": [
494 | {
495 | "name": "stdout",
496 | "output_type": "stream",
497 | "text": [
498 | "+-------+-----------+\n",
499 | "|Country|country_num|\n",
500 | "+-------+-----------+\n",
501 | "|India |1.0 |\n",
502 | "|Brazil |2.0 |\n",
503 | "|Brazil |2.0 |\n",
504 | "+-------+-----------+\n",
505 | "only showing top 3 rows\n",
506 | "\n"
507 | ]
508 | }
509 | ],
510 | "source": [
511 | "df.select(['Country','country_num']).show(3,False)"
512 | ]
513 | },
514 | {
515 | "cell_type": "code",
516 | "execution_count": 140,
517 | "metadata": {},
518 | "outputs": [],
519 | "source": [
520 | "#one hot encoding\n",
521 | "country_encoder = OneHotEncoder(inputCol=\"country_num\", outputCol=\"country_vector\")\n",
522 | "df = country_encoder.transform(df)"
523 | ]
524 | },
525 | {
526 | "cell_type": "code",
527 | "execution_count": 141,
528 | "metadata": {},
529 | "outputs": [
530 | {
531 | "name": "stdout",
532 | "output_type": "stream",
533 | "text": [
534 | "+-------+-----------+--------------+\n",
535 | "|Country|country_num|country_vector|\n",
536 | "+-------+-----------+--------------+\n",
537 | "|India |1.0 |(3,[1],[1.0]) |\n",
538 | "|Brazil |2.0 |(3,[2],[1.0]) |\n",
539 | "|Brazil |2.0 |(3,[2],[1.0]) |\n",
540 | "+-------+-----------+--------------+\n",
541 | "only showing top 3 rows\n",
542 | "\n"
543 | ]
544 | }
545 | ],
546 | "source": [
547 | "df.select(['Country','country_num','country_vector']).show(3,False)"
548 | ]
549 | },
550 | {
551 | "cell_type": "code",
552 | "execution_count": 142,
553 | "metadata": {},
554 | "outputs": [
555 | {
556 | "name": "stdout",
557 | "output_type": "stream",
558 | "text": [
559 | "+---------+-----+\n",
560 | "|Country |count|\n",
561 | "+---------+-----+\n",
562 | "|Indonesia|12178|\n",
563 | "|India |4018 |\n",
564 | "|Brazil |2586 |\n",
565 | "|Malaysia |1218 |\n",
566 | "+---------+-----+\n",
567 | "\n"
568 | ]
569 | }
570 | ],
571 | "source": [
572 | "df.groupBy('Country').count().orderBy('count',ascending=False).show(5,False)"
573 | ]
574 | },
575 | {
576 | "cell_type": "code",
577 | "execution_count": 143,
578 | "metadata": {},
579 | "outputs": [
580 | {
581 | "name": "stdout",
582 | "output_type": "stream",
583 | "text": [
584 | "+-----------+-----+\n",
585 | "|country_num|count|\n",
586 | "+-----------+-----+\n",
587 | "|0.0 |12178|\n",
588 | "|1.0 |4018 |\n",
589 | "|2.0 |2586 |\n",
590 | "|3.0 |1218 |\n",
591 | "+-----------+-----+\n",
592 | "\n"
593 | ]
594 | }
595 | ],
596 | "source": [
597 | "df.groupBy('country_num').count().orderBy('count',ascending=False).show(5,False)"
598 | ]
599 | },
600 | {
601 | "cell_type": "code",
602 | "execution_count": 144,
603 | "metadata": {},
604 | "outputs": [
605 | {
606 | "name": "stdout",
607 | "output_type": "stream",
608 | "text": [
609 | "+--------------+-----+\n",
610 | "|country_vector|count|\n",
611 | "+--------------+-----+\n",
612 | "|(3,[0],[1.0]) |12178|\n",
613 | "|(3,[1],[1.0]) |4018 |\n",
614 | "|(3,[2],[1.0]) |2586 |\n",
615 | "|(3,[],[]) |1218 |\n",
616 | "+--------------+-----+\n",
617 | "\n"
618 | ]
619 | }
620 | ],
621 | "source": [
622 | "df.groupBy('country_vector').count().orderBy('count',ascending=False).show(5,False)"
623 | ]
624 | },
625 | {
626 | "cell_type": "code",
627 | "execution_count": 145,
628 | "metadata": {},
629 | "outputs": [],
630 | "source": [
631 | "from pyspark.ml.feature import VectorAssembler"
632 | ]
633 | },
634 | {
635 | "cell_type": "code",
636 | "execution_count": 146,
637 | "metadata": {},
638 | "outputs": [],
639 | "source": [
640 | "df_assembler = VectorAssembler(inputCols=['platform_vector','country_vector','Age', 'Repeat_Visitor','Web_pages_viewed'], outputCol=\"features\")\n",
641 | "df = df_assembler.transform(df)"
642 | ]
643 | },
644 | {
645 | "cell_type": "code",
646 | "execution_count": 147,
647 | "metadata": {},
648 | "outputs": [
649 | {
650 | "name": "stdout",
651 | "output_type": "stream",
652 | "text": [
653 | "root\n",
654 | " |-- Country: string (nullable = true)\n",
655 | " |-- Age: integer (nullable = true)\n",
656 | " |-- Repeat_Visitor: integer (nullable = true)\n",
657 | " |-- Platform: string (nullable = true)\n",
658 | " |-- Web_pages_viewed: integer (nullable = true)\n",
659 | " |-- Status: integer (nullable = true)\n",
660 | " |-- platform_num: double (nullable = false)\n",
661 | " |-- platform_vector: vector (nullable = true)\n",
662 | " |-- country_num: double (nullable = false)\n",
663 | " |-- country_vector: vector (nullable = true)\n",
664 | " |-- features: vector (nullable = true)\n",
665 | "\n"
666 | ]
667 | }
668 | ],
669 | "source": [
670 | "df.printSchema()"
671 | ]
672 | },
673 | {
674 | "cell_type": "code",
675 | "execution_count": 148,
676 | "metadata": {},
677 | "outputs": [
678 | {
679 | "name": "stdout",
680 | "output_type": "stream",
681 | "text": [
682 | "+-----------------------------------+------+\n",
683 | "|features |Status|\n",
684 | "+-----------------------------------+------+\n",
685 | "|[1.0,0.0,0.0,1.0,0.0,41.0,1.0,21.0]|1 |\n",
686 | "|[1.0,0.0,0.0,0.0,1.0,28.0,1.0,5.0] |0 |\n",
687 | "|(8,[1,4,5,7],[1.0,1.0,40.0,3.0]) |0 |\n",
688 | "|(8,[2,5,6,7],[1.0,31.0,1.0,15.0]) |1 |\n",
689 | "|(8,[1,5,7],[1.0,32.0,15.0]) |1 |\n",
690 | "|(8,[1,4,5,7],[1.0,1.0,32.0,3.0]) |0 |\n",
691 | "|(8,[1,4,5,7],[1.0,1.0,32.0,6.0]) |0 |\n",
692 | "|(8,[1,2,5,7],[1.0,1.0,27.0,9.0]) |0 |\n",
693 | "|(8,[0,2,5,7],[1.0,1.0,32.0,2.0]) |0 |\n",
694 | "|(8,[2,5,6,7],[1.0,31.0,1.0,16.0]) |1 |\n",
695 | "+-----------------------------------+------+\n",
696 | "only showing top 10 rows\n",
697 | "\n"
698 | ]
699 | }
700 | ],
701 | "source": [
702 | "df.select(['features','Status']).show(10,False)"
703 | ]
704 | },
705 | {
706 | "cell_type": "code",
707 | "execution_count": 149,
708 | "metadata": {},
709 | "outputs": [],
710 | "source": [
711 | "#select data for building model\n",
712 | "model_df=df.select(['features','Status'])"
713 | ]
714 | },
715 | {
716 | "cell_type": "code",
717 | "execution_count": 150,
718 | "metadata": {},
719 | "outputs": [],
720 | "source": [
721 | "from pyspark.ml.classification import LogisticRegression"
722 | ]
723 | },
724 | {
725 | "cell_type": "code",
726 | "execution_count": 151,
727 | "metadata": {},
728 | "outputs": [],
729 | "source": [
730 | "#split the data \n",
731 | "training_df,test_df=model_df.randomSplit([0.75,0.25])"
732 | ]
733 | },
734 | {
735 | "cell_type": "code",
736 | "execution_count": 152,
737 | "metadata": {},
738 | "outputs": [
739 | {
740 | "data": {
741 | "text/plain": [
742 | "14907"
743 | ]
744 | },
745 | "execution_count": 152,
746 | "metadata": {},
747 | "output_type": "execute_result"
748 | }
749 | ],
750 | "source": [
751 | "training_df.count()"
752 | ]
753 | },
754 | {
755 | "cell_type": "code",
756 | "execution_count": 160,
757 | "metadata": {},
758 | "outputs": [
759 | {
760 | "name": "stdout",
761 | "output_type": "stream",
762 | "text": [
763 | "+------+-----+\n",
764 | "|Status|count|\n",
765 | "+------+-----+\n",
766 | "| 1| 7417|\n",
767 | "| 0| 7490|\n",
768 | "+------+-----+\n",
769 | "\n"
770 | ]
771 | }
772 | ],
773 | "source": [
774 | "training_df.groupBy('Status').count().show()"
775 | ]
776 | },
777 | {
778 | "cell_type": "code",
779 | "execution_count": 153,
780 | "metadata": {},
781 | "outputs": [
782 | {
783 | "data": {
784 | "text/plain": [
785 | "5093"
786 | ]
787 | },
788 | "execution_count": 153,
789 | "metadata": {},
790 | "output_type": "execute_result"
791 | }
792 | ],
793 | "source": [
794 | "test_df.count()"
795 | ]
796 | },
797 | {
798 | "cell_type": "code",
799 | "execution_count": 161,
800 | "metadata": {},
801 | "outputs": [
802 | {
803 | "name": "stdout",
804 | "output_type": "stream",
805 | "text": [
806 | "+------+-----+\n",
807 | "|Status|count|\n",
808 | "+------+-----+\n",
809 | "| 1| 2583|\n",
810 | "| 0| 2510|\n",
811 | "+------+-----+\n",
812 | "\n"
813 | ]
814 | }
815 | ],
816 | "source": [
817 | "test_df.groupBy('Status').count().show()"
818 | ]
819 | },
820 | {
821 | "cell_type": "code",
822 | "execution_count": 154,
823 | "metadata": {},
824 | "outputs": [],
825 | "source": [
826 | "log_reg=LogisticRegression(labelCol='Status').fit(training_df)"
827 | ]
828 | },
829 | {
830 | "cell_type": "code",
831 | "execution_count": null,
832 | "metadata": {},
833 | "outputs": [],
834 | "source": [
835 | "#Training Results"
836 | ]
837 | },
838 | {
839 | "cell_type": "code",
840 | "execution_count": 155,
841 | "metadata": {},
842 | "outputs": [],
843 | "source": [
844 | "train_results=log_reg.evaluate(training_df).predictions"
845 | ]
846 | },
847 | {
848 | "cell_type": "code",
849 | "execution_count": 168,
850 | "metadata": {},
851 | "outputs": [
852 | {
853 | "name": "stdout",
854 | "output_type": "stream",
855 | "text": [
856 | "+------+----------+----------------------------------------+\n",
857 | "|Status|prediction|probability |\n",
858 | "+------+----------+----------------------------------------+\n",
859 | "|1 |1.0 |[0.2978572628475072,0.7021427371524929] |\n",
860 | "|1 |1.0 |[0.2978572628475072,0.7021427371524929] |\n",
861 | "|1 |1.0 |[0.16704676975730415,0.8329532302426959]|\n",
862 | "|1 |1.0 |[0.16704676975730415,0.8329532302426959]|\n",
863 | "|1 |1.0 |[0.16704676975730415,0.8329532302426959]|\n",
864 | "|1 |1.0 |[0.08659913656062515,0.9134008634393749]|\n",
865 | "|1 |1.0 |[0.08659913656062515,0.9134008634393749]|\n",
866 | "|1 |1.0 |[0.08659913656062515,0.9134008634393749]|\n",
867 | "|1 |1.0 |[0.08659913656062515,0.9134008634393749]|\n",
868 | "|1 |1.0 |[0.08659913656062515,0.9134008634393749]|\n",
869 | "+------+----------+----------------------------------------+\n",
870 | "only showing top 10 rows\n",
871 | "\n"
872 | ]
873 | }
874 | ],
875 | "source": [
876 | "train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).select(['Status','prediction','probability']).show(10,False)"
877 | ]
878 | },
879 | {
880 | "cell_type": "markdown",
881 | "metadata": {},
882 | "source": [
883 | "Probability at 0 index is for 0 class and probabilty as 1 index is for 1 class"
884 | ]
885 | },
886 | {
887 | "cell_type": "code",
888 | "execution_count": 177,
889 | "metadata": {},
890 | "outputs": [],
891 | "source": [
892 | "correct_preds=train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).count()\n"
893 | ]
894 | },
895 | {
896 | "cell_type": "code",
897 | "execution_count": 174,
898 | "metadata": {},
899 | "outputs": [
900 | {
901 | "data": {
902 | "text/plain": [
903 | "7417"
904 | ]
905 | },
906 | "execution_count": 174,
907 | "metadata": {},
908 | "output_type": "execute_result"
909 | }
910 | ],
911 | "source": [
912 | "training_df.filter(training_df['Status']==1).count()"
913 | ]
914 | },
915 | {
916 | "cell_type": "code",
917 | "execution_count": 178,
918 | "metadata": {},
919 | "outputs": [
920 | {
921 | "data": {
922 | "text/plain": [
923 | "0.9366320614803829"
924 | ]
925 | },
926 | "execution_count": 178,
927 | "metadata": {},
928 | "output_type": "execute_result"
929 | }
930 | ],
931 | "source": [
932 | "#accuracy on training dataset \n",
933 | "float(correct_preds)/(training_df.filter(training_df['Status']==1).count())"
934 | ]
935 | },
936 | {
937 | "cell_type": "code",
938 | "execution_count": null,
939 | "metadata": {},
940 | "outputs": [],
941 | "source": [
942 | "#Test Set results"
943 | ]
944 | },
945 | {
946 | "cell_type": "code",
947 | "execution_count": 170,
948 | "metadata": {},
949 | "outputs": [],
950 | "source": [
951 | "results=log_reg.evaluate(test_df).predictions"
952 | ]
953 | },
954 | {
955 | "cell_type": "code",
956 | "execution_count": 93,
957 | "metadata": {},
958 | "outputs": [
959 | {
960 | "name": "stdout",
961 | "output_type": "stream",
962 | "text": [
963 | "+------+----------+\n",
964 | "|Status|prediction|\n",
965 | "+------+----------+\n",
966 | "|0 |0.0 |\n",
967 | "|0 |0.0 |\n",
968 | "|0 |0.0 |\n",
969 | "|0 |0.0 |\n",
970 | "|1 |0.0 |\n",
971 | "|0 |0.0 |\n",
972 | "|1 |1.0 |\n",
973 | "|0 |1.0 |\n",
974 | "|1 |1.0 |\n",
975 | "|1 |1.0 |\n",
976 | "+------+----------+\n",
977 | "only showing top 10 rows\n",
978 | "\n"
979 | ]
980 | }
981 | ],
982 | "source": [
983 | "results.select(['Status','prediction']).show(10,False)"
984 | ]
985 | },
986 | {
987 | "cell_type": "code",
988 | "execution_count": 91,
989 | "metadata": {},
990 | "outputs": [
991 | {
992 | "name": "stdout",
993 | "output_type": "stream",
994 | "text": [
995 | "root\n",
996 | " |-- features: vector (nullable = true)\n",
997 | " |-- Status: integer (nullable = true)\n",
998 | " |-- rawPrediction: vector (nullable = true)\n",
999 | " |-- probability: vector (nullable = true)\n",
1000 | " |-- prediction: double (nullable = false)\n",
1001 | "\n"
1002 | ]
1003 | }
1004 | ],
1005 | "source": [
1006 | "results.printSchema()"
1007 | ]
1008 | },
1009 | {
1010 | "cell_type": "code",
1011 | "execution_count": 92,
1012 | "metadata": {},
1013 | "outputs": [],
1014 | "source": [
1015 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator"
1016 | ]
1017 | },
1018 | {
1019 | "cell_type": "code",
1020 | "execution_count": 94,
1021 | "metadata": {},
1022 | "outputs": [],
1023 | "source": [
1024 | "#confusion matrix\n",
1025 | "true_postives = results[(results.Status == 1) & (results.prediction == 1)].count()\n",
1026 | "true_negatives = results[(results.Status == 0) & (results.prediction == 0)].count()\n",
1027 | "false_positives = results[(results.Status == 0) & (results.prediction == 1)].count()\n",
1028 | "false_negatives = results[(results.Status == 1) & (results.prediction == 0)].count()"
1029 | ]
1030 | },
1031 | {
1032 | "cell_type": "code",
1033 | "execution_count": 98,
1034 | "metadata": {},
1035 | "outputs": [
1036 | {
1037 | "name": "stdout",
1038 | "output_type": "stream",
1039 | "text": [
1040 | "2356\n",
1041 | "2363\n",
1042 | "158\n",
1043 | "157\n",
1044 | "5034\n",
1045 | "5034\n"
1046 | ]
1047 | }
1048 | ],
1049 | "source": [
1050 | "print (true_postives)\n",
1051 | "print (true_negatives)\n",
1052 | "print (false_positives)\n",
1053 | "print (false_negatives)\n",
1054 | "print(true_postives+true_negatives+false_positives+false_negatives)\n",
1055 | "print (results.count())"
1056 | ]
1057 | },
1058 | {
1059 | "cell_type": "code",
1060 | "execution_count": 99,
1061 | "metadata": {},
1062 | "outputs": [
1063 | {
1064 | "name": "stdout",
1065 | "output_type": "stream",
1066 | "text": [
1067 | "0.937524870672503\n"
1068 | ]
1069 | }
1070 | ],
1071 | "source": [
1072 | "recall = float(true_postives)/(true_postives + false_negatives)\n",
1073 | "print(recall)"
1074 | ]
1075 | },
1076 | {
1077 | "cell_type": "code",
1078 | "execution_count": 100,
1079 | "metadata": {},
1080 | "outputs": [
1081 | {
1082 | "name": "stdout",
1083 | "output_type": "stream",
1084 | "text": [
1085 | "0.9371519490851233\n"
1086 | ]
1087 | }
1088 | ],
1089 | "source": [
1090 | "precision = float(true_postives) / (true_postives + false_positives)\n",
1091 | "print(precision)"
1092 | ]
1093 | },
1094 | {
1095 | "cell_type": "code",
1096 | "execution_count": 103,
1097 | "metadata": {},
1098 | "outputs": [
1099 | {
1100 | "name": "stdout",
1101 | "output_type": "stream",
1102 | "text": [
1103 | "0.9374255065554231\n"
1104 | ]
1105 | }
1106 | ],
1107 | "source": [
1108 | "accuracy=float((true_postives+true_negatives) /(results.count()))\n",
1109 | "print(accuracy)"
1110 | ]
1111 | },
1112 | {
1113 | "cell_type": "code",
1114 | "execution_count": null,
1115 | "metadata": {},
1116 | "outputs": [],
1117 | "source": []
1118 | }
1119 | ],
1120 | "metadata": {
1121 | "kernelspec": {
1122 | "display_name": "Python 3",
1123 | "language": "python",
1124 | "name": "python3"
1125 | },
1126 | "language_info": {
1127 | "codemirror_mode": {
1128 | "name": "ipython",
1129 | "version": 3
1130 | },
1131 | "file_extension": ".py",
1132 | "mimetype": "text/x-python",
1133 | "name": "python",
1134 | "nbconvert_exporter": "python",
1135 | "pygments_lexer": "ipython3",
1136 | "version": "3.6.3"
1137 | }
1138 | },
1139 | "nbformat": 4,
1140 | "nbformat_minor": 2
1141 | }
1142 |
--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/pyspark_basics-checkpoint.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Pyspark Basics"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 2,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "#create pyspark session\n",
17 | "from pyspark.sql import SparkSession\n",
18 | "spark=SparkSession.builder.appName('pyspark').getOrCreate()"
19 | ]
20 | },
21 | {
22 | "cell_type": "code",
23 | "execution_count": 121,
24 | "metadata": {},
25 | "outputs": [],
26 | "source": [
27 | "#read the data file\n",
28 | "df=spark.read.csv('conversion_data.csv')"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 122,
34 | "metadata": {},
35 | "outputs": [
36 | {
37 | "name": "stdout",
38 | "output_type": "stream",
39 | "text": [
40 | "+-------+---+--------+------+-------------------+---------+\n",
41 | "| _c0|_c1| _c2| _c3| _c4| _c5|\n",
42 | "+-------+---+--------+------+-------------------+---------+\n",
43 | "|country|age|new_user|source|total_pages_visited|converted|\n",
44 | "| UK| 25| 1| Ads| 1| 0|\n",
45 | "| US| 23| 1| Seo| 5| 0|\n",
46 | "| US| 28| 1| Seo| 4| 0|\n",
47 | "| China| 39| 1| Seo| 5| 0|\n",
48 | "+-------+---+--------+------+-------------------+---------+\n",
49 | "only showing top 5 rows\n",
50 | "\n"
51 | ]
52 | }
53 | ],
54 | "source": [
55 | "df.show(5)"
56 | ]
57 | },
58 | {
59 | "cell_type": "code",
60 | "execution_count": 123,
61 | "metadata": {},
62 | "outputs": [],
63 | "source": [
64 | "#read the data file\n",
65 | "df=spark.read.csv('conversion_data.csv',header=True)"
66 | ]
67 | },
68 | {
69 | "cell_type": "code",
70 | "execution_count": 124,
71 | "metadata": {},
72 | "outputs": [
73 | {
74 | "name": "stdout",
75 | "output_type": "stream",
76 | "text": [
77 | "+-------+---+--------+------+-------------------+---------+\n",
78 | "|country|age|new_user|source|total_pages_visited|converted|\n",
79 | "+-------+---+--------+------+-------------------+---------+\n",
80 | "| UK| 25| 1| Ads| 1| 0|\n",
81 | "| US| 23| 1| Seo| 5| 0|\n",
82 | "| US| 28| 1| Seo| 4| 0|\n",
83 | "| China| 39| 1| Seo| 5| 0|\n",
84 | "| US| 30| 1| Seo| 6| 0|\n",
85 | "+-------+---+--------+------+-------------------+---------+\n",
86 | "only showing top 5 rows\n",
87 | "\n"
88 | ]
89 | }
90 | ],
91 | "source": [
92 | "df.show(5)"
93 | ]
94 | },
95 | {
96 | "cell_type": "code",
97 | "execution_count": 125,
98 | "metadata": {},
99 | "outputs": [
100 | {
101 | "name": "stdout",
102 | "output_type": "stream",
103 | "text": [
104 | "root\n",
105 | " |-- country: string (nullable = true)\n",
106 | " |-- age: string (nullable = true)\n",
107 | " |-- new_user: string (nullable = true)\n",
108 | " |-- source: string (nullable = true)\n",
109 | " |-- total_pages_visited: string (nullable = true)\n",
110 | " |-- converted: string (nullable = true)\n",
111 | "\n"
112 | ]
113 | }
114 | ],
115 | "source": [
116 | "df.printSchema()"
117 | ]
118 | },
119 | {
120 | "cell_type": "code",
121 | "execution_count": 126,
122 | "metadata": {},
123 | "outputs": [
124 | {
125 | "name": "stdout",
126 | "output_type": "stream",
127 | "text": [
128 | "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n",
129 | "|summary|country| age| new_user|source|total_pages_visited| converted|\n",
130 | "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n",
131 | "| count| 316200| 316200| 316200|316200| 316200| 316200|\n",
132 | "| mean| null|30.569857685009488| 0.6854648956356736| null| 4.872966476913346|0.03225806451612903|\n",
133 | "| stddev| null| 8.271801801807728|0.46433119036384723| null| 3.341103757948214|0.17668497535763514|\n",
134 | "| min| China| 111| 0| Ads| 1| 0|\n",
135 | "| max| US| 79| 1| Seo| 9| 1|\n",
136 | "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n",
137 | "\n"
138 | ]
139 | }
140 | ],
141 | "source": [
142 | "#statistical summary for data numerical columns\n",
143 | "df.describe().show()"
144 | ]
145 | },
146 | {
147 | "cell_type": "markdown",
148 | "metadata": {},
149 | "source": [
150 | "## Datatypes "
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": 130,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": [
159 | "from pyspark.sql.functions import col , column\n",
160 | "df = df.withColumn(\"age\", col(\"age\").cast(\"Int\"))\\\n",
161 | " .withColumn(\"new_user\", col(\"new_user\").cast(\"Int\"))\\\n",
162 | " .withColumn(\"total_pages_visited\", col(\"total_pages_visited\").cast(\"Int\"))\\\n",
163 | " .withColumn(\"converted\", col(\"converted\").cast(\"Int\"))"
164 | ]
165 | },
166 | {
167 | "cell_type": "code",
168 | "execution_count": 131,
169 | "metadata": {},
170 | "outputs": [
171 | {
172 | "name": "stdout",
173 | "output_type": "stream",
174 | "text": [
175 | "root\n",
176 | " |-- country: string (nullable = true)\n",
177 | " |-- age: integer (nullable = true)\n",
178 | " |-- new_user: integer (nullable = true)\n",
179 | " |-- source: string (nullable = true)\n",
180 | " |-- total_pages_visited: integer (nullable = true)\n",
181 | " |-- converted: integer (nullable = true)\n",
182 | "\n"
183 | ]
184 | }
185 | ],
186 | "source": [
187 | "df.printSchema()"
188 | ]
189 | },
190 | {
191 | "cell_type": "code",
192 | "execution_count": 132,
193 | "metadata": {},
194 | "outputs": [
195 | {
196 | "name": "stdout",
197 | "output_type": "stream",
198 | "text": [
199 | "+-------+---+--------+------+-------------------+---------+\n",
200 | "|country|age|new_user|source|total_pages_visited|converted|\n",
201 | "+-------+---+--------+------+-------------------+---------+\n",
202 | "| UK| 25| 1| Ads| 1| 0|\n",
203 | "| US| 23| 1| Seo| 5| 0|\n",
204 | "| US| 28| 1| Seo| 4| 0|\n",
205 | "| China| 39| 1| Seo| 5| 0|\n",
206 | "| US| 30| 1| Seo| 6| 0|\n",
207 | "+-------+---+--------+------+-------------------+---------+\n",
208 | "only showing top 5 rows\n",
209 | "\n"
210 | ]
211 | }
212 | ],
213 | "source": [
214 | "df.show(5)"
215 | ]
216 | },
217 | {
218 | "cell_type": "code",
219 | "execution_count": 133,
220 | "metadata": {},
221 | "outputs": [
222 | {
223 | "data": {
224 | "text/plain": [
225 | "Column"
226 | ]
227 | },
228 | "execution_count": 133,
229 | "metadata": {},
230 | "output_type": "execute_result"
231 | }
232 | ],
233 | "source": [
234 | "#acess dataframe column , we get column object \n",
235 | "df['country']"
236 | ]
237 | },
238 | {
239 | "cell_type": "code",
240 | "execution_count": 134,
241 | "metadata": {},
242 | "outputs": [
243 | {
244 | "data": {
245 | "text/plain": [
246 | "pyspark.sql.column.Column"
247 | ]
248 | },
249 | "execution_count": 134,
250 | "metadata": {},
251 | "output_type": "execute_result"
252 | }
253 | ],
254 | "source": [
255 | "type(df['country'])"
256 | ]
257 | },
258 | {
259 | "cell_type": "code",
260 | "execution_count": 135,
261 | "metadata": {},
262 | "outputs": [
263 | {
264 | "name": "stdout",
265 | "output_type": "stream",
266 | "text": [
267 | "+-------+\n",
268 | "|country|\n",
269 | "+-------+\n",
270 | "| UK|\n",
271 | "| US|\n",
272 | "| US|\n",
273 | "| China|\n",
274 | "| US|\n",
275 | "+-------+\n",
276 | "only showing top 5 rows\n",
277 | "\n"
278 | ]
279 | }
280 | ],
281 | "source": [
282 | "#access content of colum\n",
283 | "df.select('country').show(5)"
284 | ]
285 | },
286 | {
287 | "cell_type": "code",
288 | "execution_count": 136,
289 | "metadata": {},
290 | "outputs": [
291 | {
292 | "name": "stdout",
293 | "output_type": "stream",
294 | "text": [
295 | "+-------+------+\n",
296 | "|country|source|\n",
297 | "+-------+------+\n",
298 | "| UK| Ads|\n",
299 | "| US| Seo|\n",
300 | "| US| Seo|\n",
301 | "| China| Seo|\n",
302 | "| US| Seo|\n",
303 | "+-------+------+\n",
304 | "only showing top 5 rows\n",
305 | "\n"
306 | ]
307 | }
308 | ],
309 | "source": [
310 | "#acess multiple columns\n",
311 | "df.select(['country','source']).show(5)"
312 | ]
313 | },
314 | {
315 | "cell_type": "markdown",
316 | "metadata": {},
317 | "source": [
318 | "### Add or Remove column "
319 | ]
320 | },
321 | {
322 | "cell_type": "markdown",
323 | "metadata": {},
324 | "source": [
325 | "#### using udf (user defined functions)"
326 | ]
327 | },
328 | {
329 | "cell_type": "code",
330 | "execution_count": 137,
331 | "metadata": {},
332 | "outputs": [],
333 | "source": [
334 | "from pyspark.sql.types import StringType\n",
335 | "from pyspark.sql.functions import udf\n",
336 | "\n",
337 | "def country_udf(country):\n",
338 | " if country =='UK':\n",
339 | " return 'Britain'\n",
340 | " elif country =='US':\n",
341 | " return 'USA'\n",
342 | " elif country =='China':\n",
343 | " return 'Asia'\n",
344 | " elif country =='Germany':\n",
345 | " return 'Deustche'\n",
346 | " else:\n",
347 | " return country\n",
348 | " \n",
349 | "spark_udf = udf(country_udf, StringType())\n",
350 | "\n",
351 | "df=df.withColumn(\"country_new\", spark_udf(df.country))"
352 | ]
353 | },
354 | {
355 | "cell_type": "code",
356 | "execution_count": 138,
357 | "metadata": {},
358 | "outputs": [
359 | {
360 | "name": "stdout",
361 | "output_type": "stream",
362 | "text": [
363 | "+-------+---+--------+------+-------------------+---------+-----------+\n",
364 | "|country|age|new_user|source|total_pages_visited|converted|country_new|\n",
365 | "+-------+---+--------+------+-------------------+---------+-----------+\n",
366 | "| UK| 25| 1| Ads| 1| 0| Britain|\n",
367 | "| US| 23| 1| Seo| 5| 0| USA|\n",
368 | "| US| 28| 1| Seo| 4| 0| USA|\n",
369 | "| China| 39| 1| Seo| 5| 0| Asia|\n",
370 | "| US| 30| 1| Seo| 6| 0| USA|\n",
371 | "| US| 31| 0| Seo| 1| 0| USA|\n",
372 | "| China| 27| 1| Seo| 4| 0| Asia|\n",
373 | "| US| 23| 0| Ads| 4| 0| USA|\n",
374 | "| UK| 29| 0|Direct| 4| 0| Britain|\n",
375 | "| US| 25| 0| Ads| 2| 0| USA|\n",
376 | "+-------+---+--------+------+-------------------+---------+-----------+\n",
377 | "only showing top 10 rows\n",
378 | "\n"
379 | ]
380 | }
381 | ],
382 | "source": [
383 | "df.show(10)"
384 | ]
385 | },
386 | {
387 | "cell_type": "markdown",
388 | "metadata": {},
389 | "source": [
390 | "#### without using udf "
391 | ]
392 | },
393 | {
394 | "cell_type": "code",
395 | "execution_count": 139,
396 | "metadata": {},
397 | "outputs": [],
398 | "source": [
399 | "#create new column with age +2 value\n",
400 | "df=df.withColumn('new_age',df['age'] +2)"
401 | ]
402 | },
403 | {
404 | "cell_type": "code",
405 | "execution_count": 140,
406 | "metadata": {},
407 | "outputs": [
408 | {
409 | "name": "stdout",
410 | "output_type": "stream",
411 | "text": [
412 | "+-------+---+--------+------+-------------------+---------+-----------+-------+\n",
413 | "|country|age|new_user|source|total_pages_visited|converted|country_new|new_age|\n",
414 | "+-------+---+--------+------+-------------------+---------+-----------+-------+\n",
415 | "| UK| 25| 1| Ads| 1| 0| Britain| 27|\n",
416 | "| US| 23| 1| Seo| 5| 0| USA| 25|\n",
417 | "| US| 28| 1| Seo| 4| 0| USA| 30|\n",
418 | "| China| 39| 1| Seo| 5| 0| Asia| 41|\n",
419 | "| US| 30| 1| Seo| 6| 0| USA| 32|\n",
420 | "| US| 31| 0| Seo| 1| 0| USA| 33|\n",
421 | "| China| 27| 1| Seo| 4| 0| Asia| 29|\n",
422 | "| US| 23| 0| Ads| 4| 0| USA| 25|\n",
423 | "| UK| 29| 0|Direct| 4| 0| Britain| 31|\n",
424 | "| US| 25| 0| Ads| 2| 0| USA| 27|\n",
425 | "+-------+---+--------+------+-------------------+---------+-----------+-------+\n",
426 | "only showing top 10 rows\n",
427 | "\n"
428 | ]
429 | }
430 | ],
431 | "source": [
432 | "df.show(10)"
433 | ]
434 | },
435 | {
436 | "cell_type": "markdown",
437 | "metadata": {},
438 | "source": [
439 | "### Drop /Delete columns "
440 | ]
441 | },
442 | {
443 | "cell_type": "code",
444 | "execution_count": 141,
445 | "metadata": {},
446 | "outputs": [],
447 | "source": [
448 | "#delete the new_age column\n",
449 | "df=df.drop('new_age')"
450 | ]
451 | },
452 | {
453 | "cell_type": "code",
454 | "execution_count": 142,
455 | "metadata": {},
456 | "outputs": [],
457 | "source": [
458 | "#delete the country_new column\n",
459 | "df=df.drop('country_new')"
460 | ]
461 | },
462 | {
463 | "cell_type": "code",
464 | "execution_count": 143,
465 | "metadata": {},
466 | "outputs": [
467 | {
468 | "name": "stdout",
469 | "output_type": "stream",
470 | "text": [
471 | "+-------+---+--------+------+-------------------+---------+\n",
472 | "|country|age|new_user|source|total_pages_visited|converted|\n",
473 | "+-------+---+--------+------+-------------------+---------+\n",
474 | "| UK| 25| 1| Ads| 1| 0|\n",
475 | "| US| 23| 1| Seo| 5| 0|\n",
476 | "| US| 28| 1| Seo| 4| 0|\n",
477 | "| China| 39| 1| Seo| 5| 0|\n",
478 | "| US| 30| 1| Seo| 6| 0|\n",
479 | "+-------+---+--------+------+-------------------+---------+\n",
480 | "only showing top 5 rows\n",
481 | "\n"
482 | ]
483 | }
484 | ],
485 | "source": [
486 | "df.show(5)"
487 | ]
488 | },
489 | {
490 | "cell_type": "markdown",
491 | "metadata": {},
492 | "source": [
493 | "# Acess row objects of dataframe"
494 | ]
495 | },
496 | {
497 | "cell_type": "code",
498 | "execution_count": 144,
499 | "metadata": {},
500 | "outputs": [
501 | {
502 | "data": {
503 | "text/plain": [
504 | "[Row(country='UK', age=25, new_user=1, source='Ads', total_pages_visited=1, converted=0),\n",
505 | " Row(country='US', age=23, new_user=1, source='Seo', total_pages_visited=5, converted=0),\n",
506 | " Row(country='US', age=28, new_user=1, source='Seo', total_pages_visited=4, converted=0)]"
507 | ]
508 | },
509 | "execution_count": 144,
510 | "metadata": {},
511 | "output_type": "execute_result"
512 | }
513 | ],
514 | "source": [
515 | "#access first 3 rows\n",
516 | "df.head(3)"
517 | ]
518 | },
519 | {
520 | "cell_type": "code",
521 | "execution_count": 145,
522 | "metadata": {},
523 | "outputs": [
524 | {
525 | "data": {
526 | "text/plain": [
527 | "Row(country='UK', age=25, new_user=1, source='Ads', total_pages_visited=1, converted=0)"
528 | ]
529 | },
530 | "execution_count": 145,
531 | "metadata": {},
532 | "output_type": "execute_result"
533 | }
534 | ],
535 | "source": [
536 | "#access first row object \n",
537 | "df.head(3)[0]"
538 | ]
539 | },
540 | {
541 | "cell_type": "code",
542 | "execution_count": 146,
543 | "metadata": {},
544 | "outputs": [
545 | {
546 | "data": {
547 | "text/plain": [
548 | "'UK'"
549 | ]
550 | },
551 | "execution_count": 146,
552 | "metadata": {},
553 | "output_type": "execute_result"
554 | }
555 | ],
556 | "source": [
557 | "#access first row object\n",
558 | "df.head(3)[0][0]"
559 | ]
560 | },
561 | {
562 | "cell_type": "markdown",
563 | "metadata": {},
564 | "source": [
565 | "## Filtering "
566 | ]
567 | },
568 | {
569 | "cell_type": "code",
570 | "execution_count": 147,
571 | "metadata": {},
572 | "outputs": [
573 | {
574 | "name": "stdout",
575 | "output_type": "stream",
576 | "text": [
577 | "+-------+---+--------+------+-------------------+---------+\n",
578 | "|country|age|new_user|source|total_pages_visited|converted|\n",
579 | "+-------+---+--------+------+-------------------+---------+\n",
580 | "|Germany|123| 0| Seo| 15| 1|\n",
581 | "| US| 77| 0|Direct| 4| 0|\n",
582 | "| US| 79| 1|Direct| 1| 0|\n",
583 | "| UK|111| 0| Ads| 10| 1|\n",
584 | "+-------+---+--------+------+-------------------+---------+\n",
585 | "\n"
586 | ]
587 | }
588 | ],
589 | "source": [
590 | "#filter records where age of user is more than 75 years\n",
591 | "df.filter(df['age'] >75).show(5)"
592 | ]
593 | },
594 | {
595 | "cell_type": "code",
596 | "execution_count": 148,
597 | "metadata": {},
598 | "outputs": [
599 | {
600 | "name": "stdout",
601 | "output_type": "stream",
602 | "text": [
603 | "+-------+---------+---+\n",
604 | "|country|converted|age|\n",
605 | "+-------+---------+---+\n",
606 | "|Germany| 1|123|\n",
607 | "| US| 0| 77|\n",
608 | "| US| 0| 79|\n",
609 | "| UK| 1|111|\n",
610 | "+-------+---------+---+\n",
611 | "\n"
612 | ]
613 | }
614 | ],
615 | "source": [
616 | "#filter records and show only country and converted status of that user\n",
617 | "df.filter(df['age'] > 75).select(['country','converted','age']).show(5)"
618 | ]
619 | },
620 | {
621 | "cell_type": "markdown",
622 | "metadata": {},
623 | "source": [
624 | "### Multiple filter conditions"
625 | ]
626 | },
627 | {
628 | "cell_type": "code",
629 | "execution_count": 149,
630 | "metadata": {},
631 | "outputs": [
632 | {
633 | "name": "stdout",
634 | "output_type": "stream",
635 | "text": [
636 | "+-------+---+--------+------+-------------------+---------+\n",
637 | "|country|age|new_user|source|total_pages_visited|converted|\n",
638 | "+-------+---+--------+------+-------------------+---------+\n",
639 | "| US| 77| 0|Direct| 4| 0|\n",
640 | "| US| 79| 1|Direct| 1| 0|\n",
641 | "+-------+---+--------+------+-------------------+---------+\n",
642 | "\n"
643 | ]
644 | }
645 | ],
646 | "source": [
647 | "#select people over 75 years only from US\n",
648 | "df.filter(df['age'] > 75).filter(df['country'] =='US').show(5)"
649 | ]
650 | },
651 | {
652 | "cell_type": "code",
653 | "execution_count": 150,
654 | "metadata": {},
655 | "outputs": [
656 | {
657 | "name": "stdout",
658 | "output_type": "stream",
659 | "text": [
660 | "+-------+---+--------+------+-------------------+---------+\n",
661 | "|country|age|new_user|source|total_pages_visited|converted|\n",
662 | "+-------+---+--------+------+-------------------+---------+\n",
663 | "|Germany| 31| 0|Direct| 2| 1|\n",
664 | "+-------+---+--------+------+-------------------+---------+\n",
665 | "\n"
666 | ]
667 | }
668 | ],
669 | "source": [
670 | "#selet users who have more less than 3 visited pages and are still converted from Germany \n",
671 | "df.filter(df['total_pages_visited'] < 3).filter(df['converted']==1).filter(df['country'] =='Germany').show(5)"
672 | ]
673 | },
674 | {
675 | "cell_type": "markdown",
676 | "metadata": {},
677 | "source": [
678 | "## Count Records "
679 | ]
680 | },
681 | {
682 | "cell_type": "code",
683 | "execution_count": 151,
684 | "metadata": {},
685 | "outputs": [
686 | {
687 | "data": {
688 | "text/plain": [
689 | "316200"
690 | ]
691 | },
692 | "execution_count": 151,
693 | "metadata": {},
694 | "output_type": "execute_result"
695 | }
696 | ],
697 | "source": [
698 | "#total records in df \n",
699 | "df.count()"
700 | ]
701 | },
702 | {
703 | "cell_type": "code",
704 | "execution_count": 152,
705 | "metadata": {},
706 | "outputs": [
707 | {
708 | "name": "stdout",
709 | "output_type": "stream",
710 | "text": [
711 | "+-------+------+\n",
712 | "|country| count|\n",
713 | "+-------+------+\n",
714 | "|Germany| 13056|\n",
715 | "| China| 76602|\n",
716 | "| US|178092|\n",
717 | "| UK| 48450|\n",
718 | "+-------+------+\n",
719 | "\n"
720 | ]
721 | }
722 | ],
723 | "source": [
724 | "# Frequency count of column values\n",
725 | "df.groupBy('country').count().show(5)"
726 | ]
727 | },
728 | {
729 | "cell_type": "code",
730 | "execution_count": 153,
731 | "metadata": {},
732 | "outputs": [
733 | {
734 | "name": "stdout",
735 | "output_type": "stream",
736 | "text": [
737 | "+-------+------+\n",
738 | "|country| count|\n",
739 | "+-------+------+\n",
740 | "| US|178092|\n",
741 | "| China| 76602|\n",
742 | "| UK| 48450|\n",
743 | "|Germany| 13056|\n",
744 | "+-------+------+\n",
745 | "\n"
746 | ]
747 | }
748 | ],
749 | "source": [
750 | "## Ordered Frequency count \n",
751 | "df.groupBy('country').count().orderBy('count',ascending=False).show(5)"
752 | ]
753 | },
754 | {
755 | "cell_type": "code",
756 | "execution_count": 154,
757 | "metadata": {},
758 | "outputs": [
759 | {
760 | "name": "stdout",
761 | "output_type": "stream",
762 | "text": [
763 | "+---------+------+\n",
764 | "|converted| count|\n",
765 | "+---------+------+\n",
766 | "| 1| 10200|\n",
767 | "| 0|306000|\n",
768 | "+---------+------+\n",
769 | "\n"
770 | ]
771 | }
772 | ],
773 | "source": [
774 | "#Total converted vs non converted user counts\n",
775 | "df.groupBy('converted').count().show(2)"
776 | ]
777 | },
778 | {
779 | "cell_type": "code",
780 | "execution_count": 156,
781 | "metadata": {},
782 | "outputs": [
783 | {
784 | "name": "stdout",
785 | "output_type": "stream",
786 | "text": [
787 | "+---------+------------------+------------------+------------------------+--------------+\n",
788 | "|converted| avg(age)| avg(new_user)|avg(total_pages_visited)|avg(converted)|\n",
789 | "+---------+------------------+------------------+------------------------+--------------+\n",
790 | "| 1|26.546764705882353|0.2979411764705882| 14.553529411764705| 1.0|\n",
791 | "| 0|30.703960784313725|0.6983823529411765| 4.550281045751634| 0.0|\n",
792 | "+---------+------------------+------------------+------------------------+--------------+\n",
793 | "\n"
794 | ]
795 | }
796 | ],
797 | "source": [
798 | "#Mean value of conversion for each source\n",
799 | "df.groupBy('converted').mean().show()"
800 | ]
801 | },
802 | {
803 | "cell_type": "markdown",
804 | "metadata": {},
805 | "source": [
806 | "## Collect "
807 | ]
808 | },
809 | {
810 | "cell_type": "markdown",
811 | "metadata": {},
812 | "source": [
813 | "Save the results as a list with row objects\n"
814 | ]
815 | },
816 | {
817 | "cell_type": "code",
818 | "execution_count": 57,
819 | "metadata": {},
820 | "outputs": [],
821 | "source": [
822 | "# create a list with only converted users data from China\n",
823 | "china_data=df.filter((df['country']=='China') & (df['converted'] ==1)).collect()"
824 | ]
825 | },
826 | {
827 | "cell_type": "code",
828 | "execution_count": 60,
829 | "metadata": {},
830 | "outputs": [
831 | {
832 | "data": {
833 | "text/plain": [
834 | "[Row(country='China', age='24', new_user='0', source='Seo', total_pages_visited='18', converted='1'),\n",
835 | " Row(country='China', age='26', new_user='1', source='Ads', total_pages_visited='18', converted='1'),\n",
836 | " Row(country='China', age='30', new_user='0', source='Ads', total_pages_visited='17', converted='1'),\n",
837 | " Row(country='China', age='26', new_user='0', source='Seo', total_pages_visited='8', converted='1'),\n",
838 | " Row(country='China', age='33', new_user='1', source='Direct', total_pages_visited='13', converted='1')]"
839 | ]
840 | },
841 | "execution_count": 60,
842 | "metadata": {},
843 | "output_type": "execute_result"
844 | }
845 | ],
846 | "source": [
847 | "#view the new list \n",
848 | "china_data[:5]"
849 | ]
850 | },
851 | {
852 | "cell_type": "code",
853 | "execution_count": 67,
854 | "metadata": {},
855 | "outputs": [],
856 | "source": [
857 | "#view the list object as a dictionary\n",
858 | "china_dict=china_data[0].asDict()"
859 | ]
860 | },
861 | {
862 | "cell_type": "code",
863 | "execution_count": 64,
864 | "metadata": {},
865 | "outputs": [
866 | {
867 | "data": {
868 | "text/plain": [
869 | "'24'"
870 | ]
871 | },
872 | "execution_count": 64,
873 | "metadata": {},
874 | "output_type": "execute_result"
875 | }
876 | ],
877 | "source": [
878 | "china_dict['age']"
879 | ]
880 | },
881 | {
882 | "cell_type": "code",
883 | "execution_count": 65,
884 | "metadata": {},
885 | "outputs": [
886 | {
887 | "data": {
888 | "text/plain": [
889 | "'18'"
890 | ]
891 | },
892 | "execution_count": 65,
893 | "metadata": {},
894 | "output_type": "execute_result"
895 | }
896 | ],
897 | "source": [
898 | "china_dict['total_pages_visited']"
899 | ]
900 | },
901 | {
902 | "cell_type": "markdown",
903 | "metadata": {},
904 | "source": [
905 | "## Aggregate Functions"
906 | ]
907 | },
908 | {
909 | "cell_type": "code",
910 | "execution_count": 158,
911 | "metadata": {},
912 | "outputs": [
913 | {
914 | "name": "stdout",
915 | "output_type": "stream",
916 | "text": [
917 | "+------------------+\n",
918 | "| avg(age)|\n",
919 | "+------------------+\n",
920 | "|30.569857685009488|\n",
921 | "+------------------+\n",
922 | "\n"
923 | ]
924 | }
925 | ],
926 | "source": [
927 | "df.agg({'age':'mean'}).show()"
928 | ]
929 | },
930 | {
931 | "cell_type": "code",
932 | "execution_count": 159,
933 | "metadata": {},
934 | "outputs": [
935 | {
936 | "name": "stdout",
937 | "output_type": "stream",
938 | "text": [
939 | "+-------------------+\n",
940 | "| avg(converted)|\n",
941 | "+-------------------+\n",
942 | "|0.03225806451612903|\n",
943 | "+-------------------+\n",
944 | "\n"
945 | ]
946 | }
947 | ],
948 | "source": [
949 | "df.agg({'converted':'mean'}).show()"
950 | ]
951 | },
952 | {
953 | "cell_type": "code",
954 | "execution_count": 160,
955 | "metadata": {},
956 | "outputs": [
957 | {
958 | "name": "stdout",
959 | "output_type": "stream",
960 | "text": [
961 | "+--------+\n",
962 | "|max(age)|\n",
963 | "+--------+\n",
964 | "| 123|\n",
965 | "+--------+\n",
966 | "\n"
967 | ]
968 | }
969 | ],
970 | "source": [
971 | "df.agg({'age':'max'}).show()"
972 | ]
973 | },
974 | {
975 | "cell_type": "code",
976 | "execution_count": 161,
977 | "metadata": {},
978 | "outputs": [
979 | {
980 | "name": "stdout",
981 | "output_type": "stream",
982 | "text": [
983 | "+--------------+\n",
984 | "|count(country)|\n",
985 | "+--------------+\n",
986 | "| 316200|\n",
987 | "+--------------+\n",
988 | "\n"
989 | ]
990 | }
991 | ],
992 | "source": [
993 | "df.agg({'country':'count'}).show()"
994 | ]
995 | },
996 | {
997 | "cell_type": "code",
998 | "execution_count": 162,
999 | "metadata": {},
1000 | "outputs": [
1001 | {
1002 | "name": "stdout",
1003 | "output_type": "stream",
1004 | "text": [
1005 | "+--------+\n",
1006 | "|min(age)|\n",
1007 | "+--------+\n",
1008 | "| 17|\n",
1009 | "+--------+\n",
1010 | "\n"
1011 | ]
1012 | }
1013 | ],
1014 | "source": [
1015 | "df.agg({'age':'min'}).show()"
1016 | ]
1017 | },
1018 | {
1019 | "cell_type": "code",
1020 | "execution_count": 163,
1021 | "metadata": {},
1022 | "outputs": [
1023 | {
1024 | "name": "stdout",
1025 | "output_type": "stream",
1026 | "text": [
1027 | "+-------+--------+\n",
1028 | "|country|max(age)|\n",
1029 | "+-------+--------+\n",
1030 | "|Germany| 123|\n",
1031 | "| China| 69|\n",
1032 | "| US| 79|\n",
1033 | "| UK| 111|\n",
1034 | "+-------+--------+\n",
1035 | "\n"
1036 | ]
1037 | }
1038 | ],
1039 | "source": [
1040 | "## aggregation on grouped data by country\n",
1041 | "df.groupBy('country').agg({'age':'max'}).show()"
1042 | ]
1043 | },
1044 | {
1045 | "cell_type": "code",
1046 | "execution_count": 164,
1047 | "metadata": {},
1048 | "outputs": [
1049 | {
1050 | "name": "stdout",
1051 | "output_type": "stream",
1052 | "text": [
1053 | "+-------+------+--------+\n",
1054 | "|country|source|max(age)|\n",
1055 | "+-------+------+--------+\n",
1056 | "|Germany|Direct| 61|\n",
1057 | "| China|Direct| 65|\n",
1058 | "| UK| Ads| 111|\n",
1059 | "| US| Seo| 73|\n",
1060 | "| UK| Seo| 66|\n",
1061 | "|Germany| Seo| 123|\n",
1062 | "|Germany| Ads| 64|\n",
1063 | "| China| Seo| 68|\n",
1064 | "| UK|Direct| 69|\n",
1065 | "| China| Ads| 69|\n",
1066 | "| US| Ads| 70|\n",
1067 | "| US|Direct| 79|\n",
1068 | "+-------+------+--------+\n",
1069 | "\n"
1070 | ]
1071 | }
1072 | ],
1073 | "source": [
1074 | "## aggregation on grouped data by country,source\n",
1075 | "df.groupBy(['country','source']).agg({'age':'max'}).show()"
1076 | ]
1077 | },
1078 | {
1079 | "cell_type": "code",
1080 | "execution_count": 170,
1081 | "metadata": {},
1082 | "outputs": [
1083 | {
1084 | "name": "stdout",
1085 | "output_type": "stream",
1086 | "text": [
1087 | "+-------+---------+------------------------+\n",
1088 | "|country|converted|avg(total_pages_visited)|\n",
1089 | "+-------+---------+------------------------+\n",
1090 | "|Germany| 0| 4.565277777777778|\n",
1091 | "| China| 1| 14.352941176470589|\n",
1092 | "| China| 0| 4.5404575163398695|\n",
1093 | "| US| 0| 4.551785714285714|\n",
1094 | "| UK| 0| 4.557037037037037|\n",
1095 | "|Germany| 1| 14.572303921568627|\n",
1096 | "| UK| 1| 14.53450980392157|\n",
1097 | "| US| 1| 14.561497326203208|\n",
1098 | "+-------+---------+------------------------+\n",
1099 | "\n"
1100 | ]
1101 | }
1102 | ],
1103 | "source": [
1104 | "## aggregation on grouped data by country,converted\n",
1105 | "df.groupBy(['country','converted']).agg({'total_pages_visited':'mean'}).show()"
1106 | ]
1107 | },
1108 | {
1109 | "cell_type": "code",
1110 | "execution_count": null,
1111 | "metadata": {},
1112 | "outputs": [],
1113 | "source": []
1114 | }
1115 | ],
1116 | "metadata": {
1117 | "kernelspec": {
1118 | "display_name": "Python 3",
1119 | "language": "python",
1120 | "name": "python3"
1121 | },
1122 | "language_info": {
1123 | "codemirror_mode": {
1124 | "name": "ipython",
1125 | "version": 3
1126 | },
1127 | "file_extension": ".py",
1128 | "mimetype": "text/x-python",
1129 | "name": "python",
1130 | "nbconvert_exporter": "python",
1131 | "pygments_lexer": "ipython3",
1132 | "version": "3.6.3"
1133 | }
1134 | },
1135 | "nbformat": 4,
1136 | "nbformat_minor": 2
1137 | }
1138 |
--------------------------------------------------------------------------------