├── chap_2
    └── .DS_Store
├── chap_3
    ├── .DS_Store
    ├── .ipynb_checkpoints
    │   ├── Spark Structured Streaming-checkpoint.ipynb
    │   ├── Spark Structured Streaming-ver_1-checkpoint.ipynb
    │   ├── Spark Structured Streaming app-checkpoint.ipynb
    │   ├── Spark Structured Streaming demo-checkpoint.ipynb
    │   ├── Logistic_resgression_pyspark-checkpoint.ipynb
    │   └── pyspark_basics-checkpoint.ipynb
    └── Spark Structured Streaming demo.ipynb
├── chap_4
    ├── .DS_Store
    └── pramod_dag.py
├── chap_5
    ├── .DS_Store
    ├── Classification_using_MLlib.ipynb
    └── .ipynb_checkpoints
    │   └── Classification_using_MLlib-checkpoint.ipynb
├── chap_6
    └── .DS_Store
├── chap_7
    └── .DS_Store
├── chap_8
    ├── .DS_Store
    ├── .ipynb_checkpoints
    │   └── multilayer perceptron-checkpoint.ipynb
    └── Multilayer_perceptron_spark.ipynb
├── 9781484249604.jpg
├── errata.md
├── README.md
├── Contributing.md
└── LICENSE.txt


/chap_2/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_2/.DS_Store


--------------------------------------------------------------------------------
/chap_3/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_3/.DS_Store


--------------------------------------------------------------------------------
/chap_4/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_4/.DS_Store


--------------------------------------------------------------------------------
/chap_5/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_5/.DS_Store


--------------------------------------------------------------------------------
/chap_6/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_6/.DS_Store


--------------------------------------------------------------------------------
/chap_7/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_7/.DS_Store


--------------------------------------------------------------------------------
/chap_8/.DS_Store:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_8/.DS_Store


--------------------------------------------------------------------------------
/9781484249604.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/9781484249604.jpg


--------------------------------------------------------------------------------
/errata.md:
--------------------------------------------------------------------------------
 1 | # Errata for *Book Title*
 2 | 
 3 | On **page xx** [Summary of error]:
 4 |  
 5 | Details of error here. Highlight key pieces in **bold**.
 6 | 
 7 | ***
 8 | 
 9 | On **page xx** [Summary of error]:
10 |  
11 | Details of error here. Highlight key pieces in **bold**.
12 | 
13 | ***


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Apress Source Code
 2 | 
 3 | This repository accompanies [*Learn PySpark*](https://www.apress.com/9781484249604) by Pramod Singh (Apress, 2019).
 4 | 
 5 | [comment]: #cover
 6 | ![Cover image](9781484249604.jpg)
 7 | 
 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git.
 9 | 
10 | ## Releases
11 | 
12 | Release v1.0 corresponds to the code in the published book, without corrections or updates.
13 | 
14 | ## Contributions
15 | 
16 | See the file Contributing.md for more information on how you can contribute to this repository.


--------------------------------------------------------------------------------
/Contributing.md:
--------------------------------------------------------------------------------
 1 | # Contributing to Apress Source Code
 2 | 
 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers.
 4 | 
 5 | ## How to Contribute
 6 | 
 7 | 1. Make sure you have a GitHub account.
 8 | 2. Fork the repository for the relevant book.
 9 | 3. Create a new branch on which to make your change, e.g. 
10 | `git checkout -b my_code_contribution`
11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted.
12 | 5. Submit a pull request.
13 | 
14 | Thank you for your contribution!


--------------------------------------------------------------------------------
/chap_4/pramod_dag.py:
--------------------------------------------------------------------------------
 1 | from datetime import timedelta
 2 | 
 3 | import airflow
 4 | from airflow import DAG
 5 | from airflow.operators.bash_operator import BashOperator
 6 | 
 7 | 
 8 | 
 9 | args = {
10 |     'owner': 'Pramod',    
11 |     'start_date': airflow.utils.dates.days_ago(3),
12 |     # 'end_date': datetime(2018, 12, 30),
13 |     'depends_on_past': False,
14 |     'email': ['airflow@example.com'],
15 |     'email_on_failure': False,
16 |     'email_on_retry': False,
17 |     # If a task fails, retry it once after waiting
18 |     # at least 5 minutes
19 |     'retries': 1,
20 |     'retry_delay': timedelta(minutes=5),
21 |     }
22 | 
23 | 
24 | dag = DAG(
25 |     'pramod_airflow_dag',
26 |     default_args=args,
27 |     description='A simple DAG',
28 |     # Continue to run DAG once per day
29 |     schedule_interval=timedelta(days=1)
30 | )
31 | 
32 | 
33 | # t1, t2 and t3 are examples of tasks created by instantiating operators
34 | t1 = BashOperator(
35 |     task_id='print_date',
36 |     bash_command='date',
37 |     dag=dag,
38 | )
39 | 
40 | t2 = BashOperator(
41 |     task_id='sleep',
42 |     depends_on_past=False,
43 |     bash_command='sleep 5',
44 |     dag=dag,
45 | )
46 | 
47 | 
48 | 
49 | t1 >> t2
50 | 
51 | 


--------------------------------------------------------------------------------
/LICENSE.txt:
--------------------------------------------------------------------------------
 1 | Freeware License, some rights reserved
 2 | 
 3 | Copyright (c) 2019 Pramod Singh
 4 | 
 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 
 6 | of this software and associated documentation files (the "Software"), 
 7 | to work with the Software within the limits of freeware distribution and fair use. 
 8 | This includes the rights to use, copy, and modify the Software for personal use. 
 9 | Users are also allowed and encouraged to submit corrections and modifications 
10 | to the Software for the benefit of other users.
11 | 
12 | It is not allowed to reuse,  modify, or redistribute the Software for 
13 | commercial use in any way, or for a user’s educational materials such as books 
14 | or blog articles without prior permission from the copyright holder. 
15 | 
16 | The above copyright notice and this permission notice need to be included 
17 | in all copies or substantial portions of the software.
18 | 
19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
25 | SOFTWARE.
26 | 
27 | 
28 | 


--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/Spark Structured Streaming-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pyspark"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#import SparkSession\n",
 19 |     "from pyspark.sql import SparkSession\n",
 20 |     "spark=SparkSession.builder.appName('ss').getOrCreate()"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 20,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "from pyspark.sql.functions import *\n",
 30 |     "from pyspark.sql.types import *"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 22,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "#create sample dataset\n",
 40 |     "df_1=spark.createDataFrame([('pramod neha',),('pramod ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 23,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "#define schema for input data\n",
 50 |     "schema=StructType().add('name','string')\n",
 51 |     "name_list=spark.readStream.schema(schema).format('parquet').load(\"new_folder\")"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 24,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "#split the names into individual names\n",
 61 |     "names=name_list.select(explode(split(name_list.name,' ')).alias('name'))"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 25,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "name_count=names.groupBy('name').count()"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 27,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "#query to write the results into memory sink\n",
 80 |     "query=(name_count.writeStream.queryName('new_query').outputMode('complete').format('memory').start())"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 31,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "data": {
 90 |       "text/html": [
 91 |        "<div>\n",
 92 |        "<table border=\"1\" class=\"dataframe\">\n",
 93 |        "  <thead>\n",
 94 |        "    <tr style=\"text-align: right;\">\n",
 95 |        "      <th></th>\n",
 96 |        "      <th>name</th>\n",
 97 |        "      <th>count</th>\n",
 98 |        "    </tr>\n",
 99 |        "  </thead>\n",
100 |        "  <tbody>\n",
101 |        "    <tr>\n",
102 |        "      <th>0</th>\n",
103 |        "      <td>ziaan</td>\n",
104 |        "      <td>1</td>\n",
105 |        "    </tr>\n",
106 |        "    <tr>\n",
107 |        "      <th>1</th>\n",
108 |        "      <td>neha</td>\n",
109 |        "      <td>1</td>\n",
110 |        "    </tr>\n",
111 |        "    <tr>\n",
112 |        "      <th>2</th>\n",
113 |        "      <td>pramod</td>\n",
114 |        "      <td>2</td>\n",
115 |        "    </tr>\n",
116 |        "  </tbody>\n",
117 |        "</table>\n",
118 |        "</div>"
119 |       ],
120 |       "text/plain": [
121 |        "     name  count\n",
122 |        "0   ziaan      1\n",
123 |        "1    neha      1\n",
124 |        "2  pramod      2"
125 |       ]
126 |      },
127 |      "execution_count": 31,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     }
131 |    ],
132 |    "source": [
133 |     "spark.sql(\"select * from new_query order by count \").toPandas().head(5)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 32,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "df_2=spark.createDataFrame([('ziaan neha',),('ziaan ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 35,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/html": [
153 |        "<div>\n",
154 |        "<table border=\"1\" class=\"dataframe\">\n",
155 |        "  <thead>\n",
156 |        "    <tr style=\"text-align: right;\">\n",
157 |        "      <th></th>\n",
158 |        "      <th>name</th>\n",
159 |        "      <th>count</th>\n",
160 |        "    </tr>\n",
161 |        "  </thead>\n",
162 |        "  <tbody>\n",
163 |        "    <tr>\n",
164 |        "      <th>0</th>\n",
165 |        "      <td>pramod</td>\n",
166 |        "      <td>2</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>1</th>\n",
170 |        "      <td>neha</td>\n",
171 |        "      <td>3</td>\n",
172 |        "    </tr>\n",
173 |        "    <tr>\n",
174 |        "      <th>2</th>\n",
175 |        "      <td>ziaan</td>\n",
176 |        "      <td>5</td>\n",
177 |        "    </tr>\n",
178 |        "  </tbody>\n",
179 |        "</table>\n",
180 |        "</div>"
181 |       ],
182 |       "text/plain": [
183 |        "     name  count\n",
184 |        "0  pramod      2\n",
185 |        "1    neha      3\n",
186 |        "2   ziaan      5"
187 |       ]
188 |      },
189 |      "execution_count": 35,
190 |      "metadata": {},
191 |      "output_type": "execute_result"
192 |     }
193 |    ],
194 |    "source": [
195 |     "spark.sql(\"select * from new_query order by count \").toPandas().head(50)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 34,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "df_3=spark.createDataFrame([('neha',),('ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": []
213 |   }
214 |  ],
215 |  "metadata": {
216 |   "kernelspec": {
217 |    "display_name": "Python 3",
218 |    "language": "python",
219 |    "name": "python3"
220 |   },
221 |   "language_info": {
222 |    "codemirror_mode": {
223 |     "name": "ipython",
224 |     "version": 3
225 |    },
226 |    "file_extension": ".py",
227 |    "mimetype": "text/x-python",
228 |    "name": "python",
229 |    "nbconvert_exporter": "python",
230 |    "pygments_lexer": "ipython3",
231 |    "version": "3.6.3"
232 |   }
233 |  },
234 |  "nbformat": 4,
235 |  "nbformat_minor": 2
236 | }
237 | 


--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/Spark Structured Streaming-ver_1-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "import pyspark"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 2,
 15 |    "metadata": {},
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "#import SparkSession\n",
 19 |     "from pyspark.sql import SparkSession\n",
 20 |     "spark=SparkSession.builder.appName('ss').getOrCreate()"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 20,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "from pyspark.sql.functions import *\n",
 30 |     "from pyspark.sql.types import *"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 22,
 36 |    "metadata": {},
 37 |    "outputs": [],
 38 |    "source": [
 39 |     "#create sample dataset\n",
 40 |     "df_1=spark.createDataFrame([('pramod neha',),('pramod ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 23,
 46 |    "metadata": {},
 47 |    "outputs": [],
 48 |    "source": [
 49 |     "#define schema for input data\n",
 50 |     "schema=StructType().add('name','string')\n",
 51 |     "name_list=spark.readStream.schema(schema).format('parquet').load(\"new_folder\")"
 52 |    ]
 53 |   },
 54 |   {
 55 |    "cell_type": "code",
 56 |    "execution_count": 24,
 57 |    "metadata": {},
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "#split the names into individual names\n",
 61 |     "names=name_list.select(explode(split(name_list.name,' ')).alias('name'))"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 25,
 67 |    "metadata": {},
 68 |    "outputs": [],
 69 |    "source": [
 70 |     "name_count=names.groupBy('name').count()"
 71 |    ]
 72 |   },
 73 |   {
 74 |    "cell_type": "code",
 75 |    "execution_count": 27,
 76 |    "metadata": {},
 77 |    "outputs": [],
 78 |    "source": [
 79 |     "#query to write the results into memory sink\n",
 80 |     "query=(name_count.writeStream.queryName('new_query').outputMode('complete').format('memory').start())"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 31,
 86 |    "metadata": {},
 87 |    "outputs": [
 88 |     {
 89 |      "data": {
 90 |       "text/html": [
 91 |        "<div>\n",
 92 |        "<table border=\"1\" class=\"dataframe\">\n",
 93 |        "  <thead>\n",
 94 |        "    <tr style=\"text-align: right;\">\n",
 95 |        "      <th></th>\n",
 96 |        "      <th>name</th>\n",
 97 |        "      <th>count</th>\n",
 98 |        "    </tr>\n",
 99 |        "  </thead>\n",
100 |        "  <tbody>\n",
101 |        "    <tr>\n",
102 |        "      <th>0</th>\n",
103 |        "      <td>ziaan</td>\n",
104 |        "      <td>1</td>\n",
105 |        "    </tr>\n",
106 |        "    <tr>\n",
107 |        "      <th>1</th>\n",
108 |        "      <td>neha</td>\n",
109 |        "      <td>1</td>\n",
110 |        "    </tr>\n",
111 |        "    <tr>\n",
112 |        "      <th>2</th>\n",
113 |        "      <td>pramod</td>\n",
114 |        "      <td>2</td>\n",
115 |        "    </tr>\n",
116 |        "  </tbody>\n",
117 |        "</table>\n",
118 |        "</div>"
119 |       ],
120 |       "text/plain": [
121 |        "     name  count\n",
122 |        "0   ziaan      1\n",
123 |        "1    neha      1\n",
124 |        "2  pramod      2"
125 |       ]
126 |      },
127 |      "execution_count": 31,
128 |      "metadata": {},
129 |      "output_type": "execute_result"
130 |     }
131 |    ],
132 |    "source": [
133 |     "spark.sql(\"select * from new_query order by count \").toPandas().head(5)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 32,
139 |    "metadata": {},
140 |    "outputs": [],
141 |    "source": [
142 |     "df_2=spark.createDataFrame([('ziaan neha',),('ziaan ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
143 |    ]
144 |   },
145 |   {
146 |    "cell_type": "code",
147 |    "execution_count": 35,
148 |    "metadata": {},
149 |    "outputs": [
150 |     {
151 |      "data": {
152 |       "text/html": [
153 |        "<div>\n",
154 |        "<table border=\"1\" class=\"dataframe\">\n",
155 |        "  <thead>\n",
156 |        "    <tr style=\"text-align: right;\">\n",
157 |        "      <th></th>\n",
158 |        "      <th>name</th>\n",
159 |        "      <th>count</th>\n",
160 |        "    </tr>\n",
161 |        "  </thead>\n",
162 |        "  <tbody>\n",
163 |        "    <tr>\n",
164 |        "      <th>0</th>\n",
165 |        "      <td>pramod</td>\n",
166 |        "      <td>2</td>\n",
167 |        "    </tr>\n",
168 |        "    <tr>\n",
169 |        "      <th>1</th>\n",
170 |        "      <td>neha</td>\n",
171 |        "      <td>3</td>\n",
172 |        "    </tr>\n",
173 |        "    <tr>\n",
174 |        "      <th>2</th>\n",
175 |        "      <td>ziaan</td>\n",
176 |        "      <td>5</td>\n",
177 |        "    </tr>\n",
178 |        "  </tbody>\n",
179 |        "</table>\n",
180 |        "</div>"
181 |       ],
182 |       "text/plain": [
183 |        "     name  count\n",
184 |        "0  pramod      2\n",
185 |        "1    neha      3\n",
186 |        "2   ziaan      5"
187 |       ]
188 |      },
189 |      "execution_count": 35,
190 |      "metadata": {},
191 |      "output_type": "execute_result"
192 |     }
193 |    ],
194 |    "source": [
195 |     "spark.sql(\"select * from new_query order by count \").toPandas().head(50)"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 34,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "df_3=spark.createDataFrame([('neha',),('ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": null,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": []
213 |   }
214 |  ],
215 |  "metadata": {
216 |   "kernelspec": {
217 |    "display_name": "Python 3",
218 |    "language": "python",
219 |    "name": "python3"
220 |   },
221 |   "language_info": {
222 |    "codemirror_mode": {
223 |     "name": "ipython",
224 |     "version": 3
225 |    },
226 |    "file_extension": ".py",
227 |    "mimetype": "text/x-python",
228 |    "name": "python",
229 |    "nbconvert_exporter": "python",
230 |    "pygments_lexer": "ipython3",
231 |    "version": "3.6.3"
232 |   }
233 |  },
234 |  "nbformat": 4,
235 |  "nbformat_minor": 2
236 | }
237 | 


--------------------------------------------------------------------------------
/chap_8/.ipynb_checkpoints/multilayer perceptron-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Load the libraries"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "import os\n",
 17 |     "import numpy as np\n",
 18 |     "import pandas as pd\n",
 19 |     "from pyspark.sql.types import *\n",
 20 |     "from pyspark.ml import Pipeline\n",
 21 |     "from pyspark.sql import functions as f\n",
 22 |     "from pyspark.sql.functions import udf, StringType\n",
 23 |     "from pyspark.sql import SparkSession, functions as F\n",
 24 |     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
 25 |     "from pyspark.ml.classification import MultilayerPerceptronClassifier\n",
 26 |     "from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer"
 27 |    ]
 28 |   },
 29 |   {
 30 |    "cell_type": "markdown",
 31 |    "metadata": {},
 32 |    "source": [
 33 |     "# Initialize Spark Session"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 2,
 39 |    "metadata": {},
 40 |    "outputs": [],
 41 |    "source": [
 42 |     "spark = SparkSession.builder.appName('pyspark-dl').getOrCreate()"
 43 |    ]
 44 |   },
 45 |   {
 46 |    "cell_type": "markdown",
 47 |    "metadata": {},
 48 |    "source": [
 49 |     "# Read the Dataset"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": 3,
 55 |    "metadata": {},
 56 |    "outputs": [],
 57 |    "source": [
 58 |     "web_data = spark.read.csv('data_set.csv', header=True, inferSchema=True)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "name": "stdout",
 68 |      "output_type": "stream",
 69 |      "text": [
 70 |       "root\n",
 71 |       " |-- Visit_Number_Bucket: string (nullable = true)\n",
 72 |       " |-- Page_Views_Normalized: double (nullable = true)\n",
 73 |       " |-- Orders_Normalized: integer (nullable = true)\n",
 74 |       " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n",
 75 |       " |-- Internal_Search_Null_Normalized: double (nullable = true)\n",
 76 |       " |-- Email_Signup_Normalized: double (nullable = true)\n",
 77 |       " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n",
 78 |       " |-- Store_Locator_Search_Normalized: double (nullable = true)\n",
 79 |       " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n",
 80 |       " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n",
 81 |       " |-- Mapped_Browser_Type: string (nullable = true)\n",
 82 |       " |-- Mapped_Entry_Pages: string (nullable = true)\n",
 83 |       " |-- Mapped_Site_Section: string (nullable = true)\n",
 84 |       " |-- Mapped_Promo_Code: string (nullable = true)\n",
 85 |       " |-- Maped_Product_Name: string (nullable = true)\n",
 86 |       " |-- Mapped_Search_Term: string (nullable = true)\n",
 87 |       " |-- Mapped_Product_Collection: string (nullable = true)\n",
 88 |       "\n"
 89 |      ]
 90 |     }
 91 |    ],
 92 |    "source": [
 93 |     "web_data.printSchema()"
 94 |    ]
 95 |   },
 96 |   {
 97 |    "cell_type": "markdown",
 98 |    "metadata": {},
 99 |    "source": [
100 |     "# Rename Target Column"
101 |    ]
102 |   },
103 |   {
104 |    "cell_type": "code",
105 |    "execution_count": 5,
106 |    "metadata": {},
107 |    "outputs": [],
108 |    "source": [
109 |     "web_data_renamed = web_data.withColumnRenamed('Orders_Normalized', 'label')"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 6,
115 |    "metadata": {},
116 |    "outputs": [
117 |     {
118 |      "name": "stdout",
119 |      "output_type": "stream",
120 |      "text": [
121 |       "root\n",
122 |       " |-- Visit_Number_Bucket: string (nullable = true)\n",
123 |       " |-- Page_Views_Normalized: double (nullable = true)\n",
124 |       " |-- label: integer (nullable = true)\n",
125 |       " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n",
126 |       " |-- Internal_Search_Null_Normalized: double (nullable = true)\n",
127 |       " |-- Email_Signup_Normalized: double (nullable = true)\n",
128 |       " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n",
129 |       " |-- Store_Locator_Search_Normalized: double (nullable = true)\n",
130 |       " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n",
131 |       " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n",
132 |       " |-- Mapped_Browser_Type: string (nullable = true)\n",
133 |       " |-- Mapped_Entry_Pages: string (nullable = true)\n",
134 |       " |-- Mapped_Site_Section: string (nullable = true)\n",
135 |       " |-- Mapped_Promo_Code: string (nullable = true)\n",
136 |       " |-- Maped_Product_Name: string (nullable = true)\n",
137 |       " |-- Mapped_Search_Term: string (nullable = true)\n",
138 |       " |-- Mapped_Product_Collection: string (nullable = true)\n",
139 |       "\n"
140 |      ]
141 |     }
142 |    ],
143 |    "source": [
144 |     "web_data_renamed.printSchema()"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {},
150 |    "source": [
151 |     "# Split the dataset into Train, Validation and Test"
152 |    ]
153 |   },
154 |   {
155 |    "cell_type": "code",
156 |    "execution_count": 7,
157 |    "metadata": {},
158 |    "outputs": [],
159 |    "source": [
160 |     "train, validation, test  = web_data_renamed.randomSplit([0.7, 0.2, 0.1], 1234)"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "markdown",
165 |    "metadata": {},
166 |    "source": [
167 |     "# Build Pipeline"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": 8,
173 |    "metadata": {},
174 |    "outputs": [],
175 |    "source": [
176 |     "categorical_columns = [item[0] for item in web_data_renamed.dtypes if item[1].startswith('string')]\n",
177 |     "numeric_columns = [item[0] for item in web_data_renamed.dtypes if item[1].startswith('double')]"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 9,
183 |    "metadata": {},
184 |    "outputs": [],
185 |    "source": [
186 |     "indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]\n"
187 |    ]
188 |   },
189 |   {
190 |    "cell_type": "code",
191 |    "execution_count": 10,
192 |    "metadata": {},
193 |    "outputs": [],
194 |    "source": [
195 |     "featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol=\"features\")\n"
196 |    ]
197 |   },
198 |   {
199 |    "cell_type": "code",
200 |    "execution_count": 11,
201 |    "metadata": {},
202 |    "outputs": [],
203 |    "source": [
204 |     "layers = [len(featuresCreator.getInputCols()), 4, 2, 2]\n",
205 |     "\n",
206 |     "classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)\n"
207 |    ]
208 |   },
209 |   {
210 |    "cell_type": "code",
211 |    "execution_count": 12,
212 |    "metadata": {},
213 |    "outputs": [],
214 |    "source": [
215 |     "pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "markdown",
220 |    "metadata": {},
221 |    "source": [
222 |     "# Fit Pipeline"
223 |    ]
224 |   },
225 |   {
226 |    "cell_type": "code",
227 |    "execution_count": 13,
228 |    "metadata": {},
229 |    "outputs": [],
230 |    "source": [
231 |     "model = pipeline.fit(train)"
232 |    ]
233 |   },
234 |   {
235 |    "cell_type": "markdown",
236 |    "metadata": {},
237 |    "source": [
238 |     "# Get Pipeline Output"
239 |    ]
240 |   },
241 |   {
242 |    "cell_type": "code",
243 |    "execution_count": 14,
244 |    "metadata": {},
245 |    "outputs": [],
246 |    "source": [
247 |     "train_output_df = model.transform(train)\n",
248 |     "validation_output_df = model.transform(validation)\n",
249 |     "test_output_df = model.transform(test)"
250 |    ]
251 |   },
252 |   {
253 |    "cell_type": "markdown",
254 |    "metadata": {},
255 |    "source": [
256 |     "# Evaluate the Predictions"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": 15,
262 |    "metadata": {
263 |     "scrolled": true
264 |    },
265 |    "outputs": [
266 |     {
267 |      "name": "stdout",
268 |      "output_type": "stream",
269 |      "text": [
270 |       "Train weightedPrecision = 0.976101874447846\n",
271 |       "Validation weightedPrecision = 0.9765821626938243\n",
272 |       "Test weightedPrecision = 0.9747324280445043\n",
273 |       "Train weightedRecall = 0.9755751041220662\n",
274 |       "Validation weightedRecall = 0.9761613691931541\n",
275 |       "Test weightedRecall = 0.9742582305920606\n",
276 |       "Train accuracy = 0.975575104122066\n",
277 |       "Validation accuracy = 0.976161369193154\n",
278 |       "Test accuracy = 0.9742582305920607\n"
279 |      ]
280 |     }
281 |    ],
282 |    "source": [
283 |     "train_predictionAndLabels = train_output_df.select(\"prediction\", \"label\")\n",
284 |     "validation_predictionAndLabels = validation_output_df.select(\"prediction\", \"label\")\n",
285 |     "test_predictionAndLabels = test_output_df.select(\"prediction\", \"label\")\n",
286 |     "\n",
287 |     "metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']\n",
288 |     "\n",
289 |     "for metric in metrics:\n",
290 |     "    evaluator = MulticlassClassificationEvaluator(metricName=metric)\n",
291 |     "    print('Train ' + metric + ' = ' + str(evaluator.evaluate(train_predictionAndLabels)))\n",
292 |     "    print('Validation ' + metric + ' = ' + str(evaluator.evaluate(validation_predictionAndLabels)))\n",
293 |     "    print('Test ' + metric + ' = ' + str(evaluator.evaluate(test_predictionAndLabels)))"
294 |    ]
295 |   }
296 |  ],
297 |  "metadata": {
298 |   "kernelspec": {
299 |    "display_name": "Python 3",
300 |    "language": "python",
301 |    "name": "python3"
302 |   },
303 |   "language_info": {
304 |    "codemirror_mode": {
305 |     "name": "ipython",
306 |     "version": 3
307 |    },
308 |    "file_extension": ".py",
309 |    "mimetype": "text/x-python",
310 |    "name": "python",
311 |    "nbconvert_exporter": "python",
312 |    "pygments_lexer": "ipython3",
313 |    "version": "3.6.3"
314 |   }
315 |  },
316 |  "nbformat": 4,
317 |  "nbformat_minor": 2
318 | }
319 | 


--------------------------------------------------------------------------------
/chap_8/Multilayer_perceptron_spark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Load the libraries"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "import os\n",
 19 |     "import numpy as np\n",
 20 |     "import pandas as pd\n",
 21 |     "from pyspark.sql.types import *\n",
 22 |     "from pyspark.ml import Pipeline\n",
 23 |     "from pyspark.sql import functions as f\n",
 24 |     "from pyspark.sql.functions import udf, StringType\n",
 25 |     "from pyspark.sql import SparkSession, functions as F\n",
 26 |     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
 27 |     "from pyspark.ml.classification import MultilayerPerceptronClassifier\n",
 28 |     "from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "markdown",
 33 |    "metadata": {},
 34 |    "source": [
 35 |     "# Initialize Spark Session"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 2,
 41 |    "metadata": {
 42 |     "collapsed": true
 43 |    },
 44 |    "outputs": [],
 45 |    "source": [
 46 |     "spark = SparkSession.builder.appName('deep_learning').getOrCreate()"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "markdown",
 51 |    "metadata": {},
 52 |    "source": [
 53 |     "# Read the Dataset"
 54 |    ]
 55 |   },
 56 |   {
 57 |    "cell_type": "code",
 58 |    "execution_count": 3,
 59 |    "metadata": {
 60 |     "collapsed": true
 61 |    },
 62 |    "outputs": [],
 63 |    "source": [
 64 |     "data = spark.read.csv('dl_data.csv', header=True, inferSchema=True)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 4,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "root\n",
 77 |       " |-- Visit_Number_Bucket: string (nullable = true)\n",
 78 |       " |-- Page_Views_Normalized: double (nullable = true)\n",
 79 |       " |-- Orders_Normalized: integer (nullable = true)\n",
 80 |       " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n",
 81 |       " |-- Internal_Search_Null_Normalized: double (nullable = true)\n",
 82 |       " |-- Email_Signup_Normalized: double (nullable = true)\n",
 83 |       " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n",
 84 |       " |-- Store_Locator_Search_Normalized: double (nullable = true)\n",
 85 |       " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n",
 86 |       " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n",
 87 |       " |-- Mapped_Browser_Type: string (nullable = true)\n",
 88 |       " |-- Mapped_Entry_Pages: string (nullable = true)\n",
 89 |       " |-- Mapped_Site_Section: string (nullable = true)\n",
 90 |       " |-- Mapped_Promo_Code: string (nullable = true)\n",
 91 |       " |-- Maped_Product_Name: string (nullable = true)\n",
 92 |       " |-- Mapped_Search_Term: string (nullable = true)\n",
 93 |       " |-- Mapped_Product_Collection: string (nullable = true)\n",
 94 |       "\n"
 95 |      ]
 96 |     }
 97 |    ],
 98 |    "source": [
 99 |     "data.printSchema()"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "markdown",
104 |    "metadata": {},
105 |    "source": [
106 |     "# Rename Target Column"
107 |    ]
108 |   },
109 |   {
110 |    "cell_type": "code",
111 |    "execution_count": 5,
112 |    "metadata": {
113 |     "collapsed": true
114 |    },
115 |    "outputs": [],
116 |    "source": [
117 |     "data = data.withColumnRenamed('Orders_Normalized', 'label')"
118 |    ]
119 |   },
120 |   {
121 |    "cell_type": "code",
122 |    "execution_count": 6,
123 |    "metadata": {},
124 |    "outputs": [
125 |     {
126 |      "name": "stdout",
127 |      "output_type": "stream",
128 |      "text": [
129 |       "root\n",
130 |       " |-- Visit_Number_Bucket: string (nullable = true)\n",
131 |       " |-- Page_Views_Normalized: double (nullable = true)\n",
132 |       " |-- label: integer (nullable = true)\n",
133 |       " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n",
134 |       " |-- Internal_Search_Null_Normalized: double (nullable = true)\n",
135 |       " |-- Email_Signup_Normalized: double (nullable = true)\n",
136 |       " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n",
137 |       " |-- Store_Locator_Search_Normalized: double (nullable = true)\n",
138 |       " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n",
139 |       " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n",
140 |       " |-- Mapped_Browser_Type: string (nullable = true)\n",
141 |       " |-- Mapped_Entry_Pages: string (nullable = true)\n",
142 |       " |-- Mapped_Site_Section: string (nullable = true)\n",
143 |       " |-- Mapped_Promo_Code: string (nullable = true)\n",
144 |       " |-- Maped_Product_Name: string (nullable = true)\n",
145 |       " |-- Mapped_Search_Term: string (nullable = true)\n",
146 |       " |-- Mapped_Product_Collection: string (nullable = true)\n",
147 |       "\n"
148 |      ]
149 |     }
150 |    ],
151 |    "source": [
152 |     "data.printSchema()"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "markdown",
157 |    "metadata": {},
158 |    "source": [
159 |     "# Split the dataset into Train, Validation and Test"
160 |    ]
161 |   },
162 |   {
163 |    "cell_type": "code",
164 |    "execution_count": 7,
165 |    "metadata": {
166 |     "collapsed": true
167 |    },
168 |    "outputs": [],
169 |    "source": [
170 |     "train, validation, test  = data.randomSplit([0.7, 0.2, 0.1], 1234)"
171 |    ]
172 |   },
173 |   {
174 |    "cell_type": "markdown",
175 |    "metadata": {},
176 |    "source": [
177 |     "# Build Pipeline"
178 |    ]
179 |   },
180 |   {
181 |    "cell_type": "code",
182 |    "execution_count": 8,
183 |    "metadata": {
184 |     "collapsed": true
185 |    },
186 |    "outputs": [],
187 |    "source": [
188 |     "categorical_columns = [item[0] for item in data.dtypes if item[1].startswith('string')]\n",
189 |     "numeric_columns = [item[0] for item in data.dtypes if item[1].startswith('double')]"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 9,
195 |    "metadata": {
196 |     "collapsed": true
197 |    },
198 |    "outputs": [],
199 |    "source": [
200 |     "indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]\n"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 10,
206 |    "metadata": {
207 |     "collapsed": true
208 |    },
209 |    "outputs": [],
210 |    "source": [
211 |     "featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol=\"features\")\n"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 11,
217 |    "metadata": {
218 |     "collapsed": true
219 |    },
220 |    "outputs": [],
221 |    "source": [
222 |     "layers = [len(featuresCreator.getInputCols()), 4, 2, 2]\n",
223 |     "\n",
224 |     "classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)\n"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 12,
230 |    "metadata": {
231 |     "collapsed": true
232 |    },
233 |    "outputs": [],
234 |    "source": [
235 |     "pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "markdown",
240 |    "metadata": {},
241 |    "source": [
242 |     "# Fit Pipeline"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 13,
248 |    "metadata": {
249 |     "collapsed": true
250 |    },
251 |    "outputs": [],
252 |    "source": [
253 |     "model = pipeline.fit(train)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "markdown",
258 |    "metadata": {},
259 |    "source": [
260 |     "# Get Pipeline Output"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 14,
266 |    "metadata": {
267 |     "collapsed": true
268 |    },
269 |    "outputs": [],
270 |    "source": [
271 |     "train_output_df = model.transform(train)\n",
272 |     "validation_output_df = model.transform(validation)\n",
273 |     "test_output_df = model.transform(test)"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "markdown",
278 |    "metadata": {},
279 |    "source": [
280 |     "# Evaluate the Predictions"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 15,
286 |    "metadata": {
287 |     "scrolled": true
288 |    },
289 |    "outputs": [
290 |     {
291 |      "name": "stdout",
292 |      "output_type": "stream",
293 |      "text": [
294 |       "Train weightedPrecision = 0.976101874447846\n",
295 |       "Validation weightedPrecision = 0.9765821626938243\n",
296 |       "Test weightedPrecision = 0.9747324280445043\n",
297 |       "Train weightedRecall = 0.9755751041220662\n",
298 |       "Validation weightedRecall = 0.9761613691931541\n",
299 |       "Test weightedRecall = 0.9742582305920606\n",
300 |       "Train accuracy = 0.975575104122066\n",
301 |       "Validation accuracy = 0.976161369193154\n",
302 |       "Test accuracy = 0.9742582305920607\n"
303 |      ]
304 |     }
305 |    ],
306 |    "source": [
307 |     "train_predictionAndLabels = train_output_df.select(\"prediction\", \"label\")\n",
308 |     "validation_predictionAndLabels = validation_output_df.select(\"prediction\", \"label\")\n",
309 |     "test_predictionAndLabels = test_output_df.select(\"prediction\", \"label\")\n",
310 |     "\n",
311 |     "metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']\n",
312 |     "\n",
313 |     "for metric in metrics:\n",
314 |     "    evaluator = MulticlassClassificationEvaluator(metricName=metric)\n",
315 |     "    print('Train ' + metric + ' = ' + str(evaluator.evaluate(train_predictionAndLabels)))\n",
316 |     "    print('Validation ' + metric + ' = ' + str(evaluator.evaluate(validation_predictionAndLabels)))\n",
317 |     "    print('Test ' + metric + ' = ' + str(evaluator.evaluate(test_predictionAndLabels)))"
318 |    ]
319 |   }
320 |  ],
321 |  "metadata": {
322 |   "kernelspec": {
323 |    "display_name": "Python 3",
324 |    "language": "python",
325 |    "name": "python3"
326 |   },
327 |   "language_info": {
328 |    "codemirror_mode": {
329 |     "name": "ipython",
330 |     "version": 3
331 |    },
332 |    "file_extension": ".py",
333 |    "mimetype": "text/x-python",
334 |    "name": "python",
335 |    "nbconvert_exporter": "python",
336 |    "pygments_lexer": "ipython3",
337 |    "version": "3.7.0"
338 |   }
339 |  },
340 |  "nbformat": 4,
341 |  "nbformat_minor": 2
342 | }
343 | 


--------------------------------------------------------------------------------
/chap_5/Classification_using_MLlib.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "#import SparkSession\n",
 12 |     "from pyspark.sql import SparkSession\n",
 13 |     "spark=SparkSession.builder.appName('binary_class').getOrCreate()"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {
 20 |     "collapsed": true
 21 |    },
 22 |    "outputs": [],
 23 |    "source": [
 24 |     "#read the dataset\n",
 25 |     "df=spark.read.csv('classification_data.csv',inferSchema=True,header=True)"
 26 |    ]
 27 |   },
 28 |   {
 29 |    "cell_type": "code",
 30 |    "execution_count": null,
 31 |    "metadata": {
 32 |     "collapsed": true
 33 |    },
 34 |    "outputs": [],
 35 |    "source": [
 36 |     "#check the shape of the data \n",
 37 |     "print((df.count(),len(df.columns)))"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": null,
 43 |    "metadata": {
 44 |     "collapsed": true
 45 |    },
 46 |    "outputs": [],
 47 |    "source": [
 48 |     "#printSchema\n",
 49 |     "df.printSchema()"
 50 |    ]
 51 |   },
 52 |   {
 53 |    "cell_type": "code",
 54 |    "execution_count": null,
 55 |    "metadata": {
 56 |     "collapsed": true
 57 |    },
 58 |    "outputs": [],
 59 |    "source": [
 60 |     "#number of columns in dataset\n",
 61 |     "df.columns"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": null,
 67 |    "metadata": {
 68 |     "collapsed": true
 69 |    },
 70 |    "outputs": [],
 71 |    "source": [
 72 |     "#view the dataset\n",
 73 |     "df.show(5)"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "code",
 78 |    "execution_count": null,
 79 |    "metadata": {
 80 |     "collapsed": true
 81 |    },
 82 |    "outputs": [],
 83 |    "source": [
 84 |     "#Exploratory Data Analysis\n",
 85 |     "df.describe().show()\n"
 86 |    ]
 87 |   },
 88 |   {
 89 |    "cell_type": "code",
 90 |    "execution_count": null,
 91 |    "metadata": {
 92 |     "collapsed": true
 93 |    },
 94 |    "outputs": [],
 95 |    "source": [
 96 |     "df.groupBy('label').count().show()"
 97 |    ]
 98 |   },
 99 |   {
100 |    "cell_type": "code",
101 |    "execution_count": null,
102 |    "metadata": {
103 |     "collapsed": true
104 |    },
105 |    "outputs": [],
106 |    "source": [
107 |     "df.groupBy('loan_purpose').count().show()"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": null,
113 |    "metadata": {
114 |     "collapsed": true
115 |    },
116 |    "outputs": [],
117 |    "source": [
118 |     "#converting categorical data to numerical form"
119 |    ]
120 |   },
121 |   {
122 |    "cell_type": "code",
123 |    "execution_count": 3,
124 |    "metadata": {
125 |     "collapsed": true
126 |    },
127 |    "outputs": [],
128 |    "source": [
129 |     "#import required libraries\n",
130 |     "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n",
131 |     "\n"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "code",
136 |    "execution_count": 4,
137 |    "metadata": {
138 |     "collapsed": true
139 |    },
140 |    "outputs": [],
141 |    "source": [
142 |     "loan_purpose_indexer = StringIndexer(inputCol=\"loan_purpose\", outputCol=\"loan_index\").fit(df)\n",
143 |     "df = loan_purpose_indexer.transform(df)\n",
144 |     "loan_encoder = OneHotEncoder(inputCol=\"loan_index\", outputCol=\"loan_purpose_vec\")\n",
145 |     "df = loan_encoder.transform(df)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": null,
151 |    "metadata": {
152 |     "collapsed": true
153 |    },
154 |    "outputs": [],
155 |    "source": [
156 |     "df.select(['loan_purpose','loan_index','loan_purpose_vec']).show(3,False)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 5,
162 |    "metadata": {
163 |     "collapsed": true
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "from pyspark.ml.feature import VectorAssembler"
168 |    ]
169 |   },
170 |   {
171 |    "cell_type": "code",
172 |    "execution_count": null,
173 |    "metadata": {
174 |     "collapsed": true
175 |    },
176 |    "outputs": [],
177 |    "source": [
178 |     "df.columns"
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 6,
184 |    "metadata": {
185 |     "collapsed": true
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "df_assembler = VectorAssembler(inputCols=['is_first_loan',\n",
190 |     " 'total_credit_card_limit',\n",
191 |     " 'avg_percentage_credit_card_limit_used_last_year',\n",
192 |     " 'saving_amount',\n",
193 |     " 'checking_amount',\n",
194 |     " 'is_employed',\n",
195 |     " 'yearly_salary',\n",
196 |     " 'age',\n",
197 |     " 'dependent_number',\n",
198 |     " 'loan_purpose_vec'], outputCol=\"features\")\n",
199 |     "df = df_assembler.transform(df)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": null,
205 |    "metadata": {
206 |     "collapsed": true
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "df.printSchema()"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": null,
216 |    "metadata": {
217 |     "collapsed": true
218 |    },
219 |    "outputs": [],
220 |    "source": [
221 |     "df.select(['features','label']).show(10,False)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "code",
226 |    "execution_count": 7,
227 |    "metadata": {
228 |     "collapsed": true
229 |    },
230 |    "outputs": [],
231 |    "source": [
232 |     "#select data for building model\n",
233 |     "model_df=df.select(['features','label'])"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": null,
239 |    "metadata": {
240 |     "collapsed": true
241 |    },
242 |    "outputs": [],
243 |    "source": [
244 |     "from pyspark.ml.classification import LogisticRegression"
245 |    ]
246 |   },
247 |   {
248 |    "cell_type": "code",
249 |    "execution_count": 8,
250 |    "metadata": {
251 |     "collapsed": true
252 |    },
253 |    "outputs": [],
254 |    "source": [
255 |     "#split the data \n",
256 |     "training_df,test_df=model_df.randomSplit([0.75,0.25])"
257 |    ]
258 |   },
259 |   {
260 |    "cell_type": "code",
261 |    "execution_count": null,
262 |    "metadata": {
263 |     "collapsed": true
264 |    },
265 |    "outputs": [],
266 |    "source": [
267 |     "training_df.count()"
268 |    ]
269 |   },
270 |   {
271 |    "cell_type": "code",
272 |    "execution_count": null,
273 |    "metadata": {
274 |     "collapsed": true
275 |    },
276 |    "outputs": [],
277 |    "source": [
278 |     "training_df.groupBy('label').count().show()"
279 |    ]
280 |   },
281 |   {
282 |    "cell_type": "code",
283 |    "execution_count": null,
284 |    "metadata": {
285 |     "collapsed": true
286 |    },
287 |    "outputs": [],
288 |    "source": [
289 |     "test_df.count()"
290 |    ]
291 |   },
292 |   {
293 |    "cell_type": "code",
294 |    "execution_count": null,
295 |    "metadata": {
296 |     "collapsed": true
297 |    },
298 |    "outputs": [],
299 |    "source": [
300 |     "test_df.groupBy('label').count().show()"
301 |    ]
302 |   },
303 |   {
304 |    "cell_type": "code",
305 |    "execution_count": null,
306 |    "metadata": {
307 |     "collapsed": true
308 |    },
309 |    "outputs": [],
310 |    "source": [
311 |     "log_reg=LogisticRegression().fit(training_df)"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": null,
317 |    "metadata": {
318 |     "collapsed": true
319 |    },
320 |    "outputs": [],
321 |    "source": [
322 |     "#Training Results"
323 |    ]
324 |   },
325 |   {
326 |    "cell_type": "code",
327 |    "execution_count": null,
328 |    "metadata": {
329 |     "collapsed": true
330 |    },
331 |    "outputs": [],
332 |    "source": [
333 |     "lr_summary=log_reg.summary"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": null,
339 |    "metadata": {
340 |     "collapsed": true
341 |    },
342 |    "outputs": [],
343 |    "source": [
344 |     "lr_summary.accuracy"
345 |    ]
346 |   },
347 |   {
348 |    "cell_type": "code",
349 |    "execution_count": null,
350 |    "metadata": {
351 |     "collapsed": true
352 |    },
353 |    "outputs": [],
354 |    "source": [
355 |     "lr_summary.areaUnderROC"
356 |    ]
357 |   },
358 |   {
359 |    "cell_type": "code",
360 |    "execution_count": null,
361 |    "metadata": {
362 |     "collapsed": true
363 |    },
364 |    "outputs": [],
365 |    "source": [
366 |     "print(lr_summary.precisionByLabel)"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": null,
372 |    "metadata": {
373 |     "collapsed": true
374 |    },
375 |    "outputs": [],
376 |    "source": [
377 |     "print(lr_summary.recallByLabel)"
378 |    ]
379 |   },
380 |   {
381 |    "cell_type": "code",
382 |    "execution_count": null,
383 |    "metadata": {
384 |     "collapsed": true
385 |    },
386 |    "outputs": [],
387 |    "source": [
388 |     "predictions = log_reg.transform(test_df)\n",
389 |     "predictions.show(10)\n"
390 |    ]
391 |   },
392 |   {
393 |    "cell_type": "code",
394 |    "execution_count": null,
395 |    "metadata": {
396 |     "collapsed": true
397 |    },
398 |    "outputs": [],
399 |    "source": [
400 |     "model_predictions = log_reg.transform(test_df)\n"
401 |    ]
402 |   },
403 |   {
404 |    "cell_type": "code",
405 |    "execution_count": null,
406 |    "metadata": {
407 |     "collapsed": true
408 |    },
409 |    "outputs": [],
410 |    "source": [
411 |     "model_predictions = log_reg.evaluate(test_df)\n"
412 |    ]
413 |   },
414 |   {
415 |    "cell_type": "code",
416 |    "execution_count": null,
417 |    "metadata": {
418 |     "collapsed": true
419 |    },
420 |    "outputs": [],
421 |    "source": [
422 |     "model_predictions.accuracy"
423 |    ]
424 |   },
425 |   {
426 |    "cell_type": "code",
427 |    "execution_count": null,
428 |    "metadata": {
429 |     "collapsed": true
430 |    },
431 |    "outputs": [],
432 |    "source": [
433 |     "model_predictions.weightedPrecision"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "code",
438 |    "execution_count": null,
439 |    "metadata": {
440 |     "collapsed": true
441 |    },
442 |    "outputs": [],
443 |    "source": [
444 |     "model_predictions.recallByLabel"
445 |    ]
446 |   },
447 |   {
448 |    "cell_type": "code",
449 |    "execution_count": null,
450 |    "metadata": {
451 |     "collapsed": true
452 |    },
453 |    "outputs": [],
454 |    "source": [
455 |     "print(model_predictions.precisionByLabel)"
456 |    ]
457 |   },
458 |   {
459 |    "cell_type": "code",
460 |    "execution_count": null,
461 |    "metadata": {
462 |     "collapsed": true
463 |    },
464 |    "outputs": [],
465 |    "source": [
466 |     "model_predictions.areaUnderROC"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 9,
472 |    "metadata": {
473 |     "collapsed": true
474 |    },
475 |    "outputs": [],
476 |    "source": [
477 |     "from pyspark.ml.classification import RandomForestClassifier\n",
478 |     "rf = RandomForestClassifier()\n",
479 |     "rf_model = rf.fit(training_df)\n"
480 |    ]
481 |   },
482 |   {
483 |    "cell_type": "code",
484 |    "execution_count": 10,
485 |    "metadata": {
486 |     "collapsed": true
487 |    },
488 |    "outputs": [],
489 |    "source": [
490 |     "model_predictions = rf_model.transform(test_df)\n"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 11,
496 |    "metadata": {
497 |     "collapsed": true
498 |    },
499 |    "outputs": [],
500 |    "source": [
501 |     "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
502 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
503 |     "\n",
504 |     "evaluator = BinaryClassificationEvaluator()\n",
505 |     "\n",
506 |     "rf = RandomForestClassifier()\n",
507 |     "paramGrid = (ParamGridBuilder()\n",
508 |     "             .addGrid(rf.maxDepth, [5,10,20,25,30])\n",
509 |     "             .addGrid(rf.maxBins, [20,30,40 ])\n",
510 |     "             .addGrid(rf.numTrees, [5, 20,50])\n",
511 |     "             .build())\n",
512 |     "cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)\n",
513 |     "cv_model = cv.fit(training_df)"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": 12,
519 |    "metadata": {
520 |     "collapsed": true
521 |    },
522 |    "outputs": [],
523 |    "source": [
524 |     "best_rf_model = cv_model.bestModel"
525 |    ]
526 |   },
527 |   {
528 |    "cell_type": "code",
529 |    "execution_count": 13,
530 |    "metadata": {
531 |     "collapsed": true
532 |    },
533 |    "outputs": [],
534 |    "source": [
535 |     "# Generate predictions for entire dataset\n",
536 |     "model_predictions = best_rf_model.transform(test_df)"
537 |    ]
538 |   },
539 |   {
540 |    "cell_type": "code",
541 |    "execution_count": 14,
542 |    "metadata": {
543 |     "collapsed": true
544 |    },
545 |    "outputs": [],
546 |    "source": [
547 |     "true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()\n",
548 |     "actual_pos=model_predictions.filter(model_predictions['label']==1).count()\n",
549 |     "pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()"
550 |    ]
551 |   },
552 |   {
553 |    "cell_type": "code",
554 |    "execution_count": 15,
555 |    "metadata": {},
556 |    "outputs": [
557 |     {
558 |      "data": {
559 |       "text/plain": [
560 |        "0.912426614481409"
561 |       ]
562 |      },
563 |      "execution_count": 15,
564 |      "metadata": {},
565 |      "output_type": "execute_result"
566 |     }
567 |    ],
568 |    "source": [
569 |     "#Recall \n",
570 |     "float(true_pos)/(actual_pos)"
571 |    ]
572 |   },
573 |   {
574 |    "cell_type": "code",
575 |    "execution_count": 16,
576 |    "metadata": {},
577 |    "outputs": [
578 |     {
579 |      "data": {
580 |       "text/plain": [
581 |        "0.8562901744719926"
582 |       ]
583 |      },
584 |      "execution_count": 16,
585 |      "metadata": {},
586 |      "output_type": "execute_result"
587 |     }
588 |    ],
589 |    "source": [
590 |     "#Precision on test Data \n",
591 |     "float(true_pos)/(pred_pos)"
592 |    ]
593 |   },
594 |   {
595 |    "cell_type": "code",
596 |    "execution_count": null,
597 |    "metadata": {
598 |     "collapsed": true
599 |    },
600 |    "outputs": [],
601 |    "source": []
602 |   }
603 |  ],
604 |  "metadata": {
605 |   "kernelspec": {
606 |    "display_name": "Python 3",
607 |    "language": "python",
608 |    "name": "python3"
609 |   },
610 |   "language_info": {
611 |    "codemirror_mode": {
612 |     "name": "ipython",
613 |     "version": 3
614 |    },
615 |    "file_extension": ".py",
616 |    "mimetype": "text/x-python",
617 |    "name": "python",
618 |    "nbconvert_exporter": "python",
619 |    "pygments_lexer": "ipython3",
620 |    "version": "3.7.0"
621 |   }
622 |  },
623 |  "nbformat": 4,
624 |  "nbformat_minor": 2
625 | }
626 | 


--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/Spark Structured Streaming app-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#import SparkSession\n",
 10 |     "from pyspark.sql import SparkSession\n",
 11 |     "spark=SparkSession.builder.appName('structured_streaming').getOrCreate()"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pyspark.sql.functions as F\n",
 21 |     "from pyspark.sql.types import *"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 3,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "#create sample dataset\n",
 31 |     "df_1=spark.createDataFrame([(\"XN203\",'FB',300,30),(\"XN201\",'Twitter',10,19),(\"XN202\",'Insta',500,45)], \n",
 32 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 4,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "#define schema for input data\n",
 42 |     "schema=StructType().add(\"user_id\", \"string\").add(\"app\", \"string\").add(\"time_in_secs\", \"integer\").add(\"age\", \"integer\")\n",
 43 |     "data=spark.readStream.option(\"sep\", \",\").schema(schema).csv(\"csv_folder\")"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 5,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "root\n",
 56 |       " |-- user_id: string (nullable = true)\n",
 57 |       " |-- app: string (nullable = true)\n",
 58 |       " |-- time_in_secs: integer (nullable = true)\n",
 59 |       " |-- age: integer (nullable = true)\n",
 60 |       "\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "data.printSchema()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "app_count=data.groupBy('app').count()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 7,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "query=(app_count.writeStream.queryName('count_query').outputMode('complete').format('memory').start())"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 10,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/html": [
 94 |        "<div>\n",
 95 |        "<table border=\"1\" class=\"dataframe\">\n",
 96 |        "  <thead>\n",
 97 |        "    <tr style=\"text-align: right;\">\n",
 98 |        "      <th></th>\n",
 99 |        "      <th>app</th>\n",
100 |        "      <th>count</th>\n",
101 |        "    </tr>\n",
102 |        "  </thead>\n",
103 |        "  <tbody>\n",
104 |        "    <tr>\n",
105 |        "      <th>0</th>\n",
106 |        "      <td>Insta</td>\n",
107 |        "      <td>1</td>\n",
108 |        "    </tr>\n",
109 |        "    <tr>\n",
110 |        "      <th>1</th>\n",
111 |        "      <td>FB</td>\n",
112 |        "      <td>1</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>2</th>\n",
116 |        "      <td>Twitter</td>\n",
117 |        "      <td>1</td>\n",
118 |        "    </tr>\n",
119 |        "  </tbody>\n",
120 |        "</table>\n",
121 |        "</div>"
122 |       ],
123 |       "text/plain": [
124 |        "       app  count\n",
125 |        "0    Insta      1\n",
126 |        "1       FB      1\n",
127 |        "2  Twitter      1"
128 |       ]
129 |      },
130 |      "execution_count": 10,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "spark.sql(\"select * from count_query \").toPandas().head(5)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 9,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "fb_data=data.filter(data['app']=='FB')"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 10,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "fb_avg_time=fb_data.groupBy('user_id').agg(F.avg(\"time_in_secs\"))"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 11,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "fb_query=(fb_avg_time.writeStream.queryName('fb_query').outputMode('complete').format('memory').start())"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 12,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/html": [
174 |        "<div>\n",
175 |        "<table border=\"1\" class=\"dataframe\">\n",
176 |        "  <thead>\n",
177 |        "    <tr style=\"text-align: right;\">\n",
178 |        "      <th></th>\n",
179 |        "      <th>user_id</th>\n",
180 |        "      <th>avg(time_in_secs)</th>\n",
181 |        "    </tr>\n",
182 |        "  </thead>\n",
183 |        "  <tbody>\n",
184 |        "  </tbody>\n",
185 |        "</table>\n",
186 |        "</div>"
187 |       ],
188 |       "text/plain": [
189 |        "Empty DataFrame\n",
190 |        "Columns: [user_id, avg(time_in_secs)]\n",
191 |        "Index: []"
192 |       ]
193 |      },
194 |      "execution_count": 12,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "spark.sql(\"select * from fb_query \").toPandas().head(5)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 5,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "df_2=spark.createDataFrame([(\"XN203\",'FB',100,30),(\"XN201\",'FB',10,19),(\"XN202\",'FB',2000,45)], \n",
210 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 14,
216 |    "metadata": {},
217 |    "outputs": [
218 |     {
219 |      "data": {
220 |       "text/html": [
221 |        "<div>\n",
222 |        "<table border=\"1\" class=\"dataframe\">\n",
223 |        "  <thead>\n",
224 |        "    <tr style=\"text-align: right;\">\n",
225 |        "      <th></th>\n",
226 |        "      <th>user_id</th>\n",
227 |        "      <th>avg(time_in_secs)</th>\n",
228 |        "    </tr>\n",
229 |        "  </thead>\n",
230 |        "  <tbody>\n",
231 |        "    <tr>\n",
232 |        "      <th>0</th>\n",
233 |        "      <td>XN203</td>\n",
234 |        "      <td>300.0</td>\n",
235 |        "    </tr>\n",
236 |        "  </tbody>\n",
237 |        "</table>\n",
238 |        "</div>"
239 |       ],
240 |       "text/plain": [
241 |        "  user_id  avg(time_in_secs)\n",
242 |        "0   XN203              300.0"
243 |       ]
244 |      },
245 |      "execution_count": 14,
246 |      "metadata": {},
247 |      "output_type": "execute_result"
248 |     }
249 |    ],
250 |    "source": [
251 |     "spark.sql(\"select * from fb_query \").toPandas().head(5)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 12,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "df_3=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
261 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 16,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/html": [
272 |        "<div>\n",
273 |        "<table border=\"1\" class=\"dataframe\">\n",
274 |        "  <thead>\n",
275 |        "    <tr style=\"text-align: right;\">\n",
276 |        "      <th></th>\n",
277 |        "      <th>user_id</th>\n",
278 |        "      <th>avg(time_in_secs)</th>\n",
279 |        "    </tr>\n",
280 |        "  </thead>\n",
281 |        "  <tbody>\n",
282 |        "    <tr>\n",
283 |        "      <th>0</th>\n",
284 |        "      <td>XN203</td>\n",
285 |        "      <td>300.0</td>\n",
286 |        "    </tr>\n",
287 |        "  </tbody>\n",
288 |        "</table>\n",
289 |        "</div>"
290 |       ],
291 |       "text/plain": [
292 |        "  user_id  avg(time_in_secs)\n",
293 |        "0   XN203              300.0"
294 |       ]
295 |      },
296 |      "execution_count": 16,
297 |      "metadata": {},
298 |      "output_type": "execute_result"
299 |     }
300 |    ],
301 |    "source": [
302 |     "spark.sql(\"select * from fb_query \").toPandas().head(5)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 13,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "df_4=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
312 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 18,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "#app wise time spent\n",
322 |     "\n",
323 |     "app_df=data.groupBy('app').agg(F.sum('time_in_secs').alias('total_time')).orderBy('total_time',ascending=False)"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 19,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "app_query=(app_df.writeStream.queryName('app_wise_query').outputMode('complete').format('memory').start())"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 27,
338 |    "metadata": {},
339 |    "outputs": [
340 |     {
341 |      "data": {
342 |       "text/html": [
343 |        "<div>\n",
344 |        "<table border=\"1\" class=\"dataframe\">\n",
345 |        "  <thead>\n",
346 |        "    <tr style=\"text-align: right;\">\n",
347 |        "      <th></th>\n",
348 |        "      <th>app</th>\n",
349 |        "      <th>total_time</th>\n",
350 |        "    </tr>\n",
351 |        "  </thead>\n",
352 |        "  <tbody>\n",
353 |        "    <tr>\n",
354 |        "      <th>0</th>\n",
355 |        "      <td>FB</td>\n",
356 |        "      <td>3410</td>\n",
357 |        "    </tr>\n",
358 |        "    <tr>\n",
359 |        "      <th>1</th>\n",
360 |        "      <td>Insta</td>\n",
361 |        "      <td>560</td>\n",
362 |        "    </tr>\n",
363 |        "    <tr>\n",
364 |        "      <th>2</th>\n",
365 |        "      <td>Twitter</td>\n",
366 |        "      <td>210</td>\n",
367 |        "    </tr>\n",
368 |        "  </tbody>\n",
369 |        "</table>\n",
370 |        "</div>"
371 |       ],
372 |       "text/plain": [
373 |        "       app  total_time\n",
374 |        "0       FB        3410\n",
375 |        "1    Insta         560\n",
376 |        "2  Twitter         210"
377 |       ]
378 |      },
379 |      "execution_count": 27,
380 |      "metadata": {},
381 |      "output_type": "execute_result"
382 |     }
383 |    ],
384 |    "source": [
385 |     "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 11,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "df_5=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
395 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 26,
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "data": {
405 |       "text/html": [
406 |        "<div>\n",
407 |        "<table border=\"1\" class=\"dataframe\">\n",
408 |        "  <thead>\n",
409 |        "    <tr style=\"text-align: right;\">\n",
410 |        "      <th></th>\n",
411 |        "      <th>app</th>\n",
412 |        "      <th>total_time</th>\n",
413 |        "    </tr>\n",
414 |        "  </thead>\n",
415 |        "  <tbody>\n",
416 |        "    <tr>\n",
417 |        "      <th>0</th>\n",
418 |        "      <td>FB</td>\n",
419 |        "      <td>3410</td>\n",
420 |        "    </tr>\n",
421 |        "    <tr>\n",
422 |        "      <th>1</th>\n",
423 |        "      <td>Insta</td>\n",
424 |        "      <td>560</td>\n",
425 |        "    </tr>\n",
426 |        "    <tr>\n",
427 |        "      <th>2</th>\n",
428 |        "      <td>Twitter</td>\n",
429 |        "      <td>210</td>\n",
430 |        "    </tr>\n",
431 |        "  </tbody>\n",
432 |        "</table>\n",
433 |        "</div>"
434 |       ],
435 |       "text/plain": [
436 |        "       app  total_time\n",
437 |        "0       FB        3410\n",
438 |        "1    Insta         560\n",
439 |        "2  Twitter         210"
440 |       ]
441 |      },
442 |      "execution_count": 26,
443 |      "metadata": {},
444 |      "output_type": "execute_result"
445 |     }
446 |    ],
447 |    "source": [
448 |     "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": 28,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "# app wise mean age \n",
458 |     "age_df=data.groupBy('app').agg(F.avg('age').alias('mean_age')).orderBy('mean_age',ascending=False)\n"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": null,
464 |    "metadata": {
465 |     "scrolled": true
466 |    },
467 |    "outputs": [],
468 |    "source": [
469 |     "age_query=(age_df.writeStream.queryName('age_query').outputMode('complete').format('memory').start())"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 30,
475 |    "metadata": {},
476 |    "outputs": [
477 |     {
478 |      "data": {
479 |       "text/html": [
480 |        "<div>\n",
481 |        "<table border=\"1\" class=\"dataframe\">\n",
482 |        "  <thead>\n",
483 |        "    <tr style=\"text-align: right;\">\n",
484 |        "      <th></th>\n",
485 |        "      <th>app</th>\n",
486 |        "      <th>mean_age</th>\n",
487 |        "    </tr>\n",
488 |        "  </thead>\n",
489 |        "  <tbody>\n",
490 |        "    <tr>\n",
491 |        "      <th>0</th>\n",
492 |        "      <td>Twitter</td>\n",
493 |        "      <td>38.500000</td>\n",
494 |        "    </tr>\n",
495 |        "    <tr>\n",
496 |        "      <th>1</th>\n",
497 |        "      <td>FB</td>\n",
498 |        "      <td>30.571429</td>\n",
499 |        "    </tr>\n",
500 |        "    <tr>\n",
501 |        "      <th>2</th>\n",
502 |        "      <td>Insta</td>\n",
503 |        "      <td>25.500000</td>\n",
504 |        "    </tr>\n",
505 |        "  </tbody>\n",
506 |        "</table>\n",
507 |        "</div>"
508 |       ],
509 |       "text/plain": [
510 |        "       app   mean_age\n",
511 |        "0  Twitter  38.500000\n",
512 |        "1       FB  30.571429\n",
513 |        "2    Insta  25.500000"
514 |       ]
515 |      },
516 |      "execution_count": 30,
517 |      "metadata": {},
518 |      "output_type": "execute_result"
519 |     }
520 |    ],
521 |    "source": [
522 |     "spark.sql(\"select * from age_query \").toPandas().head(5)"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": 15,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "df_6=spark.createDataFrame([(\"XN210\",'FB',500,50),(\"XN255\",'Insta',30,23),(\"XN222\",'Twitter',100,30)], \n",
532 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": 32,
538 |    "metadata": {},
539 |    "outputs": [
540 |     {
541 |      "data": {
542 |       "text/html": [
543 |        "<div>\n",
544 |        "<table border=\"1\" class=\"dataframe\">\n",
545 |        "  <thead>\n",
546 |        "    <tr style=\"text-align: right;\">\n",
547 |        "      <th></th>\n",
548 |        "      <th>app</th>\n",
549 |        "      <th>mean_age</th>\n",
550 |        "    </tr>\n",
551 |        "  </thead>\n",
552 |        "  <tbody>\n",
553 |        "    <tr>\n",
554 |        "      <th>0</th>\n",
555 |        "      <td>Twitter</td>\n",
556 |        "      <td>38.500000</td>\n",
557 |        "    </tr>\n",
558 |        "    <tr>\n",
559 |        "      <th>1</th>\n",
560 |        "      <td>FB</td>\n",
561 |        "      <td>30.571429</td>\n",
562 |        "    </tr>\n",
563 |        "    <tr>\n",
564 |        "      <th>2</th>\n",
565 |        "      <td>Insta</td>\n",
566 |        "      <td>25.500000</td>\n",
567 |        "    </tr>\n",
568 |        "  </tbody>\n",
569 |        "</table>\n",
570 |        "</div>"
571 |       ],
572 |       "text/plain": [
573 |        "       app   mean_age\n",
574 |        "0  Twitter  38.500000\n",
575 |        "1       FB  30.571429\n",
576 |        "2    Insta  25.500000"
577 |       ]
578 |      },
579 |      "execution_count": 32,
580 |      "metadata": {},
581 |      "output_type": "execute_result"
582 |     }
583 |    ],
584 |    "source": [
585 |     "spark.sql(\"select * from age_query \").toPandas().head(5)"
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": 6,
591 |    "metadata": {},
592 |    "outputs": [
593 |     {
594 |      "name": "stdout",
595 |      "output_type": "stream",
596 |      "text": [
597 |       "+-------+---------+\n",
598 |       "|    app|full_name|\n",
599 |       "+-------+---------+\n",
600 |       "|     FB| FACEBOOK|\n",
601 |       "|  Insta|INSTAGRAM|\n",
602 |       "|Twitter|  TWITTER|\n",
603 |       "+-------+---------+\n",
604 |       "\n"
605 |      ]
606 |     }
607 |    ],
608 |    "source": [
609 |     "# Join static dataframe with streaming dataframe\n",
610 |     "app_df=spark.createDataFrame([('FB','FACEBOOK'),('Insta','INSTAGRAM'),('Twitter','TWITTER')],[\"app\", \"full_name\"])\n",
611 |     "app_df.show()"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 7,
617 |    "metadata": {},
618 |    "outputs": [],
619 |    "source": [
620 |     "app_stream_df=data.join(app_df,'app')"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": 8,
626 |    "metadata": {},
627 |    "outputs": [],
628 |    "source": [
629 |     "join_query=(app_stream_df.writeStream.queryName('join_query').outputMode('append').format('memory').start())"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "code",
634 |    "execution_count": 14,
635 |    "metadata": {},
636 |    "outputs": [
637 |     {
638 |      "data": {
639 |       "text/html": [
640 |        "<div>\n",
641 |        "<table border=\"1\" class=\"dataframe\">\n",
642 |        "  <thead>\n",
643 |        "    <tr style=\"text-align: right;\">\n",
644 |        "      <th></th>\n",
645 |        "      <th>app</th>\n",
646 |        "      <th>user_id</th>\n",
647 |        "      <th>time_in_secs</th>\n",
648 |        "      <th>age</th>\n",
649 |        "      <th>full_name</th>\n",
650 |        "    </tr>\n",
651 |        "  </thead>\n",
652 |        "  <tbody>\n",
653 |        "    <tr>\n",
654 |        "      <th>0</th>\n",
655 |        "      <td>FB</td>\n",
656 |        "      <td>XN201</td>\n",
657 |        "      <td>10</td>\n",
658 |        "      <td>19</td>\n",
659 |        "      <td>FACEBOOK</td>\n",
660 |        "    </tr>\n",
661 |        "    <tr>\n",
662 |        "      <th>1</th>\n",
663 |        "      <td>FB</td>\n",
664 |        "      <td>XN203</td>\n",
665 |        "      <td>100</td>\n",
666 |        "      <td>30</td>\n",
667 |        "      <td>FACEBOOK</td>\n",
668 |        "    </tr>\n",
669 |        "    <tr>\n",
670 |        "      <th>2</th>\n",
671 |        "      <td>FB</td>\n",
672 |        "      <td>XN203</td>\n",
673 |        "      <td>300</td>\n",
674 |        "      <td>30</td>\n",
675 |        "      <td>FACEBOOK</td>\n",
676 |        "    </tr>\n",
677 |        "    <tr>\n",
678 |        "      <th>3</th>\n",
679 |        "      <td>FB</td>\n",
680 |        "      <td>XN202</td>\n",
681 |        "      <td>2000</td>\n",
682 |        "      <td>45</td>\n",
683 |        "      <td>FACEBOOK</td>\n",
684 |        "    </tr>\n",
685 |        "    <tr>\n",
686 |        "      <th>4</th>\n",
687 |        "      <td>Insta</td>\n",
688 |        "      <td>XN202</td>\n",
689 |        "      <td>500</td>\n",
690 |        "      <td>45</td>\n",
691 |        "      <td>INSTAGRAM</td>\n",
692 |        "    </tr>\n",
693 |        "    <tr>\n",
694 |        "      <th>5</th>\n",
695 |        "      <td>Twitter</td>\n",
696 |        "      <td>XN201</td>\n",
697 |        "      <td>10</td>\n",
698 |        "      <td>19</td>\n",
699 |        "      <td>TWITTER</td>\n",
700 |        "    </tr>\n",
701 |        "    <tr>\n",
702 |        "      <th>6</th>\n",
703 |        "      <td>FB</td>\n",
704 |        "      <td>XN203</td>\n",
705 |        "      <td>500</td>\n",
706 |        "      <td>30</td>\n",
707 |        "      <td>FACEBOOK</td>\n",
708 |        "    </tr>\n",
709 |        "    <tr>\n",
710 |        "      <th>7</th>\n",
711 |        "      <td>Insta</td>\n",
712 |        "      <td>XN201</td>\n",
713 |        "      <td>30</td>\n",
714 |        "      <td>19</td>\n",
715 |        "      <td>INSTAGRAM</td>\n",
716 |        "    </tr>\n",
717 |        "    <tr>\n",
718 |        "      <th>8</th>\n",
719 |        "      <td>Twitter</td>\n",
720 |        "      <td>XN202</td>\n",
721 |        "      <td>100</td>\n",
722 |        "      <td>45</td>\n",
723 |        "      <td>TWITTER</td>\n",
724 |        "    </tr>\n",
725 |        "    <tr>\n",
726 |        "      <th>9</th>\n",
727 |        "      <td>FB</td>\n",
728 |        "      <td>XN203</td>\n",
729 |        "      <td>500</td>\n",
730 |        "      <td>30</td>\n",
731 |        "      <td>FACEBOOK</td>\n",
732 |        "    </tr>\n",
733 |        "    <tr>\n",
734 |        "      <th>10</th>\n",
735 |        "      <td>Insta</td>\n",
736 |        "      <td>XN201</td>\n",
737 |        "      <td>30</td>\n",
738 |        "      <td>19</td>\n",
739 |        "      <td>INSTAGRAM</td>\n",
740 |        "    </tr>\n",
741 |        "    <tr>\n",
742 |        "      <th>11</th>\n",
743 |        "      <td>Twitter</td>\n",
744 |        "      <td>XN202</td>\n",
745 |        "      <td>100</td>\n",
746 |        "      <td>45</td>\n",
747 |        "      <td>TWITTER</td>\n",
748 |        "    </tr>\n",
749 |        "  </tbody>\n",
750 |        "</table>\n",
751 |        "</div>"
752 |       ],
753 |       "text/plain": [
754 |        "        app user_id  time_in_secs  age  full_name\n",
755 |        "0        FB   XN201            10   19   FACEBOOK\n",
756 |        "1        FB   XN203           100   30   FACEBOOK\n",
757 |        "2        FB   XN203           300   30   FACEBOOK\n",
758 |        "3        FB   XN202          2000   45   FACEBOOK\n",
759 |        "4     Insta   XN202           500   45  INSTAGRAM\n",
760 |        "5   Twitter   XN201            10   19    TWITTER\n",
761 |        "6        FB   XN203           500   30   FACEBOOK\n",
762 |        "7     Insta   XN201            30   19  INSTAGRAM\n",
763 |        "8   Twitter   XN202           100   45    TWITTER\n",
764 |        "9        FB   XN203           500   30   FACEBOOK\n",
765 |        "10    Insta   XN201            30   19  INSTAGRAM\n",
766 |        "11  Twitter   XN202           100   45    TWITTER"
767 |       ]
768 |      },
769 |      "execution_count": 14,
770 |      "metadata": {},
771 |      "output_type": "execute_result"
772 |     }
773 |    ],
774 |    "source": [
775 |     "spark.sql(\"select * from join_query \").toPandas().head(50)"
776 |    ]
777 |   },
778 |   {
779 |    "cell_type": "code",
780 |    "execution_count": null,
781 |    "metadata": {},
782 |    "outputs": [],
783 |    "source": []
784 |   },
785 |   {
786 |    "cell_type": "code",
787 |    "execution_count": null,
788 |    "metadata": {},
789 |    "outputs": [],
790 |    "source": []
791 |   }
792 |  ],
793 |  "metadata": {
794 |   "kernelspec": {
795 |    "display_name": "Python 3",
796 |    "language": "python",
797 |    "name": "python3"
798 |   },
799 |   "language_info": {
800 |    "codemirror_mode": {
801 |     "name": "ipython",
802 |     "version": 3
803 |    },
804 |    "file_extension": ".py",
805 |    "mimetype": "text/x-python",
806 |    "name": "python",
807 |    "nbconvert_exporter": "python",
808 |    "pygments_lexer": "ipython3",
809 |    "version": "3.6.3"
810 |   }
811 |  },
812 |  "nbformat": 4,
813 |  "nbformat_minor": 2
814 | }
815 | 


--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/Spark Structured Streaming demo-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#import SparkSession\n",
 10 |     "from pyspark.sql import SparkSession\n",
 11 |     "spark=SparkSession.builder.appName('structured_streaming').getOrCreate()"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pyspark.sql.functions as F\n",
 21 |     "from pyspark.sql.types import *"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 3,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "#create sample dataset\n",
 31 |     "df_1=spark.createDataFrame([(\"XN203\",'FB',300,30),(\"XN201\",'Twitter',10,19),(\"XN202\",'Insta',500,45)], \n",
 32 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 4,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "#define schema for input data\n",
 42 |     "schema=StructType().add(\"user_id\", \"string\").add(\"app\", \"string\").add(\"time_in_secs\", \"integer\").add(\"age\", \"integer\")\n",
 43 |     "data=spark.readStream.option(\"sep\", \",\").schema(schema).csv(\"csv_folder\")"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 5,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "root\n",
 56 |       " |-- user_id: string (nullable = true)\n",
 57 |       " |-- app: string (nullable = true)\n",
 58 |       " |-- time_in_secs: integer (nullable = true)\n",
 59 |       " |-- age: integer (nullable = true)\n",
 60 |       "\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "data.printSchema()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "app_count=data.groupBy('app').count()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 7,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "query=(app_count.writeStream.queryName('count_query').outputMode('complete').format('memory').start())"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 10,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/html": [
 94 |        "<div>\n",
 95 |        "<table border=\"1\" class=\"dataframe\">\n",
 96 |        "  <thead>\n",
 97 |        "    <tr style=\"text-align: right;\">\n",
 98 |        "      <th></th>\n",
 99 |        "      <th>app</th>\n",
100 |        "      <th>count</th>\n",
101 |        "    </tr>\n",
102 |        "  </thead>\n",
103 |        "  <tbody>\n",
104 |        "    <tr>\n",
105 |        "      <th>0</th>\n",
106 |        "      <td>Insta</td>\n",
107 |        "      <td>1</td>\n",
108 |        "    </tr>\n",
109 |        "    <tr>\n",
110 |        "      <th>1</th>\n",
111 |        "      <td>FB</td>\n",
112 |        "      <td>1</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>2</th>\n",
116 |        "      <td>Twitter</td>\n",
117 |        "      <td>1</td>\n",
118 |        "    </tr>\n",
119 |        "  </tbody>\n",
120 |        "</table>\n",
121 |        "</div>"
122 |       ],
123 |       "text/plain": [
124 |        "       app  count\n",
125 |        "0    Insta      1\n",
126 |        "1       FB      1\n",
127 |        "2  Twitter      1"
128 |       ]
129 |      },
130 |      "execution_count": 10,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "spark.sql(\"select * from count_query \").toPandas().head(5)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 9,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "fb_data=data.filter(data['app']=='FB')"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 10,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "fb_avg_time=fb_data.groupBy('user_id').agg(F.avg(\"time_in_secs\"))"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 11,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "fb_query=(fb_avg_time.writeStream.queryName('fb_query').outputMode('complete').format('memory').start())"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 12,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/html": [
174 |        "<div>\n",
175 |        "<table border=\"1\" class=\"dataframe\">\n",
176 |        "  <thead>\n",
177 |        "    <tr style=\"text-align: right;\">\n",
178 |        "      <th></th>\n",
179 |        "      <th>user_id</th>\n",
180 |        "      <th>avg(time_in_secs)</th>\n",
181 |        "    </tr>\n",
182 |        "  </thead>\n",
183 |        "  <tbody>\n",
184 |        "  </tbody>\n",
185 |        "</table>\n",
186 |        "</div>"
187 |       ],
188 |       "text/plain": [
189 |        "Empty DataFrame\n",
190 |        "Columns: [user_id, avg(time_in_secs)]\n",
191 |        "Index: []"
192 |       ]
193 |      },
194 |      "execution_count": 12,
195 |      "metadata": {},
196 |      "output_type": "execute_result"
197 |     }
198 |    ],
199 |    "source": [
200 |     "spark.sql(\"select * from fb_query \").toPandas().head(5)"
201 |    ]
202 |   },
203 |   {
204 |    "cell_type": "code",
205 |    "execution_count": 5,
206 |    "metadata": {},
207 |    "outputs": [],
208 |    "source": [
209 |     "df_2=spark.createDataFrame([(\"XN203\",'FB',100,30),(\"XN201\",'FB',10,19),(\"XN202\",'FB',2000,45)], \n",
210 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "code",
215 |    "execution_count": 14,
216 |    "metadata": {},
217 |    "outputs": [
218 |     {
219 |      "data": {
220 |       "text/html": [
221 |        "<div>\n",
222 |        "<table border=\"1\" class=\"dataframe\">\n",
223 |        "  <thead>\n",
224 |        "    <tr style=\"text-align: right;\">\n",
225 |        "      <th></th>\n",
226 |        "      <th>user_id</th>\n",
227 |        "      <th>avg(time_in_secs)</th>\n",
228 |        "    </tr>\n",
229 |        "  </thead>\n",
230 |        "  <tbody>\n",
231 |        "    <tr>\n",
232 |        "      <th>0</th>\n",
233 |        "      <td>XN203</td>\n",
234 |        "      <td>300.0</td>\n",
235 |        "    </tr>\n",
236 |        "  </tbody>\n",
237 |        "</table>\n",
238 |        "</div>"
239 |       ],
240 |       "text/plain": [
241 |        "  user_id  avg(time_in_secs)\n",
242 |        "0   XN203              300.0"
243 |       ]
244 |      },
245 |      "execution_count": 14,
246 |      "metadata": {},
247 |      "output_type": "execute_result"
248 |     }
249 |    ],
250 |    "source": [
251 |     "spark.sql(\"select * from fb_query \").toPandas().head(5)"
252 |    ]
253 |   },
254 |   {
255 |    "cell_type": "code",
256 |    "execution_count": 12,
257 |    "metadata": {},
258 |    "outputs": [],
259 |    "source": [
260 |     "df_3=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
261 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 16,
267 |    "metadata": {},
268 |    "outputs": [
269 |     {
270 |      "data": {
271 |       "text/html": [
272 |        "<div>\n",
273 |        "<table border=\"1\" class=\"dataframe\">\n",
274 |        "  <thead>\n",
275 |        "    <tr style=\"text-align: right;\">\n",
276 |        "      <th></th>\n",
277 |        "      <th>user_id</th>\n",
278 |        "      <th>avg(time_in_secs)</th>\n",
279 |        "    </tr>\n",
280 |        "  </thead>\n",
281 |        "  <tbody>\n",
282 |        "    <tr>\n",
283 |        "      <th>0</th>\n",
284 |        "      <td>XN203</td>\n",
285 |        "      <td>300.0</td>\n",
286 |        "    </tr>\n",
287 |        "  </tbody>\n",
288 |        "</table>\n",
289 |        "</div>"
290 |       ],
291 |       "text/plain": [
292 |        "  user_id  avg(time_in_secs)\n",
293 |        "0   XN203              300.0"
294 |       ]
295 |      },
296 |      "execution_count": 16,
297 |      "metadata": {},
298 |      "output_type": "execute_result"
299 |     }
300 |    ],
301 |    "source": [
302 |     "spark.sql(\"select * from fb_query \").toPandas().head(5)"
303 |    ]
304 |   },
305 |   {
306 |    "cell_type": "code",
307 |    "execution_count": 13,
308 |    "metadata": {},
309 |    "outputs": [],
310 |    "source": [
311 |     "df_4=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
312 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
313 |    ]
314 |   },
315 |   {
316 |    "cell_type": "code",
317 |    "execution_count": 18,
318 |    "metadata": {},
319 |    "outputs": [],
320 |    "source": [
321 |     "#app wise time spent\n",
322 |     "\n",
323 |     "app_df=data.groupBy('app').agg(F.sum('time_in_secs').alias('total_time')).orderBy('total_time',ascending=False)"
324 |    ]
325 |   },
326 |   {
327 |    "cell_type": "code",
328 |    "execution_count": 19,
329 |    "metadata": {},
330 |    "outputs": [],
331 |    "source": [
332 |     "app_query=(app_df.writeStream.queryName('app_wise_query').outputMode('complete').format('memory').start())"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 27,
338 |    "metadata": {},
339 |    "outputs": [
340 |     {
341 |      "data": {
342 |       "text/html": [
343 |        "<div>\n",
344 |        "<table border=\"1\" class=\"dataframe\">\n",
345 |        "  <thead>\n",
346 |        "    <tr style=\"text-align: right;\">\n",
347 |        "      <th></th>\n",
348 |        "      <th>app</th>\n",
349 |        "      <th>total_time</th>\n",
350 |        "    </tr>\n",
351 |        "  </thead>\n",
352 |        "  <tbody>\n",
353 |        "    <tr>\n",
354 |        "      <th>0</th>\n",
355 |        "      <td>FB</td>\n",
356 |        "      <td>3410</td>\n",
357 |        "    </tr>\n",
358 |        "    <tr>\n",
359 |        "      <th>1</th>\n",
360 |        "      <td>Insta</td>\n",
361 |        "      <td>560</td>\n",
362 |        "    </tr>\n",
363 |        "    <tr>\n",
364 |        "      <th>2</th>\n",
365 |        "      <td>Twitter</td>\n",
366 |        "      <td>210</td>\n",
367 |        "    </tr>\n",
368 |        "  </tbody>\n",
369 |        "</table>\n",
370 |        "</div>"
371 |       ],
372 |       "text/plain": [
373 |        "       app  total_time\n",
374 |        "0       FB        3410\n",
375 |        "1    Insta         560\n",
376 |        "2  Twitter         210"
377 |       ]
378 |      },
379 |      "execution_count": 27,
380 |      "metadata": {},
381 |      "output_type": "execute_result"
382 |     }
383 |    ],
384 |    "source": [
385 |     "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 11,
391 |    "metadata": {},
392 |    "outputs": [],
393 |    "source": [
394 |     "df_5=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
395 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 26,
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "data": {
405 |       "text/html": [
406 |        "<div>\n",
407 |        "<table border=\"1\" class=\"dataframe\">\n",
408 |        "  <thead>\n",
409 |        "    <tr style=\"text-align: right;\">\n",
410 |        "      <th></th>\n",
411 |        "      <th>app</th>\n",
412 |        "      <th>total_time</th>\n",
413 |        "    </tr>\n",
414 |        "  </thead>\n",
415 |        "  <tbody>\n",
416 |        "    <tr>\n",
417 |        "      <th>0</th>\n",
418 |        "      <td>FB</td>\n",
419 |        "      <td>3410</td>\n",
420 |        "    </tr>\n",
421 |        "    <tr>\n",
422 |        "      <th>1</th>\n",
423 |        "      <td>Insta</td>\n",
424 |        "      <td>560</td>\n",
425 |        "    </tr>\n",
426 |        "    <tr>\n",
427 |        "      <th>2</th>\n",
428 |        "      <td>Twitter</td>\n",
429 |        "      <td>210</td>\n",
430 |        "    </tr>\n",
431 |        "  </tbody>\n",
432 |        "</table>\n",
433 |        "</div>"
434 |       ],
435 |       "text/plain": [
436 |        "       app  total_time\n",
437 |        "0       FB        3410\n",
438 |        "1    Insta         560\n",
439 |        "2  Twitter         210"
440 |       ]
441 |      },
442 |      "execution_count": 26,
443 |      "metadata": {},
444 |      "output_type": "execute_result"
445 |     }
446 |    ],
447 |    "source": [
448 |     "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
449 |    ]
450 |   },
451 |   {
452 |    "cell_type": "code",
453 |    "execution_count": 28,
454 |    "metadata": {},
455 |    "outputs": [],
456 |    "source": [
457 |     "# app wise mean age \n",
458 |     "age_df=data.groupBy('app').agg(F.avg('age').alias('mean_age')).orderBy('mean_age',ascending=False)\n"
459 |    ]
460 |   },
461 |   {
462 |    "cell_type": "code",
463 |    "execution_count": null,
464 |    "metadata": {
465 |     "scrolled": true
466 |    },
467 |    "outputs": [],
468 |    "source": [
469 |     "age_query=(age_df.writeStream.queryName('age_query').outputMode('complete').format('memory').start())"
470 |    ]
471 |   },
472 |   {
473 |    "cell_type": "code",
474 |    "execution_count": 30,
475 |    "metadata": {},
476 |    "outputs": [
477 |     {
478 |      "data": {
479 |       "text/html": [
480 |        "<div>\n",
481 |        "<table border=\"1\" class=\"dataframe\">\n",
482 |        "  <thead>\n",
483 |        "    <tr style=\"text-align: right;\">\n",
484 |        "      <th></th>\n",
485 |        "      <th>app</th>\n",
486 |        "      <th>mean_age</th>\n",
487 |        "    </tr>\n",
488 |        "  </thead>\n",
489 |        "  <tbody>\n",
490 |        "    <tr>\n",
491 |        "      <th>0</th>\n",
492 |        "      <td>Twitter</td>\n",
493 |        "      <td>38.500000</td>\n",
494 |        "    </tr>\n",
495 |        "    <tr>\n",
496 |        "      <th>1</th>\n",
497 |        "      <td>FB</td>\n",
498 |        "      <td>30.571429</td>\n",
499 |        "    </tr>\n",
500 |        "    <tr>\n",
501 |        "      <th>2</th>\n",
502 |        "      <td>Insta</td>\n",
503 |        "      <td>25.500000</td>\n",
504 |        "    </tr>\n",
505 |        "  </tbody>\n",
506 |        "</table>\n",
507 |        "</div>"
508 |       ],
509 |       "text/plain": [
510 |        "       app   mean_age\n",
511 |        "0  Twitter  38.500000\n",
512 |        "1       FB  30.571429\n",
513 |        "2    Insta  25.500000"
514 |       ]
515 |      },
516 |      "execution_count": 30,
517 |      "metadata": {},
518 |      "output_type": "execute_result"
519 |     }
520 |    ],
521 |    "source": [
522 |     "spark.sql(\"select * from age_query \").toPandas().head(5)"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": 15,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "df_6=spark.createDataFrame([(\"XN210\",'FB',500,50),(\"XN255\",'Insta',30,23),(\"XN222\",'Twitter',100,30)], \n",
532 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
533 |    ]
534 |   },
535 |   {
536 |    "cell_type": "code",
537 |    "execution_count": 32,
538 |    "metadata": {},
539 |    "outputs": [
540 |     {
541 |      "data": {
542 |       "text/html": [
543 |        "<div>\n",
544 |        "<table border=\"1\" class=\"dataframe\">\n",
545 |        "  <thead>\n",
546 |        "    <tr style=\"text-align: right;\">\n",
547 |        "      <th></th>\n",
548 |        "      <th>app</th>\n",
549 |        "      <th>mean_age</th>\n",
550 |        "    </tr>\n",
551 |        "  </thead>\n",
552 |        "  <tbody>\n",
553 |        "    <tr>\n",
554 |        "      <th>0</th>\n",
555 |        "      <td>Twitter</td>\n",
556 |        "      <td>38.500000</td>\n",
557 |        "    </tr>\n",
558 |        "    <tr>\n",
559 |        "      <th>1</th>\n",
560 |        "      <td>FB</td>\n",
561 |        "      <td>30.571429</td>\n",
562 |        "    </tr>\n",
563 |        "    <tr>\n",
564 |        "      <th>2</th>\n",
565 |        "      <td>Insta</td>\n",
566 |        "      <td>25.500000</td>\n",
567 |        "    </tr>\n",
568 |        "  </tbody>\n",
569 |        "</table>\n",
570 |        "</div>"
571 |       ],
572 |       "text/plain": [
573 |        "       app   mean_age\n",
574 |        "0  Twitter  38.500000\n",
575 |        "1       FB  30.571429\n",
576 |        "2    Insta  25.500000"
577 |       ]
578 |      },
579 |      "execution_count": 32,
580 |      "metadata": {},
581 |      "output_type": "execute_result"
582 |     }
583 |    ],
584 |    "source": [
585 |     "spark.sql(\"select * from age_query \").toPandas().head(5)"
586 |    ]
587 |   },
588 |   {
589 |    "cell_type": "code",
590 |    "execution_count": 6,
591 |    "metadata": {},
592 |    "outputs": [
593 |     {
594 |      "name": "stdout",
595 |      "output_type": "stream",
596 |      "text": [
597 |       "+-------+---------+\n",
598 |       "|    app|full_name|\n",
599 |       "+-------+---------+\n",
600 |       "|     FB| FACEBOOK|\n",
601 |       "|  Insta|INSTAGRAM|\n",
602 |       "|Twitter|  TWITTER|\n",
603 |       "+-------+---------+\n",
604 |       "\n"
605 |      ]
606 |     }
607 |    ],
608 |    "source": [
609 |     "# Join static dataframe with streaming dataframe\n",
610 |     "app_df=spark.createDataFrame([('FB','FACEBOOK'),('Insta','INSTAGRAM'),('Twitter','TWITTER')],[\"app\", \"full_name\"])\n",
611 |     "app_df.show()"
612 |    ]
613 |   },
614 |   {
615 |    "cell_type": "code",
616 |    "execution_count": 7,
617 |    "metadata": {},
618 |    "outputs": [],
619 |    "source": [
620 |     "app_stream_df=data.join(app_df,'app')"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": 8,
626 |    "metadata": {},
627 |    "outputs": [],
628 |    "source": [
629 |     "join_query=(app_stream_df.writeStream.queryName('join_query').outputMode('append').format('memory').start())"
630 |    ]
631 |   },
632 |   {
633 |    "cell_type": "code",
634 |    "execution_count": 14,
635 |    "metadata": {},
636 |    "outputs": [
637 |     {
638 |      "data": {
639 |       "text/html": [
640 |        "<div>\n",
641 |        "<table border=\"1\" class=\"dataframe\">\n",
642 |        "  <thead>\n",
643 |        "    <tr style=\"text-align: right;\">\n",
644 |        "      <th></th>\n",
645 |        "      <th>app</th>\n",
646 |        "      <th>user_id</th>\n",
647 |        "      <th>time_in_secs</th>\n",
648 |        "      <th>age</th>\n",
649 |        "      <th>full_name</th>\n",
650 |        "    </tr>\n",
651 |        "  </thead>\n",
652 |        "  <tbody>\n",
653 |        "    <tr>\n",
654 |        "      <th>0</th>\n",
655 |        "      <td>FB</td>\n",
656 |        "      <td>XN201</td>\n",
657 |        "      <td>10</td>\n",
658 |        "      <td>19</td>\n",
659 |        "      <td>FACEBOOK</td>\n",
660 |        "    </tr>\n",
661 |        "    <tr>\n",
662 |        "      <th>1</th>\n",
663 |        "      <td>FB</td>\n",
664 |        "      <td>XN203</td>\n",
665 |        "      <td>100</td>\n",
666 |        "      <td>30</td>\n",
667 |        "      <td>FACEBOOK</td>\n",
668 |        "    </tr>\n",
669 |        "    <tr>\n",
670 |        "      <th>2</th>\n",
671 |        "      <td>FB</td>\n",
672 |        "      <td>XN203</td>\n",
673 |        "      <td>300</td>\n",
674 |        "      <td>30</td>\n",
675 |        "      <td>FACEBOOK</td>\n",
676 |        "    </tr>\n",
677 |        "    <tr>\n",
678 |        "      <th>3</th>\n",
679 |        "      <td>FB</td>\n",
680 |        "      <td>XN202</td>\n",
681 |        "      <td>2000</td>\n",
682 |        "      <td>45</td>\n",
683 |        "      <td>FACEBOOK</td>\n",
684 |        "    </tr>\n",
685 |        "    <tr>\n",
686 |        "      <th>4</th>\n",
687 |        "      <td>Insta</td>\n",
688 |        "      <td>XN202</td>\n",
689 |        "      <td>500</td>\n",
690 |        "      <td>45</td>\n",
691 |        "      <td>INSTAGRAM</td>\n",
692 |        "    </tr>\n",
693 |        "    <tr>\n",
694 |        "      <th>5</th>\n",
695 |        "      <td>Twitter</td>\n",
696 |        "      <td>XN201</td>\n",
697 |        "      <td>10</td>\n",
698 |        "      <td>19</td>\n",
699 |        "      <td>TWITTER</td>\n",
700 |        "    </tr>\n",
701 |        "    <tr>\n",
702 |        "      <th>6</th>\n",
703 |        "      <td>FB</td>\n",
704 |        "      <td>XN203</td>\n",
705 |        "      <td>500</td>\n",
706 |        "      <td>30</td>\n",
707 |        "      <td>FACEBOOK</td>\n",
708 |        "    </tr>\n",
709 |        "    <tr>\n",
710 |        "      <th>7</th>\n",
711 |        "      <td>Insta</td>\n",
712 |        "      <td>XN201</td>\n",
713 |        "      <td>30</td>\n",
714 |        "      <td>19</td>\n",
715 |        "      <td>INSTAGRAM</td>\n",
716 |        "    </tr>\n",
717 |        "    <tr>\n",
718 |        "      <th>8</th>\n",
719 |        "      <td>Twitter</td>\n",
720 |        "      <td>XN202</td>\n",
721 |        "      <td>100</td>\n",
722 |        "      <td>45</td>\n",
723 |        "      <td>TWITTER</td>\n",
724 |        "    </tr>\n",
725 |        "    <tr>\n",
726 |        "      <th>9</th>\n",
727 |        "      <td>FB</td>\n",
728 |        "      <td>XN203</td>\n",
729 |        "      <td>500</td>\n",
730 |        "      <td>30</td>\n",
731 |        "      <td>FACEBOOK</td>\n",
732 |        "    </tr>\n",
733 |        "    <tr>\n",
734 |        "      <th>10</th>\n",
735 |        "      <td>Insta</td>\n",
736 |        "      <td>XN201</td>\n",
737 |        "      <td>30</td>\n",
738 |        "      <td>19</td>\n",
739 |        "      <td>INSTAGRAM</td>\n",
740 |        "    </tr>\n",
741 |        "    <tr>\n",
742 |        "      <th>11</th>\n",
743 |        "      <td>Twitter</td>\n",
744 |        "      <td>XN202</td>\n",
745 |        "      <td>100</td>\n",
746 |        "      <td>45</td>\n",
747 |        "      <td>TWITTER</td>\n",
748 |        "    </tr>\n",
749 |        "  </tbody>\n",
750 |        "</table>\n",
751 |        "</div>"
752 |       ],
753 |       "text/plain": [
754 |        "        app user_id  time_in_secs  age  full_name\n",
755 |        "0        FB   XN201            10   19   FACEBOOK\n",
756 |        "1        FB   XN203           100   30   FACEBOOK\n",
757 |        "2        FB   XN203           300   30   FACEBOOK\n",
758 |        "3        FB   XN202          2000   45   FACEBOOK\n",
759 |        "4     Insta   XN202           500   45  INSTAGRAM\n",
760 |        "5   Twitter   XN201            10   19    TWITTER\n",
761 |        "6        FB   XN203           500   30   FACEBOOK\n",
762 |        "7     Insta   XN201            30   19  INSTAGRAM\n",
763 |        "8   Twitter   XN202           100   45    TWITTER\n",
764 |        "9        FB   XN203           500   30   FACEBOOK\n",
765 |        "10    Insta   XN201            30   19  INSTAGRAM\n",
766 |        "11  Twitter   XN202           100   45    TWITTER"
767 |       ]
768 |      },
769 |      "execution_count": 14,
770 |      "metadata": {},
771 |      "output_type": "execute_result"
772 |     }
773 |    ],
774 |    "source": [
775 |     "spark.sql(\"select * from join_query \").toPandas().head(50)"
776 |    ]
777 |   },
778 |   {
779 |    "cell_type": "code",
780 |    "execution_count": null,
781 |    "metadata": {},
782 |    "outputs": [],
783 |    "source": []
784 |   },
785 |   {
786 |    "cell_type": "code",
787 |    "execution_count": null,
788 |    "metadata": {},
789 |    "outputs": [],
790 |    "source": []
791 |   }
792 |  ],
793 |  "metadata": {
794 |   "kernelspec": {
795 |    "display_name": "Python 3",
796 |    "language": "python",
797 |    "name": "python3"
798 |   },
799 |   "language_info": {
800 |    "codemirror_mode": {
801 |     "name": "ipython",
802 |     "version": 3
803 |    },
804 |    "file_extension": ".py",
805 |    "mimetype": "text/x-python",
806 |    "name": "python",
807 |    "nbconvert_exporter": "python",
808 |    "pygments_lexer": "ipython3",
809 |    "version": "3.6.3"
810 |   }
811 |  },
812 |  "nbformat": 4,
813 |  "nbformat_minor": 2
814 | }
815 | 


--------------------------------------------------------------------------------
/chap_3/Spark Structured Streaming demo.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#import SparkSession\n",
 10 |     "from pyspark.sql import SparkSession\n",
 11 |     "spark=SparkSession.builder.appName('structured_streaming').getOrCreate()"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 2,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "import pyspark.sql.functions as F\n",
 21 |     "from pyspark.sql.types import *"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 3,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "#create sample dataset\n",
 31 |     "df_1=spark.createDataFrame([(\"XN203\",'FB',300,30),(\"XN201\",'Twitter',10,19),(\"XN202\",'Insta',500,45)], \n",
 32 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "code",
 37 |    "execution_count": 4,
 38 |    "metadata": {},
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "#define schema for input data\n",
 42 |     "schema=StructType().add(\"user_id\", \"string\").add(\"app\", \"string\").add(\"time_in_secs\", \"integer\").add(\"age\", \"integer\")\n",
 43 |     "data=spark.readStream.option(\"sep\", \",\").schema(schema).csv(\"demo\")"
 44 |    ]
 45 |   },
 46 |   {
 47 |    "cell_type": "code",
 48 |    "execution_count": 5,
 49 |    "metadata": {},
 50 |    "outputs": [
 51 |     {
 52 |      "name": "stdout",
 53 |      "output_type": "stream",
 54 |      "text": [
 55 |       "root\n",
 56 |       " |-- user_id: string (nullable = true)\n",
 57 |       " |-- app: string (nullable = true)\n",
 58 |       " |-- time_in_secs: integer (nullable = true)\n",
 59 |       " |-- age: integer (nullable = true)\n",
 60 |       "\n"
 61 |      ]
 62 |     }
 63 |    ],
 64 |    "source": [
 65 |     "data.printSchema()"
 66 |    ]
 67 |   },
 68 |   {
 69 |    "cell_type": "code",
 70 |    "execution_count": 6,
 71 |    "metadata": {},
 72 |    "outputs": [],
 73 |    "source": [
 74 |     "app_count=data.groupBy('app').count()"
 75 |    ]
 76 |   },
 77 |   {
 78 |    "cell_type": "code",
 79 |    "execution_count": 7,
 80 |    "metadata": {},
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "query=(app_count.writeStream.queryName('count_query').outputMode('complete').format('memory').start())"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 8,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "data": {
 93 |       "text/html": [
 94 |        "<div>\n",
 95 |        "<table border=\"1\" class=\"dataframe\">\n",
 96 |        "  <thead>\n",
 97 |        "    <tr style=\"text-align: right;\">\n",
 98 |        "      <th></th>\n",
 99 |        "      <th>app</th>\n",
100 |        "      <th>count</th>\n",
101 |        "    </tr>\n",
102 |        "  </thead>\n",
103 |        "  <tbody>\n",
104 |        "    <tr>\n",
105 |        "      <th>0</th>\n",
106 |        "      <td>Insta</td>\n",
107 |        "      <td>1</td>\n",
108 |        "    </tr>\n",
109 |        "    <tr>\n",
110 |        "      <th>1</th>\n",
111 |        "      <td>FB</td>\n",
112 |        "      <td>1</td>\n",
113 |        "    </tr>\n",
114 |        "    <tr>\n",
115 |        "      <th>2</th>\n",
116 |        "      <td>Twitter</td>\n",
117 |        "      <td>1</td>\n",
118 |        "    </tr>\n",
119 |        "  </tbody>\n",
120 |        "</table>\n",
121 |        "</div>"
122 |       ],
123 |       "text/plain": [
124 |        "       app  count\n",
125 |        "0    Insta      1\n",
126 |        "1       FB      1\n",
127 |        "2  Twitter      1"
128 |       ]
129 |      },
130 |      "execution_count": 8,
131 |      "metadata": {},
132 |      "output_type": "execute_result"
133 |     }
134 |    ],
135 |    "source": [
136 |     "spark.sql(\"select * from count_query \").toPandas().head(5)"
137 |    ]
138 |   },
139 |   {
140 |    "cell_type": "code",
141 |    "execution_count": 9,
142 |    "metadata": {},
143 |    "outputs": [],
144 |    "source": [
145 |     "fb_data=data.filter(data['app']=='FB')"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 10,
151 |    "metadata": {},
152 |    "outputs": [],
153 |    "source": [
154 |     "fb_avg_time=fb_data.groupBy('user_id').agg(F.avg(\"time_in_secs\"))"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": 11,
160 |    "metadata": {},
161 |    "outputs": [],
162 |    "source": [
163 |     "fb_query=(fb_avg_time.writeStream.queryName('fb_query').outputMode('complete').format('memory').start())"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 13,
169 |    "metadata": {},
170 |    "outputs": [
171 |     {
172 |      "data": {
173 |       "text/html": [
174 |        "<div>\n",
175 |        "<table border=\"1\" class=\"dataframe\">\n",
176 |        "  <thead>\n",
177 |        "    <tr style=\"text-align: right;\">\n",
178 |        "      <th></th>\n",
179 |        "      <th>user_id</th>\n",
180 |        "      <th>avg(time_in_secs)</th>\n",
181 |        "    </tr>\n",
182 |        "  </thead>\n",
183 |        "  <tbody>\n",
184 |        "    <tr>\n",
185 |        "      <th>0</th>\n",
186 |        "      <td>XN203</td>\n",
187 |        "      <td>300.0</td>\n",
188 |        "    </tr>\n",
189 |        "  </tbody>\n",
190 |        "</table>\n",
191 |        "</div>"
192 |       ],
193 |       "text/plain": [
194 |        "  user_id  avg(time_in_secs)\n",
195 |        "0   XN203              300.0"
196 |       ]
197 |      },
198 |      "execution_count": 13,
199 |      "metadata": {},
200 |      "output_type": "execute_result"
201 |     }
202 |    ],
203 |    "source": [
204 |     "spark.sql(\"select * from fb_query \").toPandas().head(5)"
205 |    ]
206 |   },
207 |   {
208 |    "cell_type": "code",
209 |    "execution_count": 21,
210 |    "metadata": {},
211 |    "outputs": [],
212 |    "source": [
213 |     "df_2=spark.createDataFrame([(\"XN203\",'FB',100,30),(\"XN201\",'FB',10,19),(\"XN202\",'FB',2000,45)], \n",
214 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 23,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "data": {
224 |       "text/html": [
225 |        "<div>\n",
226 |        "<table border=\"1\" class=\"dataframe\">\n",
227 |        "  <thead>\n",
228 |        "    <tr style=\"text-align: right;\">\n",
229 |        "      <th></th>\n",
230 |        "      <th>user_id</th>\n",
231 |        "      <th>avg(time_in_secs)</th>\n",
232 |        "    </tr>\n",
233 |        "  </thead>\n",
234 |        "  <tbody>\n",
235 |        "    <tr>\n",
236 |        "      <th>0</th>\n",
237 |        "      <td>XN203</td>\n",
238 |        "      <td>200.0</td>\n",
239 |        "    </tr>\n",
240 |        "    <tr>\n",
241 |        "      <th>1</th>\n",
242 |        "      <td>XN201</td>\n",
243 |        "      <td>10.0</td>\n",
244 |        "    </tr>\n",
245 |        "  </tbody>\n",
246 |        "</table>\n",
247 |        "</div>"
248 |       ],
249 |       "text/plain": [
250 |        "  user_id  avg(time_in_secs)\n",
251 |        "0   XN203              200.0\n",
252 |        "1   XN201               10.0"
253 |       ]
254 |      },
255 |      "execution_count": 23,
256 |      "metadata": {},
257 |      "output_type": "execute_result"
258 |     }
259 |    ],
260 |    "source": [
261 |     "spark.sql(\"select * from fb_query \").toPandas().head(5)"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 24,
267 |    "metadata": {},
268 |    "outputs": [],
269 |    "source": [
270 |     "df_3=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
271 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 25,
277 |    "metadata": {},
278 |    "outputs": [
279 |     {
280 |      "data": {
281 |       "text/html": [
282 |        "<div>\n",
283 |        "<table border=\"1\" class=\"dataframe\">\n",
284 |        "  <thead>\n",
285 |        "    <tr style=\"text-align: right;\">\n",
286 |        "      <th></th>\n",
287 |        "      <th>user_id</th>\n",
288 |        "      <th>avg(time_in_secs)</th>\n",
289 |        "    </tr>\n",
290 |        "  </thead>\n",
291 |        "  <tbody>\n",
292 |        "    <tr>\n",
293 |        "      <th>0</th>\n",
294 |        "      <td>XN203</td>\n",
295 |        "      <td>200.0</td>\n",
296 |        "    </tr>\n",
297 |        "    <tr>\n",
298 |        "      <th>1</th>\n",
299 |        "      <td>XN201</td>\n",
300 |        "      <td>10.0</td>\n",
301 |        "    </tr>\n",
302 |        "    <tr>\n",
303 |        "      <th>2</th>\n",
304 |        "      <td>XN202</td>\n",
305 |        "      <td>2000.0</td>\n",
306 |        "    </tr>\n",
307 |        "  </tbody>\n",
308 |        "</table>\n",
309 |        "</div>"
310 |       ],
311 |       "text/plain": [
312 |        "  user_id  avg(time_in_secs)\n",
313 |        "0   XN203              200.0\n",
314 |        "1   XN201               10.0\n",
315 |        "2   XN202             2000.0"
316 |       ]
317 |      },
318 |      "execution_count": 25,
319 |      "metadata": {},
320 |      "output_type": "execute_result"
321 |     }
322 |    ],
323 |    "source": [
324 |     "spark.sql(\"select * from fb_query \").toPandas().head(5)"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "code",
329 |    "execution_count": 26,
330 |    "metadata": {},
331 |    "outputs": [],
332 |    "source": [
333 |     "df_4=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
334 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 18,
340 |    "metadata": {},
341 |    "outputs": [],
342 |    "source": [
343 |     "#app wise time spent\n",
344 |     "\n",
345 |     "app_df=data.groupBy('app').agg(F.sum('time_in_secs').alias('total_time')).orderBy('total_time',ascending=False)"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 19,
351 |    "metadata": {},
352 |    "outputs": [],
353 |    "source": [
354 |     "app_query=(app_df.writeStream.queryName('app_wise_query').outputMode('complete').format('memory').start())"
355 |    ]
356 |   },
357 |   {
358 |    "cell_type": "code",
359 |    "execution_count": 27,
360 |    "metadata": {},
361 |    "outputs": [
362 |     {
363 |      "data": {
364 |       "text/html": [
365 |        "<div>\n",
366 |        "<table border=\"1\" class=\"dataframe\">\n",
367 |        "  <thead>\n",
368 |        "    <tr style=\"text-align: right;\">\n",
369 |        "      <th></th>\n",
370 |        "      <th>app</th>\n",
371 |        "      <th>total_time</th>\n",
372 |        "    </tr>\n",
373 |        "  </thead>\n",
374 |        "  <tbody>\n",
375 |        "    <tr>\n",
376 |        "      <th>0</th>\n",
377 |        "      <td>FB</td>\n",
378 |        "      <td>3410</td>\n",
379 |        "    </tr>\n",
380 |        "    <tr>\n",
381 |        "      <th>1</th>\n",
382 |        "      <td>Insta</td>\n",
383 |        "      <td>560</td>\n",
384 |        "    </tr>\n",
385 |        "    <tr>\n",
386 |        "      <th>2</th>\n",
387 |        "      <td>Twitter</td>\n",
388 |        "      <td>210</td>\n",
389 |        "    </tr>\n",
390 |        "  </tbody>\n",
391 |        "</table>\n",
392 |        "</div>"
393 |       ],
394 |       "text/plain": [
395 |        "       app  total_time\n",
396 |        "0       FB        3410\n",
397 |        "1    Insta         560\n",
398 |        "2  Twitter         210"
399 |       ]
400 |      },
401 |      "execution_count": 27,
402 |      "metadata": {},
403 |      "output_type": "execute_result"
404 |     }
405 |    ],
406 |    "source": [
407 |     "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
408 |    ]
409 |   },
410 |   {
411 |    "cell_type": "code",
412 |    "execution_count": 11,
413 |    "metadata": {},
414 |    "outputs": [],
415 |    "source": [
416 |     "df_5=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n",
417 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 26,
423 |    "metadata": {},
424 |    "outputs": [
425 |     {
426 |      "data": {
427 |       "text/html": [
428 |        "<div>\n",
429 |        "<table border=\"1\" class=\"dataframe\">\n",
430 |        "  <thead>\n",
431 |        "    <tr style=\"text-align: right;\">\n",
432 |        "      <th></th>\n",
433 |        "      <th>app</th>\n",
434 |        "      <th>total_time</th>\n",
435 |        "    </tr>\n",
436 |        "  </thead>\n",
437 |        "  <tbody>\n",
438 |        "    <tr>\n",
439 |        "      <th>0</th>\n",
440 |        "      <td>FB</td>\n",
441 |        "      <td>3410</td>\n",
442 |        "    </tr>\n",
443 |        "    <tr>\n",
444 |        "      <th>1</th>\n",
445 |        "      <td>Insta</td>\n",
446 |        "      <td>560</td>\n",
447 |        "    </tr>\n",
448 |        "    <tr>\n",
449 |        "      <th>2</th>\n",
450 |        "      <td>Twitter</td>\n",
451 |        "      <td>210</td>\n",
452 |        "    </tr>\n",
453 |        "  </tbody>\n",
454 |        "</table>\n",
455 |        "</div>"
456 |       ],
457 |       "text/plain": [
458 |        "       app  total_time\n",
459 |        "0       FB        3410\n",
460 |        "1    Insta         560\n",
461 |        "2  Twitter         210"
462 |       ]
463 |      },
464 |      "execution_count": 26,
465 |      "metadata": {},
466 |      "output_type": "execute_result"
467 |     }
468 |    ],
469 |    "source": [
470 |     "spark.sql(\"select * from app_wise_query \").toPandas().head(5)"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 28,
476 |    "metadata": {},
477 |    "outputs": [],
478 |    "source": [
479 |     "# app wise mean age \n",
480 |     "age_df=data.groupBy('app').agg(F.avg('age').alias('mean_age')).orderBy('mean_age',ascending=False)\n"
481 |    ]
482 |   },
483 |   {
484 |    "cell_type": "code",
485 |    "execution_count": null,
486 |    "metadata": {
487 |     "scrolled": true
488 |    },
489 |    "outputs": [],
490 |    "source": [
491 |     "age_query=(age_df.writeStream.queryName('age_query').outputMode('complete').format('memory').start())"
492 |    ]
493 |   },
494 |   {
495 |    "cell_type": "code",
496 |    "execution_count": 30,
497 |    "metadata": {},
498 |    "outputs": [
499 |     {
500 |      "data": {
501 |       "text/html": [
502 |        "<div>\n",
503 |        "<table border=\"1\" class=\"dataframe\">\n",
504 |        "  <thead>\n",
505 |        "    <tr style=\"text-align: right;\">\n",
506 |        "      <th></th>\n",
507 |        "      <th>app</th>\n",
508 |        "      <th>mean_age</th>\n",
509 |        "    </tr>\n",
510 |        "  </thead>\n",
511 |        "  <tbody>\n",
512 |        "    <tr>\n",
513 |        "      <th>0</th>\n",
514 |        "      <td>Twitter</td>\n",
515 |        "      <td>38.500000</td>\n",
516 |        "    </tr>\n",
517 |        "    <tr>\n",
518 |        "      <th>1</th>\n",
519 |        "      <td>FB</td>\n",
520 |        "      <td>30.571429</td>\n",
521 |        "    </tr>\n",
522 |        "    <tr>\n",
523 |        "      <th>2</th>\n",
524 |        "      <td>Insta</td>\n",
525 |        "      <td>25.500000</td>\n",
526 |        "    </tr>\n",
527 |        "  </tbody>\n",
528 |        "</table>\n",
529 |        "</div>"
530 |       ],
531 |       "text/plain": [
532 |        "       app   mean_age\n",
533 |        "0  Twitter  38.500000\n",
534 |        "1       FB  30.571429\n",
535 |        "2    Insta  25.500000"
536 |       ]
537 |      },
538 |      "execution_count": 30,
539 |      "metadata": {},
540 |      "output_type": "execute_result"
541 |     }
542 |    ],
543 |    "source": [
544 |     "spark.sql(\"select * from age_query \").toPandas().head(5)"
545 |    ]
546 |   },
547 |   {
548 |    "cell_type": "code",
549 |    "execution_count": 15,
550 |    "metadata": {},
551 |    "outputs": [],
552 |    "source": [
553 |     "df_6=spark.createDataFrame([(\"XN210\",'FB',500,50),(\"XN255\",'Insta',30,23),(\"XN222\",'Twitter',100,30)], \n",
554 |     "                           [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')"
555 |    ]
556 |   },
557 |   {
558 |    "cell_type": "code",
559 |    "execution_count": 32,
560 |    "metadata": {},
561 |    "outputs": [
562 |     {
563 |      "data": {
564 |       "text/html": [
565 |        "<div>\n",
566 |        "<table border=\"1\" class=\"dataframe\">\n",
567 |        "  <thead>\n",
568 |        "    <tr style=\"text-align: right;\">\n",
569 |        "      <th></th>\n",
570 |        "      <th>app</th>\n",
571 |        "      <th>mean_age</th>\n",
572 |        "    </tr>\n",
573 |        "  </thead>\n",
574 |        "  <tbody>\n",
575 |        "    <tr>\n",
576 |        "      <th>0</th>\n",
577 |        "      <td>Twitter</td>\n",
578 |        "      <td>38.500000</td>\n",
579 |        "    </tr>\n",
580 |        "    <tr>\n",
581 |        "      <th>1</th>\n",
582 |        "      <td>FB</td>\n",
583 |        "      <td>30.571429</td>\n",
584 |        "    </tr>\n",
585 |        "    <tr>\n",
586 |        "      <th>2</th>\n",
587 |        "      <td>Insta</td>\n",
588 |        "      <td>25.500000</td>\n",
589 |        "    </tr>\n",
590 |        "  </tbody>\n",
591 |        "</table>\n",
592 |        "</div>"
593 |       ],
594 |       "text/plain": [
595 |        "       app   mean_age\n",
596 |        "0  Twitter  38.500000\n",
597 |        "1       FB  30.571429\n",
598 |        "2    Insta  25.500000"
599 |       ]
600 |      },
601 |      "execution_count": 32,
602 |      "metadata": {},
603 |      "output_type": "execute_result"
604 |     }
605 |    ],
606 |    "source": [
607 |     "spark.sql(\"select * from age_query \").toPandas().head(5)"
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "code",
612 |    "execution_count": 27,
613 |    "metadata": {},
614 |    "outputs": [
615 |     {
616 |      "name": "stdout",
617 |      "output_type": "stream",
618 |      "text": [
619 |       "+-------+---------+\n",
620 |       "|    app|full_name|\n",
621 |       "+-------+---------+\n",
622 |       "|     FB| FACEBOOK|\n",
623 |       "|  Insta|INSTAGRAM|\n",
624 |       "|Twitter|  TWITTER|\n",
625 |       "+-------+---------+\n",
626 |       "\n"
627 |      ]
628 |     }
629 |    ],
630 |    "source": [
631 |     "# Join static dataframe with streaming dataframe\n",
632 |     "app_df=spark.createDataFrame([('FB','FACEBOOK'),('Insta','INSTAGRAM'),('Twitter','TWITTER')],[\"app\", \"full_name\"])\n",
633 |     "app_df.show()"
634 |    ]
635 |   },
636 |   {
637 |    "cell_type": "code",
638 |    "execution_count": 28,
639 |    "metadata": {},
640 |    "outputs": [],
641 |    "source": [
642 |     "app_stream_df=data.join(app_df,'app')"
643 |    ]
644 |   },
645 |   {
646 |    "cell_type": "code",
647 |    "execution_count": 29,
648 |    "metadata": {},
649 |    "outputs": [],
650 |    "source": [
651 |     "join_query=(app_stream_df.writeStream.queryName('join_query').outputMode('append').format('memory').start())"
652 |    ]
653 |   },
654 |   {
655 |    "cell_type": "code",
656 |    "execution_count": 30,
657 |    "metadata": {},
658 |    "outputs": [
659 |     {
660 |      "data": {
661 |       "text/html": [
662 |        "<div>\n",
663 |        "<table border=\"1\" class=\"dataframe\">\n",
664 |        "  <thead>\n",
665 |        "    <tr style=\"text-align: right;\">\n",
666 |        "      <th></th>\n",
667 |        "      <th>app</th>\n",
668 |        "      <th>user_id</th>\n",
669 |        "      <th>time_in_secs</th>\n",
670 |        "      <th>age</th>\n",
671 |        "      <th>full_name</th>\n",
672 |        "    </tr>\n",
673 |        "  </thead>\n",
674 |        "  <tbody>\n",
675 |        "    <tr>\n",
676 |        "      <th>0</th>\n",
677 |        "      <td>FB</td>\n",
678 |        "      <td>XN201</td>\n",
679 |        "      <td>10</td>\n",
680 |        "      <td>19</td>\n",
681 |        "      <td>FACEBOOK</td>\n",
682 |        "    </tr>\n",
683 |        "    <tr>\n",
684 |        "      <th>1</th>\n",
685 |        "      <td>FB</td>\n",
686 |        "      <td>XN203</td>\n",
687 |        "      <td>500</td>\n",
688 |        "      <td>30</td>\n",
689 |        "      <td>FACEBOOK</td>\n",
690 |        "    </tr>\n",
691 |        "    <tr>\n",
692 |        "      <th>2</th>\n",
693 |        "      <td>FB</td>\n",
694 |        "      <td>XN203</td>\n",
695 |        "      <td>500</td>\n",
696 |        "      <td>30</td>\n",
697 |        "      <td>FACEBOOK</td>\n",
698 |        "    </tr>\n",
699 |        "    <tr>\n",
700 |        "      <th>3</th>\n",
701 |        "      <td>FB</td>\n",
702 |        "      <td>XN203</td>\n",
703 |        "      <td>100</td>\n",
704 |        "      <td>30</td>\n",
705 |        "      <td>FACEBOOK</td>\n",
706 |        "    </tr>\n",
707 |        "    <tr>\n",
708 |        "      <th>4</th>\n",
709 |        "      <td>FB</td>\n",
710 |        "      <td>XN203</td>\n",
711 |        "      <td>300</td>\n",
712 |        "      <td>30</td>\n",
713 |        "      <td>FACEBOOK</td>\n",
714 |        "    </tr>\n",
715 |        "    <tr>\n",
716 |        "      <th>5</th>\n",
717 |        "      <td>FB</td>\n",
718 |        "      <td>XN202</td>\n",
719 |        "      <td>2000</td>\n",
720 |        "      <td>45</td>\n",
721 |        "      <td>FACEBOOK</td>\n",
722 |        "    </tr>\n",
723 |        "    <tr>\n",
724 |        "      <th>6</th>\n",
725 |        "      <td>Insta</td>\n",
726 |        "      <td>XN201</td>\n",
727 |        "      <td>30</td>\n",
728 |        "      <td>19</td>\n",
729 |        "      <td>INSTAGRAM</td>\n",
730 |        "    </tr>\n",
731 |        "    <tr>\n",
732 |        "      <th>7</th>\n",
733 |        "      <td>Insta</td>\n",
734 |        "      <td>XN201</td>\n",
735 |        "      <td>30</td>\n",
736 |        "      <td>19</td>\n",
737 |        "      <td>INSTAGRAM</td>\n",
738 |        "    </tr>\n",
739 |        "    <tr>\n",
740 |        "      <th>8</th>\n",
741 |        "      <td>Insta</td>\n",
742 |        "      <td>XN202</td>\n",
743 |        "      <td>500</td>\n",
744 |        "      <td>45</td>\n",
745 |        "      <td>INSTAGRAM</td>\n",
746 |        "    </tr>\n",
747 |        "    <tr>\n",
748 |        "      <th>9</th>\n",
749 |        "      <td>Twitter</td>\n",
750 |        "      <td>XN201</td>\n",
751 |        "      <td>10</td>\n",
752 |        "      <td>19</td>\n",
753 |        "      <td>TWITTER</td>\n",
754 |        "    </tr>\n",
755 |        "    <tr>\n",
756 |        "      <th>10</th>\n",
757 |        "      <td>Twitter</td>\n",
758 |        "      <td>XN202</td>\n",
759 |        "      <td>100</td>\n",
760 |        "      <td>45</td>\n",
761 |        "      <td>TWITTER</td>\n",
762 |        "    </tr>\n",
763 |        "    <tr>\n",
764 |        "      <th>11</th>\n",
765 |        "      <td>Twitter</td>\n",
766 |        "      <td>XN202</td>\n",
767 |        "      <td>100</td>\n",
768 |        "      <td>45</td>\n",
769 |        "      <td>TWITTER</td>\n",
770 |        "    </tr>\n",
771 |        "  </tbody>\n",
772 |        "</table>\n",
773 |        "</div>"
774 |       ],
775 |       "text/plain": [
776 |        "        app user_id  time_in_secs  age  full_name\n",
777 |        "0        FB   XN201            10   19   FACEBOOK\n",
778 |        "1        FB   XN203           500   30   FACEBOOK\n",
779 |        "2        FB   XN203           500   30   FACEBOOK\n",
780 |        "3        FB   XN203           100   30   FACEBOOK\n",
781 |        "4        FB   XN203           300   30   FACEBOOK\n",
782 |        "5        FB   XN202          2000   45   FACEBOOK\n",
783 |        "6     Insta   XN201            30   19  INSTAGRAM\n",
784 |        "7     Insta   XN201            30   19  INSTAGRAM\n",
785 |        "8     Insta   XN202           500   45  INSTAGRAM\n",
786 |        "9   Twitter   XN201            10   19    TWITTER\n",
787 |        "10  Twitter   XN202           100   45    TWITTER\n",
788 |        "11  Twitter   XN202           100   45    TWITTER"
789 |       ]
790 |      },
791 |      "execution_count": 30,
792 |      "metadata": {},
793 |      "output_type": "execute_result"
794 |     }
795 |    ],
796 |    "source": [
797 |     "spark.sql(\"select * from join_query \").toPandas().head(50)"
798 |    ]
799 |   },
800 |   {
801 |    "cell_type": "code",
802 |    "execution_count": null,
803 |    "metadata": {},
804 |    "outputs": [],
805 |    "source": []
806 |   },
807 |   {
808 |    "cell_type": "code",
809 |    "execution_count": null,
810 |    "metadata": {},
811 |    "outputs": [],
812 |    "source": []
813 |   }
814 |  ],
815 |  "metadata": {
816 |   "kernelspec": {
817 |    "display_name": "Python 3",
818 |    "language": "python",
819 |    "name": "python3"
820 |   },
821 |   "language_info": {
822 |    "codemirror_mode": {
823 |     "name": "ipython",
824 |     "version": 3
825 |    },
826 |    "file_extension": ".py",
827 |    "mimetype": "text/x-python",
828 |    "name": "python",
829 |    "nbconvert_exporter": "python",
830 |    "pygments_lexer": "ipython3",
831 |    "version": "3.6.3"
832 |   }
833 |  },
834 |  "nbformat": 4,
835 |  "nbformat_minor": 2
836 | }
837 | 


--------------------------------------------------------------------------------
/chap_5/.ipynb_checkpoints/Classification_using_MLlib-checkpoint.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "#import SparkSession\n",
 10 |     "from pyspark.sql import SparkSession\n",
 11 |     "spark=SparkSession.builder.appName('binary_class').getOrCreate()"
 12 |    ]
 13 |   },
 14 |   {
 15 |    "cell_type": "code",
 16 |    "execution_count": 11,
 17 |    "metadata": {},
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "#read the dataset\n",
 21 |     "df=spark.read.csv('classification_data.csv',inferSchema=True,header=True)"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "code",
 26 |    "execution_count": 3,
 27 |    "metadata": {},
 28 |    "outputs": [],
 29 |    "source": [
 30 |     "from pyspark.sql.functions import *\n"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 12,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "(46751, 12)\n"
 43 |      ]
 44 |     }
 45 |    ],
 46 |    "source": [
 47 |     "#check the shape of the data \n",
 48 |     "print((df.count(),len(df.columns)))"
 49 |    ]
 50 |   },
 51 |   {
 52 |    "cell_type": "code",
 53 |    "execution_count": 13,
 54 |    "metadata": {},
 55 |    "outputs": [
 56 |     {
 57 |      "name": "stdout",
 58 |      "output_type": "stream",
 59 |      "text": [
 60 |       "root\n",
 61 |       " |-- loan_id: string (nullable = true)\n",
 62 |       " |-- loan_purpose: string (nullable = true)\n",
 63 |       " |-- is_first_loan: integer (nullable = true)\n",
 64 |       " |-- total_credit_card_limit: integer (nullable = true)\n",
 65 |       " |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)\n",
 66 |       " |-- saving_amount: integer (nullable = true)\n",
 67 |       " |-- checking_amount: integer (nullable = true)\n",
 68 |       " |-- is_employed: integer (nullable = true)\n",
 69 |       " |-- yearly_salary: integer (nullable = true)\n",
 70 |       " |-- age: integer (nullable = true)\n",
 71 |       " |-- dependent_number: integer (nullable = true)\n",
 72 |       " |-- loan_defaulter: integer (nullable = true)\n",
 73 |       "\n"
 74 |      ]
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "#printSchema\n",
 79 |     "df.printSchema()"
 80 |    ]
 81 |   },
 82 |   {
 83 |    "cell_type": "code",
 84 |    "execution_count": 14,
 85 |    "metadata": {},
 86 |    "outputs": [
 87 |     {
 88 |      "data": {
 89 |       "text/plain": [
 90 |        "['loan_id',\n",
 91 |        " 'loan_purpose',\n",
 92 |        " 'is_first_loan',\n",
 93 |        " 'total_credit_card_limit',\n",
 94 |        " 'avg_percentage_credit_card_limit_used_last_year',\n",
 95 |        " 'saving_amount',\n",
 96 |        " 'checking_amount',\n",
 97 |        " 'is_employed',\n",
 98 |        " 'yearly_salary',\n",
 99 |        " 'age',\n",
100 |        " 'dependent_number',\n",
101 |        " 'loan_defaulter']"
102 |       ]
103 |      },
104 |      "execution_count": 14,
105 |      "metadata": {},
106 |      "output_type": "execute_result"
107 |     }
108 |    ],
109 |    "source": [
110 |     "#number of columns in dataset\n",
111 |     "df.columns"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 15,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "name": "stdout",
121 |      "output_type": "stream",
122 |      "text": [
123 |       "+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+\n",
124 |       "|loan_id|loan_purpose|is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|saving_amount|checking_amount|is_employed|yearly_salary|age|dependent_number|loan_defaulter|\n",
125 |       "+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+\n",
126 |       "|    A_1|    personal|            1|                   7900|                                            0.8|         1103|           6393|          1|        16400| 42|               4|             0|\n",
127 |       "|    A_2|    personal|            0|                   3300|                                           0.29|         2588|            832|          1|        75500| 56|               1|             0|\n",
128 |       "|    A_3|    personal|            0|                   7600|                                            0.9|         1651|           8868|          1|        59000| 46|               1|             0|\n",
129 |       "|    A_4|    personal|            1|                   3400|                                           0.38|         1269|           6863|          1|        26000| 55|               8|             0|\n",
130 |       "|    A_5|   emergency|            0|                   2600|                                           0.89|         1310|           3423|          1|         9700| 41|               4|             1|\n",
131 |       "+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+\n",
132 |       "only showing top 5 rows\n",
133 |       "\n"
134 |      ]
135 |     }
136 |    ],
137 |    "source": [
138 |     "#view the dataset\n",
139 |     "df.show(5)"
140 |    ]
141 |   },
142 |   {
143 |    "cell_type": "code",
144 |    "execution_count": 16,
145 |    "metadata": {},
146 |    "outputs": [
147 |     {
148 |      "name": "stdout",
149 |      "output_type": "stream",
150 |      "text": [
151 |       "+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+\n",
152 |       "|summary|loan_id|loan_purpose|     is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|     saving_amount|   checking_amount|       is_employed|     yearly_salary|               age|  dependent_number|     loan_defaulter|\n",
153 |       "+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+\n",
154 |       "|  count|  46751|       46751|             46751|                  46751|                                          46751|             46751|             46751|             46751|             46751|             46751|             46751|              46751|\n",
155 |       "|   mean|   null|        null|0.5414429637868708|      4615.304485465552|                              0.700091121045545| 2037.636585313683|3520.6714294881394|0.9173279715941905| 29527.62079955509| 41.53979594019379|3.7448396825736348|0.34653804196701676|\n",
156 |       "| stddev|   null|        null|0.4982848498677868|      1890.194453628314|                             0.1777288093267152|1498.6710906030362|2160.9332423713727|0.2753887911928983|16149.757703029438|12.817646350266434|2.6191527902107667|0.47587211651314887|\n",
157 |       "|    min|    A_1|   emergency|                 0|                    500|                                            0.0|                 0|                 0|                 0|                 0|                18|                 0|                  0|\n",
158 |       "|    max| A_9999|    property|                 1|                  13500|                                           1.09|             10641|             13165|                 1|             97200|                79|                 8|                  1|\n",
159 |       "+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+\n",
160 |       "\n"
161 |      ]
162 |     }
163 |    ],
164 |    "source": [
165 |     "#Exploratory Data Analysis\n",
166 |     "df.describe().show()\n"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 17,
172 |    "metadata": {},
173 |    "outputs": [
174 |     {
175 |      "name": "stdout",
176 |      "output_type": "stream",
177 |      "text": [
178 |       "+--------------+-----+\n",
179 |       "|loan_defaulter|count|\n",
180 |       "+--------------+-----+\n",
181 |       "|             1|16201|\n",
182 |       "|             0|30550|\n",
183 |       "+--------------+-----+\n",
184 |       "\n"
185 |      ]
186 |     }
187 |    ],
188 |    "source": [
189 |     "df.groupBy('loan_defaulter').count().show()"
190 |    ]
191 |   },
192 |   {
193 |    "cell_type": "code",
194 |    "execution_count": 18,
195 |    "metadata": {},
196 |    "outputs": [
197 |     {
198 |      "name": "stdout",
199 |      "output_type": "stream",
200 |      "text": [
201 |       "+------------+-----+\n",
202 |       "|loan_purpose|count|\n",
203 |       "+------------+-----+\n",
204 |       "|      others| 6763|\n",
205 |       "|   emergency| 7562|\n",
206 |       "|    property|11388|\n",
207 |       "|  operations|10580|\n",
208 |       "|    personal|10458|\n",
209 |       "+------------+-----+\n",
210 |       "\n"
211 |      ]
212 |     }
213 |    ],
214 |    "source": [
215 |     "df.groupBy('loan_purpose').count().show()"
216 |    ]
217 |   },
218 |   {
219 |    "cell_type": "code",
220 |    "execution_count": 120,
221 |    "metadata": {},
222 |    "outputs": [],
223 |    "source": [
224 |     "#converting categorical data to numerical form"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 21,
230 |    "metadata": {},
231 |    "outputs": [],
232 |    "source": [
233 |     "#import required libraries\n",
234 |     "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n",
235 |     "\n"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 22,
241 |    "metadata": {},
242 |    "outputs": [],
243 |    "source": [
244 |     "loan_purpose_indexer = StringIndexer(inputCol=\"loan_purpose\", outputCol=\"loan_purpose\").fit(df)\n",
245 |     "df = loan_purpose_indexer.transform(df)\n",
246 |     "loan_encoder = OneHotEncoder(inputCol=\"loan_index\", outputCol=\"loan_purpose_vec\")\n",
247 |     "df = loan_encoder.transform(df)"
248 |    ]
249 |   },
250 |   {
251 |    "cell_type": "code",
252 |    "execution_count": 63,
253 |    "metadata": {},
254 |    "outputs": [
255 |     {
256 |      "name": "stdout",
257 |      "output_type": "stream",
258 |      "text": [
259 |       "+------------+------------+----------------+\n",
260 |       "|loan_purpose|loan_purpose|loan_purpose_vec|\n",
261 |       "+------------+------------+----------------+\n",
262 |       "|personal    |personal    |(4,[2],[1.0])   |\n",
263 |       "|personal    |personal    |(4,[2],[1.0])   |\n",
264 |       "|personal    |personal    |(4,[2],[1.0])   |\n",
265 |       "+------------+------------+----------------+\n",
266 |       "only showing top 3 rows\n",
267 |       "\n"
268 |      ]
269 |     }
270 |    ],
271 |    "source": [
272 |     "df.select(['loan_purpose','loan_purpose','loan_purpose_vec']).show(3,False)"
273 |    ]
274 |   },
275 |   {
276 |    "cell_type": "code",
277 |    "execution_count": 24,
278 |    "metadata": {},
279 |    "outputs": [],
280 |    "source": [
281 |     "from pyspark.ml.feature import VectorAssembler"
282 |    ]
283 |   },
284 |   {
285 |    "cell_type": "code",
286 |    "execution_count": 25,
287 |    "metadata": {},
288 |    "outputs": [
289 |     {
290 |      "data": {
291 |       "text/plain": [
292 |        "['loan_id',\n",
293 |        " 'loan_purpose',\n",
294 |        " 'is_first_loan',\n",
295 |        " 'total_credit_card_limit',\n",
296 |        " 'avg_percentage_credit_card_limit_used_last_year',\n",
297 |        " 'saving_amount',\n",
298 |        " 'checking_amount',\n",
299 |        " 'is_employed',\n",
300 |        " 'yearly_salary',\n",
301 |        " 'age',\n",
302 |        " 'dependent_number',\n",
303 |        " 'loan_defaulter',\n",
304 |        " 'loan_index',\n",
305 |        " 'loan_purpose_vec']"
306 |       ]
307 |      },
308 |      "execution_count": 25,
309 |      "metadata": {},
310 |      "output_type": "execute_result"
311 |     }
312 |    ],
313 |    "source": [
314 |     "df.columns"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 28,
320 |    "metadata": {},
321 |    "outputs": [],
322 |    "source": [
323 |     "df_assembler = VectorAssembler(inputCols=['is_first_loan',\n",
324 |     " 'total_credit_card_limit',\n",
325 |     " 'avg_percentage_credit_card_limit_used_last_year',\n",
326 |     " 'saving_amount',\n",
327 |     " 'checking_amount',\n",
328 |     " 'is_employed',\n",
329 |     " 'yearly_salary',\n",
330 |     " 'age',\n",
331 |     " 'dependent_number',\n",
332 |     " 'loan_purpose_vec'], outputCol=\"features\")\n",
333 |     "df = df_assembler.transform(df)"
334 |    ]
335 |   },
336 |   {
337 |    "cell_type": "code",
338 |    "execution_count": 29,
339 |    "metadata": {},
340 |    "outputs": [
341 |     {
342 |      "name": "stdout",
343 |      "output_type": "stream",
344 |      "text": [
345 |       "root\n",
346 |       " |-- loan_id: string (nullable = true)\n",
347 |       " |-- loan_purpose: string (nullable = true)\n",
348 |       " |-- is_first_loan: integer (nullable = true)\n",
349 |       " |-- total_credit_card_limit: integer (nullable = true)\n",
350 |       " |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)\n",
351 |       " |-- saving_amount: integer (nullable = true)\n",
352 |       " |-- checking_amount: integer (nullable = true)\n",
353 |       " |-- is_employed: integer (nullable = true)\n",
354 |       " |-- yearly_salary: integer (nullable = true)\n",
355 |       " |-- age: integer (nullable = true)\n",
356 |       " |-- dependent_number: integer (nullable = true)\n",
357 |       " |-- loan_defaulter: integer (nullable = true)\n",
358 |       " |-- loan_index: double (nullable = false)\n",
359 |       " |-- loan_purpose_vec: vector (nullable = true)\n",
360 |       " |-- features: vector (nullable = true)\n",
361 |       "\n"
362 |      ]
363 |     }
364 |    ],
365 |    "source": [
366 |     "df.printSchema()"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 30,
372 |    "metadata": {},
373 |    "outputs": [
374 |     {
375 |      "name": "stdout",
376 |      "output_type": "stream",
377 |      "text": [
378 |       "+--------------------------------------------------------------------+--------------+\n",
379 |       "|features                                                            |loan_defaulter|\n",
380 |       "+--------------------------------------------------------------------+--------------+\n",
381 |       "|[1.0,7900.0,0.8,1103.0,6393.0,1.0,16400.0,42.0,4.0,0.0,0.0,1.0,0.0] |0             |\n",
382 |       "|[0.0,3300.0,0.29,2588.0,832.0,1.0,75500.0,56.0,1.0,0.0,0.0,1.0,0.0] |0             |\n",
383 |       "|[0.0,7600.0,0.9,1651.0,8868.0,1.0,59000.0,46.0,1.0,0.0,0.0,1.0,0.0] |0             |\n",
384 |       "|[1.0,3400.0,0.38,1269.0,6863.0,1.0,26000.0,55.0,8.0,0.0,0.0,1.0,0.0]|0             |\n",
385 |       "|[0.0,2600.0,0.89,1310.0,3423.0,1.0,9700.0,41.0,4.0,0.0,0.0,0.0,1.0] |1             |\n",
386 |       "|[0.0,7600.0,0.51,1040.0,2406.0,1.0,22900.0,52.0,0.0,0.0,1.0,0.0,0.0]|0             |\n",
387 |       "|[1.0,6900.0,0.82,2408.0,5556.0,1.0,34800.0,48.0,4.0,0.0,1.0,0.0,0.0]|0             |\n",
388 |       "|[0.0,5700.0,0.56,1933.0,4139.0,1.0,32500.0,64.0,2.0,0.0,0.0,1.0,0.0]|0             |\n",
389 |       "|[1.0,3400.0,0.95,3866.0,4131.0,1.0,13300.0,23.0,3.0,0.0,0.0,1.0,0.0]|0             |\n",
390 |       "|[0.0,2900.0,0.91,88.0,2725.0,1.0,21100.0,52.0,1.0,0.0,0.0,1.0,0.0]  |1             |\n",
391 |       "+--------------------------------------------------------------------+--------------+\n",
392 |       "only showing top 10 rows\n",
393 |       "\n"
394 |      ]
395 |     }
396 |    ],
397 |    "source": [
398 |     "df.select(['features','loan_defaulter']).show(10,False)"
399 |    ]
400 |   },
401 |   {
402 |    "cell_type": "code",
403 |    "execution_count": 31,
404 |    "metadata": {},
405 |    "outputs": [],
406 |    "source": [
407 |     "#select data for building model\n",
408 |     "model_df=df.select(['features','loan_defaulter'])"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 32,
414 |    "metadata": {},
415 |    "outputs": [],
416 |    "source": [
417 |     "from pyspark.ml.classification import LogisticRegression"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 33,
423 |    "metadata": {},
424 |    "outputs": [],
425 |    "source": [
426 |     "#split the data \n",
427 |     "training_df,test_df=model_df.randomSplit([0.75,0.25])"
428 |    ]
429 |   },
430 |   {
431 |    "cell_type": "code",
432 |    "execution_count": 34,
433 |    "metadata": {},
434 |    "outputs": [
435 |     {
436 |      "data": {
437 |       "text/plain": [
438 |        "34958"
439 |       ]
440 |      },
441 |      "execution_count": 34,
442 |      "metadata": {},
443 |      "output_type": "execute_result"
444 |     }
445 |    ],
446 |    "source": [
447 |     "training_df.count()"
448 |    ]
449 |   },
450 |   {
451 |    "cell_type": "code",
452 |    "execution_count": 35,
453 |    "metadata": {},
454 |    "outputs": [
455 |     {
456 |      "name": "stdout",
457 |      "output_type": "stream",
458 |      "text": [
459 |       "+--------------+-----+\n",
460 |       "|loan_defaulter|count|\n",
461 |       "+--------------+-----+\n",
462 |       "|             1|12048|\n",
463 |       "|             0|22910|\n",
464 |       "+--------------+-----+\n",
465 |       "\n"
466 |      ]
467 |     }
468 |    ],
469 |    "source": [
470 |     "training_df.groupBy('loan_defaulter').count().show()"
471 |    ]
472 |   },
473 |   {
474 |    "cell_type": "code",
475 |    "execution_count": 36,
476 |    "metadata": {},
477 |    "outputs": [
478 |     {
479 |      "data": {
480 |       "text/plain": [
481 |        "11793"
482 |       ]
483 |      },
484 |      "execution_count": 36,
485 |      "metadata": {},
486 |      "output_type": "execute_result"
487 |     }
488 |    ],
489 |    "source": [
490 |     "test_df.count()"
491 |    ]
492 |   },
493 |   {
494 |    "cell_type": "code",
495 |    "execution_count": 37,
496 |    "metadata": {},
497 |    "outputs": [
498 |     {
499 |      "name": "stdout",
500 |      "output_type": "stream",
501 |      "text": [
502 |       "+--------------+-----+\n",
503 |       "|loan_defaulter|count|\n",
504 |       "+--------------+-----+\n",
505 |       "|             1| 4153|\n",
506 |       "|             0| 7640|\n",
507 |       "+--------------+-----+\n",
508 |       "\n"
509 |      ]
510 |     }
511 |    ],
512 |    "source": [
513 |     "test_df.groupBy('loan_defaulter').count().show()"
514 |    ]
515 |   },
516 |   {
517 |    "cell_type": "code",
518 |    "execution_count": 38,
519 |    "metadata": {},
520 |    "outputs": [],
521 |    "source": [
522 |     "log_reg=LogisticRegression(labelCol='loan_defaulter').fit(training_df)"
523 |    ]
524 |   },
525 |   {
526 |    "cell_type": "code",
527 |    "execution_count": null,
528 |    "metadata": {},
529 |    "outputs": [],
530 |    "source": [
531 |     "#Training Results"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": 39,
537 |    "metadata": {},
538 |    "outputs": [],
539 |    "source": [
540 |     "lr_summary=log_reg.summary"
541 |    ]
542 |   },
543 |   {
544 |    "cell_type": "code",
545 |    "execution_count": 40,
546 |    "metadata": {},
547 |    "outputs": [
548 |     {
549 |      "data": {
550 |       "text/plain": [
551 |        "0.8939298586875679"
552 |       ]
553 |      },
554 |      "execution_count": 40,
555 |      "metadata": {},
556 |      "output_type": "execute_result"
557 |     }
558 |    ],
559 |    "source": [
560 |     "lr_summary.accuracy"
561 |    ]
562 |   },
563 |   {
564 |    "cell_type": "code",
565 |    "execution_count": 41,
566 |    "metadata": {},
567 |    "outputs": [
568 |     {
569 |      "data": {
570 |       "text/plain": [
571 |        "0.9587456481363935"
572 |       ]
573 |      },
574 |      "execution_count": 41,
575 |      "metadata": {},
576 |      "output_type": "execute_result"
577 |     }
578 |    ],
579 |    "source": [
580 |     "lr_summary.areaUnderROC"
581 |    ]
582 |   },
583 |   {
584 |    "cell_type": "code",
585 |    "execution_count": 42,
586 |    "metadata": {},
587 |    "outputs": [
588 |     {
589 |      "name": "stdout",
590 |      "output_type": "stream",
591 |      "text": [
592 |       "[0.9233245149911816, 0.8396318618667535]\n"
593 |      ]
594 |     }
595 |    ],
596 |    "source": [
597 |     "print(lr_summary.precisionByLabel)"
598 |    ]
599 |   },
600 |   {
601 |    "cell_type": "code",
602 |    "execution_count": 43,
603 |    "metadata": {},
604 |    "outputs": [
605 |     {
606 |      "name": "stdout",
607 |      "output_type": "stream",
608 |      "text": [
609 |       "[0.914054997817547, 0.8556606905710491]\n"
610 |      ]
611 |     }
612 |    ],
613 |    "source": [
614 |     "print(lr_summary.recallByLabel)"
615 |    ]
616 |   },
617 |   {
618 |    "cell_type": "code",
619 |    "execution_count": 45,
620 |    "metadata": {},
621 |    "outputs": [
622 |     {
623 |      "name": "stdout",
624 |      "output_type": "stream",
625 |      "text": [
626 |       "+--------------------+--------------+--------------------+--------------------+----------+\n",
627 |       "|            features|loan_defaulter|       rawPrediction|         probability|prediction|\n",
628 |       "+--------------------+--------------+--------------------+--------------------+----------+\n",
629 |       "|(13,[0,1,2,3,4,7]...|             1|[-3.4630360774167...|[0.03038246469741...|       1.0|\n",
630 |       "|(13,[0,1,2,3,4,7]...|             1|[-5.5391195110590...|[0.00391460129742...|       1.0|\n",
631 |       "|(13,[0,1,2,3,4,7]...|             0|[1.00238593296486...|[0.73152742283114...|       0.0|\n",
632 |       "|(13,[0,1,2,3,4,7]...|             1|[-1.8290704519648...|[0.13834904603406...|       1.0|\n",
633 |       "|(13,[0,1,2,3,4,7]...|             1|[-1.5501728962289...|[0.17506129798003...|       1.0|\n",
634 |       "|(13,[0,1,2,3,4,7]...|             0|[6.60737916543425...|[0.99865145442765...|       0.0|\n",
635 |       "|(13,[0,1,2,3,4,7]...|             0|[7.50587822302399...|[0.99945045940723...|       0.0|\n",
636 |       "|(13,[0,1,2,3,4,7,...|             1|[-4.4555325192703...|[0.01148079400371...|       1.0|\n",
637 |       "|(13,[0,1,2,3,4,7,...|             1|[-4.5326784954285...|[0.01063746639570...|       1.0|\n",
638 |       "|(13,[0,1,2,3,4,7,...|             1|[-4.9717163244463...|[0.00688353015038...|       1.0|\n",
639 |       "+--------------------+--------------+--------------------+--------------------+----------+\n",
640 |       "only showing top 10 rows\n",
641 |       "\n"
642 |      ]
643 |     }
644 |    ],
645 |    "source": [
646 |     "predictions = log_reg.transform(test_df)\n",
647 |     "predictions.show(10)\n"
648 |    ]
649 |   },
650 |   {
651 |    "cell_type": "code",
652 |    "execution_count": 47,
653 |    "metadata": {},
654 |    "outputs": [
655 |     {
656 |      "data": {
657 |       "text/plain": [
658 |        "['features', 'loan_defaulter', 'rawPrediction', 'probability', 'prediction']"
659 |       ]
660 |      },
661 |      "execution_count": 47,
662 |      "metadata": {},
663 |      "output_type": "execute_result"
664 |     }
665 |    ],
666 |    "source": [
667 |     "model_predictions = log_reg.transform(test_df)\n",
668 |     "model_predictions.columns"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": 48,
674 |    "metadata": {},
675 |    "outputs": [],
676 |    "source": [
677 |     "model_predictions = log_reg.evaluate(test_df)\n"
678 |    ]
679 |   },
680 |   {
681 |    "cell_type": "code",
682 |    "execution_count": 49,
683 |    "metadata": {},
684 |    "outputs": [
685 |     {
686 |      "data": {
687 |       "text/plain": [
688 |        "0.8945984906300347"
689 |       ]
690 |      },
691 |      "execution_count": 49,
692 |      "metadata": {},
693 |      "output_type": "execute_result"
694 |     }
695 |    ],
696 |    "source": [
697 |     "model_predictions.accuracy"
698 |    ]
699 |   },
700 |   {
701 |    "cell_type": "code",
702 |    "execution_count": 50,
703 |    "metadata": {},
704 |    "outputs": [
705 |     {
706 |      "data": {
707 |       "text/plain": [
708 |        "0.8951909857782705"
709 |       ]
710 |      },
711 |      "execution_count": 50,
712 |      "metadata": {},
713 |      "output_type": "execute_result"
714 |     }
715 |    ],
716 |    "source": [
717 |     "model_predictions.weightedPrecision"
718 |    ]
719 |   },
720 |   {
721 |    "cell_type": "code",
722 |    "execution_count": 52,
723 |    "metadata": {},
724 |    "outputs": [
725 |     {
726 |      "data": {
727 |       "text/plain": [
728 |        "[0.9129581151832461, 0.8608235010835541]"
729 |       ]
730 |      },
731 |      "execution_count": 52,
732 |      "metadata": {},
733 |      "output_type": "execute_result"
734 |     }
735 |    ],
736 |    "source": [
737 |     "model_predictions.recallByLabel"
738 |    ]
739 |   },
740 |   {
741 |    "cell_type": "code",
742 |    "execution_count": 53,
743 |    "metadata": {},
744 |    "outputs": [
745 |     {
746 |      "name": "stdout",
747 |      "output_type": "stream",
748 |      "text": [
749 |       "[0.9234741162452006, 0.8431603773584906]\n"
750 |      ]
751 |     }
752 |    ],
753 |    "source": [
754 |     "print(model_predictions.precisionByLabel)"
755 |    ]
756 |   },
757 |   {
758 |    "cell_type": "code",
759 |    "execution_count": 54,
760 |    "metadata": {},
761 |    "outputs": [
762 |     {
763 |      "data": {
764 |       "text/plain": [
765 |        "0.9594316478468224"
766 |       ]
767 |      },
768 |      "execution_count": 54,
769 |      "metadata": {},
770 |      "output_type": "execute_result"
771 |     }
772 |    ],
773 |    "source": [
774 |     "model_predictions.areaUnderROC"
775 |    ]
776 |   },
777 |   {
778 |    "cell_type": "code",
779 |    "execution_count": 56,
780 |    "metadata": {},
781 |    "outputs": [],
782 |    "source": [
783 |     "from pyspark.ml.classification import RandomForestClassifier\n",
784 |     "rf = RandomForestClassifier(numTrees=50,maxDepth=30,labelCol='loan_defaulter')\n",
785 |     "rf_model = rf.fit(training_df)\n"
786 |    ]
787 |   },
788 |   {
789 |    "cell_type": "code",
790 |    "execution_count": 57,
791 |    "metadata": {},
792 |    "outputs": [],
793 |    "source": [
794 |     "model_predictions = rf_model.transform(test_df)\n"
795 |    ]
796 |   },
797 |   {
798 |    "cell_type": "code",
799 |    "execution_count": 59,
800 |    "metadata": {},
801 |    "outputs": [],
802 |    "source": [
803 |     "true_pos=model_predictions.filter(model_predictions['loan_defaulter']==1).filter(model_predictions['prediction']==1).count()\n",
804 |     "actual_pos=model_predictions.filter(model_predictions['loan_defaulter']==1).count()\n",
805 |     "pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()"
806 |    ]
807 |   },
808 |   {
809 |    "cell_type": "code",
810 |    "execution_count": 60,
811 |    "metadata": {},
812 |    "outputs": [
813 |     {
814 |      "data": {
815 |       "text/plain": [
816 |        "0.8979051288225379"
817 |       ]
818 |      },
819 |      "execution_count": 60,
820 |      "metadata": {},
821 |      "output_type": "execute_result"
822 |     }
823 |    ],
824 |    "source": [
825 |     "#Recall \n",
826 |     "float(true_pos)/(actual_pos)"
827 |    ]
828 |   },
829 |   {
830 |    "cell_type": "code",
831 |    "execution_count": 61,
832 |    "metadata": {},
833 |    "outputs": [
834 |     {
835 |      "data": {
836 |       "text/plain": [
837 |        "0.8660009289363678"
838 |       ]
839 |      },
840 |      "execution_count": 61,
841 |      "metadata": {},
842 |      "output_type": "execute_result"
843 |     }
844 |    ],
845 |    "source": [
846 |     "#Precision on test Data \n",
847 |     "float(true_pos)/(pred_pos)"
848 |    ]
849 |   },
850 |   {
851 |    "cell_type": "code",
852 |    "execution_count": null,
853 |    "metadata": {},
854 |    "outputs": [],
855 |    "source": []
856 |   }
857 |  ],
858 |  "metadata": {
859 |   "kernelspec": {
860 |    "display_name": "Python 3",
861 |    "language": "python",
862 |    "name": "python3"
863 |   },
864 |   "language_info": {
865 |    "codemirror_mode": {
866 |     "name": "ipython",
867 |     "version": 3
868 |    },
869 |    "file_extension": ".py",
870 |    "mimetype": "text/x-python",
871 |    "name": "python",
872 |    "nbconvert_exporter": "python",
873 |    "pygments_lexer": "ipython3",
874 |    "version": "3.6.3"
875 |   }
876 |  },
877 |  "nbformat": 4,
878 |  "nbformat_minor": 2
879 | }
880 | 


--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/Logistic_resgression_pyspark-checkpoint.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "code",
   5 |    "execution_count": 1,
   6 |    "metadata": {},
   7 |    "outputs": [],
   8 |    "source": [
   9 |     "#import SparkSession\n",
  10 |     "from pyspark.sql import SparkSession\n",
  11 |     "spark=SparkSession.builder.appName('log_reg').getOrCreate()"
  12 |    ]
  13 |   },
  14 |   {
  15 |    "cell_type": "code",
  16 |    "execution_count": 2,
  17 |    "metadata": {},
  18 |    "outputs": [],
  19 |    "source": [
  20 |     "#read the dataset\n",
  21 |     "df=spark.read.csv('Log_Reg_dataset.csv',inferSchema=True,header=True)"
  22 |    ]
  23 |   },
  24 |   {
  25 |    "cell_type": "code",
  26 |    "execution_count": 14,
  27 |    "metadata": {},
  28 |    "outputs": [],
  29 |    "source": [
  30 |     "from pyspark.sql.functions import *\n"
  31 |    ]
  32 |   },
  33 |   {
  34 |    "cell_type": "code",
  35 |    "execution_count": 3,
  36 |    "metadata": {},
  37 |    "outputs": [
  38 |     {
  39 |      "name": "stdout",
  40 |      "output_type": "stream",
  41 |      "text": [
  42 |       "(20000, 6)\n"
  43 |      ]
  44 |     }
  45 |    ],
  46 |    "source": [
  47 |     "#check the shape of the data \n",
  48 |     "print((df.count(),len(df.columns)))"
  49 |    ]
  50 |   },
  51 |   {
  52 |    "cell_type": "code",
  53 |    "execution_count": 4,
  54 |    "metadata": {},
  55 |    "outputs": [
  56 |     {
  57 |      "name": "stdout",
  58 |      "output_type": "stream",
  59 |      "text": [
  60 |       "root\n",
  61 |       " |-- Country: string (nullable = true)\n",
  62 |       " |-- Age: integer (nullable = true)\n",
  63 |       " |-- Repeat_Visitor: integer (nullable = true)\n",
  64 |       " |-- Platform: string (nullable = true)\n",
  65 |       " |-- Web_pages_viewed: integer (nullable = true)\n",
  66 |       " |-- Status: integer (nullable = true)\n",
  67 |       "\n"
  68 |      ]
  69 |     }
  70 |    ],
  71 |    "source": [
  72 |     "#printSchema\n",
  73 |     "df.printSchema()"
  74 |    ]
  75 |   },
  76 |   {
  77 |    "cell_type": "code",
  78 |    "execution_count": 5,
  79 |    "metadata": {},
  80 |    "outputs": [
  81 |     {
  82 |      "data": {
  83 |       "text/plain": [
  84 |        "['Country', 'Age', 'Repeat_Visitor', 'Platform', 'Web_pages_viewed', 'Status']"
  85 |       ]
  86 |      },
  87 |      "execution_count": 5,
  88 |      "metadata": {},
  89 |      "output_type": "execute_result"
  90 |     }
  91 |    ],
  92 |    "source": [
  93 |     "#number of columns in dataset\n",
  94 |     "df.columns"
  95 |    ]
  96 |   },
  97 |   {
  98 |    "cell_type": "code",
  99 |    "execution_count": 6,
 100 |    "metadata": {},
 101 |    "outputs": [
 102 |     {
 103 |      "name": "stdout",
 104 |      "output_type": "stream",
 105 |      "text": [
 106 |       "+---------+---+--------------+--------+----------------+------+\n",
 107 |       "|  Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|\n",
 108 |       "+---------+---+--------------+--------+----------------+------+\n",
 109 |       "|    India| 41|             1|   Yahoo|              21|     1|\n",
 110 |       "|   Brazil| 28|             1|   Yahoo|               5|     0|\n",
 111 |       "|   Brazil| 40|             0|  Google|               3|     0|\n",
 112 |       "|Indonesia| 31|             1|    Bing|              15|     1|\n",
 113 |       "| Malaysia| 32|             0|  Google|              15|     1|\n",
 114 |       "+---------+---+--------------+--------+----------------+------+\n",
 115 |       "only showing top 5 rows\n",
 116 |       "\n"
 117 |      ]
 118 |     }
 119 |    ],
 120 |    "source": [
 121 |     "#view the dataset\n",
 122 |     "df.show(5)"
 123 |    ]
 124 |   },
 125 |   {
 126 |    "cell_type": "code",
 127 |    "execution_count": 19,
 128 |    "metadata": {},
 129 |    "outputs": [
 130 |     {
 131 |      "name": "stdout",
 132 |      "output_type": "stream",
 133 |      "text": [
 134 |       "+-------+--------+-----------------+-----------------+--------+-----------------+------------------+\n",
 135 |       "|summary| Country|              Age|   Repeat_Visitor|Platform| Web_pages_viewed|            Status|\n",
 136 |       "+-------+--------+-----------------+-----------------+--------+-----------------+------------------+\n",
 137 |       "|  count|   20000|            20000|            20000|   20000|            20000|             20000|\n",
 138 |       "|   mean|    null|         28.53955|           0.5029|    null|           9.5533|               0.5|\n",
 139 |       "| stddev|    null|7.888912950773227|0.500004090187782|    null|6.073903499824976|0.5000125004687693|\n",
 140 |       "|    min|  Brazil|               17|                0|    Bing|                1|                 0|\n",
 141 |       "|    max|Malaysia|              111|                1|   Yahoo|               29|                 1|\n",
 142 |       "+-------+--------+-----------------+-----------------+--------+-----------------+------------------+\n",
 143 |       "\n"
 144 |      ]
 145 |     }
 146 |    ],
 147 |    "source": [
 148 |     "#Exploratory Data Analysis\n",
 149 |     "df.describe().show()\n"
 150 |    ]
 151 |   },
 152 |   {
 153 |    "cell_type": "code",
 154 |    "execution_count": 22,
 155 |    "metadata": {},
 156 |    "outputs": [
 157 |     {
 158 |      "name": "stdout",
 159 |      "output_type": "stream",
 160 |      "text": [
 161 |       "+---------+-----+\n",
 162 |       "|  Country|count|\n",
 163 |       "+---------+-----+\n",
 164 |       "| Malaysia| 1218|\n",
 165 |       "|    India| 4018|\n",
 166 |       "|Indonesia|12178|\n",
 167 |       "|   Brazil| 2586|\n",
 168 |       "+---------+-----+\n",
 169 |       "\n"
 170 |      ]
 171 |     }
 172 |    ],
 173 |    "source": [
 174 |     "df.groupBy('Country').count().show()"
 175 |    ]
 176 |   },
 177 |   {
 178 |    "cell_type": "code",
 179 |    "execution_count": 118,
 180 |    "metadata": {},
 181 |    "outputs": [
 182 |     {
 183 |      "name": "stdout",
 184 |      "output_type": "stream",
 185 |      "text": [
 186 |       "+--------+-----+\n",
 187 |       "|Platform|count|\n",
 188 |       "+--------+-----+\n",
 189 |       "|   Yahoo| 9859|\n",
 190 |       "|    Bing| 4360|\n",
 191 |       "|  Google| 5781|\n",
 192 |       "+--------+-----+\n",
 193 |       "\n"
 194 |      ]
 195 |     }
 196 |    ],
 197 |    "source": [
 198 |     "df.groupBy('Platform').count().show()"
 199 |    ]
 200 |   },
 201 |   {
 202 |    "cell_type": "code",
 203 |    "execution_count": 119,
 204 |    "metadata": {},
 205 |    "outputs": [
 206 |     {
 207 |      "name": "stdout",
 208 |      "output_type": "stream",
 209 |      "text": [
 210 |       "+------+-----+\n",
 211 |       "|Status|count|\n",
 212 |       "+------+-----+\n",
 213 |       "|     1|10000|\n",
 214 |       "|     0|10000|\n",
 215 |       "+------+-----+\n",
 216 |       "\n"
 217 |      ]
 218 |     }
 219 |    ],
 220 |    "source": [
 221 |     "df.groupBy('Status').count().show()"
 222 |    ]
 223 |   },
 224 |   {
 225 |    "cell_type": "code",
 226 |    "execution_count": 23,
 227 |    "metadata": {
 228 |     "scrolled": true
 229 |    },
 230 |    "outputs": [
 231 |     {
 232 |      "name": "stdout",
 233 |      "output_type": "stream",
 234 |      "text": [
 235 |       "+---------+------------------+-------------------+---------------------+--------------------+\n",
 236 |       "|  Country|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|         avg(Status)|\n",
 237 |       "+---------+------------------+-------------------+---------------------+--------------------+\n",
 238 |       "| Malaysia|27.792282430213465| 0.5730706075533661|   11.192118226600986|  0.6568144499178982|\n",
 239 |       "|    India|27.976854156296664| 0.5433051269288203|   10.727227476356397|  0.6212045793927327|\n",
 240 |       "|Indonesia| 28.43159796354081| 0.5207751683363442|    9.985711939563148|  0.5422893742814913|\n",
 241 |       "|   Brazil|30.274168600154677|  0.322892498066512|    4.921113689095128|0.038669760247486466|\n",
 242 |       "+---------+------------------+-------------------+---------------------+--------------------+\n",
 243 |       "\n"
 244 |      ]
 245 |     }
 246 |    ],
 247 |    "source": [
 248 |     "df.groupBy('Country').mean().show()"
 249 |    ]
 250 |   },
 251 |   {
 252 |    "cell_type": "code",
 253 |    "execution_count": 24,
 254 |    "metadata": {},
 255 |    "outputs": [
 256 |     {
 257 |      "name": "stdout",
 258 |      "output_type": "stream",
 259 |      "text": [
 260 |       "+--------+------------------+-------------------+---------------------+------------------+\n",
 261 |       "|Platform|          avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|       avg(Status)|\n",
 262 |       "+--------+------------------+-------------------+---------------------+------------------+\n",
 263 |       "|   Yahoo|28.569226087838523| 0.5094837204584644|    9.599655137437875|0.5071508266558474|\n",
 264 |       "|    Bing| 28.68394495412844| 0.4720183486238532|    9.114908256880733|0.4559633027522936|\n",
 265 |       "|  Google|28.380038055699707| 0.5149628092025601|    9.804878048780488|0.5210171250648676|\n",
 266 |       "+--------+------------------+-------------------+---------------------+------------------+\n",
 267 |       "\n"
 268 |      ]
 269 |     }
 270 |    ],
 271 |    "source": [
 272 |     "df.groupBy('Platform').mean().show()"
 273 |    ]
 274 |   },
 275 |   {
 276 |    "cell_type": "code",
 277 |    "execution_count": 25,
 278 |    "metadata": {},
 279 |    "outputs": [
 280 |     {
 281 |      "name": "stdout",
 282 |      "output_type": "stream",
 283 |      "text": [
 284 |       "+------+--------+-------------------+---------------------+-----------+\n",
 285 |       "|Status|avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|avg(Status)|\n",
 286 |       "+------+--------+-------------------+---------------------+-----------+\n",
 287 |       "|     1| 26.5435|             0.7019|              14.5617|        1.0|\n",
 288 |       "|     0| 30.5356|             0.3039|               4.5449|        0.0|\n",
 289 |       "+------+--------+-------------------+---------------------+-----------+\n",
 290 |       "\n"
 291 |      ]
 292 |     }
 293 |    ],
 294 |    "source": [
 295 |     "df.groupBy('Status').mean().show()"
 296 |    ]
 297 |   },
 298 |   {
 299 |    "cell_type": "code",
 300 |    "execution_count": 120,
 301 |    "metadata": {},
 302 |    "outputs": [],
 303 |    "source": [
 304 |     "#converting categorical data to numerical form"
 305 |    ]
 306 |   },
 307 |   {
 308 |    "cell_type": "code",
 309 |    "execution_count": 121,
 310 |    "metadata": {},
 311 |    "outputs": [],
 312 |    "source": [
 313 |     "#import required libraries\n",
 314 |     "\n",
 315 |     "from pyspark.ml.feature import StringIndexer\n"
 316 |    ]
 317 |   },
 318 |   {
 319 |    "cell_type": "code",
 320 |    "execution_count": 122,
 321 |    "metadata": {},
 322 |    "outputs": [],
 323 |    "source": [
 324 |     "#Indexing "
 325 |    ]
 326 |   },
 327 |   {
 328 |    "cell_type": "code",
 329 |    "execution_count": 123,
 330 |    "metadata": {},
 331 |    "outputs": [],
 332 |    "source": [
 333 |     "platform_indexer = StringIndexer(inputCol=\"Platform\", outputCol=\"platform_num\").fit(df)\n",
 334 |     "df = platform_indexer.transform(df)"
 335 |    ]
 336 |   },
 337 |   {
 338 |    "cell_type": "code",
 339 |    "execution_count": 124,
 340 |    "metadata": {},
 341 |    "outputs": [
 342 |     {
 343 |      "name": "stdout",
 344 |      "output_type": "stream",
 345 |      "text": [
 346 |       "+-------+---+--------------+--------+----------------+------+------------+\n",
 347 |       "|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|platform_num|\n",
 348 |       "+-------+---+--------------+--------+----------------+------+------------+\n",
 349 |       "|India  |41 |1             |Yahoo   |21              |1     |0.0         |\n",
 350 |       "|Brazil |28 |1             |Yahoo   |5               |0     |0.0         |\n",
 351 |       "|Brazil |40 |0             |Google  |3               |0     |1.0         |\n",
 352 |       "+-------+---+--------------+--------+----------------+------+------------+\n",
 353 |       "only showing top 3 rows\n",
 354 |       "\n"
 355 |      ]
 356 |     }
 357 |    ],
 358 |    "source": [
 359 |     "df.show(3,False)"
 360 |    ]
 361 |   },
 362 |   {
 363 |    "cell_type": "code",
 364 |    "execution_count": 125,
 365 |    "metadata": {},
 366 |    "outputs": [],
 367 |    "source": [
 368 |     "from pyspark.ml.feature import OneHotEncoder"
 369 |    ]
 370 |   },
 371 |   {
 372 |    "cell_type": "code",
 373 |    "execution_count": 126,
 374 |    "metadata": {},
 375 |    "outputs": [],
 376 |    "source": [
 377 |     "#one hot encoding\n",
 378 |     "platform_encoder = OneHotEncoder(inputCol=\"platform_num\", outputCol=\"platform_vector\")\n",
 379 |     "df = platform_encoder.transform(df)"
 380 |    ]
 381 |   },
 382 |   {
 383 |    "cell_type": "code",
 384 |    "execution_count": 129,
 385 |    "metadata": {},
 386 |    "outputs": [
 387 |     {
 388 |      "name": "stdout",
 389 |      "output_type": "stream",
 390 |      "text": [
 391 |       "+-------+---+--------------+--------+----------------+------+------------+---------------+\n",
 392 |       "|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|platform_num|platform_vector|\n",
 393 |       "+-------+---+--------------+--------+----------------+------+------------+---------------+\n",
 394 |       "|India  |41 |1             |Yahoo   |21              |1     |0.0         |(2,[0],[1.0])  |\n",
 395 |       "|Brazil |28 |1             |Yahoo   |5               |0     |0.0         |(2,[0],[1.0])  |\n",
 396 |       "|Brazil |40 |0             |Google  |3               |0     |1.0         |(2,[1],[1.0])  |\n",
 397 |       "+-------+---+--------------+--------+----------------+------+------------+---------------+\n",
 398 |       "only showing top 3 rows\n",
 399 |       "\n"
 400 |      ]
 401 |     }
 402 |    ],
 403 |    "source": [
 404 |     "df.show(3,False)"
 405 |    ]
 406 |   },
 407 |   {
 408 |    "cell_type": "code",
 409 |    "execution_count": 134,
 410 |    "metadata": {},
 411 |    "outputs": [
 412 |     {
 413 |      "name": "stdout",
 414 |      "output_type": "stream",
 415 |      "text": [
 416 |       "+--------+-----+\n",
 417 |       "|Platform|count|\n",
 418 |       "+--------+-----+\n",
 419 |       "|Yahoo   |9859 |\n",
 420 |       "|Google  |5781 |\n",
 421 |       "|Bing    |4360 |\n",
 422 |       "+--------+-----+\n",
 423 |       "\n"
 424 |      ]
 425 |     }
 426 |    ],
 427 |    "source": [
 428 |     "df.groupBy('Platform').count().orderBy('count',ascending=False).show(5,False)"
 429 |    ]
 430 |   },
 431 |   {
 432 |    "cell_type": "code",
 433 |    "execution_count": 135,
 434 |    "metadata": {},
 435 |    "outputs": [
 436 |     {
 437 |      "name": "stdout",
 438 |      "output_type": "stream",
 439 |      "text": [
 440 |       "+------------+-----+\n",
 441 |       "|platform_num|count|\n",
 442 |       "+------------+-----+\n",
 443 |       "|0.0         |9859 |\n",
 444 |       "|1.0         |5781 |\n",
 445 |       "|2.0         |4360 |\n",
 446 |       "+------------+-----+\n",
 447 |       "\n"
 448 |      ]
 449 |     }
 450 |    ],
 451 |    "source": [
 452 |     "df.groupBy('platform_num').count().orderBy('count',ascending=False).show(5,False)"
 453 |    ]
 454 |   },
 455 |   {
 456 |    "cell_type": "code",
 457 |    "execution_count": 136,
 458 |    "metadata": {},
 459 |    "outputs": [
 460 |     {
 461 |      "name": "stdout",
 462 |      "output_type": "stream",
 463 |      "text": [
 464 |       "+---------------+-----+\n",
 465 |       "|platform_vector|count|\n",
 466 |       "+---------------+-----+\n",
 467 |       "|(2,[0],[1.0])  |9859 |\n",
 468 |       "|(2,[1],[1.0])  |5781 |\n",
 469 |       "|(2,[],[])      |4360 |\n",
 470 |       "+---------------+-----+\n",
 471 |       "\n"
 472 |      ]
 473 |     }
 474 |    ],
 475 |    "source": [
 476 |     "df.groupBy('platform_vector').count().orderBy('count',ascending=False).show(5,False)"
 477 |    ]
 478 |   },
 479 |   {
 480 |    "cell_type": "code",
 481 |    "execution_count": 137,
 482 |    "metadata": {},
 483 |    "outputs": [],
 484 |    "source": [
 485 |     "country_indexer = StringIndexer(inputCol=\"Country\", outputCol=\"country_num\").fit(df)\n",
 486 |     "df = country_indexer.transform(df)"
 487 |    ]
 488 |   },
 489 |   {
 490 |    "cell_type": "code",
 491 |    "execution_count": 139,
 492 |    "metadata": {},
 493 |    "outputs": [
 494 |     {
 495 |      "name": "stdout",
 496 |      "output_type": "stream",
 497 |      "text": [
 498 |       "+-------+-----------+\n",
 499 |       "|Country|country_num|\n",
 500 |       "+-------+-----------+\n",
 501 |       "|India  |1.0        |\n",
 502 |       "|Brazil |2.0        |\n",
 503 |       "|Brazil |2.0        |\n",
 504 |       "+-------+-----------+\n",
 505 |       "only showing top 3 rows\n",
 506 |       "\n"
 507 |      ]
 508 |     }
 509 |    ],
 510 |    "source": [
 511 |     "df.select(['Country','country_num']).show(3,False)"
 512 |    ]
 513 |   },
 514 |   {
 515 |    "cell_type": "code",
 516 |    "execution_count": 140,
 517 |    "metadata": {},
 518 |    "outputs": [],
 519 |    "source": [
 520 |     "#one hot encoding\n",
 521 |     "country_encoder = OneHotEncoder(inputCol=\"country_num\", outputCol=\"country_vector\")\n",
 522 |     "df = country_encoder.transform(df)"
 523 |    ]
 524 |   },
 525 |   {
 526 |    "cell_type": "code",
 527 |    "execution_count": 141,
 528 |    "metadata": {},
 529 |    "outputs": [
 530 |     {
 531 |      "name": "stdout",
 532 |      "output_type": "stream",
 533 |      "text": [
 534 |       "+-------+-----------+--------------+\n",
 535 |       "|Country|country_num|country_vector|\n",
 536 |       "+-------+-----------+--------------+\n",
 537 |       "|India  |1.0        |(3,[1],[1.0]) |\n",
 538 |       "|Brazil |2.0        |(3,[2],[1.0]) |\n",
 539 |       "|Brazil |2.0        |(3,[2],[1.0]) |\n",
 540 |       "+-------+-----------+--------------+\n",
 541 |       "only showing top 3 rows\n",
 542 |       "\n"
 543 |      ]
 544 |     }
 545 |    ],
 546 |    "source": [
 547 |     "df.select(['Country','country_num','country_vector']).show(3,False)"
 548 |    ]
 549 |   },
 550 |   {
 551 |    "cell_type": "code",
 552 |    "execution_count": 142,
 553 |    "metadata": {},
 554 |    "outputs": [
 555 |     {
 556 |      "name": "stdout",
 557 |      "output_type": "stream",
 558 |      "text": [
 559 |       "+---------+-----+\n",
 560 |       "|Country  |count|\n",
 561 |       "+---------+-----+\n",
 562 |       "|Indonesia|12178|\n",
 563 |       "|India    |4018 |\n",
 564 |       "|Brazil   |2586 |\n",
 565 |       "|Malaysia |1218 |\n",
 566 |       "+---------+-----+\n",
 567 |       "\n"
 568 |      ]
 569 |     }
 570 |    ],
 571 |    "source": [
 572 |     "df.groupBy('Country').count().orderBy('count',ascending=False).show(5,False)"
 573 |    ]
 574 |   },
 575 |   {
 576 |    "cell_type": "code",
 577 |    "execution_count": 143,
 578 |    "metadata": {},
 579 |    "outputs": [
 580 |     {
 581 |      "name": "stdout",
 582 |      "output_type": "stream",
 583 |      "text": [
 584 |       "+-----------+-----+\n",
 585 |       "|country_num|count|\n",
 586 |       "+-----------+-----+\n",
 587 |       "|0.0        |12178|\n",
 588 |       "|1.0        |4018 |\n",
 589 |       "|2.0        |2586 |\n",
 590 |       "|3.0        |1218 |\n",
 591 |       "+-----------+-----+\n",
 592 |       "\n"
 593 |      ]
 594 |     }
 595 |    ],
 596 |    "source": [
 597 |     "df.groupBy('country_num').count().orderBy('count',ascending=False).show(5,False)"
 598 |    ]
 599 |   },
 600 |   {
 601 |    "cell_type": "code",
 602 |    "execution_count": 144,
 603 |    "metadata": {},
 604 |    "outputs": [
 605 |     {
 606 |      "name": "stdout",
 607 |      "output_type": "stream",
 608 |      "text": [
 609 |       "+--------------+-----+\n",
 610 |       "|country_vector|count|\n",
 611 |       "+--------------+-----+\n",
 612 |       "|(3,[0],[1.0]) |12178|\n",
 613 |       "|(3,[1],[1.0]) |4018 |\n",
 614 |       "|(3,[2],[1.0]) |2586 |\n",
 615 |       "|(3,[],[])     |1218 |\n",
 616 |       "+--------------+-----+\n",
 617 |       "\n"
 618 |      ]
 619 |     }
 620 |    ],
 621 |    "source": [
 622 |     "df.groupBy('country_vector').count().orderBy('count',ascending=False).show(5,False)"
 623 |    ]
 624 |   },
 625 |   {
 626 |    "cell_type": "code",
 627 |    "execution_count": 145,
 628 |    "metadata": {},
 629 |    "outputs": [],
 630 |    "source": [
 631 |     "from pyspark.ml.feature import VectorAssembler"
 632 |    ]
 633 |   },
 634 |   {
 635 |    "cell_type": "code",
 636 |    "execution_count": 146,
 637 |    "metadata": {},
 638 |    "outputs": [],
 639 |    "source": [
 640 |     "df_assembler = VectorAssembler(inputCols=['platform_vector','country_vector','Age', 'Repeat_Visitor','Web_pages_viewed'], outputCol=\"features\")\n",
 641 |     "df = df_assembler.transform(df)"
 642 |    ]
 643 |   },
 644 |   {
 645 |    "cell_type": "code",
 646 |    "execution_count": 147,
 647 |    "metadata": {},
 648 |    "outputs": [
 649 |     {
 650 |      "name": "stdout",
 651 |      "output_type": "stream",
 652 |      "text": [
 653 |       "root\n",
 654 |       " |-- Country: string (nullable = true)\n",
 655 |       " |-- Age: integer (nullable = true)\n",
 656 |       " |-- Repeat_Visitor: integer (nullable = true)\n",
 657 |       " |-- Platform: string (nullable = true)\n",
 658 |       " |-- Web_pages_viewed: integer (nullable = true)\n",
 659 |       " |-- Status: integer (nullable = true)\n",
 660 |       " |-- platform_num: double (nullable = false)\n",
 661 |       " |-- platform_vector: vector (nullable = true)\n",
 662 |       " |-- country_num: double (nullable = false)\n",
 663 |       " |-- country_vector: vector (nullable = true)\n",
 664 |       " |-- features: vector (nullable = true)\n",
 665 |       "\n"
 666 |      ]
 667 |     }
 668 |    ],
 669 |    "source": [
 670 |     "df.printSchema()"
 671 |    ]
 672 |   },
 673 |   {
 674 |    "cell_type": "code",
 675 |    "execution_count": 148,
 676 |    "metadata": {},
 677 |    "outputs": [
 678 |     {
 679 |      "name": "stdout",
 680 |      "output_type": "stream",
 681 |      "text": [
 682 |       "+-----------------------------------+------+\n",
 683 |       "|features                           |Status|\n",
 684 |       "+-----------------------------------+------+\n",
 685 |       "|[1.0,0.0,0.0,1.0,0.0,41.0,1.0,21.0]|1     |\n",
 686 |       "|[1.0,0.0,0.0,0.0,1.0,28.0,1.0,5.0] |0     |\n",
 687 |       "|(8,[1,4,5,7],[1.0,1.0,40.0,3.0])   |0     |\n",
 688 |       "|(8,[2,5,6,7],[1.0,31.0,1.0,15.0])  |1     |\n",
 689 |       "|(8,[1,5,7],[1.0,32.0,15.0])        |1     |\n",
 690 |       "|(8,[1,4,5,7],[1.0,1.0,32.0,3.0])   |0     |\n",
 691 |       "|(8,[1,4,5,7],[1.0,1.0,32.0,6.0])   |0     |\n",
 692 |       "|(8,[1,2,5,7],[1.0,1.0,27.0,9.0])   |0     |\n",
 693 |       "|(8,[0,2,5,7],[1.0,1.0,32.0,2.0])   |0     |\n",
 694 |       "|(8,[2,5,6,7],[1.0,31.0,1.0,16.0])  |1     |\n",
 695 |       "+-----------------------------------+------+\n",
 696 |       "only showing top 10 rows\n",
 697 |       "\n"
 698 |      ]
 699 |     }
 700 |    ],
 701 |    "source": [
 702 |     "df.select(['features','Status']).show(10,False)"
 703 |    ]
 704 |   },
 705 |   {
 706 |    "cell_type": "code",
 707 |    "execution_count": 149,
 708 |    "metadata": {},
 709 |    "outputs": [],
 710 |    "source": [
 711 |     "#select data for building model\n",
 712 |     "model_df=df.select(['features','Status'])"
 713 |    ]
 714 |   },
 715 |   {
 716 |    "cell_type": "code",
 717 |    "execution_count": 150,
 718 |    "metadata": {},
 719 |    "outputs": [],
 720 |    "source": [
 721 |     "from pyspark.ml.classification import LogisticRegression"
 722 |    ]
 723 |   },
 724 |   {
 725 |    "cell_type": "code",
 726 |    "execution_count": 151,
 727 |    "metadata": {},
 728 |    "outputs": [],
 729 |    "source": [
 730 |     "#split the data \n",
 731 |     "training_df,test_df=model_df.randomSplit([0.75,0.25])"
 732 |    ]
 733 |   },
 734 |   {
 735 |    "cell_type": "code",
 736 |    "execution_count": 152,
 737 |    "metadata": {},
 738 |    "outputs": [
 739 |     {
 740 |      "data": {
 741 |       "text/plain": [
 742 |        "14907"
 743 |       ]
 744 |      },
 745 |      "execution_count": 152,
 746 |      "metadata": {},
 747 |      "output_type": "execute_result"
 748 |     }
 749 |    ],
 750 |    "source": [
 751 |     "training_df.count()"
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "code",
 756 |    "execution_count": 160,
 757 |    "metadata": {},
 758 |    "outputs": [
 759 |     {
 760 |      "name": "stdout",
 761 |      "output_type": "stream",
 762 |      "text": [
 763 |       "+------+-----+\n",
 764 |       "|Status|count|\n",
 765 |       "+------+-----+\n",
 766 |       "|     1| 7417|\n",
 767 |       "|     0| 7490|\n",
 768 |       "+------+-----+\n",
 769 |       "\n"
 770 |      ]
 771 |     }
 772 |    ],
 773 |    "source": [
 774 |     "training_df.groupBy('Status').count().show()"
 775 |    ]
 776 |   },
 777 |   {
 778 |    "cell_type": "code",
 779 |    "execution_count": 153,
 780 |    "metadata": {},
 781 |    "outputs": [
 782 |     {
 783 |      "data": {
 784 |       "text/plain": [
 785 |        "5093"
 786 |       ]
 787 |      },
 788 |      "execution_count": 153,
 789 |      "metadata": {},
 790 |      "output_type": "execute_result"
 791 |     }
 792 |    ],
 793 |    "source": [
 794 |     "test_df.count()"
 795 |    ]
 796 |   },
 797 |   {
 798 |    "cell_type": "code",
 799 |    "execution_count": 161,
 800 |    "metadata": {},
 801 |    "outputs": [
 802 |     {
 803 |      "name": "stdout",
 804 |      "output_type": "stream",
 805 |      "text": [
 806 |       "+------+-----+\n",
 807 |       "|Status|count|\n",
 808 |       "+------+-----+\n",
 809 |       "|     1| 2583|\n",
 810 |       "|     0| 2510|\n",
 811 |       "+------+-----+\n",
 812 |       "\n"
 813 |      ]
 814 |     }
 815 |    ],
 816 |    "source": [
 817 |     "test_df.groupBy('Status').count().show()"
 818 |    ]
 819 |   },
 820 |   {
 821 |    "cell_type": "code",
 822 |    "execution_count": 154,
 823 |    "metadata": {},
 824 |    "outputs": [],
 825 |    "source": [
 826 |     "log_reg=LogisticRegression(labelCol='Status').fit(training_df)"
 827 |    ]
 828 |   },
 829 |   {
 830 |    "cell_type": "code",
 831 |    "execution_count": null,
 832 |    "metadata": {},
 833 |    "outputs": [],
 834 |    "source": [
 835 |     "#Training Results"
 836 |    ]
 837 |   },
 838 |   {
 839 |    "cell_type": "code",
 840 |    "execution_count": 155,
 841 |    "metadata": {},
 842 |    "outputs": [],
 843 |    "source": [
 844 |     "train_results=log_reg.evaluate(training_df).predictions"
 845 |    ]
 846 |   },
 847 |   {
 848 |    "cell_type": "code",
 849 |    "execution_count": 168,
 850 |    "metadata": {},
 851 |    "outputs": [
 852 |     {
 853 |      "name": "stdout",
 854 |      "output_type": "stream",
 855 |      "text": [
 856 |       "+------+----------+----------------------------------------+\n",
 857 |       "|Status|prediction|probability                             |\n",
 858 |       "+------+----------+----------------------------------------+\n",
 859 |       "|1     |1.0       |[0.2978572628475072,0.7021427371524929] |\n",
 860 |       "|1     |1.0       |[0.2978572628475072,0.7021427371524929] |\n",
 861 |       "|1     |1.0       |[0.16704676975730415,0.8329532302426959]|\n",
 862 |       "|1     |1.0       |[0.16704676975730415,0.8329532302426959]|\n",
 863 |       "|1     |1.0       |[0.16704676975730415,0.8329532302426959]|\n",
 864 |       "|1     |1.0       |[0.08659913656062515,0.9134008634393749]|\n",
 865 |       "|1     |1.0       |[0.08659913656062515,0.9134008634393749]|\n",
 866 |       "|1     |1.0       |[0.08659913656062515,0.9134008634393749]|\n",
 867 |       "|1     |1.0       |[0.08659913656062515,0.9134008634393749]|\n",
 868 |       "|1     |1.0       |[0.08659913656062515,0.9134008634393749]|\n",
 869 |       "+------+----------+----------------------------------------+\n",
 870 |       "only showing top 10 rows\n",
 871 |       "\n"
 872 |      ]
 873 |     }
 874 |    ],
 875 |    "source": [
 876 |     "train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).select(['Status','prediction','probability']).show(10,False)"
 877 |    ]
 878 |   },
 879 |   {
 880 |    "cell_type": "markdown",
 881 |    "metadata": {},
 882 |    "source": [
 883 |     "Probability at 0 index is for 0 class and probabilty as 1 index is for 1 class"
 884 |    ]
 885 |   },
 886 |   {
 887 |    "cell_type": "code",
 888 |    "execution_count": 177,
 889 |    "metadata": {},
 890 |    "outputs": [],
 891 |    "source": [
 892 |     "correct_preds=train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).count()\n"
 893 |    ]
 894 |   },
 895 |   {
 896 |    "cell_type": "code",
 897 |    "execution_count": 174,
 898 |    "metadata": {},
 899 |    "outputs": [
 900 |     {
 901 |      "data": {
 902 |       "text/plain": [
 903 |        "7417"
 904 |       ]
 905 |      },
 906 |      "execution_count": 174,
 907 |      "metadata": {},
 908 |      "output_type": "execute_result"
 909 |     }
 910 |    ],
 911 |    "source": [
 912 |     "training_df.filter(training_df['Status']==1).count()"
 913 |    ]
 914 |   },
 915 |   {
 916 |    "cell_type": "code",
 917 |    "execution_count": 178,
 918 |    "metadata": {},
 919 |    "outputs": [
 920 |     {
 921 |      "data": {
 922 |       "text/plain": [
 923 |        "0.9366320614803829"
 924 |       ]
 925 |      },
 926 |      "execution_count": 178,
 927 |      "metadata": {},
 928 |      "output_type": "execute_result"
 929 |     }
 930 |    ],
 931 |    "source": [
 932 |     "#accuracy on training dataset \n",
 933 |     "float(correct_preds)/(training_df.filter(training_df['Status']==1).count())"
 934 |    ]
 935 |   },
 936 |   {
 937 |    "cell_type": "code",
 938 |    "execution_count": null,
 939 |    "metadata": {},
 940 |    "outputs": [],
 941 |    "source": [
 942 |     "#Test Set results"
 943 |    ]
 944 |   },
 945 |   {
 946 |    "cell_type": "code",
 947 |    "execution_count": 170,
 948 |    "metadata": {},
 949 |    "outputs": [],
 950 |    "source": [
 951 |     "results=log_reg.evaluate(test_df).predictions"
 952 |    ]
 953 |   },
 954 |   {
 955 |    "cell_type": "code",
 956 |    "execution_count": 93,
 957 |    "metadata": {},
 958 |    "outputs": [
 959 |     {
 960 |      "name": "stdout",
 961 |      "output_type": "stream",
 962 |      "text": [
 963 |       "+------+----------+\n",
 964 |       "|Status|prediction|\n",
 965 |       "+------+----------+\n",
 966 |       "|0     |0.0       |\n",
 967 |       "|0     |0.0       |\n",
 968 |       "|0     |0.0       |\n",
 969 |       "|0     |0.0       |\n",
 970 |       "|1     |0.0       |\n",
 971 |       "|0     |0.0       |\n",
 972 |       "|1     |1.0       |\n",
 973 |       "|0     |1.0       |\n",
 974 |       "|1     |1.0       |\n",
 975 |       "|1     |1.0       |\n",
 976 |       "+------+----------+\n",
 977 |       "only showing top 10 rows\n",
 978 |       "\n"
 979 |      ]
 980 |     }
 981 |    ],
 982 |    "source": [
 983 |     "results.select(['Status','prediction']).show(10,False)"
 984 |    ]
 985 |   },
 986 |   {
 987 |    "cell_type": "code",
 988 |    "execution_count": 91,
 989 |    "metadata": {},
 990 |    "outputs": [
 991 |     {
 992 |      "name": "stdout",
 993 |      "output_type": "stream",
 994 |      "text": [
 995 |       "root\n",
 996 |       " |-- features: vector (nullable = true)\n",
 997 |       " |-- Status: integer (nullable = true)\n",
 998 |       " |-- rawPrediction: vector (nullable = true)\n",
 999 |       " |-- probability: vector (nullable = true)\n",
1000 |       " |-- prediction: double (nullable = false)\n",
1001 |       "\n"
1002 |      ]
1003 |     }
1004 |    ],
1005 |    "source": [
1006 |     "results.printSchema()"
1007 |    ]
1008 |   },
1009 |   {
1010 |    "cell_type": "code",
1011 |    "execution_count": 92,
1012 |    "metadata": {},
1013 |    "outputs": [],
1014 |    "source": [
1015 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator"
1016 |    ]
1017 |   },
1018 |   {
1019 |    "cell_type": "code",
1020 |    "execution_count": 94,
1021 |    "metadata": {},
1022 |    "outputs": [],
1023 |    "source": [
1024 |     "#confusion matrix\n",
1025 |     "true_postives = results[(results.Status == 1) & (results.prediction == 1)].count()\n",
1026 |     "true_negatives = results[(results.Status == 0) & (results.prediction == 0)].count()\n",
1027 |     "false_positives = results[(results.Status == 0) & (results.prediction == 1)].count()\n",
1028 |     "false_negatives = results[(results.Status == 1) & (results.prediction == 0)].count()"
1029 |    ]
1030 |   },
1031 |   {
1032 |    "cell_type": "code",
1033 |    "execution_count": 98,
1034 |    "metadata": {},
1035 |    "outputs": [
1036 |     {
1037 |      "name": "stdout",
1038 |      "output_type": "stream",
1039 |      "text": [
1040 |       "2356\n",
1041 |       "2363\n",
1042 |       "158\n",
1043 |       "157\n",
1044 |       "5034\n",
1045 |       "5034\n"
1046 |      ]
1047 |     }
1048 |    ],
1049 |    "source": [
1050 |     "print (true_postives)\n",
1051 |     "print (true_negatives)\n",
1052 |     "print (false_positives)\n",
1053 |     "print (false_negatives)\n",
1054 |     "print(true_postives+true_negatives+false_positives+false_negatives)\n",
1055 |     "print (results.count())"
1056 |    ]
1057 |   },
1058 |   {
1059 |    "cell_type": "code",
1060 |    "execution_count": 99,
1061 |    "metadata": {},
1062 |    "outputs": [
1063 |     {
1064 |      "name": "stdout",
1065 |      "output_type": "stream",
1066 |      "text": [
1067 |       "0.937524870672503\n"
1068 |      ]
1069 |     }
1070 |    ],
1071 |    "source": [
1072 |     "recall = float(true_postives)/(true_postives + false_negatives)\n",
1073 |     "print(recall)"
1074 |    ]
1075 |   },
1076 |   {
1077 |    "cell_type": "code",
1078 |    "execution_count": 100,
1079 |    "metadata": {},
1080 |    "outputs": [
1081 |     {
1082 |      "name": "stdout",
1083 |      "output_type": "stream",
1084 |      "text": [
1085 |       "0.9371519490851233\n"
1086 |      ]
1087 |     }
1088 |    ],
1089 |    "source": [
1090 |     "precision = float(true_postives) / (true_postives + false_positives)\n",
1091 |     "print(precision)"
1092 |    ]
1093 |   },
1094 |   {
1095 |    "cell_type": "code",
1096 |    "execution_count": 103,
1097 |    "metadata": {},
1098 |    "outputs": [
1099 |     {
1100 |      "name": "stdout",
1101 |      "output_type": "stream",
1102 |      "text": [
1103 |       "0.9374255065554231\n"
1104 |      ]
1105 |     }
1106 |    ],
1107 |    "source": [
1108 |     "accuracy=float((true_postives+true_negatives) /(results.count()))\n",
1109 |     "print(accuracy)"
1110 |    ]
1111 |   },
1112 |   {
1113 |    "cell_type": "code",
1114 |    "execution_count": null,
1115 |    "metadata": {},
1116 |    "outputs": [],
1117 |    "source": []
1118 |   }
1119 |  ],
1120 |  "metadata": {
1121 |   "kernelspec": {
1122 |    "display_name": "Python 3",
1123 |    "language": "python",
1124 |    "name": "python3"
1125 |   },
1126 |   "language_info": {
1127 |    "codemirror_mode": {
1128 |     "name": "ipython",
1129 |     "version": 3
1130 |    },
1131 |    "file_extension": ".py",
1132 |    "mimetype": "text/x-python",
1133 |    "name": "python",
1134 |    "nbconvert_exporter": "python",
1135 |    "pygments_lexer": "ipython3",
1136 |    "version": "3.6.3"
1137 |   }
1138 |  },
1139 |  "nbformat": 4,
1140 |  "nbformat_minor": 2
1141 | }
1142 | 


--------------------------------------------------------------------------------
/chap_3/.ipynb_checkpoints/pyspark_basics-checkpoint.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {},
   6 |    "source": [
   7 |     "# Pyspark Basics"
   8 |    ]
   9 |   },
  10 |   {
  11 |    "cell_type": "code",
  12 |    "execution_count": 2,
  13 |    "metadata": {},
  14 |    "outputs": [],
  15 |    "source": [
  16 |     "#create pyspark session\n",
  17 |     "from pyspark.sql import SparkSession\n",
  18 |     "spark=SparkSession.builder.appName('pyspark').getOrCreate()"
  19 |    ]
  20 |   },
  21 |   {
  22 |    "cell_type": "code",
  23 |    "execution_count": 121,
  24 |    "metadata": {},
  25 |    "outputs": [],
  26 |    "source": [
  27 |     "#read the data file\n",
  28 |     "df=spark.read.csv('conversion_data.csv')"
  29 |    ]
  30 |   },
  31 |   {
  32 |    "cell_type": "code",
  33 |    "execution_count": 122,
  34 |    "metadata": {},
  35 |    "outputs": [
  36 |     {
  37 |      "name": "stdout",
  38 |      "output_type": "stream",
  39 |      "text": [
  40 |       "+-------+---+--------+------+-------------------+---------+\n",
  41 |       "|    _c0|_c1|     _c2|   _c3|                _c4|      _c5|\n",
  42 |       "+-------+---+--------+------+-------------------+---------+\n",
  43 |       "|country|age|new_user|source|total_pages_visited|converted|\n",
  44 |       "|     UK| 25|       1|   Ads|                  1|        0|\n",
  45 |       "|     US| 23|       1|   Seo|                  5|        0|\n",
  46 |       "|     US| 28|       1|   Seo|                  4|        0|\n",
  47 |       "|  China| 39|       1|   Seo|                  5|        0|\n",
  48 |       "+-------+---+--------+------+-------------------+---------+\n",
  49 |       "only showing top 5 rows\n",
  50 |       "\n"
  51 |      ]
  52 |     }
  53 |    ],
  54 |    "source": [
  55 |     "df.show(5)"
  56 |    ]
  57 |   },
  58 |   {
  59 |    "cell_type": "code",
  60 |    "execution_count": 123,
  61 |    "metadata": {},
  62 |    "outputs": [],
  63 |    "source": [
  64 |     "#read the data file\n",
  65 |     "df=spark.read.csv('conversion_data.csv',header=True)"
  66 |    ]
  67 |   },
  68 |   {
  69 |    "cell_type": "code",
  70 |    "execution_count": 124,
  71 |    "metadata": {},
  72 |    "outputs": [
  73 |     {
  74 |      "name": "stdout",
  75 |      "output_type": "stream",
  76 |      "text": [
  77 |       "+-------+---+--------+------+-------------------+---------+\n",
  78 |       "|country|age|new_user|source|total_pages_visited|converted|\n",
  79 |       "+-------+---+--------+------+-------------------+---------+\n",
  80 |       "|     UK| 25|       1|   Ads|                  1|        0|\n",
  81 |       "|     US| 23|       1|   Seo|                  5|        0|\n",
  82 |       "|     US| 28|       1|   Seo|                  4|        0|\n",
  83 |       "|  China| 39|       1|   Seo|                  5|        0|\n",
  84 |       "|     US| 30|       1|   Seo|                  6|        0|\n",
  85 |       "+-------+---+--------+------+-------------------+---------+\n",
  86 |       "only showing top 5 rows\n",
  87 |       "\n"
  88 |      ]
  89 |     }
  90 |    ],
  91 |    "source": [
  92 |     "df.show(5)"
  93 |    ]
  94 |   },
  95 |   {
  96 |    "cell_type": "code",
  97 |    "execution_count": 125,
  98 |    "metadata": {},
  99 |    "outputs": [
 100 |     {
 101 |      "name": "stdout",
 102 |      "output_type": "stream",
 103 |      "text": [
 104 |       "root\n",
 105 |       " |-- country: string (nullable = true)\n",
 106 |       " |-- age: string (nullable = true)\n",
 107 |       " |-- new_user: string (nullable = true)\n",
 108 |       " |-- source: string (nullable = true)\n",
 109 |       " |-- total_pages_visited: string (nullable = true)\n",
 110 |       " |-- converted: string (nullable = true)\n",
 111 |       "\n"
 112 |      ]
 113 |     }
 114 |    ],
 115 |    "source": [
 116 |     "df.printSchema()"
 117 |    ]
 118 |   },
 119 |   {
 120 |    "cell_type": "code",
 121 |    "execution_count": 126,
 122 |    "metadata": {},
 123 |    "outputs": [
 124 |     {
 125 |      "name": "stdout",
 126 |      "output_type": "stream",
 127 |      "text": [
 128 |       "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n",
 129 |       "|summary|country|               age|           new_user|source|total_pages_visited|          converted|\n",
 130 |       "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n",
 131 |       "|  count| 316200|            316200|             316200|316200|             316200|             316200|\n",
 132 |       "|   mean|   null|30.569857685009488| 0.6854648956356736|  null|  4.872966476913346|0.03225806451612903|\n",
 133 |       "| stddev|   null| 8.271801801807728|0.46433119036384723|  null|  3.341103757948214|0.17668497535763514|\n",
 134 |       "|    min|  China|               111|                  0|   Ads|                  1|                  0|\n",
 135 |       "|    max|     US|                79|                  1|   Seo|                  9|                  1|\n",
 136 |       "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n",
 137 |       "\n"
 138 |      ]
 139 |     }
 140 |    ],
 141 |    "source": [
 142 |     "#statistical summary for data numerical columns\n",
 143 |     "df.describe().show()"
 144 |    ]
 145 |   },
 146 |   {
 147 |    "cell_type": "markdown",
 148 |    "metadata": {},
 149 |    "source": [
 150 |     "## Datatypes "
 151 |    ]
 152 |   },
 153 |   {
 154 |    "cell_type": "code",
 155 |    "execution_count": 130,
 156 |    "metadata": {},
 157 |    "outputs": [],
 158 |    "source": [
 159 |     "from pyspark.sql.functions import col , column\n",
 160 |     "df = df.withColumn(\"age\", col(\"age\").cast(\"Int\"))\\\n",
 161 |     "    .withColumn(\"new_user\", col(\"new_user\").cast(\"Int\"))\\\n",
 162 |     "    .withColumn(\"total_pages_visited\", col(\"total_pages_visited\").cast(\"Int\"))\\\n",
 163 |     "    .withColumn(\"converted\", col(\"converted\").cast(\"Int\"))"
 164 |    ]
 165 |   },
 166 |   {
 167 |    "cell_type": "code",
 168 |    "execution_count": 131,
 169 |    "metadata": {},
 170 |    "outputs": [
 171 |     {
 172 |      "name": "stdout",
 173 |      "output_type": "stream",
 174 |      "text": [
 175 |       "root\n",
 176 |       " |-- country: string (nullable = true)\n",
 177 |       " |-- age: integer (nullable = true)\n",
 178 |       " |-- new_user: integer (nullable = true)\n",
 179 |       " |-- source: string (nullable = true)\n",
 180 |       " |-- total_pages_visited: integer (nullable = true)\n",
 181 |       " |-- converted: integer (nullable = true)\n",
 182 |       "\n"
 183 |      ]
 184 |     }
 185 |    ],
 186 |    "source": [
 187 |     "df.printSchema()"
 188 |    ]
 189 |   },
 190 |   {
 191 |    "cell_type": "code",
 192 |    "execution_count": 132,
 193 |    "metadata": {},
 194 |    "outputs": [
 195 |     {
 196 |      "name": "stdout",
 197 |      "output_type": "stream",
 198 |      "text": [
 199 |       "+-------+---+--------+------+-------------------+---------+\n",
 200 |       "|country|age|new_user|source|total_pages_visited|converted|\n",
 201 |       "+-------+---+--------+------+-------------------+---------+\n",
 202 |       "|     UK| 25|       1|   Ads|                  1|        0|\n",
 203 |       "|     US| 23|       1|   Seo|                  5|        0|\n",
 204 |       "|     US| 28|       1|   Seo|                  4|        0|\n",
 205 |       "|  China| 39|       1|   Seo|                  5|        0|\n",
 206 |       "|     US| 30|       1|   Seo|                  6|        0|\n",
 207 |       "+-------+---+--------+------+-------------------+---------+\n",
 208 |       "only showing top 5 rows\n",
 209 |       "\n"
 210 |      ]
 211 |     }
 212 |    ],
 213 |    "source": [
 214 |     "df.show(5)"
 215 |    ]
 216 |   },
 217 |   {
 218 |    "cell_type": "code",
 219 |    "execution_count": 133,
 220 |    "metadata": {},
 221 |    "outputs": [
 222 |     {
 223 |      "data": {
 224 |       "text/plain": [
 225 |        "Column<b'country'>"
 226 |       ]
 227 |      },
 228 |      "execution_count": 133,
 229 |      "metadata": {},
 230 |      "output_type": "execute_result"
 231 |     }
 232 |    ],
 233 |    "source": [
 234 |     "#acess dataframe column , we get column object \n",
 235 |     "df['country']"
 236 |    ]
 237 |   },
 238 |   {
 239 |    "cell_type": "code",
 240 |    "execution_count": 134,
 241 |    "metadata": {},
 242 |    "outputs": [
 243 |     {
 244 |      "data": {
 245 |       "text/plain": [
 246 |        "pyspark.sql.column.Column"
 247 |       ]
 248 |      },
 249 |      "execution_count": 134,
 250 |      "metadata": {},
 251 |      "output_type": "execute_result"
 252 |     }
 253 |    ],
 254 |    "source": [
 255 |     "type(df['country'])"
 256 |    ]
 257 |   },
 258 |   {
 259 |    "cell_type": "code",
 260 |    "execution_count": 135,
 261 |    "metadata": {},
 262 |    "outputs": [
 263 |     {
 264 |      "name": "stdout",
 265 |      "output_type": "stream",
 266 |      "text": [
 267 |       "+-------+\n",
 268 |       "|country|\n",
 269 |       "+-------+\n",
 270 |       "|     UK|\n",
 271 |       "|     US|\n",
 272 |       "|     US|\n",
 273 |       "|  China|\n",
 274 |       "|     US|\n",
 275 |       "+-------+\n",
 276 |       "only showing top 5 rows\n",
 277 |       "\n"
 278 |      ]
 279 |     }
 280 |    ],
 281 |    "source": [
 282 |     "#access content of colum\n",
 283 |     "df.select('country').show(5)"
 284 |    ]
 285 |   },
 286 |   {
 287 |    "cell_type": "code",
 288 |    "execution_count": 136,
 289 |    "metadata": {},
 290 |    "outputs": [
 291 |     {
 292 |      "name": "stdout",
 293 |      "output_type": "stream",
 294 |      "text": [
 295 |       "+-------+------+\n",
 296 |       "|country|source|\n",
 297 |       "+-------+------+\n",
 298 |       "|     UK|   Ads|\n",
 299 |       "|     US|   Seo|\n",
 300 |       "|     US|   Seo|\n",
 301 |       "|  China|   Seo|\n",
 302 |       "|     US|   Seo|\n",
 303 |       "+-------+------+\n",
 304 |       "only showing top 5 rows\n",
 305 |       "\n"
 306 |      ]
 307 |     }
 308 |    ],
 309 |    "source": [
 310 |     "#acess multiple columns\n",
 311 |     "df.select(['country','source']).show(5)"
 312 |    ]
 313 |   },
 314 |   {
 315 |    "cell_type": "markdown",
 316 |    "metadata": {},
 317 |    "source": [
 318 |     "### Add or Remove column "
 319 |    ]
 320 |   },
 321 |   {
 322 |    "cell_type": "markdown",
 323 |    "metadata": {},
 324 |    "source": [
 325 |     "#### using udf (user defined functions)"
 326 |    ]
 327 |   },
 328 |   {
 329 |    "cell_type": "code",
 330 |    "execution_count": 137,
 331 |    "metadata": {},
 332 |    "outputs": [],
 333 |    "source": [
 334 |     "from pyspark.sql.types import StringType\n",
 335 |     "from pyspark.sql.functions import udf\n",
 336 |     "\n",
 337 |     "def country_udf(country):\n",
 338 |     "    if country =='UK':\n",
 339 |     "        return 'Britain'\n",
 340 |     "    elif country =='US':\n",
 341 |     "        return 'USA'\n",
 342 |     "    elif country =='China':\n",
 343 |     "        return 'Asia'\n",
 344 |     "    elif country =='Germany':\n",
 345 |     "        return 'Deustche'\n",
 346 |     "    else:\n",
 347 |     "        return country\n",
 348 |     "        \n",
 349 |     "spark_udf = udf(country_udf, StringType())\n",
 350 |     "\n",
 351 |     "df=df.withColumn(\"country_new\", spark_udf(df.country))"
 352 |    ]
 353 |   },
 354 |   {
 355 |    "cell_type": "code",
 356 |    "execution_count": 138,
 357 |    "metadata": {},
 358 |    "outputs": [
 359 |     {
 360 |      "name": "stdout",
 361 |      "output_type": "stream",
 362 |      "text": [
 363 |       "+-------+---+--------+------+-------------------+---------+-----------+\n",
 364 |       "|country|age|new_user|source|total_pages_visited|converted|country_new|\n",
 365 |       "+-------+---+--------+------+-------------------+---------+-----------+\n",
 366 |       "|     UK| 25|       1|   Ads|                  1|        0|    Britain|\n",
 367 |       "|     US| 23|       1|   Seo|                  5|        0|        USA|\n",
 368 |       "|     US| 28|       1|   Seo|                  4|        0|        USA|\n",
 369 |       "|  China| 39|       1|   Seo|                  5|        0|       Asia|\n",
 370 |       "|     US| 30|       1|   Seo|                  6|        0|        USA|\n",
 371 |       "|     US| 31|       0|   Seo|                  1|        0|        USA|\n",
 372 |       "|  China| 27|       1|   Seo|                  4|        0|       Asia|\n",
 373 |       "|     US| 23|       0|   Ads|                  4|        0|        USA|\n",
 374 |       "|     UK| 29|       0|Direct|                  4|        0|    Britain|\n",
 375 |       "|     US| 25|       0|   Ads|                  2|        0|        USA|\n",
 376 |       "+-------+---+--------+------+-------------------+---------+-----------+\n",
 377 |       "only showing top 10 rows\n",
 378 |       "\n"
 379 |      ]
 380 |     }
 381 |    ],
 382 |    "source": [
 383 |     "df.show(10)"
 384 |    ]
 385 |   },
 386 |   {
 387 |    "cell_type": "markdown",
 388 |    "metadata": {},
 389 |    "source": [
 390 |     "#### without using udf "
 391 |    ]
 392 |   },
 393 |   {
 394 |    "cell_type": "code",
 395 |    "execution_count": 139,
 396 |    "metadata": {},
 397 |    "outputs": [],
 398 |    "source": [
 399 |     "#create new column with age +2  value\n",
 400 |     "df=df.withColumn('new_age',df['age'] +2)"
 401 |    ]
 402 |   },
 403 |   {
 404 |    "cell_type": "code",
 405 |    "execution_count": 140,
 406 |    "metadata": {},
 407 |    "outputs": [
 408 |     {
 409 |      "name": "stdout",
 410 |      "output_type": "stream",
 411 |      "text": [
 412 |       "+-------+---+--------+------+-------------------+---------+-----------+-------+\n",
 413 |       "|country|age|new_user|source|total_pages_visited|converted|country_new|new_age|\n",
 414 |       "+-------+---+--------+------+-------------------+---------+-----------+-------+\n",
 415 |       "|     UK| 25|       1|   Ads|                  1|        0|    Britain|     27|\n",
 416 |       "|     US| 23|       1|   Seo|                  5|        0|        USA|     25|\n",
 417 |       "|     US| 28|       1|   Seo|                  4|        0|        USA|     30|\n",
 418 |       "|  China| 39|       1|   Seo|                  5|        0|       Asia|     41|\n",
 419 |       "|     US| 30|       1|   Seo|                  6|        0|        USA|     32|\n",
 420 |       "|     US| 31|       0|   Seo|                  1|        0|        USA|     33|\n",
 421 |       "|  China| 27|       1|   Seo|                  4|        0|       Asia|     29|\n",
 422 |       "|     US| 23|       0|   Ads|                  4|        0|        USA|     25|\n",
 423 |       "|     UK| 29|       0|Direct|                  4|        0|    Britain|     31|\n",
 424 |       "|     US| 25|       0|   Ads|                  2|        0|        USA|     27|\n",
 425 |       "+-------+---+--------+------+-------------------+---------+-----------+-------+\n",
 426 |       "only showing top 10 rows\n",
 427 |       "\n"
 428 |      ]
 429 |     }
 430 |    ],
 431 |    "source": [
 432 |     "df.show(10)"
 433 |    ]
 434 |   },
 435 |   {
 436 |    "cell_type": "markdown",
 437 |    "metadata": {},
 438 |    "source": [
 439 |     "### Drop /Delete columns "
 440 |    ]
 441 |   },
 442 |   {
 443 |    "cell_type": "code",
 444 |    "execution_count": 141,
 445 |    "metadata": {},
 446 |    "outputs": [],
 447 |    "source": [
 448 |     "#delete the new_age column\n",
 449 |     "df=df.drop('new_age')"
 450 |    ]
 451 |   },
 452 |   {
 453 |    "cell_type": "code",
 454 |    "execution_count": 142,
 455 |    "metadata": {},
 456 |    "outputs": [],
 457 |    "source": [
 458 |     "#delete the country_new column\n",
 459 |     "df=df.drop('country_new')"
 460 |    ]
 461 |   },
 462 |   {
 463 |    "cell_type": "code",
 464 |    "execution_count": 143,
 465 |    "metadata": {},
 466 |    "outputs": [
 467 |     {
 468 |      "name": "stdout",
 469 |      "output_type": "stream",
 470 |      "text": [
 471 |       "+-------+---+--------+------+-------------------+---------+\n",
 472 |       "|country|age|new_user|source|total_pages_visited|converted|\n",
 473 |       "+-------+---+--------+------+-------------------+---------+\n",
 474 |       "|     UK| 25|       1|   Ads|                  1|        0|\n",
 475 |       "|     US| 23|       1|   Seo|                  5|        0|\n",
 476 |       "|     US| 28|       1|   Seo|                  4|        0|\n",
 477 |       "|  China| 39|       1|   Seo|                  5|        0|\n",
 478 |       "|     US| 30|       1|   Seo|                  6|        0|\n",
 479 |       "+-------+---+--------+------+-------------------+---------+\n",
 480 |       "only showing top 5 rows\n",
 481 |       "\n"
 482 |      ]
 483 |     }
 484 |    ],
 485 |    "source": [
 486 |     "df.show(5)"
 487 |    ]
 488 |   },
 489 |   {
 490 |    "cell_type": "markdown",
 491 |    "metadata": {},
 492 |    "source": [
 493 |     "# Acess row objects of dataframe"
 494 |    ]
 495 |   },
 496 |   {
 497 |    "cell_type": "code",
 498 |    "execution_count": 144,
 499 |    "metadata": {},
 500 |    "outputs": [
 501 |     {
 502 |      "data": {
 503 |       "text/plain": [
 504 |        "[Row(country='UK', age=25, new_user=1, source='Ads', total_pages_visited=1, converted=0),\n",
 505 |        " Row(country='US', age=23, new_user=1, source='Seo', total_pages_visited=5, converted=0),\n",
 506 |        " Row(country='US', age=28, new_user=1, source='Seo', total_pages_visited=4, converted=0)]"
 507 |       ]
 508 |      },
 509 |      "execution_count": 144,
 510 |      "metadata": {},
 511 |      "output_type": "execute_result"
 512 |     }
 513 |    ],
 514 |    "source": [
 515 |     "#access first 3 rows\n",
 516 |     "df.head(3)"
 517 |    ]
 518 |   },
 519 |   {
 520 |    "cell_type": "code",
 521 |    "execution_count": 145,
 522 |    "metadata": {},
 523 |    "outputs": [
 524 |     {
 525 |      "data": {
 526 |       "text/plain": [
 527 |        "Row(country='UK', age=25, new_user=1, source='Ads', total_pages_visited=1, converted=0)"
 528 |       ]
 529 |      },
 530 |      "execution_count": 145,
 531 |      "metadata": {},
 532 |      "output_type": "execute_result"
 533 |     }
 534 |    ],
 535 |    "source": [
 536 |     "#access first row object \n",
 537 |     "df.head(3)[0]"
 538 |    ]
 539 |   },
 540 |   {
 541 |    "cell_type": "code",
 542 |    "execution_count": 146,
 543 |    "metadata": {},
 544 |    "outputs": [
 545 |     {
 546 |      "data": {
 547 |       "text/plain": [
 548 |        "'UK'"
 549 |       ]
 550 |      },
 551 |      "execution_count": 146,
 552 |      "metadata": {},
 553 |      "output_type": "execute_result"
 554 |     }
 555 |    ],
 556 |    "source": [
 557 |     "#access first row object\n",
 558 |     "df.head(3)[0][0]"
 559 |    ]
 560 |   },
 561 |   {
 562 |    "cell_type": "markdown",
 563 |    "metadata": {},
 564 |    "source": [
 565 |     "## Filtering "
 566 |    ]
 567 |   },
 568 |   {
 569 |    "cell_type": "code",
 570 |    "execution_count": 147,
 571 |    "metadata": {},
 572 |    "outputs": [
 573 |     {
 574 |      "name": "stdout",
 575 |      "output_type": "stream",
 576 |      "text": [
 577 |       "+-------+---+--------+------+-------------------+---------+\n",
 578 |       "|country|age|new_user|source|total_pages_visited|converted|\n",
 579 |       "+-------+---+--------+------+-------------------+---------+\n",
 580 |       "|Germany|123|       0|   Seo|                 15|        1|\n",
 581 |       "|     US| 77|       0|Direct|                  4|        0|\n",
 582 |       "|     US| 79|       1|Direct|                  1|        0|\n",
 583 |       "|     UK|111|       0|   Ads|                 10|        1|\n",
 584 |       "+-------+---+--------+------+-------------------+---------+\n",
 585 |       "\n"
 586 |      ]
 587 |     }
 588 |    ],
 589 |    "source": [
 590 |     "#filter records where age of user is more than 75 years\n",
 591 |     "df.filter(df['age'] >75).show(5)"
 592 |    ]
 593 |   },
 594 |   {
 595 |    "cell_type": "code",
 596 |    "execution_count": 148,
 597 |    "metadata": {},
 598 |    "outputs": [
 599 |     {
 600 |      "name": "stdout",
 601 |      "output_type": "stream",
 602 |      "text": [
 603 |       "+-------+---------+---+\n",
 604 |       "|country|converted|age|\n",
 605 |       "+-------+---------+---+\n",
 606 |       "|Germany|        1|123|\n",
 607 |       "|     US|        0| 77|\n",
 608 |       "|     US|        0| 79|\n",
 609 |       "|     UK|        1|111|\n",
 610 |       "+-------+---------+---+\n",
 611 |       "\n"
 612 |      ]
 613 |     }
 614 |    ],
 615 |    "source": [
 616 |     "#filter records and show only country and converted status of that user\n",
 617 |     "df.filter(df['age'] > 75).select(['country','converted','age']).show(5)"
 618 |    ]
 619 |   },
 620 |   {
 621 |    "cell_type": "markdown",
 622 |    "metadata": {},
 623 |    "source": [
 624 |     "### Multiple filter conditions"
 625 |    ]
 626 |   },
 627 |   {
 628 |    "cell_type": "code",
 629 |    "execution_count": 149,
 630 |    "metadata": {},
 631 |    "outputs": [
 632 |     {
 633 |      "name": "stdout",
 634 |      "output_type": "stream",
 635 |      "text": [
 636 |       "+-------+---+--------+------+-------------------+---------+\n",
 637 |       "|country|age|new_user|source|total_pages_visited|converted|\n",
 638 |       "+-------+---+--------+------+-------------------+---------+\n",
 639 |       "|     US| 77|       0|Direct|                  4|        0|\n",
 640 |       "|     US| 79|       1|Direct|                  1|        0|\n",
 641 |       "+-------+---+--------+------+-------------------+---------+\n",
 642 |       "\n"
 643 |      ]
 644 |     }
 645 |    ],
 646 |    "source": [
 647 |     "#select people over 75 years only from US\n",
 648 |     "df.filter(df['age'] > 75).filter(df['country'] =='US').show(5)"
 649 |    ]
 650 |   },
 651 |   {
 652 |    "cell_type": "code",
 653 |    "execution_count": 150,
 654 |    "metadata": {},
 655 |    "outputs": [
 656 |     {
 657 |      "name": "stdout",
 658 |      "output_type": "stream",
 659 |      "text": [
 660 |       "+-------+---+--------+------+-------------------+---------+\n",
 661 |       "|country|age|new_user|source|total_pages_visited|converted|\n",
 662 |       "+-------+---+--------+------+-------------------+---------+\n",
 663 |       "|Germany| 31|       0|Direct|                  2|        1|\n",
 664 |       "+-------+---+--------+------+-------------------+---------+\n",
 665 |       "\n"
 666 |      ]
 667 |     }
 668 |    ],
 669 |    "source": [
 670 |     "#selet users who have more less than 3 visited pages and are still converted from Germany \n",
 671 |     "df.filter(df['total_pages_visited'] < 3).filter(df['converted']==1).filter(df['country'] =='Germany').show(5)"
 672 |    ]
 673 |   },
 674 |   {
 675 |    "cell_type": "markdown",
 676 |    "metadata": {},
 677 |    "source": [
 678 |     "## Count Records "
 679 |    ]
 680 |   },
 681 |   {
 682 |    "cell_type": "code",
 683 |    "execution_count": 151,
 684 |    "metadata": {},
 685 |    "outputs": [
 686 |     {
 687 |      "data": {
 688 |       "text/plain": [
 689 |        "316200"
 690 |       ]
 691 |      },
 692 |      "execution_count": 151,
 693 |      "metadata": {},
 694 |      "output_type": "execute_result"
 695 |     }
 696 |    ],
 697 |    "source": [
 698 |     "#total records in df \n",
 699 |     "df.count()"
 700 |    ]
 701 |   },
 702 |   {
 703 |    "cell_type": "code",
 704 |    "execution_count": 152,
 705 |    "metadata": {},
 706 |    "outputs": [
 707 |     {
 708 |      "name": "stdout",
 709 |      "output_type": "stream",
 710 |      "text": [
 711 |       "+-------+------+\n",
 712 |       "|country| count|\n",
 713 |       "+-------+------+\n",
 714 |       "|Germany| 13056|\n",
 715 |       "|  China| 76602|\n",
 716 |       "|     US|178092|\n",
 717 |       "|     UK| 48450|\n",
 718 |       "+-------+------+\n",
 719 |       "\n"
 720 |      ]
 721 |     }
 722 |    ],
 723 |    "source": [
 724 |     "# Frequency count of column values\n",
 725 |     "df.groupBy('country').count().show(5)"
 726 |    ]
 727 |   },
 728 |   {
 729 |    "cell_type": "code",
 730 |    "execution_count": 153,
 731 |    "metadata": {},
 732 |    "outputs": [
 733 |     {
 734 |      "name": "stdout",
 735 |      "output_type": "stream",
 736 |      "text": [
 737 |       "+-------+------+\n",
 738 |       "|country| count|\n",
 739 |       "+-------+------+\n",
 740 |       "|     US|178092|\n",
 741 |       "|  China| 76602|\n",
 742 |       "|     UK| 48450|\n",
 743 |       "|Germany| 13056|\n",
 744 |       "+-------+------+\n",
 745 |       "\n"
 746 |      ]
 747 |     }
 748 |    ],
 749 |    "source": [
 750 |     "## Ordered Frequency count \n",
 751 |     "df.groupBy('country').count().orderBy('count',ascending=False).show(5)"
 752 |    ]
 753 |   },
 754 |   {
 755 |    "cell_type": "code",
 756 |    "execution_count": 154,
 757 |    "metadata": {},
 758 |    "outputs": [
 759 |     {
 760 |      "name": "stdout",
 761 |      "output_type": "stream",
 762 |      "text": [
 763 |       "+---------+------+\n",
 764 |       "|converted| count|\n",
 765 |       "+---------+------+\n",
 766 |       "|        1| 10200|\n",
 767 |       "|        0|306000|\n",
 768 |       "+---------+------+\n",
 769 |       "\n"
 770 |      ]
 771 |     }
 772 |    ],
 773 |    "source": [
 774 |     "#Total converted vs non converted user counts\n",
 775 |     "df.groupBy('converted').count().show(2)"
 776 |    ]
 777 |   },
 778 |   {
 779 |    "cell_type": "code",
 780 |    "execution_count": 156,
 781 |    "metadata": {},
 782 |    "outputs": [
 783 |     {
 784 |      "name": "stdout",
 785 |      "output_type": "stream",
 786 |      "text": [
 787 |       "+---------+------------------+------------------+------------------------+--------------+\n",
 788 |       "|converted|          avg(age)|     avg(new_user)|avg(total_pages_visited)|avg(converted)|\n",
 789 |       "+---------+------------------+------------------+------------------------+--------------+\n",
 790 |       "|        1|26.546764705882353|0.2979411764705882|      14.553529411764705|           1.0|\n",
 791 |       "|        0|30.703960784313725|0.6983823529411765|       4.550281045751634|           0.0|\n",
 792 |       "+---------+------------------+------------------+------------------------+--------------+\n",
 793 |       "\n"
 794 |      ]
 795 |     }
 796 |    ],
 797 |    "source": [
 798 |     "#Mean value of conversion for each source\n",
 799 |     "df.groupBy('converted').mean().show()"
 800 |    ]
 801 |   },
 802 |   {
 803 |    "cell_type": "markdown",
 804 |    "metadata": {},
 805 |    "source": [
 806 |     "## Collect "
 807 |    ]
 808 |   },
 809 |   {
 810 |    "cell_type": "markdown",
 811 |    "metadata": {},
 812 |    "source": [
 813 |     "Save the results as a list with row objects\n"
 814 |    ]
 815 |   },
 816 |   {
 817 |    "cell_type": "code",
 818 |    "execution_count": 57,
 819 |    "metadata": {},
 820 |    "outputs": [],
 821 |    "source": [
 822 |     "# create a list with only converted users data from China\n",
 823 |     "china_data=df.filter((df['country']=='China') & (df['converted'] ==1)).collect()"
 824 |    ]
 825 |   },
 826 |   {
 827 |    "cell_type": "code",
 828 |    "execution_count": 60,
 829 |    "metadata": {},
 830 |    "outputs": [
 831 |     {
 832 |      "data": {
 833 |       "text/plain": [
 834 |        "[Row(country='China', age='24', new_user='0', source='Seo', total_pages_visited='18', converted='1'),\n",
 835 |        " Row(country='China', age='26', new_user='1', source='Ads', total_pages_visited='18', converted='1'),\n",
 836 |        " Row(country='China', age='30', new_user='0', source='Ads', total_pages_visited='17', converted='1'),\n",
 837 |        " Row(country='China', age='26', new_user='0', source='Seo', total_pages_visited='8', converted='1'),\n",
 838 |        " Row(country='China', age='33', new_user='1', source='Direct', total_pages_visited='13', converted='1')]"
 839 |       ]
 840 |      },
 841 |      "execution_count": 60,
 842 |      "metadata": {},
 843 |      "output_type": "execute_result"
 844 |     }
 845 |    ],
 846 |    "source": [
 847 |     "#view the new list \n",
 848 |     "china_data[:5]"
 849 |    ]
 850 |   },
 851 |   {
 852 |    "cell_type": "code",
 853 |    "execution_count": 67,
 854 |    "metadata": {},
 855 |    "outputs": [],
 856 |    "source": [
 857 |     "#view the list object as a dictionary\n",
 858 |     "china_dict=china_data[0].asDict()"
 859 |    ]
 860 |   },
 861 |   {
 862 |    "cell_type": "code",
 863 |    "execution_count": 64,
 864 |    "metadata": {},
 865 |    "outputs": [
 866 |     {
 867 |      "data": {
 868 |       "text/plain": [
 869 |        "'24'"
 870 |       ]
 871 |      },
 872 |      "execution_count": 64,
 873 |      "metadata": {},
 874 |      "output_type": "execute_result"
 875 |     }
 876 |    ],
 877 |    "source": [
 878 |     "china_dict['age']"
 879 |    ]
 880 |   },
 881 |   {
 882 |    "cell_type": "code",
 883 |    "execution_count": 65,
 884 |    "metadata": {},
 885 |    "outputs": [
 886 |     {
 887 |      "data": {
 888 |       "text/plain": [
 889 |        "'18'"
 890 |       ]
 891 |      },
 892 |      "execution_count": 65,
 893 |      "metadata": {},
 894 |      "output_type": "execute_result"
 895 |     }
 896 |    ],
 897 |    "source": [
 898 |     "china_dict['total_pages_visited']"
 899 |    ]
 900 |   },
 901 |   {
 902 |    "cell_type": "markdown",
 903 |    "metadata": {},
 904 |    "source": [
 905 |     "## Aggregate Functions"
 906 |    ]
 907 |   },
 908 |   {
 909 |    "cell_type": "code",
 910 |    "execution_count": 158,
 911 |    "metadata": {},
 912 |    "outputs": [
 913 |     {
 914 |      "name": "stdout",
 915 |      "output_type": "stream",
 916 |      "text": [
 917 |       "+------------------+\n",
 918 |       "|          avg(age)|\n",
 919 |       "+------------------+\n",
 920 |       "|30.569857685009488|\n",
 921 |       "+------------------+\n",
 922 |       "\n"
 923 |      ]
 924 |     }
 925 |    ],
 926 |    "source": [
 927 |     "df.agg({'age':'mean'}).show()"
 928 |    ]
 929 |   },
 930 |   {
 931 |    "cell_type": "code",
 932 |    "execution_count": 159,
 933 |    "metadata": {},
 934 |    "outputs": [
 935 |     {
 936 |      "name": "stdout",
 937 |      "output_type": "stream",
 938 |      "text": [
 939 |       "+-------------------+\n",
 940 |       "|     avg(converted)|\n",
 941 |       "+-------------------+\n",
 942 |       "|0.03225806451612903|\n",
 943 |       "+-------------------+\n",
 944 |       "\n"
 945 |      ]
 946 |     }
 947 |    ],
 948 |    "source": [
 949 |     "df.agg({'converted':'mean'}).show()"
 950 |    ]
 951 |   },
 952 |   {
 953 |    "cell_type": "code",
 954 |    "execution_count": 160,
 955 |    "metadata": {},
 956 |    "outputs": [
 957 |     {
 958 |      "name": "stdout",
 959 |      "output_type": "stream",
 960 |      "text": [
 961 |       "+--------+\n",
 962 |       "|max(age)|\n",
 963 |       "+--------+\n",
 964 |       "|     123|\n",
 965 |       "+--------+\n",
 966 |       "\n"
 967 |      ]
 968 |     }
 969 |    ],
 970 |    "source": [
 971 |     "df.agg({'age':'max'}).show()"
 972 |    ]
 973 |   },
 974 |   {
 975 |    "cell_type": "code",
 976 |    "execution_count": 161,
 977 |    "metadata": {},
 978 |    "outputs": [
 979 |     {
 980 |      "name": "stdout",
 981 |      "output_type": "stream",
 982 |      "text": [
 983 |       "+--------------+\n",
 984 |       "|count(country)|\n",
 985 |       "+--------------+\n",
 986 |       "|        316200|\n",
 987 |       "+--------------+\n",
 988 |       "\n"
 989 |      ]
 990 |     }
 991 |    ],
 992 |    "source": [
 993 |     "df.agg({'country':'count'}).show()"
 994 |    ]
 995 |   },
 996 |   {
 997 |    "cell_type": "code",
 998 |    "execution_count": 162,
 999 |    "metadata": {},
1000 |    "outputs": [
1001 |     {
1002 |      "name": "stdout",
1003 |      "output_type": "stream",
1004 |      "text": [
1005 |       "+--------+\n",
1006 |       "|min(age)|\n",
1007 |       "+--------+\n",
1008 |       "|      17|\n",
1009 |       "+--------+\n",
1010 |       "\n"
1011 |      ]
1012 |     }
1013 |    ],
1014 |    "source": [
1015 |     "df.agg({'age':'min'}).show()"
1016 |    ]
1017 |   },
1018 |   {
1019 |    "cell_type": "code",
1020 |    "execution_count": 163,
1021 |    "metadata": {},
1022 |    "outputs": [
1023 |     {
1024 |      "name": "stdout",
1025 |      "output_type": "stream",
1026 |      "text": [
1027 |       "+-------+--------+\n",
1028 |       "|country|max(age)|\n",
1029 |       "+-------+--------+\n",
1030 |       "|Germany|     123|\n",
1031 |       "|  China|      69|\n",
1032 |       "|     US|      79|\n",
1033 |       "|     UK|     111|\n",
1034 |       "+-------+--------+\n",
1035 |       "\n"
1036 |      ]
1037 |     }
1038 |    ],
1039 |    "source": [
1040 |     "## aggregation on grouped data by country\n",
1041 |     "df.groupBy('country').agg({'age':'max'}).show()"
1042 |    ]
1043 |   },
1044 |   {
1045 |    "cell_type": "code",
1046 |    "execution_count": 164,
1047 |    "metadata": {},
1048 |    "outputs": [
1049 |     {
1050 |      "name": "stdout",
1051 |      "output_type": "stream",
1052 |      "text": [
1053 |       "+-------+------+--------+\n",
1054 |       "|country|source|max(age)|\n",
1055 |       "+-------+------+--------+\n",
1056 |       "|Germany|Direct|      61|\n",
1057 |       "|  China|Direct|      65|\n",
1058 |       "|     UK|   Ads|     111|\n",
1059 |       "|     US|   Seo|      73|\n",
1060 |       "|     UK|   Seo|      66|\n",
1061 |       "|Germany|   Seo|     123|\n",
1062 |       "|Germany|   Ads|      64|\n",
1063 |       "|  China|   Seo|      68|\n",
1064 |       "|     UK|Direct|      69|\n",
1065 |       "|  China|   Ads|      69|\n",
1066 |       "|     US|   Ads|      70|\n",
1067 |       "|     US|Direct|      79|\n",
1068 |       "+-------+------+--------+\n",
1069 |       "\n"
1070 |      ]
1071 |     }
1072 |    ],
1073 |    "source": [
1074 |     "## aggregation on grouped data by country,source\n",
1075 |     "df.groupBy(['country','source']).agg({'age':'max'}).show()"
1076 |    ]
1077 |   },
1078 |   {
1079 |    "cell_type": "code",
1080 |    "execution_count": 170,
1081 |    "metadata": {},
1082 |    "outputs": [
1083 |     {
1084 |      "name": "stdout",
1085 |      "output_type": "stream",
1086 |      "text": [
1087 |       "+-------+---------+------------------------+\n",
1088 |       "|country|converted|avg(total_pages_visited)|\n",
1089 |       "+-------+---------+------------------------+\n",
1090 |       "|Germany|        0|       4.565277777777778|\n",
1091 |       "|  China|        1|      14.352941176470589|\n",
1092 |       "|  China|        0|      4.5404575163398695|\n",
1093 |       "|     US|        0|       4.551785714285714|\n",
1094 |       "|     UK|        0|       4.557037037037037|\n",
1095 |       "|Germany|        1|      14.572303921568627|\n",
1096 |       "|     UK|        1|       14.53450980392157|\n",
1097 |       "|     US|        1|      14.561497326203208|\n",
1098 |       "+-------+---------+------------------------+\n",
1099 |       "\n"
1100 |      ]
1101 |     }
1102 |    ],
1103 |    "source": [
1104 |     "## aggregation on grouped data by country,converted\n",
1105 |     "df.groupBy(['country','converted']).agg({'total_pages_visited':'mean'}).show()"
1106 |    ]
1107 |   },
1108 |   {
1109 |    "cell_type": "code",
1110 |    "execution_count": null,
1111 |    "metadata": {},
1112 |    "outputs": [],
1113 |    "source": []
1114 |   }
1115 |  ],
1116 |  "metadata": {
1117 |   "kernelspec": {
1118 |    "display_name": "Python 3",
1119 |    "language": "python",
1120 |    "name": "python3"
1121 |   },
1122 |   "language_info": {
1123 |    "codemirror_mode": {
1124 |     "name": "ipython",
1125 |     "version": 3
1126 |    },
1127 |    "file_extension": ".py",
1128 |    "mimetype": "text/x-python",
1129 |    "name": "python",
1130 |    "nbconvert_exporter": "python",
1131 |    "pygments_lexer": "ipython3",
1132 |    "version": "3.6.3"
1133 |   }
1134 |  },
1135 |  "nbformat": 4,
1136 |  "nbformat_minor": 2
1137 | }
1138 | 


--------------------------------------------------------------------------------