├── chap_2 └── .DS_Store ├── chap_3 ├── .DS_Store ├── .ipynb_checkpoints │ ├── Spark Structured Streaming-checkpoint.ipynb │ ├── Spark Structured Streaming-ver_1-checkpoint.ipynb │ ├── Spark Structured Streaming app-checkpoint.ipynb │ ├── Spark Structured Streaming demo-checkpoint.ipynb │ ├── Logistic_resgression_pyspark-checkpoint.ipynb │ └── pyspark_basics-checkpoint.ipynb └── Spark Structured Streaming demo.ipynb ├── chap_4 ├── .DS_Store └── pramod_dag.py ├── chap_5 ├── .DS_Store ├── Classification_using_MLlib.ipynb └── .ipynb_checkpoints │ └── Classification_using_MLlib-checkpoint.ipynb ├── chap_6 └── .DS_Store ├── chap_7 └── .DS_Store ├── chap_8 ├── .DS_Store ├── .ipynb_checkpoints │ └── multilayer perceptron-checkpoint.ipynb └── Multilayer_perceptron_spark.ipynb ├── 9781484249604.jpg ├── errata.md ├── README.md ├── Contributing.md └── LICENSE.txt /chap_2/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_2/.DS_Store -------------------------------------------------------------------------------- /chap_3/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_3/.DS_Store -------------------------------------------------------------------------------- /chap_4/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_4/.DS_Store -------------------------------------------------------------------------------- /chap_5/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_5/.DS_Store -------------------------------------------------------------------------------- /chap_6/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_6/.DS_Store -------------------------------------------------------------------------------- /chap_7/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_7/.DS_Store -------------------------------------------------------------------------------- /chap_8/.DS_Store: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/chap_8/.DS_Store -------------------------------------------------------------------------------- /9781484249604.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Apress/learn-pyspark/HEAD/9781484249604.jpg -------------------------------------------------------------------------------- /errata.md: -------------------------------------------------------------------------------- 1 | # Errata for *Book Title* 2 | 3 | On **page xx** [Summary of error]: 4 | 5 | Details of error here. Highlight key pieces in **bold**. 6 | 7 | *** 8 | 9 | On **page xx** [Summary of error]: 10 | 11 | Details of error here. Highlight key pieces in **bold**. 12 | 13 | *** -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apress Source Code 2 | 3 | This repository accompanies [*Learn PySpark*](https://www.apress.com/9781484249604) by Pramod Singh (Apress, 2019). 4 | 5 | [comment]: #cover 6 | ![Cover image](9781484249604.jpg) 7 | 8 | Download the files as a zip using the green button, or clone the repository to your machine using Git. 9 | 10 | ## Releases 11 | 12 | Release v1.0 corresponds to the code in the published book, without corrections or updates. 13 | 14 | ## Contributions 15 | 16 | See the file Contributing.md for more information on how you can contribute to this repository. -------------------------------------------------------------------------------- /Contributing.md: -------------------------------------------------------------------------------- 1 | # Contributing to Apress Source Code 2 | 3 | Copyright for Apress source code belongs to the author(s). However, under fair use you are encouraged to fork and contribute minor corrections and updates for the benefit of the author(s) and other readers. 4 | 5 | ## How to Contribute 6 | 7 | 1. Make sure you have a GitHub account. 8 | 2. Fork the repository for the relevant book. 9 | 3. Create a new branch on which to make your change, e.g. 10 | `git checkout -b my_code_contribution` 11 | 4. Commit your change. Include a commit message describing the correction. Please note that if your commit message is not clear, the correction will not be accepted. 12 | 5. Submit a pull request. 13 | 14 | Thank you for your contribution! -------------------------------------------------------------------------------- /chap_4/pramod_dag.py: -------------------------------------------------------------------------------- 1 | from datetime import timedelta 2 | 3 | import airflow 4 | from airflow import DAG 5 | from airflow.operators.bash_operator import BashOperator 6 | 7 | 8 | 9 | args = { 10 | 'owner': 'Pramod', 11 | 'start_date': airflow.utils.dates.days_ago(3), 12 | # 'end_date': datetime(2018, 12, 30), 13 | 'depends_on_past': False, 14 | 'email': ['airflow@example.com'], 15 | 'email_on_failure': False, 16 | 'email_on_retry': False, 17 | # If a task fails, retry it once after waiting 18 | # at least 5 minutes 19 | 'retries': 1, 20 | 'retry_delay': timedelta(minutes=5), 21 | } 22 | 23 | 24 | dag = DAG( 25 | 'pramod_airflow_dag', 26 | default_args=args, 27 | description='A simple DAG', 28 | # Continue to run DAG once per day 29 | schedule_interval=timedelta(days=1) 30 | ) 31 | 32 | 33 | # t1, t2 and t3 are examples of tasks created by instantiating operators 34 | t1 = BashOperator( 35 | task_id='print_date', 36 | bash_command='date', 37 | dag=dag, 38 | ) 39 | 40 | t2 = BashOperator( 41 | task_id='sleep', 42 | depends_on_past=False, 43 | bash_command='sleep 5', 44 | dag=dag, 45 | ) 46 | 47 | 48 | 49 | t1 >> t2 50 | 51 | -------------------------------------------------------------------------------- /LICENSE.txt: -------------------------------------------------------------------------------- 1 | Freeware License, some rights reserved 2 | 3 | Copyright (c) 2019 Pramod Singh 4 | 5 | Permission is hereby granted, free of charge, to anyone obtaining a copy 6 | of this software and associated documentation files (the "Software"), 7 | to work with the Software within the limits of freeware distribution and fair use. 8 | This includes the rights to use, copy, and modify the Software for personal use. 9 | Users are also allowed and encouraged to submit corrections and modifications 10 | to the Software for the benefit of other users. 11 | 12 | It is not allowed to reuse, modify, or redistribute the Software for 13 | commercial use in any way, or for a user’s educational materials such as books 14 | or blog articles without prior permission from the copyright holder. 15 | 16 | The above copyright notice and this permission notice need to be included 17 | in all copies or substantial portions of the software. 18 | 19 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 20 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 21 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 22 | AUTHORS OR COPYRIGHT HOLDERS OR APRESS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 23 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 24 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 25 | SOFTWARE. 26 | 27 | 28 | -------------------------------------------------------------------------------- /chap_3/.ipynb_checkpoints/Spark Structured Streaming-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pyspark" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "#import SparkSession\n", 19 | "from pyspark.sql import SparkSession\n", 20 | "spark=SparkSession.builder.appName('ss').getOrCreate()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 20, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from pyspark.sql.functions import *\n", 30 | "from pyspark.sql.types import *" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 22, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "#create sample dataset\n", 40 | "df_1=spark.createDataFrame([('pramod neha',),('pramod ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 23, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "#define schema for input data\n", 50 | "schema=StructType().add('name','string')\n", 51 | "name_list=spark.readStream.schema(schema).format('parquet').load(\"new_folder\")" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 24, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "#split the names into individual names\n", 61 | "names=name_list.select(explode(split(name_list.name,' ')).alias('name'))" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 25, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "name_count=names.groupBy('name').count()" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 27, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "#query to write the results into memory sink\n", 80 | "query=(name_count.writeStream.queryName('new_query').outputMode('complete').format('memory').start())" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 31, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/html": [ 91 | "
\n", 92 | "\n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | "
namecount
0ziaan1
1neha1
2pramod2
\n", 118 | "
" 119 | ], 120 | "text/plain": [ 121 | " name count\n", 122 | "0 ziaan 1\n", 123 | "1 neha 1\n", 124 | "2 pramod 2" 125 | ] 126 | }, 127 | "execution_count": 31, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "spark.sql(\"select * from new_query order by count \").toPandas().head(5)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 32, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "df_2=spark.createDataFrame([('ziaan neha',),('ziaan ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 35, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/html": [ 153 | "
\n", 154 | "\n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | "
namecount
0pramod2
1neha3
2ziaan5
\n", 180 | "
" 181 | ], 182 | "text/plain": [ 183 | " name count\n", 184 | "0 pramod 2\n", 185 | "1 neha 3\n", 186 | "2 ziaan 5" 187 | ] 188 | }, 189 | "execution_count": 35, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "spark.sql(\"select * from new_query order by count \").toPandas().head(50)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 34, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "df_3=spark.createDataFrame([('neha',),('ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 3", 218 | "language": "python", 219 | "name": "python3" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 3 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython3", 231 | "version": "3.6.3" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 2 236 | } 237 | -------------------------------------------------------------------------------- /chap_3/.ipynb_checkpoints/Spark Structured Streaming-ver_1-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import pyspark" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 2, 15 | "metadata": {}, 16 | "outputs": [], 17 | "source": [ 18 | "#import SparkSession\n", 19 | "from pyspark.sql import SparkSession\n", 20 | "spark=SparkSession.builder.appName('ss').getOrCreate()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 20, 26 | "metadata": {}, 27 | "outputs": [], 28 | "source": [ 29 | "from pyspark.sql.functions import *\n", 30 | "from pyspark.sql.types import *" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 22, 36 | "metadata": {}, 37 | "outputs": [], 38 | "source": [ 39 | "#create sample dataset\n", 40 | "df_1=spark.createDataFrame([('pramod neha',),('pramod ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 23, 46 | "metadata": {}, 47 | "outputs": [], 48 | "source": [ 49 | "#define schema for input data\n", 50 | "schema=StructType().add('name','string')\n", 51 | "name_list=spark.readStream.schema(schema).format('parquet').load(\"new_folder\")" 52 | ] 53 | }, 54 | { 55 | "cell_type": "code", 56 | "execution_count": 24, 57 | "metadata": {}, 58 | "outputs": [], 59 | "source": [ 60 | "#split the names into individual names\n", 61 | "names=name_list.select(explode(split(name_list.name,' ')).alias('name'))" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 25, 67 | "metadata": {}, 68 | "outputs": [], 69 | "source": [ 70 | "name_count=names.groupBy('name').count()" 71 | ] 72 | }, 73 | { 74 | "cell_type": "code", 75 | "execution_count": 27, 76 | "metadata": {}, 77 | "outputs": [], 78 | "source": [ 79 | "#query to write the results into memory sink\n", 80 | "query=(name_count.writeStream.queryName('new_query').outputMode('complete').format('memory').start())" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 31, 86 | "metadata": {}, 87 | "outputs": [ 88 | { 89 | "data": { 90 | "text/html": [ 91 | "
\n", 92 | "\n", 93 | " \n", 94 | " \n", 95 | " \n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | "
namecount
0ziaan1
1neha1
2pramod2
\n", 118 | "
" 119 | ], 120 | "text/plain": [ 121 | " name count\n", 122 | "0 ziaan 1\n", 123 | "1 neha 1\n", 124 | "2 pramod 2" 125 | ] 126 | }, 127 | "execution_count": 31, 128 | "metadata": {}, 129 | "output_type": "execute_result" 130 | } 131 | ], 132 | "source": [ 133 | "spark.sql(\"select * from new_query order by count \").toPandas().head(5)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 32, 139 | "metadata": {}, 140 | "outputs": [], 141 | "source": [ 142 | "df_2=spark.createDataFrame([('ziaan neha',),('ziaan ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')" 143 | ] 144 | }, 145 | { 146 | "cell_type": "code", 147 | "execution_count": 35, 148 | "metadata": {}, 149 | "outputs": [ 150 | { 151 | "data": { 152 | "text/html": [ 153 | "
\n", 154 | "\n", 155 | " \n", 156 | " \n", 157 | " \n", 158 | " \n", 159 | " \n", 160 | " \n", 161 | " \n", 162 | " \n", 163 | " \n", 164 | " \n", 165 | " \n", 166 | " \n", 167 | " \n", 168 | " \n", 169 | " \n", 170 | " \n", 171 | " \n", 172 | " \n", 173 | " \n", 174 | " \n", 175 | " \n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | "
namecount
0pramod2
1neha3
2ziaan5
\n", 180 | "
" 181 | ], 182 | "text/plain": [ 183 | " name count\n", 184 | "0 pramod 2\n", 185 | "1 neha 3\n", 186 | "2 ziaan 5" 187 | ] 188 | }, 189 | "execution_count": 35, 190 | "metadata": {}, 191 | "output_type": "execute_result" 192 | } 193 | ], 194 | "source": [ 195 | "spark.sql(\"select * from new_query order by count \").toPandas().head(50)" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 34, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "df_3=spark.createDataFrame([('neha',),('ziaan',)],[\"name\"]).write.save(\"new_folder\",mode='append')" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": null, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [] 213 | } 214 | ], 215 | "metadata": { 216 | "kernelspec": { 217 | "display_name": "Python 3", 218 | "language": "python", 219 | "name": "python3" 220 | }, 221 | "language_info": { 222 | "codemirror_mode": { 223 | "name": "ipython", 224 | "version": 3 225 | }, 226 | "file_extension": ".py", 227 | "mimetype": "text/x-python", 228 | "name": "python", 229 | "nbconvert_exporter": "python", 230 | "pygments_lexer": "ipython3", 231 | "version": "3.6.3" 232 | } 233 | }, 234 | "nbformat": 4, 235 | "nbformat_minor": 2 236 | } 237 | -------------------------------------------------------------------------------- /chap_8/.ipynb_checkpoints/multilayer perceptron-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load the libraries" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "import os\n", 17 | "import numpy as np\n", 18 | "import pandas as pd\n", 19 | "from pyspark.sql.types import *\n", 20 | "from pyspark.ml import Pipeline\n", 21 | "from pyspark.sql import functions as f\n", 22 | "from pyspark.sql.functions import udf, StringType\n", 23 | "from pyspark.sql import SparkSession, functions as F\n", 24 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", 25 | "from pyspark.ml.classification import MultilayerPerceptronClassifier\n", 26 | "from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer" 27 | ] 28 | }, 29 | { 30 | "cell_type": "markdown", 31 | "metadata": {}, 32 | "source": [ 33 | "# Initialize Spark Session" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "outputs": [], 41 | "source": [ 42 | "spark = SparkSession.builder.appName('pyspark-dl').getOrCreate()" 43 | ] 44 | }, 45 | { 46 | "cell_type": "markdown", 47 | "metadata": {}, 48 | "source": [ 49 | "# Read the Dataset" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": 3, 55 | "metadata": {}, 56 | "outputs": [], 57 | "source": [ 58 | "web_data = spark.read.csv('data_set.csv', header=True, inferSchema=True)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "name": "stdout", 68 | "output_type": "stream", 69 | "text": [ 70 | "root\n", 71 | " |-- Visit_Number_Bucket: string (nullable = true)\n", 72 | " |-- Page_Views_Normalized: double (nullable = true)\n", 73 | " |-- Orders_Normalized: integer (nullable = true)\n", 74 | " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n", 75 | " |-- Internal_Search_Null_Normalized: double (nullable = true)\n", 76 | " |-- Email_Signup_Normalized: double (nullable = true)\n", 77 | " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n", 78 | " |-- Store_Locator_Search_Normalized: double (nullable = true)\n", 79 | " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n", 80 | " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n", 81 | " |-- Mapped_Browser_Type: string (nullable = true)\n", 82 | " |-- Mapped_Entry_Pages: string (nullable = true)\n", 83 | " |-- Mapped_Site_Section: string (nullable = true)\n", 84 | " |-- Mapped_Promo_Code: string (nullable = true)\n", 85 | " |-- Maped_Product_Name: string (nullable = true)\n", 86 | " |-- Mapped_Search_Term: string (nullable = true)\n", 87 | " |-- Mapped_Product_Collection: string (nullable = true)\n", 88 | "\n" 89 | ] 90 | } 91 | ], 92 | "source": [ 93 | "web_data.printSchema()" 94 | ] 95 | }, 96 | { 97 | "cell_type": "markdown", 98 | "metadata": {}, 99 | "source": [ 100 | "# Rename Target Column" 101 | ] 102 | }, 103 | { 104 | "cell_type": "code", 105 | "execution_count": 5, 106 | "metadata": {}, 107 | "outputs": [], 108 | "source": [ 109 | "web_data_renamed = web_data.withColumnRenamed('Orders_Normalized', 'label')" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 6, 115 | "metadata": {}, 116 | "outputs": [ 117 | { 118 | "name": "stdout", 119 | "output_type": "stream", 120 | "text": [ 121 | "root\n", 122 | " |-- Visit_Number_Bucket: string (nullable = true)\n", 123 | " |-- Page_Views_Normalized: double (nullable = true)\n", 124 | " |-- label: integer (nullable = true)\n", 125 | " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n", 126 | " |-- Internal_Search_Null_Normalized: double (nullable = true)\n", 127 | " |-- Email_Signup_Normalized: double (nullable = true)\n", 128 | " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n", 129 | " |-- Store_Locator_Search_Normalized: double (nullable = true)\n", 130 | " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n", 131 | " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n", 132 | " |-- Mapped_Browser_Type: string (nullable = true)\n", 133 | " |-- Mapped_Entry_Pages: string (nullable = true)\n", 134 | " |-- Mapped_Site_Section: string (nullable = true)\n", 135 | " |-- Mapped_Promo_Code: string (nullable = true)\n", 136 | " |-- Maped_Product_Name: string (nullable = true)\n", 137 | " |-- Mapped_Search_Term: string (nullable = true)\n", 138 | " |-- Mapped_Product_Collection: string (nullable = true)\n", 139 | "\n" 140 | ] 141 | } 142 | ], 143 | "source": [ 144 | "web_data_renamed.printSchema()" 145 | ] 146 | }, 147 | { 148 | "cell_type": "markdown", 149 | "metadata": {}, 150 | "source": [ 151 | "# Split the dataset into Train, Validation and Test" 152 | ] 153 | }, 154 | { 155 | "cell_type": "code", 156 | "execution_count": 7, 157 | "metadata": {}, 158 | "outputs": [], 159 | "source": [ 160 | "train, validation, test = web_data_renamed.randomSplit([0.7, 0.2, 0.1], 1234)" 161 | ] 162 | }, 163 | { 164 | "cell_type": "markdown", 165 | "metadata": {}, 166 | "source": [ 167 | "# Build Pipeline" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": 8, 173 | "metadata": {}, 174 | "outputs": [], 175 | "source": [ 176 | "categorical_columns = [item[0] for item in web_data_renamed.dtypes if item[1].startswith('string')]\n", 177 | "numeric_columns = [item[0] for item in web_data_renamed.dtypes if item[1].startswith('double')]" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 9, 183 | "metadata": {}, 184 | "outputs": [], 185 | "source": [ 186 | "indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]\n" 187 | ] 188 | }, 189 | { 190 | "cell_type": "code", 191 | "execution_count": 10, 192 | "metadata": {}, 193 | "outputs": [], 194 | "source": [ 195 | "featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol=\"features\")\n" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 11, 201 | "metadata": {}, 202 | "outputs": [], 203 | "source": [ 204 | "layers = [len(featuresCreator.getInputCols()), 4, 2, 2]\n", 205 | "\n", 206 | "classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)\n" 207 | ] 208 | }, 209 | { 210 | "cell_type": "code", 211 | "execution_count": 12, 212 | "metadata": {}, 213 | "outputs": [], 214 | "source": [ 215 | "pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])" 216 | ] 217 | }, 218 | { 219 | "cell_type": "markdown", 220 | "metadata": {}, 221 | "source": [ 222 | "# Fit Pipeline" 223 | ] 224 | }, 225 | { 226 | "cell_type": "code", 227 | "execution_count": 13, 228 | "metadata": {}, 229 | "outputs": [], 230 | "source": [ 231 | "model = pipeline.fit(train)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "markdown", 236 | "metadata": {}, 237 | "source": [ 238 | "# Get Pipeline Output" 239 | ] 240 | }, 241 | { 242 | "cell_type": "code", 243 | "execution_count": 14, 244 | "metadata": {}, 245 | "outputs": [], 246 | "source": [ 247 | "train_output_df = model.transform(train)\n", 248 | "validation_output_df = model.transform(validation)\n", 249 | "test_output_df = model.transform(test)" 250 | ] 251 | }, 252 | { 253 | "cell_type": "markdown", 254 | "metadata": {}, 255 | "source": [ 256 | "# Evaluate the Predictions" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": 15, 262 | "metadata": { 263 | "scrolled": true 264 | }, 265 | "outputs": [ 266 | { 267 | "name": "stdout", 268 | "output_type": "stream", 269 | "text": [ 270 | "Train weightedPrecision = 0.976101874447846\n", 271 | "Validation weightedPrecision = 0.9765821626938243\n", 272 | "Test weightedPrecision = 0.9747324280445043\n", 273 | "Train weightedRecall = 0.9755751041220662\n", 274 | "Validation weightedRecall = 0.9761613691931541\n", 275 | "Test weightedRecall = 0.9742582305920606\n", 276 | "Train accuracy = 0.975575104122066\n", 277 | "Validation accuracy = 0.976161369193154\n", 278 | "Test accuracy = 0.9742582305920607\n" 279 | ] 280 | } 281 | ], 282 | "source": [ 283 | "train_predictionAndLabels = train_output_df.select(\"prediction\", \"label\")\n", 284 | "validation_predictionAndLabels = validation_output_df.select(\"prediction\", \"label\")\n", 285 | "test_predictionAndLabels = test_output_df.select(\"prediction\", \"label\")\n", 286 | "\n", 287 | "metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']\n", 288 | "\n", 289 | "for metric in metrics:\n", 290 | " evaluator = MulticlassClassificationEvaluator(metricName=metric)\n", 291 | " print('Train ' + metric + ' = ' + str(evaluator.evaluate(train_predictionAndLabels)))\n", 292 | " print('Validation ' + metric + ' = ' + str(evaluator.evaluate(validation_predictionAndLabels)))\n", 293 | " print('Test ' + metric + ' = ' + str(evaluator.evaluate(test_predictionAndLabels)))" 294 | ] 295 | } 296 | ], 297 | "metadata": { 298 | "kernelspec": { 299 | "display_name": "Python 3", 300 | "language": "python", 301 | "name": "python3" 302 | }, 303 | "language_info": { 304 | "codemirror_mode": { 305 | "name": "ipython", 306 | "version": 3 307 | }, 308 | "file_extension": ".py", 309 | "mimetype": "text/x-python", 310 | "name": "python", 311 | "nbconvert_exporter": "python", 312 | "pygments_lexer": "ipython3", 313 | "version": "3.6.3" 314 | } 315 | }, 316 | "nbformat": 4, 317 | "nbformat_minor": 2 318 | } 319 | -------------------------------------------------------------------------------- /chap_8/Multilayer_perceptron_spark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Load the libraries" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "import os\n", 19 | "import numpy as np\n", 20 | "import pandas as pd\n", 21 | "from pyspark.sql.types import *\n", 22 | "from pyspark.ml import Pipeline\n", 23 | "from pyspark.sql import functions as f\n", 24 | "from pyspark.sql.functions import udf, StringType\n", 25 | "from pyspark.sql import SparkSession, functions as F\n", 26 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", 27 | "from pyspark.ml.classification import MultilayerPerceptronClassifier\n", 28 | "from pyspark.ml.feature import OneHotEncoder, VectorAssembler, StringIndexer" 29 | ] 30 | }, 31 | { 32 | "cell_type": "markdown", 33 | "metadata": {}, 34 | "source": [ 35 | "# Initialize Spark Session" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 2, 41 | "metadata": { 42 | "collapsed": true 43 | }, 44 | "outputs": [], 45 | "source": [ 46 | "spark = SparkSession.builder.appName('deep_learning').getOrCreate()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "markdown", 51 | "metadata": {}, 52 | "source": [ 53 | "# Read the Dataset" 54 | ] 55 | }, 56 | { 57 | "cell_type": "code", 58 | "execution_count": 3, 59 | "metadata": { 60 | "collapsed": true 61 | }, 62 | "outputs": [], 63 | "source": [ 64 | "data = spark.read.csv('dl_data.csv', header=True, inferSchema=True)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 4, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "root\n", 77 | " |-- Visit_Number_Bucket: string (nullable = true)\n", 78 | " |-- Page_Views_Normalized: double (nullable = true)\n", 79 | " |-- Orders_Normalized: integer (nullable = true)\n", 80 | " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n", 81 | " |-- Internal_Search_Null_Normalized: double (nullable = true)\n", 82 | " |-- Email_Signup_Normalized: double (nullable = true)\n", 83 | " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n", 84 | " |-- Store_Locator_Search_Normalized: double (nullable = true)\n", 85 | " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n", 86 | " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n", 87 | " |-- Mapped_Browser_Type: string (nullable = true)\n", 88 | " |-- Mapped_Entry_Pages: string (nullable = true)\n", 89 | " |-- Mapped_Site_Section: string (nullable = true)\n", 90 | " |-- Mapped_Promo_Code: string (nullable = true)\n", 91 | " |-- Maped_Product_Name: string (nullable = true)\n", 92 | " |-- Mapped_Search_Term: string (nullable = true)\n", 93 | " |-- Mapped_Product_Collection: string (nullable = true)\n", 94 | "\n" 95 | ] 96 | } 97 | ], 98 | "source": [ 99 | "data.printSchema()" 100 | ] 101 | }, 102 | { 103 | "cell_type": "markdown", 104 | "metadata": {}, 105 | "source": [ 106 | "# Rename Target Column" 107 | ] 108 | }, 109 | { 110 | "cell_type": "code", 111 | "execution_count": 5, 112 | "metadata": { 113 | "collapsed": true 114 | }, 115 | "outputs": [], 116 | "source": [ 117 | "data = data.withColumnRenamed('Orders_Normalized', 'label')" 118 | ] 119 | }, 120 | { 121 | "cell_type": "code", 122 | "execution_count": 6, 123 | "metadata": {}, 124 | "outputs": [ 125 | { 126 | "name": "stdout", 127 | "output_type": "stream", 128 | "text": [ 129 | "root\n", 130 | " |-- Visit_Number_Bucket: string (nullable = true)\n", 131 | " |-- Page_Views_Normalized: double (nullable = true)\n", 132 | " |-- label: integer (nullable = true)\n", 133 | " |-- Internal_Search_Successful_Normalized: double (nullable = true)\n", 134 | " |-- Internal_Search_Null_Normalized: double (nullable = true)\n", 135 | " |-- Email_Signup_Normalized: double (nullable = true)\n", 136 | " |-- Total_Seconds_Spent_Normalized: double (nullable = true)\n", 137 | " |-- Store_Locator_Search_Normalized: double (nullable = true)\n", 138 | " |-- Mapped_Last_Touch_Channel: string (nullable = true)\n", 139 | " |-- Mapped_Mobile_Device_Type: string (nullable = true)\n", 140 | " |-- Mapped_Browser_Type: string (nullable = true)\n", 141 | " |-- Mapped_Entry_Pages: string (nullable = true)\n", 142 | " |-- Mapped_Site_Section: string (nullable = true)\n", 143 | " |-- Mapped_Promo_Code: string (nullable = true)\n", 144 | " |-- Maped_Product_Name: string (nullable = true)\n", 145 | " |-- Mapped_Search_Term: string (nullable = true)\n", 146 | " |-- Mapped_Product_Collection: string (nullable = true)\n", 147 | "\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "data.printSchema()" 153 | ] 154 | }, 155 | { 156 | "cell_type": "markdown", 157 | "metadata": {}, 158 | "source": [ 159 | "# Split the dataset into Train, Validation and Test" 160 | ] 161 | }, 162 | { 163 | "cell_type": "code", 164 | "execution_count": 7, 165 | "metadata": { 166 | "collapsed": true 167 | }, 168 | "outputs": [], 169 | "source": [ 170 | "train, validation, test = data.randomSplit([0.7, 0.2, 0.1], 1234)" 171 | ] 172 | }, 173 | { 174 | "cell_type": "markdown", 175 | "metadata": {}, 176 | "source": [ 177 | "# Build Pipeline" 178 | ] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": 8, 183 | "metadata": { 184 | "collapsed": true 185 | }, 186 | "outputs": [], 187 | "source": [ 188 | "categorical_columns = [item[0] for item in data.dtypes if item[1].startswith('string')]\n", 189 | "numeric_columns = [item[0] for item in data.dtypes if item[1].startswith('double')]" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 9, 195 | "metadata": { 196 | "collapsed": true 197 | }, 198 | "outputs": [], 199 | "source": [ 200 | "indexers = [StringIndexer(inputCol=column, outputCol='{0}_index'.format(column)) for column in categorical_columns]\n" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 10, 206 | "metadata": { 207 | "collapsed": true 208 | }, 209 | "outputs": [], 210 | "source": [ 211 | "featuresCreator = VectorAssembler(inputCols=[indexer.getOutputCol() for indexer in indexers] + numeric_columns, outputCol=\"features\")\n" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 11, 217 | "metadata": { 218 | "collapsed": true 219 | }, 220 | "outputs": [], 221 | "source": [ 222 | "layers = [len(featuresCreator.getInputCols()), 4, 2, 2]\n", 223 | "\n", 224 | "classifier = MultilayerPerceptronClassifier(labelCol='label', featuresCol='features', maxIter=100, layers=layers, blockSize=128, seed=1234)\n" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 12, 230 | "metadata": { 231 | "collapsed": true 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "pipeline = Pipeline(stages=indexers + [featuresCreator, classifier])" 236 | ] 237 | }, 238 | { 239 | "cell_type": "markdown", 240 | "metadata": {}, 241 | "source": [ 242 | "# Fit Pipeline" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 13, 248 | "metadata": { 249 | "collapsed": true 250 | }, 251 | "outputs": [], 252 | "source": [ 253 | "model = pipeline.fit(train)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "markdown", 258 | "metadata": {}, 259 | "source": [ 260 | "# Get Pipeline Output" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 14, 266 | "metadata": { 267 | "collapsed": true 268 | }, 269 | "outputs": [], 270 | "source": [ 271 | "train_output_df = model.transform(train)\n", 272 | "validation_output_df = model.transform(validation)\n", 273 | "test_output_df = model.transform(test)" 274 | ] 275 | }, 276 | { 277 | "cell_type": "markdown", 278 | "metadata": {}, 279 | "source": [ 280 | "# Evaluate the Predictions" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 15, 286 | "metadata": { 287 | "scrolled": true 288 | }, 289 | "outputs": [ 290 | { 291 | "name": "stdout", 292 | "output_type": "stream", 293 | "text": [ 294 | "Train weightedPrecision = 0.976101874447846\n", 295 | "Validation weightedPrecision = 0.9765821626938243\n", 296 | "Test weightedPrecision = 0.9747324280445043\n", 297 | "Train weightedRecall = 0.9755751041220662\n", 298 | "Validation weightedRecall = 0.9761613691931541\n", 299 | "Test weightedRecall = 0.9742582305920606\n", 300 | "Train accuracy = 0.975575104122066\n", 301 | "Validation accuracy = 0.976161369193154\n", 302 | "Test accuracy = 0.9742582305920607\n" 303 | ] 304 | } 305 | ], 306 | "source": [ 307 | "train_predictionAndLabels = train_output_df.select(\"prediction\", \"label\")\n", 308 | "validation_predictionAndLabels = validation_output_df.select(\"prediction\", \"label\")\n", 309 | "test_predictionAndLabels = test_output_df.select(\"prediction\", \"label\")\n", 310 | "\n", 311 | "metrics = ['weightedPrecision', 'weightedRecall', 'accuracy']\n", 312 | "\n", 313 | "for metric in metrics:\n", 314 | " evaluator = MulticlassClassificationEvaluator(metricName=metric)\n", 315 | " print('Train ' + metric + ' = ' + str(evaluator.evaluate(train_predictionAndLabels)))\n", 316 | " print('Validation ' + metric + ' = ' + str(evaluator.evaluate(validation_predictionAndLabels)))\n", 317 | " print('Test ' + metric + ' = ' + str(evaluator.evaluate(test_predictionAndLabels)))" 318 | ] 319 | } 320 | ], 321 | "metadata": { 322 | "kernelspec": { 323 | "display_name": "Python 3", 324 | "language": "python", 325 | "name": "python3" 326 | }, 327 | "language_info": { 328 | "codemirror_mode": { 329 | "name": "ipython", 330 | "version": 3 331 | }, 332 | "file_extension": ".py", 333 | "mimetype": "text/x-python", 334 | "name": "python", 335 | "nbconvert_exporter": "python", 336 | "pygments_lexer": "ipython3", 337 | "version": "3.7.0" 338 | } 339 | }, 340 | "nbformat": 4, 341 | "nbformat_minor": 2 342 | } 343 | -------------------------------------------------------------------------------- /chap_5/Classification_using_MLlib.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "#import SparkSession\n", 12 | "from pyspark.sql import SparkSession\n", 13 | "spark=SparkSession.builder.appName('binary_class').getOrCreate()" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": { 20 | "collapsed": true 21 | }, 22 | "outputs": [], 23 | "source": [ 24 | "#read the dataset\n", 25 | "df=spark.read.csv('classification_data.csv',inferSchema=True,header=True)" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": true 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "#check the shape of the data \n", 37 | "print((df.count(),len(df.columns)))" 38 | ] 39 | }, 40 | { 41 | "cell_type": "code", 42 | "execution_count": null, 43 | "metadata": { 44 | "collapsed": true 45 | }, 46 | "outputs": [], 47 | "source": [ 48 | "#printSchema\n", 49 | "df.printSchema()" 50 | ] 51 | }, 52 | { 53 | "cell_type": "code", 54 | "execution_count": null, 55 | "metadata": { 56 | "collapsed": true 57 | }, 58 | "outputs": [], 59 | "source": [ 60 | "#number of columns in dataset\n", 61 | "df.columns" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": null, 67 | "metadata": { 68 | "collapsed": true 69 | }, 70 | "outputs": [], 71 | "source": [ 72 | "#view the dataset\n", 73 | "df.show(5)" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": null, 79 | "metadata": { 80 | "collapsed": true 81 | }, 82 | "outputs": [], 83 | "source": [ 84 | "#Exploratory Data Analysis\n", 85 | "df.describe().show()\n" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": true 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "df.groupBy('label').count().show()" 97 | ] 98 | }, 99 | { 100 | "cell_type": "code", 101 | "execution_count": null, 102 | "metadata": { 103 | "collapsed": true 104 | }, 105 | "outputs": [], 106 | "source": [ 107 | "df.groupBy('loan_purpose').count().show()" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": null, 113 | "metadata": { 114 | "collapsed": true 115 | }, 116 | "outputs": [], 117 | "source": [ 118 | "#converting categorical data to numerical form" 119 | ] 120 | }, 121 | { 122 | "cell_type": "code", 123 | "execution_count": 3, 124 | "metadata": { 125 | "collapsed": true 126 | }, 127 | "outputs": [], 128 | "source": [ 129 | "#import required libraries\n", 130 | "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n", 131 | "\n" 132 | ] 133 | }, 134 | { 135 | "cell_type": "code", 136 | "execution_count": 4, 137 | "metadata": { 138 | "collapsed": true 139 | }, 140 | "outputs": [], 141 | "source": [ 142 | "loan_purpose_indexer = StringIndexer(inputCol=\"loan_purpose\", outputCol=\"loan_index\").fit(df)\n", 143 | "df = loan_purpose_indexer.transform(df)\n", 144 | "loan_encoder = OneHotEncoder(inputCol=\"loan_index\", outputCol=\"loan_purpose_vec\")\n", 145 | "df = loan_encoder.transform(df)" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": null, 151 | "metadata": { 152 | "collapsed": true 153 | }, 154 | "outputs": [], 155 | "source": [ 156 | "df.select(['loan_purpose','loan_index','loan_purpose_vec']).show(3,False)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 5, 162 | "metadata": { 163 | "collapsed": true 164 | }, 165 | "outputs": [], 166 | "source": [ 167 | "from pyspark.ml.feature import VectorAssembler" 168 | ] 169 | }, 170 | { 171 | "cell_type": "code", 172 | "execution_count": null, 173 | "metadata": { 174 | "collapsed": true 175 | }, 176 | "outputs": [], 177 | "source": [ 178 | "df.columns" 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 6, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "df_assembler = VectorAssembler(inputCols=['is_first_loan',\n", 190 | " 'total_credit_card_limit',\n", 191 | " 'avg_percentage_credit_card_limit_used_last_year',\n", 192 | " 'saving_amount',\n", 193 | " 'checking_amount',\n", 194 | " 'is_employed',\n", 195 | " 'yearly_salary',\n", 196 | " 'age',\n", 197 | " 'dependent_number',\n", 198 | " 'loan_purpose_vec'], outputCol=\"features\")\n", 199 | "df = df_assembler.transform(df)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": null, 205 | "metadata": { 206 | "collapsed": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "df.printSchema()" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": null, 216 | "metadata": { 217 | "collapsed": true 218 | }, 219 | "outputs": [], 220 | "source": [ 221 | "df.select(['features','label']).show(10,False)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 7, 227 | "metadata": { 228 | "collapsed": true 229 | }, 230 | "outputs": [], 231 | "source": [ 232 | "#select data for building model\n", 233 | "model_df=df.select(['features','label'])" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": null, 239 | "metadata": { 240 | "collapsed": true 241 | }, 242 | "outputs": [], 243 | "source": [ 244 | "from pyspark.ml.classification import LogisticRegression" 245 | ] 246 | }, 247 | { 248 | "cell_type": "code", 249 | "execution_count": 8, 250 | "metadata": { 251 | "collapsed": true 252 | }, 253 | "outputs": [], 254 | "source": [ 255 | "#split the data \n", 256 | "training_df,test_df=model_df.randomSplit([0.75,0.25])" 257 | ] 258 | }, 259 | { 260 | "cell_type": "code", 261 | "execution_count": null, 262 | "metadata": { 263 | "collapsed": true 264 | }, 265 | "outputs": [], 266 | "source": [ 267 | "training_df.count()" 268 | ] 269 | }, 270 | { 271 | "cell_type": "code", 272 | "execution_count": null, 273 | "metadata": { 274 | "collapsed": true 275 | }, 276 | "outputs": [], 277 | "source": [ 278 | "training_df.groupBy('label').count().show()" 279 | ] 280 | }, 281 | { 282 | "cell_type": "code", 283 | "execution_count": null, 284 | "metadata": { 285 | "collapsed": true 286 | }, 287 | "outputs": [], 288 | "source": [ 289 | "test_df.count()" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": null, 295 | "metadata": { 296 | "collapsed": true 297 | }, 298 | "outputs": [], 299 | "source": [ 300 | "test_df.groupBy('label').count().show()" 301 | ] 302 | }, 303 | { 304 | "cell_type": "code", 305 | "execution_count": null, 306 | "metadata": { 307 | "collapsed": true 308 | }, 309 | "outputs": [], 310 | "source": [ 311 | "log_reg=LogisticRegression().fit(training_df)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "collapsed": true 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "#Training Results" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": true 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "lr_summary=log_reg.summary" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "lr_summary.accuracy" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": { 351 | "collapsed": true 352 | }, 353 | "outputs": [], 354 | "source": [ 355 | "lr_summary.areaUnderROC" 356 | ] 357 | }, 358 | { 359 | "cell_type": "code", 360 | "execution_count": null, 361 | "metadata": { 362 | "collapsed": true 363 | }, 364 | "outputs": [], 365 | "source": [ 366 | "print(lr_summary.precisionByLabel)" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": null, 372 | "metadata": { 373 | "collapsed": true 374 | }, 375 | "outputs": [], 376 | "source": [ 377 | "print(lr_summary.recallByLabel)" 378 | ] 379 | }, 380 | { 381 | "cell_type": "code", 382 | "execution_count": null, 383 | "metadata": { 384 | "collapsed": true 385 | }, 386 | "outputs": [], 387 | "source": [ 388 | "predictions = log_reg.transform(test_df)\n", 389 | "predictions.show(10)\n" 390 | ] 391 | }, 392 | { 393 | "cell_type": "code", 394 | "execution_count": null, 395 | "metadata": { 396 | "collapsed": true 397 | }, 398 | "outputs": [], 399 | "source": [ 400 | "model_predictions = log_reg.transform(test_df)\n" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": null, 406 | "metadata": { 407 | "collapsed": true 408 | }, 409 | "outputs": [], 410 | "source": [ 411 | "model_predictions = log_reg.evaluate(test_df)\n" 412 | ] 413 | }, 414 | { 415 | "cell_type": "code", 416 | "execution_count": null, 417 | "metadata": { 418 | "collapsed": true 419 | }, 420 | "outputs": [], 421 | "source": [ 422 | "model_predictions.accuracy" 423 | ] 424 | }, 425 | { 426 | "cell_type": "code", 427 | "execution_count": null, 428 | "metadata": { 429 | "collapsed": true 430 | }, 431 | "outputs": [], 432 | "source": [ 433 | "model_predictions.weightedPrecision" 434 | ] 435 | }, 436 | { 437 | "cell_type": "code", 438 | "execution_count": null, 439 | "metadata": { 440 | "collapsed": true 441 | }, 442 | "outputs": [], 443 | "source": [ 444 | "model_predictions.recallByLabel" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": { 451 | "collapsed": true 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "print(model_predictions.precisionByLabel)" 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": { 462 | "collapsed": true 463 | }, 464 | "outputs": [], 465 | "source": [ 466 | "model_predictions.areaUnderROC" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 9, 472 | "metadata": { 473 | "collapsed": true 474 | }, 475 | "outputs": [], 476 | "source": [ 477 | "from pyspark.ml.classification import RandomForestClassifier\n", 478 | "rf = RandomForestClassifier()\n", 479 | "rf_model = rf.fit(training_df)\n" 480 | ] 481 | }, 482 | { 483 | "cell_type": "code", 484 | "execution_count": 10, 485 | "metadata": { 486 | "collapsed": true 487 | }, 488 | "outputs": [], 489 | "source": [ 490 | "model_predictions = rf_model.transform(test_df)\n" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 11, 496 | "metadata": { 497 | "collapsed": true 498 | }, 499 | "outputs": [], 500 | "source": [ 501 | "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n", 502 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", 503 | "\n", 504 | "evaluator = BinaryClassificationEvaluator()\n", 505 | "\n", 506 | "rf = RandomForestClassifier()\n", 507 | "paramGrid = (ParamGridBuilder()\n", 508 | " .addGrid(rf.maxDepth, [5,10,20,25,30])\n", 509 | " .addGrid(rf.maxBins, [20,30,40 ])\n", 510 | " .addGrid(rf.numTrees, [5, 20,50])\n", 511 | " .build())\n", 512 | "cv = CrossValidator(estimator=rf, estimatorParamMaps=paramGrid, evaluator=evaluator, numFolds=5)\n", 513 | "cv_model = cv.fit(training_df)" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 12, 519 | "metadata": { 520 | "collapsed": true 521 | }, 522 | "outputs": [], 523 | "source": [ 524 | "best_rf_model = cv_model.bestModel" 525 | ] 526 | }, 527 | { 528 | "cell_type": "code", 529 | "execution_count": 13, 530 | "metadata": { 531 | "collapsed": true 532 | }, 533 | "outputs": [], 534 | "source": [ 535 | "# Generate predictions for entire dataset\n", 536 | "model_predictions = best_rf_model.transform(test_df)" 537 | ] 538 | }, 539 | { 540 | "cell_type": "code", 541 | "execution_count": 14, 542 | "metadata": { 543 | "collapsed": true 544 | }, 545 | "outputs": [], 546 | "source": [ 547 | "true_pos=model_predictions.filter(model_predictions['label']==1).filter(model_predictions['prediction']==1).count()\n", 548 | "actual_pos=model_predictions.filter(model_predictions['label']==1).count()\n", 549 | "pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()" 550 | ] 551 | }, 552 | { 553 | "cell_type": "code", 554 | "execution_count": 15, 555 | "metadata": {}, 556 | "outputs": [ 557 | { 558 | "data": { 559 | "text/plain": [ 560 | "0.912426614481409" 561 | ] 562 | }, 563 | "execution_count": 15, 564 | "metadata": {}, 565 | "output_type": "execute_result" 566 | } 567 | ], 568 | "source": [ 569 | "#Recall \n", 570 | "float(true_pos)/(actual_pos)" 571 | ] 572 | }, 573 | { 574 | "cell_type": "code", 575 | "execution_count": 16, 576 | "metadata": {}, 577 | "outputs": [ 578 | { 579 | "data": { 580 | "text/plain": [ 581 | "0.8562901744719926" 582 | ] 583 | }, 584 | "execution_count": 16, 585 | "metadata": {}, 586 | "output_type": "execute_result" 587 | } 588 | ], 589 | "source": [ 590 | "#Precision on test Data \n", 591 | "float(true_pos)/(pred_pos)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": null, 597 | "metadata": { 598 | "collapsed": true 599 | }, 600 | "outputs": [], 601 | "source": [] 602 | } 603 | ], 604 | "metadata": { 605 | "kernelspec": { 606 | "display_name": "Python 3", 607 | "language": "python", 608 | "name": "python3" 609 | }, 610 | "language_info": { 611 | "codemirror_mode": { 612 | "name": "ipython", 613 | "version": 3 614 | }, 615 | "file_extension": ".py", 616 | "mimetype": "text/x-python", 617 | "name": "python", 618 | "nbconvert_exporter": "python", 619 | "pygments_lexer": "ipython3", 620 | "version": "3.7.0" 621 | } 622 | }, 623 | "nbformat": 4, 624 | "nbformat_minor": 2 625 | } 626 | -------------------------------------------------------------------------------- /chap_3/.ipynb_checkpoints/Spark Structured Streaming app-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#import SparkSession\n", 10 | "from pyspark.sql import SparkSession\n", 11 | "spark=SparkSession.builder.appName('structured_streaming').getOrCreate()" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pyspark.sql.functions as F\n", 21 | "from pyspark.sql.types import *" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "#create sample dataset\n", 31 | "df_1=spark.createDataFrame([(\"XN203\",'FB',300,30),(\"XN201\",'Twitter',10,19),(\"XN202\",'Insta',500,45)], \n", 32 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "#define schema for input data\n", 42 | "schema=StructType().add(\"user_id\", \"string\").add(\"app\", \"string\").add(\"time_in_secs\", \"integer\").add(\"age\", \"integer\")\n", 43 | "data=spark.readStream.option(\"sep\", \",\").schema(schema).csv(\"csv_folder\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 5, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "root\n", 56 | " |-- user_id: string (nullable = true)\n", 57 | " |-- app: string (nullable = true)\n", 58 | " |-- time_in_secs: integer (nullable = true)\n", 59 | " |-- age: integer (nullable = true)\n", 60 | "\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "data.printSchema()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "app_count=data.groupBy('app').count()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 7, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "query=(app_count.writeStream.queryName('count_query').outputMode('complete').format('memory').start())" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 10, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/html": [ 94 | "
\n", 95 | "\n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | "
appcount
0Insta1
1FB1
2Twitter1
\n", 121 | "
" 122 | ], 123 | "text/plain": [ 124 | " app count\n", 125 | "0 Insta 1\n", 126 | "1 FB 1\n", 127 | "2 Twitter 1" 128 | ] 129 | }, 130 | "execution_count": 10, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "spark.sql(\"select * from count_query \").toPandas().head(5)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 9, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "fb_data=data.filter(data['app']=='FB')" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 10, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "fb_avg_time=fb_data.groupBy('user_id').agg(F.avg(\"time_in_secs\"))" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 11, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "fb_query=(fb_avg_time.writeStream.queryName('fb_query').outputMode('complete').format('memory').start())" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 12, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/html": [ 174 | "
\n", 175 | "\n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | "
user_idavg(time_in_secs)
\n", 186 | "
" 187 | ], 188 | "text/plain": [ 189 | "Empty DataFrame\n", 190 | "Columns: [user_id, avg(time_in_secs)]\n", 191 | "Index: []" 192 | ] 193 | }, 194 | "execution_count": 12, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "spark.sql(\"select * from fb_query \").toPandas().head(5)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 5, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "df_2=spark.createDataFrame([(\"XN203\",'FB',100,30),(\"XN201\",'FB',10,19),(\"XN202\",'FB',2000,45)], \n", 210 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 14, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "text/html": [ 221 | "
\n", 222 | "\n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | "
user_idavg(time_in_secs)
0XN203300.0
\n", 238 | "
" 239 | ], 240 | "text/plain": [ 241 | " user_id avg(time_in_secs)\n", 242 | "0 XN203 300.0" 243 | ] 244 | }, 245 | "execution_count": 14, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "spark.sql(\"select * from fb_query \").toPandas().head(5)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 12, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "df_3=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n", 261 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 16, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/html": [ 272 | "
\n", 273 | "\n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | "
user_idavg(time_in_secs)
0XN203300.0
\n", 289 | "
" 290 | ], 291 | "text/plain": [ 292 | " user_id avg(time_in_secs)\n", 293 | "0 XN203 300.0" 294 | ] 295 | }, 296 | "execution_count": 16, 297 | "metadata": {}, 298 | "output_type": "execute_result" 299 | } 300 | ], 301 | "source": [ 302 | "spark.sql(\"select * from fb_query \").toPandas().head(5)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 13, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "df_4=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n", 312 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 18, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "#app wise time spent\n", 322 | "\n", 323 | "app_df=data.groupBy('app').agg(F.sum('time_in_secs').alias('total_time')).orderBy('total_time',ascending=False)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 19, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "app_query=(app_df.writeStream.queryName('app_wise_query').outputMode('complete').format('memory').start())" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 27, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "data": { 342 | "text/html": [ 343 | "
\n", 344 | "\n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | "
apptotal_time
0FB3410
1Insta560
2Twitter210
\n", 370 | "
" 371 | ], 372 | "text/plain": [ 373 | " app total_time\n", 374 | "0 FB 3410\n", 375 | "1 Insta 560\n", 376 | "2 Twitter 210" 377 | ] 378 | }, 379 | "execution_count": 27, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 11, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "df_5=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n", 395 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 26, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "data": { 405 | "text/html": [ 406 | "
\n", 407 | "\n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | "
apptotal_time
0FB3410
1Insta560
2Twitter210
\n", 433 | "
" 434 | ], 435 | "text/plain": [ 436 | " app total_time\n", 437 | "0 FB 3410\n", 438 | "1 Insta 560\n", 439 | "2 Twitter 210" 440 | ] 441 | }, 442 | "execution_count": 26, 443 | "metadata": {}, 444 | "output_type": "execute_result" 445 | } 446 | ], 447 | "source": [ 448 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 28, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "# app wise mean age \n", 458 | "age_df=data.groupBy('app').agg(F.avg('age').alias('mean_age')).orderBy('mean_age',ascending=False)\n" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": { 465 | "scrolled": true 466 | }, 467 | "outputs": [], 468 | "source": [ 469 | "age_query=(age_df.writeStream.queryName('age_query').outputMode('complete').format('memory').start())" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 30, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/html": [ 480 | "
\n", 481 | "\n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | "
appmean_age
0Twitter38.500000
1FB30.571429
2Insta25.500000
\n", 507 | "
" 508 | ], 509 | "text/plain": [ 510 | " app mean_age\n", 511 | "0 Twitter 38.500000\n", 512 | "1 FB 30.571429\n", 513 | "2 Insta 25.500000" 514 | ] 515 | }, 516 | "execution_count": 30, 517 | "metadata": {}, 518 | "output_type": "execute_result" 519 | } 520 | ], 521 | "source": [ 522 | "spark.sql(\"select * from age_query \").toPandas().head(5)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 15, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "df_6=spark.createDataFrame([(\"XN210\",'FB',500,50),(\"XN255\",'Insta',30,23),(\"XN222\",'Twitter',100,30)], \n", 532 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 32, 538 | "metadata": {}, 539 | "outputs": [ 540 | { 541 | "data": { 542 | "text/html": [ 543 | "
\n", 544 | "\n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | "
appmean_age
0Twitter38.500000
1FB30.571429
2Insta25.500000
\n", 570 | "
" 571 | ], 572 | "text/plain": [ 573 | " app mean_age\n", 574 | "0 Twitter 38.500000\n", 575 | "1 FB 30.571429\n", 576 | "2 Insta 25.500000" 577 | ] 578 | }, 579 | "execution_count": 32, 580 | "metadata": {}, 581 | "output_type": "execute_result" 582 | } 583 | ], 584 | "source": [ 585 | "spark.sql(\"select * from age_query \").toPandas().head(5)" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 6, 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "name": "stdout", 595 | "output_type": "stream", 596 | "text": [ 597 | "+-------+---------+\n", 598 | "| app|full_name|\n", 599 | "+-------+---------+\n", 600 | "| FB| FACEBOOK|\n", 601 | "| Insta|INSTAGRAM|\n", 602 | "|Twitter| TWITTER|\n", 603 | "+-------+---------+\n", 604 | "\n" 605 | ] 606 | } 607 | ], 608 | "source": [ 609 | "# Join static dataframe with streaming dataframe\n", 610 | "app_df=spark.createDataFrame([('FB','FACEBOOK'),('Insta','INSTAGRAM'),('Twitter','TWITTER')],[\"app\", \"full_name\"])\n", 611 | "app_df.show()" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 7, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [ 620 | "app_stream_df=data.join(app_df,'app')" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 8, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "join_query=(app_stream_df.writeStream.queryName('join_query').outputMode('append').format('memory').start())" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 14, 635 | "metadata": {}, 636 | "outputs": [ 637 | { 638 | "data": { 639 | "text/html": [ 640 | "
\n", 641 | "\n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | "
appuser_idtime_in_secsagefull_name
0FBXN2011019FACEBOOK
1FBXN20310030FACEBOOK
2FBXN20330030FACEBOOK
3FBXN202200045FACEBOOK
4InstaXN20250045INSTAGRAM
5TwitterXN2011019TWITTER
6FBXN20350030FACEBOOK
7InstaXN2013019INSTAGRAM
8TwitterXN20210045TWITTER
9FBXN20350030FACEBOOK
10InstaXN2013019INSTAGRAM
11TwitterXN20210045TWITTER
\n", 751 | "
" 752 | ], 753 | "text/plain": [ 754 | " app user_id time_in_secs age full_name\n", 755 | "0 FB XN201 10 19 FACEBOOK\n", 756 | "1 FB XN203 100 30 FACEBOOK\n", 757 | "2 FB XN203 300 30 FACEBOOK\n", 758 | "3 FB XN202 2000 45 FACEBOOK\n", 759 | "4 Insta XN202 500 45 INSTAGRAM\n", 760 | "5 Twitter XN201 10 19 TWITTER\n", 761 | "6 FB XN203 500 30 FACEBOOK\n", 762 | "7 Insta XN201 30 19 INSTAGRAM\n", 763 | "8 Twitter XN202 100 45 TWITTER\n", 764 | "9 FB XN203 500 30 FACEBOOK\n", 765 | "10 Insta XN201 30 19 INSTAGRAM\n", 766 | "11 Twitter XN202 100 45 TWITTER" 767 | ] 768 | }, 769 | "execution_count": 14, 770 | "metadata": {}, 771 | "output_type": "execute_result" 772 | } 773 | ], 774 | "source": [ 775 | "spark.sql(\"select * from join_query \").toPandas().head(50)" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": null, 788 | "metadata": {}, 789 | "outputs": [], 790 | "source": [] 791 | } 792 | ], 793 | "metadata": { 794 | "kernelspec": { 795 | "display_name": "Python 3", 796 | "language": "python", 797 | "name": "python3" 798 | }, 799 | "language_info": { 800 | "codemirror_mode": { 801 | "name": "ipython", 802 | "version": 3 803 | }, 804 | "file_extension": ".py", 805 | "mimetype": "text/x-python", 806 | "name": "python", 807 | "nbconvert_exporter": "python", 808 | "pygments_lexer": "ipython3", 809 | "version": "3.6.3" 810 | } 811 | }, 812 | "nbformat": 4, 813 | "nbformat_minor": 2 814 | } 815 | -------------------------------------------------------------------------------- /chap_3/.ipynb_checkpoints/Spark Structured Streaming demo-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#import SparkSession\n", 10 | "from pyspark.sql import SparkSession\n", 11 | "spark=SparkSession.builder.appName('structured_streaming').getOrCreate()" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pyspark.sql.functions as F\n", 21 | "from pyspark.sql.types import *" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "#create sample dataset\n", 31 | "df_1=spark.createDataFrame([(\"XN203\",'FB',300,30),(\"XN201\",'Twitter',10,19),(\"XN202\",'Insta',500,45)], \n", 32 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "#define schema for input data\n", 42 | "schema=StructType().add(\"user_id\", \"string\").add(\"app\", \"string\").add(\"time_in_secs\", \"integer\").add(\"age\", \"integer\")\n", 43 | "data=spark.readStream.option(\"sep\", \",\").schema(schema).csv(\"csv_folder\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 5, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "root\n", 56 | " |-- user_id: string (nullable = true)\n", 57 | " |-- app: string (nullable = true)\n", 58 | " |-- time_in_secs: integer (nullable = true)\n", 59 | " |-- age: integer (nullable = true)\n", 60 | "\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "data.printSchema()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "app_count=data.groupBy('app').count()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 7, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "query=(app_count.writeStream.queryName('count_query').outputMode('complete').format('memory').start())" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 10, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/html": [ 94 | "
\n", 95 | "\n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | "
appcount
0Insta1
1FB1
2Twitter1
\n", 121 | "
" 122 | ], 123 | "text/plain": [ 124 | " app count\n", 125 | "0 Insta 1\n", 126 | "1 FB 1\n", 127 | "2 Twitter 1" 128 | ] 129 | }, 130 | "execution_count": 10, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "spark.sql(\"select * from count_query \").toPandas().head(5)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 9, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "fb_data=data.filter(data['app']=='FB')" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 10, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "fb_avg_time=fb_data.groupBy('user_id').agg(F.avg(\"time_in_secs\"))" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 11, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "fb_query=(fb_avg_time.writeStream.queryName('fb_query').outputMode('complete').format('memory').start())" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 12, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/html": [ 174 | "
\n", 175 | "\n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | "
user_idavg(time_in_secs)
\n", 186 | "
" 187 | ], 188 | "text/plain": [ 189 | "Empty DataFrame\n", 190 | "Columns: [user_id, avg(time_in_secs)]\n", 191 | "Index: []" 192 | ] 193 | }, 194 | "execution_count": 12, 195 | "metadata": {}, 196 | "output_type": "execute_result" 197 | } 198 | ], 199 | "source": [ 200 | "spark.sql(\"select * from fb_query \").toPandas().head(5)" 201 | ] 202 | }, 203 | { 204 | "cell_type": "code", 205 | "execution_count": 5, 206 | "metadata": {}, 207 | "outputs": [], 208 | "source": [ 209 | "df_2=spark.createDataFrame([(\"XN203\",'FB',100,30),(\"XN201\",'FB',10,19),(\"XN202\",'FB',2000,45)], \n", 210 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 211 | ] 212 | }, 213 | { 214 | "cell_type": "code", 215 | "execution_count": 14, 216 | "metadata": {}, 217 | "outputs": [ 218 | { 219 | "data": { 220 | "text/html": [ 221 | "
\n", 222 | "\n", 223 | " \n", 224 | " \n", 225 | " \n", 226 | " \n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | "
user_idavg(time_in_secs)
0XN203300.0
\n", 238 | "
" 239 | ], 240 | "text/plain": [ 241 | " user_id avg(time_in_secs)\n", 242 | "0 XN203 300.0" 243 | ] 244 | }, 245 | "execution_count": 14, 246 | "metadata": {}, 247 | "output_type": "execute_result" 248 | } 249 | ], 250 | "source": [ 251 | "spark.sql(\"select * from fb_query \").toPandas().head(5)" 252 | ] 253 | }, 254 | { 255 | "cell_type": "code", 256 | "execution_count": 12, 257 | "metadata": {}, 258 | "outputs": [], 259 | "source": [ 260 | "df_3=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n", 261 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 16, 267 | "metadata": {}, 268 | "outputs": [ 269 | { 270 | "data": { 271 | "text/html": [ 272 | "
\n", 273 | "\n", 274 | " \n", 275 | " \n", 276 | " \n", 277 | " \n", 278 | " \n", 279 | " \n", 280 | " \n", 281 | " \n", 282 | " \n", 283 | " \n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | "
user_idavg(time_in_secs)
0XN203300.0
\n", 289 | "
" 290 | ], 291 | "text/plain": [ 292 | " user_id avg(time_in_secs)\n", 293 | "0 XN203 300.0" 294 | ] 295 | }, 296 | "execution_count": 16, 297 | "metadata": {}, 298 | "output_type": "execute_result" 299 | } 300 | ], 301 | "source": [ 302 | "spark.sql(\"select * from fb_query \").toPandas().head(5)" 303 | ] 304 | }, 305 | { 306 | "cell_type": "code", 307 | "execution_count": 13, 308 | "metadata": {}, 309 | "outputs": [], 310 | "source": [ 311 | "df_4=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n", 312 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 313 | ] 314 | }, 315 | { 316 | "cell_type": "code", 317 | "execution_count": 18, 318 | "metadata": {}, 319 | "outputs": [], 320 | "source": [ 321 | "#app wise time spent\n", 322 | "\n", 323 | "app_df=data.groupBy('app').agg(F.sum('time_in_secs').alias('total_time')).orderBy('total_time',ascending=False)" 324 | ] 325 | }, 326 | { 327 | "cell_type": "code", 328 | "execution_count": 19, 329 | "metadata": {}, 330 | "outputs": [], 331 | "source": [ 332 | "app_query=(app_df.writeStream.queryName('app_wise_query').outputMode('complete').format('memory').start())" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 27, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "data": { 342 | "text/html": [ 343 | "
\n", 344 | "\n", 345 | " \n", 346 | " \n", 347 | " \n", 348 | " \n", 349 | " \n", 350 | " \n", 351 | " \n", 352 | " \n", 353 | " \n", 354 | " \n", 355 | " \n", 356 | " \n", 357 | " \n", 358 | " \n", 359 | " \n", 360 | " \n", 361 | " \n", 362 | " \n", 363 | " \n", 364 | " \n", 365 | " \n", 366 | " \n", 367 | " \n", 368 | " \n", 369 | "
apptotal_time
0FB3410
1Insta560
2Twitter210
\n", 370 | "
" 371 | ], 372 | "text/plain": [ 373 | " app total_time\n", 374 | "0 FB 3410\n", 375 | "1 Insta 560\n", 376 | "2 Twitter 210" 377 | ] 378 | }, 379 | "execution_count": 27, 380 | "metadata": {}, 381 | "output_type": "execute_result" 382 | } 383 | ], 384 | "source": [ 385 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 11, 391 | "metadata": {}, 392 | "outputs": [], 393 | "source": [ 394 | "df_5=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n", 395 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 26, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "data": { 405 | "text/html": [ 406 | "
\n", 407 | "\n", 408 | " \n", 409 | " \n", 410 | " \n", 411 | " \n", 412 | " \n", 413 | " \n", 414 | " \n", 415 | " \n", 416 | " \n", 417 | " \n", 418 | " \n", 419 | " \n", 420 | " \n", 421 | " \n", 422 | " \n", 423 | " \n", 424 | " \n", 425 | " \n", 426 | " \n", 427 | " \n", 428 | " \n", 429 | " \n", 430 | " \n", 431 | " \n", 432 | "
apptotal_time
0FB3410
1Insta560
2Twitter210
\n", 433 | "
" 434 | ], 435 | "text/plain": [ 436 | " app total_time\n", 437 | "0 FB 3410\n", 438 | "1 Insta 560\n", 439 | "2 Twitter 210" 440 | ] 441 | }, 442 | "execution_count": 26, 443 | "metadata": {}, 444 | "output_type": "execute_result" 445 | } 446 | ], 447 | "source": [ 448 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)" 449 | ] 450 | }, 451 | { 452 | "cell_type": "code", 453 | "execution_count": 28, 454 | "metadata": {}, 455 | "outputs": [], 456 | "source": [ 457 | "# app wise mean age \n", 458 | "age_df=data.groupBy('app').agg(F.avg('age').alias('mean_age')).orderBy('mean_age',ascending=False)\n" 459 | ] 460 | }, 461 | { 462 | "cell_type": "code", 463 | "execution_count": null, 464 | "metadata": { 465 | "scrolled": true 466 | }, 467 | "outputs": [], 468 | "source": [ 469 | "age_query=(age_df.writeStream.queryName('age_query').outputMode('complete').format('memory').start())" 470 | ] 471 | }, 472 | { 473 | "cell_type": "code", 474 | "execution_count": 30, 475 | "metadata": {}, 476 | "outputs": [ 477 | { 478 | "data": { 479 | "text/html": [ 480 | "
\n", 481 | "\n", 482 | " \n", 483 | " \n", 484 | " \n", 485 | " \n", 486 | " \n", 487 | " \n", 488 | " \n", 489 | " \n", 490 | " \n", 491 | " \n", 492 | " \n", 493 | " \n", 494 | " \n", 495 | " \n", 496 | " \n", 497 | " \n", 498 | " \n", 499 | " \n", 500 | " \n", 501 | " \n", 502 | " \n", 503 | " \n", 504 | " \n", 505 | " \n", 506 | "
appmean_age
0Twitter38.500000
1FB30.571429
2Insta25.500000
\n", 507 | "
" 508 | ], 509 | "text/plain": [ 510 | " app mean_age\n", 511 | "0 Twitter 38.500000\n", 512 | "1 FB 30.571429\n", 513 | "2 Insta 25.500000" 514 | ] 515 | }, 516 | "execution_count": 30, 517 | "metadata": {}, 518 | "output_type": "execute_result" 519 | } 520 | ], 521 | "source": [ 522 | "spark.sql(\"select * from age_query \").toPandas().head(5)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 15, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "df_6=spark.createDataFrame([(\"XN210\",'FB',500,50),(\"XN255\",'Insta',30,23),(\"XN222\",'Twitter',100,30)], \n", 532 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 533 | ] 534 | }, 535 | { 536 | "cell_type": "code", 537 | "execution_count": 32, 538 | "metadata": {}, 539 | "outputs": [ 540 | { 541 | "data": { 542 | "text/html": [ 543 | "
\n", 544 | "\n", 545 | " \n", 546 | " \n", 547 | " \n", 548 | " \n", 549 | " \n", 550 | " \n", 551 | " \n", 552 | " \n", 553 | " \n", 554 | " \n", 555 | " \n", 556 | " \n", 557 | " \n", 558 | " \n", 559 | " \n", 560 | " \n", 561 | " \n", 562 | " \n", 563 | " \n", 564 | " \n", 565 | " \n", 566 | " \n", 567 | " \n", 568 | " \n", 569 | "
appmean_age
0Twitter38.500000
1FB30.571429
2Insta25.500000
\n", 570 | "
" 571 | ], 572 | "text/plain": [ 573 | " app mean_age\n", 574 | "0 Twitter 38.500000\n", 575 | "1 FB 30.571429\n", 576 | "2 Insta 25.500000" 577 | ] 578 | }, 579 | "execution_count": 32, 580 | "metadata": {}, 581 | "output_type": "execute_result" 582 | } 583 | ], 584 | "source": [ 585 | "spark.sql(\"select * from age_query \").toPandas().head(5)" 586 | ] 587 | }, 588 | { 589 | "cell_type": "code", 590 | "execution_count": 6, 591 | "metadata": {}, 592 | "outputs": [ 593 | { 594 | "name": "stdout", 595 | "output_type": "stream", 596 | "text": [ 597 | "+-------+---------+\n", 598 | "| app|full_name|\n", 599 | "+-------+---------+\n", 600 | "| FB| FACEBOOK|\n", 601 | "| Insta|INSTAGRAM|\n", 602 | "|Twitter| TWITTER|\n", 603 | "+-------+---------+\n", 604 | "\n" 605 | ] 606 | } 607 | ], 608 | "source": [ 609 | "# Join static dataframe with streaming dataframe\n", 610 | "app_df=spark.createDataFrame([('FB','FACEBOOK'),('Insta','INSTAGRAM'),('Twitter','TWITTER')],[\"app\", \"full_name\"])\n", 611 | "app_df.show()" 612 | ] 613 | }, 614 | { 615 | "cell_type": "code", 616 | "execution_count": 7, 617 | "metadata": {}, 618 | "outputs": [], 619 | "source": [ 620 | "app_stream_df=data.join(app_df,'app')" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 8, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "join_query=(app_stream_df.writeStream.queryName('join_query').outputMode('append').format('memory').start())" 630 | ] 631 | }, 632 | { 633 | "cell_type": "code", 634 | "execution_count": 14, 635 | "metadata": {}, 636 | "outputs": [ 637 | { 638 | "data": { 639 | "text/html": [ 640 | "
\n", 641 | "\n", 642 | " \n", 643 | " \n", 644 | " \n", 645 | " \n", 646 | " \n", 647 | " \n", 648 | " \n", 649 | " \n", 650 | " \n", 651 | " \n", 652 | " \n", 653 | " \n", 654 | " \n", 655 | " \n", 656 | " \n", 657 | " \n", 658 | " \n", 659 | " \n", 660 | " \n", 661 | " \n", 662 | " \n", 663 | " \n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | "
appuser_idtime_in_secsagefull_name
0FBXN2011019FACEBOOK
1FBXN20310030FACEBOOK
2FBXN20330030FACEBOOK
3FBXN202200045FACEBOOK
4InstaXN20250045INSTAGRAM
5TwitterXN2011019TWITTER
6FBXN20350030FACEBOOK
7InstaXN2013019INSTAGRAM
8TwitterXN20210045TWITTER
9FBXN20350030FACEBOOK
10InstaXN2013019INSTAGRAM
11TwitterXN20210045TWITTER
\n", 751 | "
" 752 | ], 753 | "text/plain": [ 754 | " app user_id time_in_secs age full_name\n", 755 | "0 FB XN201 10 19 FACEBOOK\n", 756 | "1 FB XN203 100 30 FACEBOOK\n", 757 | "2 FB XN203 300 30 FACEBOOK\n", 758 | "3 FB XN202 2000 45 FACEBOOK\n", 759 | "4 Insta XN202 500 45 INSTAGRAM\n", 760 | "5 Twitter XN201 10 19 TWITTER\n", 761 | "6 FB XN203 500 30 FACEBOOK\n", 762 | "7 Insta XN201 30 19 INSTAGRAM\n", 763 | "8 Twitter XN202 100 45 TWITTER\n", 764 | "9 FB XN203 500 30 FACEBOOK\n", 765 | "10 Insta XN201 30 19 INSTAGRAM\n", 766 | "11 Twitter XN202 100 45 TWITTER" 767 | ] 768 | }, 769 | "execution_count": 14, 770 | "metadata": {}, 771 | "output_type": "execute_result" 772 | } 773 | ], 774 | "source": [ 775 | "spark.sql(\"select * from join_query \").toPandas().head(50)" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": null, 781 | "metadata": {}, 782 | "outputs": [], 783 | "source": [] 784 | }, 785 | { 786 | "cell_type": "code", 787 | "execution_count": null, 788 | "metadata": {}, 789 | "outputs": [], 790 | "source": [] 791 | } 792 | ], 793 | "metadata": { 794 | "kernelspec": { 795 | "display_name": "Python 3", 796 | "language": "python", 797 | "name": "python3" 798 | }, 799 | "language_info": { 800 | "codemirror_mode": { 801 | "name": "ipython", 802 | "version": 3 803 | }, 804 | "file_extension": ".py", 805 | "mimetype": "text/x-python", 806 | "name": "python", 807 | "nbconvert_exporter": "python", 808 | "pygments_lexer": "ipython3", 809 | "version": "3.6.3" 810 | } 811 | }, 812 | "nbformat": 4, 813 | "nbformat_minor": 2 814 | } 815 | -------------------------------------------------------------------------------- /chap_3/Spark Structured Streaming demo.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#import SparkSession\n", 10 | "from pyspark.sql import SparkSession\n", 11 | "spark=SparkSession.builder.appName('structured_streaming').getOrCreate()" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "import pyspark.sql.functions as F\n", 21 | "from pyspark.sql.types import *" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "#create sample dataset\n", 31 | "df_1=spark.createDataFrame([(\"XN203\",'FB',300,30),(\"XN201\",'Twitter',10,19),(\"XN202\",'Insta',500,45)], \n", 32 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')" 33 | ] 34 | }, 35 | { 36 | "cell_type": "code", 37 | "execution_count": 4, 38 | "metadata": {}, 39 | "outputs": [], 40 | "source": [ 41 | "#define schema for input data\n", 42 | "schema=StructType().add(\"user_id\", \"string\").add(\"app\", \"string\").add(\"time_in_secs\", \"integer\").add(\"age\", \"integer\")\n", 43 | "data=spark.readStream.option(\"sep\", \",\").schema(schema).csv(\"demo\")" 44 | ] 45 | }, 46 | { 47 | "cell_type": "code", 48 | "execution_count": 5, 49 | "metadata": {}, 50 | "outputs": [ 51 | { 52 | "name": "stdout", 53 | "output_type": "stream", 54 | "text": [ 55 | "root\n", 56 | " |-- user_id: string (nullable = true)\n", 57 | " |-- app: string (nullable = true)\n", 58 | " |-- time_in_secs: integer (nullable = true)\n", 59 | " |-- age: integer (nullable = true)\n", 60 | "\n" 61 | ] 62 | } 63 | ], 64 | "source": [ 65 | "data.printSchema()" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 6, 71 | "metadata": {}, 72 | "outputs": [], 73 | "source": [ 74 | "app_count=data.groupBy('app').count()" 75 | ] 76 | }, 77 | { 78 | "cell_type": "code", 79 | "execution_count": 7, 80 | "metadata": {}, 81 | "outputs": [], 82 | "source": [ 83 | "query=(app_count.writeStream.queryName('count_query').outputMode('complete').format('memory').start())" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 8, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "data": { 93 | "text/html": [ 94 | "
\n", 95 | "\n", 96 | " \n", 97 | " \n", 98 | " \n", 99 | " \n", 100 | " \n", 101 | " \n", 102 | " \n", 103 | " \n", 104 | " \n", 105 | " \n", 106 | " \n", 107 | " \n", 108 | " \n", 109 | " \n", 110 | " \n", 111 | " \n", 112 | " \n", 113 | " \n", 114 | " \n", 115 | " \n", 116 | " \n", 117 | " \n", 118 | " \n", 119 | " \n", 120 | "
appcount
0Insta1
1FB1
2Twitter1
\n", 121 | "
" 122 | ], 123 | "text/plain": [ 124 | " app count\n", 125 | "0 Insta 1\n", 126 | "1 FB 1\n", 127 | "2 Twitter 1" 128 | ] 129 | }, 130 | "execution_count": 8, 131 | "metadata": {}, 132 | "output_type": "execute_result" 133 | } 134 | ], 135 | "source": [ 136 | "spark.sql(\"select * from count_query \").toPandas().head(5)" 137 | ] 138 | }, 139 | { 140 | "cell_type": "code", 141 | "execution_count": 9, 142 | "metadata": {}, 143 | "outputs": [], 144 | "source": [ 145 | "fb_data=data.filter(data['app']=='FB')" 146 | ] 147 | }, 148 | { 149 | "cell_type": "code", 150 | "execution_count": 10, 151 | "metadata": {}, 152 | "outputs": [], 153 | "source": [ 154 | "fb_avg_time=fb_data.groupBy('user_id').agg(F.avg(\"time_in_secs\"))" 155 | ] 156 | }, 157 | { 158 | "cell_type": "code", 159 | "execution_count": 11, 160 | "metadata": {}, 161 | "outputs": [], 162 | "source": [ 163 | "fb_query=(fb_avg_time.writeStream.queryName('fb_query').outputMode('complete').format('memory').start())" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 13, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "data": { 173 | "text/html": [ 174 | "
\n", 175 | "\n", 176 | " \n", 177 | " \n", 178 | " \n", 179 | " \n", 180 | " \n", 181 | " \n", 182 | " \n", 183 | " \n", 184 | " \n", 185 | " \n", 186 | " \n", 187 | " \n", 188 | " \n", 189 | " \n", 190 | "
user_idavg(time_in_secs)
0XN203300.0
\n", 191 | "
" 192 | ], 193 | "text/plain": [ 194 | " user_id avg(time_in_secs)\n", 195 | "0 XN203 300.0" 196 | ] 197 | }, 198 | "execution_count": 13, 199 | "metadata": {}, 200 | "output_type": "execute_result" 201 | } 202 | ], 203 | "source": [ 204 | "spark.sql(\"select * from fb_query \").toPandas().head(5)" 205 | ] 206 | }, 207 | { 208 | "cell_type": "code", 209 | "execution_count": 21, 210 | "metadata": {}, 211 | "outputs": [], 212 | "source": [ 213 | "df_2=spark.createDataFrame([(\"XN203\",'FB',100,30),(\"XN201\",'FB',10,19),(\"XN202\",'FB',2000,45)], \n", 214 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 23, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/html": [ 225 | "
\n", 226 | "\n", 227 | " \n", 228 | " \n", 229 | " \n", 230 | " \n", 231 | " \n", 232 | " \n", 233 | " \n", 234 | " \n", 235 | " \n", 236 | " \n", 237 | " \n", 238 | " \n", 239 | " \n", 240 | " \n", 241 | " \n", 242 | " \n", 243 | " \n", 244 | " \n", 245 | " \n", 246 | "
user_idavg(time_in_secs)
0XN203200.0
1XN20110.0
\n", 247 | "
" 248 | ], 249 | "text/plain": [ 250 | " user_id avg(time_in_secs)\n", 251 | "0 XN203 200.0\n", 252 | "1 XN201 10.0" 253 | ] 254 | }, 255 | "execution_count": 23, 256 | "metadata": {}, 257 | "output_type": "execute_result" 258 | } 259 | ], 260 | "source": [ 261 | "spark.sql(\"select * from fb_query \").toPandas().head(5)" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 24, 267 | "metadata": {}, 268 | "outputs": [], 269 | "source": [ 270 | "df_3=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n", 271 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 25, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "data": { 281 | "text/html": [ 282 | "
\n", 283 | "\n", 284 | " \n", 285 | " \n", 286 | " \n", 287 | " \n", 288 | " \n", 289 | " \n", 290 | " \n", 291 | " \n", 292 | " \n", 293 | " \n", 294 | " \n", 295 | " \n", 296 | " \n", 297 | " \n", 298 | " \n", 299 | " \n", 300 | " \n", 301 | " \n", 302 | " \n", 303 | " \n", 304 | " \n", 305 | " \n", 306 | " \n", 307 | " \n", 308 | "
user_idavg(time_in_secs)
0XN203200.0
1XN20110.0
2XN2022000.0
\n", 309 | "
" 310 | ], 311 | "text/plain": [ 312 | " user_id avg(time_in_secs)\n", 313 | "0 XN203 200.0\n", 314 | "1 XN201 10.0\n", 315 | "2 XN202 2000.0" 316 | ] 317 | }, 318 | "execution_count": 25, 319 | "metadata": {}, 320 | "output_type": "execute_result" 321 | } 322 | ], 323 | "source": [ 324 | "spark.sql(\"select * from fb_query \").toPandas().head(5)" 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 26, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "df_4=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n", 334 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"demo\",mode='append')" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 18, 340 | "metadata": {}, 341 | "outputs": [], 342 | "source": [ 343 | "#app wise time spent\n", 344 | "\n", 345 | "app_df=data.groupBy('app').agg(F.sum('time_in_secs').alias('total_time')).orderBy('total_time',ascending=False)" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 19, 351 | "metadata": {}, 352 | "outputs": [], 353 | "source": [ 354 | "app_query=(app_df.writeStream.queryName('app_wise_query').outputMode('complete').format('memory').start())" 355 | ] 356 | }, 357 | { 358 | "cell_type": "code", 359 | "execution_count": 27, 360 | "metadata": {}, 361 | "outputs": [ 362 | { 363 | "data": { 364 | "text/html": [ 365 | "
\n", 366 | "\n", 367 | " \n", 368 | " \n", 369 | " \n", 370 | " \n", 371 | " \n", 372 | " \n", 373 | " \n", 374 | " \n", 375 | " \n", 376 | " \n", 377 | " \n", 378 | " \n", 379 | " \n", 380 | " \n", 381 | " \n", 382 | " \n", 383 | " \n", 384 | " \n", 385 | " \n", 386 | " \n", 387 | " \n", 388 | " \n", 389 | " \n", 390 | " \n", 391 | "
apptotal_time
0FB3410
1Insta560
2Twitter210
\n", 392 | "
" 393 | ], 394 | "text/plain": [ 395 | " app total_time\n", 396 | "0 FB 3410\n", 397 | "1 Insta 560\n", 398 | "2 Twitter 210" 399 | ] 400 | }, 401 | "execution_count": 27, 402 | "metadata": {}, 403 | "output_type": "execute_result" 404 | } 405 | ], 406 | "source": [ 407 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)" 408 | ] 409 | }, 410 | { 411 | "cell_type": "code", 412 | "execution_count": 11, 413 | "metadata": {}, 414 | "outputs": [], 415 | "source": [ 416 | "df_5=spark.createDataFrame([(\"XN203\",'FB',500,30),(\"XN201\",'Insta',30,19),(\"XN202\",'Twitter',100,45)], \n", 417 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 26, 423 | "metadata": {}, 424 | "outputs": [ 425 | { 426 | "data": { 427 | "text/html": [ 428 | "
\n", 429 | "\n", 430 | " \n", 431 | " \n", 432 | " \n", 433 | " \n", 434 | " \n", 435 | " \n", 436 | " \n", 437 | " \n", 438 | " \n", 439 | " \n", 440 | " \n", 441 | " \n", 442 | " \n", 443 | " \n", 444 | " \n", 445 | " \n", 446 | " \n", 447 | " \n", 448 | " \n", 449 | " \n", 450 | " \n", 451 | " \n", 452 | " \n", 453 | " \n", 454 | "
apptotal_time
0FB3410
1Insta560
2Twitter210
\n", 455 | "
" 456 | ], 457 | "text/plain": [ 458 | " app total_time\n", 459 | "0 FB 3410\n", 460 | "1 Insta 560\n", 461 | "2 Twitter 210" 462 | ] 463 | }, 464 | "execution_count": 26, 465 | "metadata": {}, 466 | "output_type": "execute_result" 467 | } 468 | ], 469 | "source": [ 470 | "spark.sql(\"select * from app_wise_query \").toPandas().head(5)" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 28, 476 | "metadata": {}, 477 | "outputs": [], 478 | "source": [ 479 | "# app wise mean age \n", 480 | "age_df=data.groupBy('app').agg(F.avg('age').alias('mean_age')).orderBy('mean_age',ascending=False)\n" 481 | ] 482 | }, 483 | { 484 | "cell_type": "code", 485 | "execution_count": null, 486 | "metadata": { 487 | "scrolled": true 488 | }, 489 | "outputs": [], 490 | "source": [ 491 | "age_query=(age_df.writeStream.queryName('age_query').outputMode('complete').format('memory').start())" 492 | ] 493 | }, 494 | { 495 | "cell_type": "code", 496 | "execution_count": 30, 497 | "metadata": {}, 498 | "outputs": [ 499 | { 500 | "data": { 501 | "text/html": [ 502 | "
\n", 503 | "\n", 504 | " \n", 505 | " \n", 506 | " \n", 507 | " \n", 508 | " \n", 509 | " \n", 510 | " \n", 511 | " \n", 512 | " \n", 513 | " \n", 514 | " \n", 515 | " \n", 516 | " \n", 517 | " \n", 518 | " \n", 519 | " \n", 520 | " \n", 521 | " \n", 522 | " \n", 523 | " \n", 524 | " \n", 525 | " \n", 526 | " \n", 527 | " \n", 528 | "
appmean_age
0Twitter38.500000
1FB30.571429
2Insta25.500000
\n", 529 | "
" 530 | ], 531 | "text/plain": [ 532 | " app mean_age\n", 533 | "0 Twitter 38.500000\n", 534 | "1 FB 30.571429\n", 535 | "2 Insta 25.500000" 536 | ] 537 | }, 538 | "execution_count": 30, 539 | "metadata": {}, 540 | "output_type": "execute_result" 541 | } 542 | ], 543 | "source": [ 544 | "spark.sql(\"select * from age_query \").toPandas().head(5)" 545 | ] 546 | }, 547 | { 548 | "cell_type": "code", 549 | "execution_count": 15, 550 | "metadata": {}, 551 | "outputs": [], 552 | "source": [ 553 | "df_6=spark.createDataFrame([(\"XN210\",'FB',500,50),(\"XN255\",'Insta',30,23),(\"XN222\",'Twitter',100,30)], \n", 554 | " [\"user_id\", \"app\" ,\"time_in_secs\",\"age\"]).write.csv(\"csv_folder\",mode='append')" 555 | ] 556 | }, 557 | { 558 | "cell_type": "code", 559 | "execution_count": 32, 560 | "metadata": {}, 561 | "outputs": [ 562 | { 563 | "data": { 564 | "text/html": [ 565 | "
\n", 566 | "\n", 567 | " \n", 568 | " \n", 569 | " \n", 570 | " \n", 571 | " \n", 572 | " \n", 573 | " \n", 574 | " \n", 575 | " \n", 576 | " \n", 577 | " \n", 578 | " \n", 579 | " \n", 580 | " \n", 581 | " \n", 582 | " \n", 583 | " \n", 584 | " \n", 585 | " \n", 586 | " \n", 587 | " \n", 588 | " \n", 589 | " \n", 590 | " \n", 591 | "
appmean_age
0Twitter38.500000
1FB30.571429
2Insta25.500000
\n", 592 | "
" 593 | ], 594 | "text/plain": [ 595 | " app mean_age\n", 596 | "0 Twitter 38.500000\n", 597 | "1 FB 30.571429\n", 598 | "2 Insta 25.500000" 599 | ] 600 | }, 601 | "execution_count": 32, 602 | "metadata": {}, 603 | "output_type": "execute_result" 604 | } 605 | ], 606 | "source": [ 607 | "spark.sql(\"select * from age_query \").toPandas().head(5)" 608 | ] 609 | }, 610 | { 611 | "cell_type": "code", 612 | "execution_count": 27, 613 | "metadata": {}, 614 | "outputs": [ 615 | { 616 | "name": "stdout", 617 | "output_type": "stream", 618 | "text": [ 619 | "+-------+---------+\n", 620 | "| app|full_name|\n", 621 | "+-------+---------+\n", 622 | "| FB| FACEBOOK|\n", 623 | "| Insta|INSTAGRAM|\n", 624 | "|Twitter| TWITTER|\n", 625 | "+-------+---------+\n", 626 | "\n" 627 | ] 628 | } 629 | ], 630 | "source": [ 631 | "# Join static dataframe with streaming dataframe\n", 632 | "app_df=spark.createDataFrame([('FB','FACEBOOK'),('Insta','INSTAGRAM'),('Twitter','TWITTER')],[\"app\", \"full_name\"])\n", 633 | "app_df.show()" 634 | ] 635 | }, 636 | { 637 | "cell_type": "code", 638 | "execution_count": 28, 639 | "metadata": {}, 640 | "outputs": [], 641 | "source": [ 642 | "app_stream_df=data.join(app_df,'app')" 643 | ] 644 | }, 645 | { 646 | "cell_type": "code", 647 | "execution_count": 29, 648 | "metadata": {}, 649 | "outputs": [], 650 | "source": [ 651 | "join_query=(app_stream_df.writeStream.queryName('join_query').outputMode('append').format('memory').start())" 652 | ] 653 | }, 654 | { 655 | "cell_type": "code", 656 | "execution_count": 30, 657 | "metadata": {}, 658 | "outputs": [ 659 | { 660 | "data": { 661 | "text/html": [ 662 | "
\n", 663 | "\n", 664 | " \n", 665 | " \n", 666 | " \n", 667 | " \n", 668 | " \n", 669 | " \n", 670 | " \n", 671 | " \n", 672 | " \n", 673 | " \n", 674 | " \n", 675 | " \n", 676 | " \n", 677 | " \n", 678 | " \n", 679 | " \n", 680 | " \n", 681 | " \n", 682 | " \n", 683 | " \n", 684 | " \n", 685 | " \n", 686 | " \n", 687 | " \n", 688 | " \n", 689 | " \n", 690 | " \n", 691 | " \n", 692 | " \n", 693 | " \n", 694 | " \n", 695 | " \n", 696 | " \n", 697 | " \n", 698 | " \n", 699 | " \n", 700 | " \n", 701 | " \n", 702 | " \n", 703 | " \n", 704 | " \n", 705 | " \n", 706 | " \n", 707 | " \n", 708 | " \n", 709 | " \n", 710 | " \n", 711 | " \n", 712 | " \n", 713 | " \n", 714 | " \n", 715 | " \n", 716 | " \n", 717 | " \n", 718 | " \n", 719 | " \n", 720 | " \n", 721 | " \n", 722 | " \n", 723 | " \n", 724 | " \n", 725 | " \n", 726 | " \n", 727 | " \n", 728 | " \n", 729 | " \n", 730 | " \n", 731 | " \n", 732 | " \n", 733 | " \n", 734 | " \n", 735 | " \n", 736 | " \n", 737 | " \n", 738 | " \n", 739 | " \n", 740 | " \n", 741 | " \n", 742 | " \n", 743 | " \n", 744 | " \n", 745 | " \n", 746 | " \n", 747 | " \n", 748 | " \n", 749 | " \n", 750 | " \n", 751 | " \n", 752 | " \n", 753 | " \n", 754 | " \n", 755 | " \n", 756 | " \n", 757 | " \n", 758 | " \n", 759 | " \n", 760 | " \n", 761 | " \n", 762 | " \n", 763 | " \n", 764 | " \n", 765 | " \n", 766 | " \n", 767 | " \n", 768 | " \n", 769 | " \n", 770 | " \n", 771 | " \n", 772 | "
appuser_idtime_in_secsagefull_name
0FBXN2011019FACEBOOK
1FBXN20350030FACEBOOK
2FBXN20350030FACEBOOK
3FBXN20310030FACEBOOK
4FBXN20330030FACEBOOK
5FBXN202200045FACEBOOK
6InstaXN2013019INSTAGRAM
7InstaXN2013019INSTAGRAM
8InstaXN20250045INSTAGRAM
9TwitterXN2011019TWITTER
10TwitterXN20210045TWITTER
11TwitterXN20210045TWITTER
\n", 773 | "
" 774 | ], 775 | "text/plain": [ 776 | " app user_id time_in_secs age full_name\n", 777 | "0 FB XN201 10 19 FACEBOOK\n", 778 | "1 FB XN203 500 30 FACEBOOK\n", 779 | "2 FB XN203 500 30 FACEBOOK\n", 780 | "3 FB XN203 100 30 FACEBOOK\n", 781 | "4 FB XN203 300 30 FACEBOOK\n", 782 | "5 FB XN202 2000 45 FACEBOOK\n", 783 | "6 Insta XN201 30 19 INSTAGRAM\n", 784 | "7 Insta XN201 30 19 INSTAGRAM\n", 785 | "8 Insta XN202 500 45 INSTAGRAM\n", 786 | "9 Twitter XN201 10 19 TWITTER\n", 787 | "10 Twitter XN202 100 45 TWITTER\n", 788 | "11 Twitter XN202 100 45 TWITTER" 789 | ] 790 | }, 791 | "execution_count": 30, 792 | "metadata": {}, 793 | "output_type": "execute_result" 794 | } 795 | ], 796 | "source": [ 797 | "spark.sql(\"select * from join_query \").toPandas().head(50)" 798 | ] 799 | }, 800 | { 801 | "cell_type": "code", 802 | "execution_count": null, 803 | "metadata": {}, 804 | "outputs": [], 805 | "source": [] 806 | }, 807 | { 808 | "cell_type": "code", 809 | "execution_count": null, 810 | "metadata": {}, 811 | "outputs": [], 812 | "source": [] 813 | } 814 | ], 815 | "metadata": { 816 | "kernelspec": { 817 | "display_name": "Python 3", 818 | "language": "python", 819 | "name": "python3" 820 | }, 821 | "language_info": { 822 | "codemirror_mode": { 823 | "name": "ipython", 824 | "version": 3 825 | }, 826 | "file_extension": ".py", 827 | "mimetype": "text/x-python", 828 | "name": "python", 829 | "nbconvert_exporter": "python", 830 | "pygments_lexer": "ipython3", 831 | "version": "3.6.3" 832 | } 833 | }, 834 | "nbformat": 4, 835 | "nbformat_minor": 2 836 | } 837 | -------------------------------------------------------------------------------- /chap_5/.ipynb_checkpoints/Classification_using_MLlib-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#import SparkSession\n", 10 | "from pyspark.sql import SparkSession\n", 11 | "spark=SparkSession.builder.appName('binary_class').getOrCreate()" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 11, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "#read the dataset\n", 21 | "df=spark.read.csv('classification_data.csv',inferSchema=True,header=True)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 3, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from pyspark.sql.functions import *\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 12, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "(46751, 12)\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "#check the shape of the data \n", 48 | "print((df.count(),len(df.columns)))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 13, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "root\n", 61 | " |-- loan_id: string (nullable = true)\n", 62 | " |-- loan_purpose: string (nullable = true)\n", 63 | " |-- is_first_loan: integer (nullable = true)\n", 64 | " |-- total_credit_card_limit: integer (nullable = true)\n", 65 | " |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)\n", 66 | " |-- saving_amount: integer (nullable = true)\n", 67 | " |-- checking_amount: integer (nullable = true)\n", 68 | " |-- is_employed: integer (nullable = true)\n", 69 | " |-- yearly_salary: integer (nullable = true)\n", 70 | " |-- age: integer (nullable = true)\n", 71 | " |-- dependent_number: integer (nullable = true)\n", 72 | " |-- loan_defaulter: integer (nullable = true)\n", 73 | "\n" 74 | ] 75 | } 76 | ], 77 | "source": [ 78 | "#printSchema\n", 79 | "df.printSchema()" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": 14, 85 | "metadata": {}, 86 | "outputs": [ 87 | { 88 | "data": { 89 | "text/plain": [ 90 | "['loan_id',\n", 91 | " 'loan_purpose',\n", 92 | " 'is_first_loan',\n", 93 | " 'total_credit_card_limit',\n", 94 | " 'avg_percentage_credit_card_limit_used_last_year',\n", 95 | " 'saving_amount',\n", 96 | " 'checking_amount',\n", 97 | " 'is_employed',\n", 98 | " 'yearly_salary',\n", 99 | " 'age',\n", 100 | " 'dependent_number',\n", 101 | " 'loan_defaulter']" 102 | ] 103 | }, 104 | "execution_count": 14, 105 | "metadata": {}, 106 | "output_type": "execute_result" 107 | } 108 | ], 109 | "source": [ 110 | "#number of columns in dataset\n", 111 | "df.columns" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 15, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "name": "stdout", 121 | "output_type": "stream", 122 | "text": [ 123 | "+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+\n", 124 | "|loan_id|loan_purpose|is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year|saving_amount|checking_amount|is_employed|yearly_salary|age|dependent_number|loan_defaulter|\n", 125 | "+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+\n", 126 | "| A_1| personal| 1| 7900| 0.8| 1103| 6393| 1| 16400| 42| 4| 0|\n", 127 | "| A_2| personal| 0| 3300| 0.29| 2588| 832| 1| 75500| 56| 1| 0|\n", 128 | "| A_3| personal| 0| 7600| 0.9| 1651| 8868| 1| 59000| 46| 1| 0|\n", 129 | "| A_4| personal| 1| 3400| 0.38| 1269| 6863| 1| 26000| 55| 8| 0|\n", 130 | "| A_5| emergency| 0| 2600| 0.89| 1310| 3423| 1| 9700| 41| 4| 1|\n", 131 | "+-------+------------+-------------+-----------------------+-----------------------------------------------+-------------+---------------+-----------+-------------+---+----------------+--------------+\n", 132 | "only showing top 5 rows\n", 133 | "\n" 134 | ] 135 | } 136 | ], 137 | "source": [ 138 | "#view the dataset\n", 139 | "df.show(5)" 140 | ] 141 | }, 142 | { 143 | "cell_type": "code", 144 | "execution_count": 16, 145 | "metadata": {}, 146 | "outputs": [ 147 | { 148 | "name": "stdout", 149 | "output_type": "stream", 150 | "text": [ 151 | "+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+\n", 152 | "|summary|loan_id|loan_purpose| is_first_loan|total_credit_card_limit|avg_percentage_credit_card_limit_used_last_year| saving_amount| checking_amount| is_employed| yearly_salary| age| dependent_number| loan_defaulter|\n", 153 | "+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+\n", 154 | "| count| 46751| 46751| 46751| 46751| 46751| 46751| 46751| 46751| 46751| 46751| 46751| 46751|\n", 155 | "| mean| null| null|0.5414429637868708| 4615.304485465552| 0.700091121045545| 2037.636585313683|3520.6714294881394|0.9173279715941905| 29527.62079955509| 41.53979594019379|3.7448396825736348|0.34653804196701676|\n", 156 | "| stddev| null| null|0.4982848498677868| 1890.194453628314| 0.1777288093267152|1498.6710906030362|2160.9332423713727|0.2753887911928983|16149.757703029438|12.817646350266434|2.6191527902107667|0.47587211651314887|\n", 157 | "| min| A_1| emergency| 0| 500| 0.0| 0| 0| 0| 0| 18| 0| 0|\n", 158 | "| max| A_9999| property| 1| 13500| 1.09| 10641| 13165| 1| 97200| 79| 8| 1|\n", 159 | "+-------+-------+------------+------------------+-----------------------+-----------------------------------------------+------------------+------------------+------------------+------------------+------------------+------------------+-------------------+\n", 160 | "\n" 161 | ] 162 | } 163 | ], 164 | "source": [ 165 | "#Exploratory Data Analysis\n", 166 | "df.describe().show()\n" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 17, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "+--------------+-----+\n", 179 | "|loan_defaulter|count|\n", 180 | "+--------------+-----+\n", 181 | "| 1|16201|\n", 182 | "| 0|30550|\n", 183 | "+--------------+-----+\n", 184 | "\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "df.groupBy('loan_defaulter').count().show()" 190 | ] 191 | }, 192 | { 193 | "cell_type": "code", 194 | "execution_count": 18, 195 | "metadata": {}, 196 | "outputs": [ 197 | { 198 | "name": "stdout", 199 | "output_type": "stream", 200 | "text": [ 201 | "+------------+-----+\n", 202 | "|loan_purpose|count|\n", 203 | "+------------+-----+\n", 204 | "| others| 6763|\n", 205 | "| emergency| 7562|\n", 206 | "| property|11388|\n", 207 | "| operations|10580|\n", 208 | "| personal|10458|\n", 209 | "+------------+-----+\n", 210 | "\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "df.groupBy('loan_purpose').count().show()" 216 | ] 217 | }, 218 | { 219 | "cell_type": "code", 220 | "execution_count": 120, 221 | "metadata": {}, 222 | "outputs": [], 223 | "source": [ 224 | "#converting categorical data to numerical form" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 21, 230 | "metadata": {}, 231 | "outputs": [], 232 | "source": [ 233 | "#import required libraries\n", 234 | "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n", 235 | "\n" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 22, 241 | "metadata": {}, 242 | "outputs": [], 243 | "source": [ 244 | "loan_purpose_indexer = StringIndexer(inputCol=\"loan_purpose\", outputCol=\"loan_purpose\").fit(df)\n", 245 | "df = loan_purpose_indexer.transform(df)\n", 246 | "loan_encoder = OneHotEncoder(inputCol=\"loan_index\", outputCol=\"loan_purpose_vec\")\n", 247 | "df = loan_encoder.transform(df)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 63, 253 | "metadata": {}, 254 | "outputs": [ 255 | { 256 | "name": "stdout", 257 | "output_type": "stream", 258 | "text": [ 259 | "+------------+------------+----------------+\n", 260 | "|loan_purpose|loan_purpose|loan_purpose_vec|\n", 261 | "+------------+------------+----------------+\n", 262 | "|personal |personal |(4,[2],[1.0]) |\n", 263 | "|personal |personal |(4,[2],[1.0]) |\n", 264 | "|personal |personal |(4,[2],[1.0]) |\n", 265 | "+------------+------------+----------------+\n", 266 | "only showing top 3 rows\n", 267 | "\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "df.select(['loan_purpose','loan_purpose','loan_purpose_vec']).show(3,False)" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 24, 278 | "metadata": {}, 279 | "outputs": [], 280 | "source": [ 281 | "from pyspark.ml.feature import VectorAssembler" 282 | ] 283 | }, 284 | { 285 | "cell_type": "code", 286 | "execution_count": 25, 287 | "metadata": {}, 288 | "outputs": [ 289 | { 290 | "data": { 291 | "text/plain": [ 292 | "['loan_id',\n", 293 | " 'loan_purpose',\n", 294 | " 'is_first_loan',\n", 295 | " 'total_credit_card_limit',\n", 296 | " 'avg_percentage_credit_card_limit_used_last_year',\n", 297 | " 'saving_amount',\n", 298 | " 'checking_amount',\n", 299 | " 'is_employed',\n", 300 | " 'yearly_salary',\n", 301 | " 'age',\n", 302 | " 'dependent_number',\n", 303 | " 'loan_defaulter',\n", 304 | " 'loan_index',\n", 305 | " 'loan_purpose_vec']" 306 | ] 307 | }, 308 | "execution_count": 25, 309 | "metadata": {}, 310 | "output_type": "execute_result" 311 | } 312 | ], 313 | "source": [ 314 | "df.columns" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 28, 320 | "metadata": {}, 321 | "outputs": [], 322 | "source": [ 323 | "df_assembler = VectorAssembler(inputCols=['is_first_loan',\n", 324 | " 'total_credit_card_limit',\n", 325 | " 'avg_percentage_credit_card_limit_used_last_year',\n", 326 | " 'saving_amount',\n", 327 | " 'checking_amount',\n", 328 | " 'is_employed',\n", 329 | " 'yearly_salary',\n", 330 | " 'age',\n", 331 | " 'dependent_number',\n", 332 | " 'loan_purpose_vec'], outputCol=\"features\")\n", 333 | "df = df_assembler.transform(df)" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": 29, 339 | "metadata": {}, 340 | "outputs": [ 341 | { 342 | "name": "stdout", 343 | "output_type": "stream", 344 | "text": [ 345 | "root\n", 346 | " |-- loan_id: string (nullable = true)\n", 347 | " |-- loan_purpose: string (nullable = true)\n", 348 | " |-- is_first_loan: integer (nullable = true)\n", 349 | " |-- total_credit_card_limit: integer (nullable = true)\n", 350 | " |-- avg_percentage_credit_card_limit_used_last_year: double (nullable = true)\n", 351 | " |-- saving_amount: integer (nullable = true)\n", 352 | " |-- checking_amount: integer (nullable = true)\n", 353 | " |-- is_employed: integer (nullable = true)\n", 354 | " |-- yearly_salary: integer (nullable = true)\n", 355 | " |-- age: integer (nullable = true)\n", 356 | " |-- dependent_number: integer (nullable = true)\n", 357 | " |-- loan_defaulter: integer (nullable = true)\n", 358 | " |-- loan_index: double (nullable = false)\n", 359 | " |-- loan_purpose_vec: vector (nullable = true)\n", 360 | " |-- features: vector (nullable = true)\n", 361 | "\n" 362 | ] 363 | } 364 | ], 365 | "source": [ 366 | "df.printSchema()" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 30, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "+--------------------------------------------------------------------+--------------+\n", 379 | "|features |loan_defaulter|\n", 380 | "+--------------------------------------------------------------------+--------------+\n", 381 | "|[1.0,7900.0,0.8,1103.0,6393.0,1.0,16400.0,42.0,4.0,0.0,0.0,1.0,0.0] |0 |\n", 382 | "|[0.0,3300.0,0.29,2588.0,832.0,1.0,75500.0,56.0,1.0,0.0,0.0,1.0,0.0] |0 |\n", 383 | "|[0.0,7600.0,0.9,1651.0,8868.0,1.0,59000.0,46.0,1.0,0.0,0.0,1.0,0.0] |0 |\n", 384 | "|[1.0,3400.0,0.38,1269.0,6863.0,1.0,26000.0,55.0,8.0,0.0,0.0,1.0,0.0]|0 |\n", 385 | "|[0.0,2600.0,0.89,1310.0,3423.0,1.0,9700.0,41.0,4.0,0.0,0.0,0.0,1.0] |1 |\n", 386 | "|[0.0,7600.0,0.51,1040.0,2406.0,1.0,22900.0,52.0,0.0,0.0,1.0,0.0,0.0]|0 |\n", 387 | "|[1.0,6900.0,0.82,2408.0,5556.0,1.0,34800.0,48.0,4.0,0.0,1.0,0.0,0.0]|0 |\n", 388 | "|[0.0,5700.0,0.56,1933.0,4139.0,1.0,32500.0,64.0,2.0,0.0,0.0,1.0,0.0]|0 |\n", 389 | "|[1.0,3400.0,0.95,3866.0,4131.0,1.0,13300.0,23.0,3.0,0.0,0.0,1.0,0.0]|0 |\n", 390 | "|[0.0,2900.0,0.91,88.0,2725.0,1.0,21100.0,52.0,1.0,0.0,0.0,1.0,0.0] |1 |\n", 391 | "+--------------------------------------------------------------------+--------------+\n", 392 | "only showing top 10 rows\n", 393 | "\n" 394 | ] 395 | } 396 | ], 397 | "source": [ 398 | "df.select(['features','loan_defaulter']).show(10,False)" 399 | ] 400 | }, 401 | { 402 | "cell_type": "code", 403 | "execution_count": 31, 404 | "metadata": {}, 405 | "outputs": [], 406 | "source": [ 407 | "#select data for building model\n", 408 | "model_df=df.select(['features','loan_defaulter'])" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 32, 414 | "metadata": {}, 415 | "outputs": [], 416 | "source": [ 417 | "from pyspark.ml.classification import LogisticRegression" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 33, 423 | "metadata": {}, 424 | "outputs": [], 425 | "source": [ 426 | "#split the data \n", 427 | "training_df,test_df=model_df.randomSplit([0.75,0.25])" 428 | ] 429 | }, 430 | { 431 | "cell_type": "code", 432 | "execution_count": 34, 433 | "metadata": {}, 434 | "outputs": [ 435 | { 436 | "data": { 437 | "text/plain": [ 438 | "34958" 439 | ] 440 | }, 441 | "execution_count": 34, 442 | "metadata": {}, 443 | "output_type": "execute_result" 444 | } 445 | ], 446 | "source": [ 447 | "training_df.count()" 448 | ] 449 | }, 450 | { 451 | "cell_type": "code", 452 | "execution_count": 35, 453 | "metadata": {}, 454 | "outputs": [ 455 | { 456 | "name": "stdout", 457 | "output_type": "stream", 458 | "text": [ 459 | "+--------------+-----+\n", 460 | "|loan_defaulter|count|\n", 461 | "+--------------+-----+\n", 462 | "| 1|12048|\n", 463 | "| 0|22910|\n", 464 | "+--------------+-----+\n", 465 | "\n" 466 | ] 467 | } 468 | ], 469 | "source": [ 470 | "training_df.groupBy('loan_defaulter').count().show()" 471 | ] 472 | }, 473 | { 474 | "cell_type": "code", 475 | "execution_count": 36, 476 | "metadata": {}, 477 | "outputs": [ 478 | { 479 | "data": { 480 | "text/plain": [ 481 | "11793" 482 | ] 483 | }, 484 | "execution_count": 36, 485 | "metadata": {}, 486 | "output_type": "execute_result" 487 | } 488 | ], 489 | "source": [ 490 | "test_df.count()" 491 | ] 492 | }, 493 | { 494 | "cell_type": "code", 495 | "execution_count": 37, 496 | "metadata": {}, 497 | "outputs": [ 498 | { 499 | "name": "stdout", 500 | "output_type": "stream", 501 | "text": [ 502 | "+--------------+-----+\n", 503 | "|loan_defaulter|count|\n", 504 | "+--------------+-----+\n", 505 | "| 1| 4153|\n", 506 | "| 0| 7640|\n", 507 | "+--------------+-----+\n", 508 | "\n" 509 | ] 510 | } 511 | ], 512 | "source": [ 513 | "test_df.groupBy('loan_defaulter').count().show()" 514 | ] 515 | }, 516 | { 517 | "cell_type": "code", 518 | "execution_count": 38, 519 | "metadata": {}, 520 | "outputs": [], 521 | "source": [ 522 | "log_reg=LogisticRegression(labelCol='loan_defaulter').fit(training_df)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": null, 528 | "metadata": {}, 529 | "outputs": [], 530 | "source": [ 531 | "#Training Results" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 39, 537 | "metadata": {}, 538 | "outputs": [], 539 | "source": [ 540 | "lr_summary=log_reg.summary" 541 | ] 542 | }, 543 | { 544 | "cell_type": "code", 545 | "execution_count": 40, 546 | "metadata": {}, 547 | "outputs": [ 548 | { 549 | "data": { 550 | "text/plain": [ 551 | "0.8939298586875679" 552 | ] 553 | }, 554 | "execution_count": 40, 555 | "metadata": {}, 556 | "output_type": "execute_result" 557 | } 558 | ], 559 | "source": [ 560 | "lr_summary.accuracy" 561 | ] 562 | }, 563 | { 564 | "cell_type": "code", 565 | "execution_count": 41, 566 | "metadata": {}, 567 | "outputs": [ 568 | { 569 | "data": { 570 | "text/plain": [ 571 | "0.9587456481363935" 572 | ] 573 | }, 574 | "execution_count": 41, 575 | "metadata": {}, 576 | "output_type": "execute_result" 577 | } 578 | ], 579 | "source": [ 580 | "lr_summary.areaUnderROC" 581 | ] 582 | }, 583 | { 584 | "cell_type": "code", 585 | "execution_count": 42, 586 | "metadata": {}, 587 | "outputs": [ 588 | { 589 | "name": "stdout", 590 | "output_type": "stream", 591 | "text": [ 592 | "[0.9233245149911816, 0.8396318618667535]\n" 593 | ] 594 | } 595 | ], 596 | "source": [ 597 | "print(lr_summary.precisionByLabel)" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 43, 603 | "metadata": {}, 604 | "outputs": [ 605 | { 606 | "name": "stdout", 607 | "output_type": "stream", 608 | "text": [ 609 | "[0.914054997817547, 0.8556606905710491]\n" 610 | ] 611 | } 612 | ], 613 | "source": [ 614 | "print(lr_summary.recallByLabel)" 615 | ] 616 | }, 617 | { 618 | "cell_type": "code", 619 | "execution_count": 45, 620 | "metadata": {}, 621 | "outputs": [ 622 | { 623 | "name": "stdout", 624 | "output_type": "stream", 625 | "text": [ 626 | "+--------------------+--------------+--------------------+--------------------+----------+\n", 627 | "| features|loan_defaulter| rawPrediction| probability|prediction|\n", 628 | "+--------------------+--------------+--------------------+--------------------+----------+\n", 629 | "|(13,[0,1,2,3,4,7]...| 1|[-3.4630360774167...|[0.03038246469741...| 1.0|\n", 630 | "|(13,[0,1,2,3,4,7]...| 1|[-5.5391195110590...|[0.00391460129742...| 1.0|\n", 631 | "|(13,[0,1,2,3,4,7]...| 0|[1.00238593296486...|[0.73152742283114...| 0.0|\n", 632 | "|(13,[0,1,2,3,4,7]...| 1|[-1.8290704519648...|[0.13834904603406...| 1.0|\n", 633 | "|(13,[0,1,2,3,4,7]...| 1|[-1.5501728962289...|[0.17506129798003...| 1.0|\n", 634 | "|(13,[0,1,2,3,4,7]...| 0|[6.60737916543425...|[0.99865145442765...| 0.0|\n", 635 | "|(13,[0,1,2,3,4,7]...| 0|[7.50587822302399...|[0.99945045940723...| 0.0|\n", 636 | "|(13,[0,1,2,3,4,7,...| 1|[-4.4555325192703...|[0.01148079400371...| 1.0|\n", 637 | "|(13,[0,1,2,3,4,7,...| 1|[-4.5326784954285...|[0.01063746639570...| 1.0|\n", 638 | "|(13,[0,1,2,3,4,7,...| 1|[-4.9717163244463...|[0.00688353015038...| 1.0|\n", 639 | "+--------------------+--------------+--------------------+--------------------+----------+\n", 640 | "only showing top 10 rows\n", 641 | "\n" 642 | ] 643 | } 644 | ], 645 | "source": [ 646 | "predictions = log_reg.transform(test_df)\n", 647 | "predictions.show(10)\n" 648 | ] 649 | }, 650 | { 651 | "cell_type": "code", 652 | "execution_count": 47, 653 | "metadata": {}, 654 | "outputs": [ 655 | { 656 | "data": { 657 | "text/plain": [ 658 | "['features', 'loan_defaulter', 'rawPrediction', 'probability', 'prediction']" 659 | ] 660 | }, 661 | "execution_count": 47, 662 | "metadata": {}, 663 | "output_type": "execute_result" 664 | } 665 | ], 666 | "source": [ 667 | "model_predictions = log_reg.transform(test_df)\n", 668 | "model_predictions.columns" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 48, 674 | "metadata": {}, 675 | "outputs": [], 676 | "source": [ 677 | "model_predictions = log_reg.evaluate(test_df)\n" 678 | ] 679 | }, 680 | { 681 | "cell_type": "code", 682 | "execution_count": 49, 683 | "metadata": {}, 684 | "outputs": [ 685 | { 686 | "data": { 687 | "text/plain": [ 688 | "0.8945984906300347" 689 | ] 690 | }, 691 | "execution_count": 49, 692 | "metadata": {}, 693 | "output_type": "execute_result" 694 | } 695 | ], 696 | "source": [ 697 | "model_predictions.accuracy" 698 | ] 699 | }, 700 | { 701 | "cell_type": "code", 702 | "execution_count": 50, 703 | "metadata": {}, 704 | "outputs": [ 705 | { 706 | "data": { 707 | "text/plain": [ 708 | "0.8951909857782705" 709 | ] 710 | }, 711 | "execution_count": 50, 712 | "metadata": {}, 713 | "output_type": "execute_result" 714 | } 715 | ], 716 | "source": [ 717 | "model_predictions.weightedPrecision" 718 | ] 719 | }, 720 | { 721 | "cell_type": "code", 722 | "execution_count": 52, 723 | "metadata": {}, 724 | "outputs": [ 725 | { 726 | "data": { 727 | "text/plain": [ 728 | "[0.9129581151832461, 0.8608235010835541]" 729 | ] 730 | }, 731 | "execution_count": 52, 732 | "metadata": {}, 733 | "output_type": "execute_result" 734 | } 735 | ], 736 | "source": [ 737 | "model_predictions.recallByLabel" 738 | ] 739 | }, 740 | { 741 | "cell_type": "code", 742 | "execution_count": 53, 743 | "metadata": {}, 744 | "outputs": [ 745 | { 746 | "name": "stdout", 747 | "output_type": "stream", 748 | "text": [ 749 | "[0.9234741162452006, 0.8431603773584906]\n" 750 | ] 751 | } 752 | ], 753 | "source": [ 754 | "print(model_predictions.precisionByLabel)" 755 | ] 756 | }, 757 | { 758 | "cell_type": "code", 759 | "execution_count": 54, 760 | "metadata": {}, 761 | "outputs": [ 762 | { 763 | "data": { 764 | "text/plain": [ 765 | "0.9594316478468224" 766 | ] 767 | }, 768 | "execution_count": 54, 769 | "metadata": {}, 770 | "output_type": "execute_result" 771 | } 772 | ], 773 | "source": [ 774 | "model_predictions.areaUnderROC" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 56, 780 | "metadata": {}, 781 | "outputs": [], 782 | "source": [ 783 | "from pyspark.ml.classification import RandomForestClassifier\n", 784 | "rf = RandomForestClassifier(numTrees=50,maxDepth=30,labelCol='loan_defaulter')\n", 785 | "rf_model = rf.fit(training_df)\n" 786 | ] 787 | }, 788 | { 789 | "cell_type": "code", 790 | "execution_count": 57, 791 | "metadata": {}, 792 | "outputs": [], 793 | "source": [ 794 | "model_predictions = rf_model.transform(test_df)\n" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": 59, 800 | "metadata": {}, 801 | "outputs": [], 802 | "source": [ 803 | "true_pos=model_predictions.filter(model_predictions['loan_defaulter']==1).filter(model_predictions['prediction']==1).count()\n", 804 | "actual_pos=model_predictions.filter(model_predictions['loan_defaulter']==1).count()\n", 805 | "pred_pos=model_predictions.filter(model_predictions['prediction']==1).count()" 806 | ] 807 | }, 808 | { 809 | "cell_type": "code", 810 | "execution_count": 60, 811 | "metadata": {}, 812 | "outputs": [ 813 | { 814 | "data": { 815 | "text/plain": [ 816 | "0.8979051288225379" 817 | ] 818 | }, 819 | "execution_count": 60, 820 | "metadata": {}, 821 | "output_type": "execute_result" 822 | } 823 | ], 824 | "source": [ 825 | "#Recall \n", 826 | "float(true_pos)/(actual_pos)" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": 61, 832 | "metadata": {}, 833 | "outputs": [ 834 | { 835 | "data": { 836 | "text/plain": [ 837 | "0.8660009289363678" 838 | ] 839 | }, 840 | "execution_count": 61, 841 | "metadata": {}, 842 | "output_type": "execute_result" 843 | } 844 | ], 845 | "source": [ 846 | "#Precision on test Data \n", 847 | "float(true_pos)/(pred_pos)" 848 | ] 849 | }, 850 | { 851 | "cell_type": "code", 852 | "execution_count": null, 853 | "metadata": {}, 854 | "outputs": [], 855 | "source": [] 856 | } 857 | ], 858 | "metadata": { 859 | "kernelspec": { 860 | "display_name": "Python 3", 861 | "language": "python", 862 | "name": "python3" 863 | }, 864 | "language_info": { 865 | "codemirror_mode": { 866 | "name": "ipython", 867 | "version": 3 868 | }, 869 | "file_extension": ".py", 870 | "mimetype": "text/x-python", 871 | "name": "python", 872 | "nbconvert_exporter": "python", 873 | "pygments_lexer": "ipython3", 874 | "version": "3.6.3" 875 | } 876 | }, 877 | "nbformat": 4, 878 | "nbformat_minor": 2 879 | } 880 | -------------------------------------------------------------------------------- /chap_3/.ipynb_checkpoints/Logistic_resgression_pyspark-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "#import SparkSession\n", 10 | "from pyspark.sql import SparkSession\n", 11 | "spark=SparkSession.builder.appName('log_reg').getOrCreate()" 12 | ] 13 | }, 14 | { 15 | "cell_type": "code", 16 | "execution_count": 2, 17 | "metadata": {}, 18 | "outputs": [], 19 | "source": [ 20 | "#read the dataset\n", 21 | "df=spark.read.csv('Log_Reg_dataset.csv',inferSchema=True,header=True)" 22 | ] 23 | }, 24 | { 25 | "cell_type": "code", 26 | "execution_count": 14, 27 | "metadata": {}, 28 | "outputs": [], 29 | "source": [ 30 | "from pyspark.sql.functions import *\n" 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 3, 36 | "metadata": {}, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "(20000, 6)\n" 43 | ] 44 | } 45 | ], 46 | "source": [ 47 | "#check the shape of the data \n", 48 | "print((df.count(),len(df.columns)))" 49 | ] 50 | }, 51 | { 52 | "cell_type": "code", 53 | "execution_count": 4, 54 | "metadata": {}, 55 | "outputs": [ 56 | { 57 | "name": "stdout", 58 | "output_type": "stream", 59 | "text": [ 60 | "root\n", 61 | " |-- Country: string (nullable = true)\n", 62 | " |-- Age: integer (nullable = true)\n", 63 | " |-- Repeat_Visitor: integer (nullable = true)\n", 64 | " |-- Platform: string (nullable = true)\n", 65 | " |-- Web_pages_viewed: integer (nullable = true)\n", 66 | " |-- Status: integer (nullable = true)\n", 67 | "\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "#printSchema\n", 73 | "df.printSchema()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "code", 78 | "execution_count": 5, 79 | "metadata": {}, 80 | "outputs": [ 81 | { 82 | "data": { 83 | "text/plain": [ 84 | "['Country', 'Age', 'Repeat_Visitor', 'Platform', 'Web_pages_viewed', 'Status']" 85 | ] 86 | }, 87 | "execution_count": 5, 88 | "metadata": {}, 89 | "output_type": "execute_result" 90 | } 91 | ], 92 | "source": [ 93 | "#number of columns in dataset\n", 94 | "df.columns" 95 | ] 96 | }, 97 | { 98 | "cell_type": "code", 99 | "execution_count": 6, 100 | "metadata": {}, 101 | "outputs": [ 102 | { 103 | "name": "stdout", 104 | "output_type": "stream", 105 | "text": [ 106 | "+---------+---+--------------+--------+----------------+------+\n", 107 | "| Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|\n", 108 | "+---------+---+--------------+--------+----------------+------+\n", 109 | "| India| 41| 1| Yahoo| 21| 1|\n", 110 | "| Brazil| 28| 1| Yahoo| 5| 0|\n", 111 | "| Brazil| 40| 0| Google| 3| 0|\n", 112 | "|Indonesia| 31| 1| Bing| 15| 1|\n", 113 | "| Malaysia| 32| 0| Google| 15| 1|\n", 114 | "+---------+---+--------------+--------+----------------+------+\n", 115 | "only showing top 5 rows\n", 116 | "\n" 117 | ] 118 | } 119 | ], 120 | "source": [ 121 | "#view the dataset\n", 122 | "df.show(5)" 123 | ] 124 | }, 125 | { 126 | "cell_type": "code", 127 | "execution_count": 19, 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "+-------+--------+-----------------+-----------------+--------+-----------------+------------------+\n", 135 | "|summary| Country| Age| Repeat_Visitor|Platform| Web_pages_viewed| Status|\n", 136 | "+-------+--------+-----------------+-----------------+--------+-----------------+------------------+\n", 137 | "| count| 20000| 20000| 20000| 20000| 20000| 20000|\n", 138 | "| mean| null| 28.53955| 0.5029| null| 9.5533| 0.5|\n", 139 | "| stddev| null|7.888912950773227|0.500004090187782| null|6.073903499824976|0.5000125004687693|\n", 140 | "| min| Brazil| 17| 0| Bing| 1| 0|\n", 141 | "| max|Malaysia| 111| 1| Yahoo| 29| 1|\n", 142 | "+-------+--------+-----------------+-----------------+--------+-----------------+------------------+\n", 143 | "\n" 144 | ] 145 | } 146 | ], 147 | "source": [ 148 | "#Exploratory Data Analysis\n", 149 | "df.describe().show()\n" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 22, 155 | "metadata": {}, 156 | "outputs": [ 157 | { 158 | "name": "stdout", 159 | "output_type": "stream", 160 | "text": [ 161 | "+---------+-----+\n", 162 | "| Country|count|\n", 163 | "+---------+-----+\n", 164 | "| Malaysia| 1218|\n", 165 | "| India| 4018|\n", 166 | "|Indonesia|12178|\n", 167 | "| Brazil| 2586|\n", 168 | "+---------+-----+\n", 169 | "\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "df.groupBy('Country').count().show()" 175 | ] 176 | }, 177 | { 178 | "cell_type": "code", 179 | "execution_count": 118, 180 | "metadata": {}, 181 | "outputs": [ 182 | { 183 | "name": "stdout", 184 | "output_type": "stream", 185 | "text": [ 186 | "+--------+-----+\n", 187 | "|Platform|count|\n", 188 | "+--------+-----+\n", 189 | "| Yahoo| 9859|\n", 190 | "| Bing| 4360|\n", 191 | "| Google| 5781|\n", 192 | "+--------+-----+\n", 193 | "\n" 194 | ] 195 | } 196 | ], 197 | "source": [ 198 | "df.groupBy('Platform').count().show()" 199 | ] 200 | }, 201 | { 202 | "cell_type": "code", 203 | "execution_count": 119, 204 | "metadata": {}, 205 | "outputs": [ 206 | { 207 | "name": "stdout", 208 | "output_type": "stream", 209 | "text": [ 210 | "+------+-----+\n", 211 | "|Status|count|\n", 212 | "+------+-----+\n", 213 | "| 1|10000|\n", 214 | "| 0|10000|\n", 215 | "+------+-----+\n", 216 | "\n" 217 | ] 218 | } 219 | ], 220 | "source": [ 221 | "df.groupBy('Status').count().show()" 222 | ] 223 | }, 224 | { 225 | "cell_type": "code", 226 | "execution_count": 23, 227 | "metadata": { 228 | "scrolled": true 229 | }, 230 | "outputs": [ 231 | { 232 | "name": "stdout", 233 | "output_type": "stream", 234 | "text": [ 235 | "+---------+------------------+-------------------+---------------------+--------------------+\n", 236 | "| Country| avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)| avg(Status)|\n", 237 | "+---------+------------------+-------------------+---------------------+--------------------+\n", 238 | "| Malaysia|27.792282430213465| 0.5730706075533661| 11.192118226600986| 0.6568144499178982|\n", 239 | "| India|27.976854156296664| 0.5433051269288203| 10.727227476356397| 0.6212045793927327|\n", 240 | "|Indonesia| 28.43159796354081| 0.5207751683363442| 9.985711939563148| 0.5422893742814913|\n", 241 | "| Brazil|30.274168600154677| 0.322892498066512| 4.921113689095128|0.038669760247486466|\n", 242 | "+---------+------------------+-------------------+---------------------+--------------------+\n", 243 | "\n" 244 | ] 245 | } 246 | ], 247 | "source": [ 248 | "df.groupBy('Country').mean().show()" 249 | ] 250 | }, 251 | { 252 | "cell_type": "code", 253 | "execution_count": 24, 254 | "metadata": {}, 255 | "outputs": [ 256 | { 257 | "name": "stdout", 258 | "output_type": "stream", 259 | "text": [ 260 | "+--------+------------------+-------------------+---------------------+------------------+\n", 261 | "|Platform| avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)| avg(Status)|\n", 262 | "+--------+------------------+-------------------+---------------------+------------------+\n", 263 | "| Yahoo|28.569226087838523| 0.5094837204584644| 9.599655137437875|0.5071508266558474|\n", 264 | "| Bing| 28.68394495412844| 0.4720183486238532| 9.114908256880733|0.4559633027522936|\n", 265 | "| Google|28.380038055699707| 0.5149628092025601| 9.804878048780488|0.5210171250648676|\n", 266 | "+--------+------------------+-------------------+---------------------+------------------+\n", 267 | "\n" 268 | ] 269 | } 270 | ], 271 | "source": [ 272 | "df.groupBy('Platform').mean().show()" 273 | ] 274 | }, 275 | { 276 | "cell_type": "code", 277 | "execution_count": 25, 278 | "metadata": {}, 279 | "outputs": [ 280 | { 281 | "name": "stdout", 282 | "output_type": "stream", 283 | "text": [ 284 | "+------+--------+-------------------+---------------------+-----------+\n", 285 | "|Status|avg(Age)|avg(Repeat_Visitor)|avg(Web_pages_viewed)|avg(Status)|\n", 286 | "+------+--------+-------------------+---------------------+-----------+\n", 287 | "| 1| 26.5435| 0.7019| 14.5617| 1.0|\n", 288 | "| 0| 30.5356| 0.3039| 4.5449| 0.0|\n", 289 | "+------+--------+-------------------+---------------------+-----------+\n", 290 | "\n" 291 | ] 292 | } 293 | ], 294 | "source": [ 295 | "df.groupBy('Status').mean().show()" 296 | ] 297 | }, 298 | { 299 | "cell_type": "code", 300 | "execution_count": 120, 301 | "metadata": {}, 302 | "outputs": [], 303 | "source": [ 304 | "#converting categorical data to numerical form" 305 | ] 306 | }, 307 | { 308 | "cell_type": "code", 309 | "execution_count": 121, 310 | "metadata": {}, 311 | "outputs": [], 312 | "source": [ 313 | "#import required libraries\n", 314 | "\n", 315 | "from pyspark.ml.feature import StringIndexer\n" 316 | ] 317 | }, 318 | { 319 | "cell_type": "code", 320 | "execution_count": 122, 321 | "metadata": {}, 322 | "outputs": [], 323 | "source": [ 324 | "#Indexing " 325 | ] 326 | }, 327 | { 328 | "cell_type": "code", 329 | "execution_count": 123, 330 | "metadata": {}, 331 | "outputs": [], 332 | "source": [ 333 | "platform_indexer = StringIndexer(inputCol=\"Platform\", outputCol=\"platform_num\").fit(df)\n", 334 | "df = platform_indexer.transform(df)" 335 | ] 336 | }, 337 | { 338 | "cell_type": "code", 339 | "execution_count": 124, 340 | "metadata": {}, 341 | "outputs": [ 342 | { 343 | "name": "stdout", 344 | "output_type": "stream", 345 | "text": [ 346 | "+-------+---+--------------+--------+----------------+------+------------+\n", 347 | "|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|platform_num|\n", 348 | "+-------+---+--------------+--------+----------------+------+------------+\n", 349 | "|India |41 |1 |Yahoo |21 |1 |0.0 |\n", 350 | "|Brazil |28 |1 |Yahoo |5 |0 |0.0 |\n", 351 | "|Brazil |40 |0 |Google |3 |0 |1.0 |\n", 352 | "+-------+---+--------------+--------+----------------+------+------------+\n", 353 | "only showing top 3 rows\n", 354 | "\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "df.show(3,False)" 360 | ] 361 | }, 362 | { 363 | "cell_type": "code", 364 | "execution_count": 125, 365 | "metadata": {}, 366 | "outputs": [], 367 | "source": [ 368 | "from pyspark.ml.feature import OneHotEncoder" 369 | ] 370 | }, 371 | { 372 | "cell_type": "code", 373 | "execution_count": 126, 374 | "metadata": {}, 375 | "outputs": [], 376 | "source": [ 377 | "#one hot encoding\n", 378 | "platform_encoder = OneHotEncoder(inputCol=\"platform_num\", outputCol=\"platform_vector\")\n", 379 | "df = platform_encoder.transform(df)" 380 | ] 381 | }, 382 | { 383 | "cell_type": "code", 384 | "execution_count": 129, 385 | "metadata": {}, 386 | "outputs": [ 387 | { 388 | "name": "stdout", 389 | "output_type": "stream", 390 | "text": [ 391 | "+-------+---+--------------+--------+----------------+------+------------+---------------+\n", 392 | "|Country|Age|Repeat_Visitor|Platform|Web_pages_viewed|Status|platform_num|platform_vector|\n", 393 | "+-------+---+--------------+--------+----------------+------+------------+---------------+\n", 394 | "|India |41 |1 |Yahoo |21 |1 |0.0 |(2,[0],[1.0]) |\n", 395 | "|Brazil |28 |1 |Yahoo |5 |0 |0.0 |(2,[0],[1.0]) |\n", 396 | "|Brazil |40 |0 |Google |3 |0 |1.0 |(2,[1],[1.0]) |\n", 397 | "+-------+---+--------------+--------+----------------+------+------------+---------------+\n", 398 | "only showing top 3 rows\n", 399 | "\n" 400 | ] 401 | } 402 | ], 403 | "source": [ 404 | "df.show(3,False)" 405 | ] 406 | }, 407 | { 408 | "cell_type": "code", 409 | "execution_count": 134, 410 | "metadata": {}, 411 | "outputs": [ 412 | { 413 | "name": "stdout", 414 | "output_type": "stream", 415 | "text": [ 416 | "+--------+-----+\n", 417 | "|Platform|count|\n", 418 | "+--------+-----+\n", 419 | "|Yahoo |9859 |\n", 420 | "|Google |5781 |\n", 421 | "|Bing |4360 |\n", 422 | "+--------+-----+\n", 423 | "\n" 424 | ] 425 | } 426 | ], 427 | "source": [ 428 | "df.groupBy('Platform').count().orderBy('count',ascending=False).show(5,False)" 429 | ] 430 | }, 431 | { 432 | "cell_type": "code", 433 | "execution_count": 135, 434 | "metadata": {}, 435 | "outputs": [ 436 | { 437 | "name": "stdout", 438 | "output_type": "stream", 439 | "text": [ 440 | "+------------+-----+\n", 441 | "|platform_num|count|\n", 442 | "+------------+-----+\n", 443 | "|0.0 |9859 |\n", 444 | "|1.0 |5781 |\n", 445 | "|2.0 |4360 |\n", 446 | "+------------+-----+\n", 447 | "\n" 448 | ] 449 | } 450 | ], 451 | "source": [ 452 | "df.groupBy('platform_num').count().orderBy('count',ascending=False).show(5,False)" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 136, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "+---------------+-----+\n", 465 | "|platform_vector|count|\n", 466 | "+---------------+-----+\n", 467 | "|(2,[0],[1.0]) |9859 |\n", 468 | "|(2,[1],[1.0]) |5781 |\n", 469 | "|(2,[],[]) |4360 |\n", 470 | "+---------------+-----+\n", 471 | "\n" 472 | ] 473 | } 474 | ], 475 | "source": [ 476 | "df.groupBy('platform_vector').count().orderBy('count',ascending=False).show(5,False)" 477 | ] 478 | }, 479 | { 480 | "cell_type": "code", 481 | "execution_count": 137, 482 | "metadata": {}, 483 | "outputs": [], 484 | "source": [ 485 | "country_indexer = StringIndexer(inputCol=\"Country\", outputCol=\"country_num\").fit(df)\n", 486 | "df = country_indexer.transform(df)" 487 | ] 488 | }, 489 | { 490 | "cell_type": "code", 491 | "execution_count": 139, 492 | "metadata": {}, 493 | "outputs": [ 494 | { 495 | "name": "stdout", 496 | "output_type": "stream", 497 | "text": [ 498 | "+-------+-----------+\n", 499 | "|Country|country_num|\n", 500 | "+-------+-----------+\n", 501 | "|India |1.0 |\n", 502 | "|Brazil |2.0 |\n", 503 | "|Brazil |2.0 |\n", 504 | "+-------+-----------+\n", 505 | "only showing top 3 rows\n", 506 | "\n" 507 | ] 508 | } 509 | ], 510 | "source": [ 511 | "df.select(['Country','country_num']).show(3,False)" 512 | ] 513 | }, 514 | { 515 | "cell_type": "code", 516 | "execution_count": 140, 517 | "metadata": {}, 518 | "outputs": [], 519 | "source": [ 520 | "#one hot encoding\n", 521 | "country_encoder = OneHotEncoder(inputCol=\"country_num\", outputCol=\"country_vector\")\n", 522 | "df = country_encoder.transform(df)" 523 | ] 524 | }, 525 | { 526 | "cell_type": "code", 527 | "execution_count": 141, 528 | "metadata": {}, 529 | "outputs": [ 530 | { 531 | "name": "stdout", 532 | "output_type": "stream", 533 | "text": [ 534 | "+-------+-----------+--------------+\n", 535 | "|Country|country_num|country_vector|\n", 536 | "+-------+-----------+--------------+\n", 537 | "|India |1.0 |(3,[1],[1.0]) |\n", 538 | "|Brazil |2.0 |(3,[2],[1.0]) |\n", 539 | "|Brazil |2.0 |(3,[2],[1.0]) |\n", 540 | "+-------+-----------+--------------+\n", 541 | "only showing top 3 rows\n", 542 | "\n" 543 | ] 544 | } 545 | ], 546 | "source": [ 547 | "df.select(['Country','country_num','country_vector']).show(3,False)" 548 | ] 549 | }, 550 | { 551 | "cell_type": "code", 552 | "execution_count": 142, 553 | "metadata": {}, 554 | "outputs": [ 555 | { 556 | "name": "stdout", 557 | "output_type": "stream", 558 | "text": [ 559 | "+---------+-----+\n", 560 | "|Country |count|\n", 561 | "+---------+-----+\n", 562 | "|Indonesia|12178|\n", 563 | "|India |4018 |\n", 564 | "|Brazil |2586 |\n", 565 | "|Malaysia |1218 |\n", 566 | "+---------+-----+\n", 567 | "\n" 568 | ] 569 | } 570 | ], 571 | "source": [ 572 | "df.groupBy('Country').count().orderBy('count',ascending=False).show(5,False)" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 143, 578 | "metadata": {}, 579 | "outputs": [ 580 | { 581 | "name": "stdout", 582 | "output_type": "stream", 583 | "text": [ 584 | "+-----------+-----+\n", 585 | "|country_num|count|\n", 586 | "+-----------+-----+\n", 587 | "|0.0 |12178|\n", 588 | "|1.0 |4018 |\n", 589 | "|2.0 |2586 |\n", 590 | "|3.0 |1218 |\n", 591 | "+-----------+-----+\n", 592 | "\n" 593 | ] 594 | } 595 | ], 596 | "source": [ 597 | "df.groupBy('country_num').count().orderBy('count',ascending=False).show(5,False)" 598 | ] 599 | }, 600 | { 601 | "cell_type": "code", 602 | "execution_count": 144, 603 | "metadata": {}, 604 | "outputs": [ 605 | { 606 | "name": "stdout", 607 | "output_type": "stream", 608 | "text": [ 609 | "+--------------+-----+\n", 610 | "|country_vector|count|\n", 611 | "+--------------+-----+\n", 612 | "|(3,[0],[1.0]) |12178|\n", 613 | "|(3,[1],[1.0]) |4018 |\n", 614 | "|(3,[2],[1.0]) |2586 |\n", 615 | "|(3,[],[]) |1218 |\n", 616 | "+--------------+-----+\n", 617 | "\n" 618 | ] 619 | } 620 | ], 621 | "source": [ 622 | "df.groupBy('country_vector').count().orderBy('count',ascending=False).show(5,False)" 623 | ] 624 | }, 625 | { 626 | "cell_type": "code", 627 | "execution_count": 145, 628 | "metadata": {}, 629 | "outputs": [], 630 | "source": [ 631 | "from pyspark.ml.feature import VectorAssembler" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 146, 637 | "metadata": {}, 638 | "outputs": [], 639 | "source": [ 640 | "df_assembler = VectorAssembler(inputCols=['platform_vector','country_vector','Age', 'Repeat_Visitor','Web_pages_viewed'], outputCol=\"features\")\n", 641 | "df = df_assembler.transform(df)" 642 | ] 643 | }, 644 | { 645 | "cell_type": "code", 646 | "execution_count": 147, 647 | "metadata": {}, 648 | "outputs": [ 649 | { 650 | "name": "stdout", 651 | "output_type": "stream", 652 | "text": [ 653 | "root\n", 654 | " |-- Country: string (nullable = true)\n", 655 | " |-- Age: integer (nullable = true)\n", 656 | " |-- Repeat_Visitor: integer (nullable = true)\n", 657 | " |-- Platform: string (nullable = true)\n", 658 | " |-- Web_pages_viewed: integer (nullable = true)\n", 659 | " |-- Status: integer (nullable = true)\n", 660 | " |-- platform_num: double (nullable = false)\n", 661 | " |-- platform_vector: vector (nullable = true)\n", 662 | " |-- country_num: double (nullable = false)\n", 663 | " |-- country_vector: vector (nullable = true)\n", 664 | " |-- features: vector (nullable = true)\n", 665 | "\n" 666 | ] 667 | } 668 | ], 669 | "source": [ 670 | "df.printSchema()" 671 | ] 672 | }, 673 | { 674 | "cell_type": "code", 675 | "execution_count": 148, 676 | "metadata": {}, 677 | "outputs": [ 678 | { 679 | "name": "stdout", 680 | "output_type": "stream", 681 | "text": [ 682 | "+-----------------------------------+------+\n", 683 | "|features |Status|\n", 684 | "+-----------------------------------+------+\n", 685 | "|[1.0,0.0,0.0,1.0,0.0,41.0,1.0,21.0]|1 |\n", 686 | "|[1.0,0.0,0.0,0.0,1.0,28.0,1.0,5.0] |0 |\n", 687 | "|(8,[1,4,5,7],[1.0,1.0,40.0,3.0]) |0 |\n", 688 | "|(8,[2,5,6,7],[1.0,31.0,1.0,15.0]) |1 |\n", 689 | "|(8,[1,5,7],[1.0,32.0,15.0]) |1 |\n", 690 | "|(8,[1,4,5,7],[1.0,1.0,32.0,3.0]) |0 |\n", 691 | "|(8,[1,4,5,7],[1.0,1.0,32.0,6.0]) |0 |\n", 692 | "|(8,[1,2,5,7],[1.0,1.0,27.0,9.0]) |0 |\n", 693 | "|(8,[0,2,5,7],[1.0,1.0,32.0,2.0]) |0 |\n", 694 | "|(8,[2,5,6,7],[1.0,31.0,1.0,16.0]) |1 |\n", 695 | "+-----------------------------------+------+\n", 696 | "only showing top 10 rows\n", 697 | "\n" 698 | ] 699 | } 700 | ], 701 | "source": [ 702 | "df.select(['features','Status']).show(10,False)" 703 | ] 704 | }, 705 | { 706 | "cell_type": "code", 707 | "execution_count": 149, 708 | "metadata": {}, 709 | "outputs": [], 710 | "source": [ 711 | "#select data for building model\n", 712 | "model_df=df.select(['features','Status'])" 713 | ] 714 | }, 715 | { 716 | "cell_type": "code", 717 | "execution_count": 150, 718 | "metadata": {}, 719 | "outputs": [], 720 | "source": [ 721 | "from pyspark.ml.classification import LogisticRegression" 722 | ] 723 | }, 724 | { 725 | "cell_type": "code", 726 | "execution_count": 151, 727 | "metadata": {}, 728 | "outputs": [], 729 | "source": [ 730 | "#split the data \n", 731 | "training_df,test_df=model_df.randomSplit([0.75,0.25])" 732 | ] 733 | }, 734 | { 735 | "cell_type": "code", 736 | "execution_count": 152, 737 | "metadata": {}, 738 | "outputs": [ 739 | { 740 | "data": { 741 | "text/plain": [ 742 | "14907" 743 | ] 744 | }, 745 | "execution_count": 152, 746 | "metadata": {}, 747 | "output_type": "execute_result" 748 | } 749 | ], 750 | "source": [ 751 | "training_df.count()" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 160, 757 | "metadata": {}, 758 | "outputs": [ 759 | { 760 | "name": "stdout", 761 | "output_type": "stream", 762 | "text": [ 763 | "+------+-----+\n", 764 | "|Status|count|\n", 765 | "+------+-----+\n", 766 | "| 1| 7417|\n", 767 | "| 0| 7490|\n", 768 | "+------+-----+\n", 769 | "\n" 770 | ] 771 | } 772 | ], 773 | "source": [ 774 | "training_df.groupBy('Status').count().show()" 775 | ] 776 | }, 777 | { 778 | "cell_type": "code", 779 | "execution_count": 153, 780 | "metadata": {}, 781 | "outputs": [ 782 | { 783 | "data": { 784 | "text/plain": [ 785 | "5093" 786 | ] 787 | }, 788 | "execution_count": 153, 789 | "metadata": {}, 790 | "output_type": "execute_result" 791 | } 792 | ], 793 | "source": [ 794 | "test_df.count()" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": 161, 800 | "metadata": {}, 801 | "outputs": [ 802 | { 803 | "name": "stdout", 804 | "output_type": "stream", 805 | "text": [ 806 | "+------+-----+\n", 807 | "|Status|count|\n", 808 | "+------+-----+\n", 809 | "| 1| 2583|\n", 810 | "| 0| 2510|\n", 811 | "+------+-----+\n", 812 | "\n" 813 | ] 814 | } 815 | ], 816 | "source": [ 817 | "test_df.groupBy('Status').count().show()" 818 | ] 819 | }, 820 | { 821 | "cell_type": "code", 822 | "execution_count": 154, 823 | "metadata": {}, 824 | "outputs": [], 825 | "source": [ 826 | "log_reg=LogisticRegression(labelCol='Status').fit(training_df)" 827 | ] 828 | }, 829 | { 830 | "cell_type": "code", 831 | "execution_count": null, 832 | "metadata": {}, 833 | "outputs": [], 834 | "source": [ 835 | "#Training Results" 836 | ] 837 | }, 838 | { 839 | "cell_type": "code", 840 | "execution_count": 155, 841 | "metadata": {}, 842 | "outputs": [], 843 | "source": [ 844 | "train_results=log_reg.evaluate(training_df).predictions" 845 | ] 846 | }, 847 | { 848 | "cell_type": "code", 849 | "execution_count": 168, 850 | "metadata": {}, 851 | "outputs": [ 852 | { 853 | "name": "stdout", 854 | "output_type": "stream", 855 | "text": [ 856 | "+------+----------+----------------------------------------+\n", 857 | "|Status|prediction|probability |\n", 858 | "+------+----------+----------------------------------------+\n", 859 | "|1 |1.0 |[0.2978572628475072,0.7021427371524929] |\n", 860 | "|1 |1.0 |[0.2978572628475072,0.7021427371524929] |\n", 861 | "|1 |1.0 |[0.16704676975730415,0.8329532302426959]|\n", 862 | "|1 |1.0 |[0.16704676975730415,0.8329532302426959]|\n", 863 | "|1 |1.0 |[0.16704676975730415,0.8329532302426959]|\n", 864 | "|1 |1.0 |[0.08659913656062515,0.9134008634393749]|\n", 865 | "|1 |1.0 |[0.08659913656062515,0.9134008634393749]|\n", 866 | "|1 |1.0 |[0.08659913656062515,0.9134008634393749]|\n", 867 | "|1 |1.0 |[0.08659913656062515,0.9134008634393749]|\n", 868 | "|1 |1.0 |[0.08659913656062515,0.9134008634393749]|\n", 869 | "+------+----------+----------------------------------------+\n", 870 | "only showing top 10 rows\n", 871 | "\n" 872 | ] 873 | } 874 | ], 875 | "source": [ 876 | "train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).select(['Status','prediction','probability']).show(10,False)" 877 | ] 878 | }, 879 | { 880 | "cell_type": "markdown", 881 | "metadata": {}, 882 | "source": [ 883 | "Probability at 0 index is for 0 class and probabilty as 1 index is for 1 class" 884 | ] 885 | }, 886 | { 887 | "cell_type": "code", 888 | "execution_count": 177, 889 | "metadata": {}, 890 | "outputs": [], 891 | "source": [ 892 | "correct_preds=train_results.filter(train_results['Status']==1).filter(train_results['prediction']==1).count()\n" 893 | ] 894 | }, 895 | { 896 | "cell_type": "code", 897 | "execution_count": 174, 898 | "metadata": {}, 899 | "outputs": [ 900 | { 901 | "data": { 902 | "text/plain": [ 903 | "7417" 904 | ] 905 | }, 906 | "execution_count": 174, 907 | "metadata": {}, 908 | "output_type": "execute_result" 909 | } 910 | ], 911 | "source": [ 912 | "training_df.filter(training_df['Status']==1).count()" 913 | ] 914 | }, 915 | { 916 | "cell_type": "code", 917 | "execution_count": 178, 918 | "metadata": {}, 919 | "outputs": [ 920 | { 921 | "data": { 922 | "text/plain": [ 923 | "0.9366320614803829" 924 | ] 925 | }, 926 | "execution_count": 178, 927 | "metadata": {}, 928 | "output_type": "execute_result" 929 | } 930 | ], 931 | "source": [ 932 | "#accuracy on training dataset \n", 933 | "float(correct_preds)/(training_df.filter(training_df['Status']==1).count())" 934 | ] 935 | }, 936 | { 937 | "cell_type": "code", 938 | "execution_count": null, 939 | "metadata": {}, 940 | "outputs": [], 941 | "source": [ 942 | "#Test Set results" 943 | ] 944 | }, 945 | { 946 | "cell_type": "code", 947 | "execution_count": 170, 948 | "metadata": {}, 949 | "outputs": [], 950 | "source": [ 951 | "results=log_reg.evaluate(test_df).predictions" 952 | ] 953 | }, 954 | { 955 | "cell_type": "code", 956 | "execution_count": 93, 957 | "metadata": {}, 958 | "outputs": [ 959 | { 960 | "name": "stdout", 961 | "output_type": "stream", 962 | "text": [ 963 | "+------+----------+\n", 964 | "|Status|prediction|\n", 965 | "+------+----------+\n", 966 | "|0 |0.0 |\n", 967 | "|0 |0.0 |\n", 968 | "|0 |0.0 |\n", 969 | "|0 |0.0 |\n", 970 | "|1 |0.0 |\n", 971 | "|0 |0.0 |\n", 972 | "|1 |1.0 |\n", 973 | "|0 |1.0 |\n", 974 | "|1 |1.0 |\n", 975 | "|1 |1.0 |\n", 976 | "+------+----------+\n", 977 | "only showing top 10 rows\n", 978 | "\n" 979 | ] 980 | } 981 | ], 982 | "source": [ 983 | "results.select(['Status','prediction']).show(10,False)" 984 | ] 985 | }, 986 | { 987 | "cell_type": "code", 988 | "execution_count": 91, 989 | "metadata": {}, 990 | "outputs": [ 991 | { 992 | "name": "stdout", 993 | "output_type": "stream", 994 | "text": [ 995 | "root\n", 996 | " |-- features: vector (nullable = true)\n", 997 | " |-- Status: integer (nullable = true)\n", 998 | " |-- rawPrediction: vector (nullable = true)\n", 999 | " |-- probability: vector (nullable = true)\n", 1000 | " |-- prediction: double (nullable = false)\n", 1001 | "\n" 1002 | ] 1003 | } 1004 | ], 1005 | "source": [ 1006 | "results.printSchema()" 1007 | ] 1008 | }, 1009 | { 1010 | "cell_type": "code", 1011 | "execution_count": 92, 1012 | "metadata": {}, 1013 | "outputs": [], 1014 | "source": [ 1015 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "code", 1020 | "execution_count": 94, 1021 | "metadata": {}, 1022 | "outputs": [], 1023 | "source": [ 1024 | "#confusion matrix\n", 1025 | "true_postives = results[(results.Status == 1) & (results.prediction == 1)].count()\n", 1026 | "true_negatives = results[(results.Status == 0) & (results.prediction == 0)].count()\n", 1027 | "false_positives = results[(results.Status == 0) & (results.prediction == 1)].count()\n", 1028 | "false_negatives = results[(results.Status == 1) & (results.prediction == 0)].count()" 1029 | ] 1030 | }, 1031 | { 1032 | "cell_type": "code", 1033 | "execution_count": 98, 1034 | "metadata": {}, 1035 | "outputs": [ 1036 | { 1037 | "name": "stdout", 1038 | "output_type": "stream", 1039 | "text": [ 1040 | "2356\n", 1041 | "2363\n", 1042 | "158\n", 1043 | "157\n", 1044 | "5034\n", 1045 | "5034\n" 1046 | ] 1047 | } 1048 | ], 1049 | "source": [ 1050 | "print (true_postives)\n", 1051 | "print (true_negatives)\n", 1052 | "print (false_positives)\n", 1053 | "print (false_negatives)\n", 1054 | "print(true_postives+true_negatives+false_positives+false_negatives)\n", 1055 | "print (results.count())" 1056 | ] 1057 | }, 1058 | { 1059 | "cell_type": "code", 1060 | "execution_count": 99, 1061 | "metadata": {}, 1062 | "outputs": [ 1063 | { 1064 | "name": "stdout", 1065 | "output_type": "stream", 1066 | "text": [ 1067 | "0.937524870672503\n" 1068 | ] 1069 | } 1070 | ], 1071 | "source": [ 1072 | "recall = float(true_postives)/(true_postives + false_negatives)\n", 1073 | "print(recall)" 1074 | ] 1075 | }, 1076 | { 1077 | "cell_type": "code", 1078 | "execution_count": 100, 1079 | "metadata": {}, 1080 | "outputs": [ 1081 | { 1082 | "name": "stdout", 1083 | "output_type": "stream", 1084 | "text": [ 1085 | "0.9371519490851233\n" 1086 | ] 1087 | } 1088 | ], 1089 | "source": [ 1090 | "precision = float(true_postives) / (true_postives + false_positives)\n", 1091 | "print(precision)" 1092 | ] 1093 | }, 1094 | { 1095 | "cell_type": "code", 1096 | "execution_count": 103, 1097 | "metadata": {}, 1098 | "outputs": [ 1099 | { 1100 | "name": "stdout", 1101 | "output_type": "stream", 1102 | "text": [ 1103 | "0.9374255065554231\n" 1104 | ] 1105 | } 1106 | ], 1107 | "source": [ 1108 | "accuracy=float((true_postives+true_negatives) /(results.count()))\n", 1109 | "print(accuracy)" 1110 | ] 1111 | }, 1112 | { 1113 | "cell_type": "code", 1114 | "execution_count": null, 1115 | "metadata": {}, 1116 | "outputs": [], 1117 | "source": [] 1118 | } 1119 | ], 1120 | "metadata": { 1121 | "kernelspec": { 1122 | "display_name": "Python 3", 1123 | "language": "python", 1124 | "name": "python3" 1125 | }, 1126 | "language_info": { 1127 | "codemirror_mode": { 1128 | "name": "ipython", 1129 | "version": 3 1130 | }, 1131 | "file_extension": ".py", 1132 | "mimetype": "text/x-python", 1133 | "name": "python", 1134 | "nbconvert_exporter": "python", 1135 | "pygments_lexer": "ipython3", 1136 | "version": "3.6.3" 1137 | } 1138 | }, 1139 | "nbformat": 4, 1140 | "nbformat_minor": 2 1141 | } 1142 | -------------------------------------------------------------------------------- /chap_3/.ipynb_checkpoints/pyspark_basics-checkpoint.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Pyspark Basics" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": {}, 14 | "outputs": [], 15 | "source": [ 16 | "#create pyspark session\n", 17 | "from pyspark.sql import SparkSession\n", 18 | "spark=SparkSession.builder.appName('pyspark').getOrCreate()" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 121, 24 | "metadata": {}, 25 | "outputs": [], 26 | "source": [ 27 | "#read the data file\n", 28 | "df=spark.read.csv('conversion_data.csv')" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 122, 34 | "metadata": {}, 35 | "outputs": [ 36 | { 37 | "name": "stdout", 38 | "output_type": "stream", 39 | "text": [ 40 | "+-------+---+--------+------+-------------------+---------+\n", 41 | "| _c0|_c1| _c2| _c3| _c4| _c5|\n", 42 | "+-------+---+--------+------+-------------------+---------+\n", 43 | "|country|age|new_user|source|total_pages_visited|converted|\n", 44 | "| UK| 25| 1| Ads| 1| 0|\n", 45 | "| US| 23| 1| Seo| 5| 0|\n", 46 | "| US| 28| 1| Seo| 4| 0|\n", 47 | "| China| 39| 1| Seo| 5| 0|\n", 48 | "+-------+---+--------+------+-------------------+---------+\n", 49 | "only showing top 5 rows\n", 50 | "\n" 51 | ] 52 | } 53 | ], 54 | "source": [ 55 | "df.show(5)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 123, 61 | "metadata": {}, 62 | "outputs": [], 63 | "source": [ 64 | "#read the data file\n", 65 | "df=spark.read.csv('conversion_data.csv',header=True)" 66 | ] 67 | }, 68 | { 69 | "cell_type": "code", 70 | "execution_count": 124, 71 | "metadata": {}, 72 | "outputs": [ 73 | { 74 | "name": "stdout", 75 | "output_type": "stream", 76 | "text": [ 77 | "+-------+---+--------+------+-------------------+---------+\n", 78 | "|country|age|new_user|source|total_pages_visited|converted|\n", 79 | "+-------+---+--------+------+-------------------+---------+\n", 80 | "| UK| 25| 1| Ads| 1| 0|\n", 81 | "| US| 23| 1| Seo| 5| 0|\n", 82 | "| US| 28| 1| Seo| 4| 0|\n", 83 | "| China| 39| 1| Seo| 5| 0|\n", 84 | "| US| 30| 1| Seo| 6| 0|\n", 85 | "+-------+---+--------+------+-------------------+---------+\n", 86 | "only showing top 5 rows\n", 87 | "\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "df.show(5)" 93 | ] 94 | }, 95 | { 96 | "cell_type": "code", 97 | "execution_count": 125, 98 | "metadata": {}, 99 | "outputs": [ 100 | { 101 | "name": "stdout", 102 | "output_type": "stream", 103 | "text": [ 104 | "root\n", 105 | " |-- country: string (nullable = true)\n", 106 | " |-- age: string (nullable = true)\n", 107 | " |-- new_user: string (nullable = true)\n", 108 | " |-- source: string (nullable = true)\n", 109 | " |-- total_pages_visited: string (nullable = true)\n", 110 | " |-- converted: string (nullable = true)\n", 111 | "\n" 112 | ] 113 | } 114 | ], 115 | "source": [ 116 | "df.printSchema()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": 126, 122 | "metadata": {}, 123 | "outputs": [ 124 | { 125 | "name": "stdout", 126 | "output_type": "stream", 127 | "text": [ 128 | "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n", 129 | "|summary|country| age| new_user|source|total_pages_visited| converted|\n", 130 | "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n", 131 | "| count| 316200| 316200| 316200|316200| 316200| 316200|\n", 132 | "| mean| null|30.569857685009488| 0.6854648956356736| null| 4.872966476913346|0.03225806451612903|\n", 133 | "| stddev| null| 8.271801801807728|0.46433119036384723| null| 3.341103757948214|0.17668497535763514|\n", 134 | "| min| China| 111| 0| Ads| 1| 0|\n", 135 | "| max| US| 79| 1| Seo| 9| 1|\n", 136 | "+-------+-------+------------------+-------------------+------+-------------------+-------------------+\n", 137 | "\n" 138 | ] 139 | } 140 | ], 141 | "source": [ 142 | "#statistical summary for data numerical columns\n", 143 | "df.describe().show()" 144 | ] 145 | }, 146 | { 147 | "cell_type": "markdown", 148 | "metadata": {}, 149 | "source": [ 150 | "## Datatypes " 151 | ] 152 | }, 153 | { 154 | "cell_type": "code", 155 | "execution_count": 130, 156 | "metadata": {}, 157 | "outputs": [], 158 | "source": [ 159 | "from pyspark.sql.functions import col , column\n", 160 | "df = df.withColumn(\"age\", col(\"age\").cast(\"Int\"))\\\n", 161 | " .withColumn(\"new_user\", col(\"new_user\").cast(\"Int\"))\\\n", 162 | " .withColumn(\"total_pages_visited\", col(\"total_pages_visited\").cast(\"Int\"))\\\n", 163 | " .withColumn(\"converted\", col(\"converted\").cast(\"Int\"))" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 131, 169 | "metadata": {}, 170 | "outputs": [ 171 | { 172 | "name": "stdout", 173 | "output_type": "stream", 174 | "text": [ 175 | "root\n", 176 | " |-- country: string (nullable = true)\n", 177 | " |-- age: integer (nullable = true)\n", 178 | " |-- new_user: integer (nullable = true)\n", 179 | " |-- source: string (nullable = true)\n", 180 | " |-- total_pages_visited: integer (nullable = true)\n", 181 | " |-- converted: integer (nullable = true)\n", 182 | "\n" 183 | ] 184 | } 185 | ], 186 | "source": [ 187 | "df.printSchema()" 188 | ] 189 | }, 190 | { 191 | "cell_type": "code", 192 | "execution_count": 132, 193 | "metadata": {}, 194 | "outputs": [ 195 | { 196 | "name": "stdout", 197 | "output_type": "stream", 198 | "text": [ 199 | "+-------+---+--------+------+-------------------+---------+\n", 200 | "|country|age|new_user|source|total_pages_visited|converted|\n", 201 | "+-------+---+--------+------+-------------------+---------+\n", 202 | "| UK| 25| 1| Ads| 1| 0|\n", 203 | "| US| 23| 1| Seo| 5| 0|\n", 204 | "| US| 28| 1| Seo| 4| 0|\n", 205 | "| China| 39| 1| Seo| 5| 0|\n", 206 | "| US| 30| 1| Seo| 6| 0|\n", 207 | "+-------+---+--------+------+-------------------+---------+\n", 208 | "only showing top 5 rows\n", 209 | "\n" 210 | ] 211 | } 212 | ], 213 | "source": [ 214 | "df.show(5)" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 133, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "data": { 224 | "text/plain": [ 225 | "Column" 226 | ] 227 | }, 228 | "execution_count": 133, 229 | "metadata": {}, 230 | "output_type": "execute_result" 231 | } 232 | ], 233 | "source": [ 234 | "#acess dataframe column , we get column object \n", 235 | "df['country']" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 134, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "data": { 245 | "text/plain": [ 246 | "pyspark.sql.column.Column" 247 | ] 248 | }, 249 | "execution_count": 134, 250 | "metadata": {}, 251 | "output_type": "execute_result" 252 | } 253 | ], 254 | "source": [ 255 | "type(df['country'])" 256 | ] 257 | }, 258 | { 259 | "cell_type": "code", 260 | "execution_count": 135, 261 | "metadata": {}, 262 | "outputs": [ 263 | { 264 | "name": "stdout", 265 | "output_type": "stream", 266 | "text": [ 267 | "+-------+\n", 268 | "|country|\n", 269 | "+-------+\n", 270 | "| UK|\n", 271 | "| US|\n", 272 | "| US|\n", 273 | "| China|\n", 274 | "| US|\n", 275 | "+-------+\n", 276 | "only showing top 5 rows\n", 277 | "\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "#access content of colum\n", 283 | "df.select('country').show(5)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 136, 289 | "metadata": {}, 290 | "outputs": [ 291 | { 292 | "name": "stdout", 293 | "output_type": "stream", 294 | "text": [ 295 | "+-------+------+\n", 296 | "|country|source|\n", 297 | "+-------+------+\n", 298 | "| UK| Ads|\n", 299 | "| US| Seo|\n", 300 | "| US| Seo|\n", 301 | "| China| Seo|\n", 302 | "| US| Seo|\n", 303 | "+-------+------+\n", 304 | "only showing top 5 rows\n", 305 | "\n" 306 | ] 307 | } 308 | ], 309 | "source": [ 310 | "#acess multiple columns\n", 311 | "df.select(['country','source']).show(5)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "markdown", 316 | "metadata": {}, 317 | "source": [ 318 | "### Add or Remove column " 319 | ] 320 | }, 321 | { 322 | "cell_type": "markdown", 323 | "metadata": {}, 324 | "source": [ 325 | "#### using udf (user defined functions)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 137, 331 | "metadata": {}, 332 | "outputs": [], 333 | "source": [ 334 | "from pyspark.sql.types import StringType\n", 335 | "from pyspark.sql.functions import udf\n", 336 | "\n", 337 | "def country_udf(country):\n", 338 | " if country =='UK':\n", 339 | " return 'Britain'\n", 340 | " elif country =='US':\n", 341 | " return 'USA'\n", 342 | " elif country =='China':\n", 343 | " return 'Asia'\n", 344 | " elif country =='Germany':\n", 345 | " return 'Deustche'\n", 346 | " else:\n", 347 | " return country\n", 348 | " \n", 349 | "spark_udf = udf(country_udf, StringType())\n", 350 | "\n", 351 | "df=df.withColumn(\"country_new\", spark_udf(df.country))" 352 | ] 353 | }, 354 | { 355 | "cell_type": "code", 356 | "execution_count": 138, 357 | "metadata": {}, 358 | "outputs": [ 359 | { 360 | "name": "stdout", 361 | "output_type": "stream", 362 | "text": [ 363 | "+-------+---+--------+------+-------------------+---------+-----------+\n", 364 | "|country|age|new_user|source|total_pages_visited|converted|country_new|\n", 365 | "+-------+---+--------+------+-------------------+---------+-----------+\n", 366 | "| UK| 25| 1| Ads| 1| 0| Britain|\n", 367 | "| US| 23| 1| Seo| 5| 0| USA|\n", 368 | "| US| 28| 1| Seo| 4| 0| USA|\n", 369 | "| China| 39| 1| Seo| 5| 0| Asia|\n", 370 | "| US| 30| 1| Seo| 6| 0| USA|\n", 371 | "| US| 31| 0| Seo| 1| 0| USA|\n", 372 | "| China| 27| 1| Seo| 4| 0| Asia|\n", 373 | "| US| 23| 0| Ads| 4| 0| USA|\n", 374 | "| UK| 29| 0|Direct| 4| 0| Britain|\n", 375 | "| US| 25| 0| Ads| 2| 0| USA|\n", 376 | "+-------+---+--------+------+-------------------+---------+-----------+\n", 377 | "only showing top 10 rows\n", 378 | "\n" 379 | ] 380 | } 381 | ], 382 | "source": [ 383 | "df.show(10)" 384 | ] 385 | }, 386 | { 387 | "cell_type": "markdown", 388 | "metadata": {}, 389 | "source": [ 390 | "#### without using udf " 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 139, 396 | "metadata": {}, 397 | "outputs": [], 398 | "source": [ 399 | "#create new column with age +2 value\n", 400 | "df=df.withColumn('new_age',df['age'] +2)" 401 | ] 402 | }, 403 | { 404 | "cell_type": "code", 405 | "execution_count": 140, 406 | "metadata": {}, 407 | "outputs": [ 408 | { 409 | "name": "stdout", 410 | "output_type": "stream", 411 | "text": [ 412 | "+-------+---+--------+------+-------------------+---------+-----------+-------+\n", 413 | "|country|age|new_user|source|total_pages_visited|converted|country_new|new_age|\n", 414 | "+-------+---+--------+------+-------------------+---------+-----------+-------+\n", 415 | "| UK| 25| 1| Ads| 1| 0| Britain| 27|\n", 416 | "| US| 23| 1| Seo| 5| 0| USA| 25|\n", 417 | "| US| 28| 1| Seo| 4| 0| USA| 30|\n", 418 | "| China| 39| 1| Seo| 5| 0| Asia| 41|\n", 419 | "| US| 30| 1| Seo| 6| 0| USA| 32|\n", 420 | "| US| 31| 0| Seo| 1| 0| USA| 33|\n", 421 | "| China| 27| 1| Seo| 4| 0| Asia| 29|\n", 422 | "| US| 23| 0| Ads| 4| 0| USA| 25|\n", 423 | "| UK| 29| 0|Direct| 4| 0| Britain| 31|\n", 424 | "| US| 25| 0| Ads| 2| 0| USA| 27|\n", 425 | "+-------+---+--------+------+-------------------+---------+-----------+-------+\n", 426 | "only showing top 10 rows\n", 427 | "\n" 428 | ] 429 | } 430 | ], 431 | "source": [ 432 | "df.show(10)" 433 | ] 434 | }, 435 | { 436 | "cell_type": "markdown", 437 | "metadata": {}, 438 | "source": [ 439 | "### Drop /Delete columns " 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 141, 445 | "metadata": {}, 446 | "outputs": [], 447 | "source": [ 448 | "#delete the new_age column\n", 449 | "df=df.drop('new_age')" 450 | ] 451 | }, 452 | { 453 | "cell_type": "code", 454 | "execution_count": 142, 455 | "metadata": {}, 456 | "outputs": [], 457 | "source": [ 458 | "#delete the country_new column\n", 459 | "df=df.drop('country_new')" 460 | ] 461 | }, 462 | { 463 | "cell_type": "code", 464 | "execution_count": 143, 465 | "metadata": {}, 466 | "outputs": [ 467 | { 468 | "name": "stdout", 469 | "output_type": "stream", 470 | "text": [ 471 | "+-------+---+--------+------+-------------------+---------+\n", 472 | "|country|age|new_user|source|total_pages_visited|converted|\n", 473 | "+-------+---+--------+------+-------------------+---------+\n", 474 | "| UK| 25| 1| Ads| 1| 0|\n", 475 | "| US| 23| 1| Seo| 5| 0|\n", 476 | "| US| 28| 1| Seo| 4| 0|\n", 477 | "| China| 39| 1| Seo| 5| 0|\n", 478 | "| US| 30| 1| Seo| 6| 0|\n", 479 | "+-------+---+--------+------+-------------------+---------+\n", 480 | "only showing top 5 rows\n", 481 | "\n" 482 | ] 483 | } 484 | ], 485 | "source": [ 486 | "df.show(5)" 487 | ] 488 | }, 489 | { 490 | "cell_type": "markdown", 491 | "metadata": {}, 492 | "source": [ 493 | "# Acess row objects of dataframe" 494 | ] 495 | }, 496 | { 497 | "cell_type": "code", 498 | "execution_count": 144, 499 | "metadata": {}, 500 | "outputs": [ 501 | { 502 | "data": { 503 | "text/plain": [ 504 | "[Row(country='UK', age=25, new_user=1, source='Ads', total_pages_visited=1, converted=0),\n", 505 | " Row(country='US', age=23, new_user=1, source='Seo', total_pages_visited=5, converted=0),\n", 506 | " Row(country='US', age=28, new_user=1, source='Seo', total_pages_visited=4, converted=0)]" 507 | ] 508 | }, 509 | "execution_count": 144, 510 | "metadata": {}, 511 | "output_type": "execute_result" 512 | } 513 | ], 514 | "source": [ 515 | "#access first 3 rows\n", 516 | "df.head(3)" 517 | ] 518 | }, 519 | { 520 | "cell_type": "code", 521 | "execution_count": 145, 522 | "metadata": {}, 523 | "outputs": [ 524 | { 525 | "data": { 526 | "text/plain": [ 527 | "Row(country='UK', age=25, new_user=1, source='Ads', total_pages_visited=1, converted=0)" 528 | ] 529 | }, 530 | "execution_count": 145, 531 | "metadata": {}, 532 | "output_type": "execute_result" 533 | } 534 | ], 535 | "source": [ 536 | "#access first row object \n", 537 | "df.head(3)[0]" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 146, 543 | "metadata": {}, 544 | "outputs": [ 545 | { 546 | "data": { 547 | "text/plain": [ 548 | "'UK'" 549 | ] 550 | }, 551 | "execution_count": 146, 552 | "metadata": {}, 553 | "output_type": "execute_result" 554 | } 555 | ], 556 | "source": [ 557 | "#access first row object\n", 558 | "df.head(3)[0][0]" 559 | ] 560 | }, 561 | { 562 | "cell_type": "markdown", 563 | "metadata": {}, 564 | "source": [ 565 | "## Filtering " 566 | ] 567 | }, 568 | { 569 | "cell_type": "code", 570 | "execution_count": 147, 571 | "metadata": {}, 572 | "outputs": [ 573 | { 574 | "name": "stdout", 575 | "output_type": "stream", 576 | "text": [ 577 | "+-------+---+--------+------+-------------------+---------+\n", 578 | "|country|age|new_user|source|total_pages_visited|converted|\n", 579 | "+-------+---+--------+------+-------------------+---------+\n", 580 | "|Germany|123| 0| Seo| 15| 1|\n", 581 | "| US| 77| 0|Direct| 4| 0|\n", 582 | "| US| 79| 1|Direct| 1| 0|\n", 583 | "| UK|111| 0| Ads| 10| 1|\n", 584 | "+-------+---+--------+------+-------------------+---------+\n", 585 | "\n" 586 | ] 587 | } 588 | ], 589 | "source": [ 590 | "#filter records where age of user is more than 75 years\n", 591 | "df.filter(df['age'] >75).show(5)" 592 | ] 593 | }, 594 | { 595 | "cell_type": "code", 596 | "execution_count": 148, 597 | "metadata": {}, 598 | "outputs": [ 599 | { 600 | "name": "stdout", 601 | "output_type": "stream", 602 | "text": [ 603 | "+-------+---------+---+\n", 604 | "|country|converted|age|\n", 605 | "+-------+---------+---+\n", 606 | "|Germany| 1|123|\n", 607 | "| US| 0| 77|\n", 608 | "| US| 0| 79|\n", 609 | "| UK| 1|111|\n", 610 | "+-------+---------+---+\n", 611 | "\n" 612 | ] 613 | } 614 | ], 615 | "source": [ 616 | "#filter records and show only country and converted status of that user\n", 617 | "df.filter(df['age'] > 75).select(['country','converted','age']).show(5)" 618 | ] 619 | }, 620 | { 621 | "cell_type": "markdown", 622 | "metadata": {}, 623 | "source": [ 624 | "### Multiple filter conditions" 625 | ] 626 | }, 627 | { 628 | "cell_type": "code", 629 | "execution_count": 149, 630 | "metadata": {}, 631 | "outputs": [ 632 | { 633 | "name": "stdout", 634 | "output_type": "stream", 635 | "text": [ 636 | "+-------+---+--------+------+-------------------+---------+\n", 637 | "|country|age|new_user|source|total_pages_visited|converted|\n", 638 | "+-------+---+--------+------+-------------------+---------+\n", 639 | "| US| 77| 0|Direct| 4| 0|\n", 640 | "| US| 79| 1|Direct| 1| 0|\n", 641 | "+-------+---+--------+------+-------------------+---------+\n", 642 | "\n" 643 | ] 644 | } 645 | ], 646 | "source": [ 647 | "#select people over 75 years only from US\n", 648 | "df.filter(df['age'] > 75).filter(df['country'] =='US').show(5)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 150, 654 | "metadata": {}, 655 | "outputs": [ 656 | { 657 | "name": "stdout", 658 | "output_type": "stream", 659 | "text": [ 660 | "+-------+---+--------+------+-------------------+---------+\n", 661 | "|country|age|new_user|source|total_pages_visited|converted|\n", 662 | "+-------+---+--------+------+-------------------+---------+\n", 663 | "|Germany| 31| 0|Direct| 2| 1|\n", 664 | "+-------+---+--------+------+-------------------+---------+\n", 665 | "\n" 666 | ] 667 | } 668 | ], 669 | "source": [ 670 | "#selet users who have more less than 3 visited pages and are still converted from Germany \n", 671 | "df.filter(df['total_pages_visited'] < 3).filter(df['converted']==1).filter(df['country'] =='Germany').show(5)" 672 | ] 673 | }, 674 | { 675 | "cell_type": "markdown", 676 | "metadata": {}, 677 | "source": [ 678 | "## Count Records " 679 | ] 680 | }, 681 | { 682 | "cell_type": "code", 683 | "execution_count": 151, 684 | "metadata": {}, 685 | "outputs": [ 686 | { 687 | "data": { 688 | "text/plain": [ 689 | "316200" 690 | ] 691 | }, 692 | "execution_count": 151, 693 | "metadata": {}, 694 | "output_type": "execute_result" 695 | } 696 | ], 697 | "source": [ 698 | "#total records in df \n", 699 | "df.count()" 700 | ] 701 | }, 702 | { 703 | "cell_type": "code", 704 | "execution_count": 152, 705 | "metadata": {}, 706 | "outputs": [ 707 | { 708 | "name": "stdout", 709 | "output_type": "stream", 710 | "text": [ 711 | "+-------+------+\n", 712 | "|country| count|\n", 713 | "+-------+------+\n", 714 | "|Germany| 13056|\n", 715 | "| China| 76602|\n", 716 | "| US|178092|\n", 717 | "| UK| 48450|\n", 718 | "+-------+------+\n", 719 | "\n" 720 | ] 721 | } 722 | ], 723 | "source": [ 724 | "# Frequency count of column values\n", 725 | "df.groupBy('country').count().show(5)" 726 | ] 727 | }, 728 | { 729 | "cell_type": "code", 730 | "execution_count": 153, 731 | "metadata": {}, 732 | "outputs": [ 733 | { 734 | "name": "stdout", 735 | "output_type": "stream", 736 | "text": [ 737 | "+-------+------+\n", 738 | "|country| count|\n", 739 | "+-------+------+\n", 740 | "| US|178092|\n", 741 | "| China| 76602|\n", 742 | "| UK| 48450|\n", 743 | "|Germany| 13056|\n", 744 | "+-------+------+\n", 745 | "\n" 746 | ] 747 | } 748 | ], 749 | "source": [ 750 | "## Ordered Frequency count \n", 751 | "df.groupBy('country').count().orderBy('count',ascending=False).show(5)" 752 | ] 753 | }, 754 | { 755 | "cell_type": "code", 756 | "execution_count": 154, 757 | "metadata": {}, 758 | "outputs": [ 759 | { 760 | "name": "stdout", 761 | "output_type": "stream", 762 | "text": [ 763 | "+---------+------+\n", 764 | "|converted| count|\n", 765 | "+---------+------+\n", 766 | "| 1| 10200|\n", 767 | "| 0|306000|\n", 768 | "+---------+------+\n", 769 | "\n" 770 | ] 771 | } 772 | ], 773 | "source": [ 774 | "#Total converted vs non converted user counts\n", 775 | "df.groupBy('converted').count().show(2)" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 156, 781 | "metadata": {}, 782 | "outputs": [ 783 | { 784 | "name": "stdout", 785 | "output_type": "stream", 786 | "text": [ 787 | "+---------+------------------+------------------+------------------------+--------------+\n", 788 | "|converted| avg(age)| avg(new_user)|avg(total_pages_visited)|avg(converted)|\n", 789 | "+---------+------------------+------------------+------------------------+--------------+\n", 790 | "| 1|26.546764705882353|0.2979411764705882| 14.553529411764705| 1.0|\n", 791 | "| 0|30.703960784313725|0.6983823529411765| 4.550281045751634| 0.0|\n", 792 | "+---------+------------------+------------------+------------------------+--------------+\n", 793 | "\n" 794 | ] 795 | } 796 | ], 797 | "source": [ 798 | "#Mean value of conversion for each source\n", 799 | "df.groupBy('converted').mean().show()" 800 | ] 801 | }, 802 | { 803 | "cell_type": "markdown", 804 | "metadata": {}, 805 | "source": [ 806 | "## Collect " 807 | ] 808 | }, 809 | { 810 | "cell_type": "markdown", 811 | "metadata": {}, 812 | "source": [ 813 | "Save the results as a list with row objects\n" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": 57, 819 | "metadata": {}, 820 | "outputs": [], 821 | "source": [ 822 | "# create a list with only converted users data from China\n", 823 | "china_data=df.filter((df['country']=='China') & (df['converted'] ==1)).collect()" 824 | ] 825 | }, 826 | { 827 | "cell_type": "code", 828 | "execution_count": 60, 829 | "metadata": {}, 830 | "outputs": [ 831 | { 832 | "data": { 833 | "text/plain": [ 834 | "[Row(country='China', age='24', new_user='0', source='Seo', total_pages_visited='18', converted='1'),\n", 835 | " Row(country='China', age='26', new_user='1', source='Ads', total_pages_visited='18', converted='1'),\n", 836 | " Row(country='China', age='30', new_user='0', source='Ads', total_pages_visited='17', converted='1'),\n", 837 | " Row(country='China', age='26', new_user='0', source='Seo', total_pages_visited='8', converted='1'),\n", 838 | " Row(country='China', age='33', new_user='1', source='Direct', total_pages_visited='13', converted='1')]" 839 | ] 840 | }, 841 | "execution_count": 60, 842 | "metadata": {}, 843 | "output_type": "execute_result" 844 | } 845 | ], 846 | "source": [ 847 | "#view the new list \n", 848 | "china_data[:5]" 849 | ] 850 | }, 851 | { 852 | "cell_type": "code", 853 | "execution_count": 67, 854 | "metadata": {}, 855 | "outputs": [], 856 | "source": [ 857 | "#view the list object as a dictionary\n", 858 | "china_dict=china_data[0].asDict()" 859 | ] 860 | }, 861 | { 862 | "cell_type": "code", 863 | "execution_count": 64, 864 | "metadata": {}, 865 | "outputs": [ 866 | { 867 | "data": { 868 | "text/plain": [ 869 | "'24'" 870 | ] 871 | }, 872 | "execution_count": 64, 873 | "metadata": {}, 874 | "output_type": "execute_result" 875 | } 876 | ], 877 | "source": [ 878 | "china_dict['age']" 879 | ] 880 | }, 881 | { 882 | "cell_type": "code", 883 | "execution_count": 65, 884 | "metadata": {}, 885 | "outputs": [ 886 | { 887 | "data": { 888 | "text/plain": [ 889 | "'18'" 890 | ] 891 | }, 892 | "execution_count": 65, 893 | "metadata": {}, 894 | "output_type": "execute_result" 895 | } 896 | ], 897 | "source": [ 898 | "china_dict['total_pages_visited']" 899 | ] 900 | }, 901 | { 902 | "cell_type": "markdown", 903 | "metadata": {}, 904 | "source": [ 905 | "## Aggregate Functions" 906 | ] 907 | }, 908 | { 909 | "cell_type": "code", 910 | "execution_count": 158, 911 | "metadata": {}, 912 | "outputs": [ 913 | { 914 | "name": "stdout", 915 | "output_type": "stream", 916 | "text": [ 917 | "+------------------+\n", 918 | "| avg(age)|\n", 919 | "+------------------+\n", 920 | "|30.569857685009488|\n", 921 | "+------------------+\n", 922 | "\n" 923 | ] 924 | } 925 | ], 926 | "source": [ 927 | "df.agg({'age':'mean'}).show()" 928 | ] 929 | }, 930 | { 931 | "cell_type": "code", 932 | "execution_count": 159, 933 | "metadata": {}, 934 | "outputs": [ 935 | { 936 | "name": "stdout", 937 | "output_type": "stream", 938 | "text": [ 939 | "+-------------------+\n", 940 | "| avg(converted)|\n", 941 | "+-------------------+\n", 942 | "|0.03225806451612903|\n", 943 | "+-------------------+\n", 944 | "\n" 945 | ] 946 | } 947 | ], 948 | "source": [ 949 | "df.agg({'converted':'mean'}).show()" 950 | ] 951 | }, 952 | { 953 | "cell_type": "code", 954 | "execution_count": 160, 955 | "metadata": {}, 956 | "outputs": [ 957 | { 958 | "name": "stdout", 959 | "output_type": "stream", 960 | "text": [ 961 | "+--------+\n", 962 | "|max(age)|\n", 963 | "+--------+\n", 964 | "| 123|\n", 965 | "+--------+\n", 966 | "\n" 967 | ] 968 | } 969 | ], 970 | "source": [ 971 | "df.agg({'age':'max'}).show()" 972 | ] 973 | }, 974 | { 975 | "cell_type": "code", 976 | "execution_count": 161, 977 | "metadata": {}, 978 | "outputs": [ 979 | { 980 | "name": "stdout", 981 | "output_type": "stream", 982 | "text": [ 983 | "+--------------+\n", 984 | "|count(country)|\n", 985 | "+--------------+\n", 986 | "| 316200|\n", 987 | "+--------------+\n", 988 | "\n" 989 | ] 990 | } 991 | ], 992 | "source": [ 993 | "df.agg({'country':'count'}).show()" 994 | ] 995 | }, 996 | { 997 | "cell_type": "code", 998 | "execution_count": 162, 999 | "metadata": {}, 1000 | "outputs": [ 1001 | { 1002 | "name": "stdout", 1003 | "output_type": "stream", 1004 | "text": [ 1005 | "+--------+\n", 1006 | "|min(age)|\n", 1007 | "+--------+\n", 1008 | "| 17|\n", 1009 | "+--------+\n", 1010 | "\n" 1011 | ] 1012 | } 1013 | ], 1014 | "source": [ 1015 | "df.agg({'age':'min'}).show()" 1016 | ] 1017 | }, 1018 | { 1019 | "cell_type": "code", 1020 | "execution_count": 163, 1021 | "metadata": {}, 1022 | "outputs": [ 1023 | { 1024 | "name": "stdout", 1025 | "output_type": "stream", 1026 | "text": [ 1027 | "+-------+--------+\n", 1028 | "|country|max(age)|\n", 1029 | "+-------+--------+\n", 1030 | "|Germany| 123|\n", 1031 | "| China| 69|\n", 1032 | "| US| 79|\n", 1033 | "| UK| 111|\n", 1034 | "+-------+--------+\n", 1035 | "\n" 1036 | ] 1037 | } 1038 | ], 1039 | "source": [ 1040 | "## aggregation on grouped data by country\n", 1041 | "df.groupBy('country').agg({'age':'max'}).show()" 1042 | ] 1043 | }, 1044 | { 1045 | "cell_type": "code", 1046 | "execution_count": 164, 1047 | "metadata": {}, 1048 | "outputs": [ 1049 | { 1050 | "name": "stdout", 1051 | "output_type": "stream", 1052 | "text": [ 1053 | "+-------+------+--------+\n", 1054 | "|country|source|max(age)|\n", 1055 | "+-------+------+--------+\n", 1056 | "|Germany|Direct| 61|\n", 1057 | "| China|Direct| 65|\n", 1058 | "| UK| Ads| 111|\n", 1059 | "| US| Seo| 73|\n", 1060 | "| UK| Seo| 66|\n", 1061 | "|Germany| Seo| 123|\n", 1062 | "|Germany| Ads| 64|\n", 1063 | "| China| Seo| 68|\n", 1064 | "| UK|Direct| 69|\n", 1065 | "| China| Ads| 69|\n", 1066 | "| US| Ads| 70|\n", 1067 | "| US|Direct| 79|\n", 1068 | "+-------+------+--------+\n", 1069 | "\n" 1070 | ] 1071 | } 1072 | ], 1073 | "source": [ 1074 | "## aggregation on grouped data by country,source\n", 1075 | "df.groupBy(['country','source']).agg({'age':'max'}).show()" 1076 | ] 1077 | }, 1078 | { 1079 | "cell_type": "code", 1080 | "execution_count": 170, 1081 | "metadata": {}, 1082 | "outputs": [ 1083 | { 1084 | "name": "stdout", 1085 | "output_type": "stream", 1086 | "text": [ 1087 | "+-------+---------+------------------------+\n", 1088 | "|country|converted|avg(total_pages_visited)|\n", 1089 | "+-------+---------+------------------------+\n", 1090 | "|Germany| 0| 4.565277777777778|\n", 1091 | "| China| 1| 14.352941176470589|\n", 1092 | "| China| 0| 4.5404575163398695|\n", 1093 | "| US| 0| 4.551785714285714|\n", 1094 | "| UK| 0| 4.557037037037037|\n", 1095 | "|Germany| 1| 14.572303921568627|\n", 1096 | "| UK| 1| 14.53450980392157|\n", 1097 | "| US| 1| 14.561497326203208|\n", 1098 | "+-------+---------+------------------------+\n", 1099 | "\n" 1100 | ] 1101 | } 1102 | ], 1103 | "source": [ 1104 | "## aggregation on grouped data by country,converted\n", 1105 | "df.groupBy(['country','converted']).agg({'total_pages_visited':'mean'}).show()" 1106 | ] 1107 | }, 1108 | { 1109 | "cell_type": "code", 1110 | "execution_count": null, 1111 | "metadata": {}, 1112 | "outputs": [], 1113 | "source": [] 1114 | } 1115 | ], 1116 | "metadata": { 1117 | "kernelspec": { 1118 | "display_name": "Python 3", 1119 | "language": "python", 1120 | "name": "python3" 1121 | }, 1122 | "language_info": { 1123 | "codemirror_mode": { 1124 | "name": "ipython", 1125 | "version": 3 1126 | }, 1127 | "file_extension": ".py", 1128 | "mimetype": "text/x-python", 1129 | "name": "python", 1130 | "nbconvert_exporter": "python", 1131 | "pygments_lexer": "ipython3", 1132 | "version": "3.6.3" 1133 | } 1134 | }, 1135 | "nbformat": 4, 1136 | "nbformat_minor": 2 1137 | } 1138 | --------------------------------------------------------------------------------