├── Build Pipeline
    ├── DataPipeline.ipynb
    ├── ML-Pipeline.py
    ├── build-pipline.py
    ├── data_ETL.py
    ├── database.csv
    ├── query.csv
    ├── server.py
    └── template
    │   ├── dashboard.html
    │   └── file
├── Clustering_ML.ipynb
├── DataFrame_Basic_Operations.ipynb
├── Data_Transformations.ipynb
├── Dates_and_Timestamps.ipynb
├── GroupBy_and_Aggregate_Functions.ipynb
├── Linear_Regression_Consulting.ipynb
├── Logistic_Regression_Consulting.ipynb
├── Missing_Data.ipynb
├── MymasterNote.ipynb
├── README.md
├── Read_Write_and_Validate_Data.py
├── Recommender_System.ipynb
├── SQL_notebook.pdf
├── Spark_Streaming.ipynb
├── TweetRead.py
├── big_data
    ├── SQL-in-Spark.py
    ├── partition_parquet_file.py
    ├── readme.md
    ├── search_filter_dataframe.py
    └── split-column.py
├── books
    ├── LearningSpark2.0.pdf
    ├── pyspark.pdf
    └── spark-hadoop.pdf
├── data
    ├── ContainsNull.csv
    ├── appl_stock.csv
    ├── cruise_ship_info.csv
    ├── customer_churn.csv
    ├── people.json
    ├── sales_info.csv
    ├── users1.parquet
    ├── users2.parquet
    └── users3.parquet
├── scripts
    ├── MymasterNote.ipynb
    ├── PySpark_Dataframe_all.ipynb
    ├── aggrigating-data-in-DataFrame.py
    ├── join-append-DataFrame.py
    ├── join_tabales.ipynb
    ├── manipulating-data-in-DataFrame.py
    ├── multiple_csv_to_dataframe.py
    ├── pivote-table.py
    ├── pyspark-dataframe.py
    ├── read_write_DataFrame.py
    ├── sample_data
    │   ├── data
    │   └── data.txt
    ├── search-filter-DataFrame.py
    ├── update-column-DataFrame.py
    └── user-defined-function.py
├── search_filter_dataframe.py
└── spark-env.yml


/Build Pipeline/ML-Pipeline.py:
--------------------------------------------------------------------------------
 1 | import pyspark 
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql.types import *
 4 | from pyspark.sql.functions import *
 5 | from pyspark.ml import Pipeline
 6 | from pyspark.ml.regression import RandomForestRegressor
 7 | from pyspark.ml.feature import VectorAssembler
 8 | from pyspark.ml.evaluation import RegressionEvaluator
 9 | import numpy as np 
10 | 
11 | # Create spark session
12 | spark = SparkSession\
13 |     .builder\
14 |     .master('local[2]')\
15 |     .appName('quakes_ml')\
16 |     .config('spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:2.4.1')\
17 |     .getOrCreate()
18 | 
19 | """
20 | Data Pre-processing
21 | """
22 | # Load test data file
23 | df_test = spark.read.csv(r"query.csv", header=True)
24 | 
25 | # Load quakes data from mongodb 
26 | df_train = spark.read.format('mongo')\
27 |     .option('spark.mongodb.input.uri', 'mongodb://127.0.0.1:27017/Quake.quakes').load()
28 | 
29 | # Select fields we are going to use from df_test
30 | df_test_clean = df_test['time', 'latitude', 'longitude', 'mag', 'depth']
31 | 
32 | # Rename fields in df_test_clean
33 | df_test_clean = df_test_clean.withColumnRenamed('time', 'Date')\
34 |     .withColumnRenamed('latitude', 'Latitude')\
35 |     .withColumnRenamed('longitude', 'Longitude')\
36 |     .withColumnRenamed('mag', 'Magnitude')\
37 |     .withColumnRenamed('depth', 'Depth')
38 | 
39 | # Cast string fields to double
40 | df_test_clean = df_test_clean.withColumn('Latitude', df_test_clean['Latitude'].cast(DoubleType()))\
41 |     .withColumn('Longitude', df_test_clean['Longitude'].cast(DoubleType()))\
42 |     .withColumn('Depth', df_test_clean['Depth'].cast(DoubleType()))\
43 |     .withColumn('Magnitude', df_test_clean['Magnitude'].cast(DoubleType()))
44 | 
45 | # Create training and testing dataframes
46 | df_testing = df_test_clean['Latitude', 'Longitude', 'Magnitude', 'Depth']
47 | df_training = df_train['Latitude', 'Longitude', 'Magnitude', 'Depth']
48 | 
49 | # Remove nulls from our datasets
50 | df_training = df_training.dropna()
51 | df_testing = df_testing.dropna()
52 | 
53 | """
54 | Building the machine learning model
55 | """
56 | # Create feature vector
57 | assembler = VectorAssembler(inputCols=['Latitude', 'Longitude', 'Depth'], outputCol='features')
58 | 
59 | # Create the model
60 | model_reg = RandomForestRegressor(featuresCol='features', labelCol='Magnitude')
61 | 
62 | # Chain assembler and model into a pipleine
63 | pipeline = Pipeline(stages=[assembler, model_reg])
64 | 
65 | # Train the Model
66 | model = pipeline.fit(df_training)
67 | 
68 | # Make the prediction
69 | pred_results = model.transform(df_testing)
70 | 
71 | # Evaluate model
72 | evaluator = RegressionEvaluator(labelCol='Magnitude', predictionCol='prediction', metricName='rmse')
73 | rmse = evaluator.evaluate(pred_results)
74 | 
75 | """
76 | Create the prediction dataset
77 | """
78 | df_pred_results = pred_results['Latitude', 'Longitude', 'prediction']
79 | 
80 | # Rename the prediction field
81 | df_pred_results = df_pred_results.withColumnRenamed('prediction', 'Pred_Magnitude')
82 | 
83 | # Add more columns
84 | df_pred_results = df_pred_results.withColumn('Year', lit(2017))\
85 |     .withColumn('RMSE', lit(rmse))
86 | 
87 | # Load the pred dataset to MongoDB
88 | df_pred_results.write.format('mongo')\
89 |     .mode('overwrite')\
90 |     .option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Quake.pred_results').save()
91 | 
92 | print(df_pred_results.show(5))
93 | 
94 | print('INFO: Job ran successfully')
95 | print('')
96 | 
97 | 


--------------------------------------------------------------------------------
/Build Pipeline/build-pipline.py:
--------------------------------------------------------------------------------
 1 | import findspark
 2 | #findspark.init()
 3 | import pyspark
 4 | from pyspark.sql import SparkSession
 5 | from pyspark.sql.types import *      # convert datatype from one type to another
 6 | from pyspark.sql.functions import *  # manipulation of data
 7 | 
 8 | spark = SparkSession.builder.getOrCreate()
 9 | df = spark.sql("select 'name' as colomn")   # create a dataframe
10 | df.show()
11 | 
12 | # Configure spark session with 2 cores for this job
13 | spark = SparkSession\
14 |     .builder\
15 |     .master('local[2]')\
16 |     .appName('quake_etl')\
17 |     .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:2.4.1')\
18 |     .getOrCreate()
19 | 
20 | spark
21 | # Load the dataset from https://github.com/EBISYS/WaterWatch
22 | df_load = spark.read.csv(r"Datasets/database.csv", header=True)
23 | # Preview df_load
24 | df_load.take(1)
25 | df_load.columns
26 | df_load.printSchema()
27 | 
28 | # Drop fields we don't need from df_load
29 | lst_dropped_columns = ['Depth Error', 'Time', 'Depth Seismic Stations',
30 |                        'Magnitude Error','Magnitude Seismic Stations','Azimuthal Gap',
31 |                        'Horizontal Distance','Horizontal Error',
32 |                        'Root Mean Square','Source','Location Source','Magnitude Source','Status']
33 | 
34 | df_load = df_load.drop(*lst_dropped_columns)
35 | # Preview df_load
36 | df_load.show(5)
37 | # Create a "year" field and add it to the dataframe
38 | df_load = df_load.withColumn('Year', year(to_timestamp('Date', 'dd/MM/yyyy')))
39 | # Preview df_load
40 | df_load.show(5)
41 | # Build the quakes frequency dataframe using the year field and counts for each year
42 | df_quake_freq = df_load.groupBy('Year').count().withColumnRenamed('count', 'Counts')
43 | # Preview df_quake_freq
44 | df_quake_freq.show(5)
45 | 
46 | # Preview df_load schema
47 | df_load.printSchema()
48 | 
49 | # Cast some fields from string into numeric types
50 | df_load = df_load.withColumn('Latitude', df_load['Latitude'].cast(DoubleType()))\
51 |     .withColumn('Longitude', df_load['Longitude'].cast(DoubleType()))\
52 |     .withColumn('Depth', df_load['Depth'].cast(DoubleType()))\
53 |     .withColumn('Magnitude', df_load['Magnitude'].cast(DoubleType()))
54 | 
55 | # Preview df_load
56 | df_load.show(5)
57 | 
58 | # Preview df_load schema
59 | df_load.printSchema()
60 | 
61 | # Create avg magnitude and max magnitude fields and add to df_quake_freq
62 | df_max = df_load.groupBy('Year').max('Magnitude').withColumnRenamed('max(Magnitude)', 'max_magnitude')
63 | df_avg = df_load.groupBy('Year').avg('Magnitude').withColumnRenamed('avg(Magnitude)', 'avg_magnitude')
64 | 
65 | df_avg.show(5)
66 | 
67 | # Join df_max, and df_avg to df_quake_freq
68 | df_quake_freq = df_quake_freq.join(df_avg, ['Year']).join(df_max, ['Year'])
69 | # Preview df_quake_freq
70 | df_quake_freq.printSchema()
71 | 
72 | # Remove nulls
73 | df_load.dropna()
74 | df_quake_freq.dropna()


--------------------------------------------------------------------------------
/Build Pipeline/data_ETL.py:
--------------------------------------------------------------------------------
 1 | import pyspark 
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql.types import *
 4 | from pyspark.sql.functions import *
 5 | 
 6 | # Create the spark session
 7 | spark = SparkSession\
 8 |     .builder\
 9 |     .master('local[2]')\
10 |     .appName('quakes_etl')\
11 |     .config('spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:2.4.1')\
12 |     .getOrCreate()
13 | 
14 | # Load the dataset
15 | df_load = spark.read.csv(r"database.csv", header=True)
16 | 
17 | # Remove all fields we don't need
18 | lst_dropped_columns = ['Depth Error', 'Time', 'Depth Seismic Stations','Magnitude Error','Magnitude Seismic Stations','Azimuthal Gap', 'Horizontal Distance','Horizontal Error',
19 |     'Root Mean Square','Source','Location Source','Magnitude Source','Status']
20 | 
21 | df_load = df_load.drop(*lst_dropped_columns)
22 | 
23 | # Create a year field and add it to the df_load dataframe
24 | df_load = df_load.withColumn('Year', year(to_timestamp('Date', 'dd/MM/yyyy')))
25 | 
26 | # Create the quakes freq dataframe form the year and count values
27 | df_quake_freq = df_load.groupBy('Year').count().withColumnRenamed('count', 'Counts')
28 | 
29 | # Cast string fields to double types
30 | df_load = df_load.withColumn('Latitude', df_load['Latitude'].cast(DoubleType()))\
31 |     .withColumn('Longitude', df_load['Longitude'].cast(DoubleType()))\
32 |     .withColumn('Depth', df_load['Depth'].cast(DoubleType()))\
33 |     .withColumn('Magnitude', df_load['Magnitude'].cast(DoubleType()))
34 | 
35 | # Create avg and max magnitude fields and add to df_quake_freq
36 | df_max = df_load.groupBy('Year').max('Magnitude').withColumnRenamed('max(Magnitude)', 'Max_Magnitude')
37 | df_avg = df_load.groupBy('Year').avg('Magnitude').withColumnRenamed('avg(Magnitude)', 'Avg_Magnitude')
38 | 
39 | # Join the max and avg dfs to df_quake_freq
40 | df_quake_freq = df_quake_freq.join(df_avg, ['Year']).join(df_max, ['Year'])
41 | 
42 | # Remove records with null values
43 | df_load.dropna()
44 | df_quake_freq.dropna()
45 | 
46 | # Load df_load into mongodb
47 | df_load.write.format('mongo')\
48 |     .mode('overwrite')\
49 |     .option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Quake.quakes').save()
50 | 
51 | # Load df_quake_freq into mongodb
52 | df_quake_freq.write.format('mongo')\
53 |     .mode('overwrite')\
54 |     .option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Quake.quake_freq').save()
55 | 
56 | # Print dataframe heads
57 | print(df_quake_freq.show(5))
58 | print(df_load.show(5))
59 | 
60 | print('INFO: Job ran successfully')
61 | print('')
62 | 
63 | 
64 | # submit job: spark-submit --packages org.mongodb.spark:mongo-spark-connector_2.12:2.4.1 data_ETL.py
65 | 
66 | 
67 | 


--------------------------------------------------------------------------------
/Build Pipeline/server.py:
--------------------------------------------------------------------------------
 1 | from flask import Flask, render_template, request
 2 | 
 3 | app = Flask(__name__)
 4 | 
 5 | # Index page, no args
 6 | @app.route('/')
 7 | def index():
 8 |     return render_template('dashboard.html')
 9 | 
10 | if __name__ == '__main__':
11 |     app.run(port=5000, debug=True)
12 | 


--------------------------------------------------------------------------------
/Build Pipeline/template/file:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/Clustering_ML.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Clustering Code Along\n",
  8 |     "\n",
  9 |     "data can be found at UCI repository: https://archive.ics.uci.edu/ml/datasets/seeds."
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "markdown",
 14 |    "metadata": {},
 15 |    "source": [
 16 |     "The examined group comprised kernels belonging to three different varieties of wheat: Kama, Rosa and Canadian, 70 elements each, randomly selected for \n",
 17 |     "the experiment. High quality visualization of the internal kernel structure was detected using a soft X-ray technique. It is non-destructive and considerably cheaper than other more sophisticated imaging techniques like scanning microscopy or laser technology. The images were recorded on 13x18 cm X-ray KODAK plates. Studies were conducted using combine harvested wheat grain originating from experimental fields, explored at the Institute of Agrophysics of the Polish Academy of Sciences in Lublin. \n",
 18 |     "\n",
 19 |     "The data set can be used for the tasks of classification and cluster analysis.\n",
 20 |     "\n",
 21 |     "\n",
 22 |     "Attribute Information:\n",
 23 |     "\n",
 24 |     "To construct the data, seven geometric parameters of wheat kernels were measured: \n",
 25 |     "1. area A, \n",
 26 |     "2. perimeter P, \n",
 27 |     "3. compactness C = 4*pi*A/P^2, \n",
 28 |     "4. length of kernel, \n",
 29 |     "5. width of kernel, \n",
 30 |     "6. asymmetry coefficient \n",
 31 |     "7. length of kernel groove. \n",
 32 |     "All of these parameters were real-valued continuous.\n",
 33 |     "\n",
 34 |     "Let's see if we can cluster them in to 3 groups with K-means!"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 53,
 40 |    "metadata": {
 41 |     "collapsed": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "from pyspark.sql import SparkSession\n",
 46 |     "spark = SparkSession.builder.appName('cluster').getOrCreate()"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 54,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "from pyspark.ml.clustering import KMeans\n",
 56 |     "\n",
 57 |     "# Loads data.\n",
 58 |     "dataset = spark.read.csv(\"seeds_dataset.csv\",header=True,inferSchema=True)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 55,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "data": {
 68 |       "text/plain": [
 69 |        "Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)"
 70 |       ]
 71 |      },
 72 |      "execution_count": 55,
 73 |      "metadata": {},
 74 |      "output_type": "execute_result"
 75 |     }
 76 |    ],
 77 |    "source": [
 78 |     "dataset.head()"
 79 |    ]
 80 |   },
 81 |   {
 82 |    "cell_type": "code",
 83 |    "execution_count": 56,
 84 |    "metadata": {},
 85 |    "outputs": [
 86 |     {
 87 |      "name": "stdout",
 88 |      "output_type": "stream",
 89 |      "text": [
 90 |       "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n",
 91 |       "|summary|              area|         perimeter|         compactness|   length_of_kernel|   width_of_kernel|asymmetry_coefficient|   length_of_groove|\n",
 92 |       "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n",
 93 |       "|  count|               210|               210|                 210|                210|               210|                  210|                210|\n",
 94 |       "|   mean|14.847523809523816|14.559285714285718|  0.8709985714285714|  5.628533333333335| 3.258604761904762|   3.7001999999999997|  5.408071428571429|\n",
 95 |       "| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867|   1.5035589702547392|0.49148049910240543|\n",
 96 |       "|    min|             10.59|             12.41|              0.8081|              4.899|              2.63|                0.765|              4.519|\n",
 97 |       "|    max|             21.18|             17.25|              0.9183|              6.675|             4.033|                8.456|               6.55|\n",
 98 |       "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n",
 99 |       "\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "dataset.describe().show()"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "## Format the Data"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 57,
117 |    "metadata": {
118 |     "collapsed": true
119 |    },
120 |    "outputs": [],
121 |    "source": [
122 |     "from pyspark.ml.linalg import Vectors\n",
123 |     "from pyspark.ml.feature import VectorAssembler"
124 |    ]
125 |   },
126 |   {
127 |    "cell_type": "code",
128 |    "execution_count": 58,
129 |    "metadata": {},
130 |    "outputs": [
131 |     {
132 |      "data": {
133 |       "text/plain": [
134 |        "['area',\n",
135 |        " 'perimeter',\n",
136 |        " 'compactness',\n",
137 |        " 'length_of_kernel',\n",
138 |        " 'width_of_kernel',\n",
139 |        " 'asymmetry_coefficient',\n",
140 |        " 'length_of_groove']"
141 |       ]
142 |      },
143 |      "execution_count": 58,
144 |      "metadata": {},
145 |      "output_type": "execute_result"
146 |     }
147 |    ],
148 |    "source": [
149 |     "dataset.columns"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": 59,
155 |    "metadata": {
156 |     "collapsed": true
157 |    },
158 |    "outputs": [],
159 |    "source": [
160 |     "vec_assembler = VectorAssembler(inputCols = dataset.columns, outputCol='features')"
161 |    ]
162 |   },
163 |   {
164 |    "cell_type": "code",
165 |    "execution_count": 60,
166 |    "metadata": {
167 |     "collapsed": true
168 |    },
169 |    "outputs": [],
170 |    "source": [
171 |     "final_data = vec_assembler.transform(dataset)"
172 |    ]
173 |   },
174 |   {
175 |    "cell_type": "markdown",
176 |    "metadata": {},
177 |    "source": [
178 |     "## Scale the Data\n",
179 |     "It is a good idea to scale our data to deal with the curse of dimensionality: https://en.wikipedia.org/wiki/Curse_of_dimensionality"
180 |    ]
181 |   },
182 |   {
183 |    "cell_type": "code",
184 |    "execution_count": 61,
185 |    "metadata": {
186 |     "collapsed": true
187 |    },
188 |    "outputs": [],
189 |    "source": [
190 |     "from pyspark.ml.feature import StandardScaler"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 62,
196 |    "metadata": {},
197 |    "outputs": [],
198 |    "source": [
199 |     "scaler = StandardScaler(inputCol=\"features\", outputCol=\"scaledFeatures\", withStd=True, withMean=False)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 63,
205 |    "metadata": {
206 |     "collapsed": true
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "# Compute summary statistics by fitting the StandardScaler\n",
211 |     "scalerModel = scaler.fit(final_data)"
212 |    ]
213 |   },
214 |   {
215 |    "cell_type": "code",
216 |    "execution_count": 64,
217 |    "metadata": {},
218 |    "outputs": [],
219 |    "source": [
220 |     "# Normalize each feature to have unit standard deviation.\n",
221 |     "final_data = scalerModel.transform(final_data)"
222 |    ]
223 |   },
224 |   {
225 |    "cell_type": "markdown",
226 |    "metadata": {},
227 |    "source": [
228 |     "## Train the Model and Evaluate"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "code",
233 |    "execution_count": 76,
234 |    "metadata": {
235 |     "collapsed": true
236 |    },
237 |    "outputs": [],
238 |    "source": [
239 |     "# Trains a k-means model.\n",
240 |     "kmeans = KMeans(featuresCol='scaledFeatures',k=3)\n",
241 |     "model = kmeans.fit(final_data)"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 77,
247 |    "metadata": {},
248 |    "outputs": [
249 |     {
250 |      "name": "stdout",
251 |      "output_type": "stream",
252 |      "text": [
253 |       "Within Set Sum of Squared Errors = 429.07559671506715\n"
254 |      ]
255 |     }
256 |    ],
257 |    "source": [
258 |     "# Evaluate clustering by computing Within Set Sum of Squared Errors.\n",
259 |     "wssse = model.computeCost(final_data)\n",
260 |     "print(\"Within Set Sum of Squared Errors = \" + str(wssse))"
261 |    ]
262 |   },
263 |   {
264 |    "cell_type": "code",
265 |    "execution_count": 79,
266 |    "metadata": {},
267 |    "outputs": [
268 |     {
269 |      "name": "stdout",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "Cluster Centers: \n",
273 |       "[  6.31670546  12.37109759  37.39491396  13.91155062   9.748067\n",
274 |       "   2.39849968  12.2661748 ]\n",
275 |       "[  4.87257659  10.88120146  37.27692543  12.3410157    8.55443412\n",
276 |       "   1.81649011  10.32998598]\n",
277 |       "[  4.06105916  10.13979506  35.80536984  11.82133095   7.50395937\n",
278 |       "   3.27184732  10.42126018]\n"
279 |      ]
280 |     }
281 |    ],
282 |    "source": [
283 |     "# Shows the result.\n",
284 |     "centers = model.clusterCenters()\n",
285 |     "print(\"Cluster Centers: \")\n",
286 |     "for center in centers:\n",
287 |     "    print(center)"
288 |    ]
289 |   },
290 |   {
291 |    "cell_type": "code",
292 |    "execution_count": 80,
293 |    "metadata": {},
294 |    "outputs": [
295 |     {
296 |      "name": "stdout",
297 |      "output_type": "stream",
298 |      "text": [
299 |       "+----------+\n",
300 |       "|prediction|\n",
301 |       "+----------+\n",
302 |       "|         1|\n",
303 |       "|         1|\n",
304 |       "|         1|\n",
305 |       "|         1|\n",
306 |       "|         1|\n",
307 |       "|         1|\n",
308 |       "|         1|\n",
309 |       "|         1|\n",
310 |       "|         0|\n",
311 |       "|         0|\n",
312 |       "|         1|\n",
313 |       "|         1|\n",
314 |       "|         1|\n",
315 |       "|         1|\n",
316 |       "|         1|\n",
317 |       "|         1|\n",
318 |       "|         1|\n",
319 |       "|         1|\n",
320 |       "|         1|\n",
321 |       "|         2|\n",
322 |       "+----------+\n",
323 |       "only showing top 20 rows\n",
324 |       "\n"
325 |      ]
326 |     }
327 |    ],
328 |    "source": [
329 |     "model.transform(final_data).select('prediction').show()"
330 |    ]
331 |   }
332 |  ],
333 |  "metadata": {
334 |   "anaconda-cloud": {},
335 |   "kernelspec": {
336 |    "display_name": "Python 3",
337 |    "language": "python",
338 |    "name": "python3"
339 |   },
340 |   "language_info": {
341 |    "codemirror_mode": {
342 |     "name": "ipython",
343 |     "version": 3
344 |    },
345 |    "file_extension": ".py",
346 |    "mimetype": "text/x-python",
347 |    "name": "python",
348 |    "nbconvert_exporter": "python",
349 |    "pygments_lexer": "ipython3",
350 |    "version": "3.8.3"
351 |   }
352 |  },
353 |  "nbformat": 4,
354 |  "nbformat_minor": 1
355 | }
356 | 


--------------------------------------------------------------------------------
/Data_Transformations.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Data Transformations\n"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 2,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from pyspark.sql import SparkSession"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 3,
 24 |    "metadata": {
 25 |     "collapsed": true
 26 |    },
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "spark = SparkSession.builder.appName('data').getOrCreate()"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "code",
 34 |    "execution_count": 4,
 35 |    "metadata": {
 36 |     "collapsed": true
 37 |    },
 38 |    "outputs": [],
 39 |    "source": [
 40 |     "df = spark.read.csv('fake_customers.csv',inferSchema=True,header=True)"
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 5,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "+-------+----------+-----+\n",
 53 |       "|   Name|     Phone|Group|\n",
 54 |       "+-------+----------+-----+\n",
 55 |       "|   John|4085552424|    A|\n",
 56 |       "|   Mike|3105552738|    B|\n",
 57 |       "| Cassie|4085552424|    B|\n",
 58 |       "|  Laura|3105552438|    B|\n",
 59 |       "|  Sarah|4085551234|    A|\n",
 60 |       "|  David|3105557463|    C|\n",
 61 |       "|   Zach|4085553987|    C|\n",
 62 |       "|  Kiera|3105552938|    A|\n",
 63 |       "|  Alexa|4085559467|    C|\n",
 64 |       "|Karissa|3105553475|    A|\n",
 65 |       "+-------+----------+-----+\n",
 66 |       "\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "df.show()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "markdown",
 76 |    "metadata": {},
 77 |    "source": [
 78 |     "## Data Features\n",
 79 |     "\n",
 80 |     "### StringIndexer\n",
 81 |     "\n"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 6,
 87 |    "metadata": {},
 88 |    "outputs": [
 89 |     {
 90 |      "name": "stdout",
 91 |      "output_type": "stream",
 92 |      "text": [
 93 |       "+-------+--------+-------------+\n",
 94 |       "|user_id|category|categoryIndex|\n",
 95 |       "+-------+--------+-------------+\n",
 96 |       "|      0|       a|          0.0|\n",
 97 |       "|      1|       b|          2.0|\n",
 98 |       "|      2|       c|          1.0|\n",
 99 |       "|      3|       a|          0.0|\n",
100 |       "|      4|       a|          0.0|\n",
101 |       "|      5|       c|          1.0|\n",
102 |       "+-------+--------+-------------+\n",
103 |       "\n"
104 |      ]
105 |     }
106 |    ],
107 |    "source": [
108 |     "from pyspark.ml.feature import StringIndexer\n",
109 |     "\n",
110 |     "df = spark.createDataFrame(\n",
111 |     "    [(0, \"a\"), (1, \"b\"), (2, \"c\"), (3, \"a\"), (4, \"a\"), (5, \"c\")],\n",
112 |     "    [\"user_id\", \"category\"])\n",
113 |     "\n",
114 |     "indexer = StringIndexer(inputCol=\"category\", outputCol=\"categoryIndex\")\n",
115 |     "indexed = indexer.fit(df).transform(df)\n",
116 |     "indexed.show()"
117 |    ]
118 |   },
119 |   {
120 |    "cell_type": "code",
121 |    "execution_count": null,
122 |    "metadata": {
123 |     "collapsed": true
124 |    },
125 |    "outputs": [],
126 |    "source": []
127 |   },
128 |   {
129 |    "cell_type": "markdown",
130 |    "metadata": {},
131 |    "source": [
132 |     "### VectorIndexer\n",
133 |     "\n"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 14,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "name": "stdout",
143 |      "output_type": "stream",
144 |      "text": [
145 |       "+---+----+------+--------------+-------+\n",
146 |       "| id|hour|mobile|  userFeatures|clicked|\n",
147 |       "+---+----+------+--------------+-------+\n",
148 |       "|  0|  18|   1.0|[0.0,10.0,0.5]|    1.0|\n",
149 |       "+---+----+------+--------------+-------+\n",
150 |       "\n"
151 |      ]
152 |     }
153 |    ],
154 |    "source": [
155 |     "from pyspark.ml.linalg import Vectors\n",
156 |     "from pyspark.ml.feature import VectorAssembler\n",
157 |     "\n",
158 |     "dataset = spark.createDataFrame(\n",
159 |     "    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],\n",
160 |     "    [\"id\", \"hour\", \"mobile\", \"userFeatures\", \"clicked\"])\n",
161 |     "dataset.show()"
162 |    ]
163 |   },
164 |   {
165 |    "cell_type": "code",
166 |    "execution_count": 15,
167 |    "metadata": {},
168 |    "outputs": [
169 |     {
170 |      "name": "stdout",
171 |      "output_type": "stream",
172 |      "text": [
173 |       "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\n",
174 |       "+--------------------+-------+\n",
175 |       "|            features|clicked|\n",
176 |       "+--------------------+-------+\n",
177 |       "|[18.0,1.0,0.0,10....|    1.0|\n",
178 |       "+--------------------+-------+\n",
179 |       "\n"
180 |      ]
181 |     }
182 |    ],
183 |    "source": [
184 |     "assembler = VectorAssembler(\n",
185 |     "    inputCols=[\"hour\", \"mobile\", \"userFeatures\"],\n",
186 |     "    outputCol=\"features\")\n",
187 |     "\n",
188 |     "output = assembler.transform(dataset)\n",
189 |     "print(\"Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\")\n",
190 |     "output.select(\"features\", \"clicked\").show()"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": null,
196 |    "metadata": {
197 |     "collapsed": true
198 |    },
199 |    "outputs": [],
200 |    "source": []
201 |   }
202 |  ],
203 |  "metadata": {
204 |   "anaconda-cloud": {},
205 |   "kernelspec": {
206 |    "display_name": "Python 3",
207 |    "language": "python",
208 |    "name": "python3"
209 |   },
210 |   "language_info": {
211 |    "codemirror_mode": {
212 |     "name": "ipython",
213 |     "version": 3
214 |    },
215 |    "file_extension": ".py",
216 |    "mimetype": "text/x-python",
217 |    "name": "python",
218 |    "nbconvert_exporter": "python",
219 |    "pygments_lexer": "ipython3",
220 |    "version": "3.7.9"
221 |   }
222 |  },
223 |  "nbformat": 4,
224 |  "nbformat_minor": 1
225 | }
226 | 


--------------------------------------------------------------------------------
/Dates_and_Timestamps.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Dates and Timestamps\n"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from pyspark.sql import SparkSession\n",
 19 |     "# May take a little while on a local computer\n",
 20 |     "spark = SparkSession.builder.appName(\"dates\").getOrCreate()"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 3,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "df = spark.read.csv(\"appl_stock.csv\",header=True,inferSchema=True)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 4,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+\n",
 44 |       "|                Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|\n",
 45 |       "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+\n",
 46 |       "|2010-01-04 00:00:...|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|\n",
 47 |       "|2010-01-05 00:00:...|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|\n",
 48 |       "|2010-01-06 00:00:...|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|\n",
 49 |       "|2010-01-07 00:00:...|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|\n",
 50 |       "|2010-01-08 00:00:...|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|\n",
 51 |       "|2010-01-11 00:00:...|212.79999700000002|        213.000002|        208.450005|210.11000299999998|115557400|         27.221758|\n",
 52 |       "|2010-01-12 00:00:...|209.18999499999998|209.76999500000002|        206.419998|        207.720001|148614900|          26.91211|\n",
 53 |       "|2010-01-13 00:00:...|        207.870005|210.92999500000002|        204.099998|        210.650002|151473000|          27.29172|\n",
 54 |       "|2010-01-14 00:00:...|210.11000299999998|210.45999700000002|        209.020004|            209.43|108223500|         27.133657|\n",
 55 |       "|2010-01-15 00:00:...|210.92999500000002|211.59999700000003|        205.869999|            205.93|148516900|26.680197999999997|\n",
 56 |       "|2010-01-19 00:00:...|        208.330002|215.18999900000003|        207.240004|        215.039995|182501900|27.860484999999997|\n",
 57 |       "|2010-01-20 00:00:...|        214.910006|        215.549994|        209.500002|            211.73|153038200|         27.431644|\n",
 58 |       "|2010-01-21 00:00:...|        212.079994|213.30999599999998|        207.210003|        208.069996|152038600|         26.957455|\n",
 59 |       "|2010-01-22 00:00:...|206.78000600000001|        207.499996|            197.16|            197.75|220441900|         25.620401|\n",
 60 |       "|2010-01-25 00:00:...|202.51000200000001|        204.699999|        200.190002|        203.070002|266424900|26.309658000000002|\n",
 61 |       "|2010-01-26 00:00:...|205.95000100000001|        213.710005|        202.580004|        205.940001|466777500|         26.681494|\n",
 62 |       "|2010-01-27 00:00:...|        206.849995|            210.58|        199.530001|        207.880005|430642100|26.932840000000002|\n",
 63 |       "|2010-01-28 00:00:...|        204.930004|        205.500004|        198.699995|        199.289995|293375600|25.819922000000002|\n",
 64 |       "|2010-01-29 00:00:...|        201.079996|        202.199995|        190.250002|        192.060003|311488100|         24.883208|\n",
 65 |       "|2010-02-01 00:00:...|192.36999699999998|             196.0|191.29999899999999|        194.729998|187469100|         25.229131|\n",
 66 |       "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+\n",
 67 |       "only showing top 20 rows\n",
 68 |       "\n"
 69 |      ]
 70 |     }
 71 |    ],
 72 |    "source": [
 73 |     "df.show()"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {},
 79 |    "source": [
 80 |     "Let's walk through how to grab parts of the timestamp data"
 81 |    ]
 82 |   },
 83 |   {
 84 |    "cell_type": "code",
 85 |    "execution_count": 44,
 86 |    "metadata": {
 87 |     "collapsed": true
 88 |    },
 89 |    "outputs": [],
 90 |    "source": [
 91 |     "from pyspark.sql.functions import format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format"
 92 |    ]
 93 |   },
 94 |   {
 95 |    "cell_type": "code",
 96 |    "execution_count": 45,
 97 |    "metadata": {},
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "+----------------+\n",
104 |       "|dayofmonth(Date)|\n",
105 |       "+----------------+\n",
106 |       "|               4|\n",
107 |       "|               5|\n",
108 |       "|               6|\n",
109 |       "|               7|\n",
110 |       "|               8|\n",
111 |       "|              11|\n",
112 |       "|              12|\n",
113 |       "|              13|\n",
114 |       "|              14|\n",
115 |       "|              15|\n",
116 |       "|              19|\n",
117 |       "|              20|\n",
118 |       "|              21|\n",
119 |       "|              22|\n",
120 |       "|              25|\n",
121 |       "|              26|\n",
122 |       "|              27|\n",
123 |       "|              28|\n",
124 |       "|              29|\n",
125 |       "|               1|\n",
126 |       "+----------------+\n",
127 |       "only showing top 20 rows\n",
128 |       "\n"
129 |      ]
130 |     }
131 |    ],
132 |    "source": [
133 |     "df.select(dayofmonth(df['Date'])).show()"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "code",
138 |    "execution_count": 46,
139 |    "metadata": {},
140 |    "outputs": [
141 |     {
142 |      "name": "stdout",
143 |      "output_type": "stream",
144 |      "text": [
145 |       "+----------+\n",
146 |       "|hour(Date)|\n",
147 |       "+----------+\n",
148 |       "|         0|\n",
149 |       "|         0|\n",
150 |       "|         0|\n",
151 |       "|         0|\n",
152 |       "|         0|\n",
153 |       "|         0|\n",
154 |       "|         0|\n",
155 |       "|         0|\n",
156 |       "|         0|\n",
157 |       "|         0|\n",
158 |       "|         0|\n",
159 |       "|         0|\n",
160 |       "|         0|\n",
161 |       "|         0|\n",
162 |       "|         0|\n",
163 |       "|         0|\n",
164 |       "|         0|\n",
165 |       "|         0|\n",
166 |       "|         0|\n",
167 |       "|         0|\n",
168 |       "+----------+\n",
169 |       "only showing top 20 rows\n",
170 |       "\n"
171 |      ]
172 |     }
173 |    ],
174 |    "source": [
175 |     "df.select(hour(df['Date'])).show()"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 8,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "+---------------+\n",
188 |       "|dayofyear(Date)|\n",
189 |       "+---------------+\n",
190 |       "|              4|\n",
191 |       "|              5|\n",
192 |       "|              6|\n",
193 |       "|              7|\n",
194 |       "|              8|\n",
195 |       "|             11|\n",
196 |       "|             12|\n",
197 |       "|             13|\n",
198 |       "|             14|\n",
199 |       "|             15|\n",
200 |       "|             19|\n",
201 |       "|             20|\n",
202 |       "|             21|\n",
203 |       "|             22|\n",
204 |       "|             25|\n",
205 |       "|             26|\n",
206 |       "|             27|\n",
207 |       "|             28|\n",
208 |       "|             29|\n",
209 |       "|             32|\n",
210 |       "+---------------+\n",
211 |       "only showing top 20 rows\n",
212 |       "\n"
213 |      ]
214 |     }
215 |    ],
216 |    "source": [
217 |     "df.select(dayofyear(df['Date'])).show()"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 11,
223 |    "metadata": {},
224 |    "outputs": [
225 |     {
226 |      "name": "stdout",
227 |      "output_type": "stream",
228 |      "text": [
229 |       "+-----------+\n",
230 |       "|month(Date)|\n",
231 |       "+-----------+\n",
232 |       "|          1|\n",
233 |       "|          1|\n",
234 |       "|          1|\n",
235 |       "|          1|\n",
236 |       "|          1|\n",
237 |       "|          1|\n",
238 |       "|          1|\n",
239 |       "|          1|\n",
240 |       "|          1|\n",
241 |       "|          1|\n",
242 |       "|          1|\n",
243 |       "|          1|\n",
244 |       "|          1|\n",
245 |       "|          1|\n",
246 |       "|          1|\n",
247 |       "|          1|\n",
248 |       "|          1|\n",
249 |       "|          1|\n",
250 |       "|          1|\n",
251 |       "|          2|\n",
252 |       "+-----------+\n",
253 |       "only showing top 20 rows\n",
254 |       "\n"
255 |      ]
256 |     }
257 |    ],
258 |    "source": [
259 |     "df.select(month(df['Date'])).show()"
260 |    ]
261 |   },
262 |   {
263 |    "cell_type": "markdown",
264 |    "metadata": {},
265 |    "source": [
266 |     "So for example, let's say we wanted to know the average closing price per year. Easy! With a groupby and the year() function call:"
267 |    ]
268 |   },
269 |   {
270 |    "cell_type": "code",
271 |    "execution_count": 15,
272 |    "metadata": {},
273 |    "outputs": [
274 |     {
275 |      "name": "stdout",
276 |      "output_type": "stream",
277 |      "text": [
278 |       "+----------+\n",
279 |       "|year(Date)|\n",
280 |       "+----------+\n",
281 |       "|      2010|\n",
282 |       "|      2010|\n",
283 |       "|      2010|\n",
284 |       "|      2010|\n",
285 |       "|      2010|\n",
286 |       "|      2010|\n",
287 |       "|      2010|\n",
288 |       "|      2010|\n",
289 |       "|      2010|\n",
290 |       "|      2010|\n",
291 |       "|      2010|\n",
292 |       "|      2010|\n",
293 |       "|      2010|\n",
294 |       "|      2010|\n",
295 |       "|      2010|\n",
296 |       "|      2010|\n",
297 |       "|      2010|\n",
298 |       "|      2010|\n",
299 |       "|      2010|\n",
300 |       "|      2010|\n",
301 |       "+----------+\n",
302 |       "only showing top 20 rows\n",
303 |       "\n"
304 |      ]
305 |     }
306 |    ],
307 |    "source": [
308 |     "df.select(year(df['Date'])).show()"
309 |    ]
310 |   },
311 |   {
312 |    "cell_type": "code",
313 |    "execution_count": 19,
314 |    "metadata": {},
315 |    "outputs": [
316 |     {
317 |      "name": "stdout",
318 |      "output_type": "stream",
319 |      "text": [
320 |       "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+----+\n",
321 |       "|                Date|              Open|              High|               Low|             Close|   Volume|         Adj Close|Year|\n",
322 |       "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+----+\n",
323 |       "|2010-01-04 00:00:...|        213.429998|        214.499996|212.38000099999996|        214.009998|123432400|         27.727039|2010|\n",
324 |       "|2010-01-05 00:00:...|        214.599998|        215.589994|        213.249994|        214.379993|150476200|27.774976000000002|2010|\n",
325 |       "|2010-01-06 00:00:...|        214.379993|            215.23|        210.750004|        210.969995|138040000|27.333178000000004|2010|\n",
326 |       "|2010-01-07 00:00:...|            211.75|        212.000006|        209.050005|            210.58|119282800|          27.28265|2010|\n",
327 |       "|2010-01-08 00:00:...|        210.299994|        212.000006|209.06000500000002|211.98000499999998|111902700|         27.464034|2010|\n",
328 |       "|2010-01-11 00:00:...|212.79999700000002|        213.000002|        208.450005|210.11000299999998|115557400|         27.221758|2010|\n",
329 |       "|2010-01-12 00:00:...|209.18999499999998|209.76999500000002|        206.419998|        207.720001|148614900|          26.91211|2010|\n",
330 |       "|2010-01-13 00:00:...|        207.870005|210.92999500000002|        204.099998|        210.650002|151473000|          27.29172|2010|\n",
331 |       "|2010-01-14 00:00:...|210.11000299999998|210.45999700000002|        209.020004|            209.43|108223500|         27.133657|2010|\n",
332 |       "|2010-01-15 00:00:...|210.92999500000002|211.59999700000003|        205.869999|            205.93|148516900|26.680197999999997|2010|\n",
333 |       "|2010-01-19 00:00:...|        208.330002|215.18999900000003|        207.240004|        215.039995|182501900|27.860484999999997|2010|\n",
334 |       "|2010-01-20 00:00:...|        214.910006|        215.549994|        209.500002|            211.73|153038200|         27.431644|2010|\n",
335 |       "|2010-01-21 00:00:...|        212.079994|213.30999599999998|        207.210003|        208.069996|152038600|         26.957455|2010|\n",
336 |       "|2010-01-22 00:00:...|206.78000600000001|        207.499996|            197.16|            197.75|220441900|         25.620401|2010|\n",
337 |       "|2010-01-25 00:00:...|202.51000200000001|        204.699999|        200.190002|        203.070002|266424900|26.309658000000002|2010|\n",
338 |       "|2010-01-26 00:00:...|205.95000100000001|        213.710005|        202.580004|        205.940001|466777500|         26.681494|2010|\n",
339 |       "|2010-01-27 00:00:...|        206.849995|            210.58|        199.530001|        207.880005|430642100|26.932840000000002|2010|\n",
340 |       "|2010-01-28 00:00:...|        204.930004|        205.500004|        198.699995|        199.289995|293375600|25.819922000000002|2010|\n",
341 |       "|2010-01-29 00:00:...|        201.079996|        202.199995|        190.250002|        192.060003|311488100|         24.883208|2010|\n",
342 |       "|2010-02-01 00:00:...|192.36999699999998|             196.0|191.29999899999999|        194.729998|187469100|         25.229131|2010|\n",
343 |       "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+----+\n",
344 |       "only showing top 20 rows\n",
345 |       "\n"
346 |      ]
347 |     }
348 |    ],
349 |    "source": [
350 |     "df.withColumn(\"Year\",year(df['Date'])).show()"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "code",
355 |    "execution_count": 29,
356 |    "metadata": {
357 |     "scrolled": true
358 |    },
359 |    "outputs": [
360 |     {
361 |      "name": "stdout",
362 |      "output_type": "stream",
363 |      "text": [
364 |       "+---------+------------------+\n",
365 |       "|avg(Year)|        avg(Close)|\n",
366 |       "+---------+------------------+\n",
367 |       "|   2015.0|120.03999980555547|\n",
368 |       "|   2013.0| 472.6348802857143|\n",
369 |       "|   2014.0| 295.4023416507935|\n",
370 |       "|   2012.0| 576.0497195640002|\n",
371 |       "|   2016.0|104.60400786904763|\n",
372 |       "|   2010.0| 259.8424600000002|\n",
373 |       "|   2011.0|364.00432532142867|\n",
374 |       "+---------+------------------+\n",
375 |       "\n"
376 |      ]
377 |     }
378 |    ],
379 |    "source": [
380 |     "newdf = df.withColumn(\"Year\",year(df['Date']))\n",
381 |     "newdf.groupBy(\"Year\").mean()[['avg(Year)','avg(Close)']].show()"
382 |    ]
383 |   },
384 |   {
385 |    "cell_type": "markdown",
386 |    "metadata": {},
387 |    "source": [
388 |     "Still not quite presentable! Let's use the .alias method as well as round() to clean this up!"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "code",
393 |    "execution_count": 43,
394 |    "metadata": {},
395 |    "outputs": [
396 |     {
397 |      "name": "stdout",
398 |      "output_type": "stream",
399 |      "text": [
400 |       "+------+----------+\n",
401 |       "|  Year|Mean Close|\n",
402 |       "+------+----------+\n",
403 |       "|2015.0|    120.04|\n",
404 |       "|2013.0|    472.63|\n",
405 |       "|2014.0|    295.40|\n",
406 |       "|2012.0|    576.05|\n",
407 |       "|2016.0|    104.60|\n",
408 |       "|2010.0|    259.84|\n",
409 |       "|2011.0|    364.00|\n",
410 |       "+------+----------+\n",
411 |       "\n"
412 |      ]
413 |     }
414 |    ],
415 |    "source": [
416 |     "result = newdf.groupBy(\"Year\").mean()[['avg(Year)','avg(Close)']]\n",
417 |     "result = result.withColumnRenamed(\"avg(Year)\",\"Year\")\n",
418 |     "result = result.select('Year',format_number('avg(Close)',2).alias(\"Mean Close\")).show()"
419 |    ]
420 |   },
421 |   {
422 |    "cell_type": "markdown",
423 |    "metadata": {},
424 |    "source": [
425 |     "Perfect! Now you know how to work with Date and Timestamp information!"
426 |    ]
427 |   }
428 |  ],
429 |  "metadata": {
430 |   "anaconda-cloud": {},
431 |   "kernelspec": {
432 |    "display_name": "Python 3",
433 |    "language": "python",
434 |    "name": "python3"
435 |   },
436 |   "language_info": {
437 |    "codemirror_mode": {
438 |     "name": "ipython",
439 |     "version": 3
440 |    },
441 |    "file_extension": ".py",
442 |    "mimetype": "text/x-python",
443 |    "name": "python",
444 |    "nbconvert_exporter": "python",
445 |    "pygments_lexer": "ipython3",
446 |    "version": "3.7.9"
447 |   }
448 |  },
449 |  "nbformat": 4,
450 |  "nbformat_minor": 1
451 | }
452 | 


--------------------------------------------------------------------------------
/GroupBy_and_Aggregate_Functions.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# GroupBy and Aggregate Functions\n",
  8 |     "\n",
  9 |     "GroupBy allows you to group rows together based off some column value, for example, you could group together sales data by the day the sale occured, or group repeast customer data based off the name of the customer. Once you've performed the GroupBy operation you can use an aggregate function off that data. An aggregate function aggregates multiple rows of data into a single output, such as taking the sum of inputs, or counting the number of inputs.\n"
 10 |    ]
 11 |   },
 12 |   {
 13 |    "cell_type": "code",
 14 |    "execution_count": 1,
 15 |    "metadata": {
 16 |     "collapsed": true
 17 |    },
 18 |    "outputs": [],
 19 |    "source": [
 20 |     "from pyspark.sql import SparkSession"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "# May take a little while on a local computer\n",
 32 |     "spark = SparkSession.builder.appName(\"groupbyagg\").getOrCreate()"
 33 |    ]
 34 |   },
 35 |   {
 36 |    "cell_type": "markdown",
 37 |    "metadata": {},
 38 |    "source": [
 39 |     "Read in the customer sales data"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 3,
 45 |    "metadata": {
 46 |     "collapsed": true
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "df = spark.read.csv('sales_info.csv',inferSchema=True,header=True)"
 51 |    ]
 52 |   },
 53 |   {
 54 |    "cell_type": "code",
 55 |    "execution_count": 4,
 56 |    "metadata": {},
 57 |    "outputs": [
 58 |     {
 59 |      "name": "stdout",
 60 |      "output_type": "stream",
 61 |      "text": [
 62 |       "root\n",
 63 |       " |-- Company: string (nullable = true)\n",
 64 |       " |-- Person: string (nullable = true)\n",
 65 |       " |-- Sales: double (nullable = true)\n",
 66 |       "\n"
 67 |      ]
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "df.printSchema()"
 72 |    ]
 73 |   },
 74 |   {
 75 |    "cell_type": "code",
 76 |    "execution_count": 8,
 77 |    "metadata": {},
 78 |    "outputs": [
 79 |     {
 80 |      "name": "stdout",
 81 |      "output_type": "stream",
 82 |      "text": [
 83 |       "+-------+-------+-----+\n",
 84 |       "|Company| Person|Sales|\n",
 85 |       "+-------+-------+-----+\n",
 86 |       "|   GOOG|    Sam|200.0|\n",
 87 |       "|   GOOG|Charlie|120.0|\n",
 88 |       "|   GOOG|  Frank|340.0|\n",
 89 |       "|   MSFT|   Tina|600.0|\n",
 90 |       "|   MSFT|    Amy|124.0|\n",
 91 |       "|   MSFT|Vanessa|243.0|\n",
 92 |       "|     FB|   Carl|870.0|\n",
 93 |       "|     FB|  Sarah|350.0|\n",
 94 |       "|   APPL|   John|250.0|\n",
 95 |       "|   APPL|  Linda|130.0|\n",
 96 |       "|   APPL|   Mike|750.0|\n",
 97 |       "|   APPL|  Chris|350.0|\n",
 98 |       "+-------+-------+-----+\n",
 99 |       "\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "df.show()"
105 |    ]
106 |   },
107 |   {
108 |    "cell_type": "markdown",
109 |    "metadata": {},
110 |    "source": [
111 |     "Let's group together by company!"
112 |    ]
113 |   },
114 |   {
115 |    "cell_type": "code",
116 |    "execution_count": 9,
117 |    "metadata": {},
118 |    "outputs": [
119 |     {
120 |      "data": {
121 |       "text/plain": [
122 |        "<pyspark.sql.group.GroupedData at 0x109915f28>"
123 |       ]
124 |      },
125 |      "execution_count": 9,
126 |      "metadata": {},
127 |      "output_type": "execute_result"
128 |     }
129 |    ],
130 |    "source": [
131 |     "df.groupBy(\"Company\")"
132 |    ]
133 |   },
134 |   {
135 |    "cell_type": "markdown",
136 |    "metadata": {},
137 |    "source": [
138 |     "This returns a GroupedData object, off of which you can all various methods"
139 |    ]
140 |   },
141 |   {
142 |    "cell_type": "code",
143 |    "execution_count": 10,
144 |    "metadata": {},
145 |    "outputs": [
146 |     {
147 |      "name": "stdout",
148 |      "output_type": "stream",
149 |      "text": [
150 |       "+-------+-----------------+\n",
151 |       "|Company|       avg(Sales)|\n",
152 |       "+-------+-----------------+\n",
153 |       "|   APPL|            370.0|\n",
154 |       "|   GOOG|            220.0|\n",
155 |       "|     FB|            610.0|\n",
156 |       "|   MSFT|322.3333333333333|\n",
157 |       "+-------+-----------------+\n",
158 |       "\n"
159 |      ]
160 |     }
161 |    ],
162 |    "source": [
163 |     "# Mean\n",
164 |     "df.groupBy(\"Company\").mean().show()"
165 |    ]
166 |   },
167 |   {
168 |    "cell_type": "code",
169 |    "execution_count": 11,
170 |    "metadata": {},
171 |    "outputs": [
172 |     {
173 |      "name": "stdout",
174 |      "output_type": "stream",
175 |      "text": [
176 |       "+-------+-----+\n",
177 |       "|Company|count|\n",
178 |       "+-------+-----+\n",
179 |       "|   APPL|    4|\n",
180 |       "|   GOOG|    3|\n",
181 |       "|     FB|    2|\n",
182 |       "|   MSFT|    3|\n",
183 |       "+-------+-----+\n",
184 |       "\n"
185 |      ]
186 |     }
187 |    ],
188 |    "source": [
189 |     "# Count\n",
190 |     "df.groupBy(\"Company\").count().show()"
191 |    ]
192 |   },
193 |   {
194 |    "cell_type": "code",
195 |    "execution_count": 12,
196 |    "metadata": {},
197 |    "outputs": [
198 |     {
199 |      "name": "stdout",
200 |      "output_type": "stream",
201 |      "text": [
202 |       "+-------+----------+\n",
203 |       "|Company|max(Sales)|\n",
204 |       "+-------+----------+\n",
205 |       "|   APPL|     750.0|\n",
206 |       "|   GOOG|     340.0|\n",
207 |       "|     FB|     870.0|\n",
208 |       "|   MSFT|     600.0|\n",
209 |       "+-------+----------+\n",
210 |       "\n"
211 |      ]
212 |     }
213 |    ],
214 |    "source": [
215 |     "# Max\n",
216 |     "df.groupBy(\"Company\").max().show()"
217 |    ]
218 |   },
219 |   {
220 |    "cell_type": "code",
221 |    "execution_count": 13,
222 |    "metadata": {},
223 |    "outputs": [
224 |     {
225 |      "name": "stdout",
226 |      "output_type": "stream",
227 |      "text": [
228 |       "+-------+----------+\n",
229 |       "|Company|min(Sales)|\n",
230 |       "+-------+----------+\n",
231 |       "|   APPL|     130.0|\n",
232 |       "|   GOOG|     120.0|\n",
233 |       "|     FB|     350.0|\n",
234 |       "|   MSFT|     124.0|\n",
235 |       "+-------+----------+\n",
236 |       "\n"
237 |      ]
238 |     }
239 |    ],
240 |    "source": [
241 |     "# Min\n",
242 |     "df.groupBy(\"Company\").min().show()"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 15,
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "name": "stdout",
252 |      "output_type": "stream",
253 |      "text": [
254 |       "+-------+----------+\n",
255 |       "|Company|sum(Sales)|\n",
256 |       "+-------+----------+\n",
257 |       "|   APPL|    1480.0|\n",
258 |       "|   GOOG|     660.0|\n",
259 |       "|     FB|    1220.0|\n",
260 |       "|   MSFT|     967.0|\n",
261 |       "+-------+----------+\n",
262 |       "\n"
263 |      ]
264 |     }
265 |    ],
266 |    "source": [
267 |     "# Sum\n",
268 |     "df.groupBy(\"Company\").sum().show()"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "markdown",
273 |    "metadata": {},
274 |    "source": [
275 |     "Check out this link for more info on other methods:\n",
276 |     "http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark-sql-module\n",
277 |     "\n",
278 |     "Not all methods need a groupby call, instead one can just call the generalized **.agg()** method, that will call the aggregate across all rows in the dataframe column specified. It can take in arguments as a single column, or create multiple aggregate calls all at once using dictionary notation.\n",
279 |     "\n",
280 |     "For example:"
281 |    ]
282 |   },
283 |   {
284 |    "cell_type": "code",
285 |    "execution_count": 18,
286 |    "metadata": {},
287 |    "outputs": [
288 |     {
289 |      "name": "stdout",
290 |      "output_type": "stream",
291 |      "text": [
292 |       "+----------+\n",
293 |       "|max(Sales)|\n",
294 |       "+----------+\n",
295 |       "|     870.0|\n",
296 |       "+----------+\n",
297 |       "\n"
298 |      ]
299 |     }
300 |    ],
301 |    "source": [
302 |     "# Max sales across everything\n",
303 |     "df.agg({'Sales':'max'}).show()"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 22,
309 |    "metadata": {
310 |     "collapsed": true
311 |    },
312 |    "outputs": [],
313 |    "source": [
314 |     "# Could have done this on the group by object as well:"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 23,
320 |    "metadata": {
321 |     "collapsed": true
322 |    },
323 |    "outputs": [],
324 |    "source": [
325 |     "grouped = df.groupBy(\"Company\")"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 25,
331 |    "metadata": {},
332 |    "outputs": [
333 |     {
334 |      "name": "stdout",
335 |      "output_type": "stream",
336 |      "text": [
337 |       "+-------+----------+\n",
338 |       "|Company|max(Sales)|\n",
339 |       "+-------+----------+\n",
340 |       "|   APPL|     750.0|\n",
341 |       "|   GOOG|     340.0|\n",
342 |       "|     FB|     870.0|\n",
343 |       "|   MSFT|     600.0|\n",
344 |       "+-------+----------+\n",
345 |       "\n"
346 |      ]
347 |     }
348 |    ],
349 |    "source": [
350 |     "grouped.agg({\"Sales\":'max'}).show()"
351 |    ]
352 |   },
353 |   {
354 |    "cell_type": "markdown",
355 |    "metadata": {},
356 |    "source": [
357 |     "## Functions\n"
358 |    ]
359 |   },
360 |   {
361 |    "cell_type": "code",
362 |    "execution_count": 36,
363 |    "metadata": {},
364 |    "outputs": [],
365 |    "source": [
366 |     "from pyspark.sql.functions import countDistinct, avg,stddev"
367 |    ]
368 |   },
369 |   {
370 |    "cell_type": "code",
371 |    "execution_count": 29,
372 |    "metadata": {},
373 |    "outputs": [
374 |     {
375 |      "name": "stdout",
376 |      "output_type": "stream",
377 |      "text": [
378 |       "+---------------------+\n",
379 |       "|count(DISTINCT Sales)|\n",
380 |       "+---------------------+\n",
381 |       "|                   11|\n",
382 |       "+---------------------+\n",
383 |       "\n"
384 |      ]
385 |     }
386 |    ],
387 |    "source": [
388 |     "df.select(countDistinct(\"Sales\")).show()"
389 |    ]
390 |   },
391 |   {
392 |    "cell_type": "markdown",
393 |    "metadata": {},
394 |    "source": [
395 |     "Often you will want to change the name, use the .alias() method for this:"
396 |    ]
397 |   },
398 |   {
399 |    "cell_type": "code",
400 |    "execution_count": 31,
401 |    "metadata": {},
402 |    "outputs": [
403 |     {
404 |      "name": "stdout",
405 |      "output_type": "stream",
406 |      "text": [
407 |       "+--------------+\n",
408 |       "|Distinct Sales|\n",
409 |       "+--------------+\n",
410 |       "|            11|\n",
411 |       "+--------------+\n",
412 |       "\n"
413 |      ]
414 |     }
415 |    ],
416 |    "source": [
417 |     "df.select(countDistinct(\"Sales\").alias(\"Distinct Sales\")).show()"
418 |    ]
419 |   },
420 |   {
421 |    "cell_type": "code",
422 |    "execution_count": 35,
423 |    "metadata": {},
424 |    "outputs": [
425 |     {
426 |      "name": "stdout",
427 |      "output_type": "stream",
428 |      "text": [
429 |       "+-----------------+\n",
430 |       "|       avg(Sales)|\n",
431 |       "+-----------------+\n",
432 |       "|360.5833333333333|\n",
433 |       "+-----------------+\n",
434 |       "\n"
435 |      ]
436 |     }
437 |    ],
438 |    "source": [
439 |     "df.select(avg('Sales')).show()"
440 |    ]
441 |   },
442 |   {
443 |    "cell_type": "code",
444 |    "execution_count": 38,
445 |    "metadata": {},
446 |    "outputs": [
447 |     {
448 |      "name": "stdout",
449 |      "output_type": "stream",
450 |      "text": [
451 |       "+------------------+\n",
452 |       "|stddev_samp(Sales)|\n",
453 |       "+------------------+\n",
454 |       "|250.08742410799007|\n",
455 |       "+------------------+\n",
456 |       "\n"
457 |      ]
458 |     }
459 |    ],
460 |    "source": [
461 |     "df.select(stddev(\"Sales\")).show()"
462 |    ]
463 |   },
464 |   {
465 |    "cell_type": "code",
466 |    "execution_count": 39,
467 |    "metadata": {
468 |     "collapsed": true
469 |    },
470 |    "outputs": [],
471 |    "source": [
472 |     "from pyspark.sql.functions import format_number"
473 |    ]
474 |   },
475 |   {
476 |    "cell_type": "code",
477 |    "execution_count": 40,
478 |    "metadata": {
479 |     "collapsed": true
480 |    },
481 |    "outputs": [],
482 |    "source": [
483 |     "sales_std = df.select(stddev(\"Sales\").alias('std'))"
484 |    ]
485 |   },
486 |   {
487 |    "cell_type": "code",
488 |    "execution_count": 41,
489 |    "metadata": {},
490 |    "outputs": [
491 |     {
492 |      "name": "stdout",
493 |      "output_type": "stream",
494 |      "text": [
495 |       "+------------------+\n",
496 |       "|               std|\n",
497 |       "+------------------+\n",
498 |       "|250.08742410799007|\n",
499 |       "+------------------+\n",
500 |       "\n"
501 |      ]
502 |     }
503 |    ],
504 |    "source": [
505 |     "sales_std.show()"
506 |    ]
507 |   },
508 |   {
509 |    "cell_type": "code",
510 |    "execution_count": 42,
511 |    "metadata": {},
512 |    "outputs": [
513 |     {
514 |      "name": "stdout",
515 |      "output_type": "stream",
516 |      "text": [
517 |       "+---------------------+\n",
518 |       "|format_number(std, 2)|\n",
519 |       "+---------------------+\n",
520 |       "|               250.09|\n",
521 |       "+---------------------+\n",
522 |       "\n"
523 |      ]
524 |     }
525 |    ],
526 |    "source": [
527 |     "# format_number(\"col_name\",decimal places)\n",
528 |     "sales_std.select(format_number('std',2)).show()"
529 |    ]
530 |   },
531 |   {
532 |    "cell_type": "markdown",
533 |    "metadata": {},
534 |    "source": [
535 |     "## Order By\n",
536 |     "\n",
537 |     "You can easily sort with the orderBy method:"
538 |    ]
539 |   },
540 |   {
541 |    "cell_type": "code",
542 |    "execution_count": 43,
543 |    "metadata": {},
544 |    "outputs": [
545 |     {
546 |      "name": "stdout",
547 |      "output_type": "stream",
548 |      "text": [
549 |       "+-------+-------+-----+\n",
550 |       "|Company| Person|Sales|\n",
551 |       "+-------+-------+-----+\n",
552 |       "|   GOOG|Charlie|120.0|\n",
553 |       "|   MSFT|    Amy|124.0|\n",
554 |       "|   APPL|  Linda|130.0|\n",
555 |       "|   GOOG|    Sam|200.0|\n",
556 |       "|   MSFT|Vanessa|243.0|\n",
557 |       "|   APPL|   John|250.0|\n",
558 |       "|   GOOG|  Frank|340.0|\n",
559 |       "|     FB|  Sarah|350.0|\n",
560 |       "|   APPL|  Chris|350.0|\n",
561 |       "|   MSFT|   Tina|600.0|\n",
562 |       "|   APPL|   Mike|750.0|\n",
563 |       "|     FB|   Carl|870.0|\n",
564 |       "+-------+-------+-----+\n",
565 |       "\n"
566 |      ]
567 |     }
568 |    ],
569 |    "source": [
570 |     "# OrderBy\n",
571 |     "# Ascending\n",
572 |     "df.orderBy(\"Sales\").show()"
573 |    ]
574 |   },
575 |   {
576 |    "cell_type": "code",
577 |    "execution_count": 47,
578 |    "metadata": {},
579 |    "outputs": [
580 |     {
581 |      "name": "stdout",
582 |      "output_type": "stream",
583 |      "text": [
584 |       "+-------+-------+-----+\n",
585 |       "|Company| Person|Sales|\n",
586 |       "+-------+-------+-----+\n",
587 |       "|     FB|   Carl|870.0|\n",
588 |       "|   APPL|   Mike|750.0|\n",
589 |       "|   MSFT|   Tina|600.0|\n",
590 |       "|     FB|  Sarah|350.0|\n",
591 |       "|   APPL|  Chris|350.0|\n",
592 |       "|   GOOG|  Frank|340.0|\n",
593 |       "|   APPL|   John|250.0|\n",
594 |       "|   MSFT|Vanessa|243.0|\n",
595 |       "|   GOOG|    Sam|200.0|\n",
596 |       "|   APPL|  Linda|130.0|\n",
597 |       "|   MSFT|    Amy|124.0|\n",
598 |       "|   GOOG|Charlie|120.0|\n",
599 |       "+-------+-------+-----+\n",
600 |       "\n"
601 |      ]
602 |     }
603 |    ],
604 |    "source": [
605 |     "# Descending call off the column itself.\n",
606 |     "df.orderBy(df[\"Sales\"].desc()).show()"
607 |    ]
608 |   },
609 |   {
610 |    "cell_type": "markdown",
611 |    "metadata": {},
612 |    "source": [
613 |     "Most basic functions you would expect to be available are, so make sure to check out the documentation!"
614 |    ]
615 |   }
616 |  ],
617 |  "metadata": {
618 |   "anaconda-cloud": {},
619 |   "kernelspec": {
620 |    "display_name": "Python 3",
621 |    "language": "python",
622 |    "name": "python3"
623 |   },
624 |   "language_info": {
625 |    "codemirror_mode": {
626 |     "name": "ipython",
627 |     "version": 3
628 |    },
629 |    "file_extension": ".py",
630 |    "mimetype": "text/x-python",
631 |    "name": "python",
632 |    "nbconvert_exporter": "python",
633 |    "pygments_lexer": "ipython3",
634 |    "version": "3.7.9"
635 |   }
636 |  },
637 |  "nbformat": 4,
638 |  "nbformat_minor": 1
639 | }
640 | 


--------------------------------------------------------------------------------
/Linear_Regression_Consulting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Linear Regression Consulting"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "collapsed": true
 14 |    },
 15 |    "source": [
 16 |     "\n",
 17 |     "\n",
 18 |     "Here is data:\n",
 19 |     "\n",
 20 |     "    Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n",
 21 |     "    ships.\n",
 22 |     "\n",
 23 |     "\n",
 24 |     "    Variables/Columns\n",
 25 |     "    Ship Name     1-20\n",
 26 |     "    Cruise Line   21-40\n",
 27 |     "    Age (as of 2013)   46-48\n",
 28 |     "    Tonnage (1000s of tons)   50-56\n",
 29 |     "    passengers (100s)   58-64\n",
 30 |     "    Length (100s of feet)  66-72\n",
 31 |     "    Cabins  (100s)   74-80\n",
 32 |     "    Passenger Density   82-88\n",
 33 |     "    Crew  (100s)   90-96\n",
 34 |     "    \n"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 1,
 40 |    "metadata": {
 41 |     "collapsed": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "from pyspark.sql import SparkSession"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 2,
 51 |    "metadata": {},
 52 |    "outputs": [],
 53 |    "source": [
 54 |     "spark = SparkSession.builder.appName('cruise').getOrCreate()"
 55 |    ]
 56 |   },
 57 |   {
 58 |    "cell_type": "code",
 59 |    "execution_count": 3,
 60 |    "metadata": {},
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "df = spark.read.csv('cruise_ship_info.csv',inferSchema=True,header=True)"
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 4,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "root\n",
 76 |       " |-- Ship_name: string (nullable = true)\n",
 77 |       " |-- Cruise_line: string (nullable = true)\n",
 78 |       " |-- Age: integer (nullable = true)\n",
 79 |       " |-- Tonnage: double (nullable = true)\n",
 80 |       " |-- passengers: double (nullable = true)\n",
 81 |       " |-- length: double (nullable = true)\n",
 82 |       " |-- cabins: double (nullable = true)\n",
 83 |       " |-- passenger_density: double (nullable = true)\n",
 84 |       " |-- crew: double (nullable = true)\n",
 85 |       "\n"
 86 |      ]
 87 |     }
 88 |    ],
 89 |    "source": [
 90 |     "df.printSchema()"
 91 |    ]
 92 |   },
 93 |   {
 94 |    "cell_type": "code",
 95 |    "execution_count": 5,
 96 |    "metadata": {},
 97 |    "outputs": [
 98 |     {
 99 |      "name": "stdout",
100 |      "output_type": "stream",
101 |      "text": [
102 |       "+-----------+-----------+---+------------------+----------+------+------+-----------------+----+\n",
103 |       "|  Ship_name|Cruise_line|Age|           Tonnage|passengers|length|cabins|passenger_density|crew|\n",
104 |       "+-----------+-----------+---+------------------+----------+------+------+-----------------+----+\n",
105 |       "|    Journey|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|\n",
106 |       "|      Quest|    Azamara|  6|30.276999999999997|      6.94|  5.94|  3.55|            42.64|3.55|\n",
107 |       "|Celebration|   Carnival| 26|            47.262|     14.86|  7.22|  7.43|             31.8| 6.7|\n",
108 |       "|   Conquest|   Carnival| 11|             110.0|     29.74|  9.53| 14.88|            36.99|19.1|\n",
109 |       "|    Destiny|   Carnival| 17|           101.353|     26.42|  8.92| 13.21|            38.36|10.0|\n",
110 |       "|    Ecstasy|   Carnival| 22|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|\n",
111 |       "|    Elation|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|\n",
112 |       "|    Fantasy|   Carnival| 23|            70.367|     20.56|  8.55| 10.22|            34.23| 9.2|\n",
113 |       "|Fascination|   Carnival| 19|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|\n",
114 |       "|    Freedom|   Carnival|  6|110.23899999999999|      37.0|  9.51| 14.87|            29.79|11.5|\n",
115 |       "|      Glory|   Carnival| 10|             110.0|     29.74|  9.51| 14.87|            36.99|11.6|\n",
116 |       "|    Holiday|   Carnival| 28|            46.052|     14.52|  7.27|  7.26|            31.72| 6.6|\n",
117 |       "|Imagination|   Carnival| 18|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|\n",
118 |       "|Inspiration|   Carnival| 17|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|\n",
119 |       "|     Legend|   Carnival| 11|              86.0|     21.24|  9.63| 10.62|            40.49| 9.3|\n",
120 |       "|   Liberty*|   Carnival|  8|             110.0|     29.74|  9.51| 14.87|            36.99|11.6|\n",
121 |       "|    Miracle|   Carnival|  9|              88.5|     21.24|  9.63| 10.62|            41.67|10.3|\n",
122 |       "|   Paradise|   Carnival| 15|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|\n",
123 |       "|      Pride|   Carnival| 12|              88.5|     21.24|  9.63| 11.62|            41.67| 9.3|\n",
124 |       "|  Sensation|   Carnival| 20|            70.367|     20.52|  8.55|  10.2|            34.29| 9.2|\n",
125 |       "+-----------+-----------+---+------------------+----------+------+------+-----------------+----+\n",
126 |       "only showing top 20 rows\n",
127 |       "\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "df.show()"
133 |    ]
134 |   },
135 |   {
136 |    "cell_type": "code",
137 |    "execution_count": 6,
138 |    "metadata": {},
139 |    "outputs": [
140 |     {
141 |      "name": "stdout",
142 |      "output_type": "stream",
143 |      "text": [
144 |       "+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+\n",
145 |       "|summary|               Age|           Tonnage|       passengers|           length|            cabins|passenger_density|             crew|\n",
146 |       "+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+\n",
147 |       "|  count|               158|               158|              158|              158|               158|              158|              158|\n",
148 |       "|   mean|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|\n",
149 |       "| stddev| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|\n",
150 |       "|    min|                 4|             2.329|             0.66|             2.79|              0.33|             17.7|             0.59|\n",
151 |       "|    max|                48|             220.0|             54.0|            11.82|              27.0|            71.43|             21.0|\n",
152 |       "+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+\n",
153 |       "\n"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "df.describe().show()"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "markdown",
163 |    "metadata": {},
164 |    "source": [
165 |     "## Dealing with the Cruise_line categorical variable\n",
166 |     "Ship Name is a useless arbitrary string, but the cruise_line itself may be useful. Let's make it into a categorical variable!"
167 |    ]
168 |   },
169 |   {
170 |    "cell_type": "code",
171 |    "execution_count": 7,
172 |    "metadata": {},
173 |    "outputs": [
174 |     {
175 |      "name": "stdout",
176 |      "output_type": "stream",
177 |      "text": [
178 |       "+-----------------+-----+\n",
179 |       "|      Cruise_line|count|\n",
180 |       "+-----------------+-----+\n",
181 |       "|            Costa|   11|\n",
182 |       "|              P&O|    6|\n",
183 |       "|           Cunard|    3|\n",
184 |       "|Regent_Seven_Seas|    5|\n",
185 |       "|              MSC|    8|\n",
186 |       "|         Carnival|   22|\n",
187 |       "|          Crystal|    2|\n",
188 |       "|           Orient|    1|\n",
189 |       "|         Princess|   17|\n",
190 |       "|        Silversea|    4|\n",
191 |       "|         Seabourn|    3|\n",
192 |       "| Holland_American|   14|\n",
193 |       "|         Windstar|    3|\n",
194 |       "|           Disney|    2|\n",
195 |       "|        Norwegian|   13|\n",
196 |       "|          Oceania|    3|\n",
197 |       "|          Azamara|    2|\n",
198 |       "|        Celebrity|   10|\n",
199 |       "|             Star|    6|\n",
200 |       "|  Royal_Caribbean|   23|\n",
201 |       "+-----------------+-----+\n",
202 |       "\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "df.groupBy('Cruise_line').count().show()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 8,
213 |    "metadata": {},
214 |    "outputs": [
215 |     {
216 |      "data": {
217 |       "text/plain": [
218 |        "[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),\n",
219 |        " Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),\n",
220 |        " Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7, cruise_cat=1.0),\n",
221 |        " Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.74, length=9.53, cabins=14.88, passenger_density=36.99, crew=19.1, cruise_cat=1.0),\n",
222 |        " Row(Ship_name='Destiny', Cruise_line='Carnival', Age=17, Tonnage=101.353, passengers=26.42, length=8.92, cabins=13.21, passenger_density=38.36, crew=10.0, cruise_cat=1.0)]"
223 |       ]
224 |      },
225 |      "execution_count": 8,
226 |      "metadata": {},
227 |      "output_type": "execute_result"
228 |     }
229 |    ],
230 |    "source": [
231 |     "from pyspark.ml.feature import StringIndexer\n",
232 |     "indexer = StringIndexer(inputCol=\"Cruise_line\", outputCol=\"cruise_cat\")\n",
233 |     "indexed = indexer.fit(df).transform(df)\n",
234 |     "indexed.head(5)"
235 |    ]
236 |   },
237 |   {
238 |    "cell_type": "code",
239 |    "execution_count": 9,
240 |    "metadata": {
241 |     "collapsed": true
242 |    },
243 |    "outputs": [],
244 |    "source": [
245 |     "from pyspark.ml.linalg import Vectors\n",
246 |     "from pyspark.ml.feature import VectorAssembler"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 10,
252 |    "metadata": {},
253 |    "outputs": [
254 |     {
255 |      "data": {
256 |       "text/plain": [
257 |        "['Ship_name',\n",
258 |        " 'Cruise_line',\n",
259 |        " 'Age',\n",
260 |        " 'Tonnage',\n",
261 |        " 'passengers',\n",
262 |        " 'length',\n",
263 |        " 'cabins',\n",
264 |        " 'passenger_density',\n",
265 |        " 'crew',\n",
266 |        " 'cruise_cat']"
267 |       ]
268 |      },
269 |      "execution_count": 10,
270 |      "metadata": {},
271 |      "output_type": "execute_result"
272 |     }
273 |    ],
274 |    "source": [
275 |     "indexed.columns"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 11,
281 |    "metadata": {
282 |     "collapsed": true
283 |    },
284 |    "outputs": [],
285 |    "source": [
286 |     "assembler = VectorAssembler(\n",
287 |     "  inputCols=['Age',\n",
288 |     "             'Tonnage',\n",
289 |     "             'passengers',\n",
290 |     "             'length',\n",
291 |     "             'cabins',\n",
292 |     "             'passenger_density',\n",
293 |     "             'cruise_cat'],\n",
294 |     "    outputCol=\"features\")"
295 |    ]
296 |   },
297 |   {
298 |    "cell_type": "code",
299 |    "execution_count": 12,
300 |    "metadata": {},
301 |    "outputs": [],
302 |    "source": [
303 |     "output = assembler.transform(indexed)"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 13,
309 |    "metadata": {},
310 |    "outputs": [
311 |     {
312 |      "name": "stdout",
313 |      "output_type": "stream",
314 |      "text": [
315 |       "+--------------------+----+\n",
316 |       "|            features|crew|\n",
317 |       "+--------------------+----+\n",
318 |       "|[6.0,30.276999999...|3.55|\n",
319 |       "|[6.0,30.276999999...|3.55|\n",
320 |       "|[26.0,47.262,14.8...| 6.7|\n",
321 |       "|[11.0,110.0,29.74...|19.1|\n",
322 |       "|[17.0,101.353,26....|10.0|\n",
323 |       "|[22.0,70.367,20.5...| 9.2|\n",
324 |       "|[15.0,70.367,20.5...| 9.2|\n",
325 |       "|[23.0,70.367,20.5...| 9.2|\n",
326 |       "|[19.0,70.367,20.5...| 9.2|\n",
327 |       "|[6.0,110.23899999...|11.5|\n",
328 |       "|[10.0,110.0,29.74...|11.6|\n",
329 |       "|[28.0,46.052,14.5...| 6.6|\n",
330 |       "|[18.0,70.367,20.5...| 9.2|\n",
331 |       "|[17.0,70.367,20.5...| 9.2|\n",
332 |       "|[11.0,86.0,21.24,...| 9.3|\n",
333 |       "|[8.0,110.0,29.74,...|11.6|\n",
334 |       "|[9.0,88.5,21.24,9...|10.3|\n",
335 |       "|[15.0,70.367,20.5...| 9.2|\n",
336 |       "|[12.0,88.5,21.24,...| 9.3|\n",
337 |       "|[20.0,70.367,20.5...| 9.2|\n",
338 |       "+--------------------+----+\n",
339 |       "only showing top 20 rows\n",
340 |       "\n"
341 |      ]
342 |     }
343 |    ],
344 |    "source": [
345 |     "output.select(\"features\", \"crew\").show()"
346 |    ]
347 |   },
348 |   {
349 |    "cell_type": "code",
350 |    "execution_count": 14,
351 |    "metadata": {
352 |     "collapsed": true
353 |    },
354 |    "outputs": [],
355 |    "source": [
356 |     "final_data = output.select(\"features\", \"crew\")"
357 |    ]
358 |   },
359 |   {
360 |    "cell_type": "code",
361 |    "execution_count": 15,
362 |    "metadata": {
363 |     "collapsed": true
364 |    },
365 |    "outputs": [],
366 |    "source": [
367 |     "train_data,test_data = final_data.randomSplit([0.7,0.3])"
368 |    ]
369 |   },
370 |   {
371 |    "cell_type": "code",
372 |    "execution_count": 16,
373 |    "metadata": {},
374 |    "outputs": [],
375 |    "source": [
376 |     "from pyspark.ml.regression import LinearRegression\n",
377 |     "# Create a Linear Regression Model object\n",
378 |     "lr = LinearRegression(labelCol='crew')"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "code",
383 |    "execution_count": 17,
384 |    "metadata": {
385 |     "collapsed": true
386 |    },
387 |    "outputs": [],
388 |    "source": [
389 |     "# Fit the model to the data and call this model lrModel\n",
390 |     "lrModel = lr.fit(train_data)"
391 |    ]
392 |   },
393 |   {
394 |    "cell_type": "code",
395 |    "execution_count": 18,
396 |    "metadata": {},
397 |    "outputs": [
398 |     {
399 |      "name": "stdout",
400 |      "output_type": "stream",
401 |      "text": [
402 |       "Coefficients: [-0.0145423814068,0.0137445818936,-0.111000735058,0.422234330769,0.705574105078,-0.00631202648669,0.0306212943631] Intercept: -0.5598623529951635\n"
403 |      ]
404 |     }
405 |    ],
406 |    "source": [
407 |     "# Print the coefficients and intercept for linear regression\n",
408 |     "print(\"Coefficients: {} Intercept: {}\".format(lrModel.coefficients,lrModel.intercept))"
409 |    ]
410 |   },
411 |   {
412 |    "cell_type": "code",
413 |    "execution_count": 19,
414 |    "metadata": {
415 |     "collapsed": true
416 |    },
417 |    "outputs": [],
418 |    "source": [
419 |     "test_results = lrModel.evaluate(test_data)"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": 20,
425 |    "metadata": {},
426 |    "outputs": [
427 |     {
428 |      "name": "stdout",
429 |      "output_type": "stream",
430 |      "text": [
431 |       "RMSE: 1.3174339720092743\n",
432 |       "MSE: 1.7356322706041332\n",
433 |       "R2: 0.8671622449217978\n"
434 |      ]
435 |     }
436 |    ],
437 |    "source": [
438 |     "print(\"RMSE: {}\".format(test_results.rootMeanSquaredError))\n",
439 |     "print(\"MSE: {}\".format(test_results.meanSquaredError))\n",
440 |     "print(\"R2: {}\".format(test_results.r2))"
441 |    ]
442 |   },
443 |   {
444 |    "cell_type": "code",
445 |    "execution_count": 21,
446 |    "metadata": {
447 |     "collapsed": true
448 |    },
449 |    "outputs": [],
450 |    "source": [
451 |     "# R2 of 0.86 is pretty good, let's check the data a little closer\n",
452 |     "from pyspark.sql.functions import corr"
453 |    ]
454 |   },
455 |   {
456 |    "cell_type": "code",
457 |    "execution_count": 22,
458 |    "metadata": {},
459 |    "outputs": [
460 |     {
461 |      "name": "stdout",
462 |      "output_type": "stream",
463 |      "text": [
464 |       "+----------------------+\n",
465 |       "|corr(crew, passengers)|\n",
466 |       "+----------------------+\n",
467 |       "|    0.9152341306065384|\n",
468 |       "+----------------------+\n",
469 |       "\n"
470 |      ]
471 |     }
472 |    ],
473 |    "source": [
474 |     "df.select(corr('crew','passengers')).show()"
475 |    ]
476 |   },
477 |   {
478 |    "cell_type": "code",
479 |    "execution_count": 23,
480 |    "metadata": {},
481 |    "outputs": [
482 |     {
483 |      "name": "stdout",
484 |      "output_type": "stream",
485 |      "text": [
486 |       "+------------------+\n",
487 |       "|corr(crew, cabins)|\n",
488 |       "+------------------+\n",
489 |       "|0.9508226063578497|\n",
490 |       "+------------------+\n",
491 |       "\n"
492 |      ]
493 |     }
494 |    ],
495 |    "source": [
496 |     "df.select(corr('crew','cabins')).show()"
497 |    ]
498 |   }
499 |  ],
500 |  "metadata": {
501 |   "anaconda-cloud": {},
502 |   "kernelspec": {
503 |    "display_name": "Python 3",
504 |    "language": "python",
505 |    "name": "python3"
506 |   },
507 |   "language_info": {
508 |    "codemirror_mode": {
509 |     "name": "ipython",
510 |     "version": 3
511 |    },
512 |    "file_extension": ".py",
513 |    "mimetype": "text/x-python",
514 |    "name": "python",
515 |    "nbconvert_exporter": "python",
516 |    "pygments_lexer": "ipython3",
517 |    "version": "3.7.9"
518 |   }
519 |  },
520 |  "nbformat": 4,
521 |  "nbformat_minor": 1
522 | }
523 | 


--------------------------------------------------------------------------------
/Logistic_Regression_Consulting.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Logistic Regression Consulting \n"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "markdown",
 12 |    "metadata": {
 13 |     "collapsed": true
 14 |    },
 15 |    "source": [
 16 |     "\n",
 17 |     "\n",
 18 |     "\n",
 19 |     "The data is :\n",
 20 |     "\n",
 21 |     "    Name : Name of the latest contact at Company\n",
 22 |     "    Age: Customer Age\n",
 23 |     "    Total_Purchase: Total Ads Purchased\n",
 24 |     "    Account_Manager: Binary 0=No manager, 1= Account manager assigned\n",
 25 |     "    Years: Totaly Years as a customer\n",
 26 |     "    Num_sites: Number of websites that use the service.\n",
 27 |     "    Onboard_date: Date that the name of the latest contact was onboarded\n",
 28 |     "    Location: Client HQ Address\n",
 29 |     "    Company: Name of Client Company\n",
 30 |     "    "
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 1,
 36 |    "metadata": {
 37 |     "collapsed": true
 38 |    },
 39 |    "outputs": [],
 40 |    "source": [
 41 |     "from pyspark.sql import SparkSession"
 42 |    ]
 43 |   },
 44 |   {
 45 |    "cell_type": "code",
 46 |    "execution_count": 2,
 47 |    "metadata": {
 48 |     "collapsed": true
 49 |    },
 50 |    "outputs": [],
 51 |    "source": [
 52 |     "spark = SparkSession.builder.appName('logregconsult').getOrCreate()"
 53 |    ]
 54 |   },
 55 |   {
 56 |    "cell_type": "code",
 57 |    "execution_count": 3,
 58 |    "metadata": {
 59 |     "collapsed": true
 60 |    },
 61 |    "outputs": [],
 62 |    "source": [
 63 |     "data = spark.read.csv('customer_churn.csv',inferSchema=True,\n",
 64 |     "                     header=True)"
 65 |    ]
 66 |   },
 67 |   {
 68 |    "cell_type": "code",
 69 |    "execution_count": 37,
 70 |    "metadata": {},
 71 |    "outputs": [
 72 |     {
 73 |      "name": "stdout",
 74 |      "output_type": "stream",
 75 |      "text": [
 76 |       "root\n",
 77 |       " |-- Names: string (nullable = true)\n",
 78 |       " |-- Age: double (nullable = true)\n",
 79 |       " |-- Total_Purchase: double (nullable = true)\n",
 80 |       " |-- Account_Manager: integer (nullable = true)\n",
 81 |       " |-- Years: double (nullable = true)\n",
 82 |       " |-- Num_Sites: double (nullable = true)\n",
 83 |       " |-- Onboard_date: timestamp (nullable = true)\n",
 84 |       " |-- Location: string (nullable = true)\n",
 85 |       " |-- Company: string (nullable = true)\n",
 86 |       " |-- Churn: integer (nullable = true)\n",
 87 |       "\n"
 88 |      ]
 89 |     }
 90 |    ],
 91 |    "source": [
 92 |     "data.printSchema()"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "### Check out the data"
100 |    ]
101 |   },
102 |   {
103 |    "cell_type": "code",
104 |    "execution_count": 5,
105 |    "metadata": {},
106 |    "outputs": [
107 |     {
108 |      "name": "stdout",
109 |      "output_type": "stream",
110 |      "text": [
111 |       "+-------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+\n",
112 |       "|summary|              Age|   Total_Purchase|   Account_Manager|            Years|         Num_Sites|              Churn|\n",
113 |       "+-------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+\n",
114 |       "|  count|              900|              900|               900|              900|               900|                900|\n",
115 |       "|   mean|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|0.16666666666666666|\n",
116 |       "| stddev|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.7648355920350969| 0.3728852122772358|\n",
117 |       "|    min|             22.0|            100.0|                 0|              1.0|               3.0|                  0|\n",
118 |       "|    max|             65.0|         18026.01|                 1|             9.15|              14.0|                  1|\n",
119 |       "+-------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+\n",
120 |       "\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "data.describe().show()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 38,
131 |    "metadata": {},
132 |    "outputs": [
133 |     {
134 |      "data": {
135 |       "text/plain": [
136 |        "['Names',\n",
137 |        " 'Age',\n",
138 |        " 'Total_Purchase',\n",
139 |        " 'Account_Manager',\n",
140 |        " 'Years',\n",
141 |        " 'Num_Sites',\n",
142 |        " 'Onboard_date',\n",
143 |        " 'Location',\n",
144 |        " 'Company',\n",
145 |        " 'Churn']"
146 |       ]
147 |      },
148 |      "execution_count": 38,
149 |      "metadata": {},
150 |      "output_type": "execute_result"
151 |     }
152 |    ],
153 |    "source": [
154 |     "data.columns"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "markdown",
159 |    "metadata": {},
160 |    "source": [
161 |     "### Format for MLlib\n",
162 |     "\n"
163 |    ]
164 |   },
165 |   {
166 |    "cell_type": "code",
167 |    "execution_count": 7,
168 |    "metadata": {
169 |     "collapsed": true
170 |    },
171 |    "outputs": [],
172 |    "source": [
173 |     "from pyspark.ml.feature import VectorAssembler"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 8,
179 |    "metadata": {
180 |     "collapsed": true
181 |    },
182 |    "outputs": [],
183 |    "source": [
184 |     "assembler = VectorAssembler(inputCols=['Age',\n",
185 |     " 'Total_Purchase',\n",
186 |     " 'Account_Manager',\n",
187 |     " 'Years',\n",
188 |     " 'Num_Sites'],outputCol='features')"
189 |    ]
190 |   },
191 |   {
192 |    "cell_type": "code",
193 |    "execution_count": 9,
194 |    "metadata": {
195 |     "collapsed": true
196 |    },
197 |    "outputs": [],
198 |    "source": [
199 |     "output = assembler.transform(data)"
200 |    ]
201 |   },
202 |   {
203 |    "cell_type": "code",
204 |    "execution_count": 39,
205 |    "metadata": {
206 |     "collapsed": true
207 |    },
208 |    "outputs": [],
209 |    "source": [
210 |     "final_data = output.select('features','churn')"
211 |    ]
212 |   },
213 |   {
214 |    "cell_type": "markdown",
215 |    "metadata": {},
216 |    "source": [
217 |     "### Test Train Split"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "code",
222 |    "execution_count": 40,
223 |    "metadata": {
224 |     "collapsed": true
225 |    },
226 |    "outputs": [],
227 |    "source": [
228 |     "train_churn,test_churn = final_data.randomSplit([0.7,0.3])"
229 |    ]
230 |   },
231 |   {
232 |    "cell_type": "markdown",
233 |    "metadata": {},
234 |    "source": [
235 |     "### Fit the model"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 12,
241 |    "metadata": {
242 |     "collapsed": true
243 |    },
244 |    "outputs": [],
245 |    "source": [
246 |     "from pyspark.ml.classification import LogisticRegression"
247 |    ]
248 |   },
249 |   {
250 |    "cell_type": "code",
251 |    "execution_count": 13,
252 |    "metadata": {
253 |     "collapsed": true
254 |    },
255 |    "outputs": [],
256 |    "source": [
257 |     "lr_churn = LogisticRegression(labelCol='churn')"
258 |    ]
259 |   },
260 |   {
261 |    "cell_type": "code",
262 |    "execution_count": 14,
263 |    "metadata": {
264 |     "collapsed": true
265 |    },
266 |    "outputs": [],
267 |    "source": [
268 |     "fitted_churn_model = lr_churn.fit(train_churn)"
269 |    ]
270 |   },
271 |   {
272 |    "cell_type": "code",
273 |    "execution_count": 15,
274 |    "metadata": {
275 |     "collapsed": true
276 |    },
277 |    "outputs": [],
278 |    "source": [
279 |     "training_sum = fitted_churn_model.summary"
280 |    ]
281 |   },
282 |   {
283 |    "cell_type": "code",
284 |    "execution_count": 41,
285 |    "metadata": {},
286 |    "outputs": [
287 |     {
288 |      "name": "stdout",
289 |      "output_type": "stream",
290 |      "text": [
291 |       "+-------+-------------------+-------------------+\n",
292 |       "|summary|              churn|         prediction|\n",
293 |       "+-------+-------------------+-------------------+\n",
294 |       "|  count|                632|                632|\n",
295 |       "|   mean|0.16772151898734178|0.13924050632911392|\n",
296 |       "| stddev|0.37391474020622584| 0.3464715405857694|\n",
297 |       "|    min|                  0|                0.0|\n",
298 |       "|    max|                  1|                1.0|\n",
299 |       "+-------+-------------------+-------------------+\n",
300 |       "\n"
301 |      ]
302 |     }
303 |    ],
304 |    "source": [
305 |     "training_sum.predictions.describe().show()"
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "markdown",
310 |    "metadata": {},
311 |    "source": [
312 |     "### Evaluate results\n",
313 |     "\n",
314 |     "Let's evaluate the results on the data set we were given (using the test data)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 17,
320 |    "metadata": {
321 |     "collapsed": true
322 |    },
323 |    "outputs": [],
324 |    "source": [
325 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 18,
331 |    "metadata": {
332 |     "collapsed": true
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "pred_and_labels = fitted_churn_model.evaluate(test_churn)"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 42,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "name": "stdout",
346 |      "output_type": "stream",
347 |      "text": [
348 |       "+--------------------+-----+--------------------+--------------------+----------+\n",
349 |       "|            features|churn|       rawPrediction|         probability|prediction|\n",
350 |       "+--------------------+-----+--------------------+--------------------+----------+\n",
351 |       "|[29.0,11274.46,1....|    0|[4.87277048314045...|[0.99240597473215...|       0.0|\n",
352 |       "|[30.0,8403.78,1.0...|    0|[6.62706699787450...|[0.99867770995491...|       0.0|\n",
353 |       "|[30.0,8874.83,0.0...|    0|[3.83233030863620...|[0.97880008629612...|       0.0|\n",
354 |       "|[31.0,5387.75,0.0...|    0|[3.24742811458119...|[0.96258058552664...|       0.0|\n",
355 |       "|[31.0,7073.61,0.0...|    0|[3.79911450433881...|[0.97809976923405...|       0.0|\n",
356 |       "|[31.0,11297.57,1....|    1|[0.79751152640735...|[0.68944192100551...|       0.0|\n",
357 |       "|[31.0,11743.24,0....|    0|[7.95951793845681...|[0.99965080051155...|       0.0|\n",
358 |       "|[31.0,12264.68,1....|    0|[3.77281170068563...|[0.97752920495855...|       0.0|\n",
359 |       "|[32.0,6367.22,1.0...|    0|[3.20017220414578...|[0.96084075703562...|       0.0|\n",
360 |       "|[32.0,8575.71,0.0...|    0|[4.52857300143358...|[0.98931923918898...|       0.0|\n",
361 |       "|[32.0,13630.93,0....|    0|[2.65527248795398...|[0.93433521477806...|       0.0|\n",
362 |       "|[33.0,4711.89,0.0...|    0|[7.15048703176813...|[0.99921613300884...|       0.0|\n",
363 |       "|[33.0,5738.82,0.0...|    0|[5.41122451678732...|[0.99555369000330...|       0.0|\n",
364 |       "|[33.0,7750.54,1.0...|    0|[4.79456321095382...|[0.99179329500352...|       0.0|\n",
365 |       "|[33.0,12638.51,1....|    0|[4.15248449384766...|[0.98451815808214...|       0.0|\n",
366 |       "|[33.0,13314.19,0....|    0|[3.36990907218523...|[0.96675076852634...|       0.0|\n",
367 |       "|[34.0,5447.16,1.0...|    0|[3.75995719191832...|[0.97724510462861...|       0.0|\n",
368 |       "|[34.0,6461.86,1.0...|    0|[4.80281076454080...|[0.99186015320798...|       0.0|\n",
369 |       "|[34.0,7818.13,0.0...|    0|[4.73016790727597...|[0.99125221001613...|       0.0|\n",
370 |       "|[34.0,9265.59,0.0...|    0|[4.83050636756087...|[0.99208073716831...|       0.0|\n",
371 |       "+--------------------+-----+--------------------+--------------------+----------+\n",
372 |       "only showing top 20 rows\n",
373 |       "\n"
374 |      ]
375 |     }
376 |    ],
377 |    "source": [
378 |     "pred_and_labels.predictions.show()"
379 |    ]
380 |   },
381 |   {
382 |    "cell_type": "markdown",
383 |    "metadata": {},
384 |    "source": [
385 |     "### Using AUC"
386 |    ]
387 |   },
388 |   {
389 |    "cell_type": "code",
390 |    "execution_count": 24,
391 |    "metadata": {
392 |     "collapsed": true
393 |    },
394 |    "outputs": [],
395 |    "source": [
396 |     "churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n",
397 |     "                                           labelCol='churn')"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 26,
403 |    "metadata": {},
404 |    "outputs": [],
405 |    "source": [
406 |     "auc = churn_eval.evaluate(pred_and_labels.predictions)"
407 |    ]
408 |   },
409 |   {
410 |    "cell_type": "code",
411 |    "execution_count": 43,
412 |    "metadata": {},
413 |    "outputs": [
414 |     {
415 |      "data": {
416 |       "text/plain": [
417 |        "0.6866883116883117"
418 |       ]
419 |      },
420 |      "execution_count": 43,
421 |      "metadata": {},
422 |      "output_type": "execute_result"
423 |     }
424 |    ],
425 |    "source": [
426 |     "auc"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {},
432 |    "source": [
433 |     "[Common question - what is a good AUC value?](https://stats.stackexchange.com/questions/113326/what-is-a-good-auc-for-a-precision-recall-curve)"
434 |    ]
435 |   },
436 |   {
437 |    "cell_type": "markdown",
438 |    "metadata": {},
439 |    "source": [
440 |     "### Predict on brand new unlabeled data\n",
441 |     "\n",
442 |     "We still need to evaluate the new_customers.csv file!"
443 |    ]
444 |   },
445 |   {
446 |    "cell_type": "code",
447 |    "execution_count": 28,
448 |    "metadata": {
449 |     "collapsed": true
450 |    },
451 |    "outputs": [],
452 |    "source": [
453 |     "final_lr_model = lr_churn.fit(final_data)"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 29,
459 |    "metadata": {
460 |     "collapsed": true
461 |    },
462 |    "outputs": [],
463 |    "source": [
464 |     "new_customers = spark.read.csv('new_customers.csv',inferSchema=True,\n",
465 |     "                              header=True)"
466 |    ]
467 |   },
468 |   {
469 |    "cell_type": "code",
470 |    "execution_count": 30,
471 |    "metadata": {},
472 |    "outputs": [
473 |     {
474 |      "name": "stdout",
475 |      "output_type": "stream",
476 |      "text": [
477 |       "root\n",
478 |       " |-- Names: string (nullable = true)\n",
479 |       " |-- Age: double (nullable = true)\n",
480 |       " |-- Total_Purchase: double (nullable = true)\n",
481 |       " |-- Account_Manager: integer (nullable = true)\n",
482 |       " |-- Years: double (nullable = true)\n",
483 |       " |-- Num_Sites: double (nullable = true)\n",
484 |       " |-- Onboard_date: timestamp (nullable = true)\n",
485 |       " |-- Location: string (nullable = true)\n",
486 |       " |-- Company: string (nullable = true)\n",
487 |       "\n"
488 |      ]
489 |     }
490 |    ],
491 |    "source": [
492 |     "new_customers.printSchema()"
493 |    ]
494 |   },
495 |   {
496 |    "cell_type": "code",
497 |    "execution_count": 31,
498 |    "metadata": {
499 |     "collapsed": true
500 |    },
501 |    "outputs": [],
502 |    "source": [
503 |     "test_new_customers = assembler.transform(new_customers)"
504 |    ]
505 |   },
506 |   {
507 |    "cell_type": "code",
508 |    "execution_count": 32,
509 |    "metadata": {},
510 |    "outputs": [
511 |     {
512 |      "name": "stdout",
513 |      "output_type": "stream",
514 |      "text": [
515 |       "root\n",
516 |       " |-- Names: string (nullable = true)\n",
517 |       " |-- Age: double (nullable = true)\n",
518 |       " |-- Total_Purchase: double (nullable = true)\n",
519 |       " |-- Account_Manager: integer (nullable = true)\n",
520 |       " |-- Years: double (nullable = true)\n",
521 |       " |-- Num_Sites: double (nullable = true)\n",
522 |       " |-- Onboard_date: timestamp (nullable = true)\n",
523 |       " |-- Location: string (nullable = true)\n",
524 |       " |-- Company: string (nullable = true)\n",
525 |       " |-- features: vector (nullable = true)\n",
526 |       "\n"
527 |      ]
528 |     }
529 |    ],
530 |    "source": [
531 |     "test_new_customers.printSchema()"
532 |    ]
533 |   },
534 |   {
535 |    "cell_type": "code",
536 |    "execution_count": 33,
537 |    "metadata": {
538 |     "collapsed": true
539 |    },
540 |    "outputs": [],
541 |    "source": [
542 |     "final_results = final_lr_model.transform(test_new_customers)"
543 |    ]
544 |   },
545 |   {
546 |    "cell_type": "code",
547 |    "execution_count": 35,
548 |    "metadata": {},
549 |    "outputs": [
550 |     {
551 |      "name": "stdout",
552 |      "output_type": "stream",
553 |      "text": [
554 |       "+----------------+----------+\n",
555 |       "|         Company|prediction|\n",
556 |       "+----------------+----------+\n",
557 |       "|        King Ltd|       0.0|\n",
558 |       "|   Cannon-Benson|       1.0|\n",
559 |       "|Barron-Robertson|       1.0|\n",
560 |       "|   Sexton-Golden|       1.0|\n",
561 |       "|        Wood LLC|       0.0|\n",
562 |       "|   Parks-Robbins|       1.0|\n",
563 |       "+----------------+----------+\n",
564 |       "\n"
565 |      ]
566 |     }
567 |    ],
568 |    "source": [
569 |     "final_results.select('Company','prediction').show()"
570 |    ]
571 |   }
572 |  ],
573 |  "metadata": {
574 |   "anaconda-cloud": {},
575 |   "kernelspec": {
576 |    "display_name": "Python 3",
577 |    "language": "python",
578 |    "name": "python3"
579 |   },
580 |   "language_info": {
581 |    "codemirror_mode": {
582 |     "name": "ipython",
583 |     "version": 3
584 |    },
585 |    "file_extension": ".py",
586 |    "mimetype": "text/x-python",
587 |    "name": "python",
588 |    "nbconvert_exporter": "python",
589 |    "pygments_lexer": "ipython3",
590 |    "version": "3.7.9"
591 |   }
592 |  },
593 |  "nbformat": 4,
594 |  "nbformat_minor": 1
595 | }
596 | 


--------------------------------------------------------------------------------
/Missing_Data.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Missing Data\n"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {
 14 |     "collapsed": true
 15 |    },
 16 |    "outputs": [],
 17 |    "source": [
 18 |     "from pyspark.sql import SparkSession\n",
 19 |     "# May take a little while on a local computer\n",
 20 |     "spark = SparkSession.builder.appName(\"missingdata\").getOrCreate()"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {
 27 |     "collapsed": true
 28 |    },
 29 |    "outputs": [],
 30 |    "source": [
 31 |     "df = spark.read.csv(\"ContainsNull.csv\",header=True,inferSchema=True)"
 32 |    ]
 33 |   },
 34 |   {
 35 |    "cell_type": "code",
 36 |    "execution_count": 3,
 37 |    "metadata": {},
 38 |    "outputs": [
 39 |     {
 40 |      "name": "stdout",
 41 |      "output_type": "stream",
 42 |      "text": [
 43 |       "+----+-----+-----+\n",
 44 |       "|  Id| Name|Sales|\n",
 45 |       "+----+-----+-----+\n",
 46 |       "|emp1| John| null|\n",
 47 |       "|emp2| null| null|\n",
 48 |       "|emp3| null|345.0|\n",
 49 |       "|emp4|Cindy|456.0|\n",
 50 |       "+----+-----+-----+\n",
 51 |       "\n"
 52 |      ]
 53 |     }
 54 |    ],
 55 |    "source": [
 56 |     "df.show()"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "markdown",
 61 |    "metadata": {},
 62 |    "source": [
 63 |     "Notice how the data remains as a null."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "markdown",
 68 |    "metadata": {},
 69 |    "source": [
 70 |     "## Drop the missing data\n",
 71 |     "\n",
 72 |     "One can use the .na functions for missing data. The drop command has the following parameters:\n",
 73 |     "\n",
 74 |     "    df.na.drop(how='any', thresh=None, subset=None)\n",
 75 |     "    \n",
 76 |     "    * param how: 'any' or 'all'.\n",
 77 |     "    \n",
 78 |     "        If 'any', drop a row if it contains any nulls.\n",
 79 |     "        If 'all', drop a row only if all its values are null.\n",
 80 |     "    \n",
 81 |     "    * param thresh: int, default None\n",
 82 |     "    \n",
 83 |     "        If specified, drop rows that have less than `thresh` non-null values.\n",
 84 |     "        This overwrites the `how` parameter.\n",
 85 |     "        \n",
 86 |     "    * param subset: \n",
 87 |     "        optional list of column names to consider."
 88 |    ]
 89 |   },
 90 |   {
 91 |    "cell_type": "code",
 92 |    "execution_count": 6,
 93 |    "metadata": {},
 94 |    "outputs": [
 95 |     {
 96 |      "name": "stdout",
 97 |      "output_type": "stream",
 98 |      "text": [
 99 |       "+----+-----+-----+\n",
100 |       "|  Id| Name|Sales|\n",
101 |       "+----+-----+-----+\n",
102 |       "|emp4|Cindy|456.0|\n",
103 |       "+----+-----+-----+\n",
104 |       "\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "# Drop any row that contains missing data\n",
110 |     "df.na.drop().show()"
111 |    ]
112 |   },
113 |   {
114 |    "cell_type": "code",
115 |    "execution_count": 8,
116 |    "metadata": {},
117 |    "outputs": [
118 |     {
119 |      "name": "stdout",
120 |      "output_type": "stream",
121 |      "text": [
122 |       "+----+-----+-----+\n",
123 |       "|  Id| Name|Sales|\n",
124 |       "+----+-----+-----+\n",
125 |       "|emp1| John| null|\n",
126 |       "|emp3| null|345.0|\n",
127 |       "|emp4|Cindy|456.0|\n",
128 |       "+----+-----+-----+\n",
129 |       "\n"
130 |      ]
131 |     }
132 |    ],
133 |    "source": [
134 |     "# Has to have at least 2 NON-null values\n",
135 |     "df.na.drop(thresh=2).show()"
136 |    ]
137 |   },
138 |   {
139 |    "cell_type": "code",
140 |    "execution_count": 9,
141 |    "metadata": {},
142 |    "outputs": [
143 |     {
144 |      "name": "stdout",
145 |      "output_type": "stream",
146 |      "text": [
147 |       "+----+-----+-----+\n",
148 |       "|  Id| Name|Sales|\n",
149 |       "+----+-----+-----+\n",
150 |       "|emp3| null|345.0|\n",
151 |       "|emp4|Cindy|456.0|\n",
152 |       "+----+-----+-----+\n",
153 |       "\n"
154 |      ]
155 |     }
156 |    ],
157 |    "source": [
158 |     "df.na.drop(subset=[\"Sales\"]).show()"
159 |    ]
160 |   },
161 |   {
162 |    "cell_type": "code",
163 |    "execution_count": 10,
164 |    "metadata": {},
165 |    "outputs": [
166 |     {
167 |      "name": "stdout",
168 |      "output_type": "stream",
169 |      "text": [
170 |       "+----+-----+-----+\n",
171 |       "|  Id| Name|Sales|\n",
172 |       "+----+-----+-----+\n",
173 |       "|emp4|Cindy|456.0|\n",
174 |       "+----+-----+-----+\n",
175 |       "\n"
176 |      ]
177 |     }
178 |    ],
179 |    "source": [
180 |     "df.na.drop(how='any').show()"
181 |    ]
182 |   },
183 |   {
184 |    "cell_type": "code",
185 |    "execution_count": 11,
186 |    "metadata": {},
187 |    "outputs": [
188 |     {
189 |      "name": "stdout",
190 |      "output_type": "stream",
191 |      "text": [
192 |       "+----+-----+-----+\n",
193 |       "|  Id| Name|Sales|\n",
194 |       "+----+-----+-----+\n",
195 |       "|emp1| John| null|\n",
196 |       "|emp2| null| null|\n",
197 |       "|emp3| null|345.0|\n",
198 |       "|emp4|Cindy|456.0|\n",
199 |       "+----+-----+-----+\n",
200 |       "\n"
201 |      ]
202 |     }
203 |    ],
204 |    "source": [
205 |     "df.na.drop(how='all').show()"
206 |    ]
207 |   },
208 |   {
209 |    "cell_type": "markdown",
210 |    "metadata": {},
211 |    "source": [
212 |     "## Fill the missing values\n",
213 |     "\n",
214 |     "We can also fill the missing values with new values. If you have multiple nulls across multiple data types, Spark is actually smart enough to match up the data types. For example:"
215 |    ]
216 |   },
217 |   {
218 |    "cell_type": "code",
219 |    "execution_count": 15,
220 |    "metadata": {},
221 |    "outputs": [
222 |     {
223 |      "name": "stdout",
224 |      "output_type": "stream",
225 |      "text": [
226 |       "+----+---------+-----+\n",
227 |       "|  Id|     Name|Sales|\n",
228 |       "+----+---------+-----+\n",
229 |       "|emp1|     John| null|\n",
230 |       "|emp2|NEW VALUE| null|\n",
231 |       "|emp3|NEW VALUE|345.0|\n",
232 |       "|emp4|    Cindy|456.0|\n",
233 |       "+----+---------+-----+\n",
234 |       "\n"
235 |      ]
236 |     }
237 |    ],
238 |    "source": [
239 |     "df.na.fill('NEW VALUE').show()"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 16,
245 |    "metadata": {},
246 |    "outputs": [
247 |     {
248 |      "name": "stdout",
249 |      "output_type": "stream",
250 |      "text": [
251 |       "+----+-----+-----+\n",
252 |       "|  Id| Name|Sales|\n",
253 |       "+----+-----+-----+\n",
254 |       "|emp1| John|  0.0|\n",
255 |       "|emp2| null|  0.0|\n",
256 |       "|emp3| null|345.0|\n",
257 |       "|emp4|Cindy|456.0|\n",
258 |       "+----+-----+-----+\n",
259 |       "\n"
260 |      ]
261 |     }
262 |    ],
263 |    "source": [
264 |     "df.na.fill(0).show()"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "markdown",
269 |    "metadata": {},
270 |    "source": [
271 |     "Usually you should specify what columns you want to fill with the subset parameter"
272 |    ]
273 |   },
274 |   {
275 |    "cell_type": "code",
276 |    "execution_count": 17,
277 |    "metadata": {},
278 |    "outputs": [
279 |     {
280 |      "name": "stdout",
281 |      "output_type": "stream",
282 |      "text": [
283 |       "+----+-------+-----+\n",
284 |       "|  Id|   Name|Sales|\n",
285 |       "+----+-------+-----+\n",
286 |       "|emp1|   John| null|\n",
287 |       "|emp2|No Name| null|\n",
288 |       "|emp3|No Name|345.0|\n",
289 |       "|emp4|  Cindy|456.0|\n",
290 |       "+----+-------+-----+\n",
291 |       "\n"
292 |      ]
293 |     }
294 |    ],
295 |    "source": [
296 |     "df.na.fill('No Name',subset=['Name']).show()"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "markdown",
301 |    "metadata": {},
302 |    "source": [
303 |     "A very common practice is to fill values with the mean value for the column, for example:"
304 |    ]
305 |   },
306 |   {
307 |    "cell_type": "code",
308 |    "execution_count": 23,
309 |    "metadata": {},
310 |    "outputs": [
311 |     {
312 |      "data": {
313 |       "text/plain": [
314 |        "400.5"
315 |       ]
316 |      },
317 |      "execution_count": 23,
318 |      "metadata": {},
319 |      "output_type": "execute_result"
320 |     }
321 |    ],
322 |    "source": [
323 |     "from pyspark.sql.functions import mean\n",
324 |     "mean_val = df.select(mean(df['Sales'])).collect()\n",
325 |     "\n",
326 |     "# Weird nested formatting of Row object!\n",
327 |     "mean_val[0][0]"
328 |    ]
329 |   },
330 |   {
331 |    "cell_type": "code",
332 |    "execution_count": 24,
333 |    "metadata": {
334 |     "collapsed": true
335 |    },
336 |    "outputs": [],
337 |    "source": [
338 |     "mean_sales = mean_val[0][0]"
339 |    ]
340 |   },
341 |   {
342 |    "cell_type": "code",
343 |    "execution_count": 26,
344 |    "metadata": {},
345 |    "outputs": [
346 |     {
347 |      "name": "stdout",
348 |      "output_type": "stream",
349 |      "text": [
350 |       "+----+-----+-----+\n",
351 |       "|  Id| Name|Sales|\n",
352 |       "+----+-----+-----+\n",
353 |       "|emp1| John|400.5|\n",
354 |       "|emp2| null|400.5|\n",
355 |       "|emp3| null|345.0|\n",
356 |       "|emp4|Cindy|456.0|\n",
357 |       "+----+-----+-----+\n",
358 |       "\n"
359 |      ]
360 |     }
361 |    ],
362 |    "source": [
363 |     "df.na.fill(mean_sales,[\"Sales\"]).show()"
364 |    ]
365 |   },
366 |   {
367 |    "cell_type": "code",
368 |    "execution_count": 28,
369 |    "metadata": {},
370 |    "outputs": [
371 |     {
372 |      "name": "stdout",
373 |      "output_type": "stream",
374 |      "text": [
375 |       "+----+-----+-----+\n",
376 |       "|  Id| Name|Sales|\n",
377 |       "+----+-----+-----+\n",
378 |       "|emp1| John|400.5|\n",
379 |       "|emp2| null|400.5|\n",
380 |       "|emp3| null|345.0|\n",
381 |       "|emp4|Cindy|456.0|\n",
382 |       "+----+-----+-----+\n",
383 |       "\n"
384 |      ]
385 |     }
386 |    ],
387 |    "source": [
388 |     "# One (very ugly) one-liner\n",
389 |     "df.na.fill(df.select(mean(df['Sales'])).collect()[0][0],['Sales']).show()"
390 |    ]
391 |   }
392 |  ],
393 |  "metadata": {
394 |   "anaconda-cloud": {},
395 |   "kernelspec": {
396 |    "display_name": "Python 3",
397 |    "language": "python",
398 |    "name": "python3"
399 |   },
400 |   "language_info": {
401 |    "codemirror_mode": {
402 |     "name": "ipython",
403 |     "version": 3
404 |    },
405 |    "file_extension": ".py",
406 |    "mimetype": "text/x-python",
407 |    "name": "python",
408 |    "nbconvert_exporter": "python",
409 |    "pygments_lexer": "ipython3",
410 |    "version": "3.7.9"
411 |   }
412 |  },
413 |  "nbformat": 4,
414 |  "nbformat_minor": 1
415 | }
416 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Apache Spark
  2 | 
  3 | [![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark)
  4 | [![PySpark Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site)
  5 | [![Python 3.5](https://img.shields.io/badge/python-3.5-blue.svg)](https://www.python.org/downloads/release/python-360/)
  6 | [![](https://img.shields.io/badge/Spark-v2.4.0-brigh)](https://spark.apache.org/)
  7 | 
  8 | Spark is a unified analytics engine for large-scale data processing. It provides high-level APIs in Scala, Java, Python, and R, and an optimized engine that supports general computation graphs for data analysis. It also supports a rich set of higher-level tools including Spark SQL for SQL and DataFrames, MLlib for machine learning, GraphX for graph processing, and Structured Streaming for stream processing.
  9 | 
 10 | **Spark MLlib** library for Machine Learning provides a Collaborative Filtering implementation by using Alternating Least Squares. The implementation in MLlib has these parameters:
 11 | 
 12 | * numBlocks is the number of blocks used to parallelize computation (set to -1 to auto-configure).
 13 | * rank is the number of latent factors in the model.
 14 | * iterations is the number of iterations to run.
 15 | * lambda specifies the regularization parameter in ALS.
 16 | * implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data.
 17 | * alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations.
 18 | 
 19 | ## Data preprocessoing 
 20 | ### 1. Missing data
 21 | Often data sources are incomplete, which means you will have missing data, you have 3 basic options for filling in missing data (you will personally have to make the decision for what is the right approach:
 22 | 
 23 | * Just keep the missing data points.
 24 | * Drop them missing data points (including the entire row)
 25 | * Fill them in with some other value.
 26 | 
 27 | 
 28 | # Spark for Machine Learning 
 29 | 
 30 | - Linear regression
 31 | - Logistic regression
 32 | - Natural Language Processing (NLP)
 33 | - Tree methods
 34 | - Recomender System
 35 | 
 36 | ## Data Transformations
 37 | 
 38 | You won't always get data in a convienent format, often you will have to deal with data that is non-numerical, such as customer names, or zipcodes, country names, etc...
 39 | 
 40 | A big part of working with data is using your own domain knowledge to build an intuition of how to deal with the data, sometimes the best course of action is to drop the data, other times feature-engineering is a good way to go, or you could try to transform the data into something the Machine Learning Algorithms will understand.
 41 | 
 42 | Spark has several built in methods of dealing with thse transformations, check them all out here: http://spark.apache.org/docs/latest/ml-features.html
 43 | 
 44 | 
 45 | ### VectorIndexer
 46 | 
 47 | VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like logistic regression and decision trees. VectorAssembler accepts the following input column types: all numeric types, boolean type, and vector type. In each row, the values of the input columns will be concatenated into a vector in the specified order. 
 48 | 
 49 | Assume that we have a DataFrame with the columns id, hour, mobile, userFeatures, and clicked:
 50 | 
 51 |      id | hour | mobile | userFeatures     | clicked
 52 |     ----|------|--------|------------------|---------
 53 |      0  | 18   | 1.0    | [0.0, 10.0, 0.5] | 1.0
 54 |      
 55 | userFeatures is a vector column that contains three user features. We want to combine hour, mobile, and userFeatures into a single feature vector called features and use it to predict clicked or not. If we set VectorAssembler’s input columns to hour, mobile, and userFeatures and output column to features, after transformation we should get the following DataFrame:
 56 | 
 57 |      id | hour | mobile | userFeatures     | clicked | features
 58 |     ----|------|--------|------------------|---------|-----------------------------
 59 |      0  | 18   | 1.0    | [0.0, 10.0, 0.5] | 1.0     | [18.0, 1.0, 0.0, 10.0, 0.5]
 60 | 
 61 | 
 62 | ## Recommender System
 63 | 
 64 | The image below (from Wikipedia) shows an example of collaborative filtering. At first, people rate different items (like videos, images, games). Then, the system makes predictions about a user's rating for an item not rated yet. The new predictions are built upon the existing ratings of other users with similar ratings with the active user. In the image, the system predicts that the user will not like the video.
 65 | 
 66 | <img src=https://upload.wikimedia.org/wikipedia/commons/5/52/Collaborative_filtering.gif />
 67 | 
 68 | 
 69 | With Collaborative filtering we make predictions (filtering) about the interests of a user by collecting preferences or taste information from many users (collaborating). The underlying assumption is that if a user A has the same opinion as a user B on an issue, A is more likely to have B's opinion on a different issue x than to have the opinion on x of a user chosen randomly.
 70 | 
 71 | 
 72 | ## Spark Streaming
 73 | 
 74 | Streaming is something that is rapidly advancing and changin fast, there are multipl enew libraries every year, new and different services always popping up, and what is in this notebook may or may not apply to you. Maybe your looking for something specific on Kafka, or maybe you are looking for streaming about twitter, in which case Spark might be overkill for what you really want. Realistically speaking each situation is going to require a customized solution and this course is never going to be able to supply a one size fits all solution.
 75 | 
 76 | 
 77 | Spark has pretty well known Streaming Capabilities, if streaming is something you've found yourself needing at work then you are probably familiar with some of these concepts already, in which case you may find it more useful to jump straight to the official documentation here:
 78 | 
 79 | http://spark.apache.org/docs/latest/streaming-programming-guide.html#spark-streaming-programming-guide
 80 | 
 81 | It is really a great guide, but keep in mind some of the features are restricted to Scala at this time (Spark 2.1), hopefully they will be expanded to the Python API in the future!
 82 | 
 83 | *Twitter* is a great source for streaming because its something most people already have an intuitive understanding of, you can visit the site yourself, and a lot of streaming technology has come out of Twitter as a company. You don't access to the entire "firehose" of twitter without paying for it, but that would be too much for us to handle anyway, so we'll be more than fine with the freely available API access.
 84 | 
 85 | **Spark Streaming** is an extension of the core Spark API that enables scalable, high-throughput, fault-tolerant stream processing of live data streams. Data can be ingested from many sources like Kafka, Flume, Kinesis, or TCP sockets, and can be processed using complex algorithms expressed with high-level functions like map, reduce, join and window. Finally, processed data can be pushed out to filesystems, databases, and live dashboards. In fact, you can apply Spark’s machine learning and graph processing algorithms on data streams.
 86 | 
 87 | <img src='http://spark.apache.org/docs/latest/img/streaming-arch.png'/>
 88 | 
 89 | Keep in mind that a few of these Streamiing Capabilities are limited when it comes to Python, you'll need to reference the documentation for the most up to date information. Also the streaming contexts tend to follow more along with the older RDD syntax, so a few things might seem different than what we are used to seeing, keep that in mind, you'll definitely want to have a good understanding of lambda expressions before continuing with this!
 90 | 
 91 | There are SparkSQL modules for streaming: 
 92 | 
 93 | http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=streaming#module-pyspark.sql.streaming
 94 | 
 95 | But they are all still listed as experimental, so instead of showing you somethign that might break in the future, we'll stick to the RDD methods (which is what the documentation also currently shows for streaming).
 96 | 
 97 | Internally, it works as follows. Spark Streaming receives live input data streams and divides the data into batches, which are then processed by the Spark engine to generate the final stream of results in batches.
 98 | 
 99 | <img src='http://spark.apache.org/docs/latest/img/streaming-flow.png'/>
100 | 
101 | 
102 | ## Database
103 | 
104 | https://github.com/EBISYS/WaterWatch
105 | 
106 | https://github.com/EBISYS/WaterWatch/blob/master/query.csv
107 | 
108 | 
109 | [Configure Pycharm](https://www.youtube.com/watch?v=RsALKtZvqFo)
110 | 
111 | 


--------------------------------------------------------------------------------
/Read_Write_and_Validate_Data.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # coding: utf-8
  3 | 
  4 | # # Reading Writing and Validating Data in PySpark
  5 | # 
  6 | # Welcome to PySpark!
  7 | # 
  8 | # In this first lecture, we will be covering:
  9 | # 
 10 | #  - Reading in Data
 11 | #  - Partioned Files
 12 | #  - Validating Data
 13 | #  - Specifying Data Types
 14 | #  - Writing Data
 15 | # 
 16 | # Below you will see the script to begin your first PySpark instance. If you're ever curious
 17 | # about how your PySpark instance is performing, Spark offers a neat Web UI with tons of information.
 18 | # Just navigate to http://[driver]:4040 in your browswer where "drive" is you driver name.
 19 | # If you are running PySpark locally, it would be http://localhost:4040 or you can use the hyperlink
 20 | # automatically produced from the script below.
 21 | 
 22 | # First let's create our PySpark instance!
 23 | 
 24 | # PC users can use the next two lines of code but mac users don't need it
 25 | # import findspark
 26 | # findspark.init()
 27 | 
 28 | import pyspark       # only run after findspark.init()
 29 | from pyspark.sql import SparkSession
 30 | # May take awhile locally
 31 | spark = SparkSession.builder.appName("ReadWriteValidate").getOrCreate()
 32 | spark
 33 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
 34 | print("You are working with", cores, "core(s)")
 35 | 
 36 | # ## Reading data
 37 | # 
 38 | # A DataFrame is equivalent to a relational table in Spark SQL, and can be created using various
 39 | # functions in SparkSession.
 40 | # 
 41 | # First let's try reading in a csv file containing a list of students and their grades.
 42 | # **Source:** https://www.kaggle.com/spscientist/students-performance-in-exams
 43 | 
 44 | # Start by reading a basic csv dataset
 45 | # Let Spark know about the header and infer the Schema types!
 46 | 
 47 | path ="Datasets/"
 48 | # Some csv data
 49 | students = spark.read.csv(path+'students.csv',inferSchema=True,header=True)
 50 | students.limit(4).toPandas()
 51 | 
 52 | # **Parquet Files**
 53 | # Now try reading in a parquet file. This is most common data type in the big data world.
 54 | # Why? because it is the most compact file storage method (even better than zipped files!)
 55 | 
 56 | parquet = spark.read.parquet(path+'users1.parquet')
 57 | parquet.show(2)
 58 | parquet.count()
 59 | 
 60 | # **Partitioned Parquet Files**
 61 | # 
 62 | # Actually most big datasets will be partitioned. Here is how you can collect all
 63 | # the pieces (parts) of the dataset in one simple command.
 64 | 
 65 | partitioned = spark.read.parquet(path+'users*')
 66 | partitioned.show(2)
 67 | 
 68 | # You can also opt to read in only a specific set of paritioned parquet files.
 69 | # Say for example that you only wanted users1 and users2 and not users3
 70 | 
 71 | # Note that the .option("basePath", path) option is used to override the automatic function
 72 | # that will exclude the partitioned variable in resulting dataframe. 
 73 | # I prefer to have the partitioning info in my new dataframe personally. 
 74 | users1_2 = spark.read.option("basePath", path).parquet(path+'users1.parquet',
 75 |                                                        path+'users2.parquet')
 76 | users1_2.show(4)
 77 | users1_2.count()
 78 | #-----------------------------------------------------------
 79 | #in **AWS** cloud storing data in s3 buckets your code will be more like this...
 80 | bucket = "my_bucket"
 81 | key1 = "partition_test/Table1/CREATED_YEAR=2015/*"
 82 | key2 = "partition_test/Table1/CREATED_YEAR=2017/*"
 83 | key3 = "partition_test/Table1/CREATED_YEAR=2018/*"
 84 | 
 85 | test_df = spark.read.parquet('s3://'+bucket+'/'+key1,\
 86 |                              's3://'+bucket+'/'+key2,\
 87 |                              's3://'+bucket+'/'+key3)
 88 | 
 89 | test_df.show(1)
 90 | #---------------------------------------------------------
 91 | 
 92 | # ## Validating Data
 93 | # 
 94 | # If you want to validate that you dataframe was read in correct. We will get
 95 | # into more detailed data evaluation later on but first we need to ensure that all the
 96 | # variable types were infered correctly and that the values actually made it in... sometimes
 97 | # they don't :)
 98 | students.printSchema()      #Prints out the schema in the tree format.
 99 | students.columns
100 | students.describe
101 | # Get an inital view of your dataframe
102 | students.show(3)
103 | 
104 | # Note the types here:
105 | print(type(students))
106 | studentsPdf = students.toPandas()
107 | print(type(studentsPdf))
108 | 
109 | 
110 | # A Solid Summary of your data:
111 | #show the data (like df.head())
112 | print(students.printSchema())
113 | print("")
114 | print(students.columns)
115 | print("")
116 | print(students.describe()) # Not so fond of this one but to each their own
117 | 
118 | # If you need to get the type of just ONE column by name you can use this function:
119 | students.schema['math score'].dataType
120 | 
121 | # Neat "describe" function
122 | students.describe(['math score']).show()
123 | 
124 | 
125 | # Summary function
126 | students.select("math score", "reading score","writing score").summary("count", "min", "25%", "75%", "max").show()
127 | 
128 | #  How to specify data types as you read in datasets.
129 | # Some data types make it easier to infer schema (like tabular formats such as csv which we will show later). 
130 | # 
131 | # However you often have to set the schema yourself if you aren't dealing with a .read method that
132 | # doesn't have inferSchema() built-in.
133 | # Spark has all the tools you need for this, it just requires a very specific structure:
134 | 
135 | from pyspark.sql.types import StructField,StringType,IntegerType,StructType,DateType
136 | 
137 | # Next we need to create the list of Structure fields
138 | #     * :param name: string, name of the field.
139 | #     * :param dataType: :class:`DataType` of the field.
140 | #     * :param nullable: boolean, whether the field can be null (None) or not.
141 | 
142 | data_schema = [StructField("name", StringType(), True),
143 |                StructField("email", StringType(), True),
144 |                StructField("city", StringType(), True),
145 |                StructField("mac", StringType(), True),
146 |                StructField("timestamp", DateType(), True),
147 |                StructField("creditcard", StringType(), True)]
148 | 
149 | final_struc = StructType(fields=data_schema)
150 | 
151 | 
152 | # a .json file  
153 | # 
154 | # **Source:** https://gist.github.com/raine/da15845f332a2fb8937b344504abfbe0
155 | 
156 | people = spark.read.json(path+'people.json', schema=final_struc)
157 | 
158 | people.printSchema()
159 | 
160 | 
161 | # ## Writing Data
162 | # First let's just try writing a simple csv file.
163 | 
164 | # Note the funky naming convention of the file in your output folder. There is no way to directly change this. 
165 | students.write.mode("overwrite").csv('write_test.csv')
166 | 
167 | # students.write.csv('write_test.csv')
168 | students.toPandas().to_csv('write_test2.csv')
169 | # Note the strange naming convention of the output file in the path that you specified.
170 | # Spark uses Hadoop File Format, which requires data to be partitioned - that's why you have part- files.
171 | # If you want to rename your written files to a more user friendly format, you can do that using the
172 | # method below:
173 | from py4j.java_gateway import java_import
174 | java_import(spark._jvm, 'org.apache.hadoop.fs.Path')
175 | 
176 | fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration())
177 | file = fs.globStatus(spark._jvm.Path('write_test.csv/part*'))[0].getPath().getName()
178 | fs.rename(spark._jvm.Path('write_test.csv/' + file), spark._jvm.Path('write_test2.csv'))
179 | #these two need to be different
180 | fs.delete(spark._jvm.Path('write_test.csv'), True)
181 | 
182 | 
183 | #  Writting Parquet files
184 | # 
185 | # Now let's try writing a parquet file. This is best practice for big data as it is the most compact
186 | # storage method.
187 | 
188 | users1_2.write.mode("overwrite").parquet('parquet/')
189 | 
190 | # Try this solution: https://stackoverflow.com/questions/59220832/unable-to-write-spark-dataframe-to-a-parquet-file-format-to-c-drive-in-pyspark
191 | # 
192 | #  Writting Partitioned Parquet Files
193 | # 
194 | # Now try to write a partioned parquet file... super fun!
195 | 
196 | users1_2.write.mode("overwrite").partitionBy("gender").parquet('part_parquet/')
197 | 
198 | 
199 | # #### Writting your own dataframes here! 
200 | # You can also create your own dataframes directly here in your Juypter Notebook too if you want. 
201 | 
202 | 
203 | values = [('Pear',10),('Orange',36),('Banana',123),('Kiwi',48),('Peach',16),('Strawberry',1)]
204 | df = spark.createDataFrame(values,['fruit','quantity'])
205 | df.show()
206 | 
207 | 


--------------------------------------------------------------------------------
/Recommender_System.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Recommender Code Along\n",
  8 |     "\n",
  9 |     "[movielens data set](https://grouplens.org/datasets/movielens/). \n",
 10 |     "\n",
 11 |     "\n",
 12 |     "Looking for more datasets? https://gist.github.com/entaroadun/1653794"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {
 19 |     "collapsed": true
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "from pyspark.sql import SparkSession"
 24 |    ]
 25 |   },
 26 |   {
 27 |    "cell_type": "code",
 28 |    "execution_count": 12,
 29 |    "metadata": {
 30 |     "collapsed": true
 31 |    },
 32 |    "outputs": [],
 33 |    "source": [
 34 |     "spark = SparkSession.builder.appName('rec').getOrCreate()"
 35 |    ]
 36 |   },
 37 |   {
 38 |    "cell_type": "code",
 39 |    "execution_count": 5,
 40 |    "metadata": {
 41 |     "collapsed": true
 42 |    },
 43 |    "outputs": [],
 44 |    "source": [
 45 |     "from pyspark.ml.evaluation import RegressionEvaluator\n",
 46 |     "from pyspark.ml.recommendation import ALS"
 47 |    ]
 48 |   },
 49 |   {
 50 |    "cell_type": "code",
 51 |    "execution_count": 19,
 52 |    "metadata": {},
 53 |    "outputs": [],
 54 |    "source": [
 55 |     "data = spark.read.csv('movielens_ratings.csv',inferSchema=True,header=True)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "code",
 60 |    "execution_count": 21,
 61 |    "metadata": {},
 62 |    "outputs": [
 63 |     {
 64 |      "data": {
 65 |       "text/plain": [
 66 |        "Row(movieId=2, rating=3.0, userId=0)"
 67 |       ]
 68 |      },
 69 |      "execution_count": 21,
 70 |      "metadata": {},
 71 |      "output_type": "execute_result"
 72 |     }
 73 |    ],
 74 |    "source": [
 75 |     "data.head()"
 76 |    ]
 77 |   },
 78 |   {
 79 |    "cell_type": "code",
 80 |    "execution_count": 24,
 81 |    "metadata": {},
 82 |    "outputs": [
 83 |     {
 84 |      "name": "stdout",
 85 |      "output_type": "stream",
 86 |      "text": [
 87 |       "+-------+------------------+------------------+------------------+\n",
 88 |       "|summary|           movieId|            rating|            userId|\n",
 89 |       "+-------+------------------+------------------+------------------+\n",
 90 |       "|  count|              1501|              1501|              1501|\n",
 91 |       "|   mean| 49.40572951365756|1.7741505662891406|14.383744170552964|\n",
 92 |       "| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|\n",
 93 |       "|    min|                 0|               1.0|                 0|\n",
 94 |       "|    max|                99|               5.0|                29|\n",
 95 |       "+-------+------------------+------------------+------------------+\n",
 96 |       "\n"
 97 |      ]
 98 |     }
 99 |    ],
100 |    "source": [
101 |     "data.describe().show()"
102 |    ]
103 |   },
104 |   {
105 |    "cell_type": "markdown",
106 |    "metadata": {},
107 |    "source": [
108 |     "We can do a split to evaluate how well our model performed, but keep in mind that it is very hard to know conclusively how well a recommender system is truly working for some topics. Especially if subjectivity is involved, for example not everyone that loves star wars is going to love star trek, even though a recommendation system may suggest otherwise."
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 27,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "# Smaller dataset so we will use 0.8 / 0.2\n",
120 |     "(training, test) = data.randomSplit([0.8, 0.2])"
121 |    ]
122 |   },
123 |   {
124 |    "cell_type": "code",
125 |    "execution_count": 28,
126 |    "metadata": {
127 |     "collapsed": true
128 |    },
129 |    "outputs": [],
130 |    "source": [
131 |     "# Build the recommendation model using ALS on the training data\n",
132 |     "als = ALS(maxIter=5, regParam=0.01, userCol=\"userId\", itemCol=\"movieId\", ratingCol=\"rating\")\n",
133 |     "model = als.fit(training)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "Now let's see hwo the model performed!"
141 |    ]
142 |   },
143 |   {
144 |    "cell_type": "code",
145 |    "execution_count": 41,
146 |    "metadata": {
147 |     "collapsed": true
148 |    },
149 |    "outputs": [],
150 |    "source": [
151 |     "# Evaluate the model by computing the RMSE on the test data\n",
152 |     "predictions = model.transform(test)"
153 |    ]
154 |   },
155 |   {
156 |    "cell_type": "code",
157 |    "execution_count": 43,
158 |    "metadata": {},
159 |    "outputs": [
160 |     {
161 |      "name": "stdout",
162 |      "output_type": "stream",
163 |      "text": [
164 |       "+-------+------+------+----------+\n",
165 |       "|movieId|rating|userId|prediction|\n",
166 |       "+-------+------+------+----------+\n",
167 |       "|     31|   1.0|    27| 2.5976338|\n",
168 |       "|     31|   1.0|    13|  2.113986|\n",
169 |       "|     31|   1.0|     5| 1.3165921|\n",
170 |       "|     31|   2.0|    25|0.16370271|\n",
171 |       "|     85|   1.0|    28|-2.5285664|\n",
172 |       "|     85|   1.0|    26|0.37620115|\n",
173 |       "|     85|   1.0|    12| 0.8253538|\n",
174 |       "|     85|   3.0|     1| 1.6069186|\n",
175 |       "|     85|   1.0|    13| 2.2720711|\n",
176 |       "|     85|   5.0|    16|0.80576146|\n",
177 |       "|     85|   1.0|    15|0.54832166|\n",
178 |       "|     85|   1.0|     4|  3.144216|\n",
179 |       "|     65|   1.0|    28|  -2.03051|\n",
180 |       "|     65|   2.0|     3|  3.801642|\n",
181 |       "|     65|   1.0|     2| 1.7128268|\n",
182 |       "|     53|   3.0|    13| 3.4453833|\n",
183 |       "|     53|   1.0|     6| 1.8362958|\n",
184 |       "|     53|   1.0|     9| 1.8954519|\n",
185 |       "|     78|   1.0|    22| 0.5302301|\n",
186 |       "|     78|   1.0|    13| 0.5055496|\n",
187 |       "+-------+------+------+----------+\n",
188 |       "only showing top 20 rows\n",
189 |       "\n"
190 |      ]
191 |     }
192 |    ],
193 |    "source": [
194 |     "predictions.show()"
195 |    ]
196 |   },
197 |   {
198 |    "cell_type": "code",
199 |    "execution_count": 29,
200 |    "metadata": {},
201 |    "outputs": [
202 |     {
203 |      "name": "stdout",
204 |      "output_type": "stream",
205 |      "text": [
206 |       "Root-mean-square error = 1.751143638387403\n"
207 |      ]
208 |     }
209 |    ],
210 |    "source": [
211 |     "evaluator = RegressionEvaluator(metricName=\"rmse\", labelCol=\"rating\",predictionCol=\"prediction\")\n",
212 |     "rmse = evaluator.evaluate(predictions)\n",
213 |     "print(\"Root-mean-square error = \" + str(rmse))"
214 |    ]
215 |   },
216 |   {
217 |    "cell_type": "markdown",
218 |    "metadata": {},
219 |    "source": [
220 |     "The RMSE described our error in terms of the stars rating column."
221 |    ]
222 |   },
223 |   {
224 |    "cell_type": "markdown",
225 |    "metadata": {},
226 |    "source": [
227 |     "So now that we have the model, how would you actually supply a recommendation to a user?\n",
228 |     "\n",
229 |     "The same way we did with the test data! For example:"
230 |    ]
231 |   },
232 |   {
233 |    "cell_type": "code",
234 |    "execution_count": 47,
235 |    "metadata": {
236 |     "collapsed": true
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "single_user = test.filter(test['userId']==11).select(['movieId','userId'])"
241 |    ]
242 |   },
243 |   {
244 |    "cell_type": "code",
245 |    "execution_count": 48,
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "name": "stdout",
250 |      "output_type": "stream",
251 |      "text": [
252 |       "+-------+------+\n",
253 |       "|movieId|userId|\n",
254 |       "+-------+------+\n",
255 |       "|      0|    11|\n",
256 |       "|     13|    11|\n",
257 |       "|     18|    11|\n",
258 |       "|     30|    11|\n",
259 |       "|     66|    11|\n",
260 |       "|     70|    11|\n",
261 |       "|     75|    11|\n",
262 |       "|     78|    11|\n",
263 |       "|     79|    11|\n",
264 |       "|     99|    11|\n",
265 |       "+-------+------+\n",
266 |       "\n"
267 |      ]
268 |     }
269 |    ],
270 |    "source": [
271 |     "# User had 10 ratings in the test data set \n",
272 |     "# Realistically this should be some sort of hold out set!\n",
273 |     "single_user.show()"
274 |    ]
275 |   },
276 |   {
277 |    "cell_type": "code",
278 |    "execution_count": 49,
279 |    "metadata": {
280 |     "collapsed": true
281 |    },
282 |    "outputs": [],
283 |    "source": [
284 |     "reccomendations = model.transform(single_user)"
285 |    ]
286 |   },
287 |   {
288 |    "cell_type": "code",
289 |    "execution_count": 54,
290 |    "metadata": {},
291 |    "outputs": [
292 |     {
293 |      "name": "stdout",
294 |      "output_type": "stream",
295 |      "text": [
296 |       "+-------+------+----------+\n",
297 |       "|movieId|userId|prediction|\n",
298 |       "+-------+------+----------+\n",
299 |       "|     30|    11|  5.578189|\n",
300 |       "|     13|    11|  3.257565|\n",
301 |       "|     70|    11| 2.7580981|\n",
302 |       "|     99|    11| 1.7420897|\n",
303 |       "|     18|    11| 1.5150304|\n",
304 |       "|     75|    11|   1.34218|\n",
305 |       "|     79|    11| 0.9733073|\n",
306 |       "|     66|    11| 0.5732717|\n",
307 |       "|     78|    11| 0.4434041|\n",
308 |       "|      0|    11|  -1.85298|\n",
309 |       "+-------+------+----------+\n",
310 |       "\n"
311 |      ]
312 |     }
313 |    ],
314 |    "source": [
315 |     "reccomendations.orderBy('prediction',ascending=False).show()"
316 |    ]
317 |   }
318 |  ],
319 |  "metadata": {
320 |   "anaconda-cloud": {},
321 |   "kernelspec": {
322 |    "display_name": "Python 3",
323 |    "language": "python",
324 |    "name": "python3"
325 |   },
326 |   "language_info": {
327 |    "codemirror_mode": {
328 |     "name": "ipython",
329 |     "version": 3
330 |    },
331 |    "file_extension": ".py",
332 |    "mimetype": "text/x-python",
333 |    "name": "python",
334 |    "nbconvert_exporter": "python",
335 |    "pygments_lexer": "ipython3",
336 |    "version": "3.8.3"
337 |   }
338 |  },
339 |  "nbformat": 4,
340 |  "nbformat_minor": 1
341 | }
342 | 


--------------------------------------------------------------------------------
/SQL_notebook.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/SQL_notebook.pdf


--------------------------------------------------------------------------------
/TweetRead.py:
--------------------------------------------------------------------------------
 1 | # import libraries
 2 | import tweepy
 3 | from tweepy import OAuthHandler
 4 | from tweepy import Stream
 5 | from tweepy.streaming import StreamListener
 6 | import socket
 7 | import json
 8 | 
 9 | #%%
10 | # Set up your credentials
11 | consumer_key=''
12 | consumer_secret=''
13 | access_token =''
14 | access_secret=''
15 | 
16 | class TweetsListener(StreamListener):
17 | 
18 |   def __init__(self, csocket):
19 |       self.client_socket = csocket
20 | 
21 |   def on_data(self, data):
22 |       try:
23 |           msg = json.loads( data )
24 |           print( msg['text'].encode('utf-8') )
25 |           self.client_socket.send( msg['text'].encode('utf-8') )
26 |           return True
27 |       except BaseException as e:
28 |           print("Error on_data: %s" % str(e))
29 |       return True
30 | 
31 |   def on_error(self, status):
32 |       print(status)
33 |       return True
34 | 
35 | def sendData(c_socket):
36 |   auth = OAuthHandler(consumer_key, consumer_secret)
37 |   auth.set_access_token(access_token, access_secret)
38 | 
39 |   twitter_stream = Stream(auth, TweetsListener(c_socket))
40 |   twitter_stream.filter(track=['soccer'])
41 | 
42 | if __name__ == "__main__":
43 |   s = socket.socket()            # Create a socket object
44 |   host = "127.0.0.1"             # Get local machine name
45 |   port = 5555                   # Reserve a port for your service.
46 |   s.bind((host, port))          # Bind to the port
47 | 
48 |   print("Listening on port: %s" % str(port))
49 | 
50 |   s.listen(5)                 # Now wait for client connection.
51 |   c, addr = s.accept()        # Establish connection with client.
52 | 
53 |   print( "Received request from: " + str( addr ) )
54 | 
55 |   sendData( c )
56 | 


--------------------------------------------------------------------------------
/big_data/SQL-in-Spark.py:
--------------------------------------------------------------------------------
 1 | # PySpark provides two main options when it comes to using staight SQL. Spark SQL and SQL Transformer.
 2 | # ## 1. Spark SQL
 3 | # Spark TempView provides two functions that allow users to run **SQL** queries against a Spark DataFrame:
 4 | #
 5 | #  - **createOrReplaceTempView:** The lifetime of this temporary view is tied to the SparkSession that was
 6 | #  used to create the dataset. It creates (or replaces if that view name already exists) a lazily evaluated
 7 | #  "view" that you can then use like a hive table in Spark SQL. It does not persist to memory unless you cache
 8 | #  the dataset that underpins the view.
 9 | #  - **createGlobalTempView:** The lifetime of this temporary view is tied to this Spark application.
10 | #  This feature is useful when you want to share data among different sessions and keep alive until your
11 | #  application ends.
12 | #
13 | 
14 | import pyspark # only run after findspark.init()
15 | from pyspark.sql import SparkSession
16 | # May take awhile locally
17 | spark = SparkSession.builder.appName("SparkSQL").getOrCreate()
18 | 
19 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
20 | print("You are working with", cores, "core(s)")
21 | spark
22 | 
23 | ##**Source:** https://www.kaggle.com/r3w0p4/recorded-crime-data-at-police-force-area-level
24 | # Start by reading a basic csv dataset
25 | path = 'Datasets/'
26 | crime = spark.read.csv(path+"rec-crime-pfa.csv",header=True,inferSchema=True)
27 | 
28 | # So, in order for us to perform SQL calls off of this dataframe, we will need to rename any variables
29 | # that have spaces in them. We will not be using the first variable so we'll leave that one as is,
30 | # but we will be using the last variable, so I will go ahead and change that to Count so we can work with it.
31 | 
32 | df = crime.withColumnRenamed('Rolling year total number of offences','Count')
33 | #.withColumn("12 months ending", crime["12 months ending"].cast(DateType())).
34 | print(df.printSchema())
35 | 
36 | 
37 | # Create a temporary view of the dataframe, it is like a hive table in Spark SQL
38 | df.createOrReplaceTempView("newtable")
39 | spark.sql("SELECT * FROM newtable WHERE Count > 1000").limit(5).toPandas()
40 | spark.sql("SELECT sum(Count) as total FROM newtable where Count between 1000 and 2000").show(5)
41 | spark.sql("SELECT Region, sum(Count) as total FROM newtable GROUP BY Region").show(5)
42 | 
43 | 


--------------------------------------------------------------------------------
/big_data/partition_parquet_file.py:
--------------------------------------------------------------------------------
 1 | ##
 2 | from pyspark.sql import SparkSession
 3 | 
 4 | 
 5 | spark = SparkSession.builder.appName("session").master("local").getOrCreate()
 6 | 
 7 | path = "Datasets/"
 8 | data = spark.read.csv(path+"pga_tour_historical.csv",inferSchema=True, header=True)
 9 | data.show(10)
10 | data.limit(10).toPandas()
11 | data.count()
12 | data.printSchema()
13 | data.describe
14 | # Generate summary statistics for TWO variables
15 | 
16 | data.select('Season', 'Value').summary("count","min",'max').show()
17 | 
18 | # Write a partioned parquet file
19 | ## Now try writing a parquet file (not partitioned) from the pga dataset. But first create a new dataframe containing
20 | # ONLY the the "Season" and "Value" fields (using the "select command you used in the question above) and write a parquet file
21 | # partitioned by "Season". This is a bit of a challenge aimed at getting you ready for material that will be covered later on
22 | # in the course. Don't feel bad if you can't figure it out.
23 | df = data.select('Season','Value')
24 | # it will create a directory named partition_parquet
25 | df.write.mode("overwrite").parquet("partition_parquet/", partitionBy='Season')
26 | # then partition parquet-data in that directory
27 | #df.write.mode("overwrite").partitionBy("Season").parquet("partition_parquet/")
28 | df.show(20)
29 | 
30 | # Now try reading in the partitioned parquet file you just created above.
31 | path_prq = 'partition_parquet/'
32 | parquet = spark.read.parquet(path_prq)
33 | parquet.show(20)
34 | df.printSchema()
35 | 
36 | # Reading in a set of paritioned parquet files
37 | # # Now try only reading Seasons 2010, 2011 and 2012.
38 | 
39 | partitioned = spark.read.parquet(path_prq+'Season=2010/',path_prq+'Season=2011/',
40 |                                  path_prq+'Season=2012/')
41 | 
42 | partitioned.show(10)
43 | # we need to use a method to get both season and value
44 | 
45 | partitioned = spark.read.option("basePath", path_prq).parquet(path_prq+'Season=2010/',path_prq+'Season=2011/',
46 |                                  path_prq+'Season=2012/')
47 | 
48 | partitioned.show(10)
49 | 


--------------------------------------------------------------------------------
/big_data/readme.md:
--------------------------------------------------------------------------------
1 | Documentation [https://spark.apache.org/docs/latest/sql-programming-guide.html]
2 | 


--------------------------------------------------------------------------------
/big_data/search_filter_dataframe.py:
--------------------------------------------------------------------------------
 1 | import pyspark
 2 | from pyspark.sql import SparkSession
 3 | # May take awhile locally
 4 | spark = SparkSession.builder.appName("FunctionsHW").getOrCreate()
 5 | 
 6 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
 7 | print("You are working with", cores, "core(s)")
 8 | spark
 9 | 
10 | fifa = spark.read.csv('Datasets/fifa19.csv',inferSchema=True,header=True)
11 | 
12 | print(fifa.printSchema())
13 | 
14 | from pyspark.sql.functions import *
15 | fifa.select(['Name','Position','Release Clause']).show(5,False)
16 | # Display the same results from above sorted by the players names
17 | fifa.select(['Name','Position']).orderBy('Name').show(5)
18 | fifa.select(['Name','Position','Age']).orderBy(fifa['Age'].desc()).show(5)
19 | 
20 | # Select only the players who belong to a club begining with FC
21 | # One way
22 | fifa.select("Name","Club").where(fifa.Club.like("FC%")).show(5, False)
23 | 
24 | # Another way
25 | fifa.select("Name","Club").where(fifa.Club.startswith("FC")).limit(4).toPandas()
26 | 
27 | ## ======================================================
28 | # to create a new dataframe
29 | df = fifa.limit(100)
30 | df.count()
31 | 
32 | # if we slice the colomns
33 | df2_col = fifa.columns[0:5]
34 | df2 = fifa.select(df2_col)
35 | df2.count()
36 | df2.show(5,False)
37 | # count the colomn 
38 | len(df2.columns)
39 | # ========================================================
40 | # Filtering data with condition
41 | # ========================================================
42 | 
43 | fifa.filter("Age>40").select(['Name','Age']).limit(4).toPandas()
44 | 


--------------------------------------------------------------------------------
/big_data/split-column.py:
--------------------------------------------------------------------------------
 1 | # PySpark split() Column into Multiple Columns
 2 | from sqlite3 import Timestamp
 3 | 
 4 | import pyspark
 5 | from pyspark.sql import SparkSession
 6 | # from pyspark.sql.functions import split
 7 | from pyspark.sql.functions import *
 8 | from pyspark.sql.types import StructType,StructField, StringType, IntegerType
 9 | spark=SparkSession.builder.appName("sparkbyexamples").getOrCreate()
10 | ##
11 | #
12 | # create a data which is a list
13 | data = [('James','','Smith','1991-04-01'),
14 |   ('Michael','Rose','','2000-05-19'),
15 |   ('Robert','','Williams','1978-09-05'),
16 |   ('Maria','Anne','Jones','1967-12-01'),
17 |   ('Jen','Mary','Brown','1980-02-17')
18 | ]
19 | 
20 | df0 = spark.createDataFrame(data, ["name", "midname", "surname", "dob"])
21 | 
22 | df0.printSchema()
23 | df0.show(truncate=False)
24 | """
25 | +-------+-------+--------+----------+
26 | |name   |midname|surname |dob       |
27 | +-------+-------+--------+----------+
28 | |James  |       |Smith   |1991-04-01|
29 | |Michael|Rose   |        |2000-05-19|
30 | |Robert |       |Williams|1978-09-05|
31 | |Maria  |Anne   |Jones   |1967-12-01|
32 | |Jen    |Mary   |Brown   |1980-02-17|
33 | +-------+-------+--------+----------+
34 | """
35 | #----------------------------
36 | # Below example creates a new Dataframe with Columns year, month, and the day after performing a split()
37 | # function on dob Column of string type.
38 | 
39 | df1 = df0.withColumn('year', split(df0['dob'], '-').getItem(0)) \
40 |        .withColumn('month', split(df0['dob'], '-').getItem(1)) \
41 |        .withColumn('day', split(df0['dob'], '-').getItem(2))
42 | df1.show(truncate=False)
43 | #---------
44 | split_col = pyspark.sql.functions.split(df0['dob'], '-')
45 | df2 = df0.withColumn('year', split_col.getItem(0)) \
46 |        .withColumn('month', split_col.getItem(1)) \
47 |        .withColumn('day', split_col.getItem(2))
48 | df2.show(truncate=False)
49 | 
50 | """
51 | +-------+-------+--------+----------+----+-----+---+
52 | |name   |midname|surname |dob       |year|month|day|
53 | +-------+-------+--------+----------+----+-----+---+
54 | |James  |       |Smith   |1991-04-01|1991|04   |01 |
55 | |Michael|Rose   |        |2000-05-19|2000|05   |19 |
56 | |Robert |       |Williams|1978-09-05|1978|09   |05 |
57 | |Maria  |Anne   |Jones   |1967-12-01|1967|12   |01 |
58 | |Jen    |Mary   |Brown   |1980-02-17|1980|02   |17 |
59 | +-------+-------+--------+----------+----+-----+---+
60 | """
61 | 
62 | # Using split() function of Column class
63 | split_col = pyspark.sql.functions.split(df0['dob'], '-')
64 | df3 = df0.select("firstname","middlename","lastname","dob", split_col.getItem(0).alias('year'),split_col.getItem(1).alias('month'),split_col.getItem(2).alias('day'))
65 | df3.show(truncate=False)
66 | 
67 | 
68 | 
69 | 
70 | 
71 | 
72 | 
73 | 
74 | 
75 | 


--------------------------------------------------------------------------------
/books/LearningSpark2.0.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/books/LearningSpark2.0.pdf


--------------------------------------------------------------------------------
/books/pyspark.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/books/pyspark.pdf


--------------------------------------------------------------------------------
/books/spark-hadoop.pdf:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/books/spark-hadoop.pdf


--------------------------------------------------------------------------------
/data/ContainsNull.csv:
--------------------------------------------------------------------------------
1 | Id,Name,Sales
2 | emp1,John,
3 | emp2,,
4 | emp3,,345.0
5 | emp4,Cindy,456.0
6 | 


--------------------------------------------------------------------------------
/data/cruise_ship_info.csv:
--------------------------------------------------------------------------------
  1 | Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew
  2 | Journey,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55
  3 | Quest,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55
  4 | Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7
  5 | Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1
  6 | Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0
  7 | Ecstasy,Carnival,22,70.367,20.52,8.55,10.2,34.29,9.2
  8 | Elation,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2
  9 | Fantasy,Carnival,23,70.367,20.56,8.55,10.22,34.23,9.2
 10 | Fascination,Carnival,19,70.367,20.52,8.55,10.2,34.29,9.2
 11 | Freedom,Carnival,6,110.23899999999999,37.0,9.51,14.87,29.79,11.5
 12 | Glory,Carnival,10,110.0,29.74,9.51,14.87,36.99,11.6
 13 | Holiday,Carnival,28,46.052,14.52,7.27,7.26,31.72,6.6
 14 | Imagination,Carnival,18,70.367,20.52,8.55,10.2,34.29,9.2
 15 | Inspiration,Carnival,17,70.367,20.52,8.55,10.2,34.29,9.2
 16 | Legend,Carnival,11,86.0,21.24,9.63,10.62,40.49,9.3
 17 | Liberty*,Carnival,8,110.0,29.74,9.51,14.87,36.99,11.6
 18 | Miracle,Carnival,9,88.5,21.24,9.63,10.62,41.67,10.3
 19 | Paradise,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2
 20 | Pride,Carnival,12,88.5,21.24,9.63,11.62,41.67,9.3
 21 | Sensation,Carnival,20,70.367,20.52,8.55,10.2,34.29,9.2
 22 | Spirit,Carnival,12,88.5,21.24,9.63,10.56,41.67,10.29
 23 | Triumph,Carnival,14,101.509,27.58,8.93,13.21,36.81,10.0
 24 | Valor,Carnival,9,110.0,29.74,9.52,14.87,36.99,11.6
 25 | Victory,Carnival,13,101.509,27.58,8.93,13.79,36.81,11.5
 26 | Century,Celebrity,18,70.60600000000001,17.7,8.15,8.75,39.89,8.58
 27 | Constellation,Celebrity,11,91.0,20.32,9.65,9.75,44.78,9.99
 28 | Galaxy,Celebrity,17,77.71300000000001,18.9,8.66,9.35,41.12,9.09
 29 | Infinity,Celebrity,12,91.0,20.32,9.65,9.75,44.78,9.99
 30 | Mercury,Celebrity,16,77.71300000000001,18.82,8.66,9.35,41.29,9.09
 31 | Millenium,Celebrity,13,91.0,20.32,9.65,9.75,44.78,9.99
 32 | Solstice,Celebrity,5,122.0,28.5,10.33,6.87,34.57,6.7
 33 | Summit,Celebrity,12,91.0,20.32,9.65,9.75,44.78,9.99
 34 | Xpedition,Celebrity,12,2.329,0.94,2.96,0.45,24.78,0.6
 35 | Zenith,Celebrity,21,47.225,13.66,6.82,6.87,34.57,6.7
 36 | Allegra,Costa,21,28.43,8.08,6.16,4.1,35.19,4.0
 37 | Atlantica,Costa,13,85.619,21.14,9.57,10.56,40.5,9.2
 38 | Classica,Costa,22,52.926,13.02,7.18,6.54,40.65,6.17
 39 | Europa,Costa,27,53.872,14.94,7.98,7.67,36.06,6.36
 40 | Fortuna,Costa,10,105.0,27.2,8.9,13.56,38.6,10.68
 41 | Magica,Costa,9,105.0,27.2,8.9,13.56,38.6,10.68
 42 | Marina,Costa,23,25.0,7.76,6.22,3.86,32.22,3.85
 43 | Mediterranea,Costa,10,86.0,21.14,9.6,10.56,40.68,9.2
 44 | Romantica,Costa,20,53.049,13.44,7.22,6.78,39.47,6.0
 45 | Serena,Costa,6,112.0,38.0,9.51,15.0,29.47,10.9
 46 | Victoria,Costa,17,75.166,19.28,8.28,9.64,38.99,7.66
 47 | Serenity,Crystal,10,68.0,10.8,7.9,5.5,62.96,6.36
 48 | Symphony,Crystal,18,51.004,9.4,7.81,4.8,54.26,5.45
 49 | QueenElizabethII,Cunard,44,70.327,17.91,9.63,9.5,39.27,9.21
 50 | QueenMary2,Cunard,10,151.4,26.2,11.32,11.34,57.79,12.53
 51 | QueenVictoria,Cunard,6,90.0,20.0,9.64,10.29,45.0,9.0
 52 | Magic,Disney,15,83.338,17.5,9.64,8.75,47.62,9.45
 53 | Wonder,Disney,14,83.0,17.5,9.64,8.75,47.43,9.45
 54 | Amsterdam,Holland_American,13,61.0,13.8,7.8,6.88,44.2,6.0
 55 | Eurodam,Holland_American,5,86.0,21.04,9.36,10.22,40.87,8.0
 56 | Maasdam,Holland_American,20,55.451,12.64,7.19,6.32,43.87,5.57
 57 | Noordam,Holland_American,29,33.92,12.14,7.04,6.07,27.94,5.3
 58 | Oosterdam,Holland_American,10,81.76899999999999,18.48,9.59,9.24,44.25,8.42
 59 | Prinsendam,Holland_American,25,38.0,7.49,6.74,3.96,50.73,4.6
 60 | Rotterdam,Holland_American,16,59.652,13.2,7.77,6.6,45.19,6.44
 61 | Ryndam,Holland_American,19,55.451,12.66,7.19,6.33,43.8,5.88
 62 | Statendam,Holland_American,20,55.451,12.66,7.19,6.33,43.8,5.88
 63 | Veendam,Holland_American,17,55.451,12.66,7.19,6.33,43.8,5.88
 64 | Volendam,Holland_American,14,63.0,14.4,7.77,7.2,43.75,5.61
 65 | Westerdam,Holland_American,27,53.872,14.94,7.98,7.47,36.06,6.12
 66 | Zaandam,Holland_American,13,63.0,14.4,7.77,7.2,43.75,5.31
 67 | Zuiderdam,Holland_American,11,85.0,18.48,9.51,9.24,46.0,8.0
 68 | Armonia,MSC,12,58.6,15.66,8.24,7.83,37.42,7.0
 69 | Fantasia,MSC,5,133.5,39.59,10.93,16.37,33.72,13.13
 70 | Lirica,MSC,10,58.825,15.6,8.23,7.65,37.71,7.0
 71 | Melody,MSC,31,35.143,12.5,6.69,5.32,28.11,5.35
 72 | Musica,MSC,7,89.6,25.5,9.61,12.75,35.14,9.87
 73 | Opera,MSC,9,59.058,17.0,7.63,8.5,34.74,7.4
 74 | Rhapsody,MSC,36,16.852,9.52,5.41,3.83,17.7,2.97
 75 | Sinfonia,MSC,11,58.6,15.66,8.23,7.83,37.42,7.6
 76 | Crown,Norwegian,25,34.25,10.52,6.15,5.26,32.56,4.7
 77 | Dawn,Norwegian,11,90.0,22.4,9.65,11.2,40.18,11.0
 78 | Dream,Norwegian,21,50.76,17.48,7.54,8.74,29.04,6.14
 79 | Gem,Norwegian,6,93.0,23.94,9.65,11.97,38.85,11.09
 80 | Jewel,Norwegian,8,91.0,22.44,9.65,11.22,40.55,11.0
 81 | Majesty,Norwegian,21,38.0,10.56,5.67,5.28,35.98,4.38
 82 | PrideofAloha,Norwegian,14,77.104,20.02,8.53,10.01,38.51,8.0
 83 | PrideofAmerica,Norwegian,9,81.0,21.44,9.21,10.72,37.78,10.0
 84 | Sea,Norwegian,25,42.0,15.04,7.08,7.52,27.93,6.3
 85 | Spirit,Norwegian,15,75.33800000000001,19.56,8.79,9.83,38.52,13.0
 86 | Star,Norwegian,40,28.0,11.5,6.74,4.0,24.35,3.8
 87 | Sun,Norwegian,12,77.104,20.02,8.53,10.01,38.51,9.59
 88 | Wind,Norwegian,20,50.76,17.48,7.54,8.74,29.04,6.14
 89 | Insignia,Oceania,15,30.276999999999997,6.84,5.94,3.42,44.26,4.0
 90 | Nautica,Oceania,13,30.276999999999997,6.84,5.94,3.42,44.26,4.0
 91 | Regatta,Oceania,15,30.276999999999997,6.84,5.94,3.42,44.26,4.0
 92 | MarcoPolo,Orient,48,22.08,8.26,5.78,4.25,26.73,3.5
 93 | Arcadia,P&O,9,85.0,19.68,9.35,9.84,43.19,8.69
 94 | Artemis,P&O,29,45.0,11.78,7.54,5.3,38.2,5.2
 95 | Aurora,P&O,13,76.0,18.74,8.86,9.39,40.55,8.5
 96 | Oceana,P&O,10,77.0,20.16,8.56,9.75,38.19,9.0
 97 | Oriana,P&O,18,69.153,18.82,8.53,9.14,36.74,7.94
 98 | Ventura,P&O,5,115.0,35.74,9.0,15.32,32.18,12.2
 99 | Caribbean,Princess,9,116.0,26.0,9.51,13.0,44.62,11.0
100 | Coral,Princess,11,91.62700000000001,19.74,9.64,9.87,46.42,9.0
101 | Crown,Princess,7,116.0,31.0,9.51,15.57,37.42,12.0
102 | Dawn,Princess,16,77.499,19.5,8.56,10.5,39.74,9.0
103 | Diamond,Princess,9,113.0,26.74,9.51,13.37,42.26,12.38
104 | Emerald,Princess,6,113.0,37.82,9.51,15.57,29.88,12.0
105 | Golden,Princess,12,108.865,27.58,9.51,13.0,39.47,11.0
106 | Grand,Princess,15,108.806,26.0,9.51,13.0,41.85,11.1
107 | Island,Princess,10,91.62700000000001,19.74,9.64,9.87,46.42,9.0
108 | Pacific,Princess,14,30.276999999999997,6.86,5.93,3.44,44.14,3.73
109 | Regal,Princess,22,69.845,15.9,8.03,7.95,43.93,6.96
110 | Royal,Princess,29,44.348,12.0,7.54,6.0,36.96,5.2
111 | Saphire,Princess,9,113.0,26.74,9.51,13.37,42.26,12.38
112 | Sea,Princess,8,77.499,19.5,8.56,9.75,39.74,9.0
113 | Star,Princess,11,108.977,26.02,9.51,13.01,41.88,12.0
114 | Sun,Princess,18,77.499,19.5,8.56,9.75,39.74,9.0
115 | Tahitian,Princess,14,30.276999999999997,6.88,5.93,3.44,44.01,3.73
116 | ExplorerII,Regent_Seven_Seas,27,12.5,3.94,4.36,0.88,31.73,1.46
117 | Mariner,Regent_Seven_Seas,12,50.0,7.0,7.09,3.54,71.43,4.45
118 | Navigator,Regent_Seven_Seas,14,33.0,4.9,5.6,2.45,67.35,3.24
119 | PaulGauguin,Regent_Seven_Seas,16,19.2,3.2,5.13,1.6,60.0,2.11
120 | Voyager,Regent_Seven_Seas,10,46.0,7.0,6.7,1.82,65.71,4.47
121 | Adventure,Royal_Caribbean,12,138.0,31.14,10.2,15.57,44.32,11.85
122 | Brilliance,Royal_Caribbean,11,90.09,25.01,9.62,10.5,36.02,8.48
123 | Empress,Royal_Caribbean,23,48.563,20.2,6.92,8.0,24.04,6.71
124 | Enchantment,Royal_Caribbean,16,74.137,19.5,9.16,9.75,38.02,7.6
125 | Explorer,Royal_Caribbean,13,138.0,31.14,10.2,15.57,44.32,11.76
126 | Freedom,Royal_Caribbean,7,158.0,43.7,11.12,18.0,36.16,13.6
127 | Grandeur,Royal_Caribbean,17,74.137,19.5,9.16,9.75,38.02,7.6
128 | Independence,Royal_Caribbean,5,160.0,36.34,11.12,18.17,44.03,13.6
129 | Jewel,Royal_Caribbean,9,90.09,25.01,9.62,10.94,36.02,8.69
130 | Legend,Royal_Caribbean,18,70.0,18.0,8.67,9.0,38.89,7.2
131 | Liberty,Royal_Caribbean,6,158.0,43.7,11.25,18.0,36.16,13.6
132 | Majesty,Royal_Caribbean,21,73.941,27.44,8.8,11.75,26.95,8.22
133 | Mariner,Royal_Caribbean,10,138.0,31.14,10.2,15.57,44.32,11.85
134 | Monarch,Royal_Caribbean,22,73.941,27.44,8.8,11.77,30.94,8.22
135 | Navigator,Royal_Caribbean,11,138.0,31.14,10.2,15.57,44.32,11.85
136 | Oasis,Royal_Caribbean,4,220.0,54.0,11.82,27.0,40.74,21.0
137 | Radiance,Royal_Caribbean,12,90.09,25.01,9.62,10.5,36.02,8.68
138 | Rhapsody,Royal_Caribbean,16,78.491,24.35,9.15,10.0,32.23,7.65
139 | Serenade,Royal_Caribbean,10,90.09,25.01,9.62,10.5,36.02,8.58
140 | Sovreign,Royal_Caribbean,25,73.192,28.52,8.8,11.38,25.66,8.08
141 | Splendour,Royal_Caribbean,17,70.0,20.76,8.67,9.02,33.72,7.2
142 | Vision,Royal_Caribbean,15,78.491,24.35,9.15,10.0,32.23,6.6
143 | Voyager,Royal_Caribbean,14,138.0,31.14,10.2,15.57,44.32,11.76
144 | Legend,Seabourn,21,10.0,2.08,4.4,1.04,48.08,1.6
145 | Pride,Seabourn,27,10.0,2.08,4.4,1.04,48.08,1.6
146 | Spirit,Seabourn,24,10.0,2.08,4.4,1.04,48.08,1.6
147 | Cloud,Silversea,19,16.8,2.96,5.14,1.48,56.76,2.1
148 | Shadow,Silversea,13,25.0,3.82,5.97,1.94,65.45,2.95
149 | Whisper,Silversea,12,25.0,3.88,5.97,1.94,64.43,2.87
150 | Wind,Silversea,19,16.8,2.96,5.14,1.48,56.76,1.97
151 | Aries,Star,22,3.341,0.66,2.8,0.33,50.62,0.59
152 | Gemini,Star,21,19.093,8.0,5.37,4.0,23.87,4.7
153 | Libra,Star,12,42.0,14.8,7.13,7.4,28.38,6.8
154 | Pisces,Star,24,40.053000000000004,12.87,5.79,7.76,31.12,7.5
155 | Taurus,Star,22,3.341,0.66,2.79,0.33,50.62,0.59
156 | Virgo,Star,14,76.8,19.6,8.79,9.67,39.18,12.0
157 | Spirit,Windstar,25,5.35,1.58,4.4,0.74,33.86,0.88
158 | Star,Windstar,27,5.35,1.67,4.4,0.74,32.04,0.88
159 | Surf,Windstar,23,14.745,3.08,6.17,1.56,47.87,1.8
160 | 


--------------------------------------------------------------------------------
/data/sales_info.csv:
--------------------------------------------------------------------------------
 1 | Company,Person,Sales
 2 | GOOG,Sam,200
 3 | GOOG,Charlie,120
 4 | GOOG,Frank,340
 5 | MSFT,Tina,600
 6 | MSFT,Amy,124
 7 | MSFT,Vanessa,243
 8 | FB,Carl,870
 9 | FB,Sarah,350
10 | APPL,John,250
11 | APPL,Linda, 130
12 | APPL,Mike, 750
13 | APPL, Chris, 350


--------------------------------------------------------------------------------
/data/users1.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/data/users1.parquet


--------------------------------------------------------------------------------
/data/users2.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/data/users2.parquet


--------------------------------------------------------------------------------
/data/users3.parquet:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/data/users3.parquet


--------------------------------------------------------------------------------
/scripts/aggrigating-data-in-DataFrame.py:
--------------------------------------------------------------------------------
 1 | """
 2 | - Group by
 3 | - Pivot
 4 | - Aggregate method
 5 | - Combos of each
 6 | """
 7 | #from pyspark.sql.functions import mean
 8 | 
 9 | import pyspark
10 | from pyspark.sql import SparkSession
11 | 
12 | spark = SparkSession.builder.appName('aggrigation').getOrCreate()
13 | spark
14 | df0 = spark.read.csv("Datasets/nyc_air_bnb.csv", header=True, inferSchema=True)
15 | df0.printSchema()
16 | df0.show(4)
17 | 
18 | from pyspark.sql.types import *
19 | from pyspark.sql.functions import *
20 | 
21 | df = df0.withColumn("price", df0.price.cast(IntegerType()))
22 | df.printSchema()
23 | 
24 | df = df.withColumn("minimum_nights", df.minimum_nights.cast(IntegerType())) \
25 |     .withColumn("number_of_reviews", df.number_of_reviews.cast(IntegerType())) \
26 |     .withColumn("reviews_per_month", df.reviews_per_month.cast(IntegerType())) \
27 |     .withColumn("calculated_host_listings_count", df.calculated_host_listings_count.cast(IntegerType())) \
28 |     .withColumn("last_review", df.last_review.cast('date'))
29 | 
30 | df.printSchema()
31 | df.show(4, False)
32 | 
33 | # GROUP BY
34 | df.groupBy("neighbourhood_group").min().show(5)
35 | df.summary().show(5)
36 | df.summary("min","max","count").show(5)
37 | df.select('price', 'minimum_nights').summary('min','max','count', 'mean').show(5)
38 | ##
39 | df.select(countDistinct("neighbourhood_group"), mean('price'), max('price')).show(5)
40 | 
41 | df.groupBy('room_type').pivot("neighbourhood_group", ["Queens", "Brooklyn"]). count().show(5)
42 | 


--------------------------------------------------------------------------------
/scripts/join-append-DataFrame.py:
--------------------------------------------------------------------------------
 1 | """
 2 | - Appending Table
 3 | - Joining Tables
 4 | """
 5 | 
 6 | import pyspark
 7 | from pyspark.sql import SparkSession
 8 | 
 9 | spark = SparkSession.builder.appName('jointables').getOrCreate()
10 | spark
11 | ##
12 | valuesP = [('koala',1,'yes'),('caterpillar',2,'yes'),('deer',3,'yes'),('human',4,'yes')]
13 | plants = spark.createDataFrame(valuesP,['name','id','eats_plants'])
14 | 
15 | valuesM = [('shark',5,'yes'),('lion',6,'yes'),('tiger',7,'yes'),('human',4,'yes')]
16 | meat = spark.createDataFrame(valuesM,['name','id','eats_meat'])
17 | ##
18 | print("Plant eaters (herbivores)")
19 | print(plants.show())
20 | print("Meat eaters (carnivores)")
21 | print(meat.show())
22 | 
23 | # ---------------
24 | 
25 | innerjoinDF = plants.join(meat, on = ['name', 'id'], how='inner')
26 | innerjoinDF.show()
27 | 
28 | leftjoinDF = plants.join(meat, on = 'name', how='left')
29 | leftjoinDF.show()
30 | 
31 | rightjoinDF = plants.join(meat, on = 'name', how='right')
32 | rightjoinDF.show()
33 | 
34 | # to exclude a value from a join table
35 | rightjoinDF = plants.join(meat, on = 'name', how='right').filter(plants.name.isNotNull())
36 | rightjoinDF.show()
37 | 
38 | # FULL outer join
39 | fulljoinDF = plants.join(meat, on = 'name', how='full')
40 | fulljoinDF.show()
41 | 
42 | ##
43 | import os
44 | """
45 | #  - **course_offerings:** uuid, course_uuid, term_code, name
46 | #  - **instructors:** id, name
47 | #  - **sections:** uuid, course_offering_uuid,room_uuid, schedule_uuid
48 | #  - **teachings:** instructor_id, section_uuid
49 | #  
50 | #  **Source:** https://www.kaggle.com/Madgrades/uw-madison-course
51 | """
52 | path = "Datasets/uw-madison-courses/"
53 | 
54 | df_list = []
55 | for filename in os.listdir(path):
56 |     if filename.endswith(".csv"):
57 |         filename_list = filename.split(".")  # separate path from .csv
58 |         df_name = filename_list[0]
59 |         df = spark.read.csv(path + filename, inferSchema=True, header=True)
60 |         df.name = df_name
61 |         df_list.append(df_name)
62 |         exec(df_name + ' = df')
63 | ##
64 | #
65 | print("Full list of dfs:")
66 | print(df_list)
67 | 
68 | rooms.show()
69 | sections.show(4)
70 | 
71 | step1 = teachings.join(instructors, teachings.instructor_id == instructors.id, how='left').select(['instructor_id','name','section_uuid'])
72 | step1.limit(4).show(5)
73 | 
74 | step2 = step1.join(sections, step1.section_uuid == sections.uuid, how='left').select(['name','course_offering_uuid'])
75 | step2.limit(4).show()
76 | 
77 | step3 = step2.withColumnRenamed('name', 'instructor').join(course_offerings, step2.course_offering_uuid == course_offerings.uuid, how='inner').select(['instructor','name','course_offering_uuid'])
78 | step3.show(4)


--------------------------------------------------------------------------------
/scripts/join_tabales.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "id": "b039fa64",
  7 |    "metadata": {},
  8 |    "outputs": [],
  9 |    "source": [
 10 |     "from pyspark import SparkContext"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 4,
 16 |    "id": "379fcd52",
 17 |    "metadata": {},
 18 |    "outputs": [
 19 |     {
 20 |      "data": {
 21 |       "text/html": [
 22 |        "\n",
 23 |        "            <div>\n",
 24 |        "                <p><b>SparkSession - in-memory</b></p>\n",
 25 |        "                \n",
 26 |        "        <div>\n",
 27 |        "            <p><b>SparkContext</b></p>\n",
 28 |        "\n",
 29 |        "            <p><a href=\"http://ber-2420.fritz.box:4040\">Spark UI</a></p>\n",
 30 |        "\n",
 31 |        "            <dl>\n",
 32 |        "              <dt>Version</dt>\n",
 33 |        "                <dd><code>v3.0.3</code></dd>\n",
 34 |        "              <dt>Master</dt>\n",
 35 |        "                <dd><code>local[*]</code></dd>\n",
 36 |        "              <dt>AppName</dt>\n",
 37 |        "                <dd><code>jointables</code></dd>\n",
 38 |        "            </dl>\n",
 39 |        "        </div>\n",
 40 |        "        \n",
 41 |        "            </div>\n",
 42 |        "        "
 43 |       ],
 44 |       "text/plain": [
 45 |        "<pyspark.sql.session.SparkSession at 0x11f217a30>"
 46 |       ]
 47 |      },
 48 |      "execution_count": 4,
 49 |      "metadata": {},
 50 |      "output_type": "execute_result"
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "import pyspark\n",
 55 |     "from pyspark.sql import SparkSession\n",
 56 |     "\n",
 57 |     "spark = SparkSession.builder.appName('jointables').getOrCreate()\n",
 58 |     "spark"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 5,
 64 |    "id": "4abf5bb7",
 65 |    "metadata": {},
 66 |    "outputs": [
 67 |     {
 68 |      "name": "stdout",
 69 |      "output_type": "stream",
 70 |      "text": [
 71 |       "root\n",
 72 |       " |-- emp_id: long (nullable = true)\n",
 73 |       " |-- name: string (nullable = true)\n",
 74 |       " |-- superior_emp_id: long (nullable = true)\n",
 75 |       " |-- year_joined: string (nullable = true)\n",
 76 |       " |-- emp_dept_id: string (nullable = true)\n",
 77 |       " |-- gender: string (nullable = true)\n",
 78 |       " |-- salary: long (nullable = true)\n",
 79 |       "\n",
 80 |       "+------+--------+---------------+-----------+-----------+------+------+\n",
 81 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|\n",
 82 |       "+------+--------+---------------+-----------+-----------+------+------+\n",
 83 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |\n",
 84 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |\n",
 85 |       "|3     |Williams|1              |2010       |10         |M     |1000  |\n",
 86 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |\n",
 87 |       "|5     |Brown   |2              |2010       |40         |      |-1    |\n",
 88 |       "|6     |Brown   |2              |2010       |50         |      |-1    |\n",
 89 |       "+------+--------+---------------+-----------+-----------+------+------+\n",
 90 |       "\n"
 91 |      ]
 92 |     }
 93 |    ],
 94 |    "source": [
 95 |     "emp = [(1,\"Smith\",-1,\"2018\",\"10\",\"M\",3000), \\\n",
 96 |     "    (2,\"Rose\",1,\"2010\",\"20\",\"M\",4000), \\\n",
 97 |     "    (3,\"Williams\",1,\"2010\",\"10\",\"M\",1000), \\\n",
 98 |     "    (4,\"Jones\",2,\"2005\",\"10\",\"F\",2000), \\\n",
 99 |     "    (5,\"Brown\",2,\"2010\",\"40\",\"\",-1), \\\n",
100 |     "      (6,\"Brown\",2,\"2010\",\"50\",\"\",-1) \\\n",
101 |     "  ]\n",
102 |     "empColumns = [\"emp_id\",\"name\",\"superior_emp_id\",\"year_joined\", \\\n",
103 |     "       \"emp_dept_id\",\"gender\",\"salary\"]\n",
104 |     "\n",
105 |     "empDF = spark.createDataFrame(data=emp, schema = empColumns)\n",
106 |     "empDF.printSchema()\n",
107 |     "empDF.show(truncate=False)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "code",
112 |    "execution_count": 6,
113 |    "id": "59f0d2ec",
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": [
117 |     "dept = [(\"Finance\",10, \"Bill\"), \\\n",
118 |     "    (\"Marketing\",20, \"Joe\"), \\\n",
119 |     "    (\"Sales\",30, \"Smith\"), \\\n",
120 |     "    (\"IT\",40, \"Brown\") \\\n",
121 |     "  ]"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 7,
127 |    "id": "9ed307c0",
128 |    "metadata": {},
129 |    "outputs": [
130 |     {
131 |      "name": "stdout",
132 |      "output_type": "stream",
133 |      "text": [
134 |       "root\n",
135 |       " |-- dept_name: string (nullable = true)\n",
136 |       " |-- dept_id: long (nullable = true)\n",
137 |       " |-- emp_name: string (nullable = true)\n",
138 |       "\n",
139 |       "+---------+-------+--------+\n",
140 |       "|dept_name|dept_id|emp_name|\n",
141 |       "+---------+-------+--------+\n",
142 |       "|Finance  |10     |Bill    |\n",
143 |       "|Marketing|20     |Joe     |\n",
144 |       "|Sales    |30     |Smith   |\n",
145 |       "|IT       |40     |Brown   |\n",
146 |       "+---------+-------+--------+\n",
147 |       "\n"
148 |      ]
149 |     }
150 |    ],
151 |    "source": [
152 |     "deptColumns = [\"dept_name\",\"dept_id\",\"emp_name\"]\n",
153 |     "deptDF = spark.createDataFrame(data=dept, schema = deptColumns)\n",
154 |     "deptDF.printSchema()\n",
155 |     "deptDF.show(truncate=False)"
156 |    ]
157 |   },
158 |   {
159 |    "cell_type": "code",
160 |    "execution_count": 8,
161 |    "id": "67e9358b",
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
169 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n",
170 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
171 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |\n",
172 |       "|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |\n",
173 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |\n",
174 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |\n",
175 |       "|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |\n",
176 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
177 |       "\n"
178 |      ]
179 |     }
180 |    ],
181 |    "source": [
182 |     "empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,\"inner\").show(truncate=False)\n"
183 |    ]
184 |   },
185 |   {
186 |    "cell_type": "code",
187 |    "execution_count": 9,
188 |    "id": "7c98d2fe",
189 |    "metadata": {},
190 |    "outputs": [
191 |     {
192 |      "name": "stdout",
193 |      "output_type": "stream",
194 |      "text": [
195 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
196 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n",
197 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
198 |       "|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |null    |\n",
199 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |\n",
200 |       "|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |\n",
201 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |\n",
202 |       "|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |Smith   |\n",
203 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |\n",
204 |       "|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |\n",
205 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
206 |       "\n",
207 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
208 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n",
209 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
210 |       "|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |null    |\n",
211 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |\n",
212 |       "|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |\n",
213 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |\n",
214 |       "|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |Smith   |\n",
215 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |\n",
216 |       "|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |\n",
217 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
218 |       "\n",
219 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
220 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n",
221 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
222 |       "|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |null    |\n",
223 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |\n",
224 |       "|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |\n",
225 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |\n",
226 |       "|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |Smith   |\n",
227 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |\n",
228 |       "|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |\n",
229 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
230 |       "\n"
231 |      ]
232 |     }
233 |    ],
234 |    "source": [
235 |     "# Full Outer Join\n",
236 |     "\n",
237 |     "empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,\"outer\").show(truncate=False)\n",
238 |     "empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,\"full\").show(truncate=False)\n",
239 |     "empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,\"fullouter\").show(truncate=False)\n"
240 |    ]
241 |   },
242 |   {
243 |    "cell_type": "code",
244 |    "execution_count": 10,
245 |    "id": "58fb5363",
246 |    "metadata": {},
247 |    "outputs": [
248 |     {
249 |      "name": "stdout",
250 |      "output_type": "stream",
251 |      "text": [
252 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
253 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n",
254 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
255 |       "|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |null    |\n",
256 |       "|5     |Brown   |2              |2010       |40         |      |-1    |null     |null   |null    |\n",
257 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |null     |null   |null    |\n",
258 |       "|3     |Williams|1              |2010       |10         |M     |1000  |null     |null   |null    |\n",
259 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |null     |null   |null    |\n",
260 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |null     |null   |null    |\n",
261 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
262 |       "\n",
263 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
264 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n",
265 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
266 |       "|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |null    |\n",
267 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |\n",
268 |       "|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |\n",
269 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |\n",
270 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |\n",
271 |       "|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |\n",
272 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
273 |       "\n"
274 |      ]
275 |     }
276 |    ],
277 |    "source": [
278 |     "#  Left Outer Join\n",
279 |     "# vaghti column moshtarak nadashte bashan (emp_id, dept_id)\n",
280 |     "empDF.join(deptDF, empDF.emp_id == deptDF.dept_id, \"left\") \\\n",
281 |     "    .show(truncate = False)\n",
282 |     "\n",
283 |     "# rooye column moshtarak \n",
284 |     "empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, \"left\") \\\n",
285 |     "    .show(truncate = False)\n"
286 |    ]
287 |   },
288 |   {
289 |    "cell_type": "code",
290 |    "execution_count": 11,
291 |    "id": "ff31efe2",
292 |    "metadata": {},
293 |    "outputs": [
294 |     {
295 |      "name": "stdout",
296 |      "output_type": "stream",
297 |      "text": [
298 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
299 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n",
300 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
301 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |null     |null   |null    |\n",
302 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |null     |null   |null    |\n",
303 |       "|3     |Williams|1              |2010       |10         |M     |1000  |null     |null   |null    |\n",
304 |       "|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |\n",
305 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |null     |null   |null    |\n",
306 |       "|6     |Brown   |2              |2010       |50         |      |-1    |null     |null   |null    |\n",
307 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
308 |       "\n"
309 |      ]
310 |     }
311 |    ],
312 |    "source": [
313 |     "leftjoin = empDF.join(deptDF, (empDF.emp_dept_id == deptDF.dept_id) & (empDF.name == deptDF.emp_name), \"left\") \n",
314 |     "leftjoin.show(truncate= False)"
315 |    ]
316 |   },
317 |   {
318 |    "cell_type": "code",
319 |    "execution_count": 12,
320 |    "id": "c7966499",
321 |    "metadata": {},
322 |    "outputs": [
323 |     {
324 |      "name": "stdout",
325 |      "output_type": "stream",
326 |      "text": [
327 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
328 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n",
329 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
330 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |\n",
331 |       "|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |\n",
332 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |\n",
333 |       "|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |Smith   |\n",
334 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |\n",
335 |       "|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |\n",
336 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
337 |       "\n",
338 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
339 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n",
340 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
341 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |\n",
342 |       "|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |\n",
343 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |\n",
344 |       "|null  |null    |null           |null       |null       |null  |null  |Sales    |30     |Smith   |\n",
345 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |\n",
346 |       "|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |\n",
347 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
348 |       "\n"
349 |      ]
350 |     }
351 |    ],
352 |    "source": [
353 |     "# Right Outer Join\n",
354 |     "\n",
355 |     "\n",
356 |     "empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,\"right\") \\\n",
357 |     "   .show(truncate=False)\n",
358 |     "empDF.join(deptDF,empDF.emp_dept_id ==  deptDF.dept_id,\"rightouter\") \\\n",
359 |     "   .show(truncate=False)\n"
360 |    ]
361 |   },
362 |   {
363 |    "cell_type": "markdown",
364 |    "id": "bda5eff2",
365 |    "metadata": {},
366 |    "source": [
367 |     "## Using SQL Expression\n",
368 |     "\n",
369 |     "Since PySpark SQL support native SQL syntax, we can also write join operations after creating temporary tables on DataFrames and use these tables on `spark.sql()`."
370 |    ]
371 |   },
372 |   {
373 |    "cell_type": "code",
374 |    "execution_count": 13,
375 |    "id": "a2f9c926",
376 |    "metadata": {},
377 |    "outputs": [],
378 |    "source": [
379 |     "empDF.createOrReplaceTempView(\"EMP\")\n",
380 |     "deptDF.createOrReplaceTempView(\"DEPT\")"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": 14,
386 |    "id": "7dfe1542",
387 |    "metadata": {},
388 |    "outputs": [],
389 |    "source": [
390 |     "joinDF = spark.sql(\"select * from EMP e, DEPT d where e.emp_dept_id == d.dept_id\") \n",
391 |     "\n",
392 |     "joinDF2 = spark.sql(\"select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id == d.dept_id\")"
393 |    ]
394 |   },
395 |   {
396 |    "cell_type": "code",
397 |    "execution_count": 15,
398 |    "id": "027fec39",
399 |    "metadata": {},
400 |    "outputs": [
401 |     {
402 |      "name": "stdout",
403 |      "output_type": "stream",
404 |      "text": [
405 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
406 |       "|emp_id|name    |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n",
407 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
408 |       "|1     |Smith   |-1             |2018       |10         |M     |3000  |Finance  |10     |Bill    |\n",
409 |       "|3     |Williams|1              |2010       |10         |M     |1000  |Finance  |10     |Bill    |\n",
410 |       "|4     |Jones   |2              |2005       |10         |F     |2000  |Finance  |10     |Bill    |\n",
411 |       "|2     |Rose    |1              |2010       |20         |M     |4000  |Marketing|20     |Joe     |\n",
412 |       "|5     |Brown   |2              |2010       |40         |      |-1    |IT       |40     |Brown   |\n",
413 |       "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n",
414 |       "\n"
415 |      ]
416 |     }
417 |    ],
418 |    "source": [
419 |     "joinDF.show(truncate=False)"
420 |    ]
421 |   },
422 |   {
423 |    "cell_type": "code",
424 |    "execution_count": null,
425 |    "id": "572fe1b8",
426 |    "metadata": {},
427 |    "outputs": [],
428 |    "source": []
429 |   }
430 |  ],
431 |  "metadata": {
432 |   "kernelspec": {
433 |    "display_name": "Python 3 (ipykernel)",
434 |    "language": "python",
435 |    "name": "python3"
436 |   },
437 |   "language_info": {
438 |    "codemirror_mode": {
439 |     "name": "ipython",
440 |     "version": 3
441 |    },
442 |    "file_extension": ".py",
443 |    "mimetype": "text/x-python",
444 |    "name": "python",
445 |    "nbconvert_exporter": "python",
446 |    "pygments_lexer": "ipython3",
447 |    "version": "3.9.7"
448 |   }
449 |  },
450 |  "nbformat": 4,
451 |  "nbformat_minor": 5
452 | }
453 | 


--------------------------------------------------------------------------------
/scripts/manipulating-data-in-DataFrame.py:
--------------------------------------------------------------------------------
  1 | """
  2 | - changing data types when they are incorrectly interpretted
  3 | - Clean your data
  4 | - create new columns
  5 | - rename columns
  6 | - extract or create new value
  7 | """
  8 | 
  9 | import os
 10 | import pyspark
 11 | from pyspark.sql import SparkSession
 12 | from pyspark.sql.types import *
 13 | from pyspark.sql.functions import *
 14 | from pyspark.sql.functions import col, lit, to_date, trim, lower
 15 | 
 16 | # create a session
 17 | spark = SparkSession.builder.appName("manipulatedata").getOrCreate()
 18 | spark
 19 | 
 20 | ##
 21 | # Trending YouTube Video Statistics, https://www.kaggle.com/datasnaek/youtube-new
 22 | path = "Datasets/"
 23 | videos = spark.read.csv(path+"youtubevideos.csv", header=True, inferSchema=True)
 24 | len(videos.columns)
 25 | videos.limit(10).show()
 26 | videos.printSchema()
 27 | videos.select("publish_time").show(5, False)
 28 | # publish_time, its schema needs to be modified. 2017-11-13T17:13:01.000Z
 29 | 
 30 | """PySpark withColumn() is a transformation function of DataFrame which is used to change the value,
 31 |  convert the datatype of an existing column, create a new column, and many more
 32 | 
 33 | - PySpark withColumn – To change column DataType
 34 | - Transform/change value of an existing column
 35 | - Derive new column from an existing column
 36 | - Add a column with the literal value
 37 | - Rename column name
 38 | - Drop DataFrame column
 39 | 
 40 | here are the subclasses of the DataType in PySpark and we can change or cast DataFrame columns to *only* these types.
 41 | 
 42 | ArrayType, BinaryType, BooleanType, CalendarIntervalType, DateType, HiveStringType, MapType, NullType, 
 43 | NumericType, ObjectType, StringType, StructType, TimestampType
 44 | 
 45 | Syntax: to_date(timestamp_column,format)
 46 | """
 47 | 
 48 | # create new data frame from videos DataFrame
 49 | 
 50 | # change type of views column
 51 | #df = videos.withColumn("views", videos["views"].cast(IntegerType()))
 52 | df = videos.withColumn("views", col("views").cast(IntegerType()))  \
 53 |      .withColumn("likes", videos.likes.cast(IntegerType()))  \
 54 |      .withColumn("dislikes",videos.dislikes.cast(IntegerType()))  \
 55 |      .withColumn("trending_date", to_date(videos.trending_date,'yy.dd.mm'))  \
 56 |      .withColumn("publish_time", to_timestamp(videos.publish_time, 'yyyy-MM-dd HH:mm:ss'))
 57 | 
 58 | df.describe()
 59 | df.limit(4).show()
 60 | df.printSchema()
 61 | ## NOW, we face some problems here:
 62 | #1) pyspark infer trendin_date incorrectly 2017-01-14, so how to fix it???
 63 | #2) publish time is null now !!!!!!!! and it is because of funky TZ in the original format 2017-11-13T17:13:01.000Z
 64 | 
 65 | df = videos.withColumn("views", col("views").cast(IntegerType()))  \
 66 |      .withColumn("likes", videos.likes.cast(IntegerType()))  \
 67 |      .withColumn("dislikes",videos.dislikes.cast(IntegerType()))  \
 68 |      .withColumn("trending_date", to_date(videos.trending_date,'yy.dd.mm'))  \
 69 | #     .withColumn("publish_time", to_timestamp(videos.publish_time, 'yyyy-MM-dd HH:mm:ss'))
 70 | # create a new column
 71 | df = df.withColumn('publish_time_2', regexp_replace(df.publish_time, "T", " "))
 72 | # same for Z and replace it
 73 | df = df.withColumn("publish_time_2", regexp_replace(df.publish_time_2, 'Z', ''))
 74 | df.select('publish_time', 'publish_time_2').show(4, False)
 75 | df.printSchema()
 76 | # So now we can transform it to timestamp
 77 | df = df.withColumn('publish_time_3', to_timestamp(df.publish_time_2, 'yyyy-MM-dd HH:mm:ss.SSS'))
 78 | df.printSchema()
 79 | df.show(4)
 80 | # rename the colomn name
 81 | #renamed_df = df.withColumnRenamed("newname", "publish_time_3")  # 2017-11-13 17:13:01 the 000 just took that off, thats OK
 82 | 
 83 | 
 84 | # TRANSLATE function, alternative way
 85 | # NOte, here i am not creating object i m just showing
 86 | df.select('publish_time', translate(col('publish_time'), "TZ", " ").alias('trans_col')).show(4, False)
 87 | 
 88 | # -------------------------------------------------------------
 89 | # TRIM()
 90 | df = df.withColumn('title', trim(df.title))
 91 | df.select('title').show(4, False)
 92 | df = df.withColumn("title", lower(df.title))
 93 | df.select('title').show(4, False)
 94 | #--------------------------------------------------------------
 95 | # case WHEN
 96 | # option 1, when-otherwise
 97 | # option 2 expr
 98 | def CASE(args):
 99 |      pass
100 | 
101 | 
102 | df.select("likes", "dislikes", expr("CASE WHEN likes > dislikes THEN 'Good mvie' "
103 |                                     "WHEN likes < dislikes THEN 'Bad movies'  "
104 |                                     "ELSE 'undetermined' END AS Favarability")).show(4)
105 | 
106 | 
107 | df.selectExpr("likes", "dislikes", "CASE WHEN likes > dislikes THEN 'Good mvie' "
108 |                                     "WHEN likes < dislikes THEN 'Bad movies'  "
109 |                                     "ELSE 'undetermined' END AS Favarability").show(4)
110 | 
111 | # ---------------------------------------------------------------
112 | # concatinate
113 | # Joining two columns for NLP, and added
114 | df = df.withColumn("title_channel", concat_ws(' ', df.title, df.channel_title))
115 | df.show(4)
116 | df.printSchema()
117 | 
118 | df.select("trending_date", year("trending_date"), month("trending_date")).show(4)
119 | 
120 | array = df.select('title', split(df.title, ' ').alias('new'))
121 | array.show(4, False) # it return an array [we, want, to, talk, about, our, marriage],
122 | array.select('title', array_contains(array.new, 'about')).show(4)
123 | array.printSchema()
124 | array.show(4, False)
125 | 
126 | # array_remove()
127 | ##
128 | """
129 | Pyspark, User Defined Functions
130 | """
131 | #PySpark UDF’s are similar to UDF on traditional databases. In PySpark, you create a function in a
132 | # Python syntax and wrap it with PySpark SQL udf() or register it as udf and use it on DataFrame and SQL respectively
133 | 
134 | from pyspark.sql.functions import udf
135 | from pyspark.sql.types import IntegerType
136 | 
137 | def squar(x):
138 |     return int(x)**2
139 | 
140 | square_udf = udf(lambda z: squar(z), IntegerType())
141 | df.select('dislikes',square_udf('dislikes')).where(df.dislikes.isNotNull())
142 | df.select('dislikes',square_udf('dislikes')).where(df.dislikes.isNotNull()).show(4)


--------------------------------------------------------------------------------
/scripts/multiple_csv_to_dataframe.py:
--------------------------------------------------------------------------------
 1 | # read multiple .csv file and create a dataframe
 2 | 
 3 | import os
 4 | """
 5 | #  - **course_offerings:** uuid, course_uuid, term_code, name
 6 | #  - **instructors:** id, name
 7 | #  - **sections:** uuid, course_offering_uuid,room_uuid, schedule_uuid
 8 | #  - **teachings:** instructor_id, section_uuid
 9 | #  
10 | #  **Source:** https://www.kaggle.com/Madgrades/uw-madison-course
11 | """
12 | path = "Datasets/uw-madison-courses/"
13 | 
14 | df_list = []
15 | for filename in os.listdir(path):
16 |     if filename.endswith(".csv"):
17 |         filename_list = filename.split(".")  # separate path from .csv
18 |         df_name = filename_list[0]
19 |         df = spark.read.csv(path + filename, inferSchema=True, header=True)
20 |         df.name = df_name
21 |         df_list.append(df_name)
22 |         exec(df_name + ' = df')
23 | 
24 | 


--------------------------------------------------------------------------------
/scripts/pivote-table.py:
--------------------------------------------------------------------------------
 1 | import pyspark
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql.functions import expr
 4 | spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
 5 | 
 6 | data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \
 7 |       ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \
 8 |       ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \
 9 |       ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")]
10 | 
11 | columns= ["Product","Amount","Country"]
12 | spark = SparkSession.builder.appName("tabale").getOrCreate()
13 | df = spark.createDataFrame(data = data, schema = columns)
14 | df.printSchema()
15 | df.show(truncate=False)
16 | """
17 | Product|Amount|Country|
18 | +-------+------+-------+
19 | |Banana |1000  |USA    |
20 | |Carrots|1500  |USA    |
21 | |Beans  |1600  |USA    |
22 | |Orange |2000  |USA    |
23 | |Orange |2000  |USA    |
24 | |Banana |400   |China  |
25 | |Carrots|1200  |China  |
26 | |Beans  |1500  |China  |
27 | |Orange |4000  |China  |
28 | |Banana |2000  |Canada |
29 | |Carrots|2000  |Canada |
30 | |Beans  |2000  |Mexico |
31 | +-------+------+-------+
32 | """
33 | 
34 | # PySpark SQL provides pivot() function to rotate the data from one column into multiple columns.
35 | # It is an aggregation where one of the grouping columns values transposed into individual columns with distinct data.
36 | # To get the total amount exported to each country of each product, will do group by Product,
37 | # pivot by Country, and the sum of Amount.
38 | pivotDF = df.groupBy("Product").pivot("Country").sum("Amount")
39 | pivotDF.printSchema()
40 | pivotDF.show(truncate=False)
41 | """
42 | This will transpose the countries from DataFrame rows into columns and produces below output. where ever 
43 | data is not present, it represents as null by default.
44 | 
45 | +-------+------+-----+------+----+
46 | |Product|Canada|China|Mexico|USA |
47 | +-------+------+-----+------+----+
48 | |Orange |null  |4000 |null  |4000|
49 | |Beans  |null  |1500 |2000  |1600|
50 | |Banana |2000  |400  |null  |1000|
51 | |Carrots|2000  |1200 |null  |1500|
52 | +-------+------+-----+------+----+
53 | """


--------------------------------------------------------------------------------
/scripts/pyspark-dataframe.py:
--------------------------------------------------------------------------------
 1 | import pyspark
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql.functions import col, lit
 4 | from pyspark.sql.types import StructType, StructField, StringType,IntegerType
 5 | 
 6 | # PySpark applications start with initializing SparkSession which is the entry point of PySpark as below. 
 7 | #In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users
 8 | 
 9 | spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
10 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
11 | print("You are working with", cores, "core(s)")
12 | 
13 | print(spark)
14 | 
15 | 


--------------------------------------------------------------------------------
/scripts/read_write_DataFrame.py:
--------------------------------------------------------------------------------
  1 | ## read csv file
  2 | from pyspark.sql import SparkSession
  3 | from pprint import pprint
  4 | from pyspark.sql import SparkSession
  5 | from pyspark.sql.types import StructType,StructField, StringType, IntegerType
  6 | from pyspark.sql.types import ArrayType, DoubleType, BooleanType
  7 | from pyspark.sql.functions import array_contains
  8 | 
  9 | 
 10 | spark = SparkSession.builder.appName('readdata').master('local').getOrCreate
 11 | # create a session and name it spark
 12 | pprint(spark)
 13 | 
 14 | ##  ------------------------------------
 15 | #       read csv format
 16 | # PySpark supports reading a CSV file with a pipe, comma, tab, space, or any other delimiter/separator files.
 17 | # Using csv("path") or format("csv").load("path") of DataFrameReader, you can read a CSV file into a PySpark DataFrame.
 18 | ## -------------------------------------
 19 | # file is located in a folder
 20 | data_path = "Datasets/"
 21 | students = spark.read.csv(data_path+'students.csv',header=True, inferSchema= True)
 22 | pprint(students)
 23 | students.printSchema()
 24 | """
 25 | root
 26 |  |-- gender: string (nullable = true)
 27 |  |-- race/ethnicity: string (nullable = true)
 28 |  |-- parental level of education: string (nullable = true)
 29 |  |-- lunch: string (nullable = true)
 30 |  |-- test preparation course: string (nullable = true)
 31 |  |-- math score: integer (nullable = true)
 32 |  |-- reading score: integer (nullable = true)
 33 |  |-- writing score: integer (nullable = true)
 34 | 
 35 | """
 36 | # OR
 37 | 
 38 | df = spark.read.format('csv').load(data_path+'students.csv')
 39 | 
 40 | pprint(df)
 41 | df.printSchema()
 42 | # this reads the data into DataFrame columns "_c0" for the first column and "_c1" for the second and so on
 43 | """
 44 | root
 45 |  |-- _c0: string (nullable = true)
 46 |  |-- _c1: string (nullable = true)
 47 |  |-- _c2: string (nullable = true)
 48 |  |-- _c3: string (nullable = true)
 49 |  |-- _c4: string (nullable = true)
 50 |  |-- _c5: string (nullable = true)
 51 |  |-- _c6: string (nullable = true)
 52 |  |-- _c7: string (nullable = true)
 53 | """
 54 | ## --------------------------------------
 55 | #          read parquet format
 56 | # _______________________________________
 57 | user1 = spark.read.parquet(data_path+'user*',header=True,inferSchema=True)
 58 | user1.show(4)
 59 | user1.count()
 60 | 
 61 | ## ---------------------------------------
 62 | #  Reading CSV files with a user-specified custom schema
 63 | # If you know the schema of the file ahead and do not want to use the inferSchema option for column names and types,
 64 | # use user-defined custom column names and type using schema option
 65 | # Refer dataset https://github.com/spark-examples/pyspark-examples/blob/master/resources/zipcodes.csv
 66 | # lets try some stuff here
 67 | df = spark.read.csv("resources/zipcodes.csv")
 68 | df.printSchema()
 69 | """
 70 | root
 71 |  |-- _c0: string (nullable = true)
 72 |  |-- _c1: string (nullable = true)
 73 |  |-- _c2: string (nullable = true) ....
 74 |   """
 75 | # there is one option here
 76 | df2 = spark.read.option("header", True).csv("resources/zipcodes.csv")
 77 | df2.printSchema()
 78 | """
 79 | root
 80 |  |-- RecordNumber: string (nullable = true)
 81 |  |-- Zipcode: string (nullable = true)
 82 |  |-- ZipCodeType: string (nullable = true) ...
 83 |  """
 84 | # lets add another option
 85 | df3 = spark.read.options(header = True, delimiter = ',').csv("resources/zipcodes.csv")
 86 | df3.printSchema()
 87 | df3.show(4)
 88 | 
 89 | # The schema does not look correct. lets change the data type
 90 | 
 91 | schema = StructType() \
 92 |       .add("RecordNumber",IntegerType(),True) \
 93 |       .add("Zipcode",IntegerType(),True) \
 94 |       .add("ZipCodeType",StringType(),True) \
 95 |       .add("City",StringType(),True) \
 96 |       .add("State",StringType(),True) \
 97 |       .add("LocationType",StringType(),True) \
 98 |       .add("Lat",DoubleType(),True) \
 99 |       .add("Long",DoubleType(),True) \
100 |       .add("Xaxis",IntegerType(),True) \
101 |       .add("Yaxis",DoubleType(),True) \
102 |       .add("Zaxis",DoubleType(),True) \
103 |       .add("WorldRegion",StringType(),True) \
104 |       .add("Country",StringType(),True) \
105 |       .add("LocationText",StringType(),True) \
106 |       .add("Location",StringType(),True) \
107 |       .add("Decommisioned",BooleanType(),True) \
108 |       .add("TaxReturnsFiled",StringType(),True) \
109 |       .add("EstimatedPopulation",IntegerType(),True) \
110 |       .add("TotalWages",IntegerType(),True) \
111 |       .add("Notes",StringType(),True)
112 | 
113 | df_with_schema = spark.read.option("header", True).format("csv").schema(schema).load("resources/zipcodes.csv")
114 | df_with_schema.printSchema()
115 | """
116 | root
117 |  |-- RecordNumber: integer (nullable = true)
118 |  |-- Zipcode: integer (nullable = true)
119 |  |-- ZipCodeType: string (nullable = true)
120 |  |-- City: string (nullable = true) ..."""
121 | 
122 | ## df3.write.mode('overwrite').csv('zip.csv')
123 | ## ===========================================
124 | # so
125 | path = "path_to_data"
126 | # CSV
127 | df = spark.read.csv(path+'students.csv',inferSchema=True,header=True)
128 | 
129 | # Json
130 | people = spark.read.json(path+'people.json')
131 | 
132 | # Parquet
133 | parquet = spark.read.parquet(path+'users.parquet')
134 | 
135 | # Partioned Parquet
136 | partitioned = spark.read.parquet(path+'users*')
137 | 
138 | # Parts of a partitioned Parquet
139 | users1_2 = spark.read.option("basePath", path).parquet(path+'users1.parquet', path+'users2.parquet')
140 | 
141 | #====================================================================================================================
142 | #                     Applying DataFrame transformations
143 | 
144 | # Once you have created DataFrame from the CSV file, you can apply all transformation and actions DataFrame support
145 | # --------------------------------------------------------
146 | #        Write PySpark DataFrame to CSV file
147 | # --------------------------------------------------------
148 | # Use the write() method of the PySpark DataFrameWriter object to write PySpark DataFrame to a CSV fil
149 | #  Options:  While writing a CSV file you can use several options. for example, header to output the DataFrame column
150 | #  names as header record and delimiter to specify the delimiter on the CSV output file.
151 | # Other options available quote,escape,nullValue,dateFormat,quoteMode
152 | 
153 | 
154 | df3.write.mode('overwrite').csv("spark_output/zipcodes")   # overwrite – mode is used to overwrite the existing file
155 | # one can also use this
156 | df2.write.format("csv").mode('overwrite').save("output/zipcodes")


--------------------------------------------------------------------------------
/scripts/sample_data/data:
--------------------------------------------------------------------------------
1 | 
2 | 


--------------------------------------------------------------------------------
/scripts/sample_data/data.txt:
--------------------------------------------------------------------------------
1 | Project Gutenberg’s
2 | Alice’s Adventures in Wonderland
3 | Project Gutenberg’s
4 | Adventures in Wonderland
5 | Project Gutenberg’s


--------------------------------------------------------------------------------
/scripts/search-filter-DataFrame.py:
--------------------------------------------------------------------------------
 1 | ##
 2 | import pyspark
 3 | from pyspark.sql import SparkSession
 4 | from pyspark.sql.functions import *
 5 | from pprint import pprint
 6 | # May take awhile locally
 7 | spark = SparkSession.builder.appName("FunctionsHW").getOrCreate()
 8 | 
 9 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
10 | print("You are working with", cores, "core(s)")
11 | spark
12 | ##
13 | df = spark.read.csv('Datasets/df19.csv',inferSchema=True,header=True)
14 | print(df.printSchema())
15 | df.limit(4).toPandas()
16 | df.show(4, truncate = True)
17 | len(df.columns) # 89
18 | df.describe
19 | 
20 | df.select(['Aggression', 'Stamina']).show(5)
21 | df.select(['Aggression', 'Stamina']).summary('count','min').show()
22 | df.select(['Name', 'Age']).orderBy(df['Age'].desc()).show(5)
23 | df.select(['*']).show(5)
24 | 
25 | ## filtering data horizontally WHERE condition
26 | df.select(["Name", "age", "Club"]).show(5)
27 | df.select(["Name", "age", "Club"]).where(df.Club.like("%celon%")).show(5)
28 | 
29 | # ------------------------------
30 | # SELECT SUBSTRING ('what a wonderful DAY' from 2 for 6); -- hat a
31 | df.select("Photo",df.Photo.substr(-4,5).alias('the last 4 charachter')).show(5)
32 | # ISIN(list)
33 | df['Name','club','Nationality'].filter("Club IN ('FC Barcelona')").limit(4).toPandas()
34 | 
35 | df.select('Name').where(df.Name.startswith("L")).show(5)
36 | df.select('Name', 'Club').where(df.Name.startswith("L")).where(df.Name.endswith('i')).where(df.Club.like('%Barcelona')).show(5)
37 | 
38 | ## SLICING DataFrame, take n number of rows
39 | df.count()
40 | df1 = df.limit(100)
41 | df1.show(5, True).toPandas()
42 | 
43 | # SLICING, take n number of colomns
44 | df2 = df.select('Name', 'Club', "Nationality")
45 | # OR
46 | df_sel_col = df.select(df.columns[0:5])
47 | #
48 | df2.limit(5).show()
49 | len(df2.columns)   #3
50 | df2.count()        #18207
51 | len(df_sel_col.columns)   #5
52 | df_sel_col.limit(5).show()
53 | 
54 | df.printSchema()
55 | df['Name', 'Weight'].filter("Overall>50").limit(4).show()
56 | df['Name', 'Weight'].limit(4).show()
57 | df.select(['Name','Position','Release Clause']).show(5,False)
58 | # Display the same results from above sorted by the players names
59 | df.select(['Name','Position']).orderBy('Name').show(5)
60 | df.select(['Name','Position','Age']).orderBy(df['Age'].desc()).show(5)
61 | 
62 | # Select only the players who belong to a club begining with FC
63 | # One way
64 | df.select("Name","Club").where(df.Club.like("FC%")).show(5, False)
65 | 
66 | # Another way
67 | df.select("Name","Club").where(df.Club.startswith("FC")).limit(4).toPandas()
68 | 
69 | ## ======================================================
70 | # to create a new dataframe
71 | df = df.limit(100)
72 | df.count()
73 | 
74 | # if we slice the colomns
75 | df2_col = df.columns[0:5]
76 | df2 = df.select(df2_col)
77 | df2.count()
78 | df2.show(5,False)
79 | # count the colomn 
80 | len(df2.columns)
81 | 
82 | df.filter("Age>40").select(['Name','Age']).limit(4).toPandas()
83 | #   COLLECTING RESULTS AS OBJECTS ----> .COLLECT() method, it will collect results as a python object
84 | df4 = df.select('Name', 'Club').where(df.Name.startswith("L")).collect()    #object is list
85 | df4.toPandas()
86 | type(df4[0])   # 'pyspark.sql.types.Row'
87 | print("Name start with L:",df4[1][1])


--------------------------------------------------------------------------------
/scripts/update-column-DataFrame.py:
--------------------------------------------------------------------------------
 1 | from pyspark.sql import SparkSession
 2 | 
 3 | spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
 4 | 
 5 | data = [('James','Smith','M',3000),('Anna','Rose','F',4100),
 6 |   ('Robert','Williams','NA',6200),(None,'Rob','F',6200)
 7 |   
 8 | ]
 9 | 
10 | columns = ["firstname","lastname","gender","salary"]
11 | df = spark.createDataFrame(data=data, schema = columns)
12 | df.show()
13 | 
14 | df2=df.withColumn("salary", df.salary*3)
15 | df2.show()
16 | #-----------------------------------------------------------
17 | from pyspark.sql.functions import when
18 | df3 = df.withColumn("gender", when(df.gender == "M","Male") \
19 |       .when(df.gender == "F","Female") \
20 |       .otherwise(df.gender))
21 | df3.show()
22 | #------------------------------------------------------------
23 | 
24 | df4=df.withColumn("salary",df.salary.cast("String"))
25 | df4.printSchema()
26 | 
27 | df.createOrReplaceTempView("PER")
28 | df5=spark.sql("select firstname,gender,salary*3 as salary from PER")
29 | df5.show()
30 | 


--------------------------------------------------------------------------------
/scripts/user-defined-function.py:
--------------------------------------------------------------------------------
  1 | import pyspark
  2 | from pyspark.sql import SparkSession
  3 | from pyspark.sql.functions import col, udf
  4 | from pyspark.sql.types import StringType
  5 | 
  6 | spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate()
  7 | 
  8 | columns = ["Seqno","Name"]
  9 | data = [("1", "john jones"),
 10 |     ("2", "tracey smith"),
 11 |     ("3", "amy sanders")]
 12 | 
 13 | """
 14 | +-----+------------+
 15 | |Seqno|Names       |
 16 | +-----+------------+
 17 | |1    |john jones  |
 18 | |2    |tracey smith|
 19 | |3    |amy sanders |
 20 | +-----+------------+
 21 | """
 22 | 
 23 | df = spark.createDataFrame(data=data,schema=columns)
 24 | 
 25 | df.show(truncate=False)
 26 | # Create a Python Function
 27 | #The first step in creating a UDF is creating a Python function. Below snippet creates a function convertCase() 
 28 | # which takes a string parameter and converts the first letter of every word to capital letter
 29 | def convertCase(str):
 30 |     resStr=""
 31 |     arr = str.split(" ")
 32 |     for x in arr:
 33 |        resStr= resStr + x[0:1].upper() + x[1:len(x)] + " "
 34 |     return resStr 
 35 | 
 36 | # Converting function to UDF
 37 | # Now convert this function convertCase() to UDF by passing the function to PySpark SQL udf(), this function is available at
 38 | #  org.apache.spark.sql.functions.udf package. Make sure you import this package before using it
 39 | convertUDF = udf(lambda z: convertCase(z), StringType)
 40 | # Note: The default type of the udf() is StringType hence, you can also write the above statement without return type
 41 | #  Now you can use convertUDF() on a DataFrame column as a regular build-in function.
 42 | df.select(col("Seqno"), \
 43 |     convertUDF(col("Name")).alias("Name") ) \
 44 | .show(truncate=False)
 45 | """
 46 | +-----+-------------+
 47 | |Seqno|Name         |
 48 | +-----+-------------+
 49 | |1    |John Jones   |
 50 | |2    |Tracey Smith |
 51 | |3    |Amy Sanders  |
 52 | +-----+-------------+
 53 | """
 54 | # Using UDF with PySpark DataFrame withColumn()
 55 | @udf(returnType=StringType()) 
 56 | def upperCase(str):
 57 |     return str.upper()
 58 | 
 59 | upperCaseUDF = udf(lambda z:upperCase(z),StringType())    
 60 | 
 61 | df.withColumn("Cureated Name", upperCase(col("Name"))) \
 62 | .show(truncate=False)
 63 | 
 64 | """ Using UDF on SQL """
 65 | # Registering PySpark UDF & use it on SQL
 66 | # In order to use convertCase() function on PySpark SQL, you need to register the function with PySpark by using spark.udf.register()
 67 | spark.udf.register("convertUDF", convertCase,StringType())
 68 | df.createOrReplaceTempView("NAME_TABLE")
 69 | spark.sql("select Seqno, convertUDF(Name) as Name from NAME_TABLE") \
 70 |      .show(truncate=False)
 71 |      
 72 | spark.sql("select Seqno, convertUDF(Name) as Name from NAME_TABLE " + \
 73 |           "where Name is not null and convertUDF(Name) like '%John%'") \
 74 |      .show(truncate=False)  
 75 |      
 76 | """ null check """
 77 | # UDF’s are error-prone when not designed carefully. for example, when you have a column that contains the value null on some records
 78 | columns = ["Seqno","Name"]
 79 | data = [("1", "john jones"),
 80 |     ("2", "tracey smith"),
 81 |     ("3", "amy sanders"),
 82 |     ('4',None)]
 83 | 
 84 | df2 = spark.createDataFrame(data=data,schema=columns)
 85 | df2.show(truncate=False)
 86 | df2.createOrReplaceTempView("NAME_TABLE2")
 87 |     
 88 | spark.udf.register("_nullsafeUDF", lambda str: convertCase(str) if not str is None else "" , StringType())
 89 | 
 90 | spark.sql("select _nullsafeUDF(Name) from NAME_TABLE2") \
 91 |      .show(truncate=False)
 92 | 
 93 | spark.sql("select Seqno, _nullsafeUDF(Name) as Name from NAME_TABLE2 " + \
 94 |           " where Name is not null and _nullsafeUDF(Name) like '%John%'") \
 95 |      .show(truncate=False)  
 96 | 
 97 | 
 98 | 
 99 |  
100 | 
101 | 


--------------------------------------------------------------------------------
/search_filter_dataframe.py:
--------------------------------------------------------------------------------
 1 | import pyspark
 2 | from pyspark.sql import SparkSession
 3 | from pyspark.sql.functions import *
 4 | # May take awhile locally
 5 | spark = SparkSession.builder.appName("FunctionsHW").getOrCreate()
 6 | 
 7 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size()
 8 | print("You are working with", cores, "core(s)")
 9 | spark
10 | 
11 | fifa = spark.read.csv('Datasets/fifa19.csv',inferSchema=True,header=True)
12 | print(fifa.printSchema())
13 | 
14 | 
15 | fifa.select(['Name','Position','Release Clause']).show(5,False)
16 | # Display the same results from above sorted by the players names
17 | fifa.select(['Name','Position']).orderBy('Name').show(5)
18 | fifa.select(['Name','Position','Age']).orderBy(fifa['Age'].desc()).show(5)
19 | 
20 | # Select only the players who belong to a club begining with FC
21 | # One way
22 | fifa.select("Name","Club").where(fifa.Club.like("FC%")).show(5, False)
23 | 
24 | # Another way
25 | fifa.select("Name","Club").where(fifa.Club.startswith("FC")).limit(4).toPandas()
26 | 
27 | ## ======================================================
28 | # to create a new dataframe
29 | df = fifa.limit(100)
30 | df.count()
31 | 
32 | # if we slice the colomns
33 | df2_col = fifa.columns[0:5]
34 | df2 = fifa.select(df2_col)
35 | df2.count()
36 | df2.show(5,False)
37 | # count the colomn 
38 | len(df2.columns)
39 | # =[=======================================================
40 | # Filtering data
41 | # ========================================================
42 | 
43 | fifa.filter("Age>40").select(['Name','Age']).limit(4).toPandas()
44 | 


--------------------------------------------------------------------------------
/spark-env.yml:
--------------------------------------------------------------------------------
  1 | name: spark
  2 | channels:
  3 |   - conda-forge
  4 |   - defaults
  5 | dependencies:
  6 |   - _libgcc_mutex=0.1=main
  7 |   - argon2-cffi=20.1.0=py37h7b6447c_1
  8 |   - async_generator=1.10=py37h28b3542_0
  9 |   - attrs=20.3.0=pyhd3eb1b0_0
 10 |   - autopep8=1.5.4=py_0
 11 |   - backcall=0.2.0=py_0
 12 |   - bleach=3.2.1=py_0
 13 |   - ca-certificates=2020.12.8=h06a4308_0
 14 |   - certifi=2020.12.5=py37h06a4308_0
 15 |   - cffi=1.14.4=py37h261ae71_0
 16 |   - decorator=4.4.2=py_0
 17 |   - defusedxml=0.6.0=py_0
 18 |   - entrypoints=0.3=py37_0
 19 |   - icu=58.2=hf484d3e_1000
 20 |   - importlib-metadata=2.0.0=py_1
 21 |   - importlib_metadata=2.0.0=1
 22 |   - ipykernel=5.3.4=py37h5ca1d4c_0
 23 |   - ipython=7.19.0=py37hb070fc8_0
 24 |   - ipython_genutils=0.2.0=pyhd3eb1b0_1
 25 |   - jedi=0.18.0=py37h06a4308_0
 26 |   - jinja2=2.11.2=py_0
 27 |   - jsonschema=3.2.0=py_2
 28 |   - jupyter_client=6.1.7=py_0
 29 |   - jupyter_contrib_core=0.3.3=py_2
 30 |   - jupyter_contrib_nbextensions=0.5.1=py37hc8dfbb8_1
 31 |   - jupyter_core=4.7.0=py37h06a4308_0
 32 |   - jupyter_highlight_selected_word=0.2.0=py37hc8dfbb8_1002
 33 |   - jupyter_latex_envs=1.4.6=py37hc8dfbb8_1001
 34 |   - jupyter_nbextensions_configurator=0.4.1=py37hc8dfbb8_2
 35 |   - jupyterlab_pygments=0.1.2=py_0
 36 |   - ld_impl_linux-64=2.33.1=h53a641e_7
 37 |   - libedit=3.1.20191231=h14c3975_1
 38 |   - libffi=3.3=he6710b0_2
 39 |   - libgcc-ng=9.1.0=hdf63c60_0
 40 |   - libsodium=1.0.18=h7b6447c_0
 41 |   - libstdcxx-ng=9.1.0=hdf63c60_0
 42 |   - libxml2=2.9.10=hb55368b_3
 43 |   - libxslt=1.1.34=hc22bd24_0
 44 |   - lxml=4.6.2=py37h9120a33_0
 45 |   - markupsafe=1.1.1=py37h14c3975_1
 46 |   - mistune=0.8.4=py37h14c3975_1001
 47 |   - nbclient=0.5.1=py_0
 48 |   - nbconvert=6.0.7=py37_0
 49 |   - nbformat=5.0.8=py_0
 50 |   - ncurses=6.2=he6710b0_1
 51 |   - nest-asyncio=1.4.3=pyhd3eb1b0_0
 52 |   - notebook=6.1.6=py37h06a4308_0
 53 |   - openssl=1.1.1i=h27cfd23_0
 54 |   - packaging=20.8=pyhd3eb1b0_0
 55 |   - pandoc=2.11=hb0f4dca_0
 56 |   - pandocfilters=1.4.3=py37h06a4308_1
 57 |   - pexpect=4.8.0=pyhd3eb1b0_3
 58 |   - pickleshare=0.7.5=pyhd3eb1b0_1003
 59 |   - pip=20.3.3=py37h06a4308_0
 60 |   - prometheus_client=0.9.0=pyhd3eb1b0_0
 61 |   - prompt-toolkit=3.0.8=py_0
 62 |   - ptyprocess=0.7.0=pyhd3eb1b0_2
 63 |   - pycodestyle=2.6.0=py_0
 64 |   - pycparser=2.20=py_2
 65 |   - pygments=2.7.3=pyhd3eb1b0_0
 66 |   - pyparsing=2.4.7=py_0
 67 |   - pyrsistent=0.17.3=py37h7b6447c_0
 68 |   - python=3.7.9=h7579374_0
 69 |   - python-dateutil=2.8.1=py_0
 70 |   - python_abi=3.7=1_cp37m
 71 |   - pyzmq=20.0.0=py37h2531618_1
 72 |   - readline=8.0=h7b6447c_0
 73 |   - send2trash=1.5.0=pyhd3eb1b0_1
 74 |   - setuptools=51.0.0=py37h06a4308_2
 75 |   - six=1.15.0=py37h06a4308_0
 76 |   - sqlite=3.33.0=h62c20be_0
 77 |   - terminado=0.9.2=py37h06a4308_0
 78 |   - testpath=0.4.4=py_0
 79 |   - tk=8.6.10=hbc83047_0
 80 |   - toml=0.10.1=py_0
 81 |   - tornado=6.1=py37h27cfd23_0
 82 |   - traitlets=5.0.5=py_0
 83 |   - wcwidth=0.2.5=py_0
 84 |   - webencodings=0.5.1=py37_1
 85 |   - wheel=0.36.2=pyhd3eb1b0_0
 86 |   - xz=5.2.5=h7b6447c_0
 87 |   - yaml=0.2.5=h516909a_0
 88 |   - zeromq=4.3.3=he6710b0_3
 89 |   - zipp=3.4.0=pyhd3eb1b0_0
 90 |   - zlib=1.2.11=h7b6447c_3
 91 |   - pip:
 92 |     - findspark==1.4.2
 93 |     - jupyter-contrib-core==0.3.3
 94 |     - numpy==1.19.5
 95 |     - pandas==1.2.0
 96 |     - parso==0.8.1
 97 |     - py4j==0.10.9
 98 |     - pyspark==3.0.1
 99 |     - pytz==2020.5
100 |     - pyyaml==5.3.1
101 |     - pymongo==3.11.2
102 | prefix: /home/najmeh/anaconda3/envs/spark
103 | 


--------------------------------------------------------------------------------