├── Build Pipeline ├── DataPipeline.ipynb ├── ML-Pipeline.py ├── build-pipline.py ├── data_ETL.py ├── database.csv ├── query.csv ├── server.py └── template │ ├── dashboard.html │ └── file ├── Clustering_ML.ipynb ├── DataFrame_Basic_Operations.ipynb ├── Data_Transformations.ipynb ├── Dates_and_Timestamps.ipynb ├── GroupBy_and_Aggregate_Functions.ipynb ├── Linear_Regression_Consulting.ipynb ├── Logistic_Regression_Consulting.ipynb ├── Missing_Data.ipynb ├── MymasterNote.ipynb ├── README.md ├── Read_Write_and_Validate_Data.py ├── Recommender_System.ipynb ├── SQL_notebook.pdf ├── Spark_Streaming.ipynb ├── TweetRead.py ├── big_data ├── SQL-in-Spark.py ├── partition_parquet_file.py ├── readme.md ├── search_filter_dataframe.py └── split-column.py ├── books ├── LearningSpark2.0.pdf ├── pyspark.pdf └── spark-hadoop.pdf ├── data ├── ContainsNull.csv ├── appl_stock.csv ├── cruise_ship_info.csv ├── customer_churn.csv ├── people.json ├── sales_info.csv ├── users1.parquet ├── users2.parquet └── users3.parquet ├── scripts ├── MymasterNote.ipynb ├── PySpark_Dataframe_all.ipynb ├── aggrigating-data-in-DataFrame.py ├── join-append-DataFrame.py ├── join_tabales.ipynb ├── manipulating-data-in-DataFrame.py ├── multiple_csv_to_dataframe.py ├── pivote-table.py ├── pyspark-dataframe.py ├── read_write_DataFrame.py ├── sample_data │ ├── data │ └── data.txt ├── search-filter-DataFrame.py ├── update-column-DataFrame.py └── user-defined-function.py ├── search_filter_dataframe.py └── spark-env.yml /Build Pipeline/ML-Pipeline.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | from pyspark.ml import Pipeline 6 | from pyspark.ml.regression import RandomForestRegressor 7 | from pyspark.ml.feature import VectorAssembler 8 | from pyspark.ml.evaluation import RegressionEvaluator 9 | import numpy as np 10 | 11 | # Create spark session 12 | spark = SparkSession\ 13 | .builder\ 14 | .master('local[2]')\ 15 | .appName('quakes_ml')\ 16 | .config('spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:2.4.1')\ 17 | .getOrCreate() 18 | 19 | """ 20 | Data Pre-processing 21 | """ 22 | # Load test data file 23 | df_test = spark.read.csv(r"query.csv", header=True) 24 | 25 | # Load quakes data from mongodb 26 | df_train = spark.read.format('mongo')\ 27 | .option('spark.mongodb.input.uri', 'mongodb://127.0.0.1:27017/Quake.quakes').load() 28 | 29 | # Select fields we are going to use from df_test 30 | df_test_clean = df_test['time', 'latitude', 'longitude', 'mag', 'depth'] 31 | 32 | # Rename fields in df_test_clean 33 | df_test_clean = df_test_clean.withColumnRenamed('time', 'Date')\ 34 | .withColumnRenamed('latitude', 'Latitude')\ 35 | .withColumnRenamed('longitude', 'Longitude')\ 36 | .withColumnRenamed('mag', 'Magnitude')\ 37 | .withColumnRenamed('depth', 'Depth') 38 | 39 | # Cast string fields to double 40 | df_test_clean = df_test_clean.withColumn('Latitude', df_test_clean['Latitude'].cast(DoubleType()))\ 41 | .withColumn('Longitude', df_test_clean['Longitude'].cast(DoubleType()))\ 42 | .withColumn('Depth', df_test_clean['Depth'].cast(DoubleType()))\ 43 | .withColumn('Magnitude', df_test_clean['Magnitude'].cast(DoubleType())) 44 | 45 | # Create training and testing dataframes 46 | df_testing = df_test_clean['Latitude', 'Longitude', 'Magnitude', 'Depth'] 47 | df_training = df_train['Latitude', 'Longitude', 'Magnitude', 'Depth'] 48 | 49 | # Remove nulls from our datasets 50 | df_training = df_training.dropna() 51 | df_testing = df_testing.dropna() 52 | 53 | """ 54 | Building the machine learning model 55 | """ 56 | # Create feature vector 57 | assembler = VectorAssembler(inputCols=['Latitude', 'Longitude', 'Depth'], outputCol='features') 58 | 59 | # Create the model 60 | model_reg = RandomForestRegressor(featuresCol='features', labelCol='Magnitude') 61 | 62 | # Chain assembler and model into a pipleine 63 | pipeline = Pipeline(stages=[assembler, model_reg]) 64 | 65 | # Train the Model 66 | model = pipeline.fit(df_training) 67 | 68 | # Make the prediction 69 | pred_results = model.transform(df_testing) 70 | 71 | # Evaluate model 72 | evaluator = RegressionEvaluator(labelCol='Magnitude', predictionCol='prediction', metricName='rmse') 73 | rmse = evaluator.evaluate(pred_results) 74 | 75 | """ 76 | Create the prediction dataset 77 | """ 78 | df_pred_results = pred_results['Latitude', 'Longitude', 'prediction'] 79 | 80 | # Rename the prediction field 81 | df_pred_results = df_pred_results.withColumnRenamed('prediction', 'Pred_Magnitude') 82 | 83 | # Add more columns 84 | df_pred_results = df_pred_results.withColumn('Year', lit(2017))\ 85 | .withColumn('RMSE', lit(rmse)) 86 | 87 | # Load the pred dataset to MongoDB 88 | df_pred_results.write.format('mongo')\ 89 | .mode('overwrite')\ 90 | .option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Quake.pred_results').save() 91 | 92 | print(df_pred_results.show(5)) 93 | 94 | print('INFO: Job ran successfully') 95 | print('') 96 | 97 | -------------------------------------------------------------------------------- /Build Pipeline/build-pipline.py: -------------------------------------------------------------------------------- 1 | import findspark 2 | #findspark.init() 3 | import pyspark 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql.types import * # convert datatype from one type to another 6 | from pyspark.sql.functions import * # manipulation of data 7 | 8 | spark = SparkSession.builder.getOrCreate() 9 | df = spark.sql("select 'name' as colomn") # create a dataframe 10 | df.show() 11 | 12 | # Configure spark session with 2 cores for this job 13 | spark = SparkSession\ 14 | .builder\ 15 | .master('local[2]')\ 16 | .appName('quake_etl')\ 17 | .config('spark.jars.packages', 'org.mongodb.spark:mongo-spark-connector_2.12:2.4.1')\ 18 | .getOrCreate() 19 | 20 | spark 21 | # Load the dataset from https://github.com/EBISYS/WaterWatch 22 | df_load = spark.read.csv(r"Datasets/database.csv", header=True) 23 | # Preview df_load 24 | df_load.take(1) 25 | df_load.columns 26 | df_load.printSchema() 27 | 28 | # Drop fields we don't need from df_load 29 | lst_dropped_columns = ['Depth Error', 'Time', 'Depth Seismic Stations', 30 | 'Magnitude Error','Magnitude Seismic Stations','Azimuthal Gap', 31 | 'Horizontal Distance','Horizontal Error', 32 | 'Root Mean Square','Source','Location Source','Magnitude Source','Status'] 33 | 34 | df_load = df_load.drop(*lst_dropped_columns) 35 | # Preview df_load 36 | df_load.show(5) 37 | # Create a "year" field and add it to the dataframe 38 | df_load = df_load.withColumn('Year', year(to_timestamp('Date', 'dd/MM/yyyy'))) 39 | # Preview df_load 40 | df_load.show(5) 41 | # Build the quakes frequency dataframe using the year field and counts for each year 42 | df_quake_freq = df_load.groupBy('Year').count().withColumnRenamed('count', 'Counts') 43 | # Preview df_quake_freq 44 | df_quake_freq.show(5) 45 | 46 | # Preview df_load schema 47 | df_load.printSchema() 48 | 49 | # Cast some fields from string into numeric types 50 | df_load = df_load.withColumn('Latitude', df_load['Latitude'].cast(DoubleType()))\ 51 | .withColumn('Longitude', df_load['Longitude'].cast(DoubleType()))\ 52 | .withColumn('Depth', df_load['Depth'].cast(DoubleType()))\ 53 | .withColumn('Magnitude', df_load['Magnitude'].cast(DoubleType())) 54 | 55 | # Preview df_load 56 | df_load.show(5) 57 | 58 | # Preview df_load schema 59 | df_load.printSchema() 60 | 61 | # Create avg magnitude and max magnitude fields and add to df_quake_freq 62 | df_max = df_load.groupBy('Year').max('Magnitude').withColumnRenamed('max(Magnitude)', 'max_magnitude') 63 | df_avg = df_load.groupBy('Year').avg('Magnitude').withColumnRenamed('avg(Magnitude)', 'avg_magnitude') 64 | 65 | df_avg.show(5) 66 | 67 | # Join df_max, and df_avg to df_quake_freq 68 | df_quake_freq = df_quake_freq.join(df_avg, ['Year']).join(df_max, ['Year']) 69 | # Preview df_quake_freq 70 | df_quake_freq.printSchema() 71 | 72 | # Remove nulls 73 | df_load.dropna() 74 | df_quake_freq.dropna() -------------------------------------------------------------------------------- /Build Pipeline/data_ETL.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.types import * 4 | from pyspark.sql.functions import * 5 | 6 | # Create the spark session 7 | spark = SparkSession\ 8 | .builder\ 9 | .master('local[2]')\ 10 | .appName('quakes_etl')\ 11 | .config('spark.jars.packages", "org.mongodb.spark:mongo-spark-connector_2.12:2.4.1')\ 12 | .getOrCreate() 13 | 14 | # Load the dataset 15 | df_load = spark.read.csv(r"database.csv", header=True) 16 | 17 | # Remove all fields we don't need 18 | lst_dropped_columns = ['Depth Error', 'Time', 'Depth Seismic Stations','Magnitude Error','Magnitude Seismic Stations','Azimuthal Gap', 'Horizontal Distance','Horizontal Error', 19 | 'Root Mean Square','Source','Location Source','Magnitude Source','Status'] 20 | 21 | df_load = df_load.drop(*lst_dropped_columns) 22 | 23 | # Create a year field and add it to the df_load dataframe 24 | df_load = df_load.withColumn('Year', year(to_timestamp('Date', 'dd/MM/yyyy'))) 25 | 26 | # Create the quakes freq dataframe form the year and count values 27 | df_quake_freq = df_load.groupBy('Year').count().withColumnRenamed('count', 'Counts') 28 | 29 | # Cast string fields to double types 30 | df_load = df_load.withColumn('Latitude', df_load['Latitude'].cast(DoubleType()))\ 31 | .withColumn('Longitude', df_load['Longitude'].cast(DoubleType()))\ 32 | .withColumn('Depth', df_load['Depth'].cast(DoubleType()))\ 33 | .withColumn('Magnitude', df_load['Magnitude'].cast(DoubleType())) 34 | 35 | # Create avg and max magnitude fields and add to df_quake_freq 36 | df_max = df_load.groupBy('Year').max('Magnitude').withColumnRenamed('max(Magnitude)', 'Max_Magnitude') 37 | df_avg = df_load.groupBy('Year').avg('Magnitude').withColumnRenamed('avg(Magnitude)', 'Avg_Magnitude') 38 | 39 | # Join the max and avg dfs to df_quake_freq 40 | df_quake_freq = df_quake_freq.join(df_avg, ['Year']).join(df_max, ['Year']) 41 | 42 | # Remove records with null values 43 | df_load.dropna() 44 | df_quake_freq.dropna() 45 | 46 | # Load df_load into mongodb 47 | df_load.write.format('mongo')\ 48 | .mode('overwrite')\ 49 | .option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Quake.quakes').save() 50 | 51 | # Load df_quake_freq into mongodb 52 | df_quake_freq.write.format('mongo')\ 53 | .mode('overwrite')\ 54 | .option('spark.mongodb.output.uri', 'mongodb://127.0.0.1:27017/Quake.quake_freq').save() 55 | 56 | # Print dataframe heads 57 | print(df_quake_freq.show(5)) 58 | print(df_load.show(5)) 59 | 60 | print('INFO: Job ran successfully') 61 | print('') 62 | 63 | 64 | # submit job: spark-submit --packages org.mongodb.spark:mongo-spark-connector_2.12:2.4.1 data_ETL.py 65 | 66 | 67 | -------------------------------------------------------------------------------- /Build Pipeline/server.py: -------------------------------------------------------------------------------- 1 | from flask import Flask, render_template, request 2 | 3 | app = Flask(__name__) 4 | 5 | # Index page, no args 6 | @app.route('/') 7 | def index(): 8 | return render_template('dashboard.html') 9 | 10 | if __name__ == '__main__': 11 | app.run(port=5000, debug=True) 12 | -------------------------------------------------------------------------------- /Build Pipeline/template/file: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /Clustering_ML.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Clustering Code Along\n", 8 | "\n", 9 | "data can be found at UCI repository: https://archive.ics.uci.edu/ml/datasets/seeds." 10 | ] 11 | }, 12 | { 13 | "cell_type": "markdown", 14 | "metadata": {}, 15 | "source": [ 16 | "The examined group comprised kernels belonging to three different varieties of wheat: Kama, Rosa and Canadian, 70 elements each, randomly selected for \n", 17 | "the experiment. High quality visualization of the internal kernel structure was detected using a soft X-ray technique. It is non-destructive and considerably cheaper than other more sophisticated imaging techniques like scanning microscopy or laser technology. The images were recorded on 13x18 cm X-ray KODAK plates. Studies were conducted using combine harvested wheat grain originating from experimental fields, explored at the Institute of Agrophysics of the Polish Academy of Sciences in Lublin. \n", 18 | "\n", 19 | "The data set can be used for the tasks of classification and cluster analysis.\n", 20 | "\n", 21 | "\n", 22 | "Attribute Information:\n", 23 | "\n", 24 | "To construct the data, seven geometric parameters of wheat kernels were measured: \n", 25 | "1. area A, \n", 26 | "2. perimeter P, \n", 27 | "3. compactness C = 4*pi*A/P^2, \n", 28 | "4. length of kernel, \n", 29 | "5. width of kernel, \n", 30 | "6. asymmetry coefficient \n", 31 | "7. length of kernel groove. \n", 32 | "All of these parameters were real-valued continuous.\n", 33 | "\n", 34 | "Let's see if we can cluster them in to 3 groups with K-means!" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 53, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "from pyspark.sql import SparkSession\n", 46 | "spark = SparkSession.builder.appName('cluster').getOrCreate()" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 54, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "from pyspark.ml.clustering import KMeans\n", 56 | "\n", 57 | "# Loads data.\n", 58 | "dataset = spark.read.csv(\"seeds_dataset.csv\",header=True,inferSchema=True)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 55, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "Row(area=15.26, perimeter=14.84, compactness=0.871, length_of_kernel=5.763, width_of_kernel=3.312, asymmetry_coefficient=2.221, length_of_groove=5.22)" 70 | ] 71 | }, 72 | "execution_count": 55, 73 | "metadata": {}, 74 | "output_type": "execute_result" 75 | } 76 | ], 77 | "source": [ 78 | "dataset.head()" 79 | ] 80 | }, 81 | { 82 | "cell_type": "code", 83 | "execution_count": 56, 84 | "metadata": {}, 85 | "outputs": [ 86 | { 87 | "name": "stdout", 88 | "output_type": "stream", 89 | "text": [ 90 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n", 91 | "|summary| area| perimeter| compactness| length_of_kernel| width_of_kernel|asymmetry_coefficient| length_of_groove|\n", 92 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n", 93 | "| count| 210| 210| 210| 210| 210| 210| 210|\n", 94 | "| mean|14.847523809523816|14.559285714285718| 0.8709985714285714| 5.628533333333335| 3.258604761904762| 3.7001999999999997| 5.408071428571429|\n", 95 | "| stddev|2.9096994306873647|1.3059587265640225|0.023629416583846364|0.44306347772644983|0.3777144449065867| 1.5035589702547392|0.49148049910240543|\n", 96 | "| min| 10.59| 12.41| 0.8081| 4.899| 2.63| 0.765| 4.519|\n", 97 | "| max| 21.18| 17.25| 0.9183| 6.675| 4.033| 8.456| 6.55|\n", 98 | "+-------+------------------+------------------+--------------------+-------------------+------------------+---------------------+-------------------+\n", 99 | "\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "dataset.describe().show()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "## Format the Data" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 57, 117 | "metadata": { 118 | "collapsed": true 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "from pyspark.ml.linalg import Vectors\n", 123 | "from pyspark.ml.feature import VectorAssembler" 124 | ] 125 | }, 126 | { 127 | "cell_type": "code", 128 | "execution_count": 58, 129 | "metadata": {}, 130 | "outputs": [ 131 | { 132 | "data": { 133 | "text/plain": [ 134 | "['area',\n", 135 | " 'perimeter',\n", 136 | " 'compactness',\n", 137 | " 'length_of_kernel',\n", 138 | " 'width_of_kernel',\n", 139 | " 'asymmetry_coefficient',\n", 140 | " 'length_of_groove']" 141 | ] 142 | }, 143 | "execution_count": 58, 144 | "metadata": {}, 145 | "output_type": "execute_result" 146 | } 147 | ], 148 | "source": [ 149 | "dataset.columns" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": 59, 155 | "metadata": { 156 | "collapsed": true 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "vec_assembler = VectorAssembler(inputCols = dataset.columns, outputCol='features')" 161 | ] 162 | }, 163 | { 164 | "cell_type": "code", 165 | "execution_count": 60, 166 | "metadata": { 167 | "collapsed": true 168 | }, 169 | "outputs": [], 170 | "source": [ 171 | "final_data = vec_assembler.transform(dataset)" 172 | ] 173 | }, 174 | { 175 | "cell_type": "markdown", 176 | "metadata": {}, 177 | "source": [ 178 | "## Scale the Data\n", 179 | "It is a good idea to scale our data to deal with the curse of dimensionality: https://en.wikipedia.org/wiki/Curse_of_dimensionality" 180 | ] 181 | }, 182 | { 183 | "cell_type": "code", 184 | "execution_count": 61, 185 | "metadata": { 186 | "collapsed": true 187 | }, 188 | "outputs": [], 189 | "source": [ 190 | "from pyspark.ml.feature import StandardScaler" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 62, 196 | "metadata": {}, 197 | "outputs": [], 198 | "source": [ 199 | "scaler = StandardScaler(inputCol=\"features\", outputCol=\"scaledFeatures\", withStd=True, withMean=False)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 63, 205 | "metadata": { 206 | "collapsed": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "# Compute summary statistics by fitting the StandardScaler\n", 211 | "scalerModel = scaler.fit(final_data)" 212 | ] 213 | }, 214 | { 215 | "cell_type": "code", 216 | "execution_count": 64, 217 | "metadata": {}, 218 | "outputs": [], 219 | "source": [ 220 | "# Normalize each feature to have unit standard deviation.\n", 221 | "final_data = scalerModel.transform(final_data)" 222 | ] 223 | }, 224 | { 225 | "cell_type": "markdown", 226 | "metadata": {}, 227 | "source": [ 228 | "## Train the Model and Evaluate" 229 | ] 230 | }, 231 | { 232 | "cell_type": "code", 233 | "execution_count": 76, 234 | "metadata": { 235 | "collapsed": true 236 | }, 237 | "outputs": [], 238 | "source": [ 239 | "# Trains a k-means model.\n", 240 | "kmeans = KMeans(featuresCol='scaledFeatures',k=3)\n", 241 | "model = kmeans.fit(final_data)" 242 | ] 243 | }, 244 | { 245 | "cell_type": "code", 246 | "execution_count": 77, 247 | "metadata": {}, 248 | "outputs": [ 249 | { 250 | "name": "stdout", 251 | "output_type": "stream", 252 | "text": [ 253 | "Within Set Sum of Squared Errors = 429.07559671506715\n" 254 | ] 255 | } 256 | ], 257 | "source": [ 258 | "# Evaluate clustering by computing Within Set Sum of Squared Errors.\n", 259 | "wssse = model.computeCost(final_data)\n", 260 | "print(\"Within Set Sum of Squared Errors = \" + str(wssse))" 261 | ] 262 | }, 263 | { 264 | "cell_type": "code", 265 | "execution_count": 79, 266 | "metadata": {}, 267 | "outputs": [ 268 | { 269 | "name": "stdout", 270 | "output_type": "stream", 271 | "text": [ 272 | "Cluster Centers: \n", 273 | "[ 6.31670546 12.37109759 37.39491396 13.91155062 9.748067\n", 274 | " 2.39849968 12.2661748 ]\n", 275 | "[ 4.87257659 10.88120146 37.27692543 12.3410157 8.55443412\n", 276 | " 1.81649011 10.32998598]\n", 277 | "[ 4.06105916 10.13979506 35.80536984 11.82133095 7.50395937\n", 278 | " 3.27184732 10.42126018]\n" 279 | ] 280 | } 281 | ], 282 | "source": [ 283 | "# Shows the result.\n", 284 | "centers = model.clusterCenters()\n", 285 | "print(\"Cluster Centers: \")\n", 286 | "for center in centers:\n", 287 | " print(center)" 288 | ] 289 | }, 290 | { 291 | "cell_type": "code", 292 | "execution_count": 80, 293 | "metadata": {}, 294 | "outputs": [ 295 | { 296 | "name": "stdout", 297 | "output_type": "stream", 298 | "text": [ 299 | "+----------+\n", 300 | "|prediction|\n", 301 | "+----------+\n", 302 | "| 1|\n", 303 | "| 1|\n", 304 | "| 1|\n", 305 | "| 1|\n", 306 | "| 1|\n", 307 | "| 1|\n", 308 | "| 1|\n", 309 | "| 1|\n", 310 | "| 0|\n", 311 | "| 0|\n", 312 | "| 1|\n", 313 | "| 1|\n", 314 | "| 1|\n", 315 | "| 1|\n", 316 | "| 1|\n", 317 | "| 1|\n", 318 | "| 1|\n", 319 | "| 1|\n", 320 | "| 1|\n", 321 | "| 2|\n", 322 | "+----------+\n", 323 | "only showing top 20 rows\n", 324 | "\n" 325 | ] 326 | } 327 | ], 328 | "source": [ 329 | "model.transform(final_data).select('prediction').show()" 330 | ] 331 | } 332 | ], 333 | "metadata": { 334 | "anaconda-cloud": {}, 335 | "kernelspec": { 336 | "display_name": "Python 3", 337 | "language": "python", 338 | "name": "python3" 339 | }, 340 | "language_info": { 341 | "codemirror_mode": { 342 | "name": "ipython", 343 | "version": 3 344 | }, 345 | "file_extension": ".py", 346 | "mimetype": "text/x-python", 347 | "name": "python", 348 | "nbconvert_exporter": "python", 349 | "pygments_lexer": "ipython3", 350 | "version": "3.8.3" 351 | } 352 | }, 353 | "nbformat": 4, 354 | "nbformat_minor": 1 355 | } 356 | -------------------------------------------------------------------------------- /Data_Transformations.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Data Transformations\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 2, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from pyspark.sql import SparkSession" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 3, 24 | "metadata": { 25 | "collapsed": true 26 | }, 27 | "outputs": [], 28 | "source": [ 29 | "spark = SparkSession.builder.appName('data').getOrCreate()" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": 4, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "df = spark.read.csv('fake_customers.csv',inferSchema=True,header=True)" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": 5, 46 | "metadata": {}, 47 | "outputs": [ 48 | { 49 | "name": "stdout", 50 | "output_type": "stream", 51 | "text": [ 52 | "+-------+----------+-----+\n", 53 | "| Name| Phone|Group|\n", 54 | "+-------+----------+-----+\n", 55 | "| John|4085552424| A|\n", 56 | "| Mike|3105552738| B|\n", 57 | "| Cassie|4085552424| B|\n", 58 | "| Laura|3105552438| B|\n", 59 | "| Sarah|4085551234| A|\n", 60 | "| David|3105557463| C|\n", 61 | "| Zach|4085553987| C|\n", 62 | "| Kiera|3105552938| A|\n", 63 | "| Alexa|4085559467| C|\n", 64 | "|Karissa|3105553475| A|\n", 65 | "+-------+----------+-----+\n", 66 | "\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "df.show()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "markdown", 76 | "metadata": {}, 77 | "source": [ 78 | "## Data Features\n", 79 | "\n", 80 | "### StringIndexer\n", 81 | "\n" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 6, 87 | "metadata": {}, 88 | "outputs": [ 89 | { 90 | "name": "stdout", 91 | "output_type": "stream", 92 | "text": [ 93 | "+-------+--------+-------------+\n", 94 | "|user_id|category|categoryIndex|\n", 95 | "+-------+--------+-------------+\n", 96 | "| 0| a| 0.0|\n", 97 | "| 1| b| 2.0|\n", 98 | "| 2| c| 1.0|\n", 99 | "| 3| a| 0.0|\n", 100 | "| 4| a| 0.0|\n", 101 | "| 5| c| 1.0|\n", 102 | "+-------+--------+-------------+\n", 103 | "\n" 104 | ] 105 | } 106 | ], 107 | "source": [ 108 | "from pyspark.ml.feature import StringIndexer\n", 109 | "\n", 110 | "df = spark.createDataFrame(\n", 111 | " [(0, \"a\"), (1, \"b\"), (2, \"c\"), (3, \"a\"), (4, \"a\"), (5, \"c\")],\n", 112 | " [\"user_id\", \"category\"])\n", 113 | "\n", 114 | "indexer = StringIndexer(inputCol=\"category\", outputCol=\"categoryIndex\")\n", 115 | "indexed = indexer.fit(df).transform(df)\n", 116 | "indexed.show()" 117 | ] 118 | }, 119 | { 120 | "cell_type": "code", 121 | "execution_count": null, 122 | "metadata": { 123 | "collapsed": true 124 | }, 125 | "outputs": [], 126 | "source": [] 127 | }, 128 | { 129 | "cell_type": "markdown", 130 | "metadata": {}, 131 | "source": [ 132 | "### VectorIndexer\n", 133 | "\n" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 14, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "+---+----+------+--------------+-------+\n", 146 | "| id|hour|mobile| userFeatures|clicked|\n", 147 | "+---+----+------+--------------+-------+\n", 148 | "| 0| 18| 1.0|[0.0,10.0,0.5]| 1.0|\n", 149 | "+---+----+------+--------------+-------+\n", 150 | "\n" 151 | ] 152 | } 153 | ], 154 | "source": [ 155 | "from pyspark.ml.linalg import Vectors\n", 156 | "from pyspark.ml.feature import VectorAssembler\n", 157 | "\n", 158 | "dataset = spark.createDataFrame(\n", 159 | " [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],\n", 160 | " [\"id\", \"hour\", \"mobile\", \"userFeatures\", \"clicked\"])\n", 161 | "dataset.show()" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 15, 167 | "metadata": {}, 168 | "outputs": [ 169 | { 170 | "name": "stdout", 171 | "output_type": "stream", 172 | "text": [ 173 | "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\n", 174 | "+--------------------+-------+\n", 175 | "| features|clicked|\n", 176 | "+--------------------+-------+\n", 177 | "|[18.0,1.0,0.0,10....| 1.0|\n", 178 | "+--------------------+-------+\n", 179 | "\n" 180 | ] 181 | } 182 | ], 183 | "source": [ 184 | "assembler = VectorAssembler(\n", 185 | " inputCols=[\"hour\", \"mobile\", \"userFeatures\"],\n", 186 | " outputCol=\"features\")\n", 187 | "\n", 188 | "output = assembler.transform(dataset)\n", 189 | "print(\"Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'\")\n", 190 | "output.select(\"features\", \"clicked\").show()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": null, 196 | "metadata": { 197 | "collapsed": true 198 | }, 199 | "outputs": [], 200 | "source": [] 201 | } 202 | ], 203 | "metadata": { 204 | "anaconda-cloud": {}, 205 | "kernelspec": { 206 | "display_name": "Python 3", 207 | "language": "python", 208 | "name": "python3" 209 | }, 210 | "language_info": { 211 | "codemirror_mode": { 212 | "name": "ipython", 213 | "version": 3 214 | }, 215 | "file_extension": ".py", 216 | "mimetype": "text/x-python", 217 | "name": "python", 218 | "nbconvert_exporter": "python", 219 | "pygments_lexer": "ipython3", 220 | "version": "3.7.9" 221 | } 222 | }, 223 | "nbformat": 4, 224 | "nbformat_minor": 1 225 | } 226 | -------------------------------------------------------------------------------- /Dates_and_Timestamps.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Dates and Timestamps\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from pyspark.sql import SparkSession\n", 19 | "# May take a little while on a local computer\n", 20 | "spark = SparkSession.builder.appName(\"dates\").getOrCreate()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 3, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "df = spark.read.csv(\"appl_stock.csv\",header=True,inferSchema=True)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 4, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+\n", 44 | "| Date| Open| High| Low| Close| Volume| Adj Close|\n", 45 | "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+\n", 46 | "|2010-01-04 00:00:...| 213.429998| 214.499996|212.38000099999996| 214.009998|123432400| 27.727039|\n", 47 | "|2010-01-05 00:00:...| 214.599998| 215.589994| 213.249994| 214.379993|150476200|27.774976000000002|\n", 48 | "|2010-01-06 00:00:...| 214.379993| 215.23| 210.750004| 210.969995|138040000|27.333178000000004|\n", 49 | "|2010-01-07 00:00:...| 211.75| 212.000006| 209.050005| 210.58|119282800| 27.28265|\n", 50 | "|2010-01-08 00:00:...| 210.299994| 212.000006|209.06000500000002|211.98000499999998|111902700| 27.464034|\n", 51 | "|2010-01-11 00:00:...|212.79999700000002| 213.000002| 208.450005|210.11000299999998|115557400| 27.221758|\n", 52 | "|2010-01-12 00:00:...|209.18999499999998|209.76999500000002| 206.419998| 207.720001|148614900| 26.91211|\n", 53 | "|2010-01-13 00:00:...| 207.870005|210.92999500000002| 204.099998| 210.650002|151473000| 27.29172|\n", 54 | "|2010-01-14 00:00:...|210.11000299999998|210.45999700000002| 209.020004| 209.43|108223500| 27.133657|\n", 55 | "|2010-01-15 00:00:...|210.92999500000002|211.59999700000003| 205.869999| 205.93|148516900|26.680197999999997|\n", 56 | "|2010-01-19 00:00:...| 208.330002|215.18999900000003| 207.240004| 215.039995|182501900|27.860484999999997|\n", 57 | "|2010-01-20 00:00:...| 214.910006| 215.549994| 209.500002| 211.73|153038200| 27.431644|\n", 58 | "|2010-01-21 00:00:...| 212.079994|213.30999599999998| 207.210003| 208.069996|152038600| 26.957455|\n", 59 | "|2010-01-22 00:00:...|206.78000600000001| 207.499996| 197.16| 197.75|220441900| 25.620401|\n", 60 | "|2010-01-25 00:00:...|202.51000200000001| 204.699999| 200.190002| 203.070002|266424900|26.309658000000002|\n", 61 | "|2010-01-26 00:00:...|205.95000100000001| 213.710005| 202.580004| 205.940001|466777500| 26.681494|\n", 62 | "|2010-01-27 00:00:...| 206.849995| 210.58| 199.530001| 207.880005|430642100|26.932840000000002|\n", 63 | "|2010-01-28 00:00:...| 204.930004| 205.500004| 198.699995| 199.289995|293375600|25.819922000000002|\n", 64 | "|2010-01-29 00:00:...| 201.079996| 202.199995| 190.250002| 192.060003|311488100| 24.883208|\n", 65 | "|2010-02-01 00:00:...|192.36999699999998| 196.0|191.29999899999999| 194.729998|187469100| 25.229131|\n", 66 | "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+\n", 67 | "only showing top 20 rows\n", 68 | "\n" 69 | ] 70 | } 71 | ], 72 | "source": [ 73 | "df.show()" 74 | ] 75 | }, 76 | { 77 | "cell_type": "markdown", 78 | "metadata": {}, 79 | "source": [ 80 | "Let's walk through how to grab parts of the timestamp data" 81 | ] 82 | }, 83 | { 84 | "cell_type": "code", 85 | "execution_count": 44, 86 | "metadata": { 87 | "collapsed": true 88 | }, 89 | "outputs": [], 90 | "source": [ 91 | "from pyspark.sql.functions import format_number,dayofmonth,hour,dayofyear,month,year,weekofyear,date_format" 92 | ] 93 | }, 94 | { 95 | "cell_type": "code", 96 | "execution_count": 45, 97 | "metadata": {}, 98 | "outputs": [ 99 | { 100 | "name": "stdout", 101 | "output_type": "stream", 102 | "text": [ 103 | "+----------------+\n", 104 | "|dayofmonth(Date)|\n", 105 | "+----------------+\n", 106 | "| 4|\n", 107 | "| 5|\n", 108 | "| 6|\n", 109 | "| 7|\n", 110 | "| 8|\n", 111 | "| 11|\n", 112 | "| 12|\n", 113 | "| 13|\n", 114 | "| 14|\n", 115 | "| 15|\n", 116 | "| 19|\n", 117 | "| 20|\n", 118 | "| 21|\n", 119 | "| 22|\n", 120 | "| 25|\n", 121 | "| 26|\n", 122 | "| 27|\n", 123 | "| 28|\n", 124 | "| 29|\n", 125 | "| 1|\n", 126 | "+----------------+\n", 127 | "only showing top 20 rows\n", 128 | "\n" 129 | ] 130 | } 131 | ], 132 | "source": [ 133 | "df.select(dayofmonth(df['Date'])).show()" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": 46, 139 | "metadata": {}, 140 | "outputs": [ 141 | { 142 | "name": "stdout", 143 | "output_type": "stream", 144 | "text": [ 145 | "+----------+\n", 146 | "|hour(Date)|\n", 147 | "+----------+\n", 148 | "| 0|\n", 149 | "| 0|\n", 150 | "| 0|\n", 151 | "| 0|\n", 152 | "| 0|\n", 153 | "| 0|\n", 154 | "| 0|\n", 155 | "| 0|\n", 156 | "| 0|\n", 157 | "| 0|\n", 158 | "| 0|\n", 159 | "| 0|\n", 160 | "| 0|\n", 161 | "| 0|\n", 162 | "| 0|\n", 163 | "| 0|\n", 164 | "| 0|\n", 165 | "| 0|\n", 166 | "| 0|\n", 167 | "| 0|\n", 168 | "+----------+\n", 169 | "only showing top 20 rows\n", 170 | "\n" 171 | ] 172 | } 173 | ], 174 | "source": [ 175 | "df.select(hour(df['Date'])).show()" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 8, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "+---------------+\n", 188 | "|dayofyear(Date)|\n", 189 | "+---------------+\n", 190 | "| 4|\n", 191 | "| 5|\n", 192 | "| 6|\n", 193 | "| 7|\n", 194 | "| 8|\n", 195 | "| 11|\n", 196 | "| 12|\n", 197 | "| 13|\n", 198 | "| 14|\n", 199 | "| 15|\n", 200 | "| 19|\n", 201 | "| 20|\n", 202 | "| 21|\n", 203 | "| 22|\n", 204 | "| 25|\n", 205 | "| 26|\n", 206 | "| 27|\n", 207 | "| 28|\n", 208 | "| 29|\n", 209 | "| 32|\n", 210 | "+---------------+\n", 211 | "only showing top 20 rows\n", 212 | "\n" 213 | ] 214 | } 215 | ], 216 | "source": [ 217 | "df.select(dayofyear(df['Date'])).show()" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 11, 223 | "metadata": {}, 224 | "outputs": [ 225 | { 226 | "name": "stdout", 227 | "output_type": "stream", 228 | "text": [ 229 | "+-----------+\n", 230 | "|month(Date)|\n", 231 | "+-----------+\n", 232 | "| 1|\n", 233 | "| 1|\n", 234 | "| 1|\n", 235 | "| 1|\n", 236 | "| 1|\n", 237 | "| 1|\n", 238 | "| 1|\n", 239 | "| 1|\n", 240 | "| 1|\n", 241 | "| 1|\n", 242 | "| 1|\n", 243 | "| 1|\n", 244 | "| 1|\n", 245 | "| 1|\n", 246 | "| 1|\n", 247 | "| 1|\n", 248 | "| 1|\n", 249 | "| 1|\n", 250 | "| 1|\n", 251 | "| 2|\n", 252 | "+-----------+\n", 253 | "only showing top 20 rows\n", 254 | "\n" 255 | ] 256 | } 257 | ], 258 | "source": [ 259 | "df.select(month(df['Date'])).show()" 260 | ] 261 | }, 262 | { 263 | "cell_type": "markdown", 264 | "metadata": {}, 265 | "source": [ 266 | "So for example, let's say we wanted to know the average closing price per year. Easy! With a groupby and the year() function call:" 267 | ] 268 | }, 269 | { 270 | "cell_type": "code", 271 | "execution_count": 15, 272 | "metadata": {}, 273 | "outputs": [ 274 | { 275 | "name": "stdout", 276 | "output_type": "stream", 277 | "text": [ 278 | "+----------+\n", 279 | "|year(Date)|\n", 280 | "+----------+\n", 281 | "| 2010|\n", 282 | "| 2010|\n", 283 | "| 2010|\n", 284 | "| 2010|\n", 285 | "| 2010|\n", 286 | "| 2010|\n", 287 | "| 2010|\n", 288 | "| 2010|\n", 289 | "| 2010|\n", 290 | "| 2010|\n", 291 | "| 2010|\n", 292 | "| 2010|\n", 293 | "| 2010|\n", 294 | "| 2010|\n", 295 | "| 2010|\n", 296 | "| 2010|\n", 297 | "| 2010|\n", 298 | "| 2010|\n", 299 | "| 2010|\n", 300 | "| 2010|\n", 301 | "+----------+\n", 302 | "only showing top 20 rows\n", 303 | "\n" 304 | ] 305 | } 306 | ], 307 | "source": [ 308 | "df.select(year(df['Date'])).show()" 309 | ] 310 | }, 311 | { 312 | "cell_type": "code", 313 | "execution_count": 19, 314 | "metadata": {}, 315 | "outputs": [ 316 | { 317 | "name": "stdout", 318 | "output_type": "stream", 319 | "text": [ 320 | "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+----+\n", 321 | "| Date| Open| High| Low| Close| Volume| Adj Close|Year|\n", 322 | "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+----+\n", 323 | "|2010-01-04 00:00:...| 213.429998| 214.499996|212.38000099999996| 214.009998|123432400| 27.727039|2010|\n", 324 | "|2010-01-05 00:00:...| 214.599998| 215.589994| 213.249994| 214.379993|150476200|27.774976000000002|2010|\n", 325 | "|2010-01-06 00:00:...| 214.379993| 215.23| 210.750004| 210.969995|138040000|27.333178000000004|2010|\n", 326 | "|2010-01-07 00:00:...| 211.75| 212.000006| 209.050005| 210.58|119282800| 27.28265|2010|\n", 327 | "|2010-01-08 00:00:...| 210.299994| 212.000006|209.06000500000002|211.98000499999998|111902700| 27.464034|2010|\n", 328 | "|2010-01-11 00:00:...|212.79999700000002| 213.000002| 208.450005|210.11000299999998|115557400| 27.221758|2010|\n", 329 | "|2010-01-12 00:00:...|209.18999499999998|209.76999500000002| 206.419998| 207.720001|148614900| 26.91211|2010|\n", 330 | "|2010-01-13 00:00:...| 207.870005|210.92999500000002| 204.099998| 210.650002|151473000| 27.29172|2010|\n", 331 | "|2010-01-14 00:00:...|210.11000299999998|210.45999700000002| 209.020004| 209.43|108223500| 27.133657|2010|\n", 332 | "|2010-01-15 00:00:...|210.92999500000002|211.59999700000003| 205.869999| 205.93|148516900|26.680197999999997|2010|\n", 333 | "|2010-01-19 00:00:...| 208.330002|215.18999900000003| 207.240004| 215.039995|182501900|27.860484999999997|2010|\n", 334 | "|2010-01-20 00:00:...| 214.910006| 215.549994| 209.500002| 211.73|153038200| 27.431644|2010|\n", 335 | "|2010-01-21 00:00:...| 212.079994|213.30999599999998| 207.210003| 208.069996|152038600| 26.957455|2010|\n", 336 | "|2010-01-22 00:00:...|206.78000600000001| 207.499996| 197.16| 197.75|220441900| 25.620401|2010|\n", 337 | "|2010-01-25 00:00:...|202.51000200000001| 204.699999| 200.190002| 203.070002|266424900|26.309658000000002|2010|\n", 338 | "|2010-01-26 00:00:...|205.95000100000001| 213.710005| 202.580004| 205.940001|466777500| 26.681494|2010|\n", 339 | "|2010-01-27 00:00:...| 206.849995| 210.58| 199.530001| 207.880005|430642100|26.932840000000002|2010|\n", 340 | "|2010-01-28 00:00:...| 204.930004| 205.500004| 198.699995| 199.289995|293375600|25.819922000000002|2010|\n", 341 | "|2010-01-29 00:00:...| 201.079996| 202.199995| 190.250002| 192.060003|311488100| 24.883208|2010|\n", 342 | "|2010-02-01 00:00:...|192.36999699999998| 196.0|191.29999899999999| 194.729998|187469100| 25.229131|2010|\n", 343 | "+--------------------+------------------+------------------+------------------+------------------+---------+------------------+----+\n", 344 | "only showing top 20 rows\n", 345 | "\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "df.withColumn(\"Year\",year(df['Date'])).show()" 351 | ] 352 | }, 353 | { 354 | "cell_type": "code", 355 | "execution_count": 29, 356 | "metadata": { 357 | "scrolled": true 358 | }, 359 | "outputs": [ 360 | { 361 | "name": "stdout", 362 | "output_type": "stream", 363 | "text": [ 364 | "+---------+------------------+\n", 365 | "|avg(Year)| avg(Close)|\n", 366 | "+---------+------------------+\n", 367 | "| 2015.0|120.03999980555547|\n", 368 | "| 2013.0| 472.6348802857143|\n", 369 | "| 2014.0| 295.4023416507935|\n", 370 | "| 2012.0| 576.0497195640002|\n", 371 | "| 2016.0|104.60400786904763|\n", 372 | "| 2010.0| 259.8424600000002|\n", 373 | "| 2011.0|364.00432532142867|\n", 374 | "+---------+------------------+\n", 375 | "\n" 376 | ] 377 | } 378 | ], 379 | "source": [ 380 | "newdf = df.withColumn(\"Year\",year(df['Date']))\n", 381 | "newdf.groupBy(\"Year\").mean()[['avg(Year)','avg(Close)']].show()" 382 | ] 383 | }, 384 | { 385 | "cell_type": "markdown", 386 | "metadata": {}, 387 | "source": [ 388 | "Still not quite presentable! Let's use the .alias method as well as round() to clean this up!" 389 | ] 390 | }, 391 | { 392 | "cell_type": "code", 393 | "execution_count": 43, 394 | "metadata": {}, 395 | "outputs": [ 396 | { 397 | "name": "stdout", 398 | "output_type": "stream", 399 | "text": [ 400 | "+------+----------+\n", 401 | "| Year|Mean Close|\n", 402 | "+------+----------+\n", 403 | "|2015.0| 120.04|\n", 404 | "|2013.0| 472.63|\n", 405 | "|2014.0| 295.40|\n", 406 | "|2012.0| 576.05|\n", 407 | "|2016.0| 104.60|\n", 408 | "|2010.0| 259.84|\n", 409 | "|2011.0| 364.00|\n", 410 | "+------+----------+\n", 411 | "\n" 412 | ] 413 | } 414 | ], 415 | "source": [ 416 | "result = newdf.groupBy(\"Year\").mean()[['avg(Year)','avg(Close)']]\n", 417 | "result = result.withColumnRenamed(\"avg(Year)\",\"Year\")\n", 418 | "result = result.select('Year',format_number('avg(Close)',2).alias(\"Mean Close\")).show()" 419 | ] 420 | }, 421 | { 422 | "cell_type": "markdown", 423 | "metadata": {}, 424 | "source": [ 425 | "Perfect! Now you know how to work with Date and Timestamp information!" 426 | ] 427 | } 428 | ], 429 | "metadata": { 430 | "anaconda-cloud": {}, 431 | "kernelspec": { 432 | "display_name": "Python 3", 433 | "language": "python", 434 | "name": "python3" 435 | }, 436 | "language_info": { 437 | "codemirror_mode": { 438 | "name": "ipython", 439 | "version": 3 440 | }, 441 | "file_extension": ".py", 442 | "mimetype": "text/x-python", 443 | "name": "python", 444 | "nbconvert_exporter": "python", 445 | "pygments_lexer": "ipython3", 446 | "version": "3.7.9" 447 | } 448 | }, 449 | "nbformat": 4, 450 | "nbformat_minor": 1 451 | } 452 | -------------------------------------------------------------------------------- /GroupBy_and_Aggregate_Functions.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# GroupBy and Aggregate Functions\n", 8 | "\n", 9 | "GroupBy allows you to group rows together based off some column value, for example, you could group together sales data by the day the sale occured, or group repeast customer data based off the name of the customer. Once you've performed the GroupBy operation you can use an aggregate function off that data. An aggregate function aggregates multiple rows of data into a single output, such as taking the sum of inputs, or counting the number of inputs.\n" 10 | ] 11 | }, 12 | { 13 | "cell_type": "code", 14 | "execution_count": 1, 15 | "metadata": { 16 | "collapsed": true 17 | }, 18 | "outputs": [], 19 | "source": [ 20 | "from pyspark.sql import SparkSession" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "# May take a little while on a local computer\n", 32 | "spark = SparkSession.builder.appName(\"groupbyagg\").getOrCreate()" 33 | ] 34 | }, 35 | { 36 | "cell_type": "markdown", 37 | "metadata": {}, 38 | "source": [ 39 | "Read in the customer sales data" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 3, 45 | "metadata": { 46 | "collapsed": true 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "df = spark.read.csv('sales_info.csv',inferSchema=True,header=True)" 51 | ] 52 | }, 53 | { 54 | "cell_type": "code", 55 | "execution_count": 4, 56 | "metadata": {}, 57 | "outputs": [ 58 | { 59 | "name": "stdout", 60 | "output_type": "stream", 61 | "text": [ 62 | "root\n", 63 | " |-- Company: string (nullable = true)\n", 64 | " |-- Person: string (nullable = true)\n", 65 | " |-- Sales: double (nullable = true)\n", 66 | "\n" 67 | ] 68 | } 69 | ], 70 | "source": [ 71 | "df.printSchema()" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 8, 77 | "metadata": {}, 78 | "outputs": [ 79 | { 80 | "name": "stdout", 81 | "output_type": "stream", 82 | "text": [ 83 | "+-------+-------+-----+\n", 84 | "|Company| Person|Sales|\n", 85 | "+-------+-------+-----+\n", 86 | "| GOOG| Sam|200.0|\n", 87 | "| GOOG|Charlie|120.0|\n", 88 | "| GOOG| Frank|340.0|\n", 89 | "| MSFT| Tina|600.0|\n", 90 | "| MSFT| Amy|124.0|\n", 91 | "| MSFT|Vanessa|243.0|\n", 92 | "| FB| Carl|870.0|\n", 93 | "| FB| Sarah|350.0|\n", 94 | "| APPL| John|250.0|\n", 95 | "| APPL| Linda|130.0|\n", 96 | "| APPL| Mike|750.0|\n", 97 | "| APPL| Chris|350.0|\n", 98 | "+-------+-------+-----+\n", 99 | "\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "df.show()" 105 | ] 106 | }, 107 | { 108 | "cell_type": "markdown", 109 | "metadata": {}, 110 | "source": [ 111 | "Let's group together by company!" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 9, 117 | "metadata": {}, 118 | "outputs": [ 119 | { 120 | "data": { 121 | "text/plain": [ 122 | "" 123 | ] 124 | }, 125 | "execution_count": 9, 126 | "metadata": {}, 127 | "output_type": "execute_result" 128 | } 129 | ], 130 | "source": [ 131 | "df.groupBy(\"Company\")" 132 | ] 133 | }, 134 | { 135 | "cell_type": "markdown", 136 | "metadata": {}, 137 | "source": [ 138 | "This returns a GroupedData object, off of which you can all various methods" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 10, 144 | "metadata": {}, 145 | "outputs": [ 146 | { 147 | "name": "stdout", 148 | "output_type": "stream", 149 | "text": [ 150 | "+-------+-----------------+\n", 151 | "|Company| avg(Sales)|\n", 152 | "+-------+-----------------+\n", 153 | "| APPL| 370.0|\n", 154 | "| GOOG| 220.0|\n", 155 | "| FB| 610.0|\n", 156 | "| MSFT|322.3333333333333|\n", 157 | "+-------+-----------------+\n", 158 | "\n" 159 | ] 160 | } 161 | ], 162 | "source": [ 163 | "# Mean\n", 164 | "df.groupBy(\"Company\").mean().show()" 165 | ] 166 | }, 167 | { 168 | "cell_type": "code", 169 | "execution_count": 11, 170 | "metadata": {}, 171 | "outputs": [ 172 | { 173 | "name": "stdout", 174 | "output_type": "stream", 175 | "text": [ 176 | "+-------+-----+\n", 177 | "|Company|count|\n", 178 | "+-------+-----+\n", 179 | "| APPL| 4|\n", 180 | "| GOOG| 3|\n", 181 | "| FB| 2|\n", 182 | "| MSFT| 3|\n", 183 | "+-------+-----+\n", 184 | "\n" 185 | ] 186 | } 187 | ], 188 | "source": [ 189 | "# Count\n", 190 | "df.groupBy(\"Company\").count().show()" 191 | ] 192 | }, 193 | { 194 | "cell_type": "code", 195 | "execution_count": 12, 196 | "metadata": {}, 197 | "outputs": [ 198 | { 199 | "name": "stdout", 200 | "output_type": "stream", 201 | "text": [ 202 | "+-------+----------+\n", 203 | "|Company|max(Sales)|\n", 204 | "+-------+----------+\n", 205 | "| APPL| 750.0|\n", 206 | "| GOOG| 340.0|\n", 207 | "| FB| 870.0|\n", 208 | "| MSFT| 600.0|\n", 209 | "+-------+----------+\n", 210 | "\n" 211 | ] 212 | } 213 | ], 214 | "source": [ 215 | "# Max\n", 216 | "df.groupBy(\"Company\").max().show()" 217 | ] 218 | }, 219 | { 220 | "cell_type": "code", 221 | "execution_count": 13, 222 | "metadata": {}, 223 | "outputs": [ 224 | { 225 | "name": "stdout", 226 | "output_type": "stream", 227 | "text": [ 228 | "+-------+----------+\n", 229 | "|Company|min(Sales)|\n", 230 | "+-------+----------+\n", 231 | "| APPL| 130.0|\n", 232 | "| GOOG| 120.0|\n", 233 | "| FB| 350.0|\n", 234 | "| MSFT| 124.0|\n", 235 | "+-------+----------+\n", 236 | "\n" 237 | ] 238 | } 239 | ], 240 | "source": [ 241 | "# Min\n", 242 | "df.groupBy(\"Company\").min().show()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 15, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "name": "stdout", 252 | "output_type": "stream", 253 | "text": [ 254 | "+-------+----------+\n", 255 | "|Company|sum(Sales)|\n", 256 | "+-------+----------+\n", 257 | "| APPL| 1480.0|\n", 258 | "| GOOG| 660.0|\n", 259 | "| FB| 1220.0|\n", 260 | "| MSFT| 967.0|\n", 261 | "+-------+----------+\n", 262 | "\n" 263 | ] 264 | } 265 | ], 266 | "source": [ 267 | "# Sum\n", 268 | "df.groupBy(\"Company\").sum().show()" 269 | ] 270 | }, 271 | { 272 | "cell_type": "markdown", 273 | "metadata": {}, 274 | "source": [ 275 | "Check out this link for more info on other methods:\n", 276 | "http://spark.apache.org/docs/latest/api/python/pyspark.sql.html#pyspark-sql-module\n", 277 | "\n", 278 | "Not all methods need a groupby call, instead one can just call the generalized **.agg()** method, that will call the aggregate across all rows in the dataframe column specified. It can take in arguments as a single column, or create multiple aggregate calls all at once using dictionary notation.\n", 279 | "\n", 280 | "For example:" 281 | ] 282 | }, 283 | { 284 | "cell_type": "code", 285 | "execution_count": 18, 286 | "metadata": {}, 287 | "outputs": [ 288 | { 289 | "name": "stdout", 290 | "output_type": "stream", 291 | "text": [ 292 | "+----------+\n", 293 | "|max(Sales)|\n", 294 | "+----------+\n", 295 | "| 870.0|\n", 296 | "+----------+\n", 297 | "\n" 298 | ] 299 | } 300 | ], 301 | "source": [ 302 | "# Max sales across everything\n", 303 | "df.agg({'Sales':'max'}).show()" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 22, 309 | "metadata": { 310 | "collapsed": true 311 | }, 312 | "outputs": [], 313 | "source": [ 314 | "# Could have done this on the group by object as well:" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 23, 320 | "metadata": { 321 | "collapsed": true 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "grouped = df.groupBy(\"Company\")" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 25, 331 | "metadata": {}, 332 | "outputs": [ 333 | { 334 | "name": "stdout", 335 | "output_type": "stream", 336 | "text": [ 337 | "+-------+----------+\n", 338 | "|Company|max(Sales)|\n", 339 | "+-------+----------+\n", 340 | "| APPL| 750.0|\n", 341 | "| GOOG| 340.0|\n", 342 | "| FB| 870.0|\n", 343 | "| MSFT| 600.0|\n", 344 | "+-------+----------+\n", 345 | "\n" 346 | ] 347 | } 348 | ], 349 | "source": [ 350 | "grouped.agg({\"Sales\":'max'}).show()" 351 | ] 352 | }, 353 | { 354 | "cell_type": "markdown", 355 | "metadata": {}, 356 | "source": [ 357 | "## Functions\n" 358 | ] 359 | }, 360 | { 361 | "cell_type": "code", 362 | "execution_count": 36, 363 | "metadata": {}, 364 | "outputs": [], 365 | "source": [ 366 | "from pyspark.sql.functions import countDistinct, avg,stddev" 367 | ] 368 | }, 369 | { 370 | "cell_type": "code", 371 | "execution_count": 29, 372 | "metadata": {}, 373 | "outputs": [ 374 | { 375 | "name": "stdout", 376 | "output_type": "stream", 377 | "text": [ 378 | "+---------------------+\n", 379 | "|count(DISTINCT Sales)|\n", 380 | "+---------------------+\n", 381 | "| 11|\n", 382 | "+---------------------+\n", 383 | "\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "df.select(countDistinct(\"Sales\")).show()" 389 | ] 390 | }, 391 | { 392 | "cell_type": "markdown", 393 | "metadata": {}, 394 | "source": [ 395 | "Often you will want to change the name, use the .alias() method for this:" 396 | ] 397 | }, 398 | { 399 | "cell_type": "code", 400 | "execution_count": 31, 401 | "metadata": {}, 402 | "outputs": [ 403 | { 404 | "name": "stdout", 405 | "output_type": "stream", 406 | "text": [ 407 | "+--------------+\n", 408 | "|Distinct Sales|\n", 409 | "+--------------+\n", 410 | "| 11|\n", 411 | "+--------------+\n", 412 | "\n" 413 | ] 414 | } 415 | ], 416 | "source": [ 417 | "df.select(countDistinct(\"Sales\").alias(\"Distinct Sales\")).show()" 418 | ] 419 | }, 420 | { 421 | "cell_type": "code", 422 | "execution_count": 35, 423 | "metadata": {}, 424 | "outputs": [ 425 | { 426 | "name": "stdout", 427 | "output_type": "stream", 428 | "text": [ 429 | "+-----------------+\n", 430 | "| avg(Sales)|\n", 431 | "+-----------------+\n", 432 | "|360.5833333333333|\n", 433 | "+-----------------+\n", 434 | "\n" 435 | ] 436 | } 437 | ], 438 | "source": [ 439 | "df.select(avg('Sales')).show()" 440 | ] 441 | }, 442 | { 443 | "cell_type": "code", 444 | "execution_count": 38, 445 | "metadata": {}, 446 | "outputs": [ 447 | { 448 | "name": "stdout", 449 | "output_type": "stream", 450 | "text": [ 451 | "+------------------+\n", 452 | "|stddev_samp(Sales)|\n", 453 | "+------------------+\n", 454 | "|250.08742410799007|\n", 455 | "+------------------+\n", 456 | "\n" 457 | ] 458 | } 459 | ], 460 | "source": [ 461 | "df.select(stddev(\"Sales\")).show()" 462 | ] 463 | }, 464 | { 465 | "cell_type": "code", 466 | "execution_count": 39, 467 | "metadata": { 468 | "collapsed": true 469 | }, 470 | "outputs": [], 471 | "source": [ 472 | "from pyspark.sql.functions import format_number" 473 | ] 474 | }, 475 | { 476 | "cell_type": "code", 477 | "execution_count": 40, 478 | "metadata": { 479 | "collapsed": true 480 | }, 481 | "outputs": [], 482 | "source": [ 483 | "sales_std = df.select(stddev(\"Sales\").alias('std'))" 484 | ] 485 | }, 486 | { 487 | "cell_type": "code", 488 | "execution_count": 41, 489 | "metadata": {}, 490 | "outputs": [ 491 | { 492 | "name": "stdout", 493 | "output_type": "stream", 494 | "text": [ 495 | "+------------------+\n", 496 | "| std|\n", 497 | "+------------------+\n", 498 | "|250.08742410799007|\n", 499 | "+------------------+\n", 500 | "\n" 501 | ] 502 | } 503 | ], 504 | "source": [ 505 | "sales_std.show()" 506 | ] 507 | }, 508 | { 509 | "cell_type": "code", 510 | "execution_count": 42, 511 | "metadata": {}, 512 | "outputs": [ 513 | { 514 | "name": "stdout", 515 | "output_type": "stream", 516 | "text": [ 517 | "+---------------------+\n", 518 | "|format_number(std, 2)|\n", 519 | "+---------------------+\n", 520 | "| 250.09|\n", 521 | "+---------------------+\n", 522 | "\n" 523 | ] 524 | } 525 | ], 526 | "source": [ 527 | "# format_number(\"col_name\",decimal places)\n", 528 | "sales_std.select(format_number('std',2)).show()" 529 | ] 530 | }, 531 | { 532 | "cell_type": "markdown", 533 | "metadata": {}, 534 | "source": [ 535 | "## Order By\n", 536 | "\n", 537 | "You can easily sort with the orderBy method:" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": 43, 543 | "metadata": {}, 544 | "outputs": [ 545 | { 546 | "name": "stdout", 547 | "output_type": "stream", 548 | "text": [ 549 | "+-------+-------+-----+\n", 550 | "|Company| Person|Sales|\n", 551 | "+-------+-------+-----+\n", 552 | "| GOOG|Charlie|120.0|\n", 553 | "| MSFT| Amy|124.0|\n", 554 | "| APPL| Linda|130.0|\n", 555 | "| GOOG| Sam|200.0|\n", 556 | "| MSFT|Vanessa|243.0|\n", 557 | "| APPL| John|250.0|\n", 558 | "| GOOG| Frank|340.0|\n", 559 | "| FB| Sarah|350.0|\n", 560 | "| APPL| Chris|350.0|\n", 561 | "| MSFT| Tina|600.0|\n", 562 | "| APPL| Mike|750.0|\n", 563 | "| FB| Carl|870.0|\n", 564 | "+-------+-------+-----+\n", 565 | "\n" 566 | ] 567 | } 568 | ], 569 | "source": [ 570 | "# OrderBy\n", 571 | "# Ascending\n", 572 | "df.orderBy(\"Sales\").show()" 573 | ] 574 | }, 575 | { 576 | "cell_type": "code", 577 | "execution_count": 47, 578 | "metadata": {}, 579 | "outputs": [ 580 | { 581 | "name": "stdout", 582 | "output_type": "stream", 583 | "text": [ 584 | "+-------+-------+-----+\n", 585 | "|Company| Person|Sales|\n", 586 | "+-------+-------+-----+\n", 587 | "| FB| Carl|870.0|\n", 588 | "| APPL| Mike|750.0|\n", 589 | "| MSFT| Tina|600.0|\n", 590 | "| FB| Sarah|350.0|\n", 591 | "| APPL| Chris|350.0|\n", 592 | "| GOOG| Frank|340.0|\n", 593 | "| APPL| John|250.0|\n", 594 | "| MSFT|Vanessa|243.0|\n", 595 | "| GOOG| Sam|200.0|\n", 596 | "| APPL| Linda|130.0|\n", 597 | "| MSFT| Amy|124.0|\n", 598 | "| GOOG|Charlie|120.0|\n", 599 | "+-------+-------+-----+\n", 600 | "\n" 601 | ] 602 | } 603 | ], 604 | "source": [ 605 | "# Descending call off the column itself.\n", 606 | "df.orderBy(df[\"Sales\"].desc()).show()" 607 | ] 608 | }, 609 | { 610 | "cell_type": "markdown", 611 | "metadata": {}, 612 | "source": [ 613 | "Most basic functions you would expect to be available are, so make sure to check out the documentation!" 614 | ] 615 | } 616 | ], 617 | "metadata": { 618 | "anaconda-cloud": {}, 619 | "kernelspec": { 620 | "display_name": "Python 3", 621 | "language": "python", 622 | "name": "python3" 623 | }, 624 | "language_info": { 625 | "codemirror_mode": { 626 | "name": "ipython", 627 | "version": 3 628 | }, 629 | "file_extension": ".py", 630 | "mimetype": "text/x-python", 631 | "name": "python", 632 | "nbconvert_exporter": "python", 633 | "pygments_lexer": "ipython3", 634 | "version": "3.7.9" 635 | } 636 | }, 637 | "nbformat": 4, 638 | "nbformat_minor": 1 639 | } 640 | -------------------------------------------------------------------------------- /Linear_Regression_Consulting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Linear Regression Consulting" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "source": [ 16 | "\n", 17 | "\n", 18 | "Here is data:\n", 19 | "\n", 20 | " Description: Measurements of ship size, capacity, crew, and age for 158 cruise\n", 21 | " ships.\n", 22 | "\n", 23 | "\n", 24 | " Variables/Columns\n", 25 | " Ship Name 1-20\n", 26 | " Cruise Line 21-40\n", 27 | " Age (as of 2013) 46-48\n", 28 | " Tonnage (1000s of tons) 50-56\n", 29 | " passengers (100s) 58-64\n", 30 | " Length (100s of feet) 66-72\n", 31 | " Cabins (100s) 74-80\n", 32 | " Passenger Density 82-88\n", 33 | " Crew (100s) 90-96\n", 34 | " \n" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 1, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "from pyspark.sql import SparkSession" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 2, 51 | "metadata": {}, 52 | "outputs": [], 53 | "source": [ 54 | "spark = SparkSession.builder.appName('cruise').getOrCreate()" 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": 3, 60 | "metadata": {}, 61 | "outputs": [], 62 | "source": [ 63 | "df = spark.read.csv('cruise_ship_info.csv',inferSchema=True,header=True)" 64 | ] 65 | }, 66 | { 67 | "cell_type": "code", 68 | "execution_count": 4, 69 | "metadata": {}, 70 | "outputs": [ 71 | { 72 | "name": "stdout", 73 | "output_type": "stream", 74 | "text": [ 75 | "root\n", 76 | " |-- Ship_name: string (nullable = true)\n", 77 | " |-- Cruise_line: string (nullable = true)\n", 78 | " |-- Age: integer (nullable = true)\n", 79 | " |-- Tonnage: double (nullable = true)\n", 80 | " |-- passengers: double (nullable = true)\n", 81 | " |-- length: double (nullable = true)\n", 82 | " |-- cabins: double (nullable = true)\n", 83 | " |-- passenger_density: double (nullable = true)\n", 84 | " |-- crew: double (nullable = true)\n", 85 | "\n" 86 | ] 87 | } 88 | ], 89 | "source": [ 90 | "df.printSchema()" 91 | ] 92 | }, 93 | { 94 | "cell_type": "code", 95 | "execution_count": 5, 96 | "metadata": {}, 97 | "outputs": [ 98 | { 99 | "name": "stdout", 100 | "output_type": "stream", 101 | "text": [ 102 | "+-----------+-----------+---+------------------+----------+------+------+-----------------+----+\n", 103 | "| Ship_name|Cruise_line|Age| Tonnage|passengers|length|cabins|passenger_density|crew|\n", 104 | "+-----------+-----------+---+------------------+----------+------+------+-----------------+----+\n", 105 | "| Journey| Azamara| 6|30.276999999999997| 6.94| 5.94| 3.55| 42.64|3.55|\n", 106 | "| Quest| Azamara| 6|30.276999999999997| 6.94| 5.94| 3.55| 42.64|3.55|\n", 107 | "|Celebration| Carnival| 26| 47.262| 14.86| 7.22| 7.43| 31.8| 6.7|\n", 108 | "| Conquest| Carnival| 11| 110.0| 29.74| 9.53| 14.88| 36.99|19.1|\n", 109 | "| Destiny| Carnival| 17| 101.353| 26.42| 8.92| 13.21| 38.36|10.0|\n", 110 | "| Ecstasy| Carnival| 22| 70.367| 20.52| 8.55| 10.2| 34.29| 9.2|\n", 111 | "| Elation| Carnival| 15| 70.367| 20.52| 8.55| 10.2| 34.29| 9.2|\n", 112 | "| Fantasy| Carnival| 23| 70.367| 20.56| 8.55| 10.22| 34.23| 9.2|\n", 113 | "|Fascination| Carnival| 19| 70.367| 20.52| 8.55| 10.2| 34.29| 9.2|\n", 114 | "| Freedom| Carnival| 6|110.23899999999999| 37.0| 9.51| 14.87| 29.79|11.5|\n", 115 | "| Glory| Carnival| 10| 110.0| 29.74| 9.51| 14.87| 36.99|11.6|\n", 116 | "| Holiday| Carnival| 28| 46.052| 14.52| 7.27| 7.26| 31.72| 6.6|\n", 117 | "|Imagination| Carnival| 18| 70.367| 20.52| 8.55| 10.2| 34.29| 9.2|\n", 118 | "|Inspiration| Carnival| 17| 70.367| 20.52| 8.55| 10.2| 34.29| 9.2|\n", 119 | "| Legend| Carnival| 11| 86.0| 21.24| 9.63| 10.62| 40.49| 9.3|\n", 120 | "| Liberty*| Carnival| 8| 110.0| 29.74| 9.51| 14.87| 36.99|11.6|\n", 121 | "| Miracle| Carnival| 9| 88.5| 21.24| 9.63| 10.62| 41.67|10.3|\n", 122 | "| Paradise| Carnival| 15| 70.367| 20.52| 8.55| 10.2| 34.29| 9.2|\n", 123 | "| Pride| Carnival| 12| 88.5| 21.24| 9.63| 11.62| 41.67| 9.3|\n", 124 | "| Sensation| Carnival| 20| 70.367| 20.52| 8.55| 10.2| 34.29| 9.2|\n", 125 | "+-----------+-----------+---+------------------+----------+------+------+-----------------+----+\n", 126 | "only showing top 20 rows\n", 127 | "\n" 128 | ] 129 | } 130 | ], 131 | "source": [ 132 | "df.show()" 133 | ] 134 | }, 135 | { 136 | "cell_type": "code", 137 | "execution_count": 6, 138 | "metadata": {}, 139 | "outputs": [ 140 | { 141 | "name": "stdout", 142 | "output_type": "stream", 143 | "text": [ 144 | "+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+\n", 145 | "|summary| Age| Tonnage| passengers| length| cabins|passenger_density| crew|\n", 146 | "+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+\n", 147 | "| count| 158| 158| 158| 158| 158| 158| 158|\n", 148 | "| mean|15.689873417721518| 71.28467088607599|18.45740506329114|8.130632911392404| 8.830000000000005|39.90094936708861|7.794177215189873|\n", 149 | "| stddev| 7.615691058751413|37.229540025907866|9.677094775143416|1.793473548054825|4.4714172221480615| 8.63921711391542|3.503486564627034|\n", 150 | "| min| 4| 2.329| 0.66| 2.79| 0.33| 17.7| 0.59|\n", 151 | "| max| 48| 220.0| 54.0| 11.82| 27.0| 71.43| 21.0|\n", 152 | "+-------+------------------+------------------+-----------------+-----------------+------------------+-----------------+-----------------+\n", 153 | "\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "df.describe().show()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "markdown", 163 | "metadata": {}, 164 | "source": [ 165 | "## Dealing with the Cruise_line categorical variable\n", 166 | "Ship Name is a useless arbitrary string, but the cruise_line itself may be useful. Let's make it into a categorical variable!" 167 | ] 168 | }, 169 | { 170 | "cell_type": "code", 171 | "execution_count": 7, 172 | "metadata": {}, 173 | "outputs": [ 174 | { 175 | "name": "stdout", 176 | "output_type": "stream", 177 | "text": [ 178 | "+-----------------+-----+\n", 179 | "| Cruise_line|count|\n", 180 | "+-----------------+-----+\n", 181 | "| Costa| 11|\n", 182 | "| P&O| 6|\n", 183 | "| Cunard| 3|\n", 184 | "|Regent_Seven_Seas| 5|\n", 185 | "| MSC| 8|\n", 186 | "| Carnival| 22|\n", 187 | "| Crystal| 2|\n", 188 | "| Orient| 1|\n", 189 | "| Princess| 17|\n", 190 | "| Silversea| 4|\n", 191 | "| Seabourn| 3|\n", 192 | "| Holland_American| 14|\n", 193 | "| Windstar| 3|\n", 194 | "| Disney| 2|\n", 195 | "| Norwegian| 13|\n", 196 | "| Oceania| 3|\n", 197 | "| Azamara| 2|\n", 198 | "| Celebrity| 10|\n", 199 | "| Star| 6|\n", 200 | "| Royal_Caribbean| 23|\n", 201 | "+-----------------+-----+\n", 202 | "\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "df.groupBy('Cruise_line').count().show()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 8, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "data": { 217 | "text/plain": [ 218 | "[Row(Ship_name='Journey', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),\n", 219 | " Row(Ship_name='Quest', Cruise_line='Azamara', Age=6, Tonnage=30.276999999999997, passengers=6.94, length=5.94, cabins=3.55, passenger_density=42.64, crew=3.55, cruise_cat=16.0),\n", 220 | " Row(Ship_name='Celebration', Cruise_line='Carnival', Age=26, Tonnage=47.262, passengers=14.86, length=7.22, cabins=7.43, passenger_density=31.8, crew=6.7, cruise_cat=1.0),\n", 221 | " Row(Ship_name='Conquest', Cruise_line='Carnival', Age=11, Tonnage=110.0, passengers=29.74, length=9.53, cabins=14.88, passenger_density=36.99, crew=19.1, cruise_cat=1.0),\n", 222 | " Row(Ship_name='Destiny', Cruise_line='Carnival', Age=17, Tonnage=101.353, passengers=26.42, length=8.92, cabins=13.21, passenger_density=38.36, crew=10.0, cruise_cat=1.0)]" 223 | ] 224 | }, 225 | "execution_count": 8, 226 | "metadata": {}, 227 | "output_type": "execute_result" 228 | } 229 | ], 230 | "source": [ 231 | "from pyspark.ml.feature import StringIndexer\n", 232 | "indexer = StringIndexer(inputCol=\"Cruise_line\", outputCol=\"cruise_cat\")\n", 233 | "indexed = indexer.fit(df).transform(df)\n", 234 | "indexed.head(5)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 9, 240 | "metadata": { 241 | "collapsed": true 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "from pyspark.ml.linalg import Vectors\n", 246 | "from pyspark.ml.feature import VectorAssembler" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 10, 252 | "metadata": {}, 253 | "outputs": [ 254 | { 255 | "data": { 256 | "text/plain": [ 257 | "['Ship_name',\n", 258 | " 'Cruise_line',\n", 259 | " 'Age',\n", 260 | " 'Tonnage',\n", 261 | " 'passengers',\n", 262 | " 'length',\n", 263 | " 'cabins',\n", 264 | " 'passenger_density',\n", 265 | " 'crew',\n", 266 | " 'cruise_cat']" 267 | ] 268 | }, 269 | "execution_count": 10, 270 | "metadata": {}, 271 | "output_type": "execute_result" 272 | } 273 | ], 274 | "source": [ 275 | "indexed.columns" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 11, 281 | "metadata": { 282 | "collapsed": true 283 | }, 284 | "outputs": [], 285 | "source": [ 286 | "assembler = VectorAssembler(\n", 287 | " inputCols=['Age',\n", 288 | " 'Tonnage',\n", 289 | " 'passengers',\n", 290 | " 'length',\n", 291 | " 'cabins',\n", 292 | " 'passenger_density',\n", 293 | " 'cruise_cat'],\n", 294 | " outputCol=\"features\")" 295 | ] 296 | }, 297 | { 298 | "cell_type": "code", 299 | "execution_count": 12, 300 | "metadata": {}, 301 | "outputs": [], 302 | "source": [ 303 | "output = assembler.transform(indexed)" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 13, 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "name": "stdout", 313 | "output_type": "stream", 314 | "text": [ 315 | "+--------------------+----+\n", 316 | "| features|crew|\n", 317 | "+--------------------+----+\n", 318 | "|[6.0,30.276999999...|3.55|\n", 319 | "|[6.0,30.276999999...|3.55|\n", 320 | "|[26.0,47.262,14.8...| 6.7|\n", 321 | "|[11.0,110.0,29.74...|19.1|\n", 322 | "|[17.0,101.353,26....|10.0|\n", 323 | "|[22.0,70.367,20.5...| 9.2|\n", 324 | "|[15.0,70.367,20.5...| 9.2|\n", 325 | "|[23.0,70.367,20.5...| 9.2|\n", 326 | "|[19.0,70.367,20.5...| 9.2|\n", 327 | "|[6.0,110.23899999...|11.5|\n", 328 | "|[10.0,110.0,29.74...|11.6|\n", 329 | "|[28.0,46.052,14.5...| 6.6|\n", 330 | "|[18.0,70.367,20.5...| 9.2|\n", 331 | "|[17.0,70.367,20.5...| 9.2|\n", 332 | "|[11.0,86.0,21.24,...| 9.3|\n", 333 | "|[8.0,110.0,29.74,...|11.6|\n", 334 | "|[9.0,88.5,21.24,9...|10.3|\n", 335 | "|[15.0,70.367,20.5...| 9.2|\n", 336 | "|[12.0,88.5,21.24,...| 9.3|\n", 337 | "|[20.0,70.367,20.5...| 9.2|\n", 338 | "+--------------------+----+\n", 339 | "only showing top 20 rows\n", 340 | "\n" 341 | ] 342 | } 343 | ], 344 | "source": [ 345 | "output.select(\"features\", \"crew\").show()" 346 | ] 347 | }, 348 | { 349 | "cell_type": "code", 350 | "execution_count": 14, 351 | "metadata": { 352 | "collapsed": true 353 | }, 354 | "outputs": [], 355 | "source": [ 356 | "final_data = output.select(\"features\", \"crew\")" 357 | ] 358 | }, 359 | { 360 | "cell_type": "code", 361 | "execution_count": 15, 362 | "metadata": { 363 | "collapsed": true 364 | }, 365 | "outputs": [], 366 | "source": [ 367 | "train_data,test_data = final_data.randomSplit([0.7,0.3])" 368 | ] 369 | }, 370 | { 371 | "cell_type": "code", 372 | "execution_count": 16, 373 | "metadata": {}, 374 | "outputs": [], 375 | "source": [ 376 | "from pyspark.ml.regression import LinearRegression\n", 377 | "# Create a Linear Regression Model object\n", 378 | "lr = LinearRegression(labelCol='crew')" 379 | ] 380 | }, 381 | { 382 | "cell_type": "code", 383 | "execution_count": 17, 384 | "metadata": { 385 | "collapsed": true 386 | }, 387 | "outputs": [], 388 | "source": [ 389 | "# Fit the model to the data and call this model lrModel\n", 390 | "lrModel = lr.fit(train_data)" 391 | ] 392 | }, 393 | { 394 | "cell_type": "code", 395 | "execution_count": 18, 396 | "metadata": {}, 397 | "outputs": [ 398 | { 399 | "name": "stdout", 400 | "output_type": "stream", 401 | "text": [ 402 | "Coefficients: [-0.0145423814068,0.0137445818936,-0.111000735058,0.422234330769,0.705574105078,-0.00631202648669,0.0306212943631] Intercept: -0.5598623529951635\n" 403 | ] 404 | } 405 | ], 406 | "source": [ 407 | "# Print the coefficients and intercept for linear regression\n", 408 | "print(\"Coefficients: {} Intercept: {}\".format(lrModel.coefficients,lrModel.intercept))" 409 | ] 410 | }, 411 | { 412 | "cell_type": "code", 413 | "execution_count": 19, 414 | "metadata": { 415 | "collapsed": true 416 | }, 417 | "outputs": [], 418 | "source": [ 419 | "test_results = lrModel.evaluate(test_data)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": 20, 425 | "metadata": {}, 426 | "outputs": [ 427 | { 428 | "name": "stdout", 429 | "output_type": "stream", 430 | "text": [ 431 | "RMSE: 1.3174339720092743\n", 432 | "MSE: 1.7356322706041332\n", 433 | "R2: 0.8671622449217978\n" 434 | ] 435 | } 436 | ], 437 | "source": [ 438 | "print(\"RMSE: {}\".format(test_results.rootMeanSquaredError))\n", 439 | "print(\"MSE: {}\".format(test_results.meanSquaredError))\n", 440 | "print(\"R2: {}\".format(test_results.r2))" 441 | ] 442 | }, 443 | { 444 | "cell_type": "code", 445 | "execution_count": 21, 446 | "metadata": { 447 | "collapsed": true 448 | }, 449 | "outputs": [], 450 | "source": [ 451 | "# R2 of 0.86 is pretty good, let's check the data a little closer\n", 452 | "from pyspark.sql.functions import corr" 453 | ] 454 | }, 455 | { 456 | "cell_type": "code", 457 | "execution_count": 22, 458 | "metadata": {}, 459 | "outputs": [ 460 | { 461 | "name": "stdout", 462 | "output_type": "stream", 463 | "text": [ 464 | "+----------------------+\n", 465 | "|corr(crew, passengers)|\n", 466 | "+----------------------+\n", 467 | "| 0.9152341306065384|\n", 468 | "+----------------------+\n", 469 | "\n" 470 | ] 471 | } 472 | ], 473 | "source": [ 474 | "df.select(corr('crew','passengers')).show()" 475 | ] 476 | }, 477 | { 478 | "cell_type": "code", 479 | "execution_count": 23, 480 | "metadata": {}, 481 | "outputs": [ 482 | { 483 | "name": "stdout", 484 | "output_type": "stream", 485 | "text": [ 486 | "+------------------+\n", 487 | "|corr(crew, cabins)|\n", 488 | "+------------------+\n", 489 | "|0.9508226063578497|\n", 490 | "+------------------+\n", 491 | "\n" 492 | ] 493 | } 494 | ], 495 | "source": [ 496 | "df.select(corr('crew','cabins')).show()" 497 | ] 498 | } 499 | ], 500 | "metadata": { 501 | "anaconda-cloud": {}, 502 | "kernelspec": { 503 | "display_name": "Python 3", 504 | "language": "python", 505 | "name": "python3" 506 | }, 507 | "language_info": { 508 | "codemirror_mode": { 509 | "name": "ipython", 510 | "version": 3 511 | }, 512 | "file_extension": ".py", 513 | "mimetype": "text/x-python", 514 | "name": "python", 515 | "nbconvert_exporter": "python", 516 | "pygments_lexer": "ipython3", 517 | "version": "3.7.9" 518 | } 519 | }, 520 | "nbformat": 4, 521 | "nbformat_minor": 1 522 | } 523 | -------------------------------------------------------------------------------- /Logistic_Regression_Consulting.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Logistic Regression Consulting \n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "markdown", 12 | "metadata": { 13 | "collapsed": true 14 | }, 15 | "source": [ 16 | "\n", 17 | "\n", 18 | "\n", 19 | "The data is :\n", 20 | "\n", 21 | " Name : Name of the latest contact at Company\n", 22 | " Age: Customer Age\n", 23 | " Total_Purchase: Total Ads Purchased\n", 24 | " Account_Manager: Binary 0=No manager, 1= Account manager assigned\n", 25 | " Years: Totaly Years as a customer\n", 26 | " Num_sites: Number of websites that use the service.\n", 27 | " Onboard_date: Date that the name of the latest contact was onboarded\n", 28 | " Location: Client HQ Address\n", 29 | " Company: Name of Client Company\n", 30 | " " 31 | ] 32 | }, 33 | { 34 | "cell_type": "code", 35 | "execution_count": 1, 36 | "metadata": { 37 | "collapsed": true 38 | }, 39 | "outputs": [], 40 | "source": [ 41 | "from pyspark.sql import SparkSession" 42 | ] 43 | }, 44 | { 45 | "cell_type": "code", 46 | "execution_count": 2, 47 | "metadata": { 48 | "collapsed": true 49 | }, 50 | "outputs": [], 51 | "source": [ 52 | "spark = SparkSession.builder.appName('logregconsult').getOrCreate()" 53 | ] 54 | }, 55 | { 56 | "cell_type": "code", 57 | "execution_count": 3, 58 | "metadata": { 59 | "collapsed": true 60 | }, 61 | "outputs": [], 62 | "source": [ 63 | "data = spark.read.csv('customer_churn.csv',inferSchema=True,\n", 64 | " header=True)" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": 37, 70 | "metadata": {}, 71 | "outputs": [ 72 | { 73 | "name": "stdout", 74 | "output_type": "stream", 75 | "text": [ 76 | "root\n", 77 | " |-- Names: string (nullable = true)\n", 78 | " |-- Age: double (nullable = true)\n", 79 | " |-- Total_Purchase: double (nullable = true)\n", 80 | " |-- Account_Manager: integer (nullable = true)\n", 81 | " |-- Years: double (nullable = true)\n", 82 | " |-- Num_Sites: double (nullable = true)\n", 83 | " |-- Onboard_date: timestamp (nullable = true)\n", 84 | " |-- Location: string (nullable = true)\n", 85 | " |-- Company: string (nullable = true)\n", 86 | " |-- Churn: integer (nullable = true)\n", 87 | "\n" 88 | ] 89 | } 90 | ], 91 | "source": [ 92 | "data.printSchema()" 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "### Check out the data" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": 5, 105 | "metadata": {}, 106 | "outputs": [ 107 | { 108 | "name": "stdout", 109 | "output_type": "stream", 110 | "text": [ 111 | "+-------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+\n", 112 | "|summary| Age| Total_Purchase| Account_Manager| Years| Num_Sites| Churn|\n", 113 | "+-------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+\n", 114 | "| count| 900| 900| 900| 900| 900| 900|\n", 115 | "| mean|41.81666666666667|10062.82403333334|0.4811111111111111| 5.27315555555555| 8.587777777777777|0.16666666666666666|\n", 116 | "| stddev|6.127560416916251|2408.644531858096|0.4999208935073339|1.274449013194616|1.7648355920350969| 0.3728852122772358|\n", 117 | "| min| 22.0| 100.0| 0| 1.0| 3.0| 0|\n", 118 | "| max| 65.0| 18026.01| 1| 9.15| 14.0| 1|\n", 119 | "+-------+-----------------+-----------------+------------------+-----------------+------------------+-------------------+\n", 120 | "\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "data.describe().show()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 38, 131 | "metadata": {}, 132 | "outputs": [ 133 | { 134 | "data": { 135 | "text/plain": [ 136 | "['Names',\n", 137 | " 'Age',\n", 138 | " 'Total_Purchase',\n", 139 | " 'Account_Manager',\n", 140 | " 'Years',\n", 141 | " 'Num_Sites',\n", 142 | " 'Onboard_date',\n", 143 | " 'Location',\n", 144 | " 'Company',\n", 145 | " 'Churn']" 146 | ] 147 | }, 148 | "execution_count": 38, 149 | "metadata": {}, 150 | "output_type": "execute_result" 151 | } 152 | ], 153 | "source": [ 154 | "data.columns" 155 | ] 156 | }, 157 | { 158 | "cell_type": "markdown", 159 | "metadata": {}, 160 | "source": [ 161 | "### Format for MLlib\n", 162 | "\n" 163 | ] 164 | }, 165 | { 166 | "cell_type": "code", 167 | "execution_count": 7, 168 | "metadata": { 169 | "collapsed": true 170 | }, 171 | "outputs": [], 172 | "source": [ 173 | "from pyspark.ml.feature import VectorAssembler" 174 | ] 175 | }, 176 | { 177 | "cell_type": "code", 178 | "execution_count": 8, 179 | "metadata": { 180 | "collapsed": true 181 | }, 182 | "outputs": [], 183 | "source": [ 184 | "assembler = VectorAssembler(inputCols=['Age',\n", 185 | " 'Total_Purchase',\n", 186 | " 'Account_Manager',\n", 187 | " 'Years',\n", 188 | " 'Num_Sites'],outputCol='features')" 189 | ] 190 | }, 191 | { 192 | "cell_type": "code", 193 | "execution_count": 9, 194 | "metadata": { 195 | "collapsed": true 196 | }, 197 | "outputs": [], 198 | "source": [ 199 | "output = assembler.transform(data)" 200 | ] 201 | }, 202 | { 203 | "cell_type": "code", 204 | "execution_count": 39, 205 | "metadata": { 206 | "collapsed": true 207 | }, 208 | "outputs": [], 209 | "source": [ 210 | "final_data = output.select('features','churn')" 211 | ] 212 | }, 213 | { 214 | "cell_type": "markdown", 215 | "metadata": {}, 216 | "source": [ 217 | "### Test Train Split" 218 | ] 219 | }, 220 | { 221 | "cell_type": "code", 222 | "execution_count": 40, 223 | "metadata": { 224 | "collapsed": true 225 | }, 226 | "outputs": [], 227 | "source": [ 228 | "train_churn,test_churn = final_data.randomSplit([0.7,0.3])" 229 | ] 230 | }, 231 | { 232 | "cell_type": "markdown", 233 | "metadata": {}, 234 | "source": [ 235 | "### Fit the model" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 12, 241 | "metadata": { 242 | "collapsed": true 243 | }, 244 | "outputs": [], 245 | "source": [ 246 | "from pyspark.ml.classification import LogisticRegression" 247 | ] 248 | }, 249 | { 250 | "cell_type": "code", 251 | "execution_count": 13, 252 | "metadata": { 253 | "collapsed": true 254 | }, 255 | "outputs": [], 256 | "source": [ 257 | "lr_churn = LogisticRegression(labelCol='churn')" 258 | ] 259 | }, 260 | { 261 | "cell_type": "code", 262 | "execution_count": 14, 263 | "metadata": { 264 | "collapsed": true 265 | }, 266 | "outputs": [], 267 | "source": [ 268 | "fitted_churn_model = lr_churn.fit(train_churn)" 269 | ] 270 | }, 271 | { 272 | "cell_type": "code", 273 | "execution_count": 15, 274 | "metadata": { 275 | "collapsed": true 276 | }, 277 | "outputs": [], 278 | "source": [ 279 | "training_sum = fitted_churn_model.summary" 280 | ] 281 | }, 282 | { 283 | "cell_type": "code", 284 | "execution_count": 41, 285 | "metadata": {}, 286 | "outputs": [ 287 | { 288 | "name": "stdout", 289 | "output_type": "stream", 290 | "text": [ 291 | "+-------+-------------------+-------------------+\n", 292 | "|summary| churn| prediction|\n", 293 | "+-------+-------------------+-------------------+\n", 294 | "| count| 632| 632|\n", 295 | "| mean|0.16772151898734178|0.13924050632911392|\n", 296 | "| stddev|0.37391474020622584| 0.3464715405857694|\n", 297 | "| min| 0| 0.0|\n", 298 | "| max| 1| 1.0|\n", 299 | "+-------+-------------------+-------------------+\n", 300 | "\n" 301 | ] 302 | } 303 | ], 304 | "source": [ 305 | "training_sum.predictions.describe().show()" 306 | ] 307 | }, 308 | { 309 | "cell_type": "markdown", 310 | "metadata": {}, 311 | "source": [ 312 | "### Evaluate results\n", 313 | "\n", 314 | "Let's evaluate the results on the data set we were given (using the test data)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 17, 320 | "metadata": { 321 | "collapsed": true 322 | }, 323 | "outputs": [], 324 | "source": [ 325 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 18, 331 | "metadata": { 332 | "collapsed": true 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "pred_and_labels = fitted_churn_model.evaluate(test_churn)" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 42, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "name": "stdout", 346 | "output_type": "stream", 347 | "text": [ 348 | "+--------------------+-----+--------------------+--------------------+----------+\n", 349 | "| features|churn| rawPrediction| probability|prediction|\n", 350 | "+--------------------+-----+--------------------+--------------------+----------+\n", 351 | "|[29.0,11274.46,1....| 0|[4.87277048314045...|[0.99240597473215...| 0.0|\n", 352 | "|[30.0,8403.78,1.0...| 0|[6.62706699787450...|[0.99867770995491...| 0.0|\n", 353 | "|[30.0,8874.83,0.0...| 0|[3.83233030863620...|[0.97880008629612...| 0.0|\n", 354 | "|[31.0,5387.75,0.0...| 0|[3.24742811458119...|[0.96258058552664...| 0.0|\n", 355 | "|[31.0,7073.61,0.0...| 0|[3.79911450433881...|[0.97809976923405...| 0.0|\n", 356 | "|[31.0,11297.57,1....| 1|[0.79751152640735...|[0.68944192100551...| 0.0|\n", 357 | "|[31.0,11743.24,0....| 0|[7.95951793845681...|[0.99965080051155...| 0.0|\n", 358 | "|[31.0,12264.68,1....| 0|[3.77281170068563...|[0.97752920495855...| 0.0|\n", 359 | "|[32.0,6367.22,1.0...| 0|[3.20017220414578...|[0.96084075703562...| 0.0|\n", 360 | "|[32.0,8575.71,0.0...| 0|[4.52857300143358...|[0.98931923918898...| 0.0|\n", 361 | "|[32.0,13630.93,0....| 0|[2.65527248795398...|[0.93433521477806...| 0.0|\n", 362 | "|[33.0,4711.89,0.0...| 0|[7.15048703176813...|[0.99921613300884...| 0.0|\n", 363 | "|[33.0,5738.82,0.0...| 0|[5.41122451678732...|[0.99555369000330...| 0.0|\n", 364 | "|[33.0,7750.54,1.0...| 0|[4.79456321095382...|[0.99179329500352...| 0.0|\n", 365 | "|[33.0,12638.51,1....| 0|[4.15248449384766...|[0.98451815808214...| 0.0|\n", 366 | "|[33.0,13314.19,0....| 0|[3.36990907218523...|[0.96675076852634...| 0.0|\n", 367 | "|[34.0,5447.16,1.0...| 0|[3.75995719191832...|[0.97724510462861...| 0.0|\n", 368 | "|[34.0,6461.86,1.0...| 0|[4.80281076454080...|[0.99186015320798...| 0.0|\n", 369 | "|[34.0,7818.13,0.0...| 0|[4.73016790727597...|[0.99125221001613...| 0.0|\n", 370 | "|[34.0,9265.59,0.0...| 0|[4.83050636756087...|[0.99208073716831...| 0.0|\n", 371 | "+--------------------+-----+--------------------+--------------------+----------+\n", 372 | "only showing top 20 rows\n", 373 | "\n" 374 | ] 375 | } 376 | ], 377 | "source": [ 378 | "pred_and_labels.predictions.show()" 379 | ] 380 | }, 381 | { 382 | "cell_type": "markdown", 383 | "metadata": {}, 384 | "source": [ 385 | "### Using AUC" 386 | ] 387 | }, 388 | { 389 | "cell_type": "code", 390 | "execution_count": 24, 391 | "metadata": { 392 | "collapsed": true 393 | }, 394 | "outputs": [], 395 | "source": [ 396 | "churn_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction',\n", 397 | " labelCol='churn')" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 26, 403 | "metadata": {}, 404 | "outputs": [], 405 | "source": [ 406 | "auc = churn_eval.evaluate(pred_and_labels.predictions)" 407 | ] 408 | }, 409 | { 410 | "cell_type": "code", 411 | "execution_count": 43, 412 | "metadata": {}, 413 | "outputs": [ 414 | { 415 | "data": { 416 | "text/plain": [ 417 | "0.6866883116883117" 418 | ] 419 | }, 420 | "execution_count": 43, 421 | "metadata": {}, 422 | "output_type": "execute_result" 423 | } 424 | ], 425 | "source": [ 426 | "auc" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": {}, 432 | "source": [ 433 | "[Common question - what is a good AUC value?](https://stats.stackexchange.com/questions/113326/what-is-a-good-auc-for-a-precision-recall-curve)" 434 | ] 435 | }, 436 | { 437 | "cell_type": "markdown", 438 | "metadata": {}, 439 | "source": [ 440 | "### Predict on brand new unlabeled data\n", 441 | "\n", 442 | "We still need to evaluate the new_customers.csv file!" 443 | ] 444 | }, 445 | { 446 | "cell_type": "code", 447 | "execution_count": 28, 448 | "metadata": { 449 | "collapsed": true 450 | }, 451 | "outputs": [], 452 | "source": [ 453 | "final_lr_model = lr_churn.fit(final_data)" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 29, 459 | "metadata": { 460 | "collapsed": true 461 | }, 462 | "outputs": [], 463 | "source": [ 464 | "new_customers = spark.read.csv('new_customers.csv',inferSchema=True,\n", 465 | " header=True)" 466 | ] 467 | }, 468 | { 469 | "cell_type": "code", 470 | "execution_count": 30, 471 | "metadata": {}, 472 | "outputs": [ 473 | { 474 | "name": "stdout", 475 | "output_type": "stream", 476 | "text": [ 477 | "root\n", 478 | " |-- Names: string (nullable = true)\n", 479 | " |-- Age: double (nullable = true)\n", 480 | " |-- Total_Purchase: double (nullable = true)\n", 481 | " |-- Account_Manager: integer (nullable = true)\n", 482 | " |-- Years: double (nullable = true)\n", 483 | " |-- Num_Sites: double (nullable = true)\n", 484 | " |-- Onboard_date: timestamp (nullable = true)\n", 485 | " |-- Location: string (nullable = true)\n", 486 | " |-- Company: string (nullable = true)\n", 487 | "\n" 488 | ] 489 | } 490 | ], 491 | "source": [ 492 | "new_customers.printSchema()" 493 | ] 494 | }, 495 | { 496 | "cell_type": "code", 497 | "execution_count": 31, 498 | "metadata": { 499 | "collapsed": true 500 | }, 501 | "outputs": [], 502 | "source": [ 503 | "test_new_customers = assembler.transform(new_customers)" 504 | ] 505 | }, 506 | { 507 | "cell_type": "code", 508 | "execution_count": 32, 509 | "metadata": {}, 510 | "outputs": [ 511 | { 512 | "name": "stdout", 513 | "output_type": "stream", 514 | "text": [ 515 | "root\n", 516 | " |-- Names: string (nullable = true)\n", 517 | " |-- Age: double (nullable = true)\n", 518 | " |-- Total_Purchase: double (nullable = true)\n", 519 | " |-- Account_Manager: integer (nullable = true)\n", 520 | " |-- Years: double (nullable = true)\n", 521 | " |-- Num_Sites: double (nullable = true)\n", 522 | " |-- Onboard_date: timestamp (nullable = true)\n", 523 | " |-- Location: string (nullable = true)\n", 524 | " |-- Company: string (nullable = true)\n", 525 | " |-- features: vector (nullable = true)\n", 526 | "\n" 527 | ] 528 | } 529 | ], 530 | "source": [ 531 | "test_new_customers.printSchema()" 532 | ] 533 | }, 534 | { 535 | "cell_type": "code", 536 | "execution_count": 33, 537 | "metadata": { 538 | "collapsed": true 539 | }, 540 | "outputs": [], 541 | "source": [ 542 | "final_results = final_lr_model.transform(test_new_customers)" 543 | ] 544 | }, 545 | { 546 | "cell_type": "code", 547 | "execution_count": 35, 548 | "metadata": {}, 549 | "outputs": [ 550 | { 551 | "name": "stdout", 552 | "output_type": "stream", 553 | "text": [ 554 | "+----------------+----------+\n", 555 | "| Company|prediction|\n", 556 | "+----------------+----------+\n", 557 | "| King Ltd| 0.0|\n", 558 | "| Cannon-Benson| 1.0|\n", 559 | "|Barron-Robertson| 1.0|\n", 560 | "| Sexton-Golden| 1.0|\n", 561 | "| Wood LLC| 0.0|\n", 562 | "| Parks-Robbins| 1.0|\n", 563 | "+----------------+----------+\n", 564 | "\n" 565 | ] 566 | } 567 | ], 568 | "source": [ 569 | "final_results.select('Company','prediction').show()" 570 | ] 571 | } 572 | ], 573 | "metadata": { 574 | "anaconda-cloud": {}, 575 | "kernelspec": { 576 | "display_name": "Python 3", 577 | "language": "python", 578 | "name": "python3" 579 | }, 580 | "language_info": { 581 | "codemirror_mode": { 582 | "name": "ipython", 583 | "version": 3 584 | }, 585 | "file_extension": ".py", 586 | "mimetype": "text/x-python", 587 | "name": "python", 588 | "nbconvert_exporter": "python", 589 | "pygments_lexer": "ipython3", 590 | "version": "3.7.9" 591 | } 592 | }, 593 | "nbformat": 4, 594 | "nbformat_minor": 1 595 | } 596 | -------------------------------------------------------------------------------- /Missing_Data.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Missing Data\n" 8 | ] 9 | }, 10 | { 11 | "cell_type": "code", 12 | "execution_count": 1, 13 | "metadata": { 14 | "collapsed": true 15 | }, 16 | "outputs": [], 17 | "source": [ 18 | "from pyspark.sql import SparkSession\n", 19 | "# May take a little while on a local computer\n", 20 | "spark = SparkSession.builder.appName(\"missingdata\").getOrCreate()" 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 2, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "df = spark.read.csv(\"ContainsNull.csv\",header=True,inferSchema=True)" 32 | ] 33 | }, 34 | { 35 | "cell_type": "code", 36 | "execution_count": 3, 37 | "metadata": {}, 38 | "outputs": [ 39 | { 40 | "name": "stdout", 41 | "output_type": "stream", 42 | "text": [ 43 | "+----+-----+-----+\n", 44 | "| Id| Name|Sales|\n", 45 | "+----+-----+-----+\n", 46 | "|emp1| John| null|\n", 47 | "|emp2| null| null|\n", 48 | "|emp3| null|345.0|\n", 49 | "|emp4|Cindy|456.0|\n", 50 | "+----+-----+-----+\n", 51 | "\n" 52 | ] 53 | } 54 | ], 55 | "source": [ 56 | "df.show()" 57 | ] 58 | }, 59 | { 60 | "cell_type": "markdown", 61 | "metadata": {}, 62 | "source": [ 63 | "Notice how the data remains as a null." 64 | ] 65 | }, 66 | { 67 | "cell_type": "markdown", 68 | "metadata": {}, 69 | "source": [ 70 | "## Drop the missing data\n", 71 | "\n", 72 | "One can use the .na functions for missing data. The drop command has the following parameters:\n", 73 | "\n", 74 | " df.na.drop(how='any', thresh=None, subset=None)\n", 75 | " \n", 76 | " * param how: 'any' or 'all'.\n", 77 | " \n", 78 | " If 'any', drop a row if it contains any nulls.\n", 79 | " If 'all', drop a row only if all its values are null.\n", 80 | " \n", 81 | " * param thresh: int, default None\n", 82 | " \n", 83 | " If specified, drop rows that have less than `thresh` non-null values.\n", 84 | " This overwrites the `how` parameter.\n", 85 | " \n", 86 | " * param subset: \n", 87 | " optional list of column names to consider." 88 | ] 89 | }, 90 | { 91 | "cell_type": "code", 92 | "execution_count": 6, 93 | "metadata": {}, 94 | "outputs": [ 95 | { 96 | "name": "stdout", 97 | "output_type": "stream", 98 | "text": [ 99 | "+----+-----+-----+\n", 100 | "| Id| Name|Sales|\n", 101 | "+----+-----+-----+\n", 102 | "|emp4|Cindy|456.0|\n", 103 | "+----+-----+-----+\n", 104 | "\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "# Drop any row that contains missing data\n", 110 | "df.na.drop().show()" 111 | ] 112 | }, 113 | { 114 | "cell_type": "code", 115 | "execution_count": 8, 116 | "metadata": {}, 117 | "outputs": [ 118 | { 119 | "name": "stdout", 120 | "output_type": "stream", 121 | "text": [ 122 | "+----+-----+-----+\n", 123 | "| Id| Name|Sales|\n", 124 | "+----+-----+-----+\n", 125 | "|emp1| John| null|\n", 126 | "|emp3| null|345.0|\n", 127 | "|emp4|Cindy|456.0|\n", 128 | "+----+-----+-----+\n", 129 | "\n" 130 | ] 131 | } 132 | ], 133 | "source": [ 134 | "# Has to have at least 2 NON-null values\n", 135 | "df.na.drop(thresh=2).show()" 136 | ] 137 | }, 138 | { 139 | "cell_type": "code", 140 | "execution_count": 9, 141 | "metadata": {}, 142 | "outputs": [ 143 | { 144 | "name": "stdout", 145 | "output_type": "stream", 146 | "text": [ 147 | "+----+-----+-----+\n", 148 | "| Id| Name|Sales|\n", 149 | "+----+-----+-----+\n", 150 | "|emp3| null|345.0|\n", 151 | "|emp4|Cindy|456.0|\n", 152 | "+----+-----+-----+\n", 153 | "\n" 154 | ] 155 | } 156 | ], 157 | "source": [ 158 | "df.na.drop(subset=[\"Sales\"]).show()" 159 | ] 160 | }, 161 | { 162 | "cell_type": "code", 163 | "execution_count": 10, 164 | "metadata": {}, 165 | "outputs": [ 166 | { 167 | "name": "stdout", 168 | "output_type": "stream", 169 | "text": [ 170 | "+----+-----+-----+\n", 171 | "| Id| Name|Sales|\n", 172 | "+----+-----+-----+\n", 173 | "|emp4|Cindy|456.0|\n", 174 | "+----+-----+-----+\n", 175 | "\n" 176 | ] 177 | } 178 | ], 179 | "source": [ 180 | "df.na.drop(how='any').show()" 181 | ] 182 | }, 183 | { 184 | "cell_type": "code", 185 | "execution_count": 11, 186 | "metadata": {}, 187 | "outputs": [ 188 | { 189 | "name": "stdout", 190 | "output_type": "stream", 191 | "text": [ 192 | "+----+-----+-----+\n", 193 | "| Id| Name|Sales|\n", 194 | "+----+-----+-----+\n", 195 | "|emp1| John| null|\n", 196 | "|emp2| null| null|\n", 197 | "|emp3| null|345.0|\n", 198 | "|emp4|Cindy|456.0|\n", 199 | "+----+-----+-----+\n", 200 | "\n" 201 | ] 202 | } 203 | ], 204 | "source": [ 205 | "df.na.drop(how='all').show()" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "## Fill the missing values\n", 213 | "\n", 214 | "We can also fill the missing values with new values. If you have multiple nulls across multiple data types, Spark is actually smart enough to match up the data types. For example:" 215 | ] 216 | }, 217 | { 218 | "cell_type": "code", 219 | "execution_count": 15, 220 | "metadata": {}, 221 | "outputs": [ 222 | { 223 | "name": "stdout", 224 | "output_type": "stream", 225 | "text": [ 226 | "+----+---------+-----+\n", 227 | "| Id| Name|Sales|\n", 228 | "+----+---------+-----+\n", 229 | "|emp1| John| null|\n", 230 | "|emp2|NEW VALUE| null|\n", 231 | "|emp3|NEW VALUE|345.0|\n", 232 | "|emp4| Cindy|456.0|\n", 233 | "+----+---------+-----+\n", 234 | "\n" 235 | ] 236 | } 237 | ], 238 | "source": [ 239 | "df.na.fill('NEW VALUE').show()" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 16, 245 | "metadata": {}, 246 | "outputs": [ 247 | { 248 | "name": "stdout", 249 | "output_type": "stream", 250 | "text": [ 251 | "+----+-----+-----+\n", 252 | "| Id| Name|Sales|\n", 253 | "+----+-----+-----+\n", 254 | "|emp1| John| 0.0|\n", 255 | "|emp2| null| 0.0|\n", 256 | "|emp3| null|345.0|\n", 257 | "|emp4|Cindy|456.0|\n", 258 | "+----+-----+-----+\n", 259 | "\n" 260 | ] 261 | } 262 | ], 263 | "source": [ 264 | "df.na.fill(0).show()" 265 | ] 266 | }, 267 | { 268 | "cell_type": "markdown", 269 | "metadata": {}, 270 | "source": [ 271 | "Usually you should specify what columns you want to fill with the subset parameter" 272 | ] 273 | }, 274 | { 275 | "cell_type": "code", 276 | "execution_count": 17, 277 | "metadata": {}, 278 | "outputs": [ 279 | { 280 | "name": "stdout", 281 | "output_type": "stream", 282 | "text": [ 283 | "+----+-------+-----+\n", 284 | "| Id| Name|Sales|\n", 285 | "+----+-------+-----+\n", 286 | "|emp1| John| null|\n", 287 | "|emp2|No Name| null|\n", 288 | "|emp3|No Name|345.0|\n", 289 | "|emp4| Cindy|456.0|\n", 290 | "+----+-------+-----+\n", 291 | "\n" 292 | ] 293 | } 294 | ], 295 | "source": [ 296 | "df.na.fill('No Name',subset=['Name']).show()" 297 | ] 298 | }, 299 | { 300 | "cell_type": "markdown", 301 | "metadata": {}, 302 | "source": [ 303 | "A very common practice is to fill values with the mean value for the column, for example:" 304 | ] 305 | }, 306 | { 307 | "cell_type": "code", 308 | "execution_count": 23, 309 | "metadata": {}, 310 | "outputs": [ 311 | { 312 | "data": { 313 | "text/plain": [ 314 | "400.5" 315 | ] 316 | }, 317 | "execution_count": 23, 318 | "metadata": {}, 319 | "output_type": "execute_result" 320 | } 321 | ], 322 | "source": [ 323 | "from pyspark.sql.functions import mean\n", 324 | "mean_val = df.select(mean(df['Sales'])).collect()\n", 325 | "\n", 326 | "# Weird nested formatting of Row object!\n", 327 | "mean_val[0][0]" 328 | ] 329 | }, 330 | { 331 | "cell_type": "code", 332 | "execution_count": 24, 333 | "metadata": { 334 | "collapsed": true 335 | }, 336 | "outputs": [], 337 | "source": [ 338 | "mean_sales = mean_val[0][0]" 339 | ] 340 | }, 341 | { 342 | "cell_type": "code", 343 | "execution_count": 26, 344 | "metadata": {}, 345 | "outputs": [ 346 | { 347 | "name": "stdout", 348 | "output_type": "stream", 349 | "text": [ 350 | "+----+-----+-----+\n", 351 | "| Id| Name|Sales|\n", 352 | "+----+-----+-----+\n", 353 | "|emp1| John|400.5|\n", 354 | "|emp2| null|400.5|\n", 355 | "|emp3| null|345.0|\n", 356 | "|emp4|Cindy|456.0|\n", 357 | "+----+-----+-----+\n", 358 | "\n" 359 | ] 360 | } 361 | ], 362 | "source": [ 363 | "df.na.fill(mean_sales,[\"Sales\"]).show()" 364 | ] 365 | }, 366 | { 367 | "cell_type": "code", 368 | "execution_count": 28, 369 | "metadata": {}, 370 | "outputs": [ 371 | { 372 | "name": "stdout", 373 | "output_type": "stream", 374 | "text": [ 375 | "+----+-----+-----+\n", 376 | "| Id| Name|Sales|\n", 377 | "+----+-----+-----+\n", 378 | "|emp1| John|400.5|\n", 379 | "|emp2| null|400.5|\n", 380 | "|emp3| null|345.0|\n", 381 | "|emp4|Cindy|456.0|\n", 382 | "+----+-----+-----+\n", 383 | "\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "# One (very ugly) one-liner\n", 389 | "df.na.fill(df.select(mean(df['Sales'])).collect()[0][0],['Sales']).show()" 390 | ] 391 | } 392 | ], 393 | "metadata": { 394 | "anaconda-cloud": {}, 395 | "kernelspec": { 396 | "display_name": "Python 3", 397 | "language": "python", 398 | "name": "python3" 399 | }, 400 | "language_info": { 401 | "codemirror_mode": { 402 | "name": "ipython", 403 | "version": 3 404 | }, 405 | "file_extension": ".py", 406 | "mimetype": "text/x-python", 407 | "name": "python", 408 | "nbconvert_exporter": "python", 409 | "pygments_lexer": "ipython3", 410 | "version": "3.7.9" 411 | } 412 | }, 413 | "nbformat": 4, 414 | "nbformat_minor": 1 415 | } 416 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Apache Spark 2 | 3 | [![AppVeyor Build](https://img.shields.io/appveyor/ci/ApacheSoftwareFoundation/spark/master.svg?style=plastic&logo=appveyor)](https://ci.appveyor.com/project/ApacheSoftwareFoundation/spark) 4 | [![PySpark Coverage](https://img.shields.io/badge/dynamic/xml.svg?label=pyspark%20coverage&url=https%3A%2F%2Fspark-test.github.io%2Fpyspark-coverage-site&query=%2Fhtml%2Fbody%2Fdiv%5B1%5D%2Fdiv%2Fh1%2Fspan&colorB=brightgreen&style=plastic)](https://spark-test.github.io/pyspark-coverage-site) 5 | [![Python 3.5](https://img.shields.io/badge/python-3.5-blue.svg)](https://www.python.org/downloads/release/python-360/) 6 | [![](https://img.shields.io/badge/Spark-v2.4.0-brigh)](https://spark.apache.org/) 7 | 8 | Spark is a unified analytics engine for large-scale data processing. It provides high-level APIs in Scala, Java, Python, and R, and an optimized engine that supports general computation graphs for data analysis. It also supports a rich set of higher-level tools including Spark SQL for SQL and DataFrames, MLlib for machine learning, GraphX for graph processing, and Structured Streaming for stream processing. 9 | 10 | **Spark MLlib** library for Machine Learning provides a Collaborative Filtering implementation by using Alternating Least Squares. The implementation in MLlib has these parameters: 11 | 12 | * numBlocks is the number of blocks used to parallelize computation (set to -1 to auto-configure). 13 | * rank is the number of latent factors in the model. 14 | * iterations is the number of iterations to run. 15 | * lambda specifies the regularization parameter in ALS. 16 | * implicitPrefs specifies whether to use the explicit feedback ALS variant or one adapted for implicit feedback data. 17 | * alpha is a parameter applicable to the implicit feedback variant of ALS that governs the baseline confidence in preference observations. 18 | 19 | ## Data preprocessoing 20 | ### 1. Missing data 21 | Often data sources are incomplete, which means you will have missing data, you have 3 basic options for filling in missing data (you will personally have to make the decision for what is the right approach: 22 | 23 | * Just keep the missing data points. 24 | * Drop them missing data points (including the entire row) 25 | * Fill them in with some other value. 26 | 27 | 28 | # Spark for Machine Learning 29 | 30 | - Linear regression 31 | - Logistic regression 32 | - Natural Language Processing (NLP) 33 | - Tree methods 34 | - Recomender System 35 | 36 | ## Data Transformations 37 | 38 | You won't always get data in a convienent format, often you will have to deal with data that is non-numerical, such as customer names, or zipcodes, country names, etc... 39 | 40 | A big part of working with data is using your own domain knowledge to build an intuition of how to deal with the data, sometimes the best course of action is to drop the data, other times feature-engineering is a good way to go, or you could try to transform the data into something the Machine Learning Algorithms will understand. 41 | 42 | Spark has several built in methods of dealing with thse transformations, check them all out here: http://spark.apache.org/docs/latest/ml-features.html 43 | 44 | 45 | ### VectorIndexer 46 | 47 | VectorAssembler is a transformer that combines a given list of columns into a single vector column. It is useful for combining raw features and features generated by different feature transformers into a single feature vector, in order to train ML models like logistic regression and decision trees. VectorAssembler accepts the following input column types: all numeric types, boolean type, and vector type. In each row, the values of the input columns will be concatenated into a vector in the specified order. 48 | 49 | Assume that we have a DataFrame with the columns id, hour, mobile, userFeatures, and clicked: 50 | 51 | id | hour | mobile | userFeatures | clicked 52 | ----|------|--------|------------------|--------- 53 | 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0 54 | 55 | userFeatures is a vector column that contains three user features. We want to combine hour, mobile, and userFeatures into a single feature vector called features and use it to predict clicked or not. If we set VectorAssembler’s input columns to hour, mobile, and userFeatures and output column to features, after transformation we should get the following DataFrame: 56 | 57 | id | hour | mobile | userFeatures | clicked | features 58 | ----|------|--------|------------------|---------|----------------------------- 59 | 0 | 18 | 1.0 | [0.0, 10.0, 0.5] | 1.0 | [18.0, 1.0, 0.0, 10.0, 0.5] 60 | 61 | 62 | ## Recommender System 63 | 64 | The image below (from Wikipedia) shows an example of collaborative filtering. At first, people rate different items (like videos, images, games). Then, the system makes predictions about a user's rating for an item not rated yet. The new predictions are built upon the existing ratings of other users with similar ratings with the active user. In the image, the system predicts that the user will not like the video. 65 | 66 | 67 | 68 | 69 | With Collaborative filtering we make predictions (filtering) about the interests of a user by collecting preferences or taste information from many users (collaborating). The underlying assumption is that if a user A has the same opinion as a user B on an issue, A is more likely to have B's opinion on a different issue x than to have the opinion on x of a user chosen randomly. 70 | 71 | 72 | ## Spark Streaming 73 | 74 | Streaming is something that is rapidly advancing and changin fast, there are multipl enew libraries every year, new and different services always popping up, and what is in this notebook may or may not apply to you. Maybe your looking for something specific on Kafka, or maybe you are looking for streaming about twitter, in which case Spark might be overkill for what you really want. Realistically speaking each situation is going to require a customized solution and this course is never going to be able to supply a one size fits all solution. 75 | 76 | 77 | Spark has pretty well known Streaming Capabilities, if streaming is something you've found yourself needing at work then you are probably familiar with some of these concepts already, in which case you may find it more useful to jump straight to the official documentation here: 78 | 79 | http://spark.apache.org/docs/latest/streaming-programming-guide.html#spark-streaming-programming-guide 80 | 81 | It is really a great guide, but keep in mind some of the features are restricted to Scala at this time (Spark 2.1), hopefully they will be expanded to the Python API in the future! 82 | 83 | *Twitter* is a great source for streaming because its something most people already have an intuitive understanding of, you can visit the site yourself, and a lot of streaming technology has come out of Twitter as a company. You don't access to the entire "firehose" of twitter without paying for it, but that would be too much for us to handle anyway, so we'll be more than fine with the freely available API access. 84 | 85 | **Spark Streaming** is an extension of the core Spark API that enables scalable, high-throughput, fault-tolerant stream processing of live data streams. Data can be ingested from many sources like Kafka, Flume, Kinesis, or TCP sockets, and can be processed using complex algorithms expressed with high-level functions like map, reduce, join and window. Finally, processed data can be pushed out to filesystems, databases, and live dashboards. In fact, you can apply Spark’s machine learning and graph processing algorithms on data streams. 86 | 87 | 88 | 89 | Keep in mind that a few of these Streamiing Capabilities are limited when it comes to Python, you'll need to reference the documentation for the most up to date information. Also the streaming contexts tend to follow more along with the older RDD syntax, so a few things might seem different than what we are used to seeing, keep that in mind, you'll definitely want to have a good understanding of lambda expressions before continuing with this! 90 | 91 | There are SparkSQL modules for streaming: 92 | 93 | http://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=streaming#module-pyspark.sql.streaming 94 | 95 | But they are all still listed as experimental, so instead of showing you somethign that might break in the future, we'll stick to the RDD methods (which is what the documentation also currently shows for streaming). 96 | 97 | Internally, it works as follows. Spark Streaming receives live input data streams and divides the data into batches, which are then processed by the Spark engine to generate the final stream of results in batches. 98 | 99 | 100 | 101 | 102 | ## Database 103 | 104 | https://github.com/EBISYS/WaterWatch 105 | 106 | https://github.com/EBISYS/WaterWatch/blob/master/query.csv 107 | 108 | 109 | [Configure Pycharm](https://www.youtube.com/watch?v=RsALKtZvqFo) 110 | 111 | -------------------------------------------------------------------------------- /Read_Write_and_Validate_Data.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # coding: utf-8 3 | 4 | # # Reading Writing and Validating Data in PySpark 5 | # 6 | # Welcome to PySpark! 7 | # 8 | # In this first lecture, we will be covering: 9 | # 10 | # - Reading in Data 11 | # - Partioned Files 12 | # - Validating Data 13 | # - Specifying Data Types 14 | # - Writing Data 15 | # 16 | # Below you will see the script to begin your first PySpark instance. If you're ever curious 17 | # about how your PySpark instance is performing, Spark offers a neat Web UI with tons of information. 18 | # Just navigate to http://[driver]:4040 in your browswer where "drive" is you driver name. 19 | # If you are running PySpark locally, it would be http://localhost:4040 or you can use the hyperlink 20 | # automatically produced from the script below. 21 | 22 | # First let's create our PySpark instance! 23 | 24 | # PC users can use the next two lines of code but mac users don't need it 25 | # import findspark 26 | # findspark.init() 27 | 28 | import pyspark # only run after findspark.init() 29 | from pyspark.sql import SparkSession 30 | # May take awhile locally 31 | spark = SparkSession.builder.appName("ReadWriteValidate").getOrCreate() 32 | spark 33 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size() 34 | print("You are working with", cores, "core(s)") 35 | 36 | # ## Reading data 37 | # 38 | # A DataFrame is equivalent to a relational table in Spark SQL, and can be created using various 39 | # functions in SparkSession. 40 | # 41 | # First let's try reading in a csv file containing a list of students and their grades. 42 | # **Source:** https://www.kaggle.com/spscientist/students-performance-in-exams 43 | 44 | # Start by reading a basic csv dataset 45 | # Let Spark know about the header and infer the Schema types! 46 | 47 | path ="Datasets/" 48 | # Some csv data 49 | students = spark.read.csv(path+'students.csv',inferSchema=True,header=True) 50 | students.limit(4).toPandas() 51 | 52 | # **Parquet Files** 53 | # Now try reading in a parquet file. This is most common data type in the big data world. 54 | # Why? because it is the most compact file storage method (even better than zipped files!) 55 | 56 | parquet = spark.read.parquet(path+'users1.parquet') 57 | parquet.show(2) 58 | parquet.count() 59 | 60 | # **Partitioned Parquet Files** 61 | # 62 | # Actually most big datasets will be partitioned. Here is how you can collect all 63 | # the pieces (parts) of the dataset in one simple command. 64 | 65 | partitioned = spark.read.parquet(path+'users*') 66 | partitioned.show(2) 67 | 68 | # You can also opt to read in only a specific set of paritioned parquet files. 69 | # Say for example that you only wanted users1 and users2 and not users3 70 | 71 | # Note that the .option("basePath", path) option is used to override the automatic function 72 | # that will exclude the partitioned variable in resulting dataframe. 73 | # I prefer to have the partitioning info in my new dataframe personally. 74 | users1_2 = spark.read.option("basePath", path).parquet(path+'users1.parquet', 75 | path+'users2.parquet') 76 | users1_2.show(4) 77 | users1_2.count() 78 | #----------------------------------------------------------- 79 | #in **AWS** cloud storing data in s3 buckets your code will be more like this... 80 | bucket = "my_bucket" 81 | key1 = "partition_test/Table1/CREATED_YEAR=2015/*" 82 | key2 = "partition_test/Table1/CREATED_YEAR=2017/*" 83 | key3 = "partition_test/Table1/CREATED_YEAR=2018/*" 84 | 85 | test_df = spark.read.parquet('s3://'+bucket+'/'+key1,\ 86 | 's3://'+bucket+'/'+key2,\ 87 | 's3://'+bucket+'/'+key3) 88 | 89 | test_df.show(1) 90 | #--------------------------------------------------------- 91 | 92 | # ## Validating Data 93 | # 94 | # If you want to validate that you dataframe was read in correct. We will get 95 | # into more detailed data evaluation later on but first we need to ensure that all the 96 | # variable types were infered correctly and that the values actually made it in... sometimes 97 | # they don't :) 98 | students.printSchema() #Prints out the schema in the tree format. 99 | students.columns 100 | students.describe 101 | # Get an inital view of your dataframe 102 | students.show(3) 103 | 104 | # Note the types here: 105 | print(type(students)) 106 | studentsPdf = students.toPandas() 107 | print(type(studentsPdf)) 108 | 109 | 110 | # A Solid Summary of your data: 111 | #show the data (like df.head()) 112 | print(students.printSchema()) 113 | print("") 114 | print(students.columns) 115 | print("") 116 | print(students.describe()) # Not so fond of this one but to each their own 117 | 118 | # If you need to get the type of just ONE column by name you can use this function: 119 | students.schema['math score'].dataType 120 | 121 | # Neat "describe" function 122 | students.describe(['math score']).show() 123 | 124 | 125 | # Summary function 126 | students.select("math score", "reading score","writing score").summary("count", "min", "25%", "75%", "max").show() 127 | 128 | # How to specify data types as you read in datasets. 129 | # Some data types make it easier to infer schema (like tabular formats such as csv which we will show later). 130 | # 131 | # However you often have to set the schema yourself if you aren't dealing with a .read method that 132 | # doesn't have inferSchema() built-in. 133 | # Spark has all the tools you need for this, it just requires a very specific structure: 134 | 135 | from pyspark.sql.types import StructField,StringType,IntegerType,StructType,DateType 136 | 137 | # Next we need to create the list of Structure fields 138 | # * :param name: string, name of the field. 139 | # * :param dataType: :class:`DataType` of the field. 140 | # * :param nullable: boolean, whether the field can be null (None) or not. 141 | 142 | data_schema = [StructField("name", StringType(), True), 143 | StructField("email", StringType(), True), 144 | StructField("city", StringType(), True), 145 | StructField("mac", StringType(), True), 146 | StructField("timestamp", DateType(), True), 147 | StructField("creditcard", StringType(), True)] 148 | 149 | final_struc = StructType(fields=data_schema) 150 | 151 | 152 | # a .json file 153 | # 154 | # **Source:** https://gist.github.com/raine/da15845f332a2fb8937b344504abfbe0 155 | 156 | people = spark.read.json(path+'people.json', schema=final_struc) 157 | 158 | people.printSchema() 159 | 160 | 161 | # ## Writing Data 162 | # First let's just try writing a simple csv file. 163 | 164 | # Note the funky naming convention of the file in your output folder. There is no way to directly change this. 165 | students.write.mode("overwrite").csv('write_test.csv') 166 | 167 | # students.write.csv('write_test.csv') 168 | students.toPandas().to_csv('write_test2.csv') 169 | # Note the strange naming convention of the output file in the path that you specified. 170 | # Spark uses Hadoop File Format, which requires data to be partitioned - that's why you have part- files. 171 | # If you want to rename your written files to a more user friendly format, you can do that using the 172 | # method below: 173 | from py4j.java_gateway import java_import 174 | java_import(spark._jvm, 'org.apache.hadoop.fs.Path') 175 | 176 | fs = spark._jvm.org.apache.hadoop.fs.FileSystem.get(spark._jsc.hadoopConfiguration()) 177 | file = fs.globStatus(spark._jvm.Path('write_test.csv/part*'))[0].getPath().getName() 178 | fs.rename(spark._jvm.Path('write_test.csv/' + file), spark._jvm.Path('write_test2.csv')) 179 | #these two need to be different 180 | fs.delete(spark._jvm.Path('write_test.csv'), True) 181 | 182 | 183 | # Writting Parquet files 184 | # 185 | # Now let's try writing a parquet file. This is best practice for big data as it is the most compact 186 | # storage method. 187 | 188 | users1_2.write.mode("overwrite").parquet('parquet/') 189 | 190 | # Try this solution: https://stackoverflow.com/questions/59220832/unable-to-write-spark-dataframe-to-a-parquet-file-format-to-c-drive-in-pyspark 191 | # 192 | # Writting Partitioned Parquet Files 193 | # 194 | # Now try to write a partioned parquet file... super fun! 195 | 196 | users1_2.write.mode("overwrite").partitionBy("gender").parquet('part_parquet/') 197 | 198 | 199 | # #### Writting your own dataframes here! 200 | # You can also create your own dataframes directly here in your Juypter Notebook too if you want. 201 | 202 | 203 | values = [('Pear',10),('Orange',36),('Banana',123),('Kiwi',48),('Peach',16),('Strawberry',1)] 204 | df = spark.createDataFrame(values,['fruit','quantity']) 205 | df.show() 206 | 207 | -------------------------------------------------------------------------------- /Recommender_System.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": {}, 6 | "source": [ 7 | "# Recommender Code Along\n", 8 | "\n", 9 | "[movielens data set](https://grouplens.org/datasets/movielens/). \n", 10 | "\n", 11 | "\n", 12 | "Looking for more datasets? https://gist.github.com/entaroadun/1653794" 13 | ] 14 | }, 15 | { 16 | "cell_type": "code", 17 | "execution_count": 1, 18 | "metadata": { 19 | "collapsed": true 20 | }, 21 | "outputs": [], 22 | "source": [ 23 | "from pyspark.sql import SparkSession" 24 | ] 25 | }, 26 | { 27 | "cell_type": "code", 28 | "execution_count": 12, 29 | "metadata": { 30 | "collapsed": true 31 | }, 32 | "outputs": [], 33 | "source": [ 34 | "spark = SparkSession.builder.appName('rec').getOrCreate()" 35 | ] 36 | }, 37 | { 38 | "cell_type": "code", 39 | "execution_count": 5, 40 | "metadata": { 41 | "collapsed": true 42 | }, 43 | "outputs": [], 44 | "source": [ 45 | "from pyspark.ml.evaluation import RegressionEvaluator\n", 46 | "from pyspark.ml.recommendation import ALS" 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 19, 52 | "metadata": {}, 53 | "outputs": [], 54 | "source": [ 55 | "data = spark.read.csv('movielens_ratings.csv',inferSchema=True,header=True)" 56 | ] 57 | }, 58 | { 59 | "cell_type": "code", 60 | "execution_count": 21, 61 | "metadata": {}, 62 | "outputs": [ 63 | { 64 | "data": { 65 | "text/plain": [ 66 | "Row(movieId=2, rating=3.0, userId=0)" 67 | ] 68 | }, 69 | "execution_count": 21, 70 | "metadata": {}, 71 | "output_type": "execute_result" 72 | } 73 | ], 74 | "source": [ 75 | "data.head()" 76 | ] 77 | }, 78 | { 79 | "cell_type": "code", 80 | "execution_count": 24, 81 | "metadata": {}, 82 | "outputs": [ 83 | { 84 | "name": "stdout", 85 | "output_type": "stream", 86 | "text": [ 87 | "+-------+------------------+------------------+------------------+\n", 88 | "|summary| movieId| rating| userId|\n", 89 | "+-------+------------------+------------------+------------------+\n", 90 | "| count| 1501| 1501| 1501|\n", 91 | "| mean| 49.40572951365756|1.7741505662891406|14.383744170552964|\n", 92 | "| stddev|28.937034065088994| 1.187276166124803| 8.591040424293272|\n", 93 | "| min| 0| 1.0| 0|\n", 94 | "| max| 99| 5.0| 29|\n", 95 | "+-------+------------------+------------------+------------------+\n", 96 | "\n" 97 | ] 98 | } 99 | ], 100 | "source": [ 101 | "data.describe().show()" 102 | ] 103 | }, 104 | { 105 | "cell_type": "markdown", 106 | "metadata": {}, 107 | "source": [ 108 | "We can do a split to evaluate how well our model performed, but keep in mind that it is very hard to know conclusively how well a recommender system is truly working for some topics. Especially if subjectivity is involved, for example not everyone that loves star wars is going to love star trek, even though a recommendation system may suggest otherwise." 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 27, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "# Smaller dataset so we will use 0.8 / 0.2\n", 120 | "(training, test) = data.randomSplit([0.8, 0.2])" 121 | ] 122 | }, 123 | { 124 | "cell_type": "code", 125 | "execution_count": 28, 126 | "metadata": { 127 | "collapsed": true 128 | }, 129 | "outputs": [], 130 | "source": [ 131 | "# Build the recommendation model using ALS on the training data\n", 132 | "als = ALS(maxIter=5, regParam=0.01, userCol=\"userId\", itemCol=\"movieId\", ratingCol=\"rating\")\n", 133 | "model = als.fit(training)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "markdown", 138 | "metadata": {}, 139 | "source": [ 140 | "Now let's see hwo the model performed!" 141 | ] 142 | }, 143 | { 144 | "cell_type": "code", 145 | "execution_count": 41, 146 | "metadata": { 147 | "collapsed": true 148 | }, 149 | "outputs": [], 150 | "source": [ 151 | "# Evaluate the model by computing the RMSE on the test data\n", 152 | "predictions = model.transform(test)" 153 | ] 154 | }, 155 | { 156 | "cell_type": "code", 157 | "execution_count": 43, 158 | "metadata": {}, 159 | "outputs": [ 160 | { 161 | "name": "stdout", 162 | "output_type": "stream", 163 | "text": [ 164 | "+-------+------+------+----------+\n", 165 | "|movieId|rating|userId|prediction|\n", 166 | "+-------+------+------+----------+\n", 167 | "| 31| 1.0| 27| 2.5976338|\n", 168 | "| 31| 1.0| 13| 2.113986|\n", 169 | "| 31| 1.0| 5| 1.3165921|\n", 170 | "| 31| 2.0| 25|0.16370271|\n", 171 | "| 85| 1.0| 28|-2.5285664|\n", 172 | "| 85| 1.0| 26|0.37620115|\n", 173 | "| 85| 1.0| 12| 0.8253538|\n", 174 | "| 85| 3.0| 1| 1.6069186|\n", 175 | "| 85| 1.0| 13| 2.2720711|\n", 176 | "| 85| 5.0| 16|0.80576146|\n", 177 | "| 85| 1.0| 15|0.54832166|\n", 178 | "| 85| 1.0| 4| 3.144216|\n", 179 | "| 65| 1.0| 28| -2.03051|\n", 180 | "| 65| 2.0| 3| 3.801642|\n", 181 | "| 65| 1.0| 2| 1.7128268|\n", 182 | "| 53| 3.0| 13| 3.4453833|\n", 183 | "| 53| 1.0| 6| 1.8362958|\n", 184 | "| 53| 1.0| 9| 1.8954519|\n", 185 | "| 78| 1.0| 22| 0.5302301|\n", 186 | "| 78| 1.0| 13| 0.5055496|\n", 187 | "+-------+------+------+----------+\n", 188 | "only showing top 20 rows\n", 189 | "\n" 190 | ] 191 | } 192 | ], 193 | "source": [ 194 | "predictions.show()" 195 | ] 196 | }, 197 | { 198 | "cell_type": "code", 199 | "execution_count": 29, 200 | "metadata": {}, 201 | "outputs": [ 202 | { 203 | "name": "stdout", 204 | "output_type": "stream", 205 | "text": [ 206 | "Root-mean-square error = 1.751143638387403\n" 207 | ] 208 | } 209 | ], 210 | "source": [ 211 | "evaluator = RegressionEvaluator(metricName=\"rmse\", labelCol=\"rating\",predictionCol=\"prediction\")\n", 212 | "rmse = evaluator.evaluate(predictions)\n", 213 | "print(\"Root-mean-square error = \" + str(rmse))" 214 | ] 215 | }, 216 | { 217 | "cell_type": "markdown", 218 | "metadata": {}, 219 | "source": [ 220 | "The RMSE described our error in terms of the stars rating column." 221 | ] 222 | }, 223 | { 224 | "cell_type": "markdown", 225 | "metadata": {}, 226 | "source": [ 227 | "So now that we have the model, how would you actually supply a recommendation to a user?\n", 228 | "\n", 229 | "The same way we did with the test data! For example:" 230 | ] 231 | }, 232 | { 233 | "cell_type": "code", 234 | "execution_count": 47, 235 | "metadata": { 236 | "collapsed": true 237 | }, 238 | "outputs": [], 239 | "source": [ 240 | "single_user = test.filter(test['userId']==11).select(['movieId','userId'])" 241 | ] 242 | }, 243 | { 244 | "cell_type": "code", 245 | "execution_count": 48, 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | "+-------+------+\n", 253 | "|movieId|userId|\n", 254 | "+-------+------+\n", 255 | "| 0| 11|\n", 256 | "| 13| 11|\n", 257 | "| 18| 11|\n", 258 | "| 30| 11|\n", 259 | "| 66| 11|\n", 260 | "| 70| 11|\n", 261 | "| 75| 11|\n", 262 | "| 78| 11|\n", 263 | "| 79| 11|\n", 264 | "| 99| 11|\n", 265 | "+-------+------+\n", 266 | "\n" 267 | ] 268 | } 269 | ], 270 | "source": [ 271 | "# User had 10 ratings in the test data set \n", 272 | "# Realistically this should be some sort of hold out set!\n", 273 | "single_user.show()" 274 | ] 275 | }, 276 | { 277 | "cell_type": "code", 278 | "execution_count": 49, 279 | "metadata": { 280 | "collapsed": true 281 | }, 282 | "outputs": [], 283 | "source": [ 284 | "reccomendations = model.transform(single_user)" 285 | ] 286 | }, 287 | { 288 | "cell_type": "code", 289 | "execution_count": 54, 290 | "metadata": {}, 291 | "outputs": [ 292 | { 293 | "name": "stdout", 294 | "output_type": "stream", 295 | "text": [ 296 | "+-------+------+----------+\n", 297 | "|movieId|userId|prediction|\n", 298 | "+-------+------+----------+\n", 299 | "| 30| 11| 5.578189|\n", 300 | "| 13| 11| 3.257565|\n", 301 | "| 70| 11| 2.7580981|\n", 302 | "| 99| 11| 1.7420897|\n", 303 | "| 18| 11| 1.5150304|\n", 304 | "| 75| 11| 1.34218|\n", 305 | "| 79| 11| 0.9733073|\n", 306 | "| 66| 11| 0.5732717|\n", 307 | "| 78| 11| 0.4434041|\n", 308 | "| 0| 11| -1.85298|\n", 309 | "+-------+------+----------+\n", 310 | "\n" 311 | ] 312 | } 313 | ], 314 | "source": [ 315 | "reccomendations.orderBy('prediction',ascending=False).show()" 316 | ] 317 | } 318 | ], 319 | "metadata": { 320 | "anaconda-cloud": {}, 321 | "kernelspec": { 322 | "display_name": "Python 3", 323 | "language": "python", 324 | "name": "python3" 325 | }, 326 | "language_info": { 327 | "codemirror_mode": { 328 | "name": "ipython", 329 | "version": 3 330 | }, 331 | "file_extension": ".py", 332 | "mimetype": "text/x-python", 333 | "name": "python", 334 | "nbconvert_exporter": "python", 335 | "pygments_lexer": "ipython3", 336 | "version": "3.8.3" 337 | } 338 | }, 339 | "nbformat": 4, 340 | "nbformat_minor": 1 341 | } 342 | -------------------------------------------------------------------------------- /SQL_notebook.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/SQL_notebook.pdf -------------------------------------------------------------------------------- /TweetRead.py: -------------------------------------------------------------------------------- 1 | # import libraries 2 | import tweepy 3 | from tweepy import OAuthHandler 4 | from tweepy import Stream 5 | from tweepy.streaming import StreamListener 6 | import socket 7 | import json 8 | 9 | #%% 10 | # Set up your credentials 11 | consumer_key='' 12 | consumer_secret='' 13 | access_token ='' 14 | access_secret='' 15 | 16 | class TweetsListener(StreamListener): 17 | 18 | def __init__(self, csocket): 19 | self.client_socket = csocket 20 | 21 | def on_data(self, data): 22 | try: 23 | msg = json.loads( data ) 24 | print( msg['text'].encode('utf-8') ) 25 | self.client_socket.send( msg['text'].encode('utf-8') ) 26 | return True 27 | except BaseException as e: 28 | print("Error on_data: %s" % str(e)) 29 | return True 30 | 31 | def on_error(self, status): 32 | print(status) 33 | return True 34 | 35 | def sendData(c_socket): 36 | auth = OAuthHandler(consumer_key, consumer_secret) 37 | auth.set_access_token(access_token, access_secret) 38 | 39 | twitter_stream = Stream(auth, TweetsListener(c_socket)) 40 | twitter_stream.filter(track=['soccer']) 41 | 42 | if __name__ == "__main__": 43 | s = socket.socket() # Create a socket object 44 | host = "127.0.0.1" # Get local machine name 45 | port = 5555 # Reserve a port for your service. 46 | s.bind((host, port)) # Bind to the port 47 | 48 | print("Listening on port: %s" % str(port)) 49 | 50 | s.listen(5) # Now wait for client connection. 51 | c, addr = s.accept() # Establish connection with client. 52 | 53 | print( "Received request from: " + str( addr ) ) 54 | 55 | sendData( c ) 56 | -------------------------------------------------------------------------------- /big_data/SQL-in-Spark.py: -------------------------------------------------------------------------------- 1 | # PySpark provides two main options when it comes to using staight SQL. Spark SQL and SQL Transformer. 2 | # ## 1. Spark SQL 3 | # Spark TempView provides two functions that allow users to run **SQL** queries against a Spark DataFrame: 4 | # 5 | # - **createOrReplaceTempView:** The lifetime of this temporary view is tied to the SparkSession that was 6 | # used to create the dataset. It creates (or replaces if that view name already exists) a lazily evaluated 7 | # "view" that you can then use like a hive table in Spark SQL. It does not persist to memory unless you cache 8 | # the dataset that underpins the view. 9 | # - **createGlobalTempView:** The lifetime of this temporary view is tied to this Spark application. 10 | # This feature is useful when you want to share data among different sessions and keep alive until your 11 | # application ends. 12 | # 13 | 14 | import pyspark # only run after findspark.init() 15 | from pyspark.sql import SparkSession 16 | # May take awhile locally 17 | spark = SparkSession.builder.appName("SparkSQL").getOrCreate() 18 | 19 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size() 20 | print("You are working with", cores, "core(s)") 21 | spark 22 | 23 | ##**Source:** https://www.kaggle.com/r3w0p4/recorded-crime-data-at-police-force-area-level 24 | # Start by reading a basic csv dataset 25 | path = 'Datasets/' 26 | crime = spark.read.csv(path+"rec-crime-pfa.csv",header=True,inferSchema=True) 27 | 28 | # So, in order for us to perform SQL calls off of this dataframe, we will need to rename any variables 29 | # that have spaces in them. We will not be using the first variable so we'll leave that one as is, 30 | # but we will be using the last variable, so I will go ahead and change that to Count so we can work with it. 31 | 32 | df = crime.withColumnRenamed('Rolling year total number of offences','Count') 33 | #.withColumn("12 months ending", crime["12 months ending"].cast(DateType())). 34 | print(df.printSchema()) 35 | 36 | 37 | # Create a temporary view of the dataframe, it is like a hive table in Spark SQL 38 | df.createOrReplaceTempView("newtable") 39 | spark.sql("SELECT * FROM newtable WHERE Count > 1000").limit(5).toPandas() 40 | spark.sql("SELECT sum(Count) as total FROM newtable where Count between 1000 and 2000").show(5) 41 | spark.sql("SELECT Region, sum(Count) as total FROM newtable GROUP BY Region").show(5) 42 | 43 | -------------------------------------------------------------------------------- /big_data/partition_parquet_file.py: -------------------------------------------------------------------------------- 1 | ## 2 | from pyspark.sql import SparkSession 3 | 4 | 5 | spark = SparkSession.builder.appName("session").master("local").getOrCreate() 6 | 7 | path = "Datasets/" 8 | data = spark.read.csv(path+"pga_tour_historical.csv",inferSchema=True, header=True) 9 | data.show(10) 10 | data.limit(10).toPandas() 11 | data.count() 12 | data.printSchema() 13 | data.describe 14 | # Generate summary statistics for TWO variables 15 | 16 | data.select('Season', 'Value').summary("count","min",'max').show() 17 | 18 | # Write a partioned parquet file 19 | ## Now try writing a parquet file (not partitioned) from the pga dataset. But first create a new dataframe containing 20 | # ONLY the the "Season" and "Value" fields (using the "select command you used in the question above) and write a parquet file 21 | # partitioned by "Season". This is a bit of a challenge aimed at getting you ready for material that will be covered later on 22 | # in the course. Don't feel bad if you can't figure it out. 23 | df = data.select('Season','Value') 24 | # it will create a directory named partition_parquet 25 | df.write.mode("overwrite").parquet("partition_parquet/", partitionBy='Season') 26 | # then partition parquet-data in that directory 27 | #df.write.mode("overwrite").partitionBy("Season").parquet("partition_parquet/") 28 | df.show(20) 29 | 30 | # Now try reading in the partitioned parquet file you just created above. 31 | path_prq = 'partition_parquet/' 32 | parquet = spark.read.parquet(path_prq) 33 | parquet.show(20) 34 | df.printSchema() 35 | 36 | # Reading in a set of paritioned parquet files 37 | # # Now try only reading Seasons 2010, 2011 and 2012. 38 | 39 | partitioned = spark.read.parquet(path_prq+'Season=2010/',path_prq+'Season=2011/', 40 | path_prq+'Season=2012/') 41 | 42 | partitioned.show(10) 43 | # we need to use a method to get both season and value 44 | 45 | partitioned = spark.read.option("basePath", path_prq).parquet(path_prq+'Season=2010/',path_prq+'Season=2011/', 46 | path_prq+'Season=2012/') 47 | 48 | partitioned.show(10) 49 | -------------------------------------------------------------------------------- /big_data/readme.md: -------------------------------------------------------------------------------- 1 | Documentation [https://spark.apache.org/docs/latest/sql-programming-guide.html] 2 | -------------------------------------------------------------------------------- /big_data/search_filter_dataframe.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark.sql import SparkSession 3 | # May take awhile locally 4 | spark = SparkSession.builder.appName("FunctionsHW").getOrCreate() 5 | 6 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size() 7 | print("You are working with", cores, "core(s)") 8 | spark 9 | 10 | fifa = spark.read.csv('Datasets/fifa19.csv',inferSchema=True,header=True) 11 | 12 | print(fifa.printSchema()) 13 | 14 | from pyspark.sql.functions import * 15 | fifa.select(['Name','Position','Release Clause']).show(5,False) 16 | # Display the same results from above sorted by the players names 17 | fifa.select(['Name','Position']).orderBy('Name').show(5) 18 | fifa.select(['Name','Position','Age']).orderBy(fifa['Age'].desc()).show(5) 19 | 20 | # Select only the players who belong to a club begining with FC 21 | # One way 22 | fifa.select("Name","Club").where(fifa.Club.like("FC%")).show(5, False) 23 | 24 | # Another way 25 | fifa.select("Name","Club").where(fifa.Club.startswith("FC")).limit(4).toPandas() 26 | 27 | ## ====================================================== 28 | # to create a new dataframe 29 | df = fifa.limit(100) 30 | df.count() 31 | 32 | # if we slice the colomns 33 | df2_col = fifa.columns[0:5] 34 | df2 = fifa.select(df2_col) 35 | df2.count() 36 | df2.show(5,False) 37 | # count the colomn 38 | len(df2.columns) 39 | # ======================================================== 40 | # Filtering data with condition 41 | # ======================================================== 42 | 43 | fifa.filter("Age>40").select(['Name','Age']).limit(4).toPandas() 44 | -------------------------------------------------------------------------------- /big_data/split-column.py: -------------------------------------------------------------------------------- 1 | # PySpark split() Column into Multiple Columns 2 | from sqlite3 import Timestamp 3 | 4 | import pyspark 5 | from pyspark.sql import SparkSession 6 | # from pyspark.sql.functions import split 7 | from pyspark.sql.functions import * 8 | from pyspark.sql.types import StructType,StructField, StringType, IntegerType 9 | spark=SparkSession.builder.appName("sparkbyexamples").getOrCreate() 10 | ## 11 | # 12 | # create a data which is a list 13 | data = [('James','','Smith','1991-04-01'), 14 | ('Michael','Rose','','2000-05-19'), 15 | ('Robert','','Williams','1978-09-05'), 16 | ('Maria','Anne','Jones','1967-12-01'), 17 | ('Jen','Mary','Brown','1980-02-17') 18 | ] 19 | 20 | df0 = spark.createDataFrame(data, ["name", "midname", "surname", "dob"]) 21 | 22 | df0.printSchema() 23 | df0.show(truncate=False) 24 | """ 25 | +-------+-------+--------+----------+ 26 | |name |midname|surname |dob | 27 | +-------+-------+--------+----------+ 28 | |James | |Smith |1991-04-01| 29 | |Michael|Rose | |2000-05-19| 30 | |Robert | |Williams|1978-09-05| 31 | |Maria |Anne |Jones |1967-12-01| 32 | |Jen |Mary |Brown |1980-02-17| 33 | +-------+-------+--------+----------+ 34 | """ 35 | #---------------------------- 36 | # Below example creates a new Dataframe with Columns year, month, and the day after performing a split() 37 | # function on dob Column of string type. 38 | 39 | df1 = df0.withColumn('year', split(df0['dob'], '-').getItem(0)) \ 40 | .withColumn('month', split(df0['dob'], '-').getItem(1)) \ 41 | .withColumn('day', split(df0['dob'], '-').getItem(2)) 42 | df1.show(truncate=False) 43 | #--------- 44 | split_col = pyspark.sql.functions.split(df0['dob'], '-') 45 | df2 = df0.withColumn('year', split_col.getItem(0)) \ 46 | .withColumn('month', split_col.getItem(1)) \ 47 | .withColumn('day', split_col.getItem(2)) 48 | df2.show(truncate=False) 49 | 50 | """ 51 | +-------+-------+--------+----------+----+-----+---+ 52 | |name |midname|surname |dob |year|month|day| 53 | +-------+-------+--------+----------+----+-----+---+ 54 | |James | |Smith |1991-04-01|1991|04 |01 | 55 | |Michael|Rose | |2000-05-19|2000|05 |19 | 56 | |Robert | |Williams|1978-09-05|1978|09 |05 | 57 | |Maria |Anne |Jones |1967-12-01|1967|12 |01 | 58 | |Jen |Mary |Brown |1980-02-17|1980|02 |17 | 59 | +-------+-------+--------+----------+----+-----+---+ 60 | """ 61 | 62 | # Using split() function of Column class 63 | split_col = pyspark.sql.functions.split(df0['dob'], '-') 64 | df3 = df0.select("firstname","middlename","lastname","dob", split_col.getItem(0).alias('year'),split_col.getItem(1).alias('month'),split_col.getItem(2).alias('day')) 65 | df3.show(truncate=False) 66 | 67 | 68 | 69 | 70 | 71 | 72 | 73 | 74 | 75 | -------------------------------------------------------------------------------- /books/LearningSpark2.0.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/books/LearningSpark2.0.pdf -------------------------------------------------------------------------------- /books/pyspark.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/books/pyspark.pdf -------------------------------------------------------------------------------- /books/spark-hadoop.pdf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/books/spark-hadoop.pdf -------------------------------------------------------------------------------- /data/ContainsNull.csv: -------------------------------------------------------------------------------- 1 | Id,Name,Sales 2 | emp1,John, 3 | emp2,, 4 | emp3,,345.0 5 | emp4,Cindy,456.0 6 | -------------------------------------------------------------------------------- /data/cruise_ship_info.csv: -------------------------------------------------------------------------------- 1 | Ship_name,Cruise_line,Age,Tonnage,passengers,length,cabins,passenger_density,crew 2 | Journey,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55 3 | Quest,Azamara,6,30.276999999999997,6.94,5.94,3.55,42.64,3.55 4 | Celebration,Carnival,26,47.262,14.86,7.22,7.43,31.8,6.7 5 | Conquest,Carnival,11,110.0,29.74,9.53,14.88,36.99,19.1 6 | Destiny,Carnival,17,101.353,26.42,8.92,13.21,38.36,10.0 7 | Ecstasy,Carnival,22,70.367,20.52,8.55,10.2,34.29,9.2 8 | Elation,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2 9 | Fantasy,Carnival,23,70.367,20.56,8.55,10.22,34.23,9.2 10 | Fascination,Carnival,19,70.367,20.52,8.55,10.2,34.29,9.2 11 | Freedom,Carnival,6,110.23899999999999,37.0,9.51,14.87,29.79,11.5 12 | Glory,Carnival,10,110.0,29.74,9.51,14.87,36.99,11.6 13 | Holiday,Carnival,28,46.052,14.52,7.27,7.26,31.72,6.6 14 | Imagination,Carnival,18,70.367,20.52,8.55,10.2,34.29,9.2 15 | Inspiration,Carnival,17,70.367,20.52,8.55,10.2,34.29,9.2 16 | Legend,Carnival,11,86.0,21.24,9.63,10.62,40.49,9.3 17 | Liberty*,Carnival,8,110.0,29.74,9.51,14.87,36.99,11.6 18 | Miracle,Carnival,9,88.5,21.24,9.63,10.62,41.67,10.3 19 | Paradise,Carnival,15,70.367,20.52,8.55,10.2,34.29,9.2 20 | Pride,Carnival,12,88.5,21.24,9.63,11.62,41.67,9.3 21 | Sensation,Carnival,20,70.367,20.52,8.55,10.2,34.29,9.2 22 | Spirit,Carnival,12,88.5,21.24,9.63,10.56,41.67,10.29 23 | Triumph,Carnival,14,101.509,27.58,8.93,13.21,36.81,10.0 24 | Valor,Carnival,9,110.0,29.74,9.52,14.87,36.99,11.6 25 | Victory,Carnival,13,101.509,27.58,8.93,13.79,36.81,11.5 26 | Century,Celebrity,18,70.60600000000001,17.7,8.15,8.75,39.89,8.58 27 | Constellation,Celebrity,11,91.0,20.32,9.65,9.75,44.78,9.99 28 | Galaxy,Celebrity,17,77.71300000000001,18.9,8.66,9.35,41.12,9.09 29 | Infinity,Celebrity,12,91.0,20.32,9.65,9.75,44.78,9.99 30 | Mercury,Celebrity,16,77.71300000000001,18.82,8.66,9.35,41.29,9.09 31 | Millenium,Celebrity,13,91.0,20.32,9.65,9.75,44.78,9.99 32 | Solstice,Celebrity,5,122.0,28.5,10.33,6.87,34.57,6.7 33 | Summit,Celebrity,12,91.0,20.32,9.65,9.75,44.78,9.99 34 | Xpedition,Celebrity,12,2.329,0.94,2.96,0.45,24.78,0.6 35 | Zenith,Celebrity,21,47.225,13.66,6.82,6.87,34.57,6.7 36 | Allegra,Costa,21,28.43,8.08,6.16,4.1,35.19,4.0 37 | Atlantica,Costa,13,85.619,21.14,9.57,10.56,40.5,9.2 38 | Classica,Costa,22,52.926,13.02,7.18,6.54,40.65,6.17 39 | Europa,Costa,27,53.872,14.94,7.98,7.67,36.06,6.36 40 | Fortuna,Costa,10,105.0,27.2,8.9,13.56,38.6,10.68 41 | Magica,Costa,9,105.0,27.2,8.9,13.56,38.6,10.68 42 | Marina,Costa,23,25.0,7.76,6.22,3.86,32.22,3.85 43 | Mediterranea,Costa,10,86.0,21.14,9.6,10.56,40.68,9.2 44 | Romantica,Costa,20,53.049,13.44,7.22,6.78,39.47,6.0 45 | Serena,Costa,6,112.0,38.0,9.51,15.0,29.47,10.9 46 | Victoria,Costa,17,75.166,19.28,8.28,9.64,38.99,7.66 47 | Serenity,Crystal,10,68.0,10.8,7.9,5.5,62.96,6.36 48 | Symphony,Crystal,18,51.004,9.4,7.81,4.8,54.26,5.45 49 | QueenElizabethII,Cunard,44,70.327,17.91,9.63,9.5,39.27,9.21 50 | QueenMary2,Cunard,10,151.4,26.2,11.32,11.34,57.79,12.53 51 | QueenVictoria,Cunard,6,90.0,20.0,9.64,10.29,45.0,9.0 52 | Magic,Disney,15,83.338,17.5,9.64,8.75,47.62,9.45 53 | Wonder,Disney,14,83.0,17.5,9.64,8.75,47.43,9.45 54 | Amsterdam,Holland_American,13,61.0,13.8,7.8,6.88,44.2,6.0 55 | Eurodam,Holland_American,5,86.0,21.04,9.36,10.22,40.87,8.0 56 | Maasdam,Holland_American,20,55.451,12.64,7.19,6.32,43.87,5.57 57 | Noordam,Holland_American,29,33.92,12.14,7.04,6.07,27.94,5.3 58 | Oosterdam,Holland_American,10,81.76899999999999,18.48,9.59,9.24,44.25,8.42 59 | Prinsendam,Holland_American,25,38.0,7.49,6.74,3.96,50.73,4.6 60 | Rotterdam,Holland_American,16,59.652,13.2,7.77,6.6,45.19,6.44 61 | Ryndam,Holland_American,19,55.451,12.66,7.19,6.33,43.8,5.88 62 | Statendam,Holland_American,20,55.451,12.66,7.19,6.33,43.8,5.88 63 | Veendam,Holland_American,17,55.451,12.66,7.19,6.33,43.8,5.88 64 | Volendam,Holland_American,14,63.0,14.4,7.77,7.2,43.75,5.61 65 | Westerdam,Holland_American,27,53.872,14.94,7.98,7.47,36.06,6.12 66 | Zaandam,Holland_American,13,63.0,14.4,7.77,7.2,43.75,5.31 67 | Zuiderdam,Holland_American,11,85.0,18.48,9.51,9.24,46.0,8.0 68 | Armonia,MSC,12,58.6,15.66,8.24,7.83,37.42,7.0 69 | Fantasia,MSC,5,133.5,39.59,10.93,16.37,33.72,13.13 70 | Lirica,MSC,10,58.825,15.6,8.23,7.65,37.71,7.0 71 | Melody,MSC,31,35.143,12.5,6.69,5.32,28.11,5.35 72 | Musica,MSC,7,89.6,25.5,9.61,12.75,35.14,9.87 73 | Opera,MSC,9,59.058,17.0,7.63,8.5,34.74,7.4 74 | Rhapsody,MSC,36,16.852,9.52,5.41,3.83,17.7,2.97 75 | Sinfonia,MSC,11,58.6,15.66,8.23,7.83,37.42,7.6 76 | Crown,Norwegian,25,34.25,10.52,6.15,5.26,32.56,4.7 77 | Dawn,Norwegian,11,90.0,22.4,9.65,11.2,40.18,11.0 78 | Dream,Norwegian,21,50.76,17.48,7.54,8.74,29.04,6.14 79 | Gem,Norwegian,6,93.0,23.94,9.65,11.97,38.85,11.09 80 | Jewel,Norwegian,8,91.0,22.44,9.65,11.22,40.55,11.0 81 | Majesty,Norwegian,21,38.0,10.56,5.67,5.28,35.98,4.38 82 | PrideofAloha,Norwegian,14,77.104,20.02,8.53,10.01,38.51,8.0 83 | PrideofAmerica,Norwegian,9,81.0,21.44,9.21,10.72,37.78,10.0 84 | Sea,Norwegian,25,42.0,15.04,7.08,7.52,27.93,6.3 85 | Spirit,Norwegian,15,75.33800000000001,19.56,8.79,9.83,38.52,13.0 86 | Star,Norwegian,40,28.0,11.5,6.74,4.0,24.35,3.8 87 | Sun,Norwegian,12,77.104,20.02,8.53,10.01,38.51,9.59 88 | Wind,Norwegian,20,50.76,17.48,7.54,8.74,29.04,6.14 89 | Insignia,Oceania,15,30.276999999999997,6.84,5.94,3.42,44.26,4.0 90 | Nautica,Oceania,13,30.276999999999997,6.84,5.94,3.42,44.26,4.0 91 | Regatta,Oceania,15,30.276999999999997,6.84,5.94,3.42,44.26,4.0 92 | MarcoPolo,Orient,48,22.08,8.26,5.78,4.25,26.73,3.5 93 | Arcadia,P&O,9,85.0,19.68,9.35,9.84,43.19,8.69 94 | Artemis,P&O,29,45.0,11.78,7.54,5.3,38.2,5.2 95 | Aurora,P&O,13,76.0,18.74,8.86,9.39,40.55,8.5 96 | Oceana,P&O,10,77.0,20.16,8.56,9.75,38.19,9.0 97 | Oriana,P&O,18,69.153,18.82,8.53,9.14,36.74,7.94 98 | Ventura,P&O,5,115.0,35.74,9.0,15.32,32.18,12.2 99 | Caribbean,Princess,9,116.0,26.0,9.51,13.0,44.62,11.0 100 | Coral,Princess,11,91.62700000000001,19.74,9.64,9.87,46.42,9.0 101 | Crown,Princess,7,116.0,31.0,9.51,15.57,37.42,12.0 102 | Dawn,Princess,16,77.499,19.5,8.56,10.5,39.74,9.0 103 | Diamond,Princess,9,113.0,26.74,9.51,13.37,42.26,12.38 104 | Emerald,Princess,6,113.0,37.82,9.51,15.57,29.88,12.0 105 | Golden,Princess,12,108.865,27.58,9.51,13.0,39.47,11.0 106 | Grand,Princess,15,108.806,26.0,9.51,13.0,41.85,11.1 107 | Island,Princess,10,91.62700000000001,19.74,9.64,9.87,46.42,9.0 108 | Pacific,Princess,14,30.276999999999997,6.86,5.93,3.44,44.14,3.73 109 | Regal,Princess,22,69.845,15.9,8.03,7.95,43.93,6.96 110 | Royal,Princess,29,44.348,12.0,7.54,6.0,36.96,5.2 111 | Saphire,Princess,9,113.0,26.74,9.51,13.37,42.26,12.38 112 | Sea,Princess,8,77.499,19.5,8.56,9.75,39.74,9.0 113 | Star,Princess,11,108.977,26.02,9.51,13.01,41.88,12.0 114 | Sun,Princess,18,77.499,19.5,8.56,9.75,39.74,9.0 115 | Tahitian,Princess,14,30.276999999999997,6.88,5.93,3.44,44.01,3.73 116 | ExplorerII,Regent_Seven_Seas,27,12.5,3.94,4.36,0.88,31.73,1.46 117 | Mariner,Regent_Seven_Seas,12,50.0,7.0,7.09,3.54,71.43,4.45 118 | Navigator,Regent_Seven_Seas,14,33.0,4.9,5.6,2.45,67.35,3.24 119 | PaulGauguin,Regent_Seven_Seas,16,19.2,3.2,5.13,1.6,60.0,2.11 120 | Voyager,Regent_Seven_Seas,10,46.0,7.0,6.7,1.82,65.71,4.47 121 | Adventure,Royal_Caribbean,12,138.0,31.14,10.2,15.57,44.32,11.85 122 | Brilliance,Royal_Caribbean,11,90.09,25.01,9.62,10.5,36.02,8.48 123 | Empress,Royal_Caribbean,23,48.563,20.2,6.92,8.0,24.04,6.71 124 | Enchantment,Royal_Caribbean,16,74.137,19.5,9.16,9.75,38.02,7.6 125 | Explorer,Royal_Caribbean,13,138.0,31.14,10.2,15.57,44.32,11.76 126 | Freedom,Royal_Caribbean,7,158.0,43.7,11.12,18.0,36.16,13.6 127 | Grandeur,Royal_Caribbean,17,74.137,19.5,9.16,9.75,38.02,7.6 128 | Independence,Royal_Caribbean,5,160.0,36.34,11.12,18.17,44.03,13.6 129 | Jewel,Royal_Caribbean,9,90.09,25.01,9.62,10.94,36.02,8.69 130 | Legend,Royal_Caribbean,18,70.0,18.0,8.67,9.0,38.89,7.2 131 | Liberty,Royal_Caribbean,6,158.0,43.7,11.25,18.0,36.16,13.6 132 | Majesty,Royal_Caribbean,21,73.941,27.44,8.8,11.75,26.95,8.22 133 | Mariner,Royal_Caribbean,10,138.0,31.14,10.2,15.57,44.32,11.85 134 | Monarch,Royal_Caribbean,22,73.941,27.44,8.8,11.77,30.94,8.22 135 | Navigator,Royal_Caribbean,11,138.0,31.14,10.2,15.57,44.32,11.85 136 | Oasis,Royal_Caribbean,4,220.0,54.0,11.82,27.0,40.74,21.0 137 | Radiance,Royal_Caribbean,12,90.09,25.01,9.62,10.5,36.02,8.68 138 | Rhapsody,Royal_Caribbean,16,78.491,24.35,9.15,10.0,32.23,7.65 139 | Serenade,Royal_Caribbean,10,90.09,25.01,9.62,10.5,36.02,8.58 140 | Sovreign,Royal_Caribbean,25,73.192,28.52,8.8,11.38,25.66,8.08 141 | Splendour,Royal_Caribbean,17,70.0,20.76,8.67,9.02,33.72,7.2 142 | Vision,Royal_Caribbean,15,78.491,24.35,9.15,10.0,32.23,6.6 143 | Voyager,Royal_Caribbean,14,138.0,31.14,10.2,15.57,44.32,11.76 144 | Legend,Seabourn,21,10.0,2.08,4.4,1.04,48.08,1.6 145 | Pride,Seabourn,27,10.0,2.08,4.4,1.04,48.08,1.6 146 | Spirit,Seabourn,24,10.0,2.08,4.4,1.04,48.08,1.6 147 | Cloud,Silversea,19,16.8,2.96,5.14,1.48,56.76,2.1 148 | Shadow,Silversea,13,25.0,3.82,5.97,1.94,65.45,2.95 149 | Whisper,Silversea,12,25.0,3.88,5.97,1.94,64.43,2.87 150 | Wind,Silversea,19,16.8,2.96,5.14,1.48,56.76,1.97 151 | Aries,Star,22,3.341,0.66,2.8,0.33,50.62,0.59 152 | Gemini,Star,21,19.093,8.0,5.37,4.0,23.87,4.7 153 | Libra,Star,12,42.0,14.8,7.13,7.4,28.38,6.8 154 | Pisces,Star,24,40.053000000000004,12.87,5.79,7.76,31.12,7.5 155 | Taurus,Star,22,3.341,0.66,2.79,0.33,50.62,0.59 156 | Virgo,Star,14,76.8,19.6,8.79,9.67,39.18,12.0 157 | Spirit,Windstar,25,5.35,1.58,4.4,0.74,33.86,0.88 158 | Star,Windstar,27,5.35,1.67,4.4,0.74,32.04,0.88 159 | Surf,Windstar,23,14.745,3.08,6.17,1.56,47.87,1.8 160 | -------------------------------------------------------------------------------- /data/sales_info.csv: -------------------------------------------------------------------------------- 1 | Company,Person,Sales 2 | GOOG,Sam,200 3 | GOOG,Charlie,120 4 | GOOG,Frank,340 5 | MSFT,Tina,600 6 | MSFT,Amy,124 7 | MSFT,Vanessa,243 8 | FB,Carl,870 9 | FB,Sarah,350 10 | APPL,John,250 11 | APPL,Linda, 130 12 | APPL,Mike, 750 13 | APPL, Chris, 350 -------------------------------------------------------------------------------- /data/users1.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/data/users1.parquet -------------------------------------------------------------------------------- /data/users2.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/data/users2.parquet -------------------------------------------------------------------------------- /data/users3.parquet: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/Foroozani/BigData_PySpark/026f2b13f0d2eba80df186991589774ee72e5bc4/data/users3.parquet -------------------------------------------------------------------------------- /scripts/aggrigating-data-in-DataFrame.py: -------------------------------------------------------------------------------- 1 | """ 2 | - Group by 3 | - Pivot 4 | - Aggregate method 5 | - Combos of each 6 | """ 7 | #from pyspark.sql.functions import mean 8 | 9 | import pyspark 10 | from pyspark.sql import SparkSession 11 | 12 | spark = SparkSession.builder.appName('aggrigation').getOrCreate() 13 | spark 14 | df0 = spark.read.csv("Datasets/nyc_air_bnb.csv", header=True, inferSchema=True) 15 | df0.printSchema() 16 | df0.show(4) 17 | 18 | from pyspark.sql.types import * 19 | from pyspark.sql.functions import * 20 | 21 | df = df0.withColumn("price", df0.price.cast(IntegerType())) 22 | df.printSchema() 23 | 24 | df = df.withColumn("minimum_nights", df.minimum_nights.cast(IntegerType())) \ 25 | .withColumn("number_of_reviews", df.number_of_reviews.cast(IntegerType())) \ 26 | .withColumn("reviews_per_month", df.reviews_per_month.cast(IntegerType())) \ 27 | .withColumn("calculated_host_listings_count", df.calculated_host_listings_count.cast(IntegerType())) \ 28 | .withColumn("last_review", df.last_review.cast('date')) 29 | 30 | df.printSchema() 31 | df.show(4, False) 32 | 33 | # GROUP BY 34 | df.groupBy("neighbourhood_group").min().show(5) 35 | df.summary().show(5) 36 | df.summary("min","max","count").show(5) 37 | df.select('price', 'minimum_nights').summary('min','max','count', 'mean').show(5) 38 | ## 39 | df.select(countDistinct("neighbourhood_group"), mean('price'), max('price')).show(5) 40 | 41 | df.groupBy('room_type').pivot("neighbourhood_group", ["Queens", "Brooklyn"]). count().show(5) 42 | -------------------------------------------------------------------------------- /scripts/join-append-DataFrame.py: -------------------------------------------------------------------------------- 1 | """ 2 | - Appending Table 3 | - Joining Tables 4 | """ 5 | 6 | import pyspark 7 | from pyspark.sql import SparkSession 8 | 9 | spark = SparkSession.builder.appName('jointables').getOrCreate() 10 | spark 11 | ## 12 | valuesP = [('koala',1,'yes'),('caterpillar',2,'yes'),('deer',3,'yes'),('human',4,'yes')] 13 | plants = spark.createDataFrame(valuesP,['name','id','eats_plants']) 14 | 15 | valuesM = [('shark',5,'yes'),('lion',6,'yes'),('tiger',7,'yes'),('human',4,'yes')] 16 | meat = spark.createDataFrame(valuesM,['name','id','eats_meat']) 17 | ## 18 | print("Plant eaters (herbivores)") 19 | print(plants.show()) 20 | print("Meat eaters (carnivores)") 21 | print(meat.show()) 22 | 23 | # --------------- 24 | 25 | innerjoinDF = plants.join(meat, on = ['name', 'id'], how='inner') 26 | innerjoinDF.show() 27 | 28 | leftjoinDF = plants.join(meat, on = 'name', how='left') 29 | leftjoinDF.show() 30 | 31 | rightjoinDF = plants.join(meat, on = 'name', how='right') 32 | rightjoinDF.show() 33 | 34 | # to exclude a value from a join table 35 | rightjoinDF = plants.join(meat, on = 'name', how='right').filter(plants.name.isNotNull()) 36 | rightjoinDF.show() 37 | 38 | # FULL outer join 39 | fulljoinDF = plants.join(meat, on = 'name', how='full') 40 | fulljoinDF.show() 41 | 42 | ## 43 | import os 44 | """ 45 | # - **course_offerings:** uuid, course_uuid, term_code, name 46 | # - **instructors:** id, name 47 | # - **sections:** uuid, course_offering_uuid,room_uuid, schedule_uuid 48 | # - **teachings:** instructor_id, section_uuid 49 | # 50 | # **Source:** https://www.kaggle.com/Madgrades/uw-madison-course 51 | """ 52 | path = "Datasets/uw-madison-courses/" 53 | 54 | df_list = [] 55 | for filename in os.listdir(path): 56 | if filename.endswith(".csv"): 57 | filename_list = filename.split(".") # separate path from .csv 58 | df_name = filename_list[0] 59 | df = spark.read.csv(path + filename, inferSchema=True, header=True) 60 | df.name = df_name 61 | df_list.append(df_name) 62 | exec(df_name + ' = df') 63 | ## 64 | # 65 | print("Full list of dfs:") 66 | print(df_list) 67 | 68 | rooms.show() 69 | sections.show(4) 70 | 71 | step1 = teachings.join(instructors, teachings.instructor_id == instructors.id, how='left').select(['instructor_id','name','section_uuid']) 72 | step1.limit(4).show(5) 73 | 74 | step2 = step1.join(sections, step1.section_uuid == sections.uuid, how='left').select(['name','course_offering_uuid']) 75 | step2.limit(4).show() 76 | 77 | step3 = step2.withColumnRenamed('name', 'instructor').join(course_offerings, step2.course_offering_uuid == course_offerings.uuid, how='inner').select(['instructor','name','course_offering_uuid']) 78 | step3.show(4) -------------------------------------------------------------------------------- /scripts/join_tabales.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "id": "b039fa64", 7 | "metadata": {}, 8 | "outputs": [], 9 | "source": [ 10 | "from pyspark import SparkContext" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 4, 16 | "id": "379fcd52", 17 | "metadata": {}, 18 | "outputs": [ 19 | { 20 | "data": { 21 | "text/html": [ 22 | "\n", 23 | "
\n", 24 | "

SparkSession - in-memory

\n", 25 | " \n", 26 | "
\n", 27 | "

SparkContext

\n", 28 | "\n", 29 | "

Spark UI

\n", 30 | "\n", 31 | "
\n", 32 | "
Version
\n", 33 | "
v3.0.3
\n", 34 | "
Master
\n", 35 | "
local[*]
\n", 36 | "
AppName
\n", 37 | "
jointables
\n", 38 | "
\n", 39 | "
\n", 40 | " \n", 41 | "
\n", 42 | " " 43 | ], 44 | "text/plain": [ 45 | "" 46 | ] 47 | }, 48 | "execution_count": 4, 49 | "metadata": {}, 50 | "output_type": "execute_result" 51 | } 52 | ], 53 | "source": [ 54 | "import pyspark\n", 55 | "from pyspark.sql import SparkSession\n", 56 | "\n", 57 | "spark = SparkSession.builder.appName('jointables').getOrCreate()\n", 58 | "spark" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 5, 64 | "id": "4abf5bb7", 65 | "metadata": {}, 66 | "outputs": [ 67 | { 68 | "name": "stdout", 69 | "output_type": "stream", 70 | "text": [ 71 | "root\n", 72 | " |-- emp_id: long (nullable = true)\n", 73 | " |-- name: string (nullable = true)\n", 74 | " |-- superior_emp_id: long (nullable = true)\n", 75 | " |-- year_joined: string (nullable = true)\n", 76 | " |-- emp_dept_id: string (nullable = true)\n", 77 | " |-- gender: string (nullable = true)\n", 78 | " |-- salary: long (nullable = true)\n", 79 | "\n", 80 | "+------+--------+---------------+-----------+-----------+------+------+\n", 81 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|\n", 82 | "+------+--------+---------------+-----------+-----------+------+------+\n", 83 | "|1 |Smith |-1 |2018 |10 |M |3000 |\n", 84 | "|2 |Rose |1 |2010 |20 |M |4000 |\n", 85 | "|3 |Williams|1 |2010 |10 |M |1000 |\n", 86 | "|4 |Jones |2 |2005 |10 |F |2000 |\n", 87 | "|5 |Brown |2 |2010 |40 | |-1 |\n", 88 | "|6 |Brown |2 |2010 |50 | |-1 |\n", 89 | "+------+--------+---------------+-----------+-----------+------+------+\n", 90 | "\n" 91 | ] 92 | } 93 | ], 94 | "source": [ 95 | "emp = [(1,\"Smith\",-1,\"2018\",\"10\",\"M\",3000), \\\n", 96 | " (2,\"Rose\",1,\"2010\",\"20\",\"M\",4000), \\\n", 97 | " (3,\"Williams\",1,\"2010\",\"10\",\"M\",1000), \\\n", 98 | " (4,\"Jones\",2,\"2005\",\"10\",\"F\",2000), \\\n", 99 | " (5,\"Brown\",2,\"2010\",\"40\",\"\",-1), \\\n", 100 | " (6,\"Brown\",2,\"2010\",\"50\",\"\",-1) \\\n", 101 | " ]\n", 102 | "empColumns = [\"emp_id\",\"name\",\"superior_emp_id\",\"year_joined\", \\\n", 103 | " \"emp_dept_id\",\"gender\",\"salary\"]\n", 104 | "\n", 105 | "empDF = spark.createDataFrame(data=emp, schema = empColumns)\n", 106 | "empDF.printSchema()\n", 107 | "empDF.show(truncate=False)" 108 | ] 109 | }, 110 | { 111 | "cell_type": "code", 112 | "execution_count": 6, 113 | "id": "59f0d2ec", 114 | "metadata": {}, 115 | "outputs": [], 116 | "source": [ 117 | "dept = [(\"Finance\",10, \"Bill\"), \\\n", 118 | " (\"Marketing\",20, \"Joe\"), \\\n", 119 | " (\"Sales\",30, \"Smith\"), \\\n", 120 | " (\"IT\",40, \"Brown\") \\\n", 121 | " ]" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 7, 127 | "id": "9ed307c0", 128 | "metadata": {}, 129 | "outputs": [ 130 | { 131 | "name": "stdout", 132 | "output_type": "stream", 133 | "text": [ 134 | "root\n", 135 | " |-- dept_name: string (nullable = true)\n", 136 | " |-- dept_id: long (nullable = true)\n", 137 | " |-- emp_name: string (nullable = true)\n", 138 | "\n", 139 | "+---------+-------+--------+\n", 140 | "|dept_name|dept_id|emp_name|\n", 141 | "+---------+-------+--------+\n", 142 | "|Finance |10 |Bill |\n", 143 | "|Marketing|20 |Joe |\n", 144 | "|Sales |30 |Smith |\n", 145 | "|IT |40 |Brown |\n", 146 | "+---------+-------+--------+\n", 147 | "\n" 148 | ] 149 | } 150 | ], 151 | "source": [ 152 | "deptColumns = [\"dept_name\",\"dept_id\",\"emp_name\"]\n", 153 | "deptDF = spark.createDataFrame(data=dept, schema = deptColumns)\n", 154 | "deptDF.printSchema()\n", 155 | "deptDF.show(truncate=False)" 156 | ] 157 | }, 158 | { 159 | "cell_type": "code", 160 | "execution_count": 8, 161 | "id": "67e9358b", 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 169 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n", 170 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 171 | "|1 |Smith |-1 |2018 |10 |M |3000 |Finance |10 |Bill |\n", 172 | "|3 |Williams|1 |2010 |10 |M |1000 |Finance |10 |Bill |\n", 173 | "|4 |Jones |2 |2005 |10 |F |2000 |Finance |10 |Bill |\n", 174 | "|2 |Rose |1 |2010 |20 |M |4000 |Marketing|20 |Joe |\n", 175 | "|5 |Brown |2 |2010 |40 | |-1 |IT |40 |Brown |\n", 176 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 177 | "\n" 178 | ] 179 | } 180 | ], 181 | "source": [ 182 | "empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,\"inner\").show(truncate=False)\n" 183 | ] 184 | }, 185 | { 186 | "cell_type": "code", 187 | "execution_count": 9, 188 | "id": "7c98d2fe", 189 | "metadata": {}, 190 | "outputs": [ 191 | { 192 | "name": "stdout", 193 | "output_type": "stream", 194 | "text": [ 195 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 196 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n", 197 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 198 | "|6 |Brown |2 |2010 |50 | |-1 |null |null |null |\n", 199 | "|1 |Smith |-1 |2018 |10 |M |3000 |Finance |10 |Bill |\n", 200 | "|3 |Williams|1 |2010 |10 |M |1000 |Finance |10 |Bill |\n", 201 | "|4 |Jones |2 |2005 |10 |F |2000 |Finance |10 |Bill |\n", 202 | "|null |null |null |null |null |null |null |Sales |30 |Smith |\n", 203 | "|2 |Rose |1 |2010 |20 |M |4000 |Marketing|20 |Joe |\n", 204 | "|5 |Brown |2 |2010 |40 | |-1 |IT |40 |Brown |\n", 205 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 206 | "\n", 207 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 208 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n", 209 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 210 | "|6 |Brown |2 |2010 |50 | |-1 |null |null |null |\n", 211 | "|1 |Smith |-1 |2018 |10 |M |3000 |Finance |10 |Bill |\n", 212 | "|3 |Williams|1 |2010 |10 |M |1000 |Finance |10 |Bill |\n", 213 | "|4 |Jones |2 |2005 |10 |F |2000 |Finance |10 |Bill |\n", 214 | "|null |null |null |null |null |null |null |Sales |30 |Smith |\n", 215 | "|2 |Rose |1 |2010 |20 |M |4000 |Marketing|20 |Joe |\n", 216 | "|5 |Brown |2 |2010 |40 | |-1 |IT |40 |Brown |\n", 217 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 218 | "\n", 219 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 220 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n", 221 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 222 | "|6 |Brown |2 |2010 |50 | |-1 |null |null |null |\n", 223 | "|1 |Smith |-1 |2018 |10 |M |3000 |Finance |10 |Bill |\n", 224 | "|3 |Williams|1 |2010 |10 |M |1000 |Finance |10 |Bill |\n", 225 | "|4 |Jones |2 |2005 |10 |F |2000 |Finance |10 |Bill |\n", 226 | "|null |null |null |null |null |null |null |Sales |30 |Smith |\n", 227 | "|2 |Rose |1 |2010 |20 |M |4000 |Marketing|20 |Joe |\n", 228 | "|5 |Brown |2 |2010 |40 | |-1 |IT |40 |Brown |\n", 229 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 230 | "\n" 231 | ] 232 | } 233 | ], 234 | "source": [ 235 | "# Full Outer Join\n", 236 | "\n", 237 | "empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,\"outer\").show(truncate=False)\n", 238 | "empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,\"full\").show(truncate=False)\n", 239 | "empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,\"fullouter\").show(truncate=False)\n" 240 | ] 241 | }, 242 | { 243 | "cell_type": "code", 244 | "execution_count": 10, 245 | "id": "58fb5363", 246 | "metadata": {}, 247 | "outputs": [ 248 | { 249 | "name": "stdout", 250 | "output_type": "stream", 251 | "text": [ 252 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 253 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n", 254 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 255 | "|6 |Brown |2 |2010 |50 | |-1 |null |null |null |\n", 256 | "|5 |Brown |2 |2010 |40 | |-1 |null |null |null |\n", 257 | "|1 |Smith |-1 |2018 |10 |M |3000 |null |null |null |\n", 258 | "|3 |Williams|1 |2010 |10 |M |1000 |null |null |null |\n", 259 | "|2 |Rose |1 |2010 |20 |M |4000 |null |null |null |\n", 260 | "|4 |Jones |2 |2005 |10 |F |2000 |null |null |null |\n", 261 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 262 | "\n", 263 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 264 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n", 265 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 266 | "|6 |Brown |2 |2010 |50 | |-1 |null |null |null |\n", 267 | "|1 |Smith |-1 |2018 |10 |M |3000 |Finance |10 |Bill |\n", 268 | "|3 |Williams|1 |2010 |10 |M |1000 |Finance |10 |Bill |\n", 269 | "|4 |Jones |2 |2005 |10 |F |2000 |Finance |10 |Bill |\n", 270 | "|2 |Rose |1 |2010 |20 |M |4000 |Marketing|20 |Joe |\n", 271 | "|5 |Brown |2 |2010 |40 | |-1 |IT |40 |Brown |\n", 272 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 273 | "\n" 274 | ] 275 | } 276 | ], 277 | "source": [ 278 | "# Left Outer Join\n", 279 | "# vaghti column moshtarak nadashte bashan (emp_id, dept_id)\n", 280 | "empDF.join(deptDF, empDF.emp_id == deptDF.dept_id, \"left\") \\\n", 281 | " .show(truncate = False)\n", 282 | "\n", 283 | "# rooye column moshtarak \n", 284 | "empDF.join(deptDF, empDF.emp_dept_id == deptDF.dept_id, \"left\") \\\n", 285 | " .show(truncate = False)\n" 286 | ] 287 | }, 288 | { 289 | "cell_type": "code", 290 | "execution_count": 11, 291 | "id": "ff31efe2", 292 | "metadata": {}, 293 | "outputs": [ 294 | { 295 | "name": "stdout", 296 | "output_type": "stream", 297 | "text": [ 298 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 299 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n", 300 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 301 | "|4 |Jones |2 |2005 |10 |F |2000 |null |null |null |\n", 302 | "|2 |Rose |1 |2010 |20 |M |4000 |null |null |null |\n", 303 | "|3 |Williams|1 |2010 |10 |M |1000 |null |null |null |\n", 304 | "|5 |Brown |2 |2010 |40 | |-1 |IT |40 |Brown |\n", 305 | "|1 |Smith |-1 |2018 |10 |M |3000 |null |null |null |\n", 306 | "|6 |Brown |2 |2010 |50 | |-1 |null |null |null |\n", 307 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 308 | "\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "leftjoin = empDF.join(deptDF, (empDF.emp_dept_id == deptDF.dept_id) & (empDF.name == deptDF.emp_name), \"left\") \n", 314 | "leftjoin.show(truncate= False)" 315 | ] 316 | }, 317 | { 318 | "cell_type": "code", 319 | "execution_count": 12, 320 | "id": "c7966499", 321 | "metadata": {}, 322 | "outputs": [ 323 | { 324 | "name": "stdout", 325 | "output_type": "stream", 326 | "text": [ 327 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 328 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n", 329 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 330 | "|1 |Smith |-1 |2018 |10 |M |3000 |Finance |10 |Bill |\n", 331 | "|3 |Williams|1 |2010 |10 |M |1000 |Finance |10 |Bill |\n", 332 | "|4 |Jones |2 |2005 |10 |F |2000 |Finance |10 |Bill |\n", 333 | "|null |null |null |null |null |null |null |Sales |30 |Smith |\n", 334 | "|2 |Rose |1 |2010 |20 |M |4000 |Marketing|20 |Joe |\n", 335 | "|5 |Brown |2 |2010 |40 | |-1 |IT |40 |Brown |\n", 336 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 337 | "\n", 338 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 339 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n", 340 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 341 | "|1 |Smith |-1 |2018 |10 |M |3000 |Finance |10 |Bill |\n", 342 | "|3 |Williams|1 |2010 |10 |M |1000 |Finance |10 |Bill |\n", 343 | "|4 |Jones |2 |2005 |10 |F |2000 |Finance |10 |Bill |\n", 344 | "|null |null |null |null |null |null |null |Sales |30 |Smith |\n", 345 | "|2 |Rose |1 |2010 |20 |M |4000 |Marketing|20 |Joe |\n", 346 | "|5 |Brown |2 |2010 |40 | |-1 |IT |40 |Brown |\n", 347 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 348 | "\n" 349 | ] 350 | } 351 | ], 352 | "source": [ 353 | "# Right Outer Join\n", 354 | "\n", 355 | "\n", 356 | "empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,\"right\") \\\n", 357 | " .show(truncate=False)\n", 358 | "empDF.join(deptDF,empDF.emp_dept_id == deptDF.dept_id,\"rightouter\") \\\n", 359 | " .show(truncate=False)\n" 360 | ] 361 | }, 362 | { 363 | "cell_type": "markdown", 364 | "id": "bda5eff2", 365 | "metadata": {}, 366 | "source": [ 367 | "## Using SQL Expression\n", 368 | "\n", 369 | "Since PySpark SQL support native SQL syntax, we can also write join operations after creating temporary tables on DataFrames and use these tables on `spark.sql()`." 370 | ] 371 | }, 372 | { 373 | "cell_type": "code", 374 | "execution_count": 13, 375 | "id": "a2f9c926", 376 | "metadata": {}, 377 | "outputs": [], 378 | "source": [ 379 | "empDF.createOrReplaceTempView(\"EMP\")\n", 380 | "deptDF.createOrReplaceTempView(\"DEPT\")" 381 | ] 382 | }, 383 | { 384 | "cell_type": "code", 385 | "execution_count": 14, 386 | "id": "7dfe1542", 387 | "metadata": {}, 388 | "outputs": [], 389 | "source": [ 390 | "joinDF = spark.sql(\"select * from EMP e, DEPT d where e.emp_dept_id == d.dept_id\") \n", 391 | "\n", 392 | "joinDF2 = spark.sql(\"select * from EMP e INNER JOIN DEPT d ON e.emp_dept_id == d.dept_id\")" 393 | ] 394 | }, 395 | { 396 | "cell_type": "code", 397 | "execution_count": 15, 398 | "id": "027fec39", 399 | "metadata": {}, 400 | "outputs": [ 401 | { 402 | "name": "stdout", 403 | "output_type": "stream", 404 | "text": [ 405 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 406 | "|emp_id|name |superior_emp_id|year_joined|emp_dept_id|gender|salary|dept_name|dept_id|emp_name|\n", 407 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 408 | "|1 |Smith |-1 |2018 |10 |M |3000 |Finance |10 |Bill |\n", 409 | "|3 |Williams|1 |2010 |10 |M |1000 |Finance |10 |Bill |\n", 410 | "|4 |Jones |2 |2005 |10 |F |2000 |Finance |10 |Bill |\n", 411 | "|2 |Rose |1 |2010 |20 |M |4000 |Marketing|20 |Joe |\n", 412 | "|5 |Brown |2 |2010 |40 | |-1 |IT |40 |Brown |\n", 413 | "+------+--------+---------------+-----------+-----------+------+------+---------+-------+--------+\n", 414 | "\n" 415 | ] 416 | } 417 | ], 418 | "source": [ 419 | "joinDF.show(truncate=False)" 420 | ] 421 | }, 422 | { 423 | "cell_type": "code", 424 | "execution_count": null, 425 | "id": "572fe1b8", 426 | "metadata": {}, 427 | "outputs": [], 428 | "source": [] 429 | } 430 | ], 431 | "metadata": { 432 | "kernelspec": { 433 | "display_name": "Python 3 (ipykernel)", 434 | "language": "python", 435 | "name": "python3" 436 | }, 437 | "language_info": { 438 | "codemirror_mode": { 439 | "name": "ipython", 440 | "version": 3 441 | }, 442 | "file_extension": ".py", 443 | "mimetype": "text/x-python", 444 | "name": "python", 445 | "nbconvert_exporter": "python", 446 | "pygments_lexer": "ipython3", 447 | "version": "3.9.7" 448 | } 449 | }, 450 | "nbformat": 4, 451 | "nbformat_minor": 5 452 | } 453 | -------------------------------------------------------------------------------- /scripts/manipulating-data-in-DataFrame.py: -------------------------------------------------------------------------------- 1 | """ 2 | - changing data types when they are incorrectly interpretted 3 | - Clean your data 4 | - create new columns 5 | - rename columns 6 | - extract or create new value 7 | """ 8 | 9 | import os 10 | import pyspark 11 | from pyspark.sql import SparkSession 12 | from pyspark.sql.types import * 13 | from pyspark.sql.functions import * 14 | from pyspark.sql.functions import col, lit, to_date, trim, lower 15 | 16 | # create a session 17 | spark = SparkSession.builder.appName("manipulatedata").getOrCreate() 18 | spark 19 | 20 | ## 21 | # Trending YouTube Video Statistics, https://www.kaggle.com/datasnaek/youtube-new 22 | path = "Datasets/" 23 | videos = spark.read.csv(path+"youtubevideos.csv", header=True, inferSchema=True) 24 | len(videos.columns) 25 | videos.limit(10).show() 26 | videos.printSchema() 27 | videos.select("publish_time").show(5, False) 28 | # publish_time, its schema needs to be modified. 2017-11-13T17:13:01.000Z 29 | 30 | """PySpark withColumn() is a transformation function of DataFrame which is used to change the value, 31 | convert the datatype of an existing column, create a new column, and many more 32 | 33 | - PySpark withColumn – To change column DataType 34 | - Transform/change value of an existing column 35 | - Derive new column from an existing column 36 | - Add a column with the literal value 37 | - Rename column name 38 | - Drop DataFrame column 39 | 40 | here are the subclasses of the DataType in PySpark and we can change or cast DataFrame columns to *only* these types. 41 | 42 | ArrayType, BinaryType, BooleanType, CalendarIntervalType, DateType, HiveStringType, MapType, NullType, 43 | NumericType, ObjectType, StringType, StructType, TimestampType 44 | 45 | Syntax: to_date(timestamp_column,format) 46 | """ 47 | 48 | # create new data frame from videos DataFrame 49 | 50 | # change type of views column 51 | #df = videos.withColumn("views", videos["views"].cast(IntegerType())) 52 | df = videos.withColumn("views", col("views").cast(IntegerType())) \ 53 | .withColumn("likes", videos.likes.cast(IntegerType())) \ 54 | .withColumn("dislikes",videos.dislikes.cast(IntegerType())) \ 55 | .withColumn("trending_date", to_date(videos.trending_date,'yy.dd.mm')) \ 56 | .withColumn("publish_time", to_timestamp(videos.publish_time, 'yyyy-MM-dd HH:mm:ss')) 57 | 58 | df.describe() 59 | df.limit(4).show() 60 | df.printSchema() 61 | ## NOW, we face some problems here: 62 | #1) pyspark infer trendin_date incorrectly 2017-01-14, so how to fix it??? 63 | #2) publish time is null now !!!!!!!! and it is because of funky TZ in the original format 2017-11-13T17:13:01.000Z 64 | 65 | df = videos.withColumn("views", col("views").cast(IntegerType())) \ 66 | .withColumn("likes", videos.likes.cast(IntegerType())) \ 67 | .withColumn("dislikes",videos.dislikes.cast(IntegerType())) \ 68 | .withColumn("trending_date", to_date(videos.trending_date,'yy.dd.mm')) \ 69 | # .withColumn("publish_time", to_timestamp(videos.publish_time, 'yyyy-MM-dd HH:mm:ss')) 70 | # create a new column 71 | df = df.withColumn('publish_time_2', regexp_replace(df.publish_time, "T", " ")) 72 | # same for Z and replace it 73 | df = df.withColumn("publish_time_2", regexp_replace(df.publish_time_2, 'Z', '')) 74 | df.select('publish_time', 'publish_time_2').show(4, False) 75 | df.printSchema() 76 | # So now we can transform it to timestamp 77 | df = df.withColumn('publish_time_3', to_timestamp(df.publish_time_2, 'yyyy-MM-dd HH:mm:ss.SSS')) 78 | df.printSchema() 79 | df.show(4) 80 | # rename the colomn name 81 | #renamed_df = df.withColumnRenamed("newname", "publish_time_3") # 2017-11-13 17:13:01 the 000 just took that off, thats OK 82 | 83 | 84 | # TRANSLATE function, alternative way 85 | # NOte, here i am not creating object i m just showing 86 | df.select('publish_time', translate(col('publish_time'), "TZ", " ").alias('trans_col')).show(4, False) 87 | 88 | # ------------------------------------------------------------- 89 | # TRIM() 90 | df = df.withColumn('title', trim(df.title)) 91 | df.select('title').show(4, False) 92 | df = df.withColumn("title", lower(df.title)) 93 | df.select('title').show(4, False) 94 | #-------------------------------------------------------------- 95 | # case WHEN 96 | # option 1, when-otherwise 97 | # option 2 expr 98 | def CASE(args): 99 | pass 100 | 101 | 102 | df.select("likes", "dislikes", expr("CASE WHEN likes > dislikes THEN 'Good mvie' " 103 | "WHEN likes < dislikes THEN 'Bad movies' " 104 | "ELSE 'undetermined' END AS Favarability")).show(4) 105 | 106 | 107 | df.selectExpr("likes", "dislikes", "CASE WHEN likes > dislikes THEN 'Good mvie' " 108 | "WHEN likes < dislikes THEN 'Bad movies' " 109 | "ELSE 'undetermined' END AS Favarability").show(4) 110 | 111 | # --------------------------------------------------------------- 112 | # concatinate 113 | # Joining two columns for NLP, and added 114 | df = df.withColumn("title_channel", concat_ws(' ', df.title, df.channel_title)) 115 | df.show(4) 116 | df.printSchema() 117 | 118 | df.select("trending_date", year("trending_date"), month("trending_date")).show(4) 119 | 120 | array = df.select('title', split(df.title, ' ').alias('new')) 121 | array.show(4, False) # it return an array [we, want, to, talk, about, our, marriage], 122 | array.select('title', array_contains(array.new, 'about')).show(4) 123 | array.printSchema() 124 | array.show(4, False) 125 | 126 | # array_remove() 127 | ## 128 | """ 129 | Pyspark, User Defined Functions 130 | """ 131 | #PySpark UDF’s are similar to UDF on traditional databases. In PySpark, you create a function in a 132 | # Python syntax and wrap it with PySpark SQL udf() or register it as udf and use it on DataFrame and SQL respectively 133 | 134 | from pyspark.sql.functions import udf 135 | from pyspark.sql.types import IntegerType 136 | 137 | def squar(x): 138 | return int(x)**2 139 | 140 | square_udf = udf(lambda z: squar(z), IntegerType()) 141 | df.select('dislikes',square_udf('dislikes')).where(df.dislikes.isNotNull()) 142 | df.select('dislikes',square_udf('dislikes')).where(df.dislikes.isNotNull()).show(4) -------------------------------------------------------------------------------- /scripts/multiple_csv_to_dataframe.py: -------------------------------------------------------------------------------- 1 | # read multiple .csv file and create a dataframe 2 | 3 | import os 4 | """ 5 | # - **course_offerings:** uuid, course_uuid, term_code, name 6 | # - **instructors:** id, name 7 | # - **sections:** uuid, course_offering_uuid,room_uuid, schedule_uuid 8 | # - **teachings:** instructor_id, section_uuid 9 | # 10 | # **Source:** https://www.kaggle.com/Madgrades/uw-madison-course 11 | """ 12 | path = "Datasets/uw-madison-courses/" 13 | 14 | df_list = [] 15 | for filename in os.listdir(path): 16 | if filename.endswith(".csv"): 17 | filename_list = filename.split(".") # separate path from .csv 18 | df_name = filename_list[0] 19 | df = spark.read.csv(path + filename, inferSchema=True, header=True) 20 | df.name = df_name 21 | df_list.append(df_name) 22 | exec(df_name + ' = df') 23 | 24 | -------------------------------------------------------------------------------- /scripts/pivote-table.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.functions import expr 4 | spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() 5 | 6 | data = [("Banana",1000,"USA"), ("Carrots",1500,"USA"), ("Beans",1600,"USA"), \ 7 | ("Orange",2000,"USA"),("Orange",2000,"USA"),("Banana",400,"China"), \ 8 | ("Carrots",1200,"China"),("Beans",1500,"China"),("Orange",4000,"China"), \ 9 | ("Banana",2000,"Canada"),("Carrots",2000,"Canada"),("Beans",2000,"Mexico")] 10 | 11 | columns= ["Product","Amount","Country"] 12 | spark = SparkSession.builder.appName("tabale").getOrCreate() 13 | df = spark.createDataFrame(data = data, schema = columns) 14 | df.printSchema() 15 | df.show(truncate=False) 16 | """ 17 | Product|Amount|Country| 18 | +-------+------+-------+ 19 | |Banana |1000 |USA | 20 | |Carrots|1500 |USA | 21 | |Beans |1600 |USA | 22 | |Orange |2000 |USA | 23 | |Orange |2000 |USA | 24 | |Banana |400 |China | 25 | |Carrots|1200 |China | 26 | |Beans |1500 |China | 27 | |Orange |4000 |China | 28 | |Banana |2000 |Canada | 29 | |Carrots|2000 |Canada | 30 | |Beans |2000 |Mexico | 31 | +-------+------+-------+ 32 | """ 33 | 34 | # PySpark SQL provides pivot() function to rotate the data from one column into multiple columns. 35 | # It is an aggregation where one of the grouping columns values transposed into individual columns with distinct data. 36 | # To get the total amount exported to each country of each product, will do group by Product, 37 | # pivot by Country, and the sum of Amount. 38 | pivotDF = df.groupBy("Product").pivot("Country").sum("Amount") 39 | pivotDF.printSchema() 40 | pivotDF.show(truncate=False) 41 | """ 42 | This will transpose the countries from DataFrame rows into columns and produces below output. where ever 43 | data is not present, it represents as null by default. 44 | 45 | +-------+------+-----+------+----+ 46 | |Product|Canada|China|Mexico|USA | 47 | +-------+------+-----+------+----+ 48 | |Orange |null |4000 |null |4000| 49 | |Beans |null |1500 |2000 |1600| 50 | |Banana |2000 |400 |null |1000| 51 | |Carrots|2000 |1200 |null |1500| 52 | +-------+------+-----+------+----+ 53 | """ -------------------------------------------------------------------------------- /scripts/pyspark-dataframe.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.functions import col, lit 4 | from pyspark.sql.types import StructType, StructField, StringType,IntegerType 5 | 6 | # PySpark applications start with initializing SparkSession which is the entry point of PySpark as below. 7 | #In case of running it in PySpark shell via pyspark executable, the shell automatically creates the session in the variable spark for users 8 | 9 | spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() 10 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size() 11 | print("You are working with", cores, "core(s)") 12 | 13 | print(spark) 14 | 15 | -------------------------------------------------------------------------------- /scripts/read_write_DataFrame.py: -------------------------------------------------------------------------------- 1 | ## read csv file 2 | from pyspark.sql import SparkSession 3 | from pprint import pprint 4 | from pyspark.sql import SparkSession 5 | from pyspark.sql.types import StructType,StructField, StringType, IntegerType 6 | from pyspark.sql.types import ArrayType, DoubleType, BooleanType 7 | from pyspark.sql.functions import array_contains 8 | 9 | 10 | spark = SparkSession.builder.appName('readdata').master('local').getOrCreate 11 | # create a session and name it spark 12 | pprint(spark) 13 | 14 | ## ------------------------------------ 15 | # read csv format 16 | # PySpark supports reading a CSV file with a pipe, comma, tab, space, or any other delimiter/separator files. 17 | # Using csv("path") or format("csv").load("path") of DataFrameReader, you can read a CSV file into a PySpark DataFrame. 18 | ## ------------------------------------- 19 | # file is located in a folder 20 | data_path = "Datasets/" 21 | students = spark.read.csv(data_path+'students.csv',header=True, inferSchema= True) 22 | pprint(students) 23 | students.printSchema() 24 | """ 25 | root 26 | |-- gender: string (nullable = true) 27 | |-- race/ethnicity: string (nullable = true) 28 | |-- parental level of education: string (nullable = true) 29 | |-- lunch: string (nullable = true) 30 | |-- test preparation course: string (nullable = true) 31 | |-- math score: integer (nullable = true) 32 | |-- reading score: integer (nullable = true) 33 | |-- writing score: integer (nullable = true) 34 | 35 | """ 36 | # OR 37 | 38 | df = spark.read.format('csv').load(data_path+'students.csv') 39 | 40 | pprint(df) 41 | df.printSchema() 42 | # this reads the data into DataFrame columns "_c0" for the first column and "_c1" for the second and so on 43 | """ 44 | root 45 | |-- _c0: string (nullable = true) 46 | |-- _c1: string (nullable = true) 47 | |-- _c2: string (nullable = true) 48 | |-- _c3: string (nullable = true) 49 | |-- _c4: string (nullable = true) 50 | |-- _c5: string (nullable = true) 51 | |-- _c6: string (nullable = true) 52 | |-- _c7: string (nullable = true) 53 | """ 54 | ## -------------------------------------- 55 | # read parquet format 56 | # _______________________________________ 57 | user1 = spark.read.parquet(data_path+'user*',header=True,inferSchema=True) 58 | user1.show(4) 59 | user1.count() 60 | 61 | ## --------------------------------------- 62 | # Reading CSV files with a user-specified custom schema 63 | # If you know the schema of the file ahead and do not want to use the inferSchema option for column names and types, 64 | # use user-defined custom column names and type using schema option 65 | # Refer dataset https://github.com/spark-examples/pyspark-examples/blob/master/resources/zipcodes.csv 66 | # lets try some stuff here 67 | df = spark.read.csv("resources/zipcodes.csv") 68 | df.printSchema() 69 | """ 70 | root 71 | |-- _c0: string (nullable = true) 72 | |-- _c1: string (nullable = true) 73 | |-- _c2: string (nullable = true) .... 74 | """ 75 | # there is one option here 76 | df2 = spark.read.option("header", True).csv("resources/zipcodes.csv") 77 | df2.printSchema() 78 | """ 79 | root 80 | |-- RecordNumber: string (nullable = true) 81 | |-- Zipcode: string (nullable = true) 82 | |-- ZipCodeType: string (nullable = true) ... 83 | """ 84 | # lets add another option 85 | df3 = spark.read.options(header = True, delimiter = ',').csv("resources/zipcodes.csv") 86 | df3.printSchema() 87 | df3.show(4) 88 | 89 | # The schema does not look correct. lets change the data type 90 | 91 | schema = StructType() \ 92 | .add("RecordNumber",IntegerType(),True) \ 93 | .add("Zipcode",IntegerType(),True) \ 94 | .add("ZipCodeType",StringType(),True) \ 95 | .add("City",StringType(),True) \ 96 | .add("State",StringType(),True) \ 97 | .add("LocationType",StringType(),True) \ 98 | .add("Lat",DoubleType(),True) \ 99 | .add("Long",DoubleType(),True) \ 100 | .add("Xaxis",IntegerType(),True) \ 101 | .add("Yaxis",DoubleType(),True) \ 102 | .add("Zaxis",DoubleType(),True) \ 103 | .add("WorldRegion",StringType(),True) \ 104 | .add("Country",StringType(),True) \ 105 | .add("LocationText",StringType(),True) \ 106 | .add("Location",StringType(),True) \ 107 | .add("Decommisioned",BooleanType(),True) \ 108 | .add("TaxReturnsFiled",StringType(),True) \ 109 | .add("EstimatedPopulation",IntegerType(),True) \ 110 | .add("TotalWages",IntegerType(),True) \ 111 | .add("Notes",StringType(),True) 112 | 113 | df_with_schema = spark.read.option("header", True).format("csv").schema(schema).load("resources/zipcodes.csv") 114 | df_with_schema.printSchema() 115 | """ 116 | root 117 | |-- RecordNumber: integer (nullable = true) 118 | |-- Zipcode: integer (nullable = true) 119 | |-- ZipCodeType: string (nullable = true) 120 | |-- City: string (nullable = true) ...""" 121 | 122 | ## df3.write.mode('overwrite').csv('zip.csv') 123 | ## =========================================== 124 | # so 125 | path = "path_to_data" 126 | # CSV 127 | df = spark.read.csv(path+'students.csv',inferSchema=True,header=True) 128 | 129 | # Json 130 | people = spark.read.json(path+'people.json') 131 | 132 | # Parquet 133 | parquet = spark.read.parquet(path+'users.parquet') 134 | 135 | # Partioned Parquet 136 | partitioned = spark.read.parquet(path+'users*') 137 | 138 | # Parts of a partitioned Parquet 139 | users1_2 = spark.read.option("basePath", path).parquet(path+'users1.parquet', path+'users2.parquet') 140 | 141 | #==================================================================================================================== 142 | # Applying DataFrame transformations 143 | 144 | # Once you have created DataFrame from the CSV file, you can apply all transformation and actions DataFrame support 145 | # -------------------------------------------------------- 146 | # Write PySpark DataFrame to CSV file 147 | # -------------------------------------------------------- 148 | # Use the write() method of the PySpark DataFrameWriter object to write PySpark DataFrame to a CSV fil 149 | # Options: While writing a CSV file you can use several options. for example, header to output the DataFrame column 150 | # names as header record and delimiter to specify the delimiter on the CSV output file. 151 | # Other options available quote,escape,nullValue,dateFormat,quoteMode 152 | 153 | 154 | df3.write.mode('overwrite').csv("spark_output/zipcodes") # overwrite – mode is used to overwrite the existing file 155 | # one can also use this 156 | df2.write.format("csv").mode('overwrite').save("output/zipcodes") -------------------------------------------------------------------------------- /scripts/sample_data/data: -------------------------------------------------------------------------------- 1 | 2 | -------------------------------------------------------------------------------- /scripts/sample_data/data.txt: -------------------------------------------------------------------------------- 1 | Project Gutenberg’s 2 | Alice’s Adventures in Wonderland 3 | Project Gutenberg’s 4 | Adventures in Wonderland 5 | Project Gutenberg’s -------------------------------------------------------------------------------- /scripts/search-filter-DataFrame.py: -------------------------------------------------------------------------------- 1 | ## 2 | import pyspark 3 | from pyspark.sql import SparkSession 4 | from pyspark.sql.functions import * 5 | from pprint import pprint 6 | # May take awhile locally 7 | spark = SparkSession.builder.appName("FunctionsHW").getOrCreate() 8 | 9 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size() 10 | print("You are working with", cores, "core(s)") 11 | spark 12 | ## 13 | df = spark.read.csv('Datasets/df19.csv',inferSchema=True,header=True) 14 | print(df.printSchema()) 15 | df.limit(4).toPandas() 16 | df.show(4, truncate = True) 17 | len(df.columns) # 89 18 | df.describe 19 | 20 | df.select(['Aggression', 'Stamina']).show(5) 21 | df.select(['Aggression', 'Stamina']).summary('count','min').show() 22 | df.select(['Name', 'Age']).orderBy(df['Age'].desc()).show(5) 23 | df.select(['*']).show(5) 24 | 25 | ## filtering data horizontally WHERE condition 26 | df.select(["Name", "age", "Club"]).show(5) 27 | df.select(["Name", "age", "Club"]).where(df.Club.like("%celon%")).show(5) 28 | 29 | # ------------------------------ 30 | # SELECT SUBSTRING ('what a wonderful DAY' from 2 for 6); -- hat a 31 | df.select("Photo",df.Photo.substr(-4,5).alias('the last 4 charachter')).show(5) 32 | # ISIN(list) 33 | df['Name','club','Nationality'].filter("Club IN ('FC Barcelona')").limit(4).toPandas() 34 | 35 | df.select('Name').where(df.Name.startswith("L")).show(5) 36 | df.select('Name', 'Club').where(df.Name.startswith("L")).where(df.Name.endswith('i')).where(df.Club.like('%Barcelona')).show(5) 37 | 38 | ## SLICING DataFrame, take n number of rows 39 | df.count() 40 | df1 = df.limit(100) 41 | df1.show(5, True).toPandas() 42 | 43 | # SLICING, take n number of colomns 44 | df2 = df.select('Name', 'Club', "Nationality") 45 | # OR 46 | df_sel_col = df.select(df.columns[0:5]) 47 | # 48 | df2.limit(5).show() 49 | len(df2.columns) #3 50 | df2.count() #18207 51 | len(df_sel_col.columns) #5 52 | df_sel_col.limit(5).show() 53 | 54 | df.printSchema() 55 | df['Name', 'Weight'].filter("Overall>50").limit(4).show() 56 | df['Name', 'Weight'].limit(4).show() 57 | df.select(['Name','Position','Release Clause']).show(5,False) 58 | # Display the same results from above sorted by the players names 59 | df.select(['Name','Position']).orderBy('Name').show(5) 60 | df.select(['Name','Position','Age']).orderBy(df['Age'].desc()).show(5) 61 | 62 | # Select only the players who belong to a club begining with FC 63 | # One way 64 | df.select("Name","Club").where(df.Club.like("FC%")).show(5, False) 65 | 66 | # Another way 67 | df.select("Name","Club").where(df.Club.startswith("FC")).limit(4).toPandas() 68 | 69 | ## ====================================================== 70 | # to create a new dataframe 71 | df = df.limit(100) 72 | df.count() 73 | 74 | # if we slice the colomns 75 | df2_col = df.columns[0:5] 76 | df2 = df.select(df2_col) 77 | df2.count() 78 | df2.show(5,False) 79 | # count the colomn 80 | len(df2.columns) 81 | 82 | df.filter("Age>40").select(['Name','Age']).limit(4).toPandas() 83 | # COLLECTING RESULTS AS OBJECTS ----> .COLLECT() method, it will collect results as a python object 84 | df4 = df.select('Name', 'Club').where(df.Name.startswith("L")).collect() #object is list 85 | df4.toPandas() 86 | type(df4[0]) # 'pyspark.sql.types.Row' 87 | print("Name start with L:",df4[1][1]) -------------------------------------------------------------------------------- /scripts/update-column-DataFrame.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() 4 | 5 | data = [('James','Smith','M',3000),('Anna','Rose','F',4100), 6 | ('Robert','Williams','NA',6200),(None,'Rob','F',6200) 7 | 8 | ] 9 | 10 | columns = ["firstname","lastname","gender","salary"] 11 | df = spark.createDataFrame(data=data, schema = columns) 12 | df.show() 13 | 14 | df2=df.withColumn("salary", df.salary*3) 15 | df2.show() 16 | #----------------------------------------------------------- 17 | from pyspark.sql.functions import when 18 | df3 = df.withColumn("gender", when(df.gender == "M","Male") \ 19 | .when(df.gender == "F","Female") \ 20 | .otherwise(df.gender)) 21 | df3.show() 22 | #------------------------------------------------------------ 23 | 24 | df4=df.withColumn("salary",df.salary.cast("String")) 25 | df4.printSchema() 26 | 27 | df.createOrReplaceTempView("PER") 28 | df5=spark.sql("select firstname,gender,salary*3 as salary from PER") 29 | df5.show() 30 | -------------------------------------------------------------------------------- /scripts/user-defined-function.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.functions import col, udf 4 | from pyspark.sql.types import StringType 5 | 6 | spark = SparkSession.builder.appName('SparkByExamples.com').getOrCreate() 7 | 8 | columns = ["Seqno","Name"] 9 | data = [("1", "john jones"), 10 | ("2", "tracey smith"), 11 | ("3", "amy sanders")] 12 | 13 | """ 14 | +-----+------------+ 15 | |Seqno|Names | 16 | +-----+------------+ 17 | |1 |john jones | 18 | |2 |tracey smith| 19 | |3 |amy sanders | 20 | +-----+------------+ 21 | """ 22 | 23 | df = spark.createDataFrame(data=data,schema=columns) 24 | 25 | df.show(truncate=False) 26 | # Create a Python Function 27 | #The first step in creating a UDF is creating a Python function. Below snippet creates a function convertCase() 28 | # which takes a string parameter and converts the first letter of every word to capital letter 29 | def convertCase(str): 30 | resStr="" 31 | arr = str.split(" ") 32 | for x in arr: 33 | resStr= resStr + x[0:1].upper() + x[1:len(x)] + " " 34 | return resStr 35 | 36 | # Converting function to UDF 37 | # Now convert this function convertCase() to UDF by passing the function to PySpark SQL udf(), this function is available at 38 | # org.apache.spark.sql.functions.udf package. Make sure you import this package before using it 39 | convertUDF = udf(lambda z: convertCase(z), StringType) 40 | # Note: The default type of the udf() is StringType hence, you can also write the above statement without return type 41 | # Now you can use convertUDF() on a DataFrame column as a regular build-in function. 42 | df.select(col("Seqno"), \ 43 | convertUDF(col("Name")).alias("Name") ) \ 44 | .show(truncate=False) 45 | """ 46 | +-----+-------------+ 47 | |Seqno|Name | 48 | +-----+-------------+ 49 | |1 |John Jones | 50 | |2 |Tracey Smith | 51 | |3 |Amy Sanders | 52 | +-----+-------------+ 53 | """ 54 | # Using UDF with PySpark DataFrame withColumn() 55 | @udf(returnType=StringType()) 56 | def upperCase(str): 57 | return str.upper() 58 | 59 | upperCaseUDF = udf(lambda z:upperCase(z),StringType()) 60 | 61 | df.withColumn("Cureated Name", upperCase(col("Name"))) \ 62 | .show(truncate=False) 63 | 64 | """ Using UDF on SQL """ 65 | # Registering PySpark UDF & use it on SQL 66 | # In order to use convertCase() function on PySpark SQL, you need to register the function with PySpark by using spark.udf.register() 67 | spark.udf.register("convertUDF", convertCase,StringType()) 68 | df.createOrReplaceTempView("NAME_TABLE") 69 | spark.sql("select Seqno, convertUDF(Name) as Name from NAME_TABLE") \ 70 | .show(truncate=False) 71 | 72 | spark.sql("select Seqno, convertUDF(Name) as Name from NAME_TABLE " + \ 73 | "where Name is not null and convertUDF(Name) like '%John%'") \ 74 | .show(truncate=False) 75 | 76 | """ null check """ 77 | # UDF’s are error-prone when not designed carefully. for example, when you have a column that contains the value null on some records 78 | columns = ["Seqno","Name"] 79 | data = [("1", "john jones"), 80 | ("2", "tracey smith"), 81 | ("3", "amy sanders"), 82 | ('4',None)] 83 | 84 | df2 = spark.createDataFrame(data=data,schema=columns) 85 | df2.show(truncate=False) 86 | df2.createOrReplaceTempView("NAME_TABLE2") 87 | 88 | spark.udf.register("_nullsafeUDF", lambda str: convertCase(str) if not str is None else "" , StringType()) 89 | 90 | spark.sql("select _nullsafeUDF(Name) from NAME_TABLE2") \ 91 | .show(truncate=False) 92 | 93 | spark.sql("select Seqno, _nullsafeUDF(Name) as Name from NAME_TABLE2 " + \ 94 | " where Name is not null and _nullsafeUDF(Name) like '%John%'") \ 95 | .show(truncate=False) 96 | 97 | 98 | 99 | 100 | 101 | -------------------------------------------------------------------------------- /search_filter_dataframe.py: -------------------------------------------------------------------------------- 1 | import pyspark 2 | from pyspark.sql import SparkSession 3 | from pyspark.sql.functions import * 4 | # May take awhile locally 5 | spark = SparkSession.builder.appName("FunctionsHW").getOrCreate() 6 | 7 | cores = spark._jsc.sc().getExecutorMemoryStatus().keySet().size() 8 | print("You are working with", cores, "core(s)") 9 | spark 10 | 11 | fifa = spark.read.csv('Datasets/fifa19.csv',inferSchema=True,header=True) 12 | print(fifa.printSchema()) 13 | 14 | 15 | fifa.select(['Name','Position','Release Clause']).show(5,False) 16 | # Display the same results from above sorted by the players names 17 | fifa.select(['Name','Position']).orderBy('Name').show(5) 18 | fifa.select(['Name','Position','Age']).orderBy(fifa['Age'].desc()).show(5) 19 | 20 | # Select only the players who belong to a club begining with FC 21 | # One way 22 | fifa.select("Name","Club").where(fifa.Club.like("FC%")).show(5, False) 23 | 24 | # Another way 25 | fifa.select("Name","Club").where(fifa.Club.startswith("FC")).limit(4).toPandas() 26 | 27 | ## ====================================================== 28 | # to create a new dataframe 29 | df = fifa.limit(100) 30 | df.count() 31 | 32 | # if we slice the colomns 33 | df2_col = fifa.columns[0:5] 34 | df2 = fifa.select(df2_col) 35 | df2.count() 36 | df2.show(5,False) 37 | # count the colomn 38 | len(df2.columns) 39 | # =[======================================================= 40 | # Filtering data 41 | # ======================================================== 42 | 43 | fifa.filter("Age>40").select(['Name','Age']).limit(4).toPandas() 44 | -------------------------------------------------------------------------------- /spark-env.yml: -------------------------------------------------------------------------------- 1 | name: spark 2 | channels: 3 | - conda-forge 4 | - defaults 5 | dependencies: 6 | - _libgcc_mutex=0.1=main 7 | - argon2-cffi=20.1.0=py37h7b6447c_1 8 | - async_generator=1.10=py37h28b3542_0 9 | - attrs=20.3.0=pyhd3eb1b0_0 10 | - autopep8=1.5.4=py_0 11 | - backcall=0.2.0=py_0 12 | - bleach=3.2.1=py_0 13 | - ca-certificates=2020.12.8=h06a4308_0 14 | - certifi=2020.12.5=py37h06a4308_0 15 | - cffi=1.14.4=py37h261ae71_0 16 | - decorator=4.4.2=py_0 17 | - defusedxml=0.6.0=py_0 18 | - entrypoints=0.3=py37_0 19 | - icu=58.2=hf484d3e_1000 20 | - importlib-metadata=2.0.0=py_1 21 | - importlib_metadata=2.0.0=1 22 | - ipykernel=5.3.4=py37h5ca1d4c_0 23 | - ipython=7.19.0=py37hb070fc8_0 24 | - ipython_genutils=0.2.0=pyhd3eb1b0_1 25 | - jedi=0.18.0=py37h06a4308_0 26 | - jinja2=2.11.2=py_0 27 | - jsonschema=3.2.0=py_2 28 | - jupyter_client=6.1.7=py_0 29 | - jupyter_contrib_core=0.3.3=py_2 30 | - jupyter_contrib_nbextensions=0.5.1=py37hc8dfbb8_1 31 | - jupyter_core=4.7.0=py37h06a4308_0 32 | - jupyter_highlight_selected_word=0.2.0=py37hc8dfbb8_1002 33 | - jupyter_latex_envs=1.4.6=py37hc8dfbb8_1001 34 | - jupyter_nbextensions_configurator=0.4.1=py37hc8dfbb8_2 35 | - jupyterlab_pygments=0.1.2=py_0 36 | - ld_impl_linux-64=2.33.1=h53a641e_7 37 | - libedit=3.1.20191231=h14c3975_1 38 | - libffi=3.3=he6710b0_2 39 | - libgcc-ng=9.1.0=hdf63c60_0 40 | - libsodium=1.0.18=h7b6447c_0 41 | - libstdcxx-ng=9.1.0=hdf63c60_0 42 | - libxml2=2.9.10=hb55368b_3 43 | - libxslt=1.1.34=hc22bd24_0 44 | - lxml=4.6.2=py37h9120a33_0 45 | - markupsafe=1.1.1=py37h14c3975_1 46 | - mistune=0.8.4=py37h14c3975_1001 47 | - nbclient=0.5.1=py_0 48 | - nbconvert=6.0.7=py37_0 49 | - nbformat=5.0.8=py_0 50 | - ncurses=6.2=he6710b0_1 51 | - nest-asyncio=1.4.3=pyhd3eb1b0_0 52 | - notebook=6.1.6=py37h06a4308_0 53 | - openssl=1.1.1i=h27cfd23_0 54 | - packaging=20.8=pyhd3eb1b0_0 55 | - pandoc=2.11=hb0f4dca_0 56 | - pandocfilters=1.4.3=py37h06a4308_1 57 | - pexpect=4.8.0=pyhd3eb1b0_3 58 | - pickleshare=0.7.5=pyhd3eb1b0_1003 59 | - pip=20.3.3=py37h06a4308_0 60 | - prometheus_client=0.9.0=pyhd3eb1b0_0 61 | - prompt-toolkit=3.0.8=py_0 62 | - ptyprocess=0.7.0=pyhd3eb1b0_2 63 | - pycodestyle=2.6.0=py_0 64 | - pycparser=2.20=py_2 65 | - pygments=2.7.3=pyhd3eb1b0_0 66 | - pyparsing=2.4.7=py_0 67 | - pyrsistent=0.17.3=py37h7b6447c_0 68 | - python=3.7.9=h7579374_0 69 | - python-dateutil=2.8.1=py_0 70 | - python_abi=3.7=1_cp37m 71 | - pyzmq=20.0.0=py37h2531618_1 72 | - readline=8.0=h7b6447c_0 73 | - send2trash=1.5.0=pyhd3eb1b0_1 74 | - setuptools=51.0.0=py37h06a4308_2 75 | - six=1.15.0=py37h06a4308_0 76 | - sqlite=3.33.0=h62c20be_0 77 | - terminado=0.9.2=py37h06a4308_0 78 | - testpath=0.4.4=py_0 79 | - tk=8.6.10=hbc83047_0 80 | - toml=0.10.1=py_0 81 | - tornado=6.1=py37h27cfd23_0 82 | - traitlets=5.0.5=py_0 83 | - wcwidth=0.2.5=py_0 84 | - webencodings=0.5.1=py37_1 85 | - wheel=0.36.2=pyhd3eb1b0_0 86 | - xz=5.2.5=h7b6447c_0 87 | - yaml=0.2.5=h516909a_0 88 | - zeromq=4.3.3=he6710b0_3 89 | - zipp=3.4.0=pyhd3eb1b0_0 90 | - zlib=1.2.11=h7b6447c_3 91 | - pip: 92 | - findspark==1.4.2 93 | - jupyter-contrib-core==0.3.3 94 | - numpy==1.19.5 95 | - pandas==1.2.0 96 | - parso==0.8.1 97 | - py4j==0.10.9 98 | - pyspark==3.0.1 99 | - pytz==2020.5 100 | - pyyaml==5.3.1 101 | - pymongo==3.11.2 102 | prefix: /home/najmeh/anaconda3/envs/spark 103 | --------------------------------------------------------------------------------