├── Chapter04 ├── CallDetailRecord.java ├── ReadWriteParquet.scala ├── Example 4-4.r ├── Code-Snippets.scala ├── Example 4-1.scala ├── SparkSQLHiveIntegration.py ├── SparkSQLHiveIntegration.java ├── Example 4-3.py ├── Example 4-2.java ├── ReadWriteParquet.py ├── ReadWriteParquet.r ├── SparkSQLHiveIntegration.r ├── ReadWriteParquet.java └── RDDConversion.java ├── Chapter06 ├── BuildingPipeline.scala ├── Example01.scala └── BuildingPipeline.py ├── Chapter07 ├── BuildingGraph.scala ├── TerrorAnalytics-GraphFrames.scala ├── ConnectedComponents.scala └── BuildPageRank.scala ├── Chapter03 ├── Example 03-09.py ├── Example 03-07.scala ├── Example 03-08.scala ├── Example 03-11.scala ├── Example 03-12.py ├── Example 03-01.scala ├── Example 03-03.scala ├── Example 03-02.py ├── Example 03-05.py ├── Example 03-10.java ├── Example 03-03.java └── Example 03-06.java ├── Chapter02 ├── Example2-2.Py ├── Example2-1.scala ├── Example2-3.java ├── Example2-8.Py ├── Example2-11.Py ├── Example2-7.scala ├── Example2-10.scala ├── Example2-12.java ├── Example2-5.Py ├── Example2-9.java ├── Example2-18.scala ├── Example2-4.scala ├── Example2-14.Py ├── Example2-16.scala ├── Example2-13.scala ├── Example2-17.scala ├── Example2-15.java ├── Example2-6.java ├── Example2-20.scala └── Example2-19.scala ├── Chapter05 └── StreamingWordCount.scala ├── DataSets ├── products.json └── cdrs.json ├── LICENSE ├── Chapter09 └── BuildRecommendationEngine.scala ├── README.md └── Chapter10 └── ChurnPrediction.ipynb /Chapter04/CallDetailRecord.java: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Chapter04/ReadWriteParquet.scala: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Chapter06/BuildingPipeline.scala: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Chapter07/BuildingGraph.scala: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /Chapter03/Example 03-09.py: -------------------------------------------------------------------------------- 1 | testDS = spark.read.csv("/home/spark/sampledata/test.tsv",sep="\t") -------------------------------------------------------------------------------- /Chapter03/Example 03-07.scala: -------------------------------------------------------------------------------- 1 | pricePaidDS = spark.write.format(“csv”).save("/home/spark/sampledata/price_paid_output”) -------------------------------------------------------------------------------- /Chapter04/Example 4-4.r: -------------------------------------------------------------------------------- 1 | sparkR.session(appName = "MyApp", sparkConfig = list(spark.some.config.option = "some-value")) -------------------------------------------------------------------------------- /Chapter02/Example2-2.Py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter02/Example2-2.Py -------------------------------------------------------------------------------- /Chapter02/Example2-1.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter02/Example2-1.scala -------------------------------------------------------------------------------- /Chapter02/Example2-3.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter02/Example2-3.java -------------------------------------------------------------------------------- /Chapter03/Example 03-08.scala: -------------------------------------------------------------------------------- 1 | val testDS = spark.read.format("csv").option("delimiter","\t").load("/home/spark/sampledata/test.tsv") -------------------------------------------------------------------------------- /Chapter04/Code-Snippets.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter04/Code-Snippets.scala -------------------------------------------------------------------------------- /Chapter04/Example 4-1.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter04/Example 4-1.scala -------------------------------------------------------------------------------- /Chapter02/Example2-8.Py: -------------------------------------------------------------------------------- 1 | data = sc.parallelize( [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20]) 2 | data.sample(1,0.1,12345).collect() 3 | -------------------------------------------------------------------------------- /Chapter05/StreamingWordCount.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter05/StreamingWordCount.scala -------------------------------------------------------------------------------- /Chapter04/SparkSQLHiveIntegration.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter04/SparkSQLHiveIntegration.py -------------------------------------------------------------------------------- /Chapter02/Example2-11.Py: -------------------------------------------------------------------------------- 1 | movieList = sc.parallelize(["A Nous Liberte","Airplane","The Apartment","The Apartment"]) 2 | movieList.distinct().collect() 3 | -------------------------------------------------------------------------------- /Chapter04/SparkSQLHiveIntegration.java: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter04/SparkSQLHiveIntegration.java -------------------------------------------------------------------------------- /Chapter02/Example2-7.scala: -------------------------------------------------------------------------------- 1 | val data = sc.parallelize( List(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)); 2 | data.sample(true,0.1,12345).collect() 3 | -------------------------------------------------------------------------------- /Chapter07/TerrorAnalytics-GraphFrames.scala: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter07/TerrorAnalytics-GraphFrames.scala -------------------------------------------------------------------------------- /Chapter02/Example2-10.scala: -------------------------------------------------------------------------------- 1 | val movieList = sc.parallelize(List("A Nous Liberte","Airplane","The Apartment","The Apartment")) 2 | moviesList.distinct().collect() 3 | -------------------------------------------------------------------------------- /Chapter02/Example2-12.java: -------------------------------------------------------------------------------- 1 | JavaRDD movieList = sc.parallelize(Arrays.asList("A Nous Liberte","Airplane","The Apartment","The Apartment")); 2 | movieList.distinct().collect(); -------------------------------------------------------------------------------- /Chapter02/Example2-5.Py: -------------------------------------------------------------------------------- 1 | movies = sc.parallelize(["Pulp Fiction","Requiem for a dream","A clockwork Orange"]) 2 | movies.flatMap(lambda movieTitle: movieTitle.split(" ")).collect() 3 | -------------------------------------------------------------------------------- /Chapter02/Example2-9.java: -------------------------------------------------------------------------------- 1 | JavaRDD nums = sc.parallelize(Arrays.asList( 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20)); 2 | nums.sample(true,0.1,12345).collect(); 3 | -------------------------------------------------------------------------------- /Chapter02/Example2-18.scala: -------------------------------------------------------------------------------- 1 | sc.parallelize(Seq(10, 4, 2, 12, 3)).takeOrdered(1) 2 | // returns Array(2) 3 | 4 | sc.parallelize(Seq(2, 3, 4, 5, 6)).takeOrdered(2) 5 | // returns Array(2, 3) 6 | -------------------------------------------------------------------------------- /Chapter02/Example2-4.scala: -------------------------------------------------------------------------------- 1 | val favMovies = sc.parallelize(List("Pulp Fiction","Requiem for a dream","A clockwork Orange")); 2 | movies.flatMap(movieTitle=>movieTitle.split(" ")).collect() 3 | -------------------------------------------------------------------------------- /Chapter03/Example 03-11.scala: -------------------------------------------------------------------------------- 1 | val data = sc.parallelize(List(("MyKey1","MyValue1"),("MyKey2","MyValue2"),("MyKey3","MyValue3"))) 2 | 3 | data.saveAsSequenceFile("/home/spark/sampledata/seq-example") -------------------------------------------------------------------------------- /DataSets/products.json: -------------------------------------------------------------------------------- 1 | {"prodname":"iPhone", "model":"4s", "price":490} 2 | {"prodname":"Samsung", "model":"Galaxy Note 7", "desc":"Catches fire while charging"} 3 | {"prodname":"iPhone", "model":"7s", "description":"nothing changed"} 4 | -------------------------------------------------------------------------------- /Chapter02/Example2-14.Py: -------------------------------------------------------------------------------- 1 | java_skills= sc.parallelize(["Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh"]) 2 | db_skills= sc.parallelize(["James Kent","Paul Jones","Tom Mahoney","Adam Waugh"]) 3 | java_skills.intersection(db_skills).collect() -------------------------------------------------------------------------------- /Chapter04/Example 4-3.py: -------------------------------------------------------------------------------- 1 | from pyspark.sql import SparkSession 2 | 3 | spark = SparkSession \ 4 | .builder \ 5 | .appName("Python Spark SQL basic example") \ 6 | .config("spark.some.config.option", "some-value") \ 7 | .getOrCreate() 8 | 9 | -------------------------------------------------------------------------------- /Chapter02/Example2-16.scala: -------------------------------------------------------------------------------- 1 | val java_skills=sc.parallelize(List("Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh")) 2 | val db_skills= sc.parallelize(List("James Kent","Paul Jones","Tom Mahoney","Adam Waugh")) 3 | java_skills.union(db_skills).collect() 4 | -------------------------------------------------------------------------------- /Chapter02/Example2-13.scala: -------------------------------------------------------------------------------- 1 | val java_skills=sc.parallelize(List("Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh")) 2 | val db_skills= sc.parallelize(List("James Kent","Paul Jones","Tom Mahoney","Adam Waugh")) 3 | java_skills.intersection(db_skills).collect() 4 | -------------------------------------------------------------------------------- /Chapter02/Example2-17.scala: -------------------------------------------------------------------------------- 1 | val java_skills=sc.parallelize(List("Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh")) 2 | val db_skills= sc.parallelize(List("James Kent","Paul Jones","Tom Mahoney","Adam Waugh")) 3 | java_skills.subtract(db_skills).collect() 4 | -------------------------------------------------------------------------------- /Chapter03/Example 03-12.py: -------------------------------------------------------------------------------- 1 | data = sc.parallelize([("MyKey1","MyValue1"),("MyKey2","MyValue2"),("MyKey3","MyValue3")]) 2 | 3 | data.collect() 4 | [('MyKey1', 'MyValue1'), ('MyKey2', 'MyValue2'), ('MyKey3', 'MyValue3')] 5 | 6 | data.saveAsSequenceFile("/home/spark/sampledata/seq-py-example") 7 | -------------------------------------------------------------------------------- /Chapter03/Example 03-01.scala: -------------------------------------------------------------------------------- 1 | //To read all README.md file 2 | val dataFile = sc.textFile("README.md") 3 | 4 | //Split line to words, and flatten the result of each split 5 | val words = dataFile.flatMap(line => line.split(" ")) 6 | //Save to textFile 7 | words.saveAsTextFile("/tmp/scalawords/") 8 | -------------------------------------------------------------------------------- /Chapter03/Example 03-03.scala: -------------------------------------------------------------------------------- 1 | //To read all README.md file 2 | val dataFile = sc.textFile("README.md") 3 | 4 | //Split line to words, and flatten the result of each split 5 | val words = dataFile.flatMap(line => line.split(" ")) 6 | //Save to textFile 7 | words.saveAsTextFile("/tmp/scalawords/") 8 | -------------------------------------------------------------------------------- /Chapter02/Example2-15.java: -------------------------------------------------------------------------------- 1 | JavaRDD javaSkills= sc.parallelize(Arrays.asList("Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh")); 2 | JavaRDD dbSkills= sc.parallelize(Arrays.asList("James Kent","Paul Jones","Tom Mahoney","Adam Waugh")); 3 | javaSkills.intersection(dbSkills).collect(); -------------------------------------------------------------------------------- /Chapter03/Example 03-02.py: -------------------------------------------------------------------------------- 1 | 2 | //To read all README.md file 3 | dataFile = sc.textFile("README.md") 4 | 5 | //Split line to words, and flatten the result of each split 6 | words = dataFile.flatMap(lambda line: line.split(" ")) 7 | 8 | //Save as TextFile 9 | words.saveAsTextFile("/tmp/pythonwords/") 10 | -------------------------------------------------------------------------------- /Chapter03/Example 03-05.py: -------------------------------------------------------------------------------- 1 | 2 | //To read all README.md file 3 | dataFile = sc.textFile("README.md") 4 | 5 | //Split line to words, and flatten the result of each split 6 | words = dataFile.flatMap(lambda line: line.split(" ")) 7 | 8 | //Save as TextFile 9 | words.saveAsTextFile("/tmp/pythonwords/") 10 | -------------------------------------------------------------------------------- /Chapter03/Example 03-10.java: -------------------------------------------------------------------------------- 1 | SparkSession spark = SparkSession.builder() 2 | .master("local") 3 | .appName("SparkCSVExample") 4 | .config("spark.some.config.option", "some-value") 5 | .getOrCreate(); 6 | 7 | Dataset pricePaidDS = spark.read().option("sep","\t").csv(fileName); 8 | -------------------------------------------------------------------------------- /Chapter04/Example 4-2.java: -------------------------------------------------------------------------------- 1 | val transCount = transactions.cartesian(products).filter{ 2 | case (TransProdId,ProdProdId) => TransProdId == ProdProdId 3 | } 4 | .filter{case(TransProdId, ProdProdId) => ProdProdId = 3500 5 | } 6 | .map{ 7 | case (TransProdId,ProdProdId) => TransProdId 8 | }.count 9 | 10 | Println(transCount) 11 | -------------------------------------------------------------------------------- /Chapter03/Example 03-03.java: -------------------------------------------------------------------------------- 1 | //To read all README.md file 2 | JavaRDD dataFile = sc.textFile(fileName); 3 | 4 | //Split line to words, and flatten the result of each split 5 | JavaRDD words = dataFile.flatMap(line -> Arrays.asList(line.split(" ")).iterator()); 6 | 7 | //Save as TextFile 8 | words.saveAsTextFile(outputFile); 9 | -------------------------------------------------------------------------------- /Chapter03/Example 03-06.java: -------------------------------------------------------------------------------- 1 | //To read all README.md file 2 | JavaRDD dataFile = sc.textFile(fileName); 3 | 4 | //Split line to words, and flatten the result of each split 5 | JavaRDD words = dataFile.flatMap(line -> Arrays.asList(line.split(" ")).iterator()); 6 | 7 | //Save as TextFile 8 | words.saveAsTextFile(outputFile); 9 | -------------------------------------------------------------------------------- /Chapter02/Example2-6.java: -------------------------------------------------------------------------------- 1 | JavaRDD movies = sc.parallelize 2 | (Arrays.asList("Pulp Fiction","Requiem for a dream" 3 | ,"A clockwork Orange") 4 | ); 5 | 6 | JavaRDD movieName = movies.flatMap( 7 | new FlatMapFunction(){ 8 | public Iterator call(String movie){ 9 | return Arrays.asList(movie.split(" ")) 10 | .iterator(); 11 | } 12 | } 13 | ); 14 | -------------------------------------------------------------------------------- /Chapter02/Example2-20.scala: -------------------------------------------------------------------------------- 1 | val sampleData = sc.parallelize(Array(("k1",10),("k2",5),("k1",6),("k3",4),("k2",1),("k3",4))) 2 | val sumCount = sampleData.combineByKey(value => (value,1), 3 | (valcntpair: (Int,Int), value) => (valcntpair._1 + value, valcntpair._2+1), 4 | (valcntpair: (Int,Int), valcntpairnxt: (Int,Int)) => ((valcntpair._1 + valcntpairnxt._1),(valcntpair._2 + valcntpairnxt._2))) 5 | 6 | sumCount.take(3) 7 | val avgByKey = sumCount.map{case (label,value) => (label, value._1/value._2)} 8 | avgByKey.take(3) 9 | 10 | -------------------------------------------------------------------------------- /Chapter02/Example2-19.scala: -------------------------------------------------------------------------------- 1 | #Input Data 2 | val storeSales = sc.parallelize(Array(("London", 23.4),("Manchester",19.8),("Leeds",14.7),("London",26.6))) 3 | 4 | 5 | #GroupByKey 6 | storeSales.groupByKey().map(location=>(location._1,location._2.sum)).collect() 7 | 8 | #SampleResult 9 | #res2: Array[(String, Double)] = Array((Manchester,19.8), (London,50.0), (Leeds,14.7)) 10 | 11 | #ReduceByKey 12 | storeSales.reduceByKey(_+_).collect() 13 | 14 | #Sample Result 15 | #res1: Array[(String, Double)] = Array((Manchester,19.8), (London,50.0), (Leeds,14.7)) 16 | -------------------------------------------------------------------------------- /Chapter04/ReadWriteParquet.py: -------------------------------------------------------------------------------- 1 | #Reading a JSON file as a DataFrame 2 | callDetailsDF = spark.read.json("/home/spark/sampledata/json/cdrs.json") 3 | # Write the DataFrame out as a Parquet File 4 | callDetailsDF.write.parquet("cdrs.parquet") 5 | # Loading the Parquet File as a DataFrame 6 | callDetailsParquetDF = spark.read.parquet("cdrs.parquet") 7 | # Standard DataFrame data manipulation 8 | callDetailsParquetDF.createOrReplaceTempView("calldetails") 9 | topCallLocsDF = spark.sql("select Origin,Dest, count(*) as cnt from calldetails group by Origin,Dest order by cnt desc") 10 | -------------------------------------------------------------------------------- /Chapter04/ReadWriteParquet.r: -------------------------------------------------------------------------------- 1 | #Loading a JSON file as a DataFrame 2 | callDetailsDF <- read.df("/home/spark/sampledata/json/cdrs.json","json") 3 | #Writing the DataFrame out as a Parquet 4 | write.parquet(callDetailsDF,"cdrs.parquet") 5 | #Reading Parquet as a DataFrame 6 | callDetailsParquetDF <- read.parquet("cdrs.parquet") 7 | #Data Manipulation of Parquet Data 8 | createOrReplaceTempView(callDetailsParquetDF,"parquetFile") 9 | topCallLocsDF <- sql("select Origin,Dest, count(*) as cnt from calldetails group by Origin,Dest order by cnt desc") 10 | head(topCallLocsDF) 11 | -------------------------------------------------------------------------------- /Chapter07/ConnectedComponents.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.graphx._ 2 | val cdrGraph = GraphLoader.edgeListFile(sc,"/home/spark/sampledata/graphx/cdrs.txt") 3 | val connectedVertices = cdrGraph.connectedComponents().vertices 4 | val usersList = sc.textFile("/home/spark/sampledata/graphx/usernames.csv").map{line => 5 | val fields = line.split(",") 6 | (fields(0).trim().toLong, fields(1)) 7 | } 8 | val connectedComponentsByUsers = usersList.join(connectedVertices).map { 9 | case (id, (username, cc)) => (username, cc) 10 | } 11 | println(connectedComponentsByUsers.collect().mkString("\n")) 12 | -------------------------------------------------------------------------------- /Chapter07/BuildPageRank.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.graphx.{Graph, VertexRDD, GraphLoader} 2 | val cdrGraph = GraphLoader.edgeListFile(sc,"/home/spark/sampledata/graphx/cdrs.txt") 3 | val influencers = cdrGraph.pageRank(0.0001).vertices 4 | val usersList = sc.textFile("/home/spark/sampledata/graphx/usernames.csv").map{line => 5 | val fields = line.split(",") 6 | (fields(0).trim().toLong, fields(1)) 7 | } 8 | 9 | val ranksByUsername = usersList.join(influencers).map { 10 | case (id, (username, userRank)) => (username, userRank) 11 | } 12 | println(ranksByUsername.collect().mkString("\n")) 13 | 14 | -------------------------------------------------------------------------------- /Chapter04/SparkSQLHiveIntegration.r: -------------------------------------------------------------------------------- 1 | # Creating Spark Session with hive Support 2 | sparkR.session(enableHiveSupport=TRUE) 3 | 4 | # Creating a table to hold CDRs 5 | sql("CREATE TABLE IF NOT EXISTS cdrs(callingNumber STRING, calledNumber String, origin String, Dest String,CallDtTm String, callCharge Int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','") 6 | 7 | # Loading data 8 | sql("LOAD DATA LOCAL INPATH '/home/spark/sampledata/cdrs.csv' INTO table cdrs") 9 | 10 | # Finding top paired origin/destinations 11 | sql(" SELECT origin, dest, count(*) as cnt from cdrs group by origin, dest order by cnt desc LIMIT 5") 12 | -------------------------------------------------------------------------------- /Chapter04/ReadWriteParquet.java: -------------------------------------------------------------------------------- 1 | #Loading a JSON file as a DataSet of Row objects 2 | Dataset callDetailsDF = mySparkSession.read().json(fileName); 3 | 4 | #Writing a Parquet File 5 | callDetailsDF.write().parquet(parquetFileName); 6 | 7 | #Reading a Parquet file of Dataset of Row objects 8 | Dataset callDetailsParquetDF = mySparkSession.read().parquet(parquetFileName); 9 | 10 | #Parquet file data manipulation 11 | callDetailsParquetDF.createOrReplaceTempView("callDetails"); 12 | Dataset topLocDF = mySparkSession.sql("select Origin,Dest, count(*) as cnt from calldetails group by Origin,Dest order by cnt desc"); 13 | topLocDF.show(5); 14 | -------------------------------------------------------------------------------- /Chapter06/Example01.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.ml.classification.LogisticRegression 2 | import org.apache.spark.ml.linalg.{Vector,Vectors} 3 | import org.apache.spark.ml.param.ParamMap 4 | import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer} 5 | 6 | val textTokenizer = new Tokenizer() 7 | .setInputCol("corpus") 8 | .setOutputCol("tokenizedWords") 9 | /* HashingTF and CountVectorized can be used to generate term frequencies. HashingTF utilizes that hashing trick and is a very fast and space-efficient way of turning arbitrary features into a vector or a matrix. 10 | */ 11 | 12 | val hashingTermFrequency = new HashingTF() 13 | .setNumFeatures(1000) 14 | .setInputCol(tokenizer.getOutputCol) 15 | .setOutputCol("features") 16 | val logisticRegression = new LogisticRegression() 17 | .setMaxIter(10) 18 | .setRegParam(0.01) 19 | val pipeline = new Pipeline() 20 | .setStages(Array(tokenizer, hashingTermFrequency, logisticRegression)) 21 | val model = pipeline.fit(trainingDataset) 22 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2017 Packt 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Chapter06/BuildingPipeline.py: -------------------------------------------------------------------------------- 1 | from pyspark.ml import Pipeline 2 | from pyspark.ml.classification import LogisticRegression 3 | from pyspark.ml.feature import HashingTF, Tokenizer 4 | 5 | 6 | # Create a dataframe using labelled data set 7 | trainingDataSet = spark.createDataFrame([ 8 | (0, "ronaldo zidane goals score ball studs", 1.0), 9 | (1, "obama trump clintons whitehouse policy inflation", 0.0), 10 | (2, "corner penalty worldcup eurocup barcelona messie", 1.0), 11 | (3, "hadoop mapreduce spark goal pig hive", 0.0)], ["documentId", "corpus", "label"]) 12 | 13 | # Configure an ML pipeline, which consists of three stages: 14 | # texttokenization, hashingTF, and logisticRegressionmodel. 15 | textTokenizer = Tokenizer(inputCol="corpus", outputCol="words") 16 | hashingTF = HashingTF(inputCol=textTokenizer.getOutputCol(), outputCol="features") 17 | logisticRegressionModel = LogisticRegression(maxIter=30, regParam=0.01) 18 | pipeline = Pipeline(stages=[textTokenizer, hashingTF, logisticRegressionModel]) 19 | 20 | # Fit the pipeline to training documents. 21 | #Returns a model which can then be used with other data sets for prediction. 22 | 23 | model = pipeline.fit(trainingDataSet) 24 | 25 | # Create a dataset which contains unlabelled documents of data 26 | testDataSet = spark.createDataFrame([ 27 | (4, "corner ball goal score" ), 28 | (5, "sort hive optimzer columnar"), 29 | (6, "ronaldo messie eurocup"), 30 | (7, "database parquet orc avro")], ["documentId", "corpus"]) 31 | 32 | # Make predictions on test documents and print columns of interest from the predictions. 33 | prediction = model.transform(testDataSet) 34 | selectedColumns = prediction.select("documentId", "corpus", "prediction", "probability") 35 | for eachRow in selectedColumns.collect(): 36 | print(eachRow) 37 | -------------------------------------------------------------------------------- /Chapter09/BuildRecommendationEngine.scala: -------------------------------------------------------------------------------- 1 | import org.apache.spark.ml.evaluation.RegressionEvaluator 2 | import org.apache.spark.ml.recommendation.ALS 3 | import org.apache.spark.sql._ 4 | 5 | case class Ratings(userId: Int, movieId: Int, rating: Double, ratingTs: Long) 6 | val ratingsSchema = Encoders.product[Ratings].schema 7 | case class Movies(moveId: Int, title: String, genre: String) 8 | val moviesSchema = Encoders.product[Movies].schema 9 | 10 | val ratings = spark.read.option("header","true") 11 | .schema(ratingsSchema) 12 | .csv("hdfs://sparkmaster:8020/user/hdfs/sampledata/ratings.csv") 13 | 14 | val movies = spark.read.option("header","true") 15 | .schema(moviesSchema) 16 | .csv("hdfs://sparkmaster:8020/user/hdfs/sampledata/movies.csv") 17 | 18 | val Array(train, test) = ratings.randomSplit(Array(0.7, 0.3)) 19 | 20 | val als = new ALS() 21 | .setMaxIter(15) 22 | .setRegParam(0.01) 23 | .setUserCol("userId") 24 | .setItemCol("movieId") 25 | .setRatingCol("rating") 26 | 27 | val recommendationModel = als.fit(train) 28 | 29 | val predictions = recommendationModel.transform(test) 30 | val ranks = List(1,2,3,4,5,6,7,8,9,10) 31 | val lambdas = List(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.8,1,2,3,4,5,6,10.0) 32 | val regParams = List(0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.8,0.10,10) 33 | val numIters = List(5,10,15,20) 34 | var bestModel: Option[ALSModel] = None 35 | var optimalRMSE = Double.MaxValue 36 | var bestRank = 0 37 | var bestRegParam = -1.0 38 | var bestNumIter = -1 39 | 40 | /* 41 | * Iterative Computation - Find best Model 42 | */ 43 | for (rank <- ranks; regParam <- regParams; numIter <- numIters) { 44 | val als = new ALS().setMaxIter(numIter).setRank(rank).setRegParam(regParam).setUserCol("userId").setItemCol("movieId").setRatingCol("rating") 45 | val model = als.fit(train) 46 | val predictions = model.transform(valid) 47 | val currentRMSE = evaluator.evaluate(predictions.filter("prediction <> 'NaN'")) 48 | println("Metrics => RMSE (Validation) = " + currentRMSE + " : Model Metrics(rank = "+ rank + ", regParam = " + regParam + ", and numIter = " + numIter + ").") 49 | if (currentRMSE < optimalRMSE) { 50 | bestModel = Some(model) 51 | optimalRMSE = currentRMSE 52 | bestRank = rank 53 | bestRegParam = regParam 54 | bestNumIter = numIter 55 | } 56 | } 57 | 58 | 59 | val als = new ALS() 60 | .setMaxIter(15) 61 | .setRegParam(0.01) 62 | .setImplicitPrefs(true) 63 | .setUserCol("userId") 64 | .setItemCol("movieId") 65 | 66 | -------------------------------------------------------------------------------- /Chapter04/RDDConversion.java: -------------------------------------------------------------------------------- 1 | package org.packtpub; 2 | 3 | import java.io.Serializable; 4 | import java.util.ArrayList; 5 | import java.util.Arrays; 6 | import java.util.Iterator; 7 | import java.util.List; 8 | 9 | import org.apache.hadoop.io.Text; 10 | import org.apache.hadoop.mapred.SequenceFileOutputFormat; 11 | import org.apache.spark.SparkConf; 12 | import org.apache.spark.SparkContext; 13 | import org.apache.spark.api.java.JavaPairRDD; 14 | import org.apache.spark.api.java.JavaRDD; 15 | import org.apache.spark.api.java.JavaSparkContext; 16 | import org.apache.spark.api.java.function.FilterFunction; 17 | import org.apache.spark.api.java.function.FlatMapFunction; 18 | import org.apache.spark.api.java.function.Function; 19 | import org.apache.spark.api.java.function.Function2; 20 | import org.apache.spark.api.java.function.PairFunction; 21 | import org.apache.spark.sql.*; 22 | 23 | import static org.apache.spark.sql.functions.col; 24 | 25 | import org.apache.spark.sql.functions; 26 | 27 | import scala.Tuple2; 28 | import scala.collection.Iterable; 29 | 30 | import org.apache.spark.sql.Dataset; 31 | import org.apache.spark.sql.Row; 32 | import org.apache.spark.sql.SparkSession; 33 | 34 | 35 | public class RDDConversion 36 | { 37 | 38 | public static void main( String[] args ) 39 | { 40 | RDDConversion app = new RDDConversion(); 41 | System.setProperty("hadoop.home.dir", "C:/spark/spark-2.0.0/"); 42 | String sparkWarehouseDir = "/home/spark/spark-warehouse"; 43 | String fileName = args[0]; 44 | 45 | SparkConf conf = new SparkConf().setAppName("RDDConversion").setMaster("local[*]"); 46 | JavaSparkContext sc = new JavaSparkContext(conf); 47 | 48 | SparkSession mySparkSession = SparkSession.builder() 49 | .master("local") 50 | .appName("Java Spark-SQL Hive Integration ") 51 | .enableHiveSupport() 52 | .config("spark.sql.warehouse.dir", sparkWarehouseDir) 53 | .getOrCreate(); 54 | 55 | JavaRDD dataFile = sc.textFile(fileName); 56 | 57 | JavaRDD cdr = dataFile.map(new Function(){ 58 | public CallDetailRecord call(String line) throws Exception{ 59 | String[] parts = line.split(","); 60 | CallDetailRecord cdr = new CallDetailRecord(); 61 | cdr.setOriginNumber(parts[0]); 62 | cdr.setTermNumber(parts[1]); 63 | cdr.setOrigin(parts[2]); 64 | cdr.setTermDest(parts[3]); 65 | cdr.setDateTime(parts[4]); 66 | cdr.setCallCharges(Long.parseLong(parts[5])); 67 | return cdr; 68 | } 69 | }); 70 | 71 | 72 | Dataset cdrDataFrame = mySparkSession.createDataFrame(cdr, CallDetailRecord.class); 73 | cdrDataFrame.show(); 74 | 75 | 76 | } 77 | 78 | 79 | 80 | } 81 | 82 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | 2 | 3 | 4 | # Learning Apache Spark 2 5 | This is the code repository for [Learning Apache Spark 2](https://www.packtpub.com/big-data-and-business-intelligence/learning-apache-spark-2?utm_source=github&utm_medium=repository&utm_campaign=9781785885136), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish. 6 | 7 | ## About the Book 8 | Spark juggernaut keeps on rolling and getting more and more momentum each day. Spark provides key capabilities in the form of Spark SQL, Spark Streaming, Spark ML and Graph X all accessible via Java, Scala, Python and R. Deploying the key capabilities is crucial whether it is on a Standalone framework or as a part of existing Hadoop installation and configuring with Yarn and Mesos. 9 | 10 | The next part of the journey after installation is using key components, APIs, Clustering, machine learning APIs, data pipelines, parallel programming. It is important to understand why each framework component is key, how widely it is being used, its stability and pertinent use cases. 11 | 12 | 13 | ## Instructions and Navigation 14 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02. 15 | 16 | Code bundle contains a DataSet folder for sample data used. 17 | 18 | The code will look like the following: 19 | 20 | When we wish to draw your attention to a particular part of a code block, the relevant lines 21 | or items are set in bold: 22 | ``` 23 | [default] 24 | exten => s,1,Dial(Zap/1|30) 25 | exten => s,2,Voicemail(u100) 26 | exten => s,102,Voicemail(b100) 27 | exten => i,1,Voicemail(s0) 28 | ``` 29 | 30 | You will need Spark 2.0, which you can download from Apache Spark website. We have used few different configurations, but you can essentially run most of these examples inside a virtual machine with 4-8GB of RAM, and 10 GB of available disk space. 31 | 32 | ## Related Products 33 | * [Mastering Apache Spark 2.0 - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/mastering-apache-spark-20-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781786462749) 34 | 35 | * [Apache Spark 2 for Beginners](https://www.packtpub.com/big-data-and-business-intelligence/apache-spark-2-beginners?utm_source=github&utm_medium=repository&utm_campaign=9781785885006) 36 | 37 | * [Apache Spark Machine Learning Cookbook](https://www.packtpub.com/big-data-and-business-intelligence/apache-spark-machine-learning-cookbook?utm_source=github&utm_medium=repository&utm_campaign=9781783551606) 38 | 39 | ### Suggestions and Feedback 40 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSe5qwunkGf6PUvzPirPDtuy1Du5Rlzew23UBp2S-P3wB-GcwQ/viewform) if you have any feedback or suggestions. 41 | ### Download a free PDF 42 | 43 | If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.
Simply click on the link to claim your free PDF.
44 |

https://packt.link/free-ebook/9781785885136

-------------------------------------------------------------------------------- /DataSets/cdrs.json: -------------------------------------------------------------------------------- 1 | {"OriginatingNum": 797308107, "TerminatingNum": 797131221, "Origin": "London", "Dest": "Birmingham","DateTime": "02/11/2016 01:51:41", "CallCharge": 549} 2 | {"OriginatingNum": 777121117, "TerminatingNum": 777440392, "Origin": "Manchester","Dest": "London", "DateTime": "05/02/2016 01:26:54", "CallCharge": 2645} 3 | {"OriginatingNum": 797009202, "TerminatingNum": 784243404, "Origin": "Victoria", "Dest": "Manchester","DateTime": "01/12/2016 21:12:54","CallCharge": 1233} 4 | { "OriginatingNum": 777557705, "TerminatingNum": 798420467, "Origin": "Twickenham", "Dest": "Victoria", "DateTime": "07/11/2016 01:07:34", "CallCharge": 2651} 5 | {"OriginatingNum": 785434022, "TerminatingNum": 779086250, "Origin": "Leeds", "Dest": "Scotland", "DateTime": "02/11/2016 22:22:26", "CallCharge": 3162} 6 | {"OriginatingNum": 779716202, "TerminatingNum": 795137353, "Origin": "Bradford", "Dest": "Virginia Water", "DateTime": "05/01/2016 20:12:35", "CallCharge": 2246} 7 | {"OriginatingNum": 775490102, "TerminatingNum": 775019605, "Origin": "Yorkshire", "Dest": "Ascot", "DateTime": "04/12/2016 23:53:52", "CallCharge": 571} 8 | {"OriginatingNum": 787581376, "TerminatingNum": 797043387, "Origin": "Birmingham", "Dest": "Bracknell", "DateTime": "06/11/2016 20:31:49", "CallCharge": 3291} 9 | {"OriginatingNum": 789231956, "TerminatingNum": 787649491, "Origin": "Coventary", "Dest": "Bradford", "DateTime": "03/12/2016 12:15:17", "CallCharge": 2270} 10 | {"OriginatingNum": 785969980, "TerminatingNum": 789993090, "Origin": "Wales", "Dest": "Yorkshire", "DateTime": "06/02/2016 20:57:44", "CallCharge": 3420} 11 | {"OriginatingNum": 797662091, "TerminatingNum": 777765510, "Origin": "Scotland", "Dest": "Birmingham", "DateTime": "02/01/2016 02:44:27", "CallCharge": 3084} 12 | {"OriginatingNum": 784036802, "TerminatingNum": 798095485, "Origin": "Virginia Water", "Dest": "Marlow", "DateTime": "09/01/2016 00:48:43", "CallCharge": 3037} 13 | {"OriginatingNum": 785160169, "TerminatingNum": 797922170, "Origin": "Ascot", "Dest": "Sunningdale", "DateTime": "08/11/2016 20:19:19", "CallCharge": 3011} 14 | {"OriginatingNum": 789519210, "TerminatingNum": 774080821, "Origin": "Bracknell", "Dest": "Lords", "DateTime": "05/01/2016 11:24:28", "CallCharge": 1018} 15 | {"OriginatingNum": 775617249, "TerminatingNum": 786549418, "Origin": "Marlow", "Dest": "Oval", "DateTime": "02/12/2016 02:07:09", "CallCharge": 771} 16 | {"OriginatingNum": 797932062, "TerminatingNum": 788292522, "Origin": "Sunningdale", "Dest": "Coventary", "DateTime": "07/11/2016 03:43:23", "CallCharge": 3585} 17 | {"OriginatingNum": 777561966, "TerminatingNum": 788455450, "Origin": "Lords", "Dest": "Wales", "DateTime": "06/01/2016 23:08:06", "CallCharge": 908} 18 | {"OriginatingNum": 777508024, "TerminatingNum": 789954417, "Origin": "Oval", "Dest": "Scotland", "DateTime": "04/12/2016 24:17:54", "CallCharge": 95} 19 | {"OriginatingNum": 777087537, "TerminatingNum": 778710691, "Origin": "Birmingham", "Dest": "Birmingham", "DateTime": "03/11/2016 00:45:24", "CallCharge": 2754} 20 | {"OriginatingNum": 774688108, "TerminatingNum": 797626213, "Origin": "London", "Dest": "Coventary", "DateTime": "03/01/2016 03:11:03", "CallCharge": 1327} 21 | {"OriginatingNum": 778449580, "TerminatingNum": 778385762, "Origin": "Manchester", "Dest": "Wales", "DateTime": "04/02/2016 14:59:06", "CallCharge": 3264} 22 | {"OriginatingNum": 788790859, "TerminatingNum": 776121867, "Origin": "Victoria", "Dest": "Scotland", "DateTime": "09/12/2016 11:05:23", "CallCharge": 1608} 23 | {"OriginatingNum": 785376620, "TerminatingNum": 798020898, "Origin": "Scotland", "Dest": "Virginia Water", "DateTime": "03/02/2016 04:31:45", "CallCharge": 77} 24 | {"OriginatingNum": 774388678, "TerminatingNum": 786552782, "Origin": "Virginia Water", "Dest": "Ascot", "DateTime": "02/01/2016 02:26:31", "CallCharge": 1757} 25 | {"OriginatingNum": 796640229, "TerminatingNum": 786558349, "Origin": "Ascot", "Dest": "Bracknell", "DateTime": "04/11/2016 01:01:39", "CallCharge": 3421} 26 | {"OriginatingNum": 776397451, "TerminatingNum": 777278274, "Origin": "Bracknell", "Dest": "Leeds", "DateTime": "05/02/2016 22:07:21", "CallCharge": 2922} 27 | {"OriginatingNum": 787426686, "TerminatingNum": 774001818, "Origin": "Bradford", "Dest": "Bradford", "DateTime": "08/02/2016 01:31:53", "CallCharge": 996} 28 | {"OriginatingNum": 774853589, "TerminatingNum": 778226530, "Origin": "Yorkshire", "Dest": "Yorkshire", "DateTime": "07/11/2016 01:15:44", "CallCharge": 2229} 29 | {"OriginatingNum": 798516272, "TerminatingNum": 798192751, "Origin": "Birmingham", "Dest": "Birmingham", "DateTime": "06/01/2016 20:47:23", "CallCharge": 2314} 30 | {"OriginatingNum": 794956011, "TerminatingNum": 798595444, "Origin": "Marlow", "Dest": "Coventary", "DateTime": "09/01/2016 10:32:12", "CallCharge": 1137} 31 | {"OriginatingNum": 788499476, "TerminatingNum": 799514066, "Origin": "Sunningdale", "Dest": "Wales", "DateTime": "09/01/2016 14:13:41", "CallCharge": 1538} 32 | {"OriginatingNum": 778956877, "TerminatingNum": 787972481, "Origin": "Lords", "Dest": "Virginia Water", "DateTime": "02/01/2016 12:13:38", "CallCharge": 75} 33 | {"OriginatingNum": 784133953, "TerminatingNum": 777082964, "Origin": "Oval", "Dest": "Ascot", "DateTime": "02/01/2016 01:04:08", "CallCharge": 1415} 34 | {"OriginatingNum": 787077525, "TerminatingNum": 789876379, "Origin": "Coventary", "Dest": "Bracknell", "DateTime": "06/11/2016 22:57:41", "CallCharge": 1061} 35 | {"OriginatingNum": 784627303, "TerminatingNum": 776663366, "Origin": "Wales", "Dest": "Bradford", "DateTime": "04/01/2016 11:29:33", "CallCharge": 2291} 36 | {"OriginatingNum": 774188291, "TerminatingNum": 794732083, "Origin": "Scotland", "Dest": "Yorkshire", "DateTime": "08/12/2016 12:37:41", "CallCharge": 3391} 37 | {"OriginatingNum": 784126576, "TerminatingNum": 787520608, "Origin": "Birmingham", "Dest": "London", "DateTime": "05/01/2016 23:57:59", "CallCharge": 1740} 38 | {"OriginatingNum": 775584064, "TerminatingNum": 795017614, "Origin": "London", "Dest": "Manchester", "DateTime": "04/01/2016 01:19:28", "CallCharge": 2940} 39 | {"OriginatingNum": 774279853, "TerminatingNum": 787470510, "Origin": "Manchester", "Dest": "Victoria", "DateTime": "09/02/2016 02:47:23", "CallCharge": 283} 40 | {"OriginatingNum": 776629283, "TerminatingNum": 784050637, "Origin": "Victoria", "Dest": "Twickenham", "DateTime": "06/01/2016 23:09:52", "CallCharge": 426} 41 | {"OriginatingNum": 796071020, "TerminatingNum": 796928746, "Origin": "Scotland", "Dest": "Leeds", "DateTime": "02/01/2016 20:15:52", "CallCharge": 1300} 42 | {"OriginatingNum": 778529801, "TerminatingNum": 799123703, "Origin": "Virginia Water", "Dest": "Bradford", "DateTime": "03/01/2016 21:23:05", "CallCharge": 85} 43 | {"OriginatingNum": 779318091, "TerminatingNum": 777545543, "Origin": "Ascot", "Dest": "Yorkshire", "DateTime": "09/12/2016 20:38:39", "CallCharge": 2198} 44 | {"OriginatingNum": 779785134, "TerminatingNum": 796559835, "Origin": "Bracknell", "Dest": "Birmingham", "DateTime": "06/01/2016 20:55:09", "CallCharge": 2551} 45 | {"OriginatingNum": 777388057, "TerminatingNum": 796373853, "Origin": "Bradford", "Dest": "Coventary", "DateTime": "03/02/2016 03:27:54", "CallCharge": 2424} 46 | {"OriginatingNum": 784410639, "TerminatingNum": 785309669, "Origin": "Yorkshire", "Dest": "Wales", "DateTime": "07/02/2016 23:44:17", "CallCharge": 652} 47 | {"OriginatingNum": 779039353, "TerminatingNum": 788576202, "Origin": "Birmingham", "Dest": "Scotland", "DateTime": "07/02/2016 04:49:12", "CallCharge": 534} 48 | {"OriginatingNum": 774918134, "TerminatingNum": 784624246, "Origin": "Marlow", "Dest": "Virginia Water", "DateTime": "08/01/2016 03:49:48", "CallCharge": 3469} 49 | {"OriginatingNum": 785848242, "TerminatingNum": 795801932, "Origin": "Sunningdale", "Dest": "Ascot", "DateTime": "09/11/2016 12:53:14", "CallCharge": 2627} 50 | {"OriginatingNum": 785339539, "TerminatingNum": 776854945, "Origin": "Lords", "Dest": "Bracknell", "DateTime": "02/02/2016 00:35:53", "CallCharge": 3204} 51 | {"OriginatingNum": 777047486, "TerminatingNum": 786699071, "Origin": "Oval", "Dest": "Marlow", "DateTime": "03/12/2016 11:52:09", "CallCharge": 2367} 52 | {"OriginatingNum": 788971265, "TerminatingNum": 785113136, "Origin": "Coventary", "Dest": "Sunningdale", "DateTime": "03/02/2016 00:59:44", "CallCharge": 3040} 53 | {"OriginatingNum": 788706239, "TerminatingNum": 799121170, "Origin": "Wales", "Dest": "Lords", "DateTime": "09/01/2016 20:52:48", "CallCharge": 3052} 54 | {"OriginatingNum": 784930367, "TerminatingNum": 799779480, "Origin": "Scotland", "Dest": "Oval", "DateTime": "07/02/2016 14:24:19", "CallCharge": 433} 55 | {"OriginatingNum": 786736111, "TerminatingNum": 778668124, "Origin": "Birmingham", "Dest": "Birmingham", "DateTime": "07/11/2016 23:49:07", "CallCharge": 2861} 56 | {"OriginatingNum": 774996036, "TerminatingNum": 776453220, "Origin": "Coventary", "Dest": "London", "DateTime": "07/11/2016 20:21:48", "CallCharge": 1896} 57 | {"OriginatingNum": 798738693, "TerminatingNum": 794306028, "Origin": "Wales", "Dest": "Manchester", "DateTime": "03/11/2016 23:27:33", "CallCharge": 52} 58 | {"OriginatingNum": 776797164, "TerminatingNum": 797585202, "Origin": "Scotland", "Dest": "Victoria", "DateTime": "04/01/2016 14:05:43", "CallCharge": 2302} 59 | {"OriginatingNum": 778899643, "TerminatingNum": 779991962, "Origin": "Virginia Water", "Dest": "Scotland", "DateTime": "07/02/2016 23:53:15", "CallCharge": 904} 60 | {"OriginatingNum": 795342792, "TerminatingNum": 789582658, "Origin": "Ascot", "Dest": "Virginia Water", "DateTime": "03/01/2016 10:25:34", "CallCharge": 125} 61 | {"OriginatingNum": 786463203, "TerminatingNum": 787079843, "Origin": "Bracknell", "Dest": "Ascot", "DateTime": "04/12/2016 11:26:52", "CallCharge": 881} 62 | {"OriginatingNum": 775479982, "TerminatingNum": 787185946, "Origin": "Leeds", "Dest": "Bracknell", "DateTime": "04/02/2016 24:13:49", "CallCharge": 1785} 63 | {"OriginatingNum": 795407096, "TerminatingNum": 794127828, "Origin": "Bradford", "Dest": "Bradford", "DateTime": "04/01/2016 01:02:41", "CallCharge": 2348} 64 | {"OriginatingNum": 799950372, "TerminatingNum": 779479868, "Origin": "Yorkshire", "Dest": "Yorkshire", "DateTime": "06/02/2016 03:06:32", "CallCharge": 2330} 65 | {"OriginatingNum": 779443671, "TerminatingNum": 789039212, "Origin": "Birmingham", "Dest": "Birmingham", "DateTime": "04/11/2016 12:28:44", "CallCharge": 971} 66 | {"OriginatingNum": 776078153, "TerminatingNum": 777623079, "Origin": "Coventary", "Dest": "Marlow", "DateTime": "04/01/2016 11:05:49", "CallCharge": 222} 67 | {"OriginatingNum": 778439584, "TerminatingNum": 794809988, "Origin": "Wales", "Dest": "Sunningdale", "DateTime": "09/02/2016 02:06:07", "CallCharge": 732} 68 | {"OriginatingNum": 776239910, "TerminatingNum": 779831334, "Origin": "Virginia Water", "Dest": "Lords", "DateTime": "06/12/2016 03:12:04", "CallCharge": 2807} 69 | {"OriginatingNum": 788661014, "TerminatingNum": 787991820, "Origin": "Ascot", "Dest": "Oval", "DateTime": "01/02/2016 03:07:33", "CallCharge": 619} 70 | {"OriginatingNum": 774298657, "TerminatingNum": 786941620, "Origin": "Bracknell", "Dest": "Coventary", "DateTime": "06/01/2016 20:17:59", "CallCharge": 3596} 71 | {"OriginatingNum": 796628071, "TerminatingNum": 785604207, "Origin": "Bradford", "Dest": "Wales", "DateTime": "01/12/2016 01:58:37", "CallCharge": 972} 72 | {"OriginatingNum": 777994634, "TerminatingNum": 774850412, "Origin": "Yorkshire", "Dest": "Scotland", "DateTime": "05/12/2016 11:22:44", "CallCharge": 3231} 73 | {"OriginatingNum": 785903192, "TerminatingNum": 776226916, "Origin": "Birmingham", "Dest": "Ascot", "DateTime": "05/11/2016 04:38:07", "CallCharge": 2074} 74 | {"OriginatingNum": 778324460, "TerminatingNum": 785392423, "Origin": "Coventary", "Dest": "Bracknell", "DateTime": "09/12/2016 22:26:41", "CallCharge": 2159} 75 | {"OriginatingNum": 774524318, "TerminatingNum": 779608295, "Origin": "Wales", "Dest": "Bradford", "DateTime": "02/01/2016 23:42:22", "CallCharge": 2417} 76 | {"OriginatingNum": 794183103, "TerminatingNum": 786955937, "Origin": "Virginia Water", "Dest": "Yorkshire", "DateTime": "08/01/2016 12:48:19", "CallCharge": 1083} 77 | {"OriginatingNum": 787471976, "TerminatingNum": 787033256, "Origin": "Ascot", "Dest": "Birmingham", "DateTime": "08/02/2016 00:48:52", "CallCharge": 8} 78 | {"OriginatingNum": 777026835, "TerminatingNum": 788186797, "Origin": "Bracknell", "Dest": "Marlow", "DateTime": "06/12/2016 10:25:33", "CallCharge": 3461} 79 | {"OriginatingNum": 777189678, "TerminatingNum": 785852073, "Origin": "Bradford", "Dest": "Sunningdale", "DateTime": "08/02/2016 02:57:09", "CallCharge": 3189} 80 | {"OriginatingNum": 779246405, "TerminatingNum": 794067417, "Origin": "Yorkshire", "Dest": "Lords", "DateTime": "02/11/2016 02:13:49", "CallCharge": 8} 81 | {"OriginatingNum": 794018876, "TerminatingNum": 776154503, "Origin": "London", "Dest": "Oval", "DateTime": "03/11/2016 01:41:37", "CallCharge": 1780} 82 | {"OriginatingNum": 788662914, "TerminatingNum": 796324299, "Origin": "Manchester", "Dest": "Coventary", "DateTime": "05/11/2016 10:19:36", "CallCharge": 2828} 83 | {"OriginatingNum": 794395044, "TerminatingNum": 776172226, "Origin": "Victoria", "Dest": "Wales", "DateTime": "01/11/2016 12:33:57", "CallCharge": 2600} 84 | {"OriginatingNum": 794810223, "TerminatingNum": 776407350, "Origin": "Twickenham", "Dest": "Scotland", "DateTime": "03/11/2016 00:57:04", "CallCharge": 2633} 85 | {"OriginatingNum": 794746359, "TerminatingNum": 784890101, "Origin": "Leeds", "Dest": "Birmingham", "DateTime": "06/12/2016 12:32:58", "CallCharge": 2452} 86 | {"OriginatingNum": 799339230, "TerminatingNum": 798748393, "Origin": "Bradford", "Dest": "Coventary", "DateTime": "05/11/2016 00:34:24", "CallCharge": 1049} 87 | {"OriginatingNum": 789140678, "TerminatingNum": 796878575, "Origin": "Yorkshire", "Dest": "Wales", "DateTime": "06/02/2016 01:02:22", "CallCharge": 1466} 88 | {"OriginatingNum": 778688915, "TerminatingNum": 779779543, "Origin": "Birmingham", "Dest": "Scotland", "DateTime": "06/11/2016 22:55:07", "CallCharge": 2973} 89 | {"OriginatingNum": 784353531, "TerminatingNum": 788414958, "Origin": "Coventary", "Dest": "Virginia Water", "DateTime": "06/02/2016 03:28:04", "CallCharge": 2159} 90 | {"OriginatingNum": 778998492, "TerminatingNum": 798635474, "Origin": "Wales", "Dest": "Ascot", "DateTime": "01/11/2016 24:31:49", "CallCharge": 2385} 91 | {"OriginatingNum": 795058836, "TerminatingNum": 774990397, "Origin": "Scotland", "Dest": "Bracknell", "DateTime": "06/12/2016 04:59:11", "CallCharge": 62} 92 | {"OriginatingNum": 779353189, "TerminatingNum": 796636314, "Origin": "Virginia Water", "Dest": "Leeds", "DateTime": "08/12/2016 10:12:28", "CallCharge": 3405} 93 | {"OriginatingNum": 788495142, "TerminatingNum": 795064948, "Origin": "Ascot", "Dest": "Bradford", "DateTime": "05/12/2016 01:37:09", "CallCharge": 2686} 94 | {"OriginatingNum": 788569039, "TerminatingNum": 788719136, "Origin": "Bracknell", "Dest": "Yorkshire", "DateTime": "09/02/2016 21:01:12", "CallCharge": 2319} 95 | {"OriginatingNum": 775435510, "TerminatingNum": 788161474, "Origin": "Marlow", "Dest": "Birmingham", "DateTime": "09/01/2016 21:09:09", "CallCharge": 2180} 96 | {"OriginatingNum": 776019794, "TerminatingNum": 776909199, "Origin": "Sunningdale", "Dest": "Coventary", "DateTime": "09/12/2016 12:16:59", "CallCharge": 3240} 97 | {"OriginatingNum": 794751801, "TerminatingNum": 774122416, "Origin": "Lords", "Dest": "Wales", "DateTime": "07/01/2016 24:28:39", "CallCharge": 1011} 98 | {"OriginatingNum": 798526356, "TerminatingNum": 784989061, "Origin": "Oval", "Dest": "Virginia Water", "DateTime": "09/11/2016 10:43:55", "CallCharge": 1771} 99 | {"OriginatingNum": 778894206, "TerminatingNum": 775901576, "Origin": "Birmingham", "Dest": "Ascot", "DateTime": "03/01/2016 23:14:55", "CallCharge": 951} 100 | {"OriginatingNum": 779461846, "TerminatingNum": 776103392, "Origin": "London", "Dest": "Bracknell", "DateTime": "08/02/2016 14:47:38", "CallCharge": 420 } -------------------------------------------------------------------------------- /Chapter10/ChurnPrediction.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "markdown", 5 | "metadata": { 6 | "collapsed": false 7 | }, 8 | "source": [ 9 | "CHURN PREDICTION - TELECOM DATA SET\n", 10 | "====================================" 11 | ] 12 | }, 13 | { 14 | "cell_type": "markdown", 15 | "metadata": { 16 | "collapsed": false 17 | }, 18 | "source": [ 19 | "We are going to Load the churners data. The data is available at the following location.\n", 20 | "https://raw.githubusercontent.com/EricChiang/churn/master/data/churn.csv " 21 | ] 22 | }, 23 | { 24 | "cell_type": "code", 25 | "execution_count": 1, 26 | "metadata": { 27 | "collapsed": true 28 | }, 29 | "outputs": [], 30 | "source": [ 31 | "from pyspark.sql import SparkSession\n", 32 | "from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, RandomForestClassifier\n", 33 | "from pyspark.ml import Pipeline\n", 34 | "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n", 35 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", 36 | "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n", 37 | "import time" 38 | ] 39 | }, 40 | { 41 | "cell_type": "markdown", 42 | "metadata": {}, 43 | "source": [ 44 | "If you look at the column names the are not the best names, as their is no consistency. \n", 45 | "The entire data set has been loaded as String and in addition to that the names have spaces too. We need to make sure\n", 46 | "we can define a proper schema for this dataset." 47 | ] 48 | }, 49 | { 50 | "cell_type": "code", 51 | "execution_count": 2, 52 | "metadata": { 53 | "collapsed": false 54 | }, 55 | "outputs": [], 56 | "source": [ 57 | "from pyspark.sql.types import *\n", 58 | "schemaString = \"STATE,ACCOUNTLENGTH,AREACODE,PHONE,INTLPLAN,VMAILPLAN,VMAILMESSAGE,DAYMINS,DAYCALLS,DAYCHARGE,EVEMINS,EVECALLS,EVECHARGE,NIGHTMINS,NIGHTCALLS,NIGHTCHARGE,INTLMINS,INTLCALLS,INTLCHARGE,CUSTSERVCALLS,CHURN\"\n", 59 | "fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split(\",\")]\n", 60 | "churnSchema = StructType(fields)" 61 | ] 62 | }, 63 | { 64 | "cell_type": "code", 65 | "execution_count": 4, 66 | "metadata": { 67 | "collapsed": false 68 | }, 69 | "outputs": [ 70 | { 71 | "ename": "Py4JJavaError", 72 | "evalue": "An error occurred while calling o77.csv.\n: java.net.ConnectException: Call From sparkmaster.demo.com/10.37.101.3 to sparkmaster:8020 failed on connection exception: java.net.ConnectException: Connection refused; For more details see: http://wiki.apache.org/hadoop/ConnectionRefused\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)\n\tat sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)\n\tat java.lang.reflect.Constructor.newInstance(Constructor.java:526)\n\tat org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:783)\n\tat org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:730)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1351)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1300)\n\tat org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206)\n\tat com.sun.proxy.$Proxy10.getFileInfo(Unknown Source)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:606)\n\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186)\n\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)\n\tat com.sun.proxy.$Proxy10.getFileInfo(Unknown Source)\n\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:651)\n\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1679)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1106)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1102)\n\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1102)\n\tat org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1397)\n\tat org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:389)\n\tat org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:379)\n\tat scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n\tat scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n\tat scala.collection.immutable.List.foreach(List.scala:381)\n\tat scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n\tat scala.collection.immutable.List.flatMap(List.scala:344)\n\tat org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:379)\n\tat org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:149)\n\tat org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:413)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:606)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: java.net.ConnectException: Connection refused\n\tat sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)\n\tat sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)\n\tat org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:206)\n\tat org.apache.hadoop.net.NetUtils.connect(NetUtils.java:529)\n\tat org.apache.hadoop.net.NetUtils.connect(NetUtils.java:493)\n\tat org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:547)\n\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:642)\n\tat org.apache.hadoop.ipc.Client$Connection.access$2600(Client.java:314)\n\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1399)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1318)\n\t... 38 more\n", 73 | "output_type": "error", 74 | "traceback": [ 75 | "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", 76 | "\u001b[1;31mPy4JJavaError\u001b[0m Traceback (most recent call last)", 77 | "\u001b[1;32m\u001b[0m in \u001b[0;36m\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mchurnDataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moption\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"header\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m\"true\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mschema\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchurnSchema\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcsv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"hdfs://sparkmaster:8020/user/hdfs/sampledata/churn.csv\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 2\u001b[0m \u001b[0mcols\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mchurnDataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 78 | "\u001b[1;32m/root/spark/spark-2.0.2/python/pyspark/sql/readwriter.py\u001b[0m in \u001b[0;36mcsv\u001b[1;34m(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode)\u001b[0m\n\u001b[0;32m 375\u001b[0m \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbasestring\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 376\u001b[0m \u001b[0mpath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 377\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_df\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jreader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcsv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_spark\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jvm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPythonUtils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoSeq\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 378\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 379\u001b[0m \u001b[1;33m@\u001b[0m\u001b[0msince\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1.5\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 79 | "\u001b[1;32m/root/spark/spark-2.0.2/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m 1131\u001b[0m \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1132\u001b[0m return_value = get_return_value(\n\u001b[1;32m-> 1133\u001b[1;33m answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[0;32m 1134\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 1135\u001b[0m \u001b[1;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 80 | "\u001b[1;32m/root/spark/spark-2.0.2/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m 61\u001b[0m \u001b[1;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 62\u001b[0m \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 63\u001b[1;33m \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m 64\u001b[0m \u001b[1;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 65\u001b[0m \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n", 81 | "\u001b[1;32m/root/spark/spark-2.0.2/python/lib/py4j-0.10.3-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[1;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[0;32m 317\u001b[0m raise Py4JJavaError(\n\u001b[0;32m 318\u001b[0m \u001b[1;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 319\u001b[1;33m format(target_id, \".\", name), value)\n\u001b[0m\u001b[0;32m 320\u001b[0m \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m 321\u001b[0m raise Py4JError(\n", 82 | "\u001b[1;31mPy4JJavaError\u001b[0m: An error occurred while calling o77.csv.\n: java.net.ConnectException: Call From sparkmaster.demo.com/10.37.101.3 to sparkmaster:8020 failed on connection exception: java.net.ConnectException: Connection refused; For more details see: http://wiki.apache.org/hadoop/ConnectionRefused\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)\n\tat sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)\n\tat java.lang.reflect.Constructor.newInstance(Constructor.java:526)\n\tat org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:783)\n\tat org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:730)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1351)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1300)\n\tat org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206)\n\tat com.sun.proxy.$Proxy10.getFileInfo(Unknown Source)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:606)\n\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186)\n\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)\n\tat com.sun.proxy.$Proxy10.getFileInfo(Unknown Source)\n\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:651)\n\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1679)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1106)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1102)\n\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1102)\n\tat org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1397)\n\tat org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:389)\n\tat org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:379)\n\tat scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n\tat scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n\tat scala.collection.immutable.List.foreach(List.scala:381)\n\tat scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n\tat scala.collection.immutable.List.flatMap(List.scala:344)\n\tat org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:379)\n\tat org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:149)\n\tat org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:413)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:606)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: java.net.ConnectException: Connection refused\n\tat sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)\n\tat sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)\n\tat org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:206)\n\tat org.apache.hadoop.net.NetUtils.connect(NetUtils.java:529)\n\tat org.apache.hadoop.net.NetUtils.connect(NetUtils.java:493)\n\tat org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:547)\n\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:642)\n\tat org.apache.hadoop.ipc.Client$Connection.access$2600(Client.java:314)\n\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1399)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1318)\n\t... 38 more\n" 83 | ] 84 | } 85 | ], 86 | "source": [ 87 | "churnDataset = spark.read.option(\"header\",\"true\").schema(churnSchema).csv(\"hdfs://sparkmaster:8020/user/hdfs/sampledata/churn.csv\")\n", 88 | "cols=churnDataset.columns" 89 | ] 90 | }, 91 | { 92 | "cell_type": "code", 93 | "execution_count": null, 94 | "metadata": { 95 | "collapsed": false 96 | }, 97 | "outputs": [], 98 | "source": [ 99 | "churnDataset.schema" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "churnDataset.createOrReplaceTempView(\"churn_tab\")\n", 111 | "spark.sql(\"select max(daymins), min(daymins) from churn_tab\").show()" 112 | ] 113 | }, 114 | { 115 | "cell_type": "code", 116 | "execution_count": 4, 117 | "metadata": { 118 | "collapsed": false 119 | }, 120 | "outputs": [], 121 | "source": [ 122 | "churnDataset = churnDataset.withColumn(\"ACCOUNTLENGTH\", churnDataset[\"ACCOUNTLENGTH\"].cast(\"double\"))\n", 123 | "churnDataset = churnDataset.withColumn(\"AREACODE\", churnDataset[\"AREACODE\"].cast(\"double\"))\n", 124 | "churnDataset = churnDataset.withColumn(\"VMAILMESSAGE\", churnDataset[\"VMAILMESSAGE\"].cast(\"double\"))\n", 125 | "churnDataset = churnDataset.withColumn(\"DAYMINS\", churnDataset[\"DAYMINS\"].cast(\"double\"))\n", 126 | "churnDataset = churnDataset.withColumn(\"DAYMINS\", churnDataset[\"DAYMINS\"].cast(\"double\"))\n", 127 | "churnDataset = churnDataset.withColumn(\"DAYCALLS\", churnDataset[\"DAYCALLS\"].cast(\"double\"))\n", 128 | "churnDataset = churnDataset.withColumn(\"DAYCHARGE\", churnDataset[\"DAYCHARGE\"].cast(\"double\"))\n", 129 | "churnDataset = churnDataset.withColumn(\"EVEMINS\", churnDataset[\"EVEMINS\"].cast(\"double\"))\n", 130 | "churnDataset = churnDataset.withColumn(\"EVECALLS\", churnDataset[\"EVECALLS\"].cast(\"double\"))\n", 131 | "churnDataset = churnDataset.withColumn(\"EVECHARGE\", churnDataset[\"EVECHARGE\"].cast(\"double\"))\n", 132 | "churnDataset = churnDataset.withColumn(\"NIGHTMINS\", churnDataset[\"NIGHTMINS\"].cast(\"double\"))\n", 133 | "churnDataset = churnDataset.withColumn(\"NIGHTCALLS\", churnDataset[\"NIGHTCALLS\"].cast(\"double\"))\n", 134 | "churnDataset = churnDataset.withColumn(\"NIGHTCHARGE\", churnDataset[\"NIGHTCHARGE\"].cast(\"double\"))\n", 135 | "churnDataset = churnDataset.withColumn(\"INTLMINS\", churnDataset[\"INTLMINS\"].cast(\"double\"))\n", 136 | "churnDataset = churnDataset.withColumn(\"INTLCALLS\", churnDataset[\"INTLCALLS\"].cast(\"double\"))\n", 137 | "churnDataset = churnDataset.withColumn(\"INTLCHARGE\", churnDataset[\"INTLCHARGE\"].cast(\"double\"))\n", 138 | "churnDataset = churnDataset.withColumn(\"CUSTSERVCALLS\", churnDataset[\"CUSTSERVCALLS\"].cast(\"double\"))\n" 139 | ] 140 | }, 141 | { 142 | "cell_type": "code", 143 | "execution_count": 5, 144 | "metadata": { 145 | "collapsed": true 146 | }, 147 | "outputs": [], 148 | "source": [ 149 | "stages = [] # Creating Stages array for our pipeline\n", 150 | "\n", 151 | "#Declaring Categorical columns\n", 152 | "categoricalColumns = [\"PHONE\",\"STATE\", \"INTLPLAN\", \"VMAILPLAN\"]\n", 153 | "\n", 154 | "#Looping through the categorical columns for feature transformation\n", 155 | "for categoricalCol in categoricalColumns:\n", 156 | " # Category Indexing with StringIndexer\n", 157 | " stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+\"Index\")\n", 158 | " # Use OneHotEncoder to convert categorical variables into binary SparseVectors\n", 159 | " encoder = OneHotEncoder(inputCol=categoricalCol+\"Index\", outputCol=categoricalCol+\"classVec\")\n", 160 | " # Add stages to the stages array. We'll pass these stages to the pipeline.\n", 161 | " stages += [stringIndexer, encoder]" 162 | ] 163 | }, 164 | { 165 | "cell_type": "code", 166 | "execution_count": 6, 167 | "metadata": { 168 | "collapsed": true 169 | }, 170 | "outputs": [], 171 | "source": [ 172 | "#Using String indexer to transform Chrun variable\n", 173 | "label_stringIdx = StringIndexer(inputCol = \"CHURN\", outputCol = \"label\")\n", 174 | "#Adding the Churn transformation to our pipeline stages\n", 175 | "stages += [label_stringIdx]" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 7, 181 | "metadata": { 182 | "collapsed": false 183 | }, 184 | "outputs": [], 185 | "source": [ 186 | "# Transform all features into a vector using VectorAssembler\n", 187 | "numericCols = [\"ACCOUNTLENGTH\",\"AREACODE\",\"VMAILMESSAGE\",\"DAYMINS\",\"DAYCALLS\",\"DAYCHARGE\",\"EVEMINS\",\"EVECALLS\",\"EVECHARGE\",\"NIGHTMINS\",\"NIGHTCALLS\",\"NIGHTCHARGE\",\"INTLMINS\",\"INTLCALLS\",\"INTLCHARGE\",\"CUSTSERVCALLS\"]\n", 188 | "#Pick up all the transformed categorical variables\n", 189 | "categoricalVectorColumns = [*map(lambda c: c + \"classVec\", categoricalColumns)]\n", 190 | "#Add transformed categorical variables and numberical columns to the assmebler input\n", 191 | "assemblerInputs = categoricalVectorColumns + numericCols\n", 192 | "#Use Vector assembler to combine raw numerical features with transformed categorical inputs \n", 193 | "assembler = VectorAssembler(inputCols=assemblerInputs, outputCol=\"features\")\n", 194 | "#Add the feature assembling part to the pipeline stages\n", 195 | "stages += [assembler]" 196 | ] 197 | }, 198 | { 199 | "cell_type": "code", 200 | "execution_count": 12, 201 | "metadata": { 202 | "collapsed": false 203 | }, 204 | "outputs": [], 205 | "source": [ 206 | "#Create the Pipeline\n", 207 | "pipeline = Pipeline(stages=stages)\n", 208 | "pipelineModel = pipeline.fit(churnDataset)\n", 209 | "churnDataset = pipelineModel.transform(churnDataset)" 210 | ] 211 | }, 212 | { 213 | "cell_type": "code", 214 | "execution_count": 22, 215 | "metadata": { 216 | "collapsed": false 217 | }, 218 | "outputs": [ 219 | { 220 | "name": "stdout", 221 | "output_type": "stream", 222 | "text": [ 223 | "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", 224 | "|features |\n", 225 | "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", 226 | "|(3400,[3056,3349,3382,3384,3385,3386,3387,3388,3389,3390,3391,3392,3393,3394,3395,3396,3397,3398,3399],[1.0,1.0,1.0,128.0,415.0,25.0,265.1,110.0,45.07,197.4,99.0,16.78,244.7,91.0,11.01,10.0,3.0,2.7,1.0])|\n", 227 | "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n", 228 | "only showing top 1 row\n", 229 | "\n" 230 | ] 231 | } 232 | ], 233 | "source": [ 234 | "churnDataset.select(\"features\").show(1,False)" 235 | ] 236 | }, 237 | { 238 | "cell_type": "code", 239 | "execution_count": 13, 240 | "metadata": { 241 | "collapsed": false 242 | }, 243 | "outputs": [], 244 | "source": [ 245 | "# Keep relevant columns\n", 246 | "selectedcols = [\"label\", \"features\"] + cols\n", 247 | "churnDataset = churnDataset.select(selectedcols)" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": 23, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "### Randomly split data into training and validation sets\n", 259 | "(trainingData, testData) = churnDataset.randomSplit([0.7, 0.3], seed = 78799)" 260 | ] 261 | }, 262 | { 263 | "cell_type": "code", 264 | "execution_count": 26, 265 | "metadata": { 266 | "collapsed": false 267 | }, 268 | "outputs": [ 269 | { 270 | "name": "stdout", 271 | "output_type": "stream", 272 | "text": [ 273 | "==================================================\n", 274 | "Training size: [2333] === Test Size: [1000]\n", 275 | "==================================================\n" 276 | ] 277 | } 278 | ], 279 | "source": [ 280 | "print(\"==================================================\")\n", 281 | "print(\"Training size: [\" + str(trainingData.count())+\"] === Test Size: [\"+str(testData.count())+\"]\")\n", 282 | "print(\"==================================================\")\n" 283 | ] 284 | }, 285 | { 286 | "cell_type": "markdown", 287 | "metadata": {}, 288 | "source": [ 289 | "Training of the model including timings" 290 | ] 291 | }, 292 | { 293 | "cell_type": "code", 294 | "execution_count": 16, 295 | "metadata": { 296 | "collapsed": false 297 | }, 298 | "outputs": [ 299 | { 300 | "name": "stdout", 301 | "output_type": "stream", 302 | "text": [ 303 | "Training time is 10.90792441368103s\n" 304 | ] 305 | } 306 | ], 307 | "source": [ 308 | "# Start timer\n", 309 | "start_time = time.time()\n", 310 | "\n", 311 | "# Create an initial RandomForest model.\n", 312 | "rf = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\", maxDepth=5, maxBins=32, numTrees=20)\n", 313 | "\n", 314 | "# Train model with Training Data\n", 315 | "rfModel = rf.fit(trainingData)\n", 316 | "\n", 317 | "# Calculate total time\n", 318 | "train_time = time.time() - start_time\n", 319 | "print(\"Training time is \" + str(train_time) + \"s\")" 320 | ] 321 | }, 322 | { 323 | "cell_type": "code", 324 | "execution_count": 27, 325 | "metadata": { 326 | "collapsed": false 327 | }, 328 | "outputs": [ 329 | { 330 | "data": { 331 | "text/plain": [ 332 | "RandomForestClassificationModel (uid=rfc_2832c50151b2) with 20 trees" 333 | ] 334 | }, 335 | "execution_count": 27, 336 | "metadata": {}, 337 | "output_type": "execute_result" 338 | } 339 | ], 340 | "source": [ 341 | "rfModel" 342 | ] 343 | }, 344 | { 345 | "cell_type": "markdown", 346 | "metadata": {}, 347 | "source": [ 348 | "Validation of the model including timings" 349 | ] 350 | }, 351 | { 352 | "cell_type": "code", 353 | "execution_count": 28, 354 | "metadata": { 355 | "collapsed": false 356 | }, 357 | "outputs": [ 358 | { 359 | "name": "stdout", 360 | "output_type": "stream", 361 | "text": [ 362 | "Evaulation time is 1.014225721359253s\n", 363 | "Total time for training and evaulation is 11.922150135040283s\n" 364 | ] 365 | } 366 | ], 367 | "source": [ 368 | "# Start timer\n", 369 | "start_time = time.time()\n", 370 | "\n", 371 | "# Make predictions on test data using the Transformer.transform() method.\n", 372 | "predictions = rfModel.transform(testData)\n", 373 | "\n", 374 | "# Evaluate model. Default metric is areaUnderROC\n", 375 | "evaluator = BinaryClassificationEvaluator()\n", 376 | "auc = evaluator.evaluate(predictions)\n", 377 | "\n", 378 | "# Calculate total time\n", 379 | "eval_time = time.time() - start_time\n", 380 | "print(\"Evaulation time is \" + str(eval_time) + \"s\")\n", 381 | "\n", 382 | "\n", 383 | "# Print total time for training + evaulation\n", 384 | "print(\"Total time for training and evaulation is \" + str(train_time + eval_time) + \"s\")" 385 | ] 386 | }, 387 | { 388 | "cell_type": "code", 389 | "execution_count": 29, 390 | "metadata": { 391 | "collapsed": false 392 | }, 393 | "outputs": [ 394 | { 395 | "name": "stdout", 396 | "output_type": "stream", 397 | "text": [ 398 | "Area under the curve is : 0.8701236787621239s\n" 399 | ] 400 | } 401 | ], 402 | "source": [ 403 | "print(\"Area under the curve is : \"+str(auc)+\"s\")" 404 | ] 405 | }, 406 | { 407 | "cell_type": "code", 408 | "execution_count": 30, 409 | "metadata": { 410 | "collapsed": true 411 | }, 412 | "outputs": [], 413 | "source": [ 414 | "# View model's predictions and probabilities\n", 415 | "selected = predictions.select(\"label\", \"prediction\", \"probability\")\n" 416 | ] 417 | }, 418 | { 419 | "cell_type": "code", 420 | "execution_count": 32, 421 | "metadata": { 422 | "collapsed": false 423 | }, 424 | "outputs": [ 425 | { 426 | "name": "stdout", 427 | "output_type": "stream", 428 | "text": [ 429 | "+-----+----------+----------------------------------------+\n", 430 | "|label|prediction|probability |\n", 431 | "+-----+----------+----------------------------------------+\n", 432 | "|0.0 |0.0 |[0.8617317493137119,0.1382682506862881] |\n", 433 | "|0.0 |0.0 |[0.8658637362350545,0.13413626376494553]|\n", 434 | "|0.0 |0.0 |[0.8530759766580674,0.1469240233419326] |\n", 435 | "|0.0 |0.0 |[0.7845468788124232,0.2154531211875767] |\n", 436 | "|0.0 |0.0 |[0.8729465847681863,0.12705341523181374]|\n", 437 | "+-----+----------+----------------------------------------+\n", 438 | "only showing top 5 rows\n", 439 | "\n" 440 | ] 441 | } 442 | ], 443 | "source": [ 444 | "selected.show(5,False)" 445 | ] 446 | }, 447 | { 448 | "cell_type": "code", 449 | "execution_count": null, 450 | "metadata": { 451 | "collapsed": false 452 | }, 453 | "outputs": [], 454 | "source": [ 455 | "churnDataset.describe('Churn').show() " 456 | ] 457 | }, 458 | { 459 | "cell_type": "code", 460 | "execution_count": null, 461 | "metadata": { 462 | "collapsed": false 463 | }, 464 | "outputs": [], 465 | "source": [ 466 | "churnDataset.describe('Phone','IntlPlan','VMailPlan','VMailMessage','DayMins','DayCalls','DayCharge','Churn').show() " 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": null, 472 | "metadata": { 473 | "collapsed": false 474 | }, 475 | "outputs": [], 476 | "source": [ 477 | "churnDataset.describe('State','EveMins','EveCalls','EveCharge','NightMins','NightCalls','NightCharge').show()" 478 | ] 479 | }, 480 | { 481 | "cell_type": "code", 482 | "execution_count": null, 483 | "metadata": { 484 | "collapsed": false 485 | }, 486 | "outputs": [], 487 | "source": [ 488 | "churnDataset.describe('AccountLength','AreaCode','IntlMins','IntlCalls','IntlCharge','CustServCalls').show() " 489 | ] 490 | }, 491 | { 492 | "cell_type": "code", 493 | "execution_count": null, 494 | "metadata": { 495 | "collapsed": true 496 | }, 497 | "outputs": [], 498 | "source": [ 499 | "churn = spark.read.option(\"header\",\"true\").csv(\"hdfs://sparkmaster:8020/user/hdfs/sampledata/churn.csv\")\n", 500 | "\n", 501 | "churn.createOrReplaceTempView(\"churn_tab\")\n", 502 | "spark.sql(\"select * from churn_tab limit 2\").show()" 503 | ] 504 | }, 505 | { 506 | "cell_type": "markdown", 507 | "metadata": {}, 508 | "source": [ 509 | "Convert a DataFrame from a Categorical values to Category vectors so that they can be used by logistic regression" 510 | ] 511 | }, 512 | { 513 | "cell_type": "code", 514 | "execution_count": null, 515 | "metadata": { 516 | "collapsed": false 517 | }, 518 | "outputs": [], 519 | "source": [ 520 | "from pyspark.ml.feature import OneHotEncoder, StringIndexer\n", 521 | "\n", 522 | "df = spark.createDataFrame([\n", 523 | " (\"AK\", \"Democrats\"),\n", 524 | " (\"AS\", \"Republicans\"),\n", 525 | " (\"AZ\", \"Democrats\"),\n", 526 | " (\"AR\", \"Republicans\"),\n", 527 | " (\"CT\", \"GreenParty\"),\n", 528 | " (\"DE\", \"Republicans\")\n", 529 | "], [\"State\", \"winparty\"])\n", 530 | "\n", 531 | "stringIndexer = StringIndexer(inputCol=\"winparty\", outputCol=\"winpartyIndex\")\n", 532 | "model = stringIndexer.fit(df)\n", 533 | "indexed = model.transform(df)\n", 534 | "\n", 535 | "encoder = OneHotEncoder(inputCol=\"winpartyIndex\", outputCol=\"winpartyVec\")\n", 536 | "encoded = encoder.transform(indexed)\n", 537 | "encoded.show()" 538 | ] 539 | }, 540 | { 541 | "cell_type": "code", 542 | "execution_count": null, 543 | "metadata": { 544 | "collapsed": false 545 | }, 546 | "outputs": [], 547 | "source": [ 548 | "import math\n", 549 | "from pyspark.sql.functions import mean, min, max, ceil, round\n", 550 | "churnDataset.select(round((mean('AccountLength')),3)).toDF(\"AccountLength\").show()\n", 551 | "churnDataset.describe('State','AreaCode','IntlMins','IntlCalls','IntlCharge','CustServCalls','Churn').show() " 552 | ] 553 | }, 554 | { 555 | "cell_type": "raw", 556 | "metadata": {}, 557 | "source": [ 558 | "This is how you would display a sample chart with Juptyer Notebook" 559 | ] 560 | }, 561 | { 562 | "cell_type": "code", 563 | "execution_count": null, 564 | "metadata": { 565 | "collapsed": false 566 | }, 567 | "outputs": [], 568 | "source": [ 569 | "%matplotlib inline\n", 570 | "import random\n", 571 | "import matplotlib.pyplot as plt\n", 572 | "import pandas as pd\n", 573 | "import numpy as np\n", 574 | "ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))\n", 575 | "ts = ts.cumsum()\n", 576 | "ts.plot()" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": null, 582 | "metadata": { 583 | "collapsed": false 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "import random\n", 588 | "\n", 589 | "# create an RDD of 100 random numbers\n", 590 | "x = [random.normalvariate(0,1) for i in range(100)]\n", 591 | "rdd = sc.parallelize(x)\n", 592 | "\n", 593 | "# plot data in RDD - use .collect() to bring data to local\n", 594 | "num_bins = 50\n", 595 | "np.array(['1','2','3']).astype(np.float)\n", 596 | "#n, bins, patches = plt.hist(rdd.collect(), num_bins, normed=1, facecolor='green', alpha=0.5)\n", 597 | "n, bins, patches = plt.hist(np.array(rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)" 598 | ] 599 | }, 600 | { 601 | "cell_type": "markdown", 602 | "metadata": {}, 603 | "source": [ 604 | "We are now going to plot the histograms for some of the data types to check their overall distribution.\n" 605 | ] 606 | }, 607 | { 608 | "cell_type": "code", 609 | "execution_count": null, 610 | "metadata": { 611 | "collapsed": false 612 | }, 613 | "outputs": [], 614 | "source": [ 615 | "import random\n", 616 | "num_bins = 10\n", 617 | "#n, bins, patches = plt.hist(np.array(churnDataset.select(\"IntlCharge\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)\n", 618 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"IntlMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"Intl Mins\")\n", 619 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"Night Mins\")\n", 620 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='blue', alpha=0.5,label=\"Day Mins\")\n", 621 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"EveMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='orange', alpha=0.5,label=\"Eve Mins\")\n", 622 | "plt.legend(loc='upper right')\n", 623 | "plt.show()\n", 624 | "\n", 625 | "\n" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": null, 631 | "metadata": { 632 | "collapsed": false 633 | }, 634 | "outputs": [], 635 | "source": [ 636 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"IntlMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"Intl Mins\")\n", 637 | "plt.legend(loc='upper right')\n", 638 | "plt.show()\n", 639 | "\n", 640 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"Night Mins\")\n", 641 | "plt.legend(loc='upper right')\n", 642 | "plt.show()\n", 643 | "\n", 644 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='blue', alpha=0.5,label=\"Day Mins\")\n", 645 | "plt.legend(loc='upper right')\n", 646 | "plt.show()\n", 647 | "\n", 648 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"EveMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='orange', alpha=0.5,label=\"Eve Mins\")\n", 649 | "plt.legend(loc='upper right')\n", 650 | "plt.show()" 651 | ] 652 | }, 653 | { 654 | "cell_type": "code", 655 | "execution_count": null, 656 | "metadata": { 657 | "collapsed": false 658 | }, 659 | "outputs": [], 660 | "source": [ 661 | "churnDataset.createOrReplaceTempView(\"churn_tab\")\n", 662 | "churners = spark.sql(\"select * from churn_tab where churn = 'True.'\")\n", 663 | "churners.count()\n" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": null, 669 | "metadata": { 670 | "collapsed": false 671 | }, 672 | "outputs": [], 673 | "source": [ 674 | "nonChurners = spark.sql(\"select * from churn_tab where churn = 'False.'\")\n", 675 | "nonChurners.count()" 676 | ] 677 | }, 678 | { 679 | "cell_type": "code", 680 | "execution_count": null, 681 | "metadata": { 682 | "collapsed": false 683 | }, 684 | "outputs": [], 685 | "source": [ 686 | "n, bins, patches = plt.hist(np.array(churners.select(\"IntlMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"C Intl Mins\")\n", 687 | "plt.legend(loc='upper right')\n", 688 | "plt.show()\n", 689 | "n, bins, patches = plt.hist(np.array(nonChurners.select(\"IntlMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"NC Intl Mins\")\n", 690 | "plt.legend(loc='upper right')\n", 691 | "plt.show()\n", 692 | "\n", 693 | "\n", 694 | "n, bins, patches = plt.hist(np.array(churners.select(\"NightMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"C Night Mins\")\n", 695 | "plt.legend(loc='upper right')\n", 696 | "plt.show()\n", 697 | "n, bins, patches = plt.hist(np.array(nonChurners.select(\"NightMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"NC Night Mins\")\n", 698 | "plt.legend(loc='upper right')\n", 699 | "plt.show()\n", 700 | "\n", 701 | "n, bins, patches = plt.hist(np.array(churners.select(\"DayMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5,label=\"C Day Mins\")\n", 702 | "plt.legend(loc='upper right')\n", 703 | "plt.show()\n", 704 | "n, bins, patches = plt.hist(np.array(nonChurners.select(\"DayMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5,label=\"NC Day Mins\")\n", 705 | "plt.legend(loc='upper right')\n", 706 | "plt.show()\n", 707 | "\n", 708 | "\n", 709 | "n, bins, patches = plt.hist(np.array(churners.select(\"EveMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5,label=\"C Eve Mins\")\n", 710 | "plt.legend(loc='upper right')\n", 711 | "plt.show()\n", 712 | "n, bins, patches = plt.hist(np.array(nonChurners.select(\"EveMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5,label=\"NC Eve Mins\")\n", 713 | "plt.legend(loc='upper right')\n", 714 | "plt.show()\n", 715 | "\n" 716 | ] 717 | }, 718 | { 719 | "cell_type": "code", 720 | "execution_count": null, 721 | "metadata": { 722 | "collapsed": false 723 | }, 724 | "outputs": [], 725 | "source": [ 726 | "churners.createOrReplaceTempView(\"churner_tab\")\n", 727 | "spark.sql(\"select Min(IntlCharge), max(IntlCharge) from churner_tab\").show()\n" 728 | ] 729 | }, 730 | { 731 | "cell_type": "code", 732 | "execution_count": null, 733 | "metadata": { 734 | "collapsed": false 735 | }, 736 | "outputs": [], 737 | "source": [ 738 | "n, bins, patches = plt.hist(np.array(churners.select(\"IntlCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"C Intl Chrg\")\n", 739 | "plt.legend(loc='upper right')\n", 740 | "plt.show()\n", 741 | "n, bins, patches = plt.hist(np.array(nonChurners.select(\"IntlCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"NC Intl Chrg\")\n", 742 | "plt.legend(loc='upper right')\n", 743 | "plt.show()\n", 744 | "\n", 745 | "n, bins, patches = plt.hist(np.array(churners.select(\"NightCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"C Night Chrg\")\n", 746 | "plt.legend(loc='upper right')\n", 747 | "plt.show()\n", 748 | "n, bins, patches = plt.hist(np.array(nonChurners.select(\"NightCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"NC Night Chrg\")\n", 749 | "plt.legend(loc='upper right')\n", 750 | "plt.show()\n", 751 | "\n", 752 | "n, bins, patches = plt.hist(np.array(churners.select(\"DayCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5,label=\"C Day Chrg\")\n", 753 | "plt.legend(loc='upper right')\n", 754 | "plt.show()\n", 755 | "n, bins, patches = plt.hist(np.array(nonChurners.select(\"DayCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5,label=\"NC Day Chrg\")\n", 756 | "plt.legend(loc='upper right')\n", 757 | "plt.show()\n", 758 | "\n", 759 | "\n", 760 | "n, bins, patches = plt.hist(np.array(churners.select(\"EveCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5,label=\"C Eve Chrg\")\n", 761 | "plt.legend(loc='upper right')\n", 762 | "plt.show()\n", 763 | "n, bins, patches = plt.hist(np.array(nonChurners.select(\"EveCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5,label=\"NC Eve Chrg\")\n", 764 | "plt.legend(loc='upper right')\n", 765 | "plt.show()\n" 766 | ] 767 | }, 768 | { 769 | "cell_type": "code", 770 | "execution_count": null, 771 | "metadata": { 772 | "collapsed": false 773 | }, 774 | "outputs": [], 775 | "source": [ 776 | "#import random\n", 777 | "#num_bins = 10\n", 778 | "#data=np.vstack([np.array(churnDataset.select(\"DayMins\").rdd.takeSample(True,20,1)).astype(np.float),np.array(churnDataset.select(\"EveMins\").rdd.takeSample(True,20,1)).astype(np.float),np.array(churnDataset.select(\"NightMins\").rdd.takeSample(True,20,1)).astype(np.float)]).T\n", 779 | "#n, bins, patches = plt.hist(data, num_bins, normed=0, facecolor='green', alpha=0.5, label=[\"Inlt Charge\",\"Night Charge\",\"Day Charge\"])\n", 780 | "#plt.legend(loc='upper right')\n", 781 | "#plt.show()\n", 782 | "\n", 783 | "\n", 784 | "import random\n", 785 | "num_bins = 10\n", 786 | "\n", 787 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"IntlCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"Intl Charge\")\n", 788 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"Night Charge\")\n", 789 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='blue', alpha=0.5,label=\"Day Charge\")\n", 790 | "plt.legend(loc='upper right')\n", 791 | "plt.show()\n", 792 | "\n", 793 | "\n", 794 | "\n" 795 | ] 796 | }, 797 | { 798 | "cell_type": "code", 799 | "execution_count": null, 800 | "metadata": { 801 | "collapsed": false 802 | }, 803 | "outputs": [], 804 | "source": [ 805 | "import random\n", 806 | "num_bins = 10\n", 807 | "\n", 808 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightCalls\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='Red', alpha=0.5, label=\"Night Calls\")\n", 809 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"IntlCalls\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"Intl Calls\")\n", 810 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayCalls\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='blue', alpha=0.5,label=\"Day Calls\")\n", 811 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"EveCalls\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='Pink', alpha=0.5,label=\"Eve Calls\")\n", 812 | "plt.legend(loc='upper right')\n", 813 | "plt.show()" 814 | ] 815 | }, 816 | { 817 | "cell_type": "code", 818 | "execution_count": null, 819 | "metadata": { 820 | "collapsed": false 821 | }, 822 | "outputs": [], 823 | "source": [ 824 | "churnDataset.createOrReplaceTempView(\"churn_tab\")\n", 825 | "churners = spark.sql(\"select * from churn_tab where churn='True.'\")\n", 826 | "nonChurners = spark.sql(\"select * from churn_tab where churn='False.'\")\n", 827 | "churners.count()\n", 828 | "print(\"churners = \"+ str(churners.count()) +\" and non churneres = \"+str(nonChurners.count())+\" \")" 829 | ] 830 | }, 831 | { 832 | "cell_type": "code", 833 | "execution_count": null, 834 | "metadata": { 835 | "collapsed": true 836 | }, 837 | "outputs": [], 838 | "source": [] 839 | }, 840 | { 841 | "cell_type": "code", 842 | "execution_count": null, 843 | "metadata": { 844 | "collapsed": false 845 | }, 846 | "outputs": [], 847 | "source": [ 848 | "import numpy as np\n", 849 | "import matplotlib.pyplot as plt\n", 850 | "\n", 851 | "\n", 852 | "N = 50\n", 853 | "x = np.array(churners.select(\"IntlMins\").rdd.takeSample(False,250,1)).astype(np.float)\n", 854 | "y = np.array(nonChurners.select(\"IntlMins\").rdd.takeSample(False,250,1)).astype(np.float)\n", 855 | "colors = ['Red','Green']\n", 856 | "area = np.pi * (15 * np.random.rand(N))**2 # 0 to 15 point radii\n", 857 | "\n", 858 | "plt.scatter(x, y, s=area, c=colors, alpha=0.5)\n", 859 | "plt.show()" 860 | ] 861 | }, 862 | { 863 | "cell_type": "code", 864 | "execution_count": null, 865 | "metadata": { 866 | "collapsed": false 867 | }, 868 | "outputs": [], 869 | "source": [ 870 | "import numpy as np\n", 871 | "import matplotlib.pyplot as plt\n", 872 | "\n", 873 | "\n", 874 | "N = 50\n", 875 | "x = np.array(churners.select(\"CustServCalls\").rdd.takeSample(False,350,1)).astype(np.float)\n", 876 | "y = np.array(nonChurners.select(\"CustServCalls\").rdd.takeSample(False,350,1)).astype(np.float)\n", 877 | "colors = ['Red','Green']\n", 878 | "area = np.pi * (15 * np.random.rand(N))**2 # 0 to 15 point radii\n", 879 | "\n", 880 | "plt.scatter(x, y, s=area, c=colors, alpha=0.5)\n", 881 | "plt.show()" 882 | ] 883 | }, 884 | { 885 | "cell_type": "code", 886 | "execution_count": null, 887 | "metadata": { 888 | "collapsed": false 889 | }, 890 | "outputs": [], 891 | "source": [ 892 | "%%sh\n", 893 | "pip install plotly" 894 | ] 895 | }, 896 | { 897 | "cell_type": "code", 898 | "execution_count": null, 899 | "metadata": { 900 | "collapsed": false 901 | }, 902 | "outputs": [], 903 | "source": [ 904 | "import plotly.plotly as py\n", 905 | "from plotly.graph_objs import *\n", 906 | "import pandas as pd\n", 907 | "import requests\n", 908 | "requests.packages.urllib3.disable_warnings()\n", 909 | "\n", 910 | "import plotly.tools as tls\n", 911 | "tls.set_credentials_file(username='masifabbasi', api_key='qX37gH9e7nhdEcuV6zSJ')\n", 912 | "\n", 913 | "\n", 914 | "eveMinsChurners = Data([Histogram(x=churners.select('EveMins').rdd.collect())])\n", 915 | "py.iplot(eveMinsChurners, filename=\"even_minchurners\")\n" 916 | ] 917 | }, 918 | { 919 | "cell_type": "markdown", 920 | "metadata": {}, 921 | "source": [ 922 | "-- Lets look at a scatter plot for Evening Mins for Churners vs. Non churners" 923 | ] 924 | }, 925 | { 926 | "cell_type": "code", 927 | "execution_count": null, 928 | "metadata": { 929 | "collapsed": false 930 | }, 931 | "outputs": [], 932 | "source": [ 933 | "import plotly.tools as tls\n", 934 | "import plotly.plotly as py\n", 935 | "import plotly.graph_objs as go\n", 936 | "\n", 937 | "# Create random data with numpy\n", 938 | "import numpy as np\n", 939 | "\n", 940 | "tls.set_credentials_file(username='masifabbasi', api_key='qX37gH9e7nhdEcuV6zSJ')\n", 941 | "c = np.array(churners.select(\"CustServCalls\").sample(False,0.9,1).limit(200).rdd.collect()).astype(np.float)\n", 942 | "nc = np.array(nonChurners.select(\"CustServCalls\").sample(False,0.9,1).limit(200).rdd.collect()).astype(np.float)\n", 943 | "\n", 944 | "# N = 1000\n", 945 | "# random_x = np.random.randn(N)\n", 946 | "# random_y = np.random.randn(N)\n", 947 | "# for i,j in zip(c.ravel(),nc.ravel()):\n", 948 | "Churners = go.Scatter(\n", 949 | " y = c.ravel(),\n", 950 | " mode = 'markers',\n", 951 | " marker = dict(\n", 952 | " color='red'\n", 953 | " )\n", 954 | ")\n", 955 | "\n", 956 | "NonChurners = go.Scatter(\n", 957 | " y = nc.ravel(),\n", 958 | " mode = 'markers',\n", 959 | " marker = dict(\n", 960 | " color='green'\n", 961 | " )\n", 962 | ")\n", 963 | " \n", 964 | "layout = go.Layout(\n", 965 | " title='Customer Service Calls',\n", 966 | " xaxis=dict(\n", 967 | " title='Customers',\n", 968 | " titlefont=dict(\n", 969 | " family='Courier New, monospace',\n", 970 | " size=18,\n", 971 | " color='#7f7f7f'\n", 972 | " )\n", 973 | " ),\n", 974 | " yaxis=dict(\n", 975 | " title='Number of Calls to Cust Service',\n", 976 | " titlefont=dict(\n", 977 | " family='Courier New, monospace',\n", 978 | " size=18,\n", 979 | " color='#7f7f7f'\n", 980 | " )\n", 981 | " )\n", 982 | ")\n", 983 | "\n", 984 | "# Create a trace\n", 985 | "\n", 986 | "data = [Churners,NonChurners]\n", 987 | "\n", 988 | "fig = go.Figure(data=data, layout=layout)\n", 989 | "py.iplot(fig, filename='basic-scatter')\n", 990 | "\n", 991 | "# ./\n", 992 | "# # Plot and embed in ipython notebook!\n", 993 | "# py.iplot(data, layout=layout, filename='basic-scatter')" 994 | ] 995 | }, 996 | { 997 | "cell_type": "code", 998 | "execution_count": null, 999 | "metadata": { 1000 | "collapsed": false 1001 | }, 1002 | "outputs": [], 1003 | "source": [ 1004 | "import numpy as np\n", 1005 | "a = np.array([7,1,4,8,1,3,2,5])\n", 1006 | "a= np.sort(a)\n", 1007 | "print(\"Array = \"+str(a))" 1008 | ] 1009 | }, 1010 | { 1011 | "cell_type": "code", 1012 | "execution_count": null, 1013 | "metadata": { 1014 | "collapsed": false 1015 | }, 1016 | "outputs": [], 1017 | "source": [ 1018 | "import plotly.tools as tls\n", 1019 | "tls.set_credentials_file(username='masifabbasi', api_key='qX37gH9e7nhdEcuV6zSJ')\n", 1020 | "c = np.array(churners.select(\"CustServCalls\").sample(False,0.2,1).limit(400).rdd.collect()).astype(np.float)\n", 1021 | "nc = np.array(nonChurners.select(\"CustServCalls\").sample(False,0.2,1).limit(400).rdd.collect()).astype(np.float)\n", 1022 | "\n", 1023 | "import plotly.plotly as py\n", 1024 | "import plotly.graph_objs as go\n", 1025 | "\n", 1026 | "# Create random data with numpy\n", 1027 | "import numpy as np\n", 1028 | "N = len(c)\n", 1029 | "random_x=random.sample(range(1, N+2), N)\n", 1030 | "\n", 1031 | "c= np.sort(c)\n", 1032 | "nc = np.sort(nc)\n", 1033 | "random_x = np.sort(random_x)\n", 1034 | "\n", 1035 | "Churners = go.Scatter(\n", 1036 | " x = random_x,\n", 1037 | " y = c.ravel(), \n", 1038 | " name ='Churners',\n", 1039 | " mode = 'markers',\n", 1040 | " marker = dict(\n", 1041 | " color='red'\n", 1042 | " ),\n", 1043 | " line = dict(\n", 1044 | " width = 2,\n", 1045 | " color = 'rgb(0, 0, 0)'\n", 1046 | " )\n", 1047 | ")\n", 1048 | "\n", 1049 | "NonChurners = go.Scatter(\n", 1050 | " x = random_x,\n", 1051 | " y = nc.ravel(),\n", 1052 | " name = 'Non-Churners',\n", 1053 | " mode = 'markers',\n", 1054 | " marker = dict(\n", 1055 | " color='green'\n", 1056 | " )\n", 1057 | ")\n", 1058 | " \n", 1059 | "\n", 1060 | "layout = go.Layout(\n", 1061 | " title='Customer Service Calls',\n", 1062 | " xaxis=dict(\n", 1063 | " title='Customers',\n", 1064 | " titlefont=dict(\n", 1065 | " family='Courier New, monospace',\n", 1066 | " size=18,\n", 1067 | " color='#7f7f7f'\n", 1068 | " )\n", 1069 | " ),\n", 1070 | " yaxis=dict(\n", 1071 | " title='Number of Calls to Cust Service',\n", 1072 | " titlefont=dict(\n", 1073 | " family='Courier New, monospace',\n", 1074 | " size=18,\n", 1075 | " color='#7f7f7f'\n", 1076 | " )\n", 1077 | " )\n", 1078 | ")\n", 1079 | "\n", 1080 | "# Create a trace\n", 1081 | "\n", 1082 | "data = [Churners,NonChurners]\n", 1083 | "\n", 1084 | "fig = go.Figure(data=data, layout=layout)\n", 1085 | "py.iplot(fig, filename='basic-scatter')\n" 1086 | ] 1087 | }, 1088 | { 1089 | "cell_type": "code", 1090 | "execution_count": null, 1091 | "metadata": { 1092 | "collapsed": false 1093 | }, 1094 | "outputs": [], 1095 | "source": [ 1096 | "#Imports\n", 1097 | "import plotly.plotly as py\n", 1098 | "import plotly.graph_objs as go\n", 1099 | "\n", 1100 | "#Separating the data into Churn and Non-Churn Data Set\n", 1101 | "churnDataset.createOrReplaceTempView(\"churn_tab\")\n", 1102 | "churners = spark.sql(\"select * from churn_tab where churn='True.'\")\n", 1103 | "nonChurners = spark.sql(\"select * from churn_tab where churn='False.'\")\n", 1104 | "\n", 1105 | "#Getting the Count for churners/Non-Churners\n", 1106 | "churnCnt = churners.count()\n", 1107 | "nonChurnCnt = nonChurners.count()\n", 1108 | "\n", 1109 | "data = [go.Bar(\n", 1110 | " x=['Churners', 'Non-Churners'],\n", 1111 | " y=[churnCnt, nonChurnCnt]\n", 1112 | " )]\n", 1113 | "\n", 1114 | "py.iplot(data, filename='Churn-NonChurn Plot')" 1115 | ] 1116 | }, 1117 | { 1118 | "cell_type": "code", 1119 | "execution_count": null, 1120 | "metadata": { 1121 | "collapsed": false 1122 | }, 1123 | "outputs": [], 1124 | "source": [ 1125 | "import plotly.tools as tls\n", 1126 | "tls.set_credentials_file(username='masifabbasi', api_key='qX37gH9e7nhdEcuV6zSJ')\n", 1127 | "c = np.array(churners.select(\"EveMins\").sample(False,0.2,1).limit(200).rdd.collect()).astype(np.float)\n", 1128 | "nc = np.array(non_churners.select(\"EveMins\").sample(False,0.2,1).limit(200).rdd.collect()).astype(np.float)\n", 1129 | "\n", 1130 | "import plotly.plotly as py\n", 1131 | "import plotly.graph_objs as go\n", 1132 | "\n", 1133 | "# Create random data with numpy\n", 1134 | "import numpy as np\n", 1135 | "\n", 1136 | "# N = 1000\n", 1137 | "# random_x = np.random.randn(N)\n", 1138 | "# random_y = np.random.randn(N)\n", 1139 | "# for i,j in zip(c.ravel(),nc.ravel()):\n", 1140 | "trace = go.Scatter(\n", 1141 | " x = c.ravel(),\n", 1142 | " y = nc.ravel(),\n", 1143 | " mode = 'markers',\n", 1144 | " marker = dict(\n", 1145 | " color='FFBAD2'\n", 1146 | " )\n", 1147 | ")\n", 1148 | "\n", 1149 | "# Create a trace\n", 1150 | "\n", 1151 | "data = [trace]\n", 1152 | "# ./\n", 1153 | "# # Plot and embed in ipython notebook!\n", 1154 | "py.iplot(data, filename='basic-scatter')" 1155 | ] 1156 | }, 1157 | { 1158 | "cell_type": "code", 1159 | "execution_count": null, 1160 | "metadata": { 1161 | "collapsed": false 1162 | }, 1163 | "outputs": [], 1164 | "source": [ 1165 | "import plotly.tools as tls\n", 1166 | "tls.set_credentials_file(username='masifabbasi', api_key='qX37gH9e7nhdEcuV6zSJ')\n", 1167 | "c = np.array(churners.select(\"EveMins\").sample(False,0.2,1).limit(200).rdd.collect()).astype(np.float)\n", 1168 | "nc = np.array(non_churners.select(\"EveMins\").sample(False,0.2,1).limit(200).rdd.collect()).astype(np.float)\n", 1169 | "\n", 1170 | "import plotly.plotly as py\n", 1171 | "import plotly.graph_objs as go\n", 1172 | "\n", 1173 | "# Create random data with numpy\n", 1174 | "import numpy as np\n", 1175 | "\n", 1176 | "# N = 1000\n", 1177 | "# random_x = np.random.randn(N)\n", 1178 | "# random_y = np.random.randn(N)\n", 1179 | "# for i,j in zip(c.ravel(),nc.ravel()):\n", 1180 | "Churners = go.Scatter(\n", 1181 | " x = c.ravel(),\n", 1182 | " mode = 'markers',\n", 1183 | " marker = dict(\n", 1184 | " color='red'\n", 1185 | " )\n", 1186 | ")\n", 1187 | "\n", 1188 | "NonChurners = go.Scatter(\n", 1189 | " x = nc.ravel(),\n", 1190 | " mode = 'markers',\n", 1191 | " marker = dict(\n", 1192 | " color='blue'\n", 1193 | " )\n", 1194 | ")\n", 1195 | " \n", 1196 | "\n", 1197 | "# Create a trace\n", 1198 | "\n", 1199 | "data = [Churners,NonChurners]\n", 1200 | "# ./\n", 1201 | "# # Plot and embed in ipython notebook!\n", 1202 | "py.iplot(data, filename='basic-scatter')" 1203 | ] 1204 | }, 1205 | { 1206 | "cell_type": "code", 1207 | "execution_count": null, 1208 | "metadata": { 1209 | "collapsed": false 1210 | }, 1211 | "outputs": [], 1212 | "source": [ 1213 | "n1, bins1, patches1 = plt.hist(np.array(churnDataset.select(\"IntlCalls\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='red', alpha=0.5)\n" 1214 | ] 1215 | }, 1216 | { 1217 | "cell_type": "code", 1218 | "execution_count": null, 1219 | "metadata": { 1220 | "collapsed": false 1221 | }, 1222 | "outputs": [], 1223 | "source": [ 1224 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightMins\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='blue', alpha=0.5)\n" 1225 | ] 1226 | }, 1227 | { 1228 | "cell_type": "code", 1229 | "execution_count": null, 1230 | "metadata": { 1231 | "collapsed": false 1232 | }, 1233 | "outputs": [], 1234 | "source": [ 1235 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightCalls\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)\n" 1236 | ] 1237 | }, 1238 | { 1239 | "cell_type": "code", 1240 | "execution_count": null, 1241 | "metadata": { 1242 | "collapsed": false 1243 | }, 1244 | "outputs": [], 1245 | "source": [ 1246 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightCharge\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)\n" 1247 | ] 1248 | }, 1249 | { 1250 | "cell_type": "code", 1251 | "execution_count": null, 1252 | "metadata": { 1253 | "collapsed": false 1254 | }, 1255 | "outputs": [], 1256 | "source": [ 1257 | "#'Phone','IntlPlan','VMailPlan','VMailMessage','DayMins','DayCalls','DayCharge','Churn\n", 1258 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayMins\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)" 1259 | ] 1260 | }, 1261 | { 1262 | "cell_type": "code", 1263 | "execution_count": null, 1264 | "metadata": { 1265 | "collapsed": false 1266 | }, 1267 | "outputs": [], 1268 | "source": [ 1269 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayCalls\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)" 1270 | ] 1271 | }, 1272 | { 1273 | "cell_type": "code", 1274 | "execution_count": null, 1275 | "metadata": { 1276 | "collapsed": false 1277 | }, 1278 | "outputs": [], 1279 | "source": [ 1280 | "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayCharge\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)" 1281 | ] 1282 | }, 1283 | { 1284 | "cell_type": "code", 1285 | "execution_count": null, 1286 | "metadata": { 1287 | "collapsed": false 1288 | }, 1289 | "outputs": [], 1290 | "source": [ 1291 | "import matplotlib.pyplot as plt\n", 1292 | "import numpy as np\n", 1293 | "\n", 1294 | "data = [(' whitefield', 65299), (' bellandur', 57061), (' kundalahalli', 51769), (' marathahalli', 50639),\n", 1295 | "(' electronic city', 44041), (' sarjapur road junction', 34164), (' indiranagar 2nd stage', 32459),\n", 1296 | "(' malleswaram', 32171), (' yelahanka main road', 28901), (' domlur', 28869)]\n", 1297 | "\n", 1298 | "freequency = []\n", 1299 | "words = []\n", 1300 | "\n", 1301 | "for line in data:\n", 1302 | " freequency.append(line[1])\n", 1303 | " words.append(line[0])\n", 1304 | "\n", 1305 | "y_axis = np.arange(1, len(words) + 1, 1)\n", 1306 | "\n", 1307 | "plt.barh(y_axis, freequency, align='center')\n", 1308 | "plt.yticks(y_axis, words)\n", 1309 | "plt.show()" 1310 | ] 1311 | }, 1312 | { 1313 | "cell_type": "code", 1314 | "execution_count": null, 1315 | "metadata": { 1316 | "collapsed": false 1317 | }, 1318 | "outputs": [], 1319 | "source": [ 1320 | "churnDataset.createOrReplaceTempView(\"churn_tab\")\n", 1321 | "vmailplan = spark.sql(\"select VmailPlan, count(*) as cnt from churn_tab group by VmailPlan \")\n", 1322 | "\n", 1323 | "# plt.barh(y_axis, vmailplan.select(\"cnt\").rdd.collect(), align='center')\n", 1324 | "# plt.yticks(y_axis, vmailplan.select(\"VmailPlan\").rdd.collect())\n", 1325 | "# plt.show()" 1326 | ] 1327 | }, 1328 | { 1329 | "cell_type": "code", 1330 | "execution_count": null, 1331 | "metadata": { 1332 | "collapsed": false 1333 | }, 1334 | "outputs": [], 1335 | "source": [ 1336 | "churnDataset.createOrReplaceTempView(\"churn_tab\")\n", 1337 | "vmailplan = spark.sql(\"select VmailPlan, count(*) as cnt from churn_tab group by VmailPlan \")\n", 1338 | "vmailplan.show()\n", 1339 | "\n" 1340 | ] 1341 | }, 1342 | { 1343 | "cell_type": "code", 1344 | "execution_count": null, 1345 | "metadata": { 1346 | "collapsed": false 1347 | }, 1348 | "outputs": [], 1349 | "source": [ 1350 | "import matplotlib.pyplot as plt\n", 1351 | "%matplotlib inline\n", 1352 | "\n", 1353 | "x_labels= vmailplan['VMAILPLAN'].values\n", 1354 | "fig = vmailplan[['cnt']].plot(kind='bar', facecolor='lightblue')\n", 1355 | "fig.set_xticklabels(x_labels)\n", 1356 | "fig.set_title('Vmail Plans')\n", 1357 | "fig.set_xlabel('Voice Mail Plan ')\n", 1358 | "fig.set_ylabel('Number of People')\n", 1359 | "plt.show()\n" 1360 | ] 1361 | }, 1362 | { 1363 | "cell_type": "code", 1364 | "execution_count": null, 1365 | "metadata": { 1366 | "collapsed": false 1367 | }, 1368 | "outputs": [], 1369 | "source": [ 1370 | "import numpy as np\n", 1371 | "import matplotlib.mlab as mlab\n", 1372 | "import matplotlib.pyplot as plt\n", 1373 | "\n", 1374 | "np.random.seed(0)\n", 1375 | "\n", 1376 | "# example data\n", 1377 | "mu = 100\n", 1378 | "sigma = 15 # standard deviation of distribution\n", 1379 | "x = mu + sigma * np.random.randn(437)\n", 1380 | "\n", 1381 | "num_bins = 50\n", 1382 | "\n", 1383 | "fig, ax = plt.subplots()\n", 1384 | "\n", 1385 | "# the histogram of the data\n", 1386 | "n, bins, patches = ax.hist(x, num_bins, normed=1)\n", 1387 | "\n", 1388 | "# add a 'best fit' line\n", 1389 | "y = mlab.normpdf(bins, mu, sigma)\n", 1390 | "ax.plot(bins, y, '--')\n", 1391 | "ax.set_xlabel('Smarts')\n", 1392 | "ax.set_ylabel('Probability density')\n", 1393 | "ax.set_title(r'Histogram of IQ: $\\mu=100$, $\\sigma=15$')\n", 1394 | "\n", 1395 | "# Tweak spacing to prevent clipping of ylabel\n", 1396 | "fig.tight_layout()\n", 1397 | "plt.show()" 1398 | ] 1399 | }, 1400 | { 1401 | "cell_type": "code", 1402 | "execution_count": null, 1403 | "metadata": { 1404 | "collapsed": false 1405 | }, 1406 | "outputs": [], 1407 | "source": [ 1408 | "\"\"\"\n", 1409 | "hexbin is an axes method or pyplot function that is essentially\n", 1410 | "a pcolor of a 2-D histogram with hexagonal cells. It can be\n", 1411 | "much more informative than a scatter plot; in the first subplot\n", 1412 | "below, try substituting 'scatter' for 'hexbin'.\n", 1413 | "\"\"\"\n", 1414 | "\n", 1415 | "import numpy as np\n", 1416 | "import matplotlib.pyplot as plt\n", 1417 | "\n", 1418 | "np.random.seed(0)\n", 1419 | "n = 100000\n", 1420 | "x = np.random.standard_normal(n)\n", 1421 | "y = 2.0 + 3.0 * x + 4.0 * np.random.standard_normal(n)\n", 1422 | "xmin = x.min()\n", 1423 | "xmax = x.max()\n", 1424 | "ymin = y.min()\n", 1425 | "ymax = y.max()\n", 1426 | "\n", 1427 | "fig, axs = plt.subplots(ncols=2, sharey=True, figsize=(7, 4))\n", 1428 | "fig.subplots_adjust(hspace=0.5, left=0.07, right=0.93)\n", 1429 | "ax = axs[0]\n", 1430 | "hb = ax.hexbin(x, y, gridsize=50, cmap='inferno')\n", 1431 | "ax.axis([xmin, xmax, ymin, ymax])\n", 1432 | "ax.set_title(\"Hexagon binning\")\n", 1433 | "cb = fig.colorbar(hb, ax=ax)\n", 1434 | "cb.set_label('counts')\n", 1435 | "\n", 1436 | "ax = axs[1]\n", 1437 | "hb = ax.hexbin(x, y, gridsize=50, bins='log', cmap='inferno')\n", 1438 | "ax.axis([xmin, xmax, ymin, ymax])\n", 1439 | "ax.set_title(\"With a log color scale\")\n", 1440 | "cb = fig.colorbar(hb, ax=ax)\n", 1441 | "cb.set_label('log10(N)')\n", 1442 | "\n", 1443 | "plt.show()\n" 1444 | ] 1445 | }, 1446 | { 1447 | "cell_type": "code", 1448 | "execution_count": null, 1449 | "metadata": { 1450 | "collapsed": false, 1451 | "scrolled": false 1452 | }, 1453 | "outputs": [], 1454 | "source": [ 1455 | "\"\"\"\n", 1456 | "Simple demo of a scatter plot.\n", 1457 | "\"\"\"\n", 1458 | "import numpy as np\n", 1459 | "import matplotlib.pyplot as plt\n", 1460 | "\n", 1461 | "\n", 1462 | "N = 50\n", 1463 | "x = np.random.rand(N)\n", 1464 | "y = np.random.rand(N)\n", 1465 | "colors = np.random.rand(N)\n", 1466 | "area = np.pi * (15 * np.random.rand(N))**2 # 0 to 15 point radii\n", 1467 | "\n", 1468 | "plt.scatter(x, y, s=area, c=colors, alpha=0.5)\n", 1469 | "plt.show()\n" 1470 | ] 1471 | }, 1472 | { 1473 | "cell_type": "code", 1474 | "execution_count": null, 1475 | "metadata": { 1476 | "collapsed": false 1477 | }, 1478 | "outputs": [], 1479 | "source": [ 1480 | "import matplotlib.pyplot as plt\n", 1481 | "import numpy as np\n", 1482 | "\n", 1483 | "x = np.arange(0.0, 2, 0.01)\n", 1484 | "y1 = np.sin(2*np.pi*x)\n", 1485 | "y2 = 1.2*np.sin(4*np.pi*x)\n", 1486 | "\n", 1487 | "fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True)\n", 1488 | "\n", 1489 | "ax1.fill_between(x, 0, y1)\n", 1490 | "ax1.set_ylabel('between y1 and 0')\n", 1491 | "\n", 1492 | "ax2.fill_between(x, y1, 1)\n", 1493 | "ax2.set_ylabel('between y1 and 1')\n", 1494 | "\n", 1495 | "ax3.fill_between(x, y1, y2)\n", 1496 | "ax3.set_ylabel('between y1 and y2')\n", 1497 | "ax3.set_xlabel('x')\n", 1498 | "\n", 1499 | "# now fill between y1 and y2 where a logical condition is met. Note\n", 1500 | "# this is different than calling\n", 1501 | "# fill_between(x[where], y1[where],y2[where]\n", 1502 | "# because of edge effects over multiple contiguous regions.\n", 1503 | "fig, (ax, ax1) = plt.subplots(2, 1, sharex=True)\n", 1504 | "ax.plot(x, y1, x, y2, color='black')\n", 1505 | "ax.fill_between(x, y1, y2, where=y2 >= y1, facecolor='green', interpolate=True)\n", 1506 | "ax.fill_between(x, y1, y2, where=y2 <= y1, facecolor='red', interpolate=True)\n", 1507 | "ax.set_title('fill between where')\n", 1508 | "\n", 1509 | "# Test support for masked arrays.\n", 1510 | "y2 = np.ma.masked_greater(y2, 1.0)\n", 1511 | "ax1.plot(x, y1, x, y2, color='black')\n", 1512 | "ax1.fill_between(x, y1, y2, where=y2 >= y1, facecolor='green', interpolate=True)\n", 1513 | "ax1.fill_between(x, y1, y2, where=y2 <= y1, facecolor='red', interpolate=True)\n", 1514 | "ax1.set_title('Now regions with y2>1 are masked')\n", 1515 | "\n", 1516 | "# This example illustrates a problem; because of the data\n", 1517 | "# gridding, there are undesired unfilled triangles at the crossover\n", 1518 | "# points. A brute-force solution would be to interpolate all\n", 1519 | "# arrays to a very fine grid before plotting.\n", 1520 | "\n", 1521 | "# show how to use transforms to create axes spans where a certain condition is satisfied\n", 1522 | "fig, ax = plt.subplots()\n", 1523 | "y = np.sin(4*np.pi*x)\n", 1524 | "ax.plot(x, y, color='black')\n", 1525 | "\n", 1526 | "# use the data coordinates for the x-axis and the axes coordinates for the y-axis\n", 1527 | "import matplotlib.transforms as mtransforms\n", 1528 | "trans = mtransforms.blended_transform_factory(ax.transData, ax.transAxes)\n", 1529 | "theta = 0.9\n", 1530 | "ax.axhline(theta, color='green', lw=2, alpha=0.5)\n", 1531 | "ax.axhline(-theta, color='red', lw=2, alpha=0.5)\n", 1532 | "ax.fill_between(x, 0, 1, where=y > theta, facecolor='green', alpha=0.5, transform=trans)\n", 1533 | "ax.fill_between(x, 0, 1, where=y < -theta, facecolor='red', alpha=0.5, transform=trans)\n", 1534 | "\n", 1535 | "\n", 1536 | "plt.show()\n" 1537 | ] 1538 | }, 1539 | { 1540 | "cell_type": "code", 1541 | "execution_count": null, 1542 | "metadata": { 1543 | "collapsed": false 1544 | }, 1545 | "outputs": [], 1546 | "source": [ 1547 | "\"\"\"\n", 1548 | "Small demonstration of the hlines and vlines plots.\n", 1549 | "\"\"\"\n", 1550 | "\n", 1551 | "import matplotlib.pyplot as plt\n", 1552 | "import numpy as np\n", 1553 | "import numpy.random as rnd\n", 1554 | "\n", 1555 | "\n", 1556 | "def f(t):\n", 1557 | " s1 = np.sin(2 * np.pi * t --------------------------------------------------------------------------------