├── Chapter04
    ├── CallDetailRecord.java
    ├── ReadWriteParquet.scala
    ├── Example 4-4.r
    ├── Code-Snippets.scala
    ├── Example 4-1.scala
    ├── SparkSQLHiveIntegration.py
    ├── SparkSQLHiveIntegration.java
    ├── Example 4-3.py
    ├── Example 4-2.java
    ├── ReadWriteParquet.py
    ├── ReadWriteParquet.r
    ├── SparkSQLHiveIntegration.r
    ├── ReadWriteParquet.java
    └── RDDConversion.java
├── Chapter06
    ├── BuildingPipeline.scala
    ├── Example01.scala
    └── BuildingPipeline.py
├── Chapter07
    ├── BuildingGraph.scala
    ├── TerrorAnalytics-GraphFrames.scala
    ├── ConnectedComponents.scala
    └── BuildPageRank.scala
├── Chapter03
    ├── Example 03-09.py
    ├── Example 03-07.scala
    ├── Example 03-08.scala
    ├── Example 03-11.scala
    ├── Example 03-12.py
    ├── Example 03-01.scala
    ├── Example 03-03.scala
    ├── Example 03-02.py
    ├── Example 03-05.py
    ├── Example 03-10.java
    ├── Example 03-03.java
    └── Example 03-06.java
├── Chapter02
    ├── Example2-2.Py
    ├── Example2-1.scala
    ├── Example2-3.java
    ├── Example2-8.Py
    ├── Example2-11.Py
    ├── Example2-7.scala
    ├── Example2-10.scala
    ├── Example2-12.java
    ├── Example2-5.Py
    ├── Example2-9.java
    ├── Example2-18.scala
    ├── Example2-4.scala
    ├── Example2-14.Py
    ├── Example2-16.scala
    ├── Example2-13.scala
    ├── Example2-17.scala
    ├── Example2-15.java
    ├── Example2-6.java
    ├── Example2-20.scala
    └── Example2-19.scala
├── Chapter05
    └── StreamingWordCount.scala
├── DataSets
    ├── products.json
    └── cdrs.json
├── LICENSE
├── Chapter09
    └── BuildRecommendationEngine.scala
├── README.md
└── Chapter10
    └── ChurnPrediction.ipynb


/Chapter04/CallDetailRecord.java:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Chapter04/ReadWriteParquet.scala:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Chapter06/BuildingPipeline.scala:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Chapter07/BuildingGraph.scala:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/Chapter03/Example 03-09.py:
--------------------------------------------------------------------------------
1 | testDS = spark.read.csv("/home/spark/sampledata/test.tsv",sep="\t")


--------------------------------------------------------------------------------
/Chapter03/Example 03-07.scala:
--------------------------------------------------------------------------------
1 | pricePaidDS = spark.write.format(“csv”).save("/home/spark/sampledata/price_paid_output”)


--------------------------------------------------------------------------------
/Chapter04/Example 4-4.r:
--------------------------------------------------------------------------------
1 | sparkR.session(appName = "MyApp", sparkConfig = list(spark.some.config.option = "some-value"))


--------------------------------------------------------------------------------
/Chapter02/Example2-2.Py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter02/Example2-2.Py


--------------------------------------------------------------------------------
/Chapter02/Example2-1.scala:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter02/Example2-1.scala


--------------------------------------------------------------------------------
/Chapter02/Example2-3.java:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter02/Example2-3.java


--------------------------------------------------------------------------------
/Chapter03/Example 03-08.scala:
--------------------------------------------------------------------------------
1 | val testDS = spark.read.format("csv").option("delimiter","\t").load("/home/spark/sampledata/test.tsv")


--------------------------------------------------------------------------------
/Chapter04/Code-Snippets.scala:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter04/Code-Snippets.scala


--------------------------------------------------------------------------------
/Chapter04/Example 4-1.scala:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter04/Example 4-1.scala


--------------------------------------------------------------------------------
/Chapter02/Example2-8.Py:
--------------------------------------------------------------------------------
1 | data = sc.parallelize( [1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20])
2 | data.sample(1,0.1,12345).collect()
3 | 


--------------------------------------------------------------------------------
/Chapter05/StreamingWordCount.scala:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter05/StreamingWordCount.scala


--------------------------------------------------------------------------------
/Chapter04/SparkSQLHiveIntegration.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter04/SparkSQLHiveIntegration.py


--------------------------------------------------------------------------------
/Chapter02/Example2-11.Py:
--------------------------------------------------------------------------------
1 | movieList = sc.parallelize(["A Nous Liberte","Airplane","The Apartment","The Apartment"])
2 | movieList.distinct().collect()
3 | 


--------------------------------------------------------------------------------
/Chapter04/SparkSQLHiveIntegration.java:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter04/SparkSQLHiveIntegration.java


--------------------------------------------------------------------------------
/Chapter02/Example2-7.scala:
--------------------------------------------------------------------------------
1 | val data = sc.parallelize( List(1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20));
2 | data.sample(true,0.1,12345).collect()
3 | 


--------------------------------------------------------------------------------
/Chapter07/TerrorAnalytics-GraphFrames.scala:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/PacktPublishing/Learning-Apache-Spark-2/HEAD/Chapter07/TerrorAnalytics-GraphFrames.scala


--------------------------------------------------------------------------------
/Chapter02/Example2-10.scala:
--------------------------------------------------------------------------------
1 | val movieList = sc.parallelize(List("A Nous Liberte","Airplane","The Apartment","The Apartment"))
2 | moviesList.distinct().collect()
3 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-12.java:
--------------------------------------------------------------------------------
1 | JavaRDD<String> movieList = sc.parallelize(Arrays.asList("A Nous Liberte","Airplane","The Apartment","The Apartment"));
2 | movieList.distinct().collect();


--------------------------------------------------------------------------------
/Chapter02/Example2-5.Py:
--------------------------------------------------------------------------------
1 | movies = sc.parallelize(["Pulp Fiction","Requiem for a dream","A clockwork Orange"])
2 | movies.flatMap(lambda movieTitle: movieTitle.split(" ")).collect()
3 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-9.java:
--------------------------------------------------------------------------------
1 | JavaRDD<Integer> nums = sc.parallelize(Arrays.asList( 1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20));
2 | nums.sample(true,0.1,12345).collect();
3 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-18.scala:
--------------------------------------------------------------------------------
1 | sc.parallelize(Seq(10, 4, 2, 12, 3)).takeOrdered(1)
2 | // returns Array(2)
3 | 
4 | sc.parallelize(Seq(2, 3, 4, 5, 6)).takeOrdered(2)
5 | // returns Array(2, 3)
6 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-4.scala:
--------------------------------------------------------------------------------
1 | val favMovies = sc.parallelize(List("Pulp Fiction","Requiem for a dream","A clockwork Orange"));
2 | movies.flatMap(movieTitle=>movieTitle.split(" ")).collect()
3 | 


--------------------------------------------------------------------------------
/Chapter03/Example 03-11.scala:
--------------------------------------------------------------------------------
1 | val data = sc.parallelize(List(("MyKey1","MyValue1"),("MyKey2","MyValue2"),("MyKey3","MyValue3")))
2 | 
3 | data.saveAsSequenceFile("/home/spark/sampledata/seq-example")


--------------------------------------------------------------------------------
/DataSets/products.json:
--------------------------------------------------------------------------------
1 | {"prodname":"iPhone", "model":"4s", "price":490}
2 | {"prodname":"Samsung", "model":"Galaxy Note 7", "desc":"Catches fire while charging"}
3 | {"prodname":"iPhone", "model":"7s", "description":"nothing changed"}
4 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-14.Py:
--------------------------------------------------------------------------------
1 | java_skills= sc.parallelize(["Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh"])
2 | db_skills= sc.parallelize(["James Kent","Paul Jones","Tom Mahoney","Adam Waugh"])
3 | java_skills.intersection(db_skills).collect()


--------------------------------------------------------------------------------
/Chapter04/Example 4-3.py:
--------------------------------------------------------------------------------
1 | from pyspark.sql import SparkSession
2 | 
3 | spark = SparkSession \
4 |     .builder \
5 |     .appName("Python Spark SQL basic example") \
6 |     .config("spark.some.config.option", "some-value") \
7 |     .getOrCreate()
8 | 
9 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-16.scala:
--------------------------------------------------------------------------------
1 | val java_skills=sc.parallelize(List("Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh"))
2 | val db_skills= sc.parallelize(List("James Kent","Paul Jones","Tom Mahoney","Adam Waugh"))
3 | java_skills.union(db_skills).collect()
4 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-13.scala:
--------------------------------------------------------------------------------
1 | val java_skills=sc.parallelize(List("Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh"))
2 | val db_skills= sc.parallelize(List("James Kent","Paul Jones","Tom Mahoney","Adam Waugh"))
3 | java_skills.intersection(db_skills).collect()
4 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-17.scala:
--------------------------------------------------------------------------------
1 | val java_skills=sc.parallelize(List("Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh"))
2 | val db_skills= sc.parallelize(List("James Kent","Paul Jones","Tom Mahoney","Adam Waugh"))
3 | java_skills.subtract(db_skills).collect()
4 | 


--------------------------------------------------------------------------------
/Chapter03/Example 03-12.py:
--------------------------------------------------------------------------------
1 | data = sc.parallelize([("MyKey1","MyValue1"),("MyKey2","MyValue2"),("MyKey3","MyValue3")])
2 | 
3 | data.collect()
4 | [('MyKey1', 'MyValue1'), ('MyKey2', 'MyValue2'), ('MyKey3', 'MyValue3')]
5 | 
6 | data.saveAsSequenceFile("/home/spark/sampledata/seq-py-example")
7 | 


--------------------------------------------------------------------------------
/Chapter03/Example 03-01.scala:
--------------------------------------------------------------------------------
1 | //To read all README.md file
2 | val dataFile = sc.textFile("README.md")
3 | 
4 | //Split line to words, and flatten the result of each split
5 | val words = dataFile.flatMap(line => line.split(" "))
6 | //Save to textFile
7 | words.saveAsTextFile("/tmp/scalawords/")
8 | 


--------------------------------------------------------------------------------
/Chapter03/Example 03-03.scala:
--------------------------------------------------------------------------------
1 | //To read all README.md file
2 | val dataFile = sc.textFile("README.md")
3 | 
4 | //Split line to words, and flatten the result of each split
5 | val words = dataFile.flatMap(line => line.split(" "))
6 | //Save to textFile
7 | words.saveAsTextFile("/tmp/scalawords/")
8 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-15.java:
--------------------------------------------------------------------------------
1 | JavaRDD<String> javaSkills= sc.parallelize(Arrays.asList("Tom Mahoney","Alicia Whitekar","Paul Jones","Rodney Marsh"));
2 | JavaRDD<String> dbSkills= sc.parallelize(Arrays.asList("James Kent","Paul Jones","Tom Mahoney","Adam Waugh"));
3 | javaSkills.intersection(dbSkills).collect();


--------------------------------------------------------------------------------
/Chapter03/Example 03-02.py:
--------------------------------------------------------------------------------
 1 | 
 2 | //To read all README.md file
 3 | dataFile = sc.textFile("README.md")
 4 | 
 5 | //Split line to words, and flatten the result of each split
 6 | words = dataFile.flatMap(lambda line: line.split(" "))
 7 | 
 8 | //Save as TextFile
 9 | words.saveAsTextFile("/tmp/pythonwords/")
10 | 


--------------------------------------------------------------------------------
/Chapter03/Example 03-05.py:
--------------------------------------------------------------------------------
 1 | 
 2 | //To read all README.md file
 3 | dataFile = sc.textFile("README.md")
 4 | 
 5 | //Split line to words, and flatten the result of each split
 6 | words = dataFile.flatMap(lambda line: line.split(" "))
 7 | 
 8 | //Save as TextFile
 9 | words.saveAsTextFile("/tmp/pythonwords/")
10 | 


--------------------------------------------------------------------------------
/Chapter03/Example 03-10.java:
--------------------------------------------------------------------------------
1 | SparkSession spark = SparkSession.builder()
2 | 						.master("local")
3 | 						.appName("SparkCSVExample")
4 | 						.config("spark.some.config.option", "some-value")
5 | 						.getOrCreate();
6 |     	     
7 | Dataset<Row> pricePaidDS = spark.read().option("sep","\t").csv(fileName);
8 | 


--------------------------------------------------------------------------------
/Chapter04/Example 4-2.java:
--------------------------------------------------------------------------------
 1 | val transCount = transactions.cartesian(products).filter{
 2 |     case (TransProdId,ProdProdId) => TransProdId == ProdProdId
 3 | }
 4 |   .filter{case(TransProdId, ProdProdId) => ProdProdId = 3500
 5 | }
 6 | .map{
 7 |   case (TransProdId,ProdProdId) => TransProdId
 8 | }.count
 9 | 
10 | Println(transCount)
11 | 


--------------------------------------------------------------------------------
/Chapter03/Example 03-03.java:
--------------------------------------------------------------------------------
1 | //To read all README.md file
2 |   JavaRDD<String> dataFile = sc.textFile(fileName);
3 |   
4 | //Split line to words, and flatten the result of each split
5 | JavaRDD<String> words = dataFile.flatMap(line ->  Arrays.asList(line.split(" ")).iterator());
6 |   
7 | //Save as TextFile
8 | words.saveAsTextFile(outputFile);
9 | 


--------------------------------------------------------------------------------
/Chapter03/Example 03-06.java:
--------------------------------------------------------------------------------
1 | //To read all README.md file
2 |   JavaRDD<String> dataFile = sc.textFile(fileName);
3 |   
4 | //Split line to words, and flatten the result of each split
5 | JavaRDD<String> words = dataFile.flatMap(line ->  Arrays.asList(line.split(" ")).iterator());
6 |   
7 | //Save as TextFile
8 | words.saveAsTextFile(outputFile);
9 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-6.java:
--------------------------------------------------------------------------------
 1 | JavaRDD<String> movies = sc.parallelize
 2 | (Arrays.asList("Pulp Fiction","Requiem for a dream"
 3 | ,"A clockwork Orange")
 4 | );
 5 |     	
 6 | JavaRDD<String> movieName = movies.flatMap(
 7 |         new FlatMapFunction<String,String>(){
 8 |           public Iterator<String> call(String movie){
 9 |             return Arrays.asList(movie.split(" "))
10 |  .iterator();
11 |            }
12 |       }
13 | );
14 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-20.scala:
--------------------------------------------------------------------------------
 1 | val sampleData = sc.parallelize(Array(("k1",10),("k2",5),("k1",6),("k3",4),("k2",1),("k3",4)))
 2 | val sumCount = sampleData.combineByKey(value => (value,1), 
 3 | (valcntpair: (Int,Int), value) => (valcntpair._1 + value, valcntpair._2+1),
 4 | (valcntpair: (Int,Int), valcntpairnxt: (Int,Int)) => ((valcntpair._1 + valcntpairnxt._1),(valcntpair._2 + valcntpairnxt._2)))
 5 | 
 6 | sumCount.take(3)
 7 | val avgByKey = sumCount.map{case (label,value) => (label, value._1/value._2)}
 8 | avgByKey.take(3)
 9 | 
10 | 


--------------------------------------------------------------------------------
/Chapter02/Example2-19.scala:
--------------------------------------------------------------------------------
 1 | #Input Data
 2 | val storeSales = sc.parallelize(Array(("London", 23.4),("Manchester",19.8),("Leeds",14.7),("London",26.6)))
 3 | 
 4 | 
 5 | #GroupByKey
 6 | storeSales.groupByKey().map(location=>(location._1,location._2.sum)).collect()
 7 | 
 8 | #SampleResult
 9 | #res2: Array[(String, Double)] = Array((Manchester,19.8), (London,50.0), (Leeds,14.7))
10 | 
11 | #ReduceByKey
12 | storeSales.reduceByKey(_+_).collect()
13 | 
14 | #Sample Result
15 | #res1: Array[(String, Double)] = Array((Manchester,19.8), (London,50.0), (Leeds,14.7))
16 | 


--------------------------------------------------------------------------------
/Chapter04/ReadWriteParquet.py:
--------------------------------------------------------------------------------
 1 | #Reading a JSON file as a DataFrame
 2 | callDetailsDF = spark.read.json("/home/spark/sampledata/json/cdrs.json")
 3 | # Write the DataFrame out as a Parquet File
 4 | callDetailsDF.write.parquet("cdrs.parquet")
 5 | # Loading the Parquet File as a DataFrame
 6 | callDetailsParquetDF = spark.read.parquet("cdrs.parquet")
 7 | # Standard DataFrame data manipulation
 8 | callDetailsParquetDF.createOrReplaceTempView("calldetails")
 9 | topCallLocsDF = spark.sql("select Origin,Dest, count(*) as cnt from calldetails group by Origin,Dest  order by cnt desc")
10 | 


--------------------------------------------------------------------------------
/Chapter04/ReadWriteParquet.r:
--------------------------------------------------------------------------------
 1 | #Loading a JSON file as a DataFrame
 2 | callDetailsDF <- read.df("/home/spark/sampledata/json/cdrs.json","json")
 3 | #Writing the DataFrame out as a Parquet
 4 | write.parquet(callDetailsDF,"cdrs.parquet")
 5 | #Reading Parquet as a DataFrame
 6 | callDetailsParquetDF <- read.parquet("cdrs.parquet")
 7 | #Data Manipulation of Parquet Data
 8 | createOrReplaceTempView(callDetailsParquetDF,"parquetFile")
 9 | topCallLocsDF <- sql("select Origin,Dest, count(*) as cnt from calldetails group by Origin,Dest  order by cnt desc")
10 | head(topCallLocsDF)
11 | 


--------------------------------------------------------------------------------
/Chapter07/ConnectedComponents.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.graphx._
 2 | val cdrGraph = GraphLoader.edgeListFile(sc,"/home/spark/sampledata/graphx/cdrs.txt")
 3 | val connectedVertices = cdrGraph.connectedComponents().vertices
 4 | val usersList = sc.textFile("/home/spark/sampledata/graphx/usernames.csv").map{line =>
 5 | val fields = line.split(",")
 6 | (fields(0).trim().toLong, fields(1))
 7 | }
 8 | val connectedComponentsByUsers = usersList.join(connectedVertices).map {
 9 |   case (id, (username, cc)) => (username, cc)
10 | }
11 | println(connectedComponentsByUsers.collect().mkString("\n"))
12 | 


--------------------------------------------------------------------------------
/Chapter07/BuildPageRank.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.graphx.{Graph, VertexRDD, GraphLoader}
 2 | val cdrGraph = GraphLoader.edgeListFile(sc,"/home/spark/sampledata/graphx/cdrs.txt")
 3 | val influencers = cdrGraph.pageRank(0.0001).vertices
 4 | val usersList = sc.textFile("/home/spark/sampledata/graphx/usernames.csv").map{line =>
 5 | val fields = line.split(",")
 6 | (fields(0).trim().toLong, fields(1))
 7 | }
 8 | 
 9 | val ranksByUsername = usersList.join(influencers).map {
10 |   case (id, (username, userRank)) => (username, userRank)
11 | }
12 | println(ranksByUsername.collect().mkString("\n"))
13 | 
14 | 


--------------------------------------------------------------------------------
/Chapter04/SparkSQLHiveIntegration.r:
--------------------------------------------------------------------------------
 1 | # Creating Spark Session with hive Support
 2 | sparkR.session(enableHiveSupport=TRUE)
 3 | 
 4 | # Creating a table to hold CDRs
 5 | sql("CREATE TABLE IF NOT EXISTS cdrs(callingNumber STRING, calledNumber String, origin String, Dest String,CallDtTm String, callCharge Int) ROW FORMAT DELIMITED FIELDS TERMINATED BY ','")
 6 | 
 7 | # Loading data
 8 | sql("LOAD DATA LOCAL INPATH '/home/spark/sampledata/cdrs.csv' INTO table cdrs")
 9 | 
10 | # Finding top paired origin/destinations
11 | sql(" SELECT origin, dest, count(*) as cnt from cdrs group by origin, dest order by cnt desc LIMIT 5")
12 | 


--------------------------------------------------------------------------------
/Chapter04/ReadWriteParquet.java:
--------------------------------------------------------------------------------
 1 | #Loading a JSON file as a DataSet of Row objects
 2 | Dataset<Row> callDetailsDF = mySparkSession.read().json(fileName);
 3 | 
 4 | #Writing a Parquet File
 5 | callDetailsDF.write().parquet(parquetFileName);
 6 | 
 7 | #Reading a Parquet file of Dataset of Row objects
 8 | Dataset<Row> callDetailsParquetDF = mySparkSession.read().parquet(parquetFileName);
 9 | 
10 | #Parquet file data manipulation
11 | callDetailsParquetDF.createOrReplaceTempView("callDetails");
12 | Dataset<Row> topLocDF = mySparkSession.sql("select Origin,Dest, count(*) as cnt from calldetails group by Origin,Dest  order by cnt desc");
13 | topLocDF.show(5);
14 | 


--------------------------------------------------------------------------------
/Chapter06/Example01.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.ml.classification.LogisticRegression
 2 | import org.apache.spark.ml.linalg.{Vector,Vectors}
 3 | import org.apache.spark.ml.param.ParamMap
 4 | import org.apache.spark.ml.feature.{HashingTF, IDF, Tokenizer}
 5 | 
 6 | val textTokenizer = new Tokenizer()
 7 |   .setInputCol("corpus")
 8 |   .setOutputCol("tokenizedWords")
 9 | /* HashingTF and CountVectorized can be used to generate term frequencies. HashingTF utilizes that hashing trick and is a very fast and space-efficient way of turning arbitrary features into a vector or a matrix.
10 | */
11 |  
12 | val hashingTermFrequency = new HashingTF()
13 |   .setNumFeatures(1000)
14 |   .setInputCol(tokenizer.getOutputCol)
15 |   .setOutputCol("features")
16 | val logisticRegression = new LogisticRegression()
17 |   .setMaxIter(10)
18 |   .setRegParam(0.01)
19 | val pipeline = new Pipeline()
20 |   .setStages(Array(tokenizer, hashingTermFrequency, logisticRegression))
21 | val model = pipeline.fit(trainingDataset)
22 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2017 Packt
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/Chapter06/BuildingPipeline.py:
--------------------------------------------------------------------------------
 1 | from pyspark.ml import Pipeline
 2 | from pyspark.ml.classification import LogisticRegression
 3 | from pyspark.ml.feature import HashingTF, Tokenizer
 4 | 
 5 | 
 6 | # Create a dataframe using labelled data set
 7 | trainingDataSet = spark.createDataFrame([
 8 |     (0, "ronaldo zidane goals score ball studs", 1.0),
 9 |     (1, "obama trump clintons whitehouse policy inflation", 0.0),
10 |     (2, "corner penalty worldcup eurocup barcelona messie", 1.0),
11 |     (3, "hadoop mapreduce spark goal pig hive", 0.0)], ["documentId", "corpus", "label"])
12 | 
13 | # Configure an ML pipeline, which consists of three stages:
14 | # texttokenization, hashingTF, and logisticRegressionmodel.
15 | textTokenizer = Tokenizer(inputCol="corpus", outputCol="words")
16 | hashingTF = HashingTF(inputCol=textTokenizer.getOutputCol(), outputCol="features")
17 | logisticRegressionModel = LogisticRegression(maxIter=30, regParam=0.01)
18 | pipeline = Pipeline(stages=[textTokenizer, hashingTF, logisticRegressionModel])
19 | 
20 | # Fit the pipeline to training documents.
21 | #Returns a model which can then be used with other data sets for prediction.
22 | 
23 | model = pipeline.fit(trainingDataSet)
24 | 
25 | # Create a dataset which contains unlabelled documents of data
26 | testDataSet = spark.createDataFrame([
27 |     (4, "corner ball goal score" ),
28 |     (5, "sort hive optimzer columnar"),
29 |     (6, "ronaldo messie eurocup"),
30 |     (7, "database parquet orc avro")], ["documentId", "corpus"])
31 | 
32 | # Make predictions on test documents and print columns of interest from the predictions.
33 | prediction = model.transform(testDataSet)
34 | selectedColumns = prediction.select("documentId", "corpus", "prediction", "probability")
35 | for eachRow in selectedColumns.collect():
36 |     print(eachRow)
37 | 


--------------------------------------------------------------------------------
/Chapter09/BuildRecommendationEngine.scala:
--------------------------------------------------------------------------------
 1 | import org.apache.spark.ml.evaluation.RegressionEvaluator
 2 | import org.apache.spark.ml.recommendation.ALS
 3 | import org.apache.spark.sql._
 4 | 
 5 | case class Ratings(userId: Int, movieId: Int, rating: Double, ratingTs: Long)
 6 | val ratingsSchema = Encoders.product[Ratings].schema
 7 | case class Movies(moveId: Int, title: String, genre: String)
 8 | val moviesSchema = Encoders.product[Movies].schema
 9 | 
10 | val ratings = spark.read.option("header","true")
11 | .schema(ratingsSchema)
12 | .csv("hdfs://sparkmaster:8020/user/hdfs/sampledata/ratings.csv")
13 | 
14 | val movies = spark.read.option("header","true")
15 | .schema(moviesSchema)
16 |  .csv("hdfs://sparkmaster:8020/user/hdfs/sampledata/movies.csv")
17 | 
18 |  val Array(train, test) = ratings.randomSplit(Array(0.7, 0.3))
19 |  
20 |  val als = new ALS()
21 | .setMaxIter(15)
22 | .setRegParam(0.01)
23 | .setUserCol("userId")
24 | .setItemCol("movieId")
25 | .setRatingCol("rating")
26 | 
27 | val recommendationModel = als.fit(train)
28 | 
29 | val predictions = recommendationModel.transform(test)
30 | val ranks = List(1,2,3,4,5,6,7,8,9,10)
31 | val lambdas = List(0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.8,1,2,3,4,5,6,10.0)
32 | val regParams = List(0.01,0.05,0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.8,0.10,10)
33 | val numIters = List(5,10,15,20)
34 | var bestModel: Option[ALSModel] = None
35 | var optimalRMSE = Double.MaxValue
36 | var bestRank = 0
37 | var bestRegParam = -1.0
38 | var bestNumIter = -1
39 | 
40 | /*
41 |  * Iterative Computation - Find best Model
42 |  */
43 | for (rank <- ranks; regParam <- regParams; numIter <- numIters) {
44 | 	val als = new ALS().setMaxIter(numIter).setRank(rank).setRegParam(regParam).setUserCol("userId").setItemCol("movieId").setRatingCol("rating")
45 | 	val model = als.fit(train)
46 | 	val predictions = model.transform(valid)
47 | 	val currentRMSE = evaluator.evaluate(predictions.filter("prediction <> 'NaN'"))
48 | 	println("Metrics => RMSE (Validation) = " + currentRMSE + " : Model Metrics(rank = "+ rank + ", regParam = " + regParam + ", and numIter = " + numIter + ").")
49 | 	if (currentRMSE < optimalRMSE) {
50 | 	bestModel = Some(model)
51 | 	optimalRMSE = currentRMSE
52 | 	bestRank = rank
53 | 	bestRegParam = regParam
54 | 	bestNumIter = numIter
55 | 	}
56 | }
57 | 
58 | 
59 | val als = new ALS()
60 | .setMaxIter(15)
61 | .setRegParam(0.01)
62 | .setImplicitPrefs(true)
63 | .setUserCol("userId")
64 | .setItemCol("movieId")
65 | 
66 | 


--------------------------------------------------------------------------------
/Chapter04/RDDConversion.java:
--------------------------------------------------------------------------------
 1 | package org.packtpub;
 2 | 
 3 | import java.io.Serializable;
 4 | import java.util.ArrayList;
 5 | import java.util.Arrays;
 6 | import java.util.Iterator;
 7 | import java.util.List;
 8 | 
 9 | import org.apache.hadoop.io.Text;
10 | import org.apache.hadoop.mapred.SequenceFileOutputFormat;
11 | import org.apache.spark.SparkConf;
12 | import org.apache.spark.SparkContext;
13 | import org.apache.spark.api.java.JavaPairRDD;
14 | import org.apache.spark.api.java.JavaRDD;
15 | import org.apache.spark.api.java.JavaSparkContext;
16 | import org.apache.spark.api.java.function.FilterFunction;
17 | import org.apache.spark.api.java.function.FlatMapFunction;
18 | import org.apache.spark.api.java.function.Function;
19 | import org.apache.spark.api.java.function.Function2;
20 | import org.apache.spark.api.java.function.PairFunction;
21 | import org.apache.spark.sql.*;
22 | 
23 | import static org.apache.spark.sql.functions.col;
24 | 
25 | import org.apache.spark.sql.functions;
26 | 
27 | import scala.Tuple2;
28 | import scala.collection.Iterable;
29 | 
30 | import org.apache.spark.sql.Dataset;
31 | import org.apache.spark.sql.Row;
32 | import org.apache.spark.sql.SparkSession;
33 | 
34 | 
35 | public class RDDConversion 
36 | {
37 | 
38 |  public static void main( String[] args )
39 |     {
40 |     	RDDConversion app = new RDDConversion();
41 |     	System.setProperty("hadoop.home.dir", "C:/spark/spark-2.0.0/");
42 |     	String sparkWarehouseDir = "/home/spark/spark-warehouse";
43 |     	String fileName = args[0];
44 |     	
45 |     	SparkConf conf = new SparkConf().setAppName("RDDConversion").setMaster("local[*]");
46 |     	JavaSparkContext sc = new JavaSparkContext(conf);
47 |    
48 |     	SparkSession mySparkSession = SparkSession.builder()
49 | 				.master("local")
50 | 				.appName("Java Spark-SQL Hive Integration ")
51 | 				.enableHiveSupport()
52 | 				.config("spark.sql.warehouse.dir", sparkWarehouseDir)
53 | 				.getOrCreate();
54 | 
55 |     	JavaRDD<String> dataFile = sc.textFile(fileName);
56 |     	
57 |     	JavaRDD<CallDetailRecord> cdr = dataFile.map(new Function<String,CallDetailRecord>(){
58 |     		public CallDetailRecord call(String line) throws Exception{
59 |     			String[] parts = line.split(",");
60 |     			CallDetailRecord cdr = new CallDetailRecord();
61 |     			cdr.setOriginNumber(parts[0]);
62 |     			cdr.setTermNumber(parts[1]);
63 |     			cdr.setOrigin(parts[2]);
64 |     			cdr.setTermDest(parts[3]);
65 |     			cdr.setDateTime(parts[4]);
66 |     			cdr.setCallCharges(Long.parseLong(parts[5]));
67 |     			return cdr;
68 |     		}
69 |     	});
70 |     	
71 |     	
72 |     	Dataset<Row> cdrDataFrame = mySparkSession.createDataFrame(cdr, CallDetailRecord.class);
73 |     	cdrDataFrame.show();
74 |     	
75 |        	
76 |     }
77 |     
78 |     
79 |     
80 | }
81 | 
82 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | 
 2 | 
 3 | 
 4 | # Learning Apache Spark 2
 5 | This is the code repository for [Learning Apache Spark 2](https://www.packtpub.com/big-data-and-business-intelligence/learning-apache-spark-2?utm_source=github&utm_medium=repository&utm_campaign=9781785885136), published by [Packt](https://www.packtpub.com/?utm_source=github). It contains all the supporting project files necessary to work through the book from start to finish.
 6 | 
 7 | ## About the Book
 8 | Spark juggernaut keeps on rolling and getting more and more momentum each day. Spark provides key capabilities in the form of Spark SQL, Spark Streaming, Spark ML and Graph X all accessible via Java, Scala, Python and R. Deploying the key capabilities is crucial whether it is on a Standalone framework or as a part of existing Hadoop installation and configuring with Yarn and Mesos.
 9 | 
10 | The next part of the journey after installation is using key components, APIs, Clustering, machine learning APIs, data pipelines, parallel programming. It is important to understand why each framework component is key, how widely it is being used, its stability and pertinent use cases.
11 | 
12 | 
13 | ## Instructions and Navigation
14 | All of the code is organized into folders. Each folder starts with a number followed by the application name. For example, Chapter02.
15 | 
16 | Code bundle contains a DataSet folder for sample data used.
17 | 
18 | The code will look like the following:
19 | 
20 | When we wish to draw your attention to a particular part of a code block, the relevant lines
21 | or items are set in bold:
22 | ```
23 | [default]
24 | exten => s,1,Dial(Zap/1|30)
25 | exten => s,2,Voicemail(u100)
26 | exten => s,102,Voicemail(b100)
27 | exten => i,1,Voicemail(s0)
28 | ```
29 | 
30 | You will need Spark 2.0, which you can download from Apache Spark website. We have used few different configurations, but you can essentially run most of these examples inside a virtual machine with 4-8GB of RAM, and 10 GB of available disk space.
31 | 
32 | ## Related Products
33 | * [Mastering Apache Spark 2.0 - Second Edition](https://www.packtpub.com/big-data-and-business-intelligence/mastering-apache-spark-20-second-edition?utm_source=github&utm_medium=repository&utm_campaign=9781786462749)
34 | 
35 | * [Apache Spark 2 for Beginners](https://www.packtpub.com/big-data-and-business-intelligence/apache-spark-2-beginners?utm_source=github&utm_medium=repository&utm_campaign=9781785885006)
36 | 
37 | * [Apache Spark Machine Learning Cookbook](https://www.packtpub.com/big-data-and-business-intelligence/apache-spark-machine-learning-cookbook?utm_source=github&utm_medium=repository&utm_campaign=9781783551606)
38 | 
39 | ### Suggestions and Feedback
40 | [Click here](https://docs.google.com/forms/d/e/1FAIpQLSe5qwunkGf6PUvzPirPDtuy1Du5Rlzew23UBp2S-P3wB-GcwQ/viewform) if you have any feedback or suggestions.
41 | ### Download a free PDF
42 | 
43 |  <i>If you have already purchased a print or Kindle version of this book, you can get a DRM-free PDF version at no cost.<br>Simply click on the link to claim your free PDF.</i>
44 | <p align="center"> <a href="https://packt.link/free-ebook/9781785885136">https://packt.link/free-ebook/9781785885136 </a> </p>


--------------------------------------------------------------------------------
/DataSets/cdrs.json:
--------------------------------------------------------------------------------
  1 | {"OriginatingNum": 797308107, "TerminatingNum": 797131221, "Origin": "London", "Dest": "Birmingham","DateTime": "02/11/2016 01:51:41", "CallCharge": 549}
  2 | {"OriginatingNum": 777121117, "TerminatingNum": 777440392, "Origin": "Manchester","Dest": "London", "DateTime": "05/02/2016 01:26:54", "CallCharge": 2645}
  3 | {"OriginatingNum": 797009202, "TerminatingNum": 784243404, "Origin": "Victoria", "Dest": "Manchester","DateTime": "01/12/2016 21:12:54","CallCharge": 1233}
  4 | { "OriginatingNum": 777557705, "TerminatingNum": 798420467, "Origin": "Twickenham", "Dest": "Victoria", "DateTime": "07/11/2016 01:07:34", "CallCharge": 2651}
  5 | {"OriginatingNum": 785434022, "TerminatingNum": 779086250, "Origin": "Leeds", "Dest": "Scotland", "DateTime": "02/11/2016 22:22:26", "CallCharge": 3162}
  6 | {"OriginatingNum": 779716202, "TerminatingNum": 795137353, "Origin": "Bradford", "Dest": "Virginia Water", "DateTime": "05/01/2016 20:12:35", "CallCharge": 2246}
  7 | {"OriginatingNum": 775490102, "TerminatingNum": 775019605, "Origin": "Yorkshire", "Dest": "Ascot", "DateTime": "04/12/2016 23:53:52", "CallCharge": 571}
  8 | {"OriginatingNum": 787581376, "TerminatingNum": 797043387, "Origin": "Birmingham", "Dest": "Bracknell", "DateTime": "06/11/2016 20:31:49", "CallCharge": 3291}
  9 | {"OriginatingNum": 789231956, "TerminatingNum": 787649491, "Origin": "Coventary", "Dest": "Bradford", "DateTime": "03/12/2016 12:15:17", "CallCharge": 2270}
 10 | {"OriginatingNum": 785969980, "TerminatingNum": 789993090, "Origin": "Wales", "Dest": "Yorkshire", "DateTime": "06/02/2016 20:57:44", "CallCharge": 3420}
 11 | {"OriginatingNum": 797662091, "TerminatingNum": 777765510, "Origin": "Scotland", "Dest": "Birmingham", "DateTime": "02/01/2016 02:44:27", "CallCharge": 3084}
 12 | {"OriginatingNum": 784036802, "TerminatingNum": 798095485, "Origin": "Virginia Water", "Dest": "Marlow", "DateTime": "09/01/2016 00:48:43", "CallCharge": 3037}
 13 | {"OriginatingNum": 785160169, "TerminatingNum": 797922170, "Origin": "Ascot", "Dest": "Sunningdale", "DateTime": "08/11/2016 20:19:19", "CallCharge": 3011}
 14 | {"OriginatingNum": 789519210, "TerminatingNum": 774080821, "Origin": "Bracknell", "Dest": "Lords", "DateTime": "05/01/2016 11:24:28", "CallCharge": 1018}
 15 | {"OriginatingNum": 775617249, "TerminatingNum": 786549418, "Origin": "Marlow", "Dest": "Oval", "DateTime": "02/12/2016 02:07:09", "CallCharge": 771}
 16 | {"OriginatingNum": 797932062, "TerminatingNum": 788292522, "Origin": "Sunningdale", "Dest": "Coventary", "DateTime": "07/11/2016 03:43:23", "CallCharge": 3585}
 17 | {"OriginatingNum": 777561966, "TerminatingNum": 788455450, "Origin": "Lords", "Dest": "Wales", "DateTime": "06/01/2016 23:08:06", "CallCharge": 908}
 18 | {"OriginatingNum": 777508024, "TerminatingNum": 789954417, "Origin": "Oval", "Dest": "Scotland", "DateTime": "04/12/2016 24:17:54", "CallCharge": 95}
 19 | {"OriginatingNum": 777087537, "TerminatingNum": 778710691, "Origin": "Birmingham", "Dest": "Birmingham", "DateTime": "03/11/2016 00:45:24", "CallCharge": 2754}
 20 | {"OriginatingNum": 774688108, "TerminatingNum": 797626213, "Origin": "London", "Dest": "Coventary", "DateTime": "03/01/2016 03:11:03", "CallCharge": 1327}
 21 | {"OriginatingNum": 778449580, "TerminatingNum": 778385762, "Origin": "Manchester", "Dest": "Wales", "DateTime": "04/02/2016 14:59:06", "CallCharge": 3264}
 22 | {"OriginatingNum": 788790859, "TerminatingNum": 776121867, "Origin": "Victoria", "Dest": "Scotland", "DateTime": "09/12/2016 11:05:23", "CallCharge": 1608}
 23 | {"OriginatingNum": 785376620, "TerminatingNum": 798020898, "Origin": "Scotland", "Dest": "Virginia Water", "DateTime": "03/02/2016 04:31:45", "CallCharge": 77}
 24 | {"OriginatingNum": 774388678, "TerminatingNum": 786552782, "Origin": "Virginia Water", "Dest": "Ascot", "DateTime": "02/01/2016 02:26:31", "CallCharge": 1757}
 25 | {"OriginatingNum": 796640229, "TerminatingNum": 786558349, "Origin": "Ascot", "Dest": "Bracknell", "DateTime": "04/11/2016 01:01:39", "CallCharge": 3421}
 26 | {"OriginatingNum": 776397451, "TerminatingNum": 777278274, "Origin": "Bracknell", "Dest": "Leeds", "DateTime": "05/02/2016 22:07:21", "CallCharge": 2922}
 27 | {"OriginatingNum": 787426686, "TerminatingNum": 774001818, "Origin": "Bradford", "Dest": "Bradford", "DateTime": "08/02/2016 01:31:53", "CallCharge": 996}
 28 | {"OriginatingNum": 774853589, "TerminatingNum": 778226530, "Origin": "Yorkshire", "Dest": "Yorkshire", "DateTime": "07/11/2016 01:15:44", "CallCharge": 2229}
 29 | {"OriginatingNum": 798516272, "TerminatingNum": 798192751, "Origin": "Birmingham", "Dest": "Birmingham", "DateTime": "06/01/2016 20:47:23", "CallCharge": 2314}
 30 | {"OriginatingNum": 794956011, "TerminatingNum": 798595444, "Origin": "Marlow", "Dest": "Coventary", "DateTime": "09/01/2016 10:32:12", "CallCharge": 1137}
 31 | {"OriginatingNum": 788499476, "TerminatingNum": 799514066, "Origin": "Sunningdale", "Dest": "Wales", "DateTime": "09/01/2016 14:13:41", "CallCharge": 1538}
 32 | {"OriginatingNum": 778956877, "TerminatingNum": 787972481, "Origin": "Lords", "Dest": "Virginia Water", "DateTime": "02/01/2016 12:13:38", "CallCharge": 75}
 33 | {"OriginatingNum": 784133953, "TerminatingNum": 777082964, "Origin": "Oval", "Dest": "Ascot", "DateTime": "02/01/2016 01:04:08", "CallCharge": 1415}
 34 | {"OriginatingNum": 787077525, "TerminatingNum": 789876379, "Origin": "Coventary", "Dest": "Bracknell", "DateTime": "06/11/2016 22:57:41", "CallCharge": 1061}
 35 | {"OriginatingNum": 784627303, "TerminatingNum": 776663366, "Origin": "Wales", "Dest": "Bradford", "DateTime": "04/01/2016 11:29:33", "CallCharge": 2291}
 36 | {"OriginatingNum": 774188291, "TerminatingNum": 794732083, "Origin": "Scotland", "Dest": "Yorkshire", "DateTime": "08/12/2016 12:37:41", "CallCharge": 3391}
 37 | {"OriginatingNum": 784126576, "TerminatingNum": 787520608, "Origin": "Birmingham", "Dest": "London", "DateTime": "05/01/2016 23:57:59", "CallCharge": 1740}
 38 | {"OriginatingNum": 775584064, "TerminatingNum": 795017614, "Origin": "London", "Dest": "Manchester", "DateTime": "04/01/2016 01:19:28", "CallCharge": 2940}
 39 | {"OriginatingNum": 774279853, "TerminatingNum": 787470510, "Origin": "Manchester", "Dest": "Victoria", "DateTime": "09/02/2016 02:47:23", "CallCharge": 283}
 40 | {"OriginatingNum": 776629283, "TerminatingNum": 784050637, "Origin": "Victoria", "Dest": "Twickenham", "DateTime": "06/01/2016 23:09:52", "CallCharge": 426}
 41 | {"OriginatingNum": 796071020, "TerminatingNum": 796928746, "Origin": "Scotland", "Dest": "Leeds", "DateTime": "02/01/2016 20:15:52", "CallCharge": 1300}
 42 | {"OriginatingNum": 778529801, "TerminatingNum": 799123703, "Origin": "Virginia Water", "Dest": "Bradford", "DateTime": "03/01/2016 21:23:05", "CallCharge": 85}
 43 | {"OriginatingNum": 779318091, "TerminatingNum": 777545543, "Origin": "Ascot", "Dest": "Yorkshire", "DateTime": "09/12/2016 20:38:39", "CallCharge": 2198}
 44 | {"OriginatingNum": 779785134, "TerminatingNum": 796559835, "Origin": "Bracknell", "Dest": "Birmingham", "DateTime": "06/01/2016 20:55:09", "CallCharge": 2551}
 45 | {"OriginatingNum": 777388057, "TerminatingNum": 796373853, "Origin": "Bradford", "Dest": "Coventary", "DateTime": "03/02/2016 03:27:54", "CallCharge": 2424}
 46 | {"OriginatingNum": 784410639, "TerminatingNum": 785309669, "Origin": "Yorkshire", "Dest": "Wales", "DateTime": "07/02/2016 23:44:17", "CallCharge": 652}
 47 | {"OriginatingNum": 779039353, "TerminatingNum": 788576202, "Origin": "Birmingham", "Dest": "Scotland", "DateTime": "07/02/2016 04:49:12", "CallCharge": 534}
 48 | {"OriginatingNum": 774918134, "TerminatingNum": 784624246, "Origin": "Marlow", "Dest": "Virginia Water", "DateTime": "08/01/2016 03:49:48", "CallCharge": 3469}
 49 | {"OriginatingNum": 785848242, "TerminatingNum": 795801932, "Origin": "Sunningdale", "Dest": "Ascot", "DateTime": "09/11/2016 12:53:14", "CallCharge": 2627}
 50 | {"OriginatingNum": 785339539, "TerminatingNum": 776854945, "Origin": "Lords", "Dest": "Bracknell", "DateTime": "02/02/2016 00:35:53", "CallCharge": 3204}
 51 | {"OriginatingNum": 777047486, "TerminatingNum": 786699071, "Origin": "Oval", "Dest": "Marlow", "DateTime": "03/12/2016 11:52:09", "CallCharge": 2367}
 52 | {"OriginatingNum": 788971265, "TerminatingNum": 785113136, "Origin": "Coventary", "Dest": "Sunningdale", "DateTime": "03/02/2016 00:59:44", "CallCharge": 3040}
 53 | {"OriginatingNum": 788706239, "TerminatingNum": 799121170, "Origin": "Wales", "Dest": "Lords", "DateTime": "09/01/2016 20:52:48", "CallCharge": 3052}
 54 | {"OriginatingNum": 784930367, "TerminatingNum": 799779480, "Origin": "Scotland", "Dest": "Oval", "DateTime": "07/02/2016 14:24:19", "CallCharge": 433}
 55 | {"OriginatingNum": 786736111, "TerminatingNum": 778668124, "Origin": "Birmingham", "Dest": "Birmingham", "DateTime": "07/11/2016 23:49:07", "CallCharge": 2861}
 56 | {"OriginatingNum": 774996036, "TerminatingNum": 776453220, "Origin": "Coventary", "Dest": "London", "DateTime": "07/11/2016 20:21:48", "CallCharge": 1896}
 57 | {"OriginatingNum": 798738693, "TerminatingNum": 794306028, "Origin": "Wales", "Dest": "Manchester", "DateTime": "03/11/2016 23:27:33", "CallCharge": 52}
 58 | {"OriginatingNum": 776797164, "TerminatingNum": 797585202, "Origin": "Scotland", "Dest": "Victoria", "DateTime": "04/01/2016 14:05:43", "CallCharge": 2302}
 59 | {"OriginatingNum": 778899643, "TerminatingNum": 779991962, "Origin": "Virginia Water", "Dest": "Scotland", "DateTime": "07/02/2016 23:53:15", "CallCharge": 904}
 60 | {"OriginatingNum": 795342792, "TerminatingNum": 789582658, "Origin": "Ascot", "Dest": "Virginia Water", "DateTime": "03/01/2016 10:25:34", "CallCharge": 125}
 61 | {"OriginatingNum": 786463203, "TerminatingNum": 787079843, "Origin": "Bracknell", "Dest": "Ascot", "DateTime": "04/12/2016 11:26:52", "CallCharge": 881}
 62 | {"OriginatingNum": 775479982, "TerminatingNum": 787185946, "Origin": "Leeds", "Dest": "Bracknell", "DateTime": "04/02/2016 24:13:49", "CallCharge": 1785}
 63 | {"OriginatingNum": 795407096, "TerminatingNum": 794127828, "Origin": "Bradford", "Dest": "Bradford", "DateTime": "04/01/2016 01:02:41", "CallCharge": 2348}
 64 | {"OriginatingNum": 799950372, "TerminatingNum": 779479868, "Origin": "Yorkshire", "Dest": "Yorkshire", "DateTime": "06/02/2016 03:06:32", "CallCharge": 2330}
 65 | {"OriginatingNum": 779443671, "TerminatingNum": 789039212, "Origin": "Birmingham", "Dest": "Birmingham", "DateTime": "04/11/2016 12:28:44", "CallCharge": 971}
 66 | {"OriginatingNum": 776078153, "TerminatingNum": 777623079, "Origin": "Coventary", "Dest": "Marlow", "DateTime": "04/01/2016 11:05:49", "CallCharge": 222}
 67 | {"OriginatingNum": 778439584, "TerminatingNum": 794809988, "Origin": "Wales", "Dest": "Sunningdale", "DateTime": "09/02/2016 02:06:07", "CallCharge": 732}
 68 | {"OriginatingNum": 776239910, "TerminatingNum": 779831334, "Origin": "Virginia Water", "Dest": "Lords", "DateTime": "06/12/2016 03:12:04", "CallCharge": 2807}
 69 | {"OriginatingNum": 788661014, "TerminatingNum": 787991820, "Origin": "Ascot", "Dest": "Oval", "DateTime": "01/02/2016 03:07:33", "CallCharge": 619}
 70 | {"OriginatingNum": 774298657, "TerminatingNum": 786941620, "Origin": "Bracknell", "Dest": "Coventary", "DateTime": "06/01/2016 20:17:59", "CallCharge": 3596}
 71 | {"OriginatingNum": 796628071, "TerminatingNum": 785604207, "Origin": "Bradford", "Dest": "Wales", "DateTime": "01/12/2016 01:58:37", "CallCharge": 972}
 72 | {"OriginatingNum": 777994634, "TerminatingNum": 774850412, "Origin": "Yorkshire", "Dest": "Scotland", "DateTime": "05/12/2016 11:22:44", "CallCharge": 3231}
 73 | {"OriginatingNum": 785903192, "TerminatingNum": 776226916, "Origin": "Birmingham", "Dest": "Ascot", "DateTime": "05/11/2016 04:38:07", "CallCharge": 2074}
 74 | {"OriginatingNum": 778324460, "TerminatingNum": 785392423, "Origin": "Coventary", "Dest": "Bracknell", "DateTime": "09/12/2016 22:26:41", "CallCharge": 2159}
 75 | {"OriginatingNum": 774524318, "TerminatingNum": 779608295, "Origin": "Wales", "Dest": "Bradford", "DateTime": "02/01/2016 23:42:22", "CallCharge": 2417}
 76 | {"OriginatingNum": 794183103, "TerminatingNum": 786955937, "Origin": "Virginia Water", "Dest": "Yorkshire", "DateTime": "08/01/2016 12:48:19", "CallCharge": 1083}
 77 | {"OriginatingNum": 787471976, "TerminatingNum": 787033256, "Origin": "Ascot", "Dest": "Birmingham", "DateTime": "08/02/2016 00:48:52", "CallCharge": 8}
 78 | {"OriginatingNum": 777026835, "TerminatingNum": 788186797, "Origin": "Bracknell", "Dest": "Marlow", "DateTime": "06/12/2016 10:25:33", "CallCharge": 3461}
 79 | {"OriginatingNum": 777189678, "TerminatingNum": 785852073, "Origin": "Bradford", "Dest": "Sunningdale", "DateTime": "08/02/2016 02:57:09", "CallCharge": 3189}
 80 | {"OriginatingNum": 779246405, "TerminatingNum": 794067417, "Origin": "Yorkshire", "Dest": "Lords", "DateTime": "02/11/2016 02:13:49", "CallCharge": 8}
 81 | {"OriginatingNum": 794018876, "TerminatingNum": 776154503, "Origin": "London", "Dest": "Oval", "DateTime": "03/11/2016 01:41:37", "CallCharge": 1780}
 82 | {"OriginatingNum": 788662914, "TerminatingNum": 796324299, "Origin": "Manchester", "Dest": "Coventary", "DateTime": "05/11/2016 10:19:36", "CallCharge": 2828}
 83 | {"OriginatingNum": 794395044, "TerminatingNum": 776172226, "Origin": "Victoria", "Dest": "Wales", "DateTime": "01/11/2016 12:33:57", "CallCharge": 2600}
 84 | {"OriginatingNum": 794810223, "TerminatingNum": 776407350, "Origin": "Twickenham", "Dest": "Scotland", "DateTime": "03/11/2016 00:57:04", "CallCharge": 2633}
 85 | {"OriginatingNum": 794746359, "TerminatingNum": 784890101, "Origin": "Leeds", "Dest": "Birmingham", "DateTime": "06/12/2016 12:32:58", "CallCharge": 2452}
 86 | {"OriginatingNum": 799339230, "TerminatingNum": 798748393, "Origin": "Bradford", "Dest": "Coventary", "DateTime": "05/11/2016 00:34:24", "CallCharge": 1049}
 87 | {"OriginatingNum": 789140678, "TerminatingNum": 796878575, "Origin": "Yorkshire", "Dest": "Wales", "DateTime": "06/02/2016 01:02:22", "CallCharge": 1466}
 88 | {"OriginatingNum": 778688915, "TerminatingNum": 779779543, "Origin": "Birmingham", "Dest": "Scotland", "DateTime": "06/11/2016 22:55:07", "CallCharge": 2973}
 89 | {"OriginatingNum": 784353531, "TerminatingNum": 788414958, "Origin": "Coventary", "Dest": "Virginia Water", "DateTime": "06/02/2016 03:28:04", "CallCharge": 2159}
 90 | {"OriginatingNum": 778998492, "TerminatingNum": 798635474, "Origin": "Wales", "Dest": "Ascot", "DateTime": "01/11/2016 24:31:49", "CallCharge": 2385}
 91 | {"OriginatingNum": 795058836, "TerminatingNum": 774990397, "Origin": "Scotland", "Dest": "Bracknell", "DateTime": "06/12/2016 04:59:11", "CallCharge": 62}
 92 | {"OriginatingNum": 779353189, "TerminatingNum": 796636314, "Origin": "Virginia Water", "Dest": "Leeds", "DateTime": "08/12/2016 10:12:28", "CallCharge": 3405}
 93 | {"OriginatingNum": 788495142, "TerminatingNum": 795064948, "Origin": "Ascot", "Dest": "Bradford", "DateTime": "05/12/2016 01:37:09", "CallCharge": 2686}
 94 | {"OriginatingNum": 788569039, "TerminatingNum": 788719136, "Origin": "Bracknell", "Dest": "Yorkshire", "DateTime": "09/02/2016 21:01:12", "CallCharge": 2319}
 95 | {"OriginatingNum": 775435510, "TerminatingNum": 788161474, "Origin": "Marlow", "Dest": "Birmingham", "DateTime": "09/01/2016 21:09:09", "CallCharge": 2180}
 96 | {"OriginatingNum": 776019794, "TerminatingNum": 776909199, "Origin": "Sunningdale", "Dest": "Coventary", "DateTime": "09/12/2016 12:16:59", "CallCharge": 3240}
 97 | {"OriginatingNum": 794751801, "TerminatingNum": 774122416, "Origin": "Lords", "Dest": "Wales", "DateTime": "07/01/2016 24:28:39", "CallCharge": 1011}
 98 | {"OriginatingNum": 798526356, "TerminatingNum": 784989061, "Origin": "Oval", "Dest": "Virginia Water", "DateTime": "09/11/2016 10:43:55", "CallCharge": 1771}
 99 | {"OriginatingNum": 778894206, "TerminatingNum": 775901576, "Origin": "Birmingham", "Dest": "Ascot", "DateTime": "03/01/2016 23:14:55", "CallCharge": 951}
100 | {"OriginatingNum": 779461846, "TerminatingNum": 776103392, "Origin": "London", "Dest": "Bracknell", "DateTime": "08/02/2016 14:47:38", "CallCharge": 420 }


--------------------------------------------------------------------------------
/Chapter10/ChurnPrediction.ipynb:
--------------------------------------------------------------------------------
   1 | {
   2 |  "cells": [
   3 |   {
   4 |    "cell_type": "markdown",
   5 |    "metadata": {
   6 |     "collapsed": false
   7 |    },
   8 |    "source": [
   9 |     "CHURN PREDICTION - TELECOM DATA SET\n",
  10 |     "===================================="
  11 |    ]
  12 |   },
  13 |   {
  14 |    "cell_type": "markdown",
  15 |    "metadata": {
  16 |     "collapsed": false
  17 |    },
  18 |    "source": [
  19 |     "We are going to Load the churners data. The data is available at the following location.\n",
  20 |     "https://raw.githubusercontent.com/EricChiang/churn/master/data/churn.csv    "
  21 |    ]
  22 |   },
  23 |   {
  24 |    "cell_type": "code",
  25 |    "execution_count": 1,
  26 |    "metadata": {
  27 |     "collapsed": true
  28 |    },
  29 |    "outputs": [],
  30 |    "source": [
  31 |     "from pyspark.sql import SparkSession\n",
  32 |     "from pyspark.ml.classification import DecisionTreeClassifier, LogisticRegression, RandomForestClassifier\n",
  33 |     "from pyspark.ml import Pipeline\n",
  34 |     "from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler\n",
  35 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
  36 |     "from pyspark.ml.tuning import ParamGridBuilder, CrossValidator\n",
  37 |     "import time"
  38 |    ]
  39 |   },
  40 |   {
  41 |    "cell_type": "markdown",
  42 |    "metadata": {},
  43 |    "source": [
  44 |     "If you look at the column names the are not the best names, as their is no consistency. \n",
  45 |     "The entire data set has been loaded as String and in addition to that the names have spaces too. We need to make sure\n",
  46 |     "we can define a proper schema for this dataset."
  47 |    ]
  48 |   },
  49 |   {
  50 |    "cell_type": "code",
  51 |    "execution_count": 2,
  52 |    "metadata": {
  53 |     "collapsed": false
  54 |    },
  55 |    "outputs": [],
  56 |    "source": [
  57 |     "from pyspark.sql.types import *\n",
  58 |     "schemaString = \"STATE,ACCOUNTLENGTH,AREACODE,PHONE,INTLPLAN,VMAILPLAN,VMAILMESSAGE,DAYMINS,DAYCALLS,DAYCHARGE,EVEMINS,EVECALLS,EVECHARGE,NIGHTMINS,NIGHTCALLS,NIGHTCHARGE,INTLMINS,INTLCALLS,INTLCHARGE,CUSTSERVCALLS,CHURN\"\n",
  59 |     "fields = [StructField(field_name, StringType(), True) for field_name in schemaString.split(\",\")]\n",
  60 |     "churnSchema = StructType(fields)"
  61 |    ]
  62 |   },
  63 |   {
  64 |    "cell_type": "code",
  65 |    "execution_count": 4,
  66 |    "metadata": {
  67 |     "collapsed": false
  68 |    },
  69 |    "outputs": [
  70 |     {
  71 |      "ename": "Py4JJavaError",
  72 |      "evalue": "An error occurred while calling o77.csv.\n: java.net.ConnectException: Call From sparkmaster.demo.com/10.37.101.3 to sparkmaster:8020 failed on connection exception: java.net.ConnectException: Connection refused; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)\n\tat sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)\n\tat java.lang.reflect.Constructor.newInstance(Constructor.java:526)\n\tat org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:783)\n\tat org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:730)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1351)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1300)\n\tat org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206)\n\tat com.sun.proxy.$Proxy10.getFileInfo(Unknown Source)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:606)\n\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186)\n\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)\n\tat com.sun.proxy.$Proxy10.getFileInfo(Unknown Source)\n\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:651)\n\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1679)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1106)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1102)\n\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1102)\n\tat org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1397)\n\tat org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:389)\n\tat org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:379)\n\tat scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n\tat scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n\tat scala.collection.immutable.List.foreach(List.scala:381)\n\tat scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n\tat scala.collection.immutable.List.flatMap(List.scala:344)\n\tat org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:379)\n\tat org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:149)\n\tat org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:413)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:606)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: java.net.ConnectException: Connection refused\n\tat sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)\n\tat sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)\n\tat org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:206)\n\tat org.apache.hadoop.net.NetUtils.connect(NetUtils.java:529)\n\tat org.apache.hadoop.net.NetUtils.connect(NetUtils.java:493)\n\tat org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:547)\n\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:642)\n\tat org.apache.hadoop.ipc.Client$Connection.access$2600(Client.java:314)\n\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1399)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1318)\n\t... 38 more\n",
  73 |      "output_type": "error",
  74 |      "traceback": [
  75 |       "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m",
  76 |       "\u001b[1;31mPy4JJavaError\u001b[0m                             Traceback (most recent call last)",
  77 |       "\u001b[1;32m<ipython-input-4-fc754f7878d8>\u001b[0m in \u001b[0;36m<module>\u001b[1;34m()\u001b[0m\n\u001b[1;32m----> 1\u001b[1;33m \u001b[0mchurnDataset\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mspark\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mread\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0moption\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"header\"\u001b[0m\u001b[1;33m,\u001b[0m\u001b[1;34m\"true\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mschema\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mchurnSchema\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcsv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;34m\"hdfs://sparkmaster:8020/user/hdfs/sampledata/churn.csv\"\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m      2\u001b[0m \u001b[0mcols\u001b[0m\u001b[1;33m=\u001b[0m\u001b[0mchurnDataset\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcolumns\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  78 |       "\u001b[1;32m/root/spark/spark-2.0.2/python/pyspark/sql/readwriter.py\u001b[0m in \u001b[0;36mcsv\u001b[1;34m(self, path, schema, sep, encoding, quote, escape, comment, header, inferSchema, ignoreLeadingWhiteSpace, ignoreTrailingWhiteSpace, nullValue, nanValue, positiveInf, negativeInf, dateFormat, timestampFormat, maxColumns, maxCharsPerColumn, maxMalformedLogPerPartition, mode)\u001b[0m\n\u001b[0;32m    375\u001b[0m         \u001b[1;32mif\u001b[0m \u001b[0misinstance\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m,\u001b[0m \u001b[0mbasestring\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    376\u001b[0m             \u001b[0mpath\u001b[0m \u001b[1;33m=\u001b[0m \u001b[1;33m[\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m]\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 377\u001b[1;33m         \u001b[1;32mreturn\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_df\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jreader\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mcsv\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_spark\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_sc\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0m_jvm\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPythonUtils\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoSeq\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mpath\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m    378\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    379\u001b[0m     \u001b[1;33m@\u001b[0m\u001b[0msince\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;36m1.5\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  79 |       "\u001b[1;32m/root/spark/spark-2.0.2/python/lib/py4j-0.10.3-src.zip/py4j/java_gateway.py\u001b[0m in \u001b[0;36m__call__\u001b[1;34m(self, *args)\u001b[0m\n\u001b[0;32m   1131\u001b[0m         \u001b[0manswer\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0mself\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mgateway_client\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0msend_command\u001b[0m\u001b[1;33m(\u001b[0m\u001b[0mcommand\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1132\u001b[0m         return_value = get_return_value(\n\u001b[1;32m-> 1133\u001b[1;33m             answer, self.gateway_client, self.target_id, self.name)\n\u001b[0m\u001b[0;32m   1134\u001b[0m \u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m   1135\u001b[0m         \u001b[1;32mfor\u001b[0m \u001b[0mtemp_arg\u001b[0m \u001b[1;32min\u001b[0m \u001b[0mtemp_args\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  80 |       "\u001b[1;32m/root/spark/spark-2.0.2/python/pyspark/sql/utils.py\u001b[0m in \u001b[0;36mdeco\u001b[1;34m(*a, **kw)\u001b[0m\n\u001b[0;32m     61\u001b[0m     \u001b[1;32mdef\u001b[0m \u001b[0mdeco\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     62\u001b[0m         \u001b[1;32mtry\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m---> 63\u001b[1;33m             \u001b[1;32mreturn\u001b[0m \u001b[0mf\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m*\u001b[0m\u001b[0ma\u001b[0m\u001b[1;33m,\u001b[0m \u001b[1;33m**\u001b[0m\u001b[0mkw\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0m\u001b[0;32m     64\u001b[0m         \u001b[1;32mexcept\u001b[0m \u001b[0mpy4j\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mprotocol\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mPy4JJavaError\u001b[0m \u001b[1;32mas\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m     65\u001b[0m             \u001b[0ms\u001b[0m \u001b[1;33m=\u001b[0m \u001b[0me\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mjava_exception\u001b[0m\u001b[1;33m.\u001b[0m\u001b[0mtoString\u001b[0m\u001b[1;33m(\u001b[0m\u001b[1;33m)\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n",
  81 |       "\u001b[1;32m/root/spark/spark-2.0.2/python/lib/py4j-0.10.3-src.zip/py4j/protocol.py\u001b[0m in \u001b[0;36mget_return_value\u001b[1;34m(answer, gateway_client, target_id, name)\u001b[0m\n\u001b[0;32m    317\u001b[0m                 raise Py4JJavaError(\n\u001b[0;32m    318\u001b[0m                     \u001b[1;34m\"An error occurred while calling {0}{1}{2}.\\n\"\u001b[0m\u001b[1;33m.\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[1;32m--> 319\u001b[1;33m                     format(target_id, \".\", name), value)\n\u001b[0m\u001b[0;32m    320\u001b[0m             \u001b[1;32melse\u001b[0m\u001b[1;33m:\u001b[0m\u001b[1;33m\u001b[0m\u001b[0m\n\u001b[0;32m    321\u001b[0m                 raise Py4JError(\n",
  82 |       "\u001b[1;31mPy4JJavaError\u001b[0m: An error occurred while calling o77.csv.\n: java.net.ConnectException: Call From sparkmaster.demo.com/10.37.101.3 to sparkmaster:8020 failed on connection exception: java.net.ConnectException: Connection refused; For more details see:  http://wiki.apache.org/hadoop/ConnectionRefused\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)\n\tat sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:57)\n\tat sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)\n\tat java.lang.reflect.Constructor.newInstance(Constructor.java:526)\n\tat org.apache.hadoop.net.NetUtils.wrapWithMessage(NetUtils.java:783)\n\tat org.apache.hadoop.net.NetUtils.wrapException(NetUtils.java:730)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1351)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1300)\n\tat org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206)\n\tat com.sun.proxy.$Proxy10.getFileInfo(Unknown Source)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:606)\n\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186)\n\tat org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)\n\tat com.sun.proxy.$Proxy10.getFileInfo(Unknown Source)\n\tat org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.getFileInfo(ClientNamenodeProtocolTranslatorPB.java:651)\n\tat org.apache.hadoop.hdfs.DFSClient.getFileInfo(DFSClient.java:1679)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1106)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem$17.doCall(DistributedFileSystem.java:1102)\n\tat org.apache.hadoop.fs.FileSystemLinkResolver.resolve(FileSystemLinkResolver.java:81)\n\tat org.apache.hadoop.hdfs.DistributedFileSystem.getFileStatus(DistributedFileSystem.java:1102)\n\tat org.apache.hadoop.fs.FileSystem.exists(FileSystem.java:1397)\n\tat org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:389)\n\tat org.apache.spark.sql.execution.datasources.DataSource$$anonfun$12.apply(DataSource.scala:379)\n\tat scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n\tat scala.collection.TraversableLike$$anonfun$flatMap$1.apply(TraversableLike.scala:241)\n\tat scala.collection.immutable.List.foreach(List.scala:381)\n\tat scala.collection.TraversableLike$class.flatMap(TraversableLike.scala:241)\n\tat scala.collection.immutable.List.flatMap(List.scala:344)\n\tat org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:379)\n\tat org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:149)\n\tat org.apache.spark.sql.DataFrameReader.csv(DataFrameReader.scala:413)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)\n\tat sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)\n\tat sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)\n\tat java.lang.reflect.Method.invoke(Method.java:606)\n\tat py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:237)\n\tat py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)\n\tat py4j.Gateway.invoke(Gateway.java:280)\n\tat py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)\n\tat py4j.commands.CallCommand.execute(CallCommand.java:79)\n\tat py4j.GatewayConnection.run(GatewayConnection.java:214)\n\tat java.lang.Thread.run(Thread.java:745)\nCaused by: java.net.ConnectException: Connection refused\n\tat sun.nio.ch.SocketChannelImpl.checkConnect(Native Method)\n\tat sun.nio.ch.SocketChannelImpl.finishConnect(SocketChannelImpl.java:744)\n\tat org.apache.hadoop.net.SocketIOWithTimeout.connect(SocketIOWithTimeout.java:206)\n\tat org.apache.hadoop.net.NetUtils.connect(NetUtils.java:529)\n\tat org.apache.hadoop.net.NetUtils.connect(NetUtils.java:493)\n\tat org.apache.hadoop.ipc.Client$Connection.setupConnection(Client.java:547)\n\tat org.apache.hadoop.ipc.Client$Connection.setupIOstreams(Client.java:642)\n\tat org.apache.hadoop.ipc.Client$Connection.access$2600(Client.java:314)\n\tat org.apache.hadoop.ipc.Client.getConnection(Client.java:1399)\n\tat org.apache.hadoop.ipc.Client.call(Client.java:1318)\n\t... 38 more\n"
  83 |      ]
  84 |     }
  85 |    ],
  86 |    "source": [
  87 |     "churnDataset = spark.read.option(\"header\",\"true\").schema(churnSchema).csv(\"hdfs://sparkmaster:8020/user/hdfs/sampledata/churn.csv\")\n",
  88 |     "cols=churnDataset.columns"
  89 |    ]
  90 |   },
  91 |   {
  92 |    "cell_type": "code",
  93 |    "execution_count": null,
  94 |    "metadata": {
  95 |     "collapsed": false
  96 |    },
  97 |    "outputs": [],
  98 |    "source": [
  99 |     "churnDataset.schema"
 100 |    ]
 101 |   },
 102 |   {
 103 |    "cell_type": "code",
 104 |    "execution_count": null,
 105 |    "metadata": {
 106 |     "collapsed": false
 107 |    },
 108 |    "outputs": [],
 109 |    "source": [
 110 |     "churnDataset.createOrReplaceTempView(\"churn_tab\")\n",
 111 |     "spark.sql(\"select max(daymins), min(daymins) from churn_tab\").show()"
 112 |    ]
 113 |   },
 114 |   {
 115 |    "cell_type": "code",
 116 |    "execution_count": 4,
 117 |    "metadata": {
 118 |     "collapsed": false
 119 |    },
 120 |    "outputs": [],
 121 |    "source": [
 122 |     "churnDataset = churnDataset.withColumn(\"ACCOUNTLENGTH\", churnDataset[\"ACCOUNTLENGTH\"].cast(\"double\"))\n",
 123 |     "churnDataset = churnDataset.withColumn(\"AREACODE\", churnDataset[\"AREACODE\"].cast(\"double\"))\n",
 124 |     "churnDataset = churnDataset.withColumn(\"VMAILMESSAGE\", churnDataset[\"VMAILMESSAGE\"].cast(\"double\"))\n",
 125 |     "churnDataset = churnDataset.withColumn(\"DAYMINS\", churnDataset[\"DAYMINS\"].cast(\"double\"))\n",
 126 |     "churnDataset = churnDataset.withColumn(\"DAYMINS\", churnDataset[\"DAYMINS\"].cast(\"double\"))\n",
 127 |     "churnDataset = churnDataset.withColumn(\"DAYCALLS\", churnDataset[\"DAYCALLS\"].cast(\"double\"))\n",
 128 |     "churnDataset = churnDataset.withColumn(\"DAYCHARGE\", churnDataset[\"DAYCHARGE\"].cast(\"double\"))\n",
 129 |     "churnDataset = churnDataset.withColumn(\"EVEMINS\", churnDataset[\"EVEMINS\"].cast(\"double\"))\n",
 130 |     "churnDataset = churnDataset.withColumn(\"EVECALLS\", churnDataset[\"EVECALLS\"].cast(\"double\"))\n",
 131 |     "churnDataset = churnDataset.withColumn(\"EVECHARGE\", churnDataset[\"EVECHARGE\"].cast(\"double\"))\n",
 132 |     "churnDataset = churnDataset.withColumn(\"NIGHTMINS\", churnDataset[\"NIGHTMINS\"].cast(\"double\"))\n",
 133 |     "churnDataset = churnDataset.withColumn(\"NIGHTCALLS\", churnDataset[\"NIGHTCALLS\"].cast(\"double\"))\n",
 134 |     "churnDataset = churnDataset.withColumn(\"NIGHTCHARGE\", churnDataset[\"NIGHTCHARGE\"].cast(\"double\"))\n",
 135 |     "churnDataset = churnDataset.withColumn(\"INTLMINS\", churnDataset[\"INTLMINS\"].cast(\"double\"))\n",
 136 |     "churnDataset = churnDataset.withColumn(\"INTLCALLS\", churnDataset[\"INTLCALLS\"].cast(\"double\"))\n",
 137 |     "churnDataset = churnDataset.withColumn(\"INTLCHARGE\", churnDataset[\"INTLCHARGE\"].cast(\"double\"))\n",
 138 |     "churnDataset = churnDataset.withColumn(\"CUSTSERVCALLS\", churnDataset[\"CUSTSERVCALLS\"].cast(\"double\"))\n"
 139 |    ]
 140 |   },
 141 |   {
 142 |    "cell_type": "code",
 143 |    "execution_count": 5,
 144 |    "metadata": {
 145 |     "collapsed": true
 146 |    },
 147 |    "outputs": [],
 148 |    "source": [
 149 |     "stages = [] # Creating Stages array for our pipeline\n",
 150 |     "\n",
 151 |     "#Declaring Categorical columns\n",
 152 |     "categoricalColumns = [\"PHONE\",\"STATE\", \"INTLPLAN\", \"VMAILPLAN\"]\n",
 153 |     "\n",
 154 |     "#Looping through the categorical columns for feature transformation\n",
 155 |     "for categoricalCol in categoricalColumns:\n",
 156 |     "  # Category Indexing with StringIndexer\n",
 157 |     "  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+\"Index\")\n",
 158 |     "  # Use OneHotEncoder to convert categorical variables into binary SparseVectors\n",
 159 |     "  encoder = OneHotEncoder(inputCol=categoricalCol+\"Index\", outputCol=categoricalCol+\"classVec\")\n",
 160 |     "  # Add stages to the stages array. We'll pass these stages to the pipeline.\n",
 161 |     "  stages += [stringIndexer, encoder]"
 162 |    ]
 163 |   },
 164 |   {
 165 |    "cell_type": "code",
 166 |    "execution_count": 6,
 167 |    "metadata": {
 168 |     "collapsed": true
 169 |    },
 170 |    "outputs": [],
 171 |    "source": [
 172 |     "#Using String indexer to transform Chrun variable\n",
 173 |     "label_stringIdx = StringIndexer(inputCol = \"CHURN\", outputCol = \"label\")\n",
 174 |     "#Adding the Churn transformation to our pipeline stages\n",
 175 |     "stages += [label_stringIdx]"
 176 |    ]
 177 |   },
 178 |   {
 179 |    "cell_type": "code",
 180 |    "execution_count": 7,
 181 |    "metadata": {
 182 |     "collapsed": false
 183 |    },
 184 |    "outputs": [],
 185 |    "source": [
 186 |     "# Transform all features into a vector using VectorAssembler\n",
 187 |     "numericCols = [\"ACCOUNTLENGTH\",\"AREACODE\",\"VMAILMESSAGE\",\"DAYMINS\",\"DAYCALLS\",\"DAYCHARGE\",\"EVEMINS\",\"EVECALLS\",\"EVECHARGE\",\"NIGHTMINS\",\"NIGHTCALLS\",\"NIGHTCHARGE\",\"INTLMINS\",\"INTLCALLS\",\"INTLCHARGE\",\"CUSTSERVCALLS\"]\n",
 188 |     "#Pick up all the transformed categorical variables\n",
 189 |     "categoricalVectorColumns = [*map(lambda c: c + \"classVec\", categoricalColumns)]\n",
 190 |     "#Add transformed categorical variables and numberical columns to the assmebler input\n",
 191 |     "assemblerInputs = categoricalVectorColumns + numericCols\n",
 192 |     "#Use Vector assembler to combine raw numerical features with transformed categorical inputs \n",
 193 |     "assembler = VectorAssembler(inputCols=assemblerInputs, outputCol=\"features\")\n",
 194 |     "#Add the feature assembling part to the pipeline stages\n",
 195 |     "stages += [assembler]"
 196 |    ]
 197 |   },
 198 |   {
 199 |    "cell_type": "code",
 200 |    "execution_count": 12,
 201 |    "metadata": {
 202 |     "collapsed": false
 203 |    },
 204 |    "outputs": [],
 205 |    "source": [
 206 |     "#Create the Pipeline\n",
 207 |     "pipeline = Pipeline(stages=stages)\n",
 208 |     "pipelineModel = pipeline.fit(churnDataset)\n",
 209 |     "churnDataset = pipelineModel.transform(churnDataset)"
 210 |    ]
 211 |   },
 212 |   {
 213 |    "cell_type": "code",
 214 |    "execution_count": 22,
 215 |    "metadata": {
 216 |     "collapsed": false
 217 |    },
 218 |    "outputs": [
 219 |     {
 220 |      "name": "stdout",
 221 |      "output_type": "stream",
 222 |      "text": [
 223 |       "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
 224 |       "|features                                                                                                                                                                                                   |\n",
 225 |       "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
 226 |       "|(3400,[3056,3349,3382,3384,3385,3386,3387,3388,3389,3390,3391,3392,3393,3394,3395,3396,3397,3398,3399],[1.0,1.0,1.0,128.0,415.0,25.0,265.1,110.0,45.07,197.4,99.0,16.78,244.7,91.0,11.01,10.0,3.0,2.7,1.0])|\n",
 227 |       "+-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+\n",
 228 |       "only showing top 1 row\n",
 229 |       "\n"
 230 |      ]
 231 |     }
 232 |    ],
 233 |    "source": [
 234 |     "churnDataset.select(\"features\").show(1,False)"
 235 |    ]
 236 |   },
 237 |   {
 238 |    "cell_type": "code",
 239 |    "execution_count": 13,
 240 |    "metadata": {
 241 |     "collapsed": false
 242 |    },
 243 |    "outputs": [],
 244 |    "source": [
 245 |     "# Keep relevant columns\n",
 246 |     "selectedcols = [\"label\", \"features\"] + cols\n",
 247 |     "churnDataset = churnDataset.select(selectedcols)"
 248 |    ]
 249 |   },
 250 |   {
 251 |    "cell_type": "code",
 252 |    "execution_count": 23,
 253 |    "metadata": {
 254 |     "collapsed": false
 255 |    },
 256 |    "outputs": [],
 257 |    "source": [
 258 |     "### Randomly split data into training and validation sets\n",
 259 |     "(trainingData, testData) = churnDataset.randomSplit([0.7, 0.3], seed = 78799)"
 260 |    ]
 261 |   },
 262 |   {
 263 |    "cell_type": "code",
 264 |    "execution_count": 26,
 265 |    "metadata": {
 266 |     "collapsed": false
 267 |    },
 268 |    "outputs": [
 269 |     {
 270 |      "name": "stdout",
 271 |      "output_type": "stream",
 272 |      "text": [
 273 |       "==================================================\n",
 274 |       "Training size: [2333] === Test Size: [1000]\n",
 275 |       "==================================================\n"
 276 |      ]
 277 |     }
 278 |    ],
 279 |    "source": [
 280 |     "print(\"==================================================\")\n",
 281 |     "print(\"Training size: [\" + str(trainingData.count())+\"] === Test Size: [\"+str(testData.count())+\"]\")\n",
 282 |     "print(\"==================================================\")\n"
 283 |    ]
 284 |   },
 285 |   {
 286 |    "cell_type": "markdown",
 287 |    "metadata": {},
 288 |    "source": [
 289 |     "Training of the model including timings"
 290 |    ]
 291 |   },
 292 |   {
 293 |    "cell_type": "code",
 294 |    "execution_count": 16,
 295 |    "metadata": {
 296 |     "collapsed": false
 297 |    },
 298 |    "outputs": [
 299 |     {
 300 |      "name": "stdout",
 301 |      "output_type": "stream",
 302 |      "text": [
 303 |       "Training time is 10.90792441368103s\n"
 304 |      ]
 305 |     }
 306 |    ],
 307 |    "source": [
 308 |     "# Start timer\n",
 309 |     "start_time = time.time()\n",
 310 |     "\n",
 311 |     "# Create an initial RandomForest model.\n",
 312 |     "rf = RandomForestClassifier(labelCol=\"label\", featuresCol=\"features\", maxDepth=5, maxBins=32, numTrees=20)\n",
 313 |     "\n",
 314 |     "# Train model with Training Data\n",
 315 |     "rfModel = rf.fit(trainingData)\n",
 316 |     "\n",
 317 |     "# Calculate total time\n",
 318 |     "train_time = time.time() - start_time\n",
 319 |     "print(\"Training time is \" + str(train_time) + \"s\")"
 320 |    ]
 321 |   },
 322 |   {
 323 |    "cell_type": "code",
 324 |    "execution_count": 27,
 325 |    "metadata": {
 326 |     "collapsed": false
 327 |    },
 328 |    "outputs": [
 329 |     {
 330 |      "data": {
 331 |       "text/plain": [
 332 |        "RandomForestClassificationModel (uid=rfc_2832c50151b2) with 20 trees"
 333 |       ]
 334 |      },
 335 |      "execution_count": 27,
 336 |      "metadata": {},
 337 |      "output_type": "execute_result"
 338 |     }
 339 |    ],
 340 |    "source": [
 341 |     "rfModel"
 342 |    ]
 343 |   },
 344 |   {
 345 |    "cell_type": "markdown",
 346 |    "metadata": {},
 347 |    "source": [
 348 |     "Validation of the model including timings"
 349 |    ]
 350 |   },
 351 |   {
 352 |    "cell_type": "code",
 353 |    "execution_count": 28,
 354 |    "metadata": {
 355 |     "collapsed": false
 356 |    },
 357 |    "outputs": [
 358 |     {
 359 |      "name": "stdout",
 360 |      "output_type": "stream",
 361 |      "text": [
 362 |       "Evaulation time is 1.014225721359253s\n",
 363 |       "Total time for training and evaulation is 11.922150135040283s\n"
 364 |      ]
 365 |     }
 366 |    ],
 367 |    "source": [
 368 |     "# Start timer\n",
 369 |     "start_time = time.time()\n",
 370 |     "\n",
 371 |     "# Make predictions on test data using the Transformer.transform() method.\n",
 372 |     "predictions = rfModel.transform(testData)\n",
 373 |     "\n",
 374 |     "# Evaluate model. Default metric is areaUnderROC\n",
 375 |     "evaluator = BinaryClassificationEvaluator()\n",
 376 |     "auc = evaluator.evaluate(predictions)\n",
 377 |     "\n",
 378 |     "# Calculate total time\n",
 379 |     "eval_time = time.time() - start_time\n",
 380 |     "print(\"Evaulation time is \" + str(eval_time) + \"s\")\n",
 381 |     "\n",
 382 |     "\n",
 383 |     "# Print total time for training + evaulation\n",
 384 |     "print(\"Total time for training and evaulation is \" + str(train_time + eval_time) + \"s\")"
 385 |    ]
 386 |   },
 387 |   {
 388 |    "cell_type": "code",
 389 |    "execution_count": 29,
 390 |    "metadata": {
 391 |     "collapsed": false
 392 |    },
 393 |    "outputs": [
 394 |     {
 395 |      "name": "stdout",
 396 |      "output_type": "stream",
 397 |      "text": [
 398 |       "Area under the curve is : 0.8701236787621239s\n"
 399 |      ]
 400 |     }
 401 |    ],
 402 |    "source": [
 403 |     "print(\"Area under the curve is : \"+str(auc)+\"s\")"
 404 |    ]
 405 |   },
 406 |   {
 407 |    "cell_type": "code",
 408 |    "execution_count": 30,
 409 |    "metadata": {
 410 |     "collapsed": true
 411 |    },
 412 |    "outputs": [],
 413 |    "source": [
 414 |     "# View model's predictions and probabilities\n",
 415 |     "selected = predictions.select(\"label\", \"prediction\", \"probability\")\n"
 416 |    ]
 417 |   },
 418 |   {
 419 |    "cell_type": "code",
 420 |    "execution_count": 32,
 421 |    "metadata": {
 422 |     "collapsed": false
 423 |    },
 424 |    "outputs": [
 425 |     {
 426 |      "name": "stdout",
 427 |      "output_type": "stream",
 428 |      "text": [
 429 |       "+-----+----------+----------------------------------------+\n",
 430 |       "|label|prediction|probability                             |\n",
 431 |       "+-----+----------+----------------------------------------+\n",
 432 |       "|0.0  |0.0       |[0.8617317493137119,0.1382682506862881] |\n",
 433 |       "|0.0  |0.0       |[0.8658637362350545,0.13413626376494553]|\n",
 434 |       "|0.0  |0.0       |[0.8530759766580674,0.1469240233419326] |\n",
 435 |       "|0.0  |0.0       |[0.7845468788124232,0.2154531211875767] |\n",
 436 |       "|0.0  |0.0       |[0.8729465847681863,0.12705341523181374]|\n",
 437 |       "+-----+----------+----------------------------------------+\n",
 438 |       "only showing top 5 rows\n",
 439 |       "\n"
 440 |      ]
 441 |     }
 442 |    ],
 443 |    "source": [
 444 |     "selected.show(5,False)"
 445 |    ]
 446 |   },
 447 |   {
 448 |    "cell_type": "code",
 449 |    "execution_count": null,
 450 |    "metadata": {
 451 |     "collapsed": false
 452 |    },
 453 |    "outputs": [],
 454 |    "source": [
 455 |     "churnDataset.describe('Churn').show()    "
 456 |    ]
 457 |   },
 458 |   {
 459 |    "cell_type": "code",
 460 |    "execution_count": null,
 461 |    "metadata": {
 462 |     "collapsed": false
 463 |    },
 464 |    "outputs": [],
 465 |    "source": [
 466 |     "churnDataset.describe('Phone','IntlPlan','VMailPlan','VMailMessage','DayMins','DayCalls','DayCharge','Churn').show()    "
 467 |    ]
 468 |   },
 469 |   {
 470 |    "cell_type": "code",
 471 |    "execution_count": null,
 472 |    "metadata": {
 473 |     "collapsed": false
 474 |    },
 475 |    "outputs": [],
 476 |    "source": [
 477 |     "churnDataset.describe('State','EveMins','EveCalls','EveCharge','NightMins','NightCalls','NightCharge').show()"
 478 |    ]
 479 |   },
 480 |   {
 481 |    "cell_type": "code",
 482 |    "execution_count": null,
 483 |    "metadata": {
 484 |     "collapsed": false
 485 |    },
 486 |    "outputs": [],
 487 |    "source": [
 488 |     "churnDataset.describe('AccountLength','AreaCode','IntlMins','IntlCalls','IntlCharge','CustServCalls').show()  "
 489 |    ]
 490 |   },
 491 |   {
 492 |    "cell_type": "code",
 493 |    "execution_count": null,
 494 |    "metadata": {
 495 |     "collapsed": true
 496 |    },
 497 |    "outputs": [],
 498 |    "source": [
 499 |     "churn = spark.read.option(\"header\",\"true\").csv(\"hdfs://sparkmaster:8020/user/hdfs/sampledata/churn.csv\")\n",
 500 |     "\n",
 501 |     "churn.createOrReplaceTempView(\"churn_tab\")\n",
 502 |     "spark.sql(\"select * from churn_tab limit 2\").show()"
 503 |    ]
 504 |   },
 505 |   {
 506 |    "cell_type": "markdown",
 507 |    "metadata": {},
 508 |    "source": [
 509 |     "Convert a DataFrame from a Categorical values to Category vectors so that they can be used by logistic regression"
 510 |    ]
 511 |   },
 512 |   {
 513 |    "cell_type": "code",
 514 |    "execution_count": null,
 515 |    "metadata": {
 516 |     "collapsed": false
 517 |    },
 518 |    "outputs": [],
 519 |    "source": [
 520 |     "from pyspark.ml.feature import OneHotEncoder, StringIndexer\n",
 521 |     "\n",
 522 |     "df = spark.createDataFrame([\n",
 523 |     "    (\"AK\", \"Democrats\"),\n",
 524 |     "    (\"AS\", \"Republicans\"),\n",
 525 |     "    (\"AZ\", \"Democrats\"),\n",
 526 |     "    (\"AR\", \"Republicans\"),\n",
 527 |     "    (\"CT\", \"GreenParty\"),\n",
 528 |     "    (\"DE\", \"Republicans\")\n",
 529 |     "], [\"State\", \"winparty\"])\n",
 530 |     "\n",
 531 |     "stringIndexer = StringIndexer(inputCol=\"winparty\", outputCol=\"winpartyIndex\")\n",
 532 |     "model = stringIndexer.fit(df)\n",
 533 |     "indexed = model.transform(df)\n",
 534 |     "\n",
 535 |     "encoder = OneHotEncoder(inputCol=\"winpartyIndex\", outputCol=\"winpartyVec\")\n",
 536 |     "encoded = encoder.transform(indexed)\n",
 537 |     "encoded.show()"
 538 |    ]
 539 |   },
 540 |   {
 541 |    "cell_type": "code",
 542 |    "execution_count": null,
 543 |    "metadata": {
 544 |     "collapsed": false
 545 |    },
 546 |    "outputs": [],
 547 |    "source": [
 548 |     "import math\n",
 549 |     "from pyspark.sql.functions import mean, min, max, ceil, round\n",
 550 |     "churnDataset.select(round((mean('AccountLength')),3)).toDF(\"AccountLength\").show()\n",
 551 |     "churnDataset.describe('State','AreaCode','IntlMins','IntlCalls','IntlCharge','CustServCalls','Churn').show()      "
 552 |    ]
 553 |   },
 554 |   {
 555 |    "cell_type": "raw",
 556 |    "metadata": {},
 557 |    "source": [
 558 |     "This is how you would display a sample chart with Juptyer Notebook"
 559 |    ]
 560 |   },
 561 |   {
 562 |    "cell_type": "code",
 563 |    "execution_count": null,
 564 |    "metadata": {
 565 |     "collapsed": false
 566 |    },
 567 |    "outputs": [],
 568 |    "source": [
 569 |     "%matplotlib inline\n",
 570 |     "import random\n",
 571 |     "import matplotlib.pyplot as plt\n",
 572 |     "import pandas as pd\n",
 573 |     "import numpy as np\n",
 574 |     "ts = pd.Series(np.random.randn(1000), index=pd.date_range('1/1/2000', periods=1000))\n",
 575 |     "ts = ts.cumsum()\n",
 576 |     "ts.plot()"
 577 |    ]
 578 |   },
 579 |   {
 580 |    "cell_type": "code",
 581 |    "execution_count": null,
 582 |    "metadata": {
 583 |     "collapsed": false
 584 |    },
 585 |    "outputs": [],
 586 |    "source": [
 587 |     "import random\n",
 588 |     "\n",
 589 |     "# create an RDD of 100 random numbers\n",
 590 |     "x = [random.normalvariate(0,1) for i in range(100)]\n",
 591 |     "rdd = sc.parallelize(x)\n",
 592 |     "\n",
 593 |     "# plot data in RDD - use .collect() to bring data to local\n",
 594 |     "num_bins = 50\n",
 595 |     "np.array(['1','2','3']).astype(np.float)\n",
 596 |     "#n, bins, patches = plt.hist(rdd.collect(), num_bins, normed=1, facecolor='green', alpha=0.5)\n",
 597 |     "n, bins, patches = plt.hist(np.array(rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)"
 598 |    ]
 599 |   },
 600 |   {
 601 |    "cell_type": "markdown",
 602 |    "metadata": {},
 603 |    "source": [
 604 |     "We are now going to plot the histograms for some of the data types to check their overall distribution.\n"
 605 |    ]
 606 |   },
 607 |   {
 608 |    "cell_type": "code",
 609 |    "execution_count": null,
 610 |    "metadata": {
 611 |     "collapsed": false
 612 |    },
 613 |    "outputs": [],
 614 |    "source": [
 615 |     "import random\n",
 616 |     "num_bins = 10\n",
 617 |     "#n, bins, patches = plt.hist(np.array(churnDataset.select(\"IntlCharge\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)\n",
 618 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"IntlMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"Intl Mins\")\n",
 619 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"Night Mins\")\n",
 620 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='blue', alpha=0.5,label=\"Day Mins\")\n",
 621 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"EveMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='orange', alpha=0.5,label=\"Eve Mins\")\n",
 622 |     "plt.legend(loc='upper right')\n",
 623 |     "plt.show()\n",
 624 |     "\n",
 625 |     "\n"
 626 |    ]
 627 |   },
 628 |   {
 629 |    "cell_type": "code",
 630 |    "execution_count": null,
 631 |    "metadata": {
 632 |     "collapsed": false
 633 |    },
 634 |    "outputs": [],
 635 |    "source": [
 636 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"IntlMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"Intl Mins\")\n",
 637 |     "plt.legend(loc='upper right')\n",
 638 |     "plt.show()\n",
 639 |     "\n",
 640 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"Night Mins\")\n",
 641 |     "plt.legend(loc='upper right')\n",
 642 |     "plt.show()\n",
 643 |     "\n",
 644 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='blue', alpha=0.5,label=\"Day Mins\")\n",
 645 |     "plt.legend(loc='upper right')\n",
 646 |     "plt.show()\n",
 647 |     "\n",
 648 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"EveMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='orange', alpha=0.5,label=\"Eve Mins\")\n",
 649 |     "plt.legend(loc='upper right')\n",
 650 |     "plt.show()"
 651 |    ]
 652 |   },
 653 |   {
 654 |    "cell_type": "code",
 655 |    "execution_count": null,
 656 |    "metadata": {
 657 |     "collapsed": false
 658 |    },
 659 |    "outputs": [],
 660 |    "source": [
 661 |     "churnDataset.createOrReplaceTempView(\"churn_tab\")\n",
 662 |     "churners = spark.sql(\"select * from churn_tab where churn = 'True.'\")\n",
 663 |     "churners.count()\n"
 664 |    ]
 665 |   },
 666 |   {
 667 |    "cell_type": "code",
 668 |    "execution_count": null,
 669 |    "metadata": {
 670 |     "collapsed": false
 671 |    },
 672 |    "outputs": [],
 673 |    "source": [
 674 |     "nonChurners = spark.sql(\"select * from churn_tab where churn = 'False.'\")\n",
 675 |     "nonChurners.count()"
 676 |    ]
 677 |   },
 678 |   {
 679 |    "cell_type": "code",
 680 |    "execution_count": null,
 681 |    "metadata": {
 682 |     "collapsed": false
 683 |    },
 684 |    "outputs": [],
 685 |    "source": [
 686 |     "n, bins, patches = plt.hist(np.array(churners.select(\"IntlMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"C Intl Mins\")\n",
 687 |     "plt.legend(loc='upper right')\n",
 688 |     "plt.show()\n",
 689 |     "n, bins, patches = plt.hist(np.array(nonChurners.select(\"IntlMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"NC Intl Mins\")\n",
 690 |     "plt.legend(loc='upper right')\n",
 691 |     "plt.show()\n",
 692 |     "\n",
 693 |     "\n",
 694 |     "n, bins, patches = plt.hist(np.array(churners.select(\"NightMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"C Night Mins\")\n",
 695 |     "plt.legend(loc='upper right')\n",
 696 |     "plt.show()\n",
 697 |     "n, bins, patches = plt.hist(np.array(nonChurners.select(\"NightMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"NC Night Mins\")\n",
 698 |     "plt.legend(loc='upper right')\n",
 699 |     "plt.show()\n",
 700 |     "\n",
 701 |     "n, bins, patches = plt.hist(np.array(churners.select(\"DayMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5,label=\"C Day Mins\")\n",
 702 |     "plt.legend(loc='upper right')\n",
 703 |     "plt.show()\n",
 704 |     "n, bins, patches = plt.hist(np.array(nonChurners.select(\"DayMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5,label=\"NC Day Mins\")\n",
 705 |     "plt.legend(loc='upper right')\n",
 706 |     "plt.show()\n",
 707 |     "\n",
 708 |     "\n",
 709 |     "n, bins, patches = plt.hist(np.array(churners.select(\"EveMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5,label=\"C Eve Mins\")\n",
 710 |     "plt.legend(loc='upper right')\n",
 711 |     "plt.show()\n",
 712 |     "n, bins, patches = plt.hist(np.array(nonChurners.select(\"EveMins\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5,label=\"NC Eve Mins\")\n",
 713 |     "plt.legend(loc='upper right')\n",
 714 |     "plt.show()\n",
 715 |     "\n"
 716 |    ]
 717 |   },
 718 |   {
 719 |    "cell_type": "code",
 720 |    "execution_count": null,
 721 |    "metadata": {
 722 |     "collapsed": false
 723 |    },
 724 |    "outputs": [],
 725 |    "source": [
 726 |     "churners.createOrReplaceTempView(\"churner_tab\")\n",
 727 |     "spark.sql(\"select Min(IntlCharge), max(IntlCharge) from churner_tab\").show()\n"
 728 |    ]
 729 |   },
 730 |   {
 731 |    "cell_type": "code",
 732 |    "execution_count": null,
 733 |    "metadata": {
 734 |     "collapsed": false
 735 |    },
 736 |    "outputs": [],
 737 |    "source": [
 738 |     "n, bins, patches = plt.hist(np.array(churners.select(\"IntlCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"C Intl Chrg\")\n",
 739 |     "plt.legend(loc='upper right')\n",
 740 |     "plt.show()\n",
 741 |     "n, bins, patches = plt.hist(np.array(nonChurners.select(\"IntlCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"NC Intl Chrg\")\n",
 742 |     "plt.legend(loc='upper right')\n",
 743 |     "plt.show()\n",
 744 |     "\n",
 745 |     "n, bins, patches = plt.hist(np.array(churners.select(\"NightCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"C Night Chrg\")\n",
 746 |     "plt.legend(loc='upper right')\n",
 747 |     "plt.show()\n",
 748 |     "n, bins, patches = plt.hist(np.array(nonChurners.select(\"NightCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"NC Night Chrg\")\n",
 749 |     "plt.legend(loc='upper right')\n",
 750 |     "plt.show()\n",
 751 |     "\n",
 752 |     "n, bins, patches = plt.hist(np.array(churners.select(\"DayCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5,label=\"C Day Chrg\")\n",
 753 |     "plt.legend(loc='upper right')\n",
 754 |     "plt.show()\n",
 755 |     "n, bins, patches = plt.hist(np.array(nonChurners.select(\"DayCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5,label=\"NC Day Chrg\")\n",
 756 |     "plt.legend(loc='upper right')\n",
 757 |     "plt.show()\n",
 758 |     "\n",
 759 |     "\n",
 760 |     "n, bins, patches = plt.hist(np.array(churners.select(\"EveCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5,label=\"C Eve Chrg\")\n",
 761 |     "plt.legend(loc='upper right')\n",
 762 |     "plt.show()\n",
 763 |     "n, bins, patches = plt.hist(np.array(nonChurners.select(\"EveCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5,label=\"NC Eve Chrg\")\n",
 764 |     "plt.legend(loc='upper right')\n",
 765 |     "plt.show()\n"
 766 |    ]
 767 |   },
 768 |   {
 769 |    "cell_type": "code",
 770 |    "execution_count": null,
 771 |    "metadata": {
 772 |     "collapsed": false
 773 |    },
 774 |    "outputs": [],
 775 |    "source": [
 776 |     "#import random\n",
 777 |     "#num_bins = 10\n",
 778 |     "#data=np.vstack([np.array(churnDataset.select(\"DayMins\").rdd.takeSample(True,20,1)).astype(np.float),np.array(churnDataset.select(\"EveMins\").rdd.takeSample(True,20,1)).astype(np.float),np.array(churnDataset.select(\"NightMins\").rdd.takeSample(True,20,1)).astype(np.float)]).T\n",
 779 |     "#n, bins, patches = plt.hist(data, num_bins, normed=0, facecolor='green', alpha=0.5, label=[\"Inlt Charge\",\"Night Charge\",\"Day Charge\"])\n",
 780 |     "#plt.legend(loc='upper right')\n",
 781 |     "#plt.show()\n",
 782 |     "\n",
 783 |     "\n",
 784 |     "import random\n",
 785 |     "num_bins = 10\n",
 786 |     "\n",
 787 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"IntlCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"Intl Charge\")\n",
 788 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='red', alpha=0.5, label=\"Night Charge\")\n",
 789 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayCharge\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='blue', alpha=0.5,label=\"Day Charge\")\n",
 790 |     "plt.legend(loc='upper right')\n",
 791 |     "plt.show()\n",
 792 |     "\n",
 793 |     "\n",
 794 |     "\n"
 795 |    ]
 796 |   },
 797 |   {
 798 |    "cell_type": "code",
 799 |    "execution_count": null,
 800 |    "metadata": {
 801 |     "collapsed": false
 802 |    },
 803 |    "outputs": [],
 804 |    "source": [
 805 |     "import random\n",
 806 |     "num_bins = 10\n",
 807 |     "\n",
 808 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightCalls\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='Red', alpha=0.5, label=\"Night Calls\")\n",
 809 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"IntlCalls\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='green', alpha=0.5, label=\"Intl Calls\")\n",
 810 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayCalls\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='blue', alpha=0.5,label=\"Day Calls\")\n",
 811 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"EveCalls\").rdd.collect()).astype(np.float), num_bins, normed=0, facecolor='Pink', alpha=0.5,label=\"Eve Calls\")\n",
 812 |     "plt.legend(loc='upper right')\n",
 813 |     "plt.show()"
 814 |    ]
 815 |   },
 816 |   {
 817 |    "cell_type": "code",
 818 |    "execution_count": null,
 819 |    "metadata": {
 820 |     "collapsed": false
 821 |    },
 822 |    "outputs": [],
 823 |    "source": [
 824 |     "churnDataset.createOrReplaceTempView(\"churn_tab\")\n",
 825 |     "churners = spark.sql(\"select * from churn_tab where churn='True.'\")\n",
 826 |     "nonChurners = spark.sql(\"select * from churn_tab where churn='False.'\")\n",
 827 |     "churners.count()\n",
 828 |     "print(\"churners = \"+ str(churners.count()) +\" and non churneres = \"+str(nonChurners.count())+\" \")"
 829 |    ]
 830 |   },
 831 |   {
 832 |    "cell_type": "code",
 833 |    "execution_count": null,
 834 |    "metadata": {
 835 |     "collapsed": true
 836 |    },
 837 |    "outputs": [],
 838 |    "source": []
 839 |   },
 840 |   {
 841 |    "cell_type": "code",
 842 |    "execution_count": null,
 843 |    "metadata": {
 844 |     "collapsed": false
 845 |    },
 846 |    "outputs": [],
 847 |    "source": [
 848 |     "import numpy as np\n",
 849 |     "import matplotlib.pyplot as plt\n",
 850 |     "\n",
 851 |     "\n",
 852 |     "N = 50\n",
 853 |     "x = np.array(churners.select(\"IntlMins\").rdd.takeSample(False,250,1)).astype(np.float)\n",
 854 |     "y = np.array(nonChurners.select(\"IntlMins\").rdd.takeSample(False,250,1)).astype(np.float)\n",
 855 |     "colors = ['Red','Green']\n",
 856 |     "area = np.pi * (15 * np.random.rand(N))**2  # 0 to 15 point radii\n",
 857 |     "\n",
 858 |     "plt.scatter(x, y, s=area, c=colors, alpha=0.5)\n",
 859 |     "plt.show()"
 860 |    ]
 861 |   },
 862 |   {
 863 |    "cell_type": "code",
 864 |    "execution_count": null,
 865 |    "metadata": {
 866 |     "collapsed": false
 867 |    },
 868 |    "outputs": [],
 869 |    "source": [
 870 |     "import numpy as np\n",
 871 |     "import matplotlib.pyplot as plt\n",
 872 |     "\n",
 873 |     "\n",
 874 |     "N = 50\n",
 875 |     "x = np.array(churners.select(\"CustServCalls\").rdd.takeSample(False,350,1)).astype(np.float)\n",
 876 |     "y = np.array(nonChurners.select(\"CustServCalls\").rdd.takeSample(False,350,1)).astype(np.float)\n",
 877 |     "colors = ['Red','Green']\n",
 878 |     "area = np.pi * (15 * np.random.rand(N))**2  # 0 to 15 point radii\n",
 879 |     "\n",
 880 |     "plt.scatter(x, y, s=area, c=colors, alpha=0.5)\n",
 881 |     "plt.show()"
 882 |    ]
 883 |   },
 884 |   {
 885 |    "cell_type": "code",
 886 |    "execution_count": null,
 887 |    "metadata": {
 888 |     "collapsed": false
 889 |    },
 890 |    "outputs": [],
 891 |    "source": [
 892 |     "%%sh\n",
 893 |     "pip install plotly"
 894 |    ]
 895 |   },
 896 |   {
 897 |    "cell_type": "code",
 898 |    "execution_count": null,
 899 |    "metadata": {
 900 |     "collapsed": false
 901 |    },
 902 |    "outputs": [],
 903 |    "source": [
 904 |     "import plotly.plotly as py\n",
 905 |     "from plotly.graph_objs import *\n",
 906 |     "import pandas as pd\n",
 907 |     "import requests\n",
 908 |     "requests.packages.urllib3.disable_warnings()\n",
 909 |     "\n",
 910 |     "import plotly.tools as tls\n",
 911 |     "tls.set_credentials_file(username='masifabbasi', api_key='qX37gH9e7nhdEcuV6zSJ')\n",
 912 |     "\n",
 913 |     "\n",
 914 |     "eveMinsChurners = Data([Histogram(x=churners.select('EveMins').rdd.collect())])\n",
 915 |     "py.iplot(eveMinsChurners, filename=\"even_minchurners\")\n"
 916 |    ]
 917 |   },
 918 |   {
 919 |    "cell_type": "markdown",
 920 |    "metadata": {},
 921 |    "source": [
 922 |     "-- Lets look at a scatter plot for Evening Mins for Churners vs. Non churners"
 923 |    ]
 924 |   },
 925 |   {
 926 |    "cell_type": "code",
 927 |    "execution_count": null,
 928 |    "metadata": {
 929 |     "collapsed": false
 930 |    },
 931 |    "outputs": [],
 932 |    "source": [
 933 |     "import plotly.tools as tls\n",
 934 |     "import plotly.plotly as py\n",
 935 |     "import plotly.graph_objs as go\n",
 936 |     "\n",
 937 |     "# Create random data with numpy\n",
 938 |     "import numpy as np\n",
 939 |     "\n",
 940 |     "tls.set_credentials_file(username='masifabbasi', api_key='qX37gH9e7nhdEcuV6zSJ')\n",
 941 |     "c = np.array(churners.select(\"CustServCalls\").sample(False,0.9,1).limit(200).rdd.collect()).astype(np.float)\n",
 942 |     "nc = np.array(nonChurners.select(\"CustServCalls\").sample(False,0.9,1).limit(200).rdd.collect()).astype(np.float)\n",
 943 |     "\n",
 944 |     "# N = 1000\n",
 945 |     "# random_x = np.random.randn(N)\n",
 946 |     "# random_y = np.random.randn(N)\n",
 947 |     "# for i,j in zip(c.ravel(),nc.ravel()):\n",
 948 |     "Churners = go.Scatter(\n",
 949 |     "   y = c.ravel(),\n",
 950 |     "   mode = 'markers',\n",
 951 |     "   marker = dict(\n",
 952 |     "      color='red'\n",
 953 |     "    )\n",
 954 |     ")\n",
 955 |     "\n",
 956 |     "NonChurners = go.Scatter(\n",
 957 |     "   y = nc.ravel(),\n",
 958 |     "   mode = 'markers',\n",
 959 |     "   marker = dict(\n",
 960 |     "        color='green'\n",
 961 |     "    )\n",
 962 |     ")\n",
 963 |     "  \n",
 964 |     "layout = go.Layout(\n",
 965 |     "    title='Customer Service Calls',\n",
 966 |     "    xaxis=dict(\n",
 967 |     "        title='Customers',\n",
 968 |     "        titlefont=dict(\n",
 969 |     "            family='Courier New, monospace',\n",
 970 |     "            size=18,\n",
 971 |     "            color='#7f7f7f'\n",
 972 |     "        )\n",
 973 |     "    ),\n",
 974 |     "    yaxis=dict(\n",
 975 |     "        title='Number of Calls to Cust Service',\n",
 976 |     "        titlefont=dict(\n",
 977 |     "            family='Courier New, monospace',\n",
 978 |     "            size=18,\n",
 979 |     "            color='#7f7f7f'\n",
 980 |     "        )\n",
 981 |     "    )\n",
 982 |     ")\n",
 983 |     "\n",
 984 |     "# Create a trace\n",
 985 |     "\n",
 986 |     "data = [Churners,NonChurners]\n",
 987 |     "\n",
 988 |     "fig = go.Figure(data=data, layout=layout)\n",
 989 |     "py.iplot(fig, filename='basic-scatter')\n",
 990 |     "\n",
 991 |     "# ./\n",
 992 |     "# # Plot and embed in ipython notebook!\n",
 993 |     "# py.iplot(data, layout=layout, filename='basic-scatter')"
 994 |    ]
 995 |   },
 996 |   {
 997 |    "cell_type": "code",
 998 |    "execution_count": null,
 999 |    "metadata": {
1000 |     "collapsed": false
1001 |    },
1002 |    "outputs": [],
1003 |    "source": [
1004 |     "import numpy as np\n",
1005 |     "a = np.array([7,1,4,8,1,3,2,5])\n",
1006 |     "a= np.sort(a)\n",
1007 |     "print(\"Array = \"+str(a))"
1008 |    ]
1009 |   },
1010 |   {
1011 |    "cell_type": "code",
1012 |    "execution_count": null,
1013 |    "metadata": {
1014 |     "collapsed": false
1015 |    },
1016 |    "outputs": [],
1017 |    "source": [
1018 |     "import plotly.tools as tls\n",
1019 |     "tls.set_credentials_file(username='masifabbasi', api_key='qX37gH9e7nhdEcuV6zSJ')\n",
1020 |     "c = np.array(churners.select(\"CustServCalls\").sample(False,0.2,1).limit(400).rdd.collect()).astype(np.float)\n",
1021 |     "nc = np.array(nonChurners.select(\"CustServCalls\").sample(False,0.2,1).limit(400).rdd.collect()).astype(np.float)\n",
1022 |     "\n",
1023 |     "import plotly.plotly as py\n",
1024 |     "import plotly.graph_objs as go\n",
1025 |     "\n",
1026 |     "# Create random data with numpy\n",
1027 |     "import numpy as np\n",
1028 |     "N = len(c)\n",
1029 |     "random_x=random.sample(range(1, N+2), N)\n",
1030 |     "\n",
1031 |     "c= np.sort(c)\n",
1032 |     "nc = np.sort(nc)\n",
1033 |     "random_x = np.sort(random_x)\n",
1034 |     "\n",
1035 |     "Churners = go.Scatter(\n",
1036 |     "   x = random_x,\n",
1037 |     "   y = c.ravel(), \n",
1038 |     "   name ='Churners',\n",
1039 |     "   mode = 'markers',\n",
1040 |     "   marker = dict(\n",
1041 |     "        color='red'\n",
1042 |     "    ),\n",
1043 |     "  line = dict(\n",
1044 |     "            width = 2,\n",
1045 |     "            color = 'rgb(0, 0, 0)'\n",
1046 |     "        )\n",
1047 |     ")\n",
1048 |     "\n",
1049 |     "NonChurners = go.Scatter(\n",
1050 |     "   x = random_x,\n",
1051 |     "   y = nc.ravel(),\n",
1052 |     "   name = 'Non-Churners',\n",
1053 |     "   mode = 'markers',\n",
1054 |     "   marker = dict(\n",
1055 |     "        color='green'\n",
1056 |     "    )\n",
1057 |     ")\n",
1058 |     "  \n",
1059 |     "\n",
1060 |     "layout = go.Layout(\n",
1061 |     "    title='Customer Service Calls',\n",
1062 |     "    xaxis=dict(\n",
1063 |     "        title='Customers',\n",
1064 |     "        titlefont=dict(\n",
1065 |     "            family='Courier New, monospace',\n",
1066 |     "            size=18,\n",
1067 |     "            color='#7f7f7f'\n",
1068 |     "        )\n",
1069 |     "    ),\n",
1070 |     "    yaxis=dict(\n",
1071 |     "        title='Number of Calls to Cust Service',\n",
1072 |     "        titlefont=dict(\n",
1073 |     "            family='Courier New, monospace',\n",
1074 |     "            size=18,\n",
1075 |     "            color='#7f7f7f'\n",
1076 |     "        )\n",
1077 |     "    )\n",
1078 |     ")\n",
1079 |     "\n",
1080 |     "# Create a trace\n",
1081 |     "\n",
1082 |     "data = [Churners,NonChurners]\n",
1083 |     "\n",
1084 |     "fig = go.Figure(data=data, layout=layout)\n",
1085 |     "py.iplot(fig, filename='basic-scatter')\n"
1086 |    ]
1087 |   },
1088 |   {
1089 |    "cell_type": "code",
1090 |    "execution_count": null,
1091 |    "metadata": {
1092 |     "collapsed": false
1093 |    },
1094 |    "outputs": [],
1095 |    "source": [
1096 |     "#Imports\n",
1097 |     "import plotly.plotly as py\n",
1098 |     "import plotly.graph_objs as go\n",
1099 |     "\n",
1100 |     "#Separating the data into Churn and Non-Churn Data Set\n",
1101 |     "churnDataset.createOrReplaceTempView(\"churn_tab\")\n",
1102 |     "churners = spark.sql(\"select * from churn_tab where churn='True.'\")\n",
1103 |     "nonChurners = spark.sql(\"select * from churn_tab where churn='False.'\")\n",
1104 |     "\n",
1105 |     "#Getting the Count for churners/Non-Churners\n",
1106 |     "churnCnt = churners.count()\n",
1107 |     "nonChurnCnt = nonChurners.count()\n",
1108 |     "\n",
1109 |     "data = [go.Bar(\n",
1110 |     "            x=['Churners', 'Non-Churners'],\n",
1111 |     "            y=[churnCnt, nonChurnCnt]\n",
1112 |     "    )]\n",
1113 |     "\n",
1114 |     "py.iplot(data, filename='Churn-NonChurn Plot')"
1115 |    ]
1116 |   },
1117 |   {
1118 |    "cell_type": "code",
1119 |    "execution_count": null,
1120 |    "metadata": {
1121 |     "collapsed": false
1122 |    },
1123 |    "outputs": [],
1124 |    "source": [
1125 |     "import plotly.tools as tls\n",
1126 |     "tls.set_credentials_file(username='masifabbasi', api_key='qX37gH9e7nhdEcuV6zSJ')\n",
1127 |     "c = np.array(churners.select(\"EveMins\").sample(False,0.2,1).limit(200).rdd.collect()).astype(np.float)\n",
1128 |     "nc = np.array(non_churners.select(\"EveMins\").sample(False,0.2,1).limit(200).rdd.collect()).astype(np.float)\n",
1129 |     "\n",
1130 |     "import plotly.plotly as py\n",
1131 |     "import plotly.graph_objs as go\n",
1132 |     "\n",
1133 |     "# Create random data with numpy\n",
1134 |     "import numpy as np\n",
1135 |     "\n",
1136 |     "# N = 1000\n",
1137 |     "# random_x = np.random.randn(N)\n",
1138 |     "# random_y = np.random.randn(N)\n",
1139 |     "# for i,j in zip(c.ravel(),nc.ravel()):\n",
1140 |     "trace = go.Scatter(\n",
1141 |     "   x = c.ravel(),\n",
1142 |     "   y = nc.ravel(),\n",
1143 |     "   mode = 'markers',\n",
1144 |     "   marker = dict(\n",
1145 |     "        color='FFBAD2'\n",
1146 |     "    )\n",
1147 |     ")\n",
1148 |     "\n",
1149 |     "# Create a trace\n",
1150 |     "\n",
1151 |     "data = [trace]\n",
1152 |     "# ./\n",
1153 |     "# # Plot and embed in ipython notebook!\n",
1154 |     "py.iplot(data, filename='basic-scatter')"
1155 |    ]
1156 |   },
1157 |   {
1158 |    "cell_type": "code",
1159 |    "execution_count": null,
1160 |    "metadata": {
1161 |     "collapsed": false
1162 |    },
1163 |    "outputs": [],
1164 |    "source": [
1165 |     "import plotly.tools as tls\n",
1166 |     "tls.set_credentials_file(username='masifabbasi', api_key='qX37gH9e7nhdEcuV6zSJ')\n",
1167 |     "c = np.array(churners.select(\"EveMins\").sample(False,0.2,1).limit(200).rdd.collect()).astype(np.float)\n",
1168 |     "nc = np.array(non_churners.select(\"EveMins\").sample(False,0.2,1).limit(200).rdd.collect()).astype(np.float)\n",
1169 |     "\n",
1170 |     "import plotly.plotly as py\n",
1171 |     "import plotly.graph_objs as go\n",
1172 |     "\n",
1173 |     "# Create random data with numpy\n",
1174 |     "import numpy as np\n",
1175 |     "\n",
1176 |     "# N = 1000\n",
1177 |     "# random_x = np.random.randn(N)\n",
1178 |     "# random_y = np.random.randn(N)\n",
1179 |     "# for i,j in zip(c.ravel(),nc.ravel()):\n",
1180 |     "Churners = go.Scatter(\n",
1181 |     "   x = c.ravel(),\n",
1182 |     "   mode = 'markers',\n",
1183 |     "   marker = dict(\n",
1184 |     "        color='red'\n",
1185 |     "    )\n",
1186 |     ")\n",
1187 |     "\n",
1188 |     "NonChurners = go.Scatter(\n",
1189 |     "   x = nc.ravel(),\n",
1190 |     "   mode = 'markers',\n",
1191 |     "   marker = dict(\n",
1192 |     "        color='blue'\n",
1193 |     "    )\n",
1194 |     ")\n",
1195 |     "  \n",
1196 |     "\n",
1197 |     "# Create a trace\n",
1198 |     "\n",
1199 |     "data = [Churners,NonChurners]\n",
1200 |     "# ./\n",
1201 |     "# # Plot and embed in ipython notebook!\n",
1202 |     "py.iplot(data, filename='basic-scatter')"
1203 |    ]
1204 |   },
1205 |   {
1206 |    "cell_type": "code",
1207 |    "execution_count": null,
1208 |    "metadata": {
1209 |     "collapsed": false
1210 |    },
1211 |    "outputs": [],
1212 |    "source": [
1213 |     "n1, bins1, patches1 = plt.hist(np.array(churnDataset.select(\"IntlCalls\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='red', alpha=0.5)\n"
1214 |    ]
1215 |   },
1216 |   {
1217 |    "cell_type": "code",
1218 |    "execution_count": null,
1219 |    "metadata": {
1220 |     "collapsed": false
1221 |    },
1222 |    "outputs": [],
1223 |    "source": [
1224 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightMins\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='blue', alpha=0.5)\n"
1225 |    ]
1226 |   },
1227 |   {
1228 |    "cell_type": "code",
1229 |    "execution_count": null,
1230 |    "metadata": {
1231 |     "collapsed": false
1232 |    },
1233 |    "outputs": [],
1234 |    "source": [
1235 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightCalls\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)\n"
1236 |    ]
1237 |   },
1238 |   {
1239 |    "cell_type": "code",
1240 |    "execution_count": null,
1241 |    "metadata": {
1242 |     "collapsed": false
1243 |    },
1244 |    "outputs": [],
1245 |    "source": [
1246 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"NightCharge\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)\n"
1247 |    ]
1248 |   },
1249 |   {
1250 |    "cell_type": "code",
1251 |    "execution_count": null,
1252 |    "metadata": {
1253 |     "collapsed": false
1254 |    },
1255 |    "outputs": [],
1256 |    "source": [
1257 |     "#'Phone','IntlPlan','VMailPlan','VMailMessage','DayMins','DayCalls','DayCharge','Churn\n",
1258 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayMins\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)"
1259 |    ]
1260 |   },
1261 |   {
1262 |    "cell_type": "code",
1263 |    "execution_count": null,
1264 |    "metadata": {
1265 |     "collapsed": false
1266 |    },
1267 |    "outputs": [],
1268 |    "source": [
1269 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayCalls\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)"
1270 |    ]
1271 |   },
1272 |   {
1273 |    "cell_type": "code",
1274 |    "execution_count": null,
1275 |    "metadata": {
1276 |     "collapsed": false
1277 |    },
1278 |    "outputs": [],
1279 |    "source": [
1280 |     "n, bins, patches = plt.hist(np.array(churnDataset.select(\"DayCharge\").rdd.collect()).astype(np.float), num_bins, normed=1, facecolor='green', alpha=0.5)"
1281 |    ]
1282 |   },
1283 |   {
1284 |    "cell_type": "code",
1285 |    "execution_count": null,
1286 |    "metadata": {
1287 |     "collapsed": false
1288 |    },
1289 |    "outputs": [],
1290 |    "source": [
1291 |     "import matplotlib.pyplot as plt\n",
1292 |     "import numpy as np\n",
1293 |     "\n",
1294 |     "data = [(' whitefield', 65299), (' bellandur', 57061), (' kundalahalli', 51769), (' marathahalli', 50639),\n",
1295 |     "(' electronic city', 44041), (' sarjapur road junction', 34164), (' indiranagar 2nd stage', 32459),\n",
1296 |     "(' malleswaram', 32171), (' yelahanka main road', 28901), (' domlur', 28869)]\n",
1297 |     "\n",
1298 |     "freequency = []\n",
1299 |     "words = []\n",
1300 |     "\n",
1301 |     "for line in data:\n",
1302 |     "    freequency.append(line[1])\n",
1303 |     "    words.append(line[0])\n",
1304 |     "\n",
1305 |     "y_axis = np.arange(1, len(words) + 1, 1)\n",
1306 |     "\n",
1307 |     "plt.barh(y_axis, freequency, align='center')\n",
1308 |     "plt.yticks(y_axis, words)\n",
1309 |     "plt.show()"
1310 |    ]
1311 |   },
1312 |   {
1313 |    "cell_type": "code",
1314 |    "execution_count": null,
1315 |    "metadata": {
1316 |     "collapsed": false
1317 |    },
1318 |    "outputs": [],
1319 |    "source": [
1320 |     "churnDataset.createOrReplaceTempView(\"churn_tab\")\n",
1321 |     "vmailplan = spark.sql(\"select VmailPlan, count(*) as cnt from churn_tab group by VmailPlan \")\n",
1322 |     "\n",
1323 |     "# plt.barh(y_axis, vmailplan.select(\"cnt\").rdd.collect(), align='center')\n",
1324 |     "# plt.yticks(y_axis, vmailplan.select(\"VmailPlan\").rdd.collect())\n",
1325 |     "# plt.show()"
1326 |    ]
1327 |   },
1328 |   {
1329 |    "cell_type": "code",
1330 |    "execution_count": null,
1331 |    "metadata": {
1332 |     "collapsed": false
1333 |    },
1334 |    "outputs": [],
1335 |    "source": [
1336 |     "churnDataset.createOrReplaceTempView(\"churn_tab\")\n",
1337 |     "vmailplan = spark.sql(\"select VmailPlan, count(*) as cnt from churn_tab group by VmailPlan \")\n",
1338 |     "vmailplan.show()\n",
1339 |     "\n"
1340 |    ]
1341 |   },
1342 |   {
1343 |    "cell_type": "code",
1344 |    "execution_count": null,
1345 |    "metadata": {
1346 |     "collapsed": false
1347 |    },
1348 |    "outputs": [],
1349 |    "source": [
1350 |     "import matplotlib.pyplot as plt\n",
1351 |     "%matplotlib inline\n",
1352 |     "\n",
1353 |     "x_labels= vmailplan['VMAILPLAN'].values\n",
1354 |     "fig = vmailplan[['cnt']].plot(kind='bar', facecolor='lightblue')\n",
1355 |     "fig.set_xticklabels(x_labels)\n",
1356 |     "fig.set_title('Vmail Plans')\n",
1357 |     "fig.set_xlabel('Voice Mail Plan ')\n",
1358 |     "fig.set_ylabel('Number of People')\n",
1359 |     "plt.show()\n"
1360 |    ]
1361 |   },
1362 |   {
1363 |    "cell_type": "code",
1364 |    "execution_count": null,
1365 |    "metadata": {
1366 |     "collapsed": false
1367 |    },
1368 |    "outputs": [],
1369 |    "source": [
1370 |     "import numpy as np\n",
1371 |     "import matplotlib.mlab as mlab\n",
1372 |     "import matplotlib.pyplot as plt\n",
1373 |     "\n",
1374 |     "np.random.seed(0)\n",
1375 |     "\n",
1376 |     "# example data\n",
1377 |     "mu = 100\n",
1378 |     "sigma = 15  # standard deviation of distribution\n",
1379 |     "x = mu + sigma * np.random.randn(437)\n",
1380 |     "\n",
1381 |     "num_bins = 50\n",
1382 |     "\n",
1383 |     "fig, ax = plt.subplots()\n",
1384 |     "\n",
1385 |     "# the histogram of the data\n",
1386 |     "n, bins, patches = ax.hist(x, num_bins, normed=1)\n",
1387 |     "\n",
1388 |     "# add a 'best fit' line\n",
1389 |     "y = mlab.normpdf(bins, mu, sigma)\n",
1390 |     "ax.plot(bins, y, '--')\n",
1391 |     "ax.set_xlabel('Smarts')\n",
1392 |     "ax.set_ylabel('Probability density')\n",
1393 |     "ax.set_title(r'Histogram of IQ: $\\mu=100$, $\\sigma=15$')\n",
1394 |     "\n",
1395 |     "# Tweak spacing to prevent clipping of ylabel\n",
1396 |     "fig.tight_layout()\n",
1397 |     "plt.show()"
1398 |    ]
1399 |   },
1400 |   {
1401 |    "cell_type": "code",
1402 |    "execution_count": null,
1403 |    "metadata": {
1404 |     "collapsed": false
1405 |    },
1406 |    "outputs": [],
1407 |    "source": [
1408 |     "\"\"\"\n",
1409 |     "hexbin is an axes method or pyplot function that is essentially\n",
1410 |     "a pcolor of a 2-D histogram with hexagonal cells.  It can be\n",
1411 |     "much more informative than a scatter plot; in the first subplot\n",
1412 |     "below, try substituting 'scatter' for 'hexbin'.\n",
1413 |     "\"\"\"\n",
1414 |     "\n",
1415 |     "import numpy as np\n",
1416 |     "import matplotlib.pyplot as plt\n",
1417 |     "\n",
1418 |     "np.random.seed(0)\n",
1419 |     "n = 100000\n",
1420 |     "x = np.random.standard_normal(n)\n",
1421 |     "y = 2.0 + 3.0 * x + 4.0 * np.random.standard_normal(n)\n",
1422 |     "xmin = x.min()\n",
1423 |     "xmax = x.max()\n",
1424 |     "ymin = y.min()\n",
1425 |     "ymax = y.max()\n",
1426 |     "\n",
1427 |     "fig, axs = plt.subplots(ncols=2, sharey=True, figsize=(7, 4))\n",
1428 |     "fig.subplots_adjust(hspace=0.5, left=0.07, right=0.93)\n",
1429 |     "ax = axs[0]\n",
1430 |     "hb = ax.hexbin(x, y, gridsize=50, cmap='inferno')\n",
1431 |     "ax.axis([xmin, xmax, ymin, ymax])\n",
1432 |     "ax.set_title(\"Hexagon binning\")\n",
1433 |     "cb = fig.colorbar(hb, ax=ax)\n",
1434 |     "cb.set_label('counts')\n",
1435 |     "\n",
1436 |     "ax = axs[1]\n",
1437 |     "hb = ax.hexbin(x, y, gridsize=50, bins='log', cmap='inferno')\n",
1438 |     "ax.axis([xmin, xmax, ymin, ymax])\n",
1439 |     "ax.set_title(\"With a log color scale\")\n",
1440 |     "cb = fig.colorbar(hb, ax=ax)\n",
1441 |     "cb.set_label('log10(N)')\n",
1442 |     "\n",
1443 |     "plt.show()\n"
1444 |    ]
1445 |   },
1446 |   {
1447 |    "cell_type": "code",
1448 |    "execution_count": null,
1449 |    "metadata": {
1450 |     "collapsed": false,
1451 |     "scrolled": false
1452 |    },
1453 |    "outputs": [],
1454 |    "source": [
1455 |     "\"\"\"\n",
1456 |     "Simple demo of a scatter plot.\n",
1457 |     "\"\"\"\n",
1458 |     "import numpy as np\n",
1459 |     "import matplotlib.pyplot as plt\n",
1460 |     "\n",
1461 |     "\n",
1462 |     "N = 50\n",
1463 |     "x = np.random.rand(N)\n",
1464 |     "y = np.random.rand(N)\n",
1465 |     "colors = np.random.rand(N)\n",
1466 |     "area = np.pi * (15 * np.random.rand(N))**2  # 0 to 15 point radii\n",
1467 |     "\n",
1468 |     "plt.scatter(x, y, s=area, c=colors, alpha=0.5)\n",
1469 |     "plt.show()\n"
1470 |    ]
1471 |   },
1472 |   {
1473 |    "cell_type": "code",
1474 |    "execution_count": null,
1475 |    "metadata": {
1476 |     "collapsed": false
1477 |    },
1478 |    "outputs": [],
1479 |    "source": [
1480 |     "import matplotlib.pyplot as plt\n",
1481 |     "import numpy as np\n",
1482 |     "\n",
1483 |     "x = np.arange(0.0, 2, 0.01)\n",
1484 |     "y1 = np.sin(2*np.pi*x)\n",
1485 |     "y2 = 1.2*np.sin(4*np.pi*x)\n",
1486 |     "\n",
1487 |     "fig, (ax1, ax2, ax3) = plt.subplots(3, 1, sharex=True)\n",
1488 |     "\n",
1489 |     "ax1.fill_between(x, 0, y1)\n",
1490 |     "ax1.set_ylabel('between y1 and 0')\n",
1491 |     "\n",
1492 |     "ax2.fill_between(x, y1, 1)\n",
1493 |     "ax2.set_ylabel('between y1 and 1')\n",
1494 |     "\n",
1495 |     "ax3.fill_between(x, y1, y2)\n",
1496 |     "ax3.set_ylabel('between y1 and y2')\n",
1497 |     "ax3.set_xlabel('x')\n",
1498 |     "\n",
1499 |     "# now fill between y1 and y2 where a logical condition is met.  Note\n",
1500 |     "# this is different than calling\n",
1501 |     "#   fill_between(x[where], y1[where],y2[where]\n",
1502 |     "# because of edge effects over multiple contiguous regions.\n",
1503 |     "fig, (ax, ax1) = plt.subplots(2, 1, sharex=True)\n",
1504 |     "ax.plot(x, y1, x, y2, color='black')\n",
1505 |     "ax.fill_between(x, y1, y2, where=y2 >= y1, facecolor='green', interpolate=True)\n",
1506 |     "ax.fill_between(x, y1, y2, where=y2 <= y1, facecolor='red', interpolate=True)\n",
1507 |     "ax.set_title('fill between where')\n",
1508 |     "\n",
1509 |     "# Test support for masked arrays.\n",
1510 |     "y2 = np.ma.masked_greater(y2, 1.0)\n",
1511 |     "ax1.plot(x, y1, x, y2, color='black')\n",
1512 |     "ax1.fill_between(x, y1, y2, where=y2 >= y1, facecolor='green', interpolate=True)\n",
1513 |     "ax1.fill_between(x, y1, y2, where=y2 <= y1, facecolor='red', interpolate=True)\n",
1514 |     "ax1.set_title('Now regions with y2>1 are masked')\n",
1515 |     "\n",
1516 |     "# This example illustrates a problem; because of the data\n",
1517 |     "# gridding, there are undesired unfilled triangles at the crossover\n",
1518 |     "# points.  A brute-force solution would be to interpolate all\n",
1519 |     "# arrays to a very fine grid before plotting.\n",
1520 |     "\n",
1521 |     "# show how to use transforms to create axes spans where a certain condition is satisfied\n",
1522 |     "fig, ax = plt.subplots()\n",
1523 |     "y = np.sin(4*np.pi*x)\n",
1524 |     "ax.plot(x, y, color='black')\n",
1525 |     "\n",
1526 |     "# use the data coordinates for the x-axis and the axes coordinates for the y-axis\n",
1527 |     "import matplotlib.transforms as mtransforms\n",
1528 |     "trans = mtransforms.blended_transform_factory(ax.transData, ax.transAxes)\n",
1529 |     "theta = 0.9\n",
1530 |     "ax.axhline(theta, color='green', lw=2, alpha=0.5)\n",
1531 |     "ax.axhline(-theta, color='red', lw=2, alpha=0.5)\n",
1532 |     "ax.fill_between(x, 0, 1, where=y > theta, facecolor='green', alpha=0.5, transform=trans)\n",
1533 |     "ax.fill_between(x, 0, 1, where=y < -theta, facecolor='red', alpha=0.5, transform=trans)\n",
1534 |     "\n",
1535 |     "\n",
1536 |     "plt.show()\n"
1537 |    ]
1538 |   },
1539 |   {
1540 |    "cell_type": "code",
1541 |    "execution_count": null,
1542 |    "metadata": {
1543 |     "collapsed": false
1544 |    },
1545 |    "outputs": [],
1546 |    "source": [
1547 |     "\"\"\"\n",
1548 |     "Small demonstration of the hlines and vlines plots.\n",
1549 |     "\"\"\"\n",
1550 |     "\n",
1551 |     "import matplotlib.pyplot as plt\n",
1552 |     "import numpy as np\n",
1553 |     "import numpy.random as rnd\n",
1554 |     "\n",
1555 |     "\n",
1556 |     "def f(t):\n",
1557 |     "    s1 = np.sin(2 * np.pi * t


--------------------------------------------------------------------------------