├── CH01
    └── chapter1 placeholder.txt
├── CH04
    └── code
    │   └── CH04.txt
├── CH11
    ├── data
    │   ├── got1.txt
    │   ├── got2.txt
    │   ├── got3.txt
    │   ├── got4.txt
    │   └── got5.txt
    └── code
    │   └── CH11_Words+to+Vectors.py
├── CH07
    ├── data
    │   └── TherapyBotSession.csv
    └── code
    │   ├── Natural+Language+Processing+-+ChatBot.py
    │   └── Natural+Language+Processing+-+ChatBot.ipynb
├── CH13
    ├── football
    │   ├── messi
    │   │   ├── messi1.jpeg
    │   │   ├── messi10.jpeg
    │   │   ├── messi11.jpeg
    │   │   ├── messi12.jpeg
    │   │   ├── messi13.jpeg
    │   │   ├── messi14.jpeg
    │   │   ├── messi15.jpeg
    │   │   ├── messi16.jpeg
    │   │   ├── messi17.jpeg
    │   │   ├── messi18.jpeg
    │   │   ├── messi19.jpeg
    │   │   ├── messi2.jpeg
    │   │   ├── messi20.jpeg
    │   │   ├── messi21.jpeg
    │   │   ├── messi22.jpeg
    │   │   ├── messi23.jpeg
    │   │   ├── messi24.jpeg
    │   │   ├── messi25.jpeg
    │   │   ├── messi26.jpeg
    │   │   ├── messi27.jpeg
    │   │   ├── messi28.jpeg
    │   │   ├── messi29.jpeg
    │   │   ├── messi3.jpeg
    │   │   ├── messi30.jpeg
    │   │   ├── messi4.jpeg
    │   │   ├── messi5.jpeg
    │   │   ├── messi6.jpeg
    │   │   ├── messi7.jpeg
    │   │   ├── messi8.jpeg
    │   │   └── messi9.jpeg
    │   └── ronaldo
    │   │   ├── ronaldo1.jpeg
    │   │   ├── ronaldo2.jpeg
    │   │   ├── ronaldo22.jpg
    │   │   ├── ronaldo23.jpg
    │   │   ├── ronaldo24.jpg
    │   │   ├── ronaldo3.jpeg
    │   │   ├── ronaldo4.jpeg
    │   │   ├── ronaldo5.jpeg
    │   │   ├── ronaldo6.jpeg
    │   │   ├── ronaldo7.jpeg
    │   │   ├── ronaldo8.jpeg
    │   │   ├── ronaldo9.jpeg
    │   │   ├── ronaldo10.jpeg
    │   │   ├── ronaldo11.jpeg
    │   │   ├── ronaldo12.jpeg
    │   │   ├── ronaldo13.jpeg
    │   │   ├── ronaldo14.jpeg
    │   │   ├── ronaldo15.jpeg
    │   │   ├── ronaldo16.jpeg
    │   │   ├── ronaldo17.jpeg
    │   │   ├── ronaldo18.jpeg
    │   │   ├── ronaldo19.jpeg
    │   │   ├── ronaldo20.jpeg
    │   │   ├── ronaldo21.jpeg
    │   │   ├── ronaldo25.jpeg
    │   │   ├── ronaldo26.jpeg
    │   │   ├── ronaldo27.jpeg
    │   │   ├── ronaldo28.jpeg
    │   │   ├── ronaldo29.jpeg
    │   │   └── ronaldo30.jpeg
    └── code
    │   ├── Image+Classification+with+TensorFlow+on+Spark.py
    │   └── Image+Classification+with+TensorFlow+on+Spark.ipynb
├── CH02
    ├── data
    │   └── HeightAndWeight.txt
    └── code
    │   └── NeuralNetworkfromScratch_with_python_and spark.py
├── README.md
├── CH03
    └── code
    │   ├── MNIST+with+CNN.py
    │   └── MNIST+with+CNN.ipynb
├── CH10
    └── code
    │   └── CH10_Face+recognition.py
├── CH08
    └── code
    │   └── Real+Estate+Prediction.py
├── CH09
    └── code
    │   └── Predicting+Apple+Stock+Market+Value.py
├── CH05
    └── code
    │   └── Predicting+Fire+Dept+Calls+with+Spark+ML.py
├── CH06
    └── code
    │   ├── CH06_LSTMs+word+level.py
    │   └── CH06_LSTMs+word+level.ipynb
└── CH12
    └── code
        └── Create+a+movie+recommendation+engine+with+Keras.py


/CH01/chapter1 placeholder.txt:
--------------------------------------------------------------------------------
1 | chapter1 placeholder
2 | 


--------------------------------------------------------------------------------
/CH04/code/CH04.txt:
--------------------------------------------------------------------------------
1 | There is no code for this chapter
2 | 


--------------------------------------------------------------------------------
/CH11/data/got1.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH11/data/got1.txt


--------------------------------------------------------------------------------
/CH11/data/got2.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH11/data/got2.txt


--------------------------------------------------------------------------------
/CH11/data/got3.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH11/data/got3.txt


--------------------------------------------------------------------------------
/CH11/data/got4.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH11/data/got4.txt


--------------------------------------------------------------------------------
/CH11/data/got5.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH11/data/got5.txt


--------------------------------------------------------------------------------
/CH07/data/TherapyBotSession.csv:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH07/data/TherapyBotSession.csv


--------------------------------------------------------------------------------
/CH13/football/messi/messi1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi1.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi10.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi10.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi11.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi11.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi12.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi12.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi13.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi13.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi14.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi14.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi15.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi15.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi16.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi16.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi17.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi17.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi18.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi18.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi19.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi19.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi2.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi20.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi20.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi21.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi21.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi22.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi22.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi23.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi23.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi24.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi24.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi25.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi25.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi26.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi26.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi27.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi27.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi28.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi28.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi29.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi29.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi3.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi30.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi30.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi4.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi4.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi5.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi5.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi6.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi6.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi7.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi7.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi8.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi8.jpeg


--------------------------------------------------------------------------------
/CH13/football/messi/messi9.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi9.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo1.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo1.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo2.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo2.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo22.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo22.jpg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo23.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo23.jpg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo24.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo24.jpg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo3.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo3.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo4.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo4.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo5.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo5.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo6.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo6.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo7.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo7.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo8.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo8.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo9.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo9.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo10.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo10.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo11.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo11.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo12.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo12.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo13.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo13.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo14.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo14.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo15.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo15.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo16.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo16.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo17.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo17.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo18.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo18.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo19.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo19.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo20.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo20.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo21.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo21.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo25.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo25.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo26.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo26.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo27.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo27.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo28.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo28.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo29.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo29.jpeg


--------------------------------------------------------------------------------
/CH13/football/ronaldo/ronaldo30.jpeg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo30.jpeg


--------------------------------------------------------------------------------
/CH02/data/HeightAndWeight.txt:
--------------------------------------------------------------------------------
 1 | Gender  Height (inches)  Weight (lbs)
 2 | Female  67  150
 3 | Female  65  135
 4 | Female  68  130
 5 | Male    70  160
 6 | Female  70  130
 7 | Male    69  174
 8 | Male    65  126
 9 | Male    74  188
10 | Female  60  110
11 | Female  63  125
12 | Male    70  173
13 | Female  70  145
14 | Male    68  175
15 | Female  65  123
16 | Male    71  145
17 | Male    74  160
18 | Female  64  135
19 | Male    71  175
20 | Male    67  145
21 | Male    67  130
22 | Male    70  162
23 | Female  64  107
24 | Male    70  175
25 | Male    64  130
26 | Male    66  163
27 | Female  63  137
28 | Male    65  165
29 | Female  65  130
30 | Female  64  109
31 | 
32 | 
33 | 
34 | 
35 | 
36 | 
37 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # ApacheSparkDeepLearningCookbook
 2 | Title: Apache Spark Deep Learning Cookbook
 3 | 
 4 | Subtitle: Over 80 recipes that streamline deep learning in a distributed environment with Apache Spark
 5 | Long Description: 
 6 | With deep learning gaining rapid mainstream adoption in modern-day industries, organizations are looking for ways to unite popular big data tools with highly efficient deep learning libraries. As a result, this will help deep learning models train with higher efficiency and speed. 
 7 | 
 8 | With the help of the Apache Spark Deep Learning Cookbook, you’ll work through specific recipes to generate outcomes for deep learning algorithms, without getting bogged down in theory. From setting up Apache Spark for deep learning to implementing types of neural net, this book tackles both common and not so common problems to perform deep learning on a distributed environment. In addition to this, you’ll get access to deep learning code within Spark that can be reused to answer similar problems or tweaked to answer slightly different problems. You will also learn how to stream and cluster your data with Spark. Once you have got to grips with the basics, you’ll explore how to implement and deploy deep learning models, such as Convolutional Neural Networks (CNN) and Recurrent Neural Networks (RNN) in Spark, using popular libraries such as TensorFlow and Keras.
 9 | 
10 | By the end of the book, you'll have the expertise to train and deploy efficient deep learning models on Apache Spark.
11 | Short description: 
12 | This book will show you how to train and deploy deep learning models on Apache Spark. You will leverage powerful deep learning libraries such as TensorFlow to develop your models and ensure their optimum performance. By the end of this book, you will be able to build efficient distributed applications using Spark, powered by deep learning.
13 | What you will learn:
14 | • Set up a fully functional Spark environment
15 | • Understand practical machine learning and deep learning concepts 
16 | • Apply built-in machine learning libraries within Spark
17 | • Explore libraries that are compatible with TensorFlow and Keras
18 | • Explore NLP models such as word2vec and TF-IDF on Spark
19 | • Organize dataframes for deep learning evaluation
20 | • Apply testing and training modeling to ensure accuracy
21 | • Access readily available code that may be reusable
22 | Metadescription: 
23 | A solution-based guide to put your deep learning models into production with the power of Apache Spark
24 | Key features:
25 | • Discover practical recipes for distributed deep learning with Apache Spark
26 | • Learn to use libraries such as Keras and TensorFlow 
27 | • Explore NLP models such as word2vec and TF-IDF on Spark
28 | • Solve problems to train your deep learning models on Apache Spark
29 | Audience: 
30 | If you’re looking for a practical and highly useful resource for implementing efficiently distributed deep learning models with Apache Spark, then the Apache Spark Deep Learning Cookbook is for you. Knowledge of the core machine learning concepts and a basic understanding of the Apache Spark framework is required to get the best out of this book.  Additionally, some programming knowledge in Python is a plus.
31 | Approach: 
32 | This book includes practical, easy to understand recipes on how you can implement the popular Deep Learning libraries such as TensorFlow and Keras to train your deep learning models on Apache Spark, without getting bogged down in theory.
33 | 
34 | 
35 | 


--------------------------------------------------------------------------------
/CH13/code/Image+Classification+with+TensorFlow+on+Spark.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | spark = SparkSession.builder    .master("local")    .appName("ImageClassification")    .config("spark.executor.memory", "6gb")    .getOrCreate()
  7 | 
  8 | 
  9 | # In[2]:
 10 | 
 11 | import pyspark.sql.functions as f
 12 | import sparkdl as dl
 13 | 
 14 | 
 15 | # In[3]:
 16 | 
 17 | dfMessi = dl.readImages('football/messi/').withColumn('label', f.lit(0))
 18 | dfRonaldo = dl.readImages('football/ronaldo/').withColumn('label', f.lit(1))
 19 | 
 20 | 
 21 | # In[4]:
 22 | 
 23 | dfMessi.show(n=10,truncate=False)
 24 | 
 25 | 
 26 | # In[5]:
 27 | 
 28 | dfRonaldo.show(n=10,truncate=False)
 29 | 
 30 | 
 31 | # In[6]:
 32 | 
 33 | trainDFmessi, testDFmessi = dfMessi.randomSplit([66.7, 33.3], seed =12)
 34 | trainDFronaldo, testDFronaldo = dfRonaldo.randomSplit([66.7, 33.3], seed=12)
 35 | 
 36 | 
 37 | # In[7]:
 38 | 
 39 | print('The number of images in trainDFmessi is {}'.format(trainDFmessi.toPandas().shape[0]))
 40 | print('The number of images in testDFmessi is {}'.format(testDFmessi.toPandas().shape[0]))
 41 | print('The number of images in trainDFronaldo is {}'.format(trainDFronaldo.toPandas().shape[0]))
 42 | print('The number of images in testDFronaldo is {}'.format(testDFronaldo.toPandas().shape[0]))
 43 | 
 44 | 
 45 | # In[8]:
 46 | 
 47 | trainDF = trainDFmessi.unionAll(trainDFronaldo)
 48 | testDF = testDFmessi.unionAll(testDFronaldo)
 49 | 
 50 | 
 51 | # In[9]:
 52 | 
 53 | print('The number of images in the training data is {}' .format(trainDF.toPandas().shape[0]))
 54 | print('The number of images in the testing  data is {}' .format(testDF.toPandas().shape[0]))
 55 | 
 56 | 
 57 | # In[10]:
 58 | 
 59 | from pyspark.ml.classification import LogisticRegression
 60 | from pyspark.ml import Pipeline
 61 | 
 62 | vectorizer = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3')
 63 | logreg = LogisticRegression(maxIter=30,labelCol = "label", featuresCol="features")
 64 | pipeline = Pipeline(stages=[vectorizer, logreg])
 65 | 
 66 | pipeline_model = pipeline.fit(trainDF)
 67 | 
 68 | 
 69 | # In[11]:
 70 | 
 71 | predictDF = pipeline_model.transform(testDF)
 72 | predictDF.select('label', 'prediction').show(n = testDF.toPandas().shape[0], truncate=False)
 73 | 
 74 | 
 75 | # In[12]:
 76 | 
 77 | predictDF.crosstab('prediction', 'label').show()
 78 | 
 79 | 
 80 | # In[13]:
 81 | 
 82 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator
 83 | scoring = predictDF.select("prediction", "label")
 84 | accuracy_score = MulticlassClassificationEvaluator(metricName="accuracy")
 85 | rate = accuracy_score.evaluate(scoring)*100
 86 | print("accuracy: {}%" .format(round(rate,2)))
 87 | 
 88 | 
 89 | # In[14]:
 90 | 
 91 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
 92 | 
 93 | binaryevaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction")
 94 | binary_rate = binaryevaluator.evaluate(predictDF)*100
 95 | print("accuracy: {}%" .format(round(binary_rate,2)))
 96 | 
 97 | 
 98 | # In[15]:
 99 | 
100 | logregFT = LogisticRegression(
101 |     regParam=0.05, 
102 |     elasticNetParam=0.3,
103 |     maxIter=15,labelCol = "label", featuresCol="features")
104 | pipelineFT = Pipeline(stages=[vectorizer, logregFT])
105 | 
106 | pipeline_model_FT = pipelineFT.fit(trainDF)
107 | 
108 | 
109 | # In[16]:
110 | 
111 | predictDF_FT = pipeline_model_FT.transform(testDF)
112 | predictDF_FT.crosstab('prediction', 'label').show()
113 | 
114 | 
115 | # In[17]:
116 | 
117 | binary_rate_FT = binaryevaluator.evaluate(predictDF_FT)*100
118 | print("accuracy: {}%" .format(round(binary_rate_FT,2)))
119 | 
120 | 


--------------------------------------------------------------------------------
/CH03/code/MNIST+with+CNN.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | import tensorflow as tf
  7 | 
  8 | 
  9 | # In[2]:
 10 | 
 11 | print(tf.__version__)
 12 | 
 13 | 
 14 | # In[3]:
 15 | 
 16 | from tensorflow.examples.tutorials.mnist import input_data
 17 | data = input_data.read_data_sets('MNIST/', one_hot=True)
 18 | 
 19 | 
 20 | # In[4]:
 21 | 
 22 | import os
 23 | os.listdir('MNIST/')
 24 | 
 25 | 
 26 | # In[5]:
 27 | 
 28 | print('Image Inventory')
 29 | print('----------')
 30 | print('Training: {}'.format(len(data.train.labels)))
 31 | print('Testing:  {}'.format(len(data.test.labels)))
 32 | print('----------')
 33 | 
 34 | 
 35 | # In[6]:
 36 | 
 37 | import numpy as np
 38 | import matplotlib.pyplot as plt
 39 | get_ipython().magic('matplotlib inline')
 40 | 
 41 | 
 42 | # In[7]:
 43 | 
 44 | for i in range(2):
 45 |     image = data.train.images[i]
 46 |     image = np.array(image, dtype='float')
 47 |     label = data.train.labels[i]
 48 |     pixels = image.reshape((28, 28))
 49 |     plt.imshow(pixels, cmap='gray')
 50 |     print('-----------------')
 51 |     print(label)
 52 |     plt.show()
 53 |     
 54 | 
 55 | 
 56 | # In[8]:
 57 | 
 58 | if not os.path.exists('MNIST/images'):
 59 |     os.makedirs('MNIST/images/')
 60 | os.chdir('MNIST/images/')
 61 | 
 62 | 
 63 | # In[9]:
 64 | 
 65 | from matplotlib import image
 66 | for i in range(1,10):
 67 |     png = data.train.images[i]
 68 |     png = np.array(png, dtype='float')
 69 |     pixels = png.reshape((28, 28))
 70 |     image.imsave('image_no_{}.png'.format(i), pixels, cmap = 'gray')
 71 | 
 72 | 
 73 | # In[10]:
 74 | 
 75 | print(os.listdir())
 76 | 
 77 | 
 78 | # In[11]:
 79 | 
 80 | from Augmentor import Pipeline
 81 | 
 82 | 
 83 | # In[12]:
 84 | 
 85 | augmentor = Pipeline('/home/asherif844/sparkNotebooks/Ch03/MNIST/images')
 86 | 
 87 | 
 88 | # In[13]:
 89 | 
 90 | augmentor.rotate(probability=0.9, max_left_rotation=25, max_right_rotation=25)
 91 | 
 92 | 
 93 | # In[14]:
 94 | 
 95 | for i in range(1,3):
 96 |     augmentor.sample(10)
 97 | 
 98 | 
 99 | # In[15]:
100 | 
101 | xtrain = data.train.images
102 | ytrain = np.asarray(data.train.labels)
103 | xtest = data.test.images 
104 | ytest = np.asarray(data.test.labels)
105 | 
106 | 
107 | # In[16]:
108 | 
109 | xtrain = xtrain.reshape( xtrain.shape[0],28,28,1)
110 | xtest = xtest.reshape(xtest.shape[0],28,28,1)
111 | ytest= ytest.reshape(ytest.shape[0],10)
112 | ytrain = ytrain.reshape(ytrain.shape[0],10)
113 | 
114 | 
115 | # In[17]:
116 | 
117 | print(xtrain.shape)
118 | print(ytrain.shape)
119 | print(xtest.shape)
120 | print(ytest.shape)
121 | 
122 | 
123 | # In[18]:
124 | 
125 | import keras
126 | import keras.backend as K
127 | from keras.models import Sequential
128 | from keras.layers import Dense, Flatten, Conv2D
129 | 
130 | K.set_image_dim_ordering('tf')
131 | 
132 | model = Sequential()
133 | 
134 | model.add(Conv2D(32, kernel_size=(5, 5),activation='relu', input_shape=(28,28,1)))
135 | model.add(Flatten())
136 | model.add(Dense(128, activation='relu'))
137 | model.add(Dense(10, activation='sigmoid'))
138 | 
139 | 
140 | # In[19]:
141 | 
142 | model.compile(optimizer='adam',loss='categorical_crossentropy', 
143 |               metrics=['accuracy'])
144 | 
145 | 
146 | # In[20]:
147 | 
148 | model.fit(xtrain,ytrain,batch_size=512,
149 |           epochs=5,
150 |           validation_data=(xtest, ytest))
151 | 
152 | 
153 | # In[21]:
154 | 
155 | stats = model.evaluate(xtest, ytest)
156 | print('The accuracy rate is {}%'.format(round(stats[1],3)*100))
157 | print('The loss rate is {}%'.format(round(stats[0],2)*100))
158 | 
159 | 
160 | # In[22]:
161 | 
162 | model.summary()
163 | 
164 | 
165 | # In[ ]:
166 | 
167 | 
168 | 
169 | 


--------------------------------------------------------------------------------
/CH10/code/CH10_Face+recognition.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | get_ipython().magic('matplotlib inline')
  8 | from os import listdir
  9 | from os.path import isfile, join
 10 | import matplotlib.pyplot as plt
 11 | import matplotlib.image as mpimg
 12 | import numpy as np
 13 | from keras.models import Sequential
 14 | from keras.layers import Dense, Dropout, Activation, Flatten 
 15 | from keras.optimizers import Adam
 16 | from keras.layers.normalization import BatchNormalization 
 17 | from keras.utils import np_utils
 18 | from keras.layers import Conv2D, MaxPooling2D
 19 | from keras.preprocessing.image import ImageDataGenerator
 20 | 
 21 | 
 22 | # In[2]:
 23 | 
 24 | 
 25 | pwd
 26 | 
 27 | 
 28 | # In[3]:
 29 | 
 30 | 
 31 | cd desktop
 32 | 
 33 | 
 34 | # In[4]:
 35 | 
 36 | 
 37 | #reading images from the local drive 
 38 | mypath='MIT-CBCL-facerec-database//training-synthetic' 
 39 | onlyfiles= [ f for f in listdir(mypath) if isfile(join(mypath,f)) ] 
 40 | images =np.empty([3240,200,200],dtype=int)
 41 | for n in range(0, len(onlyfiles)):
 42 |  images[n] = mpimg.imread( join(mypath,onlyfiles[n]) ).astype(np.float32)
 43 | 
 44 | 
 45 | # In[5]:
 46 | 
 47 | 
 48 | plt.imshow (images[0])
 49 | 
 50 | 
 51 | # In[6]:
 52 | 
 53 | 
 54 | plt.imshow (images[1])
 55 | 
 56 | 
 57 | # In[7]:
 58 | 
 59 | 
 60 | plt. imshow (images[2])
 61 | 
 62 | 
 63 | # In[8]:
 64 | 
 65 | 
 66 | plt.imshow(images[3119])
 67 | 
 68 | 
 69 | # In[9]:
 70 | 
 71 | 
 72 | y =np.empty([3240,1],dtype=int) 
 73 | for x in range(0, len(onlyfiles)):
 74 |     if onlyfiles[x][3]=='0': y[x]=0
 75 |     elif onlyfiles[x][3]=='1': y[x]=1
 76 |     elif onlyfiles[x][3]=='2': y[x]=2
 77 |     elif onlyfiles[x][3]=='3': y[x]=3
 78 |     elif onlyfiles[x][3]=='4': y[x]=4
 79 |     elif onlyfiles[x][3]=='5': y[x]=5
 80 |     elif onlyfiles[x][3]=='6': y[x]=6
 81 |     elif onlyfiles[x][3]=='7': y[x]=7
 82 |     elif onlyfiles[x][3]=='8': y[x]=8
 83 |     elif onlyfiles[x][3]=='9': y[x]=9
 84 | 
 85 | 
 86 | # In[10]:
 87 | 
 88 | 
 89 | #funtion for cropping images to obtain only the significant part 
 90 | def crop(img):
 91 |     a=28*np.ones(len(img)) #background has pixel intensity of 28 
 92 |     b=np.where((img== a).all(axis=1)) #check image background
 93 |     img=np.delete(img,(b),0) #deleting the unwanted part from the Y axis 
 94 |     plt.imshow(img)
 95 |     img=img.transpose()
 96 |     d=28*np.ones(len(img[0]))
 97 |     e=np.where((img== d).all(axis=1))
 98 |     img=np.delete(img,e,0) #deleting the unwanted part from the X axis 
 99 |     img=img.transpose()
100 |     print (img.shape) #printing image shape to ensure it is actually being cropped
101 |     super_threshold_indices = img < 29 #padding zeros instead of background data  
102 |     img[super_threshold_indices] = 0
103 |     plt.imshow (img)
104 |     return img[0:150, 0:128]
105 | 
106 | 
107 | # In[11]:
108 | 
109 | 
110 | #cropping all the images
111 | image = np.empty([3240,150,128],dtype=int) 
112 | for n in range(0, len(images)):
113 |  image[n]=crop(images[n])
114 | 
115 | 
116 | # In[12]:
117 | 
118 | 
119 | print (image[22])
120 | 
121 | 
122 | # In[13]:
123 | 
124 | 
125 | print (image[22].shape)
126 | 
127 | 
128 | # In[14]:
129 | 
130 | 
131 | # randomly splitting data into training(80%) and test(20%) sets 
132 | test_ind=np.random.choice(range(3240), 648, replace=False) 
133 | train_ind=np.delete(range(0,len(onlyfiles)),test_ind)
134 | 
135 | 
136 | # In[15]:
137 | 
138 | 
139 | # segregating the training and test images 
140 | x_train=image[train_ind] 
141 | y1_train=y[train_ind] 
142 | x_test=image[test_ind] 
143 | y1_test=y[test_ind]
144 | 
145 | 
146 | # In[16]:
147 | 
148 | 
149 | #reshaping the input images
150 | x_train = x_train.reshape(x_train.shape[0], 128, 150, 1) 
151 | x_test = x_test.reshape(x_test.shape[0], 128, 150, 1)
152 | 
153 | 
154 | # In[17]:
155 | 
156 | 
157 | #converting data to float32
158 | x_train = x_train.astype('float32') 
159 | x_test = x_test.astype('float32')
160 | 
161 | 
162 | # In[18]:
163 | 
164 | 
165 | #normalizing data
166 | x_train/=255 
167 | x_test/=255
168 | #10 digits represent the 10 classes 
169 | number_of_persons = 10
170 | 
171 | 
172 | # In[19]:
173 | 
174 | 
175 | #convert data to vectors
176 | y_train = np_utils.to_categorical(y1_train, number_of_persons) 
177 | y_test = np_utils.to_categorical(y1_test, number_of_persons)
178 | 
179 | 
180 | # In[25]:
181 | 
182 | 
183 | # model building
184 | model = Sequential()
185 | model.add(Conv2D(16, (3, 3), input_shape=(128,150,1))) #Input layer 
186 | model.add(Activation('relu')) # 'relu' as activation function
187 | model.add(Conv2D(16, (3, 3))) #first hidden layer
188 | model.add(Activation('relu'))
189 | model.add(MaxPooling2D(pool_size=(2,2))) # Maxpooling from (2,2)
190 | model.add(Conv2D(16,(3, 3))) # second hidden layer 
191 | model.add(Activation('relu'))
192 | model.add(MaxPooling2D(pool_size=(2,2))) # Maxpooling from (2,2)
193 | model.add(Flatten()) #flatten the maxpooled data
194 | # Fully connected layer
195 | model.add(Dense(512))
196 | model.add(Activation('relu'))
197 | model.add(Dropout(0.25)) #Dropout is applied to overcome overfitting 
198 | model.add(Dense(10)) 
199 | #output layer
200 | model.add(Activation('softmax')) # 'softmax' is used for SGD
201 | 
202 | 
203 | # In[26]:
204 | 
205 | 
206 | model.summary()
207 | 
208 | 
209 | # In[27]:
210 | 
211 | 
212 | #model compliation
213 | model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy'])
214 | 
215 | 
216 | # In[28]:
217 | 
218 | 
219 | # data augmentation to reduce overfitting problem
220 | gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3, 
221 |                          height_shift_range=0.08,zoom_range=0.08)
222 | test_gen = ImageDataGenerator()
223 | train_generator = gen.flow(x_train, y_train, batch_size=16) 
224 | test_generator = test_gen.flow(x_test, y_test, batch_size=16)
225 | 
226 | 
227 | # In[29]:
228 | 
229 | 
230 | #model fitting
231 | model.fit_generator(train_generator, epochs=5, validation_data=test_generator) 
232 | # Final evaluation of the model
233 | scores = model.evaluate(x_test, y_test, verbose=0) 
234 | print("Recognition Error: %.2f%%" % (100-scores[1]*100))
235 | 
236 | 


--------------------------------------------------------------------------------
/CH08/code/Real+Estate+Prediction.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | import numpy as np
  8 | import pandas as pd
  9 | import matplotlib.pyplot as plt
 10 | import seaborn as sns
 11 | import mpl_toolkits
 12 | from sklearn import preprocessing
 13 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 14 | from sklearn.feature_selection import RFE
 15 | from sklearn import linear_model
 16 | from sklearn.cross_validation import train_test_split
 17 | get_ipython().magic('matplotlib inline')
 18 | 
 19 | 
 20 | # In[2]:
 21 | 
 22 | 
 23 | pwd
 24 | 
 25 | 
 26 | # In[3]:
 27 | 
 28 | 
 29 | cd '/Users/Chanti/Desktop/Cookbook/Chapter 10'
 30 | 
 31 | 
 32 | # In[4]:
 33 | 
 34 | 
 35 | pwd
 36 | 
 37 | 
 38 | # In[5]:
 39 | 
 40 | 
 41 | dataframe = pd.read_csv("kc_house_data.csv", header='infer')
 42 | 
 43 | 
 44 | # In[6]:
 45 | 
 46 | 
 47 | list(dataframe)
 48 | 
 49 | 
 50 | # In[7]:
 51 | 
 52 | 
 53 | dataframe.head()
 54 | 
 55 | 
 56 | # In[8]:
 57 | 
 58 | 
 59 | dataframe.tail()
 60 | 
 61 | 
 62 | # In[9]:
 63 | 
 64 | 
 65 | dataframe.describe()
 66 | 
 67 | 
 68 | # In[10]:
 69 | 
 70 | 
 71 | dataframe['bedrooms'].value_counts().plot(kind='bar')
 72 | plt.title('No. of bedrooms')
 73 | plt.xlabel('Bedrooms')
 74 | plt.ylabel('Count')
 75 | sns.despine
 76 | 
 77 | 
 78 | # In[11]:
 79 | 
 80 | 
 81 | dataframe['bedrooms'].value_counts().plot(kind='pie')
 82 | plt.title('No. of bedrooms')
 83 | 
 84 | 
 85 | # In[12]:
 86 | 
 87 | 
 88 | dataframe['floors'].value_counts().plot(kind='bar')
 89 | plt.title('Number of floors')
 90 | plt.xlabel('No. of floors')
 91 | plt.ylabel('Count')
 92 | sns.despine
 93 | 
 94 | 
 95 | # In[13]:
 96 | 
 97 | 
 98 | plt.figure(figsize=(20,20))
 99 | sns.jointplot(x=dataframe.lat.values, y=dataframe.long.values, size=9)
100 | plt.xlabel('Longitude', fontsize=10)
101 | plt.ylabel('Latitude', fontsize=10)
102 | plt.show()
103 | sns.despine()
104 | 
105 | 
106 | # In[14]:
107 | 
108 | 
109 | plt.figure(figsize=(20,20))
110 | sns.jointplot(x=dataframe.lat.values, y=dataframe.long.values, size=9)
111 | plt.xlabel('Longitude', fontsize=10)
112 | plt.ylabel('Latitude', fontsize=10)
113 | plt.show()
114 | sns.despine()
115 | 
116 | 
117 | # In[15]:
118 | 
119 | 
120 | plt.figure(figsize=(8,8))
121 | plt.scatter(dataframe.price, dataframe.sqft_living)
122 | plt.xlabel('Price')
123 | plt.ylabel('Square feet')
124 | plt.show()
125 | 
126 | 
127 | # In[16]:
128 | 
129 | 
130 | plt.figure(figsize=(5,5))
131 | plt.bar(dataframe.condition, dataframe.price)
132 | plt.xlabel('Condition')
133 | plt.ylabel('Price')
134 | plt.show()
135 | 
136 | 
137 | # In[17]:
138 | 
139 | 
140 | plt.figure(figsize=(8,8))
141 | plt.scatter(dataframe.zipcode, dataframe.price)
142 | plt.xlabel('Zipcode')
143 | plt.ylabel('Price')
144 | plt.show()
145 | 
146 | 
147 | # In[18]:
148 | 
149 | 
150 | plt.figure(figsize=(10,10))
151 | plt.scatter(dataframe.grade, dataframe.price)
152 | plt.xlabel('Grade')
153 | plt.ylabel('Price')
154 | plt.show()
155 | 
156 | 
157 | # In[19]:
158 | 
159 | 
160 | x_df = dataframe.drop(['id','date',], axis = 1)
161 | x_df
162 | 
163 | 
164 | # In[20]:
165 | 
166 | 
167 | y = dataframe[['price']].copy()
168 | y_df = pd.DataFrame(y)
169 | y_df
170 | 
171 | 
172 | # In[21]:
173 | 
174 | 
175 | print('Price Vs Bedrooms: %s' % x_df['price'].corr(x_df['bedrooms']))
176 | print('Price Vs Bathrooms: %s' % x_df['price'].corr(x_df['bathrooms']))
177 | print('Price Vs Living Area: %s' % x_df['price'].corr(x_df['sqft_living']))
178 | print('Price Vs Plot Area: %s' % x_df['price'].corr(x_df['sqft_lot']))
179 | print('Price Vs No. of floors: %s' % x_df['price'].corr(x_df['floors']))
180 | print('Price Vs Waterfront property: %s' % x_df['price'].corr(x_df['waterfront']))
181 | print('Price Vs View: %s' % x_df['price'].corr(x_df['view']))
182 | print('Price Vs Grade: %s' % x_df['price'].corr(x_df['grade']))
183 | print('Price Vs Condition: %s' % x_df['price'].corr(x_df['condition']))
184 | print('Price Vs Sqft Above: %s' % x_df['price'].corr(x_df['sqft_above']))
185 | print('Price Vs Basement Area: %s' % x_df['price'].corr(x_df['sqft_basement']))
186 | print('Price Vs Year Built: %s' % x_df['price'].corr(x_df['yr_built']))
187 | print('Price Vs Year Renovated: %s' % x_df['price'].corr(x_df['yr_renovated']))
188 | print('Price Vs Zipcode: %s' % x_df['price'].corr(x_df['zipcode']))
189 | print('Price Vs Latitude: %s' % x_df['price'].corr(x_df['lat']))
190 | print('Price Vs Longitude: %s' % x_df['price'].corr(x_df['long']))
191 | 
192 | 
193 | # In[22]:
194 | 
195 | 
196 | x_df.corr().iloc[:,-19]
197 | 
198 | 
199 | # In[23]:
200 | 
201 | 
202 | sns.pairplot(data=x_df,
203 |                   x_vars=['price'],
204 |                   y_vars=['bedrooms', 'bathrooms', 'sqft_living', 
205 |                           'sqft_lot', 'floors', 'waterfront','view',
206 |                           'grade','condition','sqft_above','sqft_basement',
207 |                           'yr_built','yr_renovated','zipcode','lat','long'],
208 |             size = 5)
209 | 
210 | 
211 | # In[24]:
212 | 
213 | 
214 | x_df2 = x_df.drop(['price'], axis = 1)
215 | 
216 | 
217 | # In[25]:
218 | 
219 | 
220 | reg=linear_model.LinearRegression()
221 | 
222 | 
223 | # In[26]:
224 | 
225 | 
226 | x_train,x_test,y_train,y_test = train_test_split(x_df2,y_df,test_size=0.4,random_state=4)
227 | 
228 | 
229 | # In[27]:
230 | 
231 | 
232 | reg.fit(x_train,y_train)
233 | 
234 | 
235 | # In[28]:
236 | 
237 | 
238 | reg.coef_
239 | 
240 | 
241 | # In[29]:
242 | 
243 | 
244 | predictions=reg.predict(x_test)
245 | predictions
246 | 
247 | 
248 | # In[30]:
249 | 
250 | 
251 | reg.score(x_test,y_test)
252 | 
253 | 
254 | # In[31]:
255 | 
256 | 
257 | import xgboost
258 | 
259 | 
260 | # In[91]:
261 | 
262 | 
263 | new_model = xgboost.XGBRegressor(n_estimators=750, learning_rate=0.01, gamma=0, subsample=0.55, colsample_bytree=1, max_depth=10)
264 | 
265 | 
266 | # In[92]:
267 | 
268 | 
269 | from sklearn.model_selection import train_test_split
270 | 
271 | 
272 | # In[93]:
273 | 
274 | 
275 | traindf, testdf = train_test_split(x_train, test_size = 0.2)
276 | new_model.fit(x_train,y_train)
277 | 
278 | 
279 | # In[94]:
280 | 
281 | 
282 | from sklearn.metrics import explained_variance_score
283 | predictions = new_model.predict(x_test)
284 | print(explained_variance_score(predictions,y_test))
285 | 
286 | 


--------------------------------------------------------------------------------
/CH07/code/Natural+Language+Processing+-+ChatBot.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | spark = SparkSession.builder    .master("local")    .appName("Natural Language Processing")    .config("spark.executor.memory", "6gb")    .getOrCreate()
  7 | 
  8 | 
  9 | # In[2]:
 10 | 
 11 | df = spark.read.format('com.databricks.spark.csv')                    .options(header='true', inferschema='true')                    .load('TherapyBotSession.csv')
 12 | 
 13 | 
 14 | # In[3]:
 15 | 
 16 | df.show()
 17 | 
 18 | 
 19 | # In[4]:
 20 | 
 21 | df = df.select('id', 'label', 'chat')
 22 | 
 23 | 
 24 | # In[5]:
 25 | 
 26 | df.show()
 27 | 
 28 | 
 29 | # In[6]:
 30 | 
 31 | df.groupBy("label")     .count()     .orderBy("count", ascending = False)     .show()
 32 | 
 33 | 
 34 | # In[7]:
 35 | 
 36 | import pyspark.sql.functions as F
 37 | df = df.withColumn('word_count',F.size(F.split(F.col('chat'),' ')))
 38 | 
 39 | 
 40 | # In[8]:
 41 | 
 42 | df.show()
 43 | 
 44 | 
 45 | # In[9]:
 46 | 
 47 | df.groupBy('label')    .agg(F.avg('word_count').alias('avg_word_count'))    .orderBy('avg_word_count', ascending = False)     .show()
 48 | 
 49 | 
 50 | # In[10]:
 51 | 
 52 | df_plot = df.select('id', 'word_count').toPandas()
 53 | 
 54 | 
 55 | # In[11]:
 56 | 
 57 | import matplotlib.pyplot as plt
 58 | get_ipython().magic('matplotlib inline')
 59 | 
 60 | df_plot.set_index('id', inplace=True)
 61 | df_plot.plot(kind='bar', figsize=(16, 6))
 62 | plt.ylabel('Word Count')
 63 | plt.title('Word Count distribution')
 64 | plt.show()
 65 | 
 66 | 
 67 | # In[12]:
 68 | 
 69 | from textblob import TextBlob
 70 | def sentiment_score(chat):
 71 |         return TextBlob(chat).sentiment.polarity
 72 | 
 73 | 
 74 | # In[13]:
 75 | 
 76 | from pyspark.sql.types import FloatType
 77 | sentiment_score_udf = F.udf(lambda x: sentiment_score(x), FloatType())
 78 | 
 79 | 
 80 | # In[14]:
 81 | 
 82 | df = df.select('id', 'label', 'chat','word_count',
 83 |                    sentiment_score_udf('chat').alias('sentiment_score'))
 84 | df.show()
 85 | 
 86 | 
 87 | # In[15]:
 88 | 
 89 | df.groupBy('label')    .agg(F.avg('sentiment_score').alias('avg_sentiment_score'))    .orderBy('avg_sentiment_score', ascending = False)     .show()
 90 | 
 91 | 
 92 | # In[16]:
 93 | 
 94 | df = df.withColumn('words',F.split(F.col('chat'),' '))
 95 | df.show()
 96 | 
 97 | 
 98 | # In[17]:
 99 | 
100 | stop_words = ['i','me','my','myself','we','our','ours','ourselves',
101 |               'you','your','yours','yourself','yourselves','he','him',
102 |               'his','himself','she','her','hers','herself','it','its',
103 |               'itself','they','them','their','theirs','themselves',
104 |               'what','which','who','whom','this','that','these','those',
105 |               'am','is','are','was','were','be','been','being','have',
106 |               'has','had','having','do','does','did','doing','a','an',
107 |               'the','and','but','if','or','because','as','until','while',
108 |               'of','at','by','for','with','about','against','between',
109 |               'into','through','during','before','after','above','below',
110 |               'to','from','up','down','in','out','on','off','over','under',
111 |               'again','further','then','once','here','there','when','where',
112 |               'why','how','all','any','both','each','few','more','most',
113 |               'other','some','such','no','nor','not','only','own','same',
114 |               'so','than','too','very','can','will','just','don','should','now']
115 | 
116 | 
117 | # In[18]:
118 | 
119 | from pyspark.ml.feature import StopWordsRemover 
120 | 
121 | 
122 | # In[19]:
123 | 
124 | stopwordsRemovalFeature = StopWordsRemover(inputCol="words", 
125 |                                            outputCol="words without stop").setStopWords(stop_words)
126 | 
127 | 
128 | # In[20]:
129 | 
130 | from pyspark.ml import Pipeline
131 | stopWordRemovalPipeline = Pipeline(stages=[stopwordsRemovalFeature])
132 | pipelineFitRemoveStopWords = stopWordRemovalPipeline.fit(df)
133 | 
134 | 
135 | # In[21]:
136 | 
137 | df = pipelineFitRemoveStopWords.transform(df)
138 | df.select('words', 'words without stop').show(5)
139 | 
140 | 
141 | # In[22]:
142 | 
143 | label = F.udf(lambda x: 1.0 if x == 'escalate' else 0.0, FloatType())
144 | df = df.withColumn('label', label('label'))
145 | 
146 | 
147 | # In[23]:
148 | 
149 | df.select('label').show()
150 | 
151 | 
152 | # In[24]:
153 | 
154 | import pyspark.ml.feature as feat
155 | TF_ = feat.HashingTF(inputCol="words without stop", 
156 |                      outputCol="rawFeatures", numFeatures=100000)
157 | IDF_ = feat.IDF(inputCol="rawFeatures", outputCol="features")
158 | 
159 | 
160 | # In[25]:
161 | 
162 | pipelineTFIDF = Pipeline(stages=[TF_, IDF_])
163 | 
164 | 
165 | # In[26]:
166 | 
167 | pipelineFit = pipelineTFIDF.fit(df)
168 | df = pipelineFit.transform(df)
169 | 
170 | 
171 | # In[27]:
172 | 
173 | df.select('label', 'rawFeatures','features').show()
174 | 
175 | 
176 | # In[28]:
177 | 
178 | (trainingDF, testDF) = df.randomSplit([0.75, 0.25], seed = 1234)
179 | 
180 | 
181 | # In[29]:
182 | 
183 | from pyspark.ml.classification import LogisticRegression
184 | logreg = LogisticRegression(regParam=0.025)
185 | 
186 | 
187 | # In[30]:
188 | 
189 | logregModel = logreg.fit(trainingDF)
190 | 
191 | 
192 | # In[31]:
193 | 
194 | predictionDF = logregModel.transform(testDF)
195 | 
196 | 
197 | # In[32]:
198 | 
199 | predictionDF.select('label', 'probability', 'prediction').show()
200 | 
201 | 
202 | # In[33]:
203 | 
204 | predictionDF.crosstab('label', 'prediction').show()
205 | 
206 | 
207 | # In[34]:
208 | 
209 | from sklearn import metrics
210 | actual = predictionDF.select('label').toPandas()
211 | predicted = predictionDF.select('prediction').toPandas()
212 | 
213 | 
214 | # In[35]:
215 | 
216 | print('accuracy score: {}%'.format(round(metrics.accuracy_score(actual, predicted),3)*100))
217 | 
218 | 
219 | # In[36]:
220 | 
221 | from pyspark.ml.evaluation import BinaryClassificationEvaluator
222 | 
223 | scores = predictionDF.select('label', 'rawPrediction')
224 | evaluator = BinaryClassificationEvaluator()
225 | print('The ROC score is {}%'.format(round(evaluator.evaluate(scores),3)*100))
226 | 
227 | 
228 | # In[37]:
229 | 
230 | predictionDF.describe('label').show()
231 | 
232 | 
233 | # In[ ]:
234 | 
235 | 
236 | 
237 | 


--------------------------------------------------------------------------------
/CH09/code/Predicting+Apple+Stock+Market+Value.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | spark = SparkSession.builder    .master("local")    .appName("StockMarket")    .config("spark.executor.memory", "6gb")    .getOrCreate()
  7 | 
  8 | 
  9 | # In[2]:
 10 | 
 11 | df =spark.read.format('com.databricks.spark.csv')                    .options(header='true', inferschema='true')                    .load('AAPL.csv')
 12 | 
 13 | 
 14 | # In[3]:
 15 | 
 16 | df.show()
 17 | 
 18 | 
 19 | # In[4]:
 20 | 
 21 | import pyspark.sql.functions as f
 22 | df = df.withColumn('date', f.to_date('Date'))
 23 | 
 24 | 
 25 | # In[5]:
 26 | 
 27 | df.show(n=5)
 28 | 
 29 | 
 30 | # In[6]:
 31 | 
 32 | date_breakdown = ['year', 'month', 'day']
 33 | for i in enumerate(date_breakdown):
 34 |     index = i[0]
 35 |     name = i[1]
 36 |     df = df.withColumn(name, f.split('date', '-')[index])
 37 | 
 38 | 
 39 | # In[7]:
 40 | 
 41 | df.show(n=10)
 42 | 
 43 | 
 44 | # In[8]:
 45 | 
 46 | df_plot = df.select('year', 'Adj Close').toPandas()
 47 | 
 48 | 
 49 | # In[9]:
 50 | 
 51 | from matplotlib import pyplot as plt
 52 | get_ipython().magic('matplotlib inline')
 53 | 
 54 | df_plot.set_index('year', inplace=True)
 55 | df_plot.plot(figsize=(16, 6), grid=True)
 56 | plt.title('Apple stock')
 57 | plt.ylabel('Stock Quote ($)')
 58 | plt.show()
 59 | 
 60 | 
 61 | # In[10]:
 62 | 
 63 | df.toPandas().shape
 64 | 
 65 | 
 66 | # In[11]:
 67 | 
 68 | df.dropna().count()
 69 | 
 70 | 
 71 | # In[12]:
 72 | 
 73 | df.select('Open', 'High', 'Low', 'Close', 'Adj Close').describe().show()
 74 | 
 75 | 
 76 | # In[13]:
 77 | 
 78 | df.groupBy(['year']).agg({'Adj Close':'count'})            .withColumnRenamed('count(Adj Close)', 'Row Count')            .orderBy(["year"],ascending=False)            .show()
 79 | 
 80 | 
 81 | # In[14]:
 82 | 
 83 | trainDF = df[df.year < 2017] 
 84 | testDF = df[df.year > 2016]
 85 | 
 86 | 
 87 | # In[15]:
 88 | 
 89 | trainDF.toPandas().shape
 90 | 
 91 | 
 92 | # In[16]:
 93 | 
 94 | testDF.toPandas().shape
 95 | 
 96 | 
 97 | # In[17]:
 98 | 
 99 | trainDF_plot = trainDF.select('year', 'Adj Close').toPandas()
100 | trainDF_plot.set_index('year', inplace=True)
101 | trainDF_plot.plot(figsize=(16, 6), grid=True)
102 | plt.title('Apple Stock 2000-2016')
103 | plt.ylabel('Stock Quote ($)')
104 | plt.show()
105 | 
106 | 
107 | # In[18]:
108 | 
109 | testDF_plot = testDF.select('year', 'Adj Close').toPandas()
110 | testDF_plot.set_index('year', inplace=True)
111 | testDF_plot.plot(figsize=(16, 6), grid=True)
112 | plt.title('Apple Stock 2017-2018')
113 | plt.ylabel('Stock Quote ($)')
114 | plt.show()
115 | 
116 | 
117 | # In[19]:
118 | 
119 | import numpy as np
120 | trainArray = np.array(trainDF.select('Open', 'High', 'Low', 'Close','Volume', 'Adj Close' ).collect())
121 | testArray = np.array(testDF.select('Open', 'High', 'Low', 'Close','Volume', 'Adj Close' ).collect())
122 | 
123 | 
124 | # In[20]:
125 | 
126 | print(trainArray[0])
127 | print('-------------')
128 | print(testArray[0])
129 | 
130 | 
131 | # In[21]:
132 | 
133 | from sklearn.preprocessing import MinMaxScaler
134 | minMaxScale = MinMaxScaler()
135 | 
136 | 
137 | # In[22]:
138 | 
139 | minMaxScale.fit(trainArray)
140 | 
141 | 
142 | # In[23]:
143 | 
144 | testingArray = minMaxScale.transform(testArray)
145 | trainingArray = minMaxScale.transform(trainArray)
146 | 
147 | 
148 | # In[24]:
149 | 
150 | print(testingArray[0])
151 | print('--------------')
152 | print(trainingArray[0])
153 | 
154 | 
155 | # In[25]:
156 | 
157 | xtrain = trainingArray[:, 0:-1]
158 | xtest = testingArray[:, 0:-1]
159 | # ytrain = trainingArray[:, 5]
160 | # ytest = testingArray[:, 5]
161 | ytrain = trainingArray[:, -1:]
162 | ytest = testingArray[:, -1:]
163 | 
164 | 
165 | # In[26]:
166 | 
167 | trainingArray[0]
168 | 
169 | 
170 | # In[27]:
171 | 
172 | xtrain[0]
173 | 
174 | 
175 | # In[28]:
176 | 
177 | ytrain[0]
178 | 
179 | 
180 | # In[29]:
181 | 
182 | print('xtrain shape = {}'.format(xtrain.shape))
183 | print('xtest shape = {}'.format(xtest.shape))
184 | print('ytrain shape = {}'.format(ytrain.shape))
185 | print('ytest shape = {}'.format(ytest.shape))
186 | 
187 | 
188 | # In[30]:
189 | 
190 | plt.figure(figsize=(16,6))
191 | plt.plot(xtrain[:,0],color='red', label='open')
192 | plt.plot(xtrain[:,1],color='blue', label='high')
193 | plt.plot(xtrain[:,2],color='green', label='low')
194 | plt.plot(xtrain[:,3],color='purple', label='close')
195 | plt.legend(loc = 'upper left')
196 | plt.title('Open, High, Low, and Close by Day')
197 | plt.xlabel('Days')
198 | plt.ylabel('Scaled Quotes')
199 | plt.show()
200 | 
201 | 
202 | # In[31]:
203 | 
204 | plt.figure(figsize=(16,6))
205 | plt.plot(xtrain[:,4],color='black', label='volume')
206 | plt.legend(loc = 'upper right')
207 | plt.title('Volume by Day')
208 | plt.xlabel('Days')
209 | plt.ylabel('Scaled Volume')
210 | plt.show()
211 | 
212 | 
213 | # In[32]:
214 | 
215 | from keras import models, layers
216 | 
217 | 
218 | # In[33]:
219 | 
220 | model = models.Sequential()
221 | model.add(layers.LSTM(1, input_shape=(1,5)))
222 | model.add(layers.Dense(1))
223 | model.compile(loss='mean_squared_error', optimizer='adam')
224 | 
225 | 
226 | # In[34]:
227 | 
228 | xtrain = xtrain.reshape((xtrain.shape[0], 1, xtrain.shape[1]))
229 | xtest  = xtest.reshape((xtest.shape[0], 1, xtest.shape[1]))
230 | 
231 | 
232 | # In[35]:
233 | 
234 | print('The shape of xtrain is {}: '.format(xtrain.shape))
235 | print('The shape of xtest is {}: '.format(xtest.shape))
236 | 
237 | 
238 | # In[36]:
239 | 
240 | loss = model.fit(xtrain, ytrain, batch_size=10, epochs=100)
241 | 
242 | 
243 | # In[37]:
244 | 
245 | plt.plot(loss.history['loss'], label = 'loss')
246 | plt.title('mean squared error by epoch')
247 | plt.legend()
248 | plt.show()
249 | 
250 | 
251 | # In[38]:
252 | 
253 | predicted = model.predict(xtest)
254 | 
255 | 
256 | # In[39]:
257 | 
258 | combined_array = np.concatenate((ytest, predicted), axis = 1)
259 | 
260 | 
261 | # In[40]:
262 | 
263 | plt.figure(figsize=(16,6))
264 | plt.plot(combined_array[:,0],color='red', label='actual')
265 | plt.plot(combined_array[:,1],color='blue', label='predicted')
266 | plt.legend(loc = 'lower right')
267 | plt.title('2017 Actual vs. Predicted APPL Stock')
268 | plt.xlabel('Days')
269 | plt.ylabel('Scaled Quotes')
270 | plt.show()
271 | 
272 | 
273 | # In[41]:
274 | 
275 | import sklearn.metrics as metrics
276 | np.sqrt(metrics.mean_squared_error(ytest,predicted))
277 | 
278 | 
279 | # In[ ]:
280 | 
281 | 
282 | 
283 | 


--------------------------------------------------------------------------------
/CH05/code/Predicting+Fire+Dept+Calls+with+Spark+ML.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | from pyspark.sql import SparkSession
  7 | 
  8 | 
  9 | # In[2]:
 10 | 
 11 | spark = SparkSession.builder    .master("local")    .appName("Predicting Fire Dept Calls")    .config("spark.executor.memory", "6gb")    .getOrCreate()
 12 | 
 13 | 
 14 | # In[3]:
 15 | 
 16 | df = spark.read.format('com.databricks.spark.csv')                    .options(header='true', inferschema='true')                    .load('Fire_Department_Calls_for_Service.csv')
 17 | 
 18 | 
 19 | # In[4]:
 20 | 
 21 | df.show(2)
 22 | 
 23 | 
 24 | # In[5]:
 25 | 
 26 | df.select('Call Type Group').distinct().show()
 27 | 
 28 | 
 29 | # In[6]:
 30 | 
 31 | df.groupBy('Call Type Group').count().show()
 32 | 
 33 | 
 34 | # In[7]:
 35 | 
 36 | df2 = df.groupBy('Call Type Group').count()
 37 | 
 38 | 
 39 | # In[8]:
 40 | 
 41 | graphDF = df2.toPandas()
 42 | graphDF = graphDF.sort_values('count', ascending=False)
 43 | 
 44 | 
 45 | # In[9]:
 46 | 
 47 | import matplotlib.pyplot as plt
 48 | get_ipython().magic('matplotlib inline')
 49 | 
 50 | 
 51 | # In[10]:
 52 | 
 53 | graphDF.plot(x='Call Type Group', y = 'count', kind='bar')
 54 | plt.title('Call Type Group by Count')
 55 | plt.show()
 56 | 
 57 | 
 58 | # In[11]:
 59 | 
 60 | df.groupBy('Call Type').count().orderBy('count', ascending=False).show(100)
 61 | 
 62 | 
 63 | # In[12]:
 64 | 
 65 | from pyspark.sql import functions as F
 66 | fireIndicator = df.select(df["Call Type"],F.when(df["Call Type"].like("%Fire%"),1)                          .otherwise(0).alias('Fire Indicator'))
 67 | fireIndicator.show()
 68 | 
 69 | 
 70 | # In[13]:
 71 | 
 72 | fireIndicator.groupBy('Fire Indicator').count().show()
 73 | 
 74 | 
 75 | # In[14]:
 76 | 
 77 | df = df.withColumn("fireIndicator", F.when(df["Call Type"].like("%Fire%"),1).otherwise(0))
 78 | 
 79 | 
 80 | # In[15]:
 81 | 
 82 | df.printSchema()
 83 | 
 84 | 
 85 | # In[16]:
 86 | 
 87 | df.select('Call Type', 'fireIndicator').show(20)
 88 | 
 89 | 
 90 | # In[17]:
 91 | 
 92 | df = df.select('fireIndicator', 
 93 |                'Zipcode of Incident',
 94 |                'Battalion',
 95 |                'Station Area',
 96 |                'Box', 
 97 |                'Number of Alarms',
 98 |                'Unit sequence in call dispatch',
 99 |                'Neighborhooods - Analysis Boundaries',
100 |                'Fire Prevention District',
101 |                'Supervisor District')
102 | df.show(5)
103 | 
104 | 
105 | # In[18]:
106 | 
107 | print('Total Rows')
108 | df.count()
109 | 
110 | 
111 | # In[19]:
112 | 
113 | print('Rows without Null values')
114 | df.dropna().count()
115 | 
116 | 
117 | # In[20]:
118 | 
119 | print('Row with Null Values')
120 | df.count()-df.dropna().count()
121 | 
122 | 
123 | # In[21]:
124 | 
125 | df = df.dropna()
126 | 
127 | 
128 | # In[22]:
129 | 
130 | df.groupBy('fireIndicator').count().orderBy('count', ascending = False).show()
131 | 
132 | 
133 | # In[23]:
134 | 
135 | from pyspark.ml.feature import StringIndexer
136 | 
137 | 
138 | # In[24]:
139 | 
140 | column_names = df.columns[1:]
141 | column_names
142 | 
143 | 
144 | # In[25]:
145 | 
146 | categoricalColumns = column_names
147 | indexers = []
148 | for categoricalCol in categoricalColumns:
149 |     stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"_Index")
150 |     indexers += [stringIndexer]
151 | 
152 | 
153 | # In[26]:
154 | 
155 | models = []
156 | for model in indexers:
157 |     indexer_model = model.fit(df)
158 |     models+=[indexer_model]
159 |     
160 | for i in models:
161 |     df = i.transform(df)
162 | 
163 | 
164 | # In[27]:
165 | 
166 | df.columns
167 | 
168 | 
169 | # In[28]:
170 | 
171 | df.select('Neighborhooods - Analysis Boundaries', 'Neighborhooods - Analysis Boundaries_Index').show()
172 | 
173 | 
174 | # In[29]:
175 | 
176 | df = df.select(
177 |           'fireIndicator',
178 |           'Zipcode of Incident_Index',
179 |           'Battalion_Index',
180 |           'Station Area_Index',
181 |           'Box_Index',
182 |           'Number of Alarms_Index',
183 |           'Unit sequence in call dispatch_Index',
184 |           'Neighborhooods - Analysis Boundaries_Index',
185 |           'Fire Prevention District_Index',
186 |           'Supervisor District_Index')
187 | 
188 | 
189 | # In[30]:
190 | 
191 | df.printSchema()
192 | 
193 | 
194 | # In[31]:
195 | 
196 | df.show(5)
197 | 
198 | 
199 | # In[32]:
200 | 
201 | features = df.columns[1:]
202 | 
203 | 
204 | # In[33]:
205 | 
206 | from pyspark.ml.feature import VectorAssembler
207 | 
208 | feature_vectors = VectorAssembler(
209 |         inputCols = features,
210 |         outputCol = "features")
211 | 
212 | 
213 | # In[34]:
214 | 
215 | df = feature_vectors.transform(df)
216 | 
217 | 
218 | # In[35]:
219 | 
220 | df.columns
221 | 
222 | 
223 | # In[36]:
224 | 
225 | df = df.drop( 'Zipcode of Incident_Index',
226 |               'Battalion_Index',
227 |               'Station Area_Index',
228 |               'Box_Index',
229 |               'Number of Alarms_Index',
230 |               'Unit sequence in call dispatch_Index',
231 |               'Neighborhooods - Analysis Boundaries_Index',
232 |               'Fire Prevention District_Index',
233 |               'Supervisor District_Index')
234 | 
235 | 
236 | # In[37]:
237 | 
238 | df = df.withColumnRenamed('fireIndicator', 'label')
239 | 
240 | 
241 | # In[38]:
242 | 
243 | df.show()
244 | 
245 | 
246 | # In[39]:
247 | 
248 | (trainDF, testDF) = df.randomSplit([0.75, 0.25], seed = 12345)
249 | 
250 | 
251 | # In[40]:
252 | 
253 | print(trainDF.count())
254 | print(testDF.count())
255 | 
256 | 
257 | # In[41]:
258 | 
259 | from pyspark.ml.classification import LogisticRegression
260 | logreg = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10)
261 | LogisticRegressionModel = logreg.fit(trainDF)
262 | 
263 | 
264 | # In[42]:
265 | 
266 | df_predicted = LogisticRegressionModel.transform(testDF)
267 | 
268 | 
269 | # In[43]:
270 | 
271 | df_predicted.printSchema()
272 | 
273 | 
274 | # In[44]:
275 | 
276 | df_predicted.show(5)
277 | 
278 | 
279 | # In[45]:
280 | 
281 | df_predicted.crosstab('label', 'prediction').show()
282 | 
283 | 
284 | # In[46]:
285 | 
286 | from sklearn import metrics
287 | 
288 | 
289 | # In[47]:
290 | 
291 | actual = df_predicted.select('label').toPandas()
292 | 
293 | 
294 | # In[48]:
295 | 
296 | predicted = df_predicted.select('prediction').toPandas()
297 | 
298 | 
299 | # In[49]:
300 | 
301 | metrics.accuracy_score(actual, predicted)
302 | 
303 | 
304 | # In[50]:
305 | 
306 | df_predicted.groupBy('label').count().show()
307 | 
308 | 
309 | # In[51]:
310 | 
311 | df_predicted.describe('label').show()
312 | 
313 | 


--------------------------------------------------------------------------------
/CH11/code/CH11_Words+to+Vectors.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | pwd
  8 | 
  9 | 
 10 | # In[2]:
 11 | 
 12 | 
 13 | cd '/Users/Chanti/Desktop/USF'
 14 | 
 15 | 
 16 | # In[3]:
 17 | 
 18 | 
 19 | from __future__ import absolute_import, division, print_function
 20 | 
 21 | 
 22 | # In[4]:
 23 | 
 24 | 
 25 | import codecs
 26 | import glob
 27 | import logging
 28 | import multiprocessing
 29 | import os
 30 | import pprint
 31 | import re
 32 | 
 33 | 
 34 | # In[5]:
 35 | 
 36 | 
 37 | import nltk
 38 | import gensim.models.word2vec as w2v
 39 | import sklearn.manifold
 40 | import numpy as np
 41 | import matplotlib.pyplot as plt
 42 | import pandas as pd
 43 | import seaborn as sns
 44 | 
 45 | 
 46 | # In[6]:
 47 | 
 48 | 
 49 | get_ipython().magic('pylab inline')
 50 | 
 51 | 
 52 | # In[7]:
 53 | 
 54 | 
 55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 56 | 
 57 | 
 58 | # In[8]:
 59 | 
 60 | 
 61 | nltk.download("punkt")
 62 | nltk.download("stopwords")
 63 | 
 64 | 
 65 | # In[9]:
 66 | 
 67 | 
 68 | book_names = sorted(glob.glob("./*.txt"))
 69 | 
 70 | 
 71 | # In[10]:
 72 | 
 73 | 
 74 | print("Found books:")
 75 | book_names
 76 | 
 77 | 
 78 | # In[11]:
 79 | 
 80 | 
 81 | corpus = u''
 82 | for book_name in book_names:
 83 |     print("Reading '{0}'...".format(book_name))
 84 |     with codecs.open(book_name,"r","Latin1") as book_file:
 85 |         corpus += book_file.read()
 86 |     print("Corpus is now {0} characters long".format(len(corpus)))
 87 |     print()
 88 | 
 89 | 
 90 | # In[12]:
 91 | 
 92 | 
 93 | #Load the English pickle tokenizer from punkt
 94 | tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
 95 | 
 96 | 
 97 | # In[13]:
 98 | 
 99 | 
100 | #Tokenize the corpus into sentences
101 | raw_sentences = tokenizer.tokenize(corpus)
102 | 
103 | 
104 | # In[14]:
105 | 
106 | 
107 | #Convert sentences into list of words
108 | #remove unecessary characters, split into words, remove hyphens and special characters
109 | def sentence_to_wordlist(raw):
110 |     clean = re.sub("[^a-zA-Z]"," ", raw)
111 |     words = clean.split()
112 |     return words
113 | 
114 | 
115 | # In[15]:
116 | 
117 | 
118 | #for each sentence, sentences where each word is tokenized
119 | sentences = []
120 | for raw_sentence in raw_sentences:
121 |     if len(raw_sentence) > 0:
122 |         sentences.append(sentence_to_wordlist(raw_sentence))
123 | 
124 | 
125 | # In[16]:
126 | 
127 | 
128 | print(raw_sentences[50])
129 | print(sentence_to_wordlist(raw_sentences[50]))
130 | 
131 | 
132 | # In[17]:
133 | 
134 | 
135 | #count tokens, each one being a sentence
136 | token_count = sum([len(sentence) for sentence in sentences])
137 | print("The book corpus contains {0:,} tokens".format(token_count))
138 | 
139 | 
140 | # In[18]:
141 | 
142 | 
143 | #Define hyperparameters
144 | 
145 | # Dimensionality of the resulting word vectors.
146 | num_features = 300
147 | 
148 | # Minimum word count threshold.
149 | min_word_count = 3
150 | 
151 | # Number of threads to run in parallel.
152 | num_workers = multiprocessing.cpu_count()
153 | 
154 | # Context window length.
155 | context_size = 7
156 | 
157 | # Downsample setting for frequent words.
158 | downsampling = 1e-3
159 | 
160 | # Seed for the RNG, to make the results reproducible.
161 | seed = 1
162 | 
163 | 
164 | # In[19]:
165 | 
166 | 
167 | got2vec = w2v.Word2Vec(
168 |     sg=1,
169 |     seed=seed,
170 |     workers=num_workers,
171 |     size=num_features,
172 |     min_count=min_word_count,
173 |     window=context_size,
174 |     sample=downsampling
175 | )
176 | 
177 | 
178 | # In[20]:
179 | 
180 | 
181 | got2vec.build_vocab(sentences,progress_per=10000, keep_raw_vocab=False, trim_rule=None)
182 | 
183 | 
184 | # In[21]:
185 | 
186 | 
187 | #train model on sentences
188 | got2vec.train(sentences, total_examples=got2vec.corpus_count, 
189 |               total_words=None, epochs=got2vec.iter, 
190 |               start_alpha=None, end_alpha=None, word_count=0, 
191 |               queue_factor=2, report_delay=1.0, compute_loss=False)
192 | 
193 | 
194 | # In[22]:
195 | 
196 | 
197 | #save model
198 | if not os.path.exists("trained"):
199 |     os.makedirs("trained")
200 | 
201 | 
202 | # In[23]:
203 | 
204 | 
205 | got2vec.wv.save(os.path.join("trained", "got2vec.w2v"), ignore=[])
206 | 
207 | 
208 | # In[24]:
209 | 
210 | 
211 | #load model
212 | got2vec = w2v.KeyedVectors.load(os.path.join("trained", "got2vec.w2v"))
213 | 
214 | 
215 | # In[25]:
216 | 
217 | 
218 | #Squash dimensionality to 2
219 | tsne = sklearn.manifold.TSNE(n_components=2, random_state=0)
220 | 
221 | 
222 | # In[26]:
223 | 
224 | 
225 | #Put all the word vectors into one big matrix
226 | all_word_vectors_matrix = got2vec.wv.syn0
227 | 
228 | 
229 | # In[27]:
230 | 
231 | 
232 | print (all_word_vectors_matrix)
233 | 
234 | 
235 | # In[28]:
236 | 
237 | 
238 | #train tsne
239 | all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix)
240 | 
241 | 
242 | # In[29]:
243 | 
244 | 
245 | #plot point in 2d space
246 | points = pd.DataFrame(
247 |     [
248 |         (word, coords[0], coords[1])
249 |         for word, coords in [
250 |             (word, all_word_vectors_matrix_2d[got2vec.vocab[word].index])
251 |             for word in got2vec.vocab
252 |         ]
253 |     ],
254 |     columns=["word", "x", "y"]
255 | )
256 | 
257 | 
258 | # In[30]:
259 | 
260 | 
261 | points.head(20)
262 | 
263 | 
264 | # In[31]:
265 | 
266 | 
267 | # Plotting using the seaborn library
268 | sns.set_context("poster")
269 | 
270 | 
271 | # In[32]:
272 | 
273 | 
274 | points.plot.scatter("x", "y", s=10, figsize=(10, 10))
275 | 
276 | 
277 | # In[33]:
278 | 
279 | 
280 | def plot_region(x_bounds, y_bounds):
281 |     slice = points[
282 |         (x_bounds[0] <= points.x) &
283 |         (points.x <= x_bounds[1]) & 
284 |         (y_bounds[0] <= points.y) &
285 |         (points.y <= y_bounds[1])
286 |     ]
287 |     
288 |     ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8))
289 |     for i, point in slice.iterrows():
290 |         ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11)
291 | 
292 | 
293 | # In[34]:
294 | 
295 | 
296 | plot_region(x_bounds=(20.0, 25.0), y_bounds=(15.5, 20.0))
297 | 
298 | 
299 | # In[35]:
300 | 
301 | 
302 | plot_region(x_bounds=(4, 41), y_bounds=(-0.5, -0.1))
303 | 
304 | 
305 | # In[36]:
306 | 
307 | 
308 | plot_region(x_bounds=(10, 15), y_bounds=(5, 10))
309 | 
310 | 
311 | # In[37]:
312 | 
313 | 
314 | got2vec.most_similar("Stark")
315 | 
316 | 
317 | # In[38]:
318 | 
319 | 
320 | got2vec.most_similar("Lannister")
321 | 
322 | 
323 | # In[39]:
324 | 
325 | 
326 | got2vec.most_similar("Jon")
327 | 
328 | 
329 | # In[40]:
330 | 
331 | 
332 | #distance, similarity, and ranking
333 | def nearest_similarity_cosmul(start1, end1, end2):
334 |     similarities = got2vec.most_similar_cosmul(
335 |         positive=[end2, start1],
336 |         negative=[end1]
337 |     )
338 |     start2 = similarities[0][0]
339 |     print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals()))
340 |     return start2
341 | 
342 | 
343 | # In[41]:
344 | 
345 | 
346 | nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun")
347 | nearest_similarity_cosmul("Jaime", "sword", "wine")
348 | nearest_similarity_cosmul("Arya", "Nymeria", "dragons")
349 | 
350 | 


--------------------------------------------------------------------------------
/CH06/code/CH06_LSTMs+word+level.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | 
  7 | from numpy import array
  8 | from pickle import dump
  9 | from keras.preprocessing.text import Tokenizer
 10 | from keras.utils import to_categorical
 11 | from keras.models import Sequential
 12 | from keras.layers import Dense
 13 | from keras.layers import LSTM
 14 | from keras.layers import Embedding
 15 | 
 16 | 
 17 | # In[2]:
 18 | 
 19 | 
 20 | pwd
 21 | 
 22 | 
 23 | # In[3]:
 24 | 
 25 | 
 26 | cd '/Users/Chanti/Desktop/Cookbook/Chapter 8'
 27 | 
 28 | 
 29 | # In[4]:
 30 | 
 31 | 
 32 | pwd
 33 | 
 34 | 
 35 | # In[5]:
 36 | 
 37 | 
 38 | # load doc into memory
 39 | def load_document(name):
 40 |     file = open(name, 'r')
 41 |     text = file.read()
 42 |     file.close()
 43 |     return text
 44 | 
 45 | 
 46 | # In[6]:
 47 | 
 48 | 
 49 | # load document
 50 | input_filename = 'junglebook.txt'
 51 | doc = load_document(input_filename)
 52 | print(doc[:2000])
 53 | 
 54 | 
 55 | # In[7]:
 56 | 
 57 | 
 58 | import string
 59 |  
 60 | # turn a document into clean tokens
 61 | def clean_document(doc):
 62 |     doc = doc.replace('--', ' ')
 63 |     tokens = doc.split()
 64 |     table = str.maketrans('', '', string.punctuation)
 65 |     tokens = [w.translate(table) for w in tokens]
 66 |     tokens = [word for word in tokens if word.isalpha()]
 67 |     tokens = [word.lower() for word in tokens]
 68 |     return tokens
 69 | 
 70 | 
 71 | # In[8]:
 72 | 
 73 | 
 74 | # clean document
 75 | tokens = clean_document(doc)
 76 | print(tokens[:200])
 77 | print('Total Tokens: %d' % len(tokens))
 78 | print('Unique Tokens: %d' % len(set(tokens)))
 79 | 
 80 | 
 81 | # In[9]:
 82 | 
 83 | 
 84 | # organize into sequences (of length 50) of tokens
 85 | length = 50 + 1
 86 | sequences = list()
 87 | for i in range(length, len(tokens)):
 88 |     # select sequence of tokens
 89 |     seq = tokens[i-length:i]
 90 |     # convert into a line
 91 |     line = ' '.join(seq)
 92 |     sequences.append(line)
 93 | print('Total Sequences: %d' % len(sequences))
 94 | 
 95 | 
 96 | # In[10]:
 97 | 
 98 | 
 99 | # save tokens to file, one dialog per line
100 | def save_document(lines, name):
101 |     data = '\n'.join(lines)
102 |     file = open(name, 'w')
103 |     file.write(data)
104 |     file.close()
105 | 
106 | 
107 | # In[11]:
108 | 
109 | 
110 | # save sequences to file
111 | output_filename = 'junglebook_sequences.txt'
112 | save_document(sequences, output_filename)
113 | 
114 | 
115 | # In[12]:
116 | 
117 | 
118 | # load document into memory
119 | def load_document(name):
120 |     file = open(name, 'r')
121 |     text = file.read()
122 |     file.close()
123 |     return text
124 |  
125 | # load
126 | input_filename = 'junglebook_sequences.txt'
127 | doc = load_document(input_filename)
128 | lines = doc.split('\n')
129 | 
130 | 
131 | # In[13]:
132 | 
133 | 
134 | # integer encode sequences of words
135 | tokenizer = Tokenizer()
136 | tokenizer.fit_on_texts(lines)
137 | sequences = tokenizer.texts_to_sequences(lines)
138 | 
139 | 
140 | # In[14]:
141 | 
142 | 
143 | # vocabulary size
144 | vocab_size = len(tokenizer.word_index) + 1 
145 | 
146 | 
147 | # In[15]:
148 | 
149 | 
150 | # separate into input and output
151 | sequences = array(sequences)
152 | Input, Output = sequences[:,:-1], sequences[:,-1]
153 | Output = to_categorical(Output, num_classes=vocab_size)
154 | sequence_length = Input.shape[1]
155 | 
156 | 
157 | # In[16]:
158 | 
159 | 
160 | # define model
161 | from keras.layers import Dropout
162 | model = Sequential()
163 | model.add(Embedding(vocab_size, 100, input_length=sequence_length))
164 | model.add(LSTM(200, return_sequences=True))
165 | model.add(LSTM(200))
166 | model.add(Dropout(0.3))
167 | model.add(Dense(200, activation='relu'))
168 | model.add(Dense(vocab_size, activation='softmax'))
169 | print(model.summary())
170 | 
171 | 
172 | # In[17]:
173 | 
174 | 
175 | # compile model
176 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
177 | # fit model
178 | model.fit(Input, Output, batch_size=250, epochs=75)
179 | 
180 | 
181 | # In[18]:
182 | 
183 | 
184 | # save the model to file
185 | model.save('junglebook_trained.h5')
186 | # save the tokenizer
187 | dump(tokenizer, open('tokenizer.pkl', 'wb'))
188 | 
189 | 
190 | # In[19]:
191 | 
192 | 
193 | # load doc into memory
194 | def load_document(name):
195 |     file = open(name, 'r')
196 |     text = file.read()
197 |     file.close()
198 |     return text
199 |  
200 | # load cleaned text sequences
201 | input_filename = 'junglebook_sequences.txt'
202 | doc = load_document(input_filename)
203 | lines = doc.split('\n')
204 | 
205 | 
206 | # In[20]:
207 | 
208 | 
209 | sequence_length = len(lines[0].split()) - 1
210 | 
211 | 
212 | # In[21]:
213 | 
214 | 
215 | # load the model
216 | from keras.models import load_model
217 | model = load_model('junglebook_trained.h5')
218 | 
219 | 
220 | # In[22]:
221 | 
222 | 
223 | # select a seed text
224 | from random import randint
225 | seed_text = lines[randint(0,len(lines))]
226 | print(seed_text + '\n')
227 | 
228 | 
229 | # In[23]:
230 | 
231 | 
232 | encoded = tokenizer.texts_to_sequences([seed_text])[0]
233 | 
234 | 
235 | # In[24]:
236 | 
237 | 
238 | from random import randint
239 | from pickle import load
240 | from keras.models import load_model
241 | from keras.preprocessing.sequence import pad_sequences
242 |  
243 | # load doc into memory
244 | def load_document(name):
245 |     file = open(name, 'r')
246 |     text = file.read()
247 |     file.close()
248 |     return text
249 |  
250 | # generate a sequence from a language model
251 | def generate_sequence(model, tokenizer, sequence_length, seed_text, n_words):
252 | 	result = list()
253 | 	input_text = seed_text
254 | 	# generate a fixed number of words
255 | 	for _ in range(n_words):
256 | 		# encode the text as integer
257 | 		encoded = tokenizer.texts_to_sequences([input_text])[0]
258 | 		# truncate sequences to a fixed length
259 | 		encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
260 | 		# predict probabilities for each word
261 | 		prediction = model.predict_classes(encoded, verbose=0)
262 | 		# map predicted word index to word
263 | 		out_word = ''
264 | 		for word, index in tokenizer.word_index.items():
265 | 			if index == prediction:
266 | 				out_word = word
267 | 				break
268 | 		# append to input
269 | 		input_text += ' ' + out_word
270 | 		result.append(out_word)
271 | 	return ' '.join(result)
272 |  
273 | # load cleaned text sequences
274 | input_filename = 'junglebook_sequences.txt'
275 | doc = load_document(input_filename)
276 | lines = doc.split('\n')
277 | seq_length = len(lines[0].split()) - 1
278 | 
279 | 
280 | # In[25]:
281 | 
282 | 
283 | # load the model
284 | model = load_model('junglebook_trained.h5')
285 |  
286 | # load the tokenizer
287 | tokenizer = load(open('tokenizer.pkl', 'rb'))
288 |  
289 | # select a seed text
290 | seed_text = lines[randint(0,len(lines))]
291 | print(seed_text + '\n')
292 |  
293 | # generate new text
294 | generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50)
295 | print(generated)
296 | 
297 | 
298 | # In[26]:
299 | 
300 | 
301 | # load the model
302 | model = load_model('junglebook_trained.h5')
303 |  
304 | # load the tokenizer
305 | tokenizer = load(open('tokenizer.pkl', 'rb'))
306 |  
307 | # select a seed text
308 | seed_text = lines[randint(0,len(lines))]
309 | print(seed_text + '\n')
310 |  
311 | # generate new text
312 | generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50)
313 | print(generated)
314 | 
315 | 
316 | # In[29]:
317 | 
318 | 
319 | # load the model
320 | model = load_model('junglebook_trained.h5')
321 |  
322 | # load the tokenizer
323 | tokenizer = load(open('tokenizer.pkl', 'rb'))
324 |  
325 | # select a seed text
326 | seed_text = lines[randint(0,len(lines))]
327 | print(seed_text + '\n')
328 |  
329 | # generate new text
330 | generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50)
331 | print(generated)
332 | 
333 | 


--------------------------------------------------------------------------------
/CH12/code/Create+a+movie+recommendation+engine+with+Keras.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | spark = SparkSession.builder    .master("local")    .appName("RecommendationEngine")    .config("spark.executor.memory", "6gb")    .getOrCreate()
  7 | 
  8 | 
  9 | # In[2]:
 10 | 
 11 | import os
 12 | os.listdir('ml-latest-small/')
 13 | 
 14 | 
 15 | # In[3]:
 16 | 
 17 | movies = spark.read.format('com.databricks.spark.csv')            .options(header='true', inferschema='true')            .load('ml-latest-small/movies.csv')
 18 | tags = spark.read.format('com.databricks.spark.csv')            .options(header='true', inferschema='true')            .load('ml-latest-small/tags.csv')
 19 | links = spark.read.format('com.databricks.spark.csv')            .options(header='true', inferschema='true')            .load('ml-latest-small/links.csv')
 20 | ratings = spark.read.format('com.databricks.spark.csv')            .options(header='true', inferschema='true')            .load('ml-latest-small/ratings.csv')
 21 | 
 22 | 
 23 | # In[4]:
 24 | 
 25 | ratings.columns
 26 | 
 27 | 
 28 | # In[5]:
 29 | 
 30 | ratings.show(truncate=False)
 31 | 
 32 | 
 33 | # In[6]:
 34 | 
 35 | tags.show(truncate = False)
 36 | 
 37 | 
 38 | # In[7]:
 39 | 
 40 | movies.select('genres').distinct().show(truncate = False)
 41 | 
 42 | 
 43 | # In[8]:
 44 | 
 45 | links.show()
 46 | 
 47 | 
 48 | # In[9]:
 49 | 
 50 | print('The number of rows in movies dataset is {}'.format(movies.toPandas().shape[0]))
 51 | print('The number of rows in ratings dataset is {}'.format(ratings.toPandas().shape[0]))
 52 | print('The number of rows in tags dataset is {}'.format(tags.toPandas().shape[0]))
 53 | print('The number of rows in links dataset is {}'.format(links.toPandas().shape[0]))
 54 | 
 55 | 
 56 | # In[10]:
 57 | 
 58 | for i in ratings.columns:
 59 |     ratings = ratings.withColumnRenamed(i, i+'_1')    
 60 | 
 61 | 
 62 | # In[11]:
 63 | 
 64 | ratings.show()
 65 | 
 66 | 
 67 | # In[12]:
 68 | 
 69 | temp1 = ratings.join(movies, ratings.movieId_1 == movies.movieId, how = 'inner')
 70 | 
 71 | 
 72 | # In[13]:
 73 | 
 74 | temp2 = temp1.join(links, temp1.movieId_1 == links.movieId, how = 'inner')
 75 | 
 76 | 
 77 | # In[14]:
 78 | 
 79 | mainDF = temp2.join(tags, (temp2.userId_1 == tags.userId) &
 80 |                     (temp2.movieId_1 == tags.movieId), how = 'left')
 81 | 
 82 | 
 83 | # In[15]:
 84 | 
 85 | print(temp1.count())
 86 | print(temp2.count())
 87 | print(mainDF.count())
 88 | 
 89 | 
 90 | # In[16]:
 91 | 
 92 | mainDF.groupBy(['tag']).agg({'rating_1':'count'})            .withColumnRenamed('count(rating_1)', 'Row Count').orderBy(["Row Count"],ascending=False)            .show()
 93 | 
 94 | 
 95 | # In[17]:
 96 | 
 97 | mainDF.columns
 98 | 
 99 | 
100 | # In[18]:
101 | 
102 | mainDF = mainDF.select('userId_1','movieId_1','rating_1','title','genres', 'imdbId','tmdbId', 'timestamp_1')               .distinct()
103 | 
104 | 
105 | # In[19]:
106 | 
107 | mainDF.count()
108 | 
109 | 
110 | # In[20]:
111 | 
112 | movies.createOrReplaceTempView('movies_')
113 | links.createOrReplaceTempView('links_')
114 | ratings.createOrReplaceTempView('ratings_')
115 | 
116 | 
117 | # In[21]:
118 | 
119 | mainDF_SQL = sqlContext.sql(
120 | """
121 | select
122 | r.userId_1
123 | ,r.movieId_1
124 | ,r.rating_1
125 | ,m.title
126 | ,m.genres
127 | ,l.imdbId
128 | ,l.tmdbId
129 | ,r.timestamp_1
130 | from ratings_ r
131 | inner join movies_ m on 
132 | r.movieId_1 =  m.movieId
133 | inner join links_ l on 
134 | r.movieId_1 = l.movieId
135 | """
136 | )
137 | 
138 | 
139 | # In[22]:
140 | 
141 | mainDF_SQL.show(n =  5)
142 | 
143 | 
144 | # In[23]:
145 | 
146 | mainDF_SQL.count()
147 | 
148 | 
149 | # In[24]:
150 | 
151 | mainDF.describe('rating_1').show()
152 | 
153 | 
154 | # In[25]:
155 | 
156 | import matplotlib.pyplot as plt
157 | get_ipython().magic('matplotlib inline')
158 | 
159 | mainDF.select('rating_1').toPandas().hist(figsize=(16, 6), grid=True)
160 | plt.title('Histogram of Ratings')
161 | plt.show()
162 | 
163 | 
164 | # In[26]:
165 | 
166 | mainDF.groupBy(['rating_1']).agg({'rating_1':'count'})            .withColumnRenamed('count(rating_1)', 'Row Count').orderBy(["Row Count"],ascending=False)            .show()
167 | 
168 | 
169 | # In[27]:
170 | 
171 | userId_frequency = mainDF.groupBy(['userId_1']).agg({'rating_1':'count'})            .withColumnRenamed('count(rating_1)', '# of Reviews')            .orderBy(["# of Reviews"],ascending=False)
172 | 
173 | 
174 | # In[28]:
175 | 
176 | userId_frequency.show()
177 | 
178 | 
179 | # In[29]:
180 | 
181 | userId_frequency.select('# of Reviews').toPandas().hist(figsize=(16, 6), grid=True)
182 | plt.title('Histogram of User Ratings')
183 | plt.show()
184 | 
185 | 
186 | # In[30]:
187 | 
188 | mainDF = mainDF.withColumnRenamed('userId_1', 'userid')
189 | mainDF = mainDF.withColumnRenamed('movieId_1', 'movieid')
190 | mainDF = mainDF.withColumnRenamed('rating_1', 'rating')
191 | mainDF = mainDF.withColumnRenamed('timestamp_1', 'timestamp')
192 | mainDF = mainDF.withColumnRenamed('imdbId', 'imdbid')
193 | mainDF = mainDF.withColumnRenamed('tmdbId', 'tmdbid')
194 | 
195 | 
196 | # In[31]:
197 | 
198 | mainDF.columns
199 | 
200 | 
201 | # In[32]:
202 | 
203 | import pyspark.sql.functions as F
204 | mainDF = mainDF.withColumn("rating", F.round(mainDF["rating"], 0))
205 | 
206 | 
207 | # In[33]:
208 | 
209 | from pyspark.ml.feature import StringIndexer
210 | string_indexer = StringIndexer(inputCol="genres", outputCol="genreCount")
211 | mainDF = string_indexer.fit(mainDF).transform(mainDF)
212 | mainDF.show()
213 | 
214 | 
215 | # In[34]:
216 | 
217 | mainDF = mainDF.select('rating', 'userid', 'movieid', 'imdbid', 'tmdbid', 'timestamp', 'genreCount')
218 | 
219 | 
220 | # In[35]:
221 | 
222 | mainDF.show()
223 | 
224 | 
225 | # In[36]:
226 | 
227 | trainDF, testDF = mainDF.randomSplit([0.8, 0.2], seed=1234)
228 | 
229 | 
230 | # In[37]:
231 | 
232 | print('The number of rows in mainDF is {}'.format(mainDF.count()))
233 | print('The number of rows in trainDF is {}'.format(trainDF.count()))
234 | print('The number of rows in testDF is {}'.format(testDF.count()))
235 | 
236 | 
237 | # In[38]:
238 | 
239 | import numpy as np
240 | xtrain_array = np.array(trainDF.select('userid','movieid', 'genreCount').collect())
241 | xtest_array = np.array(testDF.select('userid','movieid', 'genreCount').collect())
242 | 
243 | 
244 | # In[39]:
245 | 
246 | ytrain_array = np.array(trainDF.select('rating').collect())
247 | ytest_array = np.array(testDF.select('rating').collect())
248 | 
249 | 
250 | # In[40]:
251 | 
252 | print(xtest_array.shape)
253 | print(ytest_array.shape)
254 | print(xtrain_array.shape)
255 | print(ytrain_array.shape)
256 | 
257 | 
258 | # In[41]:
259 | 
260 | import keras.utils as u
261 | ytrain_OHE = u.to_categorical(ytrain_array)
262 | ytest_OHE = u.to_categorical(ytest_array)
263 | 
264 | 
265 | # In[42]:
266 | 
267 | print(ytrain_OHE.shape)
268 | print(ytest_OHE.shape)
269 | 
270 | 
271 | # In[43]:
272 | 
273 | from keras.models import Sequential
274 | from keras.layers import Dense, Activation
275 | 
276 | 
277 | # In[44]:
278 | 
279 | model = Sequential()
280 | model.add(Dense(32, activation='relu', input_dim=xtrain_array.shape[1]))
281 | model.add(Dense(10, activation='relu'))
282 | model.add(Dense(ytrain_OHE.shape[1], activation='softmax'))
283 | model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
284 | 
285 | 
286 | # In[45]:
287 | 
288 | accuracy_history = model.fit(xtrain_array, ytrain_OHE, epochs=20, batch_size=32)
289 | 
290 | 
291 | # In[46]:
292 | 
293 | plt.plot(accuracy_history.history['acc'])
294 | plt.title('Accuracy vs. Epoch')
295 | plt.xlabel('Epoch')
296 | plt.ylabel('Accuracy')
297 | plt.show()
298 | plt.plot(accuracy_history.history['loss'])
299 | plt.title('Loss vs. Epoch')
300 | plt.xlabel('Epoch')
301 | plt.ylabel('Loss')
302 | plt.show()
303 | 
304 | 
305 | # In[47]:
306 | 
307 | score = model.evaluate(xtest_array, ytest_OHE, batch_size=128)
308 | accuracy_rate = score[1]*100
309 | print('accuracy is {}%'.format(round(accuracy_rate,2)))
310 | 
311 | 


--------------------------------------------------------------------------------
/CH02/code/NeuralNetworkfromScratch_with_python_and spark.py:
--------------------------------------------------------------------------------
  1 | 
  2 | # coding: utf-8
  3 | 
  4 | # In[1]:
  5 | 
  6 | from pyspark.sql import SparkSession
  7 | 
  8 | 
  9 | # In[2]:
 10 | 
 11 | spark = SparkSession.builder    .master("local")    .appName("Neural Network Model")    .config("spark.executor.memory", "6gb")    .getOrCreate()
 12 |    
 13 | sc = spark.sparkContext
 14 | 
 15 | 
 16 | # In[3]:
 17 | 
 18 | df = spark.createDataFrame([('Male', 67, 150), # insert column values
 19 |                             ('Female', 65, 135),
 20 |                             ('Female', 68, 130),
 21 |                             ('Male', 70, 160),
 22 |                             ('Female', 70, 130),
 23 |                             ('Male', 69, 174),
 24 |                             ('Female', 65, 126),
 25 |                             ('Male', 74, 188),
 26 |                             ('Female', 60, 110),
 27 |                             ('Female', 63, 125),
 28 |                             ('Male', 70, 173),
 29 |                             ('Male', 70, 145),
 30 |                             ('Male', 68, 175),
 31 |                             ('Female', 65, 123),
 32 |                             ('Male', 71, 145),
 33 |                             ('Male', 74, 160),
 34 |                             ('Female', 64, 135),
 35 |                             ('Male', 71, 175),
 36 |                             ('Male', 67, 145),
 37 |                             ('Female', 67, 130),
 38 |                             ('Male', 70, 162),
 39 |                             ('Female', 64, 107),
 40 |                             ('Male', 70, 175),
 41 |                             ('Female', 64, 130),
 42 |                             ('Male', 66, 163),
 43 |                             ('Female', 63, 137),
 44 |                             ('Male', 65, 165),
 45 |                             ('Female', 65, 130),
 46 |                             ('Female', 64, 109)], 
 47 |                            ['gender', 'height','weight']) # insert header values
 48 | 
 49 | 
 50 | 
 51 | # In[4]:
 52 | 
 53 | df.show(5)
 54 | 
 55 | 
 56 | # In[5]:
 57 | 
 58 | from pyspark.sql import functions 
 59 | 
 60 | 
 61 | # In[6]:
 62 | 
 63 | df = df.withColumn('gender',functions.when(df['gender']=='Female',0).otherwise(1))
 64 | 
 65 | 
 66 | # In[7]:
 67 | 
 68 | df = df.select('height', 'weight', 'gender')
 69 | 
 70 | 
 71 | # In[8]:
 72 | 
 73 | df.show()
 74 | 
 75 | 
 76 | # In[9]:
 77 | 
 78 | import numpy as np
 79 | 
 80 | 
 81 | # In[10]:
 82 | 
 83 | df.select("height", "weight", "gender").collect()
 84 | 
 85 | 
 86 | # In[11]:
 87 | 
 88 | data_array =  np.array(df.select("height", "weight", "gender").collect())
 89 | data_array #view the array
 90 | 
 91 | 
 92 | # In[12]:
 93 | 
 94 | data_array.shape
 95 | 
 96 | 
 97 | # In[13]:
 98 | 
 99 | data_array[0]
100 | 
101 | 
102 | # In[14]:
103 | 
104 | data_array[28]
105 | 
106 | 
107 | # In[15]:
108 | 
109 | print(data_array.max(axis=0))
110 | print(data_array.min(axis=0))
111 | 
112 | 
113 | # In[16]:
114 | 
115 | import matplotlib.pyplot as plt
116 | get_ipython().magic('matplotlib inline')
117 | 
118 | 
119 | # In[17]:
120 | 
121 | min_x = data_array.min(axis=0)[0]-10
122 | max_x = data_array.max(axis=0)[0]+10
123 | min_y = data_array.min(axis=0)[1]-10
124 | max_y = data_array.max(axis=0)[1]+10
125 | 
126 | print(min_x, max_x, min_y, max_y)
127 | 
128 | 
129 | # In[18]:
130 | 
131 | # formatting the plot grid, scales, and figure size
132 | plt.figure(figsize=(9, 4), dpi= 75)
133 | plt.axis([min_x,max_x,min_y,max_y])
134 | plt.grid()
135 | for i in range(len(data_array)):
136 |     value = data_array[i]
137 |     # assign labels values to specific matrix elements
138 |     gender = value[2]
139 |     height = value[0]
140 |     weight = value[1]
141 |     
142 |     # filter data points by gender
143 |     a = plt.scatter(height[gender==0],weight[gender==0], marker = 'x', c= 'b', label = 'Female')
144 |     b = plt.scatter(height[gender==1],weight[gender==1], marker = 'o', c= 'b', label = 'Male')
145 |     
146 |     # plot values, title, legend, x and y axis
147 |     plt.title('Weight vs Height by Gender')
148 |     plt.xlabel('Height (in)')
149 |     plt.ylabel('Weight (lbs)')
150 |     plt.legend(handles=[a,b])
151 |     
152 | 
153 | 
154 | # In[19]:
155 | 
156 | np.random.seed(12345)
157 | 
158 | 
159 | # In[20]:
160 | 
161 | w1 = np.random.randn()
162 | w2 = np.random.randn()
163 | b= np.random.randn()
164 | 
165 | 
166 | # In[21]:
167 | 
168 | print(w1, w2, b)
169 | 
170 | 
171 | # In[22]:
172 | 
173 | X = data_array[:,:2]
174 | y = data_array[:,2]
175 | print(X,y)
176 | 
177 | 
178 | # In[23]:
179 | 
180 | x_mean = X.mean(axis=0)
181 | x_std = X.std(axis=0)
182 | print(x_mean, x_std)
183 | 
184 | 
185 | # In[24]:
186 | 
187 | def normalize(X):
188 |     x_mean = X.mean(axis=0)
189 |     x_std = X.std(axis=0)
190 |     X = (X - X.mean(axis=0))/X.std(axis=0)
191 |     return X
192 | 
193 | 
194 | # In[25]:
195 | 
196 | X = normalize(X)
197 | print(X)
198 | 
199 | 
200 | # In[26]:
201 | 
202 | print('standard deviation')
203 | print(round(X[:,0].std(axis=0),0))
204 | print('mean')
205 | print(round(X[:,0].mean(axis=0),0))
206 | 
207 | 
208 | # In[27]:
209 | 
210 | data_array = np.column_stack((X[:,0], X[:,1],y))
211 | print(data_array)
212 | 
213 | 
214 | # In[28]:
215 | 
216 | # formatting the plot grid, scales, and figure size
217 | plt.figure(figsize=(9, 4), dpi= 75)
218 | # plt.axis([min_x,max_x,min_y,max_y])
219 | plt.grid()
220 | for i in range(len(data_array)):
221 |     value_n = data_array[i]
222 |     # assign labels values to specific matrix elements
223 |     gender_n = value_n[2]
224 |     height_n = value_n[0]
225 |     weight_n = value_n[1]
226 |     an = plt.scatter(height_n[gender_n==0.0],weight_n[gender_n==0.0], marker = 'x', c= 'b', label = 'Female')
227 |     bn = plt.scatter(height_n[gender_n==1.0],weight_n[gender_n==1.0], marker = 'o', c= 'b', label = 'Male')    
228 |     # plot values, title, legend, x and y axis
229 |     plt.title('Weight vs Height by Gender (normalized)')
230 |     plt.xlabel('Height (in)')
231 |     plt.ylabel('Weight (lbs)')
232 |     plt.legend(handles=[an,bn])
233 | 
234 | 
235 | # In[29]:
236 | 
237 | def sigmoid(input):
238 |     return 1/(1+np.exp(-input))
239 | 
240 | 
241 | # In[30]:
242 | 
243 | X = np.arange(-10,10,1)
244 | Y = sigmoid(X)
245 | 
246 | 
247 | # In[31]:
248 | 
249 | plt.figure(figsize=(6, 4), dpi= 75)
250 | plt.axis([-10,10,-0.25,1.2])
251 | plt.grid()
252 | plt.plot(X,Y)
253 | plt.title('Sigmoid Function')
254 | plt.show()
255 | 
256 | 
257 | # In[32]:
258 | 
259 | def sigmoid_derivative(x):
260 |     return sigmoid(x) * (1-sigmoid(x))
261 | 
262 | 
263 | # In[33]:
264 | 
265 | plt.figure(figsize=(6, 4), dpi= 75)
266 | plt.axis([-10,10,-0.25,1.2])
267 | plt.grid()
268 | X = np.arange(-10,10,1)
269 | Y = sigmoid(X)
270 | Y_Prime = sigmoid_derivative(X)
271 | plt.plot(X, Y, label="Sigmoid",c='b')
272 | plt.plot(X, Y_Prime, marker=".", label="Sigmoid Derivative", c='b')
273 | plt.title('Sigmoid vs Sigmoid Derivative')
274 | plt.xlabel('X')
275 | plt.ylabel('Y')
276 | plt.legend()
277 | plt.show()
278 | 
279 | 
280 | # In[34]:
281 | 
282 | data_array.shape
283 | 
284 | 
285 | # In[35]:
286 | 
287 | for i in range(100):
288 |     random_index = np.random.randint(len(data_array))
289 |     point = data_array[random_index]
290 |     print(i, point)
291 | 
292 | 
293 | # In[36]:
294 | 
295 | learning_rate = 0.1
296 | 
297 | all_costs = []
298 | 
299 | for i in range(100000):
300 |     # set the random data points that will be used to calculate the summation
301 |     random_number = np.random.randint(len(data_array))
302 |     random_person = data_array[random_number]
303 |     
304 |     # the height and weight from the random individual are selected
305 |     height = random_person[0]
306 |     weight = random_person[1]
307 | 
308 |     z = w1*height+w2*weight+b
309 |     predictedGender = sigmoid(z)
310 |     
311 |     actualGender = random_person[2]
312 |     
313 |     cost = (predictedGender-actualGender)**2
314 |     
315 |     # the cost value is appended to the list
316 |     all_costs.append(cost)
317 |     
318 |     # partial derivatives of the cost function and summation are calculated
319 |     dcost_predictedGender = 2 * (predictedGender-actualGender)
320 |     dpredictedGenger_dz = sigmoid_derivative(z)
321 |     dz_dw1 = height
322 |     dz_dw2 = weight
323 |     dz_db = 1
324 |     
325 |     dcost_dw1 = dcost_predictedGender * dpredictedGenger_dz * dz_dw1
326 |     dcost_dw2 = dcost_predictedGender * dpredictedGenger_dz * dz_dw2
327 |     dcost_db  = dcost_predictedGender * dpredictedGenger_dz * dz_db
328 |     
329 |     # gradient descent calculation
330 |     w1 = w1 - learning_rate * dcost_dw1
331 |     w2 = w2 - learning_rate * dcost_dw2
332 |     b  = b  - learning_rate * dcost_db
333 | 
334 | 
335 | # In[37]:
336 | 
337 | plt.plot(all_costs)
338 | plt.title('Cost Value over 100,000 iterations')
339 | plt.xlabel('Iteration')
340 | plt.ylabel('Cost Value')
341 | plt.show()
342 | 
343 | 
344 | # In[38]:
345 | 
346 | print('The final values of w1, w2, and b')
347 | print('---------------------------------')
348 | print('w1 = {}'.format(w1))
349 | print('w2 = {}'.format(w2))
350 | print('b  = {}'.format(b))
351 | 
352 | 
353 | # In[39]:
354 | 
355 | for i in range(len(data_array)):
356 |     random_individual = data_array[i]
357 |     height = random_individual[0]
358 |     weight = random_individual[1]
359 |     z = height*w1 + weight*w2 + b
360 |     predictedGender=sigmoid(z)
361 |     print("Individual #{} actual score: {} predicted score: {}"
362 |           .format(i+1,random_individual[2],predictedGender))
363 | 
364 | 
365 | # In[40]:
366 | 
367 | def input_normalize(height, weight):
368 |     inputHeight = (height - x_mean[0])/x_std[0]
369 |     inputWeight = (weight - x_mean[1])/x_std[1]
370 |     return inputHeight, inputWeight
371 | 
372 | 
373 | # In[41]:
374 | 
375 | score = input_normalize(70, 180)
376 | 
377 | 
378 | # In[42]:
379 | 
380 | def predict_gender(raw_score):
381 |     gender_summation = raw_score[0]*w1 + raw_score[1]*w2 + b
382 |     gender_score = sigmoid(gender_summation)
383 |     if gender_score <= 0.5:
384 |         gender = 'Female'
385 |     else:
386 |         gender = 'Male'
387 |     return gender, gender_score
388 | 
389 | 
390 | # In[43]:
391 | 
392 | predict_gender(score)
393 | 
394 | 
395 | # In[44]:
396 | 
397 | score = input_normalize(50,120)
398 | 
399 | 
400 | # In[45]:
401 | 
402 | predict_gender(score)
403 | 
404 | 
405 | # In[46]:
406 | 
407 | x_min = min(data_array[:,0])-0.1
408 | x_max = max(data_array[:,0])+0.1
409 | y_min = min(data_array[:,1])-0.1
410 | y_max = max(data_array[:,1])+0.1
411 | increment= 0.05
412 | print(x_min, x_max, y_min, y_max)
413 | 
414 | 
415 | # In[47]:
416 | 
417 | x_data= np.arange(x_min, x_max, increment)
418 | 
419 | 
420 | # In[48]:
421 | 
422 | y_data= np.arange(y_min, y_max, increment)
423 | 
424 | 
425 | # In[49]:
426 | 
427 | xy_data = [[x_all, y_all] for x_all in x_data for y_all in y_data]
428 | 
429 | 
430 | # In[50]:
431 | 
432 | for i in range(len(xy_data)):
433 |     data = (xy_data[i])
434 |     height = data[0]
435 |     weight = data[1]  
436 |     z_new = height*w1 + weight*w2 + b
437 |     predictedGender_new=sigmoid(z_new)
438 |     # print(height, weight, predictedGender_new)
439 |     ax = plt.scatter(height[predictedGender_new<=0.5],
440 |                      weight[predictedGender_new<=0.5], 
441 |                      marker = 'o', c= 'r', label = 'Female')
442 |     bx = plt.scatter(height[predictedGender_new > 0.5],
443 |                      weight[predictedGender_new>0.5], 
444 |                      marker = 'o', c= 'b', label = 'Male')    
445 |     # plot values, title, legend, x and y axis
446 |     plt.title('Weight vs Height by Gender')
447 |     plt.xlabel('Height (in)')
448 |     plt.ylabel('Weight (lbs)')
449 |     plt.legend(handles=[ax,bx])
450 | 
451 | 


--------------------------------------------------------------------------------
/CH13/code/Image+Classification+with+TensorFlow+on+Spark.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [],
  8 |    "source": [
  9 |     "spark = SparkSession.builder \\\n",
 10 |     "   .master(\"local\") \\\n",
 11 |     "   .appName(\"ImageClassification\") \\\n",
 12 |     "   .config(\"spark.executor.memory\", \"6gb\") \\\n",
 13 |     "   .getOrCreate()"
 14 |    ]
 15 |   },
 16 |   {
 17 |    "cell_type": "code",
 18 |    "execution_count": 2,
 19 |    "metadata": {},
 20 |    "outputs": [
 21 |     {
 22 |      "name": "stderr",
 23 |      "output_type": "stream",
 24 |      "text": [
 25 |       "Using TensorFlow backend.\n",
 26 |       "/home/asherif844/anaconda3/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
 27 |       "  return f(*args, **kwds)\n"
 28 |      ]
 29 |     }
 30 |    ],
 31 |    "source": [
 32 |     "import pyspark.sql.functions as f\n",
 33 |     "import sparkdl as dl"
 34 |    ]
 35 |   },
 36 |   {
 37 |    "cell_type": "code",
 38 |    "execution_count": 3,
 39 |    "metadata": {
 40 |     "collapsed": true
 41 |    },
 42 |    "outputs": [],
 43 |    "source": [
 44 |     "dfMessi = dl.readImages('football/messi/').withColumn('label', f.lit(0))\n",
 45 |     "dfRonaldo = dl.readImages('football/ronaldo/').withColumn('label', f.lit(1))"
 46 |    ]
 47 |   },
 48 |   {
 49 |    "cell_type": "code",
 50 |    "execution_count": 4,
 51 |    "metadata": {},
 52 |    "outputs": [
 53 |     {
 54 |      "name": "stdout",
 55 |      "output_type": "stream",
 56 |      "text": [
 57 |       "+---------------------------------------------------------------------+---------------------------+-----+\n",
 58 |       "|filePath                                                             |image                      |label|\n",
 59 |       "+---------------------------------------------------------------------+---------------------------+-----+\n",
 60 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi3.jpeg |[RGB,173,292,3,[B@43647d0f]|0    |\n",
 61 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi14.jpeg|[RGB,187,270,3,[B@28fe803] |0    |\n",
 62 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi29.jpeg|[RGB,194,259,3,[B@669635ee]|0    |\n",
 63 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi18.jpeg|[RGB,194,259,3,[B@6e004f55]|0    |\n",
 64 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi8.jpeg |[RGB,168,300,3,[B@eecdb9f] |0    |\n",
 65 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi22.jpeg|[RGB,194,259,3,[B@73def5b1]|0    |\n",
 66 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi2.jpeg |[RGB,275,183,3,[B@24308761]|0    |\n",
 67 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi5.jpeg |[RGB,183,275,3,[B@48a60e55]|0    |\n",
 68 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi13.jpeg|[RGB,183,275,3,[B@207e14fd]|0    |\n",
 69 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi11.jpeg|[RGB,175,288,3,[B@24890e3e]|0    |\n",
 70 |       "+---------------------------------------------------------------------+---------------------------+-----+\n",
 71 |       "only showing top 10 rows\n",
 72 |       "\n"
 73 |      ]
 74 |     }
 75 |    ],
 76 |    "source": [
 77 |     "dfMessi.show(n=10,truncate=False)"
 78 |    ]
 79 |   },
 80 |   {
 81 |    "cell_type": "code",
 82 |    "execution_count": 5,
 83 |    "metadata": {},
 84 |    "outputs": [
 85 |     {
 86 |      "name": "stdout",
 87 |      "output_type": "stream",
 88 |      "text": [
 89 |       "+-------------------------------------------------------------------------+---------------------------+-----+\n",
 90 |       "|filePath                                                                 |image                      |label|\n",
 91 |       "+-------------------------------------------------------------------------+---------------------------+-----+\n",
 92 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo24.jpg |[RGB,350,590,3,[B@7b3b3c6] |1    |\n",
 93 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo2.jpeg |[RGB,225,225,3,[B@61826869]|1    |\n",
 94 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo21.jpeg|[RGB,193,261,3,[B@1d739c7f]|1    |\n",
 95 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo17.jpeg|[RGB,183,275,3,[B@59b36a5b]|1    |\n",
 96 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo30.jpeg|[RGB,184,273,3,[B@4304cf28]|1    |\n",
 97 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo14.jpeg|[RGB,154,328,3,[B@31b73601]|1    |\n",
 98 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo5.jpeg |[RGB,168,300,3,[B@30a6d42c]|1    |\n",
 99 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo18.jpeg|[RGB,261,193,3,[B@728581d1]|1    |\n",
100 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo1.jpeg |[RGB,168,300,3,[B@171d6d26]|1    |\n",
101 |       "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo19.jpeg|[RGB,258,195,3,[B@1f1256fa]|1    |\n",
102 |       "+-------------------------------------------------------------------------+---------------------------+-----+\n",
103 |       "only showing top 10 rows\n",
104 |       "\n"
105 |      ]
106 |     }
107 |    ],
108 |    "source": [
109 |     "dfRonaldo.show(n=10,truncate=False)"
110 |    ]
111 |   },
112 |   {
113 |    "cell_type": "code",
114 |    "execution_count": 6,
115 |    "metadata": {
116 |     "collapsed": true
117 |    },
118 |    "outputs": [],
119 |    "source": [
120 |     "trainDFmessi, testDFmessi = dfMessi.randomSplit([66.7, 33.3], seed =12)\n",
121 |     "trainDFronaldo, testDFronaldo = dfRonaldo.randomSplit([66.7, 33.3], seed=12)"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 7,
127 |    "metadata": {},
128 |    "outputs": [
129 |     {
130 |      "name": "stdout",
131 |      "output_type": "stream",
132 |      "text": [
133 |       "The number of images in trainDFmessi is 18\n",
134 |       "The number of images in testDFmessi is 12\n",
135 |       "The number of images in trainDFronaldo is 18\n",
136 |       "The number of images in testDFronaldo is 12\n"
137 |      ]
138 |     }
139 |    ],
140 |    "source": [
141 |     "print('The number of images in trainDFmessi is {}'.format(trainDFmessi.toPandas().shape[0]))\n",
142 |     "print('The number of images in testDFmessi is {}'.format(testDFmessi.toPandas().shape[0]))\n",
143 |     "print('The number of images in trainDFronaldo is {}'.format(trainDFronaldo.toPandas().shape[0]))\n",
144 |     "print('The number of images in testDFronaldo is {}'.format(testDFronaldo.toPandas().shape[0]))"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "code",
149 |    "execution_count": 8,
150 |    "metadata": {
151 |     "collapsed": true
152 |    },
153 |    "outputs": [],
154 |    "source": [
155 |     "trainDF = trainDFmessi.unionAll(trainDFronaldo)\n",
156 |     "testDF = testDFmessi.unionAll(testDFronaldo)"
157 |    ]
158 |   },
159 |   {
160 |    "cell_type": "code",
161 |    "execution_count": 9,
162 |    "metadata": {},
163 |    "outputs": [
164 |     {
165 |      "name": "stdout",
166 |      "output_type": "stream",
167 |      "text": [
168 |       "The number of images in the training data is 36\n",
169 |       "The number of images in the testing  data is 24\n"
170 |      ]
171 |     }
172 |    ],
173 |    "source": [
174 |     "print('The number of images in the training data is {}' .format(trainDF.toPandas().shape[0]))\n",
175 |     "print('The number of images in the testing  data is {}' .format(testDF.toPandas().shape[0]))"
176 |    ]
177 |   },
178 |   {
179 |    "cell_type": "code",
180 |    "execution_count": 10,
181 |    "metadata": {},
182 |    "outputs": [
183 |     {
184 |      "name": "stdout",
185 |      "output_type": "stream",
186 |      "text": [
187 |       "INFO:tensorflow:Froze 376 variables.\n",
188 |       "Converted 376 variables to const ops.\n",
189 |       "INFO:tensorflow:Froze 0 variables.\n",
190 |       "Converted 0 variables to const ops.\n"
191 |      ]
192 |     }
193 |    ],
194 |    "source": [
195 |     "from pyspark.ml.classification import LogisticRegression\n",
196 |     "from pyspark.ml import Pipeline\n",
197 |     "\n",
198 |     "vectorizer = dl.DeepImageFeaturizer(inputCol=\"image\", outputCol=\"features\", modelName='InceptionV3')\n",
199 |     "logreg = LogisticRegression(maxIter=30,labelCol = \"label\", featuresCol=\"features\")\n",
200 |     "pipeline = Pipeline(stages=[vectorizer, logreg])\n",
201 |     "\n",
202 |     "pipeline_model = pipeline.fit(trainDF)"
203 |    ]
204 |   },
205 |   {
206 |    "cell_type": "code",
207 |    "execution_count": 11,
208 |    "metadata": {},
209 |    "outputs": [
210 |     {
211 |      "name": "stdout",
212 |      "output_type": "stream",
213 |      "text": [
214 |       "INFO:tensorflow:Froze 376 variables.\n",
215 |       "Converted 376 variables to const ops.\n",
216 |       "INFO:tensorflow:Froze 0 variables.\n",
217 |       "Converted 0 variables to const ops.\n",
218 |       "+-----+----------+\n",
219 |       "|label|prediction|\n",
220 |       "+-----+----------+\n",
221 |       "|0    |1.0       |\n",
222 |       "|0    |0.0       |\n",
223 |       "|0    |0.0       |\n",
224 |       "|0    |0.0       |\n",
225 |       "|0    |0.0       |\n",
226 |       "|0    |0.0       |\n",
227 |       "|0    |0.0       |\n",
228 |       "|0    |1.0       |\n",
229 |       "|0    |0.0       |\n",
230 |       "|0    |0.0       |\n",
231 |       "|0    |0.0       |\n",
232 |       "|0    |0.0       |\n",
233 |       "|1    |1.0       |\n",
234 |       "|1    |1.0       |\n",
235 |       "|1    |1.0       |\n",
236 |       "|1    |1.0       |\n",
237 |       "|1    |1.0       |\n",
238 |       "|1    |0.0       |\n",
239 |       "|1    |1.0       |\n",
240 |       "|1    |1.0       |\n",
241 |       "|1    |1.0       |\n",
242 |       "|1    |1.0       |\n",
243 |       "|1    |1.0       |\n",
244 |       "|1    |1.0       |\n",
245 |       "+-----+----------+\n",
246 |       "\n"
247 |      ]
248 |     }
249 |    ],
250 |    "source": [
251 |     "predictDF = pipeline_model.transform(testDF)\n",
252 |     "predictDF.select('label', 'prediction').show(n = testDF.toPandas().shape[0], truncate=False)"
253 |    ]
254 |   },
255 |   {
256 |    "cell_type": "code",
257 |    "execution_count": 12,
258 |    "metadata": {},
259 |    "outputs": [
260 |     {
261 |      "name": "stdout",
262 |      "output_type": "stream",
263 |      "text": [
264 |       "+----------------+---+---+\n",
265 |       "|prediction_label|  0|  1|\n",
266 |       "+----------------+---+---+\n",
267 |       "|             1.0|  2| 11|\n",
268 |       "|             0.0| 10|  1|\n",
269 |       "+----------------+---+---+\n",
270 |       "\n"
271 |      ]
272 |     }
273 |    ],
274 |    "source": [
275 |     "predictDF.crosstab('prediction', 'label').show()"
276 |    ]
277 |   },
278 |   {
279 |    "cell_type": "code",
280 |    "execution_count": 13,
281 |    "metadata": {},
282 |    "outputs": [
283 |     {
284 |      "name": "stdout",
285 |      "output_type": "stream",
286 |      "text": [
287 |       "accuracy: 87.5%\n"
288 |      ]
289 |     }
290 |    ],
291 |    "source": [
292 |     "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n",
293 |     "scoring = predictDF.select(\"prediction\", \"label\")\n",
294 |     "accuracy_score = MulticlassClassificationEvaluator(metricName=\"accuracy\")\n",
295 |     "rate = accuracy_score.evaluate(scoring)*100\n",
296 |     "print(\"accuracy: {}%\" .format(round(rate,2)))"
297 |    ]
298 |   },
299 |   {
300 |    "cell_type": "code",
301 |    "execution_count": 14,
302 |    "metadata": {},
303 |    "outputs": [
304 |     {
305 |      "name": "stdout",
306 |      "output_type": "stream",
307 |      "text": [
308 |       "accuracy: 87.5%\n"
309 |      ]
310 |     }
311 |    ],
312 |    "source": [
313 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
314 |     "\n",
315 |     "binaryevaluator = BinaryClassificationEvaluator(rawPredictionCol=\"prediction\")\n",
316 |     "binary_rate = binaryevaluator.evaluate(predictDF)*100\n",
317 |     "print(\"accuracy: {}%\" .format(round(binary_rate,2)))"
318 |    ]
319 |   },
320 |   {
321 |    "cell_type": "code",
322 |    "execution_count": 15,
323 |    "metadata": {},
324 |    "outputs": [
325 |     {
326 |      "name": "stdout",
327 |      "output_type": "stream",
328 |      "text": [
329 |       "INFO:tensorflow:Froze 376 variables.\n",
330 |       "Converted 376 variables to const ops.\n",
331 |       "INFO:tensorflow:Froze 0 variables.\n",
332 |       "Converted 0 variables to const ops.\n"
333 |      ]
334 |     }
335 |    ],
336 |    "source": [
337 |     "logregFT = LogisticRegression(\n",
338 |     "    regParam=0.05, \n",
339 |     "    elasticNetParam=0.3,\n",
340 |     "    maxIter=15,labelCol = \"label\", featuresCol=\"features\")\n",
341 |     "pipelineFT = Pipeline(stages=[vectorizer, logregFT])\n",
342 |     "\n",
343 |     "pipeline_model_FT = pipelineFT.fit(trainDF)"
344 |    ]
345 |   },
346 |   {
347 |    "cell_type": "code",
348 |    "execution_count": 16,
349 |    "metadata": {},
350 |    "outputs": [
351 |     {
352 |      "name": "stdout",
353 |      "output_type": "stream",
354 |      "text": [
355 |       "INFO:tensorflow:Froze 376 variables.\n",
356 |       "Converted 376 variables to const ops.\n",
357 |       "INFO:tensorflow:Froze 0 variables.\n",
358 |       "Converted 0 variables to const ops.\n",
359 |       "+----------------+---+---+\n",
360 |       "|prediction_label|  0|  1|\n",
361 |       "+----------------+---+---+\n",
362 |       "|             1.0|  0| 11|\n",
363 |       "|             0.0| 12|  1|\n",
364 |       "+----------------+---+---+\n",
365 |       "\n"
366 |      ]
367 |     }
368 |    ],
369 |    "source": [
370 |     "predictDF_FT = pipeline_model_FT.transform(testDF)\n",
371 |     "predictDF_FT.crosstab('prediction', 'label').show()"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 17,
377 |    "metadata": {},
378 |    "outputs": [
379 |     {
380 |      "name": "stdout",
381 |      "output_type": "stream",
382 |      "text": [
383 |       "accuracy: 95.83%\n"
384 |      ]
385 |     }
386 |    ],
387 |    "source": [
388 |     "binary_rate_FT = binaryevaluator.evaluate(predictDF_FT)*100\n",
389 |     "print(\"accuracy: {}%\" .format(round(binary_rate_FT,2)))"
390 |    ]
391 |   }
392 |  ],
393 |  "metadata": {
394 |   "kernelspec": {
395 |    "display_name": "Python 3",
396 |    "language": "python",
397 |    "name": "python3"
398 |   },
399 |   "language_info": {
400 |    "codemirror_mode": {
401 |     "name": "ipython",
402 |     "version": 3
403 |    },
404 |    "file_extension": ".py",
405 |    "mimetype": "text/x-python",
406 |    "name": "python",
407 |    "nbconvert_exporter": "python",
408 |    "pygments_lexer": "ipython3",
409 |    "version": "3.6.1"
410 |   }
411 |  },
412 |  "nbformat": 4,
413 |  "nbformat_minor": 2
414 | }
415 | 


--------------------------------------------------------------------------------
/CH03/code/MNIST+with+CNN.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "/home/asherif844/anaconda3/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n",
 13 |       "  return f(*args, **kwds)\n"
 14 |      ]
 15 |     }
 16 |    ],
 17 |    "source": [
 18 |     "import tensorflow as tf"
 19 |    ]
 20 |   },
 21 |   {
 22 |    "cell_type": "code",
 23 |    "execution_count": 2,
 24 |    "metadata": {},
 25 |    "outputs": [
 26 |     {
 27 |      "name": "stdout",
 28 |      "output_type": "stream",
 29 |      "text": [
 30 |       "1.4.1\n"
 31 |      ]
 32 |     }
 33 |    ],
 34 |    "source": [
 35 |     "print(tf.__version__)"
 36 |    ]
 37 |   },
 38 |   {
 39 |    "cell_type": "code",
 40 |    "execution_count": 3,
 41 |    "metadata": {
 42 |     "scrolled": true
 43 |    },
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "Extracting MNIST/train-images-idx3-ubyte.gz\n",
 50 |       "Extracting MNIST/train-labels-idx1-ubyte.gz\n",
 51 |       "Extracting MNIST/t10k-images-idx3-ubyte.gz\n",
 52 |       "Extracting MNIST/t10k-labels-idx1-ubyte.gz\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "from tensorflow.examples.tutorials.mnist import input_data\n",
 58 |     "data = input_data.read_data_sets('MNIST/', one_hot=True)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": 4,
 64 |    "metadata": {},
 65 |    "outputs": [
 66 |     {
 67 |      "data": {
 68 |       "text/plain": [
 69 |        "['t10k-images-idx3-ubyte.gz',\n",
 70 |        " 'images',\n",
 71 |        " 't10k-labels-idx1-ubyte.gz',\n",
 72 |        " 'train-labels-idx1-ubyte.gz',\n",
 73 |        " 'train-images-idx3-ubyte.gz']"
 74 |       ]
 75 |      },
 76 |      "execution_count": 4,
 77 |      "metadata": {},
 78 |      "output_type": "execute_result"
 79 |     }
 80 |    ],
 81 |    "source": [
 82 |     "import os\n",
 83 |     "os.listdir('MNIST/')"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "name": "stdout",
 93 |      "output_type": "stream",
 94 |      "text": [
 95 |       "Image Inventory\n",
 96 |       "----------\n",
 97 |       "Training: 55000\n",
 98 |       "Testing:  10000\n",
 99 |       "----------\n"
100 |      ]
101 |     }
102 |    ],
103 |    "source": [
104 |     "print('Image Inventory')\n",
105 |     "print('----------')\n",
106 |     "print('Training: {}'.format(len(data.train.labels)))\n",
107 |     "print('Testing:  {}'.format(len(data.test.labels)))\n",
108 |     "print('----------')"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": 6,
114 |    "metadata": {
115 |     "collapsed": true
116 |    },
117 |    "outputs": [],
118 |    "source": [
119 |     "import numpy as np\n",
120 |     "import matplotlib.pyplot as plt\n",
121 |     "%matplotlib inline"
122 |    ]
123 |   },
124 |   {
125 |    "cell_type": "code",
126 |    "execution_count": 7,
127 |    "metadata": {
128 |     "scrolled": false
129 |    },
130 |    "outputs": [
131 |     {
132 |      "name": "stdout",
133 |      "output_type": "stream",
134 |      "text": [
135 |       "-----------------\n",
136 |       "[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n"
137 |      ]
138 |     },
139 |     {
140 |      "data": {
141 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAD8CAYAAAC4nHJkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADZNJREFUeJzt3X+IXfWZx/HP05ig2OKPrTsMJut0/JnqH1MdpVIpXWuK\nSiEWJHbANaulUyVbjERYcYXNH/5RStJYECpTDI2lpq3UapTSNROEbMhaTSQ7469WtyQkMeaH0WSC\nYmt89o85tqPO+d7rPefcc2ae9wuGufc858fDZT5zzr3n3PM1dxeAeD5TdwMA6kH4gaAIPxAU4QeC\nIvxAUIQfCIrwA0ERfiAowg8EdUI3N2ZmXE4IVMzdrZ35Cu35zexqM/ujmb1mZncVWReA7rJOr+03\nszmS/iRpkaQ9kp6TNOTuLyWWYc8PVKwbe/7LJL3m7n92979I+qWkxQXWB6CLioT/TEm7pzzfk037\nCDMbNrNtZratwLYAlKzyD/zcfUTSiMRhP9AkRfb8eyUtmPJ8fjYNwAxQJPzPSTrXzL5gZvMkfVvS\nhnLaAlC1jg/73f19M/s3Sf8laY6kte7+YmmdAahUx6f6OtoY7/mBynXlIh8AMxfhB4Ii/EBQhB8I\nivADQRF+ICjCDwRF+IGgCD8QFOEHgiL8QFCEHwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeCIvxAUIQf\nCIrwA0ERfiAowg8ERfiBoAg/EBThB4Ii/EBQhB8IivADQXU8RLckmdlOSROSjkt6390Hy2gKQPUK\nhT/zz+5+qIT1AOgiDvuBoIqG3yWNmtl2MxsuoyEA3VH0sP8Kd99rZv8oaaOZveLum6fOkP1T4B8D\n0DDm7uWsyGylpGPuvioxTzkbA5DL3a2d+To+7Dezk83scx8+lvQNSS90uj4A3VXksL9H0m/N7MP1\nPOzuvy+lKwCVK+2wv62NcdgPVK7yw34AMxvhB4Ii/EBQhB8IivADQRF+IKgyvtWHmt188825tVan\nct98881kfeHChcn61q1bk/UtW7Yk66gPe34gKMIPBEX4gaAIPxAU4QeCIvxAUIQfCGrWnOcfGhpK\n1i+++OJkPXWuvOlOPfXUjpc9fvx4sj5v3rxk/d13303W33nnndza+Ph4ctklS5Yk6wcPHkzWkcae\nHwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeCmlG37l69enVu7fbbb08uO2fOnCKbRg2efvrpZL3VtR37\n9+8vs50Zg1t3A0gi/EBQhB8IivADQRF+ICjCDwRF+IGgWp7nN7O1kr4p6YC7X5RNO13SryT1Sdop\naYm7v9VyYwXP8+/evTu3Nn/+/OSyY2NjyXqr76VXqdW97R977LEudfLpLVq0KFm/6aabcmt9fX2F\ntt3qOoAbbrghtzab7wVQ5nn+n0m6+mPT7pK0yd3PlbQpew5gBmkZfnffLOnwxyYvlrQue7xO0nUl\n9wWgYp2+5+9x933Z4zck9ZTUD4AuKXwPP3f31Ht5MxuWNFx0OwDK1emef7+Z9UpS9vtA3ozuPuLu\ng+4+2OG2AFSg0/BvkLQ0e7xU0uPltAOgW1qG38zWS/ofSeeb2R4z+46kH0haZGavSroqew5gBplR\n3+c/77zzcmsXXnhhctnR0dFkfWJioqOekNbf359be/LJJ5PLLly4sNC277zzztxa6t4QMx3f5weQ\nRPiBoAg/EBThB4Ii/EBQhB8Iakad6sPscv311yfrjzzySKH1Hzp0KLd2xhlnFFp3k3GqD0AS4QeC\nIvxAUIQfCIrwA0ERfiAowg8ERfiBoAg/EBThB4Ii/EBQhB8IivADQRF+ICjCDwRVeLguIOW2227L\nrV166aWVbvvEE0/MrV1yySXJZbdv3152O43Dnh8IivADQRF+ICjCDwRF+IGgCD8QFOEHgmp5334z\nWyvpm5IOuPtF2bSVkr4r6WA2293u/ruWG+O+/ZXo7e3Nrd14443JZZcvX152Ox+R6s2srdvLV+Lo\n0aPJ+imnnNKlTspX5n37fybp6mmmr3H3geynZfABNEvL8Lv7ZkmHu9ALgC4q8p7/+2Y2ZmZrzey0\n0joC0BWdhv8nkvolDUjaJ2l13oxmNmxm28xsW4fbAlCBjsLv7vvd/bi7fyDpp5IuS8w74u6D7j7Y\naZMAytdR+M1s6ke435L0QjntAOiWll/pNbP1kr4m6fNmtkfSf0r6mpkNSHJJOyV9r8IeAVSgZfjd\nfWiayQ9W0EtYV111VbLe6rvnw8PDubX+/v6Oeprt1q5dW3cLteMKPyAowg8ERfiBoAg/EBThB4Ii\n/EBQ3Lq7BOecc06y/sADDyTrV155ZbJe5Vdfd+3alay/9dZbhdZ/zz335Nbee++95LL3339/sn7+\n+ed31JMkvf766x0vO1uw5weCIvxAUIQfCIrwA0ERfiAowg8ERfiBoDjP36Y77rgjt7Zs2bLksmef\nfXayfuzYsWT97bffTtbvu+++3Fqr89lbt25N1ltdB1ClI0eOFFp+YmIit/bEE08UWvdswJ4fCIrw\nA0ERfiAowg8ERfiBoAg/EBThB4LiPH+bLr/88txaq/P4GzZsSNZXr84d7UyStHnz5mR9phoYGEjW\nzzrrrELrT90v4JVXXim07tmAPT8QFOEHgiL8QFCEHwiK8ANBEX4gKMIPBNXyPL+ZLZD0kKQeSS5p\nxN1/bGanS/qVpD5JOyUtcfdiN3lvsFtvvTW3NjY2llz23nvvLbudWaHVeAc9PT2F1j86Olpo+dmu\nnT3/+5JWuPsXJX1Z0jIz+6KkuyRtcvdzJW3KngOYIVqG3933ufvz2eMJSS9LOlPSYknrstnWSbqu\nqiYBlO9Tvec3sz5JX5L0B0k97r4vK72hybcFAGaItq/tN7PPSvqNpOXufnTq+HHu7mbmOcsNSxou\n2iiAcrW15zezuZoM/i/c/dFs8n4z683qvZIOTLesu4+4+6C7D5bRMIBytAy/Te7iH5T0srv/aEpp\ng6Sl2eOlkh4vvz0AVTH3aY/W/z6D2RWS/lvSuKQPssl3a/J9/68l/ZOkXZo81Xe4xbrSG0Moq1at\nStZXrFiRrLe6pfk111yTW3vmmWeSy85k7t7WmO4t3/O7+xZJeSv7+qdpCkBzcIUfEBThB4Ii/EBQ\nhB8IivADQRF+IChu3Y1KjY+P59YuuOCCQut+6qmnkvXZfC6/DOz5gaAIPxAU4QeCIvxAUIQfCIrw\nA0ERfiAozvOjUn19fbm1E05I//kdOXIkWV+zZk0nLSHDnh8IivADQRF+ICjCDwRF+IGgCD8QFOEH\nguI8PwoZGhpK1k866aTc2sTERHLZ4eH0KG98X78Y9vxAUIQfCIrwA0ERfiAowg8ERfiBoAg/EJS5\ne3oGswWSHpLUI8kljbj7j81spaTvSjqYzXq3u/+uxbrSG0PjzJ07N1l/9tlnk/XUvfnXr1+fXPaW\nW25J1jE9d7d25mvnIp/3Ja1w9+fN7HOStpvZxqy2xt1XddokgPq0DL+775O0L3s8YWYvSzqz6sYA\nVOtTvec3sz5JX5L0h2zS981szMzWmtlpOcsMm9k2M9tWqFMApWo7/Gb2WUm/kbTc3Y9K+omkfkkD\nmjwyWD3dcu4+4u6D7j5YQr8AStJW+M1sriaD/wt3f1SS3H2/ux939w8k/VTSZdW1CaBsLcNvZibp\nQUkvu/uPpkzvnTLbtyS9UH57AKrSzqf9X5H0L5LGzWxHNu1uSUNmNqDJ0387JX2vkg5Rq1angh9+\n+OFkfceOHbm1jRs35tZQvXY+7d8iabrzhslz+gCajSv8gKAIPxAU4QeCIvxAUIQfCIrwA0G1/Epv\nqRvjK71A5dr9Si97fiAowg8ERfiBoAg/EBThB4Ii/EBQhB8IqttDdB+StGvK889n05qoqb01tS+J\n3jpVZm9ntTtjVy/y+cTGzbY19d5+Te2tqX1J9NapunrjsB8IivADQdUd/pGat5/S1N6a2pdEb52q\npbda3/MDqE/de34ANakl/GZ2tZn90cxeM7O76ughj5ntNLNxM9tR9xBj2TBoB8zshSnTTjezjWb2\navZ72mHSauptpZntzV67HWZ2bU29LTCzp83sJTN70cxuz6bX+tol+qrldev6Yb+ZzZH0J0mLJO2R\n9JykIXd/qauN5DCznZIG3b32c8Jm9lVJxyQ95O4XZdN+KOmwu/8g+8d5mrv/e0N6WynpWN0jN2cD\nyvROHVla0nWS/lU1vnaJvpaohtetjj3/ZZJec/c/u/tfJP1S0uIa+mg8d98s6fDHJi+WtC57vE6T\nfzxdl9NbI7j7Pnd/Pns8IenDkaVrfe0SfdWijvCfKWn3lOd71Kwhv13SqJltN7PhupuZRk82bLok\nvSGpp85mptFy5OZu+tjI0o157ToZ8bpsfOD3SVe4+4CkayQtyw5vG8kn37M16XRNWyM3d8s0I0v/\nTZ2vXacjXpetjvDvlbRgyvP52bRGcPe92e8Dkn6r5o0+vP/DQVKz3wdq7udvmjRy83QjS6sBr12T\nRryuI/zPSTrXzL5gZvMkfVvShhr6+AQzOzn7IEZmdrKkb6h5ow9vkLQ0e7xU0uM19vIRTRm5OW9k\nadX82jVuxGt37/qPpGs1+Yn//0n6jzp6yOmrX9L/Zj8v1t2bpPWaPAz8qyY/G/mOpH+QtEnSq5JG\nJZ3eoN5+Lmlc0pgmg9ZbU29XaPKQfkzSjuzn2rpfu0RftbxuXOEHBMUHfkBQhB8IivADQRF+ICjC\nDwRF+IGgCD8QFOEHgvp/zdVX5KPezC0AAAAASUVORK5CYII=\n",
142 |       "text/plain": [
143 |        "<matplotlib.figure.Figure at 0x7fe18d1df5f8>"
144 |       ]
145 |      },
146 |      "metadata": {},
147 |      "output_type": "display_data"
148 |     },
149 |     {
150 |      "name": "stdout",
151 |      "output_type": "stream",
152 |      "text": [
153 |       "-----------------\n",
154 |       "[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n"
155 |      ]
156 |     },
157 |     {
158 |      "data": {
159 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAD8CAYAAAC4nHJkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADhBJREFUeJzt3V2MVPUZx/HfU7E36IWydCWKiyYGo16gWUkvkGisKMYE\nuDG+xNBUWWOsKdqL4kusCYqmqVa4QddIxMa3BthIDNYoaZAmDeHNKu6CWoMCQRbERI0XVvfpxRya\nVff8zzBzZs4sz/eTbHbmPHNmHo/748yZ/5zzN3cXgHh+VnUDAKpB+IGgCD8QFOEHgiL8QFCEHwiK\n8ANBEX4gKMIPBDWhnS9mZnydEGgxd7d6HtfUnt/MrjGzPWb2kZktaea5ALSXNfrdfjM7SdIHkq6S\ntF/SVkk3uvtgYh32/ECLtWPPP1PSR+7+sbt/K+llSfOaeD4AbdRM+M+UtG/U/f3Zsh8wsz4z22Zm\n25p4LQAla/kHfu7eL6lf4m0/0Ema2fMfkDR11P2zsmUAxoFmwr9V0nlmdo6Z/VzSDZLWl9MWgFZr\n+G2/u39nZr+V9IakkyStcvf3S+sMQEs1PNTX0ItxzA+0XFu+5ANg/CL8QFCEHwiK8ANBEX4gKMIP\nBEX4gaAIPxAU4QeCIvxAUIQfCIrwA0ERfiAowg8ERfiBoAg/EBThB4Ii/EBQhB8IivADQRF+ICjC\nDwRF+IGgCD8QFOEHgiL8QFCEHwiK8ANBEX4gqIan6JYkM9sr6StJ30v6zt17y2gK7dPT05Os33bb\nbcn6/fffn6ynZoE2S08mOzQ0lKw/8MADyfrAwECyHl1T4c9c4e5HSngeAG3E234gqGbD75LeMrPt\nZtZXRkMA2qPZt/2z3P2Amf1C0ptmttvd3x79gOwfBf5hADpMU3t+dz+Q/R6WNCBp5hiP6Xf3Xj4M\nBDpLw+E3s4lmduqx25LmSNpVVmMAWquZt/3dkgay4ZoJkl5097+X0hWAlrPUOGzpL2bWvhcLZPLk\nybm1e++9N7nuzTffnKxPmjQpWS8aq29mnL/ob3Pfvn3J+qWXXppbO3LkxB2ddvf0hs0w1AcERfiB\noAg/EBThB4Ii/EBQhB8IiqG+caDotNmlS5fm1or+/7Z6uO3w4cPJekpXV1eyPm3atGR9cHAwt3bh\nhRc20tK4wFAfgCTCDwRF+IGgCD8QFOEHgiL8QFCEHwiKcf5xYOvWrcn6JZdckltrdpw/NVYuSVdc\ncUWy3syps7NmzUrWN23alKyn/tsnTCjjwtWdiXF+AEmEHwiK8ANBEX4gKMIPBEX4gaAIPxAU4/wd\n4Pzzz0/Wi8b5P//889xa0fn0RePwd999d7K+ePHiZH3ZsmW5tU8//TS5bpGiv92RkZHc2h133JFc\nt7+/v6GeOgHj/ACSCD8QFOEHgiL8QFCEHwiK8ANBEX4gqMJxfjNbJek6ScPuflG27HRJr0iaJmmv\npOvd/YvCF2OcvyFF3wNIjdU3OxV1X19fsr5y5cpkPTVN9o4dO5LrLliwIFlfs2ZNsp762z7jjDOS\n647nKbzLHOd/TtI1P1q2RNJGdz9P0sbsPoBxpDD87v62pKM/WjxP0urs9mpJ80vuC0CLNXrM3+3u\nB7Pbn0nqLqkfAG3S9IXM3N1Tx/Jm1icpfeAIoO0a3fMfMrMpkpT9Hs57oLv3u3uvu/c2+FoAWqDR\n8K+XtDC7vVDSq+W0A6BdCsNvZi9J+pek6Wa238xulfSYpKvM7ENJv8ruAxhHCo/53f3GnNKVJfeC\nHLt3767stYuuB7Bnz55kPXWtgaJrBSxZkh5BLppzoJXffzgR8A0/ICjCDwRF+IGgCD8QFOEHgiL8\nQFAn7jzFgcyePTu3VnQ6cNFQ3tDQULI+ffr0ZH3Lli25tcmTJyfXLTrdvKj3uXPnJuvRsecHgiL8\nQFCEHwiK8ANBEX4gKMIPBEX4gaAY5z8B3HTTTbm1RYsWJdctOi22jku7J+upsfxmTsmVpBUrViTr\nRZcGj449PxAU4QeCIvxAUIQfCIrwA0ERfiAowg8ExTj/Ca5onL7K9Tdv3pxc95577knWGcdvDnt+\nICjCDwRF+IGgCD8QFOEHgiL8QFCEHwiqcJzfzFZJuk7SsLtflC17SNIiSccunH6fu29oVZNIe/HF\nF3NrPT09yXW7urqS9aLr/k+cODFZT3nwwQeTdcbxW6uePf9zkq4ZY/lf3H1G9kPwgXGmMPzu/rak\no23oBUAbNXPMf5eZvWtmq8zstNI6AtAWjYZ/paRzJc2QdFDS43kPNLM+M9tmZtsafC0ALdBQ+N39\nkLt/7+4jkp6RNDPx2H5373X33kabBFC+hsJvZlNG3V0gaVc57QBol3qG+l6SdLmkLjPbL+mPki43\nsxmSXNJeSbe3sEcALWDNnq99XC9m1r4XQymKxvkffvjhZH3+/Pm5tZ07dybXnTt3brJedF3/qNw9\nPSFChm/4AUERfiAowg8ERfiBoAg/EBThB4JiqK9OqammDx8+nFuL7vXXX8+tXX311cl1iy7d/eST\nTzbU04mOoT4ASYQfCIrwA0ERfiAowg8ERfiBoAg/EBRTdGdmz56drD/+eO6VyrR79+7kurfccktD\nPZ0IHnnkkdzanDlzkutOnz697HYwCnt+ICjCDwRF+IGgCD8QFOEHgiL8QFCEHwgqzDh/6nx8SXrq\nqaeS9eHh4dxa5HH8oim6n3766dyaWV2nnaNF2PMDQRF+ICjCDwRF+IGgCD8QFOEHgiL8QFCF4/xm\nNlXS85K6JbmkfndfbmanS3pF0jRJeyVd7+5ftK7V5ixYsCBZLzp3fNOmTWW2M24UTdG9du3aZD21\nXYvmjCi6TgKaU8+e/ztJv3f3CyT9UtKdZnaBpCWSNrr7eZI2ZvcBjBOF4Xf3g+6+I7v9laQhSWdK\nmidpdfaw1ZLmt6pJAOU7rmN+M5sm6WJJWyR1u/vBrPSZaocFAMaJur/bb2anSForabG7fzn6e9nu\n7nnz8JlZn6S+ZhsFUK669vxmdrJqwX/B3ddliw+Z2ZSsPkXSmGe+uHu/u/e6e28ZDQMoR2H4rbaL\nf1bSkLs/Maq0XtLC7PZCSa+W3x6AVimcotvMZknaLOk9SSPZ4vtUO+7/m6SzJX2i2lDf0YLnqmyK\n7qIhq6GhoWR9cHAwt/boo4829dzbt29P1ov09PTk1i677LLkukVDoPPnpz/HLTotN/X3tXz58uS6\nRVN0Y2z1TtFdeMzv7v+UlPdkVx5PUwA6B9/wA4Ii/EBQhB8IivADQRF+ICjCDwRVOM5f6otVOM5f\nZM2aNcl6ary7mbFuSdq5c2eyXuTss8/OrU2aNCm5brO9F62fmqJ7xYoVyXWPHDmSrGNs9Y7zs+cH\ngiL8QFCEHwiK8ANBEX4gKMIPBEX4gaAY588UTeG9YcOG3Fpvb/oiRSMjI8l6K8fai9b95ptvkvWi\ny2cvW7YsWR8YGEjWUT7G+QEkEX4gKMIPBEX4gaAIPxAU4QeCIvxAUIzz16mrqyu3tnTp0qaeu68v\nPZvZunXrkvVmznsvunY+02SPP4zzA0gi/EBQhB8IivADQRF+ICjCDwRF+IGgCsf5zWyqpOcldUty\nSf3uvtzMHpK0SNLh7KH3uXv+Se8a3+P8wHhR7zh/PeGfImmKu+8ws1MlbZc0X9L1kr529z/X2xTh\nB1qv3vBPqOOJDko6mN3+ysyGJJ3ZXHsAqnZcx/xmNk3SxZK2ZIvuMrN3zWyVmZ2Ws06fmW0zs21N\ndQqgVHV/t9/MTpG0SdIj7r7OzLolHVHtc4Clqh0a/KbgOXjbD7RYacf8kmRmJ0t6TdIb7v7EGPVp\nkl5z94sKnofwAy1W2ok9Vrs07LOShkYHP/sg8JgFknYdb5MAqlPPp/2zJG2W9J6kY9egvk/SjZJm\nqPa2f6+k27MPB1PPxZ4faLFS3/aXhfADrcf5/ACSCD8QFOEHgiL8QFCEHwiK8ANBEX4gKMIPBEX4\ngaAIPxAU4QeCIvxAUIQfCIrwA0EVXsCzZEckfTLqfle2rBN1am+d2pdEb40qs7eeeh/Y1vP5f/Li\nZtvcvbeyBhI6tbdO7Uuit0ZV1Rtv+4GgCD8QVNXh76/49VM6tbdO7Uuit0ZV0lulx/wAqlP1nh9A\nRSoJv5ldY2Z7zOwjM1tSRQ95zGyvmb1nZu9UPcVYNg3asJntGrXsdDN708w+zH6POU1aRb09ZGYH\nsm33jpldW1FvU83sH2Y2aGbvm9nvsuWVbrtEX5Vst7a/7TezkyR9IOkqSfslbZV0o7sPtrWRHGa2\nV1Kvu1c+JmxmsyV9Len5Y7MhmdmfJB1198eyfzhPc/c/dEhvD+k4Z25uUW95M0v/WhVuuzJnvC5D\nFXv+mZI+cveP3f1bSS9LmldBHx3P3d+WdPRHi+dJWp3dXq3aH0/b5fTWEdz9oLvvyG5/JenYzNKV\nbrtEX5WoIvxnSto36v5+ddaU3y7pLTPbbmZ9VTczhu5RMyN9Jqm7ymbGUDhzczv9aGbpjtl2jcx4\nXTY+8PupWe4+Q9JcSXdmb287kteO2TppuGalpHNVm8btoKTHq2wmm1l6raTF7v7l6FqV226MvirZ\nblWE/4CkqaPun5Ut6wjufiD7PSxpQLXDlE5y6Ngkqdnv4Yr7+T93P+Tu37v7iKRnVOG2y2aWXivp\nBXdfly2ufNuN1VdV262K8G+VdJ6ZnWNmP5d0g6T1FfTxE2Y2MfsgRmY2UdIcdd7sw+slLcxuL5T0\naoW9/ECnzNycN7O0Kt52HTfjtbu3/UfStap94v8fSfdX0UNOX+dK+nf2837VvUl6SbW3gf9V7bOR\nWyVNkrRR0oeS3pJ0egf19lfVZnN+V7WgTamot1mqvaV/V9I72c+1VW+7RF+VbDe+4QcExQd+QFCE\nHwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeC+h9PPuXddgFbfgAAAABJRU5ErkJggg==\n",
160 |       "text/plain": [
161 |        "<matplotlib.figure.Figure at 0x7fe18d1c0278>"
162 |       ]
163 |      },
164 |      "metadata": {},
165 |      "output_type": "display_data"
166 |     }
167 |    ],
168 |    "source": [
169 |     "for i in range(2):\n",
170 |     "    image = data.train.images[i]\n",
171 |     "    image = np.array(image, dtype='float')\n",
172 |     "    label = data.train.labels[i]\n",
173 |     "    pixels = image.reshape((28, 28))\n",
174 |     "    plt.imshow(pixels, cmap='gray')\n",
175 |     "    print('-----------------')\n",
176 |     "    print(label)\n",
177 |     "    plt.show()\n",
178 |     "    "
179 |    ]
180 |   },
181 |   {
182 |    "cell_type": "code",
183 |    "execution_count": 8,
184 |    "metadata": {
185 |     "collapsed": true
186 |    },
187 |    "outputs": [],
188 |    "source": [
189 |     "if not os.path.exists('MNIST/images'):\n",
190 |     "    os.makedirs('MNIST/images/')\n",
191 |     "os.chdir('MNIST/images/')"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 9,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "from matplotlib import image\n",
203 |     "for i in range(1,10):\n",
204 |     "    png = data.train.images[i]\n",
205 |     "    png = np.array(png, dtype='float')\n",
206 |     "    pixels = png.reshape((28, 28))\n",
207 |     "    image.imsave('image_no_{}.png'.format(i), pixels, cmap = 'gray')"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 10,
213 |    "metadata": {},
214 |    "outputs": [
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "['image_no_9.png', 'image_no_3.png', 'image_no_4.png', 'image_no_7.png', 'output', 'image_no_2.png', 'image_no_5.png', 'image_no_8.png', 'image_no_1.png', 'image_no_6.png']\n"
220 |      ]
221 |     }
222 |    ],
223 |    "source": [
224 |     "print(os.listdir())"
225 |    ]
226 |   },
227 |   {
228 |    "cell_type": "code",
229 |    "execution_count": 11,
230 |    "metadata": {
231 |     "collapsed": true
232 |    },
233 |    "outputs": [],
234 |    "source": [
235 |     "from Augmentor import Pipeline"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 12,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "name": "stdout",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "Initialised with 9 image(s) found.\n",
248 |       "Output directory set to /home/asherif844/sparkNotebooks/Ch03/MNIST/images/output."
249 |      ]
250 |     }
251 |    ],
252 |    "source": [
253 |     "augmentor = Pipeline('/home/asherif844/sparkNotebooks/Ch03/MNIST/images')"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 13,
259 |    "metadata": {
260 |     "collapsed": true
261 |    },
262 |    "outputs": [],
263 |    "source": [
264 |     "augmentor.rotate(probability=0.9, max_left_rotation=25, max_right_rotation=25)"
265 |    ]
266 |   },
267 |   {
268 |    "cell_type": "code",
269 |    "execution_count": 14,
270 |    "metadata": {},
271 |    "outputs": [
272 |     {
273 |      "name": "stderr",
274 |      "output_type": "stream",
275 |      "text": [
276 |       "Processing <PIL.Image.Image image mode=RGBA size=28x28 at 0x7FE18C76FA58>: 100%|██████████| 10/10 [00:00<00:00, 160.13 Samples/s]\n",
277 |       "Processing <PIL.Image.Image image mode=RGBA size=28x28 at 0x7FE18C76F898>: 100%|██████████| 10/10 [00:00<00:00, 125.24 Samples/s]\n"
278 |      ]
279 |     }
280 |    ],
281 |    "source": [
282 |     "for i in range(1,3):\n",
283 |     "    augmentor.sample(10)"
284 |    ]
285 |   },
286 |   {
287 |    "cell_type": "code",
288 |    "execution_count": 15,
289 |    "metadata": {
290 |     "collapsed": true
291 |    },
292 |    "outputs": [],
293 |    "source": [
294 |     "xtrain = data.train.images\n",
295 |     "ytrain = np.asarray(data.train.labels)\n",
296 |     "xtest = data.test.images \n",
297 |     "ytest = np.asarray(data.test.labels)"
298 |    ]
299 |   },
300 |   {
301 |    "cell_type": "code",
302 |    "execution_count": 16,
303 |    "metadata": {},
304 |    "outputs": [],
305 |    "source": [
306 |     "xtrain = xtrain.reshape( xtrain.shape[0],28,28,1)\n",
307 |     "xtest = xtest.reshape(xtest.shape[0],28,28,1)\n",
308 |     "ytest= ytest.reshape(ytest.shape[0],10)\n",
309 |     "ytrain = ytrain.reshape(ytrain.shape[0],10)"
310 |    ]
311 |   },
312 |   {
313 |    "cell_type": "code",
314 |    "execution_count": 17,
315 |    "metadata": {},
316 |    "outputs": [
317 |     {
318 |      "name": "stdout",
319 |      "output_type": "stream",
320 |      "text": [
321 |       "(55000, 28, 28, 1)\n",
322 |       "(55000, 10)\n",
323 |       "(10000, 28, 28, 1)\n",
324 |       "(10000, 10)\n"
325 |      ]
326 |     }
327 |    ],
328 |    "source": [
329 |     "print(xtrain.shape)\n",
330 |     "print(ytrain.shape)\n",
331 |     "print(xtest.shape)\n",
332 |     "print(ytest.shape)"
333 |    ]
334 |   },
335 |   {
336 |    "cell_type": "code",
337 |    "execution_count": 18,
338 |    "metadata": {},
339 |    "outputs": [
340 |     {
341 |      "name": "stderr",
342 |      "output_type": "stream",
343 |      "text": [
344 |       "Using TensorFlow backend.\n"
345 |      ]
346 |     }
347 |    ],
348 |    "source": [
349 |     "import keras\n",
350 |     "import keras.backend as K\n",
351 |     "from keras.models import Sequential\n",
352 |     "from keras.layers import Dense, Flatten, Conv2D\n",
353 |     "\n",
354 |     "K.set_image_dim_ordering('tf')\n",
355 |     "\n",
356 |     "model = Sequential()\n",
357 |     "\n",
358 |     "model.add(Conv2D(32, kernel_size=(5, 5),activation='relu', input_shape=(28,28,1)))\n",
359 |     "model.add(Flatten())\n",
360 |     "model.add(Dense(128, activation='relu'))\n",
361 |     "model.add(Dense(10, activation='sigmoid'))\n"
362 |    ]
363 |   },
364 |   {
365 |    "cell_type": "code",
366 |    "execution_count": 19,
367 |    "metadata": {},
368 |    "outputs": [],
369 |    "source": [
370 |     "model.compile(optimizer='adam',loss='categorical_crossentropy', \n",
371 |     "              metrics=['accuracy'])\n"
372 |    ]
373 |   },
374 |   {
375 |    "cell_type": "code",
376 |    "execution_count": 20,
377 |    "metadata": {},
378 |    "outputs": [
379 |     {
380 |      "name": "stdout",
381 |      "output_type": "stream",
382 |      "text": [
383 |       "Train on 55000 samples, validate on 10000 samples\n",
384 |       "Epoch 1/5\n",
385 |       "55000/55000 [==============================] - 46s 832us/step - loss: 0.3617 - acc: 0.9032 - val_loss: 0.1214 - val_acc: 0.9651\n",
386 |       "Epoch 2/5\n",
387 |       "55000/55000 [==============================] - 44s 797us/step - loss: 0.0928 - acc: 0.9731 - val_loss: 0.0809 - val_acc: 0.9770\n",
388 |       "Epoch 3/5\n",
389 |       "55000/55000 [==============================] - 44s 796us/step - loss: 0.0555 - acc: 0.9837 - val_loss: 0.0521 - val_acc: 0.9839\n",
390 |       "Epoch 4/5\n",
391 |       "55000/55000 [==============================] - 42s 756us/step - loss: 0.0410 - acc: 0.9881 - val_loss: 0.0521 - val_acc: 0.9823\n",
392 |       "Epoch 5/5\n",
393 |       "55000/55000 [==============================] - 43s 782us/step - loss: 0.0309 - acc: 0.9909 - val_loss: 0.0457 - val_acc: 0.9861\n"
394 |      ]
395 |     },
396 |     {
397 |      "data": {
398 |       "text/plain": [
399 |        "<keras.callbacks.History at 0x7fe18d115c88>"
400 |       ]
401 |      },
402 |      "execution_count": 20,
403 |      "metadata": {},
404 |      "output_type": "execute_result"
405 |     }
406 |    ],
407 |    "source": [
408 |     "model.fit(xtrain,ytrain,batch_size=512,\n",
409 |     "          epochs=5,\n",
410 |     "          validation_data=(xtest, ytest))"
411 |    ]
412 |   },
413 |   {
414 |    "cell_type": "code",
415 |    "execution_count": 21,
416 |    "metadata": {},
417 |    "outputs": [
418 |     {
419 |      "name": "stdout",
420 |      "output_type": "stream",
421 |      "text": [
422 |       "10000/10000 [==============================] - 3s 324us/step\n",
423 |       "The accuracy rate is 98.6%\n",
424 |       "The loss rate is 5.0%\n"
425 |      ]
426 |     }
427 |    ],
428 |    "source": [
429 |     "stats = model.evaluate(xtest, ytest)\n",
430 |     "print('The accuracy rate is {}%'.format(round(stats[1],3)*100))\n",
431 |     "print('The loss rate is {}%'.format(round(stats[0],2)*100))"
432 |    ]
433 |   },
434 |   {
435 |    "cell_type": "code",
436 |    "execution_count": 22,
437 |    "metadata": {},
438 |    "outputs": [
439 |     {
440 |      "name": "stdout",
441 |      "output_type": "stream",
442 |      "text": [
443 |       "_________________________________________________________________\n",
444 |       "Layer (type)                 Output Shape              Param #   \n",
445 |       "=================================================================\n",
446 |       "conv2d_1 (Conv2D)            (None, 24, 24, 32)        832       \n",
447 |       "_________________________________________________________________\n",
448 |       "flatten_1 (Flatten)          (None, 18432)             0         \n",
449 |       "_________________________________________________________________\n",
450 |       "dense_1 (Dense)              (None, 128)               2359424   \n",
451 |       "_________________________________________________________________\n",
452 |       "dense_2 (Dense)              (None, 10)                1290      \n",
453 |       "=================================================================\n",
454 |       "Total params: 2,361,546\n",
455 |       "Trainable params: 2,361,546\n",
456 |       "Non-trainable params: 0\n",
457 |       "_________________________________________________________________\n"
458 |      ]
459 |     }
460 |    ],
461 |    "source": [
462 |     "model.summary()"
463 |    ]
464 |   },
465 |   {
466 |    "cell_type": "code",
467 |    "execution_count": null,
468 |    "metadata": {
469 |     "collapsed": true
470 |    },
471 |    "outputs": [],
472 |    "source": []
473 |   }
474 |  ],
475 |  "metadata": {
476 |   "kernelspec": {
477 |    "display_name": "Python 3",
478 |    "language": "python",
479 |    "name": "python3"
480 |   },
481 |   "language_info": {
482 |    "codemirror_mode": {
483 |     "name": "ipython",
484 |     "version": 3
485 |    },
486 |    "file_extension": ".py",
487 |    "mimetype": "text/x-python",
488 |    "name": "python",
489 |    "nbconvert_exporter": "python",
490 |    "pygments_lexer": "ipython3",
491 |    "version": "3.6.1"
492 |   }
493 |  },
494 |  "nbformat": 4,
495 |  "nbformat_minor": 2
496 | }
497 | 


--------------------------------------------------------------------------------
/CH06/code/CH06_LSTMs+word+level.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {},
  7 |    "outputs": [
  8 |     {
  9 |      "name": "stderr",
 10 |      "output_type": "stream",
 11 |      "text": [
 12 |       "Using TensorFlow backend.\n"
 13 |      ]
 14 |     }
 15 |    ],
 16 |    "source": [
 17 |     "from numpy import array\n",
 18 |     "from pickle import dump\n",
 19 |     "from keras.preprocessing.text import Tokenizer\n",
 20 |     "from keras.utils import to_categorical\n",
 21 |     "from keras.models import Sequential\n",
 22 |     "from keras.layers import Dense\n",
 23 |     "from keras.layers import LSTM\n",
 24 |     "from keras.layers import Embedding"
 25 |    ]
 26 |   },
 27 |   {
 28 |    "cell_type": "code",
 29 |    "execution_count": 2,
 30 |    "metadata": {},
 31 |    "outputs": [
 32 |     {
 33 |      "data": {
 34 |       "text/plain": [
 35 |        "'/Users/Chanti'"
 36 |       ]
 37 |      },
 38 |      "execution_count": 2,
 39 |      "metadata": {},
 40 |      "output_type": "execute_result"
 41 |     }
 42 |    ],
 43 |    "source": [
 44 |     "pwd"
 45 |    ]
 46 |   },
 47 |   {
 48 |    "cell_type": "code",
 49 |    "execution_count": 3,
 50 |    "metadata": {},
 51 |    "outputs": [
 52 |     {
 53 |      "name": "stdout",
 54 |      "output_type": "stream",
 55 |      "text": [
 56 |       "/Users/Chanti/Desktop/Cookbook/Chapter 8\n"
 57 |      ]
 58 |     }
 59 |    ],
 60 |    "source": [
 61 |     "cd '/Users/Chanti/Desktop/Cookbook/Chapter 8'"
 62 |    ]
 63 |   },
 64 |   {
 65 |    "cell_type": "code",
 66 |    "execution_count": 4,
 67 |    "metadata": {},
 68 |    "outputs": [
 69 |     {
 70 |      "data": {
 71 |       "text/plain": [
 72 |        "'/Users/Chanti/Desktop/Cookbook/Chapter 8'"
 73 |       ]
 74 |      },
 75 |      "execution_count": 4,
 76 |      "metadata": {},
 77 |      "output_type": "execute_result"
 78 |     }
 79 |    ],
 80 |    "source": [
 81 |     "pwd"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "code",
 86 |    "execution_count": 5,
 87 |    "metadata": {},
 88 |    "outputs": [],
 89 |    "source": [
 90 |     "# load doc into memory\n",
 91 |     "def load_document(name):\n",
 92 |     "    file = open(name, 'r')\n",
 93 |     "    text = file.read()\n",
 94 |     "    file.close()\n",
 95 |     "    return text"
 96 |    ]
 97 |   },
 98 |   {
 99 |    "cell_type": "code",
100 |    "execution_count": 6,
101 |    "metadata": {},
102 |    "outputs": [
103 |     {
104 |      "name": "stdout",
105 |      "output_type": "stream",
106 |      "text": [
107 |       "﻿The Project Gutenberg EBook of The Jungle Book, by Rudyard Kipling\n",
108 |       "\n",
109 |       "This eBook is for the use of anyone anywhere at no cost and with\n",
110 |       "almost no restrictions whatsoever.  You may copy it, give it away or\n",
111 |       "re-use it under the terms of the Project Gutenberg License included\n",
112 |       "with this eBook or online at www.gutenberg.org\n",
113 |       "\n",
114 |       "\n",
115 |       "Title: The Jungle Book\n",
116 |       "\n",
117 |       "Author: Rudyard Kipling\n",
118 |       "\n",
119 |       "Release Date: January 16, 2006 [EBook #236]\n",
120 |       "Last Updated: October 6, 2016\n",
121 |       "\n",
122 |       "Language: English\n",
123 |       "\n",
124 |       "Character set encoding: UTF-8\n",
125 |       "\n",
126 |       "*** START OF THIS PROJECT GUTENBERG EBOOK THE JUNGLE BOOK ***\n",
127 |       "\n",
128 |       "\n",
129 |       "\n",
130 |       "\n",
131 |       "Produced by An Anonymous Volunteer and David Widger\n",
132 |       "\n",
133 |       "\n",
134 |       "\n",
135 |       "\n",
136 |       "\n",
137 |       "THE JUNGLE BOOK\n",
138 |       "\n",
139 |       "By Rudyard Kipling\n",
140 |       "\n",
141 |       "\n",
142 |       "\n",
143 |       "Contents\n",
144 |       "\n",
145 |       "     Mowgli’s Brothers\n",
146 |       "     Hunting-Song of the Seeonee Pack\n",
147 |       "     Kaa’s Hunting\n",
148 |       "     Road-Song of the Bandar-Log\n",
149 |       "     “Tiger! Tiger!”\n",
150 |       "      Mowgli’s Song\n",
151 |       "     The White Seal\n",
152 |       "     Lukannon\n",
153 |       "     “Rikki-Tikki-Tavi”\n",
154 |       "      Darzee’s Chant\n",
155 |       "     Toomai of the Elephants\n",
156 |       "     Shiv and the Grasshopper\n",
157 |       "     Her Majesty’s Servants\n",
158 |       "     Parade Song of the Camp Animals\n",
159 |       "\n",
160 |       "\n",
161 |       "\n",
162 |       "\n",
163 |       "Mowgli’s Brothers\n",
164 |       "\n",
165 |       "     Now Rann the Kite brings home the night\n",
166 |       "        That Mang the Bat sets free--\n",
167 |       "     The herds are shut in byre and hut\n",
168 |       "        For loosed till dawn are we.\n",
169 |       "     This is the hour of pride and power,\n",
170 |       "        Talon and tush and claw.\n",
171 |       "     Oh, hear the call!--Good hunting all\n",
172 |       "        That keep the Jungle Law!\n",
173 |       "     Night-Song in the Jungle\n",
174 |       "\n",
175 |       "It was seven o’clock of a very warm evening in the Seeonee hills when\n",
176 |       "Father Wolf woke up from his day’s rest, scratched himself, yawned, and\n",
177 |       "spread out his paws one after the other to get rid of the sleepy feeling\n",
178 |       "in their tips. Mother Wolf lay with her big gray nose dropped across her\n",
179 |       "four tumbling, squealing cubs, and the moon shone into the mouth of the\n",
180 |       "cave where they all lived. “Augrh!” said Father Wolf. “It is time to\n",
181 |       "hunt again.” He was going to spring down hill when a little shadow with\n",
182 |       "a bushy tail crossed the threshold and whined: “Good luck go with you, O\n",
183 |       "Chief of the Wolves. And good luck and\n"
184 |      ]
185 |     }
186 |    ],
187 |    "source": [
188 |     "# load document\n",
189 |     "input_filename = 'junglebook.txt'\n",
190 |     "doc = load_document(input_filename)\n",
191 |     "print(doc[:2000])"
192 |    ]
193 |   },
194 |   {
195 |    "cell_type": "code",
196 |    "execution_count": 7,
197 |    "metadata": {
198 |     "collapsed": true
199 |    },
200 |    "outputs": [],
201 |    "source": [
202 |     "import string\n",
203 |     " \n",
204 |     "# turn a document into clean tokens\n",
205 |     "def clean_document(doc):\n",
206 |     "    doc = doc.replace('--', ' ')\n",
207 |     "    tokens = doc.split()\n",
208 |     "    table = str.maketrans('', '', string.punctuation)\n",
209 |     "    tokens = [w.translate(table) for w in tokens]\n",
210 |     "    tokens = [word for word in tokens if word.isalpha()]\n",
211 |     "    tokens = [word.lower() for word in tokens]\n",
212 |     "    return tokens"
213 |    ]
214 |   },
215 |   {
216 |    "cell_type": "code",
217 |    "execution_count": 8,
218 |    "metadata": {},
219 |    "outputs": [
220 |     {
221 |      "name": "stdout",
222 |      "output_type": "stream",
223 |      "text": [
224 |       "['project', 'gutenberg', 'ebook', 'of', 'the', 'jungle', 'book', 'by', 'rudyard', 'kipling', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg', 'title', 'the', 'jungle', 'book', 'author', 'rudyard', 'kipling', 'release', 'date', 'january', 'ebook', 'last', 'updated', 'october', 'language', 'english', 'character', 'set', 'encoding', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'the', 'jungle', 'book', 'produced', 'by', 'an', 'anonymous', 'volunteer', 'and', 'david', 'widger', 'the', 'jungle', 'book', 'by', 'rudyard', 'kipling', 'contents', 'brothers', 'huntingsong', 'of', 'the', 'seeonee', 'pack', 'hunting', 'roadsong', 'of', 'the', 'bandarlog', 'song', 'the', 'white', 'seal', 'lukannon', 'chant', 'toomai', 'of', 'the', 'elephants', 'shiv', 'and', 'the', 'grasshopper', 'her', 'servants', 'parade', 'song', 'of', 'the', 'camp', 'animals', 'brothers', 'now', 'rann', 'the', 'kite', 'brings', 'home', 'the', 'night', 'that', 'mang', 'the', 'bat', 'sets', 'free', 'the', 'herds', 'are', 'shut', 'in', 'byre', 'and', 'hut', 'for', 'loosed', 'till', 'dawn', 'are', 'we', 'this', 'is', 'the', 'hour', 'of', 'pride', 'and', 'power', 'talon', 'and', 'tush', 'and', 'claw', 'oh', 'hear', 'the', 'call', 'good', 'hunting', 'all', 'that', 'keep', 'the', 'jungle', 'law', 'nightsong', 'in', 'the', 'jungle', 'it', 'was', 'seven', 'of', 'a', 'very', 'warm', 'evening', 'in', 'the', 'seeonee', 'hills']\n",
225 |       "Total Tokens: 51473\n",
226 |       "Unique Tokens: 5027\n"
227 |      ]
228 |     }
229 |    ],
230 |    "source": [
231 |     "# clean document\n",
232 |     "tokens = clean_document(doc)\n",
233 |     "print(tokens[:200])\n",
234 |     "print('Total Tokens: %d' % len(tokens))\n",
235 |     "print('Unique Tokens: %d' % len(set(tokens)))"
236 |    ]
237 |   },
238 |   {
239 |    "cell_type": "code",
240 |    "execution_count": 9,
241 |    "metadata": {},
242 |    "outputs": [
243 |     {
244 |      "name": "stdout",
245 |      "output_type": "stream",
246 |      "text": [
247 |       "Total Sequences: 51422\n"
248 |      ]
249 |     }
250 |    ],
251 |    "source": [
252 |     "# organize into sequences (of length 50) of tokens\n",
253 |     "length = 50 + 1\n",
254 |     "sequences = list()\n",
255 |     "for i in range(length, len(tokens)):\n",
256 |     "    # select sequence of tokens\n",
257 |     "    seq = tokens[i-length:i]\n",
258 |     "    # convert into a line\n",
259 |     "    line = ' '.join(seq)\n",
260 |     "    sequences.append(line)\n",
261 |     "print('Total Sequences: %d' % len(sequences))"
262 |    ]
263 |   },
264 |   {
265 |    "cell_type": "code",
266 |    "execution_count": 10,
267 |    "metadata": {
268 |     "collapsed": true
269 |    },
270 |    "outputs": [],
271 |    "source": [
272 |     "# save tokens to file, one dialog per line\n",
273 |     "def save_document(lines, name):\n",
274 |     "    data = '\\n'.join(lines)\n",
275 |     "    file = open(name, 'w')\n",
276 |     "    file.write(data)\n",
277 |     "    file.close()"
278 |    ]
279 |   },
280 |   {
281 |    "cell_type": "code",
282 |    "execution_count": 11,
283 |    "metadata": {
284 |     "collapsed": true
285 |    },
286 |    "outputs": [],
287 |    "source": [
288 |     "# save sequences to file\n",
289 |     "output_filename = 'junglebook_sequences.txt'\n",
290 |     "save_document(sequences, output_filename)"
291 |    ]
292 |   },
293 |   {
294 |    "cell_type": "code",
295 |    "execution_count": 12,
296 |    "metadata": {
297 |     "collapsed": true
298 |    },
299 |    "outputs": [],
300 |    "source": [
301 |     "# load document into memory\n",
302 |     "def load_document(name):\n",
303 |     "    file = open(name, 'r')\n",
304 |     "    text = file.read()\n",
305 |     "    file.close()\n",
306 |     "    return text\n",
307 |     " \n",
308 |     "# load\n",
309 |     "input_filename = 'junglebook_sequences.txt'\n",
310 |     "doc = load_document(input_filename)\n",
311 |     "lines = doc.split('\\n')"
312 |    ]
313 |   },
314 |   {
315 |    "cell_type": "code",
316 |    "execution_count": 13,
317 |    "metadata": {
318 |     "collapsed": true
319 |    },
320 |    "outputs": [],
321 |    "source": [
322 |     "# integer encode sequences of words\n",
323 |     "tokenizer = Tokenizer()\n",
324 |     "tokenizer.fit_on_texts(lines)\n",
325 |     "sequences = tokenizer.texts_to_sequences(lines)"
326 |    ]
327 |   },
328 |   {
329 |    "cell_type": "code",
330 |    "execution_count": 14,
331 |    "metadata": {
332 |     "collapsed": true
333 |    },
334 |    "outputs": [],
335 |    "source": [
336 |     "# vocabulary size\n",
337 |     "vocab_size = len(tokenizer.word_index) + 1 "
338 |    ]
339 |   },
340 |   {
341 |    "cell_type": "code",
342 |    "execution_count": 15,
343 |    "metadata": {
344 |     "collapsed": true
345 |    },
346 |    "outputs": [],
347 |    "source": [
348 |     "# separate into input and output\n",
349 |     "sequences = array(sequences)\n",
350 |     "Input, Output = sequences[:,:-1], sequences[:,-1]\n",
351 |     "Output = to_categorical(Output, num_classes=vocab_size)\n",
352 |     "sequence_length = Input.shape[1]"
353 |    ]
354 |   },
355 |   {
356 |    "cell_type": "code",
357 |    "execution_count": 16,
358 |    "metadata": {},
359 |    "outputs": [
360 |     {
361 |      "name": "stdout",
362 |      "output_type": "stream",
363 |      "text": [
364 |       "_________________________________________________________________\n",
365 |       "Layer (type)                 Output Shape              Param #   \n",
366 |       "=================================================================\n",
367 |       "embedding_1 (Embedding)      (None, 50, 100)           502800    \n",
368 |       "_________________________________________________________________\n",
369 |       "lstm_1 (LSTM)                (None, 50, 200)           240800    \n",
370 |       "_________________________________________________________________\n",
371 |       "lstm_2 (LSTM)                (None, 200)               320800    \n",
372 |       "_________________________________________________________________\n",
373 |       "dropout_1 (Dropout)          (None, 200)               0         \n",
374 |       "_________________________________________________________________\n",
375 |       "dense_1 (Dense)              (None, 200)               40200     \n",
376 |       "_________________________________________________________________\n",
377 |       "dense_2 (Dense)              (None, 5028)              1010628   \n",
378 |       "=================================================================\n",
379 |       "Total params: 2,115,228\n",
380 |       "Trainable params: 2,115,228\n",
381 |       "Non-trainable params: 0\n",
382 |       "_________________________________________________________________\n",
383 |       "None\n"
384 |      ]
385 |     }
386 |    ],
387 |    "source": [
388 |     "# define model\n",
389 |     "from keras.layers import Dropout\n",
390 |     "model = Sequential()\n",
391 |     "model.add(Embedding(vocab_size, 100, input_length=sequence_length))\n",
392 |     "model.add(LSTM(200, return_sequences=True))\n",
393 |     "model.add(LSTM(200))\n",
394 |     "model.add(Dropout(0.3))\n",
395 |     "model.add(Dense(200, activation='relu'))\n",
396 |     "model.add(Dense(vocab_size, activation='softmax'))\n",
397 |     "print(model.summary())"
398 |    ]
399 |   },
400 |   {
401 |    "cell_type": "code",
402 |    "execution_count": 17,
403 |    "metadata": {},
404 |    "outputs": [
405 |     {
406 |      "name": "stdout",
407 |      "output_type": "stream",
408 |      "text": [
409 |       "Epoch 1/75\n",
410 |       "51422/51422 [==============================] - 517s 10ms/step - loss: 6.6069 - acc: 0.0682\n",
411 |       "Epoch 2/75\n",
412 |       "51422/51422 [==============================] - 501s 10ms/step - loss: 6.2250 - acc: 0.0721\n",
413 |       "Epoch 3/75\n",
414 |       "51422/51422 [==============================] - 494s 10ms/step - loss: 6.0805 - acc: 0.0827\n",
415 |       "Epoch 4/75\n",
416 |       "51422/51422 [==============================] - 453s 9ms/step - loss: 5.9354 - acc: 0.0911\n",
417 |       "Epoch 5/75\n",
418 |       "51422/51422 [==============================] - 451s 9ms/step - loss: 5.8014 - acc: 0.1025\n",
419 |       "Epoch 6/75\n",
420 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 5.6800 - acc: 0.1126\n",
421 |       "Epoch 7/75\n",
422 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 5.5646 - acc: 0.1198\n",
423 |       "Epoch 8/75\n",
424 |       "51422/51422 [==============================] - 447s 9ms/step - loss: 5.4614 - acc: 0.1267\n",
425 |       "Epoch 9/75\n",
426 |       "51422/51422 [==============================] - 447s 9ms/step - loss: 5.3677 - acc: 0.1315\n",
427 |       "Epoch 10/75\n",
428 |       "51422/51422 [==============================] - 449s 9ms/step - loss: 5.2885 - acc: 0.1342\n",
429 |       "Epoch 11/75\n",
430 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 5.2218 - acc: 0.1380\n",
431 |       "Epoch 12/75\n",
432 |       "51422/51422 [==============================] - 449s 9ms/step - loss: 5.1429 - acc: 0.1402\n",
433 |       "Epoch 13/75\n",
434 |       "51422/51422 [==============================] - 449s 9ms/step - loss: 5.0917 - acc: 0.1424\n",
435 |       "Epoch 14/75\n",
436 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 5.0171 - acc: 0.1452\n",
437 |       "Epoch 15/75\n",
438 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 4.9520 - acc: 0.1473\n",
439 |       "Epoch 16/75\n",
440 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 4.8880 - acc: 0.1506\n",
441 |       "Epoch 17/75\n",
442 |       "51422/51422 [==============================] - 447s 9ms/step - loss: 4.8307 - acc: 0.1551\n",
443 |       "Epoch 18/75\n",
444 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 4.8129 - acc: 0.1550\n",
445 |       "Epoch 19/75\n",
446 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 4.7857 - acc: 0.1548\n",
447 |       "Epoch 20/75\n",
448 |       "51422/51422 [==============================] - 449s 9ms/step - loss: 4.7032 - acc: 0.1593\n",
449 |       "Epoch 21/75\n",
450 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 4.6548 - acc: 0.1600\n",
451 |       "Epoch 22/75\n",
452 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 4.5812 - acc: 0.1629\n",
453 |       "Epoch 23/75\n",
454 |       "51422/51422 [==============================] - 447s 9ms/step - loss: 4.5474 - acc: 0.1641\n",
455 |       "Epoch 24/75\n",
456 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 4.4725 - acc: 0.1664\n",
457 |       "Epoch 25/75\n",
458 |       "51422/51422 [==============================] - 447s 9ms/step - loss: 4.5027 - acc: 0.1659\n",
459 |       "Epoch 26/75\n",
460 |       "51422/51422 [==============================] - 449s 9ms/step - loss: 4.4486 - acc: 0.1674\n",
461 |       "Epoch 27/75\n",
462 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 4.3099 - acc: 0.1745\n",
463 |       "Epoch 28/75\n",
464 |       "51422/51422 [==============================] - 452s 9ms/step - loss: 4.2418 - acc: 0.1782\n",
465 |       "Epoch 29/75\n",
466 |       "51422/51422 [==============================] - 462s 9ms/step - loss: 4.2303 - acc: 0.1788\n",
467 |       "Epoch 30/75\n",
468 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 4.1416 - acc: 0.1838\n",
469 |       "Epoch 31/75\n",
470 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 4.0701 - acc: 0.1886\n",
471 |       "Epoch 32/75\n",
472 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 4.0057 - acc: 0.1921\n",
473 |       "Epoch 33/75\n",
474 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 3.9404 - acc: 0.1977\n",
475 |       "Epoch 34/75\n",
476 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 3.8961 - acc: 0.2004\n",
477 |       "Epoch 35/75\n",
478 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 3.8313 - acc: 0.2064\n",
479 |       "Epoch 36/75\n",
480 |       "51422/51422 [==============================] - 449s 9ms/step - loss: 3.7746 - acc: 0.2139\n",
481 |       "Epoch 37/75\n",
482 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 3.7493 - acc: 0.2157\n",
483 |       "Epoch 38/75\n",
484 |       "51422/51422 [==============================] - 449s 9ms/step - loss: 3.6876 - acc: 0.2225\n",
485 |       "Epoch 39/75\n",
486 |       "51422/51422 [==============================] - 447s 9ms/step - loss: 3.6356 - acc: 0.2274\n",
487 |       "Epoch 40/75\n",
488 |       "51422/51422 [==============================] - 451s 9ms/step - loss: 3.5717 - acc: 0.2344\n",
489 |       "Epoch 41/75\n",
490 |       "51422/51422 [==============================] - 447s 9ms/step - loss: 3.5353 - acc: 0.2374\n",
491 |       "Epoch 42/75\n",
492 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 3.4846 - acc: 0.2462\n",
493 |       "Epoch 43/75\n",
494 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 3.4388 - acc: 0.2502\n",
495 |       "Epoch 44/75\n",
496 |       "51422/51422 [==============================] - 455s 9ms/step - loss: 3.3920 - acc: 0.2545\n",
497 |       "Epoch 45/75\n",
498 |       "51422/51422 [==============================] - 453s 9ms/step - loss: 3.3505 - acc: 0.2589\n",
499 |       "Epoch 46/75\n",
500 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 3.3113 - acc: 0.2662\n",
501 |       "Epoch 47/75\n",
502 |       "51422/51422 [==============================] - 448s 9ms/step - loss: 3.3232 - acc: 0.2664\n",
503 |       "Epoch 48/75\n",
504 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 3.2610 - acc: 0.2730\n",
505 |       "Epoch 49/75\n",
506 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 3.2505 - acc: 0.2748\n",
507 |       "Epoch 50/75\n",
508 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 3.2856 - acc: 0.2771\n",
509 |       "Epoch 51/75\n",
510 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 3.1842 - acc: 0.2860\n",
511 |       "Epoch 52/75\n",
512 |       "51422/51422 [==============================] - 449s 9ms/step - loss: 3.1172 - acc: 0.2948\n",
513 |       "Epoch 53/75\n",
514 |       "51422/51422 [==============================] - 455s 9ms/step - loss: 3.1662 - acc: 0.2918\n",
515 |       "Epoch 54/75\n",
516 |       "51422/51422 [==============================] - 454s 9ms/step - loss: 3.4129 - acc: 0.2656\n",
517 |       "Epoch 55/75\n",
518 |       "51422/51422 [==============================] - 449s 9ms/step - loss: 3.3144 - acc: 0.2733\n",
519 |       "Epoch 56/75\n",
520 |       "51422/51422 [==============================] - 551s 11ms/step - loss: 3.2530 - acc: 0.2807\n",
521 |       "Epoch 57/75\n",
522 |       "51422/51422 [==============================] - 539s 10ms/step - loss: 3.1926 - acc: 0.2868\n",
523 |       "Epoch 58/75\n",
524 |       "51422/51422 [==============================] - 532s 10ms/step - loss: 3.1441 - acc: 0.2928\n",
525 |       "Epoch 59/75\n",
526 |       "51422/51422 [==============================] - 529s 10ms/step - loss: 3.0970 - acc: 0.2979\n",
527 |       "Epoch 60/75\n",
528 |       "51422/51422 [==============================] - 541s 11ms/step - loss: 3.0582 - acc: 0.3036\n",
529 |       "Epoch 61/75\n",
530 |       "51422/51422 [==============================] - 524s 10ms/step - loss: 3.0121 - acc: 0.3111\n",
531 |       "Epoch 62/75\n",
532 |       "51422/51422 [==============================] - 530s 10ms/step - loss: 2.9672 - acc: 0.3175\n",
533 |       "Epoch 63/75\n",
534 |       "51422/51422 [==============================] - 532s 10ms/step - loss: 2.9369 - acc: 0.3231\n",
535 |       "Epoch 64/75\n",
536 |       "51422/51422 [==============================] - 544s 11ms/step - loss: 2.8845 - acc: 0.3300\n",
537 |       "Epoch 65/75\n",
538 |       "51422/51422 [==============================] - 579s 11ms/step - loss: 2.8595 - acc: 0.3357\n",
539 |       "Epoch 66/75\n",
540 |       "51422/51422 [==============================] - 525s 10ms/step - loss: 2.8161 - acc: 0.3400\n",
541 |       "Epoch 67/75\n",
542 |       "51422/51422 [==============================] - 458s 9ms/step - loss: 2.7810 - acc: 0.3441\n",
543 |       "Epoch 68/75\n",
544 |       "51422/51422 [==============================] - 516s 10ms/step - loss: 2.7346 - acc: 0.3547\n",
545 |       "Epoch 69/75\n",
546 |       "51422/51422 [==============================] - 522s 10ms/step - loss: 2.7065 - acc: 0.3570\n",
547 |       "Epoch 70/75\n",
548 |       "51422/51422 [==============================] - 458s 9ms/step - loss: 2.6710 - acc: 0.3642\n",
549 |       "Epoch 71/75\n",
550 |       "51422/51422 [==============================] - 449s 9ms/step - loss: 2.6264 - acc: 0.3716\n",
551 |       "Epoch 72/75\n",
552 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 2.6027 - acc: 0.3766\n",
553 |       "Epoch 73/75\n",
554 |       "51422/51422 [==============================] - 461s 9ms/step - loss: 2.5761 - acc: 0.3784\n",
555 |       "Epoch 74/75\n",
556 |       "51422/51422 [==============================] - 454s 9ms/step - loss: 2.5370 - acc: 0.3874\n",
557 |       "Epoch 75/75\n",
558 |       "51422/51422 [==============================] - 450s 9ms/step - loss: 2.5038 - acc: 0.3938\n"
559 |      ]
560 |     },
561 |     {
562 |      "data": {
563 |       "text/plain": [
564 |        "<keras.callbacks.History at 0x1131338d0>"
565 |       ]
566 |      },
567 |      "execution_count": 17,
568 |      "metadata": {},
569 |      "output_type": "execute_result"
570 |     }
571 |    ],
572 |    "source": [
573 |     "# compile model\n",
574 |     "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n",
575 |     "# fit model\n",
576 |     "model.fit(Input, Output, batch_size=250, epochs=75)"
577 |    ]
578 |   },
579 |   {
580 |    "cell_type": "code",
581 |    "execution_count": 18,
582 |    "metadata": {
583 |     "collapsed": true
584 |    },
585 |    "outputs": [],
586 |    "source": [
587 |     "# save the model to file\n",
588 |     "model.save('junglebook_trained.h5')\n",
589 |     "# save the tokenizer\n",
590 |     "dump(tokenizer, open('tokenizer.pkl', 'wb'))"
591 |    ]
592 |   },
593 |   {
594 |    "cell_type": "code",
595 |    "execution_count": 19,
596 |    "metadata": {},
597 |    "outputs": [],
598 |    "source": [
599 |     "# load doc into memory\n",
600 |     "def load_document(name):\n",
601 |     "    file = open(name, 'r')\n",
602 |     "    text = file.read()\n",
603 |     "    file.close()\n",
604 |     "    return text\n",
605 |     " \n",
606 |     "# load cleaned text sequences\n",
607 |     "input_filename = 'junglebook_sequences.txt'\n",
608 |     "doc = load_document(input_filename)\n",
609 |     "lines = doc.split('\\n')"
610 |    ]
611 |   },
612 |   {
613 |    "cell_type": "code",
614 |    "execution_count": 20,
615 |    "metadata": {
616 |     "collapsed": true
617 |    },
618 |    "outputs": [],
619 |    "source": [
620 |     "sequence_length = len(lines[0].split()) - 1"
621 |    ]
622 |   },
623 |   {
624 |    "cell_type": "code",
625 |    "execution_count": 21,
626 |    "metadata": {},
627 |    "outputs": [],
628 |    "source": [
629 |     "# load the model\n",
630 |     "from keras.models import load_model\n",
631 |     "model = load_model('junglebook_trained.h5')"
632 |    ]
633 |   },
634 |   {
635 |    "cell_type": "code",
636 |    "execution_count": 22,
637 |    "metadata": {},
638 |    "outputs": [
639 |     {
640 |      "name": "stdout",
641 |      "output_type": "stream",
642 |      "text": [
643 |       "to me not long ago with some rude talk that i was a naked cub and not fit to dig pignuts but i caught tabaqui by the tail and swung him twice against a palmtree to teach him better was foolishness for though tabaqui is a mischiefmaker he would have told\n",
644 |       "\n"
645 |      ]
646 |     }
647 |    ],
648 |    "source": [
649 |     "# select a seed text\n",
650 |     "from random import randint\n",
651 |     "seed_text = lines[randint(0,len(lines))]\n",
652 |     "print(seed_text + '\\n')"
653 |    ]
654 |   },
655 |   {
656 |    "cell_type": "code",
657 |    "execution_count": 23,
658 |    "metadata": {
659 |     "collapsed": true
660 |    },
661 |    "outputs": [],
662 |    "source": [
663 |     "encoded = tokenizer.texts_to_sequences([seed_text])[0]"
664 |    ]
665 |   },
666 |   {
667 |    "cell_type": "code",
668 |    "execution_count": 24,
669 |    "metadata": {
670 |     "collapsed": true
671 |    },
672 |    "outputs": [],
673 |    "source": [
674 |     "from random import randint\n",
675 |     "from pickle import load\n",
676 |     "from keras.models import load_model\n",
677 |     "from keras.preprocessing.sequence import pad_sequences\n",
678 |     " \n",
679 |     "# load doc into memory\n",
680 |     "def load_document(name):\n",
681 |     "    file = open(name, 'r')\n",
682 |     "    text = file.read()\n",
683 |     "    file.close()\n",
684 |     "    return text\n",
685 |     " \n",
686 |     "# generate a sequence from a language model\n",
687 |     "def generate_sequence(model, tokenizer, sequence_length, seed_text, n_words):\n",
688 |     "\tresult = list()\n",
689 |     "\tinput_text = seed_text\n",
690 |     "\t# generate a fixed number of words\n",
691 |     "\tfor _ in range(n_words):\n",
692 |     "\t\t# encode the text as integer\n",
693 |     "\t\tencoded = tokenizer.texts_to_sequences([input_text])[0]\n",
694 |     "\t\t# truncate sequences to a fixed length\n",
695 |     "\t\tencoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')\n",
696 |     "\t\t# predict probabilities for each word\n",
697 |     "\t\tprediction = model.predict_classes(encoded, verbose=0)\n",
698 |     "\t\t# map predicted word index to word\n",
699 |     "\t\tout_word = ''\n",
700 |     "\t\tfor word, index in tokenizer.word_index.items():\n",
701 |     "\t\t\tif index == prediction:\n",
702 |     "\t\t\t\tout_word = word\n",
703 |     "\t\t\t\tbreak\n",
704 |     "\t\t# append to input\n",
705 |     "\t\tinput_text += ' ' + out_word\n",
706 |     "\t\tresult.append(out_word)\n",
707 |     "\treturn ' '.join(result)\n",
708 |     " \n",
709 |     "# load cleaned text sequences\n",
710 |     "input_filename = 'junglebook_sequences.txt'\n",
711 |     "doc = load_document(input_filename)\n",
712 |     "lines = doc.split('\\n')\n",
713 |     "seq_length = len(lines[0].split()) - 1"
714 |    ]
715 |   },
716 |   {
717 |    "cell_type": "code",
718 |    "execution_count": 25,
719 |    "metadata": {},
720 |    "outputs": [
721 |     {
722 |      "name": "stdout",
723 |      "output_type": "stream",
724 |      "text": [
725 |       "baskets of dried grass and put grasshoppers in them or catch two praying mantises and make them fight or string a necklace of red and black jungle nuts or watch a lizard basking on a rock or a snake hunting a frog near the wallows then they sing long long songs\n",
726 |       "\n",
727 |       "with odd native quavers at the end of the review and the hyaena whom he had seen the truth they feel twitched to the noises round him for a picture of the end of the ravine and snuffing bitten and best of the bulls at the dawn is a native\n"
728 |      ]
729 |     }
730 |    ],
731 |    "source": [
732 |     "# load the model\n",
733 |     "model = load_model('junglebook_trained.h5')\n",
734 |     " \n",
735 |     "# load the tokenizer\n",
736 |     "tokenizer = load(open('tokenizer.pkl', 'rb'))\n",
737 |     " \n",
738 |     "# select a seed text\n",
739 |     "seed_text = lines[randint(0,len(lines))]\n",
740 |     "print(seed_text + '\\n')\n",
741 |     " \n",
742 |     "# generate new text\n",
743 |     "generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50)\n",
744 |     "print(generated)"
745 |    ]
746 |   },
747 |   {
748 |    "cell_type": "code",
749 |    "execution_count": 26,
750 |    "metadata": {},
751 |    "outputs": [
752 |     {
753 |      "name": "stdout",
754 |      "output_type": "stream",
755 |      "text": [
756 |       "little toomai there was a splash and a trample and the rush of running water and kala nag strode through the bed of a river feeling his way at each step above the noise of the water as it swirled round the legs little toomai could hear more splashing and some\n",
757 |       "\n",
758 |       "trumpeting both upstream and down grass and knocked him up to the jealous moon he could see bruised of dust for the potter was rann caught him up to the plowed din of the melbourne lines where the two wolves would be forced to make themselves rifles and the sparks\n"
759 |      ]
760 |     }
761 |    ],
762 |    "source": [
763 |     "# load the model\n",
764 |     "model = load_model('junglebook_trained.h5')\n",
765 |     " \n",
766 |     "# load the tokenizer\n",
767 |     "tokenizer = load(open('tokenizer.pkl', 'rb'))\n",
768 |     " \n",
769 |     "# select a seed text\n",
770 |     "seed_text = lines[randint(0,len(lines))]\n",
771 |     "print(seed_text + '\\n')\n",
772 |     " \n",
773 |     "# generate new text\n",
774 |     "generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50)\n",
775 |     "print(generated)"
776 |    ]
777 |   },
778 |   {
779 |    "cell_type": "code",
780 |    "execution_count": 29,
781 |    "metadata": {},
782 |    "outputs": [
783 |     {
784 |      "name": "stdout",
785 |      "output_type": "stream",
786 |      "text": [
787 |       "is in their legs and he remembered the good firm beaches of novastoshnah seven thousand miles away the games his companions played the smell of the seaweed the seal roar and the fighting that very minute he turned north swimming steadily and as he went on he met scores of his\n",
788 |       "\n",
789 |       "mates and bound like the deck of the fighters and harness under his breath and he could not be able to stop a ship and ducked to nag wound up with scores of marble tracery showing all the regiments went twisting his head and shoulders and creepers very seldom shows\n"
790 |      ]
791 |     }
792 |    ],
793 |    "source": [
794 |     "# load the model\n",
795 |     "model = load_model('junglebook_trained.h5')\n",
796 |     " \n",
797 |     "# load the tokenizer\n",
798 |     "tokenizer = load(open('tokenizer.pkl', 'rb'))\n",
799 |     " \n",
800 |     "# select a seed text\n",
801 |     "seed_text = lines[randint(0,len(lines))]\n",
802 |     "print(seed_text + '\\n')\n",
803 |     " \n",
804 |     "# generate new text\n",
805 |     "generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50)\n",
806 |     "print(generated)"
807 |    ]
808 |   },
809 |   {
810 |    "cell_type": "code",
811 |    "execution_count": null,
812 |    "metadata": {
813 |     "collapsed": true
814 |    },
815 |    "outputs": [],
816 |    "source": []
817 |   }
818 |  ],
819 |  "metadata": {
820 |   "kernelspec": {
821 |    "display_name": "Python 3",
822 |    "language": "python",
823 |    "name": "python3"
824 |   },
825 |   "language_info": {
826 |    "codemirror_mode": {
827 |     "name": "ipython",
828 |     "version": 3
829 |    },
830 |    "file_extension": ".py",
831 |    "mimetype": "text/x-python",
832 |    "name": "python",
833 |    "nbconvert_exporter": "python",
834 |    "pygments_lexer": "ipython3",
835 |    "version": "3.6.3"
836 |   }
837 |  },
838 |  "nbformat": 4,
839 |  "nbformat_minor": 2
840 | }
841 | 


--------------------------------------------------------------------------------
/CH07/code/Natural+Language+Processing+-+ChatBot.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "collapsed": true
  8 |    },
  9 |    "outputs": [],
 10 |    "source": [
 11 |     "spark = SparkSession.builder \\\n",
 12 |     "   .master(\"local\") \\\n",
 13 |     "   .appName(\"Natural Language Processing\") \\\n",
 14 |     "   .config(\"spark.executor.memory\", \"6gb\") \\\n",
 15 |     "   .getOrCreate()"
 16 |    ]
 17 |   },
 18 |   {
 19 |    "cell_type": "code",
 20 |    "execution_count": 2,
 21 |    "metadata": {
 22 |     "collapsed": true
 23 |    },
 24 |    "outputs": [],
 25 |    "source": [
 26 |     "df = spark.read.format('com.databricks.spark.csv')\\\n",
 27 |     "                    .options(header='true', inferschema='true')\\\n",
 28 |     "                    .load('TherapyBotSession.csv')\n"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 3,
 34 |    "metadata": {
 35 |     "scrolled": false
 36 |    },
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "+---+---------------+--------------------+----+----+----+----+\n",
 43 |       "| id|          label|                chat| _c3| _c4| _c5| _c6|\n",
 44 |       "+---+---------------+--------------------+----+----+----+----+\n",
 45 |       "|  1|       escalate|I had a friend th...|null|null|null|null|\n",
 46 |       "|  2|       escalate|\"My friend dealt ...|null|null|null|null|\n",
 47 |       "|  3|       escalate|Friend who had bi...|null|null|null|null|\n",
 48 |       "|  4|do_not_escalate|Over the internet...|null|null|null|null|\n",
 49 |       "|  5|       escalate|Having gone throu...|null|null|null|null|\n",
 50 |       "|  6|       escalate|My now girlfriend...|null|null|null|null|\n",
 51 |       "|  7|do_not_escalate|\"Only really one ...|null|null|null|null|\n",
 52 |       "|  8|do_not_escalate|Now that I've bee...|null|null|null|null|\n",
 53 |       "|  9|do_not_escalate|I've always been ...|null|null|null|null|\n",
 54 |       "| 10|       escalate|I feel completely...|null|null|null|null|\n",
 55 |       "| 11|do_not_escalate|Took a week off w...|null|null|null|null|\n",
 56 |       "| 12|       escalate|One of my best fr...|null|null|null|null|\n",
 57 |       "| 13|       escalate|I've had some fri...|null|null|null|null|\n",
 58 |       "| 14|do_not_escalate|Haha. In eight gr...|null|null|null|null|\n",
 59 |       "| 15|do_not_escalate|Some of my friend...|null|null|null|null|\n",
 60 |       "| 16|       escalate|I feel like depre...|null|null|null|null|\n",
 61 |       "| 17|       escalate|i've had a couple...|null|null|null|null|\n",
 62 |       "| 18|       escalate|I will always lis...|null|null|null|null|\n",
 63 |       "| 19|do_not_escalate|A lot for my frie...|null|null|null|null|\n",
 64 |       "| 20|do_not_escalate|When my friend ne...|null|null|null|null|\n",
 65 |       "+---+---------------+--------------------+----+----+----+----+\n",
 66 |       "only showing top 20 rows\n",
 67 |       "\n"
 68 |      ]
 69 |     }
 70 |    ],
 71 |    "source": [
 72 |     "df.show()"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": 4,
 78 |    "metadata": {
 79 |     "collapsed": true
 80 |    },
 81 |    "outputs": [],
 82 |    "source": [
 83 |     "df = df.select('id', 'label', 'chat')"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {},
 90 |    "outputs": [
 91 |     {
 92 |      "name": "stdout",
 93 |      "output_type": "stream",
 94 |      "text": [
 95 |       "+---+---------------+--------------------+\n",
 96 |       "| id|          label|                chat|\n",
 97 |       "+---+---------------+--------------------+\n",
 98 |       "|  1|       escalate|I had a friend th...|\n",
 99 |       "|  2|       escalate|\"My friend dealt ...|\n",
100 |       "|  3|       escalate|Friend who had bi...|\n",
101 |       "|  4|do_not_escalate|Over the internet...|\n",
102 |       "|  5|       escalate|Having gone throu...|\n",
103 |       "|  6|       escalate|My now girlfriend...|\n",
104 |       "|  7|do_not_escalate|\"Only really one ...|\n",
105 |       "|  8|do_not_escalate|Now that I've bee...|\n",
106 |       "|  9|do_not_escalate|I've always been ...|\n",
107 |       "| 10|       escalate|I feel completely...|\n",
108 |       "| 11|do_not_escalate|Took a week off w...|\n",
109 |       "| 12|       escalate|One of my best fr...|\n",
110 |       "| 13|       escalate|I've had some fri...|\n",
111 |       "| 14|do_not_escalate|Haha. In eight gr...|\n",
112 |       "| 15|do_not_escalate|Some of my friend...|\n",
113 |       "| 16|       escalate|I feel like depre...|\n",
114 |       "| 17|       escalate|i've had a couple...|\n",
115 |       "| 18|       escalate|I will always lis...|\n",
116 |       "| 19|do_not_escalate|A lot for my frie...|\n",
117 |       "| 20|do_not_escalate|When my friend ne...|\n",
118 |       "+---+---------------+--------------------+\n",
119 |       "only showing top 20 rows\n",
120 |       "\n"
121 |      ]
122 |     }
123 |    ],
124 |    "source": [
125 |     "df.show()"
126 |    ]
127 |   },
128 |   {
129 |    "cell_type": "code",
130 |    "execution_count": 6,
131 |    "metadata": {
132 |     "scrolled": true
133 |    },
134 |    "outputs": [
135 |     {
136 |      "name": "stdout",
137 |      "output_type": "stream",
138 |      "text": [
139 |       "+---------------+-----+\n",
140 |       "|          label|count|\n",
141 |       "+---------------+-----+\n",
142 |       "|do_not_escalate|   65|\n",
143 |       "|       escalate|   35|\n",
144 |       "+---------------+-----+\n",
145 |       "\n"
146 |      ]
147 |     }
148 |    ],
149 |    "source": [
150 |     "df.groupBy(\"label\") \\\n",
151 |     "    .count() \\\n",
152 |     "    .orderBy(\"count\", ascending = False) \\\n",
153 |     "    .show()"
154 |    ]
155 |   },
156 |   {
157 |    "cell_type": "code",
158 |    "execution_count": 7,
159 |    "metadata": {},
160 |    "outputs": [],
161 |    "source": [
162 |     "import pyspark.sql.functions as F\n",
163 |     "df = df.withColumn('word_count',F.size(F.split(F.col('chat'),' ')))"
164 |    ]
165 |   },
166 |   {
167 |    "cell_type": "code",
168 |    "execution_count": 8,
169 |    "metadata": {
170 |     "scrolled": true
171 |    },
172 |    "outputs": [
173 |     {
174 |      "name": "stdout",
175 |      "output_type": "stream",
176 |      "text": [
177 |       "+---+---------------+--------------------+----------+\n",
178 |       "| id|          label|                chat|word_count|\n",
179 |       "+---+---------------+--------------------+----------+\n",
180 |       "|  1|       escalate|I had a friend th...|       304|\n",
181 |       "|  2|       escalate|\"My friend dealt ...|       184|\n",
182 |       "|  3|       escalate|Friend who had bi...|        90|\n",
183 |       "|  4|do_not_escalate|Over the internet...|        88|\n",
184 |       "|  5|       escalate|Having gone throu...|        71|\n",
185 |       "|  6|       escalate|My now girlfriend...|        73|\n",
186 |       "|  7|do_not_escalate|\"Only really one ...|        74|\n",
187 |       "|  8|do_not_escalate|Now that I've bee...|        62|\n",
188 |       "|  9|do_not_escalate|I've always been ...|        60|\n",
189 |       "| 10|       escalate|I feel completely...|        56|\n",
190 |       "| 11|do_not_escalate|Took a week off w...|        60|\n",
191 |       "| 12|       escalate|One of my best fr...|        59|\n",
192 |       "| 13|       escalate|I've had some fri...|        50|\n",
193 |       "| 14|do_not_escalate|Haha. In eight gr...|        55|\n",
194 |       "| 15|do_not_escalate|Some of my friend...|        49|\n",
195 |       "| 16|       escalate|I feel like depre...|        41|\n",
196 |       "| 17|       escalate|i've had a couple...|        38|\n",
197 |       "| 18|       escalate|I will always lis...|        41|\n",
198 |       "| 19|do_not_escalate|A lot for my frie...|        44|\n",
199 |       "| 20|do_not_escalate|When my friend ne...|        42|\n",
200 |       "+---+---------------+--------------------+----------+\n",
201 |       "only showing top 20 rows\n",
202 |       "\n"
203 |      ]
204 |     }
205 |    ],
206 |    "source": [
207 |     "df.show()"
208 |    ]
209 |   },
210 |   {
211 |    "cell_type": "code",
212 |    "execution_count": 9,
213 |    "metadata": {},
214 |    "outputs": [
215 |     {
216 |      "name": "stdout",
217 |      "output_type": "stream",
218 |      "text": [
219 |       "+---------------+-----------------+\n",
220 |       "|          label|   avg_word_count|\n",
221 |       "+---------------+-----------------+\n",
222 |       "|       escalate|             44.0|\n",
223 |       "|do_not_escalate|20.29230769230769|\n",
224 |       "+---------------+-----------------+\n",
225 |       "\n"
226 |      ]
227 |     }
228 |    ],
229 |    "source": [
230 |     "df.groupBy('label')\\\n",
231 |     "    .agg(F.avg('word_count').alias('avg_word_count'))\\\n",
232 |     "    .orderBy('avg_word_count', ascending = False) \\\n",
233 |     "    .show()\n"
234 |    ]
235 |   },
236 |   {
237 |    "cell_type": "code",
238 |    "execution_count": 10,
239 |    "metadata": {},
240 |    "outputs": [],
241 |    "source": [
242 |     "df_plot = df.select('id', 'word_count').toPandas()"
243 |    ]
244 |   },
245 |   {
246 |    "cell_type": "code",
247 |    "execution_count": 11,
248 |    "metadata": {},
249 |    "outputs": [
250 |     {
251 |      "data": {
252 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAA7YAAAGMCAYAAADuhD56AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xu8bWVZN/zfxQbZIoiIO0TRNhaKiIC6g0d0J2qmiYma\nGWkGvKZZJKZWQodHrXjdz5Np+WT1UqloipFHxFOewfQRNoYiIB9Rt7I5iYgBnsH7/WOOpZPFOsx1\nmGvNsfb3+/mMzxpzzGvc4xr3nGvtfc17jHtWay0AAADQVzutdgIAAACwFApbAAAAek1hCwAAQK8p\nbAEAAOg1hS0AAAC9prAFAACg1xS2AOzwquqlVfWvq53Hchg+l6q6d1XdXFXrlqntf6yqP+vWj6qq\n7cvRbtfe5qq6bLnaA2DHorAFYKJU1SlV9b5p2744y7ZjVyinO1fV31TV17pC8Uvd47uN+bjHV9Un\nFrt/a+1rrbXdW2u3LsdxWmvPba39xWLzmXbMVlU/O9T2ua21+y1H2wDseBS2AEyac5IcOTXKWFX7\nJtklyYOmbfvZLnZkNbCgf/uq6g5JPpzkAUkel+TOSR6a5BtJDl9IW322XKO+ADAOClsAJs35GRSy\nh3WPNyf5aJLLpm37UmvtqiSpqiOr6vyq+u/u55FTjVXVx6rq1Kr6zyTfSXKfqtq/qj5eVTdV1QeT\nzDXy+ptJ7p3kya21S1prP2qtfb219pettfd2x7h/d5xvVdXFVfXEacf/raHHtxkd7UYun9uNQH+r\nql7TFeD3T/KPSR7ajRJ/a6bk5jqXqtrYtb/z0LG/3MV+paqeMdtxqur1VfUPVfXeqvp2kkd22/5y\n2vH/uKq+UVXbquoZo5x3VU19IPHZ7pi/Nv3S5nn69PVdP72nO5dPV9XPzPEaArDGKWwBmCittR8k\n+XSSn+82/XySc5N8Ytq2c5Kkqu6a5D1JXp1k7ySvTPKeqtp7qNlnJnlOkj2SfDXJm5NckEER+BdJ\njpsjpV9I8v7W2s0zPVlVuyR5d5L/SPJTSZ6X5E1VtZDLap+Q5OeSHJLkaUke21q7NMlzk3yqu5z4\nLrPsO9K5VNWdMuijX2qt7ZHkyCQXznOcpyc5NYN+m+lS5bt3x71nd9zTRjnv1trU63hod8x/m5br\nKH16bJKXJdkryeVdngDsoBS2AEyij+cnRezmDArbc6dt+3i3fnSSL7bW3thau6W1dkaSLyT55aH2\nXt9au7i1dkuSfTMoIv+stfb91to5GRRRs9k7ydVzPP8/kuyeZEtr7QettY8kOTvJr494run2/VZr\n7WsZjE4fNt8OyWByqCzsXH6U5OCqumNr7erW2sXzHOJdrbX/7EapvzdLzNSxP57BBwxPGyX3eYzS\np+9orZ3XvaZvyoh9BsDapLAFYBKdk+Th3WjshtbaF5N8MoN7b++a5OD85P7ae2QwCjvsqxmMIk65\nYmj9HkluaK19e1r8bK7PoBiezT2SXNFa+9Ecx5/PNUPr38mgqBvFyOfSxfxaBqOzV3eX8R44T/tX\nzPP8TMe+xzz7jGKUPl1snwGwBilsAZhEn0qyZ5JnJ/nPJGmt3Zjkqm7bVa21r3SxVyX56Wn73zvJ\nlUOP29D61Un26i7NHY6fzYeSPHZa/LCrktxr2qRUw8f/dpLdhp67+xzHmq7N8/yCzqW19oHW2mMy\nKNS/kOSf5jnOfMef6dhXdetLOe/5+hQAbkNhC8DEaa19N8nWJC/M4BLkKZ/otg3PhvzeJPetqqdX\n1c5V9WtJDsrg0tWZ2v5q1/bLquoOVfXw3Pay5enemMHI5duq6sCq2qmq9u4mTXp8BvcDfyfJH1XV\nLlV1VNfeW7r9L0zylKrarQZfb/OsBXTFtUn262ZmXtK5VNU+VXVMV4h+P8nNGVyaPO9x5jF17M0Z\n3Cv87932+c772iT3maXN+foUAG5DYQvApPp4BhMHDU9adG637ceFbWvt+gwKqhdlcNnwHyV5Qmvt\nG3O0/fQkRyT5ZpKXJHnDbIGtte9nMIHUF5J8MMmNSc7LYNKkT3eTXf1ykl/K4CuA/j7Jb7bWvtA1\n8aokP8igkDs9g/tBR/WRJBcnuaaqZjufUc9lpww+FLiqi31Ekt9ZwHFmck2SG7o235TkuQs475cm\nOb2b9fg29+WO0KcAcBvV2nxXGQEAAMDkMmILAABArylsAQAA6DWFLQAAAL2msAUAAKDXFLYAAAD0\n2s6rncBS3O1ud2sbN25c7TQAAAAYgwsuuOAbrbUN88X1urDduHFjtm7dutppAAAAMAZV9dVR4lyK\nDAAAQK8pbAEAAOg1hS0AAAC91ut7bAEAAJbbD3/4w2zfvj3f+973VjuVHcb69euz3377ZZdddlnU\n/gpbAACAIdu3b88ee+yRjRs3pqpWO501r7WW66+/Ptu3b8/++++/qDZcigwAADDke9/7Xvbee29F\n7Qqpquy9995LGiFX2AIAAEyjqF1ZS+1vhS0AAAC95h5bAACAOWw8+T3L2t62LUcva3uj+tjHPpZX\nvOIVOfvss1fl+Nu2bcsnP/nJPP3pT1/2to3YAgAArEG33nrraqdwG9u2bcub3/zmsbStsAUAAJgw\nf/VXf5VXv/rVSZIXvOAFedSjHpUk+chHPpJnPOMZOeOMM/LABz4wBx98cF784hf/eL/dd989L3rR\ni3LooYfmU5/6VN7//vfnwAMPzIMf/OC8/e1vn/OYN998c0444YQ88IEPzCGHHJK3ve1tSTLnsaa8\n9a1vzfHHH58kOf7443PSSSflyCOPzH3uc5+89a1vTZKcfPLJOffcc3PYYYflVa961dI7aYjCFgAA\nYMJs3rw55557bpJk69atufnmm/PDH/4w5557bu573/vmxS9+cT7ykY/kwgsvzPnnn593vvOdSZJv\nf/vbOeKII/LZz342mzZtyrOf/ey8+93vzgUXXJBrrrlmzmP+xV/8Rfbcc89cdNFF+dznPpdHPepR\nueqqq2Y91lyuvvrqfOITn8jZZ5+dk08+OUmyZcuWbN68ORdeeGFe8IIXLLGHbkthCwAAMGEe8pCH\n5IILLsiNN96YXXfdNQ996EOzdevWnHvuubnLXe6So446Khs2bMjOO++cZzzjGTnnnHOSJOvWrcuv\n/MqvJEm+8IUvZP/9988BBxyQqspv/MZvzHnMD33oQznxxBN//HivvfbK+eefP+ux5vKkJz0pO+20\nUw466KBce+21S+iJ0ayZyaOm39C9WjdkAwAALNUuu+yS/fffP69//etz5JFH5pBDDslHP/rRXH75\n5dm4cWMuuOCCGfdbv3591q1btyI5Dn9Fz/TvoN11111/vN5aG3suYxuxrar1VXVeVX22qi6uqpd1\n2+9aVR+sqi92P/ca2ueUqrq8qi6rqseOKzcAAIBJt3nz5rziFa/Iz//8z2fz5s35x3/8xzzoQQ/K\n4Ycfno9//OP5xje+kVtvvTVnnHFGHvGIR9xu/wMPPDDbtm3Ll770pSSDe2Xn8pjHPCavec1rfvz4\nhhtumPNY++yzTy699NL86Ec/yjve8Y55z2ePPfbITTfdtJAuGNk4R2y/n+RRrbWbq2qXJJ+oqvcl\neUqSD7fWtlTVyUlOTvLiqjooybFJHpDkHkk+VFX3ba1N1lReAADADmW1rgbdvHlzTj311Dz0oQ/N\nne50p6xfvz6bN2/Ovvvumy1btuSRj3xkWms5+uijc8wxx9xu//Xr1+e0007L0Ucfnd122y2bN2+e\ns7D80z/905x44ok5+OCDs27durzkJS/JU57ylFmPtWXLljzhCU/Ihg0bsmnTptx8881zns8hhxyS\ndevW5dBDD83xxx+/rPfZ1koMC1fVbkk+keR3krwhyVGttaurat8kH2ut3a+qTkmS1trLu30+kOSl\nrbVPzdbupk2b2tatW5O4FBkAAFgel156ae5///uvdho7nJn6vaouaK1tmm/fsU4eVVXrqurCJF9P\n8sHW2qeT7NNau7oLuSbJPt36PZNcMbT79m7b9DafU1Vbq2rrddddN8bsAQAA6IOxFrattVtba4cl\n2S/J4VV18LTnW5IFDRm31k5rrW1qrW3asGHDMmYLAACw9r3uda/LYYcddptleDbkPlqRWZFba9+q\nqo8meVySa6tq36FLkb/ehV2Z5F5Du+3XbQMAAFhRrbXbzPq7lpxwwgk54YQTVjuN21jqLbLjnBV5\nQ1XdpVu/Y5LHJPlCkrOSHNeFHZfkXd36WUmOrapdq2r/JAckOW9c+QEAAMxk/fr1uf7661fka2oY\nFLXXX3991q9fv+g2xjliu2+S06tqXQYF9JmttbOr6lNJzqyqZyX5apKnJUlr7eKqOjPJJUluSXKi\nGZEBAICVtt9++2X79u0xp8/KWb9+ffbbb79F7z+2wra19rkkD5ph+/VJHj3LPqcmOXVcOQEAAMxn\nl112yf7777/aabAAY508CgAAAMZNYQsAAECvKWwBAADoNYUtAAAAvaawBQAAoNcUtgAAAPSawhYA\nAIBeU9gCAADQawpbAAAAek1hCwAAQK8pbAEAAOg1hS0AAAC9prAFAACg1xS2AAAA9JrCFgAAgF5T\n2AIAANBrClsAAAB6TWELAABArylsAQAA6DWFLQAAAL2msAUAAKDXFLYAAAD0msIWAACAXlPYAgAA\n0GsKWwAAAHpNYQsAAECvKWwBAADoNYUtAAAAvaawBQAAoNcUtgAAAPSawhYAAIBeU9gCAADQawpb\nAAAAek1hCwAAQK8pbAEAAOg1hS0AAAC9prAFAACg1xS2AAAA9JrCFgAAgF5T2AIAANBrClsAAAB6\nTWELAABAr42tsK2qe1XVR6vqkqq6uKqe321/aVVdWVUXdsvjh/Y5paour6rLquqx48oNAACAtWPn\nMbZ9S5IXtdY+U1V7JLmgqj7YPfeq1torhoOr6qAkxyZ5QJJ7JPlQVd23tXbrGHMEAACg58Y2Ytta\nu7q19plu/aYklya55xy7HJPkLa2177fWvpLk8iSHjys/AAAA1oYVuce2qjYmeVCST3ebnldVn6uq\n11bVXt22eya5Ymi37ZmhEK6q51TV1qraet11140xawAAAPpg7IVtVe2e5G1Jfr+1dmOSf0hynySH\nJbk6yV8vpL3W2mmttU2ttU0bNmxY9nwBAADol7EWtlW1SwZF7Ztaa29Pktbata21W1trP0ryT/nJ\n5cZXJrnX0O77ddsAAABgVuOcFbmS/EuSS1trrxzavu9Q2JOTfL5bPyvJsVW1a1Xtn+SAJOeNKz8A\nAADWhnHOivywJM9MclFVXdht++Mkv15VhyVpSbYl+e0kaa1dXFVnJrkkgxmVTzQjMgAAAPMZW2Hb\nWvtEkprhqffOsc+pSU4dV04AAACsPSsyKzIAAACMi8IWAACAXlPYAgAA0GsKWwAAAHpNYQsAAECv\nKWwBAADoNYUtAAAAvaawBQAAoNcUtgAAAPSawhYAAIBeU9gCAADQawpbAAAAek1hCwAAQK8pbAEA\nAOg1hS0AAAC9prAFAACg1xS2AAAA9JrCFgAAgF5T2AIAANBrClsAAAB6TWELAABArylsAQAA6DWF\nLQAAAL2msAUAAKDXFLYAAAD0msIWAACAXlPYAgAA0GsKWwAAAHpNYQsAAECvKWwBAADoNYUtAAAA\nvaawBQAAoNcUtgAAAPSawhYAAIBeU9gCAADQawpbAAAAek1hCwAAQK8pbAEAAOg1hS0AAAC9prAF\nAACg1xS2AAAA9JrCFgAAgF4bW2FbVfeqqo9W1SVVdXFVPb/bfteq+mBVfbH7udfQPqdU1eVVdVlV\nPXZcuQEAALB2jHPE9pYkL2qtHZTkfyQ5saoOSnJykg+31g5I8uHucbrnjk3ygCSPS/L3VbVujPkB\nAACwBoytsG2tXd1a+0y3flOSS5PcM8kxSU7vwk5P8qRu/Zgkb2mtfb+19pUklyc5fFz5AQAAsDas\nyD22VbUxyYOSfDrJPq21q7unrkmyT7d+zyRXDO22vds2va3nVNXWqtp63XXXjS1nAAAA+mHshW1V\n7Z7kbUl+v7V24/BzrbWWpC2kvdbaaa21Ta21TRs2bFjGTAEAAOijsRa2VbVLBkXtm1prb+82X1tV\n+3bP75vk6932K5Pca2j3/bptAAAAMKtxzopcSf4lyaWttVcOPXVWkuO69eOSvGto+7FVtWtV7Z/k\ngCTnjSs/AAAA1oadx9j2w5I8M8lFVXVht+2Pk2xJcmZVPSvJV5M8LUlaaxdX1ZlJLslgRuUTW2u3\njjE/AAAA1oCxFbattU8kqVmefvQs+5ya5NRx5QQAAMDasyKzIgMAAMC4KGwBAADoNYUtAAAAvaaw\nBQAAoNcUtgAAAPSawhYAAIBeU9gCAADQawpbAAAAem3ewraq3jjKNgAAAFgNo4zYPmD4QVWtS/KQ\n8aQDAAAACzNrYVtVp1TVTUkOqaobu+WmJF9P8q4VyxAAAADmsPNsT7TWXp7k5VX18tbaKSuY01hs\nPPk9t9u2bcvRq5AJAAAAy2nWwnZKa+2Uqrpnkp8ejm+tnTPOxAAAAGAU8xa2VbUlybFJLklya7e5\nJVHYAgAAsOrmLWyTPDnJ/Vpr3x93MgAAALBQo8yK/OUku4w7EQAAAFiMUUZsv5Pkwqr6cJIfj9q2\n1k4aW1YAAAAwolEK27O6BQAAACbOKLMin74SiQAAAMBijDIr8lcymAX5Nlpr9xlLRgAAALAAo1yK\nvGlofX2SX01y1/GkAwAAAAsz76zIrbXrh5YrW2t/k+ToFcgNAAAA5jXKpcgPHnq4UwYjuKOM9AIA\nAMDYjVKg/vXQ+i1JtiV52liyAQAAgAUaZVbkR65EIgAAALAY895jW1V7VtUrq2prt/x1Ve25EskB\nAADAfOYtbJO8NslNGVx+/LQkNyZ53TiTAgAAgFGNco/tz7TWfmXo8cuq6sJxJQQAAAALMcqI7Xer\n6uFTD6rqYUm+O76UAAAAYHSjjNg+N8kbhu6rvSHJ8WPLCAAAABZglFmRP5vk0Kq6c/f4xrFnBQAA\nACOa9VLkqnphVT1r6nFr7cbW2o1V9ayq+v2VSQ8AAADmNtc9ts9I8oYZtr8xyf8znnQAAABgYeYq\nbHdurf1w+sbW2g+S1PhSAgAAgNHNVdjuVFX7TN840zYAAABYLXMVtn+V5D1V9Yiq2qNbjkpydpJX\nrEh2AAAAMI9ZZ0Vurb2hqq5L8udJDk7Sklyc5H+21t63QvkBAADAnOb8up+ugFXEAgAAMLHmuhQZ\nAAAAJp7CFgAAgF5T2AIAANBrs95jW1UvnGvH1torlz8dAAAAWJi5Rmz36JZNSX4nyT275blJHjxf\nw1X12qr6elV9fmjbS6vqyqq6sFseP/TcKVV1eVVdVlWPXewJAQAAsGOZ6+t+XpYkVXVOkge31m7q\nHr80yXtGaPv1Sf4uyRumbX9Va+0234NbVQclOTbJA5LcI8mHquq+rbVbRzsNAAAAdlSj3GO7T5If\nDD3+QbdtTq21c5J8c8Q8jknyltba91trX0lyeZLDR9wXAACAHdgohe0bkpzXXUb80iSfzmA0drGe\nV1Wf6y5V3qvbds8kVwzFbO+23U5VPaeqtlbV1uuuu24JaQAAALAWzFvYttZOTXJCkhu65YTW2ssX\nebx/SHKfJIcluTrJXy+0gdbaaa21Ta21TRs2bFhkGgAAAKwVs95jmyRVtS7Jxa21A5N8ZqkHa61d\nO9T2PyU5u3t4ZZJ7DYXu120DAACAOc05YttN3nRZVd17OQ5WVfsOPXxykqkZk89KcmxV7VpV+yc5\nIMl5y3FMAAAA1rY5R2w7eyW5uKrOS/LtqY2ttSfOtVNVnZHkqCR3q6rtSV6S5KiqOixJS7ItyW93\nbV1cVWcmuSTJLUlONCMyAAAAoxilsP2zxTTcWvv1GTb/yxzxpyY5dTHHAgAAYMc1b2HbWvt4Ve2T\n5Oe6Tee11r4+3rQAAABgNPPOilxVT8vgftdfTfK0JJ+uqqeOOzEAAAAYxSiXIv9Jkp+bGqWtqg1J\nPpTkreNMDAAAAEYx74htkp2mXXp8/Yj7AQAAwNiNMmL7/qr6QJIzuse/luS940sJAAAARjfK5FF/\nWFW/kuRh3abTWmvvGG9aAAAAMJpZC9uq+v0kn0zymdba25K8bcWyAgAAgBHNNWK7X5K/SXJgVV2U\n5D8zKHQ/2Vr75kokBwAAAPOZtbBtrf1BklTVHZJsSnJkkhOSnFZV32qtHbQyKQIAAMDsRpk86o5J\n7pxkz265KslF40wKAAAARjXXPbanJXlAkpuSfDqDy5Bf2Vq7YYVyAwAAgHnN9X20906ya5JrklyZ\nZHuSb61EUgAAADCque6xfVxVVQajtkcmeVGSg6vqm0k+1Vp7yQrlCAAAALOa8x7b1lpL8vmq+laS\n/+6WJyQ5PInCFgAAgFU31z22J2UwUntkkh+m+6qfJK+NyaMAAACYEHON2G5M8u9JXtBau3pl0gEA\nAICFmese2xeuZCIAAACwGHPNigwAAAATT2ELAABArylsAQAA6DWFLQAAAL2msAUAAKDXFLYAAAD0\nmsIWAACAXlPYAgAA0GsKWwAAAHpNYQsAAECvKWwBAADoNYUtAAAAvaawBQAAoNcUtgAAAPSawhYA\nAIBeU9gCAADQawpbAAAAek1hCwAAQK8pbAEAAOg1hS0AAAC9prAFAACg1xS2AAAA9JrCFgAAgF5T\n2AIAANBrClsAAAB6bWyFbVW9tqq+XlWfH9p216r6YFV9sfu519Bzp1TV5VV1WVU9dlx5AQAAsLaM\nc8T29UkeN23byUk+3Fo7IMmHu8epqoOSHJvkAd0+f19V68aYGwAAAGvE2Arb1to5Sb45bfMxSU7v\n1k9P8qSh7W9prX2/tfaVJJcnOXxcuQEAALB2rPQ9tvu01q7u1q9Jsk+3fs8kVwzFbe+23U5VPaeq\ntlbV1uuuu258mQIAANALqzZ5VGutJWmL2O+01tqm1tqmDRs2jCEzAAAA+mSlC9trq2rfJOl+fr3b\nfmWSew3F7ddtAwAAgDmtdGF7VpLjuvXjkrxraPuxVbVrVe2f5IAk561wbgAAAPTQzuNquKrOSHJU\nkrtV1fYkL0myJcmZVfWsJF9N8rQkaa1dXFVnJrkkyS1JTmyt3Tqu3AAAAFg7xlbYttZ+fZanHj1L\n/KlJTh1XPgAAAKxNqzZ5FAAAACwHhS0AAAC9prAFAACg1xS2AAAA9JrCFgAAgF4b26zIfbTx5Pfc\nbtu2LUevQiYAAACMyogtAAAAvWbEdoGM6gIAAEwWI7YAAAD0msIWAACAXlPYAgAA0GsKWwAAAHpN\nYQsAAECvKWwBAADoNYUtAAAAvaawBQAAoNcUtgAAAPSawhYAAIBeU9gCAADQawpbAAAAek1hCwAA\nQK/tvNoJMLuNJ7/ndtu2bTl6FTIBAACYXArbVaJoBQAAWB4uRQYAAKDXjNiOgdFYAACAlWPEFgAA\ngF5T2AIAANBrClsAAAB6TWELAABArylsAQAA6DWFLQAAAL2msAUAAKDXFLYAAAD02s6rnQBLs/Hk\n99xu27YtR69CJgAAAKvDiC0AAAC9ZsR2BzB9VHexI7rL1Q4AAMByUtiyrBS/AADASnMpMgAAAL2m\nsAUAAKDXXIpMEpcQAwAA/aWwZSIptAEAgFG5FBkAAIBeW5UR26raluSmJLcmuaW1tqmq7prk35Js\nTLItydNaazesRn4AAAD0x2qO2D6ytXZYa21T9/jkJB9urR2Q5MPdYwAAAJjTJF2KfEyS07v105M8\naRVzAQAAoCdWq7BtST5UVRdU1XO6bfu01q7u1q9Jss/qpAYAAECfrNasyA9vrV1ZVT+V5INV9YXh\nJ1trraraTDt2hfBzkuTe9773+DNl2ZnxGAAAWE6rUti21q7sfn69qt6R5PAk11bVvq21q6tq3yRf\nn2Xf05KcliSbNm2asfhlx6BABgAAklW4FLmq7lRVe0ytJ/nFJJ9PclaS47qw45K8a6VzAwAAoH9W\nY8R2nyTvqKqp47+5tfb+qjo/yZlV9awkX03ytFXIDQAAgJ5Z8cK2tfblJIfOsP36JI9e6XwAAADo\nt0n6uh8AAABYsNWaFRnWlOkTWSUmswIAgJVixBYAAIBeM2ILK8SoLgAAjIfCljVruQpJBSkAAEw2\nlyIDAADQawpbAAAAes2lyOzQduTLjKef+45y3gAArD0KW+iZHbkYBwCAmbgUGQAAgF5T2AIAANBr\nLkWGCeIyYwAAWDgjtgAAAPSaEVtYgyZtxuNR8pm0nAEA6A+FLdAbil8AAGaisAVmZaQVAIA+UNgC\nOxQTdAEArD0mjwIAAKDXFLYAAAD0mkuRAaZxuTIAQL8obIE1o48F6XLlvJITfZkwDACYNApbgAnX\nx4J9uSiiAYBRKGwBxqSPBelKFpIrdaw+vg4AwMIobAFYdn0rWhW/ANBvCluARVAIsVh9vK96ufIB\ngHFR2ALACHyYMRm8DgDMRGELAMtE0QUAq0NhCwATxmW9ALAwClsAYE59+w7ktTpy7gMPgNkpbAGA\niaH4BWAxFLYAAIs0SoG8mJjVLrInLR+A+ShsAQBYMMXv7FwRACtPYQsAOzDFyezWanHiNQfWIoUt\nAMAaoWhdmrX6YQbsCBS2AACMxVostJer+B2lb9Zi/8G4KGwBAFg18xVvyzVB11rVt6/jgnFR2AIA\nsOYpfn9iLZz3jvx6MrOdVjsBAAAAWAojtgAAwIpZjsvPYTqFLQAAsENyD/faobAFAADmZSbn8dN/\ni6ewBQAAWIKVHNX1AcPMJq6wrarHJfnbJOuS/HNrbcsqpwQAAPTMpF1CPGnF5qTls1QTVdhW1bok\nr0nymCTbk5xfVWe11i5Z3cwAAAB2LH0qfieqsE1yeJLLW2tfTpKqekuSY5IobAEAACbMpBS/k1bY\n3jPJFUOPtyc5YpVyAQAAYIlW4iueqrW28MzGpKqemuRxrbXf6h4/M8kRrbXfG4p5TpLndA/vl+Sy\nac3cLck35jnUJMVMUi5ivJ5iFh4zSbmI8ZqLGU/MJOUixmsuZjwxk5SLmNvG/HRrbcM8+ySttYlZ\nkjw0yQc4dJ+xAAAOFUlEQVSGHp+S5JQFtrG1TzGTlIsYr6cYr7kYr7mYyc5FjNdcjNdczMzLTpks\n5yc5oKr2r6o7JDk2yVmrnBMAAAATbKLusW2t3VJVv5fkAxl83c9rW2sXr3JaAAAATLCJKmyTpLX2\n3iTvXUITp/UsZpJyEbP0mEnKRczKxExSLmJWJmaSchGzMjGTlIuYlYmZpFzErEzMJOUiZhEmavIo\nAAAAWKhJu8cWAAAAFkRhCwAAQK8pbAEAAOi1NV/YVtWBVfXoqtp92vbHDa0fXlU/160fVFUvrKrH\nz9PuG+Z5/uFdO784tO2Iqrpzt37HqnpZVb27qv5XVe3ZbT+pqu41T9t3qKrfrKpf6B4/var+rqpO\nrKpdhuLuU1V/UFV/W1WvrKrnTh0fplTVTy1TO3svRztrkT4eP328MvTz+Onj8dPH46ePV4Z+nmYx\nX37bhyXJCUlOSnJZkncm2ZbkmKHnP9P9fEmS/5tka5KXJ/lIkj9Lck6SP+lizpq2vDvJzVOPu5jz\nhtp+dpILu7b/M8nJ3faLk+zcrZ+W5G+SPLyLe3u3/b+TXJXk3CS/m2TDDOf2piT/1uXxxiTvSPLM\nJK9PcnoXc1KS/0jyp0k+meQ1SU5NckmSo1b79VnA6/hTy9TO3quU/55JtiT5QpJvJrk+yaXdtruM\nsP/7up937t6fb0zy9Gkxf9/9vHuSf+he672TvDTJRUnOTLJvF3PXacve3e/GXknu2sU8blr+/5Lk\nc0nenGSfbvuWJHfr1jcl+XKSy5N8Nckjpn7Huvffz8xxfpuSfDTJvya5V5IPdr8D5yd5UBeze5I/\n735//jvJdRn8zh6vj/XxWulj/Tx3P+vjHe+9rI/7817Wxzve34vl6uOh9irJEUme0i1HpJvkeCHL\ngoL7tCT5WveG2L17vDGD4vX53eP/6n5elMF35u6W5MYkd+623zHJ54Ze3H9NclSSR3Q/r+7WHzHc\nXrd+frqCNMmdklzUrV86FPOZafleONVOBiPpv9i9Ga9L8v4kxyXZo4uZymvnJNcmWTf0pvjc8Hl1\n67sl+Vi3fu+hc/cLO/6C4ANJXpzk7kPt3r3b9h/d4wfPsjwkydVdzNu683pSBh+ovC3JrsPvpe59\n8rwkJ3d98uIu7+cleVcX86MkX5m2/LD7+eXp780k/5zkL5P8dJIXJHnn1PtrKOajSX6uW79vkq3d\n+leSvCKD38Xzuv3vMa2Pz0vyS0l+PckVSZ7abX90kk916+9KcnyS/ZK8MIMPng5IcnqS/1cf6+O1\n0Mf+Xszdz/p4x3sv6+P+vJf18Y7392K5+riL+8UM/p/+vi7nf+764/Ikvzj82s63rHoBupSle+Fn\nWi5K8v0kF0+L373rqFdmqJAcev6/psVPxezUvaAfTHJYt+3L02I/m0EBtnduX7ROFZL/nuSEbv11\nSTYNvZHOn/6G7B7vkuSJSc5Icl237fNJ7tAd76b8pOhbn6547vpg6pdmr6k36tT+fmFXrCC4bI73\n72Xdz1szuFLgozMs3x1+Lw7t+ycZXA3w4/dbbvte/tos7+UXda/FA4ee+8q02M9M32+Gdi7NT64+\n+L/TYi6aoZ3NSf4+yTXdeT1nhJynfm8+O2371O/KThl8KKOP9XHv+3i4L/Xz7ftZH+9472V93J/3\nsj7e8f5eLFcfD+WzcYY+3j9Dg4KjLCMHTuKSwWjlYRkUNsPLxgwu5/1IukJ0aJ+dk7whya3d408n\n2W2qo4fi9szti8z9MihO/26GF3FbBiODX+l+To1C7j70Rtozg8uFv9Qd94dd7MeTHDr9jT3D+U7l\n+YJuv69mcMnxh5P8UwbF7Eu6mOdnUED+Uwb/MZ0qqDckOadb9wvbxvuHMYPLwf8o3Why99w+GRT2\nH+oefz7JAbO8DlcMndNO0547PoOR4q9OzyXJX8503tPex69Mskdu/yHN9gyK9Bdl8H6uoeemrgh4\nXnduj8pg9P1vM7iC4WVJ3ji9j4f2X5fkcUle1z3+VAaf1P1qBu/nJ3XbH5GffAjxySQP79afmOQD\nw+/TCerjz+ljfbzYPu5+6udZ+lkfL0sf3+7/F5ng97I+7td7eQX72N/kQT+/ajX7eRF9fMxMfdyt\nfzHd/9unHe8OSS6fqW9nW0YOnMQlg8tQHz7Lc2/uXvy7z/L8w7qfu87y/N0yVGBNe+7oDA2hz5Pj\nbkn2n7btzkkOzWDUc59pz913xHbvkW6EMcldkjw1yeHTYh7QbT9wljYm5Rd2LRddeyX5XxkUuTdk\ncMn3pd22qZH2pya53yx9PHXM/53kF2Z4/nFJvtit/3m6S++nxfxskrfOsP2JGVw2fc207S+Ztkxd\nVn/3JG8Yijsqg3u9/yuDD1Xem8GneLt0z79lhPfxoRlcOfC+JAd2r9W3uvfOkUMx53X994mpvsrg\nQ5qT9PGi+viGro8fNhQz3Mf3naePb+j6+H9PQB8fs8Q+fuQMffzbC+zjwxbQx9/KDO/jbt17eXn6\neMb3sT5etr8Xh6xEP09oH8/092Ic/+5N9fHU34vevJez9L/JR42pj5fy9+LSrn8noo8X8F7+nyP0\n82eyiH/7lquPu8endK/3i5M8vVte3G07Zb7X+zZ5LSTYsraW3PaP4vRf2L26mEn+hV3MH8apkdzl\n/sdnrj+MByb5hennn9veL3xgBpc4LybmlxbbTgb3kh+8zPksNOb+I8bM2odJDs9PLjl/QAYffDx+\nWuxwzEEZfECyEjEPzOB+7qW2s5TzOmKEdo6Yr51p8W8c4XfoDSsR072P/31S8lmu/sngapIXZY57\njDKYgPCFKxSzuXsvL0c7SzqvxbbRvc/37NZ3y+DfprMz+Hdvz6GY4fk2/jyDyRqnx+w5QsxwOy+b\nJ2a3DP49/dAcx9pthGMt93nN1M5wPjOd10lJ7jXP+3vOmOVoY6aYDP27N+5jjTnmDhnMv/IL3eNn\nZDCPyYn5SXGya5LfHIp5egZXHS5rTJfL8PPPzOBqv+ltHDfPcaa3s9hzGqVvpsfcrp1u+88k+cMk\nr85glPS56X6PhmLuk+QPMvh/4itXMOb/y2AwZ3rMVM7ztbOk85rhOL8zRy6vni2XLu6gDG5X/D/d\ncnKSg+b6HZhpqa4xuI2qOqG19rrVjKmqO2YwudPnJyGfxcRkMOp8YgYfGByWweRl7+qe/0xr7cFV\nddIIMc9L8nvLELNcx1rOdn43gw9XFhWTwX3Ov5TBbQYfzKDQ+1iSx2Qwgn5qVb1kWswRGVx2vhIx\no+SzXDHLlc9t2ukeT/eoDP7jktbaE6vqrGnPVwajGysRM0o+yxUzznzOa60dniRV9VsZ/I69M4Mr\nR97dWtsyLebZXcw7Vijmd5chn0WdV5KnLDCX38rg78/0XC7O4NafW6rqtCTfzmBeiEd3258yQ8x3\nkrx1hWKWK5/Vbue/u32/lMEVdP/eWvvG8Jt/WswZXcx1oz6/hJgzF5rLDDGLOqclHOutM8S8KYO/\n2XfMYPLKO2Xwfn90BlezHTcUs1sGH8zvnuTtC4hJa+34+WIyKG6Hn19oLrMdZ7nOaa52pvpvpnZO\nSvKEDL4p5fEZDJZ8K8mTk/xua+1jqxDzyxncwriUdp6fwdWni84ng4GdJZ9TlttCK2HLjrFk2j2l\nYhYXk9Fn5hazyJiMNrO5mCXEZMSZ4VcwZrnymbTzGmV2/R0yZhmPM8q3E4hZeswo3/AwZ8xytLHG\nY0b5howViZmkXJY5ZpRvGNkhY5brON3jJX1Ly/Cyc9hhVdXnZnsqg3ttxSw95qbW2s1J0lrbVlVH\nJXlrVf10F5MM7k8Ws/iYW1prtyb5TlV9qbV2Yxf73ar6UdeGmKXFbMpgQro/SfKHrbULq+q7rbWP\n5ycesoIxy5XPpJ3XTlW1Vwb/oV3XuhGa1tq3q+qWHTxmuY4zfAXQZ6tqU2tta1XdN4MJHcUsT0xr\nrf0og/ku/qOqdslPvmXgFRncrjNfzBXL0MZajtmpqu6QwQc3u2VQHHwzg0t1d+leh5WKuXWCclnO\nmGRQ+N7abd89gxfna91rsiPHtGU8zpkZXL10VGvtmiSpqrtn8EHOmRl8wDOatoAq2LK2lswzq7SY\npcdktJm5xSwhJiPMbC5m6THd41lnhhez9JiMNrv+DhmzjMcZ5dsJxCw9ZpRveJgzZjnaWOMxo3xD\nxorETFIuyxzz/Mz/DSM7ZMxyHad7PO+3tIy6jBxoWXtL5plVWszSYzLazNxilhCTEWY2F7P0mGnb\n550ZXszSY4Zibze7vpiltZE5vp1AzNJjMsI3PMwXsxxtrOWYLm6Ub8hYkZhJymWZY+b8hpEdOWYZ\njzPvt7SMupg8CgAAgBXX3UJycgZfFfVT3eZrk5yVZEtr7YaR21LYAgAAMElqhG8huU28whYAAIBJ\nUlVfa63de9R4syIDAACw4mqEbyEZlcIWAACA1bBPkscmmX4vbSX55EIaUtgCAACwGs5Osntr7cLp\nT1TVxxbSkHtsAQAA6LWdVjsBAAAAWAqFLQAAAL2msAWACVZVM06eUVWvr6qnrnQ+ADCJFLYAMMFa\na0eudg4AMOnMigwAE6yqbm6t7V5VleT/JHlMkiuS/GB1MwOAyWHEFgD64clJ7pfkoCS/mcRILgB0\nFLYA0A8/n+SM1tqtrbWrknxktRMCgEmhsAUAAKDXFLYA0A/nJPm1qlpXVfsmeeRqJwQAk8LkUQDQ\nD+9I8qgklyT5WpJPrW46ADA5qrW22jkAAADAorkUGQAAgF5T2AIAANBrClsAAAB6TWELAABAryls\nAQAA6DWFLQAAAL2msAUAAKDXFLYAAAD02v8P/361UxxliukAAAAASUVORK5CYII=\n",
253 |       "text/plain": [
254 |        "<matplotlib.figure.Figure at 0x7fb3441606d8>"
255 |       ]
256 |      },
257 |      "metadata": {},
258 |      "output_type": "display_data"
259 |     }
260 |    ],
261 |    "source": [
262 |     "import matplotlib.pyplot as plt\n",
263 |     "%matplotlib inline\n",
264 |     "\n",
265 |     "df_plot.set_index('id', inplace=True)\n",
266 |     "df_plot.plot(kind='bar', figsize=(16, 6))\n",
267 |     "plt.ylabel('Word Count')\n",
268 |     "plt.title('Word Count distribution')\n",
269 |     "plt.show()"
270 |    ]
271 |   },
272 |   {
273 |    "cell_type": "code",
274 |    "execution_count": 12,
275 |    "metadata": {
276 |     "collapsed": true
277 |    },
278 |    "outputs": [],
279 |    "source": [
280 |     "from textblob import TextBlob\n",
281 |     "def sentiment_score(chat):\n",
282 |     "        return TextBlob(chat).sentiment.polarity"
283 |    ]
284 |   },
285 |   {
286 |    "cell_type": "code",
287 |    "execution_count": 13,
288 |    "metadata": {},
289 |    "outputs": [],
290 |    "source": [
291 |     "from pyspark.sql.types import FloatType\n",
292 |     "sentiment_score_udf = F.udf(lambda x: sentiment_score(x), FloatType())\n"
293 |    ]
294 |   },
295 |   {
296 |    "cell_type": "code",
297 |    "execution_count": 14,
298 |    "metadata": {},
299 |    "outputs": [
300 |     {
301 |      "name": "stdout",
302 |      "output_type": "stream",
303 |      "text": [
304 |       "+---+---------------+--------------------+----------+---------------+\n",
305 |       "| id|          label|                chat|word_count|sentiment_score|\n",
306 |       "+---+---------------+--------------------+----------+---------------+\n",
307 |       "|  1|       escalate|I had a friend th...|       304|    0.018961353|\n",
308 |       "|  2|       escalate|\"My friend dealt ...|       184|     0.20601852|\n",
309 |       "|  3|       escalate|Friend who had bi...|        90|    0.008333334|\n",
310 |       "|  4|do_not_escalate|Over the internet...|        88|    0.045833334|\n",
311 |       "|  5|       escalate|Having gone throu...|        71|         0.0125|\n",
312 |       "|  6|       escalate|My now girlfriend...|        73|     0.06333333|\n",
313 |       "|  7|do_not_escalate|\"Only really one ...|        74|    0.036363635|\n",
314 |       "|  8|do_not_escalate|Now that I've bee...|        62|          0.125|\n",
315 |       "|  9|do_not_escalate|I've always been ...|        60|           0.31|\n",
316 |       "| 10|       escalate|I feel completely...|        56|      -0.078125|\n",
317 |       "| 11|do_not_escalate|Took a week off w...|        60|     0.16666667|\n",
318 |       "| 12|       escalate|One of my best fr...|        59|            0.4|\n",
319 |       "| 13|       escalate|I've had some fri...|        50|           0.19|\n",
320 |       "| 14|do_not_escalate|Haha. In eight gr...|        55|     0.29666665|\n",
321 |       "| 15|do_not_escalate|Some of my friend...|        49|            0.4|\n",
322 |       "| 16|       escalate|I feel like depre...|        41|           0.05|\n",
323 |       "| 17|       escalate|i've had a couple...|        38|     0.16666667|\n",
324 |       "| 18|       escalate|I will always lis...|        41|         -0.025|\n",
325 |       "| 19|do_not_escalate|A lot for my frie...|        44|    0.035858586|\n",
326 |       "| 20|do_not_escalate|When my friend ne...|        42|   -0.094444446|\n",
327 |       "+---+---------------+--------------------+----------+---------------+\n",
328 |       "only showing top 20 rows\n",
329 |       "\n"
330 |      ]
331 |     }
332 |    ],
333 |    "source": [
334 |     "df = df.select('id', 'label', 'chat','word_count',\n",
335 |     "                   sentiment_score_udf('chat').alias('sentiment_score'))\n",
336 |     "df.show()"
337 |    ]
338 |   },
339 |   {
340 |    "cell_type": "code",
341 |    "execution_count": 15,
342 |    "metadata": {},
343 |    "outputs": [
344 |     {
345 |      "name": "stdout",
346 |      "output_type": "stream",
347 |      "text": [
348 |       "+---------------+--------------------+\n",
349 |       "|          label| avg_sentiment_score|\n",
350 |       "+---------------+--------------------+\n",
351 |       "|       escalate| 0.06338859780558519|\n",
352 |       "|do_not_escalate|0.031975071089198955|\n",
353 |       "+---------------+--------------------+\n",
354 |       "\n"
355 |      ]
356 |     }
357 |    ],
358 |    "source": [
359 |     "df.groupBy('label')\\\n",
360 |     "    .agg(F.avg('sentiment_score').alias('avg_sentiment_score'))\\\n",
361 |     "    .orderBy('avg_sentiment_score', ascending = False) \\\n",
362 |     "    .show()"
363 |    ]
364 |   },
365 |   {
366 |    "cell_type": "code",
367 |    "execution_count": 16,
368 |    "metadata": {},
369 |    "outputs": [
370 |     {
371 |      "name": "stdout",
372 |      "output_type": "stream",
373 |      "text": [
374 |       "+---+---------------+--------------------+----------+---------------+--------------------+\n",
375 |       "| id|          label|                chat|word_count|sentiment_score|               words|\n",
376 |       "+---+---------------+--------------------+----------+---------------+--------------------+\n",
377 |       "|  1|       escalate|I had a friend th...|       304|    0.018961353|[I, had, a, frien...|\n",
378 |       "|  2|       escalate|\"My friend dealt ...|       184|     0.20601852|[\"My, friend, dea...|\n",
379 |       "|  3|       escalate|Friend who had bi...|        90|    0.008333334|[Friend, who, had...|\n",
380 |       "|  4|do_not_escalate|Over the internet...|        88|    0.045833334|[Over, the, inter...|\n",
381 |       "|  5|       escalate|Having gone throu...|        71|         0.0125|[Having, gone, th...|\n",
382 |       "|  6|       escalate|My now girlfriend...|        73|     0.06333333|[My, now, girlfri...|\n",
383 |       "|  7|do_not_escalate|\"Only really one ...|        74|    0.036363635|[\"Only, really, o...|\n",
384 |       "|  8|do_not_escalate|Now that I've bee...|        62|          0.125|[Now, that, I've,...|\n",
385 |       "|  9|do_not_escalate|I've always been ...|        60|           0.31|[I've, always, be...|\n",
386 |       "| 10|       escalate|I feel completely...|        56|      -0.078125|[I, feel, complet...|\n",
387 |       "| 11|do_not_escalate|Took a week off w...|        60|     0.16666667|[Took, a, week, o...|\n",
388 |       "| 12|       escalate|One of my best fr...|        59|            0.4|[One, of, my, bes...|\n",
389 |       "| 13|       escalate|I've had some fri...|        50|           0.19|[I've, had, some,...|\n",
390 |       "| 14|do_not_escalate|Haha. In eight gr...|        55|     0.29666665|[Haha., In, eight...|\n",
391 |       "| 15|do_not_escalate|Some of my friend...|        49|            0.4|[Some, of, my, fr...|\n",
392 |       "| 16|       escalate|I feel like depre...|        41|           0.05|[I, feel, like, d...|\n",
393 |       "| 17|       escalate|i've had a couple...|        38|     0.16666667|[i've, had, a, co...|\n",
394 |       "| 18|       escalate|I will always lis...|        41|         -0.025|[I, will, always,...|\n",
395 |       "| 19|do_not_escalate|A lot for my frie...|        44|    0.035858586|[A, lot, for, my,...|\n",
396 |       "| 20|do_not_escalate|When my friend ne...|        42|   -0.094444446|[When, my, friend...|\n",
397 |       "+---+---------------+--------------------+----------+---------------+--------------------+\n",
398 |       "only showing top 20 rows\n",
399 |       "\n"
400 |      ]
401 |     }
402 |    ],
403 |    "source": [
404 |     "df = df.withColumn('words',F.split(F.col('chat'),' '))\n",
405 |     "df.show()"
406 |    ]
407 |   },
408 |   {
409 |    "cell_type": "code",
410 |    "execution_count": 17,
411 |    "metadata": {
412 |     "collapsed": true
413 |    },
414 |    "outputs": [],
415 |    "source": [
416 |     "stop_words = ['i','me','my','myself','we','our','ours','ourselves',\n",
417 |     "              'you','your','yours','yourself','yourselves','he','him',\n",
418 |     "              'his','himself','she','her','hers','herself','it','its',\n",
419 |     "              'itself','they','them','their','theirs','themselves',\n",
420 |     "              'what','which','who','whom','this','that','these','those',\n",
421 |     "              'am','is','are','was','were','be','been','being','have',\n",
422 |     "              'has','had','having','do','does','did','doing','a','an',\n",
423 |     "              'the','and','but','if','or','because','as','until','while',\n",
424 |     "              'of','at','by','for','with','about','against','between',\n",
425 |     "              'into','through','during','before','after','above','below',\n",
426 |     "              'to','from','up','down','in','out','on','off','over','under',\n",
427 |     "              'again','further','then','once','here','there','when','where',\n",
428 |     "              'why','how','all','any','both','each','few','more','most',\n",
429 |     "              'other','some','such','no','nor','not','only','own','same',\n",
430 |     "              'so','than','too','very','can','will','just','don','should','now']"
431 |    ]
432 |   },
433 |   {
434 |    "cell_type": "code",
435 |    "execution_count": 18,
436 |    "metadata": {
437 |     "collapsed": true
438 |    },
439 |    "outputs": [],
440 |    "source": [
441 |     "from pyspark.ml.feature import StopWordsRemover "
442 |    ]
443 |   },
444 |   {
445 |    "cell_type": "code",
446 |    "execution_count": 19,
447 |    "metadata": {
448 |     "collapsed": true
449 |    },
450 |    "outputs": [],
451 |    "source": [
452 |     "stopwordsRemovalFeature = StopWordsRemover(inputCol=\"words\", \n",
453 |     "                                           outputCol=\"words without stop\").setStopWords(stop_words)"
454 |    ]
455 |   },
456 |   {
457 |    "cell_type": "code",
458 |    "execution_count": 20,
459 |    "metadata": {
460 |     "collapsed": true
461 |    },
462 |    "outputs": [],
463 |    "source": [
464 |     "from pyspark.ml import Pipeline\n",
465 |     "stopWordRemovalPipeline = Pipeline(stages=[stopwordsRemovalFeature])\n",
466 |     "pipelineFitRemoveStopWords = stopWordRemovalPipeline.fit(df)"
467 |    ]
468 |   },
469 |   {
470 |    "cell_type": "code",
471 |    "execution_count": 21,
472 |    "metadata": {
473 |     "scrolled": false
474 |    },
475 |    "outputs": [
476 |     {
477 |      "name": "stdout",
478 |      "output_type": "stream",
479 |      "text": [
480 |       "+--------------------+--------------------+\n",
481 |       "|               words|  words without stop|\n",
482 |       "+--------------------+--------------------+\n",
483 |       "|[I, had, a, frien...|[friend, would, g...|\n",
484 |       "|[\"My, friend, dea...|[\"My, friend, dea...|\n",
485 |       "|[Friend, who, had...|[Friend, big, add...|\n",
486 |       "|[Over, the, inter...|[internet, LOT, p...|\n",
487 |       "|[Having, gone, th...|[gone, depression...|\n",
488 |       "+--------------------+--------------------+\n",
489 |       "only showing top 5 rows\n",
490 |       "\n"
491 |      ]
492 |     }
493 |    ],
494 |    "source": [
495 |     "df = pipelineFitRemoveStopWords.transform(df)\n",
496 |     "df.select('words', 'words without stop').show(5)"
497 |    ]
498 |   },
499 |   {
500 |    "cell_type": "code",
501 |    "execution_count": 22,
502 |    "metadata": {},
503 |    "outputs": [],
504 |    "source": [
505 |     "label = F.udf(lambda x: 1.0 if x == 'escalate' else 0.0, FloatType())\n",
506 |     "df = df.withColumn('label', label('label'))"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "code",
511 |    "execution_count": 23,
512 |    "metadata": {},
513 |    "outputs": [
514 |     {
515 |      "name": "stdout",
516 |      "output_type": "stream",
517 |      "text": [
518 |       "+-----+\n",
519 |       "|label|\n",
520 |       "+-----+\n",
521 |       "|  1.0|\n",
522 |       "|  1.0|\n",
523 |       "|  1.0|\n",
524 |       "|  0.0|\n",
525 |       "|  1.0|\n",
526 |       "|  1.0|\n",
527 |       "|  0.0|\n",
528 |       "|  0.0|\n",
529 |       "|  0.0|\n",
530 |       "|  1.0|\n",
531 |       "|  0.0|\n",
532 |       "|  1.0|\n",
533 |       "|  1.0|\n",
534 |       "|  0.0|\n",
535 |       "|  0.0|\n",
536 |       "|  1.0|\n",
537 |       "|  1.0|\n",
538 |       "|  1.0|\n",
539 |       "|  0.0|\n",
540 |       "|  0.0|\n",
541 |       "+-----+\n",
542 |       "only showing top 20 rows\n",
543 |       "\n"
544 |      ]
545 |     }
546 |    ],
547 |    "source": [
548 |     "df.select('label').show()"
549 |    ]
550 |   },
551 |   {
552 |    "cell_type": "code",
553 |    "execution_count": 24,
554 |    "metadata": {
555 |     "collapsed": true
556 |    },
557 |    "outputs": [],
558 |    "source": [
559 |     "import pyspark.ml.feature as feat\n",
560 |     "TF_ = feat.HashingTF(inputCol=\"words without stop\", \n",
561 |     "                     outputCol=\"rawFeatures\", numFeatures=100000)\n",
562 |     "IDF_ = feat.IDF(inputCol=\"rawFeatures\", outputCol=\"features\")"
563 |    ]
564 |   },
565 |   {
566 |    "cell_type": "code",
567 |    "execution_count": 25,
568 |    "metadata": {
569 |     "collapsed": true
570 |    },
571 |    "outputs": [],
572 |    "source": [
573 |     "pipelineTFIDF = Pipeline(stages=[TF_, IDF_])"
574 |    ]
575 |   },
576 |   {
577 |    "cell_type": "code",
578 |    "execution_count": 26,
579 |    "metadata": {},
580 |    "outputs": [],
581 |    "source": [
582 |     "pipelineFit = pipelineTFIDF.fit(df)\n",
583 |     "df = pipelineFit.transform(df)"
584 |    ]
585 |   },
586 |   {
587 |    "cell_type": "code",
588 |    "execution_count": 27,
589 |    "metadata": {},
590 |    "outputs": [
591 |     {
592 |      "name": "stdout",
593 |      "output_type": "stream",
594 |      "text": [
595 |       "+-----+--------------------+--------------------+\n",
596 |       "|label|         rawFeatures|            features|\n",
597 |       "+-----+--------------------+--------------------+\n",
598 |       "|  1.0|(100000,[76,1583,...|(100000,[76,1583,...|\n",
599 |       "|  1.0|(100000,[5319,105...|(100000,[5319,105...|\n",
600 |       "|  1.0|(100000,[618,7515...|(100000,[618,7515...|\n",
601 |       "|  0.0|(100000,[3370,444...|(100000,[3370,444...|\n",
602 |       "|  1.0|(100000,[4442,101...|(100000,[4442,101...|\n",
603 |       "|  1.0|(100000,[7369,775...|(100000,[7369,775...|\n",
604 |       "|  0.0|(100000,[232,6124...|(100000,[232,6124...|\n",
605 |       "|  0.0|(100000,[2732,335...|(100000,[2732,335...|\n",
606 |       "|  0.0|(100000,[4047,425...|(100000,[4047,425...|\n",
607 |       "|  1.0|(100000,[6531,135...|(100000,[6531,135...|\n",
608 |       "|  0.0|(100000,[5330,120...|(100000,[5330,120...|\n",
609 |       "|  1.0|(100000,[1197,444...|(100000,[1197,444...|\n",
610 |       "|  1.0|(100000,[4442,107...|(100000,[4442,107...|\n",
611 |       "|  0.0|(100000,[232,4441...|(100000,[232,4441...|\n",
612 |       "|  0.0|(100000,[781,3526...|(100000,[781,3526...|\n",
613 |       "|  1.0|(100000,[13806,14...|(100000,[13806,14...|\n",
614 |       "|  1.0|(100000,[4442,108...|(100000,[4442,108...|\n",
615 |       "|  1.0|(100000,[76,11034...|(100000,[76,11034...|\n",
616 |       "|  0.0|(100000,[10001,27...|(100000,[10001,27...|\n",
617 |       "|  0.0|(100000,[29385,39...|(100000,[29385,39...|\n",
618 |       "+-----+--------------------+--------------------+\n",
619 |       "only showing top 20 rows\n",
620 |       "\n"
621 |      ]
622 |     }
623 |    ],
624 |    "source": [
625 |     "df.select('label', 'rawFeatures','features').show()"
626 |    ]
627 |   },
628 |   {
629 |    "cell_type": "code",
630 |    "execution_count": 28,
631 |    "metadata": {
632 |     "collapsed": true
633 |    },
634 |    "outputs": [],
635 |    "source": [
636 |     "(trainingDF, testDF) = df.randomSplit([0.75, 0.25], seed = 1234)"
637 |    ]
638 |   },
639 |   {
640 |    "cell_type": "code",
641 |    "execution_count": 29,
642 |    "metadata": {
643 |     "collapsed": true
644 |    },
645 |    "outputs": [],
646 |    "source": [
647 |     "from pyspark.ml.classification import LogisticRegression\n",
648 |     "logreg = LogisticRegression(regParam=0.025)"
649 |    ]
650 |   },
651 |   {
652 |    "cell_type": "code",
653 |    "execution_count": 30,
654 |    "metadata": {},
655 |    "outputs": [],
656 |    "source": [
657 |     "logregModel = logreg.fit(trainingDF)"
658 |    ]
659 |   },
660 |   {
661 |    "cell_type": "code",
662 |    "execution_count": 31,
663 |    "metadata": {
664 |     "collapsed": true
665 |    },
666 |    "outputs": [],
667 |    "source": [
668 |     "predictionDF = logregModel.transform(testDF)"
669 |    ]
670 |   },
671 |   {
672 |    "cell_type": "code",
673 |    "execution_count": 32,
674 |    "metadata": {},
675 |    "outputs": [
676 |     {
677 |      "name": "stdout",
678 |      "output_type": "stream",
679 |      "text": [
680 |       "+-----+--------------------+----------+\n",
681 |       "|label|         probability|prediction|\n",
682 |       "+-----+--------------------+----------+\n",
683 |       "|  1.0|[0.00339966489826...|       1.0|\n",
684 |       "|  1.0|[0.55815635574642...|       0.0|\n",
685 |       "|  1.0|[0.03557500295368...|       1.0|\n",
686 |       "|  0.0|[0.52714451276392...|       0.0|\n",
687 |       "|  0.0|[0.64630042307877...|       0.0|\n",
688 |       "|  0.0|[0.69042286406135...|       0.0|\n",
689 |       "|  1.0|[0.44672236248681...|       1.0|\n",
690 |       "|  0.0|[0.67209249316671...|       0.0|\n",
691 |       "|  0.0|[0.96010780703860...|       0.0|\n",
692 |       "|  1.0|[0.75210799156076...|       0.0|\n",
693 |       "|  0.0|[0.90904812079420...|       0.0|\n",
694 |       "|  0.0|[0.97354469378068...|       0.0|\n",
695 |       "|  0.0|[0.96576753489686...|       0.0|\n",
696 |       "|  0.0|[0.89685928798301...|       0.0|\n",
697 |       "|  0.0|[0.92552854921657...|       0.0|\n",
698 |       "|  0.0|[0.94649994610325...|       0.0|\n",
699 |       "|  0.0|[0.89486269398390...|       0.0|\n",
700 |       "|  0.0|[0.65225541621797...|       0.0|\n",
701 |       "|  0.0|[0.95636713428689...|       0.0|\n",
702 |       "|  0.0|[0.95927102608436...|       0.0|\n",
703 |       "+-----+--------------------+----------+\n",
704 |       "only showing top 20 rows\n",
705 |       "\n"
706 |      ]
707 |     }
708 |    ],
709 |    "source": [
710 |     "predictionDF.select('label', 'probability', 'prediction').show()"
711 |    ]
712 |   },
713 |   {
714 |    "cell_type": "code",
715 |    "execution_count": 33,
716 |    "metadata": {},
717 |    "outputs": [
718 |     {
719 |      "name": "stdout",
720 |      "output_type": "stream",
721 |      "text": [
722 |       "+----------------+---+---+\n",
723 |       "|label_prediction|0.0|1.0|\n",
724 |       "+----------------+---+---+\n",
725 |       "|             1.0|  2|  3|\n",
726 |       "|             0.0| 19|  0|\n",
727 |       "+----------------+---+---+\n",
728 |       "\n"
729 |      ]
730 |     }
731 |    ],
732 |    "source": [
733 |     "predictionDF.crosstab('label', 'prediction').show()"
734 |    ]
735 |   },
736 |   {
737 |    "cell_type": "code",
738 |    "execution_count": 34,
739 |    "metadata": {
740 |     "collapsed": true
741 |    },
742 |    "outputs": [],
743 |    "source": [
744 |     "from sklearn import metrics\n",
745 |     "actual = predictionDF.select('label').toPandas()\n",
746 |     "predicted = predictionDF.select('prediction').toPandas()"
747 |    ]
748 |   },
749 |   {
750 |    "cell_type": "code",
751 |    "execution_count": 35,
752 |    "metadata": {},
753 |    "outputs": [
754 |     {
755 |      "name": "stdout",
756 |      "output_type": "stream",
757 |      "text": [
758 |       "accuracy score: 91.7%\n"
759 |      ]
760 |     }
761 |    ],
762 |    "source": [
763 |     "print('accuracy score: {}%'.format(round(metrics.accuracy_score(actual, predicted),3)*100))"
764 |    ]
765 |   },
766 |   {
767 |    "cell_type": "code",
768 |    "execution_count": 36,
769 |    "metadata": {},
770 |    "outputs": [
771 |     {
772 |      "name": "stdout",
773 |      "output_type": "stream",
774 |      "text": [
775 |       "The ROC score is 93.7%\n"
776 |      ]
777 |     }
778 |    ],
779 |    "source": [
780 |     "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n",
781 |     "\n",
782 |     "scores = predictionDF.select('label', 'rawPrediction')\n",
783 |     "evaluator = BinaryClassificationEvaluator()\n",
784 |     "print('The ROC score is {}%'.format(round(evaluator.evaluate(scores),3)*100))"
785 |    ]
786 |   },
787 |   {
788 |    "cell_type": "code",
789 |    "execution_count": 37,
790 |    "metadata": {
791 |     "scrolled": true
792 |    },
793 |    "outputs": [
794 |     {
795 |      "name": "stdout",
796 |      "output_type": "stream",
797 |      "text": [
798 |       "+-------+-------------------+\n",
799 |       "|summary|              label|\n",
800 |       "+-------+-------------------+\n",
801 |       "|  count|                 24|\n",
802 |       "|   mean|0.20833333333333334|\n",
803 |       "| stddev|0.41485111699905336|\n",
804 |       "|    min|                0.0|\n",
805 |       "|    max|                1.0|\n",
806 |       "+-------+-------------------+\n",
807 |       "\n"
808 |      ]
809 |     }
810 |    ],
811 |    "source": [
812 |     "predictionDF.describe('label').show()"
813 |    ]
814 |   },
815 |   {
816 |    "cell_type": "code",
817 |    "execution_count": null,
818 |    "metadata": {
819 |     "collapsed": true
820 |    },
821 |    "outputs": [],
822 |    "source": []
823 |   }
824 |  ],
825 |  "metadata": {
826 |   "kernelspec": {
827 |    "display_name": "Python 3",
828 |    "language": "python",
829 |    "name": "python3"
830 |   },
831 |   "language_info": {
832 |    "codemirror_mode": {
833 |     "name": "ipython",
834 |     "version": 3
835 |    },
836 |    "file_extension": ".py",
837 |    "mimetype": "text/x-python",
838 |    "name": "python",
839 |    "nbconvert_exporter": "python",
840 |    "pygments_lexer": "ipython3",
841 |    "version": "3.6.1"
842 |   }
843 |  },
844 |  "nbformat": 4,
845 |  "nbformat_minor": 2
846 | }
847 | 


--------------------------------------------------------------------------------