├── CH01 └── chapter1 placeholder.txt ├── CH04 └── code │ └── CH04.txt ├── CH11 ├── data │ ├── got1.txt │ ├── got2.txt │ ├── got3.txt │ ├── got4.txt │ └── got5.txt └── code │ └── CH11_Words+to+Vectors.py ├── CH07 ├── data │ └── TherapyBotSession.csv └── code │ ├── Natural+Language+Processing+-+ChatBot.py │ └── Natural+Language+Processing+-+ChatBot.ipynb ├── CH13 ├── football │ ├── messi │ │ ├── messi1.jpeg │ │ ├── messi10.jpeg │ │ ├── messi11.jpeg │ │ ├── messi12.jpeg │ │ ├── messi13.jpeg │ │ ├── messi14.jpeg │ │ ├── messi15.jpeg │ │ ├── messi16.jpeg │ │ ├── messi17.jpeg │ │ ├── messi18.jpeg │ │ ├── messi19.jpeg │ │ ├── messi2.jpeg │ │ ├── messi20.jpeg │ │ ├── messi21.jpeg │ │ ├── messi22.jpeg │ │ ├── messi23.jpeg │ │ ├── messi24.jpeg │ │ ├── messi25.jpeg │ │ ├── messi26.jpeg │ │ ├── messi27.jpeg │ │ ├── messi28.jpeg │ │ ├── messi29.jpeg │ │ ├── messi3.jpeg │ │ ├── messi30.jpeg │ │ ├── messi4.jpeg │ │ ├── messi5.jpeg │ │ ├── messi6.jpeg │ │ ├── messi7.jpeg │ │ ├── messi8.jpeg │ │ └── messi9.jpeg │ └── ronaldo │ │ ├── ronaldo1.jpeg │ │ ├── ronaldo2.jpeg │ │ ├── ronaldo22.jpg │ │ ├── ronaldo23.jpg │ │ ├── ronaldo24.jpg │ │ ├── ronaldo3.jpeg │ │ ├── ronaldo4.jpeg │ │ ├── ronaldo5.jpeg │ │ ├── ronaldo6.jpeg │ │ ├── ronaldo7.jpeg │ │ ├── ronaldo8.jpeg │ │ ├── ronaldo9.jpeg │ │ ├── ronaldo10.jpeg │ │ ├── ronaldo11.jpeg │ │ ├── ronaldo12.jpeg │ │ ├── ronaldo13.jpeg │ │ ├── ronaldo14.jpeg │ │ ├── ronaldo15.jpeg │ │ ├── ronaldo16.jpeg │ │ ├── ronaldo17.jpeg │ │ ├── ronaldo18.jpeg │ │ ├── ronaldo19.jpeg │ │ ├── ronaldo20.jpeg │ │ ├── ronaldo21.jpeg │ │ ├── ronaldo25.jpeg │ │ ├── ronaldo26.jpeg │ │ ├── ronaldo27.jpeg │ │ ├── ronaldo28.jpeg │ │ ├── ronaldo29.jpeg │ │ └── ronaldo30.jpeg └── code │ ├── Image+Classification+with+TensorFlow+on+Spark.py │ └── Image+Classification+with+TensorFlow+on+Spark.ipynb ├── CH02 ├── data │ └── HeightAndWeight.txt └── code │ └── NeuralNetworkfromScratch_with_python_and spark.py ├── README.md ├── CH03 └── code │ ├── MNIST+with+CNN.py │ └── MNIST+with+CNN.ipynb ├── CH10 └── code │ └── CH10_Face+recognition.py ├── CH08 └── code │ └── Real+Estate+Prediction.py ├── CH09 └── code │ └── Predicting+Apple+Stock+Market+Value.py ├── CH05 └── code │ └── Predicting+Fire+Dept+Calls+with+Spark+ML.py ├── CH06 └── code │ ├── CH06_LSTMs+word+level.py │ └── CH06_LSTMs+word+level.ipynb └── CH12 └── code └── Create+a+movie+recommendation+engine+with+Keras.py /CH01/chapter1 placeholder.txt: -------------------------------------------------------------------------------- 1 | chapter1 placeholder 2 | -------------------------------------------------------------------------------- /CH04/code/CH04.txt: -------------------------------------------------------------------------------- 1 | There is no code for this chapter 2 | -------------------------------------------------------------------------------- /CH11/data/got1.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH11/data/got1.txt -------------------------------------------------------------------------------- /CH11/data/got2.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH11/data/got2.txt -------------------------------------------------------------------------------- /CH11/data/got3.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH11/data/got3.txt -------------------------------------------------------------------------------- /CH11/data/got4.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH11/data/got4.txt -------------------------------------------------------------------------------- /CH11/data/got5.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH11/data/got5.txt -------------------------------------------------------------------------------- /CH07/data/TherapyBotSession.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH07/data/TherapyBotSession.csv -------------------------------------------------------------------------------- /CH13/football/messi/messi1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi1.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi10.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi10.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi11.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi11.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi12.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi12.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi13.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi13.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi14.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi14.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi15.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi15.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi16.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi16.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi17.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi17.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi18.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi18.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi19.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi19.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi2.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi20.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi20.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi21.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi21.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi22.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi22.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi23.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi23.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi24.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi24.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi25.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi25.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi26.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi26.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi27.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi27.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi28.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi28.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi29.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi29.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi3.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi30.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi30.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi4.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi4.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi5.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi5.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi6.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi6.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi7.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi7.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi8.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi8.jpeg -------------------------------------------------------------------------------- /CH13/football/messi/messi9.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/messi/messi9.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo1.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo1.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo2.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo2.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo22.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo22.jpg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo23.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo23.jpg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo24.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo24.jpg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo3.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo3.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo4.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo4.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo5.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo5.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo6.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo6.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo7.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo7.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo8.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo8.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo9.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo9.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo10.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo10.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo11.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo11.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo12.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo12.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo13.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo13.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo14.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo14.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo15.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo15.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo16.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo16.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo17.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo17.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo18.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo18.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo19.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo19.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo20.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo20.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo21.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo21.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo25.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo25.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo26.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo26.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo27.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo27.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo28.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo28.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo29.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo29.jpeg -------------------------------------------------------------------------------- /CH13/football/ronaldo/ronaldo30.jpeg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/asherif844/ApacheSparkDeepLearningCookbook/HEAD/CH13/football/ronaldo/ronaldo30.jpeg -------------------------------------------------------------------------------- /CH02/data/HeightAndWeight.txt: -------------------------------------------------------------------------------- 1 | Gender Height (inches) Weight (lbs) 2 | Female 67 150 3 | Female 65 135 4 | Female 68 130 5 | Male 70 160 6 | Female 70 130 7 | Male 69 174 8 | Male 65 126 9 | Male 74 188 10 | Female 60 110 11 | Female 63 125 12 | Male 70 173 13 | Female 70 145 14 | Male 68 175 15 | Female 65 123 16 | Male 71 145 17 | Male 74 160 18 | Female 64 135 19 | Male 71 175 20 | Male 67 145 21 | Male 67 130 22 | Male 70 162 23 | Female 64 107 24 | Male 70 175 25 | Male 64 130 26 | Male 66 163 27 | Female 63 137 28 | Male 65 165 29 | Female 65 130 30 | Female 64 109 31 | 32 | 33 | 34 | 35 | 36 | 37 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # ApacheSparkDeepLearningCookbook 2 | Title: Apache Spark Deep Learning Cookbook 3 | 4 | Subtitle: Over 80 recipes that streamline deep learning in a distributed environment with Apache Spark 5 | Long Description: 6 | With deep learning gaining rapid mainstream adoption in modern-day industries, organizations are looking for ways to unite popular big data tools with highly efficient deep learning libraries. As a result, this will help deep learning models train with higher efficiency and speed. 7 | 8 | With the help of the Apache Spark Deep Learning Cookbook, you’ll work through specific recipes to generate outcomes for deep learning algorithms, without getting bogged down in theory. From setting up Apache Spark for deep learning to implementing types of neural net, this book tackles both common and not so common problems to perform deep learning on a distributed environment. In addition to this, you’ll get access to deep learning code within Spark that can be reused to answer similar problems or tweaked to answer slightly different problems. You will also learn how to stream and cluster your data with Spark. Once you have got to grips with the basics, you’ll explore how to implement and deploy deep learning models, such as Convolutional Neural Networks (CNN) and Recurrent Neural Networks (RNN) in Spark, using popular libraries such as TensorFlow and Keras. 9 | 10 | By the end of the book, you'll have the expertise to train and deploy efficient deep learning models on Apache Spark. 11 | Short description: 12 | This book will show you how to train and deploy deep learning models on Apache Spark. You will leverage powerful deep learning libraries such as TensorFlow to develop your models and ensure their optimum performance. By the end of this book, you will be able to build efficient distributed applications using Spark, powered by deep learning. 13 | What you will learn: 14 | • Set up a fully functional Spark environment 15 | • Understand practical machine learning and deep learning concepts 16 | • Apply built-in machine learning libraries within Spark 17 | • Explore libraries that are compatible with TensorFlow and Keras 18 | • Explore NLP models such as word2vec and TF-IDF on Spark 19 | • Organize dataframes for deep learning evaluation 20 | • Apply testing and training modeling to ensure accuracy 21 | • Access readily available code that may be reusable 22 | Metadescription: 23 | A solution-based guide to put your deep learning models into production with the power of Apache Spark 24 | Key features: 25 | • Discover practical recipes for distributed deep learning with Apache Spark 26 | • Learn to use libraries such as Keras and TensorFlow 27 | • Explore NLP models such as word2vec and TF-IDF on Spark 28 | • Solve problems to train your deep learning models on Apache Spark 29 | Audience: 30 | If you’re looking for a practical and highly useful resource for implementing efficiently distributed deep learning models with Apache Spark, then the Apache Spark Deep Learning Cookbook is for you. Knowledge of the core machine learning concepts and a basic understanding of the Apache Spark framework is required to get the best out of this book. Additionally, some programming knowledge in Python is a plus. 31 | Approach: 32 | This book includes practical, easy to understand recipes on how you can implement the popular Deep Learning libraries such as TensorFlow and Keras to train your deep learning models on Apache Spark, without getting bogged down in theory. 33 | 34 | 35 | -------------------------------------------------------------------------------- /CH13/code/Image+Classification+with+TensorFlow+on+Spark.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | spark = SparkSession.builder .master("local") .appName("ImageClassification") .config("spark.executor.memory", "6gb") .getOrCreate() 7 | 8 | 9 | # In[2]: 10 | 11 | import pyspark.sql.functions as f 12 | import sparkdl as dl 13 | 14 | 15 | # In[3]: 16 | 17 | dfMessi = dl.readImages('football/messi/').withColumn('label', f.lit(0)) 18 | dfRonaldo = dl.readImages('football/ronaldo/').withColumn('label', f.lit(1)) 19 | 20 | 21 | # In[4]: 22 | 23 | dfMessi.show(n=10,truncate=False) 24 | 25 | 26 | # In[5]: 27 | 28 | dfRonaldo.show(n=10,truncate=False) 29 | 30 | 31 | # In[6]: 32 | 33 | trainDFmessi, testDFmessi = dfMessi.randomSplit([66.7, 33.3], seed =12) 34 | trainDFronaldo, testDFronaldo = dfRonaldo.randomSplit([66.7, 33.3], seed=12) 35 | 36 | 37 | # In[7]: 38 | 39 | print('The number of images in trainDFmessi is {}'.format(trainDFmessi.toPandas().shape[0])) 40 | print('The number of images in testDFmessi is {}'.format(testDFmessi.toPandas().shape[0])) 41 | print('The number of images in trainDFronaldo is {}'.format(trainDFronaldo.toPandas().shape[0])) 42 | print('The number of images in testDFronaldo is {}'.format(testDFronaldo.toPandas().shape[0])) 43 | 44 | 45 | # In[8]: 46 | 47 | trainDF = trainDFmessi.unionAll(trainDFronaldo) 48 | testDF = testDFmessi.unionAll(testDFronaldo) 49 | 50 | 51 | # In[9]: 52 | 53 | print('The number of images in the training data is {}' .format(trainDF.toPandas().shape[0])) 54 | print('The number of images in the testing data is {}' .format(testDF.toPandas().shape[0])) 55 | 56 | 57 | # In[10]: 58 | 59 | from pyspark.ml.classification import LogisticRegression 60 | from pyspark.ml import Pipeline 61 | 62 | vectorizer = dl.DeepImageFeaturizer(inputCol="image", outputCol="features", modelName='InceptionV3') 63 | logreg = LogisticRegression(maxIter=30,labelCol = "label", featuresCol="features") 64 | pipeline = Pipeline(stages=[vectorizer, logreg]) 65 | 66 | pipeline_model = pipeline.fit(trainDF) 67 | 68 | 69 | # In[11]: 70 | 71 | predictDF = pipeline_model.transform(testDF) 72 | predictDF.select('label', 'prediction').show(n = testDF.toPandas().shape[0], truncate=False) 73 | 74 | 75 | # In[12]: 76 | 77 | predictDF.crosstab('prediction', 'label').show() 78 | 79 | 80 | # In[13]: 81 | 82 | from pyspark.ml.evaluation import MulticlassClassificationEvaluator 83 | scoring = predictDF.select("prediction", "label") 84 | accuracy_score = MulticlassClassificationEvaluator(metricName="accuracy") 85 | rate = accuracy_score.evaluate(scoring)*100 86 | print("accuracy: {}%" .format(round(rate,2))) 87 | 88 | 89 | # In[14]: 90 | 91 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 92 | 93 | binaryevaluator = BinaryClassificationEvaluator(rawPredictionCol="prediction") 94 | binary_rate = binaryevaluator.evaluate(predictDF)*100 95 | print("accuracy: {}%" .format(round(binary_rate,2))) 96 | 97 | 98 | # In[15]: 99 | 100 | logregFT = LogisticRegression( 101 | regParam=0.05, 102 | elasticNetParam=0.3, 103 | maxIter=15,labelCol = "label", featuresCol="features") 104 | pipelineFT = Pipeline(stages=[vectorizer, logregFT]) 105 | 106 | pipeline_model_FT = pipelineFT.fit(trainDF) 107 | 108 | 109 | # In[16]: 110 | 111 | predictDF_FT = pipeline_model_FT.transform(testDF) 112 | predictDF_FT.crosstab('prediction', 'label').show() 113 | 114 | 115 | # In[17]: 116 | 117 | binary_rate_FT = binaryevaluator.evaluate(predictDF_FT)*100 118 | print("accuracy: {}%" .format(round(binary_rate_FT,2))) 119 | 120 | -------------------------------------------------------------------------------- /CH03/code/MNIST+with+CNN.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | import tensorflow as tf 7 | 8 | 9 | # In[2]: 10 | 11 | print(tf.__version__) 12 | 13 | 14 | # In[3]: 15 | 16 | from tensorflow.examples.tutorials.mnist import input_data 17 | data = input_data.read_data_sets('MNIST/', one_hot=True) 18 | 19 | 20 | # In[4]: 21 | 22 | import os 23 | os.listdir('MNIST/') 24 | 25 | 26 | # In[5]: 27 | 28 | print('Image Inventory') 29 | print('----------') 30 | print('Training: {}'.format(len(data.train.labels))) 31 | print('Testing: {}'.format(len(data.test.labels))) 32 | print('----------') 33 | 34 | 35 | # In[6]: 36 | 37 | import numpy as np 38 | import matplotlib.pyplot as plt 39 | get_ipython().magic('matplotlib inline') 40 | 41 | 42 | # In[7]: 43 | 44 | for i in range(2): 45 | image = data.train.images[i] 46 | image = np.array(image, dtype='float') 47 | label = data.train.labels[i] 48 | pixels = image.reshape((28, 28)) 49 | plt.imshow(pixels, cmap='gray') 50 | print('-----------------') 51 | print(label) 52 | plt.show() 53 | 54 | 55 | 56 | # In[8]: 57 | 58 | if not os.path.exists('MNIST/images'): 59 | os.makedirs('MNIST/images/') 60 | os.chdir('MNIST/images/') 61 | 62 | 63 | # In[9]: 64 | 65 | from matplotlib import image 66 | for i in range(1,10): 67 | png = data.train.images[i] 68 | png = np.array(png, dtype='float') 69 | pixels = png.reshape((28, 28)) 70 | image.imsave('image_no_{}.png'.format(i), pixels, cmap = 'gray') 71 | 72 | 73 | # In[10]: 74 | 75 | print(os.listdir()) 76 | 77 | 78 | # In[11]: 79 | 80 | from Augmentor import Pipeline 81 | 82 | 83 | # In[12]: 84 | 85 | augmentor = Pipeline('/home/asherif844/sparkNotebooks/Ch03/MNIST/images') 86 | 87 | 88 | # In[13]: 89 | 90 | augmentor.rotate(probability=0.9, max_left_rotation=25, max_right_rotation=25) 91 | 92 | 93 | # In[14]: 94 | 95 | for i in range(1,3): 96 | augmentor.sample(10) 97 | 98 | 99 | # In[15]: 100 | 101 | xtrain = data.train.images 102 | ytrain = np.asarray(data.train.labels) 103 | xtest = data.test.images 104 | ytest = np.asarray(data.test.labels) 105 | 106 | 107 | # In[16]: 108 | 109 | xtrain = xtrain.reshape( xtrain.shape[0],28,28,1) 110 | xtest = xtest.reshape(xtest.shape[0],28,28,1) 111 | ytest= ytest.reshape(ytest.shape[0],10) 112 | ytrain = ytrain.reshape(ytrain.shape[0],10) 113 | 114 | 115 | # In[17]: 116 | 117 | print(xtrain.shape) 118 | print(ytrain.shape) 119 | print(xtest.shape) 120 | print(ytest.shape) 121 | 122 | 123 | # In[18]: 124 | 125 | import keras 126 | import keras.backend as K 127 | from keras.models import Sequential 128 | from keras.layers import Dense, Flatten, Conv2D 129 | 130 | K.set_image_dim_ordering('tf') 131 | 132 | model = Sequential() 133 | 134 | model.add(Conv2D(32, kernel_size=(5, 5),activation='relu', input_shape=(28,28,1))) 135 | model.add(Flatten()) 136 | model.add(Dense(128, activation='relu')) 137 | model.add(Dense(10, activation='sigmoid')) 138 | 139 | 140 | # In[19]: 141 | 142 | model.compile(optimizer='adam',loss='categorical_crossentropy', 143 | metrics=['accuracy']) 144 | 145 | 146 | # In[20]: 147 | 148 | model.fit(xtrain,ytrain,batch_size=512, 149 | epochs=5, 150 | validation_data=(xtest, ytest)) 151 | 152 | 153 | # In[21]: 154 | 155 | stats = model.evaluate(xtest, ytest) 156 | print('The accuracy rate is {}%'.format(round(stats[1],3)*100)) 157 | print('The loss rate is {}%'.format(round(stats[0],2)*100)) 158 | 159 | 160 | # In[22]: 161 | 162 | model.summary() 163 | 164 | 165 | # In[ ]: 166 | 167 | 168 | 169 | -------------------------------------------------------------------------------- /CH10/code/CH10_Face+recognition.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | get_ipython().magic('matplotlib inline') 8 | from os import listdir 9 | from os.path import isfile, join 10 | import matplotlib.pyplot as plt 11 | import matplotlib.image as mpimg 12 | import numpy as np 13 | from keras.models import Sequential 14 | from keras.layers import Dense, Dropout, Activation, Flatten 15 | from keras.optimizers import Adam 16 | from keras.layers.normalization import BatchNormalization 17 | from keras.utils import np_utils 18 | from keras.layers import Conv2D, MaxPooling2D 19 | from keras.preprocessing.image import ImageDataGenerator 20 | 21 | 22 | # In[2]: 23 | 24 | 25 | pwd 26 | 27 | 28 | # In[3]: 29 | 30 | 31 | cd desktop 32 | 33 | 34 | # In[4]: 35 | 36 | 37 | #reading images from the local drive 38 | mypath='MIT-CBCL-facerec-database//training-synthetic' 39 | onlyfiles= [ f for f in listdir(mypath) if isfile(join(mypath,f)) ] 40 | images =np.empty([3240,200,200],dtype=int) 41 | for n in range(0, len(onlyfiles)): 42 | images[n] = mpimg.imread( join(mypath,onlyfiles[n]) ).astype(np.float32) 43 | 44 | 45 | # In[5]: 46 | 47 | 48 | plt.imshow (images[0]) 49 | 50 | 51 | # In[6]: 52 | 53 | 54 | plt.imshow (images[1]) 55 | 56 | 57 | # In[7]: 58 | 59 | 60 | plt. imshow (images[2]) 61 | 62 | 63 | # In[8]: 64 | 65 | 66 | plt.imshow(images[3119]) 67 | 68 | 69 | # In[9]: 70 | 71 | 72 | y =np.empty([3240,1],dtype=int) 73 | for x in range(0, len(onlyfiles)): 74 | if onlyfiles[x][3]=='0': y[x]=0 75 | elif onlyfiles[x][3]=='1': y[x]=1 76 | elif onlyfiles[x][3]=='2': y[x]=2 77 | elif onlyfiles[x][3]=='3': y[x]=3 78 | elif onlyfiles[x][3]=='4': y[x]=4 79 | elif onlyfiles[x][3]=='5': y[x]=5 80 | elif onlyfiles[x][3]=='6': y[x]=6 81 | elif onlyfiles[x][3]=='7': y[x]=7 82 | elif onlyfiles[x][3]=='8': y[x]=8 83 | elif onlyfiles[x][3]=='9': y[x]=9 84 | 85 | 86 | # In[10]: 87 | 88 | 89 | #funtion for cropping images to obtain only the significant part 90 | def crop(img): 91 | a=28*np.ones(len(img)) #background has pixel intensity of 28 92 | b=np.where((img== a).all(axis=1)) #check image background 93 | img=np.delete(img,(b),0) #deleting the unwanted part from the Y axis 94 | plt.imshow(img) 95 | img=img.transpose() 96 | d=28*np.ones(len(img[0])) 97 | e=np.where((img== d).all(axis=1)) 98 | img=np.delete(img,e,0) #deleting the unwanted part from the X axis 99 | img=img.transpose() 100 | print (img.shape) #printing image shape to ensure it is actually being cropped 101 | super_threshold_indices = img < 29 #padding zeros instead of background data 102 | img[super_threshold_indices] = 0 103 | plt.imshow (img) 104 | return img[0:150, 0:128] 105 | 106 | 107 | # In[11]: 108 | 109 | 110 | #cropping all the images 111 | image = np.empty([3240,150,128],dtype=int) 112 | for n in range(0, len(images)): 113 | image[n]=crop(images[n]) 114 | 115 | 116 | # In[12]: 117 | 118 | 119 | print (image[22]) 120 | 121 | 122 | # In[13]: 123 | 124 | 125 | print (image[22].shape) 126 | 127 | 128 | # In[14]: 129 | 130 | 131 | # randomly splitting data into training(80%) and test(20%) sets 132 | test_ind=np.random.choice(range(3240), 648, replace=False) 133 | train_ind=np.delete(range(0,len(onlyfiles)),test_ind) 134 | 135 | 136 | # In[15]: 137 | 138 | 139 | # segregating the training and test images 140 | x_train=image[train_ind] 141 | y1_train=y[train_ind] 142 | x_test=image[test_ind] 143 | y1_test=y[test_ind] 144 | 145 | 146 | # In[16]: 147 | 148 | 149 | #reshaping the input images 150 | x_train = x_train.reshape(x_train.shape[0], 128, 150, 1) 151 | x_test = x_test.reshape(x_test.shape[0], 128, 150, 1) 152 | 153 | 154 | # In[17]: 155 | 156 | 157 | #converting data to float32 158 | x_train = x_train.astype('float32') 159 | x_test = x_test.astype('float32') 160 | 161 | 162 | # In[18]: 163 | 164 | 165 | #normalizing data 166 | x_train/=255 167 | x_test/=255 168 | #10 digits represent the 10 classes 169 | number_of_persons = 10 170 | 171 | 172 | # In[19]: 173 | 174 | 175 | #convert data to vectors 176 | y_train = np_utils.to_categorical(y1_train, number_of_persons) 177 | y_test = np_utils.to_categorical(y1_test, number_of_persons) 178 | 179 | 180 | # In[25]: 181 | 182 | 183 | # model building 184 | model = Sequential() 185 | model.add(Conv2D(16, (3, 3), input_shape=(128,150,1))) #Input layer 186 | model.add(Activation('relu')) # 'relu' as activation function 187 | model.add(Conv2D(16, (3, 3))) #first hidden layer 188 | model.add(Activation('relu')) 189 | model.add(MaxPooling2D(pool_size=(2,2))) # Maxpooling from (2,2) 190 | model.add(Conv2D(16,(3, 3))) # second hidden layer 191 | model.add(Activation('relu')) 192 | model.add(MaxPooling2D(pool_size=(2,2))) # Maxpooling from (2,2) 193 | model.add(Flatten()) #flatten the maxpooled data 194 | # Fully connected layer 195 | model.add(Dense(512)) 196 | model.add(Activation('relu')) 197 | model.add(Dropout(0.25)) #Dropout is applied to overcome overfitting 198 | model.add(Dense(10)) 199 | #output layer 200 | model.add(Activation('softmax')) # 'softmax' is used for SGD 201 | 202 | 203 | # In[26]: 204 | 205 | 206 | model.summary() 207 | 208 | 209 | # In[27]: 210 | 211 | 212 | #model compliation 213 | model.compile(loss='categorical_crossentropy', optimizer=Adam(), metrics=['accuracy']) 214 | 215 | 216 | # In[28]: 217 | 218 | 219 | # data augmentation to reduce overfitting problem 220 | gen = ImageDataGenerator(rotation_range=8, width_shift_range=0.08, shear_range=0.3, 221 | height_shift_range=0.08,zoom_range=0.08) 222 | test_gen = ImageDataGenerator() 223 | train_generator = gen.flow(x_train, y_train, batch_size=16) 224 | test_generator = test_gen.flow(x_test, y_test, batch_size=16) 225 | 226 | 227 | # In[29]: 228 | 229 | 230 | #model fitting 231 | model.fit_generator(train_generator, epochs=5, validation_data=test_generator) 232 | # Final evaluation of the model 233 | scores = model.evaluate(x_test, y_test, verbose=0) 234 | print("Recognition Error: %.2f%%" % (100-scores[1]*100)) 235 | 236 | -------------------------------------------------------------------------------- /CH08/code/Real+Estate+Prediction.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | import numpy as np 8 | import pandas as pd 9 | import matplotlib.pyplot as plt 10 | import seaborn as sns 11 | import mpl_toolkits 12 | from sklearn import preprocessing 13 | from sklearn.preprocessing import LabelEncoder, OneHotEncoder 14 | from sklearn.feature_selection import RFE 15 | from sklearn import linear_model 16 | from sklearn.cross_validation import train_test_split 17 | get_ipython().magic('matplotlib inline') 18 | 19 | 20 | # In[2]: 21 | 22 | 23 | pwd 24 | 25 | 26 | # In[3]: 27 | 28 | 29 | cd '/Users/Chanti/Desktop/Cookbook/Chapter 10' 30 | 31 | 32 | # In[4]: 33 | 34 | 35 | pwd 36 | 37 | 38 | # In[5]: 39 | 40 | 41 | dataframe = pd.read_csv("kc_house_data.csv", header='infer') 42 | 43 | 44 | # In[6]: 45 | 46 | 47 | list(dataframe) 48 | 49 | 50 | # In[7]: 51 | 52 | 53 | dataframe.head() 54 | 55 | 56 | # In[8]: 57 | 58 | 59 | dataframe.tail() 60 | 61 | 62 | # In[9]: 63 | 64 | 65 | dataframe.describe() 66 | 67 | 68 | # In[10]: 69 | 70 | 71 | dataframe['bedrooms'].value_counts().plot(kind='bar') 72 | plt.title('No. of bedrooms') 73 | plt.xlabel('Bedrooms') 74 | plt.ylabel('Count') 75 | sns.despine 76 | 77 | 78 | # In[11]: 79 | 80 | 81 | dataframe['bedrooms'].value_counts().plot(kind='pie') 82 | plt.title('No. of bedrooms') 83 | 84 | 85 | # In[12]: 86 | 87 | 88 | dataframe['floors'].value_counts().plot(kind='bar') 89 | plt.title('Number of floors') 90 | plt.xlabel('No. of floors') 91 | plt.ylabel('Count') 92 | sns.despine 93 | 94 | 95 | # In[13]: 96 | 97 | 98 | plt.figure(figsize=(20,20)) 99 | sns.jointplot(x=dataframe.lat.values, y=dataframe.long.values, size=9) 100 | plt.xlabel('Longitude', fontsize=10) 101 | plt.ylabel('Latitude', fontsize=10) 102 | plt.show() 103 | sns.despine() 104 | 105 | 106 | # In[14]: 107 | 108 | 109 | plt.figure(figsize=(20,20)) 110 | sns.jointplot(x=dataframe.lat.values, y=dataframe.long.values, size=9) 111 | plt.xlabel('Longitude', fontsize=10) 112 | plt.ylabel('Latitude', fontsize=10) 113 | plt.show() 114 | sns.despine() 115 | 116 | 117 | # In[15]: 118 | 119 | 120 | plt.figure(figsize=(8,8)) 121 | plt.scatter(dataframe.price, dataframe.sqft_living) 122 | plt.xlabel('Price') 123 | plt.ylabel('Square feet') 124 | plt.show() 125 | 126 | 127 | # In[16]: 128 | 129 | 130 | plt.figure(figsize=(5,5)) 131 | plt.bar(dataframe.condition, dataframe.price) 132 | plt.xlabel('Condition') 133 | plt.ylabel('Price') 134 | plt.show() 135 | 136 | 137 | # In[17]: 138 | 139 | 140 | plt.figure(figsize=(8,8)) 141 | plt.scatter(dataframe.zipcode, dataframe.price) 142 | plt.xlabel('Zipcode') 143 | plt.ylabel('Price') 144 | plt.show() 145 | 146 | 147 | # In[18]: 148 | 149 | 150 | plt.figure(figsize=(10,10)) 151 | plt.scatter(dataframe.grade, dataframe.price) 152 | plt.xlabel('Grade') 153 | plt.ylabel('Price') 154 | plt.show() 155 | 156 | 157 | # In[19]: 158 | 159 | 160 | x_df = dataframe.drop(['id','date',], axis = 1) 161 | x_df 162 | 163 | 164 | # In[20]: 165 | 166 | 167 | y = dataframe[['price']].copy() 168 | y_df = pd.DataFrame(y) 169 | y_df 170 | 171 | 172 | # In[21]: 173 | 174 | 175 | print('Price Vs Bedrooms: %s' % x_df['price'].corr(x_df['bedrooms'])) 176 | print('Price Vs Bathrooms: %s' % x_df['price'].corr(x_df['bathrooms'])) 177 | print('Price Vs Living Area: %s' % x_df['price'].corr(x_df['sqft_living'])) 178 | print('Price Vs Plot Area: %s' % x_df['price'].corr(x_df['sqft_lot'])) 179 | print('Price Vs No. of floors: %s' % x_df['price'].corr(x_df['floors'])) 180 | print('Price Vs Waterfront property: %s' % x_df['price'].corr(x_df['waterfront'])) 181 | print('Price Vs View: %s' % x_df['price'].corr(x_df['view'])) 182 | print('Price Vs Grade: %s' % x_df['price'].corr(x_df['grade'])) 183 | print('Price Vs Condition: %s' % x_df['price'].corr(x_df['condition'])) 184 | print('Price Vs Sqft Above: %s' % x_df['price'].corr(x_df['sqft_above'])) 185 | print('Price Vs Basement Area: %s' % x_df['price'].corr(x_df['sqft_basement'])) 186 | print('Price Vs Year Built: %s' % x_df['price'].corr(x_df['yr_built'])) 187 | print('Price Vs Year Renovated: %s' % x_df['price'].corr(x_df['yr_renovated'])) 188 | print('Price Vs Zipcode: %s' % x_df['price'].corr(x_df['zipcode'])) 189 | print('Price Vs Latitude: %s' % x_df['price'].corr(x_df['lat'])) 190 | print('Price Vs Longitude: %s' % x_df['price'].corr(x_df['long'])) 191 | 192 | 193 | # In[22]: 194 | 195 | 196 | x_df.corr().iloc[:,-19] 197 | 198 | 199 | # In[23]: 200 | 201 | 202 | sns.pairplot(data=x_df, 203 | x_vars=['price'], 204 | y_vars=['bedrooms', 'bathrooms', 'sqft_living', 205 | 'sqft_lot', 'floors', 'waterfront','view', 206 | 'grade','condition','sqft_above','sqft_basement', 207 | 'yr_built','yr_renovated','zipcode','lat','long'], 208 | size = 5) 209 | 210 | 211 | # In[24]: 212 | 213 | 214 | x_df2 = x_df.drop(['price'], axis = 1) 215 | 216 | 217 | # In[25]: 218 | 219 | 220 | reg=linear_model.LinearRegression() 221 | 222 | 223 | # In[26]: 224 | 225 | 226 | x_train,x_test,y_train,y_test = train_test_split(x_df2,y_df,test_size=0.4,random_state=4) 227 | 228 | 229 | # In[27]: 230 | 231 | 232 | reg.fit(x_train,y_train) 233 | 234 | 235 | # In[28]: 236 | 237 | 238 | reg.coef_ 239 | 240 | 241 | # In[29]: 242 | 243 | 244 | predictions=reg.predict(x_test) 245 | predictions 246 | 247 | 248 | # In[30]: 249 | 250 | 251 | reg.score(x_test,y_test) 252 | 253 | 254 | # In[31]: 255 | 256 | 257 | import xgboost 258 | 259 | 260 | # In[91]: 261 | 262 | 263 | new_model = xgboost.XGBRegressor(n_estimators=750, learning_rate=0.01, gamma=0, subsample=0.55, colsample_bytree=1, max_depth=10) 264 | 265 | 266 | # In[92]: 267 | 268 | 269 | from sklearn.model_selection import train_test_split 270 | 271 | 272 | # In[93]: 273 | 274 | 275 | traindf, testdf = train_test_split(x_train, test_size = 0.2) 276 | new_model.fit(x_train,y_train) 277 | 278 | 279 | # In[94]: 280 | 281 | 282 | from sklearn.metrics import explained_variance_score 283 | predictions = new_model.predict(x_test) 284 | print(explained_variance_score(predictions,y_test)) 285 | 286 | -------------------------------------------------------------------------------- /CH07/code/Natural+Language+Processing+-+ChatBot.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | spark = SparkSession.builder .master("local") .appName("Natural Language Processing") .config("spark.executor.memory", "6gb") .getOrCreate() 7 | 8 | 9 | # In[2]: 10 | 11 | df = spark.read.format('com.databricks.spark.csv') .options(header='true', inferschema='true') .load('TherapyBotSession.csv') 12 | 13 | 14 | # In[3]: 15 | 16 | df.show() 17 | 18 | 19 | # In[4]: 20 | 21 | df = df.select('id', 'label', 'chat') 22 | 23 | 24 | # In[5]: 25 | 26 | df.show() 27 | 28 | 29 | # In[6]: 30 | 31 | df.groupBy("label") .count() .orderBy("count", ascending = False) .show() 32 | 33 | 34 | # In[7]: 35 | 36 | import pyspark.sql.functions as F 37 | df = df.withColumn('word_count',F.size(F.split(F.col('chat'),' '))) 38 | 39 | 40 | # In[8]: 41 | 42 | df.show() 43 | 44 | 45 | # In[9]: 46 | 47 | df.groupBy('label') .agg(F.avg('word_count').alias('avg_word_count')) .orderBy('avg_word_count', ascending = False) .show() 48 | 49 | 50 | # In[10]: 51 | 52 | df_plot = df.select('id', 'word_count').toPandas() 53 | 54 | 55 | # In[11]: 56 | 57 | import matplotlib.pyplot as plt 58 | get_ipython().magic('matplotlib inline') 59 | 60 | df_plot.set_index('id', inplace=True) 61 | df_plot.plot(kind='bar', figsize=(16, 6)) 62 | plt.ylabel('Word Count') 63 | plt.title('Word Count distribution') 64 | plt.show() 65 | 66 | 67 | # In[12]: 68 | 69 | from textblob import TextBlob 70 | def sentiment_score(chat): 71 | return TextBlob(chat).sentiment.polarity 72 | 73 | 74 | # In[13]: 75 | 76 | from pyspark.sql.types import FloatType 77 | sentiment_score_udf = F.udf(lambda x: sentiment_score(x), FloatType()) 78 | 79 | 80 | # In[14]: 81 | 82 | df = df.select('id', 'label', 'chat','word_count', 83 | sentiment_score_udf('chat').alias('sentiment_score')) 84 | df.show() 85 | 86 | 87 | # In[15]: 88 | 89 | df.groupBy('label') .agg(F.avg('sentiment_score').alias('avg_sentiment_score')) .orderBy('avg_sentiment_score', ascending = False) .show() 90 | 91 | 92 | # In[16]: 93 | 94 | df = df.withColumn('words',F.split(F.col('chat'),' ')) 95 | df.show() 96 | 97 | 98 | # In[17]: 99 | 100 | stop_words = ['i','me','my','myself','we','our','ours','ourselves', 101 | 'you','your','yours','yourself','yourselves','he','him', 102 | 'his','himself','she','her','hers','herself','it','its', 103 | 'itself','they','them','their','theirs','themselves', 104 | 'what','which','who','whom','this','that','these','those', 105 | 'am','is','are','was','were','be','been','being','have', 106 | 'has','had','having','do','does','did','doing','a','an', 107 | 'the','and','but','if','or','because','as','until','while', 108 | 'of','at','by','for','with','about','against','between', 109 | 'into','through','during','before','after','above','below', 110 | 'to','from','up','down','in','out','on','off','over','under', 111 | 'again','further','then','once','here','there','when','where', 112 | 'why','how','all','any','both','each','few','more','most', 113 | 'other','some','such','no','nor','not','only','own','same', 114 | 'so','than','too','very','can','will','just','don','should','now'] 115 | 116 | 117 | # In[18]: 118 | 119 | from pyspark.ml.feature import StopWordsRemover 120 | 121 | 122 | # In[19]: 123 | 124 | stopwordsRemovalFeature = StopWordsRemover(inputCol="words", 125 | outputCol="words without stop").setStopWords(stop_words) 126 | 127 | 128 | # In[20]: 129 | 130 | from pyspark.ml import Pipeline 131 | stopWordRemovalPipeline = Pipeline(stages=[stopwordsRemovalFeature]) 132 | pipelineFitRemoveStopWords = stopWordRemovalPipeline.fit(df) 133 | 134 | 135 | # In[21]: 136 | 137 | df = pipelineFitRemoveStopWords.transform(df) 138 | df.select('words', 'words without stop').show(5) 139 | 140 | 141 | # In[22]: 142 | 143 | label = F.udf(lambda x: 1.0 if x == 'escalate' else 0.0, FloatType()) 144 | df = df.withColumn('label', label('label')) 145 | 146 | 147 | # In[23]: 148 | 149 | df.select('label').show() 150 | 151 | 152 | # In[24]: 153 | 154 | import pyspark.ml.feature as feat 155 | TF_ = feat.HashingTF(inputCol="words without stop", 156 | outputCol="rawFeatures", numFeatures=100000) 157 | IDF_ = feat.IDF(inputCol="rawFeatures", outputCol="features") 158 | 159 | 160 | # In[25]: 161 | 162 | pipelineTFIDF = Pipeline(stages=[TF_, IDF_]) 163 | 164 | 165 | # In[26]: 166 | 167 | pipelineFit = pipelineTFIDF.fit(df) 168 | df = pipelineFit.transform(df) 169 | 170 | 171 | # In[27]: 172 | 173 | df.select('label', 'rawFeatures','features').show() 174 | 175 | 176 | # In[28]: 177 | 178 | (trainingDF, testDF) = df.randomSplit([0.75, 0.25], seed = 1234) 179 | 180 | 181 | # In[29]: 182 | 183 | from pyspark.ml.classification import LogisticRegression 184 | logreg = LogisticRegression(regParam=0.025) 185 | 186 | 187 | # In[30]: 188 | 189 | logregModel = logreg.fit(trainingDF) 190 | 191 | 192 | # In[31]: 193 | 194 | predictionDF = logregModel.transform(testDF) 195 | 196 | 197 | # In[32]: 198 | 199 | predictionDF.select('label', 'probability', 'prediction').show() 200 | 201 | 202 | # In[33]: 203 | 204 | predictionDF.crosstab('label', 'prediction').show() 205 | 206 | 207 | # In[34]: 208 | 209 | from sklearn import metrics 210 | actual = predictionDF.select('label').toPandas() 211 | predicted = predictionDF.select('prediction').toPandas() 212 | 213 | 214 | # In[35]: 215 | 216 | print('accuracy score: {}%'.format(round(metrics.accuracy_score(actual, predicted),3)*100)) 217 | 218 | 219 | # In[36]: 220 | 221 | from pyspark.ml.evaluation import BinaryClassificationEvaluator 222 | 223 | scores = predictionDF.select('label', 'rawPrediction') 224 | evaluator = BinaryClassificationEvaluator() 225 | print('The ROC score is {}%'.format(round(evaluator.evaluate(scores),3)*100)) 226 | 227 | 228 | # In[37]: 229 | 230 | predictionDF.describe('label').show() 231 | 232 | 233 | # In[ ]: 234 | 235 | 236 | 237 | -------------------------------------------------------------------------------- /CH09/code/Predicting+Apple+Stock+Market+Value.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | spark = SparkSession.builder .master("local") .appName("StockMarket") .config("spark.executor.memory", "6gb") .getOrCreate() 7 | 8 | 9 | # In[2]: 10 | 11 | df =spark.read.format('com.databricks.spark.csv') .options(header='true', inferschema='true') .load('AAPL.csv') 12 | 13 | 14 | # In[3]: 15 | 16 | df.show() 17 | 18 | 19 | # In[4]: 20 | 21 | import pyspark.sql.functions as f 22 | df = df.withColumn('date', f.to_date('Date')) 23 | 24 | 25 | # In[5]: 26 | 27 | df.show(n=5) 28 | 29 | 30 | # In[6]: 31 | 32 | date_breakdown = ['year', 'month', 'day'] 33 | for i in enumerate(date_breakdown): 34 | index = i[0] 35 | name = i[1] 36 | df = df.withColumn(name, f.split('date', '-')[index]) 37 | 38 | 39 | # In[7]: 40 | 41 | df.show(n=10) 42 | 43 | 44 | # In[8]: 45 | 46 | df_plot = df.select('year', 'Adj Close').toPandas() 47 | 48 | 49 | # In[9]: 50 | 51 | from matplotlib import pyplot as plt 52 | get_ipython().magic('matplotlib inline') 53 | 54 | df_plot.set_index('year', inplace=True) 55 | df_plot.plot(figsize=(16, 6), grid=True) 56 | plt.title('Apple stock') 57 | plt.ylabel('Stock Quote ($)') 58 | plt.show() 59 | 60 | 61 | # In[10]: 62 | 63 | df.toPandas().shape 64 | 65 | 66 | # In[11]: 67 | 68 | df.dropna().count() 69 | 70 | 71 | # In[12]: 72 | 73 | df.select('Open', 'High', 'Low', 'Close', 'Adj Close').describe().show() 74 | 75 | 76 | # In[13]: 77 | 78 | df.groupBy(['year']).agg({'Adj Close':'count'}) .withColumnRenamed('count(Adj Close)', 'Row Count') .orderBy(["year"],ascending=False) .show() 79 | 80 | 81 | # In[14]: 82 | 83 | trainDF = df[df.year < 2017] 84 | testDF = df[df.year > 2016] 85 | 86 | 87 | # In[15]: 88 | 89 | trainDF.toPandas().shape 90 | 91 | 92 | # In[16]: 93 | 94 | testDF.toPandas().shape 95 | 96 | 97 | # In[17]: 98 | 99 | trainDF_plot = trainDF.select('year', 'Adj Close').toPandas() 100 | trainDF_plot.set_index('year', inplace=True) 101 | trainDF_plot.plot(figsize=(16, 6), grid=True) 102 | plt.title('Apple Stock 2000-2016') 103 | plt.ylabel('Stock Quote ($)') 104 | plt.show() 105 | 106 | 107 | # In[18]: 108 | 109 | testDF_plot = testDF.select('year', 'Adj Close').toPandas() 110 | testDF_plot.set_index('year', inplace=True) 111 | testDF_plot.plot(figsize=(16, 6), grid=True) 112 | plt.title('Apple Stock 2017-2018') 113 | plt.ylabel('Stock Quote ($)') 114 | plt.show() 115 | 116 | 117 | # In[19]: 118 | 119 | import numpy as np 120 | trainArray = np.array(trainDF.select('Open', 'High', 'Low', 'Close','Volume', 'Adj Close' ).collect()) 121 | testArray = np.array(testDF.select('Open', 'High', 'Low', 'Close','Volume', 'Adj Close' ).collect()) 122 | 123 | 124 | # In[20]: 125 | 126 | print(trainArray[0]) 127 | print('-------------') 128 | print(testArray[0]) 129 | 130 | 131 | # In[21]: 132 | 133 | from sklearn.preprocessing import MinMaxScaler 134 | minMaxScale = MinMaxScaler() 135 | 136 | 137 | # In[22]: 138 | 139 | minMaxScale.fit(trainArray) 140 | 141 | 142 | # In[23]: 143 | 144 | testingArray = minMaxScale.transform(testArray) 145 | trainingArray = minMaxScale.transform(trainArray) 146 | 147 | 148 | # In[24]: 149 | 150 | print(testingArray[0]) 151 | print('--------------') 152 | print(trainingArray[0]) 153 | 154 | 155 | # In[25]: 156 | 157 | xtrain = trainingArray[:, 0:-1] 158 | xtest = testingArray[:, 0:-1] 159 | # ytrain = trainingArray[:, 5] 160 | # ytest = testingArray[:, 5] 161 | ytrain = trainingArray[:, -1:] 162 | ytest = testingArray[:, -1:] 163 | 164 | 165 | # In[26]: 166 | 167 | trainingArray[0] 168 | 169 | 170 | # In[27]: 171 | 172 | xtrain[0] 173 | 174 | 175 | # In[28]: 176 | 177 | ytrain[0] 178 | 179 | 180 | # In[29]: 181 | 182 | print('xtrain shape = {}'.format(xtrain.shape)) 183 | print('xtest shape = {}'.format(xtest.shape)) 184 | print('ytrain shape = {}'.format(ytrain.shape)) 185 | print('ytest shape = {}'.format(ytest.shape)) 186 | 187 | 188 | # In[30]: 189 | 190 | plt.figure(figsize=(16,6)) 191 | plt.plot(xtrain[:,0],color='red', label='open') 192 | plt.plot(xtrain[:,1],color='blue', label='high') 193 | plt.plot(xtrain[:,2],color='green', label='low') 194 | plt.plot(xtrain[:,3],color='purple', label='close') 195 | plt.legend(loc = 'upper left') 196 | plt.title('Open, High, Low, and Close by Day') 197 | plt.xlabel('Days') 198 | plt.ylabel('Scaled Quotes') 199 | plt.show() 200 | 201 | 202 | # In[31]: 203 | 204 | plt.figure(figsize=(16,6)) 205 | plt.plot(xtrain[:,4],color='black', label='volume') 206 | plt.legend(loc = 'upper right') 207 | plt.title('Volume by Day') 208 | plt.xlabel('Days') 209 | plt.ylabel('Scaled Volume') 210 | plt.show() 211 | 212 | 213 | # In[32]: 214 | 215 | from keras import models, layers 216 | 217 | 218 | # In[33]: 219 | 220 | model = models.Sequential() 221 | model.add(layers.LSTM(1, input_shape=(1,5))) 222 | model.add(layers.Dense(1)) 223 | model.compile(loss='mean_squared_error', optimizer='adam') 224 | 225 | 226 | # In[34]: 227 | 228 | xtrain = xtrain.reshape((xtrain.shape[0], 1, xtrain.shape[1])) 229 | xtest = xtest.reshape((xtest.shape[0], 1, xtest.shape[1])) 230 | 231 | 232 | # In[35]: 233 | 234 | print('The shape of xtrain is {}: '.format(xtrain.shape)) 235 | print('The shape of xtest is {}: '.format(xtest.shape)) 236 | 237 | 238 | # In[36]: 239 | 240 | loss = model.fit(xtrain, ytrain, batch_size=10, epochs=100) 241 | 242 | 243 | # In[37]: 244 | 245 | plt.plot(loss.history['loss'], label = 'loss') 246 | plt.title('mean squared error by epoch') 247 | plt.legend() 248 | plt.show() 249 | 250 | 251 | # In[38]: 252 | 253 | predicted = model.predict(xtest) 254 | 255 | 256 | # In[39]: 257 | 258 | combined_array = np.concatenate((ytest, predicted), axis = 1) 259 | 260 | 261 | # In[40]: 262 | 263 | plt.figure(figsize=(16,6)) 264 | plt.plot(combined_array[:,0],color='red', label='actual') 265 | plt.plot(combined_array[:,1],color='blue', label='predicted') 266 | plt.legend(loc = 'lower right') 267 | plt.title('2017 Actual vs. Predicted APPL Stock') 268 | plt.xlabel('Days') 269 | plt.ylabel('Scaled Quotes') 270 | plt.show() 271 | 272 | 273 | # In[41]: 274 | 275 | import sklearn.metrics as metrics 276 | np.sqrt(metrics.mean_squared_error(ytest,predicted)) 277 | 278 | 279 | # In[ ]: 280 | 281 | 282 | 283 | -------------------------------------------------------------------------------- /CH05/code/Predicting+Fire+Dept+Calls+with+Spark+ML.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | from pyspark.sql import SparkSession 7 | 8 | 9 | # In[2]: 10 | 11 | spark = SparkSession.builder .master("local") .appName("Predicting Fire Dept Calls") .config("spark.executor.memory", "6gb") .getOrCreate() 12 | 13 | 14 | # In[3]: 15 | 16 | df = spark.read.format('com.databricks.spark.csv') .options(header='true', inferschema='true') .load('Fire_Department_Calls_for_Service.csv') 17 | 18 | 19 | # In[4]: 20 | 21 | df.show(2) 22 | 23 | 24 | # In[5]: 25 | 26 | df.select('Call Type Group').distinct().show() 27 | 28 | 29 | # In[6]: 30 | 31 | df.groupBy('Call Type Group').count().show() 32 | 33 | 34 | # In[7]: 35 | 36 | df2 = df.groupBy('Call Type Group').count() 37 | 38 | 39 | # In[8]: 40 | 41 | graphDF = df2.toPandas() 42 | graphDF = graphDF.sort_values('count', ascending=False) 43 | 44 | 45 | # In[9]: 46 | 47 | import matplotlib.pyplot as plt 48 | get_ipython().magic('matplotlib inline') 49 | 50 | 51 | # In[10]: 52 | 53 | graphDF.plot(x='Call Type Group', y = 'count', kind='bar') 54 | plt.title('Call Type Group by Count') 55 | plt.show() 56 | 57 | 58 | # In[11]: 59 | 60 | df.groupBy('Call Type').count().orderBy('count', ascending=False).show(100) 61 | 62 | 63 | # In[12]: 64 | 65 | from pyspark.sql import functions as F 66 | fireIndicator = df.select(df["Call Type"],F.when(df["Call Type"].like("%Fire%"),1) .otherwise(0).alias('Fire Indicator')) 67 | fireIndicator.show() 68 | 69 | 70 | # In[13]: 71 | 72 | fireIndicator.groupBy('Fire Indicator').count().show() 73 | 74 | 75 | # In[14]: 76 | 77 | df = df.withColumn("fireIndicator", F.when(df["Call Type"].like("%Fire%"),1).otherwise(0)) 78 | 79 | 80 | # In[15]: 81 | 82 | df.printSchema() 83 | 84 | 85 | # In[16]: 86 | 87 | df.select('Call Type', 'fireIndicator').show(20) 88 | 89 | 90 | # In[17]: 91 | 92 | df = df.select('fireIndicator', 93 | 'Zipcode of Incident', 94 | 'Battalion', 95 | 'Station Area', 96 | 'Box', 97 | 'Number of Alarms', 98 | 'Unit sequence in call dispatch', 99 | 'Neighborhooods - Analysis Boundaries', 100 | 'Fire Prevention District', 101 | 'Supervisor District') 102 | df.show(5) 103 | 104 | 105 | # In[18]: 106 | 107 | print('Total Rows') 108 | df.count() 109 | 110 | 111 | # In[19]: 112 | 113 | print('Rows without Null values') 114 | df.dropna().count() 115 | 116 | 117 | # In[20]: 118 | 119 | print('Row with Null Values') 120 | df.count()-df.dropna().count() 121 | 122 | 123 | # In[21]: 124 | 125 | df = df.dropna() 126 | 127 | 128 | # In[22]: 129 | 130 | df.groupBy('fireIndicator').count().orderBy('count', ascending = False).show() 131 | 132 | 133 | # In[23]: 134 | 135 | from pyspark.ml.feature import StringIndexer 136 | 137 | 138 | # In[24]: 139 | 140 | column_names = df.columns[1:] 141 | column_names 142 | 143 | 144 | # In[25]: 145 | 146 | categoricalColumns = column_names 147 | indexers = [] 148 | for categoricalCol in categoricalColumns: 149 | stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"_Index") 150 | indexers += [stringIndexer] 151 | 152 | 153 | # In[26]: 154 | 155 | models = [] 156 | for model in indexers: 157 | indexer_model = model.fit(df) 158 | models+=[indexer_model] 159 | 160 | for i in models: 161 | df = i.transform(df) 162 | 163 | 164 | # In[27]: 165 | 166 | df.columns 167 | 168 | 169 | # In[28]: 170 | 171 | df.select('Neighborhooods - Analysis Boundaries', 'Neighborhooods - Analysis Boundaries_Index').show() 172 | 173 | 174 | # In[29]: 175 | 176 | df = df.select( 177 | 'fireIndicator', 178 | 'Zipcode of Incident_Index', 179 | 'Battalion_Index', 180 | 'Station Area_Index', 181 | 'Box_Index', 182 | 'Number of Alarms_Index', 183 | 'Unit sequence in call dispatch_Index', 184 | 'Neighborhooods - Analysis Boundaries_Index', 185 | 'Fire Prevention District_Index', 186 | 'Supervisor District_Index') 187 | 188 | 189 | # In[30]: 190 | 191 | df.printSchema() 192 | 193 | 194 | # In[31]: 195 | 196 | df.show(5) 197 | 198 | 199 | # In[32]: 200 | 201 | features = df.columns[1:] 202 | 203 | 204 | # In[33]: 205 | 206 | from pyspark.ml.feature import VectorAssembler 207 | 208 | feature_vectors = VectorAssembler( 209 | inputCols = features, 210 | outputCol = "features") 211 | 212 | 213 | # In[34]: 214 | 215 | df = feature_vectors.transform(df) 216 | 217 | 218 | # In[35]: 219 | 220 | df.columns 221 | 222 | 223 | # In[36]: 224 | 225 | df = df.drop( 'Zipcode of Incident_Index', 226 | 'Battalion_Index', 227 | 'Station Area_Index', 228 | 'Box_Index', 229 | 'Number of Alarms_Index', 230 | 'Unit sequence in call dispatch_Index', 231 | 'Neighborhooods - Analysis Boundaries_Index', 232 | 'Fire Prevention District_Index', 233 | 'Supervisor District_Index') 234 | 235 | 236 | # In[37]: 237 | 238 | df = df.withColumnRenamed('fireIndicator', 'label') 239 | 240 | 241 | # In[38]: 242 | 243 | df.show() 244 | 245 | 246 | # In[39]: 247 | 248 | (trainDF, testDF) = df.randomSplit([0.75, 0.25], seed = 12345) 249 | 250 | 251 | # In[40]: 252 | 253 | print(trainDF.count()) 254 | print(testDF.count()) 255 | 256 | 257 | # In[41]: 258 | 259 | from pyspark.ml.classification import LogisticRegression 260 | logreg = LogisticRegression(labelCol="label", featuresCol="features", maxIter=10) 261 | LogisticRegressionModel = logreg.fit(trainDF) 262 | 263 | 264 | # In[42]: 265 | 266 | df_predicted = LogisticRegressionModel.transform(testDF) 267 | 268 | 269 | # In[43]: 270 | 271 | df_predicted.printSchema() 272 | 273 | 274 | # In[44]: 275 | 276 | df_predicted.show(5) 277 | 278 | 279 | # In[45]: 280 | 281 | df_predicted.crosstab('label', 'prediction').show() 282 | 283 | 284 | # In[46]: 285 | 286 | from sklearn import metrics 287 | 288 | 289 | # In[47]: 290 | 291 | actual = df_predicted.select('label').toPandas() 292 | 293 | 294 | # In[48]: 295 | 296 | predicted = df_predicted.select('prediction').toPandas() 297 | 298 | 299 | # In[49]: 300 | 301 | metrics.accuracy_score(actual, predicted) 302 | 303 | 304 | # In[50]: 305 | 306 | df_predicted.groupBy('label').count().show() 307 | 308 | 309 | # In[51]: 310 | 311 | df_predicted.describe('label').show() 312 | 313 | -------------------------------------------------------------------------------- /CH11/code/CH11_Words+to+Vectors.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | pwd 8 | 9 | 10 | # In[2]: 11 | 12 | 13 | cd '/Users/Chanti/Desktop/USF' 14 | 15 | 16 | # In[3]: 17 | 18 | 19 | from __future__ import absolute_import, division, print_function 20 | 21 | 22 | # In[4]: 23 | 24 | 25 | import codecs 26 | import glob 27 | import logging 28 | import multiprocessing 29 | import os 30 | import pprint 31 | import re 32 | 33 | 34 | # In[5]: 35 | 36 | 37 | import nltk 38 | import gensim.models.word2vec as w2v 39 | import sklearn.manifold 40 | import numpy as np 41 | import matplotlib.pyplot as plt 42 | import pandas as pd 43 | import seaborn as sns 44 | 45 | 46 | # In[6]: 47 | 48 | 49 | get_ipython().magic('pylab inline') 50 | 51 | 52 | # In[7]: 53 | 54 | 55 | logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) 56 | 57 | 58 | # In[8]: 59 | 60 | 61 | nltk.download("punkt") 62 | nltk.download("stopwords") 63 | 64 | 65 | # In[9]: 66 | 67 | 68 | book_names = sorted(glob.glob("./*.txt")) 69 | 70 | 71 | # In[10]: 72 | 73 | 74 | print("Found books:") 75 | book_names 76 | 77 | 78 | # In[11]: 79 | 80 | 81 | corpus = u'' 82 | for book_name in book_names: 83 | print("Reading '{0}'...".format(book_name)) 84 | with codecs.open(book_name,"r","Latin1") as book_file: 85 | corpus += book_file.read() 86 | print("Corpus is now {0} characters long".format(len(corpus))) 87 | print() 88 | 89 | 90 | # In[12]: 91 | 92 | 93 | #Load the English pickle tokenizer from punkt 94 | tokenizer = nltk.data.load('tokenizers/punkt/english.pickle') 95 | 96 | 97 | # In[13]: 98 | 99 | 100 | #Tokenize the corpus into sentences 101 | raw_sentences = tokenizer.tokenize(corpus) 102 | 103 | 104 | # In[14]: 105 | 106 | 107 | #Convert sentences into list of words 108 | #remove unecessary characters, split into words, remove hyphens and special characters 109 | def sentence_to_wordlist(raw): 110 | clean = re.sub("[^a-zA-Z]"," ", raw) 111 | words = clean.split() 112 | return words 113 | 114 | 115 | # In[15]: 116 | 117 | 118 | #for each sentence, sentences where each word is tokenized 119 | sentences = [] 120 | for raw_sentence in raw_sentences: 121 | if len(raw_sentence) > 0: 122 | sentences.append(sentence_to_wordlist(raw_sentence)) 123 | 124 | 125 | # In[16]: 126 | 127 | 128 | print(raw_sentences[50]) 129 | print(sentence_to_wordlist(raw_sentences[50])) 130 | 131 | 132 | # In[17]: 133 | 134 | 135 | #count tokens, each one being a sentence 136 | token_count = sum([len(sentence) for sentence in sentences]) 137 | print("The book corpus contains {0:,} tokens".format(token_count)) 138 | 139 | 140 | # In[18]: 141 | 142 | 143 | #Define hyperparameters 144 | 145 | # Dimensionality of the resulting word vectors. 146 | num_features = 300 147 | 148 | # Minimum word count threshold. 149 | min_word_count = 3 150 | 151 | # Number of threads to run in parallel. 152 | num_workers = multiprocessing.cpu_count() 153 | 154 | # Context window length. 155 | context_size = 7 156 | 157 | # Downsample setting for frequent words. 158 | downsampling = 1e-3 159 | 160 | # Seed for the RNG, to make the results reproducible. 161 | seed = 1 162 | 163 | 164 | # In[19]: 165 | 166 | 167 | got2vec = w2v.Word2Vec( 168 | sg=1, 169 | seed=seed, 170 | workers=num_workers, 171 | size=num_features, 172 | min_count=min_word_count, 173 | window=context_size, 174 | sample=downsampling 175 | ) 176 | 177 | 178 | # In[20]: 179 | 180 | 181 | got2vec.build_vocab(sentences,progress_per=10000, keep_raw_vocab=False, trim_rule=None) 182 | 183 | 184 | # In[21]: 185 | 186 | 187 | #train model on sentences 188 | got2vec.train(sentences, total_examples=got2vec.corpus_count, 189 | total_words=None, epochs=got2vec.iter, 190 | start_alpha=None, end_alpha=None, word_count=0, 191 | queue_factor=2, report_delay=1.0, compute_loss=False) 192 | 193 | 194 | # In[22]: 195 | 196 | 197 | #save model 198 | if not os.path.exists("trained"): 199 | os.makedirs("trained") 200 | 201 | 202 | # In[23]: 203 | 204 | 205 | got2vec.wv.save(os.path.join("trained", "got2vec.w2v"), ignore=[]) 206 | 207 | 208 | # In[24]: 209 | 210 | 211 | #load model 212 | got2vec = w2v.KeyedVectors.load(os.path.join("trained", "got2vec.w2v")) 213 | 214 | 215 | # In[25]: 216 | 217 | 218 | #Squash dimensionality to 2 219 | tsne = sklearn.manifold.TSNE(n_components=2, random_state=0) 220 | 221 | 222 | # In[26]: 223 | 224 | 225 | #Put all the word vectors into one big matrix 226 | all_word_vectors_matrix = got2vec.wv.syn0 227 | 228 | 229 | # In[27]: 230 | 231 | 232 | print (all_word_vectors_matrix) 233 | 234 | 235 | # In[28]: 236 | 237 | 238 | #train tsne 239 | all_word_vectors_matrix_2d = tsne.fit_transform(all_word_vectors_matrix) 240 | 241 | 242 | # In[29]: 243 | 244 | 245 | #plot point in 2d space 246 | points = pd.DataFrame( 247 | [ 248 | (word, coords[0], coords[1]) 249 | for word, coords in [ 250 | (word, all_word_vectors_matrix_2d[got2vec.vocab[word].index]) 251 | for word in got2vec.vocab 252 | ] 253 | ], 254 | columns=["word", "x", "y"] 255 | ) 256 | 257 | 258 | # In[30]: 259 | 260 | 261 | points.head(20) 262 | 263 | 264 | # In[31]: 265 | 266 | 267 | # Plotting using the seaborn library 268 | sns.set_context("poster") 269 | 270 | 271 | # In[32]: 272 | 273 | 274 | points.plot.scatter("x", "y", s=10, figsize=(10, 10)) 275 | 276 | 277 | # In[33]: 278 | 279 | 280 | def plot_region(x_bounds, y_bounds): 281 | slice = points[ 282 | (x_bounds[0] <= points.x) & 283 | (points.x <= x_bounds[1]) & 284 | (y_bounds[0] <= points.y) & 285 | (points.y <= y_bounds[1]) 286 | ] 287 | 288 | ax = slice.plot.scatter("x", "y", s=35, figsize=(10, 8)) 289 | for i, point in slice.iterrows(): 290 | ax.text(point.x + 0.005, point.y + 0.005, point.word, fontsize=11) 291 | 292 | 293 | # In[34]: 294 | 295 | 296 | plot_region(x_bounds=(20.0, 25.0), y_bounds=(15.5, 20.0)) 297 | 298 | 299 | # In[35]: 300 | 301 | 302 | plot_region(x_bounds=(4, 41), y_bounds=(-0.5, -0.1)) 303 | 304 | 305 | # In[36]: 306 | 307 | 308 | plot_region(x_bounds=(10, 15), y_bounds=(5, 10)) 309 | 310 | 311 | # In[37]: 312 | 313 | 314 | got2vec.most_similar("Stark") 315 | 316 | 317 | # In[38]: 318 | 319 | 320 | got2vec.most_similar("Lannister") 321 | 322 | 323 | # In[39]: 324 | 325 | 326 | got2vec.most_similar("Jon") 327 | 328 | 329 | # In[40]: 330 | 331 | 332 | #distance, similarity, and ranking 333 | def nearest_similarity_cosmul(start1, end1, end2): 334 | similarities = got2vec.most_similar_cosmul( 335 | positive=[end2, start1], 336 | negative=[end1] 337 | ) 338 | start2 = similarities[0][0] 339 | print("{start1} is related to {end1}, as {start2} is related to {end2}".format(**locals())) 340 | return start2 341 | 342 | 343 | # In[41]: 344 | 345 | 346 | nearest_similarity_cosmul("Stark", "Winterfell", "Riverrun") 347 | nearest_similarity_cosmul("Jaime", "sword", "wine") 348 | nearest_similarity_cosmul("Arya", "Nymeria", "dragons") 349 | 350 | -------------------------------------------------------------------------------- /CH06/code/CH06_LSTMs+word+level.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | 7 | from numpy import array 8 | from pickle import dump 9 | from keras.preprocessing.text import Tokenizer 10 | from keras.utils import to_categorical 11 | from keras.models import Sequential 12 | from keras.layers import Dense 13 | from keras.layers import LSTM 14 | from keras.layers import Embedding 15 | 16 | 17 | # In[2]: 18 | 19 | 20 | pwd 21 | 22 | 23 | # In[3]: 24 | 25 | 26 | cd '/Users/Chanti/Desktop/Cookbook/Chapter 8' 27 | 28 | 29 | # In[4]: 30 | 31 | 32 | pwd 33 | 34 | 35 | # In[5]: 36 | 37 | 38 | # load doc into memory 39 | def load_document(name): 40 | file = open(name, 'r') 41 | text = file.read() 42 | file.close() 43 | return text 44 | 45 | 46 | # In[6]: 47 | 48 | 49 | # load document 50 | input_filename = 'junglebook.txt' 51 | doc = load_document(input_filename) 52 | print(doc[:2000]) 53 | 54 | 55 | # In[7]: 56 | 57 | 58 | import string 59 | 60 | # turn a document into clean tokens 61 | def clean_document(doc): 62 | doc = doc.replace('--', ' ') 63 | tokens = doc.split() 64 | table = str.maketrans('', '', string.punctuation) 65 | tokens = [w.translate(table) for w in tokens] 66 | tokens = [word for word in tokens if word.isalpha()] 67 | tokens = [word.lower() for word in tokens] 68 | return tokens 69 | 70 | 71 | # In[8]: 72 | 73 | 74 | # clean document 75 | tokens = clean_document(doc) 76 | print(tokens[:200]) 77 | print('Total Tokens: %d' % len(tokens)) 78 | print('Unique Tokens: %d' % len(set(tokens))) 79 | 80 | 81 | # In[9]: 82 | 83 | 84 | # organize into sequences (of length 50) of tokens 85 | length = 50 + 1 86 | sequences = list() 87 | for i in range(length, len(tokens)): 88 | # select sequence of tokens 89 | seq = tokens[i-length:i] 90 | # convert into a line 91 | line = ' '.join(seq) 92 | sequences.append(line) 93 | print('Total Sequences: %d' % len(sequences)) 94 | 95 | 96 | # In[10]: 97 | 98 | 99 | # save tokens to file, one dialog per line 100 | def save_document(lines, name): 101 | data = '\n'.join(lines) 102 | file = open(name, 'w') 103 | file.write(data) 104 | file.close() 105 | 106 | 107 | # In[11]: 108 | 109 | 110 | # save sequences to file 111 | output_filename = 'junglebook_sequences.txt' 112 | save_document(sequences, output_filename) 113 | 114 | 115 | # In[12]: 116 | 117 | 118 | # load document into memory 119 | def load_document(name): 120 | file = open(name, 'r') 121 | text = file.read() 122 | file.close() 123 | return text 124 | 125 | # load 126 | input_filename = 'junglebook_sequences.txt' 127 | doc = load_document(input_filename) 128 | lines = doc.split('\n') 129 | 130 | 131 | # In[13]: 132 | 133 | 134 | # integer encode sequences of words 135 | tokenizer = Tokenizer() 136 | tokenizer.fit_on_texts(lines) 137 | sequences = tokenizer.texts_to_sequences(lines) 138 | 139 | 140 | # In[14]: 141 | 142 | 143 | # vocabulary size 144 | vocab_size = len(tokenizer.word_index) + 1 145 | 146 | 147 | # In[15]: 148 | 149 | 150 | # separate into input and output 151 | sequences = array(sequences) 152 | Input, Output = sequences[:,:-1], sequences[:,-1] 153 | Output = to_categorical(Output, num_classes=vocab_size) 154 | sequence_length = Input.shape[1] 155 | 156 | 157 | # In[16]: 158 | 159 | 160 | # define model 161 | from keras.layers import Dropout 162 | model = Sequential() 163 | model.add(Embedding(vocab_size, 100, input_length=sequence_length)) 164 | model.add(LSTM(200, return_sequences=True)) 165 | model.add(LSTM(200)) 166 | model.add(Dropout(0.3)) 167 | model.add(Dense(200, activation='relu')) 168 | model.add(Dense(vocab_size, activation='softmax')) 169 | print(model.summary()) 170 | 171 | 172 | # In[17]: 173 | 174 | 175 | # compile model 176 | model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy']) 177 | # fit model 178 | model.fit(Input, Output, batch_size=250, epochs=75) 179 | 180 | 181 | # In[18]: 182 | 183 | 184 | # save the model to file 185 | model.save('junglebook_trained.h5') 186 | # save the tokenizer 187 | dump(tokenizer, open('tokenizer.pkl', 'wb')) 188 | 189 | 190 | # In[19]: 191 | 192 | 193 | # load doc into memory 194 | def load_document(name): 195 | file = open(name, 'r') 196 | text = file.read() 197 | file.close() 198 | return text 199 | 200 | # load cleaned text sequences 201 | input_filename = 'junglebook_sequences.txt' 202 | doc = load_document(input_filename) 203 | lines = doc.split('\n') 204 | 205 | 206 | # In[20]: 207 | 208 | 209 | sequence_length = len(lines[0].split()) - 1 210 | 211 | 212 | # In[21]: 213 | 214 | 215 | # load the model 216 | from keras.models import load_model 217 | model = load_model('junglebook_trained.h5') 218 | 219 | 220 | # In[22]: 221 | 222 | 223 | # select a seed text 224 | from random import randint 225 | seed_text = lines[randint(0,len(lines))] 226 | print(seed_text + '\n') 227 | 228 | 229 | # In[23]: 230 | 231 | 232 | encoded = tokenizer.texts_to_sequences([seed_text])[0] 233 | 234 | 235 | # In[24]: 236 | 237 | 238 | from random import randint 239 | from pickle import load 240 | from keras.models import load_model 241 | from keras.preprocessing.sequence import pad_sequences 242 | 243 | # load doc into memory 244 | def load_document(name): 245 | file = open(name, 'r') 246 | text = file.read() 247 | file.close() 248 | return text 249 | 250 | # generate a sequence from a language model 251 | def generate_sequence(model, tokenizer, sequence_length, seed_text, n_words): 252 | result = list() 253 | input_text = seed_text 254 | # generate a fixed number of words 255 | for _ in range(n_words): 256 | # encode the text as integer 257 | encoded = tokenizer.texts_to_sequences([input_text])[0] 258 | # truncate sequences to a fixed length 259 | encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre') 260 | # predict probabilities for each word 261 | prediction = model.predict_classes(encoded, verbose=0) 262 | # map predicted word index to word 263 | out_word = '' 264 | for word, index in tokenizer.word_index.items(): 265 | if index == prediction: 266 | out_word = word 267 | break 268 | # append to input 269 | input_text += ' ' + out_word 270 | result.append(out_word) 271 | return ' '.join(result) 272 | 273 | # load cleaned text sequences 274 | input_filename = 'junglebook_sequences.txt' 275 | doc = load_document(input_filename) 276 | lines = doc.split('\n') 277 | seq_length = len(lines[0].split()) - 1 278 | 279 | 280 | # In[25]: 281 | 282 | 283 | # load the model 284 | model = load_model('junglebook_trained.h5') 285 | 286 | # load the tokenizer 287 | tokenizer = load(open('tokenizer.pkl', 'rb')) 288 | 289 | # select a seed text 290 | seed_text = lines[randint(0,len(lines))] 291 | print(seed_text + '\n') 292 | 293 | # generate new text 294 | generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50) 295 | print(generated) 296 | 297 | 298 | # In[26]: 299 | 300 | 301 | # load the model 302 | model = load_model('junglebook_trained.h5') 303 | 304 | # load the tokenizer 305 | tokenizer = load(open('tokenizer.pkl', 'rb')) 306 | 307 | # select a seed text 308 | seed_text = lines[randint(0,len(lines))] 309 | print(seed_text + '\n') 310 | 311 | # generate new text 312 | generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50) 313 | print(generated) 314 | 315 | 316 | # In[29]: 317 | 318 | 319 | # load the model 320 | model = load_model('junglebook_trained.h5') 321 | 322 | # load the tokenizer 323 | tokenizer = load(open('tokenizer.pkl', 'rb')) 324 | 325 | # select a seed text 326 | seed_text = lines[randint(0,len(lines))] 327 | print(seed_text + '\n') 328 | 329 | # generate new text 330 | generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50) 331 | print(generated) 332 | 333 | -------------------------------------------------------------------------------- /CH12/code/Create+a+movie+recommendation+engine+with+Keras.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | spark = SparkSession.builder .master("local") .appName("RecommendationEngine") .config("spark.executor.memory", "6gb") .getOrCreate() 7 | 8 | 9 | # In[2]: 10 | 11 | import os 12 | os.listdir('ml-latest-small/') 13 | 14 | 15 | # In[3]: 16 | 17 | movies = spark.read.format('com.databricks.spark.csv') .options(header='true', inferschema='true') .load('ml-latest-small/movies.csv') 18 | tags = spark.read.format('com.databricks.spark.csv') .options(header='true', inferschema='true') .load('ml-latest-small/tags.csv') 19 | links = spark.read.format('com.databricks.spark.csv') .options(header='true', inferschema='true') .load('ml-latest-small/links.csv') 20 | ratings = spark.read.format('com.databricks.spark.csv') .options(header='true', inferschema='true') .load('ml-latest-small/ratings.csv') 21 | 22 | 23 | # In[4]: 24 | 25 | ratings.columns 26 | 27 | 28 | # In[5]: 29 | 30 | ratings.show(truncate=False) 31 | 32 | 33 | # In[6]: 34 | 35 | tags.show(truncate = False) 36 | 37 | 38 | # In[7]: 39 | 40 | movies.select('genres').distinct().show(truncate = False) 41 | 42 | 43 | # In[8]: 44 | 45 | links.show() 46 | 47 | 48 | # In[9]: 49 | 50 | print('The number of rows in movies dataset is {}'.format(movies.toPandas().shape[0])) 51 | print('The number of rows in ratings dataset is {}'.format(ratings.toPandas().shape[0])) 52 | print('The number of rows in tags dataset is {}'.format(tags.toPandas().shape[0])) 53 | print('The number of rows in links dataset is {}'.format(links.toPandas().shape[0])) 54 | 55 | 56 | # In[10]: 57 | 58 | for i in ratings.columns: 59 | ratings = ratings.withColumnRenamed(i, i+'_1') 60 | 61 | 62 | # In[11]: 63 | 64 | ratings.show() 65 | 66 | 67 | # In[12]: 68 | 69 | temp1 = ratings.join(movies, ratings.movieId_1 == movies.movieId, how = 'inner') 70 | 71 | 72 | # In[13]: 73 | 74 | temp2 = temp1.join(links, temp1.movieId_1 == links.movieId, how = 'inner') 75 | 76 | 77 | # In[14]: 78 | 79 | mainDF = temp2.join(tags, (temp2.userId_1 == tags.userId) & 80 | (temp2.movieId_1 == tags.movieId), how = 'left') 81 | 82 | 83 | # In[15]: 84 | 85 | print(temp1.count()) 86 | print(temp2.count()) 87 | print(mainDF.count()) 88 | 89 | 90 | # In[16]: 91 | 92 | mainDF.groupBy(['tag']).agg({'rating_1':'count'}) .withColumnRenamed('count(rating_1)', 'Row Count').orderBy(["Row Count"],ascending=False) .show() 93 | 94 | 95 | # In[17]: 96 | 97 | mainDF.columns 98 | 99 | 100 | # In[18]: 101 | 102 | mainDF = mainDF.select('userId_1','movieId_1','rating_1','title','genres', 'imdbId','tmdbId', 'timestamp_1') .distinct() 103 | 104 | 105 | # In[19]: 106 | 107 | mainDF.count() 108 | 109 | 110 | # In[20]: 111 | 112 | movies.createOrReplaceTempView('movies_') 113 | links.createOrReplaceTempView('links_') 114 | ratings.createOrReplaceTempView('ratings_') 115 | 116 | 117 | # In[21]: 118 | 119 | mainDF_SQL = sqlContext.sql( 120 | """ 121 | select 122 | r.userId_1 123 | ,r.movieId_1 124 | ,r.rating_1 125 | ,m.title 126 | ,m.genres 127 | ,l.imdbId 128 | ,l.tmdbId 129 | ,r.timestamp_1 130 | from ratings_ r 131 | inner join movies_ m on 132 | r.movieId_1 = m.movieId 133 | inner join links_ l on 134 | r.movieId_1 = l.movieId 135 | """ 136 | ) 137 | 138 | 139 | # In[22]: 140 | 141 | mainDF_SQL.show(n = 5) 142 | 143 | 144 | # In[23]: 145 | 146 | mainDF_SQL.count() 147 | 148 | 149 | # In[24]: 150 | 151 | mainDF.describe('rating_1').show() 152 | 153 | 154 | # In[25]: 155 | 156 | import matplotlib.pyplot as plt 157 | get_ipython().magic('matplotlib inline') 158 | 159 | mainDF.select('rating_1').toPandas().hist(figsize=(16, 6), grid=True) 160 | plt.title('Histogram of Ratings') 161 | plt.show() 162 | 163 | 164 | # In[26]: 165 | 166 | mainDF.groupBy(['rating_1']).agg({'rating_1':'count'}) .withColumnRenamed('count(rating_1)', 'Row Count').orderBy(["Row Count"],ascending=False) .show() 167 | 168 | 169 | # In[27]: 170 | 171 | userId_frequency = mainDF.groupBy(['userId_1']).agg({'rating_1':'count'}) .withColumnRenamed('count(rating_1)', '# of Reviews') .orderBy(["# of Reviews"],ascending=False) 172 | 173 | 174 | # In[28]: 175 | 176 | userId_frequency.show() 177 | 178 | 179 | # In[29]: 180 | 181 | userId_frequency.select('# of Reviews').toPandas().hist(figsize=(16, 6), grid=True) 182 | plt.title('Histogram of User Ratings') 183 | plt.show() 184 | 185 | 186 | # In[30]: 187 | 188 | mainDF = mainDF.withColumnRenamed('userId_1', 'userid') 189 | mainDF = mainDF.withColumnRenamed('movieId_1', 'movieid') 190 | mainDF = mainDF.withColumnRenamed('rating_1', 'rating') 191 | mainDF = mainDF.withColumnRenamed('timestamp_1', 'timestamp') 192 | mainDF = mainDF.withColumnRenamed('imdbId', 'imdbid') 193 | mainDF = mainDF.withColumnRenamed('tmdbId', 'tmdbid') 194 | 195 | 196 | # In[31]: 197 | 198 | mainDF.columns 199 | 200 | 201 | # In[32]: 202 | 203 | import pyspark.sql.functions as F 204 | mainDF = mainDF.withColumn("rating", F.round(mainDF["rating"], 0)) 205 | 206 | 207 | # In[33]: 208 | 209 | from pyspark.ml.feature import StringIndexer 210 | string_indexer = StringIndexer(inputCol="genres", outputCol="genreCount") 211 | mainDF = string_indexer.fit(mainDF).transform(mainDF) 212 | mainDF.show() 213 | 214 | 215 | # In[34]: 216 | 217 | mainDF = mainDF.select('rating', 'userid', 'movieid', 'imdbid', 'tmdbid', 'timestamp', 'genreCount') 218 | 219 | 220 | # In[35]: 221 | 222 | mainDF.show() 223 | 224 | 225 | # In[36]: 226 | 227 | trainDF, testDF = mainDF.randomSplit([0.8, 0.2], seed=1234) 228 | 229 | 230 | # In[37]: 231 | 232 | print('The number of rows in mainDF is {}'.format(mainDF.count())) 233 | print('The number of rows in trainDF is {}'.format(trainDF.count())) 234 | print('The number of rows in testDF is {}'.format(testDF.count())) 235 | 236 | 237 | # In[38]: 238 | 239 | import numpy as np 240 | xtrain_array = np.array(trainDF.select('userid','movieid', 'genreCount').collect()) 241 | xtest_array = np.array(testDF.select('userid','movieid', 'genreCount').collect()) 242 | 243 | 244 | # In[39]: 245 | 246 | ytrain_array = np.array(trainDF.select('rating').collect()) 247 | ytest_array = np.array(testDF.select('rating').collect()) 248 | 249 | 250 | # In[40]: 251 | 252 | print(xtest_array.shape) 253 | print(ytest_array.shape) 254 | print(xtrain_array.shape) 255 | print(ytrain_array.shape) 256 | 257 | 258 | # In[41]: 259 | 260 | import keras.utils as u 261 | ytrain_OHE = u.to_categorical(ytrain_array) 262 | ytest_OHE = u.to_categorical(ytest_array) 263 | 264 | 265 | # In[42]: 266 | 267 | print(ytrain_OHE.shape) 268 | print(ytest_OHE.shape) 269 | 270 | 271 | # In[43]: 272 | 273 | from keras.models import Sequential 274 | from keras.layers import Dense, Activation 275 | 276 | 277 | # In[44]: 278 | 279 | model = Sequential() 280 | model.add(Dense(32, activation='relu', input_dim=xtrain_array.shape[1])) 281 | model.add(Dense(10, activation='relu')) 282 | model.add(Dense(ytrain_OHE.shape[1], activation='softmax')) 283 | model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy']) 284 | 285 | 286 | # In[45]: 287 | 288 | accuracy_history = model.fit(xtrain_array, ytrain_OHE, epochs=20, batch_size=32) 289 | 290 | 291 | # In[46]: 292 | 293 | plt.plot(accuracy_history.history['acc']) 294 | plt.title('Accuracy vs. Epoch') 295 | plt.xlabel('Epoch') 296 | plt.ylabel('Accuracy') 297 | plt.show() 298 | plt.plot(accuracy_history.history['loss']) 299 | plt.title('Loss vs. Epoch') 300 | plt.xlabel('Epoch') 301 | plt.ylabel('Loss') 302 | plt.show() 303 | 304 | 305 | # In[47]: 306 | 307 | score = model.evaluate(xtest_array, ytest_OHE, batch_size=128) 308 | accuracy_rate = score[1]*100 309 | print('accuracy is {}%'.format(round(accuracy_rate,2))) 310 | 311 | -------------------------------------------------------------------------------- /CH02/code/NeuralNetworkfromScratch_with_python_and spark.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # In[1]: 5 | 6 | from pyspark.sql import SparkSession 7 | 8 | 9 | # In[2]: 10 | 11 | spark = SparkSession.builder .master("local") .appName("Neural Network Model") .config("spark.executor.memory", "6gb") .getOrCreate() 12 | 13 | sc = spark.sparkContext 14 | 15 | 16 | # In[3]: 17 | 18 | df = spark.createDataFrame([('Male', 67, 150), # insert column values 19 | ('Female', 65, 135), 20 | ('Female', 68, 130), 21 | ('Male', 70, 160), 22 | ('Female', 70, 130), 23 | ('Male', 69, 174), 24 | ('Female', 65, 126), 25 | ('Male', 74, 188), 26 | ('Female', 60, 110), 27 | ('Female', 63, 125), 28 | ('Male', 70, 173), 29 | ('Male', 70, 145), 30 | ('Male', 68, 175), 31 | ('Female', 65, 123), 32 | ('Male', 71, 145), 33 | ('Male', 74, 160), 34 | ('Female', 64, 135), 35 | ('Male', 71, 175), 36 | ('Male', 67, 145), 37 | ('Female', 67, 130), 38 | ('Male', 70, 162), 39 | ('Female', 64, 107), 40 | ('Male', 70, 175), 41 | ('Female', 64, 130), 42 | ('Male', 66, 163), 43 | ('Female', 63, 137), 44 | ('Male', 65, 165), 45 | ('Female', 65, 130), 46 | ('Female', 64, 109)], 47 | ['gender', 'height','weight']) # insert header values 48 | 49 | 50 | 51 | # In[4]: 52 | 53 | df.show(5) 54 | 55 | 56 | # In[5]: 57 | 58 | from pyspark.sql import functions 59 | 60 | 61 | # In[6]: 62 | 63 | df = df.withColumn('gender',functions.when(df['gender']=='Female',0).otherwise(1)) 64 | 65 | 66 | # In[7]: 67 | 68 | df = df.select('height', 'weight', 'gender') 69 | 70 | 71 | # In[8]: 72 | 73 | df.show() 74 | 75 | 76 | # In[9]: 77 | 78 | import numpy as np 79 | 80 | 81 | # In[10]: 82 | 83 | df.select("height", "weight", "gender").collect() 84 | 85 | 86 | # In[11]: 87 | 88 | data_array = np.array(df.select("height", "weight", "gender").collect()) 89 | data_array #view the array 90 | 91 | 92 | # In[12]: 93 | 94 | data_array.shape 95 | 96 | 97 | # In[13]: 98 | 99 | data_array[0] 100 | 101 | 102 | # In[14]: 103 | 104 | data_array[28] 105 | 106 | 107 | # In[15]: 108 | 109 | print(data_array.max(axis=0)) 110 | print(data_array.min(axis=0)) 111 | 112 | 113 | # In[16]: 114 | 115 | import matplotlib.pyplot as plt 116 | get_ipython().magic('matplotlib inline') 117 | 118 | 119 | # In[17]: 120 | 121 | min_x = data_array.min(axis=0)[0]-10 122 | max_x = data_array.max(axis=0)[0]+10 123 | min_y = data_array.min(axis=0)[1]-10 124 | max_y = data_array.max(axis=0)[1]+10 125 | 126 | print(min_x, max_x, min_y, max_y) 127 | 128 | 129 | # In[18]: 130 | 131 | # formatting the plot grid, scales, and figure size 132 | plt.figure(figsize=(9, 4), dpi= 75) 133 | plt.axis([min_x,max_x,min_y,max_y]) 134 | plt.grid() 135 | for i in range(len(data_array)): 136 | value = data_array[i] 137 | # assign labels values to specific matrix elements 138 | gender = value[2] 139 | height = value[0] 140 | weight = value[1] 141 | 142 | # filter data points by gender 143 | a = plt.scatter(height[gender==0],weight[gender==0], marker = 'x', c= 'b', label = 'Female') 144 | b = plt.scatter(height[gender==1],weight[gender==1], marker = 'o', c= 'b', label = 'Male') 145 | 146 | # plot values, title, legend, x and y axis 147 | plt.title('Weight vs Height by Gender') 148 | plt.xlabel('Height (in)') 149 | plt.ylabel('Weight (lbs)') 150 | plt.legend(handles=[a,b]) 151 | 152 | 153 | 154 | # In[19]: 155 | 156 | np.random.seed(12345) 157 | 158 | 159 | # In[20]: 160 | 161 | w1 = np.random.randn() 162 | w2 = np.random.randn() 163 | b= np.random.randn() 164 | 165 | 166 | # In[21]: 167 | 168 | print(w1, w2, b) 169 | 170 | 171 | # In[22]: 172 | 173 | X = data_array[:,:2] 174 | y = data_array[:,2] 175 | print(X,y) 176 | 177 | 178 | # In[23]: 179 | 180 | x_mean = X.mean(axis=0) 181 | x_std = X.std(axis=0) 182 | print(x_mean, x_std) 183 | 184 | 185 | # In[24]: 186 | 187 | def normalize(X): 188 | x_mean = X.mean(axis=0) 189 | x_std = X.std(axis=0) 190 | X = (X - X.mean(axis=0))/X.std(axis=0) 191 | return X 192 | 193 | 194 | # In[25]: 195 | 196 | X = normalize(X) 197 | print(X) 198 | 199 | 200 | # In[26]: 201 | 202 | print('standard deviation') 203 | print(round(X[:,0].std(axis=0),0)) 204 | print('mean') 205 | print(round(X[:,0].mean(axis=0),0)) 206 | 207 | 208 | # In[27]: 209 | 210 | data_array = np.column_stack((X[:,0], X[:,1],y)) 211 | print(data_array) 212 | 213 | 214 | # In[28]: 215 | 216 | # formatting the plot grid, scales, and figure size 217 | plt.figure(figsize=(9, 4), dpi= 75) 218 | # plt.axis([min_x,max_x,min_y,max_y]) 219 | plt.grid() 220 | for i in range(len(data_array)): 221 | value_n = data_array[i] 222 | # assign labels values to specific matrix elements 223 | gender_n = value_n[2] 224 | height_n = value_n[0] 225 | weight_n = value_n[1] 226 | an = plt.scatter(height_n[gender_n==0.0],weight_n[gender_n==0.0], marker = 'x', c= 'b', label = 'Female') 227 | bn = plt.scatter(height_n[gender_n==1.0],weight_n[gender_n==1.0], marker = 'o', c= 'b', label = 'Male') 228 | # plot values, title, legend, x and y axis 229 | plt.title('Weight vs Height by Gender (normalized)') 230 | plt.xlabel('Height (in)') 231 | plt.ylabel('Weight (lbs)') 232 | plt.legend(handles=[an,bn]) 233 | 234 | 235 | # In[29]: 236 | 237 | def sigmoid(input): 238 | return 1/(1+np.exp(-input)) 239 | 240 | 241 | # In[30]: 242 | 243 | X = np.arange(-10,10,1) 244 | Y = sigmoid(X) 245 | 246 | 247 | # In[31]: 248 | 249 | plt.figure(figsize=(6, 4), dpi= 75) 250 | plt.axis([-10,10,-0.25,1.2]) 251 | plt.grid() 252 | plt.plot(X,Y) 253 | plt.title('Sigmoid Function') 254 | plt.show() 255 | 256 | 257 | # In[32]: 258 | 259 | def sigmoid_derivative(x): 260 | return sigmoid(x) * (1-sigmoid(x)) 261 | 262 | 263 | # In[33]: 264 | 265 | plt.figure(figsize=(6, 4), dpi= 75) 266 | plt.axis([-10,10,-0.25,1.2]) 267 | plt.grid() 268 | X = np.arange(-10,10,1) 269 | Y = sigmoid(X) 270 | Y_Prime = sigmoid_derivative(X) 271 | plt.plot(X, Y, label="Sigmoid",c='b') 272 | plt.plot(X, Y_Prime, marker=".", label="Sigmoid Derivative", c='b') 273 | plt.title('Sigmoid vs Sigmoid Derivative') 274 | plt.xlabel('X') 275 | plt.ylabel('Y') 276 | plt.legend() 277 | plt.show() 278 | 279 | 280 | # In[34]: 281 | 282 | data_array.shape 283 | 284 | 285 | # In[35]: 286 | 287 | for i in range(100): 288 | random_index = np.random.randint(len(data_array)) 289 | point = data_array[random_index] 290 | print(i, point) 291 | 292 | 293 | # In[36]: 294 | 295 | learning_rate = 0.1 296 | 297 | all_costs = [] 298 | 299 | for i in range(100000): 300 | # set the random data points that will be used to calculate the summation 301 | random_number = np.random.randint(len(data_array)) 302 | random_person = data_array[random_number] 303 | 304 | # the height and weight from the random individual are selected 305 | height = random_person[0] 306 | weight = random_person[1] 307 | 308 | z = w1*height+w2*weight+b 309 | predictedGender = sigmoid(z) 310 | 311 | actualGender = random_person[2] 312 | 313 | cost = (predictedGender-actualGender)**2 314 | 315 | # the cost value is appended to the list 316 | all_costs.append(cost) 317 | 318 | # partial derivatives of the cost function and summation are calculated 319 | dcost_predictedGender = 2 * (predictedGender-actualGender) 320 | dpredictedGenger_dz = sigmoid_derivative(z) 321 | dz_dw1 = height 322 | dz_dw2 = weight 323 | dz_db = 1 324 | 325 | dcost_dw1 = dcost_predictedGender * dpredictedGenger_dz * dz_dw1 326 | dcost_dw2 = dcost_predictedGender * dpredictedGenger_dz * dz_dw2 327 | dcost_db = dcost_predictedGender * dpredictedGenger_dz * dz_db 328 | 329 | # gradient descent calculation 330 | w1 = w1 - learning_rate * dcost_dw1 331 | w2 = w2 - learning_rate * dcost_dw2 332 | b = b - learning_rate * dcost_db 333 | 334 | 335 | # In[37]: 336 | 337 | plt.plot(all_costs) 338 | plt.title('Cost Value over 100,000 iterations') 339 | plt.xlabel('Iteration') 340 | plt.ylabel('Cost Value') 341 | plt.show() 342 | 343 | 344 | # In[38]: 345 | 346 | print('The final values of w1, w2, and b') 347 | print('---------------------------------') 348 | print('w1 = {}'.format(w1)) 349 | print('w2 = {}'.format(w2)) 350 | print('b = {}'.format(b)) 351 | 352 | 353 | # In[39]: 354 | 355 | for i in range(len(data_array)): 356 | random_individual = data_array[i] 357 | height = random_individual[0] 358 | weight = random_individual[1] 359 | z = height*w1 + weight*w2 + b 360 | predictedGender=sigmoid(z) 361 | print("Individual #{} actual score: {} predicted score: {}" 362 | .format(i+1,random_individual[2],predictedGender)) 363 | 364 | 365 | # In[40]: 366 | 367 | def input_normalize(height, weight): 368 | inputHeight = (height - x_mean[0])/x_std[0] 369 | inputWeight = (weight - x_mean[1])/x_std[1] 370 | return inputHeight, inputWeight 371 | 372 | 373 | # In[41]: 374 | 375 | score = input_normalize(70, 180) 376 | 377 | 378 | # In[42]: 379 | 380 | def predict_gender(raw_score): 381 | gender_summation = raw_score[0]*w1 + raw_score[1]*w2 + b 382 | gender_score = sigmoid(gender_summation) 383 | if gender_score <= 0.5: 384 | gender = 'Female' 385 | else: 386 | gender = 'Male' 387 | return gender, gender_score 388 | 389 | 390 | # In[43]: 391 | 392 | predict_gender(score) 393 | 394 | 395 | # In[44]: 396 | 397 | score = input_normalize(50,120) 398 | 399 | 400 | # In[45]: 401 | 402 | predict_gender(score) 403 | 404 | 405 | # In[46]: 406 | 407 | x_min = min(data_array[:,0])-0.1 408 | x_max = max(data_array[:,0])+0.1 409 | y_min = min(data_array[:,1])-0.1 410 | y_max = max(data_array[:,1])+0.1 411 | increment= 0.05 412 | print(x_min, x_max, y_min, y_max) 413 | 414 | 415 | # In[47]: 416 | 417 | x_data= np.arange(x_min, x_max, increment) 418 | 419 | 420 | # In[48]: 421 | 422 | y_data= np.arange(y_min, y_max, increment) 423 | 424 | 425 | # In[49]: 426 | 427 | xy_data = [[x_all, y_all] for x_all in x_data for y_all in y_data] 428 | 429 | 430 | # In[50]: 431 | 432 | for i in range(len(xy_data)): 433 | data = (xy_data[i]) 434 | height = data[0] 435 | weight = data[1] 436 | z_new = height*w1 + weight*w2 + b 437 | predictedGender_new=sigmoid(z_new) 438 | # print(height, weight, predictedGender_new) 439 | ax = plt.scatter(height[predictedGender_new<=0.5], 440 | weight[predictedGender_new<=0.5], 441 | marker = 'o', c= 'r', label = 'Female') 442 | bx = plt.scatter(height[predictedGender_new > 0.5], 443 | weight[predictedGender_new>0.5], 444 | marker = 'o', c= 'b', label = 'Male') 445 | # plot values, title, legend, x and y axis 446 | plt.title('Weight vs Height by Gender') 447 | plt.xlabel('Height (in)') 448 | plt.ylabel('Weight (lbs)') 449 | plt.legend(handles=[ax,bx]) 450 | 451 | -------------------------------------------------------------------------------- /CH13/code/Image+Classification+with+TensorFlow+on+Spark.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "spark = SparkSession.builder \\\n", 10 | " .master(\"local\") \\\n", 11 | " .appName(\"ImageClassification\") \\\n", 12 | " .config(\"spark.executor.memory\", \"6gb\") \\\n", 13 | " .getOrCreate()" 14 | ] 15 | }, 16 | { 17 | "cell_type": "code", 18 | "execution_count": 2, 19 | "metadata": {}, 20 | "outputs": [ 21 | { 22 | "name": "stderr", 23 | "output_type": "stream", 24 | "text": [ 25 | "Using TensorFlow backend.\n", 26 | "/home/asherif844/anaconda3/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n", 27 | " return f(*args, **kwds)\n" 28 | ] 29 | } 30 | ], 31 | "source": [ 32 | "import pyspark.sql.functions as f\n", 33 | "import sparkdl as dl" 34 | ] 35 | }, 36 | { 37 | "cell_type": "code", 38 | "execution_count": 3, 39 | "metadata": { 40 | "collapsed": true 41 | }, 42 | "outputs": [], 43 | "source": [ 44 | "dfMessi = dl.readImages('football/messi/').withColumn('label', f.lit(0))\n", 45 | "dfRonaldo = dl.readImages('football/ronaldo/').withColumn('label', f.lit(1))" 46 | ] 47 | }, 48 | { 49 | "cell_type": "code", 50 | "execution_count": 4, 51 | "metadata": {}, 52 | "outputs": [ 53 | { 54 | "name": "stdout", 55 | "output_type": "stream", 56 | "text": [ 57 | "+---------------------------------------------------------------------+---------------------------+-----+\n", 58 | "|filePath |image |label|\n", 59 | "+---------------------------------------------------------------------+---------------------------+-----+\n", 60 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi3.jpeg |[RGB,173,292,3,[B@43647d0f]|0 |\n", 61 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi14.jpeg|[RGB,187,270,3,[B@28fe803] |0 |\n", 62 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi29.jpeg|[RGB,194,259,3,[B@669635ee]|0 |\n", 63 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi18.jpeg|[RGB,194,259,3,[B@6e004f55]|0 |\n", 64 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi8.jpeg |[RGB,168,300,3,[B@eecdb9f] |0 |\n", 65 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi22.jpeg|[RGB,194,259,3,[B@73def5b1]|0 |\n", 66 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi2.jpeg |[RGB,275,183,3,[B@24308761]|0 |\n", 67 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi5.jpeg |[RGB,183,275,3,[B@48a60e55]|0 |\n", 68 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi13.jpeg|[RGB,183,275,3,[B@207e14fd]|0 |\n", 69 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/messi/messi11.jpeg|[RGB,175,288,3,[B@24890e3e]|0 |\n", 70 | "+---------------------------------------------------------------------+---------------------------+-----+\n", 71 | "only showing top 10 rows\n", 72 | "\n" 73 | ] 74 | } 75 | ], 76 | "source": [ 77 | "dfMessi.show(n=10,truncate=False)" 78 | ] 79 | }, 80 | { 81 | "cell_type": "code", 82 | "execution_count": 5, 83 | "metadata": {}, 84 | "outputs": [ 85 | { 86 | "name": "stdout", 87 | "output_type": "stream", 88 | "text": [ 89 | "+-------------------------------------------------------------------------+---------------------------+-----+\n", 90 | "|filePath |image |label|\n", 91 | "+-------------------------------------------------------------------------+---------------------------+-----+\n", 92 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo24.jpg |[RGB,350,590,3,[B@7b3b3c6] |1 |\n", 93 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo2.jpeg |[RGB,225,225,3,[B@61826869]|1 |\n", 94 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo21.jpeg|[RGB,193,261,3,[B@1d739c7f]|1 |\n", 95 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo17.jpeg|[RGB,183,275,3,[B@59b36a5b]|1 |\n", 96 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo30.jpeg|[RGB,184,273,3,[B@4304cf28]|1 |\n", 97 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo14.jpeg|[RGB,154,328,3,[B@31b73601]|1 |\n", 98 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo5.jpeg |[RGB,168,300,3,[B@30a6d42c]|1 |\n", 99 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo18.jpeg|[RGB,261,193,3,[B@728581d1]|1 |\n", 100 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo1.jpeg |[RGB,168,300,3,[B@171d6d26]|1 |\n", 101 | "|file:/home/asherif844/sparkNotebooks/Ch14/football/ronaldo/ronaldo19.jpeg|[RGB,258,195,3,[B@1f1256fa]|1 |\n", 102 | "+-------------------------------------------------------------------------+---------------------------+-----+\n", 103 | "only showing top 10 rows\n", 104 | "\n" 105 | ] 106 | } 107 | ], 108 | "source": [ 109 | "dfRonaldo.show(n=10,truncate=False)" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": 6, 115 | "metadata": { 116 | "collapsed": true 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "trainDFmessi, testDFmessi = dfMessi.randomSplit([66.7, 33.3], seed =12)\n", 121 | "trainDFronaldo, testDFronaldo = dfRonaldo.randomSplit([66.7, 33.3], seed=12)" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 7, 127 | "metadata": {}, 128 | "outputs": [ 129 | { 130 | "name": "stdout", 131 | "output_type": "stream", 132 | "text": [ 133 | "The number of images in trainDFmessi is 18\n", 134 | "The number of images in testDFmessi is 12\n", 135 | "The number of images in trainDFronaldo is 18\n", 136 | "The number of images in testDFronaldo is 12\n" 137 | ] 138 | } 139 | ], 140 | "source": [ 141 | "print('The number of images in trainDFmessi is {}'.format(trainDFmessi.toPandas().shape[0]))\n", 142 | "print('The number of images in testDFmessi is {}'.format(testDFmessi.toPandas().shape[0]))\n", 143 | "print('The number of images in trainDFronaldo is {}'.format(trainDFronaldo.toPandas().shape[0]))\n", 144 | "print('The number of images in testDFronaldo is {}'.format(testDFronaldo.toPandas().shape[0]))" 145 | ] 146 | }, 147 | { 148 | "cell_type": "code", 149 | "execution_count": 8, 150 | "metadata": { 151 | "collapsed": true 152 | }, 153 | "outputs": [], 154 | "source": [ 155 | "trainDF = trainDFmessi.unionAll(trainDFronaldo)\n", 156 | "testDF = testDFmessi.unionAll(testDFronaldo)" 157 | ] 158 | }, 159 | { 160 | "cell_type": "code", 161 | "execution_count": 9, 162 | "metadata": {}, 163 | "outputs": [ 164 | { 165 | "name": "stdout", 166 | "output_type": "stream", 167 | "text": [ 168 | "The number of images in the training data is 36\n", 169 | "The number of images in the testing data is 24\n" 170 | ] 171 | } 172 | ], 173 | "source": [ 174 | "print('The number of images in the training data is {}' .format(trainDF.toPandas().shape[0]))\n", 175 | "print('The number of images in the testing data is {}' .format(testDF.toPandas().shape[0]))" 176 | ] 177 | }, 178 | { 179 | "cell_type": "code", 180 | "execution_count": 10, 181 | "metadata": {}, 182 | "outputs": [ 183 | { 184 | "name": "stdout", 185 | "output_type": "stream", 186 | "text": [ 187 | "INFO:tensorflow:Froze 376 variables.\n", 188 | "Converted 376 variables to const ops.\n", 189 | "INFO:tensorflow:Froze 0 variables.\n", 190 | "Converted 0 variables to const ops.\n" 191 | ] 192 | } 193 | ], 194 | "source": [ 195 | "from pyspark.ml.classification import LogisticRegression\n", 196 | "from pyspark.ml import Pipeline\n", 197 | "\n", 198 | "vectorizer = dl.DeepImageFeaturizer(inputCol=\"image\", outputCol=\"features\", modelName='InceptionV3')\n", 199 | "logreg = LogisticRegression(maxIter=30,labelCol = \"label\", featuresCol=\"features\")\n", 200 | "pipeline = Pipeline(stages=[vectorizer, logreg])\n", 201 | "\n", 202 | "pipeline_model = pipeline.fit(trainDF)" 203 | ] 204 | }, 205 | { 206 | "cell_type": "code", 207 | "execution_count": 11, 208 | "metadata": {}, 209 | "outputs": [ 210 | { 211 | "name": "stdout", 212 | "output_type": "stream", 213 | "text": [ 214 | "INFO:tensorflow:Froze 376 variables.\n", 215 | "Converted 376 variables to const ops.\n", 216 | "INFO:tensorflow:Froze 0 variables.\n", 217 | "Converted 0 variables to const ops.\n", 218 | "+-----+----------+\n", 219 | "|label|prediction|\n", 220 | "+-----+----------+\n", 221 | "|0 |1.0 |\n", 222 | "|0 |0.0 |\n", 223 | "|0 |0.0 |\n", 224 | "|0 |0.0 |\n", 225 | "|0 |0.0 |\n", 226 | "|0 |0.0 |\n", 227 | "|0 |0.0 |\n", 228 | "|0 |1.0 |\n", 229 | "|0 |0.0 |\n", 230 | "|0 |0.0 |\n", 231 | "|0 |0.0 |\n", 232 | "|0 |0.0 |\n", 233 | "|1 |1.0 |\n", 234 | "|1 |1.0 |\n", 235 | "|1 |1.0 |\n", 236 | "|1 |1.0 |\n", 237 | "|1 |1.0 |\n", 238 | "|1 |0.0 |\n", 239 | "|1 |1.0 |\n", 240 | "|1 |1.0 |\n", 241 | "|1 |1.0 |\n", 242 | "|1 |1.0 |\n", 243 | "|1 |1.0 |\n", 244 | "|1 |1.0 |\n", 245 | "+-----+----------+\n", 246 | "\n" 247 | ] 248 | } 249 | ], 250 | "source": [ 251 | "predictDF = pipeline_model.transform(testDF)\n", 252 | "predictDF.select('label', 'prediction').show(n = testDF.toPandas().shape[0], truncate=False)" 253 | ] 254 | }, 255 | { 256 | "cell_type": "code", 257 | "execution_count": 12, 258 | "metadata": {}, 259 | "outputs": [ 260 | { 261 | "name": "stdout", 262 | "output_type": "stream", 263 | "text": [ 264 | "+----------------+---+---+\n", 265 | "|prediction_label| 0| 1|\n", 266 | "+----------------+---+---+\n", 267 | "| 1.0| 2| 11|\n", 268 | "| 0.0| 10| 1|\n", 269 | "+----------------+---+---+\n", 270 | "\n" 271 | ] 272 | } 273 | ], 274 | "source": [ 275 | "predictDF.crosstab('prediction', 'label').show()" 276 | ] 277 | }, 278 | { 279 | "cell_type": "code", 280 | "execution_count": 13, 281 | "metadata": {}, 282 | "outputs": [ 283 | { 284 | "name": "stdout", 285 | "output_type": "stream", 286 | "text": [ 287 | "accuracy: 87.5%\n" 288 | ] 289 | } 290 | ], 291 | "source": [ 292 | "from pyspark.ml.evaluation import MulticlassClassificationEvaluator\n", 293 | "scoring = predictDF.select(\"prediction\", \"label\")\n", 294 | "accuracy_score = MulticlassClassificationEvaluator(metricName=\"accuracy\")\n", 295 | "rate = accuracy_score.evaluate(scoring)*100\n", 296 | "print(\"accuracy: {}%\" .format(round(rate,2)))" 297 | ] 298 | }, 299 | { 300 | "cell_type": "code", 301 | "execution_count": 14, 302 | "metadata": {}, 303 | "outputs": [ 304 | { 305 | "name": "stdout", 306 | "output_type": "stream", 307 | "text": [ 308 | "accuracy: 87.5%\n" 309 | ] 310 | } 311 | ], 312 | "source": [ 313 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", 314 | "\n", 315 | "binaryevaluator = BinaryClassificationEvaluator(rawPredictionCol=\"prediction\")\n", 316 | "binary_rate = binaryevaluator.evaluate(predictDF)*100\n", 317 | "print(\"accuracy: {}%\" .format(round(binary_rate,2)))" 318 | ] 319 | }, 320 | { 321 | "cell_type": "code", 322 | "execution_count": 15, 323 | "metadata": {}, 324 | "outputs": [ 325 | { 326 | "name": "stdout", 327 | "output_type": "stream", 328 | "text": [ 329 | "INFO:tensorflow:Froze 376 variables.\n", 330 | "Converted 376 variables to const ops.\n", 331 | "INFO:tensorflow:Froze 0 variables.\n", 332 | "Converted 0 variables to const ops.\n" 333 | ] 334 | } 335 | ], 336 | "source": [ 337 | "logregFT = LogisticRegression(\n", 338 | " regParam=0.05, \n", 339 | " elasticNetParam=0.3,\n", 340 | " maxIter=15,labelCol = \"label\", featuresCol=\"features\")\n", 341 | "pipelineFT = Pipeline(stages=[vectorizer, logregFT])\n", 342 | "\n", 343 | "pipeline_model_FT = pipelineFT.fit(trainDF)" 344 | ] 345 | }, 346 | { 347 | "cell_type": "code", 348 | "execution_count": 16, 349 | "metadata": {}, 350 | "outputs": [ 351 | { 352 | "name": "stdout", 353 | "output_type": "stream", 354 | "text": [ 355 | "INFO:tensorflow:Froze 376 variables.\n", 356 | "Converted 376 variables to const ops.\n", 357 | "INFO:tensorflow:Froze 0 variables.\n", 358 | "Converted 0 variables to const ops.\n", 359 | "+----------------+---+---+\n", 360 | "|prediction_label| 0| 1|\n", 361 | "+----------------+---+---+\n", 362 | "| 1.0| 0| 11|\n", 363 | "| 0.0| 12| 1|\n", 364 | "+----------------+---+---+\n", 365 | "\n" 366 | ] 367 | } 368 | ], 369 | "source": [ 370 | "predictDF_FT = pipeline_model_FT.transform(testDF)\n", 371 | "predictDF_FT.crosstab('prediction', 'label').show()" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 17, 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "name": "stdout", 381 | "output_type": "stream", 382 | "text": [ 383 | "accuracy: 95.83%\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "binary_rate_FT = binaryevaluator.evaluate(predictDF_FT)*100\n", 389 | "print(\"accuracy: {}%\" .format(round(binary_rate_FT,2)))" 390 | ] 391 | } 392 | ], 393 | "metadata": { 394 | "kernelspec": { 395 | "display_name": "Python 3", 396 | "language": "python", 397 | "name": "python3" 398 | }, 399 | "language_info": { 400 | "codemirror_mode": { 401 | "name": "ipython", 402 | "version": 3 403 | }, 404 | "file_extension": ".py", 405 | "mimetype": "text/x-python", 406 | "name": "python", 407 | "nbconvert_exporter": "python", 408 | "pygments_lexer": "ipython3", 409 | "version": "3.6.1" 410 | } 411 | }, 412 | "nbformat": 4, 413 | "nbformat_minor": 2 414 | } 415 | -------------------------------------------------------------------------------- /CH03/code/MNIST+with+CNN.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "/home/asherif844/anaconda3/lib/python3.6/importlib/_bootstrap.py:205: RuntimeWarning: compiletime version 3.5 of module 'tensorflow.python.framework.fast_tensor_util' does not match runtime version 3.6\n", 13 | " return f(*args, **kwds)\n" 14 | ] 15 | } 16 | ], 17 | "source": [ 18 | "import tensorflow as tf" 19 | ] 20 | }, 21 | { 22 | "cell_type": "code", 23 | "execution_count": 2, 24 | "metadata": {}, 25 | "outputs": [ 26 | { 27 | "name": "stdout", 28 | "output_type": "stream", 29 | "text": [ 30 | "1.4.1\n" 31 | ] 32 | } 33 | ], 34 | "source": [ 35 | "print(tf.__version__)" 36 | ] 37 | }, 38 | { 39 | "cell_type": "code", 40 | "execution_count": 3, 41 | "metadata": { 42 | "scrolled": true 43 | }, 44 | "outputs": [ 45 | { 46 | "name": "stdout", 47 | "output_type": "stream", 48 | "text": [ 49 | "Extracting MNIST/train-images-idx3-ubyte.gz\n", 50 | "Extracting MNIST/train-labels-idx1-ubyte.gz\n", 51 | "Extracting MNIST/t10k-images-idx3-ubyte.gz\n", 52 | "Extracting MNIST/t10k-labels-idx1-ubyte.gz\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "from tensorflow.examples.tutorials.mnist import input_data\n", 58 | "data = input_data.read_data_sets('MNIST/', one_hot=True)" 59 | ] 60 | }, 61 | { 62 | "cell_type": "code", 63 | "execution_count": 4, 64 | "metadata": {}, 65 | "outputs": [ 66 | { 67 | "data": { 68 | "text/plain": [ 69 | "['t10k-images-idx3-ubyte.gz',\n", 70 | " 'images',\n", 71 | " 't10k-labels-idx1-ubyte.gz',\n", 72 | " 'train-labels-idx1-ubyte.gz',\n", 73 | " 'train-images-idx3-ubyte.gz']" 74 | ] 75 | }, 76 | "execution_count": 4, 77 | "metadata": {}, 78 | "output_type": "execute_result" 79 | } 80 | ], 81 | "source": [ 82 | "import os\n", 83 | "os.listdir('MNIST/')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "Image Inventory\n", 96 | "----------\n", 97 | "Training: 55000\n", 98 | "Testing: 10000\n", 99 | "----------\n" 100 | ] 101 | } 102 | ], 103 | "source": [ 104 | "print('Image Inventory')\n", 105 | "print('----------')\n", 106 | "print('Training: {}'.format(len(data.train.labels)))\n", 107 | "print('Testing: {}'.format(len(data.test.labels)))\n", 108 | "print('----------')" 109 | ] 110 | }, 111 | { 112 | "cell_type": "code", 113 | "execution_count": 6, 114 | "metadata": { 115 | "collapsed": true 116 | }, 117 | "outputs": [], 118 | "source": [ 119 | "import numpy as np\n", 120 | "import matplotlib.pyplot as plt\n", 121 | "%matplotlib inline" 122 | ] 123 | }, 124 | { 125 | "cell_type": "code", 126 | "execution_count": 7, 127 | "metadata": { 128 | "scrolled": false 129 | }, 130 | "outputs": [ 131 | { 132 | "name": "stdout", 133 | "output_type": "stream", 134 | "text": [ 135 | "-----------------\n", 136 | "[0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]\n" 137 | ] 138 | }, 139 | { 140 | "data": { 141 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAD8CAYAAAC4nHJkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADZNJREFUeJzt3X+IXfWZx/HP05ig2OKPrTsMJut0/JnqH1MdpVIpXWuK\nSiEWJHbANaulUyVbjERYcYXNH/5RStJYECpTDI2lpq3UapTSNROEbMhaTSQ7469WtyQkMeaH0WSC\nYmt89o85tqPO+d7rPefcc2ae9wuGufc858fDZT5zzr3n3PM1dxeAeD5TdwMA6kH4gaAIPxAU4QeC\nIvxAUIQfCIrwA0ERfiAowg8EdUI3N2ZmXE4IVMzdrZ35Cu35zexqM/ujmb1mZncVWReA7rJOr+03\nszmS/iRpkaQ9kp6TNOTuLyWWYc8PVKwbe/7LJL3m7n92979I+qWkxQXWB6CLioT/TEm7pzzfk037\nCDMbNrNtZratwLYAlKzyD/zcfUTSiMRhP9AkRfb8eyUtmPJ8fjYNwAxQJPzPSTrXzL5gZvMkfVvS\nhnLaAlC1jg/73f19M/s3Sf8laY6kte7+YmmdAahUx6f6OtoY7/mBynXlIh8AMxfhB4Ii/EBQhB8I\nivADQRF+ICjCDwRF+IGgCD8QFOEHgiL8QFCEHwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeCIvxAUIQf\nCIrwA0ERfiAowg8ERfiBoAg/EBThB4Ii/EBQhB8IivADQXU8RLckmdlOSROSjkt6390Hy2gKQPUK\nhT/zz+5+qIT1AOgiDvuBoIqG3yWNmtl2MxsuoyEA3VH0sP8Kd99rZv8oaaOZveLum6fOkP1T4B8D\n0DDm7uWsyGylpGPuvioxTzkbA5DL3a2d+To+7Dezk83scx8+lvQNSS90uj4A3VXksL9H0m/N7MP1\nPOzuvy+lKwCVK+2wv62NcdgPVK7yw34AMxvhB4Ii/EBQhB8IivADQRF+IKgyvtWHmt188825tVan\nct98881kfeHChcn61q1bk/UtW7Yk66gPe34gKMIPBEX4gaAIPxAU4QeCIvxAUIQfCGrWnOcfGhpK\n1i+++OJkPXWuvOlOPfXUjpc9fvx4sj5v3rxk/d13303W33nnndza+Ph4ctklS5Yk6wcPHkzWkcae\nHwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeCmlG37l69enVu7fbbb08uO2fOnCKbRg2efvrpZL3VtR37\n9+8vs50Zg1t3A0gi/EBQhB8IivADQRF+ICjCDwRF+IGgWp7nN7O1kr4p6YC7X5RNO13SryT1Sdop\naYm7v9VyYwXP8+/evTu3Nn/+/OSyY2NjyXqr76VXqdW97R977LEudfLpLVq0KFm/6aabcmt9fX2F\ntt3qOoAbbrghtzab7wVQ5nn+n0m6+mPT7pK0yd3PlbQpew5gBmkZfnffLOnwxyYvlrQue7xO0nUl\n9wWgYp2+5+9x933Z4zck9ZTUD4AuKXwPP3f31Ht5MxuWNFx0OwDK1emef7+Z9UpS9vtA3ozuPuLu\ng+4+2OG2AFSg0/BvkLQ0e7xU0uPltAOgW1qG38zWS/ofSeeb2R4z+46kH0haZGavSroqew5gBplR\n3+c/77zzcmsXXnhhctnR0dFkfWJioqOekNbf359be/LJJ5PLLly4sNC277zzztxa6t4QMx3f5weQ\nRPiBoAg/EBThB4Ii/EBQhB8Iakad6sPscv311yfrjzzySKH1Hzp0KLd2xhlnFFp3k3GqD0AS4QeC\nIvxAUIQfCIrwA0ERfiAowg8ERfiBoAg/EBThB4Ii/EBQhB8IivADQRF+ICjCDwRVeLguIOW2227L\nrV166aWVbvvEE0/MrV1yySXJZbdv3152O43Dnh8IivADQRF+ICjCDwRF+IGgCD8QFOEHgmp5334z\nWyvpm5IOuPtF2bSVkr4r6WA2293u/ruWG+O+/ZXo7e3Nrd14443JZZcvX152Ox+R6s2srdvLV+Lo\n0aPJ+imnnNKlTspX5n37fybp6mmmr3H3geynZfABNEvL8Lv7ZkmHu9ALgC4q8p7/+2Y2ZmZrzey0\n0joC0BWdhv8nkvolDUjaJ2l13oxmNmxm28xsW4fbAlCBjsLv7vvd/bi7fyDpp5IuS8w74u6D7j7Y\naZMAytdR+M1s6ke435L0QjntAOiWll/pNbP1kr4m6fNmtkfSf0r6mpkNSHJJOyV9r8IeAVSgZfjd\nfWiayQ9W0EtYV111VbLe6rvnw8PDubX+/v6Oeprt1q5dW3cLteMKPyAowg8ERfiBoAg/EBThB4Ii\n/EBQ3Lq7BOecc06y/sADDyTrV155ZbJe5Vdfd+3alay/9dZbhdZ/zz335Nbee++95LL3339/sn7+\n+ed31JMkvf766x0vO1uw5weCIvxAUIQfCIrwA0ERfiAowg8ERfiBoDjP36Y77rgjt7Zs2bLksmef\nfXayfuzYsWT97bffTtbvu+++3Fqr89lbt25N1ltdB1ClI0eOFFp+YmIit/bEE08UWvdswJ4fCIrw\nA0ERfiAowg8ERfiBoAg/EBThB4LiPH+bLr/88txaq/P4GzZsSNZXr84d7UyStHnz5mR9phoYGEjW\nzzrrrELrT90v4JVXXim07tmAPT8QFOEHgiL8QFCEHwiK8ANBEX4gKMIPBNXyPL+ZLZD0kKQeSS5p\nxN1/bGanS/qVpD5JOyUtcfdiN3lvsFtvvTW3NjY2llz23nvvLbudWaHVeAc9PT2F1j86Olpo+dmu\nnT3/+5JWuPsXJX1Z0jIz+6KkuyRtcvdzJW3KngOYIVqG3933ufvz2eMJSS9LOlPSYknrstnWSbqu\nqiYBlO9Tvec3sz5JX5L0B0k97r4vK72hybcFAGaItq/tN7PPSvqNpOXufnTq+HHu7mbmOcsNSxou\n2iiAcrW15zezuZoM/i/c/dFs8n4z683qvZIOTLesu4+4+6C7D5bRMIBytAy/Te7iH5T0srv/aEpp\ng6Sl2eOlkh4vvz0AVTH3aY/W/z6D2RWS/lvSuKQPssl3a/J9/68l/ZOkXZo81Xe4xbrSG0Moq1at\nStZXrFiRrLe6pfk111yTW3vmmWeSy85k7t7WmO4t3/O7+xZJeSv7+qdpCkBzcIUfEBThB4Ii/EBQ\nhB8IivADQRF+IChu3Y1KjY+P59YuuOCCQut+6qmnkvXZfC6/DOz5gaAIPxAU4QeCIvxAUIQfCIrw\nA0ERfiAozvOjUn19fbm1E05I//kdOXIkWV+zZk0nLSHDnh8IivADQRF+ICjCDwRF+IGgCD8QFOEH\nguI8PwoZGhpK1k866aTc2sTERHLZ4eH0KG98X78Y9vxAUIQfCIrwA0ERfiAowg8ERfiBoAg/EJS5\ne3oGswWSHpLUI8kljbj7j81spaTvSjqYzXq3u/+uxbrSG0PjzJ07N1l/9tlnk/XUvfnXr1+fXPaW\nW25J1jE9d7d25mvnIp/3Ja1w9+fN7HOStpvZxqy2xt1XddokgPq0DL+775O0L3s8YWYvSzqz6sYA\nVOtTvec3sz5JX5L0h2zS981szMzWmtlpOcsMm9k2M9tWqFMApWo7/Gb2WUm/kbTc3Y9K+omkfkkD\nmjwyWD3dcu4+4u6D7j5YQr8AStJW+M1sriaD/wt3f1SS3H2/ux939w8k/VTSZdW1CaBsLcNvZibp\nQUkvu/uPpkzvnTLbtyS9UH57AKrSzqf9X5H0L5LGzWxHNu1uSUNmNqDJ0387JX2vkg5Rq1angh9+\n+OFkfceOHbm1jRs35tZQvXY+7d8iabrzhslz+gCajSv8gKAIPxAU4QeCIvxAUIQfCIrwA0G1/Epv\nqRvjK71A5dr9Si97fiAowg8ERfiBoAg/EBThB4Ii/EBQhB8IqttDdB+StGvK889n05qoqb01tS+J\n3jpVZm9ntTtjVy/y+cTGzbY19d5+Te2tqX1J9NapunrjsB8IivADQdUd/pGat5/S1N6a2pdEb52q\npbda3/MDqE/de34ANakl/GZ2tZn90cxeM7O76ughj5ntNLNxM9tR9xBj2TBoB8zshSnTTjezjWb2\navZ72mHSauptpZntzV67HWZ2bU29LTCzp83sJTN70cxuz6bX+tol+qrldev6Yb+ZzZH0J0mLJO2R\n9JykIXd/qauN5DCznZIG3b32c8Jm9lVJxyQ95O4XZdN+KOmwu/8g+8d5mrv/e0N6WynpWN0jN2cD\nyvROHVla0nWS/lU1vnaJvpaohtetjj3/ZZJec/c/u/tfJP1S0uIa+mg8d98s6fDHJi+WtC57vE6T\nfzxdl9NbI7j7Pnd/Pns8IenDkaVrfe0SfdWijvCfKWn3lOd71Kwhv13SqJltN7PhupuZRk82bLok\nvSGpp85mptFy5OZu+tjI0o157ToZ8bpsfOD3SVe4+4CkayQtyw5vG8kn37M16XRNWyM3d8s0I0v/\nTZ2vXacjXpetjvDvlbRgyvP52bRGcPe92e8Dkn6r5o0+vP/DQVKz3wdq7udvmjRy83QjS6sBr12T\nRryuI/zPSTrXzL5gZvMkfVvShhr6+AQzOzn7IEZmdrKkb6h5ow9vkLQ0e7xU0uM19vIRTRm5OW9k\nadX82jVuxGt37/qPpGs1+Yn//0n6jzp6yOmrX9L/Zj8v1t2bpPWaPAz8qyY/G/mOpH+QtEnSq5JG\nJZ3eoN5+Lmlc0pgmg9ZbU29XaPKQfkzSjuzn2rpfu0RftbxuXOEHBMUHfkBQhB8IivADQRF+ICjC\nDwRF+IGgCD8QFOEHgvp/zdVX5KPezC0AAAAASUVORK5CYII=\n", 142 | "text/plain": [ 143 | "" 144 | ] 145 | }, 146 | "metadata": {}, 147 | "output_type": "display_data" 148 | }, 149 | { 150 | "name": "stdout", 151 | "output_type": "stream", 152 | "text": [ 153 | "-----------------\n", 154 | "[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]\n" 155 | ] 156 | }, 157 | { 158 | "data": { 159 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAP8AAAD8CAYAAAC4nHJkAAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAADhBJREFUeJzt3V2MVPUZx/HfU7E36IWydCWKiyYGo16gWUkvkGisKMYE\nuDG+xNBUWWOsKdqL4kusCYqmqVa4QddIxMa3BthIDNYoaZAmDeHNKu6CWoMCQRbERI0XVvfpxRya\nVff8zzBzZs4sz/eTbHbmPHNmHo/748yZ/5zzN3cXgHh+VnUDAKpB+IGgCD8QFOEHgiL8QFCEHwiK\n8ANBEX4gKMIPBDWhnS9mZnydEGgxd7d6HtfUnt/MrjGzPWb2kZktaea5ALSXNfrdfjM7SdIHkq6S\ntF/SVkk3uvtgYh32/ECLtWPPP1PSR+7+sbt/K+llSfOaeD4AbdRM+M+UtG/U/f3Zsh8wsz4z22Zm\n25p4LQAla/kHfu7eL6lf4m0/0Ema2fMfkDR11P2zsmUAxoFmwr9V0nlmdo6Z/VzSDZLWl9MWgFZr\n+G2/u39nZr+V9IakkyStcvf3S+sMQEs1PNTX0ItxzA+0XFu+5ANg/CL8QFCEHwiK8ANBEX4gKMIP\nBEX4gaAIPxAU4QeCIvxAUIQfCIrwA0ERfiAowg8ERfiBoAg/EBThB4Ii/EBQhB8IivADQRF+ICjC\nDwRF+IGgCD8QFOEHgiL8QFCEHwiK8ANBEX4gqIan6JYkM9sr6StJ30v6zt17y2gK7dPT05Os33bb\nbcn6/fffn6ynZoE2S08mOzQ0lKw/8MADyfrAwECyHl1T4c9c4e5HSngeAG3E234gqGbD75LeMrPt\nZtZXRkMA2qPZt/2z3P2Amf1C0ptmttvd3x79gOwfBf5hADpMU3t+dz+Q/R6WNCBp5hiP6Xf3Xj4M\nBDpLw+E3s4lmduqx25LmSNpVVmMAWquZt/3dkgay4ZoJkl5097+X0hWAlrPUOGzpL2bWvhcLZPLk\nybm1e++9N7nuzTffnKxPmjQpWS8aq29mnL/ob3Pfvn3J+qWXXppbO3LkxB2ddvf0hs0w1AcERfiB\noAg/EBThB4Ii/EBQhB8IiqG+caDotNmlS5fm1or+/7Z6uO3w4cPJekpXV1eyPm3atGR9cHAwt3bh\nhRc20tK4wFAfgCTCDwRF+IGgCD8QFOEHgiL8QFCEHwiKcf5xYOvWrcn6JZdckltrdpw/NVYuSVdc\ncUWy3syps7NmzUrWN23alKyn/tsnTCjjwtWdiXF+AEmEHwiK8ANBEX4gKMIPBEX4gaAIPxAU4/wd\n4Pzzz0/Wi8b5P//889xa0fn0RePwd999d7K+ePHiZH3ZsmW5tU8//TS5bpGiv92RkZHc2h133JFc\nt7+/v6GeOgHj/ACSCD8QFOEHgiL8QFCEHwiK8ANBEX4gqMJxfjNbJek6ScPuflG27HRJr0iaJmmv\npOvd/YvCF2OcvyFF3wNIjdU3OxV1X19fsr5y5cpkPTVN9o4dO5LrLliwIFlfs2ZNsp762z7jjDOS\n647nKbzLHOd/TtI1P1q2RNJGdz9P0sbsPoBxpDD87v62pKM/WjxP0urs9mpJ80vuC0CLNXrM3+3u\nB7Pbn0nqLqkfAG3S9IXM3N1Tx/Jm1icpfeAIoO0a3fMfMrMpkpT9Hs57oLv3u3uvu/c2+FoAWqDR\n8K+XtDC7vVDSq+W0A6BdCsNvZi9J+pek6Wa238xulfSYpKvM7ENJv8ruAxhHCo/53f3GnNKVJfeC\nHLt3767stYuuB7Bnz55kPXWtgaJrBSxZkh5BLppzoJXffzgR8A0/ICjCDwRF+IGgCD8QFOEHgiL8\nQFAn7jzFgcyePTu3VnQ6cNFQ3tDQULI+ffr0ZH3Lli25tcmTJyfXLTrdvKj3uXPnJuvRsecHgiL8\nQFCEHwiK8ANBEX4gKMIPBEX4gaAY5z8B3HTTTbm1RYsWJdctOi22jku7J+upsfxmTsmVpBUrViTr\nRZcGj449PxAU4QeCIvxAUIQfCIrwA0ERfiAowg8ExTj/Ca5onL7K9Tdv3pxc95577knWGcdvDnt+\nICjCDwRF+IGgCD8QFOEHgiL8QFCEHwiqcJzfzFZJuk7SsLtflC17SNIiSccunH6fu29oVZNIe/HF\nF3NrPT09yXW7urqS9aLr/k+cODFZT3nwwQeTdcbxW6uePf9zkq4ZY/lf3H1G9kPwgXGmMPzu/rak\no23oBUAbNXPMf5eZvWtmq8zstNI6AtAWjYZ/paRzJc2QdFDS43kPNLM+M9tmZtsafC0ALdBQ+N39\nkLt/7+4jkp6RNDPx2H5373X33kabBFC+hsJvZlNG3V0gaVc57QBol3qG+l6SdLmkLjPbL+mPki43\nsxmSXNJeSbe3sEcALWDNnq99XC9m1r4XQymKxvkffvjhZH3+/Pm5tZ07dybXnTt3brJedF3/qNw9\nPSFChm/4AUERfiAowg8ERfiBoAg/EBThB4JiqK9OqammDx8+nFuL7vXXX8+tXX311cl1iy7d/eST\nTzbU04mOoT4ASYQfCIrwA0ERfiAowg8ERfiBoAg/EBRTdGdmz56drD/+eO6VyrR79+7kurfccktD\nPZ0IHnnkkdzanDlzkutOnz697HYwCnt+ICjCDwRF+IGgCD8QFOEHgiL8QFCEHwgqzDh/6nx8SXrq\nqaeS9eHh4dxa5HH8oim6n3766dyaWV2nnaNF2PMDQRF+ICjCDwRF+IGgCD8QFOEHgiL8QFCF4/xm\nNlXS85K6JbmkfndfbmanS3pF0jRJeyVd7+5ftK7V5ixYsCBZLzp3fNOmTWW2M24UTdG9du3aZD21\nXYvmjCi6TgKaU8+e/ztJv3f3CyT9UtKdZnaBpCWSNrr7eZI2ZvcBjBOF4Xf3g+6+I7v9laQhSWdK\nmidpdfaw1ZLmt6pJAOU7rmN+M5sm6WJJWyR1u/vBrPSZaocFAMaJur/bb2anSForabG7fzn6e9nu\n7nnz8JlZn6S+ZhsFUK669vxmdrJqwX/B3ddliw+Z2ZSsPkXSmGe+uHu/u/e6e28ZDQMoR2H4rbaL\nf1bSkLs/Maq0XtLC7PZCSa+W3x6AVimcotvMZknaLOk9SSPZ4vtUO+7/m6SzJX2i2lDf0YLnqmyK\n7qIhq6GhoWR9cHAwt/boo4829dzbt29P1ov09PTk1i677LLkukVDoPPnpz/HLTotN/X3tXz58uS6\nRVN0Y2z1TtFdeMzv7v+UlPdkVx5PUwA6B9/wA4Ii/EBQhB8IivADQRF+ICjCDwRVOM5f6otVOM5f\nZM2aNcl6ary7mbFuSdq5c2eyXuTss8/OrU2aNCm5brO9F62fmqJ7xYoVyXWPHDmSrGNs9Y7zs+cH\ngiL8QFCEHwiK8ANBEX4gKMIPBEX4gaAY588UTeG9YcOG3Fpvb/oiRSMjI8l6K8fai9b95ptvkvWi\ny2cvW7YsWR8YGEjWUT7G+QEkEX4gKMIPBEX4gaAIPxAU4QeCIvxAUIzz16mrqyu3tnTp0qaeu68v\nPZvZunXrkvVmznsvunY+02SPP4zzA0gi/EBQhB8IivADQRF+ICjCDwRF+IGgCsf5zWyqpOcldUty\nSf3uvtzMHpK0SNLh7KH3uXv+Se8a3+P8wHhR7zh/PeGfImmKu+8ws1MlbZc0X9L1kr529z/X2xTh\nB1qv3vBPqOOJDko6mN3+ysyGJJ3ZXHsAqnZcx/xmNk3SxZK2ZIvuMrN3zWyVmZ2Ws06fmW0zs21N\ndQqgVHV/t9/MTpG0SdIj7r7OzLolHVHtc4Clqh0a/KbgOXjbD7RYacf8kmRmJ0t6TdIb7v7EGPVp\nkl5z94sKnofwAy1W2ok9Vrs07LOShkYHP/sg8JgFknYdb5MAqlPPp/2zJG2W9J6kY9egvk/SjZJm\nqPa2f6+k27MPB1PPxZ4faLFS3/aXhfADrcf5/ACSCD8QFOEHgiL8QFCEHwiK8ANBEX4gKMIPBEX4\ngaAIPxAU4QeCIvxAUIQfCIrwA0EVXsCzZEckfTLqfle2rBN1am+d2pdEb40qs7eeeh/Y1vP5f/Li\nZtvcvbeyBhI6tbdO7Uuit0ZV1Rtv+4GgCD8QVNXh76/49VM6tbdO7Uuit0ZV0lulx/wAqlP1nh9A\nRSoJv5ldY2Z7zOwjM1tSRQ95zGyvmb1nZu9UPcVYNg3asJntGrXsdDN708w+zH6POU1aRb09ZGYH\nsm33jpldW1FvU83sH2Y2aGbvm9nvsuWVbrtEX5Vst7a/7TezkyR9IOkqSfslbZV0o7sPtrWRHGa2\nV1Kvu1c+JmxmsyV9Len5Y7MhmdmfJB1198eyfzhPc/c/dEhvD+k4Z25uUW95M0v/WhVuuzJnvC5D\nFXv+mZI+cveP3f1bSS9LmldBHx3P3d+WdPRHi+dJWp3dXq3aH0/b5fTWEdz9oLvvyG5/JenYzNKV\nbrtEX5WoIvxnSto36v5+ddaU3y7pLTPbbmZ9VTczhu5RMyN9Jqm7ymbGUDhzczv9aGbpjtl2jcx4\nXTY+8PupWe4+Q9JcSXdmb287kteO2TppuGalpHNVm8btoKTHq2wmm1l6raTF7v7l6FqV226MvirZ\nblWE/4CkqaPun5Ut6wjufiD7PSxpQLXDlE5y6Ngkqdnv4Yr7+T93P+Tu37v7iKRnVOG2y2aWXivp\nBXdfly2ufNuN1VdV262K8G+VdJ6ZnWNmP5d0g6T1FfTxE2Y2MfsgRmY2UdIcdd7sw+slLcxuL5T0\naoW9/ECnzNycN7O0Kt52HTfjtbu3/UfStap94v8fSfdX0UNOX+dK+nf2837VvUl6SbW3gf9V7bOR\nWyVNkrRR0oeS3pJ0egf19lfVZnN+V7WgTamot1mqvaV/V9I72c+1VW+7RF+VbDe+4QcExQd+QFCE\nHwiK8ANBEX4gKMIPBEX4gaAIPxAU4QeC+h9PPuXddgFbfgAAAABJRU5ErkJggg==\n", 160 | "text/plain": [ 161 | "" 162 | ] 163 | }, 164 | "metadata": {}, 165 | "output_type": "display_data" 166 | } 167 | ], 168 | "source": [ 169 | "for i in range(2):\n", 170 | " image = data.train.images[i]\n", 171 | " image = np.array(image, dtype='float')\n", 172 | " label = data.train.labels[i]\n", 173 | " pixels = image.reshape((28, 28))\n", 174 | " plt.imshow(pixels, cmap='gray')\n", 175 | " print('-----------------')\n", 176 | " print(label)\n", 177 | " plt.show()\n", 178 | " " 179 | ] 180 | }, 181 | { 182 | "cell_type": "code", 183 | "execution_count": 8, 184 | "metadata": { 185 | "collapsed": true 186 | }, 187 | "outputs": [], 188 | "source": [ 189 | "if not os.path.exists('MNIST/images'):\n", 190 | " os.makedirs('MNIST/images/')\n", 191 | "os.chdir('MNIST/images/')" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 9, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "from matplotlib import image\n", 203 | "for i in range(1,10):\n", 204 | " png = data.train.images[i]\n", 205 | " png = np.array(png, dtype='float')\n", 206 | " pixels = png.reshape((28, 28))\n", 207 | " image.imsave('image_no_{}.png'.format(i), pixels, cmap = 'gray')" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 10, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "['image_no_9.png', 'image_no_3.png', 'image_no_4.png', 'image_no_7.png', 'output', 'image_no_2.png', 'image_no_5.png', 'image_no_8.png', 'image_no_1.png', 'image_no_6.png']\n" 220 | ] 221 | } 222 | ], 223 | "source": [ 224 | "print(os.listdir())" 225 | ] 226 | }, 227 | { 228 | "cell_type": "code", 229 | "execution_count": 11, 230 | "metadata": { 231 | "collapsed": true 232 | }, 233 | "outputs": [], 234 | "source": [ 235 | "from Augmentor import Pipeline" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 12, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "Initialised with 9 image(s) found.\n", 248 | "Output directory set to /home/asherif844/sparkNotebooks/Ch03/MNIST/images/output." 249 | ] 250 | } 251 | ], 252 | "source": [ 253 | "augmentor = Pipeline('/home/asherif844/sparkNotebooks/Ch03/MNIST/images')" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 13, 259 | "metadata": { 260 | "collapsed": true 261 | }, 262 | "outputs": [], 263 | "source": [ 264 | "augmentor.rotate(probability=0.9, max_left_rotation=25, max_right_rotation=25)" 265 | ] 266 | }, 267 | { 268 | "cell_type": "code", 269 | "execution_count": 14, 270 | "metadata": {}, 271 | "outputs": [ 272 | { 273 | "name": "stderr", 274 | "output_type": "stream", 275 | "text": [ 276 | "Processing : 100%|██████████| 10/10 [00:00<00:00, 160.13 Samples/s]\n", 277 | "Processing : 100%|██████████| 10/10 [00:00<00:00, 125.24 Samples/s]\n" 278 | ] 279 | } 280 | ], 281 | "source": [ 282 | "for i in range(1,3):\n", 283 | " augmentor.sample(10)" 284 | ] 285 | }, 286 | { 287 | "cell_type": "code", 288 | "execution_count": 15, 289 | "metadata": { 290 | "collapsed": true 291 | }, 292 | "outputs": [], 293 | "source": [ 294 | "xtrain = data.train.images\n", 295 | "ytrain = np.asarray(data.train.labels)\n", 296 | "xtest = data.test.images \n", 297 | "ytest = np.asarray(data.test.labels)" 298 | ] 299 | }, 300 | { 301 | "cell_type": "code", 302 | "execution_count": 16, 303 | "metadata": {}, 304 | "outputs": [], 305 | "source": [ 306 | "xtrain = xtrain.reshape( xtrain.shape[0],28,28,1)\n", 307 | "xtest = xtest.reshape(xtest.shape[0],28,28,1)\n", 308 | "ytest= ytest.reshape(ytest.shape[0],10)\n", 309 | "ytrain = ytrain.reshape(ytrain.shape[0],10)" 310 | ] 311 | }, 312 | { 313 | "cell_type": "code", 314 | "execution_count": 17, 315 | "metadata": {}, 316 | "outputs": [ 317 | { 318 | "name": "stdout", 319 | "output_type": "stream", 320 | "text": [ 321 | "(55000, 28, 28, 1)\n", 322 | "(55000, 10)\n", 323 | "(10000, 28, 28, 1)\n", 324 | "(10000, 10)\n" 325 | ] 326 | } 327 | ], 328 | "source": [ 329 | "print(xtrain.shape)\n", 330 | "print(ytrain.shape)\n", 331 | "print(xtest.shape)\n", 332 | "print(ytest.shape)" 333 | ] 334 | }, 335 | { 336 | "cell_type": "code", 337 | "execution_count": 18, 338 | "metadata": {}, 339 | "outputs": [ 340 | { 341 | "name": "stderr", 342 | "output_type": "stream", 343 | "text": [ 344 | "Using TensorFlow backend.\n" 345 | ] 346 | } 347 | ], 348 | "source": [ 349 | "import keras\n", 350 | "import keras.backend as K\n", 351 | "from keras.models import Sequential\n", 352 | "from keras.layers import Dense, Flatten, Conv2D\n", 353 | "\n", 354 | "K.set_image_dim_ordering('tf')\n", 355 | "\n", 356 | "model = Sequential()\n", 357 | "\n", 358 | "model.add(Conv2D(32, kernel_size=(5, 5),activation='relu', input_shape=(28,28,1)))\n", 359 | "model.add(Flatten())\n", 360 | "model.add(Dense(128, activation='relu'))\n", 361 | "model.add(Dense(10, activation='sigmoid'))\n" 362 | ] 363 | }, 364 | { 365 | "cell_type": "code", 366 | "execution_count": 19, 367 | "metadata": {}, 368 | "outputs": [], 369 | "source": [ 370 | "model.compile(optimizer='adam',loss='categorical_crossentropy', \n", 371 | " metrics=['accuracy'])\n" 372 | ] 373 | }, 374 | { 375 | "cell_type": "code", 376 | "execution_count": 20, 377 | "metadata": {}, 378 | "outputs": [ 379 | { 380 | "name": "stdout", 381 | "output_type": "stream", 382 | "text": [ 383 | "Train on 55000 samples, validate on 10000 samples\n", 384 | "Epoch 1/5\n", 385 | "55000/55000 [==============================] - 46s 832us/step - loss: 0.3617 - acc: 0.9032 - val_loss: 0.1214 - val_acc: 0.9651\n", 386 | "Epoch 2/5\n", 387 | "55000/55000 [==============================] - 44s 797us/step - loss: 0.0928 - acc: 0.9731 - val_loss: 0.0809 - val_acc: 0.9770\n", 388 | "Epoch 3/5\n", 389 | "55000/55000 [==============================] - 44s 796us/step - loss: 0.0555 - acc: 0.9837 - val_loss: 0.0521 - val_acc: 0.9839\n", 390 | "Epoch 4/5\n", 391 | "55000/55000 [==============================] - 42s 756us/step - loss: 0.0410 - acc: 0.9881 - val_loss: 0.0521 - val_acc: 0.9823\n", 392 | "Epoch 5/5\n", 393 | "55000/55000 [==============================] - 43s 782us/step - loss: 0.0309 - acc: 0.9909 - val_loss: 0.0457 - val_acc: 0.9861\n" 394 | ] 395 | }, 396 | { 397 | "data": { 398 | "text/plain": [ 399 | "" 400 | ] 401 | }, 402 | "execution_count": 20, 403 | "metadata": {}, 404 | "output_type": "execute_result" 405 | } 406 | ], 407 | "source": [ 408 | "model.fit(xtrain,ytrain,batch_size=512,\n", 409 | " epochs=5,\n", 410 | " validation_data=(xtest, ytest))" 411 | ] 412 | }, 413 | { 414 | "cell_type": "code", 415 | "execution_count": 21, 416 | "metadata": {}, 417 | "outputs": [ 418 | { 419 | "name": "stdout", 420 | "output_type": "stream", 421 | "text": [ 422 | "10000/10000 [==============================] - 3s 324us/step\n", 423 | "The accuracy rate is 98.6%\n", 424 | "The loss rate is 5.0%\n" 425 | ] 426 | } 427 | ], 428 | "source": [ 429 | "stats = model.evaluate(xtest, ytest)\n", 430 | "print('The accuracy rate is {}%'.format(round(stats[1],3)*100))\n", 431 | "print('The loss rate is {}%'.format(round(stats[0],2)*100))" 432 | ] 433 | }, 434 | { 435 | "cell_type": "code", 436 | "execution_count": 22, 437 | "metadata": {}, 438 | "outputs": [ 439 | { 440 | "name": "stdout", 441 | "output_type": "stream", 442 | "text": [ 443 | "_________________________________________________________________\n", 444 | "Layer (type) Output Shape Param # \n", 445 | "=================================================================\n", 446 | "conv2d_1 (Conv2D) (None, 24, 24, 32) 832 \n", 447 | "_________________________________________________________________\n", 448 | "flatten_1 (Flatten) (None, 18432) 0 \n", 449 | "_________________________________________________________________\n", 450 | "dense_1 (Dense) (None, 128) 2359424 \n", 451 | "_________________________________________________________________\n", 452 | "dense_2 (Dense) (None, 10) 1290 \n", 453 | "=================================================================\n", 454 | "Total params: 2,361,546\n", 455 | "Trainable params: 2,361,546\n", 456 | "Non-trainable params: 0\n", 457 | "_________________________________________________________________\n" 458 | ] 459 | } 460 | ], 461 | "source": [ 462 | "model.summary()" 463 | ] 464 | }, 465 | { 466 | "cell_type": "code", 467 | "execution_count": null, 468 | "metadata": { 469 | "collapsed": true 470 | }, 471 | "outputs": [], 472 | "source": [] 473 | } 474 | ], 475 | "metadata": { 476 | "kernelspec": { 477 | "display_name": "Python 3", 478 | "language": "python", 479 | "name": "python3" 480 | }, 481 | "language_info": { 482 | "codemirror_mode": { 483 | "name": "ipython", 484 | "version": 3 485 | }, 486 | "file_extension": ".py", 487 | "mimetype": "text/x-python", 488 | "name": "python", 489 | "nbconvert_exporter": "python", 490 | "pygments_lexer": "ipython3", 491 | "version": "3.6.1" 492 | } 493 | }, 494 | "nbformat": 4, 495 | "nbformat_minor": 2 496 | } 497 | -------------------------------------------------------------------------------- /CH06/code/CH06_LSTMs+word+level.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [ 8 | { 9 | "name": "stderr", 10 | "output_type": "stream", 11 | "text": [ 12 | "Using TensorFlow backend.\n" 13 | ] 14 | } 15 | ], 16 | "source": [ 17 | "from numpy import array\n", 18 | "from pickle import dump\n", 19 | "from keras.preprocessing.text import Tokenizer\n", 20 | "from keras.utils import to_categorical\n", 21 | "from keras.models import Sequential\n", 22 | "from keras.layers import Dense\n", 23 | "from keras.layers import LSTM\n", 24 | "from keras.layers import Embedding" 25 | ] 26 | }, 27 | { 28 | "cell_type": "code", 29 | "execution_count": 2, 30 | "metadata": {}, 31 | "outputs": [ 32 | { 33 | "data": { 34 | "text/plain": [ 35 | "'/Users/Chanti'" 36 | ] 37 | }, 38 | "execution_count": 2, 39 | "metadata": {}, 40 | "output_type": "execute_result" 41 | } 42 | ], 43 | "source": [ 44 | "pwd" 45 | ] 46 | }, 47 | { 48 | "cell_type": "code", 49 | "execution_count": 3, 50 | "metadata": {}, 51 | "outputs": [ 52 | { 53 | "name": "stdout", 54 | "output_type": "stream", 55 | "text": [ 56 | "/Users/Chanti/Desktop/Cookbook/Chapter 8\n" 57 | ] 58 | } 59 | ], 60 | "source": [ 61 | "cd '/Users/Chanti/Desktop/Cookbook/Chapter 8'" 62 | ] 63 | }, 64 | { 65 | "cell_type": "code", 66 | "execution_count": 4, 67 | "metadata": {}, 68 | "outputs": [ 69 | { 70 | "data": { 71 | "text/plain": [ 72 | "'/Users/Chanti/Desktop/Cookbook/Chapter 8'" 73 | ] 74 | }, 75 | "execution_count": 4, 76 | "metadata": {}, 77 | "output_type": "execute_result" 78 | } 79 | ], 80 | "source": [ 81 | "pwd" 82 | ] 83 | }, 84 | { 85 | "cell_type": "code", 86 | "execution_count": 5, 87 | "metadata": {}, 88 | "outputs": [], 89 | "source": [ 90 | "# load doc into memory\n", 91 | "def load_document(name):\n", 92 | " file = open(name, 'r')\n", 93 | " text = file.read()\n", 94 | " file.close()\n", 95 | " return text" 96 | ] 97 | }, 98 | { 99 | "cell_type": "code", 100 | "execution_count": 6, 101 | "metadata": {}, 102 | "outputs": [ 103 | { 104 | "name": "stdout", 105 | "output_type": "stream", 106 | "text": [ 107 | "The Project Gutenberg EBook of The Jungle Book, by Rudyard Kipling\n", 108 | "\n", 109 | "This eBook is for the use of anyone anywhere at no cost and with\n", 110 | "almost no restrictions whatsoever. You may copy it, give it away or\n", 111 | "re-use it under the terms of the Project Gutenberg License included\n", 112 | "with this eBook or online at www.gutenberg.org\n", 113 | "\n", 114 | "\n", 115 | "Title: The Jungle Book\n", 116 | "\n", 117 | "Author: Rudyard Kipling\n", 118 | "\n", 119 | "Release Date: January 16, 2006 [EBook #236]\n", 120 | "Last Updated: October 6, 2016\n", 121 | "\n", 122 | "Language: English\n", 123 | "\n", 124 | "Character set encoding: UTF-8\n", 125 | "\n", 126 | "*** START OF THIS PROJECT GUTENBERG EBOOK THE JUNGLE BOOK ***\n", 127 | "\n", 128 | "\n", 129 | "\n", 130 | "\n", 131 | "Produced by An Anonymous Volunteer and David Widger\n", 132 | "\n", 133 | "\n", 134 | "\n", 135 | "\n", 136 | "\n", 137 | "THE JUNGLE BOOK\n", 138 | "\n", 139 | "By Rudyard Kipling\n", 140 | "\n", 141 | "\n", 142 | "\n", 143 | "Contents\n", 144 | "\n", 145 | " Mowgli’s Brothers\n", 146 | " Hunting-Song of the Seeonee Pack\n", 147 | " Kaa’s Hunting\n", 148 | " Road-Song of the Bandar-Log\n", 149 | " “Tiger! Tiger!”\n", 150 | " Mowgli’s Song\n", 151 | " The White Seal\n", 152 | " Lukannon\n", 153 | " “Rikki-Tikki-Tavi”\n", 154 | " Darzee’s Chant\n", 155 | " Toomai of the Elephants\n", 156 | " Shiv and the Grasshopper\n", 157 | " Her Majesty’s Servants\n", 158 | " Parade Song of the Camp Animals\n", 159 | "\n", 160 | "\n", 161 | "\n", 162 | "\n", 163 | "Mowgli’s Brothers\n", 164 | "\n", 165 | " Now Rann the Kite brings home the night\n", 166 | " That Mang the Bat sets free--\n", 167 | " The herds are shut in byre and hut\n", 168 | " For loosed till dawn are we.\n", 169 | " This is the hour of pride and power,\n", 170 | " Talon and tush and claw.\n", 171 | " Oh, hear the call!--Good hunting all\n", 172 | " That keep the Jungle Law!\n", 173 | " Night-Song in the Jungle\n", 174 | "\n", 175 | "It was seven o’clock of a very warm evening in the Seeonee hills when\n", 176 | "Father Wolf woke up from his day’s rest, scratched himself, yawned, and\n", 177 | "spread out his paws one after the other to get rid of the sleepy feeling\n", 178 | "in their tips. Mother Wolf lay with her big gray nose dropped across her\n", 179 | "four tumbling, squealing cubs, and the moon shone into the mouth of the\n", 180 | "cave where they all lived. “Augrh!” said Father Wolf. “It is time to\n", 181 | "hunt again.” He was going to spring down hill when a little shadow with\n", 182 | "a bushy tail crossed the threshold and whined: “Good luck go with you, O\n", 183 | "Chief of the Wolves. And good luck and\n" 184 | ] 185 | } 186 | ], 187 | "source": [ 188 | "# load document\n", 189 | "input_filename = 'junglebook.txt'\n", 190 | "doc = load_document(input_filename)\n", 191 | "print(doc[:2000])" 192 | ] 193 | }, 194 | { 195 | "cell_type": "code", 196 | "execution_count": 7, 197 | "metadata": { 198 | "collapsed": true 199 | }, 200 | "outputs": [], 201 | "source": [ 202 | "import string\n", 203 | " \n", 204 | "# turn a document into clean tokens\n", 205 | "def clean_document(doc):\n", 206 | " doc = doc.replace('--', ' ')\n", 207 | " tokens = doc.split()\n", 208 | " table = str.maketrans('', '', string.punctuation)\n", 209 | " tokens = [w.translate(table) for w in tokens]\n", 210 | " tokens = [word for word in tokens if word.isalpha()]\n", 211 | " tokens = [word.lower() for word in tokens]\n", 212 | " return tokens" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": 8, 218 | "metadata": {}, 219 | "outputs": [ 220 | { 221 | "name": "stdout", 222 | "output_type": "stream", 223 | "text": [ 224 | "['project', 'gutenberg', 'ebook', 'of', 'the', 'jungle', 'book', 'by', 'rudyard', 'kipling', 'this', 'ebook', 'is', 'for', 'the', 'use', 'of', 'anyone', 'anywhere', 'at', 'no', 'cost', 'and', 'with', 'almost', 'no', 'restrictions', 'whatsoever', 'you', 'may', 'copy', 'it', 'give', 'it', 'away', 'or', 'reuse', 'it', 'under', 'the', 'terms', 'of', 'the', 'project', 'gutenberg', 'license', 'included', 'with', 'this', 'ebook', 'or', 'online', 'at', 'wwwgutenbergorg', 'title', 'the', 'jungle', 'book', 'author', 'rudyard', 'kipling', 'release', 'date', 'january', 'ebook', 'last', 'updated', 'october', 'language', 'english', 'character', 'set', 'encoding', 'start', 'of', 'this', 'project', 'gutenberg', 'ebook', 'the', 'jungle', 'book', 'produced', 'by', 'an', 'anonymous', 'volunteer', 'and', 'david', 'widger', 'the', 'jungle', 'book', 'by', 'rudyard', 'kipling', 'contents', 'brothers', 'huntingsong', 'of', 'the', 'seeonee', 'pack', 'hunting', 'roadsong', 'of', 'the', 'bandarlog', 'song', 'the', 'white', 'seal', 'lukannon', 'chant', 'toomai', 'of', 'the', 'elephants', 'shiv', 'and', 'the', 'grasshopper', 'her', 'servants', 'parade', 'song', 'of', 'the', 'camp', 'animals', 'brothers', 'now', 'rann', 'the', 'kite', 'brings', 'home', 'the', 'night', 'that', 'mang', 'the', 'bat', 'sets', 'free', 'the', 'herds', 'are', 'shut', 'in', 'byre', 'and', 'hut', 'for', 'loosed', 'till', 'dawn', 'are', 'we', 'this', 'is', 'the', 'hour', 'of', 'pride', 'and', 'power', 'talon', 'and', 'tush', 'and', 'claw', 'oh', 'hear', 'the', 'call', 'good', 'hunting', 'all', 'that', 'keep', 'the', 'jungle', 'law', 'nightsong', 'in', 'the', 'jungle', 'it', 'was', 'seven', 'of', 'a', 'very', 'warm', 'evening', 'in', 'the', 'seeonee', 'hills']\n", 225 | "Total Tokens: 51473\n", 226 | "Unique Tokens: 5027\n" 227 | ] 228 | } 229 | ], 230 | "source": [ 231 | "# clean document\n", 232 | "tokens = clean_document(doc)\n", 233 | "print(tokens[:200])\n", 234 | "print('Total Tokens: %d' % len(tokens))\n", 235 | "print('Unique Tokens: %d' % len(set(tokens)))" 236 | ] 237 | }, 238 | { 239 | "cell_type": "code", 240 | "execution_count": 9, 241 | "metadata": {}, 242 | "outputs": [ 243 | { 244 | "name": "stdout", 245 | "output_type": "stream", 246 | "text": [ 247 | "Total Sequences: 51422\n" 248 | ] 249 | } 250 | ], 251 | "source": [ 252 | "# organize into sequences (of length 50) of tokens\n", 253 | "length = 50 + 1\n", 254 | "sequences = list()\n", 255 | "for i in range(length, len(tokens)):\n", 256 | " # select sequence of tokens\n", 257 | " seq = tokens[i-length:i]\n", 258 | " # convert into a line\n", 259 | " line = ' '.join(seq)\n", 260 | " sequences.append(line)\n", 261 | "print('Total Sequences: %d' % len(sequences))" 262 | ] 263 | }, 264 | { 265 | "cell_type": "code", 266 | "execution_count": 10, 267 | "metadata": { 268 | "collapsed": true 269 | }, 270 | "outputs": [], 271 | "source": [ 272 | "# save tokens to file, one dialog per line\n", 273 | "def save_document(lines, name):\n", 274 | " data = '\\n'.join(lines)\n", 275 | " file = open(name, 'w')\n", 276 | " file.write(data)\n", 277 | " file.close()" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": 11, 283 | "metadata": { 284 | "collapsed": true 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "# save sequences to file\n", 289 | "output_filename = 'junglebook_sequences.txt'\n", 290 | "save_document(sequences, output_filename)" 291 | ] 292 | }, 293 | { 294 | "cell_type": "code", 295 | "execution_count": 12, 296 | "metadata": { 297 | "collapsed": true 298 | }, 299 | "outputs": [], 300 | "source": [ 301 | "# load document into memory\n", 302 | "def load_document(name):\n", 303 | " file = open(name, 'r')\n", 304 | " text = file.read()\n", 305 | " file.close()\n", 306 | " return text\n", 307 | " \n", 308 | "# load\n", 309 | "input_filename = 'junglebook_sequences.txt'\n", 310 | "doc = load_document(input_filename)\n", 311 | "lines = doc.split('\\n')" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": 13, 317 | "metadata": { 318 | "collapsed": true 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "# integer encode sequences of words\n", 323 | "tokenizer = Tokenizer()\n", 324 | "tokenizer.fit_on_texts(lines)\n", 325 | "sequences = tokenizer.texts_to_sequences(lines)" 326 | ] 327 | }, 328 | { 329 | "cell_type": "code", 330 | "execution_count": 14, 331 | "metadata": { 332 | "collapsed": true 333 | }, 334 | "outputs": [], 335 | "source": [ 336 | "# vocabulary size\n", 337 | "vocab_size = len(tokenizer.word_index) + 1 " 338 | ] 339 | }, 340 | { 341 | "cell_type": "code", 342 | "execution_count": 15, 343 | "metadata": { 344 | "collapsed": true 345 | }, 346 | "outputs": [], 347 | "source": [ 348 | "# separate into input and output\n", 349 | "sequences = array(sequences)\n", 350 | "Input, Output = sequences[:,:-1], sequences[:,-1]\n", 351 | "Output = to_categorical(Output, num_classes=vocab_size)\n", 352 | "sequence_length = Input.shape[1]" 353 | ] 354 | }, 355 | { 356 | "cell_type": "code", 357 | "execution_count": 16, 358 | "metadata": {}, 359 | "outputs": [ 360 | { 361 | "name": "stdout", 362 | "output_type": "stream", 363 | "text": [ 364 | "_________________________________________________________________\n", 365 | "Layer (type) Output Shape Param # \n", 366 | "=================================================================\n", 367 | "embedding_1 (Embedding) (None, 50, 100) 502800 \n", 368 | "_________________________________________________________________\n", 369 | "lstm_1 (LSTM) (None, 50, 200) 240800 \n", 370 | "_________________________________________________________________\n", 371 | "lstm_2 (LSTM) (None, 200) 320800 \n", 372 | "_________________________________________________________________\n", 373 | "dropout_1 (Dropout) (None, 200) 0 \n", 374 | "_________________________________________________________________\n", 375 | "dense_1 (Dense) (None, 200) 40200 \n", 376 | "_________________________________________________________________\n", 377 | "dense_2 (Dense) (None, 5028) 1010628 \n", 378 | "=================================================================\n", 379 | "Total params: 2,115,228\n", 380 | "Trainable params: 2,115,228\n", 381 | "Non-trainable params: 0\n", 382 | "_________________________________________________________________\n", 383 | "None\n" 384 | ] 385 | } 386 | ], 387 | "source": [ 388 | "# define model\n", 389 | "from keras.layers import Dropout\n", 390 | "model = Sequential()\n", 391 | "model.add(Embedding(vocab_size, 100, input_length=sequence_length))\n", 392 | "model.add(LSTM(200, return_sequences=True))\n", 393 | "model.add(LSTM(200))\n", 394 | "model.add(Dropout(0.3))\n", 395 | "model.add(Dense(200, activation='relu'))\n", 396 | "model.add(Dense(vocab_size, activation='softmax'))\n", 397 | "print(model.summary())" 398 | ] 399 | }, 400 | { 401 | "cell_type": "code", 402 | "execution_count": 17, 403 | "metadata": {}, 404 | "outputs": [ 405 | { 406 | "name": "stdout", 407 | "output_type": "stream", 408 | "text": [ 409 | "Epoch 1/75\n", 410 | "51422/51422 [==============================] - 517s 10ms/step - loss: 6.6069 - acc: 0.0682\n", 411 | "Epoch 2/75\n", 412 | "51422/51422 [==============================] - 501s 10ms/step - loss: 6.2250 - acc: 0.0721\n", 413 | "Epoch 3/75\n", 414 | "51422/51422 [==============================] - 494s 10ms/step - loss: 6.0805 - acc: 0.0827\n", 415 | "Epoch 4/75\n", 416 | "51422/51422 [==============================] - 453s 9ms/step - loss: 5.9354 - acc: 0.0911\n", 417 | "Epoch 5/75\n", 418 | "51422/51422 [==============================] - 451s 9ms/step - loss: 5.8014 - acc: 0.1025\n", 419 | "Epoch 6/75\n", 420 | "51422/51422 [==============================] - 448s 9ms/step - loss: 5.6800 - acc: 0.1126\n", 421 | "Epoch 7/75\n", 422 | "51422/51422 [==============================] - 448s 9ms/step - loss: 5.5646 - acc: 0.1198\n", 423 | "Epoch 8/75\n", 424 | "51422/51422 [==============================] - 447s 9ms/step - loss: 5.4614 - acc: 0.1267\n", 425 | "Epoch 9/75\n", 426 | "51422/51422 [==============================] - 447s 9ms/step - loss: 5.3677 - acc: 0.1315\n", 427 | "Epoch 10/75\n", 428 | "51422/51422 [==============================] - 449s 9ms/step - loss: 5.2885 - acc: 0.1342\n", 429 | "Epoch 11/75\n", 430 | "51422/51422 [==============================] - 450s 9ms/step - loss: 5.2218 - acc: 0.1380\n", 431 | "Epoch 12/75\n", 432 | "51422/51422 [==============================] - 449s 9ms/step - loss: 5.1429 - acc: 0.1402\n", 433 | "Epoch 13/75\n", 434 | "51422/51422 [==============================] - 449s 9ms/step - loss: 5.0917 - acc: 0.1424\n", 435 | "Epoch 14/75\n", 436 | "51422/51422 [==============================] - 448s 9ms/step - loss: 5.0171 - acc: 0.1452\n", 437 | "Epoch 15/75\n", 438 | "51422/51422 [==============================] - 448s 9ms/step - loss: 4.9520 - acc: 0.1473\n", 439 | "Epoch 16/75\n", 440 | "51422/51422 [==============================] - 448s 9ms/step - loss: 4.8880 - acc: 0.1506\n", 441 | "Epoch 17/75\n", 442 | "51422/51422 [==============================] - 447s 9ms/step - loss: 4.8307 - acc: 0.1551\n", 443 | "Epoch 18/75\n", 444 | "51422/51422 [==============================] - 448s 9ms/step - loss: 4.8129 - acc: 0.1550\n", 445 | "Epoch 19/75\n", 446 | "51422/51422 [==============================] - 450s 9ms/step - loss: 4.7857 - acc: 0.1548\n", 447 | "Epoch 20/75\n", 448 | "51422/51422 [==============================] - 449s 9ms/step - loss: 4.7032 - acc: 0.1593\n", 449 | "Epoch 21/75\n", 450 | "51422/51422 [==============================] - 450s 9ms/step - loss: 4.6548 - acc: 0.1600\n", 451 | "Epoch 22/75\n", 452 | "51422/51422 [==============================] - 448s 9ms/step - loss: 4.5812 - acc: 0.1629\n", 453 | "Epoch 23/75\n", 454 | "51422/51422 [==============================] - 447s 9ms/step - loss: 4.5474 - acc: 0.1641\n", 455 | "Epoch 24/75\n", 456 | "51422/51422 [==============================] - 448s 9ms/step - loss: 4.4725 - acc: 0.1664\n", 457 | "Epoch 25/75\n", 458 | "51422/51422 [==============================] - 447s 9ms/step - loss: 4.5027 - acc: 0.1659\n", 459 | "Epoch 26/75\n", 460 | "51422/51422 [==============================] - 449s 9ms/step - loss: 4.4486 - acc: 0.1674\n", 461 | "Epoch 27/75\n", 462 | "51422/51422 [==============================] - 448s 9ms/step - loss: 4.3099 - acc: 0.1745\n", 463 | "Epoch 28/75\n", 464 | "51422/51422 [==============================] - 452s 9ms/step - loss: 4.2418 - acc: 0.1782\n", 465 | "Epoch 29/75\n", 466 | "51422/51422 [==============================] - 462s 9ms/step - loss: 4.2303 - acc: 0.1788\n", 467 | "Epoch 30/75\n", 468 | "51422/51422 [==============================] - 448s 9ms/step - loss: 4.1416 - acc: 0.1838\n", 469 | "Epoch 31/75\n", 470 | "51422/51422 [==============================] - 450s 9ms/step - loss: 4.0701 - acc: 0.1886\n", 471 | "Epoch 32/75\n", 472 | "51422/51422 [==============================] - 448s 9ms/step - loss: 4.0057 - acc: 0.1921\n", 473 | "Epoch 33/75\n", 474 | "51422/51422 [==============================] - 448s 9ms/step - loss: 3.9404 - acc: 0.1977\n", 475 | "Epoch 34/75\n", 476 | "51422/51422 [==============================] - 448s 9ms/step - loss: 3.8961 - acc: 0.2004\n", 477 | "Epoch 35/75\n", 478 | "51422/51422 [==============================] - 450s 9ms/step - loss: 3.8313 - acc: 0.2064\n", 479 | "Epoch 36/75\n", 480 | "51422/51422 [==============================] - 449s 9ms/step - loss: 3.7746 - acc: 0.2139\n", 481 | "Epoch 37/75\n", 482 | "51422/51422 [==============================] - 450s 9ms/step - loss: 3.7493 - acc: 0.2157\n", 483 | "Epoch 38/75\n", 484 | "51422/51422 [==============================] - 449s 9ms/step - loss: 3.6876 - acc: 0.2225\n", 485 | "Epoch 39/75\n", 486 | "51422/51422 [==============================] - 447s 9ms/step - loss: 3.6356 - acc: 0.2274\n", 487 | "Epoch 40/75\n", 488 | "51422/51422 [==============================] - 451s 9ms/step - loss: 3.5717 - acc: 0.2344\n", 489 | "Epoch 41/75\n", 490 | "51422/51422 [==============================] - 447s 9ms/step - loss: 3.5353 - acc: 0.2374\n", 491 | "Epoch 42/75\n", 492 | "51422/51422 [==============================] - 448s 9ms/step - loss: 3.4846 - acc: 0.2462\n", 493 | "Epoch 43/75\n", 494 | "51422/51422 [==============================] - 448s 9ms/step - loss: 3.4388 - acc: 0.2502\n", 495 | "Epoch 44/75\n", 496 | "51422/51422 [==============================] - 455s 9ms/step - loss: 3.3920 - acc: 0.2545\n", 497 | "Epoch 45/75\n", 498 | "51422/51422 [==============================] - 453s 9ms/step - loss: 3.3505 - acc: 0.2589\n", 499 | "Epoch 46/75\n", 500 | "51422/51422 [==============================] - 448s 9ms/step - loss: 3.3113 - acc: 0.2662\n", 501 | "Epoch 47/75\n", 502 | "51422/51422 [==============================] - 448s 9ms/step - loss: 3.3232 - acc: 0.2664\n", 503 | "Epoch 48/75\n", 504 | "51422/51422 [==============================] - 450s 9ms/step - loss: 3.2610 - acc: 0.2730\n", 505 | "Epoch 49/75\n", 506 | "51422/51422 [==============================] - 450s 9ms/step - loss: 3.2505 - acc: 0.2748\n", 507 | "Epoch 50/75\n", 508 | "51422/51422 [==============================] - 450s 9ms/step - loss: 3.2856 - acc: 0.2771\n", 509 | "Epoch 51/75\n", 510 | "51422/51422 [==============================] - 450s 9ms/step - loss: 3.1842 - acc: 0.2860\n", 511 | "Epoch 52/75\n", 512 | "51422/51422 [==============================] - 449s 9ms/step - loss: 3.1172 - acc: 0.2948\n", 513 | "Epoch 53/75\n", 514 | "51422/51422 [==============================] - 455s 9ms/step - loss: 3.1662 - acc: 0.2918\n", 515 | "Epoch 54/75\n", 516 | "51422/51422 [==============================] - 454s 9ms/step - loss: 3.4129 - acc: 0.2656\n", 517 | "Epoch 55/75\n", 518 | "51422/51422 [==============================] - 449s 9ms/step - loss: 3.3144 - acc: 0.2733\n", 519 | "Epoch 56/75\n", 520 | "51422/51422 [==============================] - 551s 11ms/step - loss: 3.2530 - acc: 0.2807\n", 521 | "Epoch 57/75\n", 522 | "51422/51422 [==============================] - 539s 10ms/step - loss: 3.1926 - acc: 0.2868\n", 523 | "Epoch 58/75\n", 524 | "51422/51422 [==============================] - 532s 10ms/step - loss: 3.1441 - acc: 0.2928\n", 525 | "Epoch 59/75\n", 526 | "51422/51422 [==============================] - 529s 10ms/step - loss: 3.0970 - acc: 0.2979\n", 527 | "Epoch 60/75\n", 528 | "51422/51422 [==============================] - 541s 11ms/step - loss: 3.0582 - acc: 0.3036\n", 529 | "Epoch 61/75\n", 530 | "51422/51422 [==============================] - 524s 10ms/step - loss: 3.0121 - acc: 0.3111\n", 531 | "Epoch 62/75\n", 532 | "51422/51422 [==============================] - 530s 10ms/step - loss: 2.9672 - acc: 0.3175\n", 533 | "Epoch 63/75\n", 534 | "51422/51422 [==============================] - 532s 10ms/step - loss: 2.9369 - acc: 0.3231\n", 535 | "Epoch 64/75\n", 536 | "51422/51422 [==============================] - 544s 11ms/step - loss: 2.8845 - acc: 0.3300\n", 537 | "Epoch 65/75\n", 538 | "51422/51422 [==============================] - 579s 11ms/step - loss: 2.8595 - acc: 0.3357\n", 539 | "Epoch 66/75\n", 540 | "51422/51422 [==============================] - 525s 10ms/step - loss: 2.8161 - acc: 0.3400\n", 541 | "Epoch 67/75\n", 542 | "51422/51422 [==============================] - 458s 9ms/step - loss: 2.7810 - acc: 0.3441\n", 543 | "Epoch 68/75\n", 544 | "51422/51422 [==============================] - 516s 10ms/step - loss: 2.7346 - acc: 0.3547\n", 545 | "Epoch 69/75\n", 546 | "51422/51422 [==============================] - 522s 10ms/step - loss: 2.7065 - acc: 0.3570\n", 547 | "Epoch 70/75\n", 548 | "51422/51422 [==============================] - 458s 9ms/step - loss: 2.6710 - acc: 0.3642\n", 549 | "Epoch 71/75\n", 550 | "51422/51422 [==============================] - 449s 9ms/step - loss: 2.6264 - acc: 0.3716\n", 551 | "Epoch 72/75\n", 552 | "51422/51422 [==============================] - 450s 9ms/step - loss: 2.6027 - acc: 0.3766\n", 553 | "Epoch 73/75\n", 554 | "51422/51422 [==============================] - 461s 9ms/step - loss: 2.5761 - acc: 0.3784\n", 555 | "Epoch 74/75\n", 556 | "51422/51422 [==============================] - 454s 9ms/step - loss: 2.5370 - acc: 0.3874\n", 557 | "Epoch 75/75\n", 558 | "51422/51422 [==============================] - 450s 9ms/step - loss: 2.5038 - acc: 0.3938\n" 559 | ] 560 | }, 561 | { 562 | "data": { 563 | "text/plain": [ 564 | "" 565 | ] 566 | }, 567 | "execution_count": 17, 568 | "metadata": {}, 569 | "output_type": "execute_result" 570 | } 571 | ], 572 | "source": [ 573 | "# compile model\n", 574 | "model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])\n", 575 | "# fit model\n", 576 | "model.fit(Input, Output, batch_size=250, epochs=75)" 577 | ] 578 | }, 579 | { 580 | "cell_type": "code", 581 | "execution_count": 18, 582 | "metadata": { 583 | "collapsed": true 584 | }, 585 | "outputs": [], 586 | "source": [ 587 | "# save the model to file\n", 588 | "model.save('junglebook_trained.h5')\n", 589 | "# save the tokenizer\n", 590 | "dump(tokenizer, open('tokenizer.pkl', 'wb'))" 591 | ] 592 | }, 593 | { 594 | "cell_type": "code", 595 | "execution_count": 19, 596 | "metadata": {}, 597 | "outputs": [], 598 | "source": [ 599 | "# load doc into memory\n", 600 | "def load_document(name):\n", 601 | " file = open(name, 'r')\n", 602 | " text = file.read()\n", 603 | " file.close()\n", 604 | " return text\n", 605 | " \n", 606 | "# load cleaned text sequences\n", 607 | "input_filename = 'junglebook_sequences.txt'\n", 608 | "doc = load_document(input_filename)\n", 609 | "lines = doc.split('\\n')" 610 | ] 611 | }, 612 | { 613 | "cell_type": "code", 614 | "execution_count": 20, 615 | "metadata": { 616 | "collapsed": true 617 | }, 618 | "outputs": [], 619 | "source": [ 620 | "sequence_length = len(lines[0].split()) - 1" 621 | ] 622 | }, 623 | { 624 | "cell_type": "code", 625 | "execution_count": 21, 626 | "metadata": {}, 627 | "outputs": [], 628 | "source": [ 629 | "# load the model\n", 630 | "from keras.models import load_model\n", 631 | "model = load_model('junglebook_trained.h5')" 632 | ] 633 | }, 634 | { 635 | "cell_type": "code", 636 | "execution_count": 22, 637 | "metadata": {}, 638 | "outputs": [ 639 | { 640 | "name": "stdout", 641 | "output_type": "stream", 642 | "text": [ 643 | "to me not long ago with some rude talk that i was a naked cub and not fit to dig pignuts but i caught tabaqui by the tail and swung him twice against a palmtree to teach him better was foolishness for though tabaqui is a mischiefmaker he would have told\n", 644 | "\n" 645 | ] 646 | } 647 | ], 648 | "source": [ 649 | "# select a seed text\n", 650 | "from random import randint\n", 651 | "seed_text = lines[randint(0,len(lines))]\n", 652 | "print(seed_text + '\\n')" 653 | ] 654 | }, 655 | { 656 | "cell_type": "code", 657 | "execution_count": 23, 658 | "metadata": { 659 | "collapsed": true 660 | }, 661 | "outputs": [], 662 | "source": [ 663 | "encoded = tokenizer.texts_to_sequences([seed_text])[0]" 664 | ] 665 | }, 666 | { 667 | "cell_type": "code", 668 | "execution_count": 24, 669 | "metadata": { 670 | "collapsed": true 671 | }, 672 | "outputs": [], 673 | "source": [ 674 | "from random import randint\n", 675 | "from pickle import load\n", 676 | "from keras.models import load_model\n", 677 | "from keras.preprocessing.sequence import pad_sequences\n", 678 | " \n", 679 | "# load doc into memory\n", 680 | "def load_document(name):\n", 681 | " file = open(name, 'r')\n", 682 | " text = file.read()\n", 683 | " file.close()\n", 684 | " return text\n", 685 | " \n", 686 | "# generate a sequence from a language model\n", 687 | "def generate_sequence(model, tokenizer, sequence_length, seed_text, n_words):\n", 688 | "\tresult = list()\n", 689 | "\tinput_text = seed_text\n", 690 | "\t# generate a fixed number of words\n", 691 | "\tfor _ in range(n_words):\n", 692 | "\t\t# encode the text as integer\n", 693 | "\t\tencoded = tokenizer.texts_to_sequences([input_text])[0]\n", 694 | "\t\t# truncate sequences to a fixed length\n", 695 | "\t\tencoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')\n", 696 | "\t\t# predict probabilities for each word\n", 697 | "\t\tprediction = model.predict_classes(encoded, verbose=0)\n", 698 | "\t\t# map predicted word index to word\n", 699 | "\t\tout_word = ''\n", 700 | "\t\tfor word, index in tokenizer.word_index.items():\n", 701 | "\t\t\tif index == prediction:\n", 702 | "\t\t\t\tout_word = word\n", 703 | "\t\t\t\tbreak\n", 704 | "\t\t# append to input\n", 705 | "\t\tinput_text += ' ' + out_word\n", 706 | "\t\tresult.append(out_word)\n", 707 | "\treturn ' '.join(result)\n", 708 | " \n", 709 | "# load cleaned text sequences\n", 710 | "input_filename = 'junglebook_sequences.txt'\n", 711 | "doc = load_document(input_filename)\n", 712 | "lines = doc.split('\\n')\n", 713 | "seq_length = len(lines[0].split()) - 1" 714 | ] 715 | }, 716 | { 717 | "cell_type": "code", 718 | "execution_count": 25, 719 | "metadata": {}, 720 | "outputs": [ 721 | { 722 | "name": "stdout", 723 | "output_type": "stream", 724 | "text": [ 725 | "baskets of dried grass and put grasshoppers in them or catch two praying mantises and make them fight or string a necklace of red and black jungle nuts or watch a lizard basking on a rock or a snake hunting a frog near the wallows then they sing long long songs\n", 726 | "\n", 727 | "with odd native quavers at the end of the review and the hyaena whom he had seen the truth they feel twitched to the noises round him for a picture of the end of the ravine and snuffing bitten and best of the bulls at the dawn is a native\n" 728 | ] 729 | } 730 | ], 731 | "source": [ 732 | "# load the model\n", 733 | "model = load_model('junglebook_trained.h5')\n", 734 | " \n", 735 | "# load the tokenizer\n", 736 | "tokenizer = load(open('tokenizer.pkl', 'rb'))\n", 737 | " \n", 738 | "# select a seed text\n", 739 | "seed_text = lines[randint(0,len(lines))]\n", 740 | "print(seed_text + '\\n')\n", 741 | " \n", 742 | "# generate new text\n", 743 | "generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50)\n", 744 | "print(generated)" 745 | ] 746 | }, 747 | { 748 | "cell_type": "code", 749 | "execution_count": 26, 750 | "metadata": {}, 751 | "outputs": [ 752 | { 753 | "name": "stdout", 754 | "output_type": "stream", 755 | "text": [ 756 | "little toomai there was a splash and a trample and the rush of running water and kala nag strode through the bed of a river feeling his way at each step above the noise of the water as it swirled round the legs little toomai could hear more splashing and some\n", 757 | "\n", 758 | "trumpeting both upstream and down grass and knocked him up to the jealous moon he could see bruised of dust for the potter was rann caught him up to the plowed din of the melbourne lines where the two wolves would be forced to make themselves rifles and the sparks\n" 759 | ] 760 | } 761 | ], 762 | "source": [ 763 | "# load the model\n", 764 | "model = load_model('junglebook_trained.h5')\n", 765 | " \n", 766 | "# load the tokenizer\n", 767 | "tokenizer = load(open('tokenizer.pkl', 'rb'))\n", 768 | " \n", 769 | "# select a seed text\n", 770 | "seed_text = lines[randint(0,len(lines))]\n", 771 | "print(seed_text + '\\n')\n", 772 | " \n", 773 | "# generate new text\n", 774 | "generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50)\n", 775 | "print(generated)" 776 | ] 777 | }, 778 | { 779 | "cell_type": "code", 780 | "execution_count": 29, 781 | "metadata": {}, 782 | "outputs": [ 783 | { 784 | "name": "stdout", 785 | "output_type": "stream", 786 | "text": [ 787 | "is in their legs and he remembered the good firm beaches of novastoshnah seven thousand miles away the games his companions played the smell of the seaweed the seal roar and the fighting that very minute he turned north swimming steadily and as he went on he met scores of his\n", 788 | "\n", 789 | "mates and bound like the deck of the fighters and harness under his breath and he could not be able to stop a ship and ducked to nag wound up with scores of marble tracery showing all the regiments went twisting his head and shoulders and creepers very seldom shows\n" 790 | ] 791 | } 792 | ], 793 | "source": [ 794 | "# load the model\n", 795 | "model = load_model('junglebook_trained.h5')\n", 796 | " \n", 797 | "# load the tokenizer\n", 798 | "tokenizer = load(open('tokenizer.pkl', 'rb'))\n", 799 | " \n", 800 | "# select a seed text\n", 801 | "seed_text = lines[randint(0,len(lines))]\n", 802 | "print(seed_text + '\\n')\n", 803 | " \n", 804 | "# generate new text\n", 805 | "generated = generate_sequence(model, tokenizer, sequence_length, seed_text, 50)\n", 806 | "print(generated)" 807 | ] 808 | }, 809 | { 810 | "cell_type": "code", 811 | "execution_count": null, 812 | "metadata": { 813 | "collapsed": true 814 | }, 815 | "outputs": [], 816 | "source": [] 817 | } 818 | ], 819 | "metadata": { 820 | "kernelspec": { 821 | "display_name": "Python 3", 822 | "language": "python", 823 | "name": "python3" 824 | }, 825 | "language_info": { 826 | "codemirror_mode": { 827 | "name": "ipython", 828 | "version": 3 829 | }, 830 | "file_extension": ".py", 831 | "mimetype": "text/x-python", 832 | "name": "python", 833 | "nbconvert_exporter": "python", 834 | "pygments_lexer": "ipython3", 835 | "version": "3.6.3" 836 | } 837 | }, 838 | "nbformat": 4, 839 | "nbformat_minor": 2 840 | } 841 | -------------------------------------------------------------------------------- /CH07/code/Natural+Language+Processing+-+ChatBot.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "collapsed": true 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "spark = SparkSession.builder \\\n", 12 | " .master(\"local\") \\\n", 13 | " .appName(\"Natural Language Processing\") \\\n", 14 | " .config(\"spark.executor.memory\", \"6gb\") \\\n", 15 | " .getOrCreate()" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 2, 21 | "metadata": { 22 | "collapsed": true 23 | }, 24 | "outputs": [], 25 | "source": [ 26 | "df = spark.read.format('com.databricks.spark.csv')\\\n", 27 | " .options(header='true', inferschema='true')\\\n", 28 | " .load('TherapyBotSession.csv')\n" 29 | ] 30 | }, 31 | { 32 | "cell_type": "code", 33 | "execution_count": 3, 34 | "metadata": { 35 | "scrolled": false 36 | }, 37 | "outputs": [ 38 | { 39 | "name": "stdout", 40 | "output_type": "stream", 41 | "text": [ 42 | "+---+---------------+--------------------+----+----+----+----+\n", 43 | "| id| label| chat| _c3| _c4| _c5| _c6|\n", 44 | "+---+---------------+--------------------+----+----+----+----+\n", 45 | "| 1| escalate|I had a friend th...|null|null|null|null|\n", 46 | "| 2| escalate|\"My friend dealt ...|null|null|null|null|\n", 47 | "| 3| escalate|Friend who had bi...|null|null|null|null|\n", 48 | "| 4|do_not_escalate|Over the internet...|null|null|null|null|\n", 49 | "| 5| escalate|Having gone throu...|null|null|null|null|\n", 50 | "| 6| escalate|My now girlfriend...|null|null|null|null|\n", 51 | "| 7|do_not_escalate|\"Only really one ...|null|null|null|null|\n", 52 | "| 8|do_not_escalate|Now that I've bee...|null|null|null|null|\n", 53 | "| 9|do_not_escalate|I've always been ...|null|null|null|null|\n", 54 | "| 10| escalate|I feel completely...|null|null|null|null|\n", 55 | "| 11|do_not_escalate|Took a week off w...|null|null|null|null|\n", 56 | "| 12| escalate|One of my best fr...|null|null|null|null|\n", 57 | "| 13| escalate|I've had some fri...|null|null|null|null|\n", 58 | "| 14|do_not_escalate|Haha. In eight gr...|null|null|null|null|\n", 59 | "| 15|do_not_escalate|Some of my friend...|null|null|null|null|\n", 60 | "| 16| escalate|I feel like depre...|null|null|null|null|\n", 61 | "| 17| escalate|i've had a couple...|null|null|null|null|\n", 62 | "| 18| escalate|I will always lis...|null|null|null|null|\n", 63 | "| 19|do_not_escalate|A lot for my frie...|null|null|null|null|\n", 64 | "| 20|do_not_escalate|When my friend ne...|null|null|null|null|\n", 65 | "+---+---------------+--------------------+----+----+----+----+\n", 66 | "only showing top 20 rows\n", 67 | "\n" 68 | ] 69 | } 70 | ], 71 | "source": [ 72 | "df.show()" 73 | ] 74 | }, 75 | { 76 | "cell_type": "code", 77 | "execution_count": 4, 78 | "metadata": { 79 | "collapsed": true 80 | }, 81 | "outputs": [], 82 | "source": [ 83 | "df = df.select('id', 'label', 'chat')" 84 | ] 85 | }, 86 | { 87 | "cell_type": "code", 88 | "execution_count": 5, 89 | "metadata": {}, 90 | "outputs": [ 91 | { 92 | "name": "stdout", 93 | "output_type": "stream", 94 | "text": [ 95 | "+---+---------------+--------------------+\n", 96 | "| id| label| chat|\n", 97 | "+---+---------------+--------------------+\n", 98 | "| 1| escalate|I had a friend th...|\n", 99 | "| 2| escalate|\"My friend dealt ...|\n", 100 | "| 3| escalate|Friend who had bi...|\n", 101 | "| 4|do_not_escalate|Over the internet...|\n", 102 | "| 5| escalate|Having gone throu...|\n", 103 | "| 6| escalate|My now girlfriend...|\n", 104 | "| 7|do_not_escalate|\"Only really one ...|\n", 105 | "| 8|do_not_escalate|Now that I've bee...|\n", 106 | "| 9|do_not_escalate|I've always been ...|\n", 107 | "| 10| escalate|I feel completely...|\n", 108 | "| 11|do_not_escalate|Took a week off w...|\n", 109 | "| 12| escalate|One of my best fr...|\n", 110 | "| 13| escalate|I've had some fri...|\n", 111 | "| 14|do_not_escalate|Haha. In eight gr...|\n", 112 | "| 15|do_not_escalate|Some of my friend...|\n", 113 | "| 16| escalate|I feel like depre...|\n", 114 | "| 17| escalate|i've had a couple...|\n", 115 | "| 18| escalate|I will always lis...|\n", 116 | "| 19|do_not_escalate|A lot for my frie...|\n", 117 | "| 20|do_not_escalate|When my friend ne...|\n", 118 | "+---+---------------+--------------------+\n", 119 | "only showing top 20 rows\n", 120 | "\n" 121 | ] 122 | } 123 | ], 124 | "source": [ 125 | "df.show()" 126 | ] 127 | }, 128 | { 129 | "cell_type": "code", 130 | "execution_count": 6, 131 | "metadata": { 132 | "scrolled": true 133 | }, 134 | "outputs": [ 135 | { 136 | "name": "stdout", 137 | "output_type": "stream", 138 | "text": [ 139 | "+---------------+-----+\n", 140 | "| label|count|\n", 141 | "+---------------+-----+\n", 142 | "|do_not_escalate| 65|\n", 143 | "| escalate| 35|\n", 144 | "+---------------+-----+\n", 145 | "\n" 146 | ] 147 | } 148 | ], 149 | "source": [ 150 | "df.groupBy(\"label\") \\\n", 151 | " .count() \\\n", 152 | " .orderBy(\"count\", ascending = False) \\\n", 153 | " .show()" 154 | ] 155 | }, 156 | { 157 | "cell_type": "code", 158 | "execution_count": 7, 159 | "metadata": {}, 160 | "outputs": [], 161 | "source": [ 162 | "import pyspark.sql.functions as F\n", 163 | "df = df.withColumn('word_count',F.size(F.split(F.col('chat'),' ')))" 164 | ] 165 | }, 166 | { 167 | "cell_type": "code", 168 | "execution_count": 8, 169 | "metadata": { 170 | "scrolled": true 171 | }, 172 | "outputs": [ 173 | { 174 | "name": "stdout", 175 | "output_type": "stream", 176 | "text": [ 177 | "+---+---------------+--------------------+----------+\n", 178 | "| id| label| chat|word_count|\n", 179 | "+---+---------------+--------------------+----------+\n", 180 | "| 1| escalate|I had a friend th...| 304|\n", 181 | "| 2| escalate|\"My friend dealt ...| 184|\n", 182 | "| 3| escalate|Friend who had bi...| 90|\n", 183 | "| 4|do_not_escalate|Over the internet...| 88|\n", 184 | "| 5| escalate|Having gone throu...| 71|\n", 185 | "| 6| escalate|My now girlfriend...| 73|\n", 186 | "| 7|do_not_escalate|\"Only really one ...| 74|\n", 187 | "| 8|do_not_escalate|Now that I've bee...| 62|\n", 188 | "| 9|do_not_escalate|I've always been ...| 60|\n", 189 | "| 10| escalate|I feel completely...| 56|\n", 190 | "| 11|do_not_escalate|Took a week off w...| 60|\n", 191 | "| 12| escalate|One of my best fr...| 59|\n", 192 | "| 13| escalate|I've had some fri...| 50|\n", 193 | "| 14|do_not_escalate|Haha. In eight gr...| 55|\n", 194 | "| 15|do_not_escalate|Some of my friend...| 49|\n", 195 | "| 16| escalate|I feel like depre...| 41|\n", 196 | "| 17| escalate|i've had a couple...| 38|\n", 197 | "| 18| escalate|I will always lis...| 41|\n", 198 | "| 19|do_not_escalate|A lot for my frie...| 44|\n", 199 | "| 20|do_not_escalate|When my friend ne...| 42|\n", 200 | "+---+---------------+--------------------+----------+\n", 201 | "only showing top 20 rows\n", 202 | "\n" 203 | ] 204 | } 205 | ], 206 | "source": [ 207 | "df.show()" 208 | ] 209 | }, 210 | { 211 | "cell_type": "code", 212 | "execution_count": 9, 213 | "metadata": {}, 214 | "outputs": [ 215 | { 216 | "name": "stdout", 217 | "output_type": "stream", 218 | "text": [ 219 | "+---------------+-----------------+\n", 220 | "| label| avg_word_count|\n", 221 | "+---------------+-----------------+\n", 222 | "| escalate| 44.0|\n", 223 | "|do_not_escalate|20.29230769230769|\n", 224 | "+---------------+-----------------+\n", 225 | "\n" 226 | ] 227 | } 228 | ], 229 | "source": [ 230 | "df.groupBy('label')\\\n", 231 | " .agg(F.avg('word_count').alias('avg_word_count'))\\\n", 232 | " .orderBy('avg_word_count', ascending = False) \\\n", 233 | " .show()\n" 234 | ] 235 | }, 236 | { 237 | "cell_type": "code", 238 | "execution_count": 10, 239 | "metadata": {}, 240 | "outputs": [], 241 | "source": [ 242 | "df_plot = df.select('id', 'word_count').toPandas()" 243 | ] 244 | }, 245 | { 246 | "cell_type": "code", 247 | "execution_count": 11, 248 | "metadata": {}, 249 | "outputs": [ 250 | { 251 | "data": { 252 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAA7YAAAGMCAYAAADuhD56AAAABHNCSVQICAgIfAhkiAAAAAlwSFlz\nAAALEgAACxIB0t1+/AAAIABJREFUeJzt3Xu8bWVZN/zfxQbZIoiIO0TRNhaKiIC6g0d0J2qmiYma\nGWkGvKZZJKZWQodHrXjdz5Np+WT1UqloipFHxFOewfQRNoYiIB9Rt7I5iYgBnsH7/WOOpZPFOsx1\nmGvNsfb3+/mMzxpzzGvc4xr3nGvtfc17jHtWay0AAADQVzutdgIAAACwFApbAAAAek1hCwAAQK8p\nbAEAAOg1hS0AAAC9prAFAACg1xS2AOzwquqlVfWvq53Hchg+l6q6d1XdXFXrlqntf6yqP+vWj6qq\n7cvRbtfe5qq6bLnaA2DHorAFYKJU1SlV9b5p2744y7ZjVyinO1fV31TV17pC8Uvd47uN+bjHV9Un\nFrt/a+1rrbXdW2u3LsdxWmvPba39xWLzmXbMVlU/O9T2ua21+y1H2wDseBS2AEyac5IcOTXKWFX7\nJtklyYOmbfvZLnZkNbCgf/uq6g5JPpzkAUkel+TOSR6a5BtJDl9IW322XKO+ADAOClsAJs35GRSy\nh3WPNyf5aJLLpm37UmvtqiSpqiOr6vyq+u/u55FTjVXVx6rq1Kr6zyTfSXKfqtq/qj5eVTdV1QeT\nzDXy+ptJ7p3kya21S1prP2qtfb219pettfd2x7h/d5xvVdXFVfXEacf/raHHtxkd7UYun9uNQH+r\nql7TFeD3T/KPSR7ajRJ/a6bk5jqXqtrYtb/z0LG/3MV+paqeMdtxqur1VfUPVfXeqvp2kkd22/5y\n2vH/uKq+UVXbquoZo5x3VU19IPHZ7pi/Nv3S5nn69PVdP72nO5dPV9XPzPEaArDGKWwBmCittR8k\n+XSSn+82/XySc5N8Ytq2c5Kkqu6a5D1JXp1k7ySvTPKeqtp7qNlnJnlOkj2SfDXJm5NckEER+BdJ\njpsjpV9I8v7W2s0zPVlVuyR5d5L/SPJTSZ6X5E1VtZDLap+Q5OeSHJLkaUke21q7NMlzk3yqu5z4\nLrPsO9K5VNWdMuijX2qt7ZHkyCQXznOcpyc5NYN+m+lS5bt3x71nd9zTRjnv1trU63hod8x/m5br\nKH16bJKXJdkryeVdngDsoBS2AEyij+cnRezmDArbc6dt+3i3fnSSL7bW3thau6W1dkaSLyT55aH2\nXt9au7i1dkuSfTMoIv+stfb91to5GRRRs9k7ydVzPP8/kuyeZEtr7QettY8kOTvJr494run2/VZr\n7WsZjE4fNt8OyWByqCzsXH6U5OCqumNr7erW2sXzHOJdrbX/7EapvzdLzNSxP57BBwxPGyX3eYzS\np+9orZ3XvaZvyoh9BsDapLAFYBKdk+Th3WjshtbaF5N8MoN7b++a5OD85P7ae2QwCjvsqxmMIk65\nYmj9HkluaK19e1r8bK7PoBiezT2SXNFa+9Ecx5/PNUPr38mgqBvFyOfSxfxaBqOzV3eX8R44T/tX\nzPP8TMe+xzz7jGKUPl1snwGwBilsAZhEn0qyZ5JnJ/nPJGmt3Zjkqm7bVa21r3SxVyX56Wn73zvJ\nlUOP29D61Un26i7NHY6fzYeSPHZa/LCrktxr2qRUw8f/dpLdhp67+xzHmq7N8/yCzqW19oHW2mMy\nKNS/kOSf5jnOfMef6dhXdetLOe/5+hQAbkNhC8DEaa19N8nWJC/M4BLkKZ/otg3PhvzeJPetqqdX\n1c5V9WtJDsrg0tWZ2v5q1/bLquoOVfXw3Pay5enemMHI5duq6sCq2qmq9u4mTXp8BvcDfyfJH1XV\nLlV1VNfeW7r9L0zylKrarQZfb/OsBXTFtUn262ZmXtK5VNU+VXVMV4h+P8nNGVyaPO9x5jF17M0Z\n3Cv87932+c772iT3maXN+foUAG5DYQvApPp4BhMHDU9adG637ceFbWvt+gwKqhdlcNnwHyV5Qmvt\nG3O0/fQkRyT5ZpKXJHnDbIGtte9nMIHUF5J8MMmNSc7LYNKkT3eTXf1ykl/K4CuA/j7Jb7bWvtA1\n8aokP8igkDs9g/tBR/WRJBcnuaaqZjufUc9lpww+FLiqi31Ekt9ZwHFmck2SG7o235TkuQs475cm\nOb2b9fg29+WO0KcAcBvV2nxXGQEAAMDkMmILAABArylsAQAA6DWFLQAAAL2msAUAAKDXFLYAAAD0\n2s6rncBS3O1ud2sbN25c7TQAAAAYgwsuuOAbrbUN88X1urDduHFjtm7dutppAAAAMAZV9dVR4lyK\nDAAAQK8pbAEAAOg1hS0AAAC91ut7bAEAAJbbD3/4w2zfvj3f+973VjuVHcb69euz3377ZZdddlnU\n/gpbAACAIdu3b88ee+yRjRs3pqpWO501r7WW66+/Ptu3b8/++++/qDZcigwAADDke9/7Xvbee29F\n7Qqpquy9995LGiFX2AIAAEyjqF1ZS+1vhS0AAAC95h5bAACAOWw8+T3L2t62LUcva3uj+tjHPpZX\nvOIVOfvss1fl+Nu2bcsnP/nJPP3pT1/2to3YAgAArEG33nrraqdwG9u2bcub3/zmsbStsAUAAJgw\nf/VXf5VXv/rVSZIXvOAFedSjHpUk+chHPpJnPOMZOeOMM/LABz4wBx98cF784hf/eL/dd989L3rR\ni3LooYfmU5/6VN7//vfnwAMPzIMf/OC8/e1vn/OYN998c0444YQ88IEPzCGHHJK3ve1tSTLnsaa8\n9a1vzfHHH58kOf7443PSSSflyCOPzH3uc5+89a1vTZKcfPLJOffcc3PYYYflVa961dI7aYjCFgAA\nYMJs3rw55557bpJk69atufnmm/PDH/4w5557bu573/vmxS9+cT7ykY/kwgsvzPnnn593vvOdSZJv\nf/vbOeKII/LZz342mzZtyrOf/ey8+93vzgUXXJBrrrlmzmP+xV/8Rfbcc89cdNFF+dznPpdHPepR\nueqqq2Y91lyuvvrqfOITn8jZZ5+dk08+OUmyZcuWbN68ORdeeGFe8IIXLLGHbkthCwAAMGEe8pCH\n5IILLsiNN96YXXfdNQ996EOzdevWnHvuubnLXe6So446Khs2bMjOO++cZzzjGTnnnHOSJOvWrcuv\n/MqvJEm+8IUvZP/9988BBxyQqspv/MZvzHnMD33oQznxxBN//HivvfbK+eefP+ux5vKkJz0pO+20\nUw466KBce+21S+iJ0ayZyaOm39C9WjdkAwAALNUuu+yS/fffP69//etz5JFH5pBDDslHP/rRXH75\n5dm4cWMuuOCCGfdbv3591q1btyI5Dn9Fz/TvoN11111/vN5aG3suYxuxrar1VXVeVX22qi6uqpd1\n2+9aVR+sqi92P/ca2ueUqrq8qi6rqseOKzcAAIBJt3nz5rziFa/Iz//8z2fz5s35x3/8xzzoQQ/K\n4Ycfno9//OP5xje+kVtvvTVnnHFGHvGIR9xu/wMPPDDbtm3Ll770pSSDe2Xn8pjHPCavec1rfvz4\nhhtumPNY++yzTy699NL86Ec/yjve8Y55z2ePPfbITTfdtJAuGNk4R2y/n+RRrbWbq2qXJJ+oqvcl\neUqSD7fWtlTVyUlOTvLiqjooybFJHpDkHkk+VFX3ba1N1lReAADADmW1rgbdvHlzTj311Dz0oQ/N\nne50p6xfvz6bN2/Ovvvumy1btuSRj3xkWms5+uijc8wxx9xu//Xr1+e0007L0Ucfnd122y2bN2+e\ns7D80z/905x44ok5+OCDs27durzkJS/JU57ylFmPtWXLljzhCU/Ihg0bsmnTptx8881zns8hhxyS\ndevW5dBDD83xxx+/rPfZ1koMC1fVbkk+keR3krwhyVGttaurat8kH2ut3a+qTkmS1trLu30+kOSl\nrbVPzdbupk2b2tatW5O4FBkAAFgel156ae5///uvdho7nJn6vaouaK1tmm/fsU4eVVXrqurCJF9P\n8sHW2qeT7NNau7oLuSbJPt36PZNcMbT79m7b9DafU1Vbq2rrddddN8bsAQAA6IOxFrattVtba4cl\n2S/J4VV18LTnW5IFDRm31k5rrW1qrW3asGHDMmYLAACw9r3uda/LYYcddptleDbkPlqRWZFba9+q\nqo8meVySa6tq36FLkb/ehV2Z5F5Du+3XbQMAAFhRrbXbzPq7lpxwwgk54YQTVjuN21jqLbLjnBV5\nQ1XdpVu/Y5LHJPlCkrOSHNeFHZfkXd36WUmOrapdq2r/JAckOW9c+QEAAMxk/fr1uf7661fka2oY\nFLXXX3991q9fv+g2xjliu2+S06tqXQYF9JmttbOr6lNJzqyqZyX5apKnJUlr7eKqOjPJJUluSXKi\nGZEBAICVtt9++2X79u0xp8/KWb9+ffbbb79F7z+2wra19rkkD5ph+/VJHj3LPqcmOXVcOQEAAMxn\nl112yf7777/aabAAY508CgAAAMZNYQsAAECvKWwBAADoNYUtAAAAvaawBQAAoNcUtgAAAPSawhYA\nAIBeU9gCAADQawpbAAAAek1hCwAAQK8pbAEAAOg1hS0AAAC9prAFAACg1xS2AAAA9JrCFgAAgF5T\n2AIAANBrClsAAAB6TWELAABArylsAQAA6DWFLQAAAL2msAUAAKDXFLYAAAD0msIWAACAXlPYAgAA\n0GsKWwAAAHpNYQsAAECvKWwBAADoNYUtAAAAvaawBQAAoNcUtgAAAPSawhYAAIBeU9gCAADQawpb\nAAAAek1hCwAAQK8pbAEAAOg1hS0AAAC9prAFAACg1xS2AAAA9JrCFgAAgF5T2AIAANBrClsAAAB6\nTWELAABAr42tsK2qe1XVR6vqkqq6uKqe321/aVVdWVUXdsvjh/Y5paour6rLquqx48oNAACAtWPn\nMbZ9S5IXtdY+U1V7JLmgqj7YPfeq1torhoOr6qAkxyZ5QJJ7JPlQVd23tXbrGHMEAACg58Y2Ytta\nu7q19plu/aYklya55xy7HJPkLa2177fWvpLk8iSHjys/AAAA1oYVuce2qjYmeVCST3ebnldVn6uq\n11bVXt22eya5Ymi37ZmhEK6q51TV1qraet11140xawAAAPpg7IVtVe2e5G1Jfr+1dmOSf0hynySH\nJbk6yV8vpL3W2mmttU2ttU0bNmxY9nwBAADol7EWtlW1SwZF7Ztaa29Pktbata21W1trP0ryT/nJ\n5cZXJrnX0O77ddsAAABgVuOcFbmS/EuSS1trrxzavu9Q2JOTfL5bPyvJsVW1a1Xtn+SAJOeNKz8A\nAADWhnHOivywJM9MclFVXdht++Mkv15VhyVpSbYl+e0kaa1dXFVnJrkkgxmVTzQjMgAAAPMZW2Hb\nWvtEkprhqffOsc+pSU4dV04AAACsPSsyKzIAAACMi8IWAACAXlPYAgAA0GsKWwAAAHpNYQsAAECv\nKWwBAADoNYUtAAAAvaawBQAAoNcUtgAAAPSawhYAAIBeU9gCAADQawpbAAAAek1hCwAAQK8pbAEA\nAOg1hS0AAAC9prAFAACg1xS2AAAA9JrCFgAAgF5T2AIAANBrClsAAAB6TWELAABArylsAQAA6DWF\nLQAAAL2msAUAAKDXFLYAAAD0msIWAACAXlPYAgAA0GsKWwAAAHpNYQsAAECvKWwBAADoNYUtAAAA\nvaawBQAAoNcUtgAAAPSawhYAAIBeU9gCAADQawpbAAAAek1hCwAAQK8pbAEAAOg1hS0AAAC9prAF\nAACg1xS2AAAA9JrCFgAAgF4bW2FbVfeqqo9W1SVVdXFVPb/bfteq+mBVfbH7udfQPqdU1eVVdVlV\nPXZcuQEAALB2jHPE9pYkL2qtHZTkfyQ5saoOSnJykg+31g5I8uHucbrnjk3ygCSPS/L3VbVujPkB\nAACwBoytsG2tXd1a+0y3flOSS5PcM8kxSU7vwk5P8qRu/Zgkb2mtfb+19pUklyc5fFz5AQAAsDas\nyD22VbUxyYOSfDrJPq21q7unrkmyT7d+zyRXDO22vds2va3nVNXWqtp63XXXjS1nAAAA+mHshW1V\n7Z7kbUl+v7V24/BzrbWWpC2kvdbaaa21Ta21TRs2bFjGTAEAAOijsRa2VbVLBkXtm1prb+82X1tV\n+3bP75vk6932K5Pca2j3/bptAAAAMKtxzopcSf4lyaWttVcOPXVWkuO69eOSvGto+7FVtWtV7Z/k\ngCTnjSs/AAAA1oadx9j2w5I8M8lFVXVht+2Pk2xJcmZVPSvJV5M8LUlaaxdX1ZlJLslgRuUTW2u3\njjE/AAAA1oCxFbattU8kqVmefvQs+5ya5NRx5QQAAMDasyKzIgMAAMC4KGwBAADoNYUtAAAAvaaw\nBQAAoNcUtgAAAPSawhYAAIBeU9gCAADQawpbAAAAem3ewraq3jjKNgAAAFgNo4zYPmD4QVWtS/KQ\n8aQDAAAACzNrYVtVp1TVTUkOqaobu+WmJF9P8q4VyxAAAADmsPNsT7TWXp7k5VX18tbaKSuY01hs\nPPk9t9u2bcvRq5AJAAAAy2nWwnZKa+2Uqrpnkp8ejm+tnTPOxAAAAGAU8xa2VbUlybFJLklya7e5\nJVHYAgAAsOrmLWyTPDnJ/Vpr3x93MgAAALBQo8yK/OUku4w7EQAAAFiMUUZsv5Pkwqr6cJIfj9q2\n1k4aW1YAAAAwolEK27O6BQAAACbOKLMin74SiQAAAMBijDIr8lcymAX5Nlpr9xlLRgAAALAAo1yK\nvGlofX2SX01y1/GkAwAAAAsz76zIrbXrh5YrW2t/k+ToFcgNAAAA5jXKpcgPHnq4UwYjuKOM9AIA\nAMDYjVKg/vXQ+i1JtiV52liyAQAAgAUaZVbkR65EIgAAALAY895jW1V7VtUrq2prt/x1Ve25EskB\nAADAfOYtbJO8NslNGVx+/LQkNyZ53TiTAgAAgFGNco/tz7TWfmXo8cuq6sJxJQQAAAALMcqI7Xer\n6uFTD6rqYUm+O76UAAAAYHSjjNg+N8kbhu6rvSHJ8WPLCAAAABZglFmRP5vk0Kq6c/f4xrFnBQAA\nACOa9VLkqnphVT1r6nFr7cbW2o1V9ayq+v2VSQ8AAADmNtc9ts9I8oYZtr8xyf8znnQAAABgYeYq\nbHdurf1w+sbW2g+S1PhSAgAAgNHNVdjuVFX7TN840zYAAABYLXMVtn+V5D1V9Yiq2qNbjkpydpJX\nrEh2AAAAMI9ZZ0Vurb2hqq5L8udJDk7Sklyc5H+21t63QvkBAADAnOb8up+ugFXEAgAAMLHmuhQZ\nAAAAJp7CFgAAgF5T2AIAANBrs95jW1UvnGvH1torlz8dAAAAWJi5Rmz36JZNSX4nyT275blJHjxf\nw1X12qr6elV9fmjbS6vqyqq6sFseP/TcKVV1eVVdVlWPXewJAQAAsGOZ6+t+XpYkVXVOkge31m7q\nHr80yXtGaPv1Sf4uyRumbX9Va+0234NbVQclOTbJA5LcI8mHquq+rbVbRzsNAAAAdlSj3GO7T5If\nDD3+QbdtTq21c5J8c8Q8jknyltba91trX0lyeZLDR9wXAACAHdgohe0bkpzXXUb80iSfzmA0drGe\nV1Wf6y5V3qvbds8kVwzFbO+23U5VPaeqtlbV1uuuu24JaQAAALAWzFvYttZOTXJCkhu65YTW2ssX\nebx/SHKfJIcluTrJXy+0gdbaaa21Ta21TRs2bFhkGgAAAKwVs95jmyRVtS7Jxa21A5N8ZqkHa61d\nO9T2PyU5u3t4ZZJ7DYXu120DAACAOc05YttN3nRZVd17OQ5WVfsOPXxykqkZk89KcmxV7VpV+yc5\nIMl5y3FMAAAA1rY5R2w7eyW5uKrOS/LtqY2ttSfOtVNVnZHkqCR3q6rtSV6S5KiqOixJS7ItyW93\nbV1cVWcmuSTJLUlONCMyAAAAoxilsP2zxTTcWvv1GTb/yxzxpyY5dTHHAgAAYMc1b2HbWvt4Ve2T\n5Oe6Tee11r4+3rQAAABgNPPOilxVT8vgftdfTfK0JJ+uqqeOOzEAAAAYxSiXIv9Jkp+bGqWtqg1J\nPpTkreNMDAAAAEYx74htkp2mXXp8/Yj7AQAAwNiNMmL7/qr6QJIzuse/luS940sJAAAARjfK5FF/\nWFW/kuRh3abTWmvvGG9aAAAAMJpZC9uq+v0kn0zymdba25K8bcWyAgAAgBHNNWK7X5K/SXJgVV2U\n5D8zKHQ/2Vr75kokBwAAAPOZtbBtrf1BklTVHZJsSnJkkhOSnFZV32qtHbQyKQIAAMDsRpk86o5J\n7pxkz265KslF40wKAAAARjXXPbanJXlAkpuSfDqDy5Bf2Vq7YYVyAwAAgHnN9X20906ya5JrklyZ\nZHuSb61EUgAAADCque6xfVxVVQajtkcmeVGSg6vqm0k+1Vp7yQrlCAAAALOa8x7b1lpL8vmq+laS\n/+6WJyQ5PInCFgAAgFU31z22J2UwUntkkh+m+6qfJK+NyaMAAACYEHON2G5M8u9JXtBau3pl0gEA\nAICFmese2xeuZCIAAACwGHPNigwAAAATT2ELAABArylsAQAA6DWFLQAAAL2msAUAAKDXFLYAAAD0\nmsIWAACAXlPYAgAA0GsKWwAAAHpNYQsAAECvKWwBAADoNYUtAAAAvaawBQAAoNcUtgAAAPSawhYA\nAIBeU9gCAADQawpbAAAAek1hCwAAQK8pbAEAAOg1hS0AAAC9prAFAACg1xS2AAAA9JrCFgAAgF5T\n2AIAANBrClsAAAB6bWyFbVW9tqq+XlWfH9p216r6YFV9sfu519Bzp1TV5VV1WVU9dlx5AQAAsLaM\nc8T29UkeN23byUk+3Fo7IMmHu8epqoOSHJvkAd0+f19V68aYGwAAAGvE2Arb1to5Sb45bfMxSU7v\n1k9P8qSh7W9prX2/tfaVJJcnOXxcuQEAALB2rPQ9tvu01q7u1q9Jsk+3fs8kVwzFbe+23U5VPaeq\ntlbV1uuuu258mQIAANALqzZ5VGutJWmL2O+01tqm1tqmDRs2jCEzAAAA+mSlC9trq2rfJOl+fr3b\nfmWSew3F7ddtAwAAgDmtdGF7VpLjuvXjkrxraPuxVbVrVe2f5IAk561wbgAAAPTQzuNquKrOSHJU\nkrtV1fYkL0myJcmZVfWsJF9N8rQkaa1dXFVnJrkkyS1JTmyt3Tqu3AAAAFg7xlbYttZ+fZanHj1L\n/KlJTh1XPgAAAKxNqzZ5FAAAACwHhS0AAAC9prAFAACg1xS2AAAA9JrCFgAAgF4b26zIfbTx5Pfc\nbtu2LUevQiYAAACMyogtAAAAvWbEdoGM6gIAAEwWI7YAAAD0msIWAACAXlPYAgAA0GsKWwAAAHpN\nYQsAAECvKWwBAADoNYUtAAAAvaawBQAAoNcUtgAAAPSawhYAAIBeU9gCAADQawpbAAAAek1hCwAA\nQK/tvNoJMLuNJ7/ndtu2bTl6FTIBAACYXArbVaJoBQAAWB4uRQYAAKDXjNiOgdFYAACAlWPEFgAA\ngF5T2AIAANBrClsAAAB6TWELAABArylsAQAA6DWFLQAAAL2msAUAAKDXFLYAAAD02s6rnQBLs/Hk\n99xu27YtR69CJgAAAKvDiC0AAAC9ZsR2BzB9VHexI7rL1Q4AAMByUtiyrBS/AADASnMpMgAAAL2m\nsAUAAKDXXIpMEpcQAwAA/aWwZSIptAEAgFG5FBkAAIBeW5UR26raluSmJLcmuaW1tqmq7prk35Js\nTLItydNaazesRn4AAAD0x2qO2D6ytXZYa21T9/jkJB9urR2Q5MPdYwAAAJjTJF2KfEyS07v105M8\naRVzAQAAoCdWq7BtST5UVRdU1XO6bfu01q7u1q9Jss/qpAYAAECfrNasyA9vrV1ZVT+V5INV9YXh\nJ1trraraTDt2hfBzkuTe9773+DNl2ZnxGAAAWE6rUti21q7sfn69qt6R5PAk11bVvq21q6tq3yRf\nn2Xf05KcliSbNm2asfhlx6BABgAAklW4FLmq7lRVe0ytJ/nFJJ9PclaS47qw45K8a6VzAwAAoH9W\nY8R2nyTvqKqp47+5tfb+qjo/yZlV9awkX03ytFXIDQAAgJ5Z8cK2tfblJIfOsP36JI9e6XwAAADo\nt0n6uh8AAABYsNWaFRnWlOkTWSUmswIAgJVixBYAAIBeM2ILK8SoLgAAjIfCljVruQpJBSkAAEw2\nlyIDAADQawpbAAAAes2lyOzQduTLjKef+45y3gAArD0KW+iZHbkYBwCAmbgUGQAAgF5T2AIAANBr\nLkWGCeIyYwAAWDgjtgAAAPSaEVtYgyZtxuNR8pm0nAEA6A+FLdAbil8AAGaisAVmZaQVAIA+UNgC\nOxQTdAEArD0mjwIAAKDXFLYAAAD0mkuRAaZxuTIAQL8obIE1o48F6XLlvJITfZkwDACYNApbgAnX\nx4J9uSiiAYBRKGwBxqSPBelKFpIrdaw+vg4AwMIobAFYdn0rWhW/ANBvCluARVAIsVh9vK96ufIB\ngHFR2ALACHyYMRm8DgDMRGELAMtE0QUAq0NhCwATxmW9ALAwClsAYE59+w7ktTpy7gMPgNkpbAGA\niaH4BWAxFLYAAIs0SoG8mJjVLrInLR+A+ShsAQBYMMXv7FwRACtPYQsAOzDFyezWanHiNQfWIoUt\nAMAaoWhdmrX6YQbsCBS2AACMxVostJer+B2lb9Zi/8G4KGwBAFg18xVvyzVB11rVt6/jgnFR2AIA\nsOYpfn9iLZz3jvx6MrOdVjsBAAAAWAojtgAAwIpZjsvPYTqFLQAAsENyD/faobAFAADmZSbn8dN/\ni6ewBQAAWIKVHNX1AcPMJq6wrarHJfnbJOuS/HNrbcsqpwQAAPTMpF1CPGnF5qTls1QTVdhW1bok\nr0nymCTbk5xfVWe11i5Z3cwAAAB2LH0qfieqsE1yeJLLW2tfTpKqekuSY5IobAEAACbMpBS/k1bY\n3jPJFUOPtyc5YpVyAQAAYIlW4iueqrW28MzGpKqemuRxrbXf6h4/M8kRrbXfG4p5TpLndA/vl+Sy\nac3cLck35jnUJMVMUi5ivJ5iFh4zSbmI8ZqLGU/MJOUixmsuZjwxk5SLmNvG/HRrbcM8+ySttYlZ\nkjw0yQc4dJ+xAAAOFUlEQVSGHp+S5JQFtrG1TzGTlIsYr6cYr7kYr7mYyc5FjNdcjNdczMzLTpks\n5yc5oKr2r6o7JDk2yVmrnBMAAAATbKLusW2t3VJVv5fkAxl83c9rW2sXr3JaAAAATLCJKmyTpLX2\n3iTvXUITp/UsZpJyEbP0mEnKRczKxExSLmJWJmaSchGzMjGTlIuYlYmZpFzErEzMJOUiZhEmavIo\nAAAAWKhJu8cWAAAAFkRhCwAAQK8pbAEAAOi1NV/YVtWBVfXoqtp92vbHDa0fXlU/160fVFUvrKrH\nz9PuG+Z5/uFdO784tO2Iqrpzt37HqnpZVb27qv5XVe3ZbT+pqu41T9t3qKrfrKpf6B4/var+rqpO\nrKpdhuLuU1V/UFV/W1WvrKrnTh0fplTVTy1TO3svRztrkT4eP328MvTz+Onj8dPH46ePV4Z+nmYx\nX37bhyXJCUlOSnJZkncm2ZbkmKHnP9P9fEmS/5tka5KXJ/lIkj9Lck6SP+lizpq2vDvJzVOPu5jz\nhtp+dpILu7b/M8nJ3faLk+zcrZ+W5G+SPLyLe3u3/b+TXJXk3CS/m2TDDOf2piT/1uXxxiTvSPLM\nJK9PcnoXc1KS/0jyp0k+meQ1SU5NckmSo1b79VnA6/hTy9TO3quU/55JtiT5QpJvJrk+yaXdtruM\nsP/7up937t6fb0zy9Gkxf9/9vHuSf+he672TvDTJRUnOTLJvF3PXacve3e/GXknu2sU8blr+/5Lk\nc0nenGSfbvuWJHfr1jcl+XKSy5N8Nckjpn7Huvffz8xxfpuSfDTJvya5V5IPdr8D5yd5UBeze5I/\n735//jvJdRn8zh6vj/XxWulj/Tx3P+vjHe+9rI/7817Wxzve34vl6uOh9irJEUme0i1HpJvkeCHL\ngoL7tCT5WveG2L17vDGD4vX53eP/6n5elMF35u6W5MYkd+623zHJ54Ze3H9NclSSR3Q/r+7WHzHc\nXrd+frqCNMmdklzUrV86FPOZafleONVOBiPpv9i9Ga9L8v4kxyXZo4uZymvnJNcmWTf0pvjc8Hl1\n67sl+Vi3fu+hc/cLO/6C4ANJXpzk7kPt3r3b9h/d4wfPsjwkydVdzNu683pSBh+ovC3JrsPvpe59\n8rwkJ3d98uIu7+cleVcX86MkX5m2/LD7+eXp780k/5zkL5P8dJIXJHnn1PtrKOajSX6uW79vkq3d\n+leSvCKD38Xzuv3vMa2Pz0vyS0l+PckVSZ7abX90kk916+9KcnyS/ZK8MIMPng5IcnqS/1cf6+O1\n0Mf+Xszdz/p4x3sv6+P+vJf18Y7392K5+riL+8UM/p/+vi7nf+764/Ikvzj82s63rHoBupSle+Fn\nWi5K8v0kF0+L373rqFdmqJAcev6/psVPxezUvaAfTHJYt+3L02I/m0EBtnduX7ROFZL/nuSEbv11\nSTYNvZHOn/6G7B7vkuSJSc5Icl237fNJ7tAd76b8pOhbn6547vpg6pdmr6k36tT+fmFXrCC4bI73\n72Xdz1szuFLgozMs3x1+Lw7t+ycZXA3w4/dbbvte/tos7+UXda/FA4ee+8q02M9M32+Gdi7NT64+\n+L/TYi6aoZ3NSf4+yTXdeT1nhJynfm8+O2371O/KThl8KKOP9XHv+3i4L/Xz7ftZH+9472V93J/3\nsj7e8f5eLFcfD+WzcYY+3j9Dg4KjLCMHTuKSwWjlYRkUNsPLxgwu5/1IukJ0aJ+dk7whya3d408n\n2W2qo4fi9szti8z9MihO/26GF3FbBiODX+l+To1C7j70Rtozg8uFv9Qd94dd7MeTHDr9jT3D+U7l\n+YJuv69mcMnxh5P8UwbF7Eu6mOdnUED+Uwb/MZ0qqDckOadb9wvbxvuHMYPLwf8o3Why99w+GRT2\nH+oefz7JAbO8DlcMndNO0547PoOR4q9OzyXJX8503tPex69Mskdu/yHN9gyK9Bdl8H6uoeemrgh4\nXnduj8pg9P1vM7iC4WVJ3ji9j4f2X5fkcUle1z3+VAaf1P1qBu/nJ3XbH5GffAjxySQP79afmOQD\nw+/TCerjz+ljfbzYPu5+6udZ+lkfL0sf3+7/F5ng97I+7td7eQX72N/kQT+/ajX7eRF9fMxMfdyt\nfzHd/9unHe8OSS6fqW9nW0YOnMQlg8tQHz7Lc2/uXvy7z/L8w7qfu87y/N0yVGBNe+7oDA2hz5Pj\nbkn2n7btzkkOzWDUc59pz913xHbvkW6EMcldkjw1yeHTYh7QbT9wljYm5Rd2LRddeyX5XxkUuTdk\ncMn3pd22qZH2pya53yx9PHXM/53kF2Z4/nFJvtit/3m6S++nxfxskrfOsP2JGVw2fc207S+Ztkxd\nVn/3JG8Yijsqg3u9/yuDD1Xem8GneLt0z79lhPfxoRlcOfC+JAd2r9W3uvfOkUMx53X994mpvsrg\nQ5qT9PGi+viGro8fNhQz3Mf3naePb+j6+H9PQB8fs8Q+fuQMffzbC+zjwxbQx9/KDO/jbt17eXn6\neMb3sT5etr8Xh6xEP09oH8/092Ic/+5N9fHU34vevJez9L/JR42pj5fy9+LSrn8noo8X8F7+nyP0\n82eyiH/7lquPu8endK/3i5M8vVte3G07Zb7X+zZ5LSTYsraW3PaP4vRf2L26mEn+hV3MH8apkdzl\n/sdnrj+MByb5hennn9veL3xgBpc4LybmlxbbTgb3kh+8zPksNOb+I8bM2odJDs9PLjl/QAYffDx+\nWuxwzEEZfECyEjEPzOB+7qW2s5TzOmKEdo6Yr51p8W8c4XfoDSsR072P/31S8lmu/sngapIXZY57\njDKYgPCFKxSzuXsvL0c7SzqvxbbRvc/37NZ3y+DfprMz+Hdvz6GY4fk2/jyDyRqnx+w5QsxwOy+b\nJ2a3DP49/dAcx9pthGMt93nN1M5wPjOd10lJ7jXP+3vOmOVoY6aYDP27N+5jjTnmDhnMv/IL3eNn\nZDCPyYn5SXGya5LfHIp5egZXHS5rTJfL8PPPzOBqv+ltHDfPcaa3s9hzGqVvpsfcrp1u+88k+cMk\nr85glPS56X6PhmLuk+QPMvh/4itXMOb/y2AwZ3rMVM7ztbOk85rhOL8zRy6vni2XLu6gDG5X/D/d\ncnKSg+b6HZhpqa4xuI2qOqG19rrVjKmqO2YwudPnJyGfxcRkMOp8YgYfGByWweRl7+qe/0xr7cFV\nddIIMc9L8nvLELNcx1rOdn43gw9XFhWTwX3Ov5TBbQYfzKDQ+1iSx2Qwgn5qVb1kWswRGVx2vhIx\no+SzXDHLlc9t2ukeT/eoDP7jktbaE6vqrGnPVwajGysRM0o+yxUzznzOa60dniRV9VsZ/I69M4Mr\nR97dWtsyLebZXcw7Vijmd5chn0WdV5KnLDCX38rg78/0XC7O4NafW6rqtCTfzmBeiEd3258yQ8x3\nkrx1hWKWK5/Vbue/u32/lMEVdP/eWvvG8Jt/WswZXcx1oz6/hJgzF5rLDDGLOqclHOutM8S8KYO/\n2XfMYPLKO2Xwfn90BlezHTcUs1sGH8zvnuTtC4hJa+34+WIyKG6Hn19oLrMdZ7nOaa52pvpvpnZO\nSvKEDL4p5fEZDJZ8K8mTk/xua+1jqxDzyxncwriUdp6fwdWni84ng4GdJZ9TlttCK2HLjrFk2j2l\nYhYXk9Fn5hazyJiMNrO5mCXEZMSZ4VcwZrnymbTzGmV2/R0yZhmPM8q3E4hZeswo3/AwZ8xytLHG\nY0b5howViZmkXJY5ZpRvGNkhY5brON3jJX1Ly/Cyc9hhVdXnZnsqg3ttxSw95qbW2s1J0lrbVlVH\nJXlrVf10F5MM7k8Ws/iYW1prtyb5TlV9qbV2Yxf73ar6UdeGmKXFbMpgQro/SfKHrbULq+q7rbWP\n5ycesoIxy5XPpJ3XTlW1Vwb/oV3XuhGa1tq3q+qWHTxmuY4zfAXQZ6tqU2tta1XdN4MJHcUsT0xr\nrf0og/ku/qOqdslPvmXgFRncrjNfzBXL0MZajtmpqu6QwQc3u2VQHHwzg0t1d+leh5WKuXWCclnO\nmGRQ+N7abd89gxfna91rsiPHtGU8zpkZXL10VGvtmiSpqrtn8EHOmRl8wDOatoAq2LK2lswzq7SY\npcdktJm5xSwhJiPMbC5m6THd41lnhhez9JiMNrv+DhmzjMcZ5dsJxCw9ZpRveJgzZjnaWOMxo3xD\nxorETFIuyxzz/Mz/DSM7ZMxyHad7PO+3tIy6jBxoWXtL5plVWszSYzLazNxilhCTEWY2F7P0mGnb\n550ZXszSY4Zibze7vpiltZE5vp1AzNJjMsI3PMwXsxxtrOWYLm6Ub8hYkZhJymWZY+b8hpEdOWYZ\njzPvt7SMupg8CgAAgBXX3UJycgZfFfVT3eZrk5yVZEtr7YaR21LYAgAAMElqhG8huU28whYAAIBJ\nUlVfa63de9R4syIDAACw4mqEbyEZlcIWAACA1bBPkscmmX4vbSX55EIaUtgCAACwGs5Osntr7cLp\nT1TVxxbSkHtsAQAA6LWdVjsBAAAAWAqFLQAAAL2msAWACVZVM06eUVWvr6qnrnQ+ADCJFLYAMMFa\na0eudg4AMOnMigwAE6yqbm6t7V5VleT/JHlMkiuS/GB1MwOAyWHEFgD64clJ7pfkoCS/mcRILgB0\nFLYA0A8/n+SM1tqtrbWrknxktRMCgEmhsAUAAKDXFLYA0A/nJPm1qlpXVfsmeeRqJwQAk8LkUQDQ\nD+9I8qgklyT5WpJPrW46ADA5qrW22jkAAADAorkUGQAAgF5T2AIAANBrClsAAAB6TWELAABAryls\nAQAA6DWFLQAAAL2msAUAAKDXFLYAAAD02v8P/361UxxliukAAAAASUVORK5CYII=\n", 253 | "text/plain": [ 254 | "" 255 | ] 256 | }, 257 | "metadata": {}, 258 | "output_type": "display_data" 259 | } 260 | ], 261 | "source": [ 262 | "import matplotlib.pyplot as plt\n", 263 | "%matplotlib inline\n", 264 | "\n", 265 | "df_plot.set_index('id', inplace=True)\n", 266 | "df_plot.plot(kind='bar', figsize=(16, 6))\n", 267 | "plt.ylabel('Word Count')\n", 268 | "plt.title('Word Count distribution')\n", 269 | "plt.show()" 270 | ] 271 | }, 272 | { 273 | "cell_type": "code", 274 | "execution_count": 12, 275 | "metadata": { 276 | "collapsed": true 277 | }, 278 | "outputs": [], 279 | "source": [ 280 | "from textblob import TextBlob\n", 281 | "def sentiment_score(chat):\n", 282 | " return TextBlob(chat).sentiment.polarity" 283 | ] 284 | }, 285 | { 286 | "cell_type": "code", 287 | "execution_count": 13, 288 | "metadata": {}, 289 | "outputs": [], 290 | "source": [ 291 | "from pyspark.sql.types import FloatType\n", 292 | "sentiment_score_udf = F.udf(lambda x: sentiment_score(x), FloatType())\n" 293 | ] 294 | }, 295 | { 296 | "cell_type": "code", 297 | "execution_count": 14, 298 | "metadata": {}, 299 | "outputs": [ 300 | { 301 | "name": "stdout", 302 | "output_type": "stream", 303 | "text": [ 304 | "+---+---------------+--------------------+----------+---------------+\n", 305 | "| id| label| chat|word_count|sentiment_score|\n", 306 | "+---+---------------+--------------------+----------+---------------+\n", 307 | "| 1| escalate|I had a friend th...| 304| 0.018961353|\n", 308 | "| 2| escalate|\"My friend dealt ...| 184| 0.20601852|\n", 309 | "| 3| escalate|Friend who had bi...| 90| 0.008333334|\n", 310 | "| 4|do_not_escalate|Over the internet...| 88| 0.045833334|\n", 311 | "| 5| escalate|Having gone throu...| 71| 0.0125|\n", 312 | "| 6| escalate|My now girlfriend...| 73| 0.06333333|\n", 313 | "| 7|do_not_escalate|\"Only really one ...| 74| 0.036363635|\n", 314 | "| 8|do_not_escalate|Now that I've bee...| 62| 0.125|\n", 315 | "| 9|do_not_escalate|I've always been ...| 60| 0.31|\n", 316 | "| 10| escalate|I feel completely...| 56| -0.078125|\n", 317 | "| 11|do_not_escalate|Took a week off w...| 60| 0.16666667|\n", 318 | "| 12| escalate|One of my best fr...| 59| 0.4|\n", 319 | "| 13| escalate|I've had some fri...| 50| 0.19|\n", 320 | "| 14|do_not_escalate|Haha. In eight gr...| 55| 0.29666665|\n", 321 | "| 15|do_not_escalate|Some of my friend...| 49| 0.4|\n", 322 | "| 16| escalate|I feel like depre...| 41| 0.05|\n", 323 | "| 17| escalate|i've had a couple...| 38| 0.16666667|\n", 324 | "| 18| escalate|I will always lis...| 41| -0.025|\n", 325 | "| 19|do_not_escalate|A lot for my frie...| 44| 0.035858586|\n", 326 | "| 20|do_not_escalate|When my friend ne...| 42| -0.094444446|\n", 327 | "+---+---------------+--------------------+----------+---------------+\n", 328 | "only showing top 20 rows\n", 329 | "\n" 330 | ] 331 | } 332 | ], 333 | "source": [ 334 | "df = df.select('id', 'label', 'chat','word_count',\n", 335 | " sentiment_score_udf('chat').alias('sentiment_score'))\n", 336 | "df.show()" 337 | ] 338 | }, 339 | { 340 | "cell_type": "code", 341 | "execution_count": 15, 342 | "metadata": {}, 343 | "outputs": [ 344 | { 345 | "name": "stdout", 346 | "output_type": "stream", 347 | "text": [ 348 | "+---------------+--------------------+\n", 349 | "| label| avg_sentiment_score|\n", 350 | "+---------------+--------------------+\n", 351 | "| escalate| 0.06338859780558519|\n", 352 | "|do_not_escalate|0.031975071089198955|\n", 353 | "+---------------+--------------------+\n", 354 | "\n" 355 | ] 356 | } 357 | ], 358 | "source": [ 359 | "df.groupBy('label')\\\n", 360 | " .agg(F.avg('sentiment_score').alias('avg_sentiment_score'))\\\n", 361 | " .orderBy('avg_sentiment_score', ascending = False) \\\n", 362 | " .show()" 363 | ] 364 | }, 365 | { 366 | "cell_type": "code", 367 | "execution_count": 16, 368 | "metadata": {}, 369 | "outputs": [ 370 | { 371 | "name": "stdout", 372 | "output_type": "stream", 373 | "text": [ 374 | "+---+---------------+--------------------+----------+---------------+--------------------+\n", 375 | "| id| label| chat|word_count|sentiment_score| words|\n", 376 | "+---+---------------+--------------------+----------+---------------+--------------------+\n", 377 | "| 1| escalate|I had a friend th...| 304| 0.018961353|[I, had, a, frien...|\n", 378 | "| 2| escalate|\"My friend dealt ...| 184| 0.20601852|[\"My, friend, dea...|\n", 379 | "| 3| escalate|Friend who had bi...| 90| 0.008333334|[Friend, who, had...|\n", 380 | "| 4|do_not_escalate|Over the internet...| 88| 0.045833334|[Over, the, inter...|\n", 381 | "| 5| escalate|Having gone throu...| 71| 0.0125|[Having, gone, th...|\n", 382 | "| 6| escalate|My now girlfriend...| 73| 0.06333333|[My, now, girlfri...|\n", 383 | "| 7|do_not_escalate|\"Only really one ...| 74| 0.036363635|[\"Only, really, o...|\n", 384 | "| 8|do_not_escalate|Now that I've bee...| 62| 0.125|[Now, that, I've,...|\n", 385 | "| 9|do_not_escalate|I've always been ...| 60| 0.31|[I've, always, be...|\n", 386 | "| 10| escalate|I feel completely...| 56| -0.078125|[I, feel, complet...|\n", 387 | "| 11|do_not_escalate|Took a week off w...| 60| 0.16666667|[Took, a, week, o...|\n", 388 | "| 12| escalate|One of my best fr...| 59| 0.4|[One, of, my, bes...|\n", 389 | "| 13| escalate|I've had some fri...| 50| 0.19|[I've, had, some,...|\n", 390 | "| 14|do_not_escalate|Haha. In eight gr...| 55| 0.29666665|[Haha., In, eight...|\n", 391 | "| 15|do_not_escalate|Some of my friend...| 49| 0.4|[Some, of, my, fr...|\n", 392 | "| 16| escalate|I feel like depre...| 41| 0.05|[I, feel, like, d...|\n", 393 | "| 17| escalate|i've had a couple...| 38| 0.16666667|[i've, had, a, co...|\n", 394 | "| 18| escalate|I will always lis...| 41| -0.025|[I, will, always,...|\n", 395 | "| 19|do_not_escalate|A lot for my frie...| 44| 0.035858586|[A, lot, for, my,...|\n", 396 | "| 20|do_not_escalate|When my friend ne...| 42| -0.094444446|[When, my, friend...|\n", 397 | "+---+---------------+--------------------+----------+---------------+--------------------+\n", 398 | "only showing top 20 rows\n", 399 | "\n" 400 | ] 401 | } 402 | ], 403 | "source": [ 404 | "df = df.withColumn('words',F.split(F.col('chat'),' '))\n", 405 | "df.show()" 406 | ] 407 | }, 408 | { 409 | "cell_type": "code", 410 | "execution_count": 17, 411 | "metadata": { 412 | "collapsed": true 413 | }, 414 | "outputs": [], 415 | "source": [ 416 | "stop_words = ['i','me','my','myself','we','our','ours','ourselves',\n", 417 | " 'you','your','yours','yourself','yourselves','he','him',\n", 418 | " 'his','himself','she','her','hers','herself','it','its',\n", 419 | " 'itself','they','them','their','theirs','themselves',\n", 420 | " 'what','which','who','whom','this','that','these','those',\n", 421 | " 'am','is','are','was','were','be','been','being','have',\n", 422 | " 'has','had','having','do','does','did','doing','a','an',\n", 423 | " 'the','and','but','if','or','because','as','until','while',\n", 424 | " 'of','at','by','for','with','about','against','between',\n", 425 | " 'into','through','during','before','after','above','below',\n", 426 | " 'to','from','up','down','in','out','on','off','over','under',\n", 427 | " 'again','further','then','once','here','there','when','where',\n", 428 | " 'why','how','all','any','both','each','few','more','most',\n", 429 | " 'other','some','such','no','nor','not','only','own','same',\n", 430 | " 'so','than','too','very','can','will','just','don','should','now']" 431 | ] 432 | }, 433 | { 434 | "cell_type": "code", 435 | "execution_count": 18, 436 | "metadata": { 437 | "collapsed": true 438 | }, 439 | "outputs": [], 440 | "source": [ 441 | "from pyspark.ml.feature import StopWordsRemover " 442 | ] 443 | }, 444 | { 445 | "cell_type": "code", 446 | "execution_count": 19, 447 | "metadata": { 448 | "collapsed": true 449 | }, 450 | "outputs": [], 451 | "source": [ 452 | "stopwordsRemovalFeature = StopWordsRemover(inputCol=\"words\", \n", 453 | " outputCol=\"words without stop\").setStopWords(stop_words)" 454 | ] 455 | }, 456 | { 457 | "cell_type": "code", 458 | "execution_count": 20, 459 | "metadata": { 460 | "collapsed": true 461 | }, 462 | "outputs": [], 463 | "source": [ 464 | "from pyspark.ml import Pipeline\n", 465 | "stopWordRemovalPipeline = Pipeline(stages=[stopwordsRemovalFeature])\n", 466 | "pipelineFitRemoveStopWords = stopWordRemovalPipeline.fit(df)" 467 | ] 468 | }, 469 | { 470 | "cell_type": "code", 471 | "execution_count": 21, 472 | "metadata": { 473 | "scrolled": false 474 | }, 475 | "outputs": [ 476 | { 477 | "name": "stdout", 478 | "output_type": "stream", 479 | "text": [ 480 | "+--------------------+--------------------+\n", 481 | "| words| words without stop|\n", 482 | "+--------------------+--------------------+\n", 483 | "|[I, had, a, frien...|[friend, would, g...|\n", 484 | "|[\"My, friend, dea...|[\"My, friend, dea...|\n", 485 | "|[Friend, who, had...|[Friend, big, add...|\n", 486 | "|[Over, the, inter...|[internet, LOT, p...|\n", 487 | "|[Having, gone, th...|[gone, depression...|\n", 488 | "+--------------------+--------------------+\n", 489 | "only showing top 5 rows\n", 490 | "\n" 491 | ] 492 | } 493 | ], 494 | "source": [ 495 | "df = pipelineFitRemoveStopWords.transform(df)\n", 496 | "df.select('words', 'words without stop').show(5)" 497 | ] 498 | }, 499 | { 500 | "cell_type": "code", 501 | "execution_count": 22, 502 | "metadata": {}, 503 | "outputs": [], 504 | "source": [ 505 | "label = F.udf(lambda x: 1.0 if x == 'escalate' else 0.0, FloatType())\n", 506 | "df = df.withColumn('label', label('label'))" 507 | ] 508 | }, 509 | { 510 | "cell_type": "code", 511 | "execution_count": 23, 512 | "metadata": {}, 513 | "outputs": [ 514 | { 515 | "name": "stdout", 516 | "output_type": "stream", 517 | "text": [ 518 | "+-----+\n", 519 | "|label|\n", 520 | "+-----+\n", 521 | "| 1.0|\n", 522 | "| 1.0|\n", 523 | "| 1.0|\n", 524 | "| 0.0|\n", 525 | "| 1.0|\n", 526 | "| 1.0|\n", 527 | "| 0.0|\n", 528 | "| 0.0|\n", 529 | "| 0.0|\n", 530 | "| 1.0|\n", 531 | "| 0.0|\n", 532 | "| 1.0|\n", 533 | "| 1.0|\n", 534 | "| 0.0|\n", 535 | "| 0.0|\n", 536 | "| 1.0|\n", 537 | "| 1.0|\n", 538 | "| 1.0|\n", 539 | "| 0.0|\n", 540 | "| 0.0|\n", 541 | "+-----+\n", 542 | "only showing top 20 rows\n", 543 | "\n" 544 | ] 545 | } 546 | ], 547 | "source": [ 548 | "df.select('label').show()" 549 | ] 550 | }, 551 | { 552 | "cell_type": "code", 553 | "execution_count": 24, 554 | "metadata": { 555 | "collapsed": true 556 | }, 557 | "outputs": [], 558 | "source": [ 559 | "import pyspark.ml.feature as feat\n", 560 | "TF_ = feat.HashingTF(inputCol=\"words without stop\", \n", 561 | " outputCol=\"rawFeatures\", numFeatures=100000)\n", 562 | "IDF_ = feat.IDF(inputCol=\"rawFeatures\", outputCol=\"features\")" 563 | ] 564 | }, 565 | { 566 | "cell_type": "code", 567 | "execution_count": 25, 568 | "metadata": { 569 | "collapsed": true 570 | }, 571 | "outputs": [], 572 | "source": [ 573 | "pipelineTFIDF = Pipeline(stages=[TF_, IDF_])" 574 | ] 575 | }, 576 | { 577 | "cell_type": "code", 578 | "execution_count": 26, 579 | "metadata": {}, 580 | "outputs": [], 581 | "source": [ 582 | "pipelineFit = pipelineTFIDF.fit(df)\n", 583 | "df = pipelineFit.transform(df)" 584 | ] 585 | }, 586 | { 587 | "cell_type": "code", 588 | "execution_count": 27, 589 | "metadata": {}, 590 | "outputs": [ 591 | { 592 | "name": "stdout", 593 | "output_type": "stream", 594 | "text": [ 595 | "+-----+--------------------+--------------------+\n", 596 | "|label| rawFeatures| features|\n", 597 | "+-----+--------------------+--------------------+\n", 598 | "| 1.0|(100000,[76,1583,...|(100000,[76,1583,...|\n", 599 | "| 1.0|(100000,[5319,105...|(100000,[5319,105...|\n", 600 | "| 1.0|(100000,[618,7515...|(100000,[618,7515...|\n", 601 | "| 0.0|(100000,[3370,444...|(100000,[3370,444...|\n", 602 | "| 1.0|(100000,[4442,101...|(100000,[4442,101...|\n", 603 | "| 1.0|(100000,[7369,775...|(100000,[7369,775...|\n", 604 | "| 0.0|(100000,[232,6124...|(100000,[232,6124...|\n", 605 | "| 0.0|(100000,[2732,335...|(100000,[2732,335...|\n", 606 | "| 0.0|(100000,[4047,425...|(100000,[4047,425...|\n", 607 | "| 1.0|(100000,[6531,135...|(100000,[6531,135...|\n", 608 | "| 0.0|(100000,[5330,120...|(100000,[5330,120...|\n", 609 | "| 1.0|(100000,[1197,444...|(100000,[1197,444...|\n", 610 | "| 1.0|(100000,[4442,107...|(100000,[4442,107...|\n", 611 | "| 0.0|(100000,[232,4441...|(100000,[232,4441...|\n", 612 | "| 0.0|(100000,[781,3526...|(100000,[781,3526...|\n", 613 | "| 1.0|(100000,[13806,14...|(100000,[13806,14...|\n", 614 | "| 1.0|(100000,[4442,108...|(100000,[4442,108...|\n", 615 | "| 1.0|(100000,[76,11034...|(100000,[76,11034...|\n", 616 | "| 0.0|(100000,[10001,27...|(100000,[10001,27...|\n", 617 | "| 0.0|(100000,[29385,39...|(100000,[29385,39...|\n", 618 | "+-----+--------------------+--------------------+\n", 619 | "only showing top 20 rows\n", 620 | "\n" 621 | ] 622 | } 623 | ], 624 | "source": [ 625 | "df.select('label', 'rawFeatures','features').show()" 626 | ] 627 | }, 628 | { 629 | "cell_type": "code", 630 | "execution_count": 28, 631 | "metadata": { 632 | "collapsed": true 633 | }, 634 | "outputs": [], 635 | "source": [ 636 | "(trainingDF, testDF) = df.randomSplit([0.75, 0.25], seed = 1234)" 637 | ] 638 | }, 639 | { 640 | "cell_type": "code", 641 | "execution_count": 29, 642 | "metadata": { 643 | "collapsed": true 644 | }, 645 | "outputs": [], 646 | "source": [ 647 | "from pyspark.ml.classification import LogisticRegression\n", 648 | "logreg = LogisticRegression(regParam=0.025)" 649 | ] 650 | }, 651 | { 652 | "cell_type": "code", 653 | "execution_count": 30, 654 | "metadata": {}, 655 | "outputs": [], 656 | "source": [ 657 | "logregModel = logreg.fit(trainingDF)" 658 | ] 659 | }, 660 | { 661 | "cell_type": "code", 662 | "execution_count": 31, 663 | "metadata": { 664 | "collapsed": true 665 | }, 666 | "outputs": [], 667 | "source": [ 668 | "predictionDF = logregModel.transform(testDF)" 669 | ] 670 | }, 671 | { 672 | "cell_type": "code", 673 | "execution_count": 32, 674 | "metadata": {}, 675 | "outputs": [ 676 | { 677 | "name": "stdout", 678 | "output_type": "stream", 679 | "text": [ 680 | "+-----+--------------------+----------+\n", 681 | "|label| probability|prediction|\n", 682 | "+-----+--------------------+----------+\n", 683 | "| 1.0|[0.00339966489826...| 1.0|\n", 684 | "| 1.0|[0.55815635574642...| 0.0|\n", 685 | "| 1.0|[0.03557500295368...| 1.0|\n", 686 | "| 0.0|[0.52714451276392...| 0.0|\n", 687 | "| 0.0|[0.64630042307877...| 0.0|\n", 688 | "| 0.0|[0.69042286406135...| 0.0|\n", 689 | "| 1.0|[0.44672236248681...| 1.0|\n", 690 | "| 0.0|[0.67209249316671...| 0.0|\n", 691 | "| 0.0|[0.96010780703860...| 0.0|\n", 692 | "| 1.0|[0.75210799156076...| 0.0|\n", 693 | "| 0.0|[0.90904812079420...| 0.0|\n", 694 | "| 0.0|[0.97354469378068...| 0.0|\n", 695 | "| 0.0|[0.96576753489686...| 0.0|\n", 696 | "| 0.0|[0.89685928798301...| 0.0|\n", 697 | "| 0.0|[0.92552854921657...| 0.0|\n", 698 | "| 0.0|[0.94649994610325...| 0.0|\n", 699 | "| 0.0|[0.89486269398390...| 0.0|\n", 700 | "| 0.0|[0.65225541621797...| 0.0|\n", 701 | "| 0.0|[0.95636713428689...| 0.0|\n", 702 | "| 0.0|[0.95927102608436...| 0.0|\n", 703 | "+-----+--------------------+----------+\n", 704 | "only showing top 20 rows\n", 705 | "\n" 706 | ] 707 | } 708 | ], 709 | "source": [ 710 | "predictionDF.select('label', 'probability', 'prediction').show()" 711 | ] 712 | }, 713 | { 714 | "cell_type": "code", 715 | "execution_count": 33, 716 | "metadata": {}, 717 | "outputs": [ 718 | { 719 | "name": "stdout", 720 | "output_type": "stream", 721 | "text": [ 722 | "+----------------+---+---+\n", 723 | "|label_prediction|0.0|1.0|\n", 724 | "+----------------+---+---+\n", 725 | "| 1.0| 2| 3|\n", 726 | "| 0.0| 19| 0|\n", 727 | "+----------------+---+---+\n", 728 | "\n" 729 | ] 730 | } 731 | ], 732 | "source": [ 733 | "predictionDF.crosstab('label', 'prediction').show()" 734 | ] 735 | }, 736 | { 737 | "cell_type": "code", 738 | "execution_count": 34, 739 | "metadata": { 740 | "collapsed": true 741 | }, 742 | "outputs": [], 743 | "source": [ 744 | "from sklearn import metrics\n", 745 | "actual = predictionDF.select('label').toPandas()\n", 746 | "predicted = predictionDF.select('prediction').toPandas()" 747 | ] 748 | }, 749 | { 750 | "cell_type": "code", 751 | "execution_count": 35, 752 | "metadata": {}, 753 | "outputs": [ 754 | { 755 | "name": "stdout", 756 | "output_type": "stream", 757 | "text": [ 758 | "accuracy score: 91.7%\n" 759 | ] 760 | } 761 | ], 762 | "source": [ 763 | "print('accuracy score: {}%'.format(round(metrics.accuracy_score(actual, predicted),3)*100))" 764 | ] 765 | }, 766 | { 767 | "cell_type": "code", 768 | "execution_count": 36, 769 | "metadata": {}, 770 | "outputs": [ 771 | { 772 | "name": "stdout", 773 | "output_type": "stream", 774 | "text": [ 775 | "The ROC score is 93.7%\n" 776 | ] 777 | } 778 | ], 779 | "source": [ 780 | "from pyspark.ml.evaluation import BinaryClassificationEvaluator\n", 781 | "\n", 782 | "scores = predictionDF.select('label', 'rawPrediction')\n", 783 | "evaluator = BinaryClassificationEvaluator()\n", 784 | "print('The ROC score is {}%'.format(round(evaluator.evaluate(scores),3)*100))" 785 | ] 786 | }, 787 | { 788 | "cell_type": "code", 789 | "execution_count": 37, 790 | "metadata": { 791 | "scrolled": true 792 | }, 793 | "outputs": [ 794 | { 795 | "name": "stdout", 796 | "output_type": "stream", 797 | "text": [ 798 | "+-------+-------------------+\n", 799 | "|summary| label|\n", 800 | "+-------+-------------------+\n", 801 | "| count| 24|\n", 802 | "| mean|0.20833333333333334|\n", 803 | "| stddev|0.41485111699905336|\n", 804 | "| min| 0.0|\n", 805 | "| max| 1.0|\n", 806 | "+-------+-------------------+\n", 807 | "\n" 808 | ] 809 | } 810 | ], 811 | "source": [ 812 | "predictionDF.describe('label').show()" 813 | ] 814 | }, 815 | { 816 | "cell_type": "code", 817 | "execution_count": null, 818 | "metadata": { 819 | "collapsed": true 820 | }, 821 | "outputs": [], 822 | "source": [] 823 | } 824 | ], 825 | "metadata": { 826 | "kernelspec": { 827 | "display_name": "Python 3", 828 | "language": "python", 829 | "name": "python3" 830 | }, 831 | "language_info": { 832 | "codemirror_mode": { 833 | "name": "ipython", 834 | "version": 3 835 | }, 836 | "file_extension": ".py", 837 | "mimetype": "text/x-python", 838 | "name": "python", 839 | "nbconvert_exporter": "python", 840 | "pygments_lexer": "ipython3", 841 | "version": "3.6.1" 842 | } 843 | }, 844 | "nbformat": 4, 845 | "nbformat_minor": 2 846 | } 847 | --------------------------------------------------------------------------------