├── .gitignore ├── LICENSE ├── README.md ├── media └── banners │ ├── amazon_logo.png │ ├── front_cover.png │ ├── packt_logo.png │ └── safari_logo.png └── notebooks ├── Ch01 - Machine Learning Fundamentals ├── battles.csv ├── credit_default.xls ├── feature_engineering_numerical_and_categorical_data.ipynb ├── feature_engineering_numerical_and_categorical_data.py ├── feature_engineering_text_data.ipynb ├── feature_engineering_text_data.py ├── game_of_thrones_eda.ipynb └── game_of_thrones_eda.py ├── Ch02 - Deep Learning Essentials └── NNBasics.ipynb ├── Ch05 - Unleash the Power of Transfer Learning ├── Basic CNN Model.ipynb ├── CNN with Image Augmentation.ipynb ├── CNN with Transfer Learning.ipynb ├── Datasets Builder.ipynb ├── Model Performance Evaluations.ipynb ├── model_evaluation_utils.py └── utils.py ├── Ch06 - Image Recognition and Classification ├── 0fc12a365adfcbb603e298b10149632a.jpg ├── CIFAR10_CNN_Classifier.ipynb ├── CIFAR10_VGG16_Transfer_Learning_Classifier.ipynb ├── Dog_Breed_EDA.ipynb ├── Dog_Breed_Transfer_Learning_Classifier.ipynb ├── cnn_utils.py ├── dog_breed_transfer_learning_classifier.py └── model_evaluation_utils.py ├── Ch07 - Text Document Categorization ├── 20_newsgrp_cnn_model.ipynb ├── IMDB_word2Vec.ipynb ├── Text_Summarization_IMDB.ipynb ├── amazon_review_model.py ├── config.py ├── dataloader │ ├── __init__.py │ ├── embeddings.py │ └── loader.py ├── imdb_model.py ├── model │ ├── __init__.py │ ├── cnn_document_model.py │ └── custom_layer.py ├── preprocessing │ ├── __init__.py │ └── utils.py ├── transfer_learning_imdb.py └── utils.py ├── Ch08 - Audio Identification and Categorization ├── Exploratory Analysis Sound Data.ipynb ├── Feature Engineering.ipynb ├── Modeling.ipynb └── Prediction Pipeline.ipynb ├── Ch09 - Deep Dream ├── DSC09296.jpg ├── Deep Dream Final.ipynb ├── blue-sky.jpg ├── labrador.jpg └── mccowan_mountain_orig.jpg ├── Ch10 - Neural Style Transfer ├── Style Transfer Model.ipynb └── Style Transfer Results HD.ipynb ├── Ch11 - Automated Image Caption Generator ├── Image feature extraction.ipynb ├── Model Test.ipynb └── Model Train.ipynb ├── Ch12 - Image Colorization ├── colorization_task.png ├── colornet_architecture.png ├── colornet_vgg16.ipynb ├── colorspaces.png └── images │ ├── 0311abf53d60705cc9605bc46589b0d6.jpeg │ ├── 060df5a829e988f6b5de7c089dc3c05d.jpeg │ ├── 07c4859f54fb3184aebca9a7b3aa5317.jpeg │ ├── 097ec13396d0593ddd00e360b7375b8f.jpeg │ ├── 0dc3e95a9954c5d50e5ddac5bf774e09.jpeg │ ├── 1261ea1079ab97b732812e328c3a5c48.jpeg │ ├── 156979415efa5edc3420558a884b3536.jpeg │ ├── 15f6abb6f801e04c880008f39a0ba558.jpeg │ ├── 1b99b787ef471af6c652e01737d883a6.jpeg │ ├── 1cc416b4897eca408ad396a09ad000cd.jpeg │ ├── 232a6c59d6965f9f466dd2390829a69a.jpeg │ ├── 2374de3c74aee73fd495c57eee8e4ab0.jpeg │ ├── 24e8e4a8b32e2ccbc1e7b2798cb8fcd8.jpeg │ ├── 2fbb562e61f6460cf9820940d61b7001.jpeg │ ├── 328e41c2fbc194b0f8900007a61c6d4a.jpeg │ ├── 3b8640a78d79be87f927a63174902346.jpeg │ ├── 529b4fb2e62f249a87b5908debaf73e7.jpeg │ ├── 56eb7309c5bc5cb9629b2db830a1b025.jpeg │ ├── 58fd582ca79217457b0d8807a6302824.jpeg │ ├── 5a0728978e9a5180076ec357bc28c92e.jpeg │ ├── 5a2987b714c15456c0e038dabcf426e2.jpeg │ ├── 62e2af22b63e4674759d811aab1b6679.jpeg │ ├── 62f4423f981dda4a508781f4845b0c09.jpeg │ ├── 663b353d812988e7635ce6709414d6aa.jpeg │ ├── 6a47074ff275acedfa82ffbc9025b703.jpeg │ ├── 6b85128b02f95c628c85437b1eef38a4.jpeg │ ├── 6d905979f4788767939883c0f8b4250b.jpeg │ ├── 7a24ba5bfdbb9602f78197e3e103feb4.jpeg │ ├── 7ed962e763ce17c16df44cfa96dfd047.jpeg │ ├── 81491e7159595d373c027e9c337eecfa.jpeg │ ├── 83f948c28622623c088d6d7cc0d02b18.jpeg │ ├── 8587f219580ae6f207490ff02f853df3.jpeg │ ├── 8afc970abdab28220ad7a0be25457a2e.jpeg │ ├── 92e06aafe0ca084825921deb2b4c5c55.jpeg │ ├── ae69df2e9995ee238f5ea93090a3981d.jpeg │ ├── b208394092522e1a74b7ce9d8b558022.jpeg │ ├── bb9d9ece213507e6c45852633c6e61e8.jpeg │ ├── cb2af8fbb9cd6d48eb1009c68349cb1e.jpeg │ ├── cba1776f1d7129f9f83a4d9fc4b89039.jpeg │ ├── cbb19bd188a96067f478c0a9b1559844.jpeg │ ├── cc3d9a6d928111e9861bebd43b475f63.jpeg │ ├── desktop.ini │ ├── e2b2bcccdba6d1293a7e89ef3d6df112.jpeg │ ├── e72e923ffc4d51faf1a99d09bd59896a.jpeg │ ├── fe79cd9cda63a4af3924d2718b7e775a.jpeg │ └── ff20b4d22ed3e1e8829768201d110f53.jpeg └── README.md /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ 105 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Apache License 2 | Version 2.0, January 2004 3 | http://www.apache.org/licenses/ 4 | 5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 6 | 7 | 1. Definitions. 8 | 9 | "License" shall mean the terms and conditions for use, reproduction, 10 | and distribution as defined by Sections 1 through 9 of this document. 11 | 12 | "Licensor" shall mean the copyright owner or entity authorized by 13 | the copyright owner that is granting the License. 14 | 15 | "Legal Entity" shall mean the union of the acting entity and all 16 | other entities that control, are controlled by, or are under common 17 | control with that entity. For the purposes of this definition, 18 | "control" means (i) the power, direct or indirect, to cause the 19 | direction or management of such entity, whether by contract or 20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the 21 | outstanding shares, or (iii) beneficial ownership of such entity. 22 | 23 | "You" (or "Your") shall mean an individual or Legal Entity 24 | exercising permissions granted by this License. 25 | 26 | "Source" form shall mean the preferred form for making modifications, 27 | including but not limited to software source code, documentation 28 | source, and configuration files. 29 | 30 | "Object" form shall mean any form resulting from mechanical 31 | transformation or translation of a Source form, including but 32 | not limited to compiled object code, generated documentation, 33 | and conversions to other media types. 34 | 35 | "Work" shall mean the work of authorship, whether in Source or 36 | Object form, made available under the License, as indicated by a 37 | copyright notice that is included in or attached to the work 38 | (an example is provided in the Appendix below). 39 | 40 | "Derivative Works" shall mean any work, whether in Source or Object 41 | form, that is based on (or derived from) the Work and for which the 42 | editorial revisions, annotations, elaborations, or other modifications 43 | represent, as a whole, an original work of authorship. For the purposes 44 | of this License, Derivative Works shall not include works that remain 45 | separable from, or merely link (or bind by name) to the interfaces of, 46 | the Work and Derivative Works thereof. 47 | 48 | "Contribution" shall mean any work of authorship, including 49 | the original version of the Work and any modifications or additions 50 | to that Work or Derivative Works thereof, that is intentionally 51 | submitted to Licensor for inclusion in the Work by the copyright owner 52 | or by an individual or Legal Entity authorized to submit on behalf of 53 | the copyright owner. For the purposes of this definition, "submitted" 54 | means any form of electronic, verbal, or written communication sent 55 | to the Licensor or its representatives, including but not limited to 56 | communication on electronic mailing lists, source code control systems, 57 | and issue tracking systems that are managed by, or on behalf of, the 58 | Licensor for the purpose of discussing and improving the Work, but 59 | excluding communication that is conspicuously marked or otherwise 60 | designated in writing by the copyright owner as "Not a Contribution." 61 | 62 | "Contributor" shall mean Licensor and any individual or Legal Entity 63 | on behalf of whom a Contribution has been received by Licensor and 64 | subsequently incorporated within the Work. 65 | 66 | 2. Grant of Copyright License. Subject to the terms and conditions of 67 | this License, each Contributor hereby grants to You a perpetual, 68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 69 | copyright license to reproduce, prepare Derivative Works of, 70 | publicly display, publicly perform, sublicense, and distribute the 71 | Work and such Derivative Works in Source or Object form. 72 | 73 | 3. Grant of Patent License. Subject to the terms and conditions of 74 | this License, each Contributor hereby grants to You a perpetual, 75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable 76 | (except as stated in this section) patent license to make, have made, 77 | use, offer to sell, sell, import, and otherwise transfer the Work, 78 | where such license applies only to those patent claims licensable 79 | by such Contributor that are necessarily infringed by their 80 | Contribution(s) alone or by combination of their Contribution(s) 81 | with the Work to which such Contribution(s) was submitted. If You 82 | institute patent litigation against any entity (including a 83 | cross-claim or counterclaim in a lawsuit) alleging that the Work 84 | or a Contribution incorporated within the Work constitutes direct 85 | or contributory patent infringement, then any patent licenses 86 | granted to You under this License for that Work shall terminate 87 | as of the date such litigation is filed. 88 | 89 | 4. Redistribution. You may reproduce and distribute copies of the 90 | Work or Derivative Works thereof in any medium, with or without 91 | modifications, and in Source or Object form, provided that You 92 | meet the following conditions: 93 | 94 | (a) You must give any other recipients of the Work or 95 | Derivative Works a copy of this License; and 96 | 97 | (b) You must cause any modified files to carry prominent notices 98 | stating that You changed the files; and 99 | 100 | (c) You must retain, in the Source form of any Derivative Works 101 | that You distribute, all copyright, patent, trademark, and 102 | attribution notices from the Source form of the Work, 103 | excluding those notices that do not pertain to any part of 104 | the Derivative Works; and 105 | 106 | (d) If the Work includes a "NOTICE" text file as part of its 107 | distribution, then any Derivative Works that You distribute must 108 | include a readable copy of the attribution notices contained 109 | within such NOTICE file, excluding those notices that do not 110 | pertain to any part of the Derivative Works, in at least one 111 | of the following places: within a NOTICE text file distributed 112 | as part of the Derivative Works; within the Source form or 113 | documentation, if provided along with the Derivative Works; or, 114 | within a display generated by the Derivative Works, if and 115 | wherever such third-party notices normally appear. The contents 116 | of the NOTICE file are for informational purposes only and 117 | do not modify the License. You may add Your own attribution 118 | notices within Derivative Works that You distribute, alongside 119 | or as an addendum to the NOTICE text from the Work, provided 120 | that such additional attribution notices cannot be construed 121 | as modifying the License. 122 | 123 | You may add Your own copyright statement to Your modifications and 124 | may provide additional or different license terms and conditions 125 | for use, reproduction, or distribution of Your modifications, or 126 | for any such Derivative Works as a whole, provided Your use, 127 | reproduction, and distribution of the Work otherwise complies with 128 | the conditions stated in this License. 129 | 130 | 5. Submission of Contributions. Unless You explicitly state otherwise, 131 | any Contribution intentionally submitted for inclusion in the Work 132 | by You to the Licensor shall be under the terms and conditions of 133 | this License, without any additional terms or conditions. 134 | Notwithstanding the above, nothing herein shall supersede or modify 135 | the terms of any separate license agreement you may have executed 136 | with Licensor regarding such Contributions. 137 | 138 | 6. Trademarks. This License does not grant permission to use the trade 139 | names, trademarks, service marks, or product names of the Licensor, 140 | except as required for reasonable and customary use in describing the 141 | origin of the Work and reproducing the content of the NOTICE file. 142 | 143 | 7. Disclaimer of Warranty. Unless required by applicable law or 144 | agreed to in writing, Licensor provides the Work (and each 145 | Contributor provides its Contributions) on an "AS IS" BASIS, 146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or 147 | implied, including, without limitation, any warranties or conditions 148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A 149 | PARTICULAR PURPOSE. You are solely responsible for determining the 150 | appropriateness of using or redistributing the Work and assume any 151 | risks associated with Your exercise of permissions under this License. 152 | 153 | 8. Limitation of Liability. In no event and under no legal theory, 154 | whether in tort (including negligence), contract, or otherwise, 155 | unless required by applicable law (such as deliberate and grossly 156 | negligent acts) or agreed to in writing, shall any Contributor be 157 | liable to You for damages, including any direct, indirect, special, 158 | incidental, or consequential damages of any character arising as a 159 | result of this License or out of the use or inability to use the 160 | Work (including but not limited to damages for loss of goodwill, 161 | work stoppage, computer failure or malfunction, or any and all 162 | other commercial damages or losses), even if such Contributor 163 | has been advised of the possibility of such damages. 164 | 165 | 9. Accepting Warranty or Additional Liability. While redistributing 166 | the Work or Derivative Works thereof, You may choose to offer, 167 | and charge a fee for, acceptance of support, warranty, indemnity, 168 | or other liability obligations and/or rights consistent with this 169 | License. However, in accepting such obligations, You may act only 170 | on Your own behalf and on Your sole responsibility, not on behalf 171 | of any other Contributor, and only if You agree to indemnify, 172 | defend, and hold each Contributor harmless for any liability 173 | incurred by, or claims asserted against, such Contributor by reason 174 | of your accepting any such warranty or additional liability. 175 | 176 | END OF TERMS AND CONDITIONS 177 | 178 | APPENDIX: How to apply the Apache License to your work. 179 | 180 | To apply the Apache License to your work, attach the following 181 | boilerplate notice, with the fields enclosed by brackets "[]" 182 | replaced with your own identifying information. (Don't include 183 | the brackets!) The text should be enclosed in the appropriate 184 | comment syntax for the file format. We also recommend that a 185 | file or class name and description of purpose be included on the 186 | same "printed page" as the copyright notice for easier 187 | identification within third-party archives. 188 | 189 | Copyright [yyyy] [name of copyright owner] 190 | 191 | Licensed under the Apache License, Version 2.0 (the "License"); 192 | you may not use this file except in compliance with the License. 193 | You may obtain a copy of the License at 194 | 195 | http://www.apache.org/licenses/LICENSE-2.0 196 | 197 | Unless required by applicable law or agreed to in writing, software 198 | distributed under the License is distributed on an "AS IS" BASIS, 199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 200 | See the License for the specific language governing permissions and 201 | limitations under the License. 202 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Hands-On Transfer Learning with Python 2 | ### Implement advanced deep learning and neural network models using Tensorflow and Keras 3 | 4 | With the world moving towards digitalization and automation, as a technologist/programmer it is important to keep oneself updated and learn how to leverage these tools and techniques. [*__"Hands-On Transfer Learning with Python"__*](https://github.com/dipanjanS/hands-on-transfer-learning-with-python#contents), is an attempt to help practitioners get acquainted with and equipped to use these advancements in their respective domains. This book is structured broadly into three sections: 5 | + Deep learning foundations 6 | + Essentials of transfer learning 7 | + Transfer learning case studies 8 | 9 | This repository contains all the code, notebooks and examples used in this book. We will also be adding bonus content here from time to time. So keep watching this space! 10 | 11 | 12 | ## Get the book 13 | 14 | 15 | 20 | 25 | 30 | 31 |
16 | 17 | packt 18 | 19 | 21 | 22 | safari 23 | 24 | 26 | 27 | amazon 28 | 29 |
32 | 33 | ## About the book 34 | 35 | Book Cover 36 | 37 | 38 | Transfer learning is a machine learning (ML) technique where knowledge gained during the training of one set of ML problems can be used to train other similar types of problems. The purpose of this book is two-fold. We focus on detailed coverage of deep learning and transfer learning, comparing and contrasting the two with easy-to-follow concepts and examples. The second area of focus will be on real-world examples and research problems using [`tensorflow`](https://www.tensorflow.org/), [`keras`](https://keras.io/), and the Python ecosystem with hands-on examples. 39 | 40 | The book starts with core essential concepts of ML and deep learning, followed by some depictions and coverage of important deep learning architectures, such as CNNs, DNNs, RNNs, LSTMs, and capsule networks. Our focus then shifts to transfer learning concepts and pretrained state of the art networks such as VGG, Inception, and ResNet. We also learn how these systems can be leveraged to improve performance of our deep learning models. Finally, we focus on a multitude of real-world case studies and problems in areas such as computer vision, audio analysis, and natural language processing (NLP). By the end of this book, you will be all ready to implement both deep learning and transfer learning principles in your own systems. 41 | 42 |
43 | Edition: 1st   Pages: 438   Language: English
44 | Book Title: Hands-On Transfer Learning with Python   Publisher: Packt
45 | Copyright: Sarkar, Bali & Ghosh   ISBN 13: 9781788831307
46 |
47 | 48 | 49 |
50 | 51 | ## [Contents](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks#book-contents) 52 | 53 | - [__Part I: Deep learning foundations__](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks#part-i-deep-learning-foundations) 54 | - [Chapter 1: Machine Learning Fundamentals Basics](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch01%20-%20Machine%20Learning%20Fundamentals) 55 | - [Chapter 2: Deep Learning Essentials Basics](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch02%20-%20Deep%20Learning%20Essentials) 56 | - Chapter 3: Understanding Deep Learning Architectures Basics 57 | - [__Part II: Essentials of transfer learning__](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks#part-ii-essentials-of-transfer-learning) 58 | - Chapter 4: Transfer Learning Fundamentals Basics 59 | - [Chapter 5: Unleashing the Power of Transfer Learning](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch05%20-%20Unleash%20the%20Power%20of%20Transfer%20Learning) 60 | - [__Part III: Transfer learning case studies__](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks#part-iii-transfer-learning-case-studies) 61 | - [Chapter 6: Image Recognition and Classification](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch06%20-%20Image%20Recognition%20and%20Classification) 62 | - [Chapter 7: Text Document Categorization](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch07%20-%20Text%20Document%20Categorization) 63 | - [Chapter 8: Audio Identification and Classification](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch08%20-%20Audio%20Identification%20and%20Categorization) 64 | - [Chapter 9: Deep Dream](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch09%20-%20Deep%20Dream) 65 | - [Chapter 10: Style Transfer](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch10%20-%20Neural%20Style%20Transfer) 66 | - [Chapter 11: Automated Image Caption Generator](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch11%20-%20Automated%20Image%20Caption%20Generator) 67 | - [Chapter 12: Image Colorization](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch12%20-%20Image%20Colorization) 68 | 69 | 70 | ## Key Features: 71 | + Build deep learning models with transfer learning principles in Python 72 | + Implement transfer learning to solve real-world research problems 73 | + Perform complex operations such as image captioning neural style transfer 74 | 75 | ## What You Will Learn: 76 | + Set up your own DL environment with graphics processing unit (GPU) and Cloud support 77 | + Delve into transfer learning principles with ML and DL models 78 | + Explore various DL architectures, including CNN, LSTM, and capsule networks 79 | + Learn about data and network representation and loss functions 80 | + Get to grips with models and strategies in transfer learning 81 | + Walk through potential challenges in building complex transfer learning models from scratch 82 | + Explore real-world research problems related to computer vision and audio analysis 83 | + Understand how transfer learning can be leveraged in NLP 84 | 85 |
86 | 87 | ## Audience 88 | Hands-On Transfer Learning with Python is for data scientists, ML engineers, analysts, and developers with an interest in data and applying state-of-the-art transfer learning methodologies to solve tough real-world problems. 89 | __Basic proficiency in ML and Python is required.__ 90 | 91 | ## Acknowledgements 92 | TBA 93 | -------------------------------------------------------------------------------- /media/banners/amazon_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/media/banners/amazon_logo.png -------------------------------------------------------------------------------- /media/banners/front_cover.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/media/banners/front_cover.png -------------------------------------------------------------------------------- /media/banners/packt_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/media/banners/packt_logo.png -------------------------------------------------------------------------------- /media/banners/safari_logo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/media/banners/safari_logo.png -------------------------------------------------------------------------------- /notebooks/Ch01 - Machine Learning Fundamentals/battles.csv: -------------------------------------------------------------------------------- 1 | name,year,battle_number,attacker_king,defender_king,attacker_1,attacker_2,attacker_3,attacker_4,defender_1,defender_2,defender_3,defender_4,attacker_outcome,battle_type,major_death,major_capture,attacker_size,defender_size,attacker_commander,defender_commander,summer,location,region,note Battle of the Golden Tooth,298,1,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,win,pitched battle,1,0,15000,4000,Jaime Lannister,"Clement Piper, Vance",1,Golden Tooth,The Westerlands, Battle at the Mummer's Ford,298,2,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Baratheon,,,,win,ambush,1,0,,120,Gregor Clegane,Beric Dondarrion,1,Mummer's Ford,The Riverlands, Battle of Riverrun,298,3,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,win,pitched battle,0,1,15000,10000,"Jaime Lannister, Andros Brax","Edmure Tully, Tytos Blackwood",1,Riverrun,The Riverlands, Battle of the Green Fork,298,4,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,loss,pitched battle,1,1,18000,20000,"Roose Bolton, Wylis Manderly, Medger Cerwyn, Harrion Karstark, Halys Hornwood","Tywin Lannister, Gregor Clegane, Kevan Lannister, Addam Marbrand",1,Green Fork,The Riverlands, Battle of the Whispering Wood,298,5,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,1,1,1875,6000,"Robb Stark, Brynden Tully",Jaime Lannister,1,Whispering Wood,The Riverlands, Battle of the Camps,298,6,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,0,0,6000,12625,"Robb Stark, Tytos Blackwood, Brynden Tully","Lord Andros Brax, Forley Prester",1,Riverrun,The Riverlands, Sack of Darry,298,7,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Darry,,,,win,pitched battle,0,0,,,Gregor Clegane,Lyman Darry,1,Darry,The Riverlands, Battle of Moat Cailin,299,8,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,pitched battle,0,0,,,Victarion Greyjoy,,1,Moat Cailin,The North, Battle of Deepwood Motte,299,9,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,siege,0,0,1000,,Asha Greyjoy,,1,Deepwood Motte,The North, Battle of the Stony Shore,299,10,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,ambush,0,0,264,,Theon Greyjoy,,1,Stony Shore,The North,"Greyjoy's troop number based on the Battle of Deepwood Motte, in which Asha had 1000 soldier on 30 longships. That comes out to ~33 per longship. In the Battle of the Stony Shore, Theon has 8 longships, and just we can estimate that he has 8*33 =265 troops." Battle of Torrhen's Square,299,11,Robb Stark,Balon/Euron Greyjoy,Stark,,,,Greyjoy,,,,win,pitched battle,0,0,244,900,"Rodrik Cassel, Cley Cerwyn",Dagmer Cleftjaw,1,Torrhen's Square,The North,Greyjoy's troop number comes from the 264 estimate to have arrived on the stony shore minus the 20 Theon takes to attack Winterfell. Thus 264-20=244 Battle of Winterfell,299,12,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,ambush,0,1,20,,Theon Greyjoy,Bran Stark,1,Winterfell,The North,"It isn't mentioned how many Stark men are left in Winterfell, other than ""very few""." Sack of Torrhen's Square,299,13,Balon/Euron Greyjoy,Balon/Euron Greyjoy,Greyjoy,,,,Stark,,,,win,siege,0,1,,,Dagmer Cleftjaw,,1,Torrhen's Square,The North, Sack of Winterfell,299,14,Joffrey/Tommen Baratheon,Robb Stark,Bolton,Greyjoy,,,Stark,,,,win,ambush,1,0,618,2000,"Ramsay Snow, Theon Greyjoy ","Rodrik Cassel, Cley Cerwyn, Leobald Tallhart",1,Winterfell,The North,"Since House Bolton betrays the Starks for House Lannister, we code this battle as between these two houses. Greyjoy men, numbering only 20, don't play a major part in the fighting and end up dying anyway." Battle of Oxcross,299,15,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,1,1,6000,10000,"Robb Stark, Brynden Tully","Stafford Lannister, Roland Crakehall, Antario Jast",1,Oxcross,The Westerlands, Siege of Storm's End,299,16,Stannis Baratheon,Renly Baratheon,Baratheon,,,,Baratheon,,,,win,siege,1,0,5000,20000,"Stannis Baratheon, Davos Seaworth","Renly Baratheon, Cortnay Penrose, Loras Tyrell, Randyll Tarly, Mathis Rowan",1,Storm's End,The Stormlands, Battle of the Fords,299,17,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,loss,pitched battle,0,0,20000,10000,"Tywin Lannister, Flement Brax, Gregor Clegane, Addam Marbrand, Lyle Crakehall, Leo Lefford","Edmure Tully, Jason Mallister, Karyl Vance",1,Red Fork,The Riverlands, Sack of Harrenhal,299,18,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,win,ambush,1,0,100,100,"Roose Bolton, Vargo Hoat, Robett Glover",Amory Lorch,1,Harrenhal,The Riverlands, Battle of the Crag,299,19,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,win,ambush,0,0,6000,,"Robb Stark, Smalljon Umber, Black Walder Frey",Rolph Spicer,1,Crag,The Westerlands, Battle of the Blackwater,299,20,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,,,,Lannister,,,,loss,pitched battle,1,1,21000,7250,"Stannis Baratheon, Imry Florent, Guyard Morrigen, Rolland Storm, Salladhor Saan, Davos Seaworth","Tyrion Lannister, Jacelyn Bywater, Sandor Clegane, Tywin Lannister, Garlan Tyrell, Mace Tyrell, Randyll Tarly",1,King's Landing,The Crownlands, Siege of Darry,299,21,Robb Stark,Joffrey/Tommen Baratheon,Darry,,,,Lannister,,,,win,siege,0,0,,,Helman Tallhart,,1,Darry,The Riverlands, Battle of Duskendale,299,22,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,loss,pitched battle,1,0,3000,,"Robertt Glover, Helman Tallhart","Randyll Tarly, Gregor Clegane",1,Duskendale,The Crownlands, Battle of the Burning Septry,299,23,,,Brotherhood without Banners,,,,Brave Companions,,,,win,pitched battle,0,0,,,,,1,,The Riverlands, Battle of the Ruby Ford,299,24,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Stark,,,,win,pitched battle,0,0,,6000,Gregor Clegane,"Roose Bolton, Wylis Manderly",,Ruby Ford,The Riverlands, Retaking of Harrenhal,299,25,Joffrey/Tommen Baratheon,,Lannister,,,,Brave Companions,,,,win,pitched battle,1,0,,,Gregor Clegane,Vargo Hoat,1,Harrenhal,The Riverlands, The Red Wedding,299,26,Joffrey/Tommen Baratheon,Robb Stark,Frey,Bolton,,,Stark,,,,win,ambush,1,1,3500,3500,"Walder Frey, Roose Bolton, Walder Rivers",Robb Stark,1,The Twins,The Riverlands,"This observation refers to the battle against the Stark men, not the attack on the wedding" Siege of Seagard,299,27,Robb Stark,Joffrey/Tommen Baratheon,Frey,,,,Mallister,,,,win,siege,0,1,,,Walder Frey,Jason Mallister,1,Seagard,The Riverlands, Battle of Castle Black,300,28,Stannis Baratheon,Mance Rayder,Free folk,Thenns,Giants,,Night's Watch,Baratheon,,,loss,siege,1,1,100000,1240,"Mance Rayder, Tormund Giantsbane, Harma Dogshead, Magnar Styr, Varamyr","Stannis Baratheon, Jon Snow, Donal Noye, Cotter Pyke",0,Castle Black,Beyond the Wall, Fall of Moat Cailin,300,29,Joffrey/Tommen Baratheon,Balon/Euron Greyjoy,Bolton,,,,Greyjoy,,,,win,siege,0,0,,,Ramsey Bolton,,0,Moat Cailin,The North, Sack of Saltpans,300,30,,,Brave Companions,,,,,,,,win,razing,0,0,,,Rorge,,0,Saltpans,The Riverlands, Retaking of Deepwood Motte,300,31,Stannis Baratheon,Balon/Euron Greyjoy,Baratheon,Karstark,Mormont,Glover,Greyjoy,,,,win,pitched battle,0,0,4500,200,"Stannis Baratheon, Alysane Mormot",Asha Greyjoy,0,Deepwood Motte,The North, Battle of the Shield Islands,300,32,Balon/Euron Greyjoy,Joffrey/Tommen Baratheon,Greyjoy,,,,Tyrell,,,,win,pitched battle,0,0,,,"Euron Greyjoy, Victarion Greyjoy",,0,Shield Islands,The Reach, "Invasion of Ryamsport, Vinetown, and Starfish Harbor",300,33,Balon/Euron Greyjoy,Joffrey/Tommen Baratheon,Greyjoy,,,,Tyrell,,,,win,razing,0,0,,,"Euron Greyjoy, Victarion Greyjoy",,0,"Ryamsport, Vinetown, Starfish Harbor",The Reach, Second Seige of Storm's End,300,34,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,,,,Baratheon,,,,win,siege,0,0,,200,"Mace Tyrell, Mathis Rowan",Gilbert Farring,0,Storm's End,The Stormlands, Siege of Dragonstone,300,35,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,,,,Baratheon,,,,win,siege,0,0,2000,,"Loras Tyrell, Raxter Redwyne",Rolland Storm,0,Dragonstone,The Stormlands, Siege of Riverrun,300,36,Joffrey/Tommen Baratheon,Robb Stark,Lannister,Frey,,,Tully,,,,win,siege,0,0,3000,,"Daven Lannister, Ryman Fey, Jaime Lannister",Brynden Tully,0,Riverrun,The Riverlands, Siege of Raventree,300,37,Joffrey/Tommen Baratheon,Robb Stark,Bracken,Lannister,,,Blackwood,,,,win,siege,0,1,1500,,"Jonos Bracken, Jaime Lannister",Tytos Blackwood,0,Raventree,The Riverlands, Siege of Winterfell,300,38,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,Karstark,Mormont,Glover,Bolton,Frey,,,,,,,5000,8000,Stannis Baratheon,Roose Bolton,0,Winterfell,The North, -------------------------------------------------------------------------------- /notebooks/Ch01 - Machine Learning Fundamentals/credit_default.xls: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/notebooks/Ch01 - Machine Learning Fundamentals/credit_default.xls -------------------------------------------------------------------------------- /notebooks/Ch01 - Machine Learning Fundamentals/feature_engineering_numerical_and_categorical_data.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # Feature Engineering 5 | import numpy as np 6 | import pandas as pd 7 | 8 | # plotting 9 | import seaborn as sns 10 | import matplotlib.pyplot as plt 11 | 12 | # setting params 13 | params = {'legend.fontsize': 'x-large', 14 | 'figure.figsize': (30, 10), 15 | 'axes.labelsize': 'x-large', 16 | 'axes.titlesize':'x-large', 17 | 'xtick.labelsize':'x-large', 18 | 'ytick.labelsize':'x-large'} 19 | 20 | sns.set_style('whitegrid') 21 | sns.set_context('talk') 22 | 23 | 24 | # ## Feature Engineering : Numerical Data 25 | 26 | # load dataset 27 | credit_df = pd.read_excel('credit_default.xls', 28 | skiprows=1,index_col=0) 29 | credit_df.shape 30 | 31 | 32 | credit_df.head() 33 | 34 | 35 | # ### Extract Raw Features 36 | # Attributes which are useful in their raw form itself 37 | 38 | credit_df[['LIMIT_BAL','BILL_AMT1', 39 | 'BILL_AMT2','BILL_AMT3', 40 | 'BILL_AMT4','BILL_AMT5', 41 | 'BILL_AMT6']].head() 42 | 43 | 44 | # ### Counts 45 | # Based on requirements, count of events is also a useful attribute. 46 | 47 | # utility function 48 | def default_month_count(row): 49 | count = 0 50 | for i in [0,2,3,4,5,6]: 51 | if row['PAY_'+str(i)] > 0: 52 | count +=1 53 | return count 54 | 55 | 56 | 57 | credit_df['number_of_default_months'] = credit_df.apply(default_month_count, 58 | axis=1) 59 | 60 | 61 | credit_df[['number_of_default_months']].head() 62 | 63 | 64 | # ### Binarization 65 | # Occurance or absence of an event is also a useful feature 66 | 67 | credit_df['has_ever_defaulted'] = credit_df.number_of_default_months.apply(lambda x: 1 if x>0 else 0) 68 | credit_df[['number_of_default_months','has_ever_defaulted']].head() 69 | 70 | 71 | # ### Binning 72 | # Also known as quantization, helps in transformin continuous features such as 73 | # age and income onto discrete scales. 74 | 75 | credit_df.AGE.plot(kind='hist',bins=60) 76 | plt.title('Age Histogram', fontsize=12) 77 | plt.ylabel('Age', fontsize=12) 78 | plt.xlabel('Frequency', fontsize=12) 79 | 80 | 81 | # #### Fixed Width Bins 82 | 83 | # Fixed Width Bins : 84 | # 85 | # ``` 86 | # Age Range: Bin 87 | # --------------- 88 | # 0 - 9 : 0 89 | # 10 - 19 : 1 90 | # 20 - 29 : 2 91 | # 30 - 39 : 3 92 | # ... and so on 93 | # ``` 94 | 95 | # Assign a bin label to each row 96 | credit_df['age_bin_fixed'] = credit_df.AGE.apply(lambda age: np.floor(age/10.)) 97 | 98 | 99 | credit_df[['AGE','age_bin_fixed']].head() 100 | 101 | 102 | # #### Quantile Based Binning 103 | # * 4-Quartile Binning 104 | 105 | ## Quantile binning 106 | quantile_list = [0, .25, .5, .75, 1.] 107 | quantiles = credit_df.AGE.quantile(quantile_list) 108 | quantiles 109 | 110 | 111 | # Plot Quartile Ranges on the Distribution 112 | 113 | fig, ax = plt.subplots() 114 | credit_df.AGE.plot(kind='hist',bins=60) 115 | 116 | for quantile in quantiles: 117 | qvl = plt.axvline(quantile, color='r') 118 | ax.legend([qvl], ['Quantiles'], fontsize=10) 119 | 120 | ax.set_title('Age Histogram with Quantiles', fontsize=12) 121 | ax.set_xlabel('Age', fontsize=12) 122 | ax.set_ylabel('Frequency', fontsize=12) 123 | 124 | 125 | # Assign Quartile Bin Labels 126 | quantile_labels = ['Q1', 'Q2', 'Q3', 'Q4'] 127 | credit_df['age_quantile_range'] = pd.qcut(credit_df['AGE'], 128 | q=quantile_list) 129 | credit_df['age_quantile_label'] = pd.qcut(credit_df['AGE'], 130 | q=quantile_list, 131 | labels=quantile_labels) 132 | 133 | 134 | credit_df[['AGE','age_quantile_range','age_quantile_label']].head() 135 | 136 | 137 | 138 | # ## Feature Engineering : Categorical Data 139 | # We have utilized multiple publicly available datasets to better understand 140 | # categorical attributes 141 | 142 | battles_df = pd.read_csv('battles.csv') 143 | battles_df.shape 144 | 145 | 146 | battles_df[['name','year','attacker_king','attacker_1']].head() 147 | 148 | 149 | # ### Transforming Nominal Features 150 | # Categorical attributes which ***do not*** have any intrinsic 151 | # ordering amongst them 152 | 153 | from sklearn.preprocessing import LabelEncoder 154 | 155 | attacker_le = LabelEncoder() 156 | attacker_labels = attacker_le.fit_transform(battles_df.attacker_1) 157 | attacker_mappings = {index: label for index, label in enumerate(attacker_le.classes_)} 158 | attacker_mappings 159 | 160 | 161 | # assign labels 162 | battles_df['attacker1_label'] = attacker_labels 163 | battles_df[['name','year','attacker_king','attacker_1','attacker1_label']].head() 164 | 165 | 166 | # ### Transforming Ordinal Features 167 | # Categorical attributes which ***have*** an intrinsic ordering amongst them 168 | 169 | sales_df = pd.DataFrame(data={ 170 | 'items_sold':abs(np.random.randn(7)*100), 171 | 'day_of_week':['Monday', 'Tuesday', 172 | 'Wednesday', 'Thursday', 173 | 'Friday', 'Saturday', 174 | 'Sunday']}) 175 | sales_df 176 | 177 | day_map = {'Monday': 1, 'Tuesday': 2, 178 | 'Wednesday': 3, 'Thursday': 4, 179 | 'Friday': 5, 'Saturday': 6, 180 | 'Sunday' : 7} 181 | 182 | sales_df['weekday_label'] = sales_df['day_of_week'].map(day_map) 183 | sales_df.head() 184 | 185 | 186 | # ### Encoding Categoricals 187 | 188 | # One Hot Encoder 189 | 190 | from sklearn.preprocessing import OneHotEncoder 191 | 192 | day_le = LabelEncoder() 193 | day_labels = day_le.fit_transform(sales_df['day_of_week']) 194 | sales_df['label_encoder_day_label'] = day_labels 195 | 196 | # encode day labels using one-hot encoding scheme 197 | day_ohe = OneHotEncoder() 198 | day_feature_arr = day_ohe.fit_transform(sales_df[['label_encoder_day_label']]).toarray() 199 | day_feature_labels = list(day_le.classes_) 200 | day_features = pd.DataFrame(day_feature_arr, columns=day_feature_labels) 201 | 202 | 203 | sales_ohe_df = pd.concat([sales_df, day_features], axis=1) 204 | sales_ohe_df 205 | 206 | 207 | # Dummy Encoder 208 | 209 | 210 | day_dummy_features = pd.get_dummies(sales_df['day_of_week'], drop_first=True) 211 | pd.concat([sales_df[['day_of_week','items_sold']], day_dummy_features], axis=1) 212 | 213 | -------------------------------------------------------------------------------- /notebooks/Ch01 - Machine Learning Fundamentals/feature_engineering_text_data.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # Feature Engineering 5 | # Textual Data 6 | # Important Imports 7 | 8 | import numpy as np 9 | import pandas as pd 10 | 11 | from sklearn.feature_extraction.text import CountVectorizer 12 | from sklearn.feature_extraction.text import TfidfVectorizer 13 | 14 | 15 | # Prepare a Sample Corpus 16 | 17 | corpus = ['pack my box with five dozen liquor jugs.', 18 | 'pack my box', 19 | 'the quick brown fox jumps over the lazy dog.', 20 | 'the brown fox is quick and the blue dog is lazy', 21 | 'pack my box with five dozen liquor jugs and biscuits', 22 | 'the dog is lazy but the brown fox is quick'] 23 | 24 | labels = ['picnic', 'picnic', 'animals', 'animals', 'picnic', 'animals'] 25 | corpus = np.array(corpus) 26 | corpus_df = pd.DataFrame({'document': corpus, 27 | 'category': labels}) 28 | corpus_df = corpus_df[['document', 'category']] 29 | corpus_df 30 | 31 | 32 | # Bag of Words 33 | 34 | cv = CountVectorizer(min_df=0., max_df=1.) 35 | cv_matrix = cv.fit_transform(corpus_df.document) 36 | cv_matrix = cv_matrix.toarray() 37 | cv_matrix 38 | 39 | 40 | # TF-IDF 41 | 42 | tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True) 43 | tv_matrix = tv.fit_transform(corpus_df.document) 44 | tv_matrix = tv_matrix.toarray() 45 | 46 | vocab = tv.get_feature_names() 47 | pd.DataFrame(np.round(tv_matrix, 2), columns=vocab) 48 | 49 | 50 | # N-Gram Vectorizer 51 | 52 | bv = CountVectorizer(ngram_range=(2,2)) 53 | bv_matrix = bv.fit_transform(corpus_df.document) 54 | bv_matrix = bv_matrix.toarray() 55 | vocab = bv.get_feature_names() 56 | pd.DataFrame(bv_matrix, columns=vocab) 57 | 58 | -------------------------------------------------------------------------------- /notebooks/Ch01 - Machine Learning Fundamentals/game_of_thrones_eda.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # ## Import required packages 5 | import numpy as np 6 | import pandas as pd 7 | from collections import Counter 8 | 9 | # plotting 10 | import seaborn as sns 11 | import matplotlib.pyplot as plt 12 | 13 | # setting params 14 | params = {'legend.fontsize': 'x-large', 15 | 'figure.figsize': (30, 10), 16 | 'axes.labelsize': 'x-large', 17 | 'axes.titlesize':'x-large', 18 | 'xtick.labelsize':'x-large', 19 | 'ytick.labelsize':'x-large'} 20 | 21 | sns.set_style('whitegrid') 22 | sns.set_context('talk') 23 | 24 | plt.rcParams.update(params) 25 | 26 | 27 | # ## Load Dataset 28 | # 29 | # In this step we load the ```battles.csv``` for analysis 30 | # load dataset 31 | battles_df = pd.read_csv('battles.csv') 32 | 33 | 34 | # Display sample rows 35 | print(battles_df.head()) 36 | 37 | 38 | # ## Explore raw properties 39 | print("Number of attributes available in the dataset = {}".format(battles_df.shape[1])) 40 | 41 | 42 | # View available columns and their data types 43 | print(battles_df.dtypes) 44 | 45 | 46 | # Analyze properties of numerical columns 47 | battles_df.describe() 48 | 49 | # ## Number of Battles Fought 50 | # This data is till **season 5** only 51 | 52 | print("Number of battles fought={}".format(battles_df.shape[0])) 53 | 54 | 55 | # ## Battle Distribution Across Years 56 | # The plot below shows that maximum bloodshed happened in the year 299 with 57 | # a total of 20 battles fought! 58 | 59 | sns.countplot(y='year',data=battles_df) 60 | plt.title('Battle Distribution over Years') 61 | plt.show() 62 | 63 | 64 | # ## Which Regions saw most Battles? 65 | 66 | sns.countplot(x='region',data=battles_df) 67 | plt.title('Battles by Regions') 68 | plt.show() 69 | 70 | 71 | # ### Death or Capture of Main Characters by Region 72 | 73 | # No prizes for guessing that Riverlands have seen some of the main characters 74 | # being killed or captured. Though _The Reach_ has seen 2 battles, none of the 75 | # major characters seemed to have fallen there. 76 | f, ax1 = plt.subplots() 77 | ax2 = ax1.twinx() 78 | temp_df = battles_df.groupby('region').agg({'major_death':'sum', 79 | 'major_capture':'sum'}).reset_index() 80 | temp_df.loc[:,'dummy'] = 'dummy' 81 | sns.barplot(x="region", y="major_death", 82 | hue='dummy', data=temp_df, 83 | estimator = np.sum, ax = ax1, 84 | hue_order=['dummy','other']) 85 | 86 | sns.barplot(x="region", y="major_capture", 87 | data=temp_df, hue='dummy', 88 | estimator = np.sum, ax = ax2, 89 | hue_order=['other','dummy']) 90 | 91 | ax1.legend_.remove() 92 | ax2.legend_.remove() 93 | 94 | 95 | # ## Who Attacked the most? 96 | # The Baratheon boys love attacking as they lead the pack with 38% while 97 | # Rob Stark has been the attacker in close second with 27.8% of the battles. 98 | attacker_king = battles_df.attacker_king.value_counts() 99 | attacker_king.name='' # turn off annoying y-axis-label 100 | attacker_king.plot.pie(figsize=(6, 6),autopct='%.2f') 101 | 102 | 103 | # ## Who Defended the most? 104 | # Rob Stark and Baratheon boys are again on the top of the pack. Looks like 105 | # they have been on either sides of the war lot many times. 106 | 107 | defender_king = battles_df.defender_king.value_counts() 108 | defender_king.name='' # turn off annoying y-axis-label 109 | defender_king.plot.pie(figsize=(6, 6),autopct='%.2f') 110 | 111 | 112 | # ## Battle Style Distribution 113 | # Plenty of battles all across, yet the men of Westeros and Essos are men of honor. 114 | # This is visible in the distribution which shows **pitched battle** as the 115 | # most common style of battle. 116 | 117 | sns.countplot(y='battle_type',data=battles_df) 118 | plt.title('Battle Type Distribution') 119 | plt.show() 120 | 121 | 122 | # ## Attack or Defend? 123 | # Defending your place in Westeros isn't easy, this is clearly visible from 124 | # the fact that 32 out of 37 battles were won by attackers 125 | 126 | sns.countplot(y='attacker_outcome',data=battles_df) 127 | plt.title('Attack Win/Loss Distribution') 128 | plt.show() 129 | 130 | 131 | # ## Winners 132 | # Who remembers losers? (except if you love the Starks) 133 | # The following plot helps us understand who won how many battles and how, 134 | # by attacking or defending. 135 | 136 | attack_winners = battles_df[battles_df.\ 137 | attacker_outcome=='win']\ 138 | ['attacker_king'].\ 139 | value_counts().\ 140 | reset_index() 141 | 142 | attack_winners.rename( 143 | columns={'index':'king', 144 | 'attacker_king':'wins'}, 145 | inplace=True) 146 | 147 | attack_winners.loc[:,'win_type'] = 'attack' 148 | 149 | defend_winners = battles_df[battles_df.\ 150 | attacker_outcome=='loss']\ 151 | ['defender_king'].\ 152 | value_counts().\ 153 | reset_index() 154 | defend_winners.rename( 155 | columns={'index':'king', 156 | 'defender_king':'wins'}, 157 | inplace=True) 158 | 159 | defend_winners.loc[:,'win_type'] = 'defend' 160 | 161 | 162 | sns.barplot(x="king", 163 | y="wins", 164 | hue="win_type", 165 | data=pd.concat([attack_winners, 166 | defend_winners])) 167 | plt.title('Kings and Their Wins') 168 | plt.ylabel('wins') 169 | plt.xlabel('king') 170 | plt.show() 171 | 172 | 173 | # ## Battle Commanders 174 | # A battle requires as much brains as muscle power. 175 | # The following is a distribution of the number of commanders involved on attacking and defending sides. 176 | 177 | battles_df['attack_commander_count'] = battles_df.\ 178 | dropna(subset=['attacker_commander']).\ 179 | apply(lambda row: \ 180 | len(row['attacker_commander'].\ 181 | split()),axis=1) 182 | battles_df['defend_commander_count'] = battles_df.\ 183 | dropna(subset=['defender_commander']).\ 184 | apply(lambda row: \ 185 | len(row['defender_commander'].\ 186 | split()),axis=1) 187 | 188 | battles_df[['attack_commander_count', 189 | 'defend_commander_count']].plot(kind='box') 190 | 191 | 192 | # ## How many houses fought in a battle? 193 | # Were the battles evenly balanced? The plots tell the whole story. 194 | battles_df['attacker_house_count'] = (4 - battles_df[['attacker_1', 195 | 'attacker_2', 196 | 'attacker_3', 197 | 'attacker_4']].\ 198 | isnull().sum(axis = 1)) 199 | 200 | battles_df['defender_house_count'] = (4 - battles_df[['defender_1', 201 | 'defender_2', 202 | 'defender_3', 203 | 'defender_4']].\ 204 | isnull().sum(axis = 1)) 205 | 206 | battles_df['total_involved_count'] = battles_df.apply(lambda row: \ 207 | row['attacker_house_count'] + \ 208 | row['defender_house_count'], 209 | axis=1) 210 | battles_df['bubble_text'] = battles_df.apply(lambda row: \ 211 | '{} had {} house(s) attacking {} house(s) '.\ 212 | format(row['name'], 213 | row['attacker_house_count'], 214 | row['defender_house_count']), 215 | axis=1) 216 | 217 | 218 | # ## Unbalanced Battles 219 | # Most battles so far have seen more houses forming alliances while attacking. 220 | # There are only a few friends when you are under attack! 221 | 222 | house_balance = battles_df[ 223 | battles_df.attacker_house_count != \ 224 | battles_df.defender_house_count][['name', 225 | 'attacker_house_count', 226 | 'defender_house_count']].\ 227 | set_index('name') 228 | house_balance.plot(kind='bar') 229 | 230 | 231 | # ## Battles and The size of Armies 232 | # Attackers don't take any chances, they come in huge numbers, keep your eyes open 233 | 234 | army_size_df = battles_df.dropna(subset=['total_involved_count', 235 | 'attacker_size', 236 | 'defender_size', 237 | 'bubble_text']) 238 | army_size_df.plot(kind='scatter', x='defender_size',y='attacker_size', 239 | s=army_size_df['total_involved_count']*150) 240 | 241 | 242 | # ## Archenemies? 243 | # The Stark-Baratheon friendship has taken a complete U-turn with a total of 19 battles and counting. Indeed there is no one to be trusted in this land. 244 | 245 | temp_df = battles_df.dropna( 246 | subset = ["attacker_king", 247 | "defender_king"])[ 248 | ["attacker_king", 249 | "defender_king"] 250 | ] 251 | 252 | archenemy_df = pd.DataFrame( 253 | list(Counter( 254 | [tuple(set(king_pair)) 255 | for king_pair in temp_df.values 256 | if len(set(king_pair))>1]). 257 | items()), 258 | columns=['king_pair', 259 | 'battle_count']) 260 | 261 | archenemy_df['versus_text'] = archenemy_df.\ 262 | apply( 263 | lambda row: 264 | '{} Vs {}'.format( 265 | row[ 266 | 'king_pair' 267 | ][0], 268 | row[ 269 | 'king_pair' 270 | ][1]), 271 | axis=1) 272 | archenemy_df.sort_values('battle_count', 273 | inplace=True, 274 | ascending=False) 275 | 276 | 277 | archenemy_df[['versus_text', 278 | 'battle_count']].set_index('versus_text', 279 | inplace=True) 280 | sns.barplot(data=archenemy_df, 281 | x='versus_text', 282 | y='battle_count') 283 | plt.xticks(rotation=45) 284 | plt.xlabel('Archenemies') 285 | plt.ylabel('Number of Battles') 286 | plt.title('Archenemies') 287 | plt.show() -------------------------------------------------------------------------------- /notebooks/Ch05 - Unleash the Power of Transfer Learning/Datasets Builder.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import glob\n", 10 | "import numpy as np\n", 11 | "import os\n", 12 | "import shutil\n", 13 | "from utils import log_progress\n", 14 | "\n", 15 | "np.random.seed(42)" 16 | ] 17 | }, 18 | { 19 | "cell_type": "code", 20 | "execution_count": 3, 21 | "metadata": {}, 22 | "outputs": [ 23 | { 24 | "data": { 25 | "text/plain": [ 26 | "(12500, 12500)" 27 | ] 28 | }, 29 | "execution_count": 3, 30 | "metadata": {}, 31 | "output_type": "execute_result" 32 | } 33 | ], 34 | "source": [ 35 | "files = glob.glob('train/*')\n", 36 | "\n", 37 | "cat_files = [fn for fn in files if 'cat' in fn]\n", 38 | "dog_files = [fn for fn in files if 'dog' in fn]\n", 39 | "len(cat_files), len(dog_files)" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 5, 45 | "metadata": {}, 46 | "outputs": [ 47 | { 48 | "name": "stdout", 49 | "output_type": "stream", 50 | "text": [ 51 | "Cat datasets: (1500,) (500,) (500,)\n", 52 | "Dog datasets: (1500,) (500,) (500,)\n" 53 | ] 54 | } 55 | ], 56 | "source": [ 57 | "cat_train = np.random.choice(cat_files, size=1500, replace=False)\n", 58 | "dog_train = np.random.choice(dog_files, size=1500, replace=False)\n", 59 | "cat_files = list(set(cat_files) - set(cat_train))\n", 60 | "dog_files = list(set(dog_files) - set(dog_train))\n", 61 | "\n", 62 | "cat_val = np.random.choice(cat_files, size=500, replace=False)\n", 63 | "dog_val = np.random.choice(dog_files, size=500, replace=False)\n", 64 | "cat_files = list(set(cat_files) - set(cat_val))\n", 65 | "dog_files = list(set(dog_files) - set(dog_val))\n", 66 | "\n", 67 | "cat_test = np.random.choice(cat_files, size=500, replace=False)\n", 68 | "dog_test = np.random.choice(dog_files, size=500, replace=False)\n", 69 | "\n", 70 | "print('Cat datasets:', cat_train.shape, cat_val.shape, cat_test.shape)\n", 71 | "print('Dog datasets:', dog_train.shape, dog_val.shape, dog_test.shape)" 72 | ] 73 | }, 74 | { 75 | "cell_type": "code", 76 | "execution_count": 6, 77 | "metadata": {}, 78 | "outputs": [], 79 | "source": [ 80 | "train_dir = 'training_data'\n", 81 | "val_dir = 'validation_data'\n", 82 | "test_dir = 'test_data'\n", 83 | "\n", 84 | "train_files = np.concatenate([cat_train, dog_train])\n", 85 | "validate_files = np.concatenate([cat_val, dog_val])\n", 86 | "test_files = np.concatenate([cat_test, dog_test])\n", 87 | "\n", 88 | "os.mkdir(train_dir) if not os.path.isdir(train_dir) else None\n", 89 | "os.mkdir(val_dir) if not os.path.isdir(val_dir) else None\n", 90 | "os.mkdir(test_dir) if not os.path.isdir(test_dir) else None\n", 91 | "\n", 92 | "for fn in log_progress(train_files, name='Training Images'):\n", 93 | " shutil.copy(fn, train_dir)\n", 94 | "\n", 95 | "for fn in log_progress(validate_files, name='Validation Images'):\n", 96 | " shutil.copy(fn, val_dir)\n", 97 | " \n", 98 | "for fn in log_progress(test_files, name='Test Images'):\n", 99 | " shutil.copy(fn, test_dir)" 100 | ] 101 | } 102 | ], 103 | "metadata": { 104 | "anaconda-cloud": {}, 105 | "kernelspec": { 106 | "display_name": "Python [conda root]", 107 | "language": "python", 108 | "name": "conda-root-py" 109 | }, 110 | "language_info": { 111 | "codemirror_mode": { 112 | "name": "ipython", 113 | "version": 3 114 | }, 115 | "file_extension": ".py", 116 | "mimetype": "text/x-python", 117 | "name": "python", 118 | "nbconvert_exporter": "python", 119 | "pygments_lexer": "ipython3", 120 | "version": "3.5.2" 121 | }, 122 | "widgets": { 123 | "state": { 124 | "707cda043c794c6bbccbf3e3f0264e09": { 125 | "views": [ 126 | { 127 | "cell_index": 5 128 | } 129 | ] 130 | }, 131 | "76235700d807469ea94675fe7f2b6187": { 132 | "views": [ 133 | { 134 | "cell_index": 5 135 | } 136 | ] 137 | }, 138 | "7aadcb865964421fb6f976525e21adca": { 139 | "views": [ 140 | { 141 | "cell_index": 5 142 | } 143 | ] 144 | } 145 | }, 146 | "version": "1.2.0" 147 | } 148 | }, 149 | "nbformat": 4, 150 | "nbformat_minor": 2 151 | } 152 | -------------------------------------------------------------------------------- /notebooks/Ch05 - Unleash the Power of Transfer Learning/model_evaluation_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jul 31 20:05:23 2017 4 | 5 | @author: DIP 6 | @Copyright: Dipanjan Sarkar 7 | """ 8 | 9 | from sklearn import metrics 10 | import numpy as np 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | from sklearn.preprocessing import LabelEncoder 14 | from sklearn.base import clone 15 | from sklearn.preprocessing import label_binarize 16 | from scipy import interp 17 | from sklearn.metrics import roc_curve, auc 18 | 19 | 20 | def get_metrics(true_labels, predicted_labels): 21 | 22 | print('Accuracy:', np.round( 23 | metrics.accuracy_score(true_labels, 24 | predicted_labels), 25 | 4)) 26 | print('Precision:', np.round( 27 | metrics.precision_score(true_labels, 28 | predicted_labels, 29 | average='weighted'), 30 | 4)) 31 | print('Recall:', np.round( 32 | metrics.recall_score(true_labels, 33 | predicted_labels, 34 | average='weighted'), 35 | 4)) 36 | print('F1 Score:', np.round( 37 | metrics.f1_score(true_labels, 38 | predicted_labels, 39 | average='weighted'), 40 | 4)) 41 | 42 | 43 | def train_predict_model(classifier, 44 | train_features, train_labels, 45 | test_features, test_labels): 46 | # build model 47 | classifier.fit(train_features, train_labels) 48 | # predict using model 49 | predictions = classifier.predict(test_features) 50 | return predictions 51 | 52 | 53 | def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]): 54 | 55 | total_classes = len(classes) 56 | level_labels = [total_classes*[0], list(range(total_classes))] 57 | 58 | cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels, 59 | labels=classes) 60 | cm_frame = pd.DataFrame(data=cm, 61 | columns=pd.MultiIndex(levels=[['Predicted:'], classes], 62 | labels=level_labels), 63 | index=pd.MultiIndex(levels=[['Actual:'], classes], 64 | labels=level_labels)) 65 | print(cm_frame) 66 | 67 | def display_classification_report(true_labels, predicted_labels, classes=[1,0]): 68 | 69 | report = metrics.classification_report(y_true=true_labels, 70 | y_pred=predicted_labels, 71 | labels=classes) 72 | print(report) 73 | 74 | 75 | 76 | def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]): 77 | print('Model Performance metrics:') 78 | print('-'*30) 79 | get_metrics(true_labels=true_labels, predicted_labels=predicted_labels) 80 | print('\nModel Classification report:') 81 | print('-'*30) 82 | display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, 83 | classes=classes) 84 | print('\nPrediction Confusion Matrix:') 85 | print('-'*30) 86 | display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels, 87 | classes=classes) 88 | 89 | 90 | def plot_model_decision_surface(clf, train_features, train_labels, 91 | plot_step=0.02, cmap=plt.cm.RdYlBu, 92 | markers=None, alphas=None, colors=None): 93 | 94 | if train_features.shape[1] != 2: 95 | raise ValueError("X_train should have exactly 2 columnns!") 96 | 97 | x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step 98 | y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step 99 | xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), 100 | np.arange(y_min, y_max, plot_step)) 101 | 102 | clf_est = clone(clf) 103 | clf_est.fit(train_features,train_labels) 104 | if hasattr(clf_est, 'predict_proba'): 105 | Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1] 106 | else: 107 | Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()]) 108 | Z = Z.reshape(xx.shape) 109 | cs = plt.contourf(xx, yy, Z, cmap=cmap) 110 | 111 | le = LabelEncoder() 112 | y_enc = le.fit_transform(train_labels) 113 | n_classes = len(le.classes_) 114 | plot_colors = ''.join(colors) if colors else [None] * n_classes 115 | label_names = le.classes_ 116 | markers = markers if markers else [None] * n_classes 117 | alphas = alphas if alphas else [None] * n_classes 118 | for i, color in zip(range(n_classes), plot_colors): 119 | idx = np.where(y_enc == i) 120 | plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color, 121 | label=label_names[i], cmap=cmap, edgecolors='black', 122 | marker=markers[i], alpha=alphas[i]) 123 | plt.legend() 124 | plt.show() 125 | 126 | 127 | def plot_model_roc_curve(clf, features, true_labels, label_encoder=None, class_names=None): 128 | 129 | ## Compute ROC curve and ROC area for each class 130 | fpr = dict() 131 | tpr = dict() 132 | roc_auc = dict() 133 | if hasattr(clf, 'classes_'): 134 | class_labels = clf.classes_ 135 | elif label_encoder: 136 | class_labels = label_encoder.classes_ 137 | elif class_names: 138 | class_labels = class_names 139 | else: 140 | raise ValueError('Unable to derive prediction classes, please specify class_names!') 141 | n_classes = len(class_labels) 142 | y_test = label_binarize(true_labels, classes=class_labels) 143 | if n_classes == 2: 144 | if hasattr(clf, 'predict_proba'): 145 | prob = clf.predict_proba(features) 146 | y_score = prob[:, prob.shape[1]-1] 147 | elif hasattr(clf, 'decision_function'): 148 | prob = clf.decision_function(features) 149 | y_score = prob[:, prob.shape[1]-1] 150 | else: 151 | raise AttributeError("Estimator doesn't have a probability or confidence scoring system!") 152 | 153 | fpr, tpr, _ = roc_curve(y_test, y_score) 154 | roc_auc = auc(fpr, tpr) 155 | plt.plot(fpr, tpr, label='ROC curve (area = {0:0.2f})' 156 | ''.format(roc_auc), 157 | linewidth=2.5) 158 | 159 | elif n_classes > 2: 160 | if hasattr(clf, 'predict_proba'): 161 | y_score = clf.predict_proba(features) 162 | elif hasattr(clf, 'decision_function'): 163 | y_score = clf.decision_function(features) 164 | else: 165 | raise AttributeError("Estimator doesn't have a probability or confidence scoring system!") 166 | 167 | for i in range(n_classes): 168 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) 169 | roc_auc[i] = auc(fpr[i], tpr[i]) 170 | 171 | ## Compute micro-average ROC curve and ROC area 172 | fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) 173 | roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) 174 | 175 | ## Compute macro-average ROC curve and ROC area 176 | # First aggregate all false positive rates 177 | all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) 178 | # Then interpolate all ROC curves at this points 179 | mean_tpr = np.zeros_like(all_fpr) 180 | for i in range(n_classes): 181 | mean_tpr += interp(all_fpr, fpr[i], tpr[i]) 182 | # Finally average it and compute AUC 183 | mean_tpr /= n_classes 184 | fpr["macro"] = all_fpr 185 | tpr["macro"] = mean_tpr 186 | roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) 187 | 188 | ## Plot ROC curves 189 | plt.figure(figsize=(6, 4)) 190 | plt.plot(fpr["micro"], tpr["micro"], 191 | label='micro-average ROC curve (area = {0:0.2f})' 192 | ''.format(roc_auc["micro"]), linewidth=3) 193 | 194 | plt.plot(fpr["macro"], tpr["macro"], 195 | label='macro-average ROC curve (area = {0:0.2f})' 196 | ''.format(roc_auc["macro"]), linewidth=3) 197 | 198 | for i, label in enumerate(class_labels): 199 | plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})' 200 | ''.format(label, roc_auc[i]), 201 | linewidth=2, linestyle=':') 202 | else: 203 | raise ValueError('Number of classes should be atleast 2 or more') 204 | 205 | plt.plot([0, 1], [0, 1], 'k--') 206 | plt.xlim([0.0, 1.0]) 207 | plt.ylim([0.0, 1.05]) 208 | plt.xlabel('False Positive Rate') 209 | plt.ylabel('True Positive Rate') 210 | plt.title('Receiver Operating Characteristic (ROC) Curve') 211 | plt.legend(loc="lower right") 212 | plt.show() 213 | 214 | 215 | -------------------------------------------------------------------------------- /notebooks/Ch05 - Unleash the Power of Transfer Learning/utils.py: -------------------------------------------------------------------------------- 1 | def log_progress(sequence, every=None, size=None, name='Items'): 2 | from ipywidgets import IntProgress, HTML, VBox 3 | from IPython.display import display 4 | 5 | is_iterator = False 6 | if size is None: 7 | try: 8 | size = len(sequence) 9 | except TypeError: 10 | is_iterator = True 11 | if size is not None: 12 | if every is None: 13 | if size <= 200: 14 | every = 1 15 | else: 16 | every = int(size / 200) # every 0.5% 17 | else: 18 | assert every is not None, 'sequence is iterator, set every' 19 | 20 | if is_iterator: 21 | progress = IntProgress(min=0, max=1, value=1) 22 | progress.bar_style = 'info' 23 | else: 24 | progress = IntProgress(min=0, max=size, value=0) 25 | label = HTML() 26 | box = VBox(children=[label, progress]) 27 | display(box) 28 | 29 | index = 0 30 | try: 31 | for index, record in enumerate(sequence, 1): 32 | if index == 1 or index % every == 0: 33 | if is_iterator: 34 | label.value = '{name}: {index} / ?'.format( 35 | name=name, 36 | index=index 37 | ) 38 | else: 39 | progress.value = index 40 | label.value = u'{name}: {index} / {size}'.format( 41 | name=name, 42 | index=index, 43 | size=size 44 | ) 45 | yield record 46 | except: 47 | progress.bar_style = 'danger' 48 | raise 49 | else: 50 | progress.bar_style = 'success' 51 | progress.value = index 52 | label.value = "{name}: {index}".format( 53 | name=name, 54 | index=str(index or '?') 55 | ) -------------------------------------------------------------------------------- /notebooks/Ch06 - Image Recognition and Classification/0fc12a365adfcbb603e298b10149632a.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/notebooks/Ch06 - Image Recognition and Classification/0fc12a365adfcbb603e298b10149632a.jpg -------------------------------------------------------------------------------- /notebooks/Ch06 - Image Recognition and Classification/cnn_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Fri Jun 22 17:23:21 2018 4 | 5 | @author: RAGHAV 6 | """ 7 | 8 | import numpy as np 9 | import matplotlib.pyplot as plt 10 | params = {'legend.fontsize': 'x-large', 11 | 'figure.figsize': (15, 5), 12 | 'axes.labelsize': 'x-large', 13 | 'axes.titlesize':'x-large', 14 | 'xtick.labelsize':'x-large', 15 | 'ytick.labelsize':'x-large'} 16 | 17 | plt.rcParams.update(params) 18 | 19 | 20 | def make_prediction(model=None,img_vector=[], 21 | label_dict={},top_N=3, 22 | model_input_shape=None): 23 | if model: 24 | # get model input shape 25 | if not model_input_shape: 26 | model_input_shape = (1,)+model.get_input_shape_at(0)[1:] 27 | 28 | # get prediction 29 | prediction = model.predict(img_vector.reshape(model_input_shape))[0] 30 | 31 | 32 | # get top N with confidence 33 | labels_predicted = [label_dict[idx] for idx in np.argsort(prediction)[::-1][:top_N]] 34 | confidence_predicted = np.sort(prediction)[::-1][:top_N] 35 | 36 | return labels_predicted, confidence_predicted 37 | 38 | 39 | def plot_predictions(model,dataset, 40 | dataset_labels,label_dict, 41 | batch_size,grid_height,grid_width): 42 | if model: 43 | f, ax = plt.subplots(grid_width, grid_height) 44 | f.set_size_inches(12, 12) 45 | 46 | random_batch_indx = np.random.permutation(np.arange(0,len(dataset)))[:batch_size] 47 | 48 | img_idx = 0 49 | for i in range(0, grid_width): 50 | for j in range(0, grid_height): 51 | actual_label = label_dict.get(dataset_labels[random_batch_indx[img_idx]].argmax()) 52 | preds,confs_ = make_prediction(model, 53 | img_vector=dataset[random_batch_indx[img_idx]], 54 | label_dict=label_dict, 55 | top_N=1) 56 | ax[i][j].axis('off') 57 | ax[i][j].set_title('Actual:'+actual_label[:10]+\ 58 | '\nPredicted:'+preds[0] + \ 59 | '(' +str(round(confs_[0],2)) + ')') 60 | ax[i][j].imshow(dataset[random_batch_indx[img_idx]]) 61 | img_idx += 1 62 | 63 | plt.subplots_adjust(left=0, bottom=0, right=1, 64 | top=1, wspace=0.4, hspace=0.55) 65 | 66 | 67 | # source: https://github.com/keras-team/keras/issues/431#issuecomment-317397154 68 | def get_activations(model, model_inputs, 69 | print_shape_only=True, layer_name=None): 70 | import keras.backend as K 71 | print('----- activations -----') 72 | activations = [] 73 | inp = model.input 74 | 75 | model_multi_inputs_cond = True 76 | if not isinstance(inp, list): 77 | # only one input! let's wrap it in a list. 78 | inp = [inp] 79 | model_multi_inputs_cond = False 80 | # all layer outputs 81 | outputs = [layer.output for layer in model.layers if 82 | layer.name == layer_name or layer_name is None] 83 | 84 | # evaluation functions 85 | funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs] 86 | 87 | if model_multi_inputs_cond: 88 | list_inputs = [] 89 | list_inputs.extend(model_inputs) 90 | list_inputs.append(1.) 91 | else: 92 | list_inputs = [model_inputs, 1.] 93 | 94 | # Learning phase. 1 = Test mode (no dropout or batch normalization) 95 | # layer_outputs = [func([model_inputs, 1.])[0] for func in funcs] 96 | layer_outputs = [func(list_inputs)[0] for func in funcs] 97 | for layer_activations in layer_outputs: 98 | activations.append(layer_activations) 99 | if print_shape_only: 100 | print(layer_activations.shape) 101 | else: 102 | print(layer_activations) 103 | return activations 104 | 105 | # source :https://github.com/philipperemy/keras-visualize-activations/blob/master/read_activations.py 106 | def display_activations(activation_maps): 107 | batch_size = activation_maps[0].shape[0] 108 | assert batch_size == 1, 'One image at a time to visualize.' 109 | for i, activation_map in enumerate(activation_maps): 110 | print('Displaying activation map {}'.format(i)) 111 | shape = activation_map.shape 112 | if len(shape) == 4: 113 | activations = np.hstack(np.transpose(activation_map[0], (2, 0, 1))) 114 | elif len(shape) == 2: 115 | # try to make it square as much as possible. we can skip some activations. 116 | activations = activation_map[0] 117 | num_activations = len(activations) 118 | # too hard to display it on the screen. 119 | if num_activations > 1024: 120 | square_param = int(np.floor(np.sqrt(num_activations))) 121 | activations = activations[0: square_param * square_param] 122 | activations = np.reshape(activations, (square_param, square_param)) 123 | else: 124 | activations = np.expand_dims(activations, axis=0) 125 | else: 126 | raise Exception('len(shape) = 3 has not been implemented.') 127 | #plt.imshow(activations, interpolation='None', cmap='binary') 128 | fig, ax = plt.subplots(figsize=(18, 12)) 129 | ax.imshow(activations, interpolation='None', cmap='binary') 130 | plt.show() -------------------------------------------------------------------------------- /notebooks/Ch06 - Image Recognition and Classification/dog_breed_transfer_learning_classifier.py: -------------------------------------------------------------------------------- 1 | 2 | # coding: utf-8 3 | 4 | # # Dog Breed Classifier 5 | # 6 | # This notebook leverages a pretrained InceptionV3 model (on ImageNet) to prepare a _Dog Breed Classifier_ 7 | # It showcases how __Transfer Learning__ can be utilized to prepare high performing models 8 | 9 | # In[1]: 10 | 11 | # Pandas and Numpy for data structures and util fucntions 12 | import re 13 | #import tqdm 14 | import itertools 15 | import numpy as np 16 | import pandas as pd 17 | from numpy.random import rand 18 | from datetime import datetime 19 | pd.options.display.max_colwidth = 600 20 | 21 | # Scikit Imports 22 | from sklearn import preprocessing 23 | from sklearn.metrics import roc_curve, auc, precision_recall_curve 24 | from sklearn.model_selection import train_test_split 25 | 26 | # Matplot Imports 27 | import matplotlib.pyplot as plt 28 | params = {'legend.fontsize': 'x-large', 29 | 'figure.figsize': (15, 5), 30 | 'axes.labelsize': 'x-large', 31 | 'axes.titlesize':'x-large', 32 | 'xtick.labelsize':'x-large', 33 | 'ytick.labelsize':'x-large'} 34 | 35 | plt.rcParams.update(params) 36 | get_ipython().run_line_magic('matplotlib', 'inline') 37 | 38 | # pandas display data frames as tables 39 | from IPython.display import display, HTML 40 | 41 | import warnings 42 | warnings.filterwarnings('ignore') 43 | 44 | 45 | # In[2]: 46 | 47 | import os 48 | import math 49 | import pathlib 50 | import shutil 51 | 52 | from keras import regularizers 53 | from keras.models import Model 54 | from keras.optimizers import Adam 55 | from keras.layers import Dropout 56 | from keras.layers import Conv2D,MaxPooling2D, GlobalAveragePooling2D 57 | from keras.layers import BatchNormalization 58 | from keras.layers import Activation,Dense,Flatten 59 | from keras.models import Sequential,load_model 60 | from keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img 61 | from keras.applications.inception_v3 import InceptionV3 62 | from keras.utils.np_utils import to_categorical 63 | 64 | 65 | # ## Load Dataset 66 | 67 | # In[3]: 68 | 69 | train_folder = 'train/' 70 | test_folder = 'test/' 71 | 72 | 73 | # In[4]: 74 | 75 | data_labels = pd.read_csv('labels/labels.csv') 76 | data_labels.head() 77 | 78 | 79 | # ## Check Number of Classes in the Dataset 80 | 81 | # In[5]: 82 | 83 | target_labels = data_labels['breed'] 84 | len(set(target_labels)) 85 | 86 | 87 | # ## Prepare Labels 88 | # Deep Learning models work with one hot encoded outputs or target variables 89 | 90 | # In[6]: 91 | 92 | labels_ohe_names = pd.get_dummies(target_labels, sparse=True) 93 | labels_ohe = np.asarray(labels_ohe_names) 94 | print(labels_ohe.shape) 95 | print(labels_ohe[:2]) 96 | 97 | 98 | # We add another column to the labels dataset to identify image path 99 | 100 | # In[7]: 101 | 102 | data_labels['image_path'] = data_labels.apply( lambda row: (train_folder + row["id"] + ".jpg" ), axis=1) 103 | data_labels.head() 104 | 105 | 106 | # ## Prepare Train-Test Datasets 107 | # We use a 70-30 split to prepare the two dataset 108 | 109 | # In[8]: 110 | 111 | train_data = np.array([img_to_array( 112 | load_img(img, 113 | target_size=(299, 299)) 114 | ) for img 115 | in data_labels['image_path'].values.tolist() 116 | ]).astype('float32') 117 | 118 | 119 | # In[9]: 120 | 121 | train_data.shape 122 | 123 | 124 | # In[10]: 125 | 126 | x_train, x_test, y_train, y_test = train_test_split(train_data, 127 | target_labels, 128 | test_size=0.3, 129 | stratify=np.array(target_labels), 130 | random_state=42) 131 | 132 | 133 | # In[11]: 134 | 135 | x_train.shape, x_test.shape 136 | 137 | 138 | # Prepare Validation Dataset 139 | 140 | # In[12]: 141 | 142 | x_train, x_val, y_train, y_val = train_test_split(x_train, 143 | y_train, 144 | test_size=0.15, 145 | stratify=np.array(y_train), 146 | random_state=42) 147 | 148 | 149 | # In[13]: 150 | 151 | x_train.shape, x_val.shape 152 | 153 | 154 | # Prepare target variables for train, test and validation datasets 155 | 156 | # In[14]: 157 | 158 | y_train_ohe = pd.get_dummies(y_train.reset_index(drop=True)).as_matrix() 159 | y_val_ohe = pd.get_dummies(y_val.reset_index(drop=True)).as_matrix() 160 | y_test_ohe = pd.get_dummies(y_test.reset_index(drop=True)).as_matrix() 161 | 162 | y_train_ohe.shape, y_test_ohe.shape, y_val_ohe.shape 163 | 164 | 165 | # ## Data Augmentation 166 | # 167 | # Since number of samples per class are not very high, we utilize data augmentation to prepare different variations of different samples available. We do this using the ```ImageDataGenerator utility``` from ```keras``` 168 | 169 | # In[15]: 170 | 171 | BATCH_SIZE = 32 172 | 173 | 174 | # In[16]: 175 | 176 | # Create train generator. 177 | train_datagen = ImageDataGenerator(rescale=1./255, 178 | rotation_range=30, 179 | width_shift_range=0.2, 180 | height_shift_range=0.2, 181 | horizontal_flip = 'true') 182 | train_generator = train_datagen.flow(x_train, y_train_ohe, shuffle=False, batch_size=BATCH_SIZE, seed=1) 183 | 184 | 185 | # In[17]: 186 | 187 | # Create validation generator 188 | val_datagen = ImageDataGenerator(rescale = 1./255) 189 | val_generator = train_datagen.flow(x_val, y_val_ohe, shuffle=False, batch_size=BATCH_SIZE, seed=1) 190 | 191 | 192 | # ## Prepare Deep Learning Classifier 193 | # 194 | # * Load InceptionV3 pretrained on ImageNet without its top/classification layer 195 | # * Add additional custom layers on top of InceptionV3 to prepare custom classifier 196 | 197 | # In[18]: 198 | 199 | # Get the InceptionV3 model so we can do transfer learning 200 | base_inception = InceptionV3(weights='imagenet', include_top = False, input_shape=(299, 299, 3)) 201 | 202 | 203 | # In[19]: 204 | 205 | # Add a global spatial average pooling layer 206 | out = base_inception.output 207 | out = GlobalAveragePooling2D()(out) 208 | out = Dense(512, activation='relu')(out) 209 | out = Dense(512, activation='relu')(out) 210 | total_classes = y_train_ohe.shape[1] 211 | predictions = Dense(total_classes, activation='softmax')(out) 212 | 213 | 214 | # * Stack the two models (InceptionV3 and custom layers) on top of each other 215 | # * Compile the model and view its summary 216 | 217 | # In[20]: 218 | 219 | model = Model(inputs=base_inception.input, outputs=predictions) 220 | 221 | # only if we want to freeze layers 222 | for layer in base_inception.layers: 223 | layer.trainable = False 224 | 225 | # Compile 226 | model.compile(Adam(lr=.0001), loss='categorical_crossentropy', metrics=['accuracy']) 227 | 228 | model.summary() 229 | 230 | 231 | # ## Model Training 232 | # We train the model with a Batch Size of 32 for just 15 Epochs. 233 | # 234 | # The model utilizes the power of transfer learning to achieve a validation accuracy of about __81%__ ! 235 | 236 | # In[21]: 237 | 238 | # Train the model 239 | batch_size = BATCH_SIZE 240 | train_steps_per_epoch = x_train.shape[0] // batch_size 241 | val_steps_per_epoch = x_val.shape[0] // batch_size 242 | 243 | history = model.fit_generator(train_generator, 244 | steps_per_epoch=train_steps_per_epoch, 245 | validation_data=val_generator, 246 | validation_steps=val_steps_per_epoch, 247 | epochs=15, 248 | verbose=1) 249 | 250 | 251 | # Save the Model 252 | 253 | # In[22]: 254 | 255 | model.save('dog_breed.hdf5') 256 | 257 | 258 | # ## Visualize Model Performance 259 | 260 | # In[35]: 261 | 262 | f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5)) 263 | t = f.suptitle('Deep Neural Net Performance', fontsize=12) 264 | f.subplots_adjust(top=0.85, wspace=0.3) 265 | 266 | epochs = list(range(1,16)) 267 | ax1.plot(epochs, history.history['acc'], label='Train Accuracy') 268 | ax1.plot(epochs, history.history['val_acc'], label='Validation Accuracy') 269 | ax1.set_xticks(epochs) 270 | ax1.set_ylabel('Accuracy Value') 271 | ax1.set_xlabel('Epoch') 272 | ax1.set_title('Accuracy') 273 | l1 = ax1.legend(loc="best") 274 | 275 | ax2.plot(epochs, history.history['loss'], label='Train Loss') 276 | ax2.plot(epochs, history.history['val_loss'], label='Validation Loss') 277 | ax2.set_xticks(epochs) 278 | ax2.set_ylabel('Loss Value') 279 | ax2.set_xlabel('Epoch') 280 | ax2.set_title('Loss') 281 | l2 = ax2.legend(loc="best") 282 | 283 | 284 | # ## Test Model Performance 285 | # 286 | # Step 1 is to prepare the training dataset. Since we scaled training data, test data should also be scaled in a similar manner. 287 | # 288 | # _Note: Deep Learning models are very sensitive to scaling._ 289 | 290 | # In[67]: 291 | 292 | # scaling test features 293 | x_test /= 255. 294 | 295 | 296 | # In[69]: 297 | 298 | test_predictions = model.predict(x_test) 299 | test_predictions 300 | 301 | 302 | # In[70]: 303 | 304 | predictions = pd.DataFrame(test_predictions, columns=labels_ohe_names.columns) 305 | predictions.head() 306 | 307 | 308 | # In[71]: 309 | 310 | test_labels = list(y_test) 311 | predictions = list(predictions.idxmax(axis=1)) 312 | predictions[:10] 313 | 314 | 315 | # ## Analyze Test Performance 316 | 317 | # In[72]: 318 | 319 | import model_evaluation_utils as meu 320 | 321 | 322 | # In[73]: 323 | 324 | meu.get_metrics(true_labels=test_labels, 325 | predicted_labels=predictions) 326 | 327 | 328 | # In[74]: 329 | 330 | meu.display_classification_report(true_labels=test_labels, 331 | predicted_labels=predictions, 332 | classes=list(labels_ohe_names.columns)) 333 | 334 | 335 | # In[75]: 336 | 337 | meu.display_confusion_matrix_pretty(true_labels=test_labels, 338 | predicted_labels=predictions, 339 | classes=list(labels_ohe_names.columns)) 340 | 341 | 342 | # The model achieves a test accuracy of approximately __86%__ 343 | 344 | # ## Visualize Model Performance 345 | # Visualize model performance with actual images, labels and prediction confidence 346 | 347 | # In[112]: 348 | 349 | grid_width = 5 350 | grid_height = 5 351 | f, ax = plt.subplots(grid_width, grid_height) 352 | f.set_size_inches(15, 15) 353 | batch_size = 25 354 | dataset = x_test 355 | 356 | label_dict = dict(enumerate(labels_ohe_names.columns.values)) 357 | model_input_shape = (1,)+model.get_input_shape_at(0)[1:] 358 | random_batch_indx = np.random.permutation(np.arange(0,len(dataset)))[:batch_size] 359 | 360 | img_idx = 0 361 | for i in range(0, grid_width): 362 | for j in range(0, grid_height): 363 | actual_label = np.array(y_test)[random_batch_indx[img_idx]] 364 | prediction = model.predict(dataset[random_batch_indx[img_idx]].reshape(model_input_shape))[0] 365 | label_idx = np.argmax(prediction) 366 | predicted_label = label_dict.get(label_idx) 367 | conf = round(prediction[label_idx], 2) 368 | ax[i][j].axis('off') 369 | ax[i][j].set_title('Actual: '+actual_label+'\nPred: '+predicted_label + '\nConf: ' +str(conf)) 370 | ax[i][j].imshow(dataset[random_batch_indx[img_idx]]) 371 | img_idx += 1 372 | 373 | plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0.5, hspace=0.55) 374 | 375 | -------------------------------------------------------------------------------- /notebooks/Ch06 - Image Recognition and Classification/model_evaluation_utils.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Jul 31 20:05:23 2017 4 | 5 | @author: DIP 6 | @Copyright: Dipanjan Sarkar 7 | """ 8 | 9 | from sklearn import metrics 10 | import numpy as np 11 | import pandas as pd 12 | import matplotlib.pyplot as plt 13 | from sklearn.preprocessing import LabelEncoder 14 | from sklearn.base import clone 15 | from sklearn.preprocessing import label_binarize 16 | from scipy import interp 17 | from sklearn.metrics import roc_curve, auc 18 | 19 | 20 | def get_metrics(true_labels, predicted_labels): 21 | 22 | print('Accuracy:', np.round( 23 | metrics.accuracy_score(true_labels, 24 | predicted_labels), 25 | 4)) 26 | print('Precision:', np.round( 27 | metrics.precision_score(true_labels, 28 | predicted_labels, 29 | average='weighted'), 30 | 4)) 31 | print('Recall:', np.round( 32 | metrics.recall_score(true_labels, 33 | predicted_labels, 34 | average='weighted'), 35 | 4)) 36 | print('F1 Score:', np.round( 37 | metrics.f1_score(true_labels, 38 | predicted_labels, 39 | average='weighted'), 40 | 4)) 41 | 42 | 43 | def train_predict_model(classifier, 44 | train_features, train_labels, 45 | test_features, test_labels): 46 | # build model 47 | classifier.fit(train_features, train_labels) 48 | # predict using model 49 | predictions = classifier.predict(test_features) 50 | return predictions 51 | 52 | 53 | def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]): 54 | 55 | total_classes = len(classes) 56 | level_labels = [total_classes*[0], list(range(total_classes))] 57 | 58 | cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels, 59 | labels=classes) 60 | cm_frame = pd.DataFrame(data=cm, 61 | columns=pd.MultiIndex(levels=[['Predicted:'], classes], 62 | labels=level_labels), 63 | index=pd.MultiIndex(levels=[['Actual:'], classes], 64 | labels=level_labels)) 65 | print(cm_frame) 66 | 67 | def display_classification_report(true_labels, predicted_labels, classes=[1,0]): 68 | 69 | report = metrics.classification_report(y_true=true_labels, 70 | y_pred=predicted_labels, 71 | labels=classes) 72 | print(report) 73 | 74 | 75 | 76 | def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]): 77 | print('Model Performance metrics:') 78 | print('-'*30) 79 | get_metrics(true_labels=true_labels, predicted_labels=predicted_labels) 80 | print('\nModel Classification report:') 81 | print('-'*30) 82 | display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels, 83 | classes=classes) 84 | print('\nPrediction Confusion Matrix:') 85 | print('-'*30) 86 | display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels, 87 | classes=classes) 88 | 89 | 90 | def plot_model_decision_surface(clf, train_features, train_labels, 91 | plot_step=0.02, cmap=plt.cm.RdYlBu, 92 | markers=None, alphas=None, colors=None): 93 | 94 | if train_features.shape[1] != 2: 95 | raise ValueError("X_train should have exactly 2 columnns!") 96 | 97 | x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step 98 | y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step 99 | xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step), 100 | np.arange(y_min, y_max, plot_step)) 101 | 102 | clf_est = clone(clf) 103 | clf_est.fit(train_features,train_labels) 104 | if hasattr(clf_est, 'predict_proba'): 105 | Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1] 106 | else: 107 | Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()]) 108 | Z = Z.reshape(xx.shape) 109 | cs = plt.contourf(xx, yy, Z, cmap=cmap) 110 | 111 | le = LabelEncoder() 112 | y_enc = le.fit_transform(train_labels) 113 | n_classes = len(le.classes_) 114 | plot_colors = ''.join(colors) if colors else [None] * n_classes 115 | label_names = le.classes_ 116 | markers = markers if markers else [None] * n_classes 117 | alphas = alphas if alphas else [None] * n_classes 118 | for i, color in zip(range(n_classes), plot_colors): 119 | idx = np.where(y_enc == i) 120 | plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color, 121 | label=label_names[i], cmap=cmap, edgecolors='black', 122 | marker=markers[i], alpha=alphas[i]) 123 | plt.legend() 124 | plt.show() 125 | 126 | 127 | def plot_model_roc_curve(clf, features, true_labels, label_encoder=None, class_names=None): 128 | 129 | ## Compute ROC curve and ROC area for each class 130 | fpr = dict() 131 | tpr = dict() 132 | roc_auc = dict() 133 | if hasattr(clf, 'classes_'): 134 | class_labels = clf.classes_ 135 | elif label_encoder: 136 | class_labels = label_encoder.classes_ 137 | elif class_names: 138 | class_labels = class_names 139 | else: 140 | raise ValueError('Unable to derive prediction classes, please specify class_names!') 141 | n_classes = len(class_labels) 142 | y_test = label_binarize(true_labels, classes=class_labels) 143 | if n_classes == 2: 144 | if hasattr(clf, 'predict_proba'): 145 | prob = clf.predict_proba(features) 146 | y_score = prob[:, prob.shape[1]-1] 147 | elif hasattr(clf, 'decision_function'): 148 | prob = clf.decision_function(features) 149 | y_score = prob[:, prob.shape[1]-1] 150 | else: 151 | raise AttributeError("Estimator doesn't have a probability or confidence scoring system!") 152 | 153 | fpr, tpr, _ = roc_curve(y_test, y_score) 154 | roc_auc = auc(fpr, tpr) 155 | plt.plot(fpr, tpr, label='ROC curve (area = {0:0.2f})' 156 | ''.format(roc_auc), 157 | linewidth=2.5) 158 | 159 | elif n_classes > 2: 160 | if hasattr(clf, 'predict_proba'): 161 | y_score = clf.predict_proba(features) 162 | elif hasattr(clf, 'decision_function'): 163 | y_score = clf.decision_function(features) 164 | else: 165 | raise AttributeError("Estimator doesn't have a probability or confidence scoring system!") 166 | 167 | for i in range(n_classes): 168 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i]) 169 | roc_auc[i] = auc(fpr[i], tpr[i]) 170 | 171 | ## Compute micro-average ROC curve and ROC area 172 | fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel()) 173 | roc_auc["micro"] = auc(fpr["micro"], tpr["micro"]) 174 | 175 | ## Compute macro-average ROC curve and ROC area 176 | # First aggregate all false positive rates 177 | all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)])) 178 | # Then interpolate all ROC curves at this points 179 | mean_tpr = np.zeros_like(all_fpr) 180 | for i in range(n_classes): 181 | mean_tpr += interp(all_fpr, fpr[i], tpr[i]) 182 | # Finally average it and compute AUC 183 | mean_tpr /= n_classes 184 | fpr["macro"] = all_fpr 185 | tpr["macro"] = mean_tpr 186 | roc_auc["macro"] = auc(fpr["macro"], tpr["macro"]) 187 | 188 | ## Plot ROC curves 189 | plt.figure(figsize=(6, 4)) 190 | plt.plot(fpr["micro"], tpr["micro"], 191 | label='micro-average ROC curve (area = {0:0.2f})' 192 | ''.format(roc_auc["micro"]), linewidth=3) 193 | 194 | plt.plot(fpr["macro"], tpr["macro"], 195 | label='macro-average ROC curve (area = {0:0.2f})' 196 | ''.format(roc_auc["macro"]), linewidth=3) 197 | 198 | for i, label in enumerate(class_labels): 199 | plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})' 200 | ''.format(label, roc_auc[i]), 201 | linewidth=2, linestyle=':') 202 | else: 203 | raise ValueError('Number of classes should be atleast 2 or more') 204 | 205 | plt.plot([0, 1], [0, 1], 'k--') 206 | plt.xlim([0.0, 1.0]) 207 | plt.ylim([0.0, 1.05]) 208 | plt.xlabel('False Positive Rate') 209 | plt.ylabel('True Positive Rate') 210 | plt.title('Receiver Operating Characteristic (ROC) Curve') 211 | plt.legend(loc="lower right") 212 | plt.show() 213 | 214 | 215 | -------------------------------------------------------------------------------- /notebooks/Ch07 - Text Document Categorization/20_newsgrp_cnn_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import config\n", 12 | "from dataloader.loader import Loader\n", 13 | "from preprocessing.utils import Preprocess, remove_empty_docs\n", 14 | "from dataloader.embeddings import GloVe\n", 15 | "from model.cnn_document_model import DocumentModel, TrainingParameters\n", 16 | "from keras.callbacks import ModelCheckpoint, EarlyStopping\n", 17 | "import numpy as np\n", 18 | "from keras.utils import to_categorical\n", 19 | "import keras.backend as K\n", 20 | "\n", 21 | "\n", 22 | "from sklearn.manifold import TSNE\n" 23 | ] 24 | }, 25 | { 26 | "cell_type": "markdown", 27 | "metadata": {}, 28 | "source": [ 29 | "## Load Data Sets for 20 News Group" 30 | ] 31 | }, 32 | { 33 | "cell_type": "code", 34 | "execution_count": null, 35 | "metadata": { 36 | "collapsed": true 37 | }, 38 | "outputs": [], 39 | "source": [ 40 | "dataset = Loader.load_20newsgroup_data(subset='train')\n", 41 | "corpus, labels = dataset.data, dataset.target\n", 42 | "corpus, labels = remove_empty_docs(corpus, labels)\n", 43 | "\n", 44 | "\n", 45 | "test_dataset = Loader.load_20newsgroup_data(subset='test')\n", 46 | "test_corpus, test_labels = test_dataset.data, test_dataset.target\n", 47 | "test_corpus, test_labels = remove_empty_docs(test_corpus, test_labels)" 48 | ] 49 | }, 50 | { 51 | "cell_type": "markdown", 52 | "metadata": {}, 53 | "source": [ 54 | "## Mapping 20 Groups to 6 High level Categories " 55 | ] 56 | }, 57 | { 58 | "cell_type": "code", 59 | "execution_count": null, 60 | "metadata": { 61 | "collapsed": true 62 | }, 63 | "outputs": [], 64 | "source": [ 65 | "six_groups = {\n", 66 | " 'comp.graphics':0,'comp.os.ms-windows.misc':0,'comp.sys.ibm.pc.hardware':0,\n", 67 | " 'comp.sys.mac.hardware':0, 'comp.windows.x':0,\n", 68 | " \n", 69 | " 'rec.autos':1, 'rec.motorcycles':1, 'rec.sport.baseball':1, 'rec.sport.hockey':1,\n", 70 | " \n", 71 | " 'sci.crypt':2, 'sci.electronics':2,'sci.med':2, 'sci.space':2,\n", 72 | " \n", 73 | " 'misc.forsale':3,\n", 74 | " \n", 75 | " 'talk.politics.misc':4, 'talk.politics.guns':4, 'talk.politics.mideast':4,\n", 76 | " \n", 77 | " 'talk.religion.misc':5, 'alt.atheism':5, 'soc.religion.christian':5\n", 78 | " \n", 79 | "}" 80 | ] 81 | }, 82 | { 83 | "cell_type": "code", 84 | "execution_count": null, 85 | "metadata": { 86 | "collapsed": true 87 | }, 88 | "outputs": [], 89 | "source": [ 90 | "map_20_2_6 = [six_groups[dataset.target_names[i]] for i in range(20)]\n", 91 | "labels = [six_groups[dataset.target_names[i]] for i in labels] \n", 92 | "test_labels = [six_groups[dataset.target_names[i]] for i in test_labels] " 93 | ] 94 | }, 95 | { 96 | "cell_type": "markdown", 97 | "metadata": {}, 98 | "source": [ 99 | "## Pre-process Text to convert it to word index sequences" 100 | ] 101 | }, 102 | { 103 | "cell_type": "code", 104 | "execution_count": null, 105 | "metadata": { 106 | "collapsed": false 107 | }, 108 | "outputs": [], 109 | "source": [ 110 | "Preprocess.MIN_WD_COUNT=5\n", 111 | "preprocessor = Preprocess(corpus=corpus)\n", 112 | "corpus_to_seq = preprocessor.fit()" 113 | ] 114 | }, 115 | { 116 | "cell_type": "code", 117 | "execution_count": null, 118 | "metadata": { 119 | "collapsed": false 120 | }, 121 | "outputs": [], 122 | "source": [ 123 | "test_corpus_to_seq = preprocessor.transform(test_corpus)" 124 | ] 125 | }, 126 | { 127 | "cell_type": "markdown", 128 | "metadata": {}, 129 | "source": [ 130 | "## Initialize Embeddings" 131 | ] 132 | }, 133 | { 134 | "cell_type": "code", 135 | "execution_count": null, 136 | "metadata": { 137 | "collapsed": false 138 | }, 139 | "outputs": [], 140 | "source": [ 141 | "glove=GloVe(50)\n", 142 | "initial_embeddings = glove.get_embedding(preprocessor.word_index)" 143 | ] 144 | }, 145 | { 146 | "cell_type": "markdown", 147 | "metadata": {}, 148 | "source": [ 149 | "## Build Model" 150 | ] 151 | }, 152 | { 153 | "cell_type": "code", 154 | "execution_count": null, 155 | "metadata": { 156 | "collapsed": false 157 | }, 158 | "outputs": [], 159 | "source": [ 160 | "newsgrp_model = DocumentModel(vocab_size=preprocessor.get_vocab_size(),\n", 161 | " sent_k_maxpool = 5,\n", 162 | " sent_filters = 20,\n", 163 | " word_kernel_size = 5,\n", 164 | " word_index = preprocessor.word_index,\n", 165 | " num_sentences=Preprocess.NUM_SENTENCES, \n", 166 | " embedding_weights=initial_embeddings,\n", 167 | " conv_activation = 'relu',\n", 168 | " train_embedding = True,\n", 169 | " learn_word_conv = True,\n", 170 | " learn_sent_conv = True,\n", 171 | " sent_dropout = 0.4,\n", 172 | " hidden_dims=64, \n", 173 | " input_dropout=0.2, \n", 174 | " hidden_gaussian_noise_sd=0.5,\n", 175 | " final_layer_kernel_regularizer=0.1,\n", 176 | " num_hidden_layers=2,\n", 177 | " num_units_final_layer=6)\n" 178 | ] 179 | }, 180 | { 181 | "cell_type": "markdown", 182 | "metadata": {}, 183 | "source": [ 184 | "## Save model parameters" 185 | ] 186 | }, 187 | { 188 | "cell_type": "code", 189 | "execution_count": null, 190 | "metadata": { 191 | "collapsed": true 192 | }, 193 | "outputs": [], 194 | "source": [ 195 | "train_params = TrainingParameters('6_newsgrp_largeclass', \n", 196 | " model_file_path = config.MODEL_DIR+ '/20newsgroup/model_6_01.hdf5',\n", 197 | " model_hyper_parameters = config.MODEL_DIR+ '/20newsgroup/model_6_01.json',\n", 198 | " model_train_parameters = config.MODEL_DIR+ '/20newsgroup/model_6_01_meta.json',\n", 199 | " num_epochs=20,\n", 200 | " batch_size = 128,\n", 201 | " validation_split=.10,\n", 202 | " learning_rate=0.01)\n", 203 | "\n", 204 | "train_params.save()\n", 205 | "newsgrp_model._save_model(train_params.model_hyper_parameters)" 206 | ] 207 | }, 208 | { 209 | "cell_type": "markdown", 210 | "metadata": {}, 211 | "source": [ 212 | "## Compile and run model" 213 | ] 214 | }, 215 | { 216 | "cell_type": "code", 217 | "execution_count": null, 218 | "metadata": { 219 | "collapsed": true 220 | }, 221 | "outputs": [], 222 | "source": [ 223 | "newsgrp_model._model.compile(loss=\"categorical_crossentropy\", \n", 224 | " optimizer=train_params.optimizer,\n", 225 | " metrics=[\"accuracy\"])\n", 226 | "checkpointer = ModelCheckpoint(filepath=train_params.model_file_path,\n", 227 | " verbose=1,\n", 228 | " save_best_only=True,\n", 229 | " save_weights_only=True)\n", 230 | "\n", 231 | "early_stop = EarlyStopping(patience=2)" 232 | ] 233 | }, 234 | { 235 | "cell_type": "code", 236 | "execution_count": null, 237 | "metadata": { 238 | "collapsed": true 239 | }, 240 | "outputs": [], 241 | "source": [ 242 | "\n", 243 | "x_train = np.array(corpus_to_seq)\n", 244 | "y_train = to_categorical(np.array(labels))\n", 245 | "\n", 246 | "x_test = np.array(test_corpus_to_seq)\n", 247 | "y_test = to_categorical(np.array(test_labels))\n" 248 | ] 249 | }, 250 | { 251 | "cell_type": "code", 252 | "execution_count": null, 253 | "metadata": { 254 | "collapsed": false 255 | }, 256 | "outputs": [], 257 | "source": [ 258 | "#Set LR\n", 259 | "K.set_value(newsgrp_model.get_classification_model().optimizer.lr, train_params.learning_rate)\n", 260 | "\n", 261 | "newsgrp_model.get_classification_model().fit(x_train, y_train, \n", 262 | " batch_size=train_params.batch_size, \n", 263 | " epochs=train_params.num_epochs,\n", 264 | " verbose=2,\n", 265 | " validation_split=train_params.validation_split,\n", 266 | " callbacks=[checkpointer,early_stop])\n", 267 | "\n", 268 | "newsgrp_model.get_classification_model().evaluate( x_test, y_test, verbose=2)\n", 269 | "preds = newsgrp_model.get_classification_model().predict(x_test)\n", 270 | "preds_test = np.argmax(preds, axis=1)\n" 271 | ] 272 | }, 273 | { 274 | "cell_type": "markdown", 275 | "metadata": {}, 276 | "source": [ 277 | "## Evaluate Model Accuracy" 278 | ] 279 | }, 280 | { 281 | "cell_type": "code", 282 | "execution_count": null, 283 | "metadata": { 284 | "collapsed": false 285 | }, 286 | "outputs": [], 287 | "source": [ 288 | "from sklearn.metrics import classification_report,accuracy_score,confusion_matrix\n", 289 | "print(classification_report(test_labels, preds_test))\n", 290 | "print(confusion_matrix(test_labels, preds_test))\n", 291 | "print(accuracy_score(test_labels, preds_test))" 292 | ] 293 | }, 294 | { 295 | "cell_type": "markdown", 296 | "metadata": {}, 297 | "source": [ 298 | "## Visualization: Document Embeddings with tsne - what the model learned" 299 | ] 300 | }, 301 | { 302 | "cell_type": "code", 303 | "execution_count": null, 304 | "metadata": { 305 | "collapsed": false 306 | }, 307 | "outputs": [], 308 | "source": [ 309 | "from utils import scatter_plot\n", 310 | "doc_embeddings = newsgrp_model.get_document_model().predict(x_test)\n", 311 | "print(doc_embeddings.shape)" 312 | ] 313 | }, 314 | { 315 | "cell_type": "code", 316 | "execution_count": null, 317 | "metadata": { 318 | "collapsed": false 319 | }, 320 | "outputs": [], 321 | "source": [ 322 | "doc_proj = TSNE(n_components=2, random_state=42, ).fit_transform(doc_embeddings)" 323 | ] 324 | }, 325 | { 326 | "cell_type": "code", 327 | "execution_count": null, 328 | "metadata": { 329 | "collapsed": false 330 | }, 331 | "outputs": [], 332 | "source": [ 333 | "f, ax, sc, txts = scatter_plot(doc_proj, np.array(test_labels))" 334 | ] 335 | }, 336 | { 337 | "cell_type": "code", 338 | "execution_count": null, 339 | "metadata": { 340 | "collapsed": true 341 | }, 342 | "outputs": [], 343 | "source": [ 344 | "f.savefig('nws_grp_embd.png')" 345 | ] 346 | }, 347 | { 348 | "cell_type": "code", 349 | "execution_count": null, 350 | "metadata": { 351 | "collapsed": true 352 | }, 353 | "outputs": [], 354 | "source": [] 355 | } 356 | ], 357 | "metadata": { 358 | "kernelspec": { 359 | "display_name": "Python 3", 360 | "language": "python", 361 | "name": "python3" 362 | }, 363 | "language_info": { 364 | "codemirror_mode": { 365 | "name": "ipython", 366 | "version": 3 367 | }, 368 | "file_extension": ".py", 369 | "mimetype": "text/x-python", 370 | "name": "python", 371 | "nbconvert_exporter": "python", 372 | "pygments_lexer": "ipython3", 373 | "version": "3.6.0" 374 | } 375 | }, 376 | "nbformat": 4, 377 | "nbformat_minor": 2 378 | } 379 | -------------------------------------------------------------------------------- /notebooks/Ch07 - Text Document Categorization/Text_Summarization_IMDB.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": null, 6 | "metadata": { 7 | "collapsed": false 8 | }, 9 | "outputs": [], 10 | "source": [ 11 | "import config\n", 12 | "from model.cnn_document_model import DocumentModel\n", 13 | "from preprocessing.utils import Preprocess, remove_empty_docs\n", 14 | "import numpy as np\n", 15 | "import pandas as pd\n", 16 | "from nltk.tokenize import sent_tokenize \n", 17 | "\n", 18 | "import keras.backend as K" 19 | ] 20 | }, 21 | { 22 | "cell_type": "markdown", 23 | "metadata": {}, 24 | "source": [ 25 | "## Load pre-trained IMDB model and data" 26 | ] 27 | }, 28 | { 29 | "cell_type": "code", 30 | "execution_count": null, 31 | "metadata": { 32 | "collapsed": false 33 | }, 34 | "outputs": [], 35 | "source": [ 36 | "imdb_model = DocumentModel.load_model(config.MODEL_DIR+ '/imdb/model_02.json')\n", 37 | "imdb_model.load_model_weights(config.MODEL_DIR+ '/imdb/model_02.hdf5')\n", 38 | "\n", 39 | "model = imdb_model.get_classification_model()\n", 40 | "model.compile(loss=\"binary_crossentropy\", optimizer='rmsprop', metrics=[\"accuracy\"])" 41 | ] 42 | }, 43 | { 44 | "cell_type": "code", 45 | "execution_count": null, 46 | "metadata": { 47 | "collapsed": true 48 | }, 49 | "outputs": [], 50 | "source": [ 51 | "train_df = Loader.load_imdb_data(directory = 'train')\n", 52 | "print(train_df.shape)\n", 53 | "\n", 54 | "corpus = train_df['review'].tolist()\n", 55 | "target = train_df['sentiment'].tolist()\n", 56 | "corpus, target = remove_empty_docs(corpus, target)\n", 57 | "print(len(corpus))\n" 58 | ] 59 | }, 60 | { 61 | "cell_type": "markdown", 62 | "metadata": {}, 63 | "source": [ 64 | "## Pre process input and compute document embeddings" 65 | ] 66 | }, 67 | { 68 | "cell_type": "code", 69 | "execution_count": null, 70 | "metadata": { 71 | "collapsed": false 72 | }, 73 | "outputs": [], 74 | "source": [ 75 | "preprocessor = Preprocess(corpus=corpus)\n", 76 | "corpus_to_seq = preprocessor.fit()\n", 77 | "\n", 78 | "corpus = train_df['review'].tolist()\n", 79 | "target = train_df['sentiment'].tolist()\n", 80 | "corpus_to_seq = preprocessor.transform(corpus)\n", 81 | "\n", 82 | "x_train = np.array(corpus_to_seq)\n", 83 | "y_train = np.array(target)\n", 84 | "\n", 85 | "print(x_train.shape, y_train.shape)" 86 | ] 87 | }, 88 | { 89 | "cell_type": "code", 90 | "execution_count": null, 91 | "metadata": { 92 | "collapsed": false 93 | }, 94 | "outputs": [], 95 | "source": [ 96 | "print('Evaluating Model ...')\n", 97 | "print(model.evaluate(x_train, y_train))\n", 98 | "\n", 99 | "preds = model.predict(x_train)\n", 100 | "\n", 101 | "#invert predicted label\n", 102 | "pseudo_label = np.subtract(1,preds)" 103 | ] 104 | }, 105 | { 106 | "cell_type": "markdown", 107 | "metadata": {}, 108 | "source": [ 109 | "## Gradient Calculation of inverted output w.r.t sentence embeddings" 110 | ] 111 | }, 112 | { 113 | "cell_type": "code", 114 | "execution_count": null, 115 | "metadata": { 116 | "collapsed": false 117 | }, 118 | "outputs": [], 119 | "source": [ 120 | "#Get the learned sentence embeddings\n", 121 | "sentence_ebd = imdb_model.get_sentence_model().predict(x_train)\n", 122 | "\n", 123 | "input_tensors = [model.inputs[0], # input data\n", 124 | " model.sample_weights[0], # how much to weight each sample by\n", 125 | " model.targets[0], # labels \n", 126 | "]\n", 127 | "#variable tensor at the sentence embeding layer\n", 128 | "weights = imdb_model.get_sentence_model().outputs\n", 129 | "\n", 130 | "#calculate gradient of the total model loss w.r.t \n", 131 | "#the variables at sentence embd layer \n", 132 | "gradients = model.optimizer.get_gradients(model.total_loss, weights) \n", 133 | "get_gradients = K.function(inputs=input_tensors, outputs=gradients)" 134 | ] 135 | }, 136 | { 137 | "cell_type": "code", 138 | "execution_count": null, 139 | "metadata": { 140 | "collapsed": false 141 | }, 142 | "outputs": [], 143 | "source": [ 144 | "document_number = 2\n", 145 | "K.set_learning_phase(0)\n", 146 | "inputs = [[x_train[document_number]], # X\n", 147 | " [1], # sample weights\n", 148 | " [[pseudo_label[document_number][0]]], # y\n", 149 | "]\n", 150 | "grad = get_gradients(inputs)\n", 151 | "\n", 152 | "sent_score = []\n", 153 | "for i in range(Preprocess.NUM_SENTENCES):\n", 154 | " #sent_score.append((i, -np.abs(np.dot(grad[0][0][i],sentence_ebd[document_number][i])))) #DECREASING\n", 155 | " sent_score.append((i, -np.linalg.norm(grad[0][0][i])))\n", 156 | "\n", 157 | "sent_score.sort(key=lambda tup: tup[1])\n", 158 | "summary_sentences = [ i for i, s in sent_score[:4]]\n", 159 | "\n", 160 | "doc = corpus[document_number]\n", 161 | "label = y_train[document_number]\n", 162 | "prediction = preds[document_number]\n", 163 | "print(doc, label , prediction)\n", 164 | "\n", 165 | "sentences = sent_tokenize(doc)\n", 166 | "for i in summary_sentences:\n", 167 | " print(i, sentences[i])\n", 168 | " \n" 169 | ] 170 | }, 171 | { 172 | "cell_type": "code", 173 | "execution_count": null, 174 | "metadata": { 175 | "collapsed": true 176 | }, 177 | "outputs": [], 178 | "source": [] 179 | }, 180 | { 181 | "cell_type": "code", 182 | "execution_count": null, 183 | "metadata": { 184 | "collapsed": true 185 | }, 186 | "outputs": [], 187 | "source": [] 188 | } 189 | ], 190 | "metadata": { 191 | "kernelspec": { 192 | "display_name": "Python 3", 193 | "language": "python", 194 | "name": "python3" 195 | }, 196 | "language_info": { 197 | "codemirror_mode": { 198 | "name": "ipython", 199 | "version": 3 200 | }, 201 | "file_extension": ".py", 202 | "mimetype": "text/x-python", 203 | "name": "python", 204 | "nbconvert_exporter": "python", 205 | "pygments_lexer": "ipython3", 206 | "version": "3.6.0" 207 | } 208 | }, 209 | "nbformat": 4, 210 | "nbformat_minor": 2 211 | } 212 | -------------------------------------------------------------------------------- /notebooks/Ch07 - Text Document Categorization/amazon_review_model.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Wed Apr 11 15:34:14 2018 4 | 5 | @author: tghosh 6 | """ 7 | 8 | import config 9 | from dataloader.loader import Loader 10 | from preprocessing.utils import Preprocess, remove_empty_docs 11 | from dataloader.embeddings import GloVe 12 | from model.cnn_document_model import DocumentModel, TrainingParameters 13 | from keras.callbacks import ModelCheckpoint, EarlyStopping 14 | import numpy as np 15 | 16 | 17 | train_df = Loader.load_amazon_reviews('train') 18 | print(train_df.shape) 19 | 20 | test_df = Loader.load_amazon_reviews('test') 21 | print(test_df.shape) 22 | 23 | dataset = train_df.sample(n=200000, random_state=42) 24 | dataset.sentiment.value_counts() 25 | 26 | 27 | corpus = dataset['review'].values 28 | target = dataset['sentiment'].values 29 | print(corpus.shape, target.shape) 30 | 31 | corpus, target = remove_empty_docs(corpus, target) 32 | print(len(corpus)) 33 | 34 | preprocessor = Preprocess(corpus=corpus) 35 | corpus_to_seq = preprocessor.fit() 36 | 37 | holdout_corpus = test_df['review'].values 38 | holdout_target = test_df['sentiment'].values 39 | print(holdout_corpus.shape, holdout_target.shape) 40 | 41 | holdout_corpus, holdout_target = remove_empty_docs(holdout_corpus, holdout_target) 42 | print(len(holdout_corpus)) 43 | holdout_corpus_to_seq = preprocessor.transform(holdout_corpus) 44 | 45 | glove=GloVe(50) 46 | initial_embeddings = glove.get_embedding(preprocessor.word_index) 47 | 48 | amazon_review_model = DocumentModel(vocab_size=preprocessor.get_vocab_size(), 49 | word_index = preprocessor.word_index, 50 | num_sentences=Preprocess.NUM_SENTENCES, 51 | embedding_weights=initial_embeddings, 52 | conv_activation = 'tanh', 53 | hidden_dims=64, 54 | input_dropout=0.40, 55 | hidden_gaussian_noise_sd=0.5) 56 | 57 | train_params = TrainingParameters('model_with_tanh_activation', 58 | model_file_path = config.MODEL_DIR+ '/amazonreviews/model_06.hdf5', 59 | model_hyper_parameters = config.MODEL_DIR+ '/amazonreviews/model_06.json', 60 | model_train_parameters = config.MODEL_DIR+ '/amazonreviews/model_06_meta.json', 61 | num_epochs=35) 62 | 63 | train_params.save() 64 | 65 | amazon_review_model._model.compile(loss="binary_crossentropy", 66 | optimizer=train_params.optimizer, 67 | metrics=["accuracy"]) 68 | checkpointer = ModelCheckpoint(filepath=train_params.model_file_path, 69 | verbose=1, 70 | save_best_only=True, 71 | save_weights_only=True) 72 | 73 | early_stop = EarlyStopping(patience=2) 74 | 75 | x_train = np.array(corpus_to_seq) 76 | y_train = np.array(target) 77 | 78 | x_test = np.array(holdout_corpus_to_seq) 79 | y_test = np.array(holdout_target) 80 | 81 | print(x_train.shape, y_train.shape) 82 | 83 | amazon_review_model.get_classification_model().fit(x_train, y_train, 84 | batch_size=train_params.batch_size, 85 | epochs=train_params.num_epochs, 86 | verbose=2, 87 | validation_split=train_params.validation_split, 88 | callbacks=[checkpointer]) 89 | 90 | amazon_review_model.get_classification_model().evaluate( x_test, y_test, train_params.batch_size*10, verbose=2) 91 | 92 | amazon_review_model._save_model(train_params.model_hyper_parameters) 93 | 94 | 95 | 96 | ''' Which embeddings changes most ''' 97 | 98 | learned_embeddings = amazon_review_model.get_classification_model().get_layer('imdb_embedding').get_weights()[0] 99 | 100 | embd_change = {} 101 | for word, i in preprocessor.word_index.items(): 102 | embd_change[word] = np.linalg.norm(initial_embeddings[i]-learned_embeddings[i]) 103 | embd_change = sorted(embd_change.items(), key=lambda x: x[1], reverse=True) 104 | embd_change[0:20] 105 | 106 | -------------------------------------------------------------------------------- /notebooks/Ch07 - Text Document Categorization/config.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 9 11:06:03 2018 4 | 5 | @author: tghosh 6 | """ 7 | 8 | import pandas as pd 9 | import numpy as np 10 | import os 11 | 12 | TEXT_DATA_DIR = 'PATH/TO/DATA_ROOT' 13 | 14 | #Dataset from http://ai.stanford.edu/~amaas/data/sentiment/ 15 | IMDB_DATA = TEXT_DATA_DIR + 'aclImdb' 16 | IMDB_DATA_CSV = TEXT_DATA_DIR + 'imdb_csv' 17 | 18 | PROCESSED_20_NEWS_GRP = TEXT_DATA_DIR + '20newsgrp' 19 | 20 | AMAZON_TRAIN_DATA = TEXT_DATA_DIR+'amazonreviews/train.ft' 21 | AMAZON_TEST_DATA = TEXT_DATA_DIR+'amazonreviews/test.ft' 22 | 23 | GLOVE_DIR = TEXT_DATA_DIR+ 'glove.6B' 24 | WORD2VEC_DIR = TEXT_DATA_DIR+ 'word2vec' 25 | 26 | MODEL_DIR = 'PATH/TO/MODELDIR' 27 | -------------------------------------------------------------------------------- /notebooks/Ch07 - Text Document Categorization/dataloader/__init__.py: -------------------------------------------------------------------------------- 1 | # -------------------------------------------------------------------------------- /notebooks/Ch07 - Text Document Categorization/dataloader/embeddings.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 9 11:45:59 2018 4 | 5 | @author: tghosh 6 | """ 7 | import config 8 | import numpy as np 9 | import os 10 | 11 | class GloVe: 12 | 13 | def __init__(self, embd_dim=50): 14 | if embd_dim not in [50, 100, 200, 300]: 15 | raise ValueError('embedding dim should be one of [50, 100, 200, 300]') 16 | self.EMBEDDING_DIM = embd_dim 17 | self.embedding_matrix = None 18 | 19 | def _load(self): 20 | print('Reading {} dim GloVe vectors'.format(self.EMBEDDING_DIM)) 21 | self.embeddings_index = {} 22 | with open(os.path.join(config.GLOVE_DIR, 'glove.6B.'+str(self.EMBEDDING_DIM)+'d.txt'),encoding="utf8") as fin: 23 | for line in fin: 24 | try: 25 | values = line.split() 26 | coefs = np.asarray(values[1:], dtype='float32') 27 | word = values[0] 28 | self.embeddings_index[word] = coefs 29 | except: 30 | print(line) 31 | 32 | print('Found %s word vectors.' % len(self.embeddings_index)) 33 | 34 | def _init_embedding_matrix(self, word_index_dict, oov_words_file='OOV-Words.txt'): 35 | self.embedding_matrix = np.zeros((len(word_index_dict)+2 , self.EMBEDDING_DIM)) # +1 for the 0 word index from paddings. 36 | not_found_words=0 37 | missing_word_index = [] 38 | 39 | with open(oov_words_file, 'w') as f: 40 | for word, i in word_index_dict.items(): 41 | embedding_vector = self.embeddings_index.get(word) 42 | if embedding_vector is not None: 43 | # words not found in embedding index will be all-zeros. 44 | self.embedding_matrix[i] = embedding_vector 45 | else: 46 | not_found_words+=1 47 | f.write(word + ','+str(i)+'\n') 48 | missing_word_index.append(i) 49 | 50 | #oov by average vector: 51 | self.embedding_matrix[1] = np.mean(self.embedding_matrix, axis=0) 52 | for indx in missing_word_index: 53 | self.embedding_matrix[indx] = np.random.rand(self.EMBEDDING_DIM)+ self.embedding_matrix[1] 54 | print("words not found in embeddings: {}".format(not_found_words)) 55 | 56 | 57 | def get_embedding(self, word_index_dict): 58 | if self.embedding_matrix is None: 59 | self._load() 60 | self._init_embedding_matrix(word_index_dict) 61 | return self.embedding_matrix 62 | 63 | def update_embeddings(self, word_index_dict, other_embedding, other_word_index): 64 | num_updated = 0 65 | for word, i in other_word_index.items(): 66 | if word_index_dict.get(word) is not None: 67 | embedding_vector = other_embedding[i] 68 | this_vocab_word_indx = word_index_dict.get(word) 69 | #print("BEFORE", self.embedding_matrix[this_vocab_word_indx]) 70 | self.embedding_matrix[this_vocab_word_indx] = embedding_vector 71 | #print("AFTER", self.embedding_matrix[this_vocab_word_indx]) 72 | num_updated+=1 73 | 74 | print('{} words are updated out of {}'.format(num_updated, len(word_index_dict))) 75 | 76 | class Word2Vec(GloVe): 77 | def __init__(self, embd_dim=50): 78 | super().__init__(embd_dim=embd_dim) 79 | 80 | def _load(self): 81 | print('Reading {} dim Gensim Word2Vec vectors'.format(self.EMBEDDING_DIM)) 82 | self.embeddings_index = {} 83 | with open(os.path.join(config.WORD2VEC_DIR, 'word2vec_'+str(self.EMBEDDING_DIM)+'_imdb.txt'),encoding="utf8") as fin: 84 | for line in fin: 85 | try: 86 | values = line.split() 87 | coefs = np.asarray(values[1:], dtype='float32') 88 | word = values[0] 89 | self.embeddings_index[word] = coefs 90 | except: 91 | print(line) 92 | 93 | print('Found %s word vectors.' % len(self.embeddings_index)) 94 | #test 95 | #glove=Word2Vec(50) 96 | #initial_embeddings = glove.get_embedding({'good':2, 'movie':3}) 97 | -------------------------------------------------------------------------------- /notebooks/Ch07 - Text Document Categorization/dataloader/loader.py: -------------------------------------------------------------------------------- 1 | # -*- coding: utf-8 -*- 2 | """ 3 | Created on Mon Apr 9 10:44:05 2018 4 | 5 | @author: tghosh 6 | """ 7 | import config 8 | from preprocessing import utils 9 | import re 10 | import os 11 | import numpy as np 12 | import pandas as pd 13 | from sklearn.datasets import fetch_20newsgroups 14 | from nltk.corpus import reuters 15 | from sklearn.preprocessing import LabelEncoder 16 | 17 | class Loader: 18 | 19 | amzn_reviews_kaggle_regx = re.compile(r'__label__(?P