├── .gitignore
├── LICENSE
├── README.md
├── media
└── banners
│ ├── amazon_logo.png
│ ├── front_cover.png
│ ├── packt_logo.png
│ └── safari_logo.png
└── notebooks
├── Ch01 - Machine Learning Fundamentals
├── battles.csv
├── credit_default.xls
├── feature_engineering_numerical_and_categorical_data.ipynb
├── feature_engineering_numerical_and_categorical_data.py
├── feature_engineering_text_data.ipynb
├── feature_engineering_text_data.py
├── game_of_thrones_eda.ipynb
└── game_of_thrones_eda.py
├── Ch02 - Deep Learning Essentials
└── NNBasics.ipynb
├── Ch05 - Unleash the Power of Transfer Learning
├── Basic CNN Model.ipynb
├── CNN with Image Augmentation.ipynb
├── CNN with Transfer Learning.ipynb
├── Datasets Builder.ipynb
├── Model Performance Evaluations.ipynb
├── model_evaluation_utils.py
└── utils.py
├── Ch06 - Image Recognition and Classification
├── 0fc12a365adfcbb603e298b10149632a.jpg
├── CIFAR10_CNN_Classifier.ipynb
├── CIFAR10_VGG16_Transfer_Learning_Classifier.ipynb
├── Dog_Breed_EDA.ipynb
├── Dog_Breed_Transfer_Learning_Classifier.ipynb
├── cnn_utils.py
├── dog_breed_transfer_learning_classifier.py
└── model_evaluation_utils.py
├── Ch07 - Text Document Categorization
├── 20_newsgrp_cnn_model.ipynb
├── IMDB_word2Vec.ipynb
├── Text_Summarization_IMDB.ipynb
├── amazon_review_model.py
├── config.py
├── dataloader
│ ├── __init__.py
│ ├── embeddings.py
│ └── loader.py
├── imdb_model.py
├── model
│ ├── __init__.py
│ ├── cnn_document_model.py
│ └── custom_layer.py
├── preprocessing
│ ├── __init__.py
│ └── utils.py
├── transfer_learning_imdb.py
└── utils.py
├── Ch08 - Audio Identification and Categorization
├── Exploratory Analysis Sound Data.ipynb
├── Feature Engineering.ipynb
├── Modeling.ipynb
└── Prediction Pipeline.ipynb
├── Ch09 - Deep Dream
├── DSC09296.jpg
├── Deep Dream Final.ipynb
├── blue-sky.jpg
├── labrador.jpg
└── mccowan_mountain_orig.jpg
├── Ch10 - Neural Style Transfer
├── Style Transfer Model.ipynb
└── Style Transfer Results HD.ipynb
├── Ch11 - Automated Image Caption Generator
├── Image feature extraction.ipynb
├── Model Test.ipynb
└── Model Train.ipynb
├── Ch12 - Image Colorization
├── colorization_task.png
├── colornet_architecture.png
├── colornet_vgg16.ipynb
├── colorspaces.png
└── images
│ ├── 0311abf53d60705cc9605bc46589b0d6.jpeg
│ ├── 060df5a829e988f6b5de7c089dc3c05d.jpeg
│ ├── 07c4859f54fb3184aebca9a7b3aa5317.jpeg
│ ├── 097ec13396d0593ddd00e360b7375b8f.jpeg
│ ├── 0dc3e95a9954c5d50e5ddac5bf774e09.jpeg
│ ├── 1261ea1079ab97b732812e328c3a5c48.jpeg
│ ├── 156979415efa5edc3420558a884b3536.jpeg
│ ├── 15f6abb6f801e04c880008f39a0ba558.jpeg
│ ├── 1b99b787ef471af6c652e01737d883a6.jpeg
│ ├── 1cc416b4897eca408ad396a09ad000cd.jpeg
│ ├── 232a6c59d6965f9f466dd2390829a69a.jpeg
│ ├── 2374de3c74aee73fd495c57eee8e4ab0.jpeg
│ ├── 24e8e4a8b32e2ccbc1e7b2798cb8fcd8.jpeg
│ ├── 2fbb562e61f6460cf9820940d61b7001.jpeg
│ ├── 328e41c2fbc194b0f8900007a61c6d4a.jpeg
│ ├── 3b8640a78d79be87f927a63174902346.jpeg
│ ├── 529b4fb2e62f249a87b5908debaf73e7.jpeg
│ ├── 56eb7309c5bc5cb9629b2db830a1b025.jpeg
│ ├── 58fd582ca79217457b0d8807a6302824.jpeg
│ ├── 5a0728978e9a5180076ec357bc28c92e.jpeg
│ ├── 5a2987b714c15456c0e038dabcf426e2.jpeg
│ ├── 62e2af22b63e4674759d811aab1b6679.jpeg
│ ├── 62f4423f981dda4a508781f4845b0c09.jpeg
│ ├── 663b353d812988e7635ce6709414d6aa.jpeg
│ ├── 6a47074ff275acedfa82ffbc9025b703.jpeg
│ ├── 6b85128b02f95c628c85437b1eef38a4.jpeg
│ ├── 6d905979f4788767939883c0f8b4250b.jpeg
│ ├── 7a24ba5bfdbb9602f78197e3e103feb4.jpeg
│ ├── 7ed962e763ce17c16df44cfa96dfd047.jpeg
│ ├── 81491e7159595d373c027e9c337eecfa.jpeg
│ ├── 83f948c28622623c088d6d7cc0d02b18.jpeg
│ ├── 8587f219580ae6f207490ff02f853df3.jpeg
│ ├── 8afc970abdab28220ad7a0be25457a2e.jpeg
│ ├── 92e06aafe0ca084825921deb2b4c5c55.jpeg
│ ├── ae69df2e9995ee238f5ea93090a3981d.jpeg
│ ├── b208394092522e1a74b7ce9d8b558022.jpeg
│ ├── bb9d9ece213507e6c45852633c6e61e8.jpeg
│ ├── cb2af8fbb9cd6d48eb1009c68349cb1e.jpeg
│ ├── cba1776f1d7129f9f83a4d9fc4b89039.jpeg
│ ├── cbb19bd188a96067f478c0a9b1559844.jpeg
│ ├── cc3d9a6d928111e9861bebd43b475f63.jpeg
│ ├── desktop.ini
│ ├── e2b2bcccdba6d1293a7e89ef3d6df112.jpeg
│ ├── e72e923ffc4d51faf1a99d09bd59896a.jpeg
│ ├── fe79cd9cda63a4af3924d2718b7e775a.jpeg
│ └── ff20b4d22ed3e1e8829768201d110f53.jpeg
└── README.md
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | Apache License
2 | Version 2.0, January 2004
3 | http://www.apache.org/licenses/
4 |
5 | TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
6 |
7 | 1. Definitions.
8 |
9 | "License" shall mean the terms and conditions for use, reproduction,
10 | and distribution as defined by Sections 1 through 9 of this document.
11 |
12 | "Licensor" shall mean the copyright owner or entity authorized by
13 | the copyright owner that is granting the License.
14 |
15 | "Legal Entity" shall mean the union of the acting entity and all
16 | other entities that control, are controlled by, or are under common
17 | control with that entity. For the purposes of this definition,
18 | "control" means (i) the power, direct or indirect, to cause the
19 | direction or management of such entity, whether by contract or
20 | otherwise, or (ii) ownership of fifty percent (50%) or more of the
21 | outstanding shares, or (iii) beneficial ownership of such entity.
22 |
23 | "You" (or "Your") shall mean an individual or Legal Entity
24 | exercising permissions granted by this License.
25 |
26 | "Source" form shall mean the preferred form for making modifications,
27 | including but not limited to software source code, documentation
28 | source, and configuration files.
29 |
30 | "Object" form shall mean any form resulting from mechanical
31 | transformation or translation of a Source form, including but
32 | not limited to compiled object code, generated documentation,
33 | and conversions to other media types.
34 |
35 | "Work" shall mean the work of authorship, whether in Source or
36 | Object form, made available under the License, as indicated by a
37 | copyright notice that is included in or attached to the work
38 | (an example is provided in the Appendix below).
39 |
40 | "Derivative Works" shall mean any work, whether in Source or Object
41 | form, that is based on (or derived from) the Work and for which the
42 | editorial revisions, annotations, elaborations, or other modifications
43 | represent, as a whole, an original work of authorship. For the purposes
44 | of this License, Derivative Works shall not include works that remain
45 | separable from, or merely link (or bind by name) to the interfaces of,
46 | the Work and Derivative Works thereof.
47 |
48 | "Contribution" shall mean any work of authorship, including
49 | the original version of the Work and any modifications or additions
50 | to that Work or Derivative Works thereof, that is intentionally
51 | submitted to Licensor for inclusion in the Work by the copyright owner
52 | or by an individual or Legal Entity authorized to submit on behalf of
53 | the copyright owner. For the purposes of this definition, "submitted"
54 | means any form of electronic, verbal, or written communication sent
55 | to the Licensor or its representatives, including but not limited to
56 | communication on electronic mailing lists, source code control systems,
57 | and issue tracking systems that are managed by, or on behalf of, the
58 | Licensor for the purpose of discussing and improving the Work, but
59 | excluding communication that is conspicuously marked or otherwise
60 | designated in writing by the copyright owner as "Not a Contribution."
61 |
62 | "Contributor" shall mean Licensor and any individual or Legal Entity
63 | on behalf of whom a Contribution has been received by Licensor and
64 | subsequently incorporated within the Work.
65 |
66 | 2. Grant of Copyright License. Subject to the terms and conditions of
67 | this License, each Contributor hereby grants to You a perpetual,
68 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
69 | copyright license to reproduce, prepare Derivative Works of,
70 | publicly display, publicly perform, sublicense, and distribute the
71 | Work and such Derivative Works in Source or Object form.
72 |
73 | 3. Grant of Patent License. Subject to the terms and conditions of
74 | this License, each Contributor hereby grants to You a perpetual,
75 | worldwide, non-exclusive, no-charge, royalty-free, irrevocable
76 | (except as stated in this section) patent license to make, have made,
77 | use, offer to sell, sell, import, and otherwise transfer the Work,
78 | where such license applies only to those patent claims licensable
79 | by such Contributor that are necessarily infringed by their
80 | Contribution(s) alone or by combination of their Contribution(s)
81 | with the Work to which such Contribution(s) was submitted. If You
82 | institute patent litigation against any entity (including a
83 | cross-claim or counterclaim in a lawsuit) alleging that the Work
84 | or a Contribution incorporated within the Work constitutes direct
85 | or contributory patent infringement, then any patent licenses
86 | granted to You under this License for that Work shall terminate
87 | as of the date such litigation is filed.
88 |
89 | 4. Redistribution. You may reproduce and distribute copies of the
90 | Work or Derivative Works thereof in any medium, with or without
91 | modifications, and in Source or Object form, provided that You
92 | meet the following conditions:
93 |
94 | (a) You must give any other recipients of the Work or
95 | Derivative Works a copy of this License; and
96 |
97 | (b) You must cause any modified files to carry prominent notices
98 | stating that You changed the files; and
99 |
100 | (c) You must retain, in the Source form of any Derivative Works
101 | that You distribute, all copyright, patent, trademark, and
102 | attribution notices from the Source form of the Work,
103 | excluding those notices that do not pertain to any part of
104 | the Derivative Works; and
105 |
106 | (d) If the Work includes a "NOTICE" text file as part of its
107 | distribution, then any Derivative Works that You distribute must
108 | include a readable copy of the attribution notices contained
109 | within such NOTICE file, excluding those notices that do not
110 | pertain to any part of the Derivative Works, in at least one
111 | of the following places: within a NOTICE text file distributed
112 | as part of the Derivative Works; within the Source form or
113 | documentation, if provided along with the Derivative Works; or,
114 | within a display generated by the Derivative Works, if and
115 | wherever such third-party notices normally appear. The contents
116 | of the NOTICE file are for informational purposes only and
117 | do not modify the License. You may add Your own attribution
118 | notices within Derivative Works that You distribute, alongside
119 | or as an addendum to the NOTICE text from the Work, provided
120 | that such additional attribution notices cannot be construed
121 | as modifying the License.
122 |
123 | You may add Your own copyright statement to Your modifications and
124 | may provide additional or different license terms and conditions
125 | for use, reproduction, or distribution of Your modifications, or
126 | for any such Derivative Works as a whole, provided Your use,
127 | reproduction, and distribution of the Work otherwise complies with
128 | the conditions stated in this License.
129 |
130 | 5. Submission of Contributions. Unless You explicitly state otherwise,
131 | any Contribution intentionally submitted for inclusion in the Work
132 | by You to the Licensor shall be under the terms and conditions of
133 | this License, without any additional terms or conditions.
134 | Notwithstanding the above, nothing herein shall supersede or modify
135 | the terms of any separate license agreement you may have executed
136 | with Licensor regarding such Contributions.
137 |
138 | 6. Trademarks. This License does not grant permission to use the trade
139 | names, trademarks, service marks, or product names of the Licensor,
140 | except as required for reasonable and customary use in describing the
141 | origin of the Work and reproducing the content of the NOTICE file.
142 |
143 | 7. Disclaimer of Warranty. Unless required by applicable law or
144 | agreed to in writing, Licensor provides the Work (and each
145 | Contributor provides its Contributions) on an "AS IS" BASIS,
146 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
147 | implied, including, without limitation, any warranties or conditions
148 | of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
149 | PARTICULAR PURPOSE. You are solely responsible for determining the
150 | appropriateness of using or redistributing the Work and assume any
151 | risks associated with Your exercise of permissions under this License.
152 |
153 | 8. Limitation of Liability. In no event and under no legal theory,
154 | whether in tort (including negligence), contract, or otherwise,
155 | unless required by applicable law (such as deliberate and grossly
156 | negligent acts) or agreed to in writing, shall any Contributor be
157 | liable to You for damages, including any direct, indirect, special,
158 | incidental, or consequential damages of any character arising as a
159 | result of this License or out of the use or inability to use the
160 | Work (including but not limited to damages for loss of goodwill,
161 | work stoppage, computer failure or malfunction, or any and all
162 | other commercial damages or losses), even if such Contributor
163 | has been advised of the possibility of such damages.
164 |
165 | 9. Accepting Warranty or Additional Liability. While redistributing
166 | the Work or Derivative Works thereof, You may choose to offer,
167 | and charge a fee for, acceptance of support, warranty, indemnity,
168 | or other liability obligations and/or rights consistent with this
169 | License. However, in accepting such obligations, You may act only
170 | on Your own behalf and on Your sole responsibility, not on behalf
171 | of any other Contributor, and only if You agree to indemnify,
172 | defend, and hold each Contributor harmless for any liability
173 | incurred by, or claims asserted against, such Contributor by reason
174 | of your accepting any such warranty or additional liability.
175 |
176 | END OF TERMS AND CONDITIONS
177 |
178 | APPENDIX: How to apply the Apache License to your work.
179 |
180 | To apply the Apache License to your work, attach the following
181 | boilerplate notice, with the fields enclosed by brackets "[]"
182 | replaced with your own identifying information. (Don't include
183 | the brackets!) The text should be enclosed in the appropriate
184 | comment syntax for the file format. We also recommend that a
185 | file or class name and description of purpose be included on the
186 | same "printed page" as the copyright notice for easier
187 | identification within third-party archives.
188 |
189 | Copyright [yyyy] [name of copyright owner]
190 |
191 | Licensed under the Apache License, Version 2.0 (the "License");
192 | you may not use this file except in compliance with the License.
193 | You may obtain a copy of the License at
194 |
195 | http://www.apache.org/licenses/LICENSE-2.0
196 |
197 | Unless required by applicable law or agreed to in writing, software
198 | distributed under the License is distributed on an "AS IS" BASIS,
199 | WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
200 | See the License for the specific language governing permissions and
201 | limitations under the License.
202 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Hands-On Transfer Learning with Python
2 | ### Implement advanced deep learning and neural network models using Tensorflow and Keras
3 |
4 | With the world moving towards digitalization and automation, as a technologist/programmer it is important to keep oneself updated and learn how to leverage these tools and techniques. [*__"Hands-On Transfer Learning with Python"__*](https://github.com/dipanjanS/hands-on-transfer-learning-with-python#contents), is an attempt to help practitioners get acquainted with and equipped to use these advancements in their respective domains. This book is structured broadly into three sections:
5 | + Deep learning foundations
6 | + Essentials of transfer learning
7 | + Transfer learning case studies
8 |
9 | This repository contains all the code, notebooks and examples used in this book. We will also be adding bonus content here from time to time. So keep watching this space!
10 |
11 |
12 | ## Get the book
13 |
32 |
33 | ## About the book
34 |
35 |
36 |
37 |
38 | Transfer learning is a machine learning (ML) technique where knowledge gained during the training of one set of ML problems can be used to train other similar types of problems. The purpose of this book is two-fold. We focus on detailed coverage of deep learning and transfer learning, comparing and contrasting the two with easy-to-follow concepts and examples. The second area of focus will be on real-world examples and research problems using [`tensorflow`](https://www.tensorflow.org/), [`keras`](https://keras.io/), and the Python ecosystem with hands-on examples.
39 |
40 | The book starts with core essential concepts of ML and deep learning, followed by some depictions and coverage of important deep learning architectures, such as CNNs, DNNs, RNNs, LSTMs, and capsule networks. Our focus then shifts to transfer learning concepts and pretrained state of the art networks such as VGG, Inception, and ResNet. We also learn how these systems can be leveraged to improve performance of our deep learning models. Finally, we focus on a multitude of real-world case studies and problems in areas such as computer vision, audio analysis, and natural language processing (NLP). By the end of this book, you will be all ready to implement both deep learning and transfer learning principles in your own systems.
41 |
42 |
43 | Edition: 1st Pages: 438 Language: English
44 | Book Title: Hands-On Transfer Learning with Python Publisher: Packt
45 | Copyright: Sarkar, Bali & Ghosh ISBN 13: 9781788831307
46 |
47 |
48 |
49 |
50 |
51 | ## [Contents](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks#book-contents)
52 |
53 | - [__Part I: Deep learning foundations__](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks#part-i-deep-learning-foundations)
54 | - [Chapter 1: Machine Learning Fundamentals Basics](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch01%20-%20Machine%20Learning%20Fundamentals)
55 | - [Chapter 2: Deep Learning Essentials Basics](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch02%20-%20Deep%20Learning%20Essentials)
56 | - Chapter 3: Understanding Deep Learning Architectures Basics
57 | - [__Part II: Essentials of transfer learning__](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks#part-ii-essentials-of-transfer-learning)
58 | - Chapter 4: Transfer Learning Fundamentals Basics
59 | - [Chapter 5: Unleashing the Power of Transfer Learning](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch05%20-%20Unleash%20the%20Power%20of%20Transfer%20Learning)
60 | - [__Part III: Transfer learning case studies__](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks#part-iii-transfer-learning-case-studies)
61 | - [Chapter 6: Image Recognition and Classification](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch06%20-%20Image%20Recognition%20and%20Classification)
62 | - [Chapter 7: Text Document Categorization](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch07%20-%20Text%20Document%20Categorization)
63 | - [Chapter 8: Audio Identification and Classification](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch08%20-%20Audio%20Identification%20and%20Categorization)
64 | - [Chapter 9: Deep Dream](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch09%20-%20Deep%20Dream)
65 | - [Chapter 10: Style Transfer](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch10%20-%20Neural%20Style%20Transfer)
66 | - [Chapter 11: Automated Image Caption Generator](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch11%20-%20Automated%20Image%20Caption%20Generator)
67 | - [Chapter 12: Image Colorization](https://github.com/dipanjanS/hands-on-transfer-learning-with-python/tree/master/notebooks/Ch12%20-%20Image%20Colorization)
68 |
69 |
70 | ## Key Features:
71 | + Build deep learning models with transfer learning principles in Python
72 | + Implement transfer learning to solve real-world research problems
73 | + Perform complex operations such as image captioning neural style transfer
74 |
75 | ## What You Will Learn:
76 | + Set up your own DL environment with graphics processing unit (GPU) and Cloud support
77 | + Delve into transfer learning principles with ML and DL models
78 | + Explore various DL architectures, including CNN, LSTM, and capsule networks
79 | + Learn about data and network representation and loss functions
80 | + Get to grips with models and strategies in transfer learning
81 | + Walk through potential challenges in building complex transfer learning models from scratch
82 | + Explore real-world research problems related to computer vision and audio analysis
83 | + Understand how transfer learning can be leveraged in NLP
84 |
85 |
86 |
87 | ## Audience
88 | Hands-On Transfer Learning with Python is for data scientists, ML engineers, analysts, and developers with an interest in data and applying state-of-the-art transfer learning methodologies to solve tough real-world problems.
89 | __Basic proficiency in ML and Python is required.__
90 |
91 | ## Acknowledgements
92 | TBA
93 |
--------------------------------------------------------------------------------
/media/banners/amazon_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/media/banners/amazon_logo.png
--------------------------------------------------------------------------------
/media/banners/front_cover.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/media/banners/front_cover.png
--------------------------------------------------------------------------------
/media/banners/packt_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/media/banners/packt_logo.png
--------------------------------------------------------------------------------
/media/banners/safari_logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/media/banners/safari_logo.png
--------------------------------------------------------------------------------
/notebooks/Ch01 - Machine Learning Fundamentals/battles.csv:
--------------------------------------------------------------------------------
1 | name,year,battle_number,attacker_king,defender_king,attacker_1,attacker_2,attacker_3,attacker_4,defender_1,defender_2,defender_3,defender_4,attacker_outcome,battle_type,major_death,major_capture,attacker_size,defender_size,attacker_commander,defender_commander,summer,location,region,note
Battle of the Golden Tooth,298,1,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,win,pitched battle,1,0,15000,4000,Jaime Lannister,"Clement Piper, Vance",1,Golden Tooth,The Westerlands,
Battle at the Mummer's Ford,298,2,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Baratheon,,,,win,ambush,1,0,,120,Gregor Clegane,Beric Dondarrion,1,Mummer's Ford,The Riverlands,
Battle of Riverrun,298,3,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,win,pitched battle,0,1,15000,10000,"Jaime Lannister, Andros Brax","Edmure Tully, Tytos Blackwood",1,Riverrun,The Riverlands,
Battle of the Green Fork,298,4,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,loss,pitched battle,1,1,18000,20000,"Roose Bolton, Wylis Manderly, Medger Cerwyn, Harrion Karstark, Halys Hornwood","Tywin Lannister, Gregor Clegane, Kevan Lannister, Addam Marbrand",1,Green Fork,The Riverlands,
Battle of the Whispering Wood,298,5,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,1,1,1875,6000,"Robb Stark, Brynden Tully",Jaime Lannister,1,Whispering Wood,The Riverlands,
Battle of the Camps,298,6,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,0,0,6000,12625,"Robb Stark, Tytos Blackwood, Brynden Tully","Lord Andros Brax, Forley Prester",1,Riverrun,The Riverlands,
Sack of Darry,298,7,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Darry,,,,win,pitched battle,0,0,,,Gregor Clegane,Lyman Darry,1,Darry,The Riverlands,
Battle of Moat Cailin,299,8,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,pitched battle,0,0,,,Victarion Greyjoy,,1,Moat Cailin,The North,
Battle of Deepwood Motte,299,9,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,siege,0,0,1000,,Asha Greyjoy,,1,Deepwood Motte,The North,
Battle of the Stony Shore,299,10,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,ambush,0,0,264,,Theon Greyjoy,,1,Stony Shore,The North,"Greyjoy's troop number based on the Battle of Deepwood Motte, in which Asha had 1000 soldier on 30 longships. That comes out to ~33 per longship. In the Battle of the Stony Shore, Theon has 8 longships, and just we can estimate that he has 8*33 =265 troops."
Battle of Torrhen's Square,299,11,Robb Stark,Balon/Euron Greyjoy,Stark,,,,Greyjoy,,,,win,pitched battle,0,0,244,900,"Rodrik Cassel, Cley Cerwyn",Dagmer Cleftjaw,1,Torrhen's Square,The North,Greyjoy's troop number comes from the 264 estimate to have arrived on the stony shore minus the 20 Theon takes to attack Winterfell. Thus 264-20=244
Battle of Winterfell,299,12,Balon/Euron Greyjoy,Robb Stark,Greyjoy,,,,Stark,,,,win,ambush,0,1,20,,Theon Greyjoy,Bran Stark,1,Winterfell,The North,"It isn't mentioned how many Stark men are left in Winterfell, other than ""very few""."
Sack of Torrhen's Square,299,13,Balon/Euron Greyjoy,Balon/Euron Greyjoy,Greyjoy,,,,Stark,,,,win,siege,0,1,,,Dagmer Cleftjaw,,1,Torrhen's Square,The North,
Sack of Winterfell,299,14,Joffrey/Tommen Baratheon,Robb Stark,Bolton,Greyjoy,,,Stark,,,,win,ambush,1,0,618,2000,"Ramsay Snow, Theon Greyjoy ","Rodrik Cassel, Cley Cerwyn, Leobald Tallhart",1,Winterfell,The North,"Since House Bolton betrays the Starks for House Lannister, we code this battle as between these two houses. Greyjoy men, numbering only 20, don't play a major part in the fighting and end up dying anyway."
Battle of Oxcross,299,15,Robb Stark,Joffrey/Tommen Baratheon,Stark,Tully,,,Lannister,,,,win,ambush,1,1,6000,10000,"Robb Stark, Brynden Tully","Stafford Lannister, Roland Crakehall, Antario Jast",1,Oxcross,The Westerlands,
Siege of Storm's End,299,16,Stannis Baratheon,Renly Baratheon,Baratheon,,,,Baratheon,,,,win,siege,1,0,5000,20000,"Stannis Baratheon, Davos Seaworth","Renly Baratheon, Cortnay Penrose, Loras Tyrell, Randyll Tarly, Mathis Rowan",1,Storm's End,The Stormlands,
Battle of the Fords,299,17,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Tully,,,,loss,pitched battle,0,0,20000,10000,"Tywin Lannister, Flement Brax, Gregor Clegane, Addam Marbrand, Lyle Crakehall, Leo Lefford","Edmure Tully, Jason Mallister, Karyl Vance",1,Red Fork,The Riverlands,
Sack of Harrenhal,299,18,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,win,ambush,1,0,100,100,"Roose Bolton, Vargo Hoat, Robett Glover",Amory Lorch,1,Harrenhal,The Riverlands,
Battle of the Crag,299,19,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,win,ambush,0,0,6000,,"Robb Stark, Smalljon Umber, Black Walder Frey",Rolph Spicer,1,Crag,The Westerlands,
Battle of the Blackwater,299,20,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,,,,Lannister,,,,loss,pitched battle,1,1,21000,7250,"Stannis Baratheon, Imry Florent, Guyard Morrigen, Rolland Storm, Salladhor Saan, Davos Seaworth","Tyrion Lannister, Jacelyn Bywater, Sandor Clegane, Tywin Lannister, Garlan Tyrell, Mace Tyrell, Randyll Tarly",1,King's Landing,The Crownlands,
Siege of Darry,299,21,Robb Stark,Joffrey/Tommen Baratheon,Darry,,,,Lannister,,,,win,siege,0,0,,,Helman Tallhart,,1,Darry,The Riverlands,
Battle of Duskendale,299,22,Robb Stark,Joffrey/Tommen Baratheon,Stark,,,,Lannister,,,,loss,pitched battle,1,0,3000,,"Robertt Glover, Helman Tallhart","Randyll Tarly, Gregor Clegane",1,Duskendale,The Crownlands,
Battle of the Burning Septry,299,23,,,Brotherhood without Banners,,,,Brave Companions,,,,win,pitched battle,0,0,,,,,1,,The Riverlands,
Battle of the Ruby Ford,299,24,Joffrey/Tommen Baratheon,Robb Stark,Lannister,,,,Stark,,,,win,pitched battle,0,0,,6000,Gregor Clegane,"Roose Bolton, Wylis Manderly",,Ruby Ford,The Riverlands,
Retaking of Harrenhal,299,25,Joffrey/Tommen Baratheon,,Lannister,,,,Brave Companions,,,,win,pitched battle,1,0,,,Gregor Clegane,Vargo Hoat,1,Harrenhal,The Riverlands,
The Red Wedding,299,26,Joffrey/Tommen Baratheon,Robb Stark,Frey,Bolton,,,Stark,,,,win,ambush,1,1,3500,3500,"Walder Frey, Roose Bolton, Walder Rivers",Robb Stark,1,The Twins,The Riverlands,"This observation refers to the battle against the Stark men, not the attack on the wedding"
Siege of Seagard,299,27,Robb Stark,Joffrey/Tommen Baratheon,Frey,,,,Mallister,,,,win,siege,0,1,,,Walder Frey,Jason Mallister,1,Seagard,The Riverlands,
Battle of Castle Black,300,28,Stannis Baratheon,Mance Rayder,Free folk,Thenns,Giants,,Night's Watch,Baratheon,,,loss,siege,1,1,100000,1240,"Mance Rayder, Tormund Giantsbane, Harma Dogshead, Magnar Styr, Varamyr","Stannis Baratheon, Jon Snow, Donal Noye, Cotter Pyke",0,Castle Black,Beyond the Wall,
Fall of Moat Cailin,300,29,Joffrey/Tommen Baratheon,Balon/Euron Greyjoy,Bolton,,,,Greyjoy,,,,win,siege,0,0,,,Ramsey Bolton,,0,Moat Cailin,The North,
Sack of Saltpans,300,30,,,Brave Companions,,,,,,,,win,razing,0,0,,,Rorge,,0,Saltpans,The Riverlands,
Retaking of Deepwood Motte,300,31,Stannis Baratheon,Balon/Euron Greyjoy,Baratheon,Karstark,Mormont,Glover,Greyjoy,,,,win,pitched battle,0,0,4500,200,"Stannis Baratheon, Alysane Mormot",Asha Greyjoy,0,Deepwood Motte,The North,
Battle of the Shield Islands,300,32,Balon/Euron Greyjoy,Joffrey/Tommen Baratheon,Greyjoy,,,,Tyrell,,,,win,pitched battle,0,0,,,"Euron Greyjoy, Victarion Greyjoy",,0,Shield Islands,The Reach,
"Invasion of Ryamsport, Vinetown, and Starfish Harbor",300,33,Balon/Euron Greyjoy,Joffrey/Tommen Baratheon,Greyjoy,,,,Tyrell,,,,win,razing,0,0,,,"Euron Greyjoy, Victarion Greyjoy",,0,"Ryamsport, Vinetown, Starfish Harbor",The Reach,
Second Seige of Storm's End,300,34,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,,,,Baratheon,,,,win,siege,0,0,,200,"Mace Tyrell, Mathis Rowan",Gilbert Farring,0,Storm's End,The Stormlands,
Siege of Dragonstone,300,35,Joffrey/Tommen Baratheon,Stannis Baratheon,Baratheon,,,,Baratheon,,,,win,siege,0,0,2000,,"Loras Tyrell, Raxter Redwyne",Rolland Storm,0,Dragonstone,The Stormlands,
Siege of Riverrun,300,36,Joffrey/Tommen Baratheon,Robb Stark,Lannister,Frey,,,Tully,,,,win,siege,0,0,3000,,"Daven Lannister, Ryman Fey, Jaime Lannister",Brynden Tully,0,Riverrun,The Riverlands,
Siege of Raventree,300,37,Joffrey/Tommen Baratheon,Robb Stark,Bracken,Lannister,,,Blackwood,,,,win,siege,0,1,1500,,"Jonos Bracken, Jaime Lannister",Tytos Blackwood,0,Raventree,The Riverlands,
Siege of Winterfell,300,38,Stannis Baratheon,Joffrey/Tommen Baratheon,Baratheon,Karstark,Mormont,Glover,Bolton,Frey,,,,,,,5000,8000,Stannis Baratheon,Roose Bolton,0,Winterfell,The North,
--------------------------------------------------------------------------------
/notebooks/Ch01 - Machine Learning Fundamentals/credit_default.xls:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/notebooks/Ch01 - Machine Learning Fundamentals/credit_default.xls
--------------------------------------------------------------------------------
/notebooks/Ch01 - Machine Learning Fundamentals/feature_engineering_numerical_and_categorical_data.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # # Feature Engineering
5 | import numpy as np
6 | import pandas as pd
7 |
8 | # plotting
9 | import seaborn as sns
10 | import matplotlib.pyplot as plt
11 |
12 | # setting params
13 | params = {'legend.fontsize': 'x-large',
14 | 'figure.figsize': (30, 10),
15 | 'axes.labelsize': 'x-large',
16 | 'axes.titlesize':'x-large',
17 | 'xtick.labelsize':'x-large',
18 | 'ytick.labelsize':'x-large'}
19 |
20 | sns.set_style('whitegrid')
21 | sns.set_context('talk')
22 |
23 |
24 | # ## Feature Engineering : Numerical Data
25 |
26 | # load dataset
27 | credit_df = pd.read_excel('credit_default.xls',
28 | skiprows=1,index_col=0)
29 | credit_df.shape
30 |
31 |
32 | credit_df.head()
33 |
34 |
35 | # ### Extract Raw Features
36 | # Attributes which are useful in their raw form itself
37 |
38 | credit_df[['LIMIT_BAL','BILL_AMT1',
39 | 'BILL_AMT2','BILL_AMT3',
40 | 'BILL_AMT4','BILL_AMT5',
41 | 'BILL_AMT6']].head()
42 |
43 |
44 | # ### Counts
45 | # Based on requirements, count of events is also a useful attribute.
46 |
47 | # utility function
48 | def default_month_count(row):
49 | count = 0
50 | for i in [0,2,3,4,5,6]:
51 | if row['PAY_'+str(i)] > 0:
52 | count +=1
53 | return count
54 |
55 |
56 |
57 | credit_df['number_of_default_months'] = credit_df.apply(default_month_count,
58 | axis=1)
59 |
60 |
61 | credit_df[['number_of_default_months']].head()
62 |
63 |
64 | # ### Binarization
65 | # Occurance or absence of an event is also a useful feature
66 |
67 | credit_df['has_ever_defaulted'] = credit_df.number_of_default_months.apply(lambda x: 1 if x>0 else 0)
68 | credit_df[['number_of_default_months','has_ever_defaulted']].head()
69 |
70 |
71 | # ### Binning
72 | # Also known as quantization, helps in transformin continuous features such as
73 | # age and income onto discrete scales.
74 |
75 | credit_df.AGE.plot(kind='hist',bins=60)
76 | plt.title('Age Histogram', fontsize=12)
77 | plt.ylabel('Age', fontsize=12)
78 | plt.xlabel('Frequency', fontsize=12)
79 |
80 |
81 | # #### Fixed Width Bins
82 |
83 | # Fixed Width Bins :
84 | #
85 | # ```
86 | # Age Range: Bin
87 | # ---------------
88 | # 0 - 9 : 0
89 | # 10 - 19 : 1
90 | # 20 - 29 : 2
91 | # 30 - 39 : 3
92 | # ... and so on
93 | # ```
94 |
95 | # Assign a bin label to each row
96 | credit_df['age_bin_fixed'] = credit_df.AGE.apply(lambda age: np.floor(age/10.))
97 |
98 |
99 | credit_df[['AGE','age_bin_fixed']].head()
100 |
101 |
102 | # #### Quantile Based Binning
103 | # * 4-Quartile Binning
104 |
105 | ## Quantile binning
106 | quantile_list = [0, .25, .5, .75, 1.]
107 | quantiles = credit_df.AGE.quantile(quantile_list)
108 | quantiles
109 |
110 |
111 | # Plot Quartile Ranges on the Distribution
112 |
113 | fig, ax = plt.subplots()
114 | credit_df.AGE.plot(kind='hist',bins=60)
115 |
116 | for quantile in quantiles:
117 | qvl = plt.axvline(quantile, color='r')
118 | ax.legend([qvl], ['Quantiles'], fontsize=10)
119 |
120 | ax.set_title('Age Histogram with Quantiles', fontsize=12)
121 | ax.set_xlabel('Age', fontsize=12)
122 | ax.set_ylabel('Frequency', fontsize=12)
123 |
124 |
125 | # Assign Quartile Bin Labels
126 | quantile_labels = ['Q1', 'Q2', 'Q3', 'Q4']
127 | credit_df['age_quantile_range'] = pd.qcut(credit_df['AGE'],
128 | q=quantile_list)
129 | credit_df['age_quantile_label'] = pd.qcut(credit_df['AGE'],
130 | q=quantile_list,
131 | labels=quantile_labels)
132 |
133 |
134 | credit_df[['AGE','age_quantile_range','age_quantile_label']].head()
135 |
136 |
137 |
138 | # ## Feature Engineering : Categorical Data
139 | # We have utilized multiple publicly available datasets to better understand
140 | # categorical attributes
141 |
142 | battles_df = pd.read_csv('battles.csv')
143 | battles_df.shape
144 |
145 |
146 | battles_df[['name','year','attacker_king','attacker_1']].head()
147 |
148 |
149 | # ### Transforming Nominal Features
150 | # Categorical attributes which ***do not*** have any intrinsic
151 | # ordering amongst them
152 |
153 | from sklearn.preprocessing import LabelEncoder
154 |
155 | attacker_le = LabelEncoder()
156 | attacker_labels = attacker_le.fit_transform(battles_df.attacker_1)
157 | attacker_mappings = {index: label for index, label in enumerate(attacker_le.classes_)}
158 | attacker_mappings
159 |
160 |
161 | # assign labels
162 | battles_df['attacker1_label'] = attacker_labels
163 | battles_df[['name','year','attacker_king','attacker_1','attacker1_label']].head()
164 |
165 |
166 | # ### Transforming Ordinal Features
167 | # Categorical attributes which ***have*** an intrinsic ordering amongst them
168 |
169 | sales_df = pd.DataFrame(data={
170 | 'items_sold':abs(np.random.randn(7)*100),
171 | 'day_of_week':['Monday', 'Tuesday',
172 | 'Wednesday', 'Thursday',
173 | 'Friday', 'Saturday',
174 | 'Sunday']})
175 | sales_df
176 |
177 | day_map = {'Monday': 1, 'Tuesday': 2,
178 | 'Wednesday': 3, 'Thursday': 4,
179 | 'Friday': 5, 'Saturday': 6,
180 | 'Sunday' : 7}
181 |
182 | sales_df['weekday_label'] = sales_df['day_of_week'].map(day_map)
183 | sales_df.head()
184 |
185 |
186 | # ### Encoding Categoricals
187 |
188 | # One Hot Encoder
189 |
190 | from sklearn.preprocessing import OneHotEncoder
191 |
192 | day_le = LabelEncoder()
193 | day_labels = day_le.fit_transform(sales_df['day_of_week'])
194 | sales_df['label_encoder_day_label'] = day_labels
195 |
196 | # encode day labels using one-hot encoding scheme
197 | day_ohe = OneHotEncoder()
198 | day_feature_arr = day_ohe.fit_transform(sales_df[['label_encoder_day_label']]).toarray()
199 | day_feature_labels = list(day_le.classes_)
200 | day_features = pd.DataFrame(day_feature_arr, columns=day_feature_labels)
201 |
202 |
203 | sales_ohe_df = pd.concat([sales_df, day_features], axis=1)
204 | sales_ohe_df
205 |
206 |
207 | # Dummy Encoder
208 |
209 |
210 | day_dummy_features = pd.get_dummies(sales_df['day_of_week'], drop_first=True)
211 | pd.concat([sales_df[['day_of_week','items_sold']], day_dummy_features], axis=1)
212 |
213 |
--------------------------------------------------------------------------------
/notebooks/Ch01 - Machine Learning Fundamentals/feature_engineering_text_data.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # # Feature Engineering
5 | # Textual Data
6 | # Important Imports
7 |
8 | import numpy as np
9 | import pandas as pd
10 |
11 | from sklearn.feature_extraction.text import CountVectorizer
12 | from sklearn.feature_extraction.text import TfidfVectorizer
13 |
14 |
15 | # Prepare a Sample Corpus
16 |
17 | corpus = ['pack my box with five dozen liquor jugs.',
18 | 'pack my box',
19 | 'the quick brown fox jumps over the lazy dog.',
20 | 'the brown fox is quick and the blue dog is lazy',
21 | 'pack my box with five dozen liquor jugs and biscuits',
22 | 'the dog is lazy but the brown fox is quick']
23 |
24 | labels = ['picnic', 'picnic', 'animals', 'animals', 'picnic', 'animals']
25 | corpus = np.array(corpus)
26 | corpus_df = pd.DataFrame({'document': corpus,
27 | 'category': labels})
28 | corpus_df = corpus_df[['document', 'category']]
29 | corpus_df
30 |
31 |
32 | # Bag of Words
33 |
34 | cv = CountVectorizer(min_df=0., max_df=1.)
35 | cv_matrix = cv.fit_transform(corpus_df.document)
36 | cv_matrix = cv_matrix.toarray()
37 | cv_matrix
38 |
39 |
40 | # TF-IDF
41 |
42 | tv = TfidfVectorizer(min_df=0., max_df=1., use_idf=True)
43 | tv_matrix = tv.fit_transform(corpus_df.document)
44 | tv_matrix = tv_matrix.toarray()
45 |
46 | vocab = tv.get_feature_names()
47 | pd.DataFrame(np.round(tv_matrix, 2), columns=vocab)
48 |
49 |
50 | # N-Gram Vectorizer
51 |
52 | bv = CountVectorizer(ngram_range=(2,2))
53 | bv_matrix = bv.fit_transform(corpus_df.document)
54 | bv_matrix = bv_matrix.toarray()
55 | vocab = bv.get_feature_names()
56 | pd.DataFrame(bv_matrix, columns=vocab)
57 |
58 |
--------------------------------------------------------------------------------
/notebooks/Ch01 - Machine Learning Fundamentals/game_of_thrones_eda.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # ## Import required packages
5 | import numpy as np
6 | import pandas as pd
7 | from collections import Counter
8 |
9 | # plotting
10 | import seaborn as sns
11 | import matplotlib.pyplot as plt
12 |
13 | # setting params
14 | params = {'legend.fontsize': 'x-large',
15 | 'figure.figsize': (30, 10),
16 | 'axes.labelsize': 'x-large',
17 | 'axes.titlesize':'x-large',
18 | 'xtick.labelsize':'x-large',
19 | 'ytick.labelsize':'x-large'}
20 |
21 | sns.set_style('whitegrid')
22 | sns.set_context('talk')
23 |
24 | plt.rcParams.update(params)
25 |
26 |
27 | # ## Load Dataset
28 | #
29 | # In this step we load the ```battles.csv``` for analysis
30 | # load dataset
31 | battles_df = pd.read_csv('battles.csv')
32 |
33 |
34 | # Display sample rows
35 | print(battles_df.head())
36 |
37 |
38 | # ## Explore raw properties
39 | print("Number of attributes available in the dataset = {}".format(battles_df.shape[1]))
40 |
41 |
42 | # View available columns and their data types
43 | print(battles_df.dtypes)
44 |
45 |
46 | # Analyze properties of numerical columns
47 | battles_df.describe()
48 |
49 | # ## Number of Battles Fought
50 | # This data is till **season 5** only
51 |
52 | print("Number of battles fought={}".format(battles_df.shape[0]))
53 |
54 |
55 | # ## Battle Distribution Across Years
56 | # The plot below shows that maximum bloodshed happened in the year 299 with
57 | # a total of 20 battles fought!
58 |
59 | sns.countplot(y='year',data=battles_df)
60 | plt.title('Battle Distribution over Years')
61 | plt.show()
62 |
63 |
64 | # ## Which Regions saw most Battles?
65 |
66 | sns.countplot(x='region',data=battles_df)
67 | plt.title('Battles by Regions')
68 | plt.show()
69 |
70 |
71 | # ### Death or Capture of Main Characters by Region
72 |
73 | # No prizes for guessing that Riverlands have seen some of the main characters
74 | # being killed or captured. Though _The Reach_ has seen 2 battles, none of the
75 | # major characters seemed to have fallen there.
76 | f, ax1 = plt.subplots()
77 | ax2 = ax1.twinx()
78 | temp_df = battles_df.groupby('region').agg({'major_death':'sum',
79 | 'major_capture':'sum'}).reset_index()
80 | temp_df.loc[:,'dummy'] = 'dummy'
81 | sns.barplot(x="region", y="major_death",
82 | hue='dummy', data=temp_df,
83 | estimator = np.sum, ax = ax1,
84 | hue_order=['dummy','other'])
85 |
86 | sns.barplot(x="region", y="major_capture",
87 | data=temp_df, hue='dummy',
88 | estimator = np.sum, ax = ax2,
89 | hue_order=['other','dummy'])
90 |
91 | ax1.legend_.remove()
92 | ax2.legend_.remove()
93 |
94 |
95 | # ## Who Attacked the most?
96 | # The Baratheon boys love attacking as they lead the pack with 38% while
97 | # Rob Stark has been the attacker in close second with 27.8% of the battles.
98 | attacker_king = battles_df.attacker_king.value_counts()
99 | attacker_king.name='' # turn off annoying y-axis-label
100 | attacker_king.plot.pie(figsize=(6, 6),autopct='%.2f')
101 |
102 |
103 | # ## Who Defended the most?
104 | # Rob Stark and Baratheon boys are again on the top of the pack. Looks like
105 | # they have been on either sides of the war lot many times.
106 |
107 | defender_king = battles_df.defender_king.value_counts()
108 | defender_king.name='' # turn off annoying y-axis-label
109 | defender_king.plot.pie(figsize=(6, 6),autopct='%.2f')
110 |
111 |
112 | # ## Battle Style Distribution
113 | # Plenty of battles all across, yet the men of Westeros and Essos are men of honor.
114 | # This is visible in the distribution which shows **pitched battle** as the
115 | # most common style of battle.
116 |
117 | sns.countplot(y='battle_type',data=battles_df)
118 | plt.title('Battle Type Distribution')
119 | plt.show()
120 |
121 |
122 | # ## Attack or Defend?
123 | # Defending your place in Westeros isn't easy, this is clearly visible from
124 | # the fact that 32 out of 37 battles were won by attackers
125 |
126 | sns.countplot(y='attacker_outcome',data=battles_df)
127 | plt.title('Attack Win/Loss Distribution')
128 | plt.show()
129 |
130 |
131 | # ## Winners
132 | # Who remembers losers? (except if you love the Starks)
133 | # The following plot helps us understand who won how many battles and how,
134 | # by attacking or defending.
135 |
136 | attack_winners = battles_df[battles_df.\
137 | attacker_outcome=='win']\
138 | ['attacker_king'].\
139 | value_counts().\
140 | reset_index()
141 |
142 | attack_winners.rename(
143 | columns={'index':'king',
144 | 'attacker_king':'wins'},
145 | inplace=True)
146 |
147 | attack_winners.loc[:,'win_type'] = 'attack'
148 |
149 | defend_winners = battles_df[battles_df.\
150 | attacker_outcome=='loss']\
151 | ['defender_king'].\
152 | value_counts().\
153 | reset_index()
154 | defend_winners.rename(
155 | columns={'index':'king',
156 | 'defender_king':'wins'},
157 | inplace=True)
158 |
159 | defend_winners.loc[:,'win_type'] = 'defend'
160 |
161 |
162 | sns.barplot(x="king",
163 | y="wins",
164 | hue="win_type",
165 | data=pd.concat([attack_winners,
166 | defend_winners]))
167 | plt.title('Kings and Their Wins')
168 | plt.ylabel('wins')
169 | plt.xlabel('king')
170 | plt.show()
171 |
172 |
173 | # ## Battle Commanders
174 | # A battle requires as much brains as muscle power.
175 | # The following is a distribution of the number of commanders involved on attacking and defending sides.
176 |
177 | battles_df['attack_commander_count'] = battles_df.\
178 | dropna(subset=['attacker_commander']).\
179 | apply(lambda row: \
180 | len(row['attacker_commander'].\
181 | split()),axis=1)
182 | battles_df['defend_commander_count'] = battles_df.\
183 | dropna(subset=['defender_commander']).\
184 | apply(lambda row: \
185 | len(row['defender_commander'].\
186 | split()),axis=1)
187 |
188 | battles_df[['attack_commander_count',
189 | 'defend_commander_count']].plot(kind='box')
190 |
191 |
192 | # ## How many houses fought in a battle?
193 | # Were the battles evenly balanced? The plots tell the whole story.
194 | battles_df['attacker_house_count'] = (4 - battles_df[['attacker_1',
195 | 'attacker_2',
196 | 'attacker_3',
197 | 'attacker_4']].\
198 | isnull().sum(axis = 1))
199 |
200 | battles_df['defender_house_count'] = (4 - battles_df[['defender_1',
201 | 'defender_2',
202 | 'defender_3',
203 | 'defender_4']].\
204 | isnull().sum(axis = 1))
205 |
206 | battles_df['total_involved_count'] = battles_df.apply(lambda row: \
207 | row['attacker_house_count'] + \
208 | row['defender_house_count'],
209 | axis=1)
210 | battles_df['bubble_text'] = battles_df.apply(lambda row: \
211 | '{} had {} house(s) attacking {} house(s) '.\
212 | format(row['name'],
213 | row['attacker_house_count'],
214 | row['defender_house_count']),
215 | axis=1)
216 |
217 |
218 | # ## Unbalanced Battles
219 | # Most battles so far have seen more houses forming alliances while attacking.
220 | # There are only a few friends when you are under attack!
221 |
222 | house_balance = battles_df[
223 | battles_df.attacker_house_count != \
224 | battles_df.defender_house_count][['name',
225 | 'attacker_house_count',
226 | 'defender_house_count']].\
227 | set_index('name')
228 | house_balance.plot(kind='bar')
229 |
230 |
231 | # ## Battles and The size of Armies
232 | # Attackers don't take any chances, they come in huge numbers, keep your eyes open
233 |
234 | army_size_df = battles_df.dropna(subset=['total_involved_count',
235 | 'attacker_size',
236 | 'defender_size',
237 | 'bubble_text'])
238 | army_size_df.plot(kind='scatter', x='defender_size',y='attacker_size',
239 | s=army_size_df['total_involved_count']*150)
240 |
241 |
242 | # ## Archenemies?
243 | # The Stark-Baratheon friendship has taken a complete U-turn with a total of 19 battles and counting. Indeed there is no one to be trusted in this land.
244 |
245 | temp_df = battles_df.dropna(
246 | subset = ["attacker_king",
247 | "defender_king"])[
248 | ["attacker_king",
249 | "defender_king"]
250 | ]
251 |
252 | archenemy_df = pd.DataFrame(
253 | list(Counter(
254 | [tuple(set(king_pair))
255 | for king_pair in temp_df.values
256 | if len(set(king_pair))>1]).
257 | items()),
258 | columns=['king_pair',
259 | 'battle_count'])
260 |
261 | archenemy_df['versus_text'] = archenemy_df.\
262 | apply(
263 | lambda row:
264 | '{} Vs {}'.format(
265 | row[
266 | 'king_pair'
267 | ][0],
268 | row[
269 | 'king_pair'
270 | ][1]),
271 | axis=1)
272 | archenemy_df.sort_values('battle_count',
273 | inplace=True,
274 | ascending=False)
275 |
276 |
277 | archenemy_df[['versus_text',
278 | 'battle_count']].set_index('versus_text',
279 | inplace=True)
280 | sns.barplot(data=archenemy_df,
281 | x='versus_text',
282 | y='battle_count')
283 | plt.xticks(rotation=45)
284 | plt.xlabel('Archenemies')
285 | plt.ylabel('Number of Battles')
286 | plt.title('Archenemies')
287 | plt.show()
--------------------------------------------------------------------------------
/notebooks/Ch05 - Unleash the Power of Transfer Learning/Datasets Builder.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import glob\n",
10 | "import numpy as np\n",
11 | "import os\n",
12 | "import shutil\n",
13 | "from utils import log_progress\n",
14 | "\n",
15 | "np.random.seed(42)"
16 | ]
17 | },
18 | {
19 | "cell_type": "code",
20 | "execution_count": 3,
21 | "metadata": {},
22 | "outputs": [
23 | {
24 | "data": {
25 | "text/plain": [
26 | "(12500, 12500)"
27 | ]
28 | },
29 | "execution_count": 3,
30 | "metadata": {},
31 | "output_type": "execute_result"
32 | }
33 | ],
34 | "source": [
35 | "files = glob.glob('train/*')\n",
36 | "\n",
37 | "cat_files = [fn for fn in files if 'cat' in fn]\n",
38 | "dog_files = [fn for fn in files if 'dog' in fn]\n",
39 | "len(cat_files), len(dog_files)"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 5,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "Cat datasets: (1500,) (500,) (500,)\n",
52 | "Dog datasets: (1500,) (500,) (500,)\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "cat_train = np.random.choice(cat_files, size=1500, replace=False)\n",
58 | "dog_train = np.random.choice(dog_files, size=1500, replace=False)\n",
59 | "cat_files = list(set(cat_files) - set(cat_train))\n",
60 | "dog_files = list(set(dog_files) - set(dog_train))\n",
61 | "\n",
62 | "cat_val = np.random.choice(cat_files, size=500, replace=False)\n",
63 | "dog_val = np.random.choice(dog_files, size=500, replace=False)\n",
64 | "cat_files = list(set(cat_files) - set(cat_val))\n",
65 | "dog_files = list(set(dog_files) - set(dog_val))\n",
66 | "\n",
67 | "cat_test = np.random.choice(cat_files, size=500, replace=False)\n",
68 | "dog_test = np.random.choice(dog_files, size=500, replace=False)\n",
69 | "\n",
70 | "print('Cat datasets:', cat_train.shape, cat_val.shape, cat_test.shape)\n",
71 | "print('Dog datasets:', dog_train.shape, dog_val.shape, dog_test.shape)"
72 | ]
73 | },
74 | {
75 | "cell_type": "code",
76 | "execution_count": 6,
77 | "metadata": {},
78 | "outputs": [],
79 | "source": [
80 | "train_dir = 'training_data'\n",
81 | "val_dir = 'validation_data'\n",
82 | "test_dir = 'test_data'\n",
83 | "\n",
84 | "train_files = np.concatenate([cat_train, dog_train])\n",
85 | "validate_files = np.concatenate([cat_val, dog_val])\n",
86 | "test_files = np.concatenate([cat_test, dog_test])\n",
87 | "\n",
88 | "os.mkdir(train_dir) if not os.path.isdir(train_dir) else None\n",
89 | "os.mkdir(val_dir) if not os.path.isdir(val_dir) else None\n",
90 | "os.mkdir(test_dir) if not os.path.isdir(test_dir) else None\n",
91 | "\n",
92 | "for fn in log_progress(train_files, name='Training Images'):\n",
93 | " shutil.copy(fn, train_dir)\n",
94 | "\n",
95 | "for fn in log_progress(validate_files, name='Validation Images'):\n",
96 | " shutil.copy(fn, val_dir)\n",
97 | " \n",
98 | "for fn in log_progress(test_files, name='Test Images'):\n",
99 | " shutil.copy(fn, test_dir)"
100 | ]
101 | }
102 | ],
103 | "metadata": {
104 | "anaconda-cloud": {},
105 | "kernelspec": {
106 | "display_name": "Python [conda root]",
107 | "language": "python",
108 | "name": "conda-root-py"
109 | },
110 | "language_info": {
111 | "codemirror_mode": {
112 | "name": "ipython",
113 | "version": 3
114 | },
115 | "file_extension": ".py",
116 | "mimetype": "text/x-python",
117 | "name": "python",
118 | "nbconvert_exporter": "python",
119 | "pygments_lexer": "ipython3",
120 | "version": "3.5.2"
121 | },
122 | "widgets": {
123 | "state": {
124 | "707cda043c794c6bbccbf3e3f0264e09": {
125 | "views": [
126 | {
127 | "cell_index": 5
128 | }
129 | ]
130 | },
131 | "76235700d807469ea94675fe7f2b6187": {
132 | "views": [
133 | {
134 | "cell_index": 5
135 | }
136 | ]
137 | },
138 | "7aadcb865964421fb6f976525e21adca": {
139 | "views": [
140 | {
141 | "cell_index": 5
142 | }
143 | ]
144 | }
145 | },
146 | "version": "1.2.0"
147 | }
148 | },
149 | "nbformat": 4,
150 | "nbformat_minor": 2
151 | }
152 |
--------------------------------------------------------------------------------
/notebooks/Ch05 - Unleash the Power of Transfer Learning/model_evaluation_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Jul 31 20:05:23 2017
4 |
5 | @author: DIP
6 | @Copyright: Dipanjan Sarkar
7 | """
8 |
9 | from sklearn import metrics
10 | import numpy as np
11 | import pandas as pd
12 | import matplotlib.pyplot as plt
13 | from sklearn.preprocessing import LabelEncoder
14 | from sklearn.base import clone
15 | from sklearn.preprocessing import label_binarize
16 | from scipy import interp
17 | from sklearn.metrics import roc_curve, auc
18 |
19 |
20 | def get_metrics(true_labels, predicted_labels):
21 |
22 | print('Accuracy:', np.round(
23 | metrics.accuracy_score(true_labels,
24 | predicted_labels),
25 | 4))
26 | print('Precision:', np.round(
27 | metrics.precision_score(true_labels,
28 | predicted_labels,
29 | average='weighted'),
30 | 4))
31 | print('Recall:', np.round(
32 | metrics.recall_score(true_labels,
33 | predicted_labels,
34 | average='weighted'),
35 | 4))
36 | print('F1 Score:', np.round(
37 | metrics.f1_score(true_labels,
38 | predicted_labels,
39 | average='weighted'),
40 | 4))
41 |
42 |
43 | def train_predict_model(classifier,
44 | train_features, train_labels,
45 | test_features, test_labels):
46 | # build model
47 | classifier.fit(train_features, train_labels)
48 | # predict using model
49 | predictions = classifier.predict(test_features)
50 | return predictions
51 |
52 |
53 | def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
54 |
55 | total_classes = len(classes)
56 | level_labels = [total_classes*[0], list(range(total_classes))]
57 |
58 | cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels,
59 | labels=classes)
60 | cm_frame = pd.DataFrame(data=cm,
61 | columns=pd.MultiIndex(levels=[['Predicted:'], classes],
62 | labels=level_labels),
63 | index=pd.MultiIndex(levels=[['Actual:'], classes],
64 | labels=level_labels))
65 | print(cm_frame)
66 |
67 | def display_classification_report(true_labels, predicted_labels, classes=[1,0]):
68 |
69 | report = metrics.classification_report(y_true=true_labels,
70 | y_pred=predicted_labels,
71 | labels=classes)
72 | print(report)
73 |
74 |
75 |
76 | def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
77 | print('Model Performance metrics:')
78 | print('-'*30)
79 | get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
80 | print('\nModel Classification report:')
81 | print('-'*30)
82 | display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels,
83 | classes=classes)
84 | print('\nPrediction Confusion Matrix:')
85 | print('-'*30)
86 | display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels,
87 | classes=classes)
88 |
89 |
90 | def plot_model_decision_surface(clf, train_features, train_labels,
91 | plot_step=0.02, cmap=plt.cm.RdYlBu,
92 | markers=None, alphas=None, colors=None):
93 |
94 | if train_features.shape[1] != 2:
95 | raise ValueError("X_train should have exactly 2 columnns!")
96 |
97 | x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
98 | y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
99 | xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
100 | np.arange(y_min, y_max, plot_step))
101 |
102 | clf_est = clone(clf)
103 | clf_est.fit(train_features,train_labels)
104 | if hasattr(clf_est, 'predict_proba'):
105 | Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
106 | else:
107 | Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])
108 | Z = Z.reshape(xx.shape)
109 | cs = plt.contourf(xx, yy, Z, cmap=cmap)
110 |
111 | le = LabelEncoder()
112 | y_enc = le.fit_transform(train_labels)
113 | n_classes = len(le.classes_)
114 | plot_colors = ''.join(colors) if colors else [None] * n_classes
115 | label_names = le.classes_
116 | markers = markers if markers else [None] * n_classes
117 | alphas = alphas if alphas else [None] * n_classes
118 | for i, color in zip(range(n_classes), plot_colors):
119 | idx = np.where(y_enc == i)
120 | plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
121 | label=label_names[i], cmap=cmap, edgecolors='black',
122 | marker=markers[i], alpha=alphas[i])
123 | plt.legend()
124 | plt.show()
125 |
126 |
127 | def plot_model_roc_curve(clf, features, true_labels, label_encoder=None, class_names=None):
128 |
129 | ## Compute ROC curve and ROC area for each class
130 | fpr = dict()
131 | tpr = dict()
132 | roc_auc = dict()
133 | if hasattr(clf, 'classes_'):
134 | class_labels = clf.classes_
135 | elif label_encoder:
136 | class_labels = label_encoder.classes_
137 | elif class_names:
138 | class_labels = class_names
139 | else:
140 | raise ValueError('Unable to derive prediction classes, please specify class_names!')
141 | n_classes = len(class_labels)
142 | y_test = label_binarize(true_labels, classes=class_labels)
143 | if n_classes == 2:
144 | if hasattr(clf, 'predict_proba'):
145 | prob = clf.predict_proba(features)
146 | y_score = prob[:, prob.shape[1]-1]
147 | elif hasattr(clf, 'decision_function'):
148 | prob = clf.decision_function(features)
149 | y_score = prob[:, prob.shape[1]-1]
150 | else:
151 | raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
152 |
153 | fpr, tpr, _ = roc_curve(y_test, y_score)
154 | roc_auc = auc(fpr, tpr)
155 | plt.plot(fpr, tpr, label='ROC curve (area = {0:0.2f})'
156 | ''.format(roc_auc),
157 | linewidth=2.5)
158 |
159 | elif n_classes > 2:
160 | if hasattr(clf, 'predict_proba'):
161 | y_score = clf.predict_proba(features)
162 | elif hasattr(clf, 'decision_function'):
163 | y_score = clf.decision_function(features)
164 | else:
165 | raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
166 |
167 | for i in range(n_classes):
168 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
169 | roc_auc[i] = auc(fpr[i], tpr[i])
170 |
171 | ## Compute micro-average ROC curve and ROC area
172 | fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
173 | roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
174 |
175 | ## Compute macro-average ROC curve and ROC area
176 | # First aggregate all false positive rates
177 | all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
178 | # Then interpolate all ROC curves at this points
179 | mean_tpr = np.zeros_like(all_fpr)
180 | for i in range(n_classes):
181 | mean_tpr += interp(all_fpr, fpr[i], tpr[i])
182 | # Finally average it and compute AUC
183 | mean_tpr /= n_classes
184 | fpr["macro"] = all_fpr
185 | tpr["macro"] = mean_tpr
186 | roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
187 |
188 | ## Plot ROC curves
189 | plt.figure(figsize=(6, 4))
190 | plt.plot(fpr["micro"], tpr["micro"],
191 | label='micro-average ROC curve (area = {0:0.2f})'
192 | ''.format(roc_auc["micro"]), linewidth=3)
193 |
194 | plt.plot(fpr["macro"], tpr["macro"],
195 | label='macro-average ROC curve (area = {0:0.2f})'
196 | ''.format(roc_auc["macro"]), linewidth=3)
197 |
198 | for i, label in enumerate(class_labels):
199 | plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
200 | ''.format(label, roc_auc[i]),
201 | linewidth=2, linestyle=':')
202 | else:
203 | raise ValueError('Number of classes should be atleast 2 or more')
204 |
205 | plt.plot([0, 1], [0, 1], 'k--')
206 | plt.xlim([0.0, 1.0])
207 | plt.ylim([0.0, 1.05])
208 | plt.xlabel('False Positive Rate')
209 | plt.ylabel('True Positive Rate')
210 | plt.title('Receiver Operating Characteristic (ROC) Curve')
211 | plt.legend(loc="lower right")
212 | plt.show()
213 |
214 |
215 |
--------------------------------------------------------------------------------
/notebooks/Ch05 - Unleash the Power of Transfer Learning/utils.py:
--------------------------------------------------------------------------------
1 | def log_progress(sequence, every=None, size=None, name='Items'):
2 | from ipywidgets import IntProgress, HTML, VBox
3 | from IPython.display import display
4 |
5 | is_iterator = False
6 | if size is None:
7 | try:
8 | size = len(sequence)
9 | except TypeError:
10 | is_iterator = True
11 | if size is not None:
12 | if every is None:
13 | if size <= 200:
14 | every = 1
15 | else:
16 | every = int(size / 200) # every 0.5%
17 | else:
18 | assert every is not None, 'sequence is iterator, set every'
19 |
20 | if is_iterator:
21 | progress = IntProgress(min=0, max=1, value=1)
22 | progress.bar_style = 'info'
23 | else:
24 | progress = IntProgress(min=0, max=size, value=0)
25 | label = HTML()
26 | box = VBox(children=[label, progress])
27 | display(box)
28 |
29 | index = 0
30 | try:
31 | for index, record in enumerate(sequence, 1):
32 | if index == 1 or index % every == 0:
33 | if is_iterator:
34 | label.value = '{name}: {index} / ?'.format(
35 | name=name,
36 | index=index
37 | )
38 | else:
39 | progress.value = index
40 | label.value = u'{name}: {index} / {size}'.format(
41 | name=name,
42 | index=index,
43 | size=size
44 | )
45 | yield record
46 | except:
47 | progress.bar_style = 'danger'
48 | raise
49 | else:
50 | progress.bar_style = 'success'
51 | progress.value = index
52 | label.value = "{name}: {index}".format(
53 | name=name,
54 | index=str(index or '?')
55 | )
--------------------------------------------------------------------------------
/notebooks/Ch06 - Image Recognition and Classification/0fc12a365adfcbb603e298b10149632a.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dipanjanS/hands-on-transfer-learning-with-python/f6c5f6b4c2d0b5f14252f1ce9c7d4c74652b8ac6/notebooks/Ch06 - Image Recognition and Classification/0fc12a365adfcbb603e298b10149632a.jpg
--------------------------------------------------------------------------------
/notebooks/Ch06 - Image Recognition and Classification/cnn_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Fri Jun 22 17:23:21 2018
4 |
5 | @author: RAGHAV
6 | """
7 |
8 | import numpy as np
9 | import matplotlib.pyplot as plt
10 | params = {'legend.fontsize': 'x-large',
11 | 'figure.figsize': (15, 5),
12 | 'axes.labelsize': 'x-large',
13 | 'axes.titlesize':'x-large',
14 | 'xtick.labelsize':'x-large',
15 | 'ytick.labelsize':'x-large'}
16 |
17 | plt.rcParams.update(params)
18 |
19 |
20 | def make_prediction(model=None,img_vector=[],
21 | label_dict={},top_N=3,
22 | model_input_shape=None):
23 | if model:
24 | # get model input shape
25 | if not model_input_shape:
26 | model_input_shape = (1,)+model.get_input_shape_at(0)[1:]
27 |
28 | # get prediction
29 | prediction = model.predict(img_vector.reshape(model_input_shape))[0]
30 |
31 |
32 | # get top N with confidence
33 | labels_predicted = [label_dict[idx] for idx in np.argsort(prediction)[::-1][:top_N]]
34 | confidence_predicted = np.sort(prediction)[::-1][:top_N]
35 |
36 | return labels_predicted, confidence_predicted
37 |
38 |
39 | def plot_predictions(model,dataset,
40 | dataset_labels,label_dict,
41 | batch_size,grid_height,grid_width):
42 | if model:
43 | f, ax = plt.subplots(grid_width, grid_height)
44 | f.set_size_inches(12, 12)
45 |
46 | random_batch_indx = np.random.permutation(np.arange(0,len(dataset)))[:batch_size]
47 |
48 | img_idx = 0
49 | for i in range(0, grid_width):
50 | for j in range(0, grid_height):
51 | actual_label = label_dict.get(dataset_labels[random_batch_indx[img_idx]].argmax())
52 | preds,confs_ = make_prediction(model,
53 | img_vector=dataset[random_batch_indx[img_idx]],
54 | label_dict=label_dict,
55 | top_N=1)
56 | ax[i][j].axis('off')
57 | ax[i][j].set_title('Actual:'+actual_label[:10]+\
58 | '\nPredicted:'+preds[0] + \
59 | '(' +str(round(confs_[0],2)) + ')')
60 | ax[i][j].imshow(dataset[random_batch_indx[img_idx]])
61 | img_idx += 1
62 |
63 | plt.subplots_adjust(left=0, bottom=0, right=1,
64 | top=1, wspace=0.4, hspace=0.55)
65 |
66 |
67 | # source: https://github.com/keras-team/keras/issues/431#issuecomment-317397154
68 | def get_activations(model, model_inputs,
69 | print_shape_only=True, layer_name=None):
70 | import keras.backend as K
71 | print('----- activations -----')
72 | activations = []
73 | inp = model.input
74 |
75 | model_multi_inputs_cond = True
76 | if not isinstance(inp, list):
77 | # only one input! let's wrap it in a list.
78 | inp = [inp]
79 | model_multi_inputs_cond = False
80 | # all layer outputs
81 | outputs = [layer.output for layer in model.layers if
82 | layer.name == layer_name or layer_name is None]
83 |
84 | # evaluation functions
85 | funcs = [K.function(inp + [K.learning_phase()], [out]) for out in outputs]
86 |
87 | if model_multi_inputs_cond:
88 | list_inputs = []
89 | list_inputs.extend(model_inputs)
90 | list_inputs.append(1.)
91 | else:
92 | list_inputs = [model_inputs, 1.]
93 |
94 | # Learning phase. 1 = Test mode (no dropout or batch normalization)
95 | # layer_outputs = [func([model_inputs, 1.])[0] for func in funcs]
96 | layer_outputs = [func(list_inputs)[0] for func in funcs]
97 | for layer_activations in layer_outputs:
98 | activations.append(layer_activations)
99 | if print_shape_only:
100 | print(layer_activations.shape)
101 | else:
102 | print(layer_activations)
103 | return activations
104 |
105 | # source :https://github.com/philipperemy/keras-visualize-activations/blob/master/read_activations.py
106 | def display_activations(activation_maps):
107 | batch_size = activation_maps[0].shape[0]
108 | assert batch_size == 1, 'One image at a time to visualize.'
109 | for i, activation_map in enumerate(activation_maps):
110 | print('Displaying activation map {}'.format(i))
111 | shape = activation_map.shape
112 | if len(shape) == 4:
113 | activations = np.hstack(np.transpose(activation_map[0], (2, 0, 1)))
114 | elif len(shape) == 2:
115 | # try to make it square as much as possible. we can skip some activations.
116 | activations = activation_map[0]
117 | num_activations = len(activations)
118 | # too hard to display it on the screen.
119 | if num_activations > 1024:
120 | square_param = int(np.floor(np.sqrt(num_activations)))
121 | activations = activations[0: square_param * square_param]
122 | activations = np.reshape(activations, (square_param, square_param))
123 | else:
124 | activations = np.expand_dims(activations, axis=0)
125 | else:
126 | raise Exception('len(shape) = 3 has not been implemented.')
127 | #plt.imshow(activations, interpolation='None', cmap='binary')
128 | fig, ax = plt.subplots(figsize=(18, 12))
129 | ax.imshow(activations, interpolation='None', cmap='binary')
130 | plt.show()
--------------------------------------------------------------------------------
/notebooks/Ch06 - Image Recognition and Classification/dog_breed_transfer_learning_classifier.py:
--------------------------------------------------------------------------------
1 |
2 | # coding: utf-8
3 |
4 | # # Dog Breed Classifier
5 | #
6 | # This notebook leverages a pretrained InceptionV3 model (on ImageNet) to prepare a _Dog Breed Classifier_
7 | # It showcases how __Transfer Learning__ can be utilized to prepare high performing models
8 |
9 | # In[1]:
10 |
11 | # Pandas and Numpy for data structures and util fucntions
12 | import re
13 | #import tqdm
14 | import itertools
15 | import numpy as np
16 | import pandas as pd
17 | from numpy.random import rand
18 | from datetime import datetime
19 | pd.options.display.max_colwidth = 600
20 |
21 | # Scikit Imports
22 | from sklearn import preprocessing
23 | from sklearn.metrics import roc_curve, auc, precision_recall_curve
24 | from sklearn.model_selection import train_test_split
25 |
26 | # Matplot Imports
27 | import matplotlib.pyplot as plt
28 | params = {'legend.fontsize': 'x-large',
29 | 'figure.figsize': (15, 5),
30 | 'axes.labelsize': 'x-large',
31 | 'axes.titlesize':'x-large',
32 | 'xtick.labelsize':'x-large',
33 | 'ytick.labelsize':'x-large'}
34 |
35 | plt.rcParams.update(params)
36 | get_ipython().run_line_magic('matplotlib', 'inline')
37 |
38 | # pandas display data frames as tables
39 | from IPython.display import display, HTML
40 |
41 | import warnings
42 | warnings.filterwarnings('ignore')
43 |
44 |
45 | # In[2]:
46 |
47 | import os
48 | import math
49 | import pathlib
50 | import shutil
51 |
52 | from keras import regularizers
53 | from keras.models import Model
54 | from keras.optimizers import Adam
55 | from keras.layers import Dropout
56 | from keras.layers import Conv2D,MaxPooling2D, GlobalAveragePooling2D
57 | from keras.layers import BatchNormalization
58 | from keras.layers import Activation,Dense,Flatten
59 | from keras.models import Sequential,load_model
60 | from keras.preprocessing.image import ImageDataGenerator, img_to_array, load_img
61 | from keras.applications.inception_v3 import InceptionV3
62 | from keras.utils.np_utils import to_categorical
63 |
64 |
65 | # ## Load Dataset
66 |
67 | # In[3]:
68 |
69 | train_folder = 'train/'
70 | test_folder = 'test/'
71 |
72 |
73 | # In[4]:
74 |
75 | data_labels = pd.read_csv('labels/labels.csv')
76 | data_labels.head()
77 |
78 |
79 | # ## Check Number of Classes in the Dataset
80 |
81 | # In[5]:
82 |
83 | target_labels = data_labels['breed']
84 | len(set(target_labels))
85 |
86 |
87 | # ## Prepare Labels
88 | # Deep Learning models work with one hot encoded outputs or target variables
89 |
90 | # In[6]:
91 |
92 | labels_ohe_names = pd.get_dummies(target_labels, sparse=True)
93 | labels_ohe = np.asarray(labels_ohe_names)
94 | print(labels_ohe.shape)
95 | print(labels_ohe[:2])
96 |
97 |
98 | # We add another column to the labels dataset to identify image path
99 |
100 | # In[7]:
101 |
102 | data_labels['image_path'] = data_labels.apply( lambda row: (train_folder + row["id"] + ".jpg" ), axis=1)
103 | data_labels.head()
104 |
105 |
106 | # ## Prepare Train-Test Datasets
107 | # We use a 70-30 split to prepare the two dataset
108 |
109 | # In[8]:
110 |
111 | train_data = np.array([img_to_array(
112 | load_img(img,
113 | target_size=(299, 299))
114 | ) for img
115 | in data_labels['image_path'].values.tolist()
116 | ]).astype('float32')
117 |
118 |
119 | # In[9]:
120 |
121 | train_data.shape
122 |
123 |
124 | # In[10]:
125 |
126 | x_train, x_test, y_train, y_test = train_test_split(train_data,
127 | target_labels,
128 | test_size=0.3,
129 | stratify=np.array(target_labels),
130 | random_state=42)
131 |
132 |
133 | # In[11]:
134 |
135 | x_train.shape, x_test.shape
136 |
137 |
138 | # Prepare Validation Dataset
139 |
140 | # In[12]:
141 |
142 | x_train, x_val, y_train, y_val = train_test_split(x_train,
143 | y_train,
144 | test_size=0.15,
145 | stratify=np.array(y_train),
146 | random_state=42)
147 |
148 |
149 | # In[13]:
150 |
151 | x_train.shape, x_val.shape
152 |
153 |
154 | # Prepare target variables for train, test and validation datasets
155 |
156 | # In[14]:
157 |
158 | y_train_ohe = pd.get_dummies(y_train.reset_index(drop=True)).as_matrix()
159 | y_val_ohe = pd.get_dummies(y_val.reset_index(drop=True)).as_matrix()
160 | y_test_ohe = pd.get_dummies(y_test.reset_index(drop=True)).as_matrix()
161 |
162 | y_train_ohe.shape, y_test_ohe.shape, y_val_ohe.shape
163 |
164 |
165 | # ## Data Augmentation
166 | #
167 | # Since number of samples per class are not very high, we utilize data augmentation to prepare different variations of different samples available. We do this using the ```ImageDataGenerator utility``` from ```keras```
168 |
169 | # In[15]:
170 |
171 | BATCH_SIZE = 32
172 |
173 |
174 | # In[16]:
175 |
176 | # Create train generator.
177 | train_datagen = ImageDataGenerator(rescale=1./255,
178 | rotation_range=30,
179 | width_shift_range=0.2,
180 | height_shift_range=0.2,
181 | horizontal_flip = 'true')
182 | train_generator = train_datagen.flow(x_train, y_train_ohe, shuffle=False, batch_size=BATCH_SIZE, seed=1)
183 |
184 |
185 | # In[17]:
186 |
187 | # Create validation generator
188 | val_datagen = ImageDataGenerator(rescale = 1./255)
189 | val_generator = train_datagen.flow(x_val, y_val_ohe, shuffle=False, batch_size=BATCH_SIZE, seed=1)
190 |
191 |
192 | # ## Prepare Deep Learning Classifier
193 | #
194 | # * Load InceptionV3 pretrained on ImageNet without its top/classification layer
195 | # * Add additional custom layers on top of InceptionV3 to prepare custom classifier
196 |
197 | # In[18]:
198 |
199 | # Get the InceptionV3 model so we can do transfer learning
200 | base_inception = InceptionV3(weights='imagenet', include_top = False, input_shape=(299, 299, 3))
201 |
202 |
203 | # In[19]:
204 |
205 | # Add a global spatial average pooling layer
206 | out = base_inception.output
207 | out = GlobalAveragePooling2D()(out)
208 | out = Dense(512, activation='relu')(out)
209 | out = Dense(512, activation='relu')(out)
210 | total_classes = y_train_ohe.shape[1]
211 | predictions = Dense(total_classes, activation='softmax')(out)
212 |
213 |
214 | # * Stack the two models (InceptionV3 and custom layers) on top of each other
215 | # * Compile the model and view its summary
216 |
217 | # In[20]:
218 |
219 | model = Model(inputs=base_inception.input, outputs=predictions)
220 |
221 | # only if we want to freeze layers
222 | for layer in base_inception.layers:
223 | layer.trainable = False
224 |
225 | # Compile
226 | model.compile(Adam(lr=.0001), loss='categorical_crossentropy', metrics=['accuracy'])
227 |
228 | model.summary()
229 |
230 |
231 | # ## Model Training
232 | # We train the model with a Batch Size of 32 for just 15 Epochs.
233 | #
234 | # The model utilizes the power of transfer learning to achieve a validation accuracy of about __81%__ !
235 |
236 | # In[21]:
237 |
238 | # Train the model
239 | batch_size = BATCH_SIZE
240 | train_steps_per_epoch = x_train.shape[0] // batch_size
241 | val_steps_per_epoch = x_val.shape[0] // batch_size
242 |
243 | history = model.fit_generator(train_generator,
244 | steps_per_epoch=train_steps_per_epoch,
245 | validation_data=val_generator,
246 | validation_steps=val_steps_per_epoch,
247 | epochs=15,
248 | verbose=1)
249 |
250 |
251 | # Save the Model
252 |
253 | # In[22]:
254 |
255 | model.save('dog_breed.hdf5')
256 |
257 |
258 | # ## Visualize Model Performance
259 |
260 | # In[35]:
261 |
262 | f, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
263 | t = f.suptitle('Deep Neural Net Performance', fontsize=12)
264 | f.subplots_adjust(top=0.85, wspace=0.3)
265 |
266 | epochs = list(range(1,16))
267 | ax1.plot(epochs, history.history['acc'], label='Train Accuracy')
268 | ax1.plot(epochs, history.history['val_acc'], label='Validation Accuracy')
269 | ax1.set_xticks(epochs)
270 | ax1.set_ylabel('Accuracy Value')
271 | ax1.set_xlabel('Epoch')
272 | ax1.set_title('Accuracy')
273 | l1 = ax1.legend(loc="best")
274 |
275 | ax2.plot(epochs, history.history['loss'], label='Train Loss')
276 | ax2.plot(epochs, history.history['val_loss'], label='Validation Loss')
277 | ax2.set_xticks(epochs)
278 | ax2.set_ylabel('Loss Value')
279 | ax2.set_xlabel('Epoch')
280 | ax2.set_title('Loss')
281 | l2 = ax2.legend(loc="best")
282 |
283 |
284 | # ## Test Model Performance
285 | #
286 | # Step 1 is to prepare the training dataset. Since we scaled training data, test data should also be scaled in a similar manner.
287 | #
288 | # _Note: Deep Learning models are very sensitive to scaling._
289 |
290 | # In[67]:
291 |
292 | # scaling test features
293 | x_test /= 255.
294 |
295 |
296 | # In[69]:
297 |
298 | test_predictions = model.predict(x_test)
299 | test_predictions
300 |
301 |
302 | # In[70]:
303 |
304 | predictions = pd.DataFrame(test_predictions, columns=labels_ohe_names.columns)
305 | predictions.head()
306 |
307 |
308 | # In[71]:
309 |
310 | test_labels = list(y_test)
311 | predictions = list(predictions.idxmax(axis=1))
312 | predictions[:10]
313 |
314 |
315 | # ## Analyze Test Performance
316 |
317 | # In[72]:
318 |
319 | import model_evaluation_utils as meu
320 |
321 |
322 | # In[73]:
323 |
324 | meu.get_metrics(true_labels=test_labels,
325 | predicted_labels=predictions)
326 |
327 |
328 | # In[74]:
329 |
330 | meu.display_classification_report(true_labels=test_labels,
331 | predicted_labels=predictions,
332 | classes=list(labels_ohe_names.columns))
333 |
334 |
335 | # In[75]:
336 |
337 | meu.display_confusion_matrix_pretty(true_labels=test_labels,
338 | predicted_labels=predictions,
339 | classes=list(labels_ohe_names.columns))
340 |
341 |
342 | # The model achieves a test accuracy of approximately __86%__
343 |
344 | # ## Visualize Model Performance
345 | # Visualize model performance with actual images, labels and prediction confidence
346 |
347 | # In[112]:
348 |
349 | grid_width = 5
350 | grid_height = 5
351 | f, ax = plt.subplots(grid_width, grid_height)
352 | f.set_size_inches(15, 15)
353 | batch_size = 25
354 | dataset = x_test
355 |
356 | label_dict = dict(enumerate(labels_ohe_names.columns.values))
357 | model_input_shape = (1,)+model.get_input_shape_at(0)[1:]
358 | random_batch_indx = np.random.permutation(np.arange(0,len(dataset)))[:batch_size]
359 |
360 | img_idx = 0
361 | for i in range(0, grid_width):
362 | for j in range(0, grid_height):
363 | actual_label = np.array(y_test)[random_batch_indx[img_idx]]
364 | prediction = model.predict(dataset[random_batch_indx[img_idx]].reshape(model_input_shape))[0]
365 | label_idx = np.argmax(prediction)
366 | predicted_label = label_dict.get(label_idx)
367 | conf = round(prediction[label_idx], 2)
368 | ax[i][j].axis('off')
369 | ax[i][j].set_title('Actual: '+actual_label+'\nPred: '+predicted_label + '\nConf: ' +str(conf))
370 | ax[i][j].imshow(dataset[random_batch_indx[img_idx]])
371 | img_idx += 1
372 |
373 | plt.subplots_adjust(left=0, bottom=0, right=1, top=1, wspace=0.5, hspace=0.55)
374 |
375 |
--------------------------------------------------------------------------------
/notebooks/Ch06 - Image Recognition and Classification/model_evaluation_utils.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Jul 31 20:05:23 2017
4 |
5 | @author: DIP
6 | @Copyright: Dipanjan Sarkar
7 | """
8 |
9 | from sklearn import metrics
10 | import numpy as np
11 | import pandas as pd
12 | import matplotlib.pyplot as plt
13 | from sklearn.preprocessing import LabelEncoder
14 | from sklearn.base import clone
15 | from sklearn.preprocessing import label_binarize
16 | from scipy import interp
17 | from sklearn.metrics import roc_curve, auc
18 |
19 |
20 | def get_metrics(true_labels, predicted_labels):
21 |
22 | print('Accuracy:', np.round(
23 | metrics.accuracy_score(true_labels,
24 | predicted_labels),
25 | 4))
26 | print('Precision:', np.round(
27 | metrics.precision_score(true_labels,
28 | predicted_labels,
29 | average='weighted'),
30 | 4))
31 | print('Recall:', np.round(
32 | metrics.recall_score(true_labels,
33 | predicted_labels,
34 | average='weighted'),
35 | 4))
36 | print('F1 Score:', np.round(
37 | metrics.f1_score(true_labels,
38 | predicted_labels,
39 | average='weighted'),
40 | 4))
41 |
42 |
43 | def train_predict_model(classifier,
44 | train_features, train_labels,
45 | test_features, test_labels):
46 | # build model
47 | classifier.fit(train_features, train_labels)
48 | # predict using model
49 | predictions = classifier.predict(test_features)
50 | return predictions
51 |
52 |
53 | def display_confusion_matrix(true_labels, predicted_labels, classes=[1,0]):
54 |
55 | total_classes = len(classes)
56 | level_labels = [total_classes*[0], list(range(total_classes))]
57 |
58 | cm = metrics.confusion_matrix(y_true=true_labels, y_pred=predicted_labels,
59 | labels=classes)
60 | cm_frame = pd.DataFrame(data=cm,
61 | columns=pd.MultiIndex(levels=[['Predicted:'], classes],
62 | labels=level_labels),
63 | index=pd.MultiIndex(levels=[['Actual:'], classes],
64 | labels=level_labels))
65 | print(cm_frame)
66 |
67 | def display_classification_report(true_labels, predicted_labels, classes=[1,0]):
68 |
69 | report = metrics.classification_report(y_true=true_labels,
70 | y_pred=predicted_labels,
71 | labels=classes)
72 | print(report)
73 |
74 |
75 |
76 | def display_model_performance_metrics(true_labels, predicted_labels, classes=[1,0]):
77 | print('Model Performance metrics:')
78 | print('-'*30)
79 | get_metrics(true_labels=true_labels, predicted_labels=predicted_labels)
80 | print('\nModel Classification report:')
81 | print('-'*30)
82 | display_classification_report(true_labels=true_labels, predicted_labels=predicted_labels,
83 | classes=classes)
84 | print('\nPrediction Confusion Matrix:')
85 | print('-'*30)
86 | display_confusion_matrix(true_labels=true_labels, predicted_labels=predicted_labels,
87 | classes=classes)
88 |
89 |
90 | def plot_model_decision_surface(clf, train_features, train_labels,
91 | plot_step=0.02, cmap=plt.cm.RdYlBu,
92 | markers=None, alphas=None, colors=None):
93 |
94 | if train_features.shape[1] != 2:
95 | raise ValueError("X_train should have exactly 2 columnns!")
96 |
97 | x_min, x_max = train_features[:, 0].min() - plot_step, train_features[:, 0].max() + plot_step
98 | y_min, y_max = train_features[:, 1].min() - plot_step, train_features[:, 1].max() + plot_step
99 | xx, yy = np.meshgrid(np.arange(x_min, x_max, plot_step),
100 | np.arange(y_min, y_max, plot_step))
101 |
102 | clf_est = clone(clf)
103 | clf_est.fit(train_features,train_labels)
104 | if hasattr(clf_est, 'predict_proba'):
105 | Z = clf_est.predict_proba(np.c_[xx.ravel(), yy.ravel()])[:,1]
106 | else:
107 | Z = clf_est.predict(np.c_[xx.ravel(), yy.ravel()])
108 | Z = Z.reshape(xx.shape)
109 | cs = plt.contourf(xx, yy, Z, cmap=cmap)
110 |
111 | le = LabelEncoder()
112 | y_enc = le.fit_transform(train_labels)
113 | n_classes = len(le.classes_)
114 | plot_colors = ''.join(colors) if colors else [None] * n_classes
115 | label_names = le.classes_
116 | markers = markers if markers else [None] * n_classes
117 | alphas = alphas if alphas else [None] * n_classes
118 | for i, color in zip(range(n_classes), plot_colors):
119 | idx = np.where(y_enc == i)
120 | plt.scatter(train_features[idx, 0], train_features[idx, 1], c=color,
121 | label=label_names[i], cmap=cmap, edgecolors='black',
122 | marker=markers[i], alpha=alphas[i])
123 | plt.legend()
124 | plt.show()
125 |
126 |
127 | def plot_model_roc_curve(clf, features, true_labels, label_encoder=None, class_names=None):
128 |
129 | ## Compute ROC curve and ROC area for each class
130 | fpr = dict()
131 | tpr = dict()
132 | roc_auc = dict()
133 | if hasattr(clf, 'classes_'):
134 | class_labels = clf.classes_
135 | elif label_encoder:
136 | class_labels = label_encoder.classes_
137 | elif class_names:
138 | class_labels = class_names
139 | else:
140 | raise ValueError('Unable to derive prediction classes, please specify class_names!')
141 | n_classes = len(class_labels)
142 | y_test = label_binarize(true_labels, classes=class_labels)
143 | if n_classes == 2:
144 | if hasattr(clf, 'predict_proba'):
145 | prob = clf.predict_proba(features)
146 | y_score = prob[:, prob.shape[1]-1]
147 | elif hasattr(clf, 'decision_function'):
148 | prob = clf.decision_function(features)
149 | y_score = prob[:, prob.shape[1]-1]
150 | else:
151 | raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
152 |
153 | fpr, tpr, _ = roc_curve(y_test, y_score)
154 | roc_auc = auc(fpr, tpr)
155 | plt.plot(fpr, tpr, label='ROC curve (area = {0:0.2f})'
156 | ''.format(roc_auc),
157 | linewidth=2.5)
158 |
159 | elif n_classes > 2:
160 | if hasattr(clf, 'predict_proba'):
161 | y_score = clf.predict_proba(features)
162 | elif hasattr(clf, 'decision_function'):
163 | y_score = clf.decision_function(features)
164 | else:
165 | raise AttributeError("Estimator doesn't have a probability or confidence scoring system!")
166 |
167 | for i in range(n_classes):
168 | fpr[i], tpr[i], _ = roc_curve(y_test[:, i], y_score[:, i])
169 | roc_auc[i] = auc(fpr[i], tpr[i])
170 |
171 | ## Compute micro-average ROC curve and ROC area
172 | fpr["micro"], tpr["micro"], _ = roc_curve(y_test.ravel(), y_score.ravel())
173 | roc_auc["micro"] = auc(fpr["micro"], tpr["micro"])
174 |
175 | ## Compute macro-average ROC curve and ROC area
176 | # First aggregate all false positive rates
177 | all_fpr = np.unique(np.concatenate([fpr[i] for i in range(n_classes)]))
178 | # Then interpolate all ROC curves at this points
179 | mean_tpr = np.zeros_like(all_fpr)
180 | for i in range(n_classes):
181 | mean_tpr += interp(all_fpr, fpr[i], tpr[i])
182 | # Finally average it and compute AUC
183 | mean_tpr /= n_classes
184 | fpr["macro"] = all_fpr
185 | tpr["macro"] = mean_tpr
186 | roc_auc["macro"] = auc(fpr["macro"], tpr["macro"])
187 |
188 | ## Plot ROC curves
189 | plt.figure(figsize=(6, 4))
190 | plt.plot(fpr["micro"], tpr["micro"],
191 | label='micro-average ROC curve (area = {0:0.2f})'
192 | ''.format(roc_auc["micro"]), linewidth=3)
193 |
194 | plt.plot(fpr["macro"], tpr["macro"],
195 | label='macro-average ROC curve (area = {0:0.2f})'
196 | ''.format(roc_auc["macro"]), linewidth=3)
197 |
198 | for i, label in enumerate(class_labels):
199 | plt.plot(fpr[i], tpr[i], label='ROC curve of class {0} (area = {1:0.2f})'
200 | ''.format(label, roc_auc[i]),
201 | linewidth=2, linestyle=':')
202 | else:
203 | raise ValueError('Number of classes should be atleast 2 or more')
204 |
205 | plt.plot([0, 1], [0, 1], 'k--')
206 | plt.xlim([0.0, 1.0])
207 | plt.ylim([0.0, 1.05])
208 | plt.xlabel('False Positive Rate')
209 | plt.ylabel('True Positive Rate')
210 | plt.title('Receiver Operating Characteristic (ROC) Curve')
211 | plt.legend(loc="lower right")
212 | plt.show()
213 |
214 |
215 |
--------------------------------------------------------------------------------
/notebooks/Ch07 - Text Document Categorization/20_newsgrp_cnn_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import config\n",
12 | "from dataloader.loader import Loader\n",
13 | "from preprocessing.utils import Preprocess, remove_empty_docs\n",
14 | "from dataloader.embeddings import GloVe\n",
15 | "from model.cnn_document_model import DocumentModel, TrainingParameters\n",
16 | "from keras.callbacks import ModelCheckpoint, EarlyStopping\n",
17 | "import numpy as np\n",
18 | "from keras.utils import to_categorical\n",
19 | "import keras.backend as K\n",
20 | "\n",
21 | "\n",
22 | "from sklearn.manifold import TSNE\n"
23 | ]
24 | },
25 | {
26 | "cell_type": "markdown",
27 | "metadata": {},
28 | "source": [
29 | "## Load Data Sets for 20 News Group"
30 | ]
31 | },
32 | {
33 | "cell_type": "code",
34 | "execution_count": null,
35 | "metadata": {
36 | "collapsed": true
37 | },
38 | "outputs": [],
39 | "source": [
40 | "dataset = Loader.load_20newsgroup_data(subset='train')\n",
41 | "corpus, labels = dataset.data, dataset.target\n",
42 | "corpus, labels = remove_empty_docs(corpus, labels)\n",
43 | "\n",
44 | "\n",
45 | "test_dataset = Loader.load_20newsgroup_data(subset='test')\n",
46 | "test_corpus, test_labels = test_dataset.data, test_dataset.target\n",
47 | "test_corpus, test_labels = remove_empty_docs(test_corpus, test_labels)"
48 | ]
49 | },
50 | {
51 | "cell_type": "markdown",
52 | "metadata": {},
53 | "source": [
54 | "## Mapping 20 Groups to 6 High level Categories "
55 | ]
56 | },
57 | {
58 | "cell_type": "code",
59 | "execution_count": null,
60 | "metadata": {
61 | "collapsed": true
62 | },
63 | "outputs": [],
64 | "source": [
65 | "six_groups = {\n",
66 | " 'comp.graphics':0,'comp.os.ms-windows.misc':0,'comp.sys.ibm.pc.hardware':0,\n",
67 | " 'comp.sys.mac.hardware':0, 'comp.windows.x':0,\n",
68 | " \n",
69 | " 'rec.autos':1, 'rec.motorcycles':1, 'rec.sport.baseball':1, 'rec.sport.hockey':1,\n",
70 | " \n",
71 | " 'sci.crypt':2, 'sci.electronics':2,'sci.med':2, 'sci.space':2,\n",
72 | " \n",
73 | " 'misc.forsale':3,\n",
74 | " \n",
75 | " 'talk.politics.misc':4, 'talk.politics.guns':4, 'talk.politics.mideast':4,\n",
76 | " \n",
77 | " 'talk.religion.misc':5, 'alt.atheism':5, 'soc.religion.christian':5\n",
78 | " \n",
79 | "}"
80 | ]
81 | },
82 | {
83 | "cell_type": "code",
84 | "execution_count": null,
85 | "metadata": {
86 | "collapsed": true
87 | },
88 | "outputs": [],
89 | "source": [
90 | "map_20_2_6 = [six_groups[dataset.target_names[i]] for i in range(20)]\n",
91 | "labels = [six_groups[dataset.target_names[i]] for i in labels] \n",
92 | "test_labels = [six_groups[dataset.target_names[i]] for i in test_labels] "
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "## Pre-process Text to convert it to word index sequences"
100 | ]
101 | },
102 | {
103 | "cell_type": "code",
104 | "execution_count": null,
105 | "metadata": {
106 | "collapsed": false
107 | },
108 | "outputs": [],
109 | "source": [
110 | "Preprocess.MIN_WD_COUNT=5\n",
111 | "preprocessor = Preprocess(corpus=corpus)\n",
112 | "corpus_to_seq = preprocessor.fit()"
113 | ]
114 | },
115 | {
116 | "cell_type": "code",
117 | "execution_count": null,
118 | "metadata": {
119 | "collapsed": false
120 | },
121 | "outputs": [],
122 | "source": [
123 | "test_corpus_to_seq = preprocessor.transform(test_corpus)"
124 | ]
125 | },
126 | {
127 | "cell_type": "markdown",
128 | "metadata": {},
129 | "source": [
130 | "## Initialize Embeddings"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": null,
136 | "metadata": {
137 | "collapsed": false
138 | },
139 | "outputs": [],
140 | "source": [
141 | "glove=GloVe(50)\n",
142 | "initial_embeddings = glove.get_embedding(preprocessor.word_index)"
143 | ]
144 | },
145 | {
146 | "cell_type": "markdown",
147 | "metadata": {},
148 | "source": [
149 | "## Build Model"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {
156 | "collapsed": false
157 | },
158 | "outputs": [],
159 | "source": [
160 | "newsgrp_model = DocumentModel(vocab_size=preprocessor.get_vocab_size(),\n",
161 | " sent_k_maxpool = 5,\n",
162 | " sent_filters = 20,\n",
163 | " word_kernel_size = 5,\n",
164 | " word_index = preprocessor.word_index,\n",
165 | " num_sentences=Preprocess.NUM_SENTENCES, \n",
166 | " embedding_weights=initial_embeddings,\n",
167 | " conv_activation = 'relu',\n",
168 | " train_embedding = True,\n",
169 | " learn_word_conv = True,\n",
170 | " learn_sent_conv = True,\n",
171 | " sent_dropout = 0.4,\n",
172 | " hidden_dims=64, \n",
173 | " input_dropout=0.2, \n",
174 | " hidden_gaussian_noise_sd=0.5,\n",
175 | " final_layer_kernel_regularizer=0.1,\n",
176 | " num_hidden_layers=2,\n",
177 | " num_units_final_layer=6)\n"
178 | ]
179 | },
180 | {
181 | "cell_type": "markdown",
182 | "metadata": {},
183 | "source": [
184 | "## Save model parameters"
185 | ]
186 | },
187 | {
188 | "cell_type": "code",
189 | "execution_count": null,
190 | "metadata": {
191 | "collapsed": true
192 | },
193 | "outputs": [],
194 | "source": [
195 | "train_params = TrainingParameters('6_newsgrp_largeclass', \n",
196 | " model_file_path = config.MODEL_DIR+ '/20newsgroup/model_6_01.hdf5',\n",
197 | " model_hyper_parameters = config.MODEL_DIR+ '/20newsgroup/model_6_01.json',\n",
198 | " model_train_parameters = config.MODEL_DIR+ '/20newsgroup/model_6_01_meta.json',\n",
199 | " num_epochs=20,\n",
200 | " batch_size = 128,\n",
201 | " validation_split=.10,\n",
202 | " learning_rate=0.01)\n",
203 | "\n",
204 | "train_params.save()\n",
205 | "newsgrp_model._save_model(train_params.model_hyper_parameters)"
206 | ]
207 | },
208 | {
209 | "cell_type": "markdown",
210 | "metadata": {},
211 | "source": [
212 | "## Compile and run model"
213 | ]
214 | },
215 | {
216 | "cell_type": "code",
217 | "execution_count": null,
218 | "metadata": {
219 | "collapsed": true
220 | },
221 | "outputs": [],
222 | "source": [
223 | "newsgrp_model._model.compile(loss=\"categorical_crossentropy\", \n",
224 | " optimizer=train_params.optimizer,\n",
225 | " metrics=[\"accuracy\"])\n",
226 | "checkpointer = ModelCheckpoint(filepath=train_params.model_file_path,\n",
227 | " verbose=1,\n",
228 | " save_best_only=True,\n",
229 | " save_weights_only=True)\n",
230 | "\n",
231 | "early_stop = EarlyStopping(patience=2)"
232 | ]
233 | },
234 | {
235 | "cell_type": "code",
236 | "execution_count": null,
237 | "metadata": {
238 | "collapsed": true
239 | },
240 | "outputs": [],
241 | "source": [
242 | "\n",
243 | "x_train = np.array(corpus_to_seq)\n",
244 | "y_train = to_categorical(np.array(labels))\n",
245 | "\n",
246 | "x_test = np.array(test_corpus_to_seq)\n",
247 | "y_test = to_categorical(np.array(test_labels))\n"
248 | ]
249 | },
250 | {
251 | "cell_type": "code",
252 | "execution_count": null,
253 | "metadata": {
254 | "collapsed": false
255 | },
256 | "outputs": [],
257 | "source": [
258 | "#Set LR\n",
259 | "K.set_value(newsgrp_model.get_classification_model().optimizer.lr, train_params.learning_rate)\n",
260 | "\n",
261 | "newsgrp_model.get_classification_model().fit(x_train, y_train, \n",
262 | " batch_size=train_params.batch_size, \n",
263 | " epochs=train_params.num_epochs,\n",
264 | " verbose=2,\n",
265 | " validation_split=train_params.validation_split,\n",
266 | " callbacks=[checkpointer,early_stop])\n",
267 | "\n",
268 | "newsgrp_model.get_classification_model().evaluate( x_test, y_test, verbose=2)\n",
269 | "preds = newsgrp_model.get_classification_model().predict(x_test)\n",
270 | "preds_test = np.argmax(preds, axis=1)\n"
271 | ]
272 | },
273 | {
274 | "cell_type": "markdown",
275 | "metadata": {},
276 | "source": [
277 | "## Evaluate Model Accuracy"
278 | ]
279 | },
280 | {
281 | "cell_type": "code",
282 | "execution_count": null,
283 | "metadata": {
284 | "collapsed": false
285 | },
286 | "outputs": [],
287 | "source": [
288 | "from sklearn.metrics import classification_report,accuracy_score,confusion_matrix\n",
289 | "print(classification_report(test_labels, preds_test))\n",
290 | "print(confusion_matrix(test_labels, preds_test))\n",
291 | "print(accuracy_score(test_labels, preds_test))"
292 | ]
293 | },
294 | {
295 | "cell_type": "markdown",
296 | "metadata": {},
297 | "source": [
298 | "## Visualization: Document Embeddings with tsne - what the model learned"
299 | ]
300 | },
301 | {
302 | "cell_type": "code",
303 | "execution_count": null,
304 | "metadata": {
305 | "collapsed": false
306 | },
307 | "outputs": [],
308 | "source": [
309 | "from utils import scatter_plot\n",
310 | "doc_embeddings = newsgrp_model.get_document_model().predict(x_test)\n",
311 | "print(doc_embeddings.shape)"
312 | ]
313 | },
314 | {
315 | "cell_type": "code",
316 | "execution_count": null,
317 | "metadata": {
318 | "collapsed": false
319 | },
320 | "outputs": [],
321 | "source": [
322 | "doc_proj = TSNE(n_components=2, random_state=42, ).fit_transform(doc_embeddings)"
323 | ]
324 | },
325 | {
326 | "cell_type": "code",
327 | "execution_count": null,
328 | "metadata": {
329 | "collapsed": false
330 | },
331 | "outputs": [],
332 | "source": [
333 | "f, ax, sc, txts = scatter_plot(doc_proj, np.array(test_labels))"
334 | ]
335 | },
336 | {
337 | "cell_type": "code",
338 | "execution_count": null,
339 | "metadata": {
340 | "collapsed": true
341 | },
342 | "outputs": [],
343 | "source": [
344 | "f.savefig('nws_grp_embd.png')"
345 | ]
346 | },
347 | {
348 | "cell_type": "code",
349 | "execution_count": null,
350 | "metadata": {
351 | "collapsed": true
352 | },
353 | "outputs": [],
354 | "source": []
355 | }
356 | ],
357 | "metadata": {
358 | "kernelspec": {
359 | "display_name": "Python 3",
360 | "language": "python",
361 | "name": "python3"
362 | },
363 | "language_info": {
364 | "codemirror_mode": {
365 | "name": "ipython",
366 | "version": 3
367 | },
368 | "file_extension": ".py",
369 | "mimetype": "text/x-python",
370 | "name": "python",
371 | "nbconvert_exporter": "python",
372 | "pygments_lexer": "ipython3",
373 | "version": "3.6.0"
374 | }
375 | },
376 | "nbformat": 4,
377 | "nbformat_minor": 2
378 | }
379 |
--------------------------------------------------------------------------------
/notebooks/Ch07 - Text Document Categorization/Text_Summarization_IMDB.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": null,
6 | "metadata": {
7 | "collapsed": false
8 | },
9 | "outputs": [],
10 | "source": [
11 | "import config\n",
12 | "from model.cnn_document_model import DocumentModel\n",
13 | "from preprocessing.utils import Preprocess, remove_empty_docs\n",
14 | "import numpy as np\n",
15 | "import pandas as pd\n",
16 | "from nltk.tokenize import sent_tokenize \n",
17 | "\n",
18 | "import keras.backend as K"
19 | ]
20 | },
21 | {
22 | "cell_type": "markdown",
23 | "metadata": {},
24 | "source": [
25 | "## Load pre-trained IMDB model and data"
26 | ]
27 | },
28 | {
29 | "cell_type": "code",
30 | "execution_count": null,
31 | "metadata": {
32 | "collapsed": false
33 | },
34 | "outputs": [],
35 | "source": [
36 | "imdb_model = DocumentModel.load_model(config.MODEL_DIR+ '/imdb/model_02.json')\n",
37 | "imdb_model.load_model_weights(config.MODEL_DIR+ '/imdb/model_02.hdf5')\n",
38 | "\n",
39 | "model = imdb_model.get_classification_model()\n",
40 | "model.compile(loss=\"binary_crossentropy\", optimizer='rmsprop', metrics=[\"accuracy\"])"
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": null,
46 | "metadata": {
47 | "collapsed": true
48 | },
49 | "outputs": [],
50 | "source": [
51 | "train_df = Loader.load_imdb_data(directory = 'train')\n",
52 | "print(train_df.shape)\n",
53 | "\n",
54 | "corpus = train_df['review'].tolist()\n",
55 | "target = train_df['sentiment'].tolist()\n",
56 | "corpus, target = remove_empty_docs(corpus, target)\n",
57 | "print(len(corpus))\n"
58 | ]
59 | },
60 | {
61 | "cell_type": "markdown",
62 | "metadata": {},
63 | "source": [
64 | "## Pre process input and compute document embeddings"
65 | ]
66 | },
67 | {
68 | "cell_type": "code",
69 | "execution_count": null,
70 | "metadata": {
71 | "collapsed": false
72 | },
73 | "outputs": [],
74 | "source": [
75 | "preprocessor = Preprocess(corpus=corpus)\n",
76 | "corpus_to_seq = preprocessor.fit()\n",
77 | "\n",
78 | "corpus = train_df['review'].tolist()\n",
79 | "target = train_df['sentiment'].tolist()\n",
80 | "corpus_to_seq = preprocessor.transform(corpus)\n",
81 | "\n",
82 | "x_train = np.array(corpus_to_seq)\n",
83 | "y_train = np.array(target)\n",
84 | "\n",
85 | "print(x_train.shape, y_train.shape)"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": null,
91 | "metadata": {
92 | "collapsed": false
93 | },
94 | "outputs": [],
95 | "source": [
96 | "print('Evaluating Model ...')\n",
97 | "print(model.evaluate(x_train, y_train))\n",
98 | "\n",
99 | "preds = model.predict(x_train)\n",
100 | "\n",
101 | "#invert predicted label\n",
102 | "pseudo_label = np.subtract(1,preds)"
103 | ]
104 | },
105 | {
106 | "cell_type": "markdown",
107 | "metadata": {},
108 | "source": [
109 | "## Gradient Calculation of inverted output w.r.t sentence embeddings"
110 | ]
111 | },
112 | {
113 | "cell_type": "code",
114 | "execution_count": null,
115 | "metadata": {
116 | "collapsed": false
117 | },
118 | "outputs": [],
119 | "source": [
120 | "#Get the learned sentence embeddings\n",
121 | "sentence_ebd = imdb_model.get_sentence_model().predict(x_train)\n",
122 | "\n",
123 | "input_tensors = [model.inputs[0], # input data\n",
124 | " model.sample_weights[0], # how much to weight each sample by\n",
125 | " model.targets[0], # labels \n",
126 | "]\n",
127 | "#variable tensor at the sentence embeding layer\n",
128 | "weights = imdb_model.get_sentence_model().outputs\n",
129 | "\n",
130 | "#calculate gradient of the total model loss w.r.t \n",
131 | "#the variables at sentence embd layer \n",
132 | "gradients = model.optimizer.get_gradients(model.total_loss, weights) \n",
133 | "get_gradients = K.function(inputs=input_tensors, outputs=gradients)"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {
140 | "collapsed": false
141 | },
142 | "outputs": [],
143 | "source": [
144 | "document_number = 2\n",
145 | "K.set_learning_phase(0)\n",
146 | "inputs = [[x_train[document_number]], # X\n",
147 | " [1], # sample weights\n",
148 | " [[pseudo_label[document_number][0]]], # y\n",
149 | "]\n",
150 | "grad = get_gradients(inputs)\n",
151 | "\n",
152 | "sent_score = []\n",
153 | "for i in range(Preprocess.NUM_SENTENCES):\n",
154 | " #sent_score.append((i, -np.abs(np.dot(grad[0][0][i],sentence_ebd[document_number][i])))) #DECREASING\n",
155 | " sent_score.append((i, -np.linalg.norm(grad[0][0][i])))\n",
156 | "\n",
157 | "sent_score.sort(key=lambda tup: tup[1])\n",
158 | "summary_sentences = [ i for i, s in sent_score[:4]]\n",
159 | "\n",
160 | "doc = corpus[document_number]\n",
161 | "label = y_train[document_number]\n",
162 | "prediction = preds[document_number]\n",
163 | "print(doc, label , prediction)\n",
164 | "\n",
165 | "sentences = sent_tokenize(doc)\n",
166 | "for i in summary_sentences:\n",
167 | " print(i, sentences[i])\n",
168 | " \n"
169 | ]
170 | },
171 | {
172 | "cell_type": "code",
173 | "execution_count": null,
174 | "metadata": {
175 | "collapsed": true
176 | },
177 | "outputs": [],
178 | "source": []
179 | },
180 | {
181 | "cell_type": "code",
182 | "execution_count": null,
183 | "metadata": {
184 | "collapsed": true
185 | },
186 | "outputs": [],
187 | "source": []
188 | }
189 | ],
190 | "metadata": {
191 | "kernelspec": {
192 | "display_name": "Python 3",
193 | "language": "python",
194 | "name": "python3"
195 | },
196 | "language_info": {
197 | "codemirror_mode": {
198 | "name": "ipython",
199 | "version": 3
200 | },
201 | "file_extension": ".py",
202 | "mimetype": "text/x-python",
203 | "name": "python",
204 | "nbconvert_exporter": "python",
205 | "pygments_lexer": "ipython3",
206 | "version": "3.6.0"
207 | }
208 | },
209 | "nbformat": 4,
210 | "nbformat_minor": 2
211 | }
212 |
--------------------------------------------------------------------------------
/notebooks/Ch07 - Text Document Categorization/amazon_review_model.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Wed Apr 11 15:34:14 2018
4 |
5 | @author: tghosh
6 | """
7 |
8 | import config
9 | from dataloader.loader import Loader
10 | from preprocessing.utils import Preprocess, remove_empty_docs
11 | from dataloader.embeddings import GloVe
12 | from model.cnn_document_model import DocumentModel, TrainingParameters
13 | from keras.callbacks import ModelCheckpoint, EarlyStopping
14 | import numpy as np
15 |
16 |
17 | train_df = Loader.load_amazon_reviews('train')
18 | print(train_df.shape)
19 |
20 | test_df = Loader.load_amazon_reviews('test')
21 | print(test_df.shape)
22 |
23 | dataset = train_df.sample(n=200000, random_state=42)
24 | dataset.sentiment.value_counts()
25 |
26 |
27 | corpus = dataset['review'].values
28 | target = dataset['sentiment'].values
29 | print(corpus.shape, target.shape)
30 |
31 | corpus, target = remove_empty_docs(corpus, target)
32 | print(len(corpus))
33 |
34 | preprocessor = Preprocess(corpus=corpus)
35 | corpus_to_seq = preprocessor.fit()
36 |
37 | holdout_corpus = test_df['review'].values
38 | holdout_target = test_df['sentiment'].values
39 | print(holdout_corpus.shape, holdout_target.shape)
40 |
41 | holdout_corpus, holdout_target = remove_empty_docs(holdout_corpus, holdout_target)
42 | print(len(holdout_corpus))
43 | holdout_corpus_to_seq = preprocessor.transform(holdout_corpus)
44 |
45 | glove=GloVe(50)
46 | initial_embeddings = glove.get_embedding(preprocessor.word_index)
47 |
48 | amazon_review_model = DocumentModel(vocab_size=preprocessor.get_vocab_size(),
49 | word_index = preprocessor.word_index,
50 | num_sentences=Preprocess.NUM_SENTENCES,
51 | embedding_weights=initial_embeddings,
52 | conv_activation = 'tanh',
53 | hidden_dims=64,
54 | input_dropout=0.40,
55 | hidden_gaussian_noise_sd=0.5)
56 |
57 | train_params = TrainingParameters('model_with_tanh_activation',
58 | model_file_path = config.MODEL_DIR+ '/amazonreviews/model_06.hdf5',
59 | model_hyper_parameters = config.MODEL_DIR+ '/amazonreviews/model_06.json',
60 | model_train_parameters = config.MODEL_DIR+ '/amazonreviews/model_06_meta.json',
61 | num_epochs=35)
62 |
63 | train_params.save()
64 |
65 | amazon_review_model._model.compile(loss="binary_crossentropy",
66 | optimizer=train_params.optimizer,
67 | metrics=["accuracy"])
68 | checkpointer = ModelCheckpoint(filepath=train_params.model_file_path,
69 | verbose=1,
70 | save_best_only=True,
71 | save_weights_only=True)
72 |
73 | early_stop = EarlyStopping(patience=2)
74 |
75 | x_train = np.array(corpus_to_seq)
76 | y_train = np.array(target)
77 |
78 | x_test = np.array(holdout_corpus_to_seq)
79 | y_test = np.array(holdout_target)
80 |
81 | print(x_train.shape, y_train.shape)
82 |
83 | amazon_review_model.get_classification_model().fit(x_train, y_train,
84 | batch_size=train_params.batch_size,
85 | epochs=train_params.num_epochs,
86 | verbose=2,
87 | validation_split=train_params.validation_split,
88 | callbacks=[checkpointer])
89 |
90 | amazon_review_model.get_classification_model().evaluate( x_test, y_test, train_params.batch_size*10, verbose=2)
91 |
92 | amazon_review_model._save_model(train_params.model_hyper_parameters)
93 |
94 |
95 |
96 | ''' Which embeddings changes most '''
97 |
98 | learned_embeddings = amazon_review_model.get_classification_model().get_layer('imdb_embedding').get_weights()[0]
99 |
100 | embd_change = {}
101 | for word, i in preprocessor.word_index.items():
102 | embd_change[word] = np.linalg.norm(initial_embeddings[i]-learned_embeddings[i])
103 | embd_change = sorted(embd_change.items(), key=lambda x: x[1], reverse=True)
104 | embd_change[0:20]
105 |
106 |
--------------------------------------------------------------------------------
/notebooks/Ch07 - Text Document Categorization/config.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Apr 9 11:06:03 2018
4 |
5 | @author: tghosh
6 | """
7 |
8 | import pandas as pd
9 | import numpy as np
10 | import os
11 |
12 | TEXT_DATA_DIR = 'PATH/TO/DATA_ROOT'
13 |
14 | #Dataset from http://ai.stanford.edu/~amaas/data/sentiment/
15 | IMDB_DATA = TEXT_DATA_DIR + 'aclImdb'
16 | IMDB_DATA_CSV = TEXT_DATA_DIR + 'imdb_csv'
17 |
18 | PROCESSED_20_NEWS_GRP = TEXT_DATA_DIR + '20newsgrp'
19 |
20 | AMAZON_TRAIN_DATA = TEXT_DATA_DIR+'amazonreviews/train.ft'
21 | AMAZON_TEST_DATA = TEXT_DATA_DIR+'amazonreviews/test.ft'
22 |
23 | GLOVE_DIR = TEXT_DATA_DIR+ 'glove.6B'
24 | WORD2VEC_DIR = TEXT_DATA_DIR+ 'word2vec'
25 |
26 | MODEL_DIR = 'PATH/TO/MODELDIR'
27 |
--------------------------------------------------------------------------------
/notebooks/Ch07 - Text Document Categorization/dataloader/__init__.py:
--------------------------------------------------------------------------------
1 | #
--------------------------------------------------------------------------------
/notebooks/Ch07 - Text Document Categorization/dataloader/embeddings.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Apr 9 11:45:59 2018
4 |
5 | @author: tghosh
6 | """
7 | import config
8 | import numpy as np
9 | import os
10 |
11 | class GloVe:
12 |
13 | def __init__(self, embd_dim=50):
14 | if embd_dim not in [50, 100, 200, 300]:
15 | raise ValueError('embedding dim should be one of [50, 100, 200, 300]')
16 | self.EMBEDDING_DIM = embd_dim
17 | self.embedding_matrix = None
18 |
19 | def _load(self):
20 | print('Reading {} dim GloVe vectors'.format(self.EMBEDDING_DIM))
21 | self.embeddings_index = {}
22 | with open(os.path.join(config.GLOVE_DIR, 'glove.6B.'+str(self.EMBEDDING_DIM)+'d.txt'),encoding="utf8") as fin:
23 | for line in fin:
24 | try:
25 | values = line.split()
26 | coefs = np.asarray(values[1:], dtype='float32')
27 | word = values[0]
28 | self.embeddings_index[word] = coefs
29 | except:
30 | print(line)
31 |
32 | print('Found %s word vectors.' % len(self.embeddings_index))
33 |
34 | def _init_embedding_matrix(self, word_index_dict, oov_words_file='OOV-Words.txt'):
35 | self.embedding_matrix = np.zeros((len(word_index_dict)+2 , self.EMBEDDING_DIM)) # +1 for the 0 word index from paddings.
36 | not_found_words=0
37 | missing_word_index = []
38 |
39 | with open(oov_words_file, 'w') as f:
40 | for word, i in word_index_dict.items():
41 | embedding_vector = self.embeddings_index.get(word)
42 | if embedding_vector is not None:
43 | # words not found in embedding index will be all-zeros.
44 | self.embedding_matrix[i] = embedding_vector
45 | else:
46 | not_found_words+=1
47 | f.write(word + ','+str(i)+'\n')
48 | missing_word_index.append(i)
49 |
50 | #oov by average vector:
51 | self.embedding_matrix[1] = np.mean(self.embedding_matrix, axis=0)
52 | for indx in missing_word_index:
53 | self.embedding_matrix[indx] = np.random.rand(self.EMBEDDING_DIM)+ self.embedding_matrix[1]
54 | print("words not found in embeddings: {}".format(not_found_words))
55 |
56 |
57 | def get_embedding(self, word_index_dict):
58 | if self.embedding_matrix is None:
59 | self._load()
60 | self._init_embedding_matrix(word_index_dict)
61 | return self.embedding_matrix
62 |
63 | def update_embeddings(self, word_index_dict, other_embedding, other_word_index):
64 | num_updated = 0
65 | for word, i in other_word_index.items():
66 | if word_index_dict.get(word) is not None:
67 | embedding_vector = other_embedding[i]
68 | this_vocab_word_indx = word_index_dict.get(word)
69 | #print("BEFORE", self.embedding_matrix[this_vocab_word_indx])
70 | self.embedding_matrix[this_vocab_word_indx] = embedding_vector
71 | #print("AFTER", self.embedding_matrix[this_vocab_word_indx])
72 | num_updated+=1
73 |
74 | print('{} words are updated out of {}'.format(num_updated, len(word_index_dict)))
75 |
76 | class Word2Vec(GloVe):
77 | def __init__(self, embd_dim=50):
78 | super().__init__(embd_dim=embd_dim)
79 |
80 | def _load(self):
81 | print('Reading {} dim Gensim Word2Vec vectors'.format(self.EMBEDDING_DIM))
82 | self.embeddings_index = {}
83 | with open(os.path.join(config.WORD2VEC_DIR, 'word2vec_'+str(self.EMBEDDING_DIM)+'_imdb.txt'),encoding="utf8") as fin:
84 | for line in fin:
85 | try:
86 | values = line.split()
87 | coefs = np.asarray(values[1:], dtype='float32')
88 | word = values[0]
89 | self.embeddings_index[word] = coefs
90 | except:
91 | print(line)
92 |
93 | print('Found %s word vectors.' % len(self.embeddings_index))
94 | #test
95 | #glove=Word2Vec(50)
96 | #initial_embeddings = glove.get_embedding({'good':2, 'movie':3})
97 |
--------------------------------------------------------------------------------
/notebooks/Ch07 - Text Document Categorization/dataloader/loader.py:
--------------------------------------------------------------------------------
1 | # -*- coding: utf-8 -*-
2 | """
3 | Created on Mon Apr 9 10:44:05 2018
4 |
5 | @author: tghosh
6 | """
7 | import config
8 | from preprocessing import utils
9 | import re
10 | import os
11 | import numpy as np
12 | import pandas as pd
13 | from sklearn.datasets import fetch_20newsgroups
14 | from nltk.corpus import reuters
15 | from sklearn.preprocessing import LabelEncoder
16 |
17 | class Loader:
18 |
19 | amzn_reviews_kaggle_regx = re.compile(r'__label__(?P