├── _config.yml
├── day_4
├── flask_docker_iap
│ ├── requirements.txt
│ ├── Dockerfile
│ ├── app.py
│ └── model_nlp.py
└── Exploration - FP16 in Python.ipynb
├── images
├── colab_1.jpg
├── colab_2.jpg
├── colab_3.jpg
├── colab_4.jpg
└── dl-iap-header.jpg
├── day_3
├── notebook_imgs
│ └── yoon_kim_structure.png
├── Code Lab 4C - Transfer Learning in NLP.ipynb
└── Sample - Loading and Visualising NLP Data.ipynb
├── .github
└── ISSUE_TEMPLATE
│ └── bug_report.md
├── LICENSE
├── .gitignore
├── README.md
└── day_1
├── Code Lab 0 - Hello GPU.ipynb
└── Sample - EDA Titanic.ipynb
/_config.yml:
--------------------------------------------------------------------------------
1 | theme: jekyll-theme-cayman
--------------------------------------------------------------------------------
/day_4/flask_docker_iap/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy
2 | keras
3 | tensorflow
4 | nltk
5 | flask
6 |
--------------------------------------------------------------------------------
/images/colab_1.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSUTD/deeplearning-workshop-2019/master/images/colab_1.jpg
--------------------------------------------------------------------------------
/images/colab_2.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSUTD/deeplearning-workshop-2019/master/images/colab_2.jpg
--------------------------------------------------------------------------------
/images/colab_3.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSUTD/deeplearning-workshop-2019/master/images/colab_3.jpg
--------------------------------------------------------------------------------
/images/colab_4.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSUTD/deeplearning-workshop-2019/master/images/colab_4.jpg
--------------------------------------------------------------------------------
/images/dl-iap-header.jpg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSUTD/deeplearning-workshop-2019/master/images/dl-iap-header.jpg
--------------------------------------------------------------------------------
/day_3/notebook_imgs/yoon_kim_structure.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/OpenSUTD/deeplearning-workshop-2019/master/day_3/notebook_imgs/yoon_kim_structure.png
--------------------------------------------------------------------------------
/day_4/flask_docker_iap/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.6.8-slim-stretch
2 | MAINTAINER Dude "dude@example.com"
3 |
4 | WORKDIR /app
5 |
6 | COPY . /app
7 |
8 | RUN pip install --no-cache-dir -r requirements.txt
9 |
10 | RUN python -c "import nltk;nltk.download('vader_lexicon');nltk.download('stopwords')"
11 |
12 | EXPOSE 5000
13 |
14 | ENTRYPOINT [ "python" ]
15 |
16 | CMD [ "app.py" ]
--------------------------------------------------------------------------------
/.github/ISSUE_TEMPLATE/bug_report.md:
--------------------------------------------------------------------------------
1 | ---
2 | name: Bug report
3 | about: Create a report to help us improve
4 | title: ''
5 | labels: bug
6 | assignees: ''
7 |
8 | ---
9 |
10 | **Describe the bug**
11 | A clear and concise description of what the bug is.
12 |
13 | **To Reproduce**
14 | Steps to reproduce the behavior:
15 | 1. Go to '...'
16 | 2. Click on '....'
17 | 3. Scroll down to '....'
18 | 4. See error
19 |
20 | **Expected behavior**
21 | A clear and concise description of what you expected to happen.
22 |
23 | **Screenshots**
24 | If applicable, add screenshots to help explain your problem.
25 |
26 | **Desktop (please complete the following information):**
27 | - OS:
28 | - Browser:
29 |
30 | **Additional context**
31 | Add any other context about the problem here.
32 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2018 Timothy Liu
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/day_4/flask_docker_iap/app.py:
--------------------------------------------------------------------------------
1 | print(" * [i] Loading Python modules...")
2 | import time
3 | import flask
4 | import functools
5 |
6 | print(" * [i] Loading NLP models...")
7 | from model_nlp import *
8 | from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
9 |
10 | app = flask.Flask(__name__)
11 |
12 | sentiment_model = sentiment_classifier()
13 | sia = SIA()
14 |
15 | @functools.lru_cache(maxsize=128, typed=False)
16 | def pred_sentiment(input_):
17 | global sentiment_model, data
18 | data["sentiment"] = sentiment_model.predict(input_)
19 |
20 | data = {"success": False}
21 |
22 | @app.route("/predict", methods=["POST"])
23 | def predict():
24 | global sentiment_model, data
25 |
26 | # get the respective args from the post request
27 |
28 | if flask.request.method == "POST":
29 | start_time = time.time()
30 |
31 | data = {"success": False}
32 | start_time = time.time()
33 | test_text = flask.request.args.get("test")
34 | test_text = test_text.replace("%20", " ")
35 |
36 | pred_sentiment(test_text)
37 |
38 | nltk_sentiment = sia.polarity_scores(test_text)
39 | data["nltk"] = nltk_sentiment
40 |
41 | data["success"] = True
42 |
43 | print(" * [i] Request took", round(time.time()-start_time, 3), "seconds")
44 |
45 | # return the data dictionary as a JSON response
46 | return flask.jsonify(data)
47 |
48 |
49 | # if file was executed by itself, start the server process
50 | if __name__ == "__main__":
51 | print(" * [i] Starting Flask server")
52 | app.run(host='0.0.0.0', port=5000)
--------------------------------------------------------------------------------
/day_4/flask_docker_iap/model_nlp.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import keras
3 | import pickle
4 | import re
5 | from nltk.corpus import stopwords
6 | from keras import preprocessing
7 |
8 | class sentiment_classifier(object):
9 | def __init__(self, model_file="cnn.h5"):
10 | self.model = keras.models.load_model(model_file)
11 | self.model._make_predict_function()
12 | self.classes = ["negative", "positive"]
13 | with open('tokenizer.pickle', 'rb') as handle:
14 | self.tokenizer = pickle.load(handle)
15 |
16 | def predict(self, input_data):
17 | input_sequence = self.preprocess(input_data)
18 | preds = self.model.predict(input_sequence)
19 | pred = preds.argmax(axis=-1)
20 | output = self.classes[pred[0]]
21 | return output
22 |
23 | from nltk.corpus import stopwords
24 | def clean_text(text, remove_stopwords=True):
25 | output = ""
26 | text = str(text).replace("\n", "")
27 | text = re.sub(r'[^\w\s]','',text).lower()
28 | if remove_stopwords:
29 | text = text.split(" ")
30 | for word in text:
31 | if word not in stopwords.words("english"):
32 | output = output + " " + word
33 | else:
34 | output = text
35 | return str(output.strip()).replace(" ", " ")
36 |
37 | def preprocess(self, input_data, MAX_SEQUENCE_LENGTH=30):
38 | input_string = self.clean_text(input_data)
39 | input_token = self.tokenizer.texts_to_sequences([input_string])
40 | processed_input = preprocessing.sequence.pad_sequences(input_token, padding='pre', maxlen=(MAX_SEQUENCE_LENGTH-5))
41 | processed_input = preprocessing.sequence.pad_sequences(processed_input, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))
42 | return processed_input
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | day_1/flickr8k/images/
2 |
3 | .DS_Store
4 | .jupyter
5 | .keras
6 | .ipy*
7 | .cache
8 | .local
9 | .nv
10 | .config
11 |
12 | day_2/train
13 | day_2/val
14 | day_2/logs
15 | day_3/.DS_Store
16 | day_3/logs
17 | cache
18 | *.pickle
19 | *.h5
20 |
21 | # Byte-compiled / optimized / DLL files
22 | __pycache__/
23 | *.py[cod]
24 | *$py.class
25 |
26 | # C extensions
27 | *.so
28 |
29 | # Distribution / packaging
30 | .Python
31 | build/
32 | develop-eggs/
33 | dist/
34 | downloads/
35 | eggs/
36 | .eggs/
37 | lib/
38 | lib64/
39 | parts/
40 | sdist/
41 | var/
42 | wheels/
43 | *.egg-info/
44 | .installed.cfg
45 | *.egg
46 | MANIFEST
47 |
48 | # PyInstaller
49 | # Usually these files are written by a python script from a template
50 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
51 | *.manifest
52 | *.spec
53 |
54 | # Installer logs
55 | pip-log.txt
56 | pip-delete-this-directory.txt
57 |
58 | # Unit test / coverage reports
59 | htmlcov/
60 | .tox/
61 | .coverage
62 | .coverage.*
63 | .cache
64 | nosetests.xml
65 | coverage.xml
66 | *.cover
67 | .hypothesis/
68 | .pytest_cache/
69 |
70 | # Translations
71 | *.mo
72 | *.pot
73 |
74 | # Django stuff:
75 | *.log
76 | local_settings.py
77 | db.sqlite3
78 |
79 | # Flask stuff:
80 | instance/
81 | .webassets-cache
82 |
83 | # Scrapy stuff:
84 | .scrapy
85 |
86 | # Sphinx documentation
87 | docs/_build/
88 |
89 | # PyBuilder
90 | target/
91 |
92 | # Jupyter Notebook
93 | .ipynb_checkpoints
94 |
95 | # pyenv
96 | .python-version
97 |
98 | # celery beat schedule file
99 | celerybeat-schedule
100 |
101 | # SageMath parsed files
102 | *.sage.py
103 |
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 |
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 |
117 | # Rope project settings
118 | .ropeproject
119 |
120 | # mkdocs documentation
121 | /site
122 |
123 | # mypy
124 | .mypy_cache/
125 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | 
2 |
3 | This repository contains the material for the Deep Learning Workshop conducted in the IAP 2019. The slides are also available at on [Google Drive](bit.ly/dl-iap-1).
4 |
5 | If you do not already have a workstation set up for Deep Learning, you may want to run the notebooks in [Google Colab](https://colab.research.google.com/).
6 |
7 | If you have a workstation or cloud instance set up for Deep Learning, our recommended way to run the notebooks is to run it in a Docker container `nvaitc/ai-lab` ([learn more](https://github.com/NVAITC/ai-lab/blob/master/README.md#using-the-ai-lab-container)). For now, we will also assume that you are using Ubuntu 16.04 or 18.04, and have an NVIDIA GPU card in your workstation/instance. The instructions are [below](#b-workstation--cloud-instance).
8 |
9 |  [](https://github.com/OpenSUTD/deeplearning-workshop-2019/issues) 
10 |
11 | ## Using the Notebooks
12 |
13 | ### A. Google Colab
14 |
15 | #### 1. Open Notebook in Colab
16 |
17 | * Proceed to [Google Colab](https://colab.research.google.com) and click the "GitHub" tab.
18 | * Enter in the URL of this repository as follows and simply select which notebook you wish to open
19 |
20 | 
21 |
22 | * Change runtime type to GPU
23 |
24 | 
25 |
26 | 
27 |
28 | * On the menu bar, go to **Runtime > Run All**
29 |
30 | 
31 |
32 | * Accept the usual warning, and you will be able to run the notebook
33 | * All the notebooks should be able to run just fine, do open an issue if you face problems.
34 |
35 | ### B. Workstation / Cloud Instance
36 |
37 | #### 1. Setting up CUDA, NVIDIA drivers, and Docker
38 |
39 | ```bash
40 | sudo su root
41 | curl https://getcuda.ml/ubuntu.sh | bash
42 | # your computer will reboot
43 | # after your computer reboots, add yourself to the docker group
44 | # if you don't want to run docker with sudo
45 | # you may need to log in and out again for this to take effect
46 | sudo usermod -aG docker $USER
47 | ```
48 |
49 | #### 2. Pulling the `nvaitc/ai-lab` Docker image
50 |
51 | * This container includes many data science, machine learning and deep learning packages that are preconfigured and ready to use.
52 | * **This is a 6GB download**!
53 | * Find out more about the image at its [GitHub repository](https://github.com/NVAITC/ai-lab).
54 |
55 | ```bash
56 | docker pull nvaitc/ai-lab:latest
57 | ```
58 |
59 | #### 3. Download the code labs
60 |
61 | ```bash
62 | git clone https://github.com/OpenSUTD/deeplearning-workshop-2019
63 | # take note of where you cloned the files to!
64 | # we will assume it's at /home/$USER/deeplearning-workshop-2019
65 | ```
66 |
67 | Alternatively, you may download this repository as a zip file from the GitHub web interface.
68 |
69 | #### 4. Start the container and mount the folder
70 |
71 | Please change the path `/home/$USER/deeplearning-workshop-2019` to where-ever you downloaded the files to in **Step 3**.
72 |
73 | ```bash
74 | nvidia-docker run --rm -p 8888:8888 -v /home/$USER/deeplearning-workshop-2019:/home/jovyan/ nvaitc/ai-lab
75 | ```
76 |
77 | This will output a chunk of output in the Terminal. Take note of the last few lines.
78 |
79 | Open your web browser and point to `localhost:8888`. You will be asked to enter a token. This can be found in the last few lines of the Terminal output.
80 |
81 | ## Workshop Authors
82 |
83 | * Soh Jun De
84 | * Aiden Chia
85 | * Timothy Liu
86 |
--------------------------------------------------------------------------------
/day_4/Exploration - FP16 in Python.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "Ever wonder what can go wrong with `float16`?"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "import numpy as np"
17 | ]
18 | },
19 | {
20 | "cell_type": "markdown",
21 | "metadata": {},
22 | "source": [
23 | "First off, let's look at the correct `float32` results"
24 | ]
25 | },
26 | {
27 | "cell_type": "code",
28 | "execution_count": 2,
29 | "metadata": {},
30 | "outputs": [
31 | {
32 | "data": {
33 | "text/plain": [
34 | "1.0001"
35 | ]
36 | },
37 | "execution_count": 2,
38 | "metadata": {},
39 | "output_type": "execute_result"
40 | }
41 | ],
42 | "source": [
43 | "np.add(1, 0.0001, dtype=\"float32\")"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 7,
49 | "metadata": {},
50 | "outputs": [
51 | {
52 | "data": {
53 | "text/plain": [
54 | "66001.0"
55 | ]
56 | },
57 | "execution_count": 7,
58 | "metadata": {},
59 | "output_type": "execute_result"
60 | }
61 | ],
62 | "source": [
63 | "np.add(1, 66000, dtype=\"float32\")"
64 | ]
65 | },
66 | {
67 | "cell_type": "markdown",
68 | "metadata": {},
69 | "source": [
70 | "Now for `float16`"
71 | ]
72 | },
73 | {
74 | "cell_type": "code",
75 | "execution_count": 3,
76 | "metadata": {},
77 | "outputs": [
78 | {
79 | "data": {
80 | "text/plain": [
81 | "1.0"
82 | ]
83 | },
84 | "execution_count": 3,
85 | "metadata": {},
86 | "output_type": "execute_result"
87 | }
88 | ],
89 | "source": [
90 | "np.add(1, 0.0001, dtype=\"float16\")"
91 | ]
92 | },
93 | {
94 | "cell_type": "code",
95 | "execution_count": 4,
96 | "metadata": {},
97 | "outputs": [
98 | {
99 | "data": {
100 | "text/plain": [
101 | "inf"
102 | ]
103 | },
104 | "execution_count": 4,
105 | "metadata": {},
106 | "output_type": "execute_result"
107 | }
108 | ],
109 | "source": [
110 | "np.add(1, 66000, dtype=\"float16\")"
111 | ]
112 | },
113 | {
114 | "cell_type": "code",
115 | "execution_count": 5,
116 | "metadata": {},
117 | "outputs": [
118 | {
119 | "data": {
120 | "text/plain": [
121 | "0.0"
122 | ]
123 | },
124 | "execution_count": 5,
125 | "metadata": {},
126 | "output_type": "execute_result"
127 | }
128 | ],
129 | "source": [
130 | "np.add(1e-8, 1e-8, dtype=\"float16\")"
131 | ]
132 | },
133 | {
134 | "cell_type": "code",
135 | "execution_count": 6,
136 | "metadata": {},
137 | "outputs": [
138 | {
139 | "data": {
140 | "text/plain": [
141 | "2e-08"
142 | ]
143 | },
144 | "execution_count": 6,
145 | "metadata": {},
146 | "output_type": "execute_result"
147 | }
148 | ],
149 | "source": [
150 | "np.add(1e-8, 1e-8, dtype=\"float32\")"
151 | ]
152 | },
153 | {
154 | "cell_type": "code",
155 | "execution_count": null,
156 | "metadata": {},
157 | "outputs": [],
158 | "source": []
159 | }
160 | ],
161 | "metadata": {
162 | "kernelspec": {
163 | "display_name": "Python 3",
164 | "language": "python",
165 | "name": "python3"
166 | },
167 | "language_info": {
168 | "codemirror_mode": {
169 | "name": "ipython",
170 | "version": 3
171 | },
172 | "file_extension": ".py",
173 | "mimetype": "text/x-python",
174 | "name": "python",
175 | "nbconvert_exporter": "python",
176 | "pygments_lexer": "ipython3",
177 | "version": "3.6.7"
178 | }
179 | },
180 | "nbformat": 4,
181 | "nbformat_minor": 2
182 | }
183 |
--------------------------------------------------------------------------------
/day_1/Code Lab 0 - Hello GPU.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Hello, GPU!"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [
15 | {
16 | "name": "stdout",
17 | "output_type": "stream",
18 | "text": [
19 | "Mon Mar 25 06:23:26 2019 \r\n",
20 | "+-----------------------------------------------------------------------------+\r\n",
21 | "| NVIDIA-SMI 418.39 Driver Version: 418.39 CUDA Version: 10.1 |\r\n",
22 | "|-------------------------------+----------------------+----------------------+\r\n",
23 | "| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\r\n",
24 | "| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\r\n",
25 | "|===============================+======================+======================|\r\n",
26 | "| 0 GeForce GTX 108... On | 00000000:01:00.0 On | N/A |\r\n",
27 | "| 49% 50C P0 65W / 250W | 729MiB / 11175MiB | 2% Default |\r\n",
28 | "+-------------------------------+----------------------+----------------------+\r\n",
29 | " \r\n",
30 | "+-----------------------------------------------------------------------------+\r\n",
31 | "| Processes: GPU Memory |\r\n",
32 | "| GPU PID Type Process name Usage |\r\n",
33 | "|=============================================================================|\r\n",
34 | "+-----------------------------------------------------------------------------+\r\n"
35 | ]
36 | }
37 | ],
38 | "source": [
39 | "!nvidia-smi"
40 | ]
41 | },
42 | {
43 | "cell_type": "code",
44 | "execution_count": 2,
45 | "metadata": {},
46 | "outputs": [
47 | {
48 | "name": "stdout",
49 | "output_type": "stream",
50 | "text": [
51 | "\u001b[1m\u001b[37m0e2ea7bd894e\u001b[m Mon Mar 25 06:23:27 2019\r\n",
52 | "\u001b[36m[0]\u001b[m \u001b[34mGeForce GTX 1080 Ti\u001b[m |\u001b[1m\u001b[31m 50'C\u001b[m, \u001b[32m 1 %\u001b[m | \u001b[36m\u001b[1m\u001b[33m 729\u001b[m / \u001b[33m11175\u001b[m MB |\r\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "!gpustat"
58 | ]
59 | },
60 | {
61 | "cell_type": "code",
62 | "execution_count": 3,
63 | "metadata": {},
64 | "outputs": [
65 | {
66 | "name": "stdout",
67 | "output_type": "stream",
68 | "text": [
69 | "[name: \"/device:CPU:0\"\n",
70 | "device_type: \"CPU\"\n",
71 | "memory_limit: 268435456\n",
72 | "locality {\n",
73 | "}\n",
74 | "incarnation: 4503158194805763585\n",
75 | ", name: \"/device:XLA_CPU:0\"\n",
76 | "device_type: \"XLA_CPU\"\n",
77 | "memory_limit: 17179869184\n",
78 | "locality {\n",
79 | "}\n",
80 | "incarnation: 3695275088634348821\n",
81 | "physical_device_desc: \"device: XLA_CPU device\"\n",
82 | ", name: \"/device:XLA_GPU:0\"\n",
83 | "device_type: \"XLA_GPU\"\n",
84 | "memory_limit: 17179869184\n",
85 | "locality {\n",
86 | "}\n",
87 | "incarnation: 3548020933932946827\n",
88 | "physical_device_desc: \"device: XLA_GPU device\"\n",
89 | ", name: \"/device:GPU:0\"\n",
90 | "device_type: \"GPU\"\n",
91 | "memory_limit: 10189963264\n",
92 | "locality {\n",
93 | " bus_id: 1\n",
94 | " links {\n",
95 | " }\n",
96 | "}\n",
97 | "incarnation: 18420482472046241236\n",
98 | "physical_device_desc: \"device: 0, name: GeForce GTX 1080 Ti, pci bus id: 0000:01:00.0, compute capability: 6.1\"\n",
99 | "]\n"
100 | ]
101 | }
102 | ],
103 | "source": [
104 | "import tensorflow as tf\n",
105 | "from tensorflow.python.client import device_lib\n",
106 | "print(device_lib.list_local_devices())"
107 | ]
108 | }
109 | ],
110 | "metadata": {
111 | "kernelspec": {
112 | "display_name": "Python 3",
113 | "language": "python",
114 | "name": "python3"
115 | },
116 | "language_info": {
117 | "codemirror_mode": {
118 | "name": "ipython",
119 | "version": 3
120 | },
121 | "file_extension": ".py",
122 | "mimetype": "text/x-python",
123 | "name": "python",
124 | "nbconvert_exporter": "python",
125 | "pygments_lexer": "ipython3",
126 | "version": "3.6.7"
127 | }
128 | },
129 | "nbformat": 4,
130 | "nbformat_minor": 2
131 | }
132 |
--------------------------------------------------------------------------------
/day_3/Code Lab 4C - Transfer Learning in NLP.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Code Lab 4C - Transfer Learning in NLP\n",
8 | "\n",
9 | "## Fine-tuning ELMo for Text Classification\n",
10 | "\n",
11 | "In this Code Lab, we are going to make use of a pre-trained ELMo model from [TensorFlow Hub](https://www.tensorflow.org/hub/). ELMo is a model that makes use of a **language model** (a more complex representation compared to word embeddings) to achieve state of the art results (until recently, but that's how fast things are moving.\n",
12 | "\n",
13 | "**More on ELMo**\n",
14 | "\n",
15 | "* https://allennlp.org/elmo\n",
16 | "* [ArXiv: Deep contextualized word representations](https://arxiv.org/abs/1802.05365)\n",
17 | "\n",
18 | "This notebook consist of 4 main sections:\n",
19 | "\n",
20 | "1. Preparing the data\n",
21 | "2. Implementing a simple CNN model\n",
22 | "3. Training the model\n",
23 | "4. Evaluating the model"
24 | ]
25 | },
26 | {
27 | "cell_type": "markdown",
28 | "metadata": {},
29 | "source": [
30 | "**Key Model Parameters**"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": null,
36 | "metadata": {},
37 | "outputs": [],
38 | "source": [
39 | "MAX_NB_WORDS = 100000 # max no. of words for tokenizer\n",
40 | "MAX_SEQUENCE_LENGTH = 20 # max length of each entry (sentence), including padding\n",
41 | "VALIDATION_SPLIT = 0.3 # data for validation (not used in training)"
42 | ]
43 | },
44 | {
45 | "cell_type": "code",
46 | "execution_count": null,
47 | "metadata": {},
48 | "outputs": [],
49 | "source": [
50 | "import numpy as np\n",
51 | "import re, sys, csv, pickle\n",
52 | "from tqdm import tqdm_notebook\n",
53 | "\n",
54 | "import tensorflow as tf\n",
55 | "import tensorflow_hub as hub\n",
56 | "\n",
57 | "import keras\n",
58 | "from keras import regularizers, initializers, optimizers, callbacks\n",
59 | "from keras.utils.np_utils import to_categorical\n",
60 | "from keras.layers import *\n",
61 | "from keras.models import Model\n",
62 | "from keras import backend as K"
63 | ]
64 | },
65 | {
66 | "cell_type": "markdown",
67 | "metadata": {},
68 | "source": [
69 | "### 1. Prepare the data"
70 | ]
71 | },
72 | {
73 | "cell_type": "markdown",
74 | "metadata": {},
75 | "source": [
76 | "**Preprocessing Step**\n",
77 | "\n",
78 | "Removing [stopwords](https://nlp.stanford.edu/IR-book/html/htmledition/dropping-common-terms-stop-words-1.html), punctuation and making everything lowercase."
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": null,
84 | "metadata": {},
85 | "outputs": [],
86 | "source": [
87 | "from nltk.corpus import stopwords\n",
88 | "def clean_text(text, remove_stopwords=True):\n",
89 | " output = \"\"\n",
90 | " text = str(text).replace(\"\\n\", \"\")\n",
91 | " text = re.sub(r'[^\\w\\s]','',text).lower()\n",
92 | " if remove_stopwords:\n",
93 | " text = text.split(\" \")\n",
94 | " for word in text:\n",
95 | " if word not in stopwords.words(\"english\"):\n",
96 | " output = output + \" \" + word\n",
97 | " else:\n",
98 | " output = text\n",
99 | " return str(output.strip())[1:-3].replace(\" \", \" \")"
100 | ]
101 | },
102 | {
103 | "cell_type": "markdown",
104 | "metadata": {},
105 | "source": [
106 | "**Reading from Dataset**"
107 | ]
108 | },
109 | {
110 | "cell_type": "code",
111 | "execution_count": null,
112 | "metadata": {},
113 | "outputs": [],
114 | "source": [
115 | "DATA_URL = \"https://s3-ap-southeast-1.amazonaws.com/deeplearning-mat/stanford_movie.zip\"\n",
116 | "DATA_DIR = keras.utils.get_file(\"stanford_movie.zip\", DATA_URL, cache_subdir='datasets', extract=True)\n",
117 | "print(\"Dataset present at\", DATA_DIR)\n",
118 | "DATA_DIR = DATA_DIR.replace(\".zip\", \"\")"
119 | ]
120 | },
121 | {
122 | "cell_type": "code",
123 | "execution_count": null,
124 | "metadata": {},
125 | "outputs": [],
126 | "source": [
127 | "texts = [] # empty list for model input: the movie reviews\n",
128 | "labels = [] # empty lists model output: sentiment labels\n",
129 | "\n",
130 | "data_neg = open(DATA_DIR+\"/stanford_movie_neg.txt\", \"rb\") \n",
131 | "for line in tqdm_notebook(data_neg, total=5331): \n",
132 | " texts.append(clean_text(line, remove_stopwords=False))\n",
133 | " labels.append(int(0))"
134 | ]
135 | },
136 | {
137 | "cell_type": "code",
138 | "execution_count": null,
139 | "metadata": {},
140 | "outputs": [],
141 | "source": [
142 | "data_pos = open(DATA_DIR+\"/stanford_movie_pos.txt\", \"rb\") \n",
143 | "for line in tqdm_notebook(data_pos, total=5331): \n",
144 | " texts.append(clean_text(line, remove_stopwords=False))\n",
145 | " labels.append(int(1))"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": null,
151 | "metadata": {},
152 | "outputs": [],
153 | "source": [
154 | "print(\"Sample negative:\", texts[0], labels[0])\n",
155 | "print(\"Sample positive:\", texts[9000], labels[9000])"
156 | ]
157 | },
158 | {
159 | "cell_type": "markdown",
160 | "metadata": {},
161 | "source": [
162 | "**Generate the array of sentences from dataset**"
163 | ]
164 | },
165 | {
166 | "cell_type": "code",
167 | "execution_count": null,
168 | "metadata": {},
169 | "outputs": [],
170 | "source": [
171 | "all_text = [' '.join(t.split()[0:150]) for t in texts]\n",
172 | "all_text = np.array(all_text, dtype=object)[:, np.newaxis]\n",
173 | "\n",
174 | "labels = to_categorical(np.asarray(labels)) # convert the category label to one-hot encoding\n",
175 | "print('[i] Shape of data tensor:', all_text.shape)\n",
176 | "print('[i] Shape of label tensor:', labels.shape)\n",
177 | "\n",
178 | "indices = np.arange(all_text.shape[0])\n",
179 | "np.random.shuffle(indices)\n",
180 | "all_text = all_text[indices]\n",
181 | "labels = labels[indices]\n",
182 | "nb_validation_samples = int(VALIDATION_SPLIT * all_text.shape[0])\n",
183 | "x_train = all_text[:-nb_validation_samples]\n",
184 | "y_train = labels[:-nb_validation_samples]\n",
185 | "x_val = all_text[-nb_validation_samples:]\n",
186 | "y_val = labels[-nb_validation_samples:]\n",
187 | "\n",
188 | "print('[i] Number of entries in each category:')\n",
189 | "print(\"[+] Training:\",y_train.sum(axis=0))\n",
190 | "print(\"[+] Validation:\",y_val.sum(axis=0))"
191 | ]
192 | },
193 | {
194 | "cell_type": "markdown",
195 | "metadata": {},
196 | "source": [
197 | "**What does the data look like?**"
198 | ]
199 | },
200 | {
201 | "cell_type": "code",
202 | "execution_count": null,
203 | "metadata": {},
204 | "outputs": [],
205 | "source": [
206 | "print(\"Sentence input\", all_text[0])\n",
207 | "print(\"\")\n",
208 | "print(\"One-hot label\", labels[0])"
209 | ]
210 | },
211 | {
212 | "cell_type": "markdown",
213 | "metadata": {},
214 | "source": [
215 | "### 2. Create the model\n",
216 | "\n",
217 | "We will now start to create the model in `Keras`."
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {},
223 | "source": [
224 | "**Create the `ELMo` layer**\n",
225 | "\n",
226 | "Computes deep contextualized word representations using character-based word representations and bidirectional LSTMs. Paper: [Deep contextualized word representations](https://arxiv.org/abs/1802.05365)"
227 | ]
228 | },
229 | {
230 | "cell_type": "code",
231 | "execution_count": null,
232 | "metadata": {},
233 | "outputs": [],
234 | "source": [
235 | "# Initialize session\n",
236 | "sess = tf.Session()\n",
237 | "K.set_session(sess)"
238 | ]
239 | },
240 | {
241 | "cell_type": "code",
242 | "execution_count": null,
243 | "metadata": {},
244 | "outputs": [],
245 | "source": [
246 | "elmo_model = hub.Module(\"https://tfhub.dev/google/elmo/2\", trainable=False)\n",
247 | "sess.run(tf.global_variables_initializer())\n",
248 | "sess.run(tf.tables_initializer())\n",
249 | "\n",
250 | "def ElmoEmbedding(x):\n",
251 | " return elmo_model(tf.squeeze(tf.cast(x, tf.string)), signature=\"default\", as_dict=True)[\"default\"]"
252 | ]
253 | },
254 | {
255 | "cell_type": "markdown",
256 | "metadata": {},
257 | "source": [
258 | "**Rest of the model**"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {},
265 | "outputs": [],
266 | "source": [
267 | "sequence_input = Input(shape=(1,), dtype=tf.string)\n",
268 | "embedded_sequences = Lambda(ElmoEmbedding, output_shape=(1024,))(sequence_input)\n",
269 | "embedded_sequences = Reshape((1024, 1,))(embedded_sequences)"
270 | ]
271 | },
272 | {
273 | "cell_type": "code",
274 | "execution_count": null,
275 | "metadata": {},
276 | "outputs": [],
277 | "source": [
278 | "l_drop = Dropout(0.5)(embedded_sequences)\n",
279 | "l_flat = Flatten()(l_drop)\n",
280 | "l_dense = Dense(32, activation='relu')(l_flat)\n",
281 | "preds = Dense(2, activation='softmax')(l_dense) #follows the number of classes"
282 | ]
283 | },
284 | {
285 | "cell_type": "markdown",
286 | "metadata": {},
287 | "source": [
288 | "**Compile the model into a static graph for training**"
289 | ]
290 | },
291 | {
292 | "cell_type": "code",
293 | "execution_count": null,
294 | "metadata": {},
295 | "outputs": [],
296 | "source": [
297 | "model = Model(sequence_input, preds)\n",
298 | "model.compile(loss='binary_crossentropy',\n",
299 | " optimizer=\"rmsprop\",\n",
300 | " metrics=['acc'])\n",
301 | "model.summary()"
302 | ]
303 | },
304 | {
305 | "cell_type": "markdown",
306 | "metadata": {},
307 | "source": [
308 | "**Visualisation**"
309 | ]
310 | },
311 | {
312 | "cell_type": "code",
313 | "execution_count": null,
314 | "metadata": {},
315 | "outputs": [],
316 | "source": [
317 | "from IPython.display import SVG\n",
318 | "from keras.utils.vis_utils import model_to_dot\n",
319 | "SVG(model_to_dot(model, show_shapes=True).create(prog='dot', format='svg'))"
320 | ]
321 | },
322 | {
323 | "cell_type": "markdown",
324 | "metadata": {},
325 | "source": [
326 | "### 3. Train the model"
327 | ]
328 | },
329 | {
330 | "cell_type": "code",
331 | "execution_count": null,
332 | "metadata": {
333 | "scrolled": false
334 | },
335 | "outputs": [],
336 | "source": [
337 | "print(\"Training Progress:\")\n",
338 | "model_log = model.fit(x_train, y_train, validation_data=(x_val, y_val),\n",
339 | " epochs=15, batch_size=64)"
340 | ]
341 | },
342 | {
343 | "cell_type": "markdown",
344 | "metadata": {},
345 | "source": [
346 | "### 4. Evaluate the model"
347 | ]
348 | },
349 | {
350 | "cell_type": "code",
351 | "execution_count": null,
352 | "metadata": {},
353 | "outputs": [],
354 | "source": [
355 | "import matplotlib.pyplot as plt\n",
356 | "%matplotlib inline\n",
357 | "%config InlineBackend.figure_format = 'retina'\n",
358 | "\n",
359 | "plt.plot(model_log.history['acc'])\n",
360 | "plt.plot(model_log.history['val_acc'])\n",
361 | "plt.title('Accuracy (Higher Better)')\n",
362 | "plt.ylabel('Accuracy')\n",
363 | "plt.xlabel('Epoch')\n",
364 | "plt.legend(['train', 'validation'], loc='upper left')\n",
365 | "plt.show()\n",
366 | "\n",
367 | "plt.plot(model_log.history['loss'])\n",
368 | "plt.plot(model_log.history['val_loss'])\n",
369 | "plt.title('Loss (Lower Better)')\n",
370 | "plt.ylabel('Loss')\n",
371 | "plt.xlabel('Epoch')\n",
372 | "plt.legend(['train', 'validation'], loc='upper left')\n",
373 | "plt.show()"
374 | ]
375 | },
376 | {
377 | "cell_type": "code",
378 | "execution_count": null,
379 | "metadata": {},
380 | "outputs": [],
381 | "source": [
382 | "from sklearn.metrics import classification_report, confusion_matrix\n",
383 | "import itertools\n",
384 | "\n",
385 | "classes = [\"positive\", \"negative\"]"
386 | ]
387 | },
388 | {
389 | "cell_type": "code",
390 | "execution_count": null,
391 | "metadata": {},
392 | "outputs": [],
393 | "source": [
394 | "Y_test = np.argmax(y_val, axis=1) # Convert one-hot to index\n",
395 | "y_pred = model.predict(x_val)\n",
396 | "y_pred_class = np.argmax(y_pred,axis=1)\n",
397 | "print(classification_report(Y_test, y_pred_class, target_names=classes))"
398 | ]
399 | },
400 | {
401 | "cell_type": "code",
402 | "execution_count": null,
403 | "metadata": {},
404 | "outputs": [],
405 | "source": [
406 | "plt.style.use('seaborn-dark')\n",
407 | "def plot_confusion_matrix(cm, labels,\n",
408 | " normalize=True,\n",
409 | " title='Confusion Matrix (Validation Set)',\n",
410 | " cmap=plt.cm.Blues):\n",
411 | " \"\"\"\n",
412 | " This function prints and plots the confusion matrix.\n",
413 | " Normalization can be applied by setting `normalize=True`.\n",
414 | " \"\"\"\n",
415 | " if normalize:\n",
416 | " cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]\n",
417 | " #print(\"Normalized confusion matrix\")\n",
418 | " else:\n",
419 | " #print('Confusion matrix, without normalization')\n",
420 | " pass\n",
421 | "\n",
422 | " #print(cm)\n",
423 | "\n",
424 | " plt.imshow(cm, interpolation='nearest', cmap=cmap)\n",
425 | " plt.title(title)\n",
426 | " plt.colorbar()\n",
427 | " tick_marks = np.arange(len(labels))\n",
428 | " plt.xticks(tick_marks, labels, rotation=45)\n",
429 | " plt.yticks(tick_marks, labels)\n",
430 | "\n",
431 | " fmt = '.2f' if normalize else 'd'\n",
432 | " thresh = cm.max() / 2.\n",
433 | " for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):\n",
434 | " plt.text(j, i, format(cm[i, j], fmt),\n",
435 | " horizontalalignment=\"center\",\n",
436 | " color=\"white\" if cm[i, j] > thresh else \"black\")\n",
437 | "\n",
438 | " plt.tight_layout()\n",
439 | " plt.ylabel('True label')\n",
440 | " plt.xlabel('Predicted label')\n",
441 | "\n",
442 | "plt.figure(figsize=(14,7))\n",
443 | "cnf_matrix = confusion_matrix(Y_test, y_pred_class)\n",
444 | "plot_confusion_matrix(cnf_matrix, labels=classes)"
445 | ]
446 | },
447 | {
448 | "cell_type": "code",
449 | "execution_count": null,
450 | "metadata": {},
451 | "outputs": [],
452 | "source": []
453 | }
454 | ],
455 | "metadata": {
456 | "kernelspec": {
457 | "display_name": "Python 3",
458 | "language": "python",
459 | "name": "python3"
460 | },
461 | "language_info": {
462 | "codemirror_mode": {
463 | "name": "ipython",
464 | "version": 3
465 | },
466 | "file_extension": ".py",
467 | "mimetype": "text/x-python",
468 | "name": "python",
469 | "nbconvert_exporter": "python",
470 | "pygments_lexer": "ipython3",
471 | "version": "3.6.7"
472 | }
473 | },
474 | "nbformat": 4,
475 | "nbformat_minor": 2
476 | }
477 |
--------------------------------------------------------------------------------
/day_1/Sample - EDA Titanic.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "Ffy0K-fYmF1E"
8 | },
9 | "source": [
10 | "# EDA: Titanic Dataset\n",
11 | "\n",
12 | "Who lives and who dies?"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "metadata": {
19 | "colab": {
20 | "base_uri": "https://localhost:8080/",
21 | "height": 459
22 | },
23 | "colab_type": "code",
24 | "executionInfo": {
25 | "elapsed": 7780,
26 | "status": "ok",
27 | "timestamp": 1546957324044,
28 | "user": {
29 | "displayName": "Timothy Liu",
30 | "photoUrl": "https://lh4.googleusercontent.com/-dGSoF3PTUms/AAAAAAAAAAI/AAAAAAAAEjo/onM3a4Ivxls/s64/photo.jpg",
31 | "userId": "03413426750796061565"
32 | },
33 | "user_tz": -480
34 | },
35 | "id": "gOprH9WElm1S",
36 | "outputId": "b151df53-b07a-4af5-d4e8-02278bbd46ed"
37 | },
38 | "outputs": [
39 | {
40 | "name": "stdout",
41 | "output_type": "stream",
42 | "text": [
43 | "Requirement already up-to-date: seaborn in /opt/conda/lib/python3.6/site-packages (0.9.0)\n",
44 | "Requirement already up-to-date: pandas in /opt/conda/lib/python3.6/site-packages (0.24.2)\n",
45 | "Requirement already satisfied, skipping upgrade: matplotlib>=1.4.3 in /opt/conda/lib/python3.6/site-packages (from seaborn) (3.0.3)\n",
46 | "Requirement already satisfied, skipping upgrade: numpy>=1.9.3 in /opt/conda/lib/python3.6/site-packages (from seaborn) (1.16.2)\n",
47 | "Requirement already satisfied, skipping upgrade: scipy>=0.14.0 in /opt/conda/lib/python3.6/site-packages (from seaborn) (1.2.1)\n",
48 | "Requirement already satisfied, skipping upgrade: pytz>=2011k in /opt/conda/lib/python3.6/site-packages (from pandas) (2018.9)\n",
49 | "Requirement already satisfied, skipping upgrade: python-dateutil>=2.5.0 in /opt/conda/lib/python3.6/site-packages (from pandas) (2.8.0)\n",
50 | "Requirement already satisfied, skipping upgrade: cycler>=0.10 in /opt/conda/lib/python3.6/site-packages (from matplotlib>=1.4.3->seaborn) (0.10.0)\n",
51 | "Requirement already satisfied, skipping upgrade: kiwisolver>=1.0.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib>=1.4.3->seaborn) (1.0.1)\n",
52 | "Requirement already satisfied, skipping upgrade: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.1 in /opt/conda/lib/python3.6/site-packages (from matplotlib>=1.4.3->seaborn) (2.3.1)\n",
53 | "Requirement already satisfied, skipping upgrade: six>=1.5 in /opt/conda/lib/python3.6/site-packages (from python-dateutil>=2.5.0->pandas) (1.12.0)\n",
54 | "Requirement already satisfied, skipping upgrade: setuptools in /opt/conda/lib/python3.6/site-packages (from kiwisolver>=1.0.1->matplotlib>=1.4.3->seaborn) (40.8.0)\n"
55 | ]
56 | }
57 | ],
58 | "source": [
59 | "# this cell is needed in Colab\n",
60 | "!pip install seaborn pandas -U"
61 | ]
62 | },
63 | {
64 | "cell_type": "code",
65 | "execution_count": 2,
66 | "metadata": {
67 | "colab": {},
68 | "colab_type": "code",
69 | "id": "LF7fKrycl3uJ"
70 | },
71 | "outputs": [],
72 | "source": [
73 | "import pandas as pd\n",
74 | "\n",
75 | "import matplotlib.pyplot as plt\n",
76 | "import seaborn\n",
77 | "\n",
78 | "%matplotlib inline"
79 | ]
80 | },
81 | {
82 | "cell_type": "code",
83 | "execution_count": 3,
84 | "metadata": {},
85 | "outputs": [
86 | {
87 | "name": "stderr",
88 | "output_type": "stream",
89 | "text": [
90 | "Using TensorFlow backend.\n"
91 | ]
92 | },
93 | {
94 | "name": "stdout",
95 | "output_type": "stream",
96 | "text": [
97 | "Dataset present at: /home/jovyan/.keras/datasets/titanic.csv\n"
98 | ]
99 | }
100 | ],
101 | "source": [
102 | "import keras\n",
103 | "\n",
104 | "DATA_URL = \"https://s3-ap-southeast-1.amazonaws.com/deeplearning-mat/titanic.csv\"\n",
105 | "DATA_DIR = keras.utils.get_file(\"titanic.csv\", DATA_URL, cache_subdir='datasets', extract=True)\n",
106 | "\n",
107 | "print(\"Dataset present at:\", DATA_DIR)"
108 | ]
109 | },
110 | {
111 | "cell_type": "code",
112 | "execution_count": 4,
113 | "metadata": {
114 | "colab": {
115 | "base_uri": "https://localhost:8080/",
116 | "height": 204
117 | },
118 | "colab_type": "code",
119 | "executionInfo": {
120 | "elapsed": 7768,
121 | "status": "ok",
122 | "timestamp": 1546957324046,
123 | "user": {
124 | "displayName": "Timothy Liu",
125 | "photoUrl": "https://lh4.googleusercontent.com/-dGSoF3PTUms/AAAAAAAAAAI/AAAAAAAAEjo/onM3a4Ivxls/s64/photo.jpg",
126 | "userId": "03413426750796061565"
127 | },
128 | "user_tz": -480
129 | },
130 | "id": "YGAGobyQl5sD",
131 | "outputId": "a377dbb8-906d-428c-987a-68f3bcfd4d2d"
132 | },
133 | "outputs": [
134 | {
135 | "data": {
136 | "text/html": [
137 | "
\n",
138 | "\n",
151 | "
\n",
152 | " \n",
153 | " \n",
154 | " | \n",
155 | " PassengerId | \n",
156 | " Survived | \n",
157 | " Pclass | \n",
158 | " Name | \n",
159 | " Sex | \n",
160 | " Age | \n",
161 | " SibSp | \n",
162 | " Parch | \n",
163 | " Ticket | \n",
164 | " Fare | \n",
165 | " Cabin | \n",
166 | " Embarked | \n",
167 | "
\n",
168 | " \n",
169 | " \n",
170 | " \n",
171 | " | 0 | \n",
172 | " 1 | \n",
173 | " 0 | \n",
174 | " 3 | \n",
175 | " Braund, Mr. Owen Harris | \n",
176 | " male | \n",
177 | " 22.0 | \n",
178 | " 1 | \n",
179 | " 0 | \n",
180 | " A/5 21171 | \n",
181 | " 7.2500 | \n",
182 | " NaN | \n",
183 | " S | \n",
184 | "
\n",
185 | " \n",
186 | " | 1 | \n",
187 | " 2 | \n",
188 | " 1 | \n",
189 | " 1 | \n",
190 | " Cumings, Mrs. John Bradley (Florence Briggs Th... | \n",
191 | " female | \n",
192 | " 38.0 | \n",
193 | " 1 | \n",
194 | " 0 | \n",
195 | " PC 17599 | \n",
196 | " 71.2833 | \n",
197 | " C85 | \n",
198 | " C | \n",
199 | "
\n",
200 | " \n",
201 | " | 2 | \n",
202 | " 3 | \n",
203 | " 1 | \n",
204 | " 3 | \n",
205 | " Heikkinen, Miss. Laina | \n",
206 | " female | \n",
207 | " 26.0 | \n",
208 | " 0 | \n",
209 | " 0 | \n",
210 | " STON/O2. 3101282 | \n",
211 | " 7.9250 | \n",
212 | " NaN | \n",
213 | " S | \n",
214 | "
\n",
215 | " \n",
216 | " | 3 | \n",
217 | " 4 | \n",
218 | " 1 | \n",
219 | " 1 | \n",
220 | " Futrelle, Mrs. Jacques Heath (Lily May Peel) | \n",
221 | " female | \n",
222 | " 35.0 | \n",
223 | " 1 | \n",
224 | " 0 | \n",
225 | " 113803 | \n",
226 | " 53.1000 | \n",
227 | " C123 | \n",
228 | " S | \n",
229 | "
\n",
230 | " \n",
231 | " | 4 | \n",
232 | " 5 | \n",
233 | " 0 | \n",
234 | " 3 | \n",
235 | " Allen, Mr. William Henry | \n",
236 | " male | \n",
237 | " 35.0 | \n",
238 | " 0 | \n",
239 | " 0 | \n",
240 | " 373450 | \n",
241 | " 8.0500 | \n",
242 | " NaN | \n",
243 | " S | \n",
244 | "
\n",
245 | " \n",
246 | "
\n",
247 | "
"
248 | ],
249 | "text/plain": [
250 | " PassengerId Survived Pclass \\\n",
251 | "0 1 0 3 \n",
252 | "1 2 1 1 \n",
253 | "2 3 1 3 \n",
254 | "3 4 1 1 \n",
255 | "4 5 0 3 \n",
256 | "\n",
257 | " Name Sex Age SibSp \\\n",
258 | "0 Braund, Mr. Owen Harris male 22.0 1 \n",
259 | "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n",
260 | "2 Heikkinen, Miss. Laina female 26.0 0 \n",
261 | "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n",
262 | "4 Allen, Mr. William Henry male 35.0 0 \n",
263 | "\n",
264 | " Parch Ticket Fare Cabin Embarked \n",
265 | "0 0 A/5 21171 7.2500 NaN S \n",
266 | "1 0 PC 17599 71.2833 C85 C \n",
267 | "2 0 STON/O2. 3101282 7.9250 NaN S \n",
268 | "3 0 113803 53.1000 C123 S \n",
269 | "4 0 373450 8.0500 NaN S "
270 | ]
271 | },
272 | "execution_count": 4,
273 | "metadata": {},
274 | "output_type": "execute_result"
275 | }
276 | ],
277 | "source": [
278 | "# use Pandas to load the csv file\n",
279 | "df = pd.read_csv(DATA_DIR)\n",
280 | "\n",
281 | "# look at the top few rows of the dataset\n",
282 | "df.head()"
283 | ]
284 | },
285 | {
286 | "cell_type": "markdown",
287 | "metadata": {
288 | "colab_type": "text",
289 | "id": "p8CTz7qvqkjE"
290 | },
291 | "source": [
292 | "### Search for Missing Data\n",
293 | "\n",
294 | "We will want to correct them"
295 | ]
296 | },
297 | {
298 | "cell_type": "code",
299 | "execution_count": 5,
300 | "metadata": {
301 | "colab": {
302 | "base_uri": "https://localhost:8080/",
303 | "height": 204
304 | },
305 | "colab_type": "code",
306 | "executionInfo": {
307 | "elapsed": 7763,
308 | "status": "ok",
309 | "timestamp": 1546957324047,
310 | "user": {
311 | "displayName": "Timothy Liu",
312 | "photoUrl": "https://lh4.googleusercontent.com/-dGSoF3PTUms/AAAAAAAAAAI/AAAAAAAAEjo/onM3a4Ivxls/s64/photo.jpg",
313 | "userId": "03413426750796061565"
314 | },
315 | "user_tz": -480
316 | },
317 | "id": "-wnHsqMxotCf",
318 | "outputId": "a62191e3-78d2-4dac-e297-211a32fa8ee9"
319 | },
320 | "outputs": [
321 | {
322 | "data": {
323 | "text/html": [
324 | "\n",
325 | "\n",
338 | "
\n",
339 | " \n",
340 | " \n",
341 | " | \n",
342 | " PassengerId | \n",
343 | " Survived | \n",
344 | " Pclass | \n",
345 | " Name | \n",
346 | " Sex | \n",
347 | " Age | \n",
348 | " SibSp | \n",
349 | " Parch | \n",
350 | " Ticket | \n",
351 | " Fare | \n",
352 | " Cabin | \n",
353 | " Embarked | \n",
354 | "
\n",
355 | " \n",
356 | " \n",
357 | " \n",
358 | " | 0 | \n",
359 | " False | \n",
360 | " False | \n",
361 | " False | \n",
362 | " False | \n",
363 | " False | \n",
364 | " False | \n",
365 | " False | \n",
366 | " False | \n",
367 | " False | \n",
368 | " False | \n",
369 | " True | \n",
370 | " False | \n",
371 | "
\n",
372 | " \n",
373 | " | 1 | \n",
374 | " False | \n",
375 | " False | \n",
376 | " False | \n",
377 | " False | \n",
378 | " False | \n",
379 | " False | \n",
380 | " False | \n",
381 | " False | \n",
382 | " False | \n",
383 | " False | \n",
384 | " False | \n",
385 | " False | \n",
386 | "
\n",
387 | " \n",
388 | " | 2 | \n",
389 | " False | \n",
390 | " False | \n",
391 | " False | \n",
392 | " False | \n",
393 | " False | \n",
394 | " False | \n",
395 | " False | \n",
396 | " False | \n",
397 | " False | \n",
398 | " False | \n",
399 | " True | \n",
400 | " False | \n",
401 | "
\n",
402 | " \n",
403 | " | 3 | \n",
404 | " False | \n",
405 | " False | \n",
406 | " False | \n",
407 | " False | \n",
408 | " False | \n",
409 | " False | \n",
410 | " False | \n",
411 | " False | \n",
412 | " False | \n",
413 | " False | \n",
414 | " False | \n",
415 | " False | \n",
416 | "
\n",
417 | " \n",
418 | " | 4 | \n",
419 | " False | \n",
420 | " False | \n",
421 | " False | \n",
422 | " False | \n",
423 | " False | \n",
424 | " False | \n",
425 | " False | \n",
426 | " False | \n",
427 | " False | \n",
428 | " False | \n",
429 | " True | \n",
430 | " False | \n",
431 | "
\n",
432 | " \n",
433 | "
\n",
434 | "
"
435 | ],
436 | "text/plain": [
437 | " PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket \\\n",
438 | "0 False False False False False False False False False \n",
439 | "1 False False False False False False False False False \n",
440 | "2 False False False False False False False False False \n",
441 | "3 False False False False False False False False False \n",
442 | "4 False False False False False False False False False \n",
443 | "\n",
444 | " Fare Cabin Embarked \n",
445 | "0 False True False \n",
446 | "1 False False False \n",
447 | "2 False True False \n",
448 | "3 False False False \n",
449 | "4 False True False "
450 | ]
451 | },
452 | "execution_count": 5,
453 | "metadata": {},
454 | "output_type": "execute_result"
455 | }
456 | ],
457 | "source": [
458 | "df.isnull().head()"
459 | ]
460 | },
461 | {
462 | "cell_type": "code",
463 | "execution_count": 6,
464 | "metadata": {
465 | "colab": {
466 | "base_uri": "https://localhost:8080/",
467 | "height": 221
468 | },
469 | "colab_type": "code",
470 | "executionInfo": {
471 | "elapsed": 7759,
472 | "status": "ok",
473 | "timestamp": 1546957324047,
474 | "user": {
475 | "displayName": "Timothy Liu",
476 | "photoUrl": "https://lh4.googleusercontent.com/-dGSoF3PTUms/AAAAAAAAAAI/AAAAAAAAEjo/onM3a4Ivxls/s64/photo.jpg",
477 | "userId": "03413426750796061565"
478 | },
479 | "user_tz": -480
480 | },
481 | "id": "vWIOwpVApFwS",
482 | "outputId": "2b2917bc-e7d9-48a4-d121-1d4d6e087af3"
483 | },
484 | "outputs": [
485 | {
486 | "name": "stdout",
487 | "output_type": "stream",
488 | "text": [
489 | "Column PassengerId has null value is: False\n",
490 | "Column Survived has null value is: False\n",
491 | "Column Pclass has null value is: False\n",
492 | "Column Name has null value is: False\n",
493 | "Column Sex has null value is: False\n",
494 | "Column Age has null value is: True\n",
495 | "Column SibSp has null value is: False\n",
496 | "Column Parch has null value is: False\n",
497 | "Column Ticket has null value is: False\n",
498 | "Column Fare has null value is: False\n",
499 | "Column Cabin has null value is: True\n",
500 | "Column Embarked has null value is: True\n"
501 | ]
502 | }
503 | ],
504 | "source": [
505 | "for col in df.columns:\n",
506 | " col_has_null = df[col].isnull().values.any()\n",
507 | " print(\"Column\", col, \"has null value is:\", col_has_null)"
508 | ]
509 | },
510 | {
511 | "cell_type": "code",
512 | "execution_count": 7,
513 | "metadata": {
514 | "colab": {},
515 | "colab_type": "code",
516 | "id": "ihopIciiphHk"
517 | },
518 | "outputs": [],
519 | "source": [
520 | "# fill missing ages/fares with mean value\n",
521 | "\n",
522 | "df[\"Age\"].fillna(df[\"Age\"].mean(), inplace=True)\n",
523 | "df[\"Fare\"].fillna(df[\"Fare\"].mean(), inplace=True)\n",
524 | "\n",
525 | "# fill cabin and embarked with \"unknown\" category\n",
526 | "\n",
527 | "df[\"Cabin\"].fillna(\"\", inplace=True)\n",
528 | "df[\"Embarked\"].fillna(\"\", inplace=True)"
529 | ]
530 | },
531 | {
532 | "cell_type": "code",
533 | "execution_count": 8,
534 | "metadata": {
535 | "colab": {
536 | "base_uri": "https://localhost:8080/",
537 | "height": 306
538 | },
539 | "colab_type": "code",
540 | "executionInfo": {
541 | "elapsed": 8066,
542 | "status": "ok",
543 | "timestamp": 1546957324363,
544 | "user": {
545 | "displayName": "Timothy Liu",
546 | "photoUrl": "https://lh4.googleusercontent.com/-dGSoF3PTUms/AAAAAAAAAAI/AAAAAAAAEjo/onM3a4Ivxls/s64/photo.jpg",
547 | "userId": "03413426750796061565"
548 | },
549 | "user_tz": -480
550 | },
551 | "id": "GFjtirLNl8Wa",
552 | "outputId": "2113975c-42ab-4c10-e366-e3e515a330b5"
553 | },
554 | "outputs": [
555 | {
556 | "name": "stdout",
557 | "output_type": "stream",
558 | "text": [
559 | "\n",
560 | "RangeIndex: 891 entries, 0 to 890\n",
561 | "Data columns (total 12 columns):\n",
562 | "PassengerId 891 non-null int64\n",
563 | "Survived 891 non-null int64\n",
564 | "Pclass 891 non-null int64\n",
565 | "Name 891 non-null object\n",
566 | "Sex 891 non-null object\n",
567 | "Age 891 non-null float64\n",
568 | "SibSp 891 non-null int64\n",
569 | "Parch 891 non-null int64\n",
570 | "Ticket 891 non-null object\n",
571 | "Fare 891 non-null float64\n",
572 | "Cabin 891 non-null object\n",
573 | "Embarked 891 non-null object\n",
574 | "dtypes: float64(2), int64(5), object(5)\n",
575 | "memory usage: 83.6+ KB\n"
576 | ]
577 | }
578 | ],
579 | "source": [
580 | "df.info()"
581 | ]
582 | },
583 | {
584 | "cell_type": "code",
585 | "execution_count": 9,
586 | "metadata": {
587 | "colab": {
588 | "base_uri": "https://localhost:8080/",
589 | "height": 297
590 | },
591 | "colab_type": "code",
592 | "executionInfo": {
593 | "elapsed": 8062,
594 | "status": "ok",
595 | "timestamp": 1546957324365,
596 | "user": {
597 | "displayName": "Timothy Liu",
598 | "photoUrl": "https://lh4.googleusercontent.com/-dGSoF3PTUms/AAAAAAAAAAI/AAAAAAAAEjo/onM3a4Ivxls/s64/photo.jpg",
599 | "userId": "03413426750796061565"
600 | },
601 | "user_tz": -480
602 | },
603 | "id": "PmZfjwa0ma3T",
604 | "outputId": "770ef5fd-b69d-42d6-f82c-7b1aa7bac759"
605 | },
606 | "outputs": [
607 | {
608 | "data": {
609 | "text/html": [
610 | "\n",
611 | "\n",
624 | "
\n",
625 | " \n",
626 | " \n",
627 | " | \n",
628 | " PassengerId | \n",
629 | " Survived | \n",
630 | " Pclass | \n",
631 | " Age | \n",
632 | " SibSp | \n",
633 | " Parch | \n",
634 | " Fare | \n",
635 | "
\n",
636 | " \n",
637 | " \n",
638 | " \n",
639 | " | count | \n",
640 | " 891.000000 | \n",
641 | " 891.000000 | \n",
642 | " 891.000000 | \n",
643 | " 891.000000 | \n",
644 | " 891.000000 | \n",
645 | " 891.000000 | \n",
646 | " 891.000000 | \n",
647 | "
\n",
648 | " \n",
649 | " | mean | \n",
650 | " 446.000000 | \n",
651 | " 0.383838 | \n",
652 | " 2.308642 | \n",
653 | " 29.699118 | \n",
654 | " 0.523008 | \n",
655 | " 0.381594 | \n",
656 | " 32.204208 | \n",
657 | "
\n",
658 | " \n",
659 | " | std | \n",
660 | " 257.353842 | \n",
661 | " 0.486592 | \n",
662 | " 0.836071 | \n",
663 | " 13.002015 | \n",
664 | " 1.102743 | \n",
665 | " 0.806057 | \n",
666 | " 49.693429 | \n",
667 | "
\n",
668 | " \n",
669 | " | min | \n",
670 | " 1.000000 | \n",
671 | " 0.000000 | \n",
672 | " 1.000000 | \n",
673 | " 0.420000 | \n",
674 | " 0.000000 | \n",
675 | " 0.000000 | \n",
676 | " 0.000000 | \n",
677 | "
\n",
678 | " \n",
679 | " | 25% | \n",
680 | " 223.500000 | \n",
681 | " 0.000000 | \n",
682 | " 2.000000 | \n",
683 | " 22.000000 | \n",
684 | " 0.000000 | \n",
685 | " 0.000000 | \n",
686 | " 7.910400 | \n",
687 | "
\n",
688 | " \n",
689 | " | 50% | \n",
690 | " 446.000000 | \n",
691 | " 0.000000 | \n",
692 | " 3.000000 | \n",
693 | " 29.699118 | \n",
694 | " 0.000000 | \n",
695 | " 0.000000 | \n",
696 | " 14.454200 | \n",
697 | "
\n",
698 | " \n",
699 | " | 75% | \n",
700 | " 668.500000 | \n",
701 | " 1.000000 | \n",
702 | " 3.000000 | \n",
703 | " 35.000000 | \n",
704 | " 1.000000 | \n",
705 | " 0.000000 | \n",
706 | " 31.000000 | \n",
707 | "
\n",
708 | " \n",
709 | " | max | \n",
710 | " 891.000000 | \n",
711 | " 1.000000 | \n",
712 | " 3.000000 | \n",
713 | " 80.000000 | \n",
714 | " 8.000000 | \n",
715 | " 6.000000 | \n",
716 | " 512.329200 | \n",
717 | "
\n",
718 | " \n",
719 | "
\n",
720 | "
"
721 | ],
722 | "text/plain": [
723 | " PassengerId Survived Pclass Age SibSp \\\n",
724 | "count 891.000000 891.000000 891.000000 891.000000 891.000000 \n",
725 | "mean 446.000000 0.383838 2.308642 29.699118 0.523008 \n",
726 | "std 257.353842 0.486592 0.836071 13.002015 1.102743 \n",
727 | "min 1.000000 0.000000 1.000000 0.420000 0.000000 \n",
728 | "25% 223.500000 0.000000 2.000000 22.000000 0.000000 \n",
729 | "50% 446.000000 0.000000 3.000000 29.699118 0.000000 \n",
730 | "75% 668.500000 1.000000 3.000000 35.000000 1.000000 \n",
731 | "max 891.000000 1.000000 3.000000 80.000000 8.000000 \n",
732 | "\n",
733 | " Parch Fare \n",
734 | "count 891.000000 891.000000 \n",
735 | "mean 0.381594 32.204208 \n",
736 | "std 0.806057 49.693429 \n",
737 | "min 0.000000 0.000000 \n",
738 | "25% 0.000000 7.910400 \n",
739 | "50% 0.000000 14.454200 \n",
740 | "75% 0.000000 31.000000 \n",
741 | "max 6.000000 512.329200 "
742 | ]
743 | },
744 | "execution_count": 9,
745 | "metadata": {},
746 | "output_type": "execute_result"
747 | }
748 | ],
749 | "source": [
750 | "df.describe()"
751 | ]
752 | },
753 | {
754 | "cell_type": "code",
755 | "execution_count": 10,
756 | "metadata": {
757 | "colab": {
758 | "base_uri": "https://localhost:8080/",
759 | "height": 300
760 | },
761 | "colab_type": "code",
762 | "executionInfo": {
763 | "elapsed": 8059,
764 | "status": "ok",
765 | "timestamp": 1546957324366,
766 | "user": {
767 | "displayName": "Timothy Liu",
768 | "photoUrl": "https://lh4.googleusercontent.com/-dGSoF3PTUms/AAAAAAAAAAI/AAAAAAAAEjo/onM3a4Ivxls/s64/photo.jpg",
769 | "userId": "03413426750796061565"
770 | },
771 | "user_tz": -480
772 | },
773 | "id": "Ca4KtBwzmeoG",
774 | "outputId": "7cbbf65f-d284-441e-c3a6-a0a88bc35b8e"
775 | },
776 | "outputs": [
777 | {
778 | "name": "stdout",
779 | "output_type": "stream",
780 | "text": [
781 | "Survival ratio: 0.3838383838383838\n"
782 | ]
783 | },
784 | {
785 | "data": {
786 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYgAAAEKCAYAAAAIO8L1AAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADl0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uIDMuMC4zLCBodHRwOi8vbWF0cGxvdGxpYi5vcmcvnQurowAAD9ZJREFUeJzt3X+s3Xddx/Hna+0YwmCuaTdL29mFVLRTGNl1KvtDZMbVKHRBO4tMizYpf1SERIVOE0FJ44xIXJAZGwU6FLoCzlWCwCxWQBe2VgdbO+caNtbasnZDhBFTaX37x/k2O3Sfe3u63e89d73PR9J8v9/P+Xy+532Wm/Pa5/vrpKqQJOlU54y7AEnS7GRASJKaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktQ0f9wFPBMLFy6s5cuXj7sMSXpW2bNnz2NVteh0/Z7VAbF8+XJ279497jIk6VklyVdG6echJklSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1GRASJKaDAhJUtOz+k7q6XDFb90y7hI0C+35o18edwnS2DmDkCQ1GRCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1GRASJKaDAhJUpMBIUlq6jUgkjyc5N4k9yTZ3bUtSHJHkge75YVD/W9Isj/JA0mu6bM2SdLUZmIG8RNVdXlVTXTbm4CdVbUC2Nltk2QlsBa4DFgF3Jxk3gzUJ0lqGMchptXA1m59K3DtUPu2qjpWVQ8B+4Erx1CfJIn+A6KATyfZk2RD13ZxVR0G6JYXde1LgANDYw92bd8hyYYku5PsPnr0aI+lS9Lc1vdPjl5VVYeSXATckeTfp+ibRls9paFqC7AFYGJi4imvS5KmR68ziKo61C2PALcxOGT0aJLFAN3ySNf9ILBsaPhS4FCf9UmSJtdbQCR5fpIXnFwHfgq4D9gBrOu6rQNu79Z3AGuTnJfkUmAFcFdf9UmSptbnIaaLgduSnHyfD1XVJ5PcDWxPsh54BFgDUFV7k2wH9gHHgY1VdaLH+iRJU+gtIKrqy8DLGu2PA1dPMmYzsLmvmiRJo/NOaklSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1GRASJKaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktRkQEiSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKmp94BIMi/JvyX5eLe9IMkdSR7slhcO9b0hyf4kDyS5pu/aJEmTm4kZxJuB+4e2NwE7q2oFsLPbJslKYC1wGbAKuDnJvBmoT5LU0GtAJFkK/AzwF0PNq4Gt3fpW4Nqh9m1VdayqHgL2A1f2WZ8kaXJ9zyD+BHgr8H9DbRdX1WGAbnlR174EODDU72DXJkkag94CIsnPAkeqas+oQxpt1djvhiS7k+w+evToM6pRkjS5PmcQVwGvSfIwsA14VZK/Ah5NshigWx7p+h8Elg2NXwocOnWnVbWlqiaqamLRokU9li9Jc1tvAVFVN1TV0qpazuDk82eq6npgB7Cu67YOuL1b3wGsTXJekkuBFcBdfdUnSZra/DG8543A9iTrgUeANQBVtTfJdmAfcBzYWFUnxlCfJIkZCoiq2gXs6tYfB66epN9mYPNM1CRJmpp3UkuSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1GRASJKaxvGTo5JG8Mjv/9C4S9AsdMnv3jtj7+UMQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNY0UEEl2jtImSTp7THmjXJLnAs8DFia5EEj30guBF/VcmyRpjE53J/UbgbcwCIM9PBkQ3wDe22NdkqQxmzIgquom4KYkb6qq98xQTZKkWWCkZzFV1XuSvAJYPjymqm6ZbEx3eOqzwHndmI9W1duTLABu7fb1MHBdVf1XN+YGYD1wAvj1qvrUmX8kSdJ0GCkgknwQeDFwD4Mvb4ACJg0I4Bjwqqp6Ism5wOeT/D3wWmBnVd2YZBOwCXhbkpXAWuAyBoe0/iHJ91XVicneQJLUn1Gf5joBrKyqGnXHXd8nus1zu38FrAZe2bVvBXYBb+vat1XVMeChJPuBK4E7R31PSdL0GfU+iPuA7znTnSeZl+Qe4AhwR1V9Abi4qg4DdMuLuu5LgANDww92bZKkMRh1BrEQ2JfkLgaHjgCoqtdMNag7PHR5ku8Gbkvyg1N0T6PtKTOWJBuADQCXXHLJCKVLkp6OUQPiHc/kTarq60l2AauAR5MsrqrDSRYzmF3AYMawbGjYUuBQY19bgC0AExMTIx/ykiSdmVGvYvqnM91xkkXAt7tw+C7gJ4E/BHYA64Abu+Xt3ZAdwIeSvJvBSeoVwF1n+r6SpOkx6lVM3+TJwz3PYXDC+VtV9cIphi0GtiaZx+Bcx/aq+niSO4HtSdYDjwBrAKpqb5LtwD7gOLDRK5gkaXxGnUG8YHg7ybUMrjCaasyXgJc32h8Hrp5kzGZg8yg1SZL69bSe5lpVfwu8apprkSTNIqMeYnrt0OY5DO6L8ASxJJ3FRr2K6dVD68cZPCJj9bRXI0maNUY9B/ErfRciSZpdRv3BoKVJbktyJMmjST6WZGnfxUmSxmfUk9TvZ3CfwosYPP7i77o2SdJZatSAWFRV76+q492/DwCLeqxLkjRmowbEY0mu7x6+Ny/J9cDjfRYmSRqvUQPiV4HrgK8Ch4GfBzxxLUlnsVEvc30nsG7ol98WAO9iEBySpLPQqDOIl54MB4Cq+hqNx2hIks4eowbEOUkuPLnRzSBGnX1Ikp6FRv2S/2PgX5J8lMEjNq7Dh+pJ0llt1Dupb0mym8ED+gK8tqr29VqZJGmsRj5M1AWCoSBJc8TTety3JOnsZ0BIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1GRASJKaDAhJUpMBIUlqMiAkSU0GhCSpqbeASLIsyT8muT/J3iRv7toXJLkjyYPdcviX6m5Isj/JA0mu6as2SdLp9TmDOA78RlX9APCjwMYkK4FNwM6qWgHs7LbpXlsLXAasAm5OMq/H+iRJU+gtIKrqcFX9a7f+TeB+YAmwGtjaddsKXNutrwa2VdWxqnoI2A9c2Vd9kqSpzcg5iCTLgZcDXwAurqrDMAgR4KKu2xLgwNCwg13bqfvakGR3kt1Hjx7ts2xJmtN6D4gk5wMfA95SVd+YqmujrZ7SULWlqiaqamLRokXTVaYk6RS9BkSScxmEw19X1d90zY8mWdy9vhg40rUfBJYNDV8KHOqzPknS5Pq8iinAXwL3V9W7h17aAazr1tcBtw+1r01yXpJLgRXAXX3VJ0ma2vwe930V8EvAvUnu6dp+G7gR2J5kPfAIsAagqvYm2Q7sY3AF1MaqOtFjfZKkKfQWEFX1edrnFQCunmTMZmBzXzVJkkbnndSSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1GRASJKaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktRkQEiSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSU28BkeR9SY4kuW+obUGSO5I82C0vHHrthiT7kzyQ5Jq+6pIkjabPGcQHgFWntG0CdlbVCmBnt02SlcBa4LJuzM1J5vVYmyTpNHoLiKr6LPC1U5pXA1u79a3AtUPt26rqWFU9BOwHruyrNknS6c30OYiLq+owQLe8qGtfAhwY6newa5MkjclsOUmdRls1OyYbkuxOsvvo0aM9lyVJc9dMB8SjSRYDdMsjXftBYNlQv6XAodYOqmpLVU1U1cSiRYt6LVaS5rKZDogdwLpufR1w+1D72iTnJbkUWAHcNcO1SZKGzO9rx0k+DLwSWJjkIPB24EZge5L1wCPAGoCq2ptkO7APOA5srKoTfdUmSTq93gKiql43yUtXT9J/M7C5r3okSWdmtpykliTNMgaEJKnJgJAkNRkQkqQmA0KS1GRASJKaDAhJUpMBIUlqMiAkSU0GhCSpyYCQJDUZEJKkJgNCktRkQEiSmgwISVKTASFJajIgJElNBoQkqcmAkCQ1GRCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICRJTQaEJKnJgJAkNRkQkqQmA0KS1DTrAiLJqiQPJNmfZNO465GkuWpWBUSSecB7gZ8GVgKvS7JyvFVJ0tw0qwICuBLYX1Vfrqr/BbYBq8dckyTNSbMtIJYAB4a2D3ZtkqQZNn/cBZwijbb6jg7JBmBDt/lEkgd6r2ruWAg8Nu4iZoO8a924S9B38m/zpLe3vibP2PeO0mm2BcRBYNnQ9lLg0HCHqtoCbJnJouaKJLuramLcdUin8m9zPGbbIaa7gRVJLk3yHGAtsGPMNUnSnDSrZhBVdTzJrwGfAuYB76uqvWMuS5LmpFkVEABV9QngE+OuY47y0J1mK/82xyBVdfpekqQ5Z7adg5AkzRIGhHy8iWatJO9LciTJfeOuZS4yIOY4H2+iWe4DwKpxFzFXGRDy8Saatarqs8DXxl3HXGVAyMebSGoyIHTax5tImpsMCJ328SaS5iYDQj7eRFKTATHHVdVx4OTjTe4Htvt4E80WST4M3Am8JMnBJOvHXdNc4p3UkqQmZxCSpCYDQpLUZEBIkpoMCElSkwEhSWoyICQgye8k2ZvkS0nuSfIj07DP10zX03GTPDEd+5HOhJe5as5L8mPAu4FXVtWxJAuB51TVae8oTzK/u5ek7xqfqKrz+34faZgzCAkWA49V1TGAqnqsqg4lebgLC5JMJNnVrb8jyZYknwZuSfKFJJed3FmSXUmuSPKGJH+a5IJuX+d0rz8vyYEk5yZ5cZJPJtmT5HNJvr/rc2mSO5PcneSdM/zfQwIMCAng08CyJP+R5OYkPz7CmCuA1VX1iwwekX4dQJLFwIuqas/JjlX138AXgZP7fTXwqar6NoPfWn5TVV0B/CZwc9fnJuDPquqHga8+408oPQ0GhOa8qnqCwRf+BuAocGuSN5xm2I6q+p9ufTuwplu/DvhIo/+twC9062u79zgfeAXwkST3AH/OYDYDcBXw4W79g2f0gaRpMn/cBUizQVWdAHYBu5LcC6wDjvPk/0Q995Qh3xoa+59JHk/yUgYh8MbGW+wA/iDJAgZh9Bng+cDXq+ryycp6mh9HmhbOIDTnJXlJkhVDTZcDXwEeZvBlDvBzp9nNNuCtwAVVde+pL3azlLsYHDr6eFWdqKpvAA8lWdPVkSQv64b8M4OZBsDrz/xTSc+cASHB+cDWJPuSfInBb3O/A/g94KYknwNOnGYfH2Xwhb59ij63Atd3y5NeD6xP8kVgL0/+3OubgY1J7gYuOLOPI00PL3OVJDU5g5AkNRkQkqQmA0KS1GRASJKaDAhJUpMBIUlqMiAkSU0GhCSp6f8BOOTYJ3/9PtYAAAAASUVORK5CYII=\n",
787 | "text/plain": [
788 | ""
789 | ]
790 | },
791 | "metadata": {
792 | "needs_background": "light"
793 | },
794 | "output_type": "display_data"
795 | }
796 | ],
797 | "source": [
798 | "ratio = df.Survived.sum()/df.Survived.count()\n",
799 | "print(\"Survival ratio:\", ratio)\n",
800 | "\n",
801 | "seaborn.countplot(x='Survived', data=df);"
802 | ]
803 | },
804 | {
805 | "cell_type": "markdown",
806 | "metadata": {
807 | "colab_type": "text",
808 | "id": "-kRnCezfnMeZ"
809 | },
810 | "source": [
811 | "### Let's guess: more females survived?"
812 | ]
813 | },
814 | {
815 | "cell_type": "code",
816 | "execution_count": 11,
817 | "metadata": {
818 | "colab": {
819 | "base_uri": "https://localhost:8080/",
820 | "height": 369
821 | },
822 | "colab_type": "code",
823 | "executionInfo": {
824 | "elapsed": 8336,
825 | "status": "ok",
826 | "timestamp": 1546957324649,
827 | "user": {
828 | "displayName": "Timothy Liu",
829 | "photoUrl": "https://lh4.googleusercontent.com/-dGSoF3PTUms/AAAAAAAAAAI/AAAAAAAAEjo/onM3a4Ivxls/s64/photo.jpg",
830 | "userId": "03413426750796061565"
831 | },
832 | "user_tz": -480
833 | },
834 | "id": "bcqRN18ZnH-y",
835 | "outputId": "40c060cc-9ef6-4500-fca9-79ba4d9072fb"
836 | },
837 | "outputs": [
838 | {
839 | "data": {
840 | "image/png": "\n",
841 | "text/plain": [
842 | ""
843 | ]
844 | },
845 | "metadata": {
846 | "needs_background": "light"
847 | },
848 | "output_type": "display_data"
849 | }
850 | ],
851 | "source": [
852 | "seaborn.catplot(x='Sex', col='Survived', kind='count', data=df);"
853 | ]
854 | },
855 | {
856 | "cell_type": "code",
857 | "execution_count": null,
858 | "metadata": {
859 | "colab": {},
860 | "colab_type": "code",
861 | "id": "uGcm_1qhnJAg"
862 | },
863 | "outputs": [],
864 | "source": []
865 | }
866 | ],
867 | "metadata": {
868 | "colab": {
869 | "collapsed_sections": [],
870 | "name": "4 - EDA Titanic.ipynb",
871 | "provenance": [],
872 | "version": "0.3.2"
873 | },
874 | "kernelspec": {
875 | "display_name": "Python 3",
876 | "language": "python",
877 | "name": "python3"
878 | },
879 | "language_info": {
880 | "codemirror_mode": {
881 | "name": "ipython",
882 | "version": 3
883 | },
884 | "file_extension": ".py",
885 | "mimetype": "text/x-python",
886 | "name": "python",
887 | "nbconvert_exporter": "python",
888 | "pygments_lexer": "ipython3",
889 | "version": "3.6.7"
890 | }
891 | },
892 | "nbformat": 4,
893 | "nbformat_minor": 1
894 | }
895 |
--------------------------------------------------------------------------------
/day_3/Sample - Loading and Visualising NLP Data.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {
7 | "colab": {
8 | "base_uri": "https://localhost:8080/",
9 | "height": 34
10 | },
11 | "colab_type": "code",
12 | "executionInfo": {
13 | "elapsed": 2053,
14 | "status": "ok",
15 | "timestamp": 1547020080104,
16 | "user": {
17 | "displayName": "Soh Jun De",
18 | "photoUrl": "",
19 | "userId": "15246530694083866298"
20 | },
21 | "user_tz": -480
22 | },
23 | "id": "3c_NsS9gS8Vl",
24 | "outputId": "8f0ac2ac-168a-43df-fd0c-2571e40f162f"
25 | },
26 | "outputs": [
27 | {
28 | "name": "stderr",
29 | "output_type": "stream",
30 | "text": [
31 | "Using TensorFlow backend.\n"
32 | ]
33 | }
34 | ],
35 | "source": [
36 | "import csv\n",
37 | "import numpy as np\n",
38 | "import matplotlib.pyplot as plt\n",
39 | "\n",
40 | "import keras\n",
41 | "\n",
42 | "%matplotlib inline\n",
43 | "#%config InlineBackend.figure_format = 'retina'"
44 | ]
45 | },
46 | {
47 | "cell_type": "code",
48 | "execution_count": 2,
49 | "metadata": {
50 | "colab": {
51 | "base_uri": "https://localhost:8080/",
52 | "height": 51
53 | },
54 | "colab_type": "code",
55 | "executionInfo": {
56 | "elapsed": 9369,
57 | "status": "ok",
58 | "timestamp": 1547020087422,
59 | "user": {
60 | "displayName": "Soh Jun De",
61 | "photoUrl": "",
62 | "userId": "15246530694083866298"
63 | },
64 | "user_tz": -480
65 | },
66 | "id": "KdCxcCmwUFHG",
67 | "outputId": "8c01ebcd-0c8c-4810-f9cc-0c89285e1f1f"
68 | },
69 | "outputs": [
70 | {
71 | "name": "stdout",
72 | "output_type": "stream",
73 | "text": [
74 | "Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz\n",
75 | "17465344/17464789 [==============================] - 16s 1us/step\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "from keras.datasets import imdb\n",
81 | "\n",
82 | "(x_train, y_train), (x_test, y_test) = imdb.load_data(path=\"imdb.npz\", num_words=None,\n",
83 | " skip_top=0, maxlen=None,\n",
84 | " start_char=1, oov_char=2,\n",
85 | " index_from=3)"
86 | ]
87 | },
88 | {
89 | "cell_type": "code",
90 | "execution_count": 3,
91 | "metadata": {
92 | "colab": {
93 | "base_uri": "https://localhost:8080/",
94 | "height": 88
95 | },
96 | "colab_type": "code",
97 | "executionInfo": {
98 | "elapsed": 9354,
99 | "status": "ok",
100 | "timestamp": 1547020087423,
101 | "user": {
102 | "displayName": "Soh Jun De",
103 | "photoUrl": "",
104 | "userId": "15246530694083866298"
105 | },
106 | "user_tz": -480
107 | },
108 | "id": "DFpqExwBUYyd",
109 | "outputId": "cadf9037-e4fa-4714-f77b-f4f8a4a83ccb"
110 | },
111 | "outputs": [
112 | {
113 | "name": "stdout",
114 | "output_type": "stream",
115 | "text": [
116 | "Training set: (25000,) (25000,)\n",
117 | "Evaluation set: (25000,) (25000,)\n",
118 | "Example: [1, 14, 22, 16, 43, 530, 973, 1622, 1385, 65, 458, 4468, 66, 3941, 4, 173, 36, 256, 5, 25, 100, 43, 838, 112, 50, 670, 22665, 9, 35, 480, 284, 5, 150, 4, 172, 112, 167, 21631, 336, 385, 39, 4, 172, 4536, 1111, 17, 546, 38, 13, 447, 4, 192, 50, 16, 6, 147, 2025, 19, 14, 22, 4, 1920, 4613, 469, 4, 22, 71, 87, 12, 16, 43, 530, 38, 76, 15, 13, 1247, 4, 22, 17, 515, 17, 12, 16, 626, 18, 19193, 5, 62, 386, 12, 8, 316, 8, 106, 5, 4, 2223, 5244, 16, 480, 66, 3785, 33, 4, 130, 12, 16, 38, 619, 5, 25, 124, 51, 36, 135, 48, 25, 1415, 33, 6, 22, 12, 215, 28, 77, 52, 5, 14, 407, 16, 82, 10311, 8, 4, 107, 117, 5952, 15, 256, 4, 31050, 7, 3766, 5, 723, 36, 71, 43, 530, 476, 26, 400, 317, 46, 7, 4, 12118, 1029, 13, 104, 88, 4, 381, 15, 297, 98, 32, 2071, 56, 26, 141, 6, 194, 7486, 18, 4, 226, 22, 21, 134, 476, 26, 480, 5, 144, 30, 5535, 18, 51, 36, 28, 224, 92, 25, 104, 4, 226, 65, 16, 38, 1334, 88, 12, 16, 283, 5, 16, 4472, 113, 103, 32, 15, 16, 5345, 19, 178, 32] 1\n"
119 | ]
120 | }
121 | ],
122 | "source": [
123 | "print(\"Training set:\", x_train.shape, y_train.shape)\n",
124 | "print(\"Evaluation set:\", x_test.shape, y_test.shape)\n",
125 | "\n",
126 | "print(\"Example:\", x_train[0], y_train[0])"
127 | ]
128 | },
129 | {
130 | "cell_type": "code",
131 | "execution_count": 4,
132 | "metadata": {
133 | "colab": {},
134 | "colab_type": "code",
135 | "id": "yRFVdzD0TpY6"
136 | },
137 | "outputs": [],
138 | "source": [
139 | "def plot_distribution(data, labels):\n",
140 | " plt.style.use('ggplot')\n",
141 | " \n",
142 | " # distribution of text lengths\n",
143 | " lengths = np.array([len(row) for row in data])\n",
144 | " summary = \"mean: \"+str(int(np.mean(lengths)))+\" , min/max: \"+str(np.min(lengths))+\"/\"+str(np.max(lengths))+\" (95%: \"+ str(round(np.percentile(lengths, 95), 2)) + \")\"\n",
145 | " plt.figure(1, figsize=(10,6))\n",
146 | " plt.hist(lengths, bins='auto')\n",
147 | " plt.title(\"Distribution of text lengths\")\n",
148 | " plt.xlabel(\"Text Length: \" + summary); plt.ylabel(\"Examples\")\n",
149 | " plt.axvline(np.mean(lengths), ls=\"-\", color=\"k\")\n",
150 | " plt.axvline(np.percentile(lengths, 95), ls=\"--\", color=\"k\")\n",
151 | " plt.xlim(0, int(np.percentile(lengths, 99)))\n",
152 | " plt.show()\n",
153 | " \n",
154 | " # distribution of label counts\n",
155 | " labels = [str(label) for label in labels]\n",
156 | " plt.figure(2, figsize=(10,4))\n",
157 | " plt.hist(labels, bins='auto')\n",
158 | " plt.title(\"Distribution of labels\")\n",
159 | " plt.xlabel(\"Labels\"); plt.ylabel(\"Examples\")\n",
160 | " plt.show()"
161 | ]
162 | },
163 | {
164 | "cell_type": "code",
165 | "execution_count": 5,
166 | "metadata": {
167 | "colab": {
168 | "base_uri": "https://localhost:8080/",
169 | "height": 688
170 | },
171 | "colab_type": "code",
172 | "executionInfo": {
173 | "elapsed": 10386,
174 | "status": "ok",
175 | "timestamp": 1547020088465,
176 | "user": {
177 | "displayName": "Soh Jun De",
178 | "photoUrl": "",
179 | "userId": "15246530694083866298"
180 | },
181 | "user_tz": -480
182 | },
183 | "id": "5VNVKzXsVEBM",
184 | "outputId": "98baa8c2-086a-423e-dbfb-dc60602757a9"
185 | },
186 | "outputs": [
187 | {
188 | "data": {
189 | "image/png": "\n",
190 | "text/plain": [
191 | ""
192 | ]
193 | },
194 | "metadata": {},
195 | "output_type": "display_data"
196 | },
197 | {
198 | "data": {
199 | "image/png": "\n",
200 | "text/plain": [
201 | ""
202 | ]
203 | },
204 | "metadata": {},
205 | "output_type": "display_data"
206 | }
207 | ],
208 | "source": [
209 | "plot_distribution(x_train, y_train)"
210 | ]
211 | },
212 | {
213 | "cell_type": "code",
214 | "execution_count": 6,
215 | "metadata": {
216 | "colab": {
217 | "base_uri": "https://localhost:8080/",
218 | "height": 688
219 | },
220 | "colab_type": "code",
221 | "executionInfo": {
222 | "elapsed": 11487,
223 | "status": "ok",
224 | "timestamp": 1547020089577,
225 | "user": {
226 | "displayName": "Soh Jun De",
227 | "photoUrl": "",
228 | "userId": "15246530694083866298"
229 | },
230 | "user_tz": -480
231 | },
232 | "id": "5kRn4fohVhCo",
233 | "outputId": "044303f3-d758-4f97-faa3-0a9d84fdda68"
234 | },
235 | "outputs": [
236 | {
237 | "data": {
238 | "image/png": "\n",
239 | "text/plain": [
240 | ""
241 | ]
242 | },
243 | "metadata": {},
244 | "output_type": "display_data"
245 | },
246 | {
247 | "data": {
248 | "image/png": "\n",
249 | "text/plain": [
250 | ""
251 | ]
252 | },
253 | "metadata": {},
254 | "output_type": "display_data"
255 | }
256 | ],
257 | "source": [
258 | "plot_distribution(x_test, y_test)"
259 | ]
260 | },
261 | {
262 | "cell_type": "code",
263 | "execution_count": null,
264 | "metadata": {
265 | "colab": {},
266 | "colab_type": "code",
267 | "id": "RE0SK-ERV00G"
268 | },
269 | "outputs": [],
270 | "source": []
271 | }
272 | ],
273 | "metadata": {
274 | "colab": {
275 | "collapsed_sections": [],
276 | "name": "2_Loading and visualisation notebook.ipynb",
277 | "provenance": [],
278 | "version": "0.3.2"
279 | },
280 | "kernelspec": {
281 | "display_name": "Python 3",
282 | "language": "python",
283 | "name": "python3"
284 | },
285 | "language_info": {
286 | "codemirror_mode": {
287 | "name": "ipython",
288 | "version": 3
289 | },
290 | "file_extension": ".py",
291 | "mimetype": "text/x-python",
292 | "name": "python",
293 | "nbconvert_exporter": "python",
294 | "pygments_lexer": "ipython3",
295 | "version": "3.6.7"
296 | }
297 | },
298 | "nbformat": 4,
299 | "nbformat_minor": 1
300 | }
301 |
--------------------------------------------------------------------------------