├── Few-shot
    └── SetFit_SST_2_Few_shot.ipynb
├── OCR-Document-Processing
    └── Doc_Visual_QA_and_Bill_extraction_demo.ipynb
├── README.md
├── classification
    ├── BERT_Fine_Tuning_Sentence_Classification_v2.ipynb
    ├── Bert_Classification_Pt.ipynb
    ├── Generic_Transformer_Classification.ipynb
    ├── large_scale_multilabelclassification.ipynb
    └── transformers_multilabel_text_classification_with_problem_type.ipynb
├── docs
    ├── BERT_Fine_Tuning_Sentence_Classification_v2.md
    ├── BERT_Fine_Tuning_Sentence_Classification_v2_files
    │   └── BERT_Fine_Tuning_Sentence_Classification_v2_92_0.png
    ├── Bert_Classification_Pt.md
    ├── Bert_Classification_Pt_files
    │   └── Bert_Classification_Pt_28_0.png
    ├── Bert_Pre_Training.md
    ├── Doc_Visual_QA_and_Bill_extraction_demo.md
    ├── Doc_Visual_QA_and_Bill_extraction_demo_files
    │   ├── Doc_Visual_QA_and_Bill_extraction_demo_15_0.png
    │   ├── Doc_Visual_QA_and_Bill_extraction_demo_26_0.png
    │   └── Doc_Visual_QA_and_Bill_extraction_demo_35_0.png
    ├── GPT_2_on_Onnx_CPU.md
    ├── Generic_Transformer_Classification.md
    ├── Question_Answering_with_a_Fine_Tuned_BERT.md
    ├── Question_Answering_with_a_Fine_Tuned_BERT_files
    │   ├── Question_Answering_with_a_Fine_Tuned_BERT_44_0.png
    │   ├── Question_Answering_with_a_Fine_Tuned_BERT_46_0.png
    │   └── Question_Answering_with_a_Fine_Tuned_BERT_49_0.png
    ├── Seq2Seq_Pytorch.md
    ├── SetFit_SST_2_Few_shot.md
    ├── SetFit_SST_2_Few_shot_files
    │   └── SetFit_SST_2_Few_shot_22_0.png
    ├── Simpletransformers_2.md
    ├── TAPAS_fine_tuning_in_tf.md
    ├── Using_Transformers_with_Fastai_Tutorial.md
    ├── Wikipedia_answer_retrieval_DPR.md
    ├── contextual_topic_modeling.md
    ├── index.md
    ├── knowledge_distillation_exploration.md
    ├── large_scale_multilabelclassification.md
    ├── simpletransformers_intro.md
    ├── token_classification_transformers_zenml.md
    └── token_classification_transformers_zenml_files
    │   └── token_classification_transformers_zenml_36_1.svg
├── knowledge-distillation
    └── knowledge_distillation_exploration.ipynb
├── machine-translation
    └── Seq2Seq_Pytorch.ipynb
├── mkdocs.yml
├── nlp-onnx
    └── GPT_2_on_Onnx_CPU.ipynb
├── nlp-package-exploration
    ├── Simpletransformers_2.ipynb
    ├── Using_Transformers_with_Fastai_Tutorial.ipynb
    └── simpletransformers_intro.ipynb
├── opendomain-qa
    └── Wikipedia_answer_retrieval_DPR.ipynb
├── pretraining
    └── Bert_Pre_Training.ipynb
├── question-answering
    └── Question_Answering_with_a_Fine_Tuned_BERT.ipynb
├── table-qa
    └── TAPAS_fine_tuning_in_tf.ipynb
├── token-classification
    └── token_classification_transformers_zenml.ipynb
└── topic-modeling
    └── contextual_topic_modeling.ipynb


/README.md:
--------------------------------------------------------------------------------
 1 | # NLP Notebooks
 2 | 
 3 | ![NLP](https://www.upwork.com/catalog-images/c0717a4a34e39d0ff4391b01b6898cd1)
 4 | 
 5 | ## Pretraining
 6 | 
 7 | * BERT Mask Language Modeling | Pretraining[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/pretraining/Bert_Pre_Training.ipynb)
 8 | 
 9 | ## Classification
10 | 
11 | * BERT Classification in Pytorch[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/classification/Bert_Classification_Pt.ipynb)
12 | 
13 | * BERT Sentence Classification 2 in Pytorch[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/classification/BERT_Fine_Tuning_Sentence_Classification_v2.ipynb)
14 | 
15 | * Generic Class for Classification using transformers[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/classification/Generic_Transformer_Classification.ipynb)
16 | 
17 | * Large Scale Multi Label Classification[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/classification/large_scale_multilabelclassification.ipynb)
18 | 
19 | ## Question Answering
20 | 
21 | * Question Answering with a Fine Tuned BERT [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/question-answering/Question_Answering_with_a_Fine_Tuned_BERT.ipynb)
22 | 
23 | 
24 | ## Machine Translation
25 | 
26 | * Machine Translation using transformers [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/machine-translation/Seq2Seq_Pytorch.ipynb)
27 | 
28 | 
29 | ## Topic Modeling
30 | 
31 | * Contextual Topic Modeling [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/topic-modeling/contextual_topic_modeling.ipynb)
32 | 
33 | 
34 | ## Knowledge Distillation
35 | 
36 | * BERT Knowledge Distillation in LSTM [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/knowledge-distillation/knowledge_distillation_exploration.ipynb)
37 | 
38 | 
39 | ## NLP Package Exploration
40 | 
41 | * Simpletransformers Exploration [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/nlp-package-exploration/Simpletransformers_2.ipynb)
42 | 
43 | * Fastai + Transformers Exploration [![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/nlp-package-exploration/Using_Transformers_with_Fastai_Tutorial.ipynb)
44 | 


--------------------------------------------------------------------------------
/docs/BERT_Fine_Tuning_Sentence_Classification_v2_files/BERT_Fine_Tuning_Sentence_Classification_v2_92_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ankur3107/nlp_notebooks/db671902eebb52f7174df5f302cbc700fe0b0b97/docs/BERT_Fine_Tuning_Sentence_Classification_v2_files/BERT_Fine_Tuning_Sentence_Classification_v2_92_0.png


--------------------------------------------------------------------------------
/docs/Bert_Classification_Pt_files/Bert_Classification_Pt_28_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ankur3107/nlp_notebooks/db671902eebb52f7174df5f302cbc700fe0b0b97/docs/Bert_Classification_Pt_files/Bert_Classification_Pt_28_0.png


--------------------------------------------------------------------------------
/docs/Bert_Pre_Training.md:
--------------------------------------------------------------------------------
  1 | <a href="https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/Bert_Pre_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  2 | 
  3 | 
  4 | ```
  5 | !pip install transformers
  6 | ```
  7 | 
  8 |     Collecting transformers
  9 |     [?25l  Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)
 10 |     [K     |████████████████████████████████| 890kB 3.4MB/s 
 11 |     [?25hCollecting sacremoses
 12 |     [?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
 13 |     [K     |████████████████████████████████| 890kB 16.3MB/s 
 14 |     [?25hRequirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)
 15 |     Requirement already satisfied: dataclasses; python_version < "3.7" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)
 16 |     Collecting tokenizers==0.8.1.rc2
 17 |     [?25l  Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
 18 |     [K     |████████████████████████████████| 3.0MB 25.3MB/s 
 19 |     [?25hRequirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)
 20 |     Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.5)
 21 |     Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)
 22 |     Collecting sentencepiece!=0.1.92
 23 |     [?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
 24 |     [K     |████████████████████████████████| 1.1MB 44.6MB/s 
 25 |     [?25hRequirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.4)
 26 |     Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)
 27 |     Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)
 28 |     Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)
 29 |     Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.16.0)
 30 |     Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)
 31 |     Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.6.20)
 32 |     Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)
 33 |     Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)
 34 |     Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)
 35 |     Building wheels for collected packages: sacremoses
 36 |       Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
 37 |       Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=b4d9f604b99e77f4dc2b8892460fc931726fa7d37c4fc51dfcb98c01d6d08797
 38 |       Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45
 39 |     Successfully built sacremoses
 40 |     Installing collected packages: sacremoses, tokenizers, sentencepiece, transformers
 41 |     Successfully installed sacremoses-0.0.43 sentencepiece-0.1.91 tokenizers-0.8.1rc2 transformers-3.1.0
 42 | 
 43 | 
 44 | 
 45 | ```
 46 | MAX_LEN = 128
 47 | BATCH_SIZE = 16 # per TPU core
 48 | TOTAL_STEPS = 2000  # thats approx 4 epochs
 49 | EVALUATE_EVERY = 200
 50 | LR =  1e-5
 51 | 
 52 | PRETRAINED_MODEL = 'bert-base-uncased'
 53 | 
 54 | 
 55 | import os
 56 | import numpy as np
 57 | import pandas as pd
 58 | import tensorflow as tf
 59 | print(tf.__version__)
 60 | from tensorflow.keras.optimizers import Adam
 61 | import transformers
 62 | from transformers import TFAutoModelWithLMHead, AutoTokenizer
 63 | import logging
 64 | 
 65 | AUTO = tf.data.experimental.AUTOTUNE
 66 | ```
 67 | 
 68 |     2.3.0
 69 | 
 70 | 
 71 | 
 72 | ```
 73 | def connect_to_TPU():
 74 |     """Detect hardware, return appropriate distribution strategy"""
 75 |     try:
 76 |         # TPU detection. No parameters necessary if TPU_NAME environment variable is
 77 |         # set: this is always the case on Kaggle.
 78 |         tpu = tf.distribute.cluster_resolver.TPUClusterResolver()
 79 |         print('Running on TPU ', tpu.master())
 80 |     except ValueError:
 81 |         tpu = None
 82 | 
 83 |     if tpu:
 84 |         tf.config.experimental_connect_to_cluster(tpu)
 85 |         tf.tpu.experimental.initialize_tpu_system(tpu)
 86 |         strategy = tf.distribute.experimental.TPUStrategy(tpu)
 87 |     else:
 88 |         # Default distribution strategy in Tensorflow. Works on CPU and single GPU.
 89 |         strategy = tf.distribute.get_strategy()
 90 | 
 91 |     global_batch_size = BATCH_SIZE * strategy.num_replicas_in_sync
 92 | 
 93 |     return tpu, strategy, global_batch_size
 94 | 
 95 | 
 96 | tpu, strategy, global_batch_size = connect_to_TPU()
 97 | print("REPLICAS: ", strategy.num_replicas_in_sync)
 98 | ```
 99 | 
100 |     INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0
101 | 
102 | 
103 |     Running on TPU  grpc://10.19.232.114:8470
104 |     INFO:tensorflow:Initializing the TPU system: grpc://10.19.232.114:8470
105 | 
106 | 
107 |     INFO:tensorflow:Initializing the TPU system: grpc://10.19.232.114:8470
108 | 
109 | 
110 |     INFO:tensorflow:Clearing out eager caches
111 | 
112 | 
113 |     INFO:tensorflow:Clearing out eager caches
114 | 
115 | 
116 |     INFO:tensorflow:Finished initializing TPU system.
117 | 
118 | 
119 |     INFO:tensorflow:Finished initializing TPU system.
120 |     WARNING:absl:`tf.distribute.experimental.TPUStrategy` is deprecated, please use  the non experimental symbol `tf.distribute.TPUStrategy` instead.
121 | 
122 | 
123 |     INFO:tensorflow:Found TPU system:
124 | 
125 | 
126 |     INFO:tensorflow:Found TPU system:
127 | 
128 | 
129 |     INFO:tensorflow:*** Num TPU Cores: 8
130 | 
131 | 
132 |     INFO:tensorflow:*** Num TPU Cores: 8
133 | 
134 | 
135 |     INFO:tensorflow:*** Num TPU Workers: 1
136 | 
137 | 
138 |     INFO:tensorflow:*** Num TPU Workers: 1
139 | 
140 | 
141 |     INFO:tensorflow:*** Num TPU Cores Per Worker: 8
142 | 
143 | 
144 |     INFO:tensorflow:*** Num TPU Cores Per Worker: 8
145 | 
146 | 
147 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
148 | 
149 | 
150 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)
151 | 
152 | 
153 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
154 | 
155 | 
156 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
157 | 
158 | 
159 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)
160 | 
161 | 
162 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)
163 | 
164 | 
165 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)
166 | 
167 | 
168 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)
169 | 
170 | 
171 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)
172 | 
173 | 
174 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)
175 | 
176 | 
177 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)
178 | 
179 | 
180 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)
181 | 
182 | 
183 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)
184 | 
185 | 
186 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)
187 | 
188 | 
189 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)
190 | 
191 | 
192 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)
193 | 
194 | 
195 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)
196 | 
197 | 
198 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)
199 | 
200 | 
201 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)
202 | 
203 | 
204 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)
205 | 
206 | 
207 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)
208 | 
209 | 
210 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)
211 | 
212 | 
213 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)
214 | 
215 | 
216 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)
217 | 
218 | 
219 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
220 | 
221 | 
222 |     INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
223 | 
224 | 
225 |     REPLICAS:  8
226 | 
227 | 
228 | 
229 | ```
230 | !wget https://raw.githubusercontent.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset/master/imdb_tr.csv
231 | ```
232 | 
233 |     --2020-09-02 10:33:57--  https://raw.githubusercontent.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset/master/imdb_tr.csv
234 |     Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
235 |     Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
236 |     HTTP request sent, awaiting response... 200 OK
237 |     Length: 23677025 (23M) [text/plain]
238 |     Saving to: ‘imdb_tr.csv’
239 |     
240 |     imdb_tr.csv         100%[===================>]  22.58M  49.2MB/s    in 0.5s    
241 |     
242 |     2020-09-02 10:33:58 (49.2 MB/s) - ‘imdb_tr.csv’ saved [23677025/23677025]
243 |     
244 | 
245 | 
246 | 
247 | ```
248 | data = pd.read_csv('imdb_tr.csv', encoding = "ISO-8859-1")
249 | ```
250 | 
251 | 
252 | ```
253 | data.head()
254 | ```
255 | 
256 | 
257 | 
258 | 
259 | <div>
260 | <style scoped>
261 |     .dataframe tbody tr th:only-of-type {
262 |         vertical-align: middle;
263 |     }
264 | 
265 |     .dataframe tbody tr th {
266 |         vertical-align: top;
267 |     }
268 | 
269 |     .dataframe thead th {
270 |         text-align: right;
271 |     }
272 | </style>
273 | <table border="1" class="dataframe">
274 |   <thead>
275 |     <tr style="text-align: right;">
276 |       <th></th>
277 |       <th>row_Number</th>
278 |       <th>text</th>
279 |       <th>polarity</th>
280 |     </tr>
281 |   </thead>
282 |   <tbody>
283 |     <tr>
284 |       <th>0</th>
285 |       <td>2148</td>
286 |       <td>first think another Disney movie, might good, ...</td>
287 |       <td>1</td>
288 |     </tr>
289 |     <tr>
290 |       <th>1</th>
291 |       <td>23577</td>
292 |       <td>Put aside Dr. House repeat missed, Desperate H...</td>
293 |       <td>0</td>
294 |     </tr>
295 |     <tr>
296 |       <th>2</th>
297 |       <td>1319</td>
298 |       <td>big fan Stephen King's work, film made even gr...</td>
299 |       <td>1</td>
300 |     </tr>
301 |     <tr>
302 |       <th>3</th>
303 |       <td>13358</td>
304 |       <td>watched horrid thing TV. Needless say one movi...</td>
305 |       <td>0</td>
306 |     </tr>
307 |     <tr>
308 |       <th>4</th>
309 |       <td>9495</td>
310 |       <td>truly enjoyed film. acting terrific plot. Jeff...</td>
311 |       <td>1</td>
312 |     </tr>
313 |   </tbody>
314 | </table>
315 | </div>
316 | 
317 | 
318 | 
319 | 
320 | ```
321 | #data = data.sample(1000)
322 | ```
323 | 
324 | 
325 | ```
326 | %%time
327 | 
328 | def regular_encode(texts, tokenizer, maxlen=512):
329 |     enc_di = tokenizer.batch_encode_plus(
330 |         texts, 
331 |         return_attention_mask=False, 
332 |         return_token_type_ids=False,
333 |         pad_to_max_length=True,
334 |         max_length=maxlen,
335 |         truncation=True
336 |     )
337 |     
338 |     return np.array(enc_di['input_ids'])
339 |     
340 | 
341 | tokenizer = AutoTokenizer.from_pretrained(PRETRAINED_MODEL)
342 | X_data = regular_encode(data.text.values, tokenizer, maxlen=MAX_LEN)
343 | ```
344 | 
345 | 
346 |     HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…
347 | 
348 | 
349 |     
350 | 
351 | 
352 | 
353 |     HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…
354 | 
355 | 
356 |     
357 | 
358 | 
359 |     /usr/local/lib/python3.6/dist-packages/transformers/tokenization_utils_base.py:1770: FutureWarning: The `pad_to_max_length` argument is deprecated and will be removed in a future version, use `padding=True` or `padding='longest'` to pad to the longest sequence in the batch, or use `padding='max_length'` to pad to a max length. In this case, you can give a specific length with `max_length` (e.g. `max_length=45`) or leave max_length to None to pad to the maximal input size of the model (e.g. 512 for Bert).
360 |       FutureWarning,
361 | 
362 | 
363 |     CPU times: user 1min 4s, sys: 233 ms, total: 1min 4s
364 |     Wall time: 1min 5s
365 | 
366 | 
367 | 
368 | ```
369 | def prepare_mlm_input_and_labels(X):
370 |     # 15% BERT masking
371 |     inp_mask = np.random.rand(*X.shape)<0.15 
372 |     # do not mask special tokens
373 |     inp_mask[X<=2] = False
374 |     # set targets to -1 by default, it means ignore
375 |     labels =  -1 * np.ones(X.shape, dtype=int)
376 |     # set labels for masked tokens
377 |     labels[inp_mask] = X[inp_mask]
378 |     
379 |     # prepare input
380 |     X_mlm = np.copy(X)
381 |     # set input to [MASK] which is the last token for the 90% of tokens
382 |     # this means leaving 10% unchanged
383 |     inp_mask_2mask = inp_mask  & (np.random.rand(*X.shape)<0.90)
384 |     X_mlm[inp_mask_2mask] = tokenizer.mask_token_id  # mask token is the last in the dict
385 | 
386 |     # set 10% to a random token
387 |     inp_mask_2random = inp_mask_2mask  & (np.random.rand(*X.shape) < 1/9)
388 |     X_mlm[inp_mask_2random] = np.random.randint(3, tokenizer.mask_token_id, inp_mask_2random.sum())
389 |     
390 |     return X_mlm, labels
391 | 
392 | 
393 | # use validation and test data for mlm
394 | X_train_mlm = np.vstack(X_data)
395 | # masks and labels
396 | X_train_mlm, y_train_mlm = prepare_mlm_input_and_labels(X_train_mlm)
397 | ```
398 | 
399 | 
400 | ```
401 | def create_dist_dataset(X, y=None, training=False):
402 |     dataset = tf.data.Dataset.from_tensor_slices(X)
403 | 
404 |     ### Add y if present ###
405 |     if y is not None:
406 |         dataset_y = tf.data.Dataset.from_tensor_slices(y)
407 |         dataset = tf.data.Dataset.zip((dataset, dataset_y))
408 |         
409 |     ### Repeat if training ###
410 |     if training:
411 |         dataset = dataset.shuffle(len(X)).repeat()
412 | 
413 |     dataset = dataset.batch(global_batch_size).prefetch(AUTO)
414 | 
415 |     ### make it distributed  ###
416 |     dist_dataset = strategy.experimental_distribute_dataset(dataset)
417 | 
418 |     return dist_dataset
419 |     
420 |     
421 | train_dist_dataset = create_dist_dataset(X_train_mlm, y_train_mlm, True)
422 | 
423 | ```
424 | 
425 | 
426 | ```
427 | %%time
428 | 
429 | def create_mlm_model_and_optimizer():
430 |     with strategy.scope():
431 |         model = TFAutoModelWithLMHead.from_pretrained(PRETRAINED_MODEL)
432 |         optimizer = tf.keras.optimizers.Adam(learning_rate=LR)
433 |     return model, optimizer
434 | 
435 | 
436 | mlm_model, optimizer = create_mlm_model_and_optimizer()
437 | mlm_model.summary()
438 | ```
439 | 
440 |     /usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_auto.py:788: FutureWarning: The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.
441 |       FutureWarning,
442 | 
443 | 
444 | 
445 |     HBox(children=(FloatProgress(value=0.0, description='Downloading', max=536063208.0, style=ProgressStyle(descri…
446 | 
447 | 
448 |     
449 | 
450 | 
451 |     Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertForMaskedLM: ['nsp___cls']
452 |     - This IS expected if you are initializing TFBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
453 |     - This IS NOT expected if you are initializing TFBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
454 |     All the weights of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
455 |     If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
456 | 
457 | 
458 |     Model: "tf_bert_for_masked_lm"
459 |     _________________________________________________________________
460 |     Layer (type)                 Output Shape              Param #   
461 |     =================================================================
462 |     bert (TFBertMainLayer)       multiple                  109482240 
463 |     _________________________________________________________________
464 |     mlm___cls (TFBertMLMHead)    multiple                  24459834  
465 |     =================================================================
466 |     Total params: 110,104,890
467 |     Trainable params: 110,104,890
468 |     Non-trainable params: 0
469 |     _________________________________________________________________
470 |     CPU times: user 14.5 s, sys: 15 s, total: 29.5 s
471 |     Wall time: 58.1 s
472 | 
473 | 
474 | 
475 | ```
476 | def define_mlm_loss_and_metrics():
477 |     with strategy.scope():
478 |         mlm_loss_object = masked_sparse_categorical_crossentropy
479 | 
480 |         def compute_mlm_loss(labels, predictions):
481 |             per_example_loss = mlm_loss_object(labels, predictions)
482 |             loss = tf.nn.compute_average_loss(
483 |                 per_example_loss, global_batch_size = global_batch_size)
484 |             return loss
485 | 
486 |         train_mlm_loss_metric = tf.keras.metrics.Mean()
487 |         
488 |     return compute_mlm_loss, train_mlm_loss_metric
489 | 
490 | 
491 | def masked_sparse_categorical_crossentropy(y_true, y_pred):
492 |     y_true_masked = tf.boolean_mask(y_true, tf.not_equal(y_true, -1))
493 |     y_pred_masked = tf.boolean_mask(y_pred, tf.not_equal(y_true, -1))
494 |     loss = tf.keras.losses.sparse_categorical_crossentropy(y_true_masked,
495 |                                                           y_pred_masked,
496 |                                                           from_logits=True)
497 |     return loss
498 | 
499 |             
500 |             
501 | def train_mlm(train_dist_dataset, total_steps=2000, evaluate_every=200):
502 |     step = 0
503 |     ### Training lopp ###
504 |     for tensor in train_dist_dataset:
505 |         distributed_mlm_train_step(tensor) 
506 |         step+=1
507 | 
508 |         if (step % evaluate_every == 0):   
509 |             ### Print train metrics ###  
510 |             train_metric = train_mlm_loss_metric.result().numpy()
511 |             print("Step %d, train loss: %.2f" % (step, train_metric))     
512 | 
513 |             ### Reset  metrics ###
514 |             train_mlm_loss_metric.reset_states()
515 |             
516 |         if step  == total_steps:
517 |             break
518 | 
519 | 
520 | @tf.function
521 | def distributed_mlm_train_step(data):
522 |     strategy.experimental_run_v2(mlm_train_step, args=(data,))
523 | 
524 | 
525 | @tf.function
526 | def mlm_train_step(inputs):
527 |     features, labels = inputs
528 | 
529 |     with tf.GradientTape() as tape:
530 |         predictions = mlm_model(features, training=True)[0]
531 |         loss = compute_mlm_loss(labels, predictions)
532 | 
533 |     gradients = tape.gradient(loss, mlm_model.trainable_variables)
534 |     optimizer.apply_gradients(zip(gradients, mlm_model.trainable_variables))
535 | 
536 |     train_mlm_loss_metric.update_state(loss)
537 |     
538 | 
539 | compute_mlm_loss, train_mlm_loss_metric = define_mlm_loss_and_metrics()
540 | ```
541 | 
542 | 
543 | ```
544 | %%time
545 | train_mlm(train_dist_dataset, TOTAL_STEPS, EVALUATE_EVERY)
546 | ```
547 | 
548 |     WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/multi_device_iterator_ops.py:601: get_next_as_optional (from tensorflow.python.data.ops.iterator_ops) is deprecated and will be removed in a future version.
549 |     Instructions for updating:
550 |     Use `tf.data.Iterator.get_next_as_optional()` instead.
551 | 
552 | 
553 |     WARNING:tensorflow:From /usr/local/lib/python3.6/dist-packages/tensorflow/python/data/ops/multi_device_iterator_ops.py:601: get_next_as_optional (from tensorflow.python.data.ops.iterator_ops) is deprecated and will be removed in a future version.
554 |     Instructions for updating:
555 |     Use `tf.data.Iterator.get_next_as_optional()` instead.
556 | 
557 | 
558 |     WARNING:tensorflow:From <ipython-input-12-d78fc23ea715>:47: StrategyBase.experimental_run_v2 (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
559 |     Instructions for updating:
560 |     renamed to `run`
561 | 
562 | 
563 |     WARNING:tensorflow:From <ipython-input-12-d78fc23ea715>:47: StrategyBase.experimental_run_v2 (from tensorflow.python.distribute.distribute_lib) is deprecated and will be removed in a future version.
564 |     Instructions for updating:
565 |     renamed to `run`
566 | 
567 | 
568 |     WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_masked_lm/bert/pooler/dense/kernel:0', 'tf_bert_for_masked_lm/bert/pooler/dense/bias:0'] when minimizing the loss.
569 | 
570 | 
571 |     WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_masked_lm/bert/pooler/dense/kernel:0', 'tf_bert_for_masked_lm/bert/pooler/dense/bias:0'] when minimizing the loss.
572 | 
573 | 
574 |     WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_masked_lm/bert/pooler/dense/kernel:0', 'tf_bert_for_masked_lm/bert/pooler/dense/bias:0'] when minimizing the loss.
575 | 
576 | 
577 |     WARNING:tensorflow:Gradients do not exist for variables ['tf_bert_for_masked_lm/bert/pooler/dense/kernel:0', 'tf_bert_for_masked_lm/bert/pooler/dense/bias:0'] when minimizing the loss.
578 | 
579 | 
580 |     Step 200, train loss: 8.89
581 |     Step 400, train loss: 8.03
582 |     Step 600, train loss: 7.68
583 |     Step 800, train loss: 7.43
584 |     Step 1000, train loss: 7.22
585 |     Step 1200, train loss: 7.00
586 |     Step 1400, train loss: 6.86
587 |     Step 1600, train loss: 6.68
588 |     Step 1800, train loss: 6.54
589 |     Step 2000, train loss: 6.38
590 |     CPU times: user 1min 23s, sys: 13.4 s, total: 1min 37s
591 |     Wall time: 9min 3s
592 | 
593 | 
594 | 
595 | ```
596 | mlm_model.save_pretrained('imdb_bert_uncased')
597 | ```
598 | 
599 | # Load and Test
600 | 
601 | 
602 | ```
603 | from transformers import *
604 | from pprint import pprint
605 | ```
606 | 
607 | 
608 | ```
609 | pretrained_model = TFAutoModelWithLMHead.from_pretrained(PRETRAINED_MODEL)
610 | nlp = pipeline("fill-mask",model=pretrained_model, tokenizer=tokenizer ,framework='tf')
611 | pprint(nlp(f"I watched {nlp.tokenizer.mask_token} and that was awesome"))
612 | ```
613 | 
614 |     /usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_auto.py:788: FutureWarning: The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.
615 |       FutureWarning,
616 |     Some weights of the model checkpoint at bert-base-uncased were not used when initializing TFBertForMaskedLM: ['nsp___cls']
617 |     - This IS expected if you are initializing TFBertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPretraining model).
618 |     - This IS NOT expected if you are initializing TFBertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
619 |     All the weights of TFBertForMaskedLM were initialized from the model checkpoint at bert-base-uncased.
620 |     If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
621 | 
622 | 
623 |     [{'score': 0.31239137053489685,
624 |       'sequence': '[CLS] i watched him and that was awesome [SEP]',
625 |       'token': 2032,
626 |       'token_str': 'him'},
627 |      {'score': 0.1729636937379837,
628 |       'sequence': '[CLS] i watched her and that was awesome [SEP]',
629 |       'token': 2014,
630 |       'token_str': 'her'},
631 |      {'score': 0.13816313445568085,
632 |       'sequence': '[CLS] i watched it and that was awesome [SEP]',
633 |       'token': 2009,
634 |       'token_str': 'it'},
635 |      {'score': 0.08374697715044022,
636 |       'sequence': '[CLS] i watched, and that was awesome [SEP]',
637 |       'token': 1010,
638 |       'token_str': ','},
639 |      {'score': 0.06438492983579636,
640 |       'sequence': '[CLS] i watched them and that was awesome [SEP]',
641 |       'token': 2068,
642 |       'token_str': 'them'}]
643 | 
644 | 
645 | 
646 | ```
647 | movie_mlm_model = TFAutoModelWithLMHead.from_pretrained('imdb_bert_uncased')
648 | nlp = pipeline("fill-mask",model=movie_mlm_model, tokenizer=tokenizer ,framework='tf')
649 | pprint(nlp(f"I watched {nlp.tokenizer.mask_token} and that was awesome"))
650 | ```
651 | 
652 |     /usr/local/lib/python3.6/dist-packages/transformers/modeling_tf_auto.py:788: FutureWarning: The class `TFAutoModelWithLMHead` is deprecated and will be removed in a future version. Please use `TFAutoModelForCausalLM` for causal language models, `TFAutoModelForMaskedLM` for masked language models and `TFAutoModelForSeq2SeqLM` for encoder-decoder models.
653 |       FutureWarning,
654 |     All model checkpoint weights were used when initializing TFBertForMaskedLM.
655 |     
656 |     All the weights of TFBertForMaskedLM were initialized from the model checkpoint at imdb_bert_uncased.
657 |     If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForMaskedLM for predictions without further training.
658 | 
659 | 
660 |     [{'score': 0.4467789828777313,
661 |       'sequence': '[CLS] i watched it and that was awesome [SEP]',
662 |       'token': 2009,
663 |       'token_str': 'it'},
664 |      {'score': 0.06318594515323639,
665 |       'sequence': '[CLS] i watched movie and that was awesome [SEP]',
666 |       'token': 3185,
667 |       'token_str': 'movie'},
668 |      {'score': 0.056345004588365555,
669 |       'sequence': '[CLS] i watched, and that was awesome [SEP]',
670 |       'token': 1010,
671 |       'token_str': ','},
672 |      {'score': 0.013144557364284992,
673 |       'sequence': '[CLS] i watched this and that was awesome [SEP]',
674 |       'token': 2023,
675 |       'token_str': 'this'},
676 |      {'score': 0.012886741198599339,
677 |       'sequence': '[CLS] i watched one and that was awesome [SEP]',
678 |       'token': 2028,
679 |       'token_str': 'one'}]
680 | 
681 | 
682 | 
683 | ```
684 | 
685 | ```
686 | 


--------------------------------------------------------------------------------
/docs/Doc_Visual_QA_and_Bill_extraction_demo.md:
--------------------------------------------------------------------------------
  1 | <a href="https://colab.research.google.com/github/Ankur3107/nlp_notebooks/blob/master/OCR-Document-Processing/Doc_Visual_QA_and_Bill_extraction_demo.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  2 | 
  3 | 
  4 | ```python
  5 | !wget --no-check-certificate https://datasets.cvc.uab.es/rrc/DocVQA/train.tar.gz
  6 | ```
  7 | 
  8 |     --2022-05-01 13:20:26--  https://datasets.cvc.uab.es/rrc/DocVQA/train.tar.gz
  9 |     Resolving datasets.cvc.uab.es (datasets.cvc.uab.es)... 158.109.8.18
 10 |     Connecting to datasets.cvc.uab.es (datasets.cvc.uab.es)|158.109.8.18|:443... connected.
 11 |     WARNING: cannot verify datasets.cvc.uab.es's certificate, issued by ‘CN=GEANT OV RSA CA 4,O=GEANT Vereniging,C=NL’:
 12 |       Unable to locally verify the issuer's authority.
 13 |     HTTP request sent, awaiting response... 200 OK
 14 |     Length: 7122739200 (6.6G) [application/x-gzip]
 15 |     Saving to: ‘train.tar.gz.1’
 16 |     
 17 |     train.tar.gz.1        0%[                    ]  39.45M   638KB/s    eta 2h 26m 
 18 | 
 19 | 
 20 | ```python
 21 | !wget --no-check-certificate https://datasets.cvc.uab.es/rrc/DocVQA/val.tar.gz
 22 | ```
 23 | 
 24 | 
 25 | ```python
 26 | !wget --no-check-certificate https://datasets.cvc.uab.es/rrc/DocVQA/test.tar.gz
 27 | ```
 28 | 
 29 | # Install Packages
 30 | 
 31 | 
 32 | ```python
 33 | !pip install -q transformers
 34 | ```
 35 | 
 36 |     [K     |████████████████████████████████| 4.0 MB 5.3 MB/s 
 37 |     [K     |████████████████████████████████| 895 kB 45.4 MB/s 
 38 |     [K     |████████████████████████████████| 6.6 MB 31.6 MB/s 
 39 |     [K     |████████████████████████████████| 77 kB 4.5 MB/s 
 40 |     [K     |████████████████████████████████| 596 kB 43.9 MB/s 
 41 |     [?25h
 42 | 
 43 | 
 44 | ```python
 45 | !pip install pyyaml==5.1
 46 | # workaround: install old version of pytorch since detectron2 hasn't released packages for pytorch 1.9 (issue: https://github.com/facebookresearch/detectron2/issues/3158)
 47 | !pip install torch==1.8.0+cu101 torchvision==0.9.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
 48 | 
 49 | # install detectron2 that matches pytorch 1.8
 50 | # See https://detectron2.readthedocs.io/tutorials/install.html for instructions
 51 | !pip install -q detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu101/torch1.8/index.html
 52 | # exit(0)  # After installation, you need to "restart runtime" in Colab. This line can also restart runtime
 53 | ```
 54 | 
 55 |     Collecting pyyaml==5.1
 56 |       Downloading PyYAML-5.1.tar.gz (274 kB)
 57 |     [K     |████████████████████████████████| 274 kB 5.3 MB/s 
 58 |     [?25hBuilding wheels for collected packages: pyyaml
 59 |       Building wheel for pyyaml (setup.py) ... [?25l[?25hdone
 60 |       Created wheel for pyyaml: filename=PyYAML-5.1-cp37-cp37m-linux_x86_64.whl size=44092 sha256=923c6817c78b049bf1912c6f36e26c890af93770cb6627b50efc1b77ac4eeeae
 61 |       Stored in directory: /root/.cache/pip/wheels/77/f5/10/d00a2bd30928b972790053b5de0c703ca87324f3fead0f2fd9
 62 |     Successfully built pyyaml
 63 |     Installing collected packages: pyyaml
 64 |       Attempting uninstall: pyyaml
 65 |         Found existing installation: PyYAML 6.0
 66 |         Uninstalling PyYAML-6.0:
 67 |           Successfully uninstalled PyYAML-6.0
 68 |     Successfully installed pyyaml-5.1
 69 |     Looking in links: https://download.pytorch.org/whl/torch_stable.html
 70 |     Collecting torch==1.8.0+cu101
 71 |       Downloading https://download.pytorch.org/whl/cu101/torch-1.8.0%2Bcu101-cp37-cp37m-linux_x86_64.whl (763.5 MB)
 72 |     [K     |████████████████████████████████| 763.5 MB 16 kB/s 
 73 |     [?25hCollecting torchvision==0.9.0+cu101
 74 |       Downloading https://download.pytorch.org/whl/cu101/torchvision-0.9.0%2Bcu101-cp37-cp37m-linux_x86_64.whl (17.3 MB)
 75 |     [K     |████████████████████████████████| 17.3 MB 795 kB/s 
 76 |     [?25hRequirement already satisfied: numpy in /usr/local/lib/python3.7/dist-packages (from torch==1.8.0+cu101) (1.21.6)
 77 |     Requirement already satisfied: typing-extensions in /usr/local/lib/python3.7/dist-packages (from torch==1.8.0+cu101) (4.2.0)
 78 |     Requirement already satisfied: pillow>=4.1.1 in /usr/local/lib/python3.7/dist-packages (from torchvision==0.9.0+cu101) (7.1.2)
 79 |     Installing collected packages: torch, torchvision
 80 |       Attempting uninstall: torch
 81 |         Found existing installation: torch 1.11.0+cu113
 82 |         Uninstalling torch-1.11.0+cu113:
 83 |           Successfully uninstalled torch-1.11.0+cu113
 84 |       Attempting uninstall: torchvision
 85 |         Found existing installation: torchvision 0.12.0+cu113
 86 |         Uninstalling torchvision-0.12.0+cu113:
 87 |           Successfully uninstalled torchvision-0.12.0+cu113
 88 |     [31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
 89 |     torchtext 0.12.0 requires torch==1.11.0, but you have torch 1.8.0+cu101 which is incompatible.
 90 |     torchaudio 0.11.0+cu113 requires torch==1.11.0, but you have torch 1.8.0+cu101 which is incompatible.[0m
 91 |     Successfully installed torch-1.8.0+cu101 torchvision-0.9.0+cu101
 92 |     [K     |████████████████████████████████| 6.3 MB 897 kB/s 
 93 |     [K     |████████████████████████████████| 74 kB 2.1 MB/s 
 94 |     [K     |████████████████████████████████| 50 kB 5.2 MB/s 
 95 |     [K     |████████████████████████████████| 147 kB 10.7 MB/s 
 96 |     [K     |████████████████████████████████| 130 kB 33.3 MB/s 
 97 |     [K     |████████████████████████████████| 749 kB 42.3 MB/s 
 98 |     [K     |████████████████████████████████| 843 kB 34.7 MB/s 
 99 |     [K     |████████████████████████████████| 112 kB 46.3 MB/s 
100 |     [?25h  Building wheel for fvcore (setup.py) ... [?25l[?25hdone
101 |       Building wheel for antlr4-python3-runtime (setup.py) ... [?25l[?25hdone
102 | 
103 | 
104 | 
105 | ```python
106 | !pip install -q datasets
107 | ```
108 | 
109 |     [K     |████████████████████████████████| 325 kB 5.4 MB/s 
110 |     [K     |████████████████████████████████| 212 kB 43.4 MB/s 
111 |     [K     |████████████████████████████████| 136 kB 43.2 MB/s 
112 |     [K     |████████████████████████████████| 1.1 MB 35.9 MB/s 
113 |     [K     |████████████████████████████████| 127 kB 45.1 MB/s 
114 |     [K     |████████████████████████████████| 144 kB 46.5 MB/s 
115 |     [K     |████████████████████████████████| 94 kB 2.6 MB/s 
116 |     [K     |████████████████████████████████| 271 kB 47.1 MB/s 
117 |     [31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
118 |     datascience 0.10.6 requires folium==0.2.1, but you have folium 0.8.3 which is incompatible.[0m
119 |     [?25h
120 | 
121 | 
122 | ```python
123 | !sudo apt install tesseract-ocr
124 | !pip install -q pytesseract
125 | ```
126 | 
127 |     Reading package lists... Done
128 |     Building dependency tree       
129 |     Reading state information... Done
130 |     The following packages were automatically installed and are no longer required:
131 |       libnvidia-common-460 nsight-compute-2020.2.0
132 |     Use 'sudo apt autoremove' to remove them.
133 |     The following additional packages will be installed:
134 |       tesseract-ocr-eng tesseract-ocr-osd
135 |     The following NEW packages will be installed:
136 |       tesseract-ocr tesseract-ocr-eng tesseract-ocr-osd
137 |     0 upgraded, 3 newly installed, 0 to remove and 42 not upgraded.
138 |     Need to get 4,795 kB of archives.
139 |     After this operation, 15.8 MB of additional disk space will be used.
140 |     Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-eng all 4.00~git24-0e00fe6-1.2 [1,588 kB]
141 |     Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr-osd all 4.00~git24-0e00fe6-1.2 [2,989 kB]
142 |     Get:3 http://archive.ubuntu.com/ubuntu bionic/universe amd64 tesseract-ocr amd64 4.00~git2288-10f4998a-2 [218 kB]
143 |     Fetched 4,795 kB in 1s (3,752 kB/s)
144 |     debconf: unable to initialize frontend: Dialog
145 |     debconf: (No usable dialog-like program is installed, so the dialog based frontend cannot be used. at /usr/share/perl5/Debconf/FrontEnd/Dialog.pm line 76, <> line 3.)
146 |     debconf: falling back to frontend: Readline
147 |     debconf: unable to initialize frontend: Readline
148 |     debconf: (This frontend requires a controlling tty.)
149 |     debconf: falling back to frontend: Teletype
150 |     dpkg-preconfigure: unable to re-open stdin: 
151 |     Selecting previously unselected package tesseract-ocr-eng.
152 |     (Reading database ... 155202 files and directories currently installed.)
153 |     Preparing to unpack .../tesseract-ocr-eng_4.00~git24-0e00fe6-1.2_all.deb ...
154 |     Unpacking tesseract-ocr-eng (4.00~git24-0e00fe6-1.2) ...
155 |     Selecting previously unselected package tesseract-ocr-osd.
156 |     Preparing to unpack .../tesseract-ocr-osd_4.00~git24-0e00fe6-1.2_all.deb ...
157 |     Unpacking tesseract-ocr-osd (4.00~git24-0e00fe6-1.2) ...
158 |     Selecting previously unselected package tesseract-ocr.
159 |     Preparing to unpack .../tesseract-ocr_4.00~git2288-10f4998a-2_amd64.deb ...
160 |     Unpacking tesseract-ocr (4.00~git2288-10f4998a-2) ...
161 |     Setting up tesseract-ocr-osd (4.00~git24-0e00fe6-1.2) ...
162 |     Setting up tesseract-ocr-eng (4.00~git24-0e00fe6-1.2) ...
163 |     Setting up tesseract-ocr (4.00~git2288-10f4998a-2) ...
164 |     Processing triggers for man-db (2.8.3-2ubuntu0.1) ...
165 |     [K     |████████████████████████████████| 4.3 MB 5.4 MB/s 
166 |     [31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
167 |     albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
168 |     [?25h
169 | 
170 | 
171 | ```python
172 | !pip install Pillow==9.0.0
173 | ```
174 | 
175 |     Collecting Pillow==9.0.0
176 |       Downloading Pillow-9.0.0-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (4.3 MB)
177 |     [K     |████████████████████████████████| 4.3 MB 5.3 MB/s 
178 |     [?25hInstalling collected packages: Pillow
179 |       Attempting uninstall: Pillow
180 |         Found existing installation: Pillow 9.1.0
181 |         Uninstalling Pillow-9.1.0:
182 |           Successfully uninstalled Pillow-9.1.0
183 |     [31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
184 |     albumentations 0.1.12 requires imgaug<0.2.7,>=0.2.5, but you have imgaug 0.2.9 which is incompatible.[0m
185 |     Successfully installed Pillow-9.0.0
186 | 
187 | 
188 | 
189 | 
190 | # DocVQA Demo
191 | 
192 | 
193 | ```python
194 | from transformers import LayoutLMv2Processor
195 | processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
196 | ```
197 | 
198 | 
199 |     Downloading:   0%|          | 0.00/135 [00:00<?, ?B/s]
200 | 
201 | 
202 | 
203 |     Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]
204 | 
205 | 
206 | 
207 |     Downloading:   0%|          | 0.00/707 [00:00<?, ?B/s]
208 | 
209 | 
210 | 
211 | ```python
212 | from transformers import AutoModelForQuestionAnswering
213 | model = AutoModelForQuestionAnswering.from_pretrained("tiennvcs/layoutlmv2-base-uncased-finetuned-docvqa")
214 | ```
215 | 
216 | 
217 |     Downloading:   0%|          | 0.00/2.69k [00:00<?, ?B/s]
218 | 
219 | 
220 | 
221 |     Downloading:   0%|          | 0.00/765M [00:00<?, ?B/s]
222 | 
223 | 
224 | 
225 | ```python
226 | import torch
227 | device = torch.device("cuda")
228 | model.to(device)
229 | ```
230 | 
231 | 
232 | ```python
233 | def run_qa(image, question):
234 |   # step 1: encoding
235 |   encoding = processor(image, question, return_tensors="pt", truncation=True)
236 | 
237 |   # step 2: forward pass
238 | 
239 |   for k,v in encoding.items():
240 |     encoding[k] = v.to(model.device)
241 | 
242 |   outputs = model(**encoding)
243 | 
244 |   # step 3: get start_logits and end_logits
245 |   start_logits = outputs.start_logits
246 |   end_logits = outputs.end_logits
247 | 
248 |   # step 4: get largest logit for both
249 |   predicted_start_idx = start_logits.argmax(-1).item()
250 |   predicted_end_idx = end_logits.argmax(-1).item()
251 |   print("Predicted start idx:", predicted_start_idx)
252 |   print("Predicted end idx:", predicted_end_idx)
253 | 
254 |   # step 5: decode the predicted answer
255 |   return processor.tokenizer.decode(encoding.input_ids.squeeze()[predicted_start_idx:predicted_end_idx+1])
256 | 
257 | ```
258 | 
259 | 
260 | ```python
261 | from PIL import Image
262 | image = Image.open("image-2.jpeg").convert("RGB")
263 | image
264 | ```
265 | 
266 | 
267 | 
268 | 
269 |     
270 | ![png](Doc_Visual_QA_and_Bill_extraction_demo_files/Doc_Visual_QA_and_Bill_extraction_demo_15_0.png)
271 |     
272 | 
273 | 
274 | 
275 | 
276 | ```python
277 | question = "Where to call?"
278 | run_qa(image, question)
279 | ```
280 | 
281 |     Predicted start idx: 77
282 |     Predicted end idx: 82
283 | 
284 | 
285 | 
286 | 
287 | 
288 |     'ext. 7240.'
289 | 
290 | 
291 | 
292 | # Bill Information extraction Demo
293 | 
294 | 
295 | ```python
296 | import numpy as np
297 | from transformers import LayoutLMv2Processor, LayoutLMv2ForTokenClassification
298 | from datasets import load_dataset
299 | from PIL import Image, ImageDraw, ImageFont
300 | ```
301 | 
302 | 
303 | ```python
304 | processor = LayoutLMv2Processor.from_pretrained("microsoft/layoutlmv2-base-uncased")
305 | model = LayoutLMv2ForTokenClassification.from_pretrained("Theivaprakasham/layoutlmv2-finetuned-sroie")
306 | ```
307 | 
308 | 
309 |     Downloading:   0%|          | 0.00/3.07k [00:00<?, ?B/s]
310 | 
311 | 
312 | 
313 |     Downloading:   0%|          | 0.00/765M [00:00<?, ?B/s]
314 | 
315 | 
316 | 
317 | ```python
318 | dataset = load_dataset("darentang/sroie", split="test")
319 | ```
320 | 
321 | 
322 |     Downloading builder script:   0%|          | 0.00/4.25k [00:00<?, ?B/s]
323 | 
324 | 
325 |     Downloading and preparing dataset sroie/sroie to /root/.cache/huggingface/datasets/darentang___sroie/sroie/1.0.0/26ed9374c9a15a1d2f44fd8886f679076e1a1fd7da2d53726d6e58a99436c506...
326 | 
327 | 
328 | 
329 |     Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]
330 | 
331 | 
332 | 
333 |     Downloading data:   0%|          | 0.00/456M [00:00<?, ?B/s]
334 | 
335 | 
336 | 
337 |     Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]
338 | 
339 | 
340 | 
341 |     Generating train split: 0 examples [00:00, ? examples/s]
342 | 
343 | 
344 | 
345 |     Generating test split: 0 examples [00:00, ? examples/s]
346 | 
347 | 
348 |     Dataset sroie downloaded and prepared to /root/.cache/huggingface/datasets/darentang___sroie/sroie/1.0.0/26ed9374c9a15a1d2f44fd8886f679076e1a1fd7da2d53726d6e58a99436c506. Subsequent calls will reuse this data.
349 | 
350 | 
351 | 
352 | ```python
353 | Image.open(dataset[50]["image_path"]).convert("RGB").save("example1.png")
354 | ```
355 | 
356 | 
357 | ```python
358 | ls
359 | ```
360 | 
361 |     example1.png  image-2.jpeg  [0m[01;34msample_data[0m/  train.tar.gz  train.tar.gz.1
362 | 
363 | 
364 | 
365 | ```python
366 | Image.open(dataset[100]["image_path"]).convert("RGB").save("example2.png")
367 | ```
368 | 
369 | 
370 | ```python
371 | # define id2label, label2color
372 | labels = dataset.features['ner_tags'].feature.names
373 | id2label = {v: k for v, k in enumerate(labels)}
374 | label2color = {'B-ADDRESS': 'blue',
375 |  'B-COMPANY': 'green',
376 |  'B-DATE': 'red',
377 |  'B-TOTAL': 'red',
378 |  'I-ADDRESS': "blue",
379 |  'I-COMPANY': 'green',
380 |  'I-DATE': 'red',
381 |  'I-TOTAL': 'red',
382 |  'O': 'green'}
383 | 
384 | label2color = dict((k.lower(), v.lower()) for k,v in label2color.items())
385 | 
386 | ```
387 | 
388 | 
389 | ```python
390 | def unnormalize_box(bbox, width, height):
391 |      return [
392 |          width * (bbox[0] / 1000),
393 |          height * (bbox[1] / 1000),
394 |          width * (bbox[2] / 1000),
395 |          height * (bbox[3] / 1000),
396 |      ]
397 | 
398 | def iob_to_label(label):
399 |     return label
400 | ```
401 | 
402 | 
403 | ```python
404 | image = Image.open("example2.png").convert("RGB")
405 | image
406 | ```
407 | 
408 | 
409 | 
410 | 
411 |     
412 | ![png](Doc_Visual_QA_and_Bill_extraction_demo_files/Doc_Visual_QA_and_Bill_extraction_demo_26_0.png)
413 |     
414 | 
415 | 
416 | 
417 | 
418 | ```python
419 | width, height = image.size
420 | width, height
421 | ```
422 | 
423 | 
424 | 
425 | 
426 |     (932, 2216)
427 | 
428 | 
429 | 
430 | 
431 | ```python
432 | encoding = processor(image, truncation=True, return_offsets_mapping=True, return_tensors="pt")
433 | offset_mapping = encoding.pop('offset_mapping')
434 | ```
435 | 
436 | 
437 | ```python
438 | outputs = model(**encoding)
439 | ```
440 | 
441 | 
442 | ```python
443 | # get predictions
444 | predictions = outputs.logits.argmax(-1).squeeze().tolist()
445 | token_boxes = encoding.bbox.squeeze().tolist()
446 | ```
447 | 
448 | 
449 | ```python
450 | # only keep non-subword predictions
451 | is_subword = np.array(offset_mapping.squeeze().tolist())[:,0] != 0
452 | true_predictions = [id2label[pred] for idx, pred in enumerate(predictions) if not is_subword[idx]]
453 | true_boxes = [unnormalize_box(box, width, height) for idx, box in enumerate(token_boxes) if not is_subword[idx]]
454 | 
455 | ```
456 | 
457 | 
458 | ```python
459 | true_boxes
460 | ```
461 | 
462 | 
463 | 
464 | 
465 |     [[0.0, 0.0, 0.0, 0.0],
466 |      [430.584, 26.592000000000002, 523.7840000000001, 90.85600000000001],
467 |      [680.36, 33.24, 722.3000000000001, 79.776],
468 |      [163.1, 259.272, 296.376, 294.728],
469 |      [315.94800000000004, 261.488, 462.272, 294.728],
470 |      [480.91200000000003, 261.488, 581.568, 296.944],
471 |      [596.48, 261.488, 752.124, 296.944],
472 |      [335.52, 301.37600000000003, 582.5, 334.616],
473 |      [258.16400000000004, 352.344, 319.67600000000004, 378.93600000000004],
474 |      [332.724, 352.344, 386.78, 378.93600000000004],
475 |      [398.896, 354.56, 499.552, 378.93600000000004],
476 |      [514.464, 354.56, 660.788, 381.152],
477 |      [229.272, 398.88, 323.404, 427.688],
478 |      [335.52, 398.88, 480.91200000000003, 427.688],
479 |      [501.41600000000005, 401.096, 685.952, 429.904],
480 |      [144.46, 474.224, 203.176, 500.81600000000003],
481 |      [217.156, 480.872, 219.952, 483.088],
482 |      [233.0, 474.224, 433.38, 503.03200000000004],
483 |      [481.844, 476.44, 543.356, 503.03200000000004],
484 |      [556.404, 500.81600000000003, 558.268, 500.81600000000003],
485 |      [571.316, 476.44, 772.6279999999999, 505.248],
486 |      [275.872, 525.192, 342.976, 551.784],
487 |      [355.092, 525.192, 385.84799999999996, 554.0],
488 |      [398.896, 531.84, 400.76, 551.784],
489 |      [414.74, 525.192, 641.2159999999999, 554.0],
490 |      [32.620000000000005, 582.808, 876.0799999999999, 622.696],
491 |      [46.6, 624.9119999999999, 83.88, 651.504],
492 |      [95.996, 627.1279999999999, 147.256, 653.7199999999999],
493 |      [45.668, 675.88, 121.16000000000001, 704.688],
494 |      [132.344, 678.096, 227.408, 704.688],
495 |      [239.524, 678.096, 329.928, 706.904],
496 |      [340.18, 678.096, 553.608, 706.904],
497 |      [45.668, 720.2, 93.2, 749.008],
498 |      [108.11200000000001, 722.416, 117.432, 749.008],
499 |      [133.27599999999998, 722.416, 234.864, 749.008],
500 |      [247.912, 722.416, 341.11199999999997, 751.224],
501 |      [354.16, 724.6320000000001, 370.93600000000004, 749.008],
502 |      [44.736000000000004, 764.52, 158.44, 791.112],
503 |      [172.42, 764.52, 259.096, 791.112],
504 |      [271.212, 766.736, 364.41200000000003, 793.328],
505 |      [45.668, 813.2719999999999, 189.19600000000003, 839.864],
506 |      [202.244, 813.2719999999999, 387.712, 842.08],
507 |      [44.736000000000004, 859.808, 244.184, 888.6160000000001],
508 |      [46.6, 899.696, 169.624, 932.9359999999999],
509 |      [285.192, 901.9119999999999, 483.708, 932.9359999999999],
510 |      [602.072, 908.56, 677.564, 935.1519999999999],
511 |      [703.66, 904.1279999999999, 875.1479999999999, 935.1519999999999],
512 |      [45.668, 952.88, 161.236, 981.688],
513 |      [259.096, 979.472, 261.89200000000005, 981.688],
514 |      [289.852, 957.312, 378.39200000000005, 983.904],
515 |      [598.344, 957.312, 675.6999999999999, 983.904],
516 |      [703.66, 959.528, 833.208, 986.12],
517 |      [47.532, 1003.8480000000001, 230.204, 1037.088],
518 |      [259.096, 1014.928, 261.89200000000005, 1032.656],
519 |      [610.46, 1008.2800000000001, 677.564, 1034.872],
520 |      [58.716, 1054.816, 131.41199999999998, 1059.248],
521 |      [527.512, 1076.9759999999999, 607.664, 1108.0],
522 |      [700.864, 1059.248, 832.2760000000001, 1108.0],
523 |      [32.620000000000005, 1110.216, 188.264, 1143.4560000000001],
524 |      [189.19600000000003, 1112.432, 367.208, 1145.672],
525 |      [369.072, 1112.432, 481.844, 1145.672],
526 |      [520.988, 1110.216, 615.12, 1136.808],
527 |      [0.0, 0.0, 932.0, 2216.0],
528 |      [674.768, 1112.432, 779.1519999999999, 1136.808],
529 |      [830.412, 1116.864, 877.944, 1150.104],
530 |      [47.532, 1150.104, 139.79999999999998, 1174.48],
531 |      [305.696, 1156.752, 312.22, 1176.6960000000001],
532 |      [399.828, 1154.536, 470.66, 1178.912],
533 |      [531.24, 1154.536, 602.072, 1178.912],
534 |      [711.116, 1156.752, 782.88, 1181.1280000000002],
535 |      [841.596, 1156.752, 877.944, 1181.1280000000002],
536 |      [46.6, 1198.856, 127.68400000000001, 1223.2320000000002],
537 |      [140.732, 1198.856, 230.204, 1223.2320000000002],
538 |      [241.388, 1198.856, 280.532, 1227.6640000000002],
539 |      [49.396, 1252.04, 138.868, 1274.1999999999998],
540 |      [303.832, 1252.04, 315.94800000000004, 1276.416],
541 |      [408.216, 1254.2559999999999, 462.272, 1276.416],
542 |      [540.56, 1254.2559999999999, 595.548, 1278.6319999999998],
543 |      [711.116, 1256.472, 740.94, 1280.848],
544 |      [752.124, 1256.472, 781.948, 1280.848],
545 |      [842.528, 1258.6879999999999, 877.944, 1280.848],
546 |      [45.668, 1298.576, 118.364, 1322.952],
547 |      [122.092, 1298.576, 194.78799999999998, 1322.952],
548 |      [207.836, 1300.792, 259.096, 1325.168],
549 |      [31.688000000000002, 1338.464, 59.648, 1342.896],
550 |      [62.444, 1340.68, 166.828, 1373.92],
551 |      [165.896, 1340.68, 229.272, 1380.568],
552 |      [238.592, 1325.168, 876.0799999999999, 1378.352],
553 |      [165.896, 1424.8880000000001, 235.796, 1451.48],
554 |      [247.912, 1427.104, 327.132, 1453.6960000000001],
555 |      [339.248, 1427.104, 488.368, 1462.5600000000002],
556 |      [500.48400000000004, 1429.32, 578.772, 1460.344],
557 |      [607.664, 1435.968, 610.46, 1455.912],
558 |      [703.66, 1429.32, 786.608, 1455.912],
559 |      [451.08799999999997, 1486.9360000000001, 579.704, 1513.528],
560 |      [607.664, 1493.584, 610.46, 1513.528],
561 |      [722.3000000000001, 1486.9360000000001, 786.608, 1515.7440000000001],
562 |      [431.516, 1544.552, 501.41600000000005, 1571.144],
563 |      [513.532, 1546.7679999999998, 579.704, 1571.144],
564 |      [607.664, 1553.416, 610.46, 1571.144],
565 |      [722.3000000000001, 1548.984, 785.6759999999999, 1573.36],
566 |      [442.7, 1599.952, 578.772, 1635.408],
567 |      [607.664, 1608.816, 610.46, 1628.76],
568 |      [722.3000000000001, 1602.168, 785.6759999999999, 1628.76],
569 |      [99.724, 1657.568, 179.876, 1684.16],
570 |      [192.92399999999998, 1657.568, 279.59999999999997, 1684.16],
571 |      [290.784, 1659.784, 448.292, 1690.808],
572 |      [459.476, 1662.0, 490.232, 1686.376],
573 |      [501.41600000000005, 1662.0, 580.636, 1695.24],
574 |      [607.664, 1668.648, 611.392, 1688.592],
575 |      [702.728, 1662.0, 785.6759999999999, 1690.808],
576 |      [484.64000000000004, 1719.616, 554.54, 1746.208],
577 |      [558.268, 1719.616, 611.392, 1746.208],
578 |      [702.728, 1721.832, 785.6759999999999, 1748.424],
579 |      [453.884, 1763.9360000000001, 558.268, 1797.1760000000002],
580 |      [562.928, 1770.584, 611.392, 1790.528],
581 |      [721.368, 1763.9360000000001, 785.6759999999999, 1792.7440000000001],
582 |      [101.588, 1854.792, 169.624, 1890.248],
583 |      [181.74, 1863.656, 355.092, 1892.464],
584 |      [528.444, 1859.224, 594.616, 1863.656],
585 |      [107.18, 1919.056, 169.624, 1950.08],
586 |      [174.284, 1919.056, 258.16400000000004, 1952.296],
587 |      [355.092, 1921.272, 381.188, 1954.512],
588 |      [471.592, 1923.488, 620.712, 1956.728],
589 |      [0.0, 0.0, 932.0, 2216.0],
590 |      [697.136, 1923.488, 839.732, 1958.944],
591 |      [110.908, 1963.376, 153.78, 1989.968],
592 |      [359.752, 1965.592, 375.596, 1989.968],
593 |      [537.764, 1967.808, 619.7800000000001, 1992.184],
594 |      [775.424, 1967.808, 838.8000000000001, 1994.4],
595 |      [298.24, 2012.1280000000002, 378.39200000000005, 2040.9360000000001],
596 |      [393.304, 2020.992, 397.964, 2040.9360000000001],
597 |      [534.968, 2014.344, 616.984, 2040.9360000000001],
598 |      [775.424, 2016.5600000000002, 839.732, 2043.152],
599 |      [94.132, 2098.5519999999997, 202.244, 2122.928],
600 |      [214.36, 2098.5519999999997, 290.784, 2122.928],
601 |      [300.104, 2098.5519999999997, 358.82, 2122.928],
602 |      [369.072, 2100.768, 427.788, 2122.928],
603 |      [439.904, 2100.768, 633.76, 2127.36],
604 |      [645.876, 2102.984, 738.144, 2125.144],
605 |      [749.3280000000001, 2102.984, 816.432, 2125.144],
606 |      [932.0, 2216.0, 932.0, 2216.0]]
607 | 
608 | 
609 | 
610 | 
611 | ```python
612 | true_predictions
613 | ```
614 | 
615 | 
616 | 
617 | 
618 |     ['O',
619 |      'O',
620 |      'O',
621 |      'B-COMPANY',
622 |      'I-COMPANY',
623 |      'I-COMPANY',
624 |      'I-COMPANY',
625 |      'O',
626 |      'B-ADDRESS',
627 |      'I-ADDRESS',
628 |      'I-ADDRESS',
629 |      'I-ADDRESS',
630 |      'I-ADDRESS',
631 |      'I-ADDRESS',
632 |      'I-ADDRESS',
633 |      'O',
634 |      'O',
635 |      'O',
636 |      'O',
637 |      'O',
638 |      'O',
639 |      'O',
640 |      'O',
641 |      'O',
642 |      'O',
643 |      'O',
644 |      'O',
645 |      'O',
646 |      'O',
647 |      'O',
648 |      'O',
649 |      'O',
650 |      'O',
651 |      'O',
652 |      'O',
653 |      'O',
654 |      'O',
655 |      'O',
656 |      'O',
657 |      'O',
658 |      'O',
659 |      'O',
660 |      'O',
661 |      'O',
662 |      'O',
663 |      'O',
664 |      'B-DATE',
665 |      'O',
666 |      'O',
667 |      'O',
668 |      'O',
669 |      'O',
670 |      'O',
671 |      'O',
672 |      'O',
673 |      'O',
674 |      'O',
675 |      'O',
676 |      'O',
677 |      'O',
678 |      'O',
679 |      'O',
680 |      'O',
681 |      'O',
682 |      'O',
683 |      'O',
684 |      'O',
685 |      'O',
686 |      'O',
687 |      'O',
688 |      'O',
689 |      'O',
690 |      'O',
691 |      'O',
692 |      'O',
693 |      'O',
694 |      'O',
695 |      'O',
696 |      'O',
697 |      'O',
698 |      'O',
699 |      'O',
700 |      'O',
701 |      'O',
702 |      'O',
703 |      'O',
704 |      'O',
705 |      'O',
706 |      'O',
707 |      'O',
708 |      'O',
709 |      'O',
710 |      'O',
711 |      'O',
712 |      'O',
713 |      'O',
714 |      'O',
715 |      'O',
716 |      'O',
717 |      'O',
718 |      'O',
719 |      'O',
720 |      'O',
721 |      'O',
722 |      'O',
723 |      'O',
724 |      'O',
725 |      'O',
726 |      'O',
727 |      'O',
728 |      'B-TOTAL',
729 |      'O',
730 |      'O',
731 |      'O',
732 |      'O',
733 |      'O',
734 |      'O',
735 |      'O',
736 |      'O',
737 |      'O',
738 |      'O',
739 |      'O',
740 |      'O',
741 |      'O',
742 |      'O',
743 |      'O',
744 |      'O',
745 |      'O',
746 |      'O',
747 |      'O',
748 |      'O',
749 |      'O',
750 |      'O',
751 |      'O',
752 |      'O',
753 |      'O',
754 |      'O',
755 |      'O',
756 |      'O',
757 |      'O',
758 |      'O',
759 |      'O']
760 | 
761 | 
762 | 
763 | 
764 | ```python
765 | # draw predictions over the image
766 | draw = ImageDraw.Draw(image)
767 | font = ImageFont.load_default()
768 | for prediction, box in zip(true_predictions, true_boxes):
769 |     predicted_label = iob_to_label(prediction).lower()
770 |     draw.rectangle(box, outline=label2color[predicted_label])
771 |     draw.text((box[0]+10, box[1]-10), text=predicted_label, fill=label2color[predicted_label], font=font)
772 | 
773 | ```
774 | 
775 | 
776 | ```python
777 | image
778 | ```
779 | 
780 | 
781 | 
782 | 
783 |     
784 | ![png](Doc_Visual_QA_and_Bill_extraction_demo_files/Doc_Visual_QA_and_Bill_extraction_demo_35_0.png)
785 |     
786 | 
787 | 
788 | 
789 | 
790 | ```python
791 | 
792 | ```
793 | 


--------------------------------------------------------------------------------
/docs/Doc_Visual_QA_and_Bill_extraction_demo_files/Doc_Visual_QA_and_Bill_extraction_demo_15_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ankur3107/nlp_notebooks/db671902eebb52f7174df5f302cbc700fe0b0b97/docs/Doc_Visual_QA_and_Bill_extraction_demo_files/Doc_Visual_QA_and_Bill_extraction_demo_15_0.png


--------------------------------------------------------------------------------
/docs/Doc_Visual_QA_and_Bill_extraction_demo_files/Doc_Visual_QA_and_Bill_extraction_demo_26_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ankur3107/nlp_notebooks/db671902eebb52f7174df5f302cbc700fe0b0b97/docs/Doc_Visual_QA_and_Bill_extraction_demo_files/Doc_Visual_QA_and_Bill_extraction_demo_26_0.png


--------------------------------------------------------------------------------
/docs/Doc_Visual_QA_and_Bill_extraction_demo_files/Doc_Visual_QA_and_Bill_extraction_demo_35_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ankur3107/nlp_notebooks/db671902eebb52f7174df5f302cbc700fe0b0b97/docs/Doc_Visual_QA_and_Bill_extraction_demo_files/Doc_Visual_QA_and_Bill_extraction_demo_35_0.png


--------------------------------------------------------------------------------
/docs/GPT_2_on_Onnx_CPU.md:
--------------------------------------------------------------------------------
  1 | <a href="https://colab.research.google.com/github/Ankur3107/nlp_notebooks/blob/master/nlp-onnx/GPT_2_on_Onnx_CPU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  2 | 
  3 | 
  4 | ```python
  5 | !pip install onnxruntime==1.8.1 onnx==1.9.0 onnxconverter_common==1.8.1 transformers==4.8.2 psutil pytz pandas py-cpuinfo py3nvml
  6 | ```
  7 | 
  8 |         Uninstalling transformers-4.14.1:
  9 |           Successfully uninstalled transformers-4.14.1
 10 |       Attempting uninstall: onnxruntime
 11 |         Found existing installation: onnxruntime 1.10.0
 12 |         Uninstalling onnxruntime-1.10.0:
 13 |           Successfully uninstalled onnxruntime-1.10.0
 14 |       Attempting uninstall: onnxconverter-common
 15 |         Found existing installation: onnxconverter-common 1.9.0
 16 |         Uninstalling onnxconverter-common-1.9.0:
 17 |           Successfully uninstalled onnxconverter-common-1.9.0
 18 |     Successfully installed huggingface-hub-0.0.12 onnx-1.9.0 onnxconverter-common-1.8.1 onnxruntime-1.8.1 transformers-4.8.2
 19 | 
 20 | 
 21 | 
 22 | 
 23 | 
 24 | ```python
 25 | import os
 26 | 
 27 | # Create a cache directory to store pretrained model.
 28 | cache_dir = os.path.join(".", "cache_models")
 29 | if not os.path.exists(cache_dir):
 30 |     os.makedirs(cache_dir)
 31 | ```
 32 | 
 33 | 
 34 | ```python
 35 | !lscpu
 36 | ```
 37 | 
 38 |     Architecture:        x86_64
 39 |     CPU op-mode(s):      32-bit, 64-bit
 40 |     Byte Order:          Little Endian
 41 |     CPU(s):              2
 42 |     On-line CPU(s) list: 0,1
 43 |     Thread(s) per core:  2
 44 |     Core(s) per socket:  1
 45 |     Socket(s):           1
 46 |     NUMA node(s):        1
 47 |     Vendor ID:           GenuineIntel
 48 |     CPU family:          6
 49 |     Model:               79
 50 |     Model name:          Intel(R) Xeon(R) CPU @ 2.20GHz
 51 |     Stepping:            0
 52 |     CPU MHz:             2199.998
 53 |     BogoMIPS:            4399.99
 54 |     Hypervisor vendor:   KVM
 55 |     Virtualization type: full
 56 |     L1d cache:           32K
 57 |     L1i cache:           32K
 58 |     L2 cache:            256K
 59 |     L3 cache:            56320K
 60 |     NUMA node0 CPU(s):   0,1
 61 |     Flags:               fpu vme de pse tsc msr pae mce cx8 apic sep mtrr pge mca cmov pat pse36 clflush mmx fxsr sse sse2 ss ht syscall nx pdpe1gb rdtscp lm constant_tsc rep_good nopl xtopology nonstop_tsc cpuid tsc_known_freq pni pclmulqdq ssse3 fma cx16 pcid sse4_1 sse4_2 x2apic movbe popcnt aes xsave avx f16c rdrand hypervisor lahf_lm abm 3dnowprefetch invpcid_single ssbd ibrs ibpb stibp fsgsbase tsc_adjust bmi1 hle avx2 smep bmi2 erms invpcid rtm rdseed adx smap xsaveopt arat md_clear arch_capabilities
 62 | 
 63 | 
 64 | 
 65 | ```python
 66 | !pip install coloredlogs
 67 | ```
 68 | 
 69 |     Requirement already satisfied: coloredlogs in /usr/local/lib/python3.7/dist-packages (15.0.1)
 70 |     Requirement already satisfied: humanfriendly>=9.1 in /usr/local/lib/python3.7/dist-packages (from coloredlogs) (10.0)
 71 | 
 72 | 
 73 | 
 74 | ```python
 75 | from onnxruntime.transformers.gpt2_beamsearch_helper import Gpt2BeamSearchHelper, GPT2LMHeadModel_BeamSearchStep
 76 | from transformers import AutoConfig
 77 | import torch
 78 | ```
 79 | 
 80 | 
 81 | ```python
 82 | model_name_or_path = "gpt2"
 83 | config = AutoConfig.from_pretrained(model_name_or_path, cache_dir=cache_dir)
 84 | model = GPT2LMHeadModel_BeamSearchStep.from_pretrained(model_name_or_path, config=config, batch_size=1, beam_size=4, cache_dir=cache_dir)
 85 | device = torch.device("cpu")
 86 | model.eval().to(device)
 87 | 
 88 | print(model.config)
 89 | 
 90 | num_attention_heads = model.config.n_head
 91 | hidden_size = model.config.n_embd
 92 | num_layer = model.config.n_layer
 93 | ```
 94 | 
 95 |     GPT2Config {
 96 |       "_name_or_path": "gpt2",
 97 |       "activation_function": "gelu_new",
 98 |       "architectures": [
 99 |         "GPT2LMHeadModel"
100 |       ],
101 |       "attn_pdrop": 0.1,
102 |       "batch_size": 1,
103 |       "beam_size": 4,
104 |       "bos_token_id": 50256,
105 |       "embd_pdrop": 0.1,
106 |       "eos_token_id": 50256,
107 |       "gradient_checkpointing": false,
108 |       "initializer_range": 0.02,
109 |       "layer_norm_epsilon": 1e-05,
110 |       "model_type": "gpt2",
111 |       "n_ctx": 1024,
112 |       "n_embd": 768,
113 |       "n_head": 12,
114 |       "n_inner": null,
115 |       "n_layer": 12,
116 |       "n_positions": 1024,
117 |       "resid_pdrop": 0.1,
118 |       "scale_attn_weights": true,
119 |       "summary_activation": null,
120 |       "summary_first_dropout": 0.1,
121 |       "summary_proj_to_labels": true,
122 |       "summary_type": "cls_index",
123 |       "summary_use_proj": true,
124 |       "task_specific_params": {
125 |         "text-generation": {
126 |           "do_sample": true,
127 |           "max_length": 50
128 |         }
129 |       },
130 |       "transformers_version": "4.8.2",
131 |       "use_cache": true,
132 |       "vocab_size": 50257
133 |     }
134 |     
135 | 
136 | 
137 | 
138 | ```python
139 | onnx_model_path = "gpt2_one_step_search.onnx"
140 | Gpt2BeamSearchHelper.export_onnx(model, device, onnx_model_path) # add parameter use_external_data_format=True when model size > 2 GB
141 | ```
142 | 
143 |     /usr/local/lib/python3.7/dist-packages/onnxruntime/transformers/gpt2_beamsearch_helper.py:91: UserWarning: __floordiv__ is deprecated, and its behavior will change in a future version of pytorch. It currently rounds toward 0 (like the 'trunc' function NOT 'floor'). This results in incorrect rounding for negative values. To keep the current behavior, use torch.div(a, b, rounding_mode='trunc'), or for actual floor division, use torch.div(a, b, rounding_mode='floor').
144 |       selected_input_seq = selected_index_flat // self.config.beam_size
145 |     /usr/local/lib/python3.7/dist-packages/torch/onnx/utils.py:100: UserWarning: `example_outputs' is deprecated and ignored. Will be removed in next PyTorch release.
146 |       warnings.warn("`example_outputs' is deprecated and ignored. Will be removed in "
147 |     /usr/local/lib/python3.7/dist-packages/torch/onnx/utils.py:103: UserWarning: `use_external_data_format' is deprecated and ignored. Will be removed in next PyTorch release. The code will work as it is False if models are not larger than 2GB, Otherwise set to False because of size limits imposed by Protocol Buffers.
148 |       warnings.warn("`use_external_data_format' is deprecated and ignored. Will be removed in next "
149 |     /usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/modeling_gpt2.py:698: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
150 |       assert batch_size > 0, "batch_size has to be defined and > 0"
151 |     /usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/modeling_gpt2.py:249: TracerWarning: Iterating over a tensor might cause the trace to be incorrect. Passing a tensor of different shape won't change the number of iterations executed (and might lead to errors or silently give incorrect results).
152 |       past_key, past_value = layer_past
153 |     /usr/local/lib/python3.7/dist-packages/transformers/models/gpt2/modeling_gpt2.py:181: TracerWarning: Converting a tensor to a Python float might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!
154 |       attn_weights = attn_weights / (float(value.size(-1)) ** 0.5)
155 | 
156 | 
157 | 
158 | ```python
159 | import onnxruntime
160 | import numpy
161 | from transformers import AutoTokenizer
162 | 
163 | EXAMPLE_Text = ['best hotel in bay area.']
164 | 
165 | def get_tokenizer(model_name_or_path, cache_dir):
166 |     tokenizer = AutoTokenizer.from_pretrained(model_name_or_path, cache_dir=cache_dir)
167 |     tokenizer.padding_side = "left"
168 |     tokenizer.pad_token = tokenizer.eos_token
169 |     #okenizer.add_special_tokens({'pad_token': '[PAD]'})
170 |     return tokenizer
171 | 
172 | def get_example_inputs(prompt_text=EXAMPLE_Text):    
173 |     tokenizer = get_tokenizer(model_name_or_path, cache_dir)
174 |     encodings_dict = tokenizer.batch_encode_plus(prompt_text, padding=True)
175 | 
176 |     input_ids = torch.tensor(encodings_dict['input_ids'], dtype=torch.int64)
177 |     attention_mask = torch.tensor(encodings_dict['attention_mask'], dtype=torch.float32)
178 |     position_ids = (attention_mask.long().cumsum(-1) - 1)
179 |     position_ids.masked_fill_(position_ids < 0, 0)
180 | 
181 |     #Empty Past State for generating first word
182 |     empty_past = []
183 |     batch_size = input_ids.size(0)
184 |     sequence_length = input_ids.size(1)
185 |     past_shape = [2, batch_size, num_attention_heads, 0, hidden_size // num_attention_heads]
186 |     for i in range(num_layer):
187 |         empty_past.append(torch.empty(past_shape).type(torch.float32).to(device))
188 |        
189 |     return input_ids, attention_mask, position_ids, empty_past
190 | 
191 | input_ids, attention_mask, position_ids, empty_past = get_example_inputs()
192 | beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()
193 | input_log_probs = torch.zeros([input_ids.shape[0], 1])
194 | input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)
195 | prev_step_scores = torch.zeros([input_ids.shape[0], 1])
196 | 
197 | onnx_model_path = "gpt2_one_step_search.onnx"
198 | session = onnxruntime.InferenceSession(onnx_model_path)
199 | ort_inputs = {
200 |               'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),
201 |               'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),
202 |               'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),
203 |               'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),
204 |               'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),
205 |               'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),
206 |               'prev_step_results': numpy.ascontiguousarray(input_ids.cpu().numpy()),
207 |               'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),
208 |              }
209 | for i, past_i in enumerate(empty_past):
210 |     ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())
211 | ort_outputs = session.run(None, ort_inputs)
212 | ```
213 | 
214 | 
215 | ```python
216 | def inference_with_io_binding(session, config, input_ids, position_ids, attention_mask, past, beam_select_idx, input_log_probs, input_unfinished_sents, prev_step_results, prev_step_scores, step, context_len):
217 |     output_shapes = Gpt2BeamSearchHelper.get_output_shapes(batch_size=1,
218 |                                                            context_len=context_len,
219 |                                                            past_sequence_length=past[0].size(3),
220 |                                                            sequence_length=input_ids.size(1),
221 |                                                            beam_size=4,
222 |                                                            step=step,
223 |                                                            config=config,
224 |                                                            model_class="GPT2LMHeadModel_BeamSearchStep")
225 |     output_buffers = Gpt2BeamSearchHelper.get_output_buffers(output_shapes, device)
226 | 
227 |     io_binding = Gpt2BeamSearchHelper.prepare_io_binding(session, input_ids, position_ids, attention_mask, past, output_buffers, output_shapes, beam_select_idx, input_log_probs, input_unfinished_sents, prev_step_results, prev_step_scores)
228 |     session.run_with_iobinding(io_binding)
229 | 
230 |     outputs = Gpt2BeamSearchHelper.get_outputs_from_io_binding_buffer(session, output_buffers, output_shapes, return_numpy=False)
231 |     return outputs
232 | ```
233 | 
234 | 
235 | ```python
236 | input_ids, attention_mask, position_ids, empty_past = get_example_inputs()
237 | beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()
238 | input_log_probs = torch.zeros([input_ids.shape[0], 1])
239 | input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)
240 | prev_step_scores = torch.zeros([input_ids.shape[0], 1])
241 | outputs = inference_with_io_binding(session, config, input_ids, position_ids, attention_mask, empty_past, beam_select_idx, input_log_probs, input_unfinished_sents, input_ids, prev_step_scores, 0, input_ids.shape[-1])
242 | assert torch.eq(outputs[-2], torch.from_numpy(ort_outputs[-2])).all()
243 | print("IO Binding result is good")
244 | ```
245 | 
246 |     IO Binding result is good
247 | 
248 | 
249 | 
250 | ```python
251 | def update(output, step, batch_size, beam_size, context_length, prev_attention_mask, device):
252 |     """
253 |     Update the inputs for next inference.
254 |     """
255 |     last_state = (torch.from_numpy(output[0]).to(device)
256 |                         if isinstance(output[0], numpy.ndarray) else output[0].clone().detach().cpu())
257 | 
258 |     input_ids = last_state.view(batch_size * beam_size, -1).to(device)
259 | 
260 |     input_unfinished_sents_id = -3
261 |     prev_step_results = (torch.from_numpy(output[-2]).to(device) if isinstance(output[-2], numpy.ndarray)
262 |                                 else output[-2].clone().detach().to(device))
263 |     position_ids = (torch.tensor([context_length + step - 1
264 |                                         ]).unsqueeze(0).repeat(batch_size * beam_size, 1).to(device))
265 | 
266 |     if prev_attention_mask.shape[0] != (batch_size * beam_size):
267 |         prev_attention_mask = prev_attention_mask.repeat(batch_size * beam_size, 1)
268 |     attention_mask = torch.cat(
269 |         [
270 |             prev_attention_mask,
271 |             torch.ones([batch_size * beam_size, 1]).type_as(prev_attention_mask),
272 |         ],
273 |         1,
274 |     ).to(device)
275 | 
276 |     beam_select_idx = (torch.from_numpy(output[input_unfinished_sents_id - 2]).to(device) if isinstance(
277 |         output[input_unfinished_sents_id - 2], numpy.ndarray) else output[input_unfinished_sents_id - 2].clone().detach().to(device))
278 |     input_log_probs = (torch.from_numpy(output[input_unfinished_sents_id - 1]).to(device) if isinstance(
279 |         output[input_unfinished_sents_id - 1], numpy.ndarray) else output[input_unfinished_sents_id - 1].clone().detach().to(device))
280 |     input_unfinished_sents = (torch.from_numpy(output[input_unfinished_sents_id]).to(device) if isinstance(
281 |         output[input_unfinished_sents_id], numpy.ndarray) else
282 |                                     output[input_unfinished_sents_id].clone().detach().to(device))
283 |     prev_step_scores = (torch.from_numpy(output[-1]).to(device)
284 |                                 if isinstance(output[-1], numpy.ndarray) else output[-1].clone().detach().to(device))
285 | 
286 |     past = []
287 |     if isinstance(output[1], tuple):  # past in torch output is tuple
288 |         past = list(output[1])
289 |     else:
290 |         for i in range(model.config.n_layer):
291 |             past_i = (torch.from_numpy(output[i + 1])
292 |                         if isinstance(output[i + 1], numpy.ndarray) else output[i + 1].clone().detach())
293 |             past.append(past_i.to(device)) 
294 | 
295 |     inputs = {
296 |         'input_ids': input_ids,
297 |         'attention_mask' : attention_mask,
298 |         'position_ids': position_ids,
299 |         'beam_select_idx': beam_select_idx,
300 |         'input_log_probs': input_log_probs,
301 |         'input_unfinished_sents': input_unfinished_sents,
302 |         'prev_step_results': prev_step_results,
303 |         'prev_step_scores': prev_step_scores,
304 |     }
305 |     ort_inputs = {
306 |         'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),
307 |         'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),
308 |         'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),
309 |         'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),
310 |         'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),
311 |         'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),
312 |         'prev_step_results': numpy.ascontiguousarray(prev_step_results.cpu().numpy()),
313 |         'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),
314 |     }
315 |     for i, past_i in enumerate(past):
316 |         ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())
317 |     
318 |     return inputs, ort_inputs, past
319 | 
320 | def test_generation(tokenizer, input_text, use_onnxruntime_io, ort_session = None, num_tokens_to_produce = 30):
321 |     print("Text generation using", "OnnxRuntime with IO binding" if use_onnxruntime_io else "OnnxRuntime", "...")    
322 |     input_ids, attention_mask, position_ids, past = get_example_inputs(input_text)
323 |     beam_select_idx = torch.zeros([1, input_ids.shape[0]]).long()
324 |     input_log_probs = torch.zeros([input_ids.shape[0], 1])
325 |     input_unfinished_sents = torch.ones([input_ids.shape[0], 1], dtype=torch.bool)
326 |     prev_step_scores = torch.zeros([input_ids.shape[0], 1])
327 |     inputs = {
328 |         'input_ids': input_ids,
329 |         'attention_mask' : attention_mask,
330 |         'position_ids': position_ids,
331 |         'beam_select_idx': beam_select_idx,
332 |         'input_log_probs': input_log_probs,
333 |         'input_unfinished_sents': input_unfinished_sents,
334 |         'prev_step_results': input_ids,
335 |         'prev_step_scores': prev_step_scores,
336 |     }
337 |     ort_inputs = {
338 |         'input_ids': numpy.ascontiguousarray(input_ids.cpu().numpy()),
339 |         'attention_mask' : numpy.ascontiguousarray(attention_mask.cpu().numpy()),
340 |         'position_ids': numpy.ascontiguousarray(position_ids.cpu().numpy()),
341 |         'beam_select_idx': numpy.ascontiguousarray(beam_select_idx.cpu().numpy()),
342 |         'input_log_probs': numpy.ascontiguousarray(input_log_probs.cpu().numpy()),
343 |         'input_unfinished_sents': numpy.ascontiguousarray(input_unfinished_sents.cpu().numpy()),
344 |         'prev_step_results': numpy.ascontiguousarray(input_ids.cpu().numpy()),
345 |         'prev_step_scores': numpy.ascontiguousarray(prev_step_scores.cpu().numpy()),
346 |     }
347 |     for i, past_i in enumerate(past):
348 |         ort_inputs[f'past_{i}'] = numpy.ascontiguousarray(past_i.cpu().numpy())
349 |     batch_size = input_ids.size(0)
350 |     beam_size = 4
351 |     context_length = input_ids.size(-1)
352 | 
353 |     for step in range(num_tokens_to_produce):
354 |         if use_onnxruntime_io:
355 |             outputs = inference_with_io_binding(ort_session, config, inputs['input_ids'], inputs['position_ids'], inputs['attention_mask'], past, inputs['beam_select_idx'], inputs['input_log_probs'], inputs['input_unfinished_sents'], inputs['prev_step_results'], inputs['prev_step_scores'], step, context_length)
356 |         else:
357 |             outputs = ort_session.run(None, ort_inputs) 
358 |         inputs, ort_inputs, past = update(outputs, step, batch_size, beam_size, context_length, inputs['attention_mask'], device)
359 | 
360 |         if not inputs['input_unfinished_sents'].any():
361 |             break
362 | 
363 |     print("------------")
364 |     print(tokenizer.decode(inputs['prev_step_results'][0], skip_special_tokens=True))
365 | ```
366 | 
367 | 
368 | ```python
369 | tokenizer = get_tokenizer(model_name_or_path, cache_dir)
370 | input_text = EXAMPLE_Text
371 | test_generation(tokenizer, input_text, use_onnxruntime_io=False, ort_session=session)
372 | ```
373 | 
374 |     Text generation using OnnxRuntime ...
375 |     ------------
376 |     best hotel in bay area.
377 |     
378 |     "It's a great place to stay," he said.
379 | 
380 | 
381 | 
382 | ```python
383 | test_generation(tokenizer, input_text, use_onnxruntime_io=True, ort_session=session)
384 | ```
385 | 
386 |     Text generation using OnnxRuntime with IO binding ...
387 |     ------------
388 |     best hotel in bay area.
389 |     
390 |     "It's a great place to stay," he said.
391 | 
392 | 
393 | 
394 | ```python
395 | 
396 | ```
397 | 


--------------------------------------------------------------------------------
/docs/Generic_Transformer_Classification.md:
--------------------------------------------------------------------------------
  1 | <a href="https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/Generic_Transformer_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  2 | 
  3 | 
  4 | ```
  5 | !pip install transformers
  6 | ```
  7 | 
  8 |     Collecting transformers
  9 |     [?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
 10 |     [K     |████████████████████████████████| 778kB 2.8MB/s 
 11 |     [?25hRequirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.5)
 12 |     Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)
 13 |     Collecting sentencepiece!=0.1.92
 14 |     [?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
 15 |     [K     |████████████████████████████████| 1.1MB 13.3MB/s 
 16 |     [?25hRequirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)
 17 |     Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)
 18 |     Requirement already satisfied: dataclasses; python_version < "3.7" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)
 19 |     Collecting sacremoses
 20 |     [?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
 21 |     [K     |████████████████████████████████| 890kB 19.7MB/s 
 22 |     [?25hRequirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)
 23 |     Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.4)
 24 |     Collecting tokenizers==0.8.1.rc1
 25 |     [?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
 26 |     [K     |████████████████████████████████| 3.0MB 20.6MB/s 
 27 |     [?25hRequirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)
 28 |     Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)
 29 |     Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)
 30 |     Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.6.20)
 31 |     Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (1.15.0)
 32 |     Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)
 33 |     Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.16.0)
 34 |     Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)
 35 |     Building wheels for collected packages: sacremoses
 36 |       Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
 37 |       Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=6bda503b3bbdbf7626ff38a3e3235c3a20e948d97c40c78a681f5c87b90ed237
 38 |       Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45
 39 |     Successfully built sacremoses
 40 |     Installing collected packages: sentencepiece, sacremoses, tokenizers, transformers
 41 |     Successfully installed sacremoses-0.0.43 sentencepiece-0.1.91 tokenizers-0.8.1rc1 transformers-3.0.2
 42 | 
 43 | 
 44 | 
 45 | ```
 46 | import os, pandas as pd
 47 | from sklearn.model_selection import train_test_split
 48 | import logging
 49 | from transformers import *
 50 | import torch
 51 | from torch.utils.data import Dataset, DataLoader
 52 | import torch.nn as nn
 53 | from tqdm.autonotebook import tqdm
 54 | ```
 55 | 
 56 | # 1. Set Configuration
 57 | 
 58 | 
 59 | ```
 60 | class Config:
 61 |   train_file = './data.csv'
 62 |   eval_file = './eval.csv'
 63 |   max_seq_len = 128
 64 |   batch_size = 32
 65 |   epochs = 5
 66 |   model_name = 'bert-base-uncased'
 67 |   learning_rate = 2e-5
 68 |   n_classes = 3
 69 |   device = 'cpu'
 70 |   
 71 | 
 72 | 
 73 | flags = Config
 74 | ```
 75 | 
 76 | # 2. Build Dataset Pipeline
 77 | 
 78 | 
 79 | ```
 80 | class TextLabelDataset(Dataset):
 81 | 
 82 |     def __init__(self, texts, labels, tokenizer, max_len):
 83 |         self.texts = texts
 84 |         self.labels = labels
 85 |         self.tokenizer = tokenizer
 86 |         self.max_len = max_len
 87 |   
 88 |     def __len__(self):
 89 |         return len(self.texts)
 90 |   
 91 |     def __getitem__(self, item):
 92 |         text = str(self.texts[item])
 93 |         label = self.labels[item]
 94 | 
 95 |         encoding = self.tokenizer.encode_plus(
 96 |           text,
 97 |           add_special_tokens=True,
 98 |           max_length=self.max_len,
 99 |           return_token_type_ids=False,
100 |           pad_to_max_length=True,
101 |           return_attention_mask=True,
102 |           return_tensors='pt',
103 |           truncation=True
104 |         )
105 | 
106 |         return {
107 |           'texts': text,
108 |           'input_ids': encoding['input_ids'].flatten(),
109 |           'attention_mask': encoding['attention_mask'].flatten(),
110 |           'targets': torch.tensor(label, dtype=torch.long)
111 |         }
112 | 
113 | def create_data_loader(df, tokenizer, max_len, batch_size, is_prediction=False):
114 | 
115 |   if isinstance(df, str):
116 |     df = pd.read_csv(df)
117 |   else:
118 |     pass
119 | 
120 |   if is_prediction:
121 |     ds = TextLabelDataset(
122 |         texts=df.text.to_numpy(),
123 |         labels=np.array([-1]*len(df.text.values)),
124 |         tokenizer=tokenizer,
125 |         max_len=max_len
126 |         )
127 |   else:
128 |     ds = TextLabelDataset(
129 |         texts=df.text.to_numpy(),
130 |         labels=df.labels.to_numpy(),
131 |         tokenizer=tokenizer,
132 |         max_len=max_len
133 |         )
134 | 
135 |     return DataLoader(
136 |         ds,
137 |         batch_size=batch_size,
138 |         num_workers=4
139 |         )
140 | ```
141 | 
142 | # 3. Build Model 
143 | 
144 | 
145 | ```
146 | class Classifier(nn.Module):
147 | 
148 |   def __init__(self, model_name, n_classes):
149 |       super(Classifier, self).__init__()
150 |       self.bert = AutoModel.from_pretrained(model_name)
151 |       self.drop = nn.Dropout(p=0.3)
152 |       self.out = nn.Linear(self.bert.config.hidden_size, n_classes)
153 | 
154 |   def forward(self, input_ids, attention_mask):
155 |       _, pooled_output = self.bert(
156 |         input_ids=input_ids,
157 |         attention_mask=attention_mask
158 |       )
159 |       output = self.drop(pooled_output)
160 |       return self.out(output)
161 | ```
162 | 
163 | 
164 | ```
165 | class ClassificationModel:
166 | 
167 |   def __init__(self, flags):
168 |     self.flags = flags
169 |     self.tokenizer = BertTokenizer.from_pretrained(self.flags.model_name)
170 |     self.model = Classifier(self.flags.model_name, self.flags.n_classes)
171 |     self.model = self.model.to(self.flags.device)
172 | 
173 |   def train(self):
174 | 
175 |     train_data_loader = create_data_loader(self.flags.train_file, self.tokenizer, self.flags.max_seq_len, self.flags.batch_size)
176 |     val_data_loader = create_data_loader(self.flags.eval_file, self.tokenizer, self.flags.max_seq_len, self.flags.batch_size)
177 | 
178 |     optimizer = AdamW(self.model.parameters(), lr=self.flags.learning_rate, correct_bias=False)
179 |     total_steps = len(train_data_loader) * self.flags.epochs
180 | 
181 |     scheduler = get_linear_schedule_with_warmup(
182 |       optimizer,
183 |       num_warmup_steps=0,
184 |       num_training_steps=total_steps
185 |     )
186 | 
187 |     loss_fn = nn.CrossEntropyLoss().to(self.flags.device)
188 | 
189 |     history = defaultdict(list)
190 |     best_accuracy = 0
191 | 
192 |     if isinstance(self.flags.train_file, str):
193 |       train_df = pd.read_csv(self.flags.train_file)
194 | 
195 |     if isinstance(self.flags.eval_file, str):
196 |       eval_df = pd.read_csv(self.flags.eval_file)
197 | 
198 |     for epoch in range(self.flags.epochs):
199 | 
200 |       print(f'Epoch {epoch + 1}/{self.flags.epochs}')
201 |       print('-' * 10)
202 | 
203 |       train_acc, train_loss = self.train_epoch(
204 |         self.model,
205 |         train_data_loader,    
206 |         loss_fn, 
207 |         optimizer, 
208 |         self.flags.device, 
209 |         scheduler, 
210 |         len(train_df)
211 |       )
212 | 
213 |       print(f'Train loss {train_loss} accuracy {train_acc}')
214 | 
215 |       val_acc, val_loss = self.eval_model(
216 |         self.model,
217 |         val_data_loader,
218 |         loss_fn, 
219 |         self.flags.device, 
220 |         len(eval_df)
221 |       )
222 | 
223 |       print(f'Val   loss {val_loss} accuracy {val_acc}')
224 |       print()
225 | 
226 |       history['train_acc'].append(train_acc)
227 |       history['train_loss'].append(train_loss)
228 |       history['val_acc'].append(val_acc)
229 |       history['val_loss'].append(val_loss)
230 | 
231 |       if val_acc > best_accuracy:
232 |         torch.save(self.model.state_dict(), 'best_model_state.bin')
233 |         best_accuracy = val_acc
234 | 
235 | 
236 |   def train_epoch(self, model, data_loader, loss_fn, optimizer, device, scheduler, n_examples):
237 |     model = model.train()
238 | 
239 |     losses = []
240 |     correct_predictions = 0
241 |     tk0 = tqdm(data_loader, total=len(data_loader), desc="Training")
242 |     for bi, d in enumerate(tk0):
243 |       input_ids = d["input_ids"].to(device)
244 |       attention_mask = d["attention_mask"].to(device)
245 |       targets = d["targets"].to(device)
246 | 
247 |       outputs = model(
248 |         input_ids=input_ids,
249 |         attention_mask=attention_mask
250 |       )
251 | 
252 |       _, preds = torch.max(outputs, dim=1)
253 |       loss = loss_fn(outputs, targets)
254 | 
255 |       correct_predictions += torch.sum(preds == targets)
256 |       losses.append(loss.item())
257 | 
258 |       loss.backward()
259 |       nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
260 |       optimizer.step()
261 |       scheduler.step()
262 |       optimizer.zero_grad()
263 | 
264 |     return correct_predictions.double() / n_examples, np.mean(losses)
265 | 
266 |   def eval_model(self, model, data_loader, loss_fn, device, n_examples):
267 |     model = model.eval()
268 | 
269 |     losses = []
270 |     correct_predictions = 0
271 | 
272 |     with torch.no_grad():
273 |       tk0 = tqdm(data_loader, total=len(data_loader), desc="Evaluating")
274 |       for bi, d in enumerate(tk0):
275 |         input_ids = d["input_ids"].to(device)
276 |         attention_mask = d["attention_mask"].to(device)
277 |         targets = d["targets"].to(device)
278 | 
279 |         outputs = model(
280 |           input_ids=input_ids,
281 |           attention_mask=attention_mask
282 |         )
283 |         _, preds = torch.max(outputs, dim=1)
284 | 
285 |         loss = loss_fn(outputs, targets)
286 | 
287 |         correct_predictions += torch.sum(preds == targets)
288 |         losses.append(loss.item())
289 | 
290 |     return correct_predictions.double() / n_examples, np.mean(losses)
291 | 
292 | 
293 |     
294 | ```
295 | 
296 | ### Download Data and Preparation
297 | 
298 | 
299 | ```
300 | !wget https://raw.githubusercontent.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset/master/imdb_tr.csv
301 | ```
302 | 
303 |     --2020-08-26 15:57:14--  https://raw.githubusercontent.com/SrinidhiRaghavan/AI-Sentiment-Analysis-on-IMDB-Dataset/master/imdb_tr.csv
304 |     Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
305 |     Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
306 |     HTTP request sent, awaiting response... 200 OK
307 |     Length: 23677025 (23M) [text/plain]
308 |     Saving to: ‘imdb_tr.csv’
309 |     
310 |     imdb_tr.csv         100%[===================>]  22.58M  39.9MB/s    in 0.6s    
311 |     
312 |     2020-08-26 15:57:16 (39.9 MB/s) - ‘imdb_tr.csv’ saved [23677025/23677025]
313 |     
314 | 
315 | 
316 | 
317 | ```
318 | data = pd.read_csv('imdb_tr.csv', encoding = "ISO-8859-1")
319 | ```
320 | 
321 | 
322 | ```
323 | data.columns = ['row_Number', 'text', 'labels']
324 | ```
325 | 
326 | 
327 | ```
328 | train_data = data.sample(1000)
329 | test_data = data.sample(100)
330 | 
331 | ```
332 | 
333 | 
334 | ```
335 | train_data.to_csv('data.csv', index=False)
336 | test_data.to_csv('eval.csv', index=False)
337 | ```
338 | 
339 | # Training
340 | 
341 | 
342 | ```
343 | from collections import defaultdict
344 | import numpy as np
345 | ```
346 | 
347 | 
348 | ```
349 | class Config:
350 |   train_file = './data.csv'
351 |   eval_file = './eval.csv'
352 |   max_seq_len = 128
353 |   batch_size = 32
354 |   epochs = 5
355 |   model_name = 'bert-base-uncased'
356 |   learning_rate = 2e-5
357 |   n_classes = 2
358 |   device = 'cuda'
359 |   
360 | flags = Config
361 | ```
362 | 
363 | 
364 | ```
365 | classification = ClassificationModel(flags)
366 | ```
367 | 
368 | 
369 | ```
370 | classification.train()
371 | ```
372 | 
373 |     Epoch 1/5
374 |     ----------
375 | 
376 | 
377 | 
378 |     HBox(children=(FloatProgress(value=0.0, description='Training', max=32.0, style=ProgressStyle(description_widt…
379 | 
380 | 
381 |     
382 |     Train loss 0.6540139000862837 accuracy 0.622
383 | 
384 | 
385 | 
386 |     HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…
387 | 
388 | 
389 |     
390 |     Val   loss 0.4141501262784004 accuracy 0.78
391 |     
392 |     Epoch 2/5
393 |     ----------
394 | 
395 | 
396 | 
397 |     HBox(children=(FloatProgress(value=0.0, description='Training', max=32.0, style=ProgressStyle(description_widt…
398 | 
399 | 
400 |     
401 |     Train loss 0.3276493112789467 accuracy 0.864
402 | 
403 | 
404 | 
405 |     HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…
406 | 
407 | 
408 |     
409 |     Val   loss 0.3254726273007691 accuracy 0.87
410 |     
411 |     Epoch 3/5
412 |     ----------
413 | 
414 | 
415 | 
416 |     HBox(children=(FloatProgress(value=0.0, description='Training', max=32.0, style=ProgressStyle(description_widt…
417 | 
418 | 
419 |     
420 |     Train loss 0.12970392164424993 accuracy 0.9530000000000001
421 | 
422 | 
423 | 
424 |     HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…
425 | 
426 | 
427 |     
428 |     Val   loss 0.4319960339926183 accuracy 0.8300000000000001
429 |     
430 |     Epoch 4/5
431 |     ----------
432 | 
433 | 
434 | 
435 |     HBox(children=(FloatProgress(value=0.0, description='Training', max=32.0, style=ProgressStyle(description_widt…
436 | 
437 | 
438 |     
439 |     Train loss 0.0639086696319282 accuracy 0.982
440 | 
441 | 
442 | 
443 |     HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…
444 | 
445 | 
446 |     
447 |     Val   loss 0.5208611574489623 accuracy 0.8300000000000001
448 |     
449 |     Epoch 5/5
450 |     ----------
451 | 
452 | 
453 | 
454 |     HBox(children=(FloatProgress(value=0.0, description='Training', max=32.0, style=ProgressStyle(description_widt…
455 | 
456 | 
457 |     
458 |     Train loss 0.01748604617023375 accuracy 0.996
459 | 
460 | 
461 | 
462 |     HBox(children=(FloatProgress(value=0.0, description='Evaluating', max=4.0, style=ProgressStyle(description_wid…
463 | 
464 | 
465 |     
466 |     Val   loss 0.4388579736405518 accuracy 0.9
467 |     
468 | 
469 | 
470 | 
471 | ```
472 | 
473 | ```
474 | 


--------------------------------------------------------------------------------
/docs/Question_Answering_with_a_Fine_Tuned_BERT.md:
--------------------------------------------------------------------------------
  1 | <a href="https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/Question_Answering_with_a_Fine_Tuned_BERT.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  2 | 
  3 | # Question Answering with a Fine-Tuned BERT
  4 | *by Ankur Singh*
  5 | 
  6 | # Part 1: How BERT is applied to Question Answering
  7 | 
  8 | ## The SQuAD v1.1 Benchmark
  9 | 
 10 | When someone mentions "Question Answering" as an application of BERT, what they are really referring to is applying BERT to the Stanford Question Answering Dataset (SQuAD).
 11 | 
 12 | The task posed by the SQuAD benchmark is a little different than you might think. Given a question, and *a passage of text containing the answer*, BERT needs to highlight the "span" of text corresponding to the correct answer. 
 13 | 
 14 | The SQuAD homepage has a fantastic tool for exploring the questions and reference text for this dataset, and even shows the predictions made by top-performing models.
 15 | 
 16 | For example, here are some [interesting examples](https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/Super_Bowl_50.html?model=r-net+%20(ensemble)%20(Microsoft%20Research%20Asia)&version=1.1) on the topic of Super Bowl 50.
 17 | 
 18 | 
 19 | ## BERT Input Format
 20 | 
 21 | To feed a QA task into BERT, we pack both the question and the reference text into the input.
 22 | 
 23 | ![Input format for QA](http://www.mccormickml.com/assets/BERT/SQuAD/input_formatting.png)
 24 | 
 25 | The two pieces of text are separated by the special `[SEP]` token. 
 26 | 
 27 | BERT also uses "Segment Embeddings" to differentiate the question from the reference text. These are simply two embeddings (for segments "A" and "B") that BERT learned, and which it adds to the token embeddings before feeding them into the input layer. 
 28 | 
 29 | ## Start & End Token Classifiers
 30 | 
 31 | BERT needs to highlight a "span" of text containing the answer--this is represented as simply predicting which token marks the start of the answer, and which token marks the end.
 32 | 
 33 | ![Start token classification](http://www.mccormickml.com/assets/BERT/SQuAD/start_token_classification.png)
 34 | 
 35 | For every token in the text, we feed its final embedding into the start token classifier. The start token classifier only has a single set of weights (represented by the blue "start" rectangle in the above illustration) which it applies to every word.
 36 | 
 37 | After taking the dot product between the output embeddings and the 'start' weights, we apply the softmax activation to produce a probability distribution over all of the words. Whichever word has the highest probability of being the start token is the one that we pick.
 38 | 
 39 | We repeat this process for the end token--we have a separate weight vector this.
 40 | 
 41 | ![End token classification](http://www.mccormickml.com/assets/BERT/SQuAD/end_token_classification.png)
 42 | 
 43 | # Part 2: Example Code
 44 | 
 45 | In the example code below, we'll be downloading a model that's *already been fine-tuned* for question answering, and try it out on our own text.
 46 | 
 47 | If you do want to fine-tune on your own dataset, it is possible to fine-tune BERT for question answering yourself. See [run_squad.py](https://github.com/huggingface/transformers/blob/master/examples/run_squad.py) in the `transformers` library. However,you may find that the below "fine-tuned-on-squad" model already does a good job, even if your text is from a different domain. 
 48 | 
 49 | > Note: The example code in this Notebook is a commented and expanded version of the short example provided in the `transformers` documentation [here](https://huggingface.co/transformers/model_doc/bert.html?highlight=bertforquestionanswering#transformers.BertForQuestionAnswering).
 50 | 
 51 | ## 1. Install huggingface transformers library
 52 | 
 53 | This example uses the `transformers` [library](https://github.com/huggingface/transformers/) by huggingface. We'll start by installing the package.
 54 | 
 55 | 
 56 | ```
 57 | !pip install transformers
 58 | ```
 59 | 
 60 |     Collecting transformers
 61 |     [?25l  Downloading https://files.pythonhosted.org/packages/27/3c/91ed8f5c4e7ef3227b4119200fc0ed4b4fd965b1f0172021c25701087825/transformers-3.0.2-py3-none-any.whl (769kB)
 62 |     [K     |████████████████████████████████| 778kB 8.6MB/s 
 63 |     [?25hRequirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers) (20.4)
 64 |     Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers) (2019.12.20)
 65 |     Requirement already satisfied: dataclasses; python_version < "3.7" in /usr/local/lib/python3.6/dist-packages (from transformers) (0.7)
 66 |     Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers) (2.23.0)
 67 |     Collecting sacremoses
 68 |     [?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
 69 |     [K     |████████████████████████████████| 890kB 14.4MB/s 
 70 |     [?25hCollecting tokenizers==0.8.1.rc1
 71 |     [?25l  Downloading https://files.pythonhosted.org/packages/40/d0/30d5f8d221a0ed981a186c8eb986ce1c94e3a6e87f994eae9f4aa5250217/tokenizers-0.8.1rc1-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)
 72 |     [K     |████████████████████████████████| 3.0MB 43.4MB/s 
 73 |     [?25hCollecting sentencepiece!=0.1.92
 74 |     [?25l  Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)
 75 |     [K     |████████████████████████████████| 1.1MB 44.5MB/s 
 76 |     [?25hRequirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers) (3.0.12)
 77 |     Requirement already satisfied: tqdm>=4.27 in /usr/local/lib/python3.6/dist-packages (from transformers) (4.41.1)
 78 |     Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from transformers) (1.18.5)
 79 |     Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (1.15.0)
 80 |     Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers) (2.4.7)
 81 |     Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2020.6.20)
 82 |     Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (2.10)
 83 |     Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (3.0.4)
 84 |     Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers) (1.24.3)
 85 |     Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (7.1.2)
 86 |     Requirement already satisfied: joblib in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers) (0.16.0)
 87 |     Building wheels for collected packages: sacremoses
 88 |       Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
 89 |       Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=09966dce42bbfaa2aa5abc0f5f82653b74458d122a6fefeb7c895da49b40ce50
 90 |       Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45
 91 |     Successfully built sacremoses
 92 |     Installing collected packages: sacremoses, tokenizers, sentencepiece, transformers
 93 |     Successfully installed sacremoses-0.0.43 sentencepiece-0.1.91 tokenizers-0.8.1rc1 transformers-3.0.2
 94 | 
 95 | 
 96 | 
 97 | ```
 98 | import torch
 99 | ```
100 | 
101 | ## 2. Load Fine-Tuned BERT-large
102 | 
103 | For Question Answering we use the `BertForQuestionAnswering` class from the `transformers` library.
104 | 
105 | This class supports fine-tuning, but for this example we will keep things simpler and load a BERT model that has already been fine-tuned for the SQuAD benchmark.
106 | 
107 | The `transformers` library has a large collection of pre-trained models which you can reference by name and load easily. The full list is in their documentation [here](https://huggingface.co/transformers/pretrained_models.html).
108 | 
109 | For Question Answering, they have a version of BERT-large that has already been fine-tuned for the SQuAD benchmark. 
110 | 
111 | BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance. 
112 | 
113 | (Note that this download is not using your own network bandwidth--it's between the Google instance and wherever the model is stored on the web).
114 | 
115 | Note: I believe this model was trained on version 1 of SQuAD, since it's not outputting whether the question is "impossible" to answer from the text (which is part of the task in v2 of SQuAD).
116 | 
117 | 
118 | 
119 | ```
120 | from transformers import BertForQuestionAnswering
121 | 
122 | model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
123 | 
124 | ```
125 | 
126 | 
127 |     HBox(children=(FloatProgress(value=0.0, description='Downloading', max=443.0, style=ProgressStyle(description_…
128 | 
129 | 
130 |     
131 | 
132 | 
133 | 
134 |     HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1340675298.0, style=ProgressStyle(descr…
135 | 
136 | 
137 |     
138 | 
139 | 
140 | Load the tokenizer as well. 
141 | 
142 | Side note: Apparently the vocabulary of this model is identicaly to the one in bert-base-uncased. You can load the tokenizer from `bert-base-uncased` and that works just as well.
143 | 
144 | 
145 | ```
146 | from transformers import BertTokenizer
147 | 
148 | tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')
149 | ```
150 | 
151 | 
152 |     HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…
153 | 
154 | 
155 |     
156 | 
157 | 
158 | ## 3. Ask a Question
159 | 
160 | Now we're ready to feed in an example!
161 | 
162 | A QA example consists of a question and a passage of text containing the answer to that question.
163 | 
164 | Let's try an example using the text in this tutorial!
165 | 
166 | 
167 | ```
168 | question = "How many parameters does BERT-large have?"
169 | answer_text = "BERT-large is really big... it has 24-layers and an embedding size of 1,024, for a total of 340M parameters! Altogether it is 1.34GB, so expect it to take a couple minutes to download to your Colab instance."
170 | ```
171 | 
172 | We'll need to run the BERT tokenizer against both the `question` and the `answer_text`. To feed these into BERT, we actually concatenate them together and place the special [SEP] token in between.
173 | 
174 | 
175 | 
176 | ```
177 | # Apply the tokenizer to the input text, treating them as a text-pair.
178 | input_ids = tokenizer.encode(question, answer_text)
179 | 
180 | print('The input has a total of {:} tokens.'.format(len(input_ids)))
181 | ```
182 | 
183 |     The input has a total of 70 tokens.
184 | 
185 | 
186 | Just to see exactly what the tokenizer is doing, let's print out the tokens with their IDs.
187 | 
188 | 
189 | ```
190 | # BERT only needs the token IDs, but for the purpose of inspecting the 
191 | # tokenizer's behavior, let's also get the token strings and display them.
192 | tokens = tokenizer.convert_ids_to_tokens(input_ids)
193 | 
194 | # For each token and its id...
195 | for token, id in zip(tokens, input_ids):
196 |     
197 |     # If this is the [SEP] token, add some space around it to make it stand out.
198 |     if id == tokenizer.sep_token_id:
199 |         print('')
200 |     
201 |     # Print the token string and its ID in two columns.
202 |     print('{:<12} {:>6,}'.format(token, id))
203 | 
204 |     if id == tokenizer.sep_token_id:
205 |         print('')
206 |     
207 | ```
208 | 
209 |     [CLS]           101
210 |     how           2,129
211 |     many          2,116
212 |     parameters   11,709
213 |     does          2,515
214 |     bert         14,324
215 |     -             1,011
216 |     large         2,312
217 |     have          2,031
218 |     ?             1,029
219 |     
220 |     [SEP]           102
221 |     
222 |     bert         14,324
223 |     -             1,011
224 |     large         2,312
225 |     is            2,003
226 |     really        2,428
227 |     big           2,502
228 |     .             1,012
229 |     .             1,012
230 |     .             1,012
231 |     it            2,009
232 |     has           2,038
233 |     24            2,484
234 |     -             1,011
235 |     layers        9,014
236 |     and           1,998
237 |     an            2,019
238 |     em            7,861
239 |     ##bed         8,270
240 |     ##ding        4,667
241 |     size          2,946
242 |     of            1,997
243 |     1             1,015
244 |     ,             1,010
245 |     02            6,185
246 |     ##4           2,549
247 |     ,             1,010
248 |     for           2,005
249 |     a             1,037
250 |     total         2,561
251 |     of            1,997
252 |     340          16,029
253 |     ##m           2,213
254 |     parameters   11,709
255 |     !               999
256 |     altogether   10,462
257 |     it            2,009
258 |     is            2,003
259 |     1             1,015
260 |     .             1,012
261 |     34            4,090
262 |     ##gb         18,259
263 |     ,             1,010
264 |     so            2,061
265 |     expect        5,987
266 |     it            2,009
267 |     to            2,000
268 |     take          2,202
269 |     a             1,037
270 |     couple        3,232
271 |     minutes       2,781
272 |     to            2,000
273 |     download      8,816
274 |     to            2,000
275 |     your          2,115
276 |     cola         15,270
277 |     ##b           2,497
278 |     instance      6,013
279 |     .             1,012
280 |     
281 |     [SEP]           102
282 |     
283 | 
284 | 
285 | We've concatenated the `question` and `answer_text` together, but BERT still needs a way to distinguish them. BERT has two special "Segment" embeddings, one for segment "A" and one for segment "B". Before the word embeddings go into the BERT layers, the segment A embedding needs to be added to the `question` tokens, and the segment B embedding needs to be added to each of the `answer_text` tokens. 
286 | 
287 | These additions are handled for us by the `transformer` library, and all we need to do is specify a '0' or '1' for each token. 
288 | 
289 | Note: In the `transformers` library, huggingface likes to call these `token_type_ids`, but I'm going with `segment_ids` since this seems clearer, and is consistent with the BERT paper.
290 | 
291 | 
292 | ```
293 | # Search the input_ids for the first instance of the `[SEP]` token.
294 | sep_index = input_ids.index(tokenizer.sep_token_id)
295 | 
296 | # The number of segment A tokens includes the [SEP] token istelf.
297 | num_seg_a = sep_index + 1
298 | 
299 | # The remainder are segment B.
300 | num_seg_b = len(input_ids) - num_seg_a
301 | 
302 | # Construct the list of 0s and 1s.
303 | segment_ids = [0]*num_seg_a + [1]*num_seg_b
304 | 
305 | # There should be a segment_id for every input token.
306 | assert len(segment_ids) == len(input_ids)
307 | ```
308 | 
309 | >*Side Note: Where's the padding?*
310 | >
311 | > The original [example code](https://huggingface.co/transformers/model_doc/bert.html?highlight=bertforquestionanswering#transformers.BertForQuestionAnswering) does not perform any padding. I suspect that this is because we are only feeding in a *single example*. If we instead fed in a batch of examples, then we would need to pad or truncate all of the samples in the batch to a single length, and supply an attention mask to tell BERT to ignore the padding tokens. 
312 | 
313 | We're ready to feed our example into the model!
314 | 
315 | 
316 | 
317 | 
318 | ```
319 | # Run our example through the model.
320 | start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
321 |                                  token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text
322 | 
323 | ```
324 | 
325 | Now we can highlight the answer just by looking at the most probable start and end words. 
326 | 
327 | 
328 | ```
329 | # Find the tokens with the highest `start` and `end` scores.
330 | answer_start = torch.argmax(start_scores)
331 | answer_end = torch.argmax(end_scores)
332 | 
333 | # Combine the tokens in the answer and print it out.
334 | answer = ' '.join(tokens[answer_start:answer_end+1])
335 | 
336 | print('Answer: "' + answer + '"')
337 | ```
338 | 
339 |     Answer: "340 ##m"
340 | 
341 | 
342 | It got it right! Awesome :)
343 | 
344 | > *Side Note: It's a little naive to pick the highest scores for start and end--what if it predicts an end word that's before the start word?! The correct implementation is to pick the highest total score for which end >= start.*
345 | 
346 | With a little more effort, we can reconstruct any words that got broken down into subwords.
347 | 
348 | 
349 | ```
350 | # Start with the first token.
351 | answer = tokens[answer_start]
352 | 
353 | # Select the remaining answer tokens and join them with whitespace.
354 | for i in range(answer_start + 1, answer_end + 1):
355 |     
356 |     # If it's a subword token, then recombine it with the previous token.
357 |     if tokens[i][0:2] == '##':
358 |         answer += tokens[i][2:]
359 |     
360 |     # Otherwise, add a space then the token.
361 |     else:
362 |         answer += ' ' + tokens[i]
363 | 
364 | print('Answer: "' + answer + '"')
365 | ```
366 | 
367 |     Answer: "340m"
368 | 
369 | 
370 | ## 4. Visualizing Scores
371 | 
372 | I was curious to see what the scores were for all of the words. The following cells generate bar plots showing the start and end scores for every word in the input.
373 | 
374 | 
375 | ```
376 | import matplotlib.pyplot as plt
377 | import seaborn as sns
378 | 
379 | # Use plot styling from seaborn.
380 | sns.set(style='darkgrid')
381 | 
382 | # Increase the plot size and font size.
383 | #sns.set(font_scale=1.5)
384 | plt.rcParams["figure.figsize"] = (16,8)
385 | ```
386 | 
387 | Retrieve all of the start and end scores, and use all of the tokens as x-axis labels.
388 | 
389 | 
390 | ```
391 | # Pull the scores out of PyTorch Tensors and convert them to 1D numpy arrays.
392 | s_scores = start_scores.detach().numpy().flatten()
393 | e_scores = end_scores.detach().numpy().flatten()
394 | 
395 | # We'll use the tokens as the x-axis labels. In order to do that, they all need
396 | # to be unique, so we'll add the token index to the end of each one.
397 | token_labels = []
398 | for (i, token) in enumerate(tokens):
399 |     token_labels.append('{:} - {:>2}'.format(token, i))
400 | 
401 | ```
402 | 
403 | Create a bar plot showing the score for every input word being the "start" word.
404 | 
405 | 
406 | ```
407 | # Create a barplot showing the start word score for all of the tokens.
408 | ax = sns.barplot(x=token_labels, y=s_scores, ci=None)
409 | 
410 | # Turn the xlabels vertical.
411 | ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
412 | 
413 | # Turn on the vertical grid to help align words to scores.
414 | ax.grid(True)
415 | 
416 | plt.title('Start Word Scores')
417 | 
418 | plt.show()
419 | ```
420 | 
421 | 
422 |     
423 | ![png](Question_Answering_with_a_Fine_Tuned_BERT_files/Question_Answering_with_a_Fine_Tuned_BERT_44_0.png)
424 |     
425 | 
426 | 
427 | Create a second bar plot showing the score for every input word being the "end" word.
428 | 
429 | 
430 | ```
431 | # Create a barplot showing the end word score for all of the tokens.
432 | ax = sns.barplot(x=token_labels, y=e_scores, ci=None)
433 | 
434 | # Turn the xlabels vertical.
435 | ax.set_xticklabels(ax.get_xticklabels(), rotation=90, ha="center")
436 | 
437 | # Turn on the vertical grid to help align words to scores.
438 | ax.grid(True)
439 | 
440 | plt.title('End Word Scores')
441 | 
442 | plt.show()
443 | ```
444 | 
445 | 
446 |     
447 | ![png](Question_Answering_with_a_Fine_Tuned_BERT_files/Question_Answering_with_a_Fine_Tuned_BERT_46_0.png)
448 |     
449 | 
450 | 
451 | **Alternate View**
452 | 
453 | I also tried visualizing both the start and end scores on a single bar plot, but I think it may actually be more confusing then seeing them separately. 
454 | 
455 | 
456 | ```
457 | import pandas as pd
458 | 
459 | # Store the tokens and scores in a DataFrame. 
460 | # Each token will have two rows, one for its start score and one for its end
461 | # score. The "marker" column will differentiate them. A little wacky, I know.
462 | scores = []
463 | for (i, token_label) in enumerate(token_labels):
464 | 
465 |     # Add the token's start score as one row.
466 |     scores.append({'token_label': token_label, 
467 |                    'score': s_scores[i],
468 |                    'marker': 'start'})
469 |     
470 |     # Add  the token's end score as another row.
471 |     scores.append({'token_label': token_label, 
472 |                    'score': e_scores[i],
473 |                    'marker': 'end'})
474 |     
475 | df = pd.DataFrame(scores)
476 | 
477 | ```
478 | 
479 | 
480 | ```
481 | # Draw a grouped barplot to show start and end scores for each word.
482 | # The "hue" parameter is where we tell it which datapoints belong to which
483 | # of the two series.
484 | g = sns.catplot(x="token_label", y="score", hue="marker", data=df,
485 |                 kind="bar", height=6, aspect=4)
486 | 
487 | # Turn the xlabels vertical.
488 | g.set_xticklabels(g.ax.get_xticklabels(), rotation=90, ha="center")
489 | 
490 | # Turn on the vertical grid to help align words to scores.
491 | g.ax.grid(True)
492 | 
493 | ```
494 | 
495 | 
496 |     
497 | ![png](Question_Answering_with_a_Fine_Tuned_BERT_files/Question_Answering_with_a_Fine_Tuned_BERT_49_0.png)
498 |     
499 | 
500 | 
501 | ## 5. More Examples
502 | 
503 | Turn the QA process into a function so we can easily try out other examples.
504 | 
505 | 
506 | ```
507 | def answer_question(question, answer_text):
508 |     '''
509 |     Takes a `question` string and an `answer_text` string (which contains the
510 |     answer), and identifies the words within the `answer_text` that are the
511 |     answer. Prints them out.
512 |     '''
513 |     # ======== Tokenize ========
514 |     # Apply the tokenizer to the input text, treating them as a text-pair.
515 |     input_ids = tokenizer.encode(question, answer_text)
516 | 
517 |     # Report how long the input sequence is.
518 |     print('Query has {:,} tokens.\n'.format(len(input_ids)))
519 | 
520 |     # ======== Set Segment IDs ========
521 |     # Search the input_ids for the first instance of the `[SEP]` token.
522 |     sep_index = input_ids.index(tokenizer.sep_token_id)
523 | 
524 |     # The number of segment A tokens includes the [SEP] token istelf.
525 |     num_seg_a = sep_index + 1
526 | 
527 |     # The remainder are segment B.
528 |     num_seg_b = len(input_ids) - num_seg_a
529 | 
530 |     # Construct the list of 0s and 1s.
531 |     segment_ids = [0]*num_seg_a + [1]*num_seg_b
532 | 
533 |     # There should be a segment_id for every input token.
534 |     assert len(segment_ids) == len(input_ids)
535 | 
536 |     # ======== Evaluate ========
537 |     # Run our example question through the model.
538 |     start_scores, end_scores = model(torch.tensor([input_ids]), # The tokens representing our input text.
539 |                                     token_type_ids=torch.tensor([segment_ids])) # The segment IDs to differentiate question from answer_text
540 | 
541 |     # ======== Reconstruct Answer ========
542 |     # Find the tokens with the highest `start` and `end` scores.
543 |     answer_start = torch.argmax(start_scores)
544 |     answer_end = torch.argmax(end_scores)
545 | 
546 |     # Get the string versions of the input tokens.
547 |     tokens = tokenizer.convert_ids_to_tokens(input_ids)
548 | 
549 |     # Start with the first token.
550 |     answer = tokens[answer_start]
551 | 
552 |     # Select the remaining answer tokens and join them with whitespace.
553 |     for i in range(answer_start + 1, answer_end + 1):
554 |         
555 |         # If it's a subword token, then recombine it with the previous token.
556 |         if tokens[i][0:2] == '##':
557 |             answer += tokens[i][2:]
558 |         
559 |         # Otherwise, add a space then the token.
560 |         else:
561 |             answer += ' ' + tokens[i]
562 | 
563 |     print('Answer: "' + answer + '"')
564 | ```
565 | 
566 | As our reference text, I've taken the Abstract of the [BERT paper](https://arxiv.org/pdf/1810.04805.pdf).
567 | 
568 | 
569 | 
570 | ```
571 | import textwrap
572 | 
573 | # Wrap text to 80 characters.
574 | wrapper = textwrap.TextWrapper(width=80) 
575 | 
576 | bert_abstract = "We introduce a new language representation model called BERT, which stands for Bidirectional Encoder Representations from Transformers. Unlike recent language representation models (Peters et al., 2018a; Radford et al., 2018), BERT is designed to pretrain deep bidirectional representations from unlabeled text by jointly conditioning on both left and right context in all layers. As a result, the pre-trained BERT model can be finetuned with just one additional output layer to create state-of-the-art models for a wide range of tasks, such as question answering and language inference, without substantial taskspecific architecture modifications. BERT is conceptually simple and empirically powerful. It obtains new state-of-the-art results on eleven natural language processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1 question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD v2.0 Test F1 to 83.1 (5.1 point absolute improvement)."
577 | 
578 | print(wrapper.fill(bert_abstract))
579 | ```
580 | 
581 |     We introduce a new language representation model called BERT, which stands for
582 |     Bidirectional Encoder Representations from Transformers. Unlike recent language
583 |     representation models (Peters et al., 2018a; Radford et al., 2018), BERT is
584 |     designed to pretrain deep bidirectional representations from unlabeled text by
585 |     jointly conditioning on both left and right context in all layers. As a result,
586 |     the pre-trained BERT model can be finetuned with just one additional output
587 |     layer to create state-of-the-art models for a wide range of tasks, such as
588 |     question answering and language inference, without substantial taskspecific
589 |     architecture modifications. BERT is conceptually simple and empirically
590 |     powerful. It obtains new state-of-the-art results on eleven natural language
591 |     processing tasks, including pushing the GLUE score to 80.5% (7.7% point absolute
592 |     improvement), MultiNLI accuracy to 86.7% (4.6% absolute improvement), SQuAD v1.1
593 |     question answering Test F1 to 93.2 (1.5 point absolute improvement) and SQuAD
594 |     v2.0 Test F1 to 83.1 (5.1 point absolute improvement).
595 | 
596 | 
597 | -----------------------------
598 | Ask BERT what its name stands for (the answer is in the first sentence of the abstract).
599 | 
600 | 
601 | ```
602 | question = "What does the 'B' in BERT stand for?"
603 | 
604 | answer_question(question, bert_abstract)
605 | ```
606 | 
607 |     Query has 258 tokens.
608 |     
609 |     Answer: "bidirectional encoder representations from transformers"
610 | 
611 | 
612 | ---------------------
613 | Ask BERT about example applications of itself :)
614 | 
615 | The answer to the question comes from this passage from the abstract: 
616 | 
617 | > "...BERT model can be finetuned with just one additional output
618 | layer to create state-of-the-art models for **a wide range of tasks, such as
619 | question answering and language inference,** without substantial taskspecific
620 | architecture modifications."
621 | 
622 | 
623 | ```
624 | question = "What are some example applications of BERT?"
625 | 
626 | answer_question(question, bert_abstract)
627 | ```
628 | 
629 |     Query has 255 tokens.
630 |     
631 |     Answer: "question answering and language inference"
632 | 
633 | 
634 | 
635 | ```
636 | 
637 | ```
638 | 


--------------------------------------------------------------------------------
/docs/Question_Answering_with_a_Fine_Tuned_BERT_files/Question_Answering_with_a_Fine_Tuned_BERT_44_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ankur3107/nlp_notebooks/db671902eebb52f7174df5f302cbc700fe0b0b97/docs/Question_Answering_with_a_Fine_Tuned_BERT_files/Question_Answering_with_a_Fine_Tuned_BERT_44_0.png


--------------------------------------------------------------------------------
/docs/Question_Answering_with_a_Fine_Tuned_BERT_files/Question_Answering_with_a_Fine_Tuned_BERT_46_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ankur3107/nlp_notebooks/db671902eebb52f7174df5f302cbc700fe0b0b97/docs/Question_Answering_with_a_Fine_Tuned_BERT_files/Question_Answering_with_a_Fine_Tuned_BERT_46_0.png


--------------------------------------------------------------------------------
/docs/Question_Answering_with_a_Fine_Tuned_BERT_files/Question_Answering_with_a_Fine_Tuned_BERT_49_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ankur3107/nlp_notebooks/db671902eebb52f7174df5f302cbc700fe0b0b97/docs/Question_Answering_with_a_Fine_Tuned_BERT_files/Question_Answering_with_a_Fine_Tuned_BERT_49_0.png


--------------------------------------------------------------------------------
/docs/Seq2Seq_Pytorch.md:
--------------------------------------------------------------------------------
  1 | <a href="https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/Seq2Seq_Pytorch.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  2 | 
  3 | 
  4 | ```
  5 | !pip install spacy --upgrade
  6 | ```
  7 | 
  8 |     Collecting spacy
  9 |     [?25l  Downloading https://files.pythonhosted.org/packages/10/b5/c7a92c7ce5d4b353b70b4b5b4385687206c8b230ddfe08746ab0fd310a3a/spacy-2.3.2-cp36-cp36m-manylinux1_x86_64.whl (9.9MB)
 10 |     [K     |████████████████████████████████| 10.0MB 3.9MB/s 
 11 |     [?25hRequirement already satisfied, skipping upgrade: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.0.2)
 12 |     Requirement already satisfied, skipping upgrade: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (0.7.1)
 13 |     Requirement already satisfied, skipping upgrade: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.0.0)
 14 |     Requirement already satisfied, skipping upgrade: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy) (2.0.3)
 15 |     Requirement already satisfied, skipping upgrade: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy) (49.6.0)
 16 |     Collecting thinc==7.4.1
 17 |     [?25l  Downloading https://files.pythonhosted.org/packages/10/ae/ef3ae5e93639c0ef8e3eb32e3c18341e511b3c515fcfc603f4b808087651/thinc-7.4.1-cp36-cp36m-manylinux1_x86_64.whl (2.1MB)
 18 |     [K     |████████████████████████████████| 2.1MB 18.3MB/s 
 19 |     [?25hRequirement already satisfied, skipping upgrade: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (2.23.0)
 20 |     Requirement already satisfied, skipping upgrade: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (4.41.1)
 21 |     Requirement already satisfied, skipping upgrade: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.1.3)
 22 |     Requirement already satisfied, skipping upgrade: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (0.4.1)
 23 |     Requirement already satisfied, skipping upgrade: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.0.2)
 24 |     Requirement already satisfied, skipping upgrade: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy) (1.18.5)
 25 |     Requirement already satisfied, skipping upgrade: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy) (3.0.2)
 26 |     Requirement already satisfied, skipping upgrade: importlib-metadata>=0.20; python_version < "3.8" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy) (1.7.0)
 27 |     Requirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (3.0.4)
 28 |     Requirement already satisfied, skipping upgrade: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2.10)
 29 |     Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (1.24.3)
 30 |     Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy) (2020.6.20)
 31 |     Requirement already satisfied, skipping upgrade: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy) (3.1.0)
 32 |     Installing collected packages: thinc, spacy
 33 |       Found existing installation: thinc 7.4.0
 34 |         Uninstalling thinc-7.4.0:
 35 |           Successfully uninstalled thinc-7.4.0
 36 |       Found existing installation: spacy 2.2.4
 37 |         Uninstalling spacy-2.2.4:
 38 |           Successfully uninstalled spacy-2.2.4
 39 |     Successfully installed spacy-2.3.2 thinc-7.4.1
 40 | 
 41 | 
 42 | 
 43 | ```
 44 | !python -m spacy download en
 45 | !python -m spacy download de
 46 | !python -m spacy download hi
 47 | ```
 48 | 
 49 |     Collecting en_core_web_sm==2.3.1
 50 |     [?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.3.1/en_core_web_sm-2.3.1.tar.gz (12.0MB)
 51 |     [K     |████████████████████████████████| 12.1MB 803kB/s 
 52 |     [?25hRequirement already satisfied: spacy<2.4.0,>=2.3.0 in /usr/local/lib/python3.6/dist-packages (from en_core_web_sm==2.3.1) (2.3.2)
 53 |     Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (0.7.1)
 54 |     Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.0.2)
 55 |     Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (0.4.1)
 56 |     Requirement already satisfied: thinc==7.4.1 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (7.4.1)
 57 |     Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.0.2)
 58 |     Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.0.0)
 59 |     Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.18.5)
 60 |     Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (3.0.2)
 61 |     Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.1.3)
 62 |     Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (4.41.1)
 63 |     Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.23.0)
 64 |     Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (49.6.0)
 65 |     Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.0.3)
 66 |     Requirement already satisfied: importlib-metadata>=0.20; python_version < "3.8" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.7.0)
 67 |     Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (3.0.4)
 68 |     Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (1.24.3)
 69 |     Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2020.6.20)
 70 |     Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (2.10)
 71 |     Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->en_core_web_sm==2.3.1) (3.1.0)
 72 |     Building wheels for collected packages: en-core-web-sm
 73 |       Building wheel for en-core-web-sm (setup.py) ... [?25l[?25hdone
 74 |       Created wheel for en-core-web-sm: filename=en_core_web_sm-2.3.1-cp36-none-any.whl size=12047109 sha256=c566e8eddcd63bc8259a784a72fae15d14da6e23d183b7a16bf6032b9aeaeed2
 75 |       Stored in directory: /tmp/pip-ephem-wheel-cache-76hzqxdi/wheels/2b/3f/41/f0b92863355c3ba34bb32b37d8a0c662959da0058202094f46
 76 |     Successfully built en-core-web-sm
 77 |     Installing collected packages: en-core-web-sm
 78 |       Found existing installation: en-core-web-sm 2.2.5
 79 |         Uninstalling en-core-web-sm-2.2.5:
 80 |           Successfully uninstalled en-core-web-sm-2.2.5
 81 |     Successfully installed en-core-web-sm-2.3.1
 82 |     [38;5;2m✔ Download and installation successful[0m
 83 |     You can now load the model via spacy.load('en_core_web_sm')
 84 |     [38;5;2m✔ Linking successful[0m
 85 |     /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
 86 |     /usr/local/lib/python3.6/dist-packages/spacy/data/en
 87 |     You can now load the model via spacy.load('en')
 88 |     Collecting de_core_news_sm==2.3.0
 89 |     [?25l  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_sm-2.3.0/de_core_news_sm-2.3.0.tar.gz (14.9MB)
 90 |     [K     |████████████████████████████████| 14.9MB 833kB/s 
 91 |     [?25hRequirement already satisfied: spacy<2.4.0,>=2.3.0 in /usr/local/lib/python3.6/dist-packages (from de_core_news_sm==2.3.0) (2.3.2)
 92 |     Requirement already satisfied: numpy>=1.15.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (1.18.5)
 93 |     Requirement already satisfied: tqdm<5.0.0,>=4.38.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (4.41.1)
 94 |     Requirement already satisfied: requests<3.0.0,>=2.13.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (2.23.0)
 95 |     Requirement already satisfied: catalogue<1.1.0,>=0.0.7 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (1.0.0)
 96 |     Requirement already satisfied: murmurhash<1.1.0,>=0.28.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (1.0.2)
 97 |     Requirement already satisfied: plac<1.2.0,>=0.9.6 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (1.1.3)
 98 |     Requirement already satisfied: blis<0.5.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (0.4.1)
 99 |     Requirement already satisfied: srsly<1.1.0,>=1.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (1.0.2)
100 |     Requirement already satisfied: setuptools in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (49.6.0)
101 |     Requirement already satisfied: thinc==7.4.1 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (7.4.1)
102 |     Requirement already satisfied: preshed<3.1.0,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (3.0.2)
103 |     Requirement already satisfied: wasabi<1.1.0,>=0.4.0 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (0.7.1)
104 |     Requirement already satisfied: cymem<2.1.0,>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (2.0.3)
105 |     Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (2020.6.20)
106 |     Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (3.0.4)
107 |     Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (2.10)
108 |     Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests<3.0.0,>=2.13.0->spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (1.24.3)
109 |     Requirement already satisfied: importlib-metadata>=0.20; python_version < "3.8" in /usr/local/lib/python3.6/dist-packages (from catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (1.7.0)
110 |     Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.6/dist-packages (from importlib-metadata>=0.20; python_version < "3.8"->catalogue<1.1.0,>=0.0.7->spacy<2.4.0,>=2.3.0->de_core_news_sm==2.3.0) (3.1.0)
111 |     Building wheels for collected packages: de-core-news-sm
112 |       Building wheel for de-core-news-sm (setup.py) ... [?25l[?25hdone
113 |       Created wheel for de-core-news-sm: filename=de_core_news_sm-2.3.0-cp36-none-any.whl size=14907580 sha256=959d3b6df2936d8e86bbc601dbd80eddfb310d3af7aca8895529c319cb58b539
114 |       Stored in directory: /tmp/pip-ephem-wheel-cache-59e5ledg/wheels/db/f3/1e/0df0f27eee12bd1aaa94bcfef11b01eca62f90b9b9a0ce08fd
115 |     Successfully built de-core-news-sm
116 |     Installing collected packages: de-core-news-sm
117 |       Found existing installation: de-core-news-sm 2.2.5
118 |         Uninstalling de-core-news-sm-2.2.5:
119 |           Successfully uninstalled de-core-news-sm-2.2.5
120 |     Successfully installed de-core-news-sm-2.3.0
121 |     [38;5;2m✔ Download and installation successful[0m
122 |     You can now load the model via spacy.load('de_core_news_sm')
123 |     [38;5;2m✔ Linking successful[0m
124 |     /usr/local/lib/python3.6/dist-packages/de_core_news_sm -->
125 |     /usr/local/lib/python3.6/dist-packages/spacy/data/de
126 |     You can now load the model via spacy.load('de')
127 |     
128 |     [38;5;1m✘ No compatible model found for 'hi' (spaCy v2.3.2).[0m
129 |     
130 | 
131 | 
132 | 
133 | ```
134 | import torch
135 | import torch.nn as nn
136 | import torch.optim as optim
137 | import spacy
138 | from torch.utils.tensorboard import SummaryWriter
139 | from torchtext.datasets import Multi30k
140 | from torchtext.data import Field, BucketIterator
141 | from spacy.lang.hi import Hindi
142 | ```
143 | 
144 | 
145 | ```
146 | spacy_ger = spacy.load("de")
147 | spacy_eng = spacy.load("en")
148 | 
149 | spacy_hi = Hindi()
150 | def tokenize_hi(text):
151 |     return [tok.text for tok in spacy_hi.tokenizer(text)]
152 | 
153 | def tokenize_ger(text):
154 |     return [tok.text for tok in spacy_ger.tokenizer(text)]
155 | 
156 | 
157 | def tokenize_eng(text):
158 |     return [tok.text for tok in spacy_eng.tokenizer(text)]
159 | 
160 | 
161 | german = Field(tokenize=tokenize_ger, lower=True, init_token="<sos>", eos_token="<eos>")
162 | 
163 | english = Field(
164 |     tokenize=tokenize_eng, lower=True, init_token="<sos>", eos_token="<eos>"
165 | )
166 | 
167 | train_data, valid_data, test_data = Multi30k.splits(
168 |     exts=(".de", ".en"), fields=(german, english)
169 | )
170 | 
171 | german.build_vocab(train_data, max_size=10000, min_freq=2)
172 | english.build_vocab(train_data, max_size=10000, min_freq=2)
173 | ```
174 | 
175 | 
176 | ```
177 | class Transformer(nn.Module):
178 |     def __init__(
179 |         self,
180 |         embedding_size,
181 |         src_vocab_size,
182 |         trg_vocab_size,
183 |         src_pad_idx,
184 |         num_heads,
185 |         num_encoder_layers,
186 |         num_decoder_layers,
187 |         forward_expansion,
188 |         dropout,
189 |         max_len,
190 |         device,
191 |     ):
192 |         super(Transformer, self).__init__()
193 |         self.src_word_embedding = nn.Embedding(src_vocab_size, embedding_size)
194 |         self.src_position_embedding = nn.Embedding(max_len, embedding_size)
195 |         self.trg_word_embedding = nn.Embedding(trg_vocab_size, embedding_size)
196 |         self.trg_position_embedding = nn.Embedding(max_len, embedding_size)
197 | 
198 |         self.device = device
199 |         self.transformer = nn.Transformer(
200 |             embedding_size,
201 |             num_heads,
202 |             num_encoder_layers,
203 |             num_decoder_layers,
204 |             forward_expansion,
205 |             dropout,
206 |         )
207 |         self.fc_out = nn.Linear(embedding_size, trg_vocab_size)
208 |         self.dropout = nn.Dropout(dropout)
209 |         self.src_pad_idx = src_pad_idx
210 | 
211 |     def make_src_mask(self, src):
212 |         src_mask = src.transpose(0, 1) == self.src_pad_idx
213 | 
214 |         # (N, src_len)
215 |         return src_mask.to(self.device)
216 | 
217 |     def forward(self, src, trg):
218 |         src_seq_length, N = src.shape
219 |         trg_seq_length, N = trg.shape
220 | 
221 |         src_positions = (
222 |             torch.arange(0, src_seq_length)
223 |             .unsqueeze(1)
224 |             .expand(src_seq_length, N)
225 |             .to(self.device)
226 |         )
227 | 
228 |         trg_positions = (
229 |             torch.arange(0, trg_seq_length)
230 |             .unsqueeze(1)
231 |             .expand(trg_seq_length, N)
232 |             .to(self.device)
233 |         )
234 | 
235 |         embed_src = self.dropout(
236 |             (self.src_word_embedding(src) + self.src_position_embedding(src_positions))
237 |         )
238 |         embed_trg = self.dropout(
239 |             (self.trg_word_embedding(trg) + self.trg_position_embedding(trg_positions))
240 |         )
241 | 
242 |         src_padding_mask = self.make_src_mask(src)
243 |         trg_mask = self.transformer.generate_square_subsequent_mask(trg_seq_length).to(
244 |             self.device
245 |         )
246 | 
247 |         out = self.transformer(
248 |             embed_src,
249 |             embed_trg,
250 |             src_key_padding_mask=src_padding_mask,
251 |             tgt_mask=trg_mask,
252 |         )
253 |         out = self.fc_out(out)
254 |         return out
255 | 
256 | ```
257 | 
258 | 
259 | ```
260 | # We're ready to define everything we need for training our Seq2Seq model
261 | device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
262 | 
263 | load_model = False
264 | save_model = True
265 | 
266 | # Training hyperparameters
267 | num_epochs = 10000
268 | learning_rate = 3e-4
269 | batch_size = 32
270 | 
271 | # Model hyperparameters
272 | src_vocab_size = len(german.vocab)
273 | trg_vocab_size = len(english.vocab)
274 | embedding_size = 512
275 | num_heads = 8
276 | num_encoder_layers = 3
277 | num_decoder_layers = 3
278 | dropout = 0.10
279 | max_len = 100
280 | forward_expansion = 4
281 | src_pad_idx = english.vocab.stoi["<pad>"]
282 | 
283 | # Tensorboard to get nice loss plot
284 | writer = SummaryWriter("runs/loss_plot")
285 | step = 0
286 | ```
287 | 
288 | 
289 | ```
290 | train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
291 |     (train_data, valid_data, test_data),
292 |     batch_size=batch_size,
293 |     sort_within_batch=True,
294 |     sort_key=lambda x: len(x.src),
295 |     device=device,
296 | )
297 | 
298 | ```
299 | 
300 | 
301 | ```
302 | model = Transformer(
303 |     embedding_size,
304 |     src_vocab_size,
305 |     trg_vocab_size,
306 |     src_pad_idx,
307 |     num_heads,
308 |     num_encoder_layers,
309 |     num_decoder_layers,
310 |     forward_expansion,
311 |     dropout,
312 |     max_len,
313 |     device,
314 | ).to(device)
315 | ```
316 | 
317 | 
318 | ```
319 | def translate_sentence(model, sentence, german, english, device, max_length=50):
320 |     # Load german tokenizer
321 |     spacy_ger = spacy.load("de")
322 | 
323 |     # Create tokens using spacy and everything in lower case (which is what our vocab is)
324 |     if type(sentence) == str:
325 |         tokens = [token.text.lower() for token in spacy_ger(sentence)]
326 |     else:
327 |         tokens = [token.lower() for token in sentence]
328 | 
329 |     # Add <SOS> and <EOS> in beginning and end respectively
330 |     tokens.insert(0, german.init_token)
331 |     tokens.append(german.eos_token)
332 | 
333 |     # Go through each german token and convert to an index
334 |     text_to_indices = [german.vocab.stoi[token] for token in tokens]
335 | 
336 |     # Convert to Tensor
337 |     sentence_tensor = torch.LongTensor(text_to_indices).unsqueeze(1).to(device)
338 | 
339 |     outputs = [english.vocab.stoi["<sos>"]]
340 |     for i in range(max_length):
341 |         trg_tensor = torch.LongTensor(outputs).unsqueeze(1).to(device)
342 | 
343 |         with torch.no_grad():
344 |             output = model(sentence_tensor, trg_tensor)
345 | 
346 |         best_guess = output.argmax(2)[-1, :].item()
347 |         outputs.append(best_guess)
348 | 
349 |         if best_guess == english.vocab.stoi["<eos>"]:
350 |             break
351 | 
352 |     translated_sentence = [english.vocab.itos[idx] for idx in outputs]
353 |     # remove start token
354 |     return translated_sentence[1:]
355 | 
356 | 
357 | def bleu(data, model, german, english, device):
358 |     targets = []
359 |     outputs = []
360 | 
361 |     for example in data:
362 |         src = vars(example)["src"]
363 |         trg = vars(example)["trg"]
364 | 
365 |         prediction = translate_sentence(model, src, german, english, device)
366 |         prediction = prediction[:-1]  # remove <eos> token
367 | 
368 |         targets.append([trg])
369 |         outputs.append(prediction)
370 | 
371 |     return bleu_score(outputs, targets)
372 | 
373 | 
374 | def save_checkpoint(state, filename="my_checkpoint.pth.tar"):
375 |     print("=> Saving checkpoint")
376 |     torch.save(state, filename)
377 | 
378 | 
379 | def load_checkpoint(checkpoint, model, optimizer):
380 |     print("=> Loading checkpoint")
381 |     model.load_state_dict(checkpoint["state_dict"])
382 |     optimizer.load_state_dict(checkpoint["optimizer"])
383 | 
384 | 
385 | optimizer = optim.Adam(model.parameters(), lr=learning_rate)
386 | 
387 | scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
388 |     optimizer, factor=0.1, patience=10, verbose=True
389 | )
390 | 
391 | pad_idx = english.vocab.stoi["<pad>"]
392 | criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
393 | 
394 | if load_model:
395 |     load_checkpoint(torch.load("my_checkpoint.pth.tar"), model, optimizer)
396 | 
397 | sentence = "ein pferd geht unter einer brücke neben einem boot."
398 | ```
399 | 
400 | 
401 | ```
402 | for epoch in range(num_epochs):
403 |     print(f"[Epoch {epoch} / {num_epochs}]")
404 | 
405 |     if save_model:
406 |         checkpoint = {
407 |             "state_dict": model.state_dict(),
408 |             "optimizer": optimizer.state_dict(),
409 |         }
410 |         save_checkpoint(checkpoint)
411 | 
412 |     model.eval()
413 |     translated_sentence = translate_sentence(
414 |         model, sentence, german, english, device, max_length=50
415 |     )
416 | 
417 |     print(f"Translated example sentence: \n {translated_sentence}")
418 |     model.train()
419 |     losses = []
420 | 
421 |     for batch_idx, batch in enumerate(train_iterator):
422 |         # Get input and targets and get to cuda
423 |         inp_data = batch.src.to(device)
424 |         target = batch.trg.to(device)
425 | 
426 |         # Forward prop
427 |         output = model(inp_data, target[:-1, :])
428 | 
429 |         # Output is of shape (trg_len, batch_size, output_dim) but Cross Entropy Loss
430 |         # doesn't take input in that form. For example if we have MNIST we want to have
431 |         # output to be: (N, 10) and targets just (N). Here we can view it in a similar
432 |         # way that we have output_words * batch_size that we want to send in into
433 |         # our cost function, so we need to do some reshapin.
434 |         # Let's also remove the start token while we're at it
435 |         output = output.reshape(-1, output.shape[2])
436 |         target = target[1:].reshape(-1)
437 | 
438 |         optimizer.zero_grad()
439 | 
440 |         loss = criterion(output, target)
441 |         losses.append(loss.item())
442 | 
443 |         # Back prop
444 |         loss.backward()
445 |         # Clip to avoid exploding gradient issues, makes sure grads are
446 |         # within a healthy range
447 |         torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
448 | 
449 |         # Gradient descent step
450 |         optimizer.step()
451 | 
452 |         # plot to tensorboard
453 |         writer.add_scalar("Training loss", loss, global_step=step)
454 |         step += 1
455 | 
456 |     mean_loss = sum(losses) / len(losses)
457 |     scheduler.step(mean_loss)
458 | 
459 | # running on entire test data takes a while
460 | score = bleu(test_data[1:100], model, german, english, device)
461 | print(f"Bleu score {score * 100:.2f}")
462 | ```
463 | 
464 |     [Epoch 0 / 10000]
465 |     => Saving checkpoint
466 |     Translated example sentence: 
467 |      ['.', 'secures', 'secures', 'half', 'secures', 'secures', 'half', '.', 'secures', 'toddler', 'secures', 'secures', 'half', '.', '.', 'olympians', 'half', '.', 'olympians', 'helmet', '.', 'secures', 'toddler', 'secures', 'secures', 'toddler', 'secures', 'secures', '.', 'secures', 'secures', '.', 'secures', '.', 'secures', 'secures', 'secures', 'secures', 'half', 'half', '.', 'secures', 'toddler', 'secures', 'secures', 'secures', 'mosaic', 'secures', 'secures', 'toddler']
468 |     [Epoch 1 / 10000]
469 |     => Saving checkpoint
470 |     Translated example sentence: 
471 |      ['a', 'horse', 'walking', 'under', 'a', 'boat', 'next', 'to', 'a', 'boat', '.', '<eos>']
472 |     [Epoch 2 / 10000]
473 |     => Saving checkpoint
474 |     Translated example sentence: 
475 |      ['a', 'horse', 'is', 'walking', 'under', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.', '<eos>']
476 |     [Epoch 3 / 10000]
477 |     => Saving checkpoint
478 |     Translated example sentence: 
479 |      ['a', 'horse', 'is', 'walking', 'under', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.', '<eos>']
480 |     [Epoch 4 / 10000]
481 |     => Saving checkpoint
482 |     Translated example sentence: 
483 |      ['a', 'horse', 'walks', 'under', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.', '<eos>']
484 |     [Epoch 5 / 10000]
485 |     => Saving checkpoint
486 |     Translated example sentence: 
487 |      ['a', 'horse', 'walks', 'under', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.', '<eos>']
488 |     [Epoch 6 / 10000]
489 |     => Saving checkpoint
490 |     Translated example sentence: 
491 |      ['a', 'horse', 'walks', 'underneath', 'a', 'bridge', 'next', 'to', 'a', 'boat', '.', '<eos>']
492 |     [Epoch 7 / 10000]
493 |     => Saving checkpoint
494 |     Translated example sentence: 
495 |      ['a', 'horse', 'is', 'walking', 'under', 'a', 'bridge', 'beside', 'a', 'boat', '.', '<eos>']
496 | 
497 | 
498 | 
499 | ```
500 | 
501 | ```
502 | 


--------------------------------------------------------------------------------
/docs/SetFit_SST_2_Few_shot.md:
--------------------------------------------------------------------------------
  1 | <a href="https://colab.research.google.com/github/Ankur3107/nlp_notebooks/blob/master/Few-shot/SetFit_SST_2_Few_shot.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  2 | 
  3 | Reference Links:
  4 | 
  5 | 1. https://towardsdatascience.com/sentence-transformer-fine-tuning-setfit-outperforms-gpt-3-on-few-shot-text-classification-while-d9a3788f0b4e
  6 | 
  7 | 2. https://arxiv.org/pdf/2109.14076.pdf
  8 | 
  9 | 3. https://huggingface.co/spaces/ought/raft-leaderboard
 10 | 
 11 | 4. https://github.com/timoschick/pet
 12 | 
 13 | 
 14 | # Init
 15 | 
 16 | 
 17 | ```python
 18 | !pip install sentence_transformers -q
 19 | ```
 20 | 
 21 |     [K     |████████████████████████████████| 78 kB 3.4 MB/s 
 22 |     [K     |████████████████████████████████| 3.4 MB 11.4 MB/s 
 23 |     [K     |████████████████████████████████| 3.3 MB 36.4 MB/s 
 24 |     [K     |████████████████████████████████| 1.2 MB 29.8 MB/s 
 25 |     [K     |████████████████████████████████| 61 kB 370 kB/s 
 26 |     [K     |████████████████████████████████| 596 kB 25.4 MB/s 
 27 |     [K     |████████████████████████████████| 895 kB 32.0 MB/s 
 28 |     [?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
 29 | 
 30 | 
 31 | 
 32 | ```python
 33 | from sklearn.metrics import accuracy_score, f1_score
 34 | from sklearn.linear_model import LogisticRegression
 35 | from sentence_transformers import SentenceTransformer, InputExample, losses, models, datasets, evaluation
 36 | from torch.utils.data import DataLoader
 37 | 
 38 | from sklearn.manifold import TSNE
 39 | from matplotlib import pyplot as plt
 40 | 
 41 | # import warnings filter
 42 | from warnings import simplefilter
 43 | # ignore all future warnings
 44 | simplefilter(action='ignore', category=FutureWarning)
 45 | 
 46 | import pandas as pd
 47 | import numpy as np
 48 | 
 49 | import torch
 50 | import random
 51 | import torch
 52 | 
 53 | def set_seed(seed):
 54 |   random.seed(seed)
 55 |   np.random.seed(seed)
 56 |   torch.manual_seed(seed)
 57 | ```
 58 | 
 59 | 
 60 | ```python
 61 | def sentence_pairs_generation(sentences, labels, pairs):
 62 | 	# initialize two empty lists to hold the (sentence, sentence) pairs and
 63 | 	# labels to indicate if a pair is positive or negative
 64 | 
 65 |   numClassesList = np.unique(labels)
 66 |   idx = [np.where(labels == i)[0] for i in numClassesList]
 67 | 
 68 |   for idxA in range(len(sentences)):      
 69 |     currentSentence = sentences[idxA]
 70 |     label = labels[idxA]
 71 |     idxB = np.random.choice(idx[np.where(numClassesList==label)[0][0]])
 72 |     posSentence = sentences[idxB]
 73 | 		  # prepare a positive pair and update the sentences and labels
 74 | 		  # lists, respectively
 75 |     pairs.append(InputExample(texts=[currentSentence, posSentence], label=1.0))
 76 | 
 77 |     negIdx = np.where(labels != label)[0]
 78 |     negSentence = sentences[np.random.choice(negIdx)]
 79 | 		  # prepare a negative pair of images and update our lists
 80 |     pairs.append(InputExample(texts=[currentSentence, negSentence], label=0.0))
 81 |   
 82 | 	# return a 2-tuple of our image pairs and labels
 83 |   return (pairs)
 84 | ```
 85 | 
 86 | 
 87 | ```python
 88 | #SST-2
 89 | # Load SST-2 dataset into a pandas dataframe.
 90 | 
 91 | train_df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/train.tsv', delimiter='\t', header=None)
 92 | 
 93 | # Load the test dataset into a pandas dataframe.
 94 | eval_df = pd.read_csv('https://github.com/clairett/pytorch-sentiment-classification/raw/master/data/SST2/test.tsv', delimiter='\t', header=None)
 95 | 
 96 | text_col=train_df.columns.values[0] 
 97 | category_col=train_df.columns.values[1]
 98 | 
 99 | x_eval = eval_df[text_col].values.tolist()
100 | y_eval = eval_df[category_col].values.tolist()
101 | ```
102 | 
103 | 
104 | ```python
105 | train_df.head()
106 | ```
107 | 
108 | 
109 | 
110 | 
111 | 
112 |   <div id="df-ad145a91-fe65-4f27-90e9-8193b3dfe59a">
113 |     <div class="colab-df-container">
114 |       <div>
115 | <style scoped>
116 |     .dataframe tbody tr th:only-of-type {
117 |         vertical-align: middle;
118 |     }
119 | 
120 |     .dataframe tbody tr th {
121 |         vertical-align: top;
122 |     }
123 | 
124 |     .dataframe thead th {
125 |         text-align: right;
126 |     }
127 | </style>
128 | <table border="1" class="dataframe">
129 |   <thead>
130 |     <tr style="text-align: right;">
131 |       <th></th>
132 |       <th>0</th>
133 |       <th>1</th>
134 |     </tr>
135 |   </thead>
136 |   <tbody>
137 |     <tr>
138 |       <th>0</th>
139 |       <td>a stirring , funny and finally transporting re...</td>
140 |       <td>1</td>
141 |     </tr>
142 |     <tr>
143 |       <th>1</th>
144 |       <td>apparently reassembled from the cutting room f...</td>
145 |       <td>0</td>
146 |     </tr>
147 |     <tr>
148 |       <th>2</th>
149 |       <td>they presume their audience wo n't sit still f...</td>
150 |       <td>0</td>
151 |     </tr>
152 |     <tr>
153 |       <th>3</th>
154 |       <td>this is a visually stunning rumination on love...</td>
155 |       <td>1</td>
156 |     </tr>
157 |     <tr>
158 |       <th>4</th>
159 |       <td>jonathan parker 's bartleby should have been t...</td>
160 |       <td>1</td>
161 |     </tr>
162 |   </tbody>
163 | </table>
164 | </div>
165 |       <button class="colab-df-convert" onclick="convertToInteractive('df-ad145a91-fe65-4f27-90e9-8193b3dfe59a')"
166 |               title="Convert this dataframe to an interactive table."
167 |               style="display:none;">
168 | 
169 |   <svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
170 |        width="24px">
171 |     <path d="M0 0h24v24H0V0z" fill="none"/>
172 |     <path d="M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z"/><path d="M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z"/>
173 |   </svg>
174 |       </button>
175 | 
176 |   <style>
177 |     .colab-df-container {
178 |       display:flex;
179 |       flex-wrap:wrap;
180 |       gap: 12px;
181 |     }
182 | 
183 |     .colab-df-convert {
184 |       background-color: #E8F0FE;
185 |       border: none;
186 |       border-radius: 50%;
187 |       cursor: pointer;
188 |       display: none;
189 |       fill: #1967D2;
190 |       height: 32px;
191 |       padding: 0 0 0 0;
192 |       width: 32px;
193 |     }
194 | 
195 |     .colab-df-convert:hover {
196 |       background-color: #E2EBFA;
197 |       box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
198 |       fill: #174EA6;
199 |     }
200 | 
201 |     [theme=dark] .colab-df-convert {
202 |       background-color: #3B4455;
203 |       fill: #D2E3FC;
204 |     }
205 | 
206 |     [theme=dark] .colab-df-convert:hover {
207 |       background-color: #434B5C;
208 |       box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
209 |       filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
210 |       fill: #FFFFFF;
211 |     }
212 |   </style>
213 | 
214 |       <script>
215 |         const buttonEl =
216 |           document.querySelector('#df-ad145a91-fe65-4f27-90e9-8193b3dfe59a button.colab-df-convert');
217 |         buttonEl.style.display =
218 |           google.colab.kernel.accessAllowed ? 'block' : 'none';
219 | 
220 |         async function convertToInteractive(key) {
221 |           const element = document.querySelector('#df-ad145a91-fe65-4f27-90e9-8193b3dfe59a');
222 |           const dataTable =
223 |             await google.colab.kernel.invokeFunction('convertToInteractive',
224 |                                                      [key], {});
225 |           if (!dataTable) return;
226 | 
227 |           const docLinkHtml = 'Like what you see? Visit the ' +
228 |             '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
229 |             + ' to learn more about interactive tables.';
230 |           element.innerHTML = '';
231 |           dataTable['output_type'] = 'display_data';
232 |           await google.colab.output.renderOutput(dataTable, element);
233 |           const docLink = document.createElement('div');
234 |           docLink.innerHTML = docLinkHtml;
235 |           element.appendChild(docLink);
236 |         }
237 |       </script>
238 |     </div>
239 |   </div>
240 | 
241 | 
242 | 
243 | 
244 | # SetFit
245 | 
246 | 
247 | ```python
248 | #@title SetFit
249 | st_model = 'paraphrase-mpnet-base-v2' #@param ['paraphrase-mpnet-base-v2', 'all-mpnet-base-v1', 'all-mpnet-base-v2', 'stsb-mpnet-base-v2', 'all-MiniLM-L12-v2', 'paraphrase-albert-small-v2', 'all-roberta-large-v1']
250 | num_training = 32 #@param ["8", "16", "32", "54", "128", "256", "512"] {type:"raw"}
251 | num_itr = 5 #@param ["1", "2", "3", "4", "5", "10"] {type:"raw"}
252 | plot2d_checkbox = True #@param {type: 'boolean'}
253 | 
254 | ```
255 | 
256 | 
257 | ```python
258 | set_seed(0)
259 | # Equal samples per class training
260 | train_df_sample = pd.concat([train_df[train_df[1]==0].sample(num_training), train_df[train_df[1]==1].sample(num_training)])
261 | x_train = train_df_sample[text_col].values.tolist()
262 | y_train = train_df_sample[category_col].values.tolist()
263 | 
264 | ```
265 | 
266 | 
267 | ```python
268 | train_df_sample.shape
269 | ```
270 | 
271 | 
272 | 
273 | 
274 |     (64, 2)
275 | 
276 | 
277 | 
278 | 
279 | ```python
280 | x_train[0:5], y_train[0:5]
281 | ```
282 | 
283 | 
284 | 
285 | 
286 |     (['makes a joke out of car chases for an hour and then gives us half an hour of car chases',
287 |       "so mind numbingly awful that you hope britney wo n't do it one more time , as far as movies are concerned",
288 |       "maid in manhattan proves that it 's easier to change the sheets than to change hackneyed concepts when it comes to dreaming up romantic comedies",
289 |       'if you go , pack your knitting needles',
290 |       'time of favor could have given audiences the time of day by concentrating on the elements of a revealing alienation among a culture of people who sadly are at hostile odds with one another through recklessness and retaliation'],
291 |      [0, 0, 0, 0, 0])
292 | 
293 | 
294 | 
295 | 
296 | ```python
297 | train_examples = [] 
298 | for x in range(num_itr):
299 |   train_examples = sentence_pairs_generation(np.array(x_train), np.array(y_train), train_examples)
300 | ```
301 | 
302 | 
303 | ```python
304 | len(train_examples)
305 | ```
306 | 
307 | 
308 | 
309 | 
310 |     640
311 | 
312 | 
313 | 
314 | 
315 | ```python
316 | i_example = train_examples[0]
317 | ```
318 | 
319 | 
320 | ```python
321 | i_example.texts, i_example.label
322 | ```
323 | 
324 | 
325 | 
326 | 
327 |     (['makes a joke out of car chases for an hour and then gives us half an hour of car chases',
328 |       'it is that rare combination of bad writing , bad direction and bad acting the trifecta of badness'],
329 |      1.0)
330 | 
331 | 
332 | 
333 | 
334 | ```python
335 | orig_model = SentenceTransformer(st_model)
336 | model = SentenceTransformer(st_model)
337 | ```
338 | 
339 | 
340 |     Downloading:   0%|          | 0.00/690 [00:00<?, ?B/s]
341 | 
342 | 
343 | 
344 |     Downloading:   0%|          | 0.00/3.70k [00:00<?, ?B/s]
345 | 
346 | 
347 | 
348 |     Downloading:   0%|          | 0.00/594 [00:00<?, ?B/s]
349 | 
350 | 
351 | 
352 |     Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]
353 | 
354 | 
355 | 
356 |     Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]
357 | 
358 | 
359 | 
360 |     Downloading:   0%|          | 0.00/438M [00:00<?, ?B/s]
361 | 
362 | 
363 | 
364 |     Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]
365 | 
366 | 
367 | 
368 |     Downloading:   0%|          | 0.00/239 [00:00<?, ?B/s]
369 | 
370 | 
371 | 
372 |     Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]
373 | 
374 | 
375 | 
376 |     Downloading:   0%|          | 0.00/1.19k [00:00<?, ?B/s]
377 | 
378 | 
379 | 
380 |     Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]
381 | 
382 | 
383 | 
384 |     Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]
385 | 
386 | 
387 | 
388 | ```python
389 | # S-BERT adaptation 
390 | train_dataloader = DataLoader(train_examples, shuffle=True, batch_size=16)
391 | train_loss = losses.CosineSimilarityLoss(model)
392 | model.fit(train_objectives=[(train_dataloader, train_loss)], epochs=1, warmup_steps=10, show_progress_bar=True)
393 | 
394 | ```
395 | 
396 | 
397 |     Epoch:   0%|          | 0/1 [00:00<?, ?it/s]
398 | 
399 | 
400 | 
401 |     Iteration:   0%|          | 0/40 [00:00<?, ?it/s]
402 | 
403 | 
404 | 
405 | ```python
406 | len(y_train)
407 | ```
408 | 
409 | 
410 | 
411 | 
412 |     64
413 | 
414 | 
415 | 
416 | 
417 | ```python
418 | # No Fit
419 | X_train_noFT = orig_model.encode(x_train)
420 | X_eval_noFT = orig_model.encode(x_eval)
421 | 
422 | sgd =  LogisticRegression()
423 | sgd.fit(X_train_noFT, y_train)
424 | y_pred_eval_sgd = sgd.predict(X_eval_noFT)
425 | 
426 | print('Acc. No Fit', accuracy_score(y_eval, y_pred_eval_sgd))
427 | ```
428 | 
429 |     Acc. No Fit 0.8390993959362988
430 | 
431 | 
432 | 
433 | ```python
434 | # With Fit (SetFit)
435 | X_train = model.encode(x_train)
436 | X_eval = model.encode(x_eval)
437 | 
438 | sgd =  LogisticRegression()
439 | sgd.fit(X_train, y_train)
440 | y_pred_eval_sgd = sgd.predict(X_eval)
441 | 
442 | print('Acc. SetFit', accuracy_score(y_eval, y_pred_eval_sgd))
443 | ```
444 | 
445 |     Acc. SetFit 0.9082921471718836
446 | 
447 | 
448 | 
449 | ```python
450 | #Plot 2-D 2x2 figures
451 | if plot2d_checkbox:   
452 | 
453 |   plt.figure(figsize=(20,10))
454 | 
455 | #Plot X_train_noFit
456 |   X_embedded = TSNE(n_components=2).fit_transform(np.array(X_train_noFT))
457 |   plt.subplot(221)
458 |   plt.title('X_train No Fit')
459 | 
460 |   for i, t in enumerate(set(np.array(y_train))):
461 |       idx = np.array(y_train) == t
462 |       plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t)   
463 | 
464 |   plt.legend(bbox_to_anchor=(1, 1));
465 | 
466 | #Plot X_eval noFit
467 |   X_embedded = TSNE(n_components=2).fit_transform(np.array(X_eval_noFT))
468 |   plt.subplot(223)
469 |   plt.title('X_eval No Fit')
470 | 
471 |   for i, t in enumerate(set(np.array(y_eval))):
472 |       idx = np.array(y_eval) == t
473 |       plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t)   
474 | 
475 |   plt.legend(bbox_to_anchor=(1, 1));
476 | 
477 | 
478 | #Plot X_train SetFit
479 |   X_embedded = TSNE(n_components=2).fit_transform(np.array(X_train))
480 | 
481 |   plt.subplot(222)
482 |   plt.title('X_train SetFit')
483 | 
484 |   for i, t in enumerate(set(np.array(y_train))):
485 |       idx = np.array(y_train) == t
486 |       plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t)   
487 | 
488 |   plt.legend(bbox_to_anchor=(1, 1));
489 | 
490 | #Plot X_eval SetFit
491 |   X_embedded = TSNE(n_components=2).fit_transform(np.array(X_eval))
492 |   plt.subplot(224)
493 |   plt.title('X_eval SetFit')
494 | 
495 |   for i, t in enumerate(set(np.array(y_eval))):
496 |       idx = np.array(y_eval) == t
497 |       plt.scatter(X_embedded[idx, 0], X_embedded[idx, 1], label=t)   
498 | 
499 |   plt.legend(bbox_to_anchor=(1, 1));
500 | 
501 | ```
502 | 
503 | 
504 |     
505 | ![png](SetFit_SST_2_Few_shot_files/SetFit_SST_2_Few_shot_22_0.png)
506 |     
507 | 
508 | 
509 | 
510 | ```python
511 | 
512 | ```
513 | 


--------------------------------------------------------------------------------
/docs/SetFit_SST_2_Few_shot_files/SetFit_SST_2_Few_shot_22_0.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/Ankur3107/nlp_notebooks/db671902eebb52f7174df5f302cbc700fe0b0b97/docs/SetFit_SST_2_Few_shot_files/SetFit_SST_2_Few_shot_22_0.png


--------------------------------------------------------------------------------
/docs/Wikipedia_answer_retrieval_DPR.md:
--------------------------------------------------------------------------------
  1 | <a href="https://colab.research.google.com/github/Ankur3107/nlp_notebooks/blob/master/opendomain-qa/Wikipedia_answer_retrieval_DPR.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  2 | 
  3 | 
  4 | ```python
  5 | !pip install wikipedia transformers sentence_transformers faiss-cpu -q
  6 | ```
  7 | 
  8 |     [K     |████████████████████████████████| 3.4 MB 36.1 MB/s 
  9 |     [K     |████████████████████████████████| 78 kB 5.8 MB/s 
 10 |     [K     |████████████████████████████████| 8.6 MB 60.3 MB/s 
 11 |     [K     |████████████████████████████████| 3.3 MB 54.2 MB/s 
 12 |     [K     |████████████████████████████████| 895 kB 66.2 MB/s 
 13 |     [K     |████████████████████████████████| 67 kB 4.6 MB/s 
 14 |     [K     |████████████████████████████████| 596 kB 64.9 MB/s 
 15 |     [K     |████████████████████████████████| 1.2 MB 54.0 MB/s 
 16 |     [?25h  Building wheel for wikipedia (setup.py) ... [?25l[?25hdone
 17 |       Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
 18 | 
 19 | 
 20 | 
 21 | ```python
 22 | import wikipedia
 23 | from wikipedia.exceptions import DisambiguationError
 24 | from transformers import pipeline
 25 | ```
 26 | 
 27 | 
 28 | ```python
 29 | def divide_chunks(l, n):
 30 |     # looping till length l
 31 |     for i in range(0, len(l), n): 
 32 |         yield l[i:i + n]
 33 | 
 34 | def get_passages(text, k=100):
 35 |     tokens = text.split(" ")
 36 |     tokens_chunks = list(divide_chunks(tokens, k))
 37 |     passages = [" ".join(c) for c in tokens_chunks]
 38 |     return passages
 39 | 
 40 | def get_passage_for_question(question, wiki_hits=3, passage_len=100, debug=False):
 41 |   top_hits = wikipedia.search(question, wiki_hits)
 42 |   if debug:
 43 |     print("Top Wiki hits :", top_hits)
 44 |   passages = []
 45 |   for hit in top_hits:
 46 |     try:
 47 |       html_page = wikipedia.page(title = hit, auto_suggest = False)
 48 |     except DisambiguationError:
 49 |       continue
 50 |     hit_passages = get_passages(html_page.content, k=passage_len)
 51 |     passages.extend(hit_passages)
 52 | 
 53 |   return passages
 54 | ```
 55 | 
 56 | 
 57 | ```python
 58 | qa = pipeline("question-answering", model="ankur310794/roberta-base-squad2-nq")
 59 | ```
 60 | 
 61 | 
 62 |     Downloading:   0%|          | 0.00/643 [00:00<?, ?B/s]
 63 | 
 64 | 
 65 |     The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
 66 |     The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
 67 | 
 68 | 
 69 | 
 70 |     Downloading:   0%|          | 0.00/473M [00:00<?, ?B/s]
 71 | 
 72 | 
 73 | 
 74 |     Downloading:   0%|          | 0.00/1.28k [00:00<?, ?B/s]
 75 | 
 76 | 
 77 |     The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
 78 | 
 79 | 
 80 | 
 81 |     Downloading:   0%|          | 0.00/780k [00:00<?, ?B/s]
 82 | 
 83 | 
 84 | 
 85 |     Downloading:   0%|          | 0.00/446k [00:00<?, ?B/s]
 86 | 
 87 | 
 88 | 
 89 |     Downloading:   0%|          | 0.00/772 [00:00<?, ?B/s]
 90 | 
 91 | 
 92 |     The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
 93 |     The `xla_device` argument has been deprecated in v4.4.0 of Transformers. It is ignored and you can safely remove it from your `config.json` file.
 94 | 
 95 | 
 96 | 
 97 | ```python
 98 | from transformers import TFAutoModel, AutoTokenizer
 99 | ```
100 | 
101 | 
102 | ```python
103 | def combine_results(passages, k=4):
104 |   passages_list = list(divide_chunks(passages, k))
105 |   passages_str = [" ".join(p) for p in passages_list]
106 |   return passages_str
107 | ```
108 | 
109 | 
110 | ```python
111 | passage_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
112 | query_encoder = TFAutoModel.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
113 | 
114 | p_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2")
115 | q_tokenizer = AutoTokenizer.from_pretrained("nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2")
116 | 
117 | ```
118 | 
119 | 
120 |     Downloading:   0%|          | 0.00/658 [00:00<?, ?B/s]
121 | 
122 | 
123 | 
124 |     Downloading:   0%|          | 0.00/16.8M [00:00<?, ?B/s]
125 | 
126 | 
127 |     All model checkpoint layers were used when initializing TFBertModel.
128 |     
129 |     All the layers of TFBertModel were initialized from the model checkpoint at nlpconnect/dpr-ctx_encoder_bert_uncased_L-2_H-128_A-2.
130 |     If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
131 | 
132 | 
133 | 
134 |     Downloading:   0%|          | 0.00/656 [00:00<?, ?B/s]
135 | 
136 | 
137 | 
138 |     Downloading:   0%|          | 0.00/16.8M [00:00<?, ?B/s]
139 | 
140 | 
141 |     All model checkpoint layers were used when initializing TFBertModel.
142 |     
143 |     All the layers of TFBertModel were initialized from the model checkpoint at nlpconnect/dpr-question_encoder_bert_uncased_L-2_H-128_A-2.
144 |     If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions without further training.
145 | 
146 | 
147 | 
148 |     Downloading:   0%|          | 0.00/360 [00:00<?, ?B/s]
149 | 
150 | 
151 | 
152 |     Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]
153 | 
154 | 
155 | 
156 |     Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]
157 | 
158 | 
159 | 
160 |     Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]
161 | 
162 | 
163 | 
164 |     Downloading:   0%|          | 0.00/360 [00:00<?, ?B/s]
165 | 
166 | 
167 | 
168 |     Downloading:   0%|          | 0.00/226k [00:00<?, ?B/s]
169 | 
170 | 
171 | 
172 |     Downloading:   0%|          | 0.00/455k [00:00<?, ?B/s]
173 | 
174 | 
175 | 
176 |     Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]
177 | 
178 | 
179 | 
180 | ```python
181 | import numpy as np
182 | def extracted_passage_embeddings(processed_passages, max_length=156):
183 |     passage_inputs = p_tokenizer.batch_encode_plus(
184 |                     processed_passages,
185 |                     add_special_tokens=True,
186 |                     truncation=True,
187 |                     padding="max_length",
188 |                     max_length=max_length,
189 |                     return_token_type_ids=True
190 |                 )
191 |     passage_embeddings = passage_encoder.predict([np.array(passage_inputs['input_ids']), 
192 |                                                 np.array(passage_inputs['attention_mask']), 
193 |                                                 np.array(passage_inputs['token_type_ids'])], 
194 |                                                 batch_size=1024, 
195 |                                                 verbose=1)
196 |     return passage_embeddings
197 | 
198 | def extracted_query_embeddings(queries, max_length=64):
199 |     query_inputs = q_tokenizer.batch_encode_plus(
200 |                     queries,
201 |                     add_special_tokens=True,
202 |                     truncation=True,
203 |                     padding="max_length",
204 |                     max_length=max_length,
205 |                     return_token_type_ids=True
206 |                 )
207 |     query_embeddings = query_encoder.predict([np.array(query_inputs['input_ids']), 
208 |                                                 np.array(query_inputs['attention_mask']), 
209 |                                                 np.array(query_inputs['token_type_ids'])], 
210 |                                                 batch_size=1, 
211 |                                                 verbose=1)
212 |     return query_embeddings
213 | ```
214 | 
215 | 
216 | ```python
217 | import faiss
218 | import spacy
219 | nlp = spacy.load("en")
220 | ```
221 | 
222 | 
223 | ```python
224 | def get_answer_full_sent(m_passages, answer_dict):
225 |   all_sents = list(nlp(m_passages).sents)
226 |   all_sents = [s.text for s in all_sents]
227 | 
228 |   for i in range(len(all_sents)):
229 |     if len("".join(all_sents[0:i])[answer_dict['start']:answer_dict['end']])>2:
230 |       answer_dict['answer_sentence'] = all_sents[i-1]
231 |       return answer_dict
232 |   return answer_dict
233 | ```
234 | 
235 | 
236 | ```python
237 | from sentence_transformers import CrossEncoder
238 | ranking_model = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2', max_length=196)
239 | def get_reranked_passage(passages, question, top_rr):
240 |   passage_question_pair = [(question, p) for p in passages]
241 |   scores = ranking_model.predict(passage_question_pair)
242 |   shorted_index = np.argpartition(scores, -top_rr)[::-1]
243 |   shorted_scores = np.array([scores[i] for i in shorted_index])
244 |   return [passages[i] for i in shorted_index[0:top_rr]]
245 | ```
246 | 
247 | 
248 | ```python
249 | # end to end with dpr
250 | topk_r=30
251 | topk_rr=8
252 | import pandas as pd
253 | 
254 | 
255 | def get_answer_dpr(question):
256 |   passages = get_passage_for_question(question, debug=True)
257 |   print("Total passages: ", len(passages))
258 |   passage_embeddings = extracted_passage_embeddings(passages)
259 |   query_embeddings = extracted_query_embeddings([question])
260 |   faiss_index = faiss.IndexFlatL2(128)
261 |   faiss_index.add(passage_embeddings.pooler_output)
262 |   prob, index = faiss_index.search(query_embeddings.pooler_output, k=topk_r)
263 |   r_passages = [passages[i] for i in index[0]]
264 |   print("Top k retrieved passages :", len(r_passages))
265 |   rr_passages = get_reranked_passage(r_passages, question, topk_rr)
266 |   print("Top k reranked passages :", len(rr_passages))
267 |   m_passages = combine_results(rr_passages)
268 |   print("Merged passages :", len(m_passages))
269 |   results = qa(question=[question]*len(m_passages), context=m_passages, max_seq_len=512)
270 |   if isinstance(results, dict):
271 |     results = [results]
272 |   output_results = [get_answer_full_sent(m_passages[i],results[i]) for i in range(len(results))]
273 |   return pd.DataFrame(output_results)[['answer', 'answer_sentence', 'score']].sort_values("score", ascending=False)
274 | ```
275 | 
276 | 
277 | ```python
278 | results= get_answer_dpr("where was tara located in gone with the wind?")
279 | results
280 | ```
281 | 
282 |     Top Wiki hits : ['Tara (plantation)', 'Margaret Mitchell', 'RKO Forty Acres']
283 |     Total passages:  95
284 |     1/1 [==============================] - 0s 470ms/step
285 |     1/1 [==============================] - 0s 21ms/step
286 |     Top k retrieved passages : 30
287 |     Top k reranked passages : 8
288 |     Merged passages : 2
289 | 
290 | 
291 |     /usr/local/lib/python3.7/dist-packages/numpy/core/_asarray.py:83: VisibleDeprecationWarning: Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray
292 |       return array(a, dtype, copy=False, order=order)
293 | 
294 | 
295 | 
296 | 
297 | 
298 | 
299 |   <div id="df-17c68291-5890-4bd1-924f-9423f8d0f7f7">
300 |     <div class="colab-df-container">
301 |       <div>
302 | <style scoped>
303 |     .dataframe tbody tr th:only-of-type {
304 |         vertical-align: middle;
305 |     }
306 | 
307 |     .dataframe tbody tr th {
308 |         vertical-align: top;
309 |     }
310 | 
311 |     .dataframe thead th {
312 |         text-align: right;
313 |     }
314 | </style>
315 | <table border="1" class="dataframe">
316 |   <thead>
317 |     <tr style="text-align: right;">
318 |       <th></th>
319 |       <th>answer</th>
320 |       <th>answer_sentence</th>
321 |       <th>score</th>
322 |     </tr>
323 |   </thead>
324 |   <tbody>
325 |     <tr>
326 |       <th>0</th>
327 |       <td>Talmadge Farms</td>
328 |       <td>Now the Tara facade is still located at Talmad...</td>
329 |       <td>0.740677</td>
330 |     </tr>
331 |     <tr>
332 |       <th>1</th>
333 |       <td>virtually the same</td>
334 |       <td>In the 2007 novel by Donald McCaig, Rhett Butl...</td>
335 |       <td>0.159294</td>
336 |     </tr>
337 |   </tbody>
338 | </table>
339 | </div>
340 |       <button class="colab-df-convert" onclick="convertToInteractive('df-17c68291-5890-4bd1-924f-9423f8d0f7f7')"
341 |               title="Convert this dataframe to an interactive table."
342 |               style="display:none;">
343 | 
344 |   <svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
345 |        width="24px">
346 |     <path d="M0 0h24v24H0V0z" fill="none"/>
347 |     <path d="M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z"/><path d="M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z"/>
348 |   </svg>
349 |       </button>
350 | 
351 |   <style>
352 |     .colab-df-container {
353 |       display:flex;
354 |       flex-wrap:wrap;
355 |       gap: 12px;
356 |     }
357 | 
358 |     .colab-df-convert {
359 |       background-color: #E8F0FE;
360 |       border: none;
361 |       border-radius: 50%;
362 |       cursor: pointer;
363 |       display: none;
364 |       fill: #1967D2;
365 |       height: 32px;
366 |       padding: 0 0 0 0;
367 |       width: 32px;
368 |     }
369 | 
370 |     .colab-df-convert:hover {
371 |       background-color: #E2EBFA;
372 |       box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
373 |       fill: #174EA6;
374 |     }
375 | 
376 |     [theme=dark] .colab-df-convert {
377 |       background-color: #3B4455;
378 |       fill: #D2E3FC;
379 |     }
380 | 
381 |     [theme=dark] .colab-df-convert:hover {
382 |       background-color: #434B5C;
383 |       box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
384 |       filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
385 |       fill: #FFFFFF;
386 |     }
387 |   </style>
388 | 
389 |       <script>
390 |         const buttonEl =
391 |           document.querySelector('#df-17c68291-5890-4bd1-924f-9423f8d0f7f7 button.colab-df-convert');
392 |         buttonEl.style.display =
393 |           google.colab.kernel.accessAllowed ? 'block' : 'none';
394 | 
395 |         async function convertToInteractive(key) {
396 |           const element = document.querySelector('#df-17c68291-5890-4bd1-924f-9423f8d0f7f7');
397 |           const dataTable =
398 |             await google.colab.kernel.invokeFunction('convertToInteractive',
399 |                                                      [key], {});
400 |           if (!dataTable) return;
401 | 
402 |           const docLinkHtml = 'Like what you see? Visit the ' +
403 |             '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
404 |             + ' to learn more about interactive tables.';
405 |           element.innerHTML = '';
406 |           dataTable['output_type'] = 'display_data';
407 |           await google.colab.output.renderOutput(dataTable, element);
408 |           const docLink = document.createElement('div');
409 |           docLink.innerHTML = docLinkHtml;
410 |           element.appendChild(docLink);
411 |         }
412 |       </script>
413 |     </div>
414 |   </div>
415 | 
416 | 
417 | 
418 | 
419 | 
420 | ```python
421 | results.sort_values("score", ascending=False)
422 | ```
423 | 
424 | 
425 | 
426 | 
427 | 
428 |   <div id="df-e023e6a4-4c3f-46d9-9958-2b8b0e640726">
429 |     <div class="colab-df-container">
430 |       <div>
431 | <style scoped>
432 |     .dataframe tbody tr th:only-of-type {
433 |         vertical-align: middle;
434 |     }
435 | 
436 |     .dataframe tbody tr th {
437 |         vertical-align: top;
438 |     }
439 | 
440 |     .dataframe thead th {
441 |         text-align: right;
442 |     }
443 | </style>
444 | <table border="1" class="dataframe">
445 |   <thead>
446 |     <tr style="text-align: right;">
447 |       <th></th>
448 |       <th>answer</th>
449 |       <th>answer_sentence</th>
450 |       <th>score</th>
451 |     </tr>
452 |   </thead>
453 |   <tbody>
454 |     <tr>
455 |       <th>0</th>
456 |       <td>Talmadge Farms</td>
457 |       <td>Now the Tara facade is still located at Talmad...</td>
458 |       <td>0.740677</td>
459 |     </tr>
460 |     <tr>
461 |       <th>1</th>
462 |       <td>virtually the same</td>
463 |       <td>In the 2007 novel by Donald McCaig, Rhett Butl...</td>
464 |       <td>0.159294</td>
465 |     </tr>
466 |   </tbody>
467 | </table>
468 | </div>
469 |       <button class="colab-df-convert" onclick="convertToInteractive('df-e023e6a4-4c3f-46d9-9958-2b8b0e640726')"
470 |               title="Convert this dataframe to an interactive table."
471 |               style="display:none;">
472 | 
473 |   <svg xmlns="http://www.w3.org/2000/svg" height="24px"viewBox="0 0 24 24"
474 |        width="24px">
475 |     <path d="M0 0h24v24H0V0z" fill="none"/>
476 |     <path d="M18.56 5.44l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94zm-11 1L8.5 8.5l.94-2.06 2.06-.94-2.06-.94L8.5 2.5l-.94 2.06-2.06.94zm10 10l.94 2.06.94-2.06 2.06-.94-2.06-.94-.94-2.06-.94 2.06-2.06.94z"/><path d="M17.41 7.96l-1.37-1.37c-.4-.4-.92-.59-1.43-.59-.52 0-1.04.2-1.43.59L10.3 9.45l-7.72 7.72c-.78.78-.78 2.05 0 2.83L4 21.41c.39.39.9.59 1.41.59.51 0 1.02-.2 1.41-.59l7.78-7.78 2.81-2.81c.8-.78.8-2.07 0-2.86zM5.41 20L4 18.59l7.72-7.72 1.47 1.35L5.41 20z"/>
477 |   </svg>
478 |       </button>
479 | 
480 |   <style>
481 |     .colab-df-container {
482 |       display:flex;
483 |       flex-wrap:wrap;
484 |       gap: 12px;
485 |     }
486 | 
487 |     .colab-df-convert {
488 |       background-color: #E8F0FE;
489 |       border: none;
490 |       border-radius: 50%;
491 |       cursor: pointer;
492 |       display: none;
493 |       fill: #1967D2;
494 |       height: 32px;
495 |       padding: 0 0 0 0;
496 |       width: 32px;
497 |     }
498 | 
499 |     .colab-df-convert:hover {
500 |       background-color: #E2EBFA;
501 |       box-shadow: 0px 1px 2px rgba(60, 64, 67, 0.3), 0px 1px 3px 1px rgba(60, 64, 67, 0.15);
502 |       fill: #174EA6;
503 |     }
504 | 
505 |     [theme=dark] .colab-df-convert {
506 |       background-color: #3B4455;
507 |       fill: #D2E3FC;
508 |     }
509 | 
510 |     [theme=dark] .colab-df-convert:hover {
511 |       background-color: #434B5C;
512 |       box-shadow: 0px 1px 3px 1px rgba(0, 0, 0, 0.15);
513 |       filter: drop-shadow(0px 1px 2px rgba(0, 0, 0, 0.3));
514 |       fill: #FFFFFF;
515 |     }
516 |   </style>
517 | 
518 |       <script>
519 |         const buttonEl =
520 |           document.querySelector('#df-e023e6a4-4c3f-46d9-9958-2b8b0e640726 button.colab-df-convert');
521 |         buttonEl.style.display =
522 |           google.colab.kernel.accessAllowed ? 'block' : 'none';
523 | 
524 |         async function convertToInteractive(key) {
525 |           const element = document.querySelector('#df-e023e6a4-4c3f-46d9-9958-2b8b0e640726');
526 |           const dataTable =
527 |             await google.colab.kernel.invokeFunction('convertToInteractive',
528 |                                                      [key], {});
529 |           if (!dataTable) return;
530 | 
531 |           const docLinkHtml = 'Like what you see? Visit the ' +
532 |             '<a target="_blank" href=https://colab.research.google.com/notebooks/data_table.ipynb>data table notebook</a>'
533 |             + ' to learn more about interactive tables.';
534 |           element.innerHTML = '';
535 |           dataTable['output_type'] = 'display_data';
536 |           await google.colab.output.renderOutput(dataTable, element);
537 |           const docLink = document.createElement('div');
538 |           docLink.innerHTML = docLinkHtml;
539 |           element.appendChild(docLink);
540 |         }
541 |       </script>
542 |     </div>
543 |   </div>
544 | 
545 | 
546 | 
547 | 
548 | 
549 | ```python
550 | !pip install gradio -q
551 | ```
552 | 
553 |     [K     |████████████████████████████████| 865 kB 21.2 MB/s 
554 |     [K     |████████████████████████████████| 2.0 MB 37.0 MB/s 
555 |     [K     |████████████████████████████████| 210 kB 50.3 MB/s 
556 |     [K     |████████████████████████████████| 61 kB 345 kB/s 
557 |     [K     |████████████████████████████████| 856 kB 42.0 MB/s 
558 |     [K     |████████████████████████████████| 3.6 MB 54.9 MB/s 
559 |     [?25h  Building wheel for ffmpy (setup.py) ... [?25l[?25hdone
560 |       Building wheel for flask-cachebuster (setup.py) ... [?25l[?25hdone
561 | 
562 | 
563 | 
564 | ```python
565 | import gradio as gr
566 | inp = gr.inputs.Textbox(lines=2, default='what is coronavirus?', label="Question")
567 | out = gr.outputs.Dataframe(label="Answers")#gr.outputs.Textbox(label="Answers")
568 | gr.Interface(fn=get_answer_dpr, inputs=inp, outputs=out).launch()
569 | ```
570 | 
571 |     Colab notebook detected. To show errors in colab notebook, set `debug=True` in `launch()`
572 |     Running on public URL: https://21615.gradio.app
573 |     
574 |     This share link expires in 72 hours. For free permanent hosting, check out Spaces (https://huggingface.co/spaces)
575 | 
576 | 
577 | 
578 | 
579 | <iframe
580 |     width="900"
581 |     height="500"
582 |     src="https://21615.gradio.app"
583 |     frameborder="0"
584 |     allowfullscreen
585 | ></iframe>
586 | 
587 | 
588 | 
589 | 
590 | 
591 | 
592 |     (<Flask 'gradio.networking'>,
593 |      'http://127.0.0.1:7860/',
594 |      'https://21615.gradio.app')
595 | 
596 | 
597 | 
598 | 
599 | ```python
600 | 
601 | ```
602 | 


--------------------------------------------------------------------------------
/docs/index.md:
--------------------------------------------------------------------------------
1 | # NLP-Notebooks Collection


--------------------------------------------------------------------------------
/docs/knowledge_distillation_exploration.md:
--------------------------------------------------------------------------------
  1 | <a href="https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/knowledge_distillation_exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>
  2 | 
  3 | 
  4 | ```
  5 | import tensorflow as tf
  6 | from tensorflow import keras
  7 | from tensorflow.keras import layers
  8 | ```
  9 | 
 10 | 
 11 | ```
 12 | vocab_size = 20000  # Only consider the top 20k words
 13 | maxlen = 200  # Only consider the first 200 words of each movie review
 14 | (x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
 15 | print(len(x_train), "Training sequences")
 16 | print(len(x_val), "Validation sequences")
 17 | x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)
 18 | x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)
 19 | ```
 20 | 
 21 |     25000 Training sequences
 22 |     25000 Validation sequences
 23 | 
 24 | 
 25 | 
 26 | ```
 27 | class Distiller(keras.Model):
 28 |     def __init__(self, student, teacher):
 29 |         super(Distiller, self).__init__()
 30 |         self.teacher = student
 31 |         self.student = teacher
 32 | 
 33 |     def compile(
 34 |         self,
 35 |         optimizer,
 36 |         metrics,
 37 |         student_loss_fn,
 38 |         distillation_loss_fn,
 39 |         alpha=0.1,
 40 |         temperature=3,
 41 |     ):
 42 |         """ Configure the distiller.
 43 | 
 44 |         Args:
 45 |             optimizer: Keras optimizer for the student weights
 46 |             metrics: Keras metrics for evaluation
 47 |             student_loss_fn: Loss function of difference between student
 48 |                 predictions and ground-truth
 49 |             distillation_loss_fn: Loss function of difference between soft
 50 |                 student predictions and soft teacher predictions
 51 |             alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
 52 |             temperature: Temperature for softening probability distributions.
 53 |                 Larger temperature gives softer distributions.
 54 |         """
 55 |         super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)
 56 |         self.student_loss_fn = student_loss_fn
 57 |         self.distillation_loss_fn = distillation_loss_fn
 58 |         self.alpha = alpha
 59 |         self.temperature = temperature
 60 | 
 61 |     def train_step(self, data):
 62 |         # Unpack data
 63 |         x, y = data
 64 | 
 65 |         # Forward pass of teacher
 66 |         teacher_predictions = self.teacher(x, training=False)
 67 | 
 68 |         with tf.GradientTape() as tape:
 69 |             # Forward pass of student
 70 |             student_predictions = self.student(x, training=True)
 71 | 
 72 |             # Compute losses
 73 |             student_loss = self.student_loss_fn(y, student_predictions)
 74 |             distillation_loss = self.distillation_loss_fn(
 75 |                 tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
 76 |                 tf.nn.softmax(student_predictions / self.temperature, axis=1),
 77 |             )
 78 |             loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
 79 | 
 80 |         # Compute gradients
 81 |         trainable_vars = self.student.trainable_variables
 82 |         gradients = tape.gradient(loss, trainable_vars)
 83 | 
 84 |         # Update weights
 85 |         self.optimizer.apply_gradients(zip(gradients, trainable_vars))
 86 | 
 87 |         # Update the metrics configured in `compile()`.
 88 |         self.compiled_metrics.update_state(y, student_predictions)
 89 | 
 90 |         # Return a dict of performance
 91 |         results = {m.name: m.result() for m in self.metrics}
 92 |         results.update(
 93 |             {"student_loss": student_loss, "distillation_loss": distillation_loss}
 94 |         )
 95 |         return results
 96 | 
 97 |     def test_step(self, data):
 98 |         # Unpack the data
 99 |         x, y = data
100 | 
101 |         # Compute predictions
102 |         y_prediction = self.student(x, training=False)
103 | 
104 |         # Calculate the loss
105 |         student_loss = self.student_loss_fn(y, y_prediction)
106 | 
107 |         # Update the metrics.
108 |         self.compiled_metrics.update_state(y, y_prediction)
109 | 
110 |         # Return a dict of performance
111 |         results = {m.name: m.result() for m in self.metrics}
112 |         results.update({"student_loss": student_loss})
113 |         return results
114 | ```
115 | 
116 | 
117 | ```
118 | # Create the teacher
119 | # Input for variable-length sequences of integers
120 | inputs = keras.Input(shape=(maxlen,), dtype="int32")
121 | # Embed each integer in a 128-dimensional vector
122 | x = layers.Embedding(vocab_size, 128)(inputs)
123 | # Add 2 bidirectional LSTMs and GRUs
124 | x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
125 | x = layers.Bidirectional(layers.GRU(128))(x)
126 | # Add a classifier
127 | outputs = layers.Dense(2)(x)
128 | teacher = keras.Model(inputs, outputs, name='teacher')
129 | teacher.summary()
130 | 
131 | 
132 | # Create the student
133 | # Input for variable-length sequences of integers
134 | inputs = keras.Input(shape=(maxlen,), dtype="int32")
135 | # Embed each integer in a 128-dimensional vector
136 | x = layers.Embedding(vocab_size, 64)(inputs)
137 | # Add 2 bidirectional LSTMs and GRUs
138 | x = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(x)
139 | x = layers.Bidirectional(layers.GRU(32))(x)
140 | # Add a classifier
141 | outputs = layers.Dense(2)(x)
142 | student = keras.Model(inputs, outputs, name='student')
143 | student.summary()
144 | 
145 | # Clone student for later comparison
146 | student_scratch = keras.models.clone_model(student)
147 | ```
148 | 
149 |     Model: "teacher"
150 |     _________________________________________________________________
151 |     Layer (type)                 Output Shape              Param #   
152 |     =================================================================
153 |     input_1 (InputLayer)         [(None, 200)]             0         
154 |     _________________________________________________________________
155 |     embedding (Embedding)        (None, 200, 128)          2560000   
156 |     _________________________________________________________________
157 |     bidirectional (Bidirectional (None, 200, 128)          98816     
158 |     _________________________________________________________________
159 |     bidirectional_1 (Bidirection (None, 256)               198144    
160 |     _________________________________________________________________
161 |     dense (Dense)                (None, 2)                 514       
162 |     =================================================================
163 |     Total params: 2,857,474
164 |     Trainable params: 2,857,474
165 |     Non-trainable params: 0
166 |     _________________________________________________________________
167 |     Model: "student"
168 |     _________________________________________________________________
169 |     Layer (type)                 Output Shape              Param #   
170 |     =================================================================
171 |     input_2 (InputLayer)         [(None, 200)]             0         
172 |     _________________________________________________________________
173 |     embedding_1 (Embedding)      (None, 200, 64)           1280000   
174 |     _________________________________________________________________
175 |     bidirectional_2 (Bidirection (None, 200, 64)           24832     
176 |     _________________________________________________________________
177 |     bidirectional_3 (Bidirection (None, 64)                18816     
178 |     _________________________________________________________________
179 |     dense_1 (Dense)              (None, 2)                 130       
180 |     =================================================================
181 |     Total params: 1,323,778
182 |     Trainable params: 1,323,778
183 |     Non-trainable params: 0
184 |     _________________________________________________________________
185 | 
186 | 
187 | 
188 | ```
189 | # Train teacher as usual
190 | teacher.compile(
191 |     optimizer=keras.optimizers.Adam(),
192 |     loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
193 |     metrics=[keras.metrics.SparseCategoricalAccuracy()],
194 | )
195 | 
196 | # Train and evaluate teacher on data.
197 | teacher.fit(x_train, y_train, epochs=3)
198 | teacher.evaluate(x_val, y_val)
199 | ```
200 | 
201 |     Epoch 1/3
202 |     782/782 [==============================] - 43s 55ms/step - loss: 0.3714 - sparse_categorical_accuracy: 0.8351
203 |     Epoch 2/3
204 |     782/782 [==============================] - 43s 55ms/step - loss: 0.2004 - sparse_categorical_accuracy: 0.9232
205 |     Epoch 3/3
206 |     782/782 [==============================] - 42s 54ms/step - loss: 0.1297 - sparse_categorical_accuracy: 0.9519
207 |     782/782 [==============================] - 12s 15ms/step - loss: 0.4922 - sparse_categorical_accuracy: 0.8590
208 | 
209 | 
210 | 
211 | 
212 | 
213 |     [0.49218621850013733, 0.8590400218963623]
214 | 
215 | 
216 | 
217 | 
218 | ```
219 | # Initialize and compile distiller
220 | distiller = Distiller(student=student, teacher=teacher)
221 | distiller.compile(
222 |     optimizer=keras.optimizers.Adam(),
223 |     metrics=[keras.metrics.SparseCategoricalAccuracy()],
224 |     student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
225 |     distillation_loss_fn=keras.losses.KLDivergence(),
226 |     alpha=0.1,
227 |     temperature=10,
228 | )
229 | 
230 | # Distill teacher to student
231 | distiller.fit(x_train, y_train, epochs=3)
232 | 
233 | # Evaluate student on test dataset
234 | distiller.evaluate(x_val, y_val)
235 | ```
236 | 
237 |     Epoch 1/3
238 |     782/782 [==============================] - 53s 68ms/step - sparse_categorical_accuracy: 0.9664 - student_loss: 0.1382 - distillation_loss: 0.0084
239 |     Epoch 2/3
240 |     782/782 [==============================] - 54s 69ms/step - sparse_categorical_accuracy: 0.9781 - student_loss: 0.1119 - distillation_loss: 0.0085
241 |     Epoch 3/3
242 |     782/782 [==============================] - 54s 69ms/step - sparse_categorical_accuracy: 0.9868 - student_loss: 0.0937 - distillation_loss: 0.0088
243 |     782/782 [==============================] - 12s 15ms/step - sparse_categorical_accuracy: 0.8476 - student_loss: 0.3908
244 | 
245 | 
246 | 
247 | 
248 | 
249 |     0.8475599884986877
250 | 
251 | 
252 | 
253 | 
254 | ```
255 | # Train student as doen usually
256 | student_scratch.compile(
257 |     optimizer=keras.optimizers.Adam(),
258 |     loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
259 |     metrics=[keras.metrics.SparseCategoricalAccuracy()],
260 | )
261 | 
262 | # Train and evaluate student trained from scratch.
263 | student_scratch.fit(x_train, y_train, epochs=3)
264 | 
265 | ```
266 | 
267 |     Epoch 1/3
268 |     782/782 [==============================] - 31s 39ms/step - loss: 0.3796 - sparse_categorical_accuracy: 0.8246
269 |     Epoch 2/3
270 |     782/782 [==============================] - 30s 39ms/step - loss: 0.1918 - sparse_categorical_accuracy: 0.9283
271 |     Epoch 3/3
272 |     782/782 [==============================] - 30s 39ms/step - loss: 0.1123 - sparse_categorical_accuracy: 0.9602
273 | 
274 | 
275 | 
276 |     ---------------------------------------------------------------------------
277 | 
278 |     NameError                                 Traceback (most recent call last)
279 | 
280 |     <ipython-input-7-2e22fcfd29d9> in <module>()
281 |           8 # Train and evaluate student trained from scratch.
282 |           9 student_scratch.fit(x_train, y_train, epochs=3)
283 |     ---> 10 student_scratch.evaluate(x_test, y_test)
284 |     
285 | 
286 |     NameError: name 'x_test' is not defined
287 | 
288 | 
289 | 
290 | ```
291 | student_scratch.evaluate(x_val, y_val)
292 | ```
293 | 
294 |     782/782 [==============================] - 12s 15ms/step - loss: 0.3720 - sparse_categorical_accuracy: 0.8596
295 | 
296 | 
297 | 
298 | 
299 | 
300 |     [0.37198370695114136, 0.8595600128173828]
301 | 
302 | 
303 | 
304 | 
305 | ```
306 | 
307 | ```
308 | 


--------------------------------------------------------------------------------
/docs/token_classification_transformers_zenml_files/token_classification_transformers_zenml_36_1.svg:
--------------------------------------------------------------------------------
  1 | <?xml version="1.0" encoding="UTF-8" standalone="no"?>
  2 | <!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
  3 |  "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
  4 | <!-- Generated by graphviz version 2.40.1 (20161225.0304)
  5 |  -->
  6 | <!-- Title: %3 Pages: 1 -->
  7 | <svg width="562pt" height="556pt"
  8 |  viewBox="0.00 0.00 562.25 556.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
  9 | <g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 552)">
 10 | <title>%3</title>
 11 | <polygon fill="#ffffff" stroke="transparent" points="-4,4 -4,-552 558.2538,-552 558.2538,4 -4,4"/>
 12 | <!-- step_1 -->
 13 | <g id="node1" class="node">
 14 | <title>step_1</title>
 15 | <ellipse fill="none" stroke="#000000" cx="415.5" cy="-530" rx="62.2891" ry="18"/>
 16 | <text text-anchor="middle" x="415.5" y="-526.3" font-family="Times,serif" font-size="14.00" fill="#000000">data_importer</text>
 17 | </g>
 18 | <!-- artifact_1 -->
 19 | <g id="node2" class="node">
 20 | <title>artifact_1</title>
 21 | <polygon fill="none" stroke="#000000" points="520,-476 311,-476 311,-438 520,-438 520,-476"/>
 22 | <text text-anchor="middle" x="415.5" y="-460.8" font-family="Times,serif" font-size="14.00" fill="#000000">output </text>
 23 | <text text-anchor="middle" x="415.5" y="-445.8" font-family="Times,serif" font-size="14.00" fill="#000000">(datasets.dataset_dict.DatasetDict)</text>
 24 | </g>
 25 | <!-- step_1&#45;&gt;artifact_1 -->
 26 | <g id="edge1" class="edge">
 27 | <title>step_1&#45;&gt;artifact_1</title>
 28 | <path fill="none" stroke="#000000" d="M415.5,-511.9551C415.5,-504.2107 415.5,-494.9622 415.5,-486.2811"/>
 29 | <polygon fill="#000000" stroke="#000000" points="419.0001,-486.1128 415.5,-476.1128 412.0001,-486.1129 419.0001,-486.1128"/>
 30 | </g>
 31 | <!-- step_2 -->
 32 | <g id="node3" class="node">
 33 | <title>step_2</title>
 34 | <ellipse fill="none" stroke="#000000" cx="415.5" cy="-384" rx="56.59" ry="18"/>
 35 | <text text-anchor="middle" x="415.5" y="-380.3" font-family="Times,serif" font-size="14.00" fill="#000000">tokenization</text>
 36 | </g>
 37 | <!-- artifact_1&#45;&gt;step_2 -->
 38 | <g id="edge3" class="edge">
 39 | <title>artifact_1&#45;&gt;step_2</title>
 40 | <path fill="none" stroke="#000000" d="M415.5,-437.8201C415.5,-430.0975 415.5,-421.0282 415.5,-412.5585"/>
 41 | <polygon fill="#000000" stroke="#000000" points="419.0001,-412.2973 415.5,-402.2973 412.0001,-412.2974 419.0001,-412.2973"/>
 42 | </g>
 43 | <!-- artifact_2 -->
 44 | <g id="node4" class="node">
 45 | <title>artifact_2</title>
 46 | <polygon fill="none" stroke="#000000" points="520,-330 311,-330 311,-292 520,-292 520,-330"/>
 47 | <text text-anchor="middle" x="415.5" y="-314.8" font-family="Times,serif" font-size="14.00" fill="#000000">output </text>
 48 | <text text-anchor="middle" x="415.5" y="-299.8" font-family="Times,serif" font-size="14.00" fill="#000000">(datasets.dataset_dict.DatasetDict)</text>
 49 | </g>
 50 | <!-- step_2&#45;&gt;artifact_2 -->
 51 | <g id="edge2" class="edge">
 52 | <title>step_2&#45;&gt;artifact_2</title>
 53 | <path fill="none" stroke="#000000" d="M415.5,-365.9551C415.5,-358.2107 415.5,-348.9622 415.5,-340.2811"/>
 54 | <polygon fill="#000000" stroke="#000000" points="419.0001,-340.1128 415.5,-330.1128 412.0001,-340.1129 419.0001,-340.1128"/>
 55 | </g>
 56 | <!-- step_3 -->
 57 | <g id="node5" class="node">
 58 | <title>step_3</title>
 59 | <ellipse fill="none" stroke="#000000" cx="384.5" cy="-238" rx="34.394" ry="18"/>
 60 | <text text-anchor="middle" x="384.5" y="-234.3" font-family="Times,serif" font-size="14.00" fill="#000000">trainer</text>
 61 | </g>
 62 | <!-- artifact_2&#45;&gt;step_3 -->
 63 | <g id="edge5" class="edge">
 64 | <title>artifact_2&#45;&gt;step_3</title>
 65 | <path fill="none" stroke="#000000" d="M407.3551,-291.8201C403.8772,-283.6302 399.7561,-273.9257 395.9776,-265.0279"/>
 66 | <polygon fill="#000000" stroke="#000000" points="399.189,-263.6358 392.0587,-255.7994 392.7459,-266.372 399.189,-263.6358"/>
 67 | </g>
 68 | <!-- step_4 -->
 69 | <g id="node7" class="node">
 70 | <title>step_4</title>
 71 | <ellipse fill="none" stroke="#000000" cx="415.5" cy="-92" rx="44.6926" ry="18"/>
 72 | <text text-anchor="middle" x="415.5" y="-88.3" font-family="Times,serif" font-size="14.00" fill="#000000">evaluator</text>
 73 | </g>
 74 | <!-- artifact_2&#45;&gt;step_4 -->
 75 | <g id="edge7" class="edge">
 76 | <title>artifact_2&#45;&gt;step_4</title>
 77 | <path fill="none" stroke="#000000" d="M449.9792,-291.9148C499.6053,-261.9085 581.3649,-201.3963 544.5,-146 527.5488,-120.5277 496.0254,-107.0481 468.3848,-99.9266"/>
 78 | <polygon fill="#000000" stroke="#000000" points="468.9471,-96.4634 458.4113,-97.5801 467.344,-103.2773 468.9471,-96.4634"/>
 79 | </g>
 80 | <!-- artifact_3 -->
 81 | <g id="node6" class="node">
 82 | <title>artifact_3</title>
 83 | <polygon fill="none" stroke="#000000" points="535,-184 0,-184 0,-146 535,-146 535,-184"/>
 84 | <text text-anchor="middle" x="267.5" y="-168.8" font-family="Times,serif" font-size="14.00" fill="#000000">output </text>
 85 | <text text-anchor="middle" x="267.5" y="-153.8" font-family="Times,serif" font-size="14.00" fill="#000000">(transformers.models.distilbert.modeling_tf_distilbert.TFDistilBertForTokenClassification)</text>
 86 | </g>
 87 | <!-- step_3&#45;&gt;artifact_3 -->
 88 | <g id="edge4" class="edge">
 89 | <title>step_3&#45;&gt;artifact_3</title>
 90 | <path fill="none" stroke="#000000" d="M362.2178,-224.0974C346.5349,-214.3123 325.1265,-200.955 306.6303,-189.4146"/>
 91 | <polygon fill="#000000" stroke="#000000" points="308.4327,-186.4138 298.0959,-184.0897 304.7272,-192.3527 308.4327,-186.4138"/>
 92 | </g>
 93 | <!-- artifact_3&#45;&gt;step_4 -->
 94 | <g id="edge8" class="edge">
 95 | <title>artifact_3&#45;&gt;step_4</title>
 96 | <path fill="none" stroke="#000000" d="M306.3853,-145.8201C328.5671,-134.8791 356.2289,-121.235 378.1504,-110.4225"/>
 97 | <polygon fill="#000000" stroke="#000000" points="379.741,-113.5406 387.1611,-105.978 376.6444,-107.2627 379.741,-113.5406"/>
 98 | </g>
 99 | <!-- artifact_4 -->
100 | <g id="node8" class="node">
101 | <title>artifact_4</title>
102 | <polygon fill="none" stroke="#000000" points="464.5,-38 366.5,-38 366.5,0 464.5,0 464.5,-38"/>
103 | <text text-anchor="middle" x="415.5" y="-22.8" font-family="Times,serif" font-size="14.00" fill="#000000">output </text>
104 | <text text-anchor="middle" x="415.5" y="-7.8" font-family="Times,serif" font-size="14.00" fill="#000000">(builtins.float)</text>
105 | </g>
106 | <!-- step_4&#45;&gt;artifact_4 -->
107 | <g id="edge6" class="edge">
108 | <title>step_4&#45;&gt;artifact_4</title>
109 | <path fill="none" stroke="#000000" d="M415.5,-73.9551C415.5,-66.2107 415.5,-56.9622 415.5,-48.2811"/>
110 | <polygon fill="#000000" stroke="#000000" points="419.0001,-48.1128 415.5,-38.1128 412.0001,-48.1129 419.0001,-48.1128"/>
111 | </g>
112 | </g>
113 | </svg>
114 | 


--------------------------------------------------------------------------------
/knowledge-distillation/knowledge_distillation_exploration.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |   "nbformat": 4,
  3 |   "nbformat_minor": 0,
  4 |   "metadata": {
  5 |     "colab": {
  6 |       "name": "knowledge_distillation_exploration.ipynb",
  7 |       "provenance": [],
  8 |       "authorship_tag": "ABX9TyPqqa3lzi3AUvvZifiQJtP8",
  9 |       "include_colab_link": true
 10 |     },
 11 |     "kernelspec": {
 12 |       "name": "python3",
 13 |       "display_name": "Python 3"
 14 |     },
 15 |     "accelerator": "GPU"
 16 |   },
 17 |   "cells": [
 18 |     {
 19 |       "cell_type": "markdown",
 20 |       "metadata": {
 21 |         "id": "view-in-github",
 22 |         "colab_type": "text"
 23 |       },
 24 |       "source": [
 25 |         "<a href=\"https://colab.research.google.com/github/Ankur3107/colab_notebooks/blob/master/knowledge_distillation_exploration.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
 26 |       ]
 27 |     },
 28 |     {
 29 |       "cell_type": "code",
 30 |       "metadata": {
 31 |         "id": "ykxHS6TBX6eS",
 32 |         "colab_type": "code",
 33 |         "colab": {}
 34 |       },
 35 |       "source": [
 36 |         "import tensorflow as tf\n",
 37 |         "from tensorflow import keras\n",
 38 |         "from tensorflow.keras import layers"
 39 |       ],
 40 |       "execution_count": 1,
 41 |       "outputs": []
 42 |     },
 43 |     {
 44 |       "cell_type": "code",
 45 |       "metadata": {
 46 |         "id": "ryolGkbbZJ2b",
 47 |         "colab_type": "code",
 48 |         "colab": {
 49 |           "base_uri": "https://localhost:8080/",
 50 |           "height": 51
 51 |         },
 52 |         "outputId": "e3242dc0-00a2-47e1-9d64-1d8212a07153"
 53 |       },
 54 |       "source": [
 55 |         "vocab_size = 20000  # Only consider the top 20k words\n",
 56 |         "maxlen = 200  # Only consider the first 200 words of each movie review\n",
 57 |         "(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)\n",
 58 |         "print(len(x_train), \"Training sequences\")\n",
 59 |         "print(len(x_val), \"Validation sequences\")\n",
 60 |         "x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=maxlen)\n",
 61 |         "x_val = keras.preprocessing.sequence.pad_sequences(x_val, maxlen=maxlen)"
 62 |       ],
 63 |       "execution_count": 2,
 64 |       "outputs": [
 65 |         {
 66 |           "output_type": "stream",
 67 |           "text": [
 68 |             "25000 Training sequences\n",
 69 |             "25000 Validation sequences\n"
 70 |           ],
 71 |           "name": "stdout"
 72 |         }
 73 |       ]
 74 |     },
 75 |     {
 76 |       "cell_type": "code",
 77 |       "metadata": {
 78 |         "id": "WGiLAkFRZP3S",
 79 |         "colab_type": "code",
 80 |         "colab": {}
 81 |       },
 82 |       "source": [
 83 |         "class Distiller(keras.Model):\n",
 84 |         "    def __init__(self, student, teacher):\n",
 85 |         "        super(Distiller, self).__init__()\n",
 86 |         "        self.teacher = student\n",
 87 |         "        self.student = teacher\n",
 88 |         "\n",
 89 |         "    def compile(\n",
 90 |         "        self,\n",
 91 |         "        optimizer,\n",
 92 |         "        metrics,\n",
 93 |         "        student_loss_fn,\n",
 94 |         "        distillation_loss_fn,\n",
 95 |         "        alpha=0.1,\n",
 96 |         "        temperature=3,\n",
 97 |         "    ):\n",
 98 |         "        \"\"\" Configure the distiller.\n",
 99 |         "\n",
100 |         "        Args:\n",
101 |         "            optimizer: Keras optimizer for the student weights\n",
102 |         "            metrics: Keras metrics for evaluation\n",
103 |         "            student_loss_fn: Loss function of difference between student\n",
104 |         "                predictions and ground-truth\n",
105 |         "            distillation_loss_fn: Loss function of difference between soft\n",
106 |         "                student predictions and soft teacher predictions\n",
107 |         "            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn\n",
108 |         "            temperature: Temperature for softening probability distributions.\n",
109 |         "                Larger temperature gives softer distributions.\n",
110 |         "        \"\"\"\n",
111 |         "        super(Distiller, self).compile(optimizer=optimizer, metrics=metrics)\n",
112 |         "        self.student_loss_fn = student_loss_fn\n",
113 |         "        self.distillation_loss_fn = distillation_loss_fn\n",
114 |         "        self.alpha = alpha\n",
115 |         "        self.temperature = temperature\n",
116 |         "\n",
117 |         "    def train_step(self, data):\n",
118 |         "        # Unpack data\n",
119 |         "        x, y = data\n",
120 |         "\n",
121 |         "        # Forward pass of teacher\n",
122 |         "        teacher_predictions = self.teacher(x, training=False)\n",
123 |         "\n",
124 |         "        with tf.GradientTape() as tape:\n",
125 |         "            # Forward pass of student\n",
126 |         "            student_predictions = self.student(x, training=True)\n",
127 |         "\n",
128 |         "            # Compute losses\n",
129 |         "            student_loss = self.student_loss_fn(y, student_predictions)\n",
130 |         "            distillation_loss = self.distillation_loss_fn(\n",
131 |         "                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),\n",
132 |         "                tf.nn.softmax(student_predictions / self.temperature, axis=1),\n",
133 |         "            )\n",
134 |         "            loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss\n",
135 |         "\n",
136 |         "        # Compute gradients\n",
137 |         "        trainable_vars = self.student.trainable_variables\n",
138 |         "        gradients = tape.gradient(loss, trainable_vars)\n",
139 |         "\n",
140 |         "        # Update weights\n",
141 |         "        self.optimizer.apply_gradients(zip(gradients, trainable_vars))\n",
142 |         "\n",
143 |         "        # Update the metrics configured in `compile()`.\n",
144 |         "        self.compiled_metrics.update_state(y, student_predictions)\n",
145 |         "\n",
146 |         "        # Return a dict of performance\n",
147 |         "        results = {m.name: m.result() for m in self.metrics}\n",
148 |         "        results.update(\n",
149 |         "            {\"student_loss\": student_loss, \"distillation_loss\": distillation_loss}\n",
150 |         "        )\n",
151 |         "        return results\n",
152 |         "\n",
153 |         "    def test_step(self, data):\n",
154 |         "        # Unpack the data\n",
155 |         "        x, y = data\n",
156 |         "\n",
157 |         "        # Compute predictions\n",
158 |         "        y_prediction = self.student(x, training=False)\n",
159 |         "\n",
160 |         "        # Calculate the loss\n",
161 |         "        student_loss = self.student_loss_fn(y, y_prediction)\n",
162 |         "\n",
163 |         "        # Update the metrics.\n",
164 |         "        self.compiled_metrics.update_state(y, y_prediction)\n",
165 |         "\n",
166 |         "        # Return a dict of performance\n",
167 |         "        results = {m.name: m.result() for m in self.metrics}\n",
168 |         "        results.update({\"student_loss\": student_loss})\n",
169 |         "        return results"
170 |       ],
171 |       "execution_count": 3,
172 |       "outputs": []
173 |     },
174 |     {
175 |       "cell_type": "code",
176 |       "metadata": {
177 |         "id": "Hbo44nFoZclu",
178 |         "colab_type": "code",
179 |         "colab": {
180 |           "base_uri": "https://localhost:8080/",
181 |           "height": 629
182 |         },
183 |         "outputId": "2cad14a0-0bb4-4cf6-ba87-eba58ed59309"
184 |       },
185 |       "source": [
186 |         "# Create the teacher\n",
187 |         "# Input for variable-length sequences of integers\n",
188 |         "inputs = keras.Input(shape=(maxlen,), dtype=\"int32\")\n",
189 |         "# Embed each integer in a 128-dimensional vector\n",
190 |         "x = layers.Embedding(vocab_size, 128)(inputs)\n",
191 |         "# Add 2 bidirectional LSTMs and GRUs\n",
192 |         "x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)\n",
193 |         "x = layers.Bidirectional(layers.GRU(128))(x)\n",
194 |         "# Add a classifier\n",
195 |         "outputs = layers.Dense(2)(x)\n",
196 |         "teacher = keras.Model(inputs, outputs, name='teacher')\n",
197 |         "teacher.summary()\n",
198 |         "\n",
199 |         "\n",
200 |         "# Create the student\n",
201 |         "# Input for variable-length sequences of integers\n",
202 |         "inputs = keras.Input(shape=(maxlen,), dtype=\"int32\")\n",
203 |         "# Embed each integer in a 128-dimensional vector\n",
204 |         "x = layers.Embedding(vocab_size, 64)(inputs)\n",
205 |         "# Add 2 bidirectional LSTMs and GRUs\n",
206 |         "x = layers.Bidirectional(layers.LSTM(32, return_sequences=True))(x)\n",
207 |         "x = layers.Bidirectional(layers.GRU(32))(x)\n",
208 |         "# Add a classifier\n",
209 |         "outputs = layers.Dense(2)(x)\n",
210 |         "student = keras.Model(inputs, outputs, name='student')\n",
211 |         "student.summary()\n",
212 |         "\n",
213 |         "# Clone student for later comparison\n",
214 |         "student_scratch = keras.models.clone_model(student)"
215 |       ],
216 |       "execution_count": 4,
217 |       "outputs": [
218 |         {
219 |           "output_type": "stream",
220 |           "text": [
221 |             "Model: \"teacher\"\n",
222 |             "_________________________________________________________________\n",
223 |             "Layer (type)                 Output Shape              Param #   \n",
224 |             "=================================================================\n",
225 |             "input_1 (InputLayer)         [(None, 200)]             0         \n",
226 |             "_________________________________________________________________\n",
227 |             "embedding (Embedding)        (None, 200, 128)          2560000   \n",
228 |             "_________________________________________________________________\n",
229 |             "bidirectional (Bidirectional (None, 200, 128)          98816     \n",
230 |             "_________________________________________________________________\n",
231 |             "bidirectional_1 (Bidirection (None, 256)               198144    \n",
232 |             "_________________________________________________________________\n",
233 |             "dense (Dense)                (None, 2)                 514       \n",
234 |             "=================================================================\n",
235 |             "Total params: 2,857,474\n",
236 |             "Trainable params: 2,857,474\n",
237 |             "Non-trainable params: 0\n",
238 |             "_________________________________________________________________\n",
239 |             "Model: \"student\"\n",
240 |             "_________________________________________________________________\n",
241 |             "Layer (type)                 Output Shape              Param #   \n",
242 |             "=================================================================\n",
243 |             "input_2 (InputLayer)         [(None, 200)]             0         \n",
244 |             "_________________________________________________________________\n",
245 |             "embedding_1 (Embedding)      (None, 200, 64)           1280000   \n",
246 |             "_________________________________________________________________\n",
247 |             "bidirectional_2 (Bidirection (None, 200, 64)           24832     \n",
248 |             "_________________________________________________________________\n",
249 |             "bidirectional_3 (Bidirection (None, 64)                18816     \n",
250 |             "_________________________________________________________________\n",
251 |             "dense_1 (Dense)              (None, 2)                 130       \n",
252 |             "=================================================================\n",
253 |             "Total params: 1,323,778\n",
254 |             "Trainable params: 1,323,778\n",
255 |             "Non-trainable params: 0\n",
256 |             "_________________________________________________________________\n"
257 |           ],
258 |           "name": "stdout"
259 |         }
260 |       ]
261 |     },
262 |     {
263 |       "cell_type": "code",
264 |       "metadata": {
265 |         "id": "C0t_Mc07aBH1",
266 |         "colab_type": "code",
267 |         "colab": {
268 |           "base_uri": "https://localhost:8080/",
269 |           "height": 153
270 |         },
271 |         "outputId": "84e82f1d-9dcd-44d4-e078-87440bccd1cb"
272 |       },
273 |       "source": [
274 |         "# Train teacher as usual\n",
275 |         "teacher.compile(\n",
276 |         "    optimizer=keras.optimizers.Adam(),\n",
277 |         "    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
278 |         "    metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
279 |         ")\n",
280 |         "\n",
281 |         "# Train and evaluate teacher on data.\n",
282 |         "teacher.fit(x_train, y_train, epochs=3)\n",
283 |         "teacher.evaluate(x_val, y_val)"
284 |       ],
285 |       "execution_count": 5,
286 |       "outputs": [
287 |         {
288 |           "output_type": "stream",
289 |           "text": [
290 |             "Epoch 1/3\n",
291 |             "782/782 [==============================] - 43s 55ms/step - loss: 0.3714 - sparse_categorical_accuracy: 0.8351\n",
292 |             "Epoch 2/3\n",
293 |             "782/782 [==============================] - 43s 55ms/step - loss: 0.2004 - sparse_categorical_accuracy: 0.9232\n",
294 |             "Epoch 3/3\n",
295 |             "782/782 [==============================] - 42s 54ms/step - loss: 0.1297 - sparse_categorical_accuracy: 0.9519\n",
296 |             "782/782 [==============================] - 12s 15ms/step - loss: 0.4922 - sparse_categorical_accuracy: 0.8590\n"
297 |           ],
298 |           "name": "stdout"
299 |         },
300 |         {
301 |           "output_type": "execute_result",
302 |           "data": {
303 |             "text/plain": [
304 |               "[0.49218621850013733, 0.8590400218963623]"
305 |             ]
306 |           },
307 |           "metadata": {
308 |             "tags": []
309 |           },
310 |           "execution_count": 5
311 |         }
312 |       ]
313 |     },
314 |     {
315 |       "cell_type": "code",
316 |       "metadata": {
317 |         "id": "6diJumxgaU1e",
318 |         "colab_type": "code",
319 |         "colab": {
320 |           "base_uri": "https://localhost:8080/",
321 |           "height": 153
322 |         },
323 |         "outputId": "827b0b52-e59a-4c83-81e7-8252a0a6287d"
324 |       },
325 |       "source": [
326 |         "# Initialize and compile distiller\n",
327 |         "distiller = Distiller(student=student, teacher=teacher)\n",
328 |         "distiller.compile(\n",
329 |         "    optimizer=keras.optimizers.Adam(),\n",
330 |         "    metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
331 |         "    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
332 |         "    distillation_loss_fn=keras.losses.KLDivergence(),\n",
333 |         "    alpha=0.1,\n",
334 |         "    temperature=10,\n",
335 |         ")\n",
336 |         "\n",
337 |         "# Distill teacher to student\n",
338 |         "distiller.fit(x_train, y_train, epochs=3)\n",
339 |         "\n",
340 |         "# Evaluate student on test dataset\n",
341 |         "distiller.evaluate(x_val, y_val)"
342 |       ],
343 |       "execution_count": 6,
344 |       "outputs": [
345 |         {
346 |           "output_type": "stream",
347 |           "text": [
348 |             "Epoch 1/3\n",
349 |             "782/782 [==============================] - 53s 68ms/step - sparse_categorical_accuracy: 0.9664 - student_loss: 0.1382 - distillation_loss: 0.0084\n",
350 |             "Epoch 2/3\n",
351 |             "782/782 [==============================] - 54s 69ms/step - sparse_categorical_accuracy: 0.9781 - student_loss: 0.1119 - distillation_loss: 0.0085\n",
352 |             "Epoch 3/3\n",
353 |             "782/782 [==============================] - 54s 69ms/step - sparse_categorical_accuracy: 0.9868 - student_loss: 0.0937 - distillation_loss: 0.0088\n",
354 |             "782/782 [==============================] - 12s 15ms/step - sparse_categorical_accuracy: 0.8476 - student_loss: 0.3908\n"
355 |           ],
356 |           "name": "stdout"
357 |         },
358 |         {
359 |           "output_type": "execute_result",
360 |           "data": {
361 |             "text/plain": [
362 |               "0.8475599884986877"
363 |             ]
364 |           },
365 |           "metadata": {
366 |             "tags": []
367 |           },
368 |           "execution_count": 6
369 |         }
370 |       ]
371 |     },
372 |     {
373 |       "cell_type": "code",
374 |       "metadata": {
375 |         "id": "v_sZMYFXafAc",
376 |         "colab_type": "code",
377 |         "colab": {
378 |           "base_uri": "https://localhost:8080/",
379 |           "height": 300
380 |         },
381 |         "outputId": "98b839c7-b5f5-4a40-c822-fefd510eedb9"
382 |       },
383 |       "source": [
384 |         "# Train student as doen usually\n",
385 |         "student_scratch.compile(\n",
386 |         "    optimizer=keras.optimizers.Adam(),\n",
387 |         "    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),\n",
388 |         "    metrics=[keras.metrics.SparseCategoricalAccuracy()],\n",
389 |         ")\n",
390 |         "\n",
391 |         "# Train and evaluate student trained from scratch.\n",
392 |         "student_scratch.fit(x_train, y_train, epochs=3)\n"
393 |       ],
394 |       "execution_count": 7,
395 |       "outputs": [
396 |         {
397 |           "output_type": "stream",
398 |           "text": [
399 |             "Epoch 1/3\n",
400 |             "782/782 [==============================] - 31s 39ms/step - loss: 0.3796 - sparse_categorical_accuracy: 0.8246\n",
401 |             "Epoch 2/3\n",
402 |             "782/782 [==============================] - 30s 39ms/step - loss: 0.1918 - sparse_categorical_accuracy: 0.9283\n",
403 |             "Epoch 3/3\n",
404 |             "782/782 [==============================] - 30s 39ms/step - loss: 0.1123 - sparse_categorical_accuracy: 0.9602\n"
405 |           ],
406 |           "name": "stdout"
407 |         },
408 |         {
409 |           "output_type": "error",
410 |           "ename": "NameError",
411 |           "evalue": "ignored",
412 |           "traceback": [
413 |             "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
414 |             "\u001b[0;31mNameError\u001b[0m                                 Traceback (most recent call last)",
415 |             "\u001b[0;32m<ipython-input-7-2e22fcfd29d9>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      8\u001b[0m \u001b[0;31m# Train and evaluate student trained from scratch.\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0mstudent_scratch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mfit\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_train\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mepochs\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;36m3\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m---> 10\u001b[0;31m \u001b[0mstudent_scratch\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mevaluate\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mx_test\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0my_test\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
416 |             "\u001b[0;31mNameError\u001b[0m: name 'x_test' is not defined"
417 |           ]
418 |         }
419 |       ]
420 |     },
421 |     {
422 |       "cell_type": "code",
423 |       "metadata": {
424 |         "id": "xmd5LaNpakwp",
425 |         "colab_type": "code",
426 |         "colab": {
427 |           "base_uri": "https://localhost:8080/",
428 |           "height": 51
429 |         },
430 |         "outputId": "f19a9c16-5ff2-4a24-ebe0-0bec4e90e0f9"
431 |       },
432 |       "source": [
433 |         "student_scratch.evaluate(x_val, y_val)"
434 |       ],
435 |       "execution_count": 9,
436 |       "outputs": [
437 |         {
438 |           "output_type": "stream",
439 |           "text": [
440 |             "782/782 [==============================] - 12s 15ms/step - loss: 0.3720 - sparse_categorical_accuracy: 0.8596\n"
441 |           ],
442 |           "name": "stdout"
443 |         },
444 |         {
445 |           "output_type": "execute_result",
446 |           "data": {
447 |             "text/plain": [
448 |               "[0.37198370695114136, 0.8595600128173828]"
449 |             ]
450 |           },
451 |           "metadata": {
452 |             "tags": []
453 |           },
454 |           "execution_count": 9
455 |         }
456 |       ]
457 |     },
458 |     {
459 |       "cell_type": "code",
460 |       "metadata": {
461 |         "id": "GUPSHu9Td6CI",
462 |         "colab_type": "code",
463 |         "colab": {}
464 |       },
465 |       "source": [
466 |         ""
467 |       ],
468 |       "execution_count": null,
469 |       "outputs": []
470 |     }
471 |   ]
472 | }


--------------------------------------------------------------------------------
/mkdocs.yml:
--------------------------------------------------------------------------------
 1 | site_name: NLP-Notebooks
 2 | nav:
 3 |     - BERT_Fine_Tuning_Sentence_Classification_v2.md
 4 |     - Bert_Classification_Pt.md
 5 |     - Bert_Pre_Training.md
 6 |     - Doc_Visual_QA_and_Bill_extraction_demo.md
 7 |     - GPT_2_on_Onnx_CPU.md
 8 |     - Generic_Transformer_Classification.md
 9 |     - Question_Answering_with_a_Fine_Tuned_BERT.md
10 |     - Seq2Seq_Pytorch.md
11 |     - SetFit_SST_2_Few_shot.md
12 |     - Simpletransformers_2.md
13 |     - TAPAS_fine_tuning_in_tf.md
14 |     - Using_Transformers_with_Fastai_Tutorial.md
15 |     - Wikipedia_answer_retrieval_DPR.md
16 |     - contextual_topic_modeling.md
17 |     - knowledge_distillation_exploration.md
18 |     - large_scale_multilabelclassification.md
19 |     - simpletransformers_intro.md
20 |     - token_classification_transformers_zenml.md
21 | theme: readthedocs


--------------------------------------------------------------------------------