├── .gitignore
├── LICENCE
├── README.md
├── augment.png
├── examples
    ├── aeda_example.ipynb
    ├── eda_example.ipynb
    ├── fasttext_example.ipynb
    ├── mixup_example_using_IMDB_sentiment.ipynb
    └── word2vec_example.ipynb
├── requirements.txt
├── setup.py
├── tests
    ├── test_translate.py
    ├── test_word2vec.py
    └── test_wordnet.py
└── textaugment
    ├── __init__.py
    ├── aeda.py
    ├── constants.py
    ├── eda.py
    ├── mixup.py
    ├── translate.py
    ├── word2vec.py
    └── wordnet.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Joseph Sefara
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | 
  2 | 
  3 | # [TextAugment: Improving Short Text Classification through Global Augmentation Methods](https://arxiv.org/abs/1907.03752) 
  4 | 
  5 | [![licence](https://img.shields.io/github/license/dsfsi/textaugment.svg?maxAge=3600)](https://github.com/dsfsi/textaugment/blob/master/LICENCE) [![GitHub release](https://img.shields.io/github/release/dsfsi/textaugment.svg?maxAge=3600)](https://github.com/dsfsi/textaugment/releases) [![Wheel](https://img.shields.io/pypi/wheel/textaugment.svg?maxAge=3600)](https://pypi.python.org/pypi/textaugment) [![python](https://img.shields.io/pypi/pyversions/textaugment.svg?maxAge=3600)](https://pypi.org/project/textaugment/) [![TotalDownloads](https://pepy.tech/badge/textaugment)](https://pypi.org/project/textaugment/) [![Downloads](https://static.pepy.tech/badge/textaugment/month)](https://pypi.org/project/textaugment/) [![LNCS](https://img.shields.io/badge/LNCS-Book%20Chapter-B31B1B.svg)](https://link.springer.com/chapter/10.1007%2F978-3-030-57321-8_21) [![arxiv](https://img.shields.io/badge/cs.CL-arXiv%3A1907.03752-B31B1B.svg)](https://arxiv.org/abs/1907.03752)
  6 | 
  7 | 
  8 | ## You have just found TextAugment.
  9 | 
 10 | TextAugment is a Python 3 library for augmenting text for natural language processing applications. TextAugment stands on the giant shoulders of [NLTK](https://www.nltk.org/), [Gensim v3.x](https://radimrehurek.com/gensim/), and [TextBlob](https://textblob.readthedocs.io/) and plays nicely with them.
 11 | 
 12 | ## Acknowledgements
 13 | Cite this [paper](https://link.springer.com/chapter/10.1007%2F978-3-030-57321-8_21) when using this library. [Arxiv Version](https://arxiv.org/abs/1907.03752)
 14 | 
 15 | ```
 16 | @inproceedings{marivate2020improving,
 17 |   title={Improving short text classification through global augmentation methods},
 18 |   author={Marivate, Vukosi and Sefara, Tshephisho},
 19 |   booktitle={International Cross-Domain Conference for Machine Learning and Knowledge Extraction},
 20 |   pages={385--399},
 21 |   year={2020},
 22 |   organization={Springer}
 23 | }
 24 | ```
 25 | 
 26 | # Table of Contents
 27 | 
 28 | - [Features](#Features)
 29 | - [Citation Paper](#citation-paper) 
 30 | 	- [Requirements](#Requirements)
 31 | 	- [Installation](#Installation)
 32 | 	- [How to use](#How-to-use)
 33 | 		- [Word2vec-based augmentation](#Word2vec-based-augmentation)
 34 | 		- [WordNet-based augmentation](#WordNet-based-augmentation)
 35 | 		- [RTT-based augmentation](#RTT-based-augmentation)
 36 | - [Easy data augmentation (EDA)](#eda-easy-data-augmentation-techniques-for-boosting-performance-on-text-classification-tasks)
 37 | - [An easier data augmentation (AEDA)](#aeda-an-easier-data-augmentation-technique-for-text-classification)
 38 | - [Mixup augmentation](#mixup-augmentation)
 39 |   - [Implementation](#Implementation)
 40 | - [Acknowledgements](#Acknowledgements)
 41 | 
 42 | ## Features
 43 | 
 44 | - Generate synthetic data for improving model performance without manual effort
 45 | - Simple, lightweight, easy-to-use library.
 46 | - Plug and play to any machine learning frameworks (e.g. PyTorch, TensorFlow, Scikit-learn)
 47 | - Support textual data
 48 | 
 49 | ## Citation Paper
 50 | 
 51 | **[Improving short text classification through global augmentation methods](https://link.springer.com/chapter/10.1007%2F978-3-030-57321-8_21)**.
 52 | 
 53 | 
 54 | 
 55 | ![alt text](https://raw.githubusercontent.com/dsfsi/textaugment/master/augment.png "Augmentation methods")
 56 | 
 57 | ### Requirements
 58 | 
 59 | * Python 3
 60 | 
 61 | The following software packages are dependencies and will be installed automatically.
 62 | 
 63 | ```shell
 64 | $ pip install numpy nltk gensim==3.8.3 textblob googletrans 
 65 | 
 66 | ```
 67 | The following code downloads NLTK corpus for [wordnet](http://www.nltk.org/howto/wordnet.html).
 68 | ```python
 69 | nltk.download('wordnet')
 70 | ```
 71 | The following code downloads [NLTK tokenizer](https://www.nltk.org/_modules/nltk/tokenize/punkt.html). This tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences. 
 72 | ```python
 73 | nltk.download('punkt')
 74 | ```
 75 | The following code downloads default [NLTK part-of-speech tagger](https://www.nltk.org/_modules/nltk/tag.html) model. A part-of-speech tagger processes a sequence of words, and attaches a part of speech tag to each word.
 76 | ```python
 77 | nltk.download('averaged_perceptron_tagger')
 78 | ```
 79 | Use gensim to load a pre-trained word2vec model. Like [Google News from Google drive](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit).
 80 | ```python
 81 | import gensim
 82 | model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
 83 | ```
 84 | You can also use gensim to load Facebook's Fasttext [English](https://fasttext.cc/docs/en/english-vectors.html) and [Multilingual models](https://fasttext.cc/docs/en/crawl-vectors.html)
 85 | ```
 86 | import gensim
 87 | model = gensim.models.fasttext.load_facebook_model('./cc.en.300.bin.gz')
 88 | ```
 89 | 
 90 | Or training one from scratch using your data or the following public dataset:
 91 | 
 92 | - [Text8 Wiki](http://mattmahoney.net/dc/enwik9.zip)
 93 | 
 94 | - [Dataset from "One Billion Word Language Modeling Benchmark"](http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz)
 95 | 
 96 | ### Installation
 97 | 
 98 | Install from pip [Recommended] 
 99 | ```sh
100 | $ pip install textaugment
101 | or install latest release
102 | $ pip install git+git@github.com:dsfsi/textaugment.git
103 | ```
104 | 
105 | Install from source
106 | ```sh
107 | $ git clone git@github.com:dsfsi/textaugment.git
108 | $ cd textaugment
109 | $ python setup.py install
110 | ```
111 | 
112 | ### How to use
113 | 
114 | There are three types of augmentations which can be used:
115 | 
116 | - word2vec 
117 | 
118 | ```python
119 | from textaugment import Word2vec
120 | ```
121 | - fasttext 
122 | 
123 | ```python
124 | from textaugment import Fasttext
125 | ```
126 | 
127 | - wordnet 
128 | ```python
129 | from textaugment import Wordnet
130 | ```
131 | - translate (This will require internet access)
132 | ```python
133 | from textaugment import Translate
134 | ```
135 | #### Fasttext/Word2vec-based augmentation
136 | 
137 | [See this notebook for an example](https://github.com/dsfsi/textaugment/blob/master/examples/word2vec_example.ipynb)
138 | 
139 | **Basic example**
140 | 
141 | ```python
142 | >>> from textaugment import Word2vec, Fasttext
143 | >>> t = Word2vec(model='path/to/gensim/model'or 'gensim model itself')
144 | >>> t.augment('The stories are good')
145 | The films are good
146 | >>> t = Fasttext(model='path/to/gensim/model'or 'gensim model itself')
147 | >>> t.augment('The stories are good')
148 | The films are good
149 | ```
150 | **Advanced example**
151 | 
152 | ```python
153 | >>> runs = 1 # By default.
154 | >>> v = False # verbose mode to replace all the words. If enabled runs is not effective. Used in this paper (https://www.cs.cmu.edu/~diyiy/docs/emnlp_wang_2015.pdf)
155 | >>> p = 0.5 # The probability of success of an individual trial. (0.1<p<1.0), default is 0.5. Used by Geometric distribution to selects words from a sentence.
156 | 
157 | >>> word = Word2vec(model='path/to/gensim/model'or'gensim model itself', runs=5, v=False, p=0.5)
158 | >>> word.augment('The stories are good', top_n=10)
159 | The movies are excellent
160 | >>> fast = Fasttext(model='path/to/gensim/model'or'gensim model itself', runs=5, v=False, p=0.5)
161 | >>> fast.augment('The stories are good', top_n=10)
162 | The movies are excellent
163 | ```
164 | #### WordNet-based augmentation
165 | **Basic example**
166 | ```python
167 | >>> import nltk
168 | >>> nltk.download('punkt')
169 | >>> nltk.download('wordnet')
170 | >>> from textaugment import Wordnet
171 | >>> t = Wordnet()
172 | >>> t.augment('In the afternoon, John is going to town')
173 | In the afternoon, John is walking to town
174 | ```
175 | **Advanced example**
176 | 
177 | ```python
178 | >>> v = True # enable verbs augmentation. By default is True.
179 | >>> n = False # enable nouns augmentation. By default is False.
180 | >>> runs = 1 # number of times to augment a sentence. By default is 1.
181 | >>> p = 0.5 # The probability of success of an individual trial. (0.1<p<1.0), default is 0.5. Used by Geometric distribution to selects words from a sentence.
182 | 
183 | >>> t = Wordnet(v=False ,n=True, p=0.5)
184 | >>> t.augment('In the afternoon, John is going to town', top_n=10)
185 | In the afternoon, Joseph is going to town.
186 | ```
187 | #### RTT-based augmentation
188 | **Example**
189 | ```python
190 | >>> src = "en" # source language of the sentence
191 | >>> to = "fr" # target language
192 | >>> from textaugment import Translate
193 | >>> t = Translate(src="en", to="fr")
194 | >>> t.augment('In the afternoon, John is going to town')
195 | In the afternoon John goes to town
196 | ```
197 | # EDA: Easy data augmentation techniques for boosting performance on text classification tasks 
198 | ## This is the implementation of EDA by Jason Wei and Kai Zou. 
199 | 
200 | https://www.aclweb.org/anthology/D19-1670.pdf
201 | 
202 | [See this notebook for an example](https://github.com/dsfsi/textaugment/blob/master/examples/eda_example.ipynb)
203 | 
204 | #### Synonym Replacement
205 | Randomly choose *n* words from the sentence that are not stop words. Replace each of these words with
206 | one of its synonyms chosen at random. 
207 | 
208 | **Basic example**
209 | ```python
210 | >>> from textaugment import EDA
211 | >>> t = EDA()
212 | >>> t.synonym_replacement("John is going to town", top_n=10)
213 | John is give out to town
214 | ```
215 | 
216 | #### Random Deletion
217 | Randomly remove each word in the sentence with probability *p*.
218 | 
219 | **Basic example**
220 | ```python
221 | >>> from textaugment import EDA
222 | >>> t = EDA()
223 | >>> t.random_deletion("John is going to town", p=0.2)
224 | is going to town
225 | ```
226 | 
227 | #### Random Swap
228 | Randomly choose two words in the sentence and swap their positions. Do this n times.
229 | 
230 | **Basic example**
231 | ```python
232 | >>> from textaugment import EDA
233 | >>> t = EDA()
234 | >>> t.random_swap("John is going to town")
235 | John town going to is
236 | ```
237 | 
238 | #### Random Insertion 
239 | Find a random synonym of a random word in the sentence that is not a stop word. Insert that synonym into a random position in the sentence. Do this n times
240 | 
241 | **Basic example**
242 | ```python
243 | >>> from textaugment import EDA
244 | >>> t = EDA()
245 | >>> t.random_insertion("John is going to town")
246 | John is going to make up town
247 | ```
248 | 
249 | # AEDA: An easier data augmentation technique for text classification
250 | 
251 | This is the implementation of AEDA by Karimi et al, a variant of EDA. It is based on the random insertion of punctuation marks.
252 | 
253 | https://aclanthology.org/2021.findings-emnlp.234.pdf
254 | 
255 | ## Implementation
256 | [See this notebook for an example](https://github.com/dsfsi/textaugment/blob/master/examples/eda_example.ipynb)
257 | 
258 | #### Random Insertion of Punctuation Marks
259 | 
260 | **Basic example**
261 | ```python
262 | >>> from textaugment import AEDA
263 | >>> t = AEDA()
264 | >>> t.punct_insertion("John is going to town")
265 | ! John is going to town
266 | ```
267 | 
268 | # Mixup augmentation
269 | 
270 | This is the implementation of mixup augmentation by [Hongyi Zhang, Moustapha Cisse, Yann Dauphin, David Lopez-Paz](https://openreview.net/forum?id=r1Ddp1-Rb) adapted to NLP. 
271 | 
272 | Used in [Augmenting Data with Mixup for Sentence Classification: An Empirical Study](https://arxiv.org/abs/1905.08941). 
273 | 
274 | Mixup is a generic and straightforward data augmentation principle. In essence, mixup trains a neural network on convex combinations of pairs of examples and their labels. By doing so, mixup regularises the neural network to favour simple linear behaviour in-between training examples. 
275 | 
276 | ## Implementation
277 | 
278 | [See this notebook for an example](https://github.com/dsfsi/textaugment/blob/master/examples/mixup_example_using_IMDB_sentiment.ipynb)
279 | 
280 | ## Built with ❤ on
281 | * [Python](http://python.org/)
282 | 
283 | ## Authors
284 | * [Joseph Sefara](https://za.linkedin.com/in/josephsefara) (http://www.speechtech.co.za)
285 | * [Vukosi Marivate](http://www.vima.co.za) (http://www.vima.co.za)
286 | 
287 | ## Acknowledgements
288 | Cite this [paper](https://link.springer.com/chapter/10.1007%2F978-3-030-57321-8_21) when using this library. [Arxiv Version](https://arxiv.org/abs/1907.03752)
289 | 
290 | ```
291 | @inproceedings{marivate2020improving,
292 |   title={Improving short text classification through global augmentation methods},
293 |   author={Marivate, Vukosi and Sefara, Tshephisho},
294 |   booktitle={International Cross-Domain Conference for Machine Learning and Knowledge Extraction},
295 |   pages={385--399},
296 |   year={2020},
297 |   organization={Springer}
298 | }
299 | ```
300 | 
301 | ## Licence
302 | MIT licensed. See the bundled [LICENCE](https://github.com/dsfsi/textaugment/blob/master/LICENCE) file for more details.
303 | 


--------------------------------------------------------------------------------
/augment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/textaugment/02c63e07f0b4dcdf95d9700722509e1512963d6a/augment.png


--------------------------------------------------------------------------------
/examples/aeda_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# AEDA example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "try:\n",
 17 |     "    from textaugment import AEDA\n",
 18 |     "except ModuleNotFoundError:\n",
 19 |     "    !pip install textaugment\n",
 20 |     "    from textaugment import AEDA"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "t = AEDA(random_state=1)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Punctuation Insertion\n",
 37 |     "1. Randomly select the amount of punctuation to be inserted, between 1 and 1/3 of the length of the sentence.\n",
 38 |     "2. Randomly select the punctuation to be inserted.\n",
 39 |     "3. Randomly select the position of the punctuation to be inserted.\n",
 40 |     "4. Insert the punctuation at the selected position."
 41 |    ]
 42 |   },
 43 |   {
 44 |    "cell_type": "code",
 45 |    "execution_count": 3,
 46 |    "metadata": {},
 47 |    "outputs": [
 48 |     {
 49 |      "name": "stdout",
 50 |      "output_type": "stream",
 51 |      "text": [
 52 |       "! John is going to town\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "output = t.punct_insertion(\"John is going to town\")\n",
 58 |     "print(output)"
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "markdown",
 63 |    "metadata": {},
 64 |    "source": [
 65 |     "## Cite the paper\n",
 66 |     "```\n",
 67 |     "@article{marivate2019improving,\n",
 68 |     "  title={Improving short text classification through global augmentation methods},\n",
 69 |     "  author={Marivate, Vukosi and Sefara, Tshephisho},\n",
 70 |     "  journal={arXiv preprint arXiv:1907.03752},\n",
 71 |     "  year={2019}\n",
 72 |     "}```\n",
 73 |     "\n",
 74 |     "https://arxiv.org/abs/1907.03752"
 75 |    ]
 76 |   }
 77 |  ],
 78 |  "metadata": {
 79 |   "kernelspec": {
 80 |    "display_name": "Python 3",
 81 |    "language": "python",
 82 |    "name": "python3"
 83 |   },
 84 |   "language_info": {
 85 |    "codemirror_mode": {
 86 |     "name": "ipython",
 87 |     "version": 3
 88 |    },
 89 |    "file_extension": ".py",
 90 |    "mimetype": "text/x-python",
 91 |    "name": "python",
 92 |    "nbconvert_exporter": "python",
 93 |    "pygments_lexer": "ipython3",
 94 |    "version": "3.7.7"
 95 |   }
 96 |  },
 97 |  "nbformat": 4,
 98 |  "nbformat_minor": 4
 99 | }
100 | 


--------------------------------------------------------------------------------
/examples/eda_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# EDA example"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "try:\n",
 17 |     "    from textaugment import EDA\n",
 18 |     "except ModuleNotFoundError:\n",
 19 |     "    !pip install textaugment\n",
 20 |     "    from textaugment import EDA"
 21 |    ]
 22 |   },
 23 |   {
 24 |    "cell_type": "code",
 25 |    "execution_count": 2,
 26 |    "metadata": {},
 27 |    "outputs": [],
 28 |    "source": [
 29 |     "t = EDA(random_state=1)"
 30 |    ]
 31 |   },
 32 |   {
 33 |    "cell_type": "markdown",
 34 |    "metadata": {},
 35 |    "source": [
 36 |     "## Synonym Replacement\n",
 37 |     "Randomly choose *n* words from the sentence that are not stop words. Replace each of these words with one of its synonyms chosen at random"
 38 |    ]
 39 |   },
 40 |   {
 41 |    "cell_type": "code",
 42 |    "execution_count": 3,
 43 |    "metadata": {},
 44 |    "outputs": [
 45 |     {
 46 |      "name": "stdout",
 47 |      "output_type": "stream",
 48 |      "text": [
 49 |       "John is choke to town\n"
 50 |      ]
 51 |     }
 52 |    ],
 53 |    "source": [
 54 |     "output = t.synonym_replacement(\"John is going to town\", top_n=10)\n",
 55 |     "print(output)"
 56 |    ]
 57 |   },
 58 |   {
 59 |    "cell_type": "markdown",
 60 |    "metadata": {},
 61 |    "source": [
 62 |     "## Random Insertion\n",
 63 |     "Find a random synonym of a random word in the sentence that is not a stop word. Insert that synonym into a random position in the sentence. Do this *n* times."
 64 |    ]
 65 |   },
 66 |   {
 67 |    "cell_type": "code",
 68 |    "execution_count": 4,
 69 |    "metadata": {},
 70 |    "outputs": [
 71 |     {
 72 |      "name": "stdout",
 73 |      "output_type": "stream",
 74 |      "text": [
 75 |       "John is going to lead town\n"
 76 |      ]
 77 |     }
 78 |    ],
 79 |    "source": [
 80 |     "output = t.random_insertion(\"John is going to town\")\n",
 81 |     "print(output)"
 82 |    ]
 83 |   },
 84 |   {
 85 |    "cell_type": "markdown",
 86 |    "metadata": {},
 87 |    "source": [
 88 |     "## Random Swap\n",
 89 |     "Randomly choose two words in the sentence and swap their positions. Do this *n* times."
 90 |    ]
 91 |   },
 92 |   {
 93 |    "cell_type": "code",
 94 |    "execution_count": 5,
 95 |    "metadata": {},
 96 |    "outputs": [
 97 |     {
 98 |      "name": "stdout",
 99 |      "output_type": "stream",
100 |      "text": [
101 |       "John is to going town\n"
102 |      ]
103 |     }
104 |    ],
105 |    "source": [
106 |     "output = t.random_swap(\"John is going to town\")\n",
107 |     "print(output)"
108 |    ]
109 |   },
110 |   {
111 |    "cell_type": "markdown",
112 |    "metadata": {},
113 |    "source": [
114 |     "## Random  Deletion\n",
115 |     "Randomly remove each word in the sentence with probability *p*."
116 |    ]
117 |   },
118 |   {
119 |    "cell_type": "code",
120 |    "execution_count": 6,
121 |    "metadata": {},
122 |    "outputs": [
123 |     {
124 |      "name": "stdout",
125 |      "output_type": "stream",
126 |      "text": [
127 |       "John going to town\n"
128 |      ]
129 |     }
130 |    ],
131 |    "source": [
132 |     "output = t.random_deletion(\"John is going to town\", p=0.2)\n",
133 |     "print(output)"
134 |    ]
135 |   },
136 |   {
137 |    "cell_type": "markdown",
138 |    "metadata": {},
139 |    "source": [
140 |     "## Cite the paper\n",
141 |     "```\n",
142 |     "@article{marivate2019improving,\n",
143 |     "  title={Improving short text classification through global augmentation methods},\n",
144 |     "  author={Marivate, Vukosi and Sefara, Tshephisho},\n",
145 |     "  journal={arXiv preprint arXiv:1907.03752},\n",
146 |     "  year={2019}\n",
147 |     "}```\n",
148 |     "\n",
149 |     "https://arxiv.org/abs/1907.03752"
150 |    ]
151 |   },
152 |   {
153 |    "cell_type": "code",
154 |    "execution_count": null,
155 |    "metadata": {},
156 |    "outputs": [],
157 |    "source": []
158 |   }
159 |  ],
160 |  "metadata": {
161 |   "kernelspec": {
162 |    "display_name": "Python 3",
163 |    "language": "python",
164 |    "name": "python3"
165 |   },
166 |   "language_info": {
167 |    "codemirror_mode": {
168 |     "name": "ipython",
169 |     "version": 3
170 |    },
171 |    "file_extension": ".py",
172 |    "mimetype": "text/x-python",
173 |    "name": "python",
174 |    "nbconvert_exporter": "python",
175 |    "pygments_lexer": "ipython3",
176 |    "version": "3.7.7"
177 |   }
178 |  },
179 |  "nbformat": 4,
180 |  "nbformat_minor": 4
181 | }
182 | 


--------------------------------------------------------------------------------
/examples/fasttext_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {},
  6 |    "source": [
  7 |     "# Example for using Fasttext"
  8 |    ]
  9 |   },
 10 |   {
 11 |    "cell_type": "code",
 12 |    "execution_count": 1,
 13 |    "metadata": {},
 14 |    "outputs": [],
 15 |    "source": [
 16 |     "# Import libraries\n",
 17 |     "try:\n",
 18 |     "    import textaugment, gensim\n",
 19 |     "except ModuleNotFoundError:\n",
 20 |     "    !pip -q install textaugment gensim\n",
 21 |     "    import textaugment, gensim"
 22 |    ]
 23 |   },
 24 |   {
 25 |    "cell_type": "markdown",
 26 |    "metadata": {},
 27 |    "source": [
 28 |     "# Load Fasttext Embeddings \n",
 29 |     "\n",
 30 |     "Fasttext has Pre-trained word vectors on English webcrawl and Wikipedia which you can find [here](https://fasttext.cc/docs/en/english-vectors.html) as well as Pre-trained models for 157 different languages which you can find [here](https://fasttext.cc/docs/en/crawl-vectors.html)"
 31 |    ]
 32 |   },
 33 |   {
 34 |    "cell_type": "code",
 35 |    "execution_count": 3,
 36 |    "metadata": {},
 37 |    "outputs": [
 38 |     {
 39 |      "name": "stdout",
 40 |      "output_type": "stream",
 41 |      "text": [
 42 |       "--2020-09-01 10:11:28--  https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\n",
 43 |       "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...\n",
 44 |       "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.\n",
 45 |       "HTTP request sent, awaiting response... 200 OK\n",
 46 |       "Length: 4503593528 (4.2G) [application/octet-stream]\n",
 47 |       "Saving to: ‘cc.en.300.bin.gz’\n",
 48 |       "\n",
 49 |       "cc.en.300.bin.gz    100%[===================>]   4.19G  4.32MB/s    in 9m 57s  \n",
 50 |       "\n",
 51 |       "2020-09-01 10:21:26 (7.20 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]\n",
 52 |       "\n"
 53 |      ]
 54 |     }
 55 |    ],
 56 |    "source": [
 57 |     "# Download the FastText embeddings in the language of your choice\n",
 58 |     "!wget \"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\""
 59 |    ]
 60 |   },
 61 |   {
 62 |    "cell_type": "code",
 63 |    "execution_count": null,
 64 |    "metadata": {},
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "# save path to your pre-trained model\n",
 68 |     "from gensim.test.utils import datapath\n",
 69 |     "pretrained_path = datapath('./cc.en.300.bin.gz')\n",
 70 |     "\n",
 71 |     "# load model\n",
 72 |     "model = gensim.models.fasttext.load_facebook_model(pretrained_path)"
 73 |    ]
 74 |   },
 75 |   {
 76 |    "cell_type": "code",
 77 |    "execution_count": null,
 78 |    "metadata": {},
 79 |    "outputs": [],
 80 |    "source": [
 81 |     "from textaugment import Word2vec\n",
 82 |     "t = Word2vec(model = model)\n",
 83 |     "output = t.augment('The stories are good', top_n=10)"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": null,
 89 |    "metadata": {},
 90 |    "outputs": [],
 91 |    "source": [
 92 |     "print(output)"
 93 |    ]
 94 |   },
 95 |   {
 96 |    "cell_type": "markdown",
 97 |    "metadata": {},
 98 |    "source": [
 99 |     "## Cite the paper\n",
100 |     "```\n",
101 |     "@article{marivate2019improving,\n",
102 |     "  title={Improving short text classification through global augmentation methods},\n",
103 |     "  author={Marivate, Vukosi and Sefara, Tshephisho},\n",
104 |     "  journal={arXiv preprint arXiv:1907.03752},\n",
105 |     "  year={2019}\n",
106 |     "}```\n",
107 |     "\n",
108 |     "https://arxiv.org/abs/1907.03752\n"
109 |    ]
110 |   },
111 |   {
112 |    "cell_type": "code",
113 |    "execution_count": null,
114 |    "metadata": {},
115 |    "outputs": [],
116 |    "source": []
117 |   }
118 |  ],
119 |  "metadata": {
120 |   "kernelspec": {
121 |    "display_name": "Python 3",
122 |    "language": "python",
123 |    "name": "python3"
124 |   },
125 |   "language_info": {
126 |    "codemirror_mode": {
127 |     "name": "ipython",
128 |     "version": 3
129 |    },
130 |    "file_extension": ".py",
131 |    "mimetype": "text/x-python",
132 |    "name": "python",
133 |    "nbconvert_exporter": "python",
134 |    "pygments_lexer": "ipython3",
135 |    "version": "3.7.7"
136 |   }
137 |  },
138 |  "nbformat": 4,
139 |  "nbformat_minor": 4
140 | }
141 | 


--------------------------------------------------------------------------------
/examples/mixup_example_using_IMDB_sentiment.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "kMccmZPoWd_h"
  8 |    },
  9 |    "source": [
 10 |     "# Mixup augmentation for NLP\n",
 11 |     "\n",
 12 |     "Using IMDB sentiment classification dataset"
 13 |    ]
 14 |   },
 15 |   {
 16 |    "cell_type": "code",
 17 |    "execution_count": 1,
 18 |    "metadata": {
 19 |     "colab": {
 20 |      "base_uri": "https://localhost:8080/",
 21 |      "height": 527
 22 |     },
 23 |     "colab_type": "code",
 24 |     "id": "YhKEHbrxWd_n",
 25 |     "outputId": "368747f0-47d5-439f-f4b3-d4db6d6a2d18"
 26 |    },
 27 |    "outputs": [
 28 |     {
 29 |      "name": "stdout",
 30 |      "output_type": "stream",
 31 |      "text": [
 32 |       "Collecting textaugment\n",
 33 |       "  Downloading https://files.pythonhosted.org/packages/d5/87/906c855827f99a65ab91b22afbfa91731bd4397b5e3ca344de571e5c7651/textaugment-1.3-py3-none-any.whl\n",
 34 |       "Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (from textaugment) (3.2.5)\n",
 35 |       "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from textaugment) (1.18.4)\n",
 36 |       "Requirement already satisfied: textblob in /usr/local/lib/python3.6/dist-packages (from textaugment) (0.15.3)\n",
 37 |       "Requirement already satisfied: gensim in /usr/local/lib/python3.6/dist-packages (from textaugment) (3.6.0)\n",
 38 |       "Collecting googletrans\n",
 39 |       "  Downloading https://files.pythonhosted.org/packages/fd/f0/a22d41d3846d1f46a4f20086141e0428ccc9c6d644aacbfd30990cf46886/googletrans-2.4.0.tar.gz\n",
 40 |       "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk->textaugment) (1.12.0)\n",
 41 |       "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.6/dist-packages (from gensim->textaugment) (1.4.1)\n",
 42 |       "Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.6/dist-packages (from gensim->textaugment) (2.0.0)\n",
 43 |       "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from googletrans->textaugment) (2.23.0)\n",
 44 |       "Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim->textaugment) (1.13.13)\n",
 45 |       "Requirement already satisfied: boto in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim->textaugment) (2.49.0)\n",
 46 |       "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->googletrans->textaugment) (2.9)\n",
 47 |       "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->googletrans->textaugment) (1.24.3)\n",
 48 |       "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->googletrans->textaugment) (3.0.4)\n",
 49 |       "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->googletrans->textaugment) (2020.4.5.1)\n",
 50 |       "Requirement already satisfied: botocore<1.17.0,>=1.16.13 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim->textaugment) (1.16.13)\n",
 51 |       "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim->textaugment) (0.10.0)\n",
 52 |       "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim->textaugment) (0.3.3)\n",
 53 |       "Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.17.0,>=1.16.13->boto3->smart-open>=1.2.1->gensim->textaugment) (0.15.2)\n",
 54 |       "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.17.0,>=1.16.13->boto3->smart-open>=1.2.1->gensim->textaugment) (2.8.1)\n",
 55 |       "Building wheels for collected packages: googletrans\n",
 56 |       "  Building wheel for googletrans (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
 57 |       "  Created wheel for googletrans: filename=googletrans-2.4.0-cp36-none-any.whl size=15777 sha256=4de7ce4b52a5c57a680d9c96137d12291609a418bf5fdd1cf158003f747c7589\n",
 58 |       "  Stored in directory: /root/.cache/pip/wheels/50/d6/e7/a8efd5f2427d5eb258070048718fa56ee5ac57fd6f53505f95\n",
 59 |       "Successfully built googletrans\n",
 60 |       "Installing collected packages: googletrans, textaugment\n",
 61 |       "Successfully installed googletrans-2.4.0 textaugment-1.3\n"
 62 |      ]
 63 |     }
 64 |    ],
 65 |    "source": [
 66 |     "# Import libraries\n",
 67 |     "try:\n",
 68 |     "  import textaugment\n",
 69 |     "except ModuleNotFoundError:\n",
 70 |     "  !pip install textaugment\n",
 71 |     "  import textaugment\n",
 72 |     "\n",
 73 |     "import pandas as pd\n",
 74 |     "\n",
 75 |     "import tensorflow as tf\n",
 76 |     "from tensorflow.keras.preprocessing import sequence\n",
 77 |     "from tensorflow.keras.models import Sequential\n",
 78 |     "from tensorflow.keras.layers import Dense, Dropout, Activation\n",
 79 |     "from tensorflow.keras.layers import Embedding\n",
 80 |     "from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D\n",
 81 |     "from tensorflow.keras.datasets import imdb\n",
 82 |     "\n",
 83 |     "from textaugment import MIXUP\n",
 84 |     "%matplotlib inline"
 85 |    ]
 86 |   },
 87 |   {
 88 |    "cell_type": "code",
 89 |    "execution_count": 2,
 90 |    "metadata": {
 91 |     "colab": {
 92 |      "base_uri": "https://localhost:8080/",
 93 |      "height": 34
 94 |     },
 95 |     "colab_type": "code",
 96 |     "id": "JeMsxayIWd_r",
 97 |     "outputId": "814596bf-e5ca-47f1-c2ce-257e761e96c4"
 98 |    },
 99 |    "outputs": [
100 |     {
101 |      "data": {
102 |       "text/plain": [
103 |        "'2.2.0'"
104 |       ]
105 |      },
106 |      "execution_count": 2,
107 |      "metadata": {
108 |       "tags": []
109 |      },
110 |      "output_type": "execute_result"
111 |     }
112 |    ],
113 |    "source": [
114 |     "tf.__version__"
115 |    ]
116 |   },
117 |   {
118 |    "cell_type": "code",
119 |    "execution_count": 3,
120 |    "metadata": {
121 |     "colab": {
122 |      "base_uri": "https://localhost:8080/",
123 |      "height": 34
124 |     },
125 |     "colab_type": "code",
126 |     "id": "_FbvA0uwRdEZ",
127 |     "outputId": "8e912f45-8b7e-4ee7-a3ad-f342c3f090c7"
128 |    },
129 |    "outputs": [
130 |     {
131 |      "data": {
132 |       "text/plain": [
133 |        "'1.3'"
134 |       ]
135 |      },
136 |      "execution_count": 3,
137 |      "metadata": {
138 |       "tags": []
139 |      },
140 |      "output_type": "execute_result"
141 |     }
142 |    ],
143 |    "source": [
144 |     "textaugment.__version__"
145 |    ]
146 |   },
147 |   {
148 |    "cell_type": "markdown",
149 |    "metadata": {
150 |     "colab_type": "text",
151 |     "id": "Oz8O8tISRdEg"
152 |    },
153 |    "source": [
154 |     "## Initialize constant variables"
155 |    ]
156 |   },
157 |   {
158 |    "cell_type": "code",
159 |    "execution_count": null,
160 |    "metadata": {
161 |     "colab": {},
162 |     "colab_type": "code",
163 |     "id": "mg1AcYIWWd_w"
164 |    },
165 |    "outputs": [],
166 |    "source": [
167 |     "# set parameters:\n",
168 |     "max_features = 5000\n",
169 |     "maxlen = 400\n",
170 |     "batch_size = 32\n",
171 |     "embedding_dims = 50\n",
172 |     "filters = 250\n",
173 |     "kernel_size = 3\n",
174 |     "hidden_dims = 250\n",
175 |     "epochs = 10\n",
176 |     "runs = 1"
177 |    ]
178 |   },
179 |   {
180 |    "cell_type": "code",
181 |    "execution_count": 5,
182 |    "metadata": {
183 |     "colab": {
184 |      "base_uri": "https://localhost:8080/",
185 |      "height": 153
186 |     },
187 |     "colab_type": "code",
188 |     "id": "ZRuNNVstWd_0",
189 |     "outputId": "bc4ce3b2-5a12-4600-d1a8-b466615018df"
190 |    },
191 |    "outputs": [
192 |     {
193 |      "name": "stdout",
194 |      "output_type": "stream",
195 |      "text": [
196 |       "Loading data...\n",
197 |       "Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz\n",
198 |       "17465344/17464789 [==============================] - 0s 0us/step\n",
199 |       "25000 train sequences\n",
200 |       "25000 test sequences\n",
201 |       "Pad sequences (samples x time)\n",
202 |       "x_train shape: (25000, 400)\n",
203 |       "x_test shape: (25000, 400)\n"
204 |      ]
205 |     }
206 |    ],
207 |    "source": [
208 |     "print('Loading data...')\n",
209 |     "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)\n",
210 |     "print(len(x_train), 'train sequences')\n",
211 |     "print(len(x_test), 'test sequences')\n",
212 |     "\n",
213 |     "print('Pad sequences (samples x time)')\n",
214 |     "x_train = sequence.pad_sequences(x_train, maxlen=maxlen)\n",
215 |     "x_test = sequence.pad_sequences(x_test, maxlen=maxlen)\n",
216 |     "print('x_train shape:', x_train.shape)\n",
217 |     "print('x_test shape:', x_test.shape)"
218 |    ]
219 |   },
220 |   {
221 |    "cell_type": "markdown",
222 |    "metadata": {
223 |     "colab_type": "text",
224 |     "id": "Tx73Y-asRdEz"
225 |    },
226 |    "source": [
227 |     "## Initialize mixup"
228 |    ]
229 |   },
230 |   {
231 |    "cell_type": "code",
232 |    "execution_count": null,
233 |    "metadata": {
234 |     "colab": {},
235 |     "colab_type": "code",
236 |     "id": "xvuxODUxRdE1"
237 |    },
238 |    "outputs": [],
239 |    "source": [
240 |     "mixup = MIXUP()\n",
241 |     "generator, step = mixup.flow(x_train, y_train, batch_size=batch_size, runs=runs)"
242 |    ]
243 |   },
244 |   {
245 |    "cell_type": "code",
246 |    "execution_count": 7,
247 |    "metadata": {
248 |     "colab": {
249 |      "base_uri": "https://localhost:8080/",
250 |      "height": 476
251 |     },
252 |     "colab_type": "code",
253 |     "id": "6cm1o_fAWd_4",
254 |     "outputId": "ea793754-100c-4c12-8acf-7798c096c399"
255 |    },
256 |    "outputs": [
257 |     {
258 |      "name": "stdout",
259 |      "output_type": "stream",
260 |      "text": [
261 |       "Build model...\n",
262 |       "Model: \"sequential\"\n",
263 |       "_________________________________________________________________\n",
264 |       "Layer (type)                 Output Shape              Param #   \n",
265 |       "=================================================================\n",
266 |       "embedding (Embedding)        (None, 400, 50)           250000    \n",
267 |       "_________________________________________________________________\n",
268 |       "dropout (Dropout)            (None, 400, 50)           0         \n",
269 |       "_________________________________________________________________\n",
270 |       "conv1d (Conv1D)              (None, 398, 250)          37750     \n",
271 |       "_________________________________________________________________\n",
272 |       "global_max_pooling1d (Global (None, 250)               0         \n",
273 |       "_________________________________________________________________\n",
274 |       "dense (Dense)                (None, 250)               62750     \n",
275 |       "_________________________________________________________________\n",
276 |       "dropout_1 (Dropout)          (None, 250)               0         \n",
277 |       "_________________________________________________________________\n",
278 |       "activation (Activation)      (None, 250)               0         \n",
279 |       "_________________________________________________________________\n",
280 |       "dense_1 (Dense)              (None, 1)                 251       \n",
281 |       "_________________________________________________________________\n",
282 |       "activation_1 (Activation)    (None, 1)                 0         \n",
283 |       "=================================================================\n",
284 |       "Total params: 350,751\n",
285 |       "Trainable params: 350,751\n",
286 |       "Non-trainable params: 0\n",
287 |       "_________________________________________________________________\n"
288 |      ]
289 |     }
290 |    ],
291 |    "source": [
292 |     "print('Build model...')\n",
293 |     "model = Sequential()\n",
294 |     "\n",
295 |     "# we start off with an efficient embedding layer which maps\n",
296 |     "# our vocab indices into embedding_dims dimensions\n",
297 |     "model.add(Embedding(max_features,\n",
298 |     "                    embedding_dims,\n",
299 |     "                    input_length=maxlen))\n",
300 |     "model.add(Dropout(0.2))\n",
301 |     "\n",
302 |     "# we add a Convolution1D, which will learn filters\n",
303 |     "# word group filters of size filter_length:\n",
304 |     "model.add(Conv1D(filters,\n",
305 |     "                 kernel_size,\n",
306 |     "                 padding='valid',\n",
307 |     "                 activation='relu',\n",
308 |     "                 strides=1))\n",
309 |     "# we use max pooling:\n",
310 |     "model.add(GlobalMaxPooling1D())\n",
311 |     "\n",
312 |     "# We add a vanilla hidden layer:\n",
313 |     "model.add(Dense(hidden_dims))\n",
314 |     "model.add(Dropout(0.2))\n",
315 |     "model.add(Activation('relu'))\n",
316 |     "\n",
317 |     "# We project onto a single unit output layer, and squash it with a sigmoid:\n",
318 |     "model.add(Dense(1))\n",
319 |     "model.add(Activation('sigmoid'))\n",
320 |     "\n",
321 |     "model.compile(loss='binary_crossentropy',\n",
322 |     "              optimizer='adam',\n",
323 |     "              metrics=['accuracy'])\n",
324 |     "model.summary()"
325 |    ]
326 |   },
327 |   {
328 |    "cell_type": "markdown",
329 |    "metadata": {
330 |     "colab_type": "text",
331 |     "id": "b5zRyuq8UKmR"
332 |    },
333 |    "source": [
334 |     "## Train model using mixup augmentation"
335 |    ]
336 |   },
337 |   {
338 |    "cell_type": "code",
339 |    "execution_count": 8,
340 |    "metadata": {
341 |     "colab": {
342 |      "base_uri": "https://localhost:8080/",
343 |      "height": 357
344 |     },
345 |     "colab_type": "code",
346 |     "id": "oGLSfzcUWeAB",
347 |     "outputId": "81464964-8fd3-4249-b901-0e05cb664436"
348 |    },
349 |    "outputs": [
350 |     {
351 |      "name": "stdout",
352 |      "output_type": "stream",
353 |      "text": [
354 |       "Epoch 1/10\n",
355 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.6867 - accuracy: 0.2859 - val_loss: 0.6408 - val_accuracy: 0.6537\n",
356 |       "Epoch 2/10\n",
357 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.6655 - accuracy: 0.3081 - val_loss: 0.6140 - val_accuracy: 0.6620\n",
358 |       "Epoch 3/10\n",
359 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.6443 - accuracy: 0.3267 - val_loss: 0.5688 - val_accuracy: 0.7233\n",
360 |       "Epoch 4/10\n",
361 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.6250 - accuracy: 0.3287 - val_loss: 0.5167 - val_accuracy: 0.7434\n",
362 |       "Epoch 5/10\n",
363 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.6140 - accuracy: 0.3337 - val_loss: 0.5154 - val_accuracy: 0.7534\n",
364 |       "Epoch 6/10\n",
365 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.6029 - accuracy: 0.3338 - val_loss: 0.4763 - val_accuracy: 0.7765\n",
366 |       "Epoch 7/10\n",
367 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.5976 - accuracy: 0.3314 - val_loss: 0.4659 - val_accuracy: 0.7810\n",
368 |       "Epoch 8/10\n",
369 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.5857 - accuracy: 0.3423 - val_loss: 0.4551 - val_accuracy: 0.7873\n",
370 |       "Epoch 9/10\n",
371 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.5800 - accuracy: 0.3488 - val_loss: 0.4502 - val_accuracy: 0.7927\n",
372 |       "Epoch 10/10\n",
373 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.5793 - accuracy: 0.3402 - val_loss: 0.4653 - val_accuracy: 0.7927\n"
374 |      ]
375 |     }
376 |    ],
377 |    "source": [
378 |     "h1 = model.fit(generator, steps_per_epoch=step,\n",
379 |     "          epochs=epochs,\n",
380 |     "          validation_data=(x_test, y_test))"
381 |    ]
382 |   },
383 |   {
384 |    "cell_type": "code",
385 |    "execution_count": 9,
386 |    "metadata": {
387 |     "colab": {
388 |      "base_uri": "https://localhost:8080/",
389 |      "height": 298
390 |     },
391 |     "colab_type": "code",
392 |     "id": "XKrXdkt8XeYo",
393 |     "outputId": "0d463439-1718-4f90-bc24-b32f6dae7eda"
394 |    },
395 |    "outputs": [
396 |     {
397 |      "data": {
398 |       "text/plain": [
399 |        "<matplotlib.axes._subplots.AxesSubplot at 0x7f13e7945550>"
400 |       ]
401 |      },
402 |      "execution_count": 9,
403 |      "metadata": {
404 |       "tags": []
405 |      },
406 |      "output_type": "execute_result"
407 |     },
408 |     {
409 |      "data": {
410 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEICAYAAABRSj9aAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deXxU1f3/8dcnCwmQBLISIIEkEAgIKBJQhITFDa2CK4gruH0VFbditS5Vq21/2lqtRa1FcUMBcQNBESsQQEUCguxbCJAAZgECCWSd8/vjTmCIARKY5E5mPs/HYx7J3GXmk1He98y5554rxhiUUkp5Lz+7C1BKKdWwNOiVUsrLadArpZSX06BXSikvp0GvlFJeToNeKaW8nAa98joikiYiG0+wPkFEjIgENND7rxWRwQ3x2kqdCg165fFE5DER+arGss3HWXadMWaRMaary/JsEbmgseo1xpxhjFnQWO+n1Mlo0KumIAM4T0T8AUSkLRAI9K6xrLNzW6WUCw161RQswwr2s5zP04D5wMYay7YaY3aJyGARyQEQkfeBDsAsESkWkUdcXvcGEdkhIgUi8vjx3lxE3hGR10TkK+drLBGRWBF5WUT2icgGEentsv2RbxAiMkdE/uGybqqIvO38/WkR+cBl3TFdSiKyQET+KiI/icgBEflCRCJO8TNUPkyDXnk8Y0w5sBRIdy5KBxYBi2ss+01r3hhzE7ADuNwYE2KMecFl9UCgK3A+8JSIdDtBGSOBJ4AooAz4AVjhfD4DeOk4+90K3CQiQ0XkBqAfcP8J/+Bj3ex8jbZAJfCveuyrFKBBr5qOhRwN9TSsoF9UY9nCer7mM8aYw8aYVcAq4MwTbPuZMWa5MaYU+AwoNca8Z4ypAqYBvWvbyRizB7gbeBd4BbjZGHOwHjW+b4xZY4wpAZ4ERlZ3VylVVxr0qqnIAAY6uy6ijTGbge+x+u4jgB7Uv39+j8vvh4CQE2z7q8vvh2t5fqJ9ZwH+wEZjzOJ61rjT5fftWF1YUfV8DeXjNOhVU/ED0Aq4A1gCYIw5AOxyLttljNl2nH3tnqL1eWA90FZERrssLwFauDyPrWXfeJffOwAVQIHbK1ReTYNeNQnGmMNAJvAQVpdNtcXOZSdqzf8KJDVcdccnIunAWKy+9luAV0WkvXP1SiBdRDqISCvgsVpe4kYR6S4iLYBngRnO7iKl6kyDXjUlC4EYrHCvtsi57ERB/1fgCRHZLyK/b8D6jiEiYcB7wL3GmFxjzCLgLWCyiIgxZh5W//4vwHLgy1pe5n3gHaxupmBgfGPUrryL6I1HlPJMIrIA+MAYM8nuWlTTpi16pZTychr0Sinl5bTrRimlvJy26JVSyss1yDStpyMqKsokJCTYXYZSSjUpy5cvLzDGRNe2zuOCPiEhgczMTLvLUEqpJkVEth9vnXbdKKWUl9OgV0opL6dBr5RSXs7j+uiVUr6poqKCnJwcSktL7S7FowUHBxMXF0dgYGCd99GgV0p5hJycHEJDQ0lISEBE7C7HIxljKCwsJCcnh8TExDrvp103SimPUFpaSmRkpIb8CYgIkZGR9f7Wo0GvlPIYGvIndyqfkdcEvTGGv8xZz4Y9B+wuRSmlPIrXBH124SE++mkHl7yyiPun/kx2QYndJSmlmpiQkBPdEbLp8pqgT4xqyeJHhnL3oE58s/ZXLnhpIX/8bDV7ivQMvlLKt3lN0AO0ahHII8NSWPjIYG44pwMfZ+5k0IvzeX72OvaWlNtdnlKqiTDGMGHCBHr06EHPnj2ZNm0aALt37yY9PZ2zzjqLHj16sGjRIqqqqhgzZsyRbf/5z3/aXP1veeXwypjQYJ4Z0YPb05J4+dvNvLV4Gx/9tJPb0xK5bWAiocF1H3+qlGp8z8xay7pd7j3f1r1dGH+6/Iw6bfvpp5+ycuVKVq1aRUFBAX379iU9PZ0PP/yQiy++mMcff5yqqioOHTrEypUryc3NZc2aNQDs37/frXW7g1e16GuKj2jBP0aeydwH0hnYOYqXv91M+gvzmbQoi9IKvb+yUqp2ixcvZvTo0fj7+9OmTRsGDRrEsmXL6Nu3L5MnT+bpp59m9erVhIaGkpSURFZWFvfddx9ff/01YWFhdpf/G17Zoq8puU0ob9zUh1U79/P3bzby3Oz1TFq0jfsvSOaaPnEE+nv18U6pJqeuLe/Glp6eTkZGBrNnz2bMmDE89NBD3HzzzaxatYq5c+fyxhtvMH36dN5++227Sz2GTyXcmfGtef+2c/jojnNp1zqYxz5dzYUvLeSLlbk4HHqnLaWUJS0tjWnTplFVVUV+fj4ZGRn069eP7du306ZNG+644w5uv/12VqxYQUFBAQ6Hg6uvvprnnnuOFStW2F3+b/hEi76m/p0i+eTu8/huQx4vzt3I/VNX8vqCrUy4uCtDU2L0og2lfNyVV17JDz/8wJlnnomI8MILLxAbG8u7777Liy++SGBgICEhIbz33nvk5uYyduxYHA4HAH/9619trv63PO6esampqaYxbzzicBhm/bKLf87bRHbhIc7u0JoJF6fQv1Nko9WglIL169fTrVs3u8toEmr7rERkuTEmtbbtfarrpjZ+fsKIs9oz76FB/OXKnuzaX8ro//7ITW8t5Zcczzt7rpRS9eXzQV8t0N+P68/pwIIJg3n80m6syS1i+L+XcNf7y9n860G7y1NKqVPmk330JxIc6M8d6Ulc1y+etxZvY9KibXyzbg9X9o7jgQuSiY9oYXeJSilVL9qiP47Q4EAeuKALGY8M4baBicz6ZRdD/7GAp75YQ95BnVZBKdV0aNCfRETLZjz+u+4snDCYa1PjmbJ0B+kvzOf/fb2BokMVdpenlFInpUFfR21bNecvV/bkfw8N4uIzYnlj4VYGvvAdE+dvoaSs0u7ylFLquDTo6ykhqiWvXNebOePTOCcxkhfnbmTQi/OZvGQbZZU6rYJSyvNo0J+ibm3DmHRLKp/cfR6dY0J4ZtY6hv59IdMzd1JZ5bC7PKVUAzvR3PXZ2dn06NGjEas5MQ3609SnYzgf3XEu79/Wj8iQZjwy4xcufjmDOat367QKSimPoMMr3UBESEuOZmDnKOau/ZW/f7ORcVNW0KN9GL+/qCuDukTrtApK1cdXj8Ke1e59zdiecMnfjrv60UcfJT4+nnvuuQeAp59+moCAAObPn8++ffuoqKjgueeeY8SIEfV629LSUu6++24yMzMJCAjgpZdeYsiQIaxdu5axY8dSXl6Ow+Hgk08+oV27dowcOZKcnByqqqp48sknGTVq1Gn92aBB71YiwrAesVzYvQ2f/5zLP7/dxJjJy+iXEMGEYV3pmxBhd4lKqeMYNWoUDzzwwJGgnz59OnPnzmX8+PGEhYVRUFDAueeey/Dhw+vVcJs4cSIiwurVq9mwYQMXXXQRmzZt4o033uD+++/nhhtuoLy8nKqqKubMmUO7du2YPXs2AEVFRW752zToG4C/n3B1nzguP7Md05bt4F/fbeHaN35gcNdofn9RV3q0b2V3iUp5thO0vBtK7969ycvLY9euXeTn5xMeHk5sbCwPPvggGRkZ+Pn5kZuby6+//kpsbGydX3fx4sXcd999AKSkpNCxY0c2bdpE//79ef7558nJyeGqq64iOTmZnj178vDDD/OHP/yByy67jLS0NLf8bdpH34CaBfhxU/8EMiYM4dFLUvh5x34ue3Ux90xZwZa8YrvLU0rVcO211zJjxgymTZvGqFGjmDJlCvn5+SxfvpyVK1fSpk0bSkvdc8Hk9ddfz8yZM2nevDmXXnop3333HV26dGHFihX07NmTJ554gmeffdYt76Ut+kbQvJk/dw3qxOh+HXhrURaTFm/jqzW7uaZPHOPPTyYuXKdVUMoTjBo1ijvuuIOCggIWLlzI9OnTiYmJITAwkPnz57N9+/Z6v2ZaWhpTpkxh6NChbNq0iR07dtC1a1eysrJISkpi/Pjx7Nixg19++YWUlBQiIiK48cYbad26NZMmTXLL36VB34haNQ/koYu6cvN5Cby+YCvv/7idz3/exfXndOCeIZ2JDg2yu0SlfNoZZ5zBwYMHad++PW3btuWGG27g8ssvp2fPnqSmppKSklLv1xw3bhx33303PXv2JCAggHfeeYegoCCmT5/O+++/T2BgILGxsfzxj39k2bJlTJgwAT8/PwIDA3n99dfd8nf5/Hz0dtq1/zCvfreZ6Zk5NPP349aBCdyZ1olWLfTm5cr36Hz0dafz0Tch7Vo3569X9WLeg+lc0L0NE+dvJc05rcKhcp1WQSnlHtp14wGSokN4dXRv7hqUxEvfbOLFuRuZvCSbe4d0YvQ5HQgK8Le7RKVULVavXs1NN910zLKgoCCWLl1qU0W106D3IGe0a8VbY/qyfPteXvh6I0/PWsd/F23jgQuSubJ3ewL89QuY8m7GmCZ1cWHPnj1ZuXJlo77nqXS31yk5RGSYiGwUkS0i8uhxthkpIutEZK2IfOiyvEpEVjofM+tdoQ/q0zGCqXcenVZhgk6roHxAcHAwhYWFpxRkvsIYQ2FhIcHBwfXa76QnY0XEH9gEXAjkAMuA0caYdS7bJAPTgaHGmH0iEmOMyXOuKzbGHH/2nxp86WRsXRhjmLt2D3//ZhNb8op1WgXltSoqKsjJyXHbOHVvFRwcTFxcHIGBxw7aONHJ2Lp03fQDthhjspwvNhUYAaxz2eYOYKIxZh9Adcir02dNq9CWC7vH6rQKyqsFBgaSmJhodxleqS5dN+2BnS7Pc5zLXHUBuojIEhH5UUSGuawLFpFM5/IransDEbnTuU1mfn5+vf4AX1E9rcJ3Dw/mzyPOYFthCde+8QNjJv/Emlz3zIehlPJO7jq7FwAkA4OB0cB/RaS1c11H59eJ64GXRaRTzZ2NMW8aY1KNManR0dFuKsk76bQKSqn6qkvQ5wLxLs/jnMtc5QAzjTEVxphtWH36yQDGmFznzyxgAdD7NGtWHJ1WIeORIYwf2pn5G/O46J8LeWTGKnL2HbK7PKWUB6lL0C8DkkUkUUSaAdcBNUfPfI7VmkdEorC6crJEJFxEglyWD+DYvn11mqqnVch4ZAhjByTy+cpdDP37Qp6euZb8g2V2l6eU8gAnDXpjTCVwLzAXWA9MN8asFZFnRWS4c7O5QKGIrAPmAxOMMYVANyBTRFY5l//NdbSOcp+okCCevKw7C34/mKvObs/7P24n7YXveGbWWnbtP2x3eUopG+lcN14qK7+YifO38vnKXPwEruodx/8NSiIpus4jXZVSTciJhld6V9DvXgWxvUDHlx+xc+8h/rsoi2nLdlJe5eDSnm0ZN7gTZ7TTm58o5U18I+gLtsBr50BCGgz/F7Tu4P7imrD8g2W8vWQb7/+wneKySoZ0jWbckM46Dl8pL+EbQe9wwPLJMO8p6/mFz0KfseCn88O4Kjpcwfs/ZPP2kmz2lpTTLyGCcUM66ZW2SjVxvhH01fbvgJn3QdYCSEyH4a9CeIK7yvMah8urmLpsB29mZLG7qJQz2oUxbnBnhvWIxd9PA1+ppsa3gh7AGFjxLsx9AowDLnwGUm/T1n0tyisdfP5zLm8s3EpWQQlJUS25a1AnrujdnmYB+nkp1VT4XtBX278TZo2Hrd9Bx4Ew4lWISHLPa3uZKofh6zV7mDh/C+t2H6Btq2DuTE/iur4daN5M58NXytP5btCD1br/+QOY+0dwVMIFT0PfO7R1fxzGGBZuyue1+Vv5KXsvES2bceuABG7qn0Cr5nqLQ6U8lW8HfbWiXJh1P2yZBx3OgxH/hsjfTLujXCzL3str87cwf2M+IUEB3NS/I7cOSNSbmCvlgTToqxkDKz+Erx+DqnI4/yk45//AT7smTmTtriJeW7CVOat308zfj1F947kzPYm48BZ2l6aUctKgr+nALpj1AGyeC/HnwoiJENW5Yd/TC2TlF/OfhVl8+nMOxsCIs9pz9+AkOseE2l2aUj5Pg742xsAv0+CrR6CyDIY+AeeO09Z9HewuOsx/M7bx4U/bKat0cHH3WMYN6USvuNYn31kp1SA06E/k4B748kHYOAfi+sKI1yC6S+O9fxNWWFzGO99n8+732RworSQtOYpxgztzblKEXnylVCPToD8ZY2D1DPhqApQfgqGPQ/97tXVfRwdLK5iydAeTFm2joLiMszu05p4hnRmaEqOBr1Qj0aCvq4O/wuyHYMOX0L6P1bqPSbGnliaotKKKjzN38p+MLHL2HSYlNpS7B3fidz3bEuCvw1mVakga9PVhDKz5BOZMgPJiGPwYnDce/OtyH3UFUFHlYNaqXby2YCtb8oqJCQ0iLTma9C5RDOgcRVSIDs9Uyt006E9FcR7MfhjWz4R2va3WfZvudlfVpDgchnnrf2Xmql0s2VLA/kMVAHRvG0ZalyjSOkeTmhBOcKB2kSl1ujToT8faz6zALz0Ag/8AAx4Af71CtL6qHIa1u4pYtLmARZvzWb59HxVVhqAAP85JiiStcxRpXaLo2iZU+/WVOgUa9KerpADm/N4K/dhecMXrENvD7qqatJKySpZuK3QGfwFb8ooBiA4NOhL6AzpHERMabHOlSjUNGvTusu4Lq3V/eD+kT4C0h7R17ya7iw4fCf0lWwrYW1IOQEpsKOldohnYOYp+iRHazaPUcWjQu1NJoXWR1ZoZENvT6rtv28vuqryKw2FYt/sAGZvzWby5gMzsfZRXOWgW4Mc5iREM7BxFWnI03dpqN49S1TToG8L6WfDlQ3B4L6Q9DGm/h4BmdlfllQ6VV7J0214WO/v3N/1qdfNEhQQxsHMkacnRpCVHEROm3TzKd2nQN5RDe+HrR62pFNr0sObMaXeW3VV5vT1FpSzeYoX+4s0FFDq7ebq2CSUtOYq0LtH0S4jQefSVT9Ggb2gb5sCXD1gnbdMesvrvA3SseGNwOAzr9xxg0eYCFm8u4KfsvZRXOmjm70ffxHDSkq3+/e5tw/DTWyQqL6ZB3xgO7bVubrLqI4jpDtdPh9bxdlflcw6XV/FT9l4Wb85n0eYCNuw5CEBky2YMTLb69tO1m0d5IQ36xrRpLnxyO4S2hVu/hhYRdlfk0/IOWN08GZvyWbylgILiY0fzpCVH0TdBR/Oopk+DvrFtWwQfXAXtzoabP4fA5nZXpDi2mydjU/6R0TzVF22lJ0eR3iWa5JgQHc2jmhwNejus+RRm3Aopv4OR7+lMmB7oUHklS7P2krE5n4xN+WzNLwEgNiyYNGfoD+wcRXhLHU2lPJ8GvV1+fAO+/gOk3gq/ewm0lejRcvcfZtEmq29/8ZYCig5XIAI927ci3TmE8+yO4QTqTJzKA2nQ22neU7DkFRjyBAyaYHc1qo6qHIZfcvaTsckaxvnzzv1UOQwhQQGcmxTJoC5Wi79jZEu7S1UK0KC3l8MBn99ljbUf/m84+ya7K1KnoOhwBT9sLTzSzZOz7zAAHSJakN7FGs3Tv1MkYcE6JYayhwa93SrL4aNRkLUQRn8EXS62uyJ1GowxZBceYpEz9H/YWkhJeRX+fsLZHVo7596Ppmf7Vvjr2H3VSDToPUHZQXjnMsjfCGO+hLha/3uoJqi80sGKHfucwV/Aml1FGAOtWwQyoHMU6c7x++1a6+gr1XA06D1FcR68daE1t/1t8yCqs90VqQZQWFzmnKLBGsaZd7AMgM4xIaQnR9O9XRjx4c2Jj2hBm7BgbfUrtzjtoBeRYcArgD8wyRjzt1q2GQk8DRhglTHmeufyW4AnnJs9Z4x590Tv5dVBD1C41Qr7Zi3htm8htI3dFakGZIxh06/FZGzKJ2NzPj9t20tZpePI+kB/IS68BXHO4I8Pb0F8RHM6OH9v3SJQx/SrOjmtoBcRf2ATcCGQAywDRhtj1rlskwxMB4YaY/aJSIwxJk9EIoBMIBXrALAc6GOM2Xe89/P6oAfIXW5140R2hjGzITjM7opUIymvdJC7/zA79x5i575D7Nx72PnTeuxz3m6xWkhQwJGDgBX+zgOC80CgE7epaicK+rrc8bofsMUYk+V8sanACGCdyzZ3ABOrA9wYk+dcfjEwzxiz17nvPGAY8NGp/CFeo30f6yKqD0fBtBvhhhk6xbGPaBbgR2JUSxKjah+WebC0gpx9h9nhDP6cfdZBYXthCYs251Na4Thm+6iQIOIjmh/5JhAf7jwgRLSgbatgAnTMv6JuQd8e2OnyPAc4p8Y2XQBEZAlW987Txpivj7Nv+5pvICJ3AncCdOjQoa61N23JF8LwV+GLcdbjyjfBT/9R+rrQ4EC6tQ2kW9vffsszxlBQXH7kG0DOvsPsKLS+Gfy8cx+zV++mynH0G7q/n9C2VbBL+FvfBuLCW9AxsgVRITrDqq+oS9DX9XWSgcFAHJAhIj3rurMx5k3gTbC6btxUk+frfQMc3A3f/RlCY+Gi5+yuSHkwESE6NIjo0CDO7hD+m/WVVQ52F5X+pltox95D/G9DHgXFZcdsnxwTwtCUGIakxNBHr/j1anUJ+lzAdb7dOOcyVznAUmNMBbBNRDZhBX8uVvi77rvgVIv1SmkPw8E98P2r1oyX/e+xuyLVRAX4+x3pv6/N4fIqcvZZB4EtecVkbCrg7SXb+E9GFmHBAaR3iWZoSgyDu8YQofP7eJW6nIwNwDoZez5WcC8DrjfGrHXZZhjWCdpbRCQK+Bk4i6MnYM92broC62Ts3uO9n0+cjK3JUQUfj4H1M+Hqt6DnNXZXpHxEcVklizfn892GPOZvzCf/YBki0Du+9ZHWfve2YTrypwlwx/DKS4GXsfrf3zbGPC8izwKZxpiZYv1f8A+sE61VwPPGmKnOfW8F/uh8qeeNMZNP9F4+GfQAFaXw/pWQswxu/ASSBtldkfIxDodhza4iK/Q35LEqpwiwZvMckhLD0JQYBnSOpEUzd/X4KnfSC6aaisP74O1LoCgHbv0KYut8mkMpt8s7WMqCjfnM35DHos0FFJdV0izAj3OTIjnfGfzH6yZSjU+DvikpyoG3LrK6c277BsI72l2RUpRXOliWvfdIaz+rwJq7v7PzhO5QPaFrOw36piZvPbx9MbSMscJeb0eoPMy2gpIjob90WyEVVYbQ6hO6XWMY3DWaSB2+2ag06Jui7d/De1dA215w80xopl+RlWc63gnds+JbM7SrdUL3jHZ6QrehadA3VetmwvSbocswGPUB+OtJMOXZTnxCN5ohXWMYmBylJ3QbgAZ9U/bTf2HO7+HsW+DyV/R2hKpJqfWErr8f53aKZGjX6CN36dIZPE/f6c51o+zU7w7r6tlF/4CwdjD4UbsrUqrOYkKDGZkaz8jU+N+c0H16ljVdVjN/P+IjmpMY1ZKOkS1JiGxBQlRLEiJb0q51cz0IuIG26JsCY+CLe2DlFKtV32eM3RUpddq2FZSwNKuQbYUlbC84RHZhCdmFJcdM3BboL8RHtCAx0nkQiGpBQmT1QUAnbXOlLfqmTsQK+OI8+PJBazROyqV2V6XUaaltFk9jDL8eKCO7sITthSVsKzjk/FnC91sLOVxRdWTbQH8hPtxq/XeMdB4AoqxvBO1bN9eDgAtt0TclZcXw7uXW8MtbZkJ8P7srUqrRGGPIO1hGdkEJ2wsPWd8EXA4Gh8qPHgQC/KxvAkcOAC7dQXHh3nkQ0JOx3qSkwLpD1eF9cOs3EN3F7oqUsp0xhvyDZWQXOruAqg8GBdbBoKTGQSAuvDkdI1s6zwtYc/cH+vvh7ycE+PkR4C8E+An+fuKy/LfPA4753drXT7BlKKkGvbfZu80K+4Dm1gVVYW3trkgpj2WMIb+4jO2Fh8guKHGeC3D+XnDsQcBdAv2l1oNG9fMjBweX54F+fnSJDeG5K05t6hPto/c2EYlww8fW7QinXANj50BwK7urUsojiQgxocHEhAbTN+HYq8yrb+aSd7CUKoehospQ5TBUOhxUHvndUOVwuKwzVFY5nMsNFVUOl+XWtkfXHX1eWeXyWg5DlfN5pcO5f5WhodrdGvRNVbveztsRjoSpN1gzXgboJedK1YfrzVy8mfedkfAlnc+HEa9B9iL47C5wOE6+j1LK52iLvqk7c5R1QdW3f7LuUDXsL3ZXpJTyMBr03mDA/VbY/zjROjF73n12V6SU8iAa9N5ABC7+KxT/Ct88ASFtoNdIu6tSSnkIDXpv4ecHV/7HGmf/+ThoGQ2dhthdlVLKA+jJWG8SEATXTYGoLjDtRti9yu6KlFIeQIPe2wS3ghtnQPNwmHItHNhld0VKKZtp0HujsHbWBVVlxfDxGKgst7sipZSNNOi9VUw3GPFv2LkU5j1pdzVKKRtp0HuzHlfBueNg6Ruweobd1SilbKJB7+0ufBY69IeZ91nTGyulfI4GvbfzD4Rr34FmIdZInNIDdleklGpkGvS+IDTWCvu92+CLcTTYFHlKKY+kQe8rEgbAhc/A+lnw/at2V6OUakQa9L6k/73QfQR8+zRkL7a7GqVUI9Gg9yUiMGIiRCTBx2PhwG67K1JKNQINel8TFAqjPoDyEutiqqoKuytSSjUwDXpfFJMCI16FnT/CN3oxlVLeToPeV/W4Gs65G5a+rhdTKeXl6hT0IjJMRDaKyBYRebSW9WNEJF9EVjoft7usq3JZPtOdxavTdNGfIf5cmDke8jbYXY1SqoGcNOhFxB+YCFwCdAdGi0j3WjadZow5y/mY5LL8sMvy4e4pW7mFfyBcOxmatdCLqZTyYnVp0fcDthhjsowx5cBUYETDlqUaTVg7uGYy7M2CL+7Ri6mU8kJ1Cfr2wE6X5znOZTVdLSK/iMgMEYl3WR4sIpki8qOIXFHbG4jInc5tMvPz8+tevXKPxDS44E+wfib88G+7q1FKuZm7TsbOAhKMMb2AecC7Lus6GmNSgeuBl0WkU82djTFvGmNSjTGp0dHRbipJ1ct546Hb5TDvT5C9xO5qlFJuVJegzwVcW+hxzmVHGGMKjTFlzqeTgD4u63KdP7OABUDv06hXNRQRGPEaRCRa4+v1YiqlvEZdgn4ZkCwiiSLSDLgOOGb0jIi0dXk6HFjvXB4uIkHO36OAAcA6dxSuGkBwmPNiqmK9mEopL3LSoDfGVAL3AnOxAny6MWatiDwrItWjaPrbDvkAAA+rSURBVMaLyFoRWQWMB8Y4l3cDMp3L5wN/M8Zo0HuymG4w3Hkx1byn7K5GKeUGYjxslEVqaqrJzMy0uww15xH46T/WiJweV9ldjVLqJERkufN86G/olbGqdhc9B3H94It79WIqpZo4DXpVu4BmMPJd62Kq6TdB2UG7K1JKnSINenV8Ye3gmrehcIvVsvewbj6lVN1o0KsTS0yH8/8E6z6HH1+zuxql1CnQoFcnN+B+SLnMmtJ4+/d2V6OUqicNenVyInDFaxCeYI2vP7jH7oqUUvWgQa/qJriVdTFV2UHrNoR6MZVSTYYGvaq7Nt3h8n/Bju+tG4wrpZoEDXpVP72uhX53WrNcrv3M7mqUUnWgQa/q76Lnj15Mlb/R7mqUUiehQa/qL6AZXPsOBATDtJugrNjuipRSJ6BBr05Nq/bOi6k2w0y9mEopT6ZBr05d0iA4/ymrr/7H1+2uRil1HBr06vQMeMC6mGrek7D9B7urUUrVQoNenZ7qi6lad3ReTPWr3RUppWrQoFenL7gVjHofSotghl5MpZSn0aBX7tHmDBj+L9i+RC+mUsrDaNAr9+k1Evre7ryY6nO7q1FKOWnQK/e6+C/QPhW+uAfyN9ldjVIKDXrlbgFB1p2pAoJg2o16MZVSHkCDXrlfqziXi6nu04uplLJZgN0FKC+VNBiGPgH/exZaRkNsT2vKhICgoz8Dmx/7PKDGcxG7/wqlvIIGvWo4Ax6EPavhp/+c2v7+QRAYXOMAUfN5zQPGcQ4agc2hQ39oHe/ev1GpJkCDXjUcPz+4ZjJc8iJUlkJlGVQedv4sPbqswnWZ67rSY5fX3O5Q4fG3M1W/rad5BNz8ObQ9s/E/C6VspEGvGpYIhEQ3/vtWVR57ADi4Bz6+Bd69HG78FOJSG78mpWyiJ2OVd/IPgKAQaBlpzbQZ1wfGfmW16t+7QuflUT5Fg175jtbxVtiHtYUProKsBXZXpFSj0KBXviWsLYyZDeGJMGUkbJ5nd0VKNTgNeuV7QmJgzJcQkwIfjYYNs+2uSKkGpUGvfFOLCLh5pjUCZ/rNsOZTuytSqsFo0Cvf1by1Ndwyrh98chusmmp3RUo1CA165duCQuHGGZCYDp/dBcvfsbsipdxOg16pZi1h9DRIvhBm3Q9L37S7IqXcqk5BLyLDRGSjiGwRkUdrWT9GRPJFZKXzcbvLultEZLPzcYs7i1fKbQKDYdQH1v1vv5oAS16xuyKl3OakV8aKiD8wEbgQyAGWichMY8y6GptOM8bcW2PfCOBPQCpggOXOffe5pXql3CkgCK59Bz77P5j3lHVV7aBH7K5KqdNWlykQ+gFbjDFZACIyFRgB1Az62lwMzDPG7HXuOw8YBnx0auUq1cD8A+Gq/1oTqs1/3po+YeiTOpOmatLq0nXTHtjp8jzHuaymq0XkFxGZISLVUwTWaV8RuVNEMkUkMz8/v46lK9VA/PxhxEToMwYW/QPmPq5z6qsmzV0nY2cBCcaYXsA84N367GyMedMYk2qMSY2OtmECLKVq8vODy16Gc+6CHyfC7IfB4bC7KqVOSV2CPhdwncQ7zrnsCGNMoTGmzPl0EtCnrvsq5bFEYNjfYMD9kPkWzLoPHLVMf6yUh6tL0C8DkkUkUUSaAdcBM103EJG2Lk+HA+udv88FLhKRcBEJBy5yLlOqaRCBC56BQY/Czx9YJ2qrKu2uSql6OenJWGNMpYjcixXQ/sDbxpi1IvIskGmMmQmMF5HhQCWwFxjj3HeviPwZ62AB8Gz1iVmlmgwRGPKYNSrnf89Yo3GufgsCmtldmVJ1IsbDTjKlpqaazMxMu8tQqnY/vAZzH4Muw+Dad63x90p5ABFZboyp9Y46emWsUvXRfxz87h+w6WuYOhrKD9ldkVInpUGvVH31vd0afrl1Pnw4EsqK7a5IqRPSoFfqVPS+0bqwavv31t2qSovsrkip49KgV+pU9boWrp0MuSvgvRFwSMcZKM+kQa/U6eg+wpoM7de18O5wKCmwuyKlfkODXqnT1XUYjJ4KhVtg8qVwcI/dFSl1DA16pdyh8/nWDUyKcmDyJdZPpTyEBr1S7pIwEG76zOq+mXwJ7Mu2uyKlAA16pdyrwzlw8xdQesDqxinYYndFSmnQK+V27c+GMV9aUyW8cynkbbC7IuXjNOiVagixPWHMbECssN+z2u6KlA/ToFeqocSkwNg5EBAM71wGucvtrkj5KA16pRpSZCcr7INbwXtXwI4f7a5I+SANeqUaWngCjP0KWkbD+1dBxouwc5nOa68aTV1uDq6UOl2t2lst++m3wHfPAc9Bs1Do2B8S0iAx3erX9/O3u1LlhTTolWosobFw21wozofsRdZjWwZs/sZaH9zKCv2ENEhMg+hu1r1rlTpNGvRKNbaQaOhxlfUAOLALshdbob8tAzZ8aS1vEWVdhJWYBomDILKzdbcrpepJg14pu4W1g14jrQfA/h2wzdnaz14E6z63lofEOkM/3Wr1hydo8Ks60VsJKuXJjIG9WUdDf9siKMmz1rXqYAV/dVdPqzh7a1W2OtGtBLVFr5QnE7GGaEZ2gtSxVvDnbzzav79xDqycYm0bkXT0xG5CGoS2sbd25TG0Ra9UU+ZwQN7ao10925dA2QFrXVRXK/SrW/0tIuytVTWoE7XoNeiV8iaOKti96mhXz/YfoKLEWtem59E+/o7nWaN8lNfQoFfKV1VVWLc6zM6wWv07l0JlKYgftDsbkgZbj/h+EBBkb62+rKoCtnxr3Wi+17Wn9BIa9EopS0Up5GZaLf6sBZCTCaYKAltYrfykIVbwtzlDR/Q0hj2rYeVHsHo6lORDbC+4a9EpvZQGvVKqdqUHrDH8WQsgaz4UbLKWt4w+2tpPGmJd2avcozgfVn8Mqz60gt4v0Lod5Vk3QOcLwD/wlF5WR90opWoXHAYpl1oPgKJcZ+g7H6s/tpZHJkMnZ2s/YaD279dXZTlsngsrP7SuhHZUQrvecMmL0POaBj9Rri16pVTtjIG8dVbgb51vjeipOATiD+37WKHfaQi0T4WAZjYX64GMgd0rrXBfPQMO77Uueus1Es66HmK6ufXttOtGKXX6Kssh56ejwb9rBRgHBLa0WvlJg63gj07x7f79g3vgl+lWwOevB/8gSPmdFe5JQ8C/YTpSNOiVUu53eL+zf3++Ff6FzvvjhrQ52refNBjC2tpWYqOpKLUuXlv1kTV6xjggrq8V7mdcCc3DG7wE7aNXSrlf89bQ7TLrAbB/59G+/S3/g1+mWcujU44Gf8IACAq1p153M8YatbTqQ1jzCZQWQVh7GPCAFfBRyXZXeIS26JVS7ld9xe5WZ2t/+/dQeRj8Aqw+/eoTu+37nPIoE9sU5cIvU61hkYWbIaA5dLsczhptzTJq0z0FtOtGKWWvitIa/fs/A8YKydYdoHW89bNVjZ8hbTxjTv7yQ7BhtjWvUNYCwECH/lbLvfsV1uglm2nQK6U8y6G91hQNO3+C/dutqZn377RGprjyb2Z1hxw5EDgPCtUHgrB2DfeNwBjrHr+rPoQ1n0H5Qev9zxoNZ15nTSLnQU67j15EhgGvAP7AJGPM346z3dXADKCvMSZTRBKA9cBG5yY/GmPuql/5Simv0yICuo+wHq7KiqEoxwr+Imf4F+20nm/+For3HLu9+EFou2PD3/X3VnEQ2Lx+te3fAaumWidW92ZZo4q6j7Ba7x0HeMY3jHo6adCLiD8wEbgQyAGWichMY8y6GtuFAvcDS2u8xFZjzFluqlcp5c2CQiAmxXrUprLM5UCw89gDwY4frZOipurYfVrG1DgQVHcNOZcFh1kHmPUzrSGR2c4pCBLSIH0CdBtu1dWE1aVF3w/YYozJAhCRqcAIYF2N7f4M/D9gglsrVEqpagFBR+fnr01VJRzcXeNAsMN6vmc1bPwKqsqO3Se4NVSVWxeDhSfCkMeh1ygI79jwf08jqUvQtwd2ujzPAc5x3UBEzgbijTGzRaRm0CeKyM/AAeAJY8xvZuwRkTuBOwE6dOhQj/KVUsqFf4CzPz++9vUOh3WHriMHAOc3AgR6XA0dzvXKi71Oexy9iPgBLwFjalm9G+hgjCkUkT7A5yJyhjHmgOtGxpg3gTfBOhl7ujUppVSt/PwgNNZ6xPe1u5pGU5ezCrmA6+ExzrmsWijQA1ggItnAucBMEUk1xpQZYwoBjDHLga1AF3cUrpRSqm7qEvTLgGQRSRSRZsB1wMzqlcaYImNMlDEmwRiTAPwIDHeOuol2nsxFRJKAZCDL7X+FUkqp4zpp140xplJE7gXmYg2vfNsYs1ZEngUyjTEzT7B7OvCsiFQADuAuY8zeE2yvlFLKzfSCKaWU8gInumCq6Y38V0opVS8a9Eop5eU06JVSystp0CullJfzuJOxIpIPbD+Nl4gCCtxUTlOnn8Wx9PM4ln4eR3nDZ9HRGBNd2wqPC/rTJSKZxzvz7Gv0sziWfh7H0s/jKG//LLTrRimlvJwGvVJKeTlvDPo37S7Ag+hncSz9PI6ln8dRXv1ZeF0fvVJKqWN5Y4teKaWUCw16pZTycl4T9CIyTEQ2isgWEXnU7nrsJCLxIjJfRNaJyFoRud/umuwmIv4i8rOIfGl3LXYTkdYiMkNENojIehHpb3dNdhKRB53/TtaIyEciEmx3Te7mFUHvcgPzS4DuwGgR6W5vVbaqBB42xnTHuhHMPT7+eYB14/r1dhfhIV4BvjbGpABn4sOfi4i0B8YDqcaYHlhTsV9nb1Xu5xVBj8sNzI0x5UD1Dcx9kjFmtzFmhfP3g1j/kNvbW5V9RCQO+B0wye5a7CYirbDuE/EWgDGm3Biz396qbBcANBeRAKAFsMvmetzOW4K+thuY+2ywuRKRBKA3sNTeSmz1MvAI1s1vfF0ikA9MdnZlTRKRlnYXZRdjTC7wd2AH1j2ui4wx39hblft5S9CrWohICPAJ8EDNG7L7ChG5DMhz3rNYWa3Xs4HXjTG9gRLAZ89piUg41rf/RKAd0FJEbrS3KvfzlqA/2Q3MfY6IBGKF/BRjzKd212OjAcBw543rpwJDReQDe0uyVQ6QY4yp/oY3Ayv4fdUFwDZjTL4xpgL4FDjP5prczluC/oQ3MPc1IiJYfbDrjTEv2V2PnYwxjxlj4pw3rr8O+M4Y43UttroyxuwBdopIV+ei84F1NpZktx3AuSLSwvnv5ny88OT0SW8O3hQc7wbmNpdlpwHATcBqEVnpXPZHY8wcG2tSnuM+YIqzUZQFjLW5HtsYY5aKyAxgBdZotZ/xwukQdAoEpZTyct7SdaOUUuo4NOiVUsrLadArpZSX06BXSikvp0GvlFJeToNeKaW8nAa9Ukp5uf8P55TflB+UlTEAAAAASUVORK5CYII=\n",
411 |       "text/plain": [
412 |        "<Figure size 432x288 with 1 Axes>"
413 |       ]
414 |      },
415 |      "metadata": {
416 |       "needs_background": "light",
417 |       "tags": []
418 |      },
419 |      "output_type": "display_data"
420 |     }
421 |    ],
422 |    "source": [
423 |     "pd.DataFrame(h1.history)[['loss','val_loss']].plot(title=\"With mixup\")"
424 |    ]
425 |   },
426 |   {
427 |    "cell_type": "code",
428 |    "execution_count": 10,
429 |    "metadata": {
430 |     "colab": {
431 |      "base_uri": "https://localhost:8080/",
432 |      "height": 476
433 |     },
434 |     "colab_type": "code",
435 |     "id": "Iiv7ahP8WeAF",
436 |     "outputId": "0ad04311-b497-4830-dd50-a832daf583ac"
437 |    },
438 |    "outputs": [
439 |     {
440 |      "name": "stdout",
441 |      "output_type": "stream",
442 |      "text": [
443 |       "Build model...\n",
444 |       "Model: \"sequential_1\"\n",
445 |       "_________________________________________________________________\n",
446 |       "Layer (type)                 Output Shape              Param #   \n",
447 |       "=================================================================\n",
448 |       "embedding_1 (Embedding)      (None, 400, 50)           250000    \n",
449 |       "_________________________________________________________________\n",
450 |       "dropout_2 (Dropout)          (None, 400, 50)           0         \n",
451 |       "_________________________________________________________________\n",
452 |       "conv1d_1 (Conv1D)            (None, 398, 250)          37750     \n",
453 |       "_________________________________________________________________\n",
454 |       "global_max_pooling1d_1 (Glob (None, 250)               0         \n",
455 |       "_________________________________________________________________\n",
456 |       "dense_2 (Dense)              (None, 250)               62750     \n",
457 |       "_________________________________________________________________\n",
458 |       "dropout_3 (Dropout)          (None, 250)               0         \n",
459 |       "_________________________________________________________________\n",
460 |       "activation_2 (Activation)    (None, 250)               0         \n",
461 |       "_________________________________________________________________\n",
462 |       "dense_3 (Dense)              (None, 1)                 251       \n",
463 |       "_________________________________________________________________\n",
464 |       "activation_3 (Activation)    (None, 1)                 0         \n",
465 |       "=================================================================\n",
466 |       "Total params: 350,751\n",
467 |       "Trainable params: 350,751\n",
468 |       "Non-trainable params: 0\n",
469 |       "_________________________________________________________________\n"
470 |      ]
471 |     }
472 |    ],
473 |    "source": [
474 |     "print('Build model...')\n",
475 |     "model2 = Sequential()\n",
476 |     "\n",
477 |     "# we start off with an efficient embedding layer which maps\n",
478 |     "# our vocab indices into embedding_dims dimensions\n",
479 |     "model2.add(Embedding(max_features,\n",
480 |     "                    embedding_dims,\n",
481 |     "                    input_length=maxlen))\n",
482 |     "model2.add(Dropout(0.2))\n",
483 |     "\n",
484 |     "# we add a Convolution1D, which will learn filters\n",
485 |     "# word group filters of size filter_length:\n",
486 |     "model2.add(Conv1D(filters,\n",
487 |     "                 kernel_size,\n",
488 |     "                 padding='valid',\n",
489 |     "                 activation='relu',\n",
490 |     "                 strides=1))\n",
491 |     "# we use max pooling:\n",
492 |     "model2.add(GlobalMaxPooling1D())\n",
493 |     "\n",
494 |     "# We add a vanilla hidden layer:\n",
495 |     "model2.add(Dense(hidden_dims))\n",
496 |     "model2.add(Dropout(0.2))\n",
497 |     "model2.add(Activation('relu'))\n",
498 |     "\n",
499 |     "# We project onto a single unit output layer, and squash it with a sigmoid:\n",
500 |     "model2.add(Dense(1))\n",
501 |     "model2.add(Activation('sigmoid'))\n",
502 |     "\n",
503 |     "model2.compile(loss='binary_crossentropy',\n",
504 |     "              optimizer='adam',\n",
505 |     "              metrics=['accuracy'])\n",
506 |     "model2.summary()"
507 |    ]
508 |   },
509 |   {
510 |    "cell_type": "code",
511 |    "execution_count": 11,
512 |    "metadata": {
513 |     "colab": {
514 |      "base_uri": "https://localhost:8080/",
515 |      "height": 357
516 |     },
517 |     "colab_type": "code",
518 |     "id": "ygNHmhGMWeAI",
519 |     "outputId": "1592613d-52d2-409b-e210-cceddb7f5bbd"
520 |    },
521 |    "outputs": [
522 |     {
523 |      "name": "stdout",
524 |      "output_type": "stream",
525 |      "text": [
526 |       "Epoch 1/10\n",
527 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.4057 - accuracy: 0.7964 - val_loss: 0.2819 - val_accuracy: 0.8825\n",
528 |       "Epoch 2/10\n",
529 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.2260 - accuracy: 0.9100 - val_loss: 0.2540 - val_accuracy: 0.8957\n",
530 |       "Epoch 3/10\n",
531 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.1579 - accuracy: 0.9409 - val_loss: 0.2806 - val_accuracy: 0.8874\n",
532 |       "Epoch 4/10\n",
533 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.1056 - accuracy: 0.9625 - val_loss: 0.3103 - val_accuracy: 0.8897\n",
534 |       "Epoch 5/10\n",
535 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.0732 - accuracy: 0.9730 - val_loss: 0.3593 - val_accuracy: 0.8838\n",
536 |       "Epoch 6/10\n",
537 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.0539 - accuracy: 0.9808 - val_loss: 0.3938 - val_accuracy: 0.8884\n",
538 |       "Epoch 7/10\n",
539 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.0419 - accuracy: 0.9854 - val_loss: 0.4444 - val_accuracy: 0.8817\n",
540 |       "Epoch 8/10\n",
541 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.0340 - accuracy: 0.9876 - val_loss: 0.4842 - val_accuracy: 0.8870\n",
542 |       "Epoch 9/10\n",
543 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.0388 - accuracy: 0.9857 - val_loss: 0.4686 - val_accuracy: 0.8863\n",
544 |       "Epoch 10/10\n",
545 |       "782/782 [==============================] - 8s 10ms/step - loss: 0.0314 - accuracy: 0.9887 - val_loss: 0.6685 - val_accuracy: 0.8559\n"
546 |      ]
547 |     }
548 |    ],
549 |    "source": [
550 |     "h2 = model2.fit(x_train, y_train,\n",
551 |     "          batch_size=batch_size,\n",
552 |     "          epochs=epochs,\n",
553 |     "          validation_data=(x_test, y_test))"
554 |    ]
555 |   },
556 |   {
557 |    "cell_type": "code",
558 |    "execution_count": 12,
559 |    "metadata": {
560 |     "colab": {
561 |      "base_uri": "https://localhost:8080/",
562 |      "height": 298
563 |     },
564 |     "colab_type": "code",
565 |     "id": "DzJEhaPrWeAM",
566 |     "outputId": "aec6c655-c5f8-434b-bb16-d1e1056adc03"
567 |    },
568 |    "outputs": [
569 |     {
570 |      "data": {
571 |       "text/plain": [
572 |        "<matplotlib.axes._subplots.AxesSubplot at 0x7f13e69ad0f0>"
573 |       ]
574 |      },
575 |      "execution_count": 12,
576 |      "metadata": {
577 |       "tags": []
578 |      },
579 |      "output_type": "execute_result"
580 |     },
581 |     {
582 |      "data": {
583 |       "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEICAYAAABPgw/pAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deXxU5b3H8c8vk8m+sIYEEvYlbEU0LLIKiOIG1mrBrdWr0uu+9GptXapo21t7a9GKtRSt1qVI1SoqSpUdQSTsWwiQsCQsSYAkLNnz3D/OAENMwiTM5MxMfu/XKy/mLJzzm1G+8+Q553mOGGNQSikV+ELsLkAppZR3aKArpVSQ0EBXSqkgoYGulFJBQgNdKaWChAa6UkoFCQ105XdEZKSIbK9ne2cRMSIS2pR1Nca53otS3qSBrnxORH4pIl/UWLejjnVTjDHLjDG93NbvFpFLm6jWN0XkeW8dr+Z7UcqXNNBVU1gKDBMRB4CIJAFOYGCNdd1d+yqlGkEDXTWF1VgBfoFreSSwCNheY90uY8x+EblERHIARORtoCPwqYgcF5HH3I57s4jsFZECEXni1EoRCReR6SKy3/UzXUTCXdtuE5Hl7sW5um+6i8hU4GbgMde5Pq3tzbj2v8f1G8UxEXlORLqJyAoRKRaROSIS5trX/b10E5EjInKha7m9iOSLyCWu5bN+ExGRZ0TkHdfrU91MU13v6YCI/E/D/jOoYKeBrnzOGFMOrAJGuVaNApYBy2us+17r3BhzK7AXuMYYE2OMecFt8wigFzAOeFpEervWPwEMxfqyGAAMBp70oM6ZwLvAC65zXVPP7pcDF7nO8xgwE7gFSAH6ATfWcvxdwC+Ad0QkCvg78JYxZvG5anMzBugBXAb8oqm6olRg0EBXTWUJZ8J7JFagL6uxbkkDj/msMabEGLMB2IAV3mC1sqcZY/KMMfnAs8Ct51N8LV4wxhQbY7YAm4H/GGOyjDFFwBfAwNr+kjHmb8BOrC+4JKwvn4Z41hhzwhizCesL4XtfHKr50kBXTWUpMEJEWgFtjTE7gBVYfeutsFq1De0/P+j2+iQQ43rdHtjjtm2Pa503HXJ7XVLLcgx1+xvW+/2zMaasgefd5/baF+9LBTANdNVUVgLxwF3ANwDGmGJgv2vdfmNMdh1/t6FTgu4HOrktd3StAzgBRJ3aICKJ53muBhGRGGA68DrwjOvL7JSzagNq1gZWl84p7u9LKQ101TSMMSVAOvAIVlfLKctd6+prnR8CujbgdP8EnhSRtiLSBngaeMe1bQPQV0QuEJEI4JnzPFdDvQSkG2PuBD4HXnPbth6YIiJOEUkDrq/l7z8lIlEi0he4HXjfh7WqAKOBrprSEiABK8RPWeZaV1+g/w4roAs9vLPjeawvj43AJmCtax3GmExgGvA1sKNGLWC1nPu4zvWxB+fymIhMAiYAd7tWPQJcKCI3u5afAroBR7H6/d+r5TBLsPrgFwD/Z4z5jzdrVIFN9AEXSvk/EekMZANOY0ylvdUof6UtdKWUChIeBbqITBCR7SKyU0Qer2X7n0RkvesnU0QKvV+qUkqp+pyzy8U1NDsTGA/kYI36u9EYs7WO/e8HBhpj/svLtSqllKqHJy30wcBO16CJcmA2MKme/W/EustAKaVUE/Jk+tEOnD2YIQcYUtuOItIJ6AIsrGP7VGAqQHR09EWpqakNKlYppZq7NWvWFBhj2ta2zdvzSU8BPjDGVNW20TVXxkyAtLQ0k56e7uXTK6VUcBORPXVt86TLJZezR6clu9bVZgra3aKUUrbwJNBXAz1EpItrStApwNyaO4lIKtASa4i3UkqpJnbOQHcNYrgPmA9sA+YYY7aIyDQRmei26xRgttGRSkopZQuP+tCNMfOAeTXWPV1j+ZnzLaaiooKcnBxKS0vP91BBLSIiguTkZJxOp92lKKX8iF89ZDcnJ4fY2Fg6d+6MiNhdjl8yxnD48GFycnLo0qWL3eUopfyIXw39Ly0tpXXr1hrm9RARWrdurb/FKKW+x68CHdAw94B+Rkqp2vhdoCulVNCqrob5T0Behk8Or4FeQ0xMfU8OU0qp87D137DyFdi/1ieH10BXSqmmUFkGXz8LCX3hB5N9cgoN9DoYY3j00Ufp168f/fv35/33rSd9HThwgFGjRnHBBRfQr18/li1bRlVVFbfddtvpff/0pz/ZXL1Syu+kvwGFe2D8NAhx+OQUfnXbortnP93C1v3FXj1mn/Zx/Pqavh7t+9FHH7F+/Xo2bNhAQUEBgwYNYtSoUbz33ntcfvnlPPHEE1RVVXHy5EnWr19Pbm4umzdvBqCwUKeDV0q5KS2CJS9Al1HQfZzPTqMt9DosX76cG2+8EYfDQbt27Rg9ejSrV69m0KBB/P3vf+eZZ55h06ZNxMbG0rVrV7Kysrj//vv58ssviYuLs7t8pZQ/WT4dSo5YrXMf3qXmty10T1vSTW3UqFEsXbqUzz//nNtuu41HHnmEn/zkJ2zYsIH58+fz2muvMWfOHN544w27S1VK+YOiXPj2Veh/A7Qf6NNTaQu9DiNHjuT999+nqqqK/Px8li5dyuDBg9mzZw/t2rXjrrvu4s4772Tt2rUUFBRQXV3Nj370I55//nnWrvXNFWylVABa/Fsw1TD2SZ+fym9b6Hb74Q9/yMqVKxkwYAAiwgsvvEBiYiJvvfUWf/jDH3A6ncTExPCPf/yD3Nxcbr/9dqqrqwH43e9+Z3P1Sim/cGgrrH8PhtwNLTv7/HTnfKaor9T2gItt27bRu3dvW+oJNPpZKRUA3v0x7P0WHlwPUa28ckgRWWOMSattm3a5KKWUL2Qvgx3zYeTDXgvzc9FAV0opb6uuhq+egrgOMOS/m+y02oeulFLetvXfsH8dTHoVnJFNdlptoSullDdVlsOCadYQ/wFTmvTU2kJXSilvSn8Dju6Gmz/w2RD/umgLXSmlvKW0CJaeGuJ/aZOfXgNdKaW85ZuX4ORhnw/xr4sG+nmob+703bt3069fvyasRillq+L9sPJV6He9z4f410UDXSmlvGHRb6G6EsY9ZVsJHl0UFZEJwEuAA5hljPnfWvb5MfAMYIANxpibzquyLx6Hg5vO6xDfk9gfrvhe6ac9/vjjpKSkcO+99wLwzDPPEBoayqJFizh69CgVFRU8//zzTJo0qUGnLS0t5e677yY9PZ3Q0FBefPFFxowZw5YtW7j99tspLy+nurqaDz/8kPbt2/PjH/+YnJwcqqqqeOqpp5g82TeT4SulvCRvG6x/17rnvAmG+NflnIEuIg5gBjAeyAFWi8hcY8xWt316AL8EhhtjjopIgq8K9qXJkyfz0EMPnQ70OXPmMH/+fB544AHi4uIoKChg6NChTJw4sUEPap4xYwYiwqZNm8jIyOCyyy4jMzOT1157jQcffJCbb76Z8vJyqqqqmDdvHu3bt+fzzz8HoKioyCfvVSnlRV8/A2GxMOpRW8vwpIU+GNhpjMkCEJHZwCRgq9s+dwEzjDFHAYwxeeddWT0taV8ZOHAgeXl57N+/n/z8fFq2bEliYiIPP/wwS5cuJSQkhNzcXA4dOkRiYqLHx12+fDn3338/AKmpqXTq1InMzEwuvvhifvOb35CTk8N1111Hjx496N+/Pz//+c/5xS9+wdVXX83IkSN99XaVUt6wezlkfgnjft1kQ/zr4kkfegdgn9tyjmudu55ATxH5RkS+dXXRBKQbbriBDz74gPfff5/Jkyfz7rvvkp+fz5o1a1i/fj3t2rWjtLTUK+e66aabmDt3LpGRkVx55ZUsXLiQnj17snbtWvr378+TTz7JtGnTvHIupZQPGAP/cQ3xH3q33dV4bWBRKNADuARIBpaKSH9jzFnPYhORqcBUgI4dO3rp1N41efJk7rrrLgoKCliyZAlz5swhISEBp9PJokWL2LNnT4OPOXLkSN59913Gjh1LZmYme/fupVevXmRlZdG1a1ceeOAB9u7dy8aNG0lNTaVVq1bccssttGjRglmzZvngXSqlvGLLv2H/Wpg0o0mH+NfFk0DPBVLclpNd69zlAKuMMRVAtohkYgX8avedjDEzgZlgTZ/b2KJ9qW/fvhw7dowOHTqQlJTEzTffzDXXXEP//v1JS0sjNTW1wce85557uPvuu+nfvz+hoaG8+eabhIeHM2fOHN5++22cTieJiYn86le/YvXq1Tz66KOEhITgdDr5y1/+4oN3qZQ6b6eH+PeBATfaXQ3gwXzoIhIKZALjsIJ8NXCTMWaL2z4TgBuNMT8VkTbAOuACY8zhuo6r86GfH/2slLLZqr/CF4/BTf+Cnpc12WnPaz50Y0wlcB8wH9gGzDHGbBGRaSIy0bXbfOCwiGwFFgGP1hfmSikV0EqLYcnvofNI6DHe7mpO86gP3RgzD5hXY93Tbq8N8Ijrp1nZtGkTt95661nrwsPDWbVqlU0VKaV8zuYh/nXxu9kWjTENusfbbv3792f9+vVNek67HhuolAKKD8DKGdDvR9DhQrurOYtfDf2PiIjg8OHDGlj1MMZw+PBhIiIi7C5FqeZpsWuI/1j7hvjXxa9a6MnJyeTk5JCfn293KX4tIiKC5ORku8tQqvnJy4B178Dgn0GrLnZX8z1+FehOp5MuXfzvQ1JKKcA1xD/G9iH+dfGrLhellPJbu7+BzC9gxEMQ3druamqlga6UUudiDHz1FMS2hyH2D/Gvi191uSillF/a+jHkrrGG+IdF2V1NnbSFrpRS9fHDIf510Ra6UkrVZ82bcCTLGuIf4rC7mnppC10pperip0P866KBrpRSdVnxMpwsgPHP+tUQ/7pooCulVG1ODfHvex10uMjuajyiga6UUrVZ/DuoqoBx/jfEvy4a6EopVVNeBqx7GwbdCa262l2NxzTQlVKqpgXP+vUQ/7pooCullLs9K2D7PL8e4l8XDXSllDrFGPiP/w/xr4sOLFJKqVO2fgK56TDxFb8e4l8XbaErpRRYd7QseBba9oYLbrK7mkbRFrpSSoHbEP85fj/Evy7aQldKqbJjsPh/XUP8L7O7mkbTQFdKqW8Ca4h/XTwKdBGZICLbRWSniDxey/bbRCRfRNa7fu70fqlKKeUDxw7CylcCaoh/Xc7Zhy4iDmAGMB7IAVaLyFxjzNYau75vjLnPBzUqpZTvBOAQ/7p40kIfDOw0xmQZY8qB2cAk35allFJNIH87rP0HDLojoIb418WTQO8A7HNbznGtq+lHIrJRRD4QkRSvVKeUUr709bPgjA64If518dZF0U+BzsaYHwBfAW/VtpOITBWRdBFJz8/P99KplVKqEfashO2fu4b4t7G7Gq/wJNBzAfcWd7Jr3WnGmMPGmDLX4iyg1isLxpiZxpg0Y0xa27ZtG1OvUkqdP2Pgq6cgNgmG3mN3NV7jSaCvBnqISBcRCQOmAHPddxCRJLfFicA275WolFJetm0u5KyGMb8KyCH+dTnnXS7GmEoRuQ+YDziAN4wxW0RkGpBujJkLPCAiE4FK4Ahwmw9rVkoFkrwM6x7vVl0hJhFCbB7+UlVh9Z237Q0DAnOIf108GvpvjJkHzKux7mm3178Efund0pRSAevYIdj0L9g4Gw5uOrM+NNIK9lZdoHU3aNXNWm7drenCfs2bcGQX3Pg+OIJr9pPgejdKKfuUn7TmEd/wT9i1EEw1tB8IE34PbXta86QczrL+LMiEHf+BqvIzf/9U2Lfu6gp9t7CPTfLOCM5TQ/w7jYCel5//8fyMBrpSqvGqq2HPctgwG7bOhfJjEJcMwx+CAVOgba8z+3YbW+PvVkFRjtVadg/7/O2QOf/cYd/aFfgNCfsVf3YN8Z8W0EP866KBrpRquPztVohvnAPFORAWC30mwYDJVuvXk66TEAe07GT9eBT2u6zzbv8SqivO7OuMgpZdXGHv1qqvGfbHDlqB3veHkBzYQ/zrooGulPLM8XzY/KHVL75/HUgIdBtnTWjV60rv3i1yzrDf5wr6XXAku/6wP9Vnf+Kwa4j/0wQrDXSlVN0qSq1+8Y3vw46vwFRB4g/g8t9Cv+shtl3T1xTigJadrR9Pwz4vAwr3wMX3BMUQ/7pooCulzlZdDXtXWi3xLZ9AWZHVdTHsPvjBFGjXx+4K61Zf2BsTlP3m7jTQlVKWgp1WiG98Hwr3WnOc9JkIP5gMXUYF7FN8TgvyMAcNdKWat5NHrH7xDbOthyNLCHS9BMY8Cb2vhrBouytUDaCBrlRzU1lm3Ra4YbZ1L3h1BST0hfHPQf8bIC7p3MdQfkkDXanmwBjY953VpbL5IygthJh2MORn1v3iif3trlB5gQa6UsHsSJZ1r/iG2XA02xqg0/tqK8S7XBJ0Q9+bO/2vqVSwMQayl8Dy6ZC1CBDoMhJGPwa9r4HwWLsrVD6iga5UsKiugq2fwDcvwYH1VpfK2CdhwI0Qn2x3daoJaKArFegqSmD9e9aw9qPZ1vD3a16y7hl3RthdnWpCGuhKBaqSo7D6dVj1GpzIhw4XWZNOpV4V+PeMq0bRQFcq0BTlwrevWvN6lx+H7pdasxt2HtEsBs+oummgKxUo8jJgxcvWXSumGvpdB8Mf1FsO1Wka6Er5u73fWnesZH5h3XaY9l9w8b3WTIRKudFAV8ofVVfDjvlWkO/7FiJbwujHYfBUiG5td3XKT2mgK+VPKsutZ3GueBnyMyC+I1zxAgy8RedVUeekga6UPyg7Bmvesi52FudCu35w3d+sp+s4nHZXpwKEBrpSdjqeZ912uHoWlBZB55FwzcvQfZzesaIazKNAF5EJwEuAA5hljPnfOvb7EfABMMgYk+61KmsoOllBfJS2WlQAO5JlDQRa9671MOTeV1u3Hian2V2ZCmDnDHQRcQAzgPFADrBaROYaY7bW2C8WeBBY5YtCT5m1LItXF+9iwSOjaRkd5stTKeV9+9dZFzq3zYWQUGtY/rD7oU0PuytTQcCDR3MzGNhpjMkyxpQDs4FJtez3HPB7oNSL9X3PiB5tKCqp4IX5Gb48jVLeYwzsWghvTYSZl1ivhz0AD22CiS9rmCuv8aTLpQOwz205BxjivoOIXAikGGM+F5FHvVjf96QmxnH7sM7MWp7NDWkpXNixpS9Pp1TjVVXC1o+tybIOboSYRLj0WUi7HSLi7a5OBSFPWuj1EpEQ4EXg5x7sO1VE0kUkPT8/v9HnfGh8T9rFhfPUx5upqjaNPo5SPlFSCN/9DV65CD68w5o8a+Kf4aGNMOIhDXPlM5600HOBFLflZNe6U2KBfsBisa7KJwJzRWRizQujxpiZwEyAtLS0RidxTHgoT13dh/veW8c73+7hp8M6N/ZQSnnHsYOQ8TlkfAbZS6G6EjqkwWXPQ6+rIOS8205KnZMngb4a6CEiXbCCfApw06mNxpgioM2pZRFZDPyPL+9yAbiqfxKzu+/j/+Zv54r+iSTE6jShqokdyYJtn1khvu87wECrrjD0HugzyZr9UG89VE3onIFujKkUkfuA+Vi3Lb5hjNkiItOAdGPMXF8XWRsRYdqkvkyYvozfzcvgT5MvsKMM1ZwYA4c2WyG+7VPI22KtT+wPl/zSehpQQm8NcWUbj+5DN8bMA+bVWPd0Hftecv5leaZr2ximjurKK4t2MnlQCkO76hwXysuqq6zWd4YrxAv3AAIdL4bLf2vNPd6ys91VKgUEwUjRe8d059/rcnnq483Me3AkTof2VarzVFlu9YNnfAoZ8+BEHoQ4oeslMPIR6HUlxCTYXaVS3xPwgR4Z5uCZiX256x/pvLE8m5+N7mZ3SSoQlR2HnV9brfAd/4GyYnBGQ4/xVldKj8sgIs7uKpWqV8AHOsD4Pu24tHcCLy3YwcQL2pMUH2l3SSoQnDhszTG+7TNrsE9VGUS1hj4TIfUaq0Wuz+RUASQoAh3g19f05dIXl/DcZ1t59eaL7C5H+auiHOv2wm2fwp4VYKogLtka7JN6tdU37giafxaqmQma/3NTWkVx35ju/PGrTJZk5jO6Z1u7S1L+Ij/T6g/f9qk1lwpAm14w4mFrUqykC/TOFBUUgibQAaaO7spH63L59Seb+fKhUUQ49cnnzZIxsH/tmXvECzKt9R0ugnG/tvrEdf4UFYSCKtDDQx1Mm9SXW1//jplLs3hgnP6jbVYKdsDq162WeHEOiAM6D4dBd1m3F8Z3sLtCpXwqqAIdYGSPtlzVP4kZi3Zy7QUd6Ng6yu6SlC8ZA7uXw8pXIPNLcIRbD4cY+wT0nABRreyuUKkmE3SBDvDU1X1YvD2PZz7dwus/TUO0fzT4VFXAlo9h5Z/hwAbr7pTRj8OgOyFGr5+o5ikoAz0xPoKHLu3Jb+Zt46uth7isb6LdJSlvKS2ynr256q9Wt0rrHnD1dBgwBZx6u6pq3oIy0AFuG96Zf63Zx7OfbmVEjzZEhQXtW20eCvfCt6/B2n9A+THr2ZtX/dEa8KMzGSoFeGE+dH/ldITw3KR+5BaW8MrCnXaXoxordy188F/w0gXWw5R7TYCpi+G2z6zXGuZKnRbUzdYhXVtz3YUd+NuyLK67MJnuCTF2l6Q8UV1tXeBc+Qrs+QbC4+Die2DIf0N8st3VKeW3gjrQAX55RW++2nqIpz/ZzLt3DtELpP6s/CRs+Cd8+yoc3gnxKXDZb+DCn+g8Kkp5IOgDvW1sOI9e3ounP9nCpxsPMHFAe7tLUjUdz7Me2bZ6FpQcgfYD4UevQ59rdRi+Ug3QLP613DykE3PS9/H8Z1sZ06stsRFOu0tSAHkZVrfKxjlQVQ69roCL74NOw3QovlKN0CyuKDlChOev7U/+8TKmf73D7nKaN2MgazG8cz28OgQ2/QsG3gz3pcON/7RGdmqYK9UozaKFDnBBSgumDOrImyt2c/1FyfRO0j7ZJlVZDls+slrkBzdBdFsY8wSk3QHR+qQppbyhWbTQT3ns8l7ERzp56uPNVFcbu8tpHkoKYfl0eGkA/Ptn1gjPiX+GhzbD6Mc0zJXyombTQgdoGR3G4xNSeezDjXy4Nocb0lLsLil4Hd1tDQRa9zaUH4cuo2Hiy9BtnN47rpSPNKtAB7j+omRmr97L777IYHyfdrSICrO7pOCSkw4r/gzb5oKEQL/r4eJ7IekHdlemVNBrdk2lkBDhuWv7UXiynD/M3253OcGhtAg2fwSvXw6zxsGuRTDsfnhwI1z3Vw1zpZqIRy10EZkAvAQ4gFnGmP+tsf2/gXuBKuA4MNUYs9XLtXpN3/bx/HRYZ95csZsfp6UwIKWF3SUFlqpK6wESuxZaPznp1qPcWnSCCb+HgbdAuI7KVaqpiTH1XxwUEQeQCYwHcoDVwI3ugS0iccaYYtfricA9xpgJ9R03LS3NpKenn2f5jVdcWsG4Py4hMS6Cj+8djiNEb5Wr15HsMwGevQzKigCBDhdCt7HQdQykDNGBQEr5mIisMcak1bbNk399g4Gdxpgs18FmA5OA04F+KsxdogG/v4UkLsLJk1f15sHZ63nvu73cOrST3SX5l9IiyF7qCvFFcDTbWh+fAn2vhW5jrAud+gAJpfyGJ4HeAdjntpwDDKm5k4jcCzwChAFjvVKdj00c0J7Z3+3jD19mcEW/RNrEhNtdkn2qKiF3DWQtOrsbJSzGmqp26D1WS7x1Nx34o5Sf8trvx8aYGcAMEbkJeBL4ac19RGQqMBWgY8eO3jp1o4kIz13blyteWsbv5mXwxx8PsLukplVfN8rIR6wATx4EDp0qQalA4Emg5wLuN2wnu9bVZTbwl9o2GGNmAjPB6kP3sEaf6p4Qyx0juvLakl1MGZzCoM5B3IVQUgi7l50J8aO7rfWnu1HGQpdR2o2iVIDyJNBXAz1EpAtWkE8BbnLfQUR6GGNOTZJyFRBQE6Y8MK47c9fn8tTHm/ns/hGEOoLkbs5T3SinAjw3HUy11Y3SZRQMvVe7UZQKIucMdGNMpYjcB8zHum3xDWPMFhGZBqQbY+YC94nIpUAFcJRaulv8WVRYKE9f05f/fmcNb67YzZ0ju9pdUuMdyTpzITN7KZQVWwN82g+EkT/XbhSlgtg5b1v0lUbftrhnBWTOt1qYHYdCWLRX6jHGcPubq1mdfYQFP7+ExPgIrxzX5+rsRulo3Ymi3ShKBZXzvW3Rv+xfb83Y9810CHFCcpoVWJ1HWi1PZ+OCWER4dmJfxv9pKc9/vpVXbrrQy4V7yYkC67Fse1ZYfx7cDJgz3SgX32eFeKuu2o2iVDMTeC10gLLjsO9bq0shexkcWG/1DYdGQMpgK9i6jLa6GRrYtTD960ymf72Dd+4YwogebRpXnzcV7z8T3ru/gQLXdAWhkdZ77TQcOo+wXms3ilJBr74WemAGek0lhbB3pSvgl8Khzdb6sBjoeDF0GWmFfOIPIMRR76FKK6q4fPpSHCJ88dBIwkPr39+rjIHCPVaA7/7GCvFTA3rCYq0upk7DrABPugBCdWIxpZqb4A/0mk4ctvqVdy+zAr4g01ofEQ+dRrha8COhbe9ap3JdtD2P2/++mkcv78W9Y7r7pkawAvzwTti93NUKXwHFOda2yJZW67vTMOunXX8dVq+UCrI+dE9Et7buq+57rbVcfMAKzewlVshv/9xaH9XGCvbOI60uGtfte2N6JTChbyJ/XriDiQPak9Iqyjt1VVdD3lZXeLtC/ES+q+YE6/FrnR6ygrxtqs4brpRqkOBsoZ/L0T2u1rurBX9sv7U+NsnVeh/FoVaDuGRWNsO7t2HWT2v9Mjy3qko4uNHtIuYKKC20tsWnuLXAh+u94EopjzS/LpeGMMa6dzt7yZmAP1kAQHFEB7443oO+w6+m3/CrIS6p/mNVllvTyp66gLlvlfW0HoBW3c6Ed+fh0ML+qQ+UUoFHA70hjIG8bbB7GdVZSzixfTGxnLC2te5xpv+980hwRlmjL/essLp0clZDZam1b9veri6UYdBx2Lm/DJRSygMa6Odh5Y48nn9jDo/1PMTosAwrvE+1ukNCobrSGomZ2P9MF0rHYfrwY6WUTzS/i6JedHGPBLoPGM5dmw4y/+HH6dIyzBrclL3ECvaOw6DjEOsOGqWUspEGugeeuLI3C7fl8fQnm/nHfw1GUgZByiC7y1JKqbPofXEeSIiL4JHLehgNMIgAABBVSURBVLJsRwFfbD5odzlKKVUrDXQP3Tq0E32S4pj26VaOl1XaXY5SSn2PBrqHQh0hPHdtPw4Wl/LygoCa7l0p1UxooDfARZ1aMjkthTeWZ7P94DG7y1FKqbNooDfQL65IJSYilKc+2Yxdt3wqpVRtNNAbqFV0GI9dnsp32Uf497r6Hq2qlFJNSwO9EaYMSuGClBY8/tEmXl6wg7LKKrtLUkopDfTGCAkR/vaTNMb3aceLX2Vy5UvLWJV12O6ylFLNnAZ6I7WNDWfGTRfy99sHUVZZzeSZ3/LYBxs4eqLc7tKUUs2UBvp5GtMrga8eHs3PRnflw7W5jHtxCR+tzdELpkqpJqeB7gWRYQ5+eUVvPrt/BB1bRfHInA3cPGsV2QUn7C5NKdWMeBToIjJBRLaLyE4RebyW7Y+IyFYR2SgiC0Skk/dL9X+9k+L46O5hPH9tPzblFnH59KV60VQp1WTOGegi4gBmAFcAfYAbRaRPjd3WAWnGmB8AHwAveLvQQBESItwytBMLHhmtF02VUk3Kkxb6YGCnMSbLGFMOzAYmue9gjFlkjDnpWvwWSPZumYEnIS5CL5oqpZqUJ4HeAdjntpzjWleXO4AvzqeoYKIXTZVSTcWrF0VF5BYgDfhDHduniki6iKTn5+d789R+TS+aKqWagieBngukuC0nu9adRUQuBZ4AJhpjymo7kDFmpjEmzRiT1rZt28bUG9BOXTR97tp+bMrRi6ZKKe/yJNBXAz1EpIuIhAFTgLnuO4jIQOCvWGGe5/0yg0dIiHDr0E4s+LleNFVKedc5A90YUwncB8wHtgFzjDFbRGSaiEx07fYHIAb4l4isF5G5dRxOuehFU6WUt4ldF+fS0tJMenq6Lef2NyXlVUxfkMmsZdnERzp58qre/HBgB0TE7tKUUn5GRNYYY9Jq26YjRf1AXRdNs/KP212aUiqAaKD7kZoXTSe8tEwvmiqlPKaB7mf0oqlSqrE00P3U6Yumtw2itEIvmiqlzk0D3c+NSU3gq0dG6UhTpdQ5aaAHgKiwUL1oqpQ6Jw30AKIXTZVS9dFADzC1XTQd+39L+MfK3ZRWaLAr1ZzpwKIAtzQzn+lfZ7J2byFtYsK5a2QXbh7aiZjwULtLU0r5QH0DizTQg4AxhpVZh3l10S6W7ywgPtLJ7cM7c9uwzrSICrO7PKWUF2mgNyPr9h5lxqJdfL3tENFhDm4Z2ok7RnYhITbC7tKUUl6ggd4MbTtQzKuLd/H5xv04HSFMHpTC1FFdSW4ZZXdpSqnzoIHejGUXnOC1xbv4aF0OxsC1Aztw9yXd6NY2xu7SlFKNoIGu2F9YwsylWfzzu72UV1VzZf8k7r2kO33ax9ldmlKqATTQ1WkFx8t4fXk2b6/cw/GySsamJnDvmO5c1Kml3aUppTygga6+p+hkBW+t3M0b32RTeLKCi7u25r6x3RnWrbXOw66UH9NAV3U6UVbJP7/by8ylWeQdK+OClBbcO6Y7l/ZO0GBXyg9poKtzKq2o4oM1Oby2ZBc5R0tITYzlnjHduap/Eo4QDXal/IUGuvJYRVU1c9fv59XFO9mVf4IubaK5e3Q3rh3YgbBQnSlCKbtpoKsGq642zN9ykFcW7WTL/mLax0cwdVRXpgzuSITTYXd5SjVbGuiq0YwxLM7MZ8bCnaTvOUqbmDDuGNGVW4Z2JDbCaXd5SjU7GujKK1ZlHeaVRTtZtqOAuIhQbhvWmduHd6FltM4Xo1RTqS/QPeoUFZEJIrJdRHaKyOO1bB8lImtFpFJErj/fgpV/GtK1NW/fMYRP7h3O0K6teXnhTob/fiG/+Xwrh4pL7S5PqWbvnC10EXEAmcB4IAdYDdxojNnqtk9nIA74H2CuMeaDc51YW+iBb/vBY/xl8U7mbthPaEgIY1MTGNs7gTG9EmgbG253eUoFpfpa6J5Mmj0Y2GmMyXIdbDYwCTgd6MaY3a5t1eddrQoYvRJjmT5lIA+P78kby7OZv+UQX245CMCAlBaMS01gXO8E+iTF6T3tSjUBTwK9A7DPbTkHGOKbclQg6tQ6mmcn9eOZiX3ZeqCYhdvyWJCRx5++zuTFrzJJjItgbO8ExqUmMKxbGyLD9C4ZpXyhSR9rIyJTgakAHTt2bMpTqyYgIvRtH0/f9vHcP64H+cfKWLw9jwXb8vhkXS7vrdpLeGgIw7u3sbpnUhNo3yLS7rKVChqeBHoukOK2nOxa12DGmJnATLD60BtzDBU42saGc0NaCjekpVBWWcV32UdYsC2PBRmHWJiRB0CfpDjG9bbCfUByC0J0VKpSjebJRdFQrIui47CCfDVwkzFmSy37vgl8phdFVX2MMezKP+4K9zzW7DlKVbWhTUwYl/SyumZG9Gij97krVYvzvg9dRK4EpgMO4A1jzG9EZBqQboyZKyKDgH8DLYFS4KAxpm99x9RAV6cUnixnSWY+CzPyWLw9n6KSCpwOYUiX1ox1XVjt1Dra7jKV8gs6sEgFjMqqatbsOcrCDKv1vjPvOADd2kYzrnc7xqUmcFGnloQ6dF4Z1TxpoKuAtefwCRZm5LEwI49vsw5TUWWIiwi1umZ6JzC6Z1taROlIVdV8aKCroHC8rJLlO/JZsC2PRdvzKDheTohAWqdWp2+L7J4Qo/e8q6Cmga6CTnW1YUNOodU1sy2PrQeKAUhuGcmAlBaktoslNSmO1MRYOrSI1LtnVNDQQFdB70BRCQsz8liamc+2A8fYe+Tk6W3RYQ56JcbSKzGO3kmx9GoXS2piHPFReheNCjwa6KrZOV5WSeahY2w/eIyMA8VkHDxGxsFjFJVUnN4nKT6CXolWuKcmxtIrMZZubWP0QR7Kr53vXC5KBZyY8FAu7NiSCzu2PL3OGMOh4jIyDloBv90V8t/szKKiymrYhIYI3drGWEGfFEuqK/CT4iO0b175PQ101WyICInxESTGR3BJr4TT6yuqqskuOMG2A8VsdwX9mj1Hmbth/+l9YiNCT4d7r8RYeifF0rNdrA5+Un5FA101e05HCD3bWQHtrqikgsxDx1yt+WIyDhzj43W5HCurPL1PhxaRVr+8W9dNlzbRep+8soUGulJ1iI90MqhzKwZ1bnV6nTGG/UWlZ/XLbz9YzOLt+VRWW902YY4QuraNpkOLSBLjI0iKjyAxPpL2rt8OkuIjdcZJ5RMa6Eo1gIjQoUUkHVpEMq53u9Pryyqr2JV3gu2HrJb8zrzjHCgqZd2+Qo6cKP/eceIjnSS5hX2SK+zbx5/5EogO13+eqmH0/xilvCA81EGf9nH0aR8HA8/eVlpRxcGiUg4UlXKwuIQDRaUcKDyzvCm3iILj3w/92IhQV+ifCfyay9qHr9xpoCvlYxFOB53bRNO5Td0TjJVVVnGoqIwDRSUcLC51hX6JK/RL2XqgmILjZdS8yzgmPNQt6L/f2k+IDScu0olDB1Y1CxroSvmB8FAHHVtH0bF1VJ37lFdWc6i49HTgHyxya+0Xl5J5KJ+8Y98PfRGri6dFpJP4qDBaRlmvW0SF0SLKSUvXny2iwmgRaS3HRzmJiwgN+Fs1jTGUVVZTUl7FyYoqwkNDiI904gzSi9Ya6EoFiLDQEFJaRZHSqu7Qr6iqJv+Y1dI/UFRKXnEZhSUVFJ0s5+jJCgpLKjhyopys/BMcPVnOsdLKOo/lCBHri8D1BXAq6Fu6gr9F9JkvAOsLwfpSiA5zNPiLoPx06FZyoqzKel1eyckK6/WJskpKKqo4We76KTuz7WR55Zn15VWUnLVcSXUtYyejwxzERzqJc72/+Ein672GnVkf6b7e+jM2wr9/29FAVyqIOB0htG8R6fGj/SqrqikqsYK+8GQ5hScrrOB3vS4ssb4Iik5WcLC4lIyDxyg8Wc6J8qp6ahDiI0+1/p3ER4bhdMjpgLVCt+qs5craUrcekU4HUWEOIsMcRIeFEhlmLbeMchIZFkqU00FUuLUuKiz09P7lVdUUnqyw3rPrz+KSCrILTlBUYi2XVtT9rHsRiA0PJT7KSYvIsNOBH+/+peD2RXBqfWO/6BpKA12pZizUEULrmHBax4Q36O+VV7q+CE6WU1hSwdET5d/7UigqKefoiQpyjp6kqtqcDuDEuIjTARzlCuPoMIcVxK71kU4H0eFngjrK6dov3EFEqMOnk62VVlRR7Ap367cbt9euL4DCk+WnvwD2F5W41lXU+8UUGiKnW/4Pje/JxAHtvV67BrpSqsHCQkNoGxtO29iGfREEggingwing4S4iAb9PWMMJ8urzmr9Wz9nwv/U+pY+mhhOA10ppbxARIgODyU6PNTjLi9vC85LvUop1QxpoCulVJDQQFdKqSChga6UUkHCo0AXkQkisl1EdorI47VsDxeR913bV4lIZ28XqpRSqn7nDHQRcQAzgCuAPsCNItKnxm53AEeNMd2BPwG/93ahSiml6udJC30wsNMYk2WMKQdmA5Nq7DMJeMv1+gNgnAT6JBBKKRVgPAn0DsA+t+Uc17pa9zHGVAJFQGtvFKiUUsozTTqwSESmAlNdi8dFZHsjD9UGKPBOVUFBP4+z6edxhn4WZwuGz6NTXRs8CfRcIMVtOdm1rrZ9ckQkFIgHDtc8kDFmJjDTg3PWS0TSjTFp53ucYKGfx9n08zhDP4uzBfvn4UmXy2qgh4h0EZEwYAowt8Y+c4Gful5fDyw0puaszEoppXzpnC10Y0yliNwHzAccwBvGmC0iMg1IN8bMBV4H3haRncARrNBXSinVhDzqQzfGzAPm1Vj3tNvrUuAG75ZWr/Putgky+nmcTT+PM/SzOFtQfx6iPSNKKRUcdOi/UkoFCQ10pZQKEgEX6OeaV6a5EJEUEVkkIltFZIuIPGh3Tf5ARBwisk5EPrO7FruJSAsR+UBEMkRkm4hcbHdNdhGRh13/TjaLyD9FpGGPIwoQARXoHs4r01xUAj83xvQBhgL3NuPPwt2DwDa7i/ATLwFfGmNSgQE0089FRDoADwBpxph+WHfrBeWdeAEV6Hg2r0yzYIw5YIxZ63p9DOsfa80pGZoVEUkGrgJm2V2L3UQkHhiFdUsxxphyY0yhvVXZKhSIdA18jAL221yPTwRaoHsyr0yz45queCCwyt5KbDcdeAyotrsQP9AFyAf+7uqCmiUi0XYXZQdjTC7wf8Be4ABQZIz5j71V+UagBbqqQURigA+Bh4wxxXbXYxcRuRrIM8assbsWPxEKXAj8xRgzEDgBNMtrTiLSEus3+S5AeyBaRG6xtyrfCLRA92RemWZDRJxYYf6uMeYju+ux2XBgoojsxuqKGysi79hbkq1ygBxjzKnf2j7ACvjm6FIg2xiTb4ypAD4Chtlck08EWqB7Mq9Ms+Cab/51YJsx5kW767GbMeaXxphkY0xnrP8vFhpjgrIV5gljzEFgn4j0cq0aB2y1sSQ77QWGikiU69/NOIL0AnGTTp97vuqaV8bmsuwyHLgV2CQi613rfuWapkEpgPuBd12NnyzgdpvrsYUxZpWIfACsxbo7bB1BOgWADv1XSqkgEWhdLkoppeqgga6UUkFCA10ppYKEBrpSSgUJDXSllAoSGuhKKRUkNNCVUipI/D8sPu6lM8OD1gAAAABJRU5ErkJggg==\n",
584 |       "text/plain": [
585 |        "<Figure size 432x288 with 1 Axes>"
586 |       ]
587 |      },
588 |      "metadata": {
589 |       "needs_background": "light",
590 |       "tags": []
591 |      },
592 |      "output_type": "display_data"
593 |     }
594 |    ],
595 |    "source": [
596 |     "pd.DataFrame(h2.history)[['loss','val_loss']].plot(title=\"Without mixup\")"
597 |    ]
598 |   },
599 |   {
600 |    "cell_type": "markdown",
601 |    "metadata": {
602 |     "colab_type": "text",
603 |     "id": "M2HDERJbGr2a"
604 |    },
605 |    "source": [
606 |     "# Comparison\n",
607 |     "See the loss curve with mixup does not overfit."
608 |    ]
609 |   },
610 |   {
611 |    "cell_type": "markdown",
612 |    "metadata": {
613 |     "colab": {},
614 |     "colab_type": "code",
615 |     "id": "hqteWafKRdF1"
616 |    },
617 |    "source": [
618 |     "## Cite the paper\n",
619 |     "```\n",
620 |     "@article{marivate2019improving,\n",
621 |     "  title={Improving short text classification through global augmentation methods},\n",
622 |     "  author={Marivate, Vukosi and Sefara, Tshephisho},\n",
623 |     "  journal={arXiv preprint arXiv:1907.03752},\n",
624 |     "  year={2019}\n",
625 |     "}```\n",
626 |     "\n",
627 |     "https://arxiv.org/abs/1907.03752"
628 |    ]
629 |   },
630 |   {
631 |    "cell_type": "code",
632 |    "execution_count": null,
633 |    "metadata": {},
634 |    "outputs": [],
635 |    "source": []
636 |   }
637 |  ],
638 |  "metadata": {
639 |   "accelerator": "GPU",
640 |   "colab": {
641 |    "collapsed_sections": [],
642 |    "name": "mixup_example_using_IMDB_sentiment.ipynb",
643 |    "provenance": []
644 |   },
645 |   "kernelspec": {
646 |    "display_name": "Python 3",
647 |    "language": "python",
648 |    "name": "python3"
649 |   },
650 |   "language_info": {
651 |    "codemirror_mode": {
652 |     "name": "ipython",
653 |     "version": 3
654 |    },
655 |    "file_extension": ".py",
656 |    "mimetype": "text/x-python",
657 |    "name": "python",
658 |    "nbconvert_exporter": "python",
659 |    "pygments_lexer": "ipython3",
660 |    "version": "3.7.7"
661 |   }
662 |  },
663 |  "nbformat": 4,
664 |  "nbformat_minor": 4
665 | }
666 | 


--------------------------------------------------------------------------------
/examples/word2vec_example.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "markdown",
  5 |    "metadata": {
  6 |     "colab_type": "text",
  7 |     "id": "JHDJLKDuJkcB"
  8 |    },
  9 |    "source": [
 10 |     "# Example for using word2vec"
 11 |    ]
 12 |   },
 13 |   {
 14 |    "cell_type": "code",
 15 |    "execution_count": 1,
 16 |    "metadata": {
 17 |     "colab": {},
 18 |     "colab_type": "code",
 19 |     "id": "9m8ChZsdAx41"
 20 |    },
 21 |    "outputs": [],
 22 |    "source": [
 23 |     "# Import libraries\n",
 24 |     "try:\n",
 25 |     "  import textaugment, gensim\n",
 26 |     "except ModuleNotFoundError:\n",
 27 |     "  !pip -q install textaugment gensim\n",
 28 |     "  import textaugment, gensim"
 29 |    ]
 30 |   },
 31 |   {
 32 |    "cell_type": "code",
 33 |    "execution_count": 4,
 34 |    "metadata": {
 35 |     "colab": {
 36 |      "base_uri": "https://localhost:8080/",
 37 |      "height": 153
 38 |     },
 39 |     "colab_type": "code",
 40 |     "id": "ux6Bc4QSrYA8",
 41 |     "outputId": "9f2b8af1-3b22-455c-dd85-d1ac173a5317"
 42 |    },
 43 |    "outputs": [
 44 |     {
 45 |      "name": "stdout",
 46 |      "output_type": "stream",
 47 |      "text": [
 48 |       "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
 49 |       "[nltk_data]   Unzipping corpora/wordnet.zip.\n",
 50 |       "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
 51 |       "[nltk_data]   Unzipping tokenizers/punkt.zip.\n",
 52 |       "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
 53 |       "[nltk_data]     /root/nltk_data...\n",
 54 |       "[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.\n"
 55 |      ]
 56 |     },
 57 |     {
 58 |      "data": {
 59 |       "text/plain": [
 60 |        "True"
 61 |       ]
 62 |      },
 63 |      "execution_count": 4,
 64 |      "metadata": {
 65 |       "tags": []
 66 |      },
 67 |      "output_type": "execute_result"
 68 |     }
 69 |    ],
 70 |    "source": [
 71 |     "# Import NLRK and download data\n",
 72 |     "import nltk\n",
 73 |     "nltk.download(['wordnet','punkt','averaged_perceptron_tagger'])"
 74 |    ]
 75 |   },
 76 |   {
 77 |    "cell_type": "markdown",
 78 |    "metadata": {
 79 |     "colab_type": "text",
 80 |     "id": "8AUt-F5MtiuI"
 81 |    },
 82 |    "source": [
 83 |     "## Load Google Word2vec embeddings"
 84 |    ]
 85 |   },
 86 |   {
 87 |    "cell_type": "code",
 88 |    "execution_count": 5,
 89 |    "metadata": {
 90 |     "colab": {
 91 |      "base_uri": "https://localhost:8080/",
 92 |      "height": 204
 93 |     },
 94 |     "colab_type": "code",
 95 |     "id": "1xq4dJtSr4RM",
 96 |     "outputId": "1ff32743-04a9-4b8a-eda3-8dcf55e711ca"
 97 |    },
 98 |    "outputs": [
 99 |     {
100 |      "name": "stdout",
101 |      "output_type": "stream",
102 |      "text": [
103 |       "--2020-05-23 18:06:47--  https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n",
104 |       "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.178.197\n",
105 |       "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.178.197|:443... connected.\n",
106 |       "HTTP request sent, awaiting response... 200 OK\n",
107 |       "Length: 1647046227 (1.5G) [application/x-gzip]\n",
108 |       "Saving to: ‘GoogleNews-vectors-negative300.bin.gz’\n",
109 |       "\n",
110 |       "GoogleNews-vectors- 100%[===================>]   1.53G  36.2MB/s    in 44s     \n",
111 |       "\n",
112 |       "2020-05-23 18:07:31 (35.7 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]\n",
113 |       "\n"
114 |      ]
115 |     }
116 |    ],
117 |    "source": [
118 |     "# Download Google Word2vec embeddings\n",
119 |     "!wget \"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\""
120 |    ]
121 |   },
122 |   {
123 |    "cell_type": "code",
124 |    "execution_count": 8,
125 |    "metadata": {
126 |     "colab": {
127 |      "base_uri": "https://localhost:8080/",
128 |      "height": 71
129 |     },
130 |     "colab_type": "code",
131 |     "id": "q2wxTNhwrjK-",
132 |     "outputId": "e30ff6b7-96a3-4d59-c486-65def436cbd8"
133 |    },
134 |    "outputs": [
135 |     {
136 |      "name": "stderr",
137 |      "output_type": "stream",
138 |      "text": [
139 |       "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:253: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
140 |       "  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
141 |      ]
142 |     }
143 |    ],
144 |    "source": [
145 |     "model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)"
146 |    ]
147 |   },
148 |   {
149 |    "cell_type": "code",
150 |    "execution_count": 13,
151 |    "metadata": {
152 |     "colab": {
153 |      "base_uri": "https://localhost:8080/",
154 |      "height": 71
155 |     },
156 |     "colab_type": "code",
157 |     "id": "3uHnRL77uATl",
158 |     "outputId": "de09c7ff-47bc-4e21-d2eb-89fcadb4d2bd"
159 |    },
160 |    "outputs": [
161 |     {
162 |      "name": "stderr",
163 |      "output_type": "stream",
164 |      "text": [
165 |       "/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
166 |       "  if np.issubdtype(vec.dtype, np.int):\n"
167 |      ]
168 |     }
169 |    ],
170 |    "source": [
171 |     "from textaugment import Word2vec\n",
172 |     "t = Word2vec(model=model)\n",
173 |     "output = t.augment('The stories are good', top_n=10)"
174 |    ]
175 |   },
176 |   {
177 |    "cell_type": "code",
178 |    "execution_count": 14,
179 |    "metadata": {
180 |     "colab": {
181 |      "base_uri": "https://localhost:8080/",
182 |      "height": 34
183 |     },
184 |     "colab_type": "code",
185 |     "id": "BhVYt8V3uAwk",
186 |     "outputId": "7c36d302-db66-4837-ff6b-ea1793a088d9"
187 |    },
188 |    "outputs": [
189 |     {
190 |      "name": "stdout",
191 |      "output_type": "stream",
192 |      "text": [
193 |       "the stories are excellent\n"
194 |      ]
195 |     }
196 |    ],
197 |    "source": [
198 |     "print(output)"
199 |    ]
200 |   },
201 |   {
202 |    "cell_type": "markdown",
203 |    "metadata": {
204 |     "colab": {},
205 |     "colab_type": "code",
206 |     "id": "IWoNJrZfy94n"
207 |    },
208 |    "source": [
209 |     "## Cite the paper\n",
210 |     "```\n",
211 |     "@article{marivate2019improving,\n",
212 |     "  title={Improving short text classification through global augmentation methods},\n",
213 |     "  author={Marivate, Vukosi and Sefara, Tshephisho},\n",
214 |     "  journal={arXiv preprint arXiv:1907.03752},\n",
215 |     "  year={2019}\n",
216 |     "}```\n",
217 |     "\n",
218 |     "https://arxiv.org/abs/1907.03752"
219 |    ]
220 |   },
221 |   {
222 |    "cell_type": "code",
223 |    "execution_count": null,
224 |    "metadata": {},
225 |    "outputs": [],
226 |    "source": []
227 |   }
228 |  ],
229 |  "metadata": {
230 |   "accelerator": "GPU",
231 |   "colab": {
232 |    "collapsed_sections": [],
233 |    "name": "word2vec example.ipynb",
234 |    "provenance": []
235 |   },
236 |   "kernelspec": {
237 |    "display_name": "Python 3",
238 |    "language": "python",
239 |    "name": "python3"
240 |   },
241 |   "language_info": {
242 |    "codemirror_mode": {
243 |     "name": "ipython",
244 |     "version": 3
245 |    },
246 |    "file_extension": ".py",
247 |    "mimetype": "text/x-python",
248 |    "name": "python",
249 |    "nbconvert_exporter": "python",
250 |    "pygments_lexer": "ipython3",
251 |    "version": "3.7.7"
252 |   }
253 |  },
254 |  "nbformat": 4,
255 |  "nbformat_minor": 4
256 | }
257 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim>=4.0
2 | googletrans>=2
3 | nltk
4 | numpy
5 | textblob
6 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | import setuptools
 4 | import re
 5 | 
 6 | 
 7 | def find_version(fname):
 8 |     """Attempts to find the version number in the file names fname.
 9 |     Raises RuntimeError if not found.
10 |     """
11 |     version = ''
12 |     with open(fname, 'r') as fp:
13 |         reg = re.compile(r'__version__ = [\'"]([^\'"]*)[\'"]')
14 |         for line in fp:
15 |             m = reg.match(line)
16 |             if m:
17 |                 version = m.group(1)
18 |                 break
19 |     if not version:
20 |         raise RuntimeError('Cannot find version information')
21 |     return version
22 | 
23 | 
24 | __version__ = find_version('textaugment/__init__.py')
25 | 
26 | 
27 | def read(fname):
28 |     with open(fname, "r") as fh:
29 |         content = fh.read()
30 |     return content
31 | 
32 | 
33 | setuptools.setup(
34 |       name='textaugment',
35 |       version=__version__,
36 |       packages=setuptools.find_packages(exclude=('test*', )),
37 |       author='Joseph Sefara',
38 |       author_email='sefaratj@gmail.com',
39 |       license='MIT',
40 |       keywords=['text augmentation', 'python', 'natural language processing', 'nlp'],
41 |       url='https://github.com/dsfsi/textaugment',
42 |       description='A library for augmenting text for natural language processing applications.',
43 |       long_description=read("README.md"),
44 |       long_description_content_type="text/markdown",
45 |       install_requires=['nltk', 'gensim>=4.0', 'textblob', 'numpy', 'googletrans>=2'],
46 |       classifiers=[
47 |           "Intended Audience :: Developers",
48 |           "Natural Language :: English",
49 |           "License :: OSI Approved :: MIT License",
50 |           "Operating System :: OS Independent",
51 |           "Programming Language :: Python :: 3",
52 |           "Programming Language :: Python :: Implementation :: PyPy",
53 |           "Topic :: Text Processing :: Linguistic",
54 |         ]
55 | )
56 | 


--------------------------------------------------------------------------------
/tests/test_translate.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | from textaugment.translate import Translate
 4 | from textaugment import translate
 5 | 
 6 | 
 7 | class InputTestCase(unittest.TestCase):
 8 | 
 9 |     def setUp(self):
10 |         self.t = Translate(src="en", to="es")
11 | 
12 |     def test_geometric(self):
13 |         with self.assertRaises(ValueError, msg="Parameters nust be set"):
14 |             Translate()
15 | 
16 |         with self.assertRaises(KeyError, msg="Value of parameters must be correct"):
17 |             Translate(to=7, src="hello")  # Test parameter, type
18 | 
19 |         with self.assertRaises(TypeError, msg="Only strings are allowed"):
20 |             self.t.augment(45)
21 | 
22 |     def test_translate(self):
23 |         self.assertTrue(translate.LANGUAGES, msg="Files exists")
24 | 
25 | 
26 | class OutputTestCase(unittest.TestCase):
27 | 
28 |     def setUp(self):
29 |         self.t = Translate(src="en", to="es")
30 |         self.data = "He walks"
31 | 
32 |     def test_augment(self):
33 |         self.assertEqual(self.t.augment(self.data), self.data)
34 | 
35 |         self.assertEqual(self.t.augment("4"), "4")
36 | 
37 | 
38 | class PlatformTestCase(unittest.TestCase):
39 | 
40 |     def test_platform(self):
41 |         self.assertEqual(sys.version_info[0], 3, msg="Must be using Python 3")
42 | 
43 | 
44 | if __name__ == '__main__':
45 |     unittest.main()
46 | 
47 | 


--------------------------------------------------------------------------------
/tests/test_word2vec.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | from textaugment.word2vec import Word2vec
 4 | 
 5 | 
 6 | class InputTestCase(unittest.TestCase):
 7 | 
 8 |     def setUp(self):
 9 |         self.path = "/home/tjs/dev/papu/models/gensim_cbow_sepedi"
10 |         self.wrongpath = "/home/tjs/dev/papu/models/gensim_cbow_sepedi-wrong"
11 |         self.w = Word2vec(model=self.path)
12 | 
13 |     def test_augment(self):
14 |         with self.assertRaises(TypeError, msg="Value for p should be float"):
15 |             Word2vec(model=self.path, p="foo")
16 | 
17 |         with self.assertRaises(TypeError, msg="Value for runs should be integer"):
18 |             Word2vec(model=self.path, runs="foo")
19 | 
20 |         with self.assertRaises(FileNotFoundError, msg="The model is not found"):
21 |             Word2vec(model=self.wrongpath)
22 | 
23 |         with self.assertRaises(TypeError, msg="Input should not be lists"):
24 |             self.w.augment(["hello"])
25 | 
26 |         with self.assertRaises(TypeError, msg="Input should not be numbers"):
27 |             self.w.augment(45)
28 | 
29 | 
30 | class OutputTestCase(unittest.TestCase):
31 | 
32 |     def setUp(self):
33 |         self.path = "/home/tjs/dev/papu/models/gensim_cbow_sepedi"
34 |         self.w = Word2vec(model=self.path)
35 |         self.data = "We are testing"
36 | 
37 |     def test_augment(self):
38 |         self.assertIsInstance(self.w.augment(self.data), str, msg="Input must be a string")
39 |         self.assertEqual(self.w.augment("4"), "4", msg="Input should not be numbers")
40 | 
41 | 
42 | class PlatformTestCase(unittest.TestCase):
43 | 
44 |     def test_platform(self):
45 |         self.assertEqual(sys.version_info[0], 3, msg="Must be using Python 3")
46 | 
47 | 
48 | if __name__ == '__main__':
49 |     unittest.main()
50 | 
51 | 


--------------------------------------------------------------------------------
/tests/test_wordnet.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import sys
 3 | import numpy as np
 4 | from textaugment.wordnet import Wordnet
 5 | 
 6 | 
 7 | class InputTestCase(unittest.TestCase):
 8 |     def setUp(self):
 9 |         self.p = 0.8
10 |         self.data = ["I", "am", "testing"]
11 |         self.w = Wordnet(p=self.p)
12 | 
13 |     def test_geometric(self):
14 |         with self.assertRaises(TypeError, msg="Receives one parameter"):
15 |             self.w.geometric(p=self.p, data=self.data)
16 | 
17 |         with self.assertRaises(TypeError, msg="Receives one parameter"):
18 |             self.w.geometric()
19 | 
20 |         with self.assertRaises(IndexError,  msg="Data must be set using; data='data string'"):
21 |             self.w.geometric(data=0)
22 | 
23 |     def test_augment(self):
24 |         with self.assertRaises(TypeError, msg="Expect string not list"):
25 |             self.w.augment(self.data)
26 | 
27 |         with self.assertRaises(TypeError, msg="Expect string not integer"):
28 |             self.w.augment(data=0)
29 | 
30 | 
31 | class OutputTestCase(unittest.TestCase):
32 | 
33 |     def setUp(self):
34 |         self.p = 0.8
35 |         self.data = ["I", "am", "testing"]
36 |         self.data2 = "известен още с псевдонимите"
37 |         self.w = Wordnet(p=self.p)
38 | 
39 |     def test_augment(self):
40 |         self.assertIsInstance(self.w.augment(" ".join(self.data)), str)
41 | 
42 |     def test_geometric(self):
43 |         self.assertIsInstance(self.w.geometric(data=self.data), np.ndarray)
44 | 
45 | 
46 | class PlatformTestCase(unittest.TestCase):
47 | 
48 |     def test_platform(self):
49 |         self.assertEqual(sys.version_info[0], 3, msg="Must be using Python 3")
50 | 
51 | 
52 | if __name__ == '__main__':
53 |     unittest.main()
54 | 


--------------------------------------------------------------------------------
/textaugment/__init__.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | from .translate import Translate
 3 | from .word2vec import Word2vec
 4 | from .word2vec import Fasttext
 5 | from .wordnet import Wordnet
 6 | from .eda import EDA
 7 | from .aeda import AEDA
 8 | from .mixup import MIXUP
 9 | from .constants import LANGUAGES
10 | 
11 | name = "textaugment"
12 | 
13 | __version__ = '2.0.0'
14 | __licence__ = 'MIT'
15 | __author__ = 'Joseph Sefara'
16 | __url__ = 'https://github.com/dsfsi/textaugment/'
17 | 
18 | PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
19 | 
20 | __all__ = [
21 |     'Translate',
22 |     'Word2vec',
23 |     'Wordnet',
24 |     'EDA',
25 |     'AEDA',
26 |     'MIXUP',
27 |     'LANGUAGES'
28 | ]
29 | 


--------------------------------------------------------------------------------
/textaugment/aeda.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # TextAugment: AEDA
 3 | #
 4 | # Copyright (C) 2023
 5 | # Author: Juhwan Choi
 6 | #
 7 | # URL: <https://github.com/dsfsi/textaugment/>
 8 | # For license information, see LICENSE
 9 | #
10 | """
11 | This module is an implementation of the original AEDA algorithm (2021) [1].
12 | """
13 | import random
14 | 
15 | 
16 | class AEDA:
17 |     """
18 |     This class is an implementation of the original AEDA algorithm (2021) [1].
19 | 
20 |     [1] Karimi et al., 2021, November. AEDA: An Easier Data Augmentation Technique for Text Classification.
21 |     In Findings of the Association for Computational Linguistics: EMNLP 2021 (pp. 2748-2754).
22 |     https://aclanthology.org/2021.findings-emnlp.234.pdf
23 | 
24 |     Example usage: ::
25 |         >>> from textaugment import AEDA
26 |         >>> t = AEDA()
27 |         >>> t.punct_insertion("John is going to town")
28 |         ! John is going to town
29 |     """
30 | 
31 |     @staticmethod
32 |     def validate(**kwargs):
33 |         """Validate input data"""
34 |         if 'sentence' in kwargs:
35 |             if not isinstance(kwargs['sentence'].strip(), str) or len(kwargs['sentence'].strip()) == 0:
36 |                 raise TypeError("sentence must be a valid sentence")
37 | 
38 |     def __init__(self, punctuations=['.', ';', '?', ':', '!', ','], random_state=1):
39 |         """A method to initialize parameters
40 | 
41 |         :type punctuations: list
42 |         :param punctuations: (optional) Punctuations to be inserted
43 |         :type random_state: int
44 |         :param random_state: (optional) Seed
45 | 
46 |         :rtype:   None
47 |         :return:  Constructer do not return.
48 |         """
49 |         self.punctuations = punctuations
50 |         self.random_state = random_state
51 |         if isinstance(self.random_state, int):
52 |             random.seed(self.random_state)
53 |         else:
54 |             raise TypeError("random_state must have type int")
55 | 
56 |     def punct_insertion(self, sentence: str):
57 |         """Insert random punctuations to the sentence
58 | 
59 |         :type sentence: str
60 |         :param sentence: Sentence
61 | 
62 |         :rtype:   str
63 |         :return:  Augmented sentence
64 |         """
65 |         self.validate(sentence=sentence)
66 | 
67 |         sentence = sentence.strip().split(' ')
68 |         len_sentence = len(sentence)
69 |         # Get random number of punctuations to be inserted
70 |         # The number of punctuations to be inserted is between 1 and 1/3 of the length of the sentence
71 |         num_punctuations = random.randint(1, len_sentence // 3)
72 |         augmented_sentence = sentence.copy()
73 | 
74 |         # Insert random punctuations in random positions
75 |         for _ in range(num_punctuations):
76 |             punct = random.choice(self.punctuations) # Select punctuation to be inserted
77 |             pos = random.randint(0, len(augmented_sentence) - 1) # Select position to insert punctuation
78 |             augmented_sentence = augmented_sentence[:pos] + [punct] + augmented_sentence[pos:] # Insert punctuation
79 |         augmented_sentence = ' '.join(augmented_sentence)
80 | 
81 |         return augmented_sentence
82 | 


--------------------------------------------------------------------------------
/textaugment/constants.py:
--------------------------------------------------------------------------------
  1 | LANGUAGES = {
  2 |     'af': 'afrikaans',
  3 |     'sq': 'albanian',
  4 |     'am': 'amharic',
  5 |     'ar': 'arabic',
  6 |     'hy': 'armenian',
  7 |     'az': 'azerbaijani',
  8 |     'eu': 'basque',
  9 |     'be': 'belarusian',
 10 |     'bn': 'bengali',
 11 |     'bs': 'bosnian',
 12 |     'bg': 'bulgarian',
 13 |     'ca': 'catalan',
 14 |     'ceb': 'cebuano',
 15 |     'ny': 'chichewa',
 16 |     'zh-cn': 'chinese (simplified)',
 17 |     'zh-tw': 'chinese (traditional)',
 18 |     'co': 'corsican',
 19 |     'hr': 'croatian',
 20 |     'cs': 'czech',
 21 |     'da': 'danish',
 22 |     'nl': 'dutch',
 23 |     'en': 'english',
 24 |     'eo': 'esperanto',
 25 |     'et': 'estonian',
 26 |     'tl': 'filipino',
 27 |     'fi': 'finnish',
 28 |     'fr': 'french',
 29 |     'fy': 'frisian',
 30 |     'gl': 'galician',
 31 |     'ka': 'georgian',
 32 |     'de': 'german',
 33 |     'el': 'greek',
 34 |     'gu': 'gujarati',
 35 |     'ht': 'haitian creole',
 36 |     'ha': 'hausa',
 37 |     'haw': 'hawaiian',
 38 |     'iw': 'hebrew',
 39 |     'hi': 'hindi',
 40 |     'hmn': 'hmong',
 41 |     'hu': 'hungarian',
 42 |     'is': 'icelandic',
 43 |     'ig': 'igbo',
 44 |     'id': 'indonesian',
 45 |     'ga': 'irish',
 46 |     'it': 'italian',
 47 |     'ja': 'japanese',
 48 |     'jw': 'javanese',
 49 |     'kn': 'kannada',
 50 |     'kk': 'kazakh',
 51 |     'km': 'khmer',
 52 |     'ko': 'korean',
 53 |     'ku': 'kurdish (kurmanji)',
 54 |     'ky': 'kyrgyz',
 55 |     'lo': 'lao',
 56 |     'la': 'latin',
 57 |     'lv': 'latvian',
 58 |     'lt': 'lithuanian',
 59 |     'lb': 'luxembourgish',
 60 |     'mk': 'macedonian',
 61 |     'mg': 'malagasy',
 62 |     'ms': 'malay',
 63 |     'ml': 'malayalam',
 64 |     'mt': 'maltese',
 65 |     'mi': 'maori',
 66 |     'mr': 'marathi',
 67 |     'mn': 'mongolian',
 68 |     'my': 'myanmar (burmese)',
 69 |     'ne': 'nepali',
 70 |     'no': 'norwegian',
 71 |     'ps': 'pashto',
 72 |     'fa': 'persian',
 73 |     'pl': 'polish',
 74 |     'pt': 'portuguese',
 75 |     'pa': 'punjabi',
 76 |     'ro': 'romanian',
 77 |     'ru': 'russian',
 78 |     'sm': 'samoan',
 79 |     'gd': 'scots gaelic',
 80 |     'sr': 'serbian',
 81 |     'st': 'sesotho',
 82 |     'sn': 'shona',
 83 |     'sd': 'sindhi',
 84 |     'si': 'sinhala',
 85 |     'sk': 'slovak',
 86 |     'sl': 'slovenian',
 87 |     'so': 'somali',
 88 |     'es': 'spanish',
 89 |     'su': 'sundanese',
 90 |     'sw': 'swahili',
 91 |     'sv': 'swedish',
 92 |     'tg': 'tajik',
 93 |     'ta': 'tamil',
 94 |     'te': 'telugu',
 95 |     'th': 'thai',
 96 |     'tr': 'turkish',
 97 |     'uk': 'ukrainian',
 98 |     'ur': 'urdu',
 99 |     'uz': 'uzbek',
100 |     'vi': 'vietnamese',
101 |     'cy': 'welsh',
102 |     'xh': 'xhosa',
103 |     'yi': 'yiddish',
104 |     'yo': 'yoruba',
105 |     'zu': 'zulu',
106 |     'fil': 'Filipino',
107 |     'he': 'Hebrew'
108 | }
109 | 
110 | LANGCODES = dict(map(reversed, LANGUAGES.items()))
111 | 


--------------------------------------------------------------------------------
/textaugment/eda.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # TextAugment: EDA
  3 | #
  4 | # Copyright (C) 2018-2023
  5 | # Author: Joseph Sefara
  6 | #
  7 | # URL: <https://github.com/dsfsi/textaugment/>
  8 | # For license information, see LICENSE
  9 | #
 10 | """
 11 | This module is an implementation of the original EDA algorithm (2019) [1].
 12 | """
 13 | import nltk
 14 | from nltk.corpus import wordnet, stopwords
 15 | import random
 16 | 
 17 | 
 18 | class EDA:
 19 |     """
 20 |     This class is an implementation of the original EDA algorithm (2019) [1].
 21 | 
 22 |     [1] Wei, J. and Zou, K., 2019, November. EDA: Easy Data Augmentation Techniques for Boosting Performance on
 23 |     Text Classification Tasks. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing
 24 |     and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP) (pp. 6383-6389).
 25 |     https://www.aclweb.org/anthology/D19-1670.pdf
 26 | 
 27 |     Example usage: ::
 28 |         >>> from textaugment import EDA
 29 |         >>> t = EDA()
 30 |         >>> t.synonym_replacement("John is going to town",top_n=3)
 31 |         John is give out to town
 32 |         >>> t.random_deletion("John is going to town", p=0.2)
 33 |         is going to town
 34 |         >>> t.random_swap("John is going to town")
 35 |         John town going to is
 36 |         >>> t.random_insertion("John is going to town")
 37 |         John is going to make up town
 38 |     """
 39 | 
 40 |     @staticmethod
 41 |     def _get_synonyms(word):
 42 |         """Generate synonym"""
 43 |         synonyms = set()
 44 |         for syn in wordnet.synsets(word):
 45 |             for lemma in syn.lemmas():
 46 |                 synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
 47 |                 synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
 48 |                 synonyms.add(synonym)
 49 |         if word in synonyms:
 50 |             synonyms.remove(word)
 51 |         synonyms = sorted(list(synonyms))
 52 |         random.shuffle(synonyms)
 53 |         return synonyms
 54 | 
 55 | 
 56 |     @staticmethod
 57 |     def swap_word(new_words):
 58 |         """Swap words"""
 59 |         random_idx_1 = random.randint(0, len(new_words) - 1)
 60 |         random_idx_2 = random_idx_1
 61 |         counter = 0
 62 |         while random_idx_2 == random_idx_1:
 63 |             random_idx_2 = random.randint(0, len(new_words) - 1)
 64 |             counter += 1
 65 |             if counter > 3:
 66 |                 return new_words
 67 |         new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
 68 |         return new_words
 69 | 
 70 |     @staticmethod
 71 |     def validate(**kwargs):
 72 |         """Validate input data"""
 73 | 
 74 |         if 'p' in kwargs:
 75 |             if kwargs['p'] > 1 or kwargs['p'] < 0:
 76 |                 raise TypeError("p must be a fraction between 0 and 1")
 77 |         if 'sentence' in kwargs:
 78 |             if not isinstance(kwargs['sentence'].strip(), str) or len(kwargs['sentence'].strip()) == 0:
 79 |                 raise TypeError("sentence must be a valid sentence")
 80 |         if 'n' in kwargs:
 81 |             if not isinstance(kwargs['n'], int):
 82 |                 raise TypeError("n must be a valid integer")
 83 | 
 84 |     def __init__(self, stop_words=None, random_state=1):
 85 |         """A method to initialize parameters
 86 | 
 87 |         :type random_state: int
 88 |         :param random_state: (optional) Seed
 89 |         :type stop_words: list
 90 |         :param stop_words: (optional) List of stopwords
 91 | 
 92 |         :rtype:   None
 93 |         :return:  Constructer do not return.
 94 |         """
 95 |         self.stopwords = stopwords.words('english') if stop_words is None else stop_words
 96 |         self.sentence = None
 97 |         self.p = None
 98 |         self.n = None
 99 |         self.random_state = random_state
100 |         if isinstance(self.random_state, int):
101 |             random.seed(self.random_state)
102 |         else:
103 |             raise TypeError("random_state must have type int")
104 | 
105 |     def add_word(self, new_words):
106 |         """Insert word"""
107 |         synonyms = list()
108 |         counter = 0
109 |         while len(synonyms) < 1:
110 |             random_word_list = list([word for word in new_words if word not in self.stopwords])
111 |             random_word = random_word_list[random.randint(0, len(random_word_list) - 1)]
112 |             synonyms = self._get_synonyms(random_word)
113 |             counter += 1
114 |             if counter >= 10:
115 |                 return new_words  # See Issue 14 for details
116 |         random_synonym = synonyms[0]  # TODO
117 |         random_idx = random.randint(0, len(new_words) - 1)
118 |         new_words.insert(random_idx, random_synonym)
119 |         return new_words
120 | 
121 |     # def synonym_replacement_top_n(self,
122 |     #                               sentence: str,
123 |     #                               n: int = 1,
124 |     #                               top_n: int = None,
125 |     #                               stopwords: list = None,
126 |     #                               lang: str = 'eng'):
127 |     #
128 |     #     """Replace n words in the sentence with top_n synonyms from wordnet
129 |     #
130 |     #     :type sentence: str
131 |     #     :param sentence: Sentence
132 |     #     :type n: int
133 |     #     :param n: Number of repetitions to replace
134 |     #     :type top_n: int
135 |     #     :param top_n: top_n of synonyms to randomly choose from
136 |     #     :type stopwords: list
137 |     #     :param stopwords: stopwords
138 |     #     :type lang: str
139 |     #     :param lang: lang
140 |     #
141 |     #     :rtype:   str
142 |     #     :return:  Augmented sentence
143 |     #     """
144 |     #
145 |     #     stopwords = stopwords if stopwords else self.stopwords
146 |     #
147 |     #     def get_synonyms(w, pos):
148 |     #         morphy_tag = {
149 |     #             'NN': wordnet.NOUN,
150 |     #             'JJ': wordnet.ADJ,
151 |     #             'VB': wordnet.VERB,
152 |     #             'RB': wordnet.ADV
153 |     #         }
154 |     #         for sunset in wordnet.synsets(w,
155 |     #                                       lang=lang,
156 |     #                                       pos=morphy_tag[pos[:2]] if pos[:2] in morphy_tag else None):
157 |     #             for lemma in sunset.lemmas(lang=lang):
158 |     #                 yield lemma.name()
159 |     #
160 |     #     new_words = list()
161 |     #     for index, (word, tag) in enumerate(nltk.pos_tag(nltk.word_tokenize(sentence))):
162 |     #         synonyms = sorted(set(synonym for synonym in get_synonyms(word, tag) if synonym != word))
163 |     #         synonyms = synonyms[:top_n if top_n else len(synonyms)]
164 |     #         new_words.append({
165 |     #             "index": index,
166 |     #             "word": word,
167 |     #             "new_word": random.choice(synonyms) if len(synonyms) > 0 else "",
168 |     #             "synonyms": synonyms,
169 |     #             "in_stopwords": word in stopwords
170 |     #         })
171 |     #
172 |     #     replaced_index = random.choices([word["index"] for word in new_words
173 |     #                                      if not word["in_stopwords"] and len(word["synonyms"]) > 0], k=n)
174 |     #
175 |     #     return ' '.join([word["new_word" if word["index"] in replaced_index else "word"] for word in new_words])
176 | 
177 |     def synonym_replacement(self, sentence: str, n: int = 1, top_n: int = None):
178 |         """Replace n words in the sentence with synonyms from wordnet
179 | 
180 |         :type sentence: str
181 |         :param sentence: Sentence
182 |         :type n: int
183 |         :param n: Number of repetitions to replace
184 |         :type top_n: int
185 |         :param top_n: top_n of synonyms to randomly choose from
186 | 
187 |         :rtype:   str
188 |         :return:  Augmented sentence
189 |         """
190 |         self.validate(sentence=sentence, n=n)
191 |         self.n = n
192 |         self.sentence = sentence
193 |         words = sentence.split()
194 |         new_words = words.copy()
195 |         random_word_list = sorted(set([word for word in words if word not in self.stopwords]))
196 |         random.shuffle(random_word_list)
197 |         replaced = 0
198 |         for random_word in random_word_list:
199 |             synonyms = self._get_synonyms(random_word)
200 |             if len(synonyms) > 0:
201 |                 synonyms = synonyms[:top_n if top_n else len(synonyms)]  # use top n or all synonyms
202 |                 synonym = random.choice(synonyms)
203 |                 new_words = [synonym if word == random_word else word for word in new_words]
204 |                 replaced += 1
205 |             if replaced >= self.n:
206 |                 break
207 |         sentence = ' '.join(new_words)
208 | 
209 |         return sentence
210 | 
211 |     def random_deletion(self, sentence: str, p: float = 0.1):
212 |         """Randomly delete words from the sentence with probability p
213 | 
214 |         :type sentence: str
215 |         :param sentence: Sentence
216 |         :type p: int
217 |         :param p: Probability between 0 and 1
218 | 
219 |         :rtype:   str
220 |         :return:  Augmented sentence
221 |         """
222 |         self.validate(sentence=sentence, p=p)
223 |         self.p = p
224 |         self.sentence = sentence
225 |         words = sentence.split()
226 |         if len(words) == 1:
227 |             return words[0]
228 |         new_words = list()
229 |         for word in words:
230 |             r = random.uniform(0, 1)
231 |             if r > self.p:
232 |                 new_words.append(word)
233 |         # if all words are deleted, just return a random word
234 |         if len(new_words) == 0:
235 |             return random.choice(words)
236 | 
237 |         return " ".join(new_words)
238 | 
239 |     def random_swap(self, sentence: str, n: int = 1):
240 |         """Randomly swap two words in the sentence n times
241 | 
242 |         :type sentence: str
243 |         :param sentence: Sentence
244 |         :type n: int
245 |         :param n: Number of repetitions to swap
246 | 
247 |         :rtype:   str
248 |         :return:  Augmented sentence
249 |         """
250 |         self.validate(sentence=sentence, n=n)
251 |         self.n = n
252 |         self.sentence = sentence
253 |         words = sentence.split()
254 |         new_words = words.copy()
255 |         for _ in range(self.n):
256 |             new_words = self.swap_word(new_words)
257 |         return " ".join(new_words)
258 | 
259 |     def random_insertion(self, sentence: str, n: int = 1):
260 |         """Randomly insert n words into the sentence
261 | 
262 |         :type sentence: str
263 |         :param sentence: Sentence
264 |         :type n: int
265 |         :param n: Number of words to insert
266 | 
267 |         :rtype:   str
268 |         :return:  Augmented sentence
269 |         """
270 |         self.validate(sentence=sentence, n=n)
271 |         self.n = n
272 |         self.sentence = sentence
273 |         words = sentence.split()
274 |         new_words = words.copy()
275 |         for _ in range(self.n):
276 |             new_words = self.add_word(new_words)
277 |         return " ".join(new_words)
278 | 


--------------------------------------------------------------------------------
/textaugment/mixup.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # TextAugment: mixup
  3 | #
  4 | # Copyright (C) 2018-2023
  5 | # Authors: Joseph Sefara, Vukosi Marivate
  6 | #
  7 | # URL: <https://github.com/dsfsi/textaugment/>
  8 | # For license information, see LICENSE
  9 | import numpy as np
 10 | import random
 11 | 
 12 | 
 13 | class MIXUP:
 14 |     """
 15 |     This class implements the mixup algorithm [1] for natural language processing.
 16 | 
 17 |     [1] Zhang, Hongyi, Moustapha Cisse, Yann N. Dauphin, and David Lopez-Paz. "mixup: Beyond empirical risk
 18 |     minimization." in International Conference on Learning Representations (2018).
 19 |     https://openreview.net/forum?id=r1Ddp1-Rb
 20 |     """
 21 | 
 22 |     @staticmethod
 23 |     def validate(**kwargs):
 24 |         """Validate input data"""
 25 | 
 26 |         if 'data' in kwargs:
 27 |             if isinstance(kwargs['data'], list):
 28 |                 kwargs['data'] = np.array(kwargs['data'])
 29 |             if not isinstance(kwargs['data'], np.ndarray):
 30 |                 raise TypeError("data must be numpy array. Found " + str(type(kwargs['data'])))
 31 |         if 'labels' in kwargs:
 32 |             if isinstance(kwargs['labels'], (list, type(None))):
 33 |                 kwargs['labels'] = np.array(kwargs['labels'])
 34 |             if not isinstance(kwargs['labels'], np.ndarray):
 35 |                 raise TypeError("labels must be numpy array. Found " + str(type(kwargs['labels'])))
 36 |         if 'batch_size' in kwargs:
 37 |             if not isinstance(kwargs['batch_size'], int):
 38 |                 raise TypeError("batch_size must be a valid integer. Found " + str(type(kwargs['batch_size'])))
 39 |         if 'shuffle' in kwargs:
 40 |             if not isinstance(kwargs['shuffle'], bool):
 41 |                 raise TypeError("shuffle must be a boolean. Found " + str(type(kwargs['shuffle'])))
 42 |         if 'runs' in kwargs:
 43 |             if not isinstance(kwargs['runs'], int):
 44 |                 raise TypeError("runs must be a valid integer. Found " + str(type(kwargs['runs'])))
 45 | 
 46 |     def __init__(self, random_state=1, runs=1):
 47 |         self.random_state = random_state
 48 |         self.runs = runs
 49 |         if isinstance(self.random_state, int):
 50 |             random.seed(self.random_state)
 51 |             np.random.seed(self.random_state)
 52 |         else:
 53 |             raise TypeError("random_state must have type int")
 54 | 
 55 |     def mixup_data(self, x, y=None, alpha=0.2):
 56 |         """This method performs mixup. If runs = 1 it just does 1 mixup with whole batch, any n of runs
 57 |         creates many mixup matches.
 58 | 
 59 |         :type x: Numpy array
 60 |         :param x: Data array
 61 |         :type y: Numpy array
 62 |         :param y: (optional) labels
 63 |         :type alpha: float
 64 |         :param alpha: alpha
 65 | 
 66 |         :rtype: tuple
 67 |         :return: Returns mixed inputs, pairs of targets, and lambda
 68 |         """
 69 |         if self.runs is None:
 70 |             self.runs = 1
 71 |         output_x = []
 72 |         output_y = []
 73 |         batch_size = x.shape[0]
 74 |         for i in range(self.runs):
 75 |             lam_vector = np.random.beta(alpha, alpha, batch_size)
 76 |             index = np.random.permutation(batch_size)
 77 |             mixed_x = (x.T * lam_vector).T + (x[index, :].T * (1.0 - lam_vector)).T
 78 |             output_x.append(mixed_x)
 79 |             if y is None:
 80 |                 return np.concatenate(output_x, axis=0)
 81 |             mixed_y = (y.T * lam_vector).T + (y[index].T * (1.0 - lam_vector)).T
 82 |             output_y.append(mixed_y)
 83 |         return np.concatenate(output_x, axis=0), np.concatenate(output_y, axis=0)
 84 | 
 85 |     def flow(self, data, labels=None, batch_size=32, shuffle=True, runs=1):
 86 |         """This function implements the batch iterator and specifically calls mixup
 87 | 
 88 |         :param data: Input data. Numpy ndarray or list of lists.
 89 |         :param labels: Labels. Numpy ndarray or list of lists.
 90 |         :param batch_size: Int (default: 32).
 91 |         :param shuffle: Boolean (default: True).
 92 |         :param runs: Int (default: 1). Number of augmentations
 93 | 
 94 |         :rtype:   array or tuple
 95 |         :return:  array or tuple of arrays (X_data array, labels array)."""
 96 | 
 97 |         self.validate(data=data, labels=labels, batch_size=batch_size, shuffle=shuffle, runs=runs)
 98 | 
 99 |         self.runs = runs
100 | 
101 |         num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
102 | 
103 |         def data_generator():
104 |             data_size = len(data)
105 |             while True:
106 |                 # Shuffle the data at each epoch
107 |                 if shuffle:
108 |                     shuffle_indices = np.random.permutation(np.arange(data_size))
109 |                     shuffled_data = data[shuffle_indices]
110 |                     if labels is not None:
111 |                         shuffled_labels = labels[shuffle_indices]
112 |                 else:
113 |                     shuffled_data = data
114 |                     if labels is not None:
115 |                         shuffled_labels = labels
116 |                 for batch_num in range(num_batches_per_epoch):
117 |                     start_index = batch_num * batch_size
118 |                     end_index = min((batch_num + 1) * batch_size, data_size)
119 |                     X = shuffled_data[start_index: end_index]
120 |                     if labels is None:
121 |                         X = self.mixup_data(X, y=None)
122 |                         yield X
123 |                     else:
124 |                         y = shuffled_labels[start_index: end_index]
125 |                         X, y = self.mixup_data(X, y)
126 |                         yield X, y
127 | 
128 |         return data_generator(), num_batches_per_epoch
129 | 


--------------------------------------------------------------------------------
/textaugment/translate.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # WordNet-based data augmentation 
  3 | #
  4 | # Copyright (C) 2020
  5 | # Author: Joseph Sefara
  6 | # URL: <https://github.com/dsfsi/textaugment/>
  7 | # For license information, see LICENSE
  8 | 
  9 | from .constants import LANGUAGES
 10 | from textblob import TextBlob
 11 | from textblob.translate import NotTranslated
 12 | from googletrans import Translator
 13 | 
 14 | 
 15 | class Translate: 
 16 |     """
 17 |     A set of functions used to augment data.
 18 |     Supported languages are:
 19 |     Language Name	Code
 20 |     Afrikaans	af
 21 |     Albanian	sq
 22 |     Arabic	ar
 23 |     Azerbaijani	az
 24 |     Basque	eu
 25 |     Bengali	bn
 26 |     Belarusian	be
 27 |     Bulgarian	bg
 28 |     Catalan	ca
 29 |     Chinese Simplified	zh-CN
 30 |     Chinese Traditional	zh-TW
 31 |     Croatian	hr
 32 |     Czech	cs
 33 |     Danish	da
 34 |     Dutch	nl
 35 |     English	en
 36 |     Esperanto	eo
 37 |     Estonian	et
 38 |     Filipino	tl
 39 |     Finnish	fi
 40 |     French	fr
 41 |     Galician	gl
 42 |     Georgian	ka
 43 |     German	de
 44 |     Greek	el
 45 |     Gujarati	gu
 46 |     Haitian Creole	ht
 47 |     Hebrew	iw
 48 |     Hindi	hi
 49 |     Hungarian	hu
 50 |     Icelandic	is
 51 |     Indonesian	id
 52 |     Irish	ga
 53 |     Italian	it
 54 |     Japanese	ja
 55 |     Kannada	kn
 56 |     Korean	ko
 57 |     Latin	la
 58 |     Latvian	lv
 59 |     Lithuanian	lt
 60 |     Macedonian	mk
 61 |     Malay	ms
 62 |     Maltese	mt
 63 |     Norwegian	no
 64 |     Persian	fa
 65 |     Polish	pl
 66 |     Portuguese	pt
 67 |     Romanian	ro
 68 |     Russian	ru
 69 |     Serbian	sr
 70 |     Slovak	sk
 71 |     Slovenian	sl
 72 |     Spanish	es
 73 |     Swahili	sw
 74 |     Swedish	sv
 75 |     Tamil	ta
 76 |     Telugu	te
 77 |     Thai	th
 78 |     Turkish	tr
 79 |     Ukrainian	uk
 80 |     Urdu	ur
 81 |     Vietnamese	vi
 82 |     Welsh	cy
 83 |     Yiddish	yi
 84 | 
 85 |     Example usage: ::
 86 |         >>> from textaugment import Translate
 87 |         >>> t = Translate(src="en",to="es")
 88 |         >>> t.augment('I love school')
 89 |         i adore school
 90 |     """
 91 | 
 92 |     def __init__(self, **kwargs):
 93 | 
 94 |         """
 95 |         A method to initialize parameters
 96 | 
 97 |         :type src: str
 98 |         :param src: Source language of the text
 99 |         :type to: str
100 |         :param to: Destination language to translate to. The language should be a family of the source language for
101 |                 better results. The text will then be translated back to the source language.
102 |         :rtype:   None
103 |         :return:  Constructer do not return.
104 |         """
105 |         hl = LANGUAGES
106 |         
107 |         try:
108 |             if "to" not in kwargs:
109 |                 raise ValueError("'to' missing")
110 |             elif "src" not in kwargs:
111 |                 raise ValueError("'src' missing")
112 |             if kwargs['to'] not in hl:
113 |                 raise KeyError("Value of to is not surpported. See help(Translate)")
114 |             if kwargs['src'] not in hl:
115 |                 raise KeyError("Value of src is not surpported. See help(Translate)")
116 |         except (ValueError, KeyError):
117 |             print("The values of the keys 'to' and 'src' are required. E.g Translate(src='en', to='es')")
118 |             raise
119 |         else:    
120 |             self.to = kwargs['to']
121 |             self.src = kwargs['src']
122 | 
123 |     def augment(self, data):
124 |         """
125 |         A method to paraphrase a sentence.
126 |         
127 |         :type data: str
128 |         :param data: sentence used for data augmentation 
129 |         :rtype:   str
130 |         :return:  The augmented data
131 |         """
132 |         if type(data) is not str:
133 |             raise TypeError("DataType must be a string")
134 |         data = TextBlob(data.lower())
135 |         try:
136 |             data = data.translate(from_lang=self.src, to=self.to)
137 |             data = data.translate(from_lang=self.to, to=self.src)
138 |         except NotTranslated:
139 |             try:  # Switch to googletrans to do translation.
140 |                 translator = Translator()
141 |                 data = translator.translate(data, dest=self.to, src=self.src).text
142 |                 data = translator.translate(data, dest=self.src, src=self.to).text
143 |             except Exception:
144 |                 print("Error Not translated.\n")
145 |                 raise
146 | 
147 |         return str(data).lower()
148 | 


--------------------------------------------------------------------------------
/textaugment/word2vec.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # Word2vec-based data augmentation 
  3 | #
  4 | # Copyright (C) 2023
  5 | # Author: Joseph Sefara
  6 | # URL: <https://github.com/dsfsi/textaugment/>
  7 | # For license information, see LICENSE
  8 | 
  9 | import gensim
 10 | import numpy as np
 11 | import random
 12 | 
 13 | 
 14 | class Word2vec:
 15 |     """
 16 |     A set of functions used to augment data.
 17 | 
 18 |     Typical usage: :: 
 19 |         >>> from textaugment import Word2vec
 20 |         >>> t = Word2vec(model='path/to/gensim/model'or 'gensim model itself')
 21 |         >>> t.augment('I love school', top_n=10)
 22 |         i adore school
 23 |     """
 24 |     
 25 |     def __init__(self, **kwargs):
 26 |         """
 27 |         A method to initialize a model on a given path.
 28 |         :type random_state: int, float, str, bytes, bytearray
 29 |         :param random_state: seed
 30 |         :type model: str or gensim.models.word2vec.Word2Vec or gensim.models.fasttext.FastText
 31 |         :param model: The path to the model or the model itself.
 32 |         :type runs: int, optional
 33 |         :param runs: The number of times to augment a sentence. By default is 1.
 34 |         :type v: bool or optional
 35 |         :param v: Replace all the words if true. If false randomly replace words.
 36 |                 Used in a Paper (https://www.cs.cmu.edu/~diyiy/docs/emnlp_wang_2015.pdf)
 37 |         :type p: float, optional
 38 |         :param p: The probability of success of an individual trial. (0.1<p<1.0), default is 0.5
 39 |         """
 40 | 
 41 |         # Set random state
 42 |         if 'random_state' in kwargs:
 43 |             self.random_state = kwargs['random_state']
 44 |             if isinstance(self.random_state, int):
 45 |                 random.seed(self.random_state)
 46 |                 np.random.seed(self.random_state)
 47 |             else:
 48 |                 raise TypeError("random_state must have type int")
 49 | 
 50 |         # Set verbose to false if does not exists
 51 |         try:
 52 |             if kwargs['v']: 
 53 |                 self.v = True
 54 |             else:
 55 |                 self.v = False
 56 |         except KeyError:
 57 |             self.v = False
 58 | 
 59 |         try:
 60 |             if "p" in kwargs:
 61 |                 if type(kwargs['p']) is not float:
 62 |                     raise TypeError("p represent probability of success and must be a float from 0.1 to 0.9. E.g p=0.5")
 63 |                 elif type(kwargs['p']) is float:
 64 |                     self.p = kwargs['p']
 65 |             else:
 66 |                 kwargs['p'] = 0.5  # Set default value
 67 |         except KeyError:
 68 |             raise
 69 | 
 70 |         # Error handling of given parameters
 71 |         try:
 72 |             if "runs" not in kwargs:
 73 |                 kwargs["runs"] = 1  # Default value for runs
 74 |             elif type(kwargs["runs"]) is not int:
 75 |                 raise TypeError("DataType for 'runs' must be an integer")
 76 |             if "model" not in kwargs:
 77 |                 raise ValueError("Set the value of model. e.g model='path/to/model' or model itself")
 78 |             if type(kwargs['model']) != str and 'gensim' not in str(type(kwargs['model'])).lower():
 79 |                 raise TypeError("Model path must be a string, or the type of the model must be a gensim.models...")
 80 |         except (ValueError, TypeError):
 81 |             raise
 82 |         else:
 83 |             self.runs = kwargs["runs"] 
 84 |             self.model = kwargs["model"]
 85 |             self.p = kwargs["p"]
 86 |             try:
 87 |                 if type(self.model) is str:
 88 |                     self.model = gensim.models.Word2Vec.load(self.model)  # load word2vec or fasttext model
 89 |             except FileNotFoundError:
 90 |                 print("Error: Model not found. Verify the path.\n")
 91 |                 raise ValueError("Error: Model not found. Verify the path.")
 92 | 
 93 |     def geometric(self, data):
 94 |         """
 95 |         Used to generate Geometric distribution.
 96 | 
 97 |         :type data: list
 98 |         :param data: Input data
 99 | 
100 |         :rtype:   ndarray or scalar
101 |         :return:  Drawn samples from the parameterized Geometric distribution.
102 |         """
103 | 
104 |         data = np.array(data)
105 |         first_trial = np.random.geometric(p=self.p, size=data.shape[0]) == 1  # Capture success after first trial
106 |         return data[first_trial]
107 | 
108 |     def augment(self, data: str, top_n: int = 10):
109 |         """
110 |         The method to replace words with similar words.
111 |         
112 |         :type data: str
113 |         :param data: Input data
114 |         :type top_n: int
115 |         :param top_n: top_n of most similar words to randomly choose from
116 | 
117 |         :rtype:   str
118 |         :return:  The augmented data
119 |         """
120 | 
121 |         # Avoid nulls and other unsupported types
122 |         if type(top_n) is not int:
123 |             raise TypeError("Only integers are supported")
124 |         if type(data) is not str: 
125 |             raise TypeError("Only strings are supported")
126 |         # Lower case and split
127 |         data_tokens = data.lower().split()
128 | 
129 |         # Verbose = True then replace all the words.
130 |         if self.v:
131 |             for _ in range(self.runs):
132 |                 for index in range(len(data_tokens)):  # Index from 0 to length of data_tokens
133 |                     try:
134 |                         similar_words = [syn for syn, t in self.model.wv.most_similar(data_tokens[index], topn=top_n)]
135 |                         r = random.randrange(len(similar_words))
136 |                         data_tokens[index] = similar_words[r].lower()  # Replace with random synonym from 10 synonyms
137 |                     except KeyError:
138 |                         pass  # For words not in the word2vec model
139 |         else:  # Randomly replace some words
140 |             for _ in range(self.runs):
141 |                 data_tokens_idx = [[x, y] for (x, y) in enumerate(data_tokens)]  # Enumerate data
142 |                 words = self.geometric(data=data_tokens_idx).tolist()  # List of words indexed
143 |                 for w in words:
144 |                     try:
145 |                         similar_words_and_weights = [(syn, t) for syn, t in self.model.wv.most_similar(w[1])]
146 |                         similar_words = [word for word, t in similar_words_and_weights]
147 |                         similar_words_weights = [t for word, t in similar_words_and_weights]
148 |                         word = random.choices(similar_words, similar_words_weights, k=1)
149 |                         data_tokens[int(w[0])] = word[0].lower()  # Replace with random synonym from 10 synonyms
150 |                     except KeyError:
151 |                         pass
152 |             return " ".join(data_tokens)
153 |         return " ".join(data_tokens)
154 | 
155 | 
156 | class Fasttext(Word2vec):
157 |     """
158 |     A set of functions used to augment data.
159 | 
160 |     Typical usage: ::
161 |         >>> from textaugment import Fasttext
162 |         >>> t = Fasttext('path/to/gensim/model'or 'gensim model itself')
163 |         >>> t.augment('I love school', top_n=10)
164 |         i adore school
165 |     """
166 |     pass
167 | 


--------------------------------------------------------------------------------
/textaugment/wordnet.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # WordNet-based data augmentation 
  3 | #
  4 | # Copyright (C) 2023
  5 | # Author: Joseph Sefara
  6 | # URL: <https://github.com/dsfsi/textaugment/>
  7 | # For license information, see LICENSE
  8 | 
  9 | import numpy as np
 10 | import nltk
 11 | from itertools import chain
 12 | from nltk.corpus import wordnet
 13 | 
 14 | 
 15 | class Wordnet:
 16 |     """
 17 |     A set of functions used to augment data.
 18 | 
 19 |     Typical usage: ::
 20 |         >>> import nltk
 21 |         >>> nltk.download('punkt')
 22 |         >>> nltk.download('wordnet')
 23 |         >>> nltk.download('averaged_perceptron_tagger')
 24 |         >>> from textaugment import Wordnet
 25 |         >>> t = Wordnet(v=True,n=True,p=0.5)
 26 |         >>> t.augment('I love school')
 27 |         i adore school
 28 |     """
 29 | 
 30 |     def __init__(self, **kwargs):
 31 |         """
 32 |         A method to initialize parameters
 33 | 
 34 |         :type random_state: int
 35 |         :param random_state: seed
 36 |         :type v: bool
 37 |         :param v: Verb, default is True
 38 |         :type n: bool
 39 |         :param n: Noun
 40 |         :type runs: int
 41 |         :param runs: Number of repetition on single text
 42 |         :type p: float, optional
 43 |         :param p: The probability of success of an individual trial. (0.1<p<1.0), default is 0.5
 44 |         :rtype:   None
 45 |         :return:  Constructer do not return.
 46 |         """
 47 | 
 48 |         # Set random state
 49 |         if 'random_state' in kwargs:
 50 |             self.random_state = kwargs['random_state']
 51 |             if isinstance(self.random_state, int):
 52 |                 np.random.seed(self.random_state)
 53 |             else:
 54 |                 raise TypeError("random_state must have type int, float, str, bytes, or bytearray")
 55 | 
 56 |         # Set verb to be default if no values given
 57 |         try:
 58 |             if "v" not in kwargs and "n" not in kwargs:
 59 |                 kwargs['v'] = True
 60 |                 kwargs['n'] = False
 61 |             elif "v" in kwargs and "n" not in kwargs:
 62 |                 kwargs['v'] = True
 63 |                 kwargs['n'] = False
 64 |             elif "v" not in kwargs and "n" in kwargs:
 65 |                 kwargs['n'] = True
 66 |                 kwargs['v'] = False
 67 |             if "runs" not in kwargs:
 68 |                 kwargs['runs'] = 1
 69 | 
 70 |         except KeyError:
 71 |             raise
 72 | 
 73 |         try:
 74 |             if "p" in kwargs:
 75 |                 if type(kwargs['p']) is not float:
 76 |                     raise TypeError("p represent probability of success and must be a float from 0.1 to 0.9. E.g p=0.5")
 77 |                 elif type(kwargs['p']) is float:
 78 |                     self.p = kwargs['p']
 79 |             else:
 80 |                 kwargs['p'] = 0.5  # Set default value
 81 |         except KeyError:
 82 |             raise
 83 | 
 84 |         self.p = kwargs['p']
 85 |         self.v = kwargs['v']
 86 |         self.n = kwargs['n']
 87 |         self.runs = kwargs['runs']
 88 | 
 89 |     def geometric(self, data):
 90 |         """
 91 |         Used to generate Geometric distribution.
 92 |         
 93 |         :type data: list
 94 |         :param data: Input data
 95 |         :rtype:   ndarray or scalar
 96 |         :return:  Drawn samples from the parameterized Geometric distribution.
 97 |         """
 98 | 
 99 |         data = np.array(data)
100 |         first_trial = np.random.geometric(p=self.p, size=data.shape[0]) == 1  # Capture success after first trial
101 |         return data[first_trial]
102 | 
103 |     def replace(self, data, lang, top_n):
104 |         """
105 |         The method to replace words with synonyms
106 |         
107 |         :type data: str
108 |         :param data: sentence used for data augmentation
109 |         :rtype:   str
110 |         :return:  The augmented data
111 |         :type lang: str
112 |         :param lang: choose lang
113 |         :type top_n: int
114 |         :param top_n: top_n of synonyms to randomly choose from
115 | 
116 |         :rtype:   str
117 |         :return:  The augmented data
118 |         """
119 |         data = data.lower().split()
120 |         data_tokens = [[i, x, y] for i, (x, y) in enumerate(nltk.pos_tag(data))]  # Convert tuple to list
121 |         if self.v:
122 |             for loop in range(self.runs):
123 |                 words = [[i, x] for i, x, y in data_tokens if y[0] == 'V']
124 |                 words = [i for i in self.geometric(data=words)]  # List of selected words
125 |                 if len(words) >= 1:  # There are synonyms
126 |                     for word in words:
127 |                         synonyms1 = wordnet.synsets(word[1], wordnet.VERB, lang=lang)  # Return verbs only
128 |                         synonyms = list(set(chain.from_iterable([syn.lemma_names(lang=lang) for syn in synonyms1])))
129 |                         synonyms_ = []  # Synonyms with no underscores goes here
130 |                         for w in synonyms:
131 |                             if '_' not in w:
132 |                                 synonyms_.append(w)  # Remove words with underscores
133 |                         if len(synonyms_) >= 1:
134 |                             synonyms_ = synonyms_[:top_n if top_n else len(synonyms_)]  # use top n or all synonyms
135 |                             synonym = self.geometric(data=synonyms_).tolist()
136 |                             if synonym:  # There is a synonym
137 |                                 data[int(word[0])] = synonym[0].lower()  # Take the first success
138 | 
139 |         if self.n:
140 |             for loop in range(self.runs):
141 |                 words = [[i, x] for i, x, y in data_tokens if y[0] == 'N']
142 |                 words = [i for i in self.geometric(data=words)]  # List of selected words
143 |                 if len(words) >= 1:  # There are synonyms
144 |                     for word in words:
145 |                         synonyms1 = wordnet.synsets(word[1], wordnet.NOUN, lang=lang)  # Return nouns only
146 |                         synonyms = list(set(chain.from_iterable([syn.lemma_names(lang=lang) for syn in synonyms1])))
147 |                         synonyms_ = []  # Synonyms with no underscores goes here
148 |                         for w in synonyms:
149 |                             if '_' not in w:
150 |                                 synonyms_.append(w)  # Remove words with underscores
151 |                         if len(synonyms_) >= 1:
152 |                             synonyms_ = synonyms_[:top_n if top_n else len(synonyms_)]  # use top n or all synonyms
153 |                             synonym = self.geometric(data=synonyms_).tolist()
154 |                             if synonym:  # There is a synonym
155 |                                 data[int(word[0])] = synonym[0].lower()  # Take the first success
156 | 
157 |         return " ".join(data)
158 | 
159 |     def augment(self, data, lang="eng", top_n=10):
160 |         """
161 |         Data augmentation for text. Generate new dataset based on verb/nouns synonyms.
162 |         
163 |         :type data: str
164 |         :param data: sentence used for data augmentation 
165 |         :rtype:   str
166 |         :return:  The augmented data
167 |         :type lang: str
168 |         :param lang: choose lang
169 |         :type top_n: int
170 |         :param top_n: top_n of synonyms to randomly choose from
171 | 
172 |         :rtype:   str
173 |         :return:  The augmented data
174 |         """
175 |         # Error handling
176 |         if type(data) is not str:
177 |             raise TypeError("Only strings are supported")
178 |         if type(lang) is not str:
179 |             raise TypeError("Only strings are supported")
180 |         if type(top_n) is not int:
181 |             raise TypeError("Only integers are supported")
182 | 
183 |         data = self.replace(data, lang, top_n)
184 |         return data 
185 | 


--------------------------------------------------------------------------------