├── .gitignore
├── LICENCE
├── README.md
├── augment.png
├── examples
├── aeda_example.ipynb
├── eda_example.ipynb
├── fasttext_example.ipynb
├── mixup_example_using_IMDB_sentiment.ipynb
└── word2vec_example.ipynb
├── requirements.txt
├── setup.py
├── tests
├── test_translate.py
├── test_word2vec.py
└── test_wordnet.py
└── textaugment
├── __init__.py
├── aeda.py
├── constants.py
├── eda.py
├── mixup.py
├── translate.py
├── word2vec.py
└── wordnet.py
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
105 |
--------------------------------------------------------------------------------
/LICENCE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Joseph Sefara
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 |
2 |
3 | # [TextAugment: Improving Short Text Classification through Global Augmentation Methods](https://arxiv.org/abs/1907.03752)
4 |
5 | [](https://github.com/dsfsi/textaugment/blob/master/LICENCE) [](https://github.com/dsfsi/textaugment/releases) [](https://pypi.python.org/pypi/textaugment) [](https://pypi.org/project/textaugment/) [](https://pypi.org/project/textaugment/) [](https://pypi.org/project/textaugment/) [](https://link.springer.com/chapter/10.1007%2F978-3-030-57321-8_21) [](https://arxiv.org/abs/1907.03752)
6 |
7 |
8 | ## You have just found TextAugment.
9 |
10 | TextAugment is a Python 3 library for augmenting text for natural language processing applications. TextAugment stands on the giant shoulders of [NLTK](https://www.nltk.org/), [Gensim v3.x](https://radimrehurek.com/gensim/), and [TextBlob](https://textblob.readthedocs.io/) and plays nicely with them.
11 |
12 | ## Acknowledgements
13 | Cite this [paper](https://link.springer.com/chapter/10.1007%2F978-3-030-57321-8_21) when using this library. [Arxiv Version](https://arxiv.org/abs/1907.03752)
14 |
15 | ```
16 | @inproceedings{marivate2020improving,
17 | title={Improving short text classification through global augmentation methods},
18 | author={Marivate, Vukosi and Sefara, Tshephisho},
19 | booktitle={International Cross-Domain Conference for Machine Learning and Knowledge Extraction},
20 | pages={385--399},
21 | year={2020},
22 | organization={Springer}
23 | }
24 | ```
25 |
26 | # Table of Contents
27 |
28 | - [Features](#Features)
29 | - [Citation Paper](#citation-paper)
30 | - [Requirements](#Requirements)
31 | - [Installation](#Installation)
32 | - [How to use](#How-to-use)
33 | - [Word2vec-based augmentation](#Word2vec-based-augmentation)
34 | - [WordNet-based augmentation](#WordNet-based-augmentation)
35 | - [RTT-based augmentation](#RTT-based-augmentation)
36 | - [Easy data augmentation (EDA)](#eda-easy-data-augmentation-techniques-for-boosting-performance-on-text-classification-tasks)
37 | - [An easier data augmentation (AEDA)](#aeda-an-easier-data-augmentation-technique-for-text-classification)
38 | - [Mixup augmentation](#mixup-augmentation)
39 | - [Implementation](#Implementation)
40 | - [Acknowledgements](#Acknowledgements)
41 |
42 | ## Features
43 |
44 | - Generate synthetic data for improving model performance without manual effort
45 | - Simple, lightweight, easy-to-use library.
46 | - Plug and play to any machine learning frameworks (e.g. PyTorch, TensorFlow, Scikit-learn)
47 | - Support textual data
48 |
49 | ## Citation Paper
50 |
51 | **[Improving short text classification through global augmentation methods](https://link.springer.com/chapter/10.1007%2F978-3-030-57321-8_21)**.
52 |
53 |
54 |
55 | 
56 |
57 | ### Requirements
58 |
59 | * Python 3
60 |
61 | The following software packages are dependencies and will be installed automatically.
62 |
63 | ```shell
64 | $ pip install numpy nltk gensim==3.8.3 textblob googletrans
65 |
66 | ```
67 | The following code downloads NLTK corpus for [wordnet](http://www.nltk.org/howto/wordnet.html).
68 | ```python
69 | nltk.download('wordnet')
70 | ```
71 | The following code downloads [NLTK tokenizer](https://www.nltk.org/_modules/nltk/tokenize/punkt.html). This tokenizer divides a text into a list of sentences by using an unsupervised algorithm to build a model for abbreviation words, collocations, and words that start sentences.
72 | ```python
73 | nltk.download('punkt')
74 | ```
75 | The following code downloads default [NLTK part-of-speech tagger](https://www.nltk.org/_modules/nltk/tag.html) model. A part-of-speech tagger processes a sequence of words, and attaches a part of speech tag to each word.
76 | ```python
77 | nltk.download('averaged_perceptron_tagger')
78 | ```
79 | Use gensim to load a pre-trained word2vec model. Like [Google News from Google drive](https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit).
80 | ```python
81 | import gensim
82 | model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)
83 | ```
84 | You can also use gensim to load Facebook's Fasttext [English](https://fasttext.cc/docs/en/english-vectors.html) and [Multilingual models](https://fasttext.cc/docs/en/crawl-vectors.html)
85 | ```
86 | import gensim
87 | model = gensim.models.fasttext.load_facebook_model('./cc.en.300.bin.gz')
88 | ```
89 |
90 | Or training one from scratch using your data or the following public dataset:
91 |
92 | - [Text8 Wiki](http://mattmahoney.net/dc/enwik9.zip)
93 |
94 | - [Dataset from "One Billion Word Language Modeling Benchmark"](http://www.statmt.org/lm-benchmark/1-billion-word-language-modeling-benchmark-r13output.tar.gz)
95 |
96 | ### Installation
97 |
98 | Install from pip [Recommended]
99 | ```sh
100 | $ pip install textaugment
101 | or install latest release
102 | $ pip install git+git@github.com:dsfsi/textaugment.git
103 | ```
104 |
105 | Install from source
106 | ```sh
107 | $ git clone git@github.com:dsfsi/textaugment.git
108 | $ cd textaugment
109 | $ python setup.py install
110 | ```
111 |
112 | ### How to use
113 |
114 | There are three types of augmentations which can be used:
115 |
116 | - word2vec
117 |
118 | ```python
119 | from textaugment import Word2vec
120 | ```
121 | - fasttext
122 |
123 | ```python
124 | from textaugment import Fasttext
125 | ```
126 |
127 | - wordnet
128 | ```python
129 | from textaugment import Wordnet
130 | ```
131 | - translate (This will require internet access)
132 | ```python
133 | from textaugment import Translate
134 | ```
135 | #### Fasttext/Word2vec-based augmentation
136 |
137 | [See this notebook for an example](https://github.com/dsfsi/textaugment/blob/master/examples/word2vec_example.ipynb)
138 |
139 | **Basic example**
140 |
141 | ```python
142 | >>> from textaugment import Word2vec, Fasttext
143 | >>> t = Word2vec(model='path/to/gensim/model'or 'gensim model itself')
144 | >>> t.augment('The stories are good')
145 | The films are good
146 | >>> t = Fasttext(model='path/to/gensim/model'or 'gensim model itself')
147 | >>> t.augment('The stories are good')
148 | The films are good
149 | ```
150 | **Advanced example**
151 |
152 | ```python
153 | >>> runs = 1 # By default.
154 | >>> v = False # verbose mode to replace all the words. If enabled runs is not effective. Used in this paper (https://www.cs.cmu.edu/~diyiy/docs/emnlp_wang_2015.pdf)
155 | >>> p = 0.5 # The probability of success of an individual trial. (0.1
>> word = Word2vec(model='path/to/gensim/model'or'gensim model itself', runs=5, v=False, p=0.5)
158 | >>> word.augment('The stories are good', top_n=10)
159 | The movies are excellent
160 | >>> fast = Fasttext(model='path/to/gensim/model'or'gensim model itself', runs=5, v=False, p=0.5)
161 | >>> fast.augment('The stories are good', top_n=10)
162 | The movies are excellent
163 | ```
164 | #### WordNet-based augmentation
165 | **Basic example**
166 | ```python
167 | >>> import nltk
168 | >>> nltk.download('punkt')
169 | >>> nltk.download('wordnet')
170 | >>> from textaugment import Wordnet
171 | >>> t = Wordnet()
172 | >>> t.augment('In the afternoon, John is going to town')
173 | In the afternoon, John is walking to town
174 | ```
175 | **Advanced example**
176 |
177 | ```python
178 | >>> v = True # enable verbs augmentation. By default is True.
179 | >>> n = False # enable nouns augmentation. By default is False.
180 | >>> runs = 1 # number of times to augment a sentence. By default is 1.
181 | >>> p = 0.5 # The probability of success of an individual trial. (0.1
>> t = Wordnet(v=False ,n=True, p=0.5)
184 | >>> t.augment('In the afternoon, John is going to town', top_n=10)
185 | In the afternoon, Joseph is going to town.
186 | ```
187 | #### RTT-based augmentation
188 | **Example**
189 | ```python
190 | >>> src = "en" # source language of the sentence
191 | >>> to = "fr" # target language
192 | >>> from textaugment import Translate
193 | >>> t = Translate(src="en", to="fr")
194 | >>> t.augment('In the afternoon, John is going to town')
195 | In the afternoon John goes to town
196 | ```
197 | # EDA: Easy data augmentation techniques for boosting performance on text classification tasks
198 | ## This is the implementation of EDA by Jason Wei and Kai Zou.
199 |
200 | https://www.aclweb.org/anthology/D19-1670.pdf
201 |
202 | [See this notebook for an example](https://github.com/dsfsi/textaugment/blob/master/examples/eda_example.ipynb)
203 |
204 | #### Synonym Replacement
205 | Randomly choose *n* words from the sentence that are not stop words. Replace each of these words with
206 | one of its synonyms chosen at random.
207 |
208 | **Basic example**
209 | ```python
210 | >>> from textaugment import EDA
211 | >>> t = EDA()
212 | >>> t.synonym_replacement("John is going to town", top_n=10)
213 | John is give out to town
214 | ```
215 |
216 | #### Random Deletion
217 | Randomly remove each word in the sentence with probability *p*.
218 |
219 | **Basic example**
220 | ```python
221 | >>> from textaugment import EDA
222 | >>> t = EDA()
223 | >>> t.random_deletion("John is going to town", p=0.2)
224 | is going to town
225 | ```
226 |
227 | #### Random Swap
228 | Randomly choose two words in the sentence and swap their positions. Do this n times.
229 |
230 | **Basic example**
231 | ```python
232 | >>> from textaugment import EDA
233 | >>> t = EDA()
234 | >>> t.random_swap("John is going to town")
235 | John town going to is
236 | ```
237 |
238 | #### Random Insertion
239 | Find a random synonym of a random word in the sentence that is not a stop word. Insert that synonym into a random position in the sentence. Do this n times
240 |
241 | **Basic example**
242 | ```python
243 | >>> from textaugment import EDA
244 | >>> t = EDA()
245 | >>> t.random_insertion("John is going to town")
246 | John is going to make up town
247 | ```
248 |
249 | # AEDA: An easier data augmentation technique for text classification
250 |
251 | This is the implementation of AEDA by Karimi et al, a variant of EDA. It is based on the random insertion of punctuation marks.
252 |
253 | https://aclanthology.org/2021.findings-emnlp.234.pdf
254 |
255 | ## Implementation
256 | [See this notebook for an example](https://github.com/dsfsi/textaugment/blob/master/examples/eda_example.ipynb)
257 |
258 | #### Random Insertion of Punctuation Marks
259 |
260 | **Basic example**
261 | ```python
262 | >>> from textaugment import AEDA
263 | >>> t = AEDA()
264 | >>> t.punct_insertion("John is going to town")
265 | ! John is going to town
266 | ```
267 |
268 | # Mixup augmentation
269 |
270 | This is the implementation of mixup augmentation by [Hongyi Zhang, Moustapha Cisse, Yann Dauphin, David Lopez-Paz](https://openreview.net/forum?id=r1Ddp1-Rb) adapted to NLP.
271 |
272 | Used in [Augmenting Data with Mixup for Sentence Classification: An Empirical Study](https://arxiv.org/abs/1905.08941).
273 |
274 | Mixup is a generic and straightforward data augmentation principle. In essence, mixup trains a neural network on convex combinations of pairs of examples and their labels. By doing so, mixup regularises the neural network to favour simple linear behaviour in-between training examples.
275 |
276 | ## Implementation
277 |
278 | [See this notebook for an example](https://github.com/dsfsi/textaugment/blob/master/examples/mixup_example_using_IMDB_sentiment.ipynb)
279 |
280 | ## Built with ❤ on
281 | * [Python](http://python.org/)
282 |
283 | ## Authors
284 | * [Joseph Sefara](https://za.linkedin.com/in/josephsefara) (http://www.speechtech.co.za)
285 | * [Vukosi Marivate](http://www.vima.co.za) (http://www.vima.co.za)
286 |
287 | ## Acknowledgements
288 | Cite this [paper](https://link.springer.com/chapter/10.1007%2F978-3-030-57321-8_21) when using this library. [Arxiv Version](https://arxiv.org/abs/1907.03752)
289 |
290 | ```
291 | @inproceedings{marivate2020improving,
292 | title={Improving short text classification through global augmentation methods},
293 | author={Marivate, Vukosi and Sefara, Tshephisho},
294 | booktitle={International Cross-Domain Conference for Machine Learning and Knowledge Extraction},
295 | pages={385--399},
296 | year={2020},
297 | organization={Springer}
298 | }
299 | ```
300 |
301 | ## Licence
302 | MIT licensed. See the bundled [LICENCE](https://github.com/dsfsi/textaugment/blob/master/LICENCE) file for more details.
303 |
--------------------------------------------------------------------------------
/augment.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/dsfsi/textaugment/02c63e07f0b4dcdf95d9700722509e1512963d6a/augment.png
--------------------------------------------------------------------------------
/examples/aeda_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# AEDA example"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "try:\n",
17 | " from textaugment import AEDA\n",
18 | "except ModuleNotFoundError:\n",
19 | " !pip install textaugment\n",
20 | " from textaugment import AEDA"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "t = AEDA(random_state=1)"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "## Punctuation Insertion\n",
37 | "1. Randomly select the amount of punctuation to be inserted, between 1 and 1/3 of the length of the sentence.\n",
38 | "2. Randomly select the punctuation to be inserted.\n",
39 | "3. Randomly select the position of the punctuation to be inserted.\n",
40 | "4. Insert the punctuation at the selected position."
41 | ]
42 | },
43 | {
44 | "cell_type": "code",
45 | "execution_count": 3,
46 | "metadata": {},
47 | "outputs": [
48 | {
49 | "name": "stdout",
50 | "output_type": "stream",
51 | "text": [
52 | "! John is going to town\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "output = t.punct_insertion(\"John is going to town\")\n",
58 | "print(output)"
59 | ]
60 | },
61 | {
62 | "cell_type": "markdown",
63 | "metadata": {},
64 | "source": [
65 | "## Cite the paper\n",
66 | "```\n",
67 | "@article{marivate2019improving,\n",
68 | " title={Improving short text classification through global augmentation methods},\n",
69 | " author={Marivate, Vukosi and Sefara, Tshephisho},\n",
70 | " journal={arXiv preprint arXiv:1907.03752},\n",
71 | " year={2019}\n",
72 | "}```\n",
73 | "\n",
74 | "https://arxiv.org/abs/1907.03752"
75 | ]
76 | }
77 | ],
78 | "metadata": {
79 | "kernelspec": {
80 | "display_name": "Python 3",
81 | "language": "python",
82 | "name": "python3"
83 | },
84 | "language_info": {
85 | "codemirror_mode": {
86 | "name": "ipython",
87 | "version": 3
88 | },
89 | "file_extension": ".py",
90 | "mimetype": "text/x-python",
91 | "name": "python",
92 | "nbconvert_exporter": "python",
93 | "pygments_lexer": "ipython3",
94 | "version": "3.7.7"
95 | }
96 | },
97 | "nbformat": 4,
98 | "nbformat_minor": 4
99 | }
100 |
--------------------------------------------------------------------------------
/examples/eda_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# EDA example"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "try:\n",
17 | " from textaugment import EDA\n",
18 | "except ModuleNotFoundError:\n",
19 | " !pip install textaugment\n",
20 | " from textaugment import EDA"
21 | ]
22 | },
23 | {
24 | "cell_type": "code",
25 | "execution_count": 2,
26 | "metadata": {},
27 | "outputs": [],
28 | "source": [
29 | "t = EDA(random_state=1)"
30 | ]
31 | },
32 | {
33 | "cell_type": "markdown",
34 | "metadata": {},
35 | "source": [
36 | "## Synonym Replacement\n",
37 | "Randomly choose *n* words from the sentence that are not stop words. Replace each of these words with one of its synonyms chosen at random"
38 | ]
39 | },
40 | {
41 | "cell_type": "code",
42 | "execution_count": 3,
43 | "metadata": {},
44 | "outputs": [
45 | {
46 | "name": "stdout",
47 | "output_type": "stream",
48 | "text": [
49 | "John is choke to town\n"
50 | ]
51 | }
52 | ],
53 | "source": [
54 | "output = t.synonym_replacement(\"John is going to town\", top_n=10)\n",
55 | "print(output)"
56 | ]
57 | },
58 | {
59 | "cell_type": "markdown",
60 | "metadata": {},
61 | "source": [
62 | "## Random Insertion\n",
63 | "Find a random synonym of a random word in the sentence that is not a stop word. Insert that synonym into a random position in the sentence. Do this *n* times."
64 | ]
65 | },
66 | {
67 | "cell_type": "code",
68 | "execution_count": 4,
69 | "metadata": {},
70 | "outputs": [
71 | {
72 | "name": "stdout",
73 | "output_type": "stream",
74 | "text": [
75 | "John is going to lead town\n"
76 | ]
77 | }
78 | ],
79 | "source": [
80 | "output = t.random_insertion(\"John is going to town\")\n",
81 | "print(output)"
82 | ]
83 | },
84 | {
85 | "cell_type": "markdown",
86 | "metadata": {},
87 | "source": [
88 | "## Random Swap\n",
89 | "Randomly choose two words in the sentence and swap their positions. Do this *n* times."
90 | ]
91 | },
92 | {
93 | "cell_type": "code",
94 | "execution_count": 5,
95 | "metadata": {},
96 | "outputs": [
97 | {
98 | "name": "stdout",
99 | "output_type": "stream",
100 | "text": [
101 | "John is to going town\n"
102 | ]
103 | }
104 | ],
105 | "source": [
106 | "output = t.random_swap(\"John is going to town\")\n",
107 | "print(output)"
108 | ]
109 | },
110 | {
111 | "cell_type": "markdown",
112 | "metadata": {},
113 | "source": [
114 | "## Random Deletion\n",
115 | "Randomly remove each word in the sentence with probability *p*."
116 | ]
117 | },
118 | {
119 | "cell_type": "code",
120 | "execution_count": 6,
121 | "metadata": {},
122 | "outputs": [
123 | {
124 | "name": "stdout",
125 | "output_type": "stream",
126 | "text": [
127 | "John going to town\n"
128 | ]
129 | }
130 | ],
131 | "source": [
132 | "output = t.random_deletion(\"John is going to town\", p=0.2)\n",
133 | "print(output)"
134 | ]
135 | },
136 | {
137 | "cell_type": "markdown",
138 | "metadata": {},
139 | "source": [
140 | "## Cite the paper\n",
141 | "```\n",
142 | "@article{marivate2019improving,\n",
143 | " title={Improving short text classification through global augmentation methods},\n",
144 | " author={Marivate, Vukosi and Sefara, Tshephisho},\n",
145 | " journal={arXiv preprint arXiv:1907.03752},\n",
146 | " year={2019}\n",
147 | "}```\n",
148 | "\n",
149 | "https://arxiv.org/abs/1907.03752"
150 | ]
151 | },
152 | {
153 | "cell_type": "code",
154 | "execution_count": null,
155 | "metadata": {},
156 | "outputs": [],
157 | "source": []
158 | }
159 | ],
160 | "metadata": {
161 | "kernelspec": {
162 | "display_name": "Python 3",
163 | "language": "python",
164 | "name": "python3"
165 | },
166 | "language_info": {
167 | "codemirror_mode": {
168 | "name": "ipython",
169 | "version": 3
170 | },
171 | "file_extension": ".py",
172 | "mimetype": "text/x-python",
173 | "name": "python",
174 | "nbconvert_exporter": "python",
175 | "pygments_lexer": "ipython3",
176 | "version": "3.7.7"
177 | }
178 | },
179 | "nbformat": 4,
180 | "nbformat_minor": 4
181 | }
182 |
--------------------------------------------------------------------------------
/examples/fasttext_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {},
6 | "source": [
7 | "# Example for using Fasttext"
8 | ]
9 | },
10 | {
11 | "cell_type": "code",
12 | "execution_count": 1,
13 | "metadata": {},
14 | "outputs": [],
15 | "source": [
16 | "# Import libraries\n",
17 | "try:\n",
18 | " import textaugment, gensim\n",
19 | "except ModuleNotFoundError:\n",
20 | " !pip -q install textaugment gensim\n",
21 | " import textaugment, gensim"
22 | ]
23 | },
24 | {
25 | "cell_type": "markdown",
26 | "metadata": {},
27 | "source": [
28 | "# Load Fasttext Embeddings \n",
29 | "\n",
30 | "Fasttext has Pre-trained word vectors on English webcrawl and Wikipedia which you can find [here](https://fasttext.cc/docs/en/english-vectors.html) as well as Pre-trained models for 157 different languages which you can find [here](https://fasttext.cc/docs/en/crawl-vectors.html)"
31 | ]
32 | },
33 | {
34 | "cell_type": "code",
35 | "execution_count": 3,
36 | "metadata": {},
37 | "outputs": [
38 | {
39 | "name": "stdout",
40 | "output_type": "stream",
41 | "text": [
42 | "--2020-09-01 10:11:28-- https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\n",
43 | "Resolving dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)... 104.22.75.142, 104.22.74.142, 172.67.9.4, ...\n",
44 | "Connecting to dl.fbaipublicfiles.com (dl.fbaipublicfiles.com)|104.22.75.142|:443... connected.\n",
45 | "HTTP request sent, awaiting response... 200 OK\n",
46 | "Length: 4503593528 (4.2G) [application/octet-stream]\n",
47 | "Saving to: ‘cc.en.300.bin.gz’\n",
48 | "\n",
49 | "cc.en.300.bin.gz 100%[===================>] 4.19G 4.32MB/s in 9m 57s \n",
50 | "\n",
51 | "2020-09-01 10:21:26 (7.20 MB/s) - ‘cc.en.300.bin.gz’ saved [4503593528/4503593528]\n",
52 | "\n"
53 | ]
54 | }
55 | ],
56 | "source": [
57 | "# Download the FastText embeddings in the language of your choice\n",
58 | "!wget \"https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz\""
59 | ]
60 | },
61 | {
62 | "cell_type": "code",
63 | "execution_count": null,
64 | "metadata": {},
65 | "outputs": [],
66 | "source": [
67 | "# save path to your pre-trained model\n",
68 | "from gensim.test.utils import datapath\n",
69 | "pretrained_path = datapath('./cc.en.300.bin.gz')\n",
70 | "\n",
71 | "# load model\n",
72 | "model = gensim.models.fasttext.load_facebook_model(pretrained_path)"
73 | ]
74 | },
75 | {
76 | "cell_type": "code",
77 | "execution_count": null,
78 | "metadata": {},
79 | "outputs": [],
80 | "source": [
81 | "from textaugment import Word2vec\n",
82 | "t = Word2vec(model = model)\n",
83 | "output = t.augment('The stories are good', top_n=10)"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": null,
89 | "metadata": {},
90 | "outputs": [],
91 | "source": [
92 | "print(output)"
93 | ]
94 | },
95 | {
96 | "cell_type": "markdown",
97 | "metadata": {},
98 | "source": [
99 | "## Cite the paper\n",
100 | "```\n",
101 | "@article{marivate2019improving,\n",
102 | " title={Improving short text classification through global augmentation methods},\n",
103 | " author={Marivate, Vukosi and Sefara, Tshephisho},\n",
104 | " journal={arXiv preprint arXiv:1907.03752},\n",
105 | " year={2019}\n",
106 | "}```\n",
107 | "\n",
108 | "https://arxiv.org/abs/1907.03752\n"
109 | ]
110 | },
111 | {
112 | "cell_type": "code",
113 | "execution_count": null,
114 | "metadata": {},
115 | "outputs": [],
116 | "source": []
117 | }
118 | ],
119 | "metadata": {
120 | "kernelspec": {
121 | "display_name": "Python 3",
122 | "language": "python",
123 | "name": "python3"
124 | },
125 | "language_info": {
126 | "codemirror_mode": {
127 | "name": "ipython",
128 | "version": 3
129 | },
130 | "file_extension": ".py",
131 | "mimetype": "text/x-python",
132 | "name": "python",
133 | "nbconvert_exporter": "python",
134 | "pygments_lexer": "ipython3",
135 | "version": "3.7.7"
136 | }
137 | },
138 | "nbformat": 4,
139 | "nbformat_minor": 4
140 | }
141 |
--------------------------------------------------------------------------------
/examples/mixup_example_using_IMDB_sentiment.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "kMccmZPoWd_h"
8 | },
9 | "source": [
10 | "# Mixup augmentation for NLP\n",
11 | "\n",
12 | "Using IMDB sentiment classification dataset"
13 | ]
14 | },
15 | {
16 | "cell_type": "code",
17 | "execution_count": 1,
18 | "metadata": {
19 | "colab": {
20 | "base_uri": "https://localhost:8080/",
21 | "height": 527
22 | },
23 | "colab_type": "code",
24 | "id": "YhKEHbrxWd_n",
25 | "outputId": "368747f0-47d5-439f-f4b3-d4db6d6a2d18"
26 | },
27 | "outputs": [
28 | {
29 | "name": "stdout",
30 | "output_type": "stream",
31 | "text": [
32 | "Collecting textaugment\n",
33 | " Downloading https://files.pythonhosted.org/packages/d5/87/906c855827f99a65ab91b22afbfa91731bd4397b5e3ca344de571e5c7651/textaugment-1.3-py3-none-any.whl\n",
34 | "Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (from textaugment) (3.2.5)\n",
35 | "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from textaugment) (1.18.4)\n",
36 | "Requirement already satisfied: textblob in /usr/local/lib/python3.6/dist-packages (from textaugment) (0.15.3)\n",
37 | "Requirement already satisfied: gensim in /usr/local/lib/python3.6/dist-packages (from textaugment) (3.6.0)\n",
38 | "Collecting googletrans\n",
39 | " Downloading https://files.pythonhosted.org/packages/fd/f0/a22d41d3846d1f46a4f20086141e0428ccc9c6d644aacbfd30990cf46886/googletrans-2.4.0.tar.gz\n",
40 | "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk->textaugment) (1.12.0)\n",
41 | "Requirement already satisfied: scipy>=0.18.1 in /usr/local/lib/python3.6/dist-packages (from gensim->textaugment) (1.4.1)\n",
42 | "Requirement already satisfied: smart-open>=1.2.1 in /usr/local/lib/python3.6/dist-packages (from gensim->textaugment) (2.0.0)\n",
43 | "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from googletrans->textaugment) (2.23.0)\n",
44 | "Requirement already satisfied: boto3 in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim->textaugment) (1.13.13)\n",
45 | "Requirement already satisfied: boto in /usr/local/lib/python3.6/dist-packages (from smart-open>=1.2.1->gensim->textaugment) (2.49.0)\n",
46 | "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->googletrans->textaugment) (2.9)\n",
47 | "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->googletrans->textaugment) (1.24.3)\n",
48 | "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->googletrans->textaugment) (3.0.4)\n",
49 | "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->googletrans->textaugment) (2020.4.5.1)\n",
50 | "Requirement already satisfied: botocore<1.17.0,>=1.16.13 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim->textaugment) (1.16.13)\n",
51 | "Requirement already satisfied: jmespath<1.0.0,>=0.7.1 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim->textaugment) (0.10.0)\n",
52 | "Requirement already satisfied: s3transfer<0.4.0,>=0.3.0 in /usr/local/lib/python3.6/dist-packages (from boto3->smart-open>=1.2.1->gensim->textaugment) (0.3.3)\n",
53 | "Requirement already satisfied: docutils<0.16,>=0.10 in /usr/local/lib/python3.6/dist-packages (from botocore<1.17.0,>=1.16.13->boto3->smart-open>=1.2.1->gensim->textaugment) (0.15.2)\n",
54 | "Requirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.17.0,>=1.16.13->boto3->smart-open>=1.2.1->gensim->textaugment) (2.8.1)\n",
55 | "Building wheels for collected packages: googletrans\n",
56 | " Building wheel for googletrans (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
57 | " Created wheel for googletrans: filename=googletrans-2.4.0-cp36-none-any.whl size=15777 sha256=4de7ce4b52a5c57a680d9c96137d12291609a418bf5fdd1cf158003f747c7589\n",
58 | " Stored in directory: /root/.cache/pip/wheels/50/d6/e7/a8efd5f2427d5eb258070048718fa56ee5ac57fd6f53505f95\n",
59 | "Successfully built googletrans\n",
60 | "Installing collected packages: googletrans, textaugment\n",
61 | "Successfully installed googletrans-2.4.0 textaugment-1.3\n"
62 | ]
63 | }
64 | ],
65 | "source": [
66 | "# Import libraries\n",
67 | "try:\n",
68 | " import textaugment\n",
69 | "except ModuleNotFoundError:\n",
70 | " !pip install textaugment\n",
71 | " import textaugment\n",
72 | "\n",
73 | "import pandas as pd\n",
74 | "\n",
75 | "import tensorflow as tf\n",
76 | "from tensorflow.keras.preprocessing import sequence\n",
77 | "from tensorflow.keras.models import Sequential\n",
78 | "from tensorflow.keras.layers import Dense, Dropout, Activation\n",
79 | "from tensorflow.keras.layers import Embedding\n",
80 | "from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D\n",
81 | "from tensorflow.keras.datasets import imdb\n",
82 | "\n",
83 | "from textaugment import MIXUP\n",
84 | "%matplotlib inline"
85 | ]
86 | },
87 | {
88 | "cell_type": "code",
89 | "execution_count": 2,
90 | "metadata": {
91 | "colab": {
92 | "base_uri": "https://localhost:8080/",
93 | "height": 34
94 | },
95 | "colab_type": "code",
96 | "id": "JeMsxayIWd_r",
97 | "outputId": "814596bf-e5ca-47f1-c2ce-257e761e96c4"
98 | },
99 | "outputs": [
100 | {
101 | "data": {
102 | "text/plain": [
103 | "'2.2.0'"
104 | ]
105 | },
106 | "execution_count": 2,
107 | "metadata": {
108 | "tags": []
109 | },
110 | "output_type": "execute_result"
111 | }
112 | ],
113 | "source": [
114 | "tf.__version__"
115 | ]
116 | },
117 | {
118 | "cell_type": "code",
119 | "execution_count": 3,
120 | "metadata": {
121 | "colab": {
122 | "base_uri": "https://localhost:8080/",
123 | "height": 34
124 | },
125 | "colab_type": "code",
126 | "id": "_FbvA0uwRdEZ",
127 | "outputId": "8e912f45-8b7e-4ee7-a3ad-f342c3f090c7"
128 | },
129 | "outputs": [
130 | {
131 | "data": {
132 | "text/plain": [
133 | "'1.3'"
134 | ]
135 | },
136 | "execution_count": 3,
137 | "metadata": {
138 | "tags": []
139 | },
140 | "output_type": "execute_result"
141 | }
142 | ],
143 | "source": [
144 | "textaugment.__version__"
145 | ]
146 | },
147 | {
148 | "cell_type": "markdown",
149 | "metadata": {
150 | "colab_type": "text",
151 | "id": "Oz8O8tISRdEg"
152 | },
153 | "source": [
154 | "## Initialize constant variables"
155 | ]
156 | },
157 | {
158 | "cell_type": "code",
159 | "execution_count": null,
160 | "metadata": {
161 | "colab": {},
162 | "colab_type": "code",
163 | "id": "mg1AcYIWWd_w"
164 | },
165 | "outputs": [],
166 | "source": [
167 | "# set parameters:\n",
168 | "max_features = 5000\n",
169 | "maxlen = 400\n",
170 | "batch_size = 32\n",
171 | "embedding_dims = 50\n",
172 | "filters = 250\n",
173 | "kernel_size = 3\n",
174 | "hidden_dims = 250\n",
175 | "epochs = 10\n",
176 | "runs = 1"
177 | ]
178 | },
179 | {
180 | "cell_type": "code",
181 | "execution_count": 5,
182 | "metadata": {
183 | "colab": {
184 | "base_uri": "https://localhost:8080/",
185 | "height": 153
186 | },
187 | "colab_type": "code",
188 | "id": "ZRuNNVstWd_0",
189 | "outputId": "bc4ce3b2-5a12-4600-d1a8-b466615018df"
190 | },
191 | "outputs": [
192 | {
193 | "name": "stdout",
194 | "output_type": "stream",
195 | "text": [
196 | "Loading data...\n",
197 | "Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz\n",
198 | "17465344/17464789 [==============================] - 0s 0us/step\n",
199 | "25000 train sequences\n",
200 | "25000 test sequences\n",
201 | "Pad sequences (samples x time)\n",
202 | "x_train shape: (25000, 400)\n",
203 | "x_test shape: (25000, 400)\n"
204 | ]
205 | }
206 | ],
207 | "source": [
208 | "print('Loading data...')\n",
209 | "(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)\n",
210 | "print(len(x_train), 'train sequences')\n",
211 | "print(len(x_test), 'test sequences')\n",
212 | "\n",
213 | "print('Pad sequences (samples x time)')\n",
214 | "x_train = sequence.pad_sequences(x_train, maxlen=maxlen)\n",
215 | "x_test = sequence.pad_sequences(x_test, maxlen=maxlen)\n",
216 | "print('x_train shape:', x_train.shape)\n",
217 | "print('x_test shape:', x_test.shape)"
218 | ]
219 | },
220 | {
221 | "cell_type": "markdown",
222 | "metadata": {
223 | "colab_type": "text",
224 | "id": "Tx73Y-asRdEz"
225 | },
226 | "source": [
227 | "## Initialize mixup"
228 | ]
229 | },
230 | {
231 | "cell_type": "code",
232 | "execution_count": null,
233 | "metadata": {
234 | "colab": {},
235 | "colab_type": "code",
236 | "id": "xvuxODUxRdE1"
237 | },
238 | "outputs": [],
239 | "source": [
240 | "mixup = MIXUP()\n",
241 | "generator, step = mixup.flow(x_train, y_train, batch_size=batch_size, runs=runs)"
242 | ]
243 | },
244 | {
245 | "cell_type": "code",
246 | "execution_count": 7,
247 | "metadata": {
248 | "colab": {
249 | "base_uri": "https://localhost:8080/",
250 | "height": 476
251 | },
252 | "colab_type": "code",
253 | "id": "6cm1o_fAWd_4",
254 | "outputId": "ea793754-100c-4c12-8acf-7798c096c399"
255 | },
256 | "outputs": [
257 | {
258 | "name": "stdout",
259 | "output_type": "stream",
260 | "text": [
261 | "Build model...\n",
262 | "Model: \"sequential\"\n",
263 | "_________________________________________________________________\n",
264 | "Layer (type) Output Shape Param # \n",
265 | "=================================================================\n",
266 | "embedding (Embedding) (None, 400, 50) 250000 \n",
267 | "_________________________________________________________________\n",
268 | "dropout (Dropout) (None, 400, 50) 0 \n",
269 | "_________________________________________________________________\n",
270 | "conv1d (Conv1D) (None, 398, 250) 37750 \n",
271 | "_________________________________________________________________\n",
272 | "global_max_pooling1d (Global (None, 250) 0 \n",
273 | "_________________________________________________________________\n",
274 | "dense (Dense) (None, 250) 62750 \n",
275 | "_________________________________________________________________\n",
276 | "dropout_1 (Dropout) (None, 250) 0 \n",
277 | "_________________________________________________________________\n",
278 | "activation (Activation) (None, 250) 0 \n",
279 | "_________________________________________________________________\n",
280 | "dense_1 (Dense) (None, 1) 251 \n",
281 | "_________________________________________________________________\n",
282 | "activation_1 (Activation) (None, 1) 0 \n",
283 | "=================================================================\n",
284 | "Total params: 350,751\n",
285 | "Trainable params: 350,751\n",
286 | "Non-trainable params: 0\n",
287 | "_________________________________________________________________\n"
288 | ]
289 | }
290 | ],
291 | "source": [
292 | "print('Build model...')\n",
293 | "model = Sequential()\n",
294 | "\n",
295 | "# we start off with an efficient embedding layer which maps\n",
296 | "# our vocab indices into embedding_dims dimensions\n",
297 | "model.add(Embedding(max_features,\n",
298 | " embedding_dims,\n",
299 | " input_length=maxlen))\n",
300 | "model.add(Dropout(0.2))\n",
301 | "\n",
302 | "# we add a Convolution1D, which will learn filters\n",
303 | "# word group filters of size filter_length:\n",
304 | "model.add(Conv1D(filters,\n",
305 | " kernel_size,\n",
306 | " padding='valid',\n",
307 | " activation='relu',\n",
308 | " strides=1))\n",
309 | "# we use max pooling:\n",
310 | "model.add(GlobalMaxPooling1D())\n",
311 | "\n",
312 | "# We add a vanilla hidden layer:\n",
313 | "model.add(Dense(hidden_dims))\n",
314 | "model.add(Dropout(0.2))\n",
315 | "model.add(Activation('relu'))\n",
316 | "\n",
317 | "# We project onto a single unit output layer, and squash it with a sigmoid:\n",
318 | "model.add(Dense(1))\n",
319 | "model.add(Activation('sigmoid'))\n",
320 | "\n",
321 | "model.compile(loss='binary_crossentropy',\n",
322 | " optimizer='adam',\n",
323 | " metrics=['accuracy'])\n",
324 | "model.summary()"
325 | ]
326 | },
327 | {
328 | "cell_type": "markdown",
329 | "metadata": {
330 | "colab_type": "text",
331 | "id": "b5zRyuq8UKmR"
332 | },
333 | "source": [
334 | "## Train model using mixup augmentation"
335 | ]
336 | },
337 | {
338 | "cell_type": "code",
339 | "execution_count": 8,
340 | "metadata": {
341 | "colab": {
342 | "base_uri": "https://localhost:8080/",
343 | "height": 357
344 | },
345 | "colab_type": "code",
346 | "id": "oGLSfzcUWeAB",
347 | "outputId": "81464964-8fd3-4249-b901-0e05cb664436"
348 | },
349 | "outputs": [
350 | {
351 | "name": "stdout",
352 | "output_type": "stream",
353 | "text": [
354 | "Epoch 1/10\n",
355 | "782/782 [==============================] - 8s 10ms/step - loss: 0.6867 - accuracy: 0.2859 - val_loss: 0.6408 - val_accuracy: 0.6537\n",
356 | "Epoch 2/10\n",
357 | "782/782 [==============================] - 8s 10ms/step - loss: 0.6655 - accuracy: 0.3081 - val_loss: 0.6140 - val_accuracy: 0.6620\n",
358 | "Epoch 3/10\n",
359 | "782/782 [==============================] - 8s 10ms/step - loss: 0.6443 - accuracy: 0.3267 - val_loss: 0.5688 - val_accuracy: 0.7233\n",
360 | "Epoch 4/10\n",
361 | "782/782 [==============================] - 8s 10ms/step - loss: 0.6250 - accuracy: 0.3287 - val_loss: 0.5167 - val_accuracy: 0.7434\n",
362 | "Epoch 5/10\n",
363 | "782/782 [==============================] - 8s 10ms/step - loss: 0.6140 - accuracy: 0.3337 - val_loss: 0.5154 - val_accuracy: 0.7534\n",
364 | "Epoch 6/10\n",
365 | "782/782 [==============================] - 8s 10ms/step - loss: 0.6029 - accuracy: 0.3338 - val_loss: 0.4763 - val_accuracy: 0.7765\n",
366 | "Epoch 7/10\n",
367 | "782/782 [==============================] - 8s 10ms/step - loss: 0.5976 - accuracy: 0.3314 - val_loss: 0.4659 - val_accuracy: 0.7810\n",
368 | "Epoch 8/10\n",
369 | "782/782 [==============================] - 8s 10ms/step - loss: 0.5857 - accuracy: 0.3423 - val_loss: 0.4551 - val_accuracy: 0.7873\n",
370 | "Epoch 9/10\n",
371 | "782/782 [==============================] - 8s 10ms/step - loss: 0.5800 - accuracy: 0.3488 - val_loss: 0.4502 - val_accuracy: 0.7927\n",
372 | "Epoch 10/10\n",
373 | "782/782 [==============================] - 8s 10ms/step - loss: 0.5793 - accuracy: 0.3402 - val_loss: 0.4653 - val_accuracy: 0.7927\n"
374 | ]
375 | }
376 | ],
377 | "source": [
378 | "h1 = model.fit(generator, steps_per_epoch=step,\n",
379 | " epochs=epochs,\n",
380 | " validation_data=(x_test, y_test))"
381 | ]
382 | },
383 | {
384 | "cell_type": "code",
385 | "execution_count": 9,
386 | "metadata": {
387 | "colab": {
388 | "base_uri": "https://localhost:8080/",
389 | "height": 298
390 | },
391 | "colab_type": "code",
392 | "id": "XKrXdkt8XeYo",
393 | "outputId": "0d463439-1718-4f90-bc24-b32f6dae7eda"
394 | },
395 | "outputs": [
396 | {
397 | "data": {
398 | "text/plain": [
399 | ""
400 | ]
401 | },
402 | "execution_count": 9,
403 | "metadata": {
404 | "tags": []
405 | },
406 | "output_type": "execute_result"
407 | },
408 | {
409 | "data": {
410 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXoAAAEICAYAAABRSj9aAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deXxU1f3/8dcnCwmQBLISIIEkEAgIKBJQhITFDa2CK4gruH0VFbditS5Vq21/2lqtRa1FcUMBcQNBESsQQEUCguxbCJAAZgECCWSd8/vjTmCIARKY5E5mPs/HYx7J3GXmk1He98y5554rxhiUUkp5Lz+7C1BKKdWwNOiVUsrLadArpZSX06BXSikvp0GvlFJeToNeKaW8nAa98joikiYiG0+wPkFEjIgENND7rxWRwQ3x2kqdCg165fFE5DER+arGss3HWXadMWaRMaary/JsEbmgseo1xpxhjFnQWO+n1Mlo0KumIAM4T0T8AUSkLRAI9K6xrLNzW6WUCw161RQswwr2s5zP04D5wMYay7YaY3aJyGARyQEQkfeBDsAsESkWkUdcXvcGEdkhIgUi8vjx3lxE3hGR10TkK+drLBGRWBF5WUT2icgGEentsv2RbxAiMkdE/uGybqqIvO38/WkR+cBl3TFdSiKyQET+KiI/icgBEflCRCJO8TNUPkyDXnk8Y0w5sBRIdy5KBxYBi2ss+01r3hhzE7ADuNwYE2KMecFl9UCgK3A+8JSIdDtBGSOBJ4AooAz4AVjhfD4DeOk4+90K3CQiQ0XkBqAfcP8J/+Bj3ex8jbZAJfCveuyrFKBBr5qOhRwN9TSsoF9UY9nCer7mM8aYw8aYVcAq4MwTbPuZMWa5MaYU+AwoNca8Z4ypAqYBvWvbyRizB7gbeBd4BbjZGHOwHjW+b4xZY4wpAZ4ERlZ3VylVVxr0qqnIAAY6uy6ijTGbge+x+u4jgB7Uv39+j8vvh4CQE2z7q8vvh2t5fqJ9ZwH+wEZjzOJ61rjT5fftWF1YUfV8DeXjNOhVU/ED0Aq4A1gCYIw5AOxyLttljNl2nH3tnqL1eWA90FZERrssLwFauDyPrWXfeJffOwAVQIHbK1ReTYNeNQnGmMNAJvAQVpdNtcXOZSdqzf8KJDVcdccnIunAWKy+9luAV0WkvXP1SiBdRDqISCvgsVpe4kYR6S4iLYBngRnO7iKl6kyDXjUlC4EYrHCvtsi57ERB/1fgCRHZLyK/b8D6jiEiYcB7wL3GmFxjzCLgLWCyiIgxZh5W//4vwHLgy1pe5n3gHaxupmBgfGPUrryL6I1HlPJMIrIA+MAYM8nuWlTTpi16pZTychr0Sinl5bTrRimlvJy26JVSyss1yDStpyMqKsokJCTYXYZSSjUpy5cvLzDGRNe2zuOCPiEhgczMTLvLUEqpJkVEth9vnXbdKKWUl9OgV0opL6dBr5RSXs7j+uiVUr6poqKCnJwcSktL7S7FowUHBxMXF0dgYGCd99GgV0p5hJycHEJDQ0lISEBE7C7HIxljKCwsJCcnh8TExDrvp103SimPUFpaSmRkpIb8CYgIkZGR9f7Wo0GvlPIYGvIndyqfkdcEvTGGv8xZz4Y9B+wuRSmlPIrXBH124SE++mkHl7yyiPun/kx2QYndJSmlmpiQkBPdEbLp8pqgT4xqyeJHhnL3oE58s/ZXLnhpIX/8bDV7ivQMvlLKt3lN0AO0ahHII8NSWPjIYG44pwMfZ+5k0IvzeX72OvaWlNtdnlKqiTDGMGHCBHr06EHPnj2ZNm0aALt37yY9PZ2zzjqLHj16sGjRIqqqqhgzZsyRbf/5z3/aXP1veeXwypjQYJ4Z0YPb05J4+dvNvLV4Gx/9tJPb0xK5bWAiocF1H3+qlGp8z8xay7pd7j3f1r1dGH+6/Iw6bfvpp5+ycuVKVq1aRUFBAX379iU9PZ0PP/yQiy++mMcff5yqqioOHTrEypUryc3NZc2aNQDs37/frXW7g1e16GuKj2jBP0aeydwH0hnYOYqXv91M+gvzmbQoi9IKvb+yUqp2ixcvZvTo0fj7+9OmTRsGDRrEsmXL6Nu3L5MnT+bpp59m9erVhIaGkpSURFZWFvfddx9ff/01YWFhdpf/G17Zoq8puU0ob9zUh1U79/P3bzby3Oz1TFq0jfsvSOaaPnEE+nv18U6pJqeuLe/Glp6eTkZGBrNnz2bMmDE89NBD3HzzzaxatYq5c+fyxhtvMH36dN5++227Sz2GTyXcmfGtef+2c/jojnNp1zqYxz5dzYUvLeSLlbk4HHqnLaWUJS0tjWnTplFVVUV+fj4ZGRn069eP7du306ZNG+644w5uv/12VqxYQUFBAQ6Hg6uvvprnnnuOFStW2F3+b/hEi76m/p0i+eTu8/huQx4vzt3I/VNX8vqCrUy4uCtDU2L0og2lfNyVV17JDz/8wJlnnomI8MILLxAbG8u7777Liy++SGBgICEhIbz33nvk5uYyduxYHA4HAH/9619trv63PO6esampqaYxbzzicBhm/bKLf87bRHbhIc7u0JoJF6fQv1Nko9WglIL169fTrVs3u8toEmr7rERkuTEmtbbtfarrpjZ+fsKIs9oz76FB/OXKnuzaX8ro//7ITW8t5Zcczzt7rpRS9eXzQV8t0N+P68/pwIIJg3n80m6syS1i+L+XcNf7y9n860G7y1NKqVPmk330JxIc6M8d6Ulc1y+etxZvY9KibXyzbg9X9o7jgQuSiY9oYXeJSilVL9qiP47Q4EAeuKALGY8M4baBicz6ZRdD/7GAp75YQ95BnVZBKdV0aNCfRETLZjz+u+4snDCYa1PjmbJ0B+kvzOf/fb2BokMVdpenlFInpUFfR21bNecvV/bkfw8N4uIzYnlj4VYGvvAdE+dvoaSs0u7ylFLquDTo6ykhqiWvXNebOePTOCcxkhfnbmTQi/OZvGQbZZU6rYJSyvNo0J+ibm3DmHRLKp/cfR6dY0J4ZtY6hv59IdMzd1JZ5bC7PKVUAzvR3PXZ2dn06NGjEas5MQ3609SnYzgf3XEu79/Wj8iQZjwy4xcufjmDOat367QKSimPoMMr3UBESEuOZmDnKOau/ZW/f7ORcVNW0KN9GL+/qCuDukTrtApK1cdXj8Ke1e59zdiecMnfjrv60UcfJT4+nnvuuQeAp59+moCAAObPn8++ffuoqKjgueeeY8SIEfV629LSUu6++24yMzMJCAjgpZdeYsiQIaxdu5axY8dSXl6Ow+Hgk08+oV27dowcOZKcnByqqqp48sknGTVq1Gn92aBB71YiwrAesVzYvQ2f/5zLP7/dxJjJy+iXEMGEYV3pmxBhd4lKqeMYNWoUDzzwwJGgnz59OnPnzmX8+PGEhYVRUFDAueeey/Dhw+vVcJs4cSIiwurVq9mwYQMXXXQRmzZt4o033uD+++/nhhtuoLy8nKqqKubMmUO7du2YPXs2AEVFRW752zToG4C/n3B1nzguP7Md05bt4F/fbeHaN35gcNdofn9RV3q0b2V3iUp5thO0vBtK7969ycvLY9euXeTn5xMeHk5sbCwPPvggGRkZ+Pn5kZuby6+//kpsbGydX3fx4sXcd999AKSkpNCxY0c2bdpE//79ef7558nJyeGqq64iOTmZnj178vDDD/OHP/yByy67jLS0NLf8bdpH34CaBfhxU/8EMiYM4dFLUvh5x34ue3Ux90xZwZa8YrvLU0rVcO211zJjxgymTZvGqFGjmDJlCvn5+SxfvpyVK1fSpk0bSkvdc8Hk9ddfz8yZM2nevDmXXnop3333HV26dGHFihX07NmTJ554gmeffdYt76Ut+kbQvJk/dw3qxOh+HXhrURaTFm/jqzW7uaZPHOPPTyYuXKdVUMoTjBo1ijvuuIOCggIWLlzI9OnTiYmJITAwkPnz57N9+/Z6v2ZaWhpTpkxh6NChbNq0iR07dtC1a1eysrJISkpi/Pjx7Nixg19++YWUlBQiIiK48cYbad26NZMmTXLL36VB34haNQ/koYu6cvN5Cby+YCvv/7idz3/exfXndOCeIZ2JDg2yu0SlfNoZZ5zBwYMHad++PW3btuWGG27g8ssvp2fPnqSmppKSklLv1xw3bhx33303PXv2JCAggHfeeYegoCCmT5/O+++/T2BgILGxsfzxj39k2bJlTJgwAT8/PwIDA3n99dfd8nf5/Hz0dtq1/zCvfreZ6Zk5NPP349aBCdyZ1olWLfTm5cr36Hz0dafz0Tch7Vo3569X9WLeg+lc0L0NE+dvJc05rcKhcp1WQSnlHtp14wGSokN4dXRv7hqUxEvfbOLFuRuZvCSbe4d0YvQ5HQgK8Le7RKVULVavXs1NN910zLKgoCCWLl1qU0W106D3IGe0a8VbY/qyfPteXvh6I0/PWsd/F23jgQuSubJ3ewL89QuY8m7GmCZ1cWHPnj1ZuXJlo77nqXS31yk5RGSYiGwUkS0i8uhxthkpIutEZK2IfOiyvEpEVjofM+tdoQ/q0zGCqXcenVZhgk6roHxAcHAwhYWFpxRkvsIYQ2FhIcHBwfXa76QnY0XEH9gEXAjkAMuA0caYdS7bJAPTgaHGmH0iEmOMyXOuKzbGHH/2nxp86WRsXRhjmLt2D3//ZhNb8op1WgXltSoqKsjJyXHbOHVvFRwcTFxcHIGBxw7aONHJ2Lp03fQDthhjspwvNhUYAaxz2eYOYKIxZh9Adcir02dNq9CWC7vH6rQKyqsFBgaSmJhodxleqS5dN+2BnS7Pc5zLXHUBuojIEhH5UUSGuawLFpFM5/IransDEbnTuU1mfn5+vf4AX1E9rcJ3Dw/mzyPOYFthCde+8QNjJv/Emlz3zIehlPJO7jq7FwAkA4OB0cB/RaS1c11H59eJ64GXRaRTzZ2NMW8aY1KNManR0dFuKsk76bQKSqn6qkvQ5wLxLs/jnMtc5QAzjTEVxphtWH36yQDGmFznzyxgAdD7NGtWHJ1WIeORIYwf2pn5G/O46J8LeWTGKnL2HbK7PKWUB6lL0C8DkkUkUUSaAdcBNUfPfI7VmkdEorC6crJEJFxEglyWD+DYvn11mqqnVch4ZAhjByTy+cpdDP37Qp6euZb8g2V2l6eU8gAnDXpjTCVwLzAXWA9MN8asFZFnRWS4c7O5QKGIrAPmAxOMMYVANyBTRFY5l//NdbSOcp+okCCevKw7C34/mKvObs/7P24n7YXveGbWWnbtP2x3eUopG+lcN14qK7+YifO38vnKXPwEruodx/8NSiIpus4jXZVSTciJhld6V9DvXgWxvUDHlx+xc+8h/rsoi2nLdlJe5eDSnm0ZN7gTZ7TTm58o5U18I+gLtsBr50BCGgz/F7Tu4P7imrD8g2W8vWQb7/+wneKySoZ0jWbckM46Dl8pL+EbQe9wwPLJMO8p6/mFz0KfseCn88O4Kjpcwfs/ZPP2kmz2lpTTLyGCcUM66ZW2SjVxvhH01fbvgJn3QdYCSEyH4a9CeIK7yvMah8urmLpsB29mZLG7qJQz2oUxbnBnhvWIxd9PA1+ppsa3gh7AGFjxLsx9AowDLnwGUm/T1n0tyisdfP5zLm8s3EpWQQlJUS25a1AnrujdnmYB+nkp1VT4XtBX278TZo2Hrd9Bx4Ew4lWISHLPa3uZKofh6zV7mDh/C+t2H6Btq2DuTE/iur4daN5M58NXytP5btCD1br/+QOY+0dwVMIFT0PfO7R1fxzGGBZuyue1+Vv5KXsvES2bceuABG7qn0Cr5nqLQ6U8lW8HfbWiXJh1P2yZBx3OgxH/hsjfTLujXCzL3str87cwf2M+IUEB3NS/I7cOSNSbmCvlgTToqxkDKz+Erx+DqnI4/yk45//AT7smTmTtriJeW7CVOat308zfj1F947kzPYm48BZ2l6aUctKgr+nALpj1AGyeC/HnwoiJENW5Yd/TC2TlF/OfhVl8+nMOxsCIs9pz9+AkOseE2l2aUj5Pg742xsAv0+CrR6CyDIY+AeeO09Z9HewuOsx/M7bx4U/bKat0cHH3WMYN6USvuNYn31kp1SA06E/k4B748kHYOAfi+sKI1yC6S+O9fxNWWFzGO99n8+732RworSQtOYpxgztzblKEXnylVCPToD8ZY2D1DPhqApQfgqGPQ/97tXVfRwdLK5iydAeTFm2joLiMszu05p4hnRmaEqOBr1Qj0aCvq4O/wuyHYMOX0L6P1bqPSbGnliaotKKKjzN38p+MLHL2HSYlNpS7B3fidz3bEuCvw1mVakga9PVhDKz5BOZMgPJiGPwYnDce/OtyH3UFUFHlYNaqXby2YCtb8oqJCQ0iLTma9C5RDOgcRVSIDs9Uyt006E9FcR7MfhjWz4R2va3WfZvudlfVpDgchnnrf2Xmql0s2VLA/kMVAHRvG0ZalyjSOkeTmhBOcKB2kSl1ujToT8faz6zALz0Ag/8AAx4Af71CtL6qHIa1u4pYtLmARZvzWb59HxVVhqAAP85JiiStcxRpXaLo2iZU+/WVOgUa9KerpADm/N4K/dhecMXrENvD7qqatJKySpZuK3QGfwFb8ooBiA4NOhL6AzpHERMabHOlSjUNGvTusu4Lq3V/eD+kT4C0h7R17ya7iw4fCf0lWwrYW1IOQEpsKOldohnYOYp+iRHazaPUcWjQu1NJoXWR1ZoZENvT6rtv28vuqryKw2FYt/sAGZvzWby5gMzsfZRXOWgW4Mc5iREM7BxFWnI03dpqN49S1TToG8L6WfDlQ3B4L6Q9DGm/h4BmdlfllQ6VV7J0214WO/v3N/1qdfNEhQQxsHMkacnRpCVHEROm3TzKd2nQN5RDe+HrR62pFNr0sObMaXeW3VV5vT1FpSzeYoX+4s0FFDq7ebq2CSUtOYq0LtH0S4jQefSVT9Ggb2gb5sCXD1gnbdMesvrvA3SseGNwOAzr9xxg0eYCFm8u4KfsvZRXOmjm70ffxHDSkq3+/e5tw/DTWyQqL6ZB3xgO7bVubrLqI4jpDtdPh9bxdlflcw6XV/FT9l4Wb85n0eYCNuw5CEBky2YMTLb69tO1m0d5IQ36xrRpLnxyO4S2hVu/hhYRdlfk0/IOWN08GZvyWbylgILiY0fzpCVH0TdBR/Oopk+DvrFtWwQfXAXtzoabP4fA5nZXpDi2mydjU/6R0TzVF22lJ0eR3iWa5JgQHc2jmhwNejus+RRm3Aopv4OR7+lMmB7oUHklS7P2krE5n4xN+WzNLwEgNiyYNGfoD+wcRXhLHU2lPJ8GvV1+fAO+/gOk3gq/ewm0lejRcvcfZtEmq29/8ZYCig5XIAI927ci3TmE8+yO4QTqTJzKA2nQ22neU7DkFRjyBAyaYHc1qo6qHIZfcvaTsckaxvnzzv1UOQwhQQGcmxTJoC5Wi79jZEu7S1UK0KC3l8MBn99ljbUf/m84+ya7K1KnoOhwBT9sLTzSzZOz7zAAHSJakN7FGs3Tv1MkYcE6JYayhwa93SrL4aNRkLUQRn8EXS62uyJ1GowxZBceYpEz9H/YWkhJeRX+fsLZHVo7596Ppmf7Vvjr2H3VSDToPUHZQXjnMsjfCGO+hLha/3uoJqi80sGKHfucwV/Aml1FGAOtWwQyoHMU6c7x++1a6+gr1XA06D1FcR68daE1t/1t8yCqs90VqQZQWFzmnKLBGsaZd7AMgM4xIaQnR9O9XRjx4c2Jj2hBm7BgbfUrtzjtoBeRYcArgD8wyRjzt1q2GQk8DRhglTHmeufyW4AnnJs9Z4x590Tv5dVBD1C41Qr7Zi3htm8htI3dFakGZIxh06/FZGzKJ2NzPj9t20tZpePI+kB/IS68BXHO4I8Pb0F8RHM6OH9v3SJQx/SrOjmtoBcRf2ATcCGQAywDRhtj1rlskwxMB4YaY/aJSIwxJk9EIoBMIBXrALAc6GOM2Xe89/P6oAfIXW5140R2hjGzITjM7opUIymvdJC7/zA79x5i575D7Nx72PnTeuxz3m6xWkhQwJGDgBX+zgOC80CgE7epaicK+rrc8bofsMUYk+V8sanACGCdyzZ3ABOrA9wYk+dcfjEwzxiz17nvPGAY8NGp/CFeo30f6yKqD0fBtBvhhhk6xbGPaBbgR2JUSxKjah+WebC0gpx9h9nhDP6cfdZBYXthCYs251Na4Thm+6iQIOIjmh/5JhAf7jwgRLSgbatgAnTMv6JuQd8e2OnyPAc4p8Y2XQBEZAlW987Txpivj7Nv+5pvICJ3AncCdOjQoa61N23JF8LwV+GLcdbjyjfBT/9R+rrQ4EC6tQ2kW9vffsszxlBQXH7kG0DOvsPsKLS+Gfy8cx+zV++mynH0G7q/n9C2VbBL+FvfBuLCW9AxsgVRITrDqq+oS9DX9XWSgcFAHJAhIj3rurMx5k3gTbC6btxUk+frfQMc3A3f/RlCY+Gi5+yuSHkwESE6NIjo0CDO7hD+m/WVVQ52F5X+pltox95D/G9DHgXFZcdsnxwTwtCUGIakxNBHr/j1anUJ+lzAdb7dOOcyVznAUmNMBbBNRDZhBX8uVvi77rvgVIv1SmkPw8E98P2r1oyX/e+xuyLVRAX4+x3pv6/N4fIqcvZZB4EtecVkbCrg7SXb+E9GFmHBAaR3iWZoSgyDu8YQofP7eJW6nIwNwDoZez5WcC8DrjfGrHXZZhjWCdpbRCQK+Bk4i6MnYM92broC62Ts3uO9n0+cjK3JUQUfj4H1M+Hqt6DnNXZXpHxEcVklizfn892GPOZvzCf/YBki0Du+9ZHWfve2YTrypwlwx/DKS4GXsfrf3zbGPC8izwKZxpiZYv1f8A+sE61VwPPGmKnOfW8F/uh8qeeNMZNP9F4+GfQAFaXw/pWQswxu/ASSBtldkfIxDodhza4iK/Q35LEqpwiwZvMckhLD0JQYBnSOpEUzd/X4KnfSC6aaisP74O1LoCgHbv0KYut8mkMpt8s7WMqCjfnM35DHos0FFJdV0izAj3OTIjnfGfzH6yZSjU+DvikpyoG3LrK6c277BsI72l2RUpRXOliWvfdIaz+rwJq7v7PzhO5QPaFrOw36piZvPbx9MbSMscJeb0eoPMy2gpIjob90WyEVVYbQ6hO6XWMY3DWaSB2+2ag06Jui7d/De1dA215w80xopl+RlWc63gnds+JbM7SrdUL3jHZ6QrehadA3VetmwvSbocswGPUB+OtJMOXZTnxCN5ohXWMYmBylJ3QbgAZ9U/bTf2HO7+HsW+DyV/R2hKpJqfWErr8f53aKZGjX6CN36dIZPE/f6c51o+zU7w7r6tlF/4CwdjD4UbsrUqrOYkKDGZkaz8jU+N+c0H16ljVdVjN/P+IjmpMY1ZKOkS1JiGxBQlRLEiJb0q51cz0IuIG26JsCY+CLe2DlFKtV32eM3RUpddq2FZSwNKuQbYUlbC84RHZhCdmFJcdM3BboL8RHtCAx0nkQiGpBQmT1QUAnbXOlLfqmTsQK+OI8+PJBazROyqV2V6XUaaltFk9jDL8eKCO7sITthSVsKzjk/FnC91sLOVxRdWTbQH8hPtxq/XeMdB4AoqxvBO1bN9eDgAtt0TclZcXw7uXW8MtbZkJ8P7srUqrRGGPIO1hGdkEJ2wsPWd8EXA4Gh8qPHgQC/KxvAkcOAC7dQXHh3nkQ0JOx3qSkwLpD1eF9cOs3EN3F7oqUsp0xhvyDZWQXOruAqg8GBdbBoKTGQSAuvDkdI1s6zwtYc/cH+vvh7ycE+PkR4C8E+An+fuKy/LfPA4753drXT7BlKKkGvbfZu80K+4Dm1gVVYW3trkgpj2WMIb+4jO2Fh8guKHGeC3D+XnDsQcBdAv2l1oNG9fMjBweX54F+fnSJDeG5K05t6hPto/c2EYlww8fW7QinXANj50BwK7urUsojiQgxocHEhAbTN+HYq8yrb+aSd7CUKoehospQ5TBUOhxUHvndUOVwuKwzVFY5nMsNFVUOl+XWtkfXHX1eWeXyWg5DlfN5pcO5f5WhodrdGvRNVbveztsRjoSpN1gzXgboJedK1YfrzVy8mfedkfAlnc+HEa9B9iL47C5wOE6+j1LK52iLvqk7c5R1QdW3f7LuUDXsL3ZXpJTyMBr03mDA/VbY/zjROjF73n12V6SU8iAa9N5ABC7+KxT/Ct88ASFtoNdIu6tSSnkIDXpv4ecHV/7HGmf/+ThoGQ2dhthdlVLKA+jJWG8SEATXTYGoLjDtRti9yu6KlFIeQIPe2wS3ghtnQPNwmHItHNhld0VKKZtp0HujsHbWBVVlxfDxGKgst7sipZSNNOi9VUw3GPFv2LkU5j1pdzVKKRtp0HuzHlfBueNg6Ruweobd1SilbKJB7+0ufBY69IeZ91nTGyulfI4GvbfzD4Rr34FmIdZInNIDdleklGpkGvS+IDTWCvu92+CLcTTYFHlKKY+kQe8rEgbAhc/A+lnw/at2V6OUakQa9L6k/73QfQR8+zRkL7a7GqVUI9Gg9yUiMGIiRCTBx2PhwG67K1JKNQINel8TFAqjPoDyEutiqqoKuytSSjUwDXpfFJMCI16FnT/CN3oxlVLeToPeV/W4Gs65G5a+rhdTKeXl6hT0IjJMRDaKyBYRebSW9WNEJF9EVjoft7usq3JZPtOdxavTdNGfIf5cmDke8jbYXY1SqoGcNOhFxB+YCFwCdAdGi0j3WjadZow5y/mY5LL8sMvy4e4pW7mFfyBcOxmatdCLqZTyYnVp0fcDthhjsowx5cBUYETDlqUaTVg7uGYy7M2CL+7Ri6mU8kJ1Cfr2wE6X5znOZTVdLSK/iMgMEYl3WR4sIpki8qOIXFHbG4jInc5tMvPz8+tevXKPxDS44E+wfib88G+7q1FKuZm7TsbOAhKMMb2AecC7Lus6GmNSgeuBl0WkU82djTFvGmNSjTGp0dHRbipJ1ct546Hb5TDvT5C9xO5qlFJuVJegzwVcW+hxzmVHGGMKjTFlzqeTgD4u63KdP7OABUDv06hXNRQRGPEaRCRa4+v1YiqlvEZdgn4ZkCwiiSLSDLgOOGb0jIi0dXk6HFjvXB4uIkHO36OAAcA6dxSuGkBwmPNiqmK9mEopL3LSoDfGVAL3AnOxAny6MWatiDwrItWjaPrbDvkAAA+rSURBVMaLyFoRWQWMB8Y4l3cDMp3L5wN/M8Zo0HuymG4w3Hkx1byn7K5GKeUGYjxslEVqaqrJzMy0uww15xH46T/WiJweV9ldjVLqJERkufN86G/olbGqdhc9B3H94It79WIqpZo4DXpVu4BmMPJd62Kq6TdB2UG7K1JKnSINenV8Ye3gmrehcIvVsvewbj6lVN1o0KsTS0yH8/8E6z6HH1+zuxql1CnQoFcnN+B+SLnMmtJ4+/d2V6OUqicNenVyInDFaxCeYI2vP7jH7oqUUvWgQa/qJriVdTFV2UHrNoR6MZVSTYYGvaq7Nt3h8n/Bju+tG4wrpZoEDXpVP72uhX53WrNcrv3M7mqUUnWgQa/q76Lnj15Mlb/R7mqUUiehQa/qL6AZXPsOBATDtJugrNjuipRSJ6BBr05Nq/bOi6k2w0y9mEopT6ZBr05d0iA4/ymrr/7H1+2uRil1HBr06vQMeMC6mGrek7D9B7urUUrVQoNenZ7qi6lad3ReTPWr3RUppWrQoFenL7gVjHofSotghl5MpZSn0aBX7tHmDBj+L9i+RC+mUsrDaNAr9+k1Evre7ryY6nO7q1FKOWnQK/e6+C/QPhW+uAfyN9ldjVIKDXrlbgFB1p2pAoJg2o16MZVSHkCDXrlfqziXi6nu04uplLJZgN0FKC+VNBiGPgH/exZaRkNsT2vKhICgoz8Dmx/7PKDGcxG7/wqlvIIGvWo4Ax6EPavhp/+c2v7+QRAYXOMAUfN5zQPGcQ4agc2hQ39oHe/ev1GpJkCDXjUcPz+4ZjJc8iJUlkJlGVQedv4sPbqswnWZ67rSY5fX3O5Q4fG3M1W/rad5BNz8ObQ9s/E/C6VspEGvGpYIhEQ3/vtWVR57ADi4Bz6+Bd69HG78FOJSG78mpWyiJ2OVd/IPgKAQaBlpzbQZ1wfGfmW16t+7QuflUT5Fg175jtbxVtiHtYUProKsBXZXpFSj0KBXviWsLYyZDeGJMGUkbJ5nd0VKNTgNeuV7QmJgzJcQkwIfjYYNs+2uSKkGpUGvfFOLCLh5pjUCZ/rNsOZTuytSqsFo0Cvf1by1Ndwyrh98chusmmp3RUo1CA165duCQuHGGZCYDp/dBcvfsbsipdxOg16pZi1h9DRIvhBm3Q9L37S7IqXcqk5BLyLDRGSjiGwRkUdrWT9GRPJFZKXzcbvLultEZLPzcYs7i1fKbQKDYdQH1v1vv5oAS16xuyKl3OakV8aKiD8wEbgQyAGWichMY8y6GptOM8bcW2PfCOBPQCpggOXOffe5pXql3CkgCK59Bz77P5j3lHVV7aBH7K5KqdNWlykQ+gFbjDFZACIyFRgB1Az62lwMzDPG7HXuOw8YBnx0auUq1cD8A+Gq/1oTqs1/3po+YeiTOpOmatLq0nXTHtjp8jzHuaymq0XkFxGZISLVUwTWaV8RuVNEMkUkMz8/v46lK9VA/PxhxEToMwYW/QPmPq5z6qsmzV0nY2cBCcaYXsA84N367GyMedMYk2qMSY2OtmECLKVq8vODy16Gc+6CHyfC7IfB4bC7KqVOSV2CPhdwncQ7zrnsCGNMoTGmzPl0EtCnrvsq5bFEYNjfYMD9kPkWzLoPHLVMf6yUh6tL0C8DkkUkUUSaAdcBM103EJG2Lk+HA+udv88FLhKRcBEJBy5yLlOqaRCBC56BQY/Czx9YJ2qrKu2uSql6OenJWGNMpYjcixXQ/sDbxpi1IvIskGmMmQmMF5HhQCWwFxjj3HeviPwZ62AB8Gz1iVmlmgwRGPKYNSrnf89Yo3GufgsCmtldmVJ1IsbDTjKlpqaazMxMu8tQqnY/vAZzH4Muw+Dad63x90p5ABFZboyp9Y46emWsUvXRfxz87h+w6WuYOhrKD9ldkVInpUGvVH31vd0afrl1Pnw4EsqK7a5IqRPSoFfqVPS+0bqwavv31t2qSovsrkip49KgV+pU9boWrp0MuSvgvRFwSMcZKM+kQa/U6eg+wpoM7de18O5wKCmwuyKlfkODXqnT1XUYjJ4KhVtg8qVwcI/dFSl1DA16pdyh8/nWDUyKcmDyJdZPpTyEBr1S7pIwEG76zOq+mXwJ7Mu2uyKlAA16pdyrwzlw8xdQesDqxinYYndFSmnQK+V27c+GMV9aUyW8cynkbbC7IuXjNOiVagixPWHMbECssN+z2u6KlA/ToFeqocSkwNg5EBAM71wGucvtrkj5KA16pRpSZCcr7INbwXtXwI4f7a5I+SANeqUaWngCjP0KWkbD+1dBxouwc5nOa68aTV1uDq6UOl2t2lst++m3wHfPAc9Bs1Do2B8S0iAx3erX9/O3u1LlhTTolWosobFw21wozofsRdZjWwZs/sZaH9zKCv2ENEhMg+hu1r1rlTpNGvRKNbaQaOhxlfUAOLALshdbob8tAzZ8aS1vEWVdhJWYBomDILKzdbcrpepJg14pu4W1g14jrQfA/h2wzdnaz14E6z63lofEOkM/3Wr1hydo8Ks60VsJKuXJjIG9WUdDf9siKMmz1rXqYAV/dVdPqzh7a1W2OtGtBLVFr5QnE7GGaEZ2gtSxVvDnbzzav79xDqycYm0bkXT0xG5CGoS2sbd25TG0Ra9UU+ZwQN7ao10925dA2QFrXVRXK/SrW/0tIuytVTWoE7XoNeiV8iaOKti96mhXz/YfoKLEWtem59E+/o7nWaN8lNfQoFfKV1VVWLc6zM6wWv07l0JlKYgftDsbkgZbj/h+EBBkb62+rKoCtnxr3Wi+17Wn9BIa9EopS0Up5GZaLf6sBZCTCaYKAltYrfykIVbwtzlDR/Q0hj2rYeVHsHo6lORDbC+4a9EpvZQGvVKqdqUHrDH8WQsgaz4UbLKWt4w+2tpPGmJd2avcozgfVn8Mqz60gt4v0Lod5Vk3QOcLwD/wlF5WR90opWoXHAYpl1oPgKJcZ+g7H6s/tpZHJkMnZ2s/YaD279dXZTlsngsrP7SuhHZUQrvecMmL0POaBj9Rri16pVTtjIG8dVbgb51vjeipOATiD+37WKHfaQi0T4WAZjYX64GMgd0rrXBfPQMO77Uueus1Es66HmK6ufXttOtGKXX6Kssh56ejwb9rBRgHBLa0WvlJg63gj07x7f79g3vgl+lWwOevB/8gSPmdFe5JQ8C/YTpSNOiVUu53eL+zf3++Ff6FzvvjhrQ52refNBjC2tpWYqOpKLUuXlv1kTV6xjggrq8V7mdcCc3DG7wE7aNXSrlf89bQ7TLrAbB/59G+/S3/g1+mWcujU44Gf8IACAq1p153M8YatbTqQ1jzCZQWQVh7GPCAFfBRyXZXeIS26JVS7ld9xe5WZ2t/+/dQeRj8Aqw+/eoTu+37nPIoE9sU5cIvU61hkYWbIaA5dLsczhptzTJq0z0FtOtGKWWvitIa/fs/A8YKydYdoHW89bNVjZ8hbTxjTv7yQ7BhtjWvUNYCwECH/lbLvfsV1uglm2nQK6U8y6G91hQNO3+C/dutqZn377RGprjyb2Z1hxw5EDgPCtUHgrB2DfeNwBjrHr+rPoQ1n0H5Qev9zxoNZ15nTSLnQU67j15EhgGvAP7AJGPM346z3dXADKCvMSZTRBKA9cBG5yY/GmPuql/5Simv0yICuo+wHq7KiqEoxwr+Imf4F+20nm/+For3HLu9+EFou2PD3/X3VnEQ2Lx+te3fAaumWidW92ZZo4q6j7Ba7x0HeMY3jHo6adCLiD8wEbgQyAGWichMY8y6GtuFAvcDS2u8xFZjzFluqlcp5c2CQiAmxXrUprLM5UCw89gDwY4frZOipurYfVrG1DgQVHcNOZcFh1kHmPUzrSGR2c4pCBLSIH0CdBtu1dWE1aVF3w/YYozJAhCRqcAIYF2N7f4M/D9gglsrVEqpagFBR+fnr01VJRzcXeNAsMN6vmc1bPwKqsqO3Se4NVSVWxeDhSfCkMeh1ygI79jwf08jqUvQtwd2ujzPAc5x3UBEzgbijTGzRaRm0CeKyM/AAeAJY8xvZuwRkTuBOwE6dOhQj/KVUsqFf4CzPz++9vUOh3WHriMHAOc3AgR6XA0dzvXKi71Oexy9iPgBLwFjalm9G+hgjCkUkT7A5yJyhjHmgOtGxpg3gTfBOhl7ujUppVSt/PwgNNZ6xPe1u5pGU5ezCrmA6+ExzrmsWijQA1ggItnAucBMEUk1xpQZYwoBjDHLga1AF3cUrpRSqm7qEvTLgGQRSRSRZsB1wMzqlcaYImNMlDEmwRiTAPwIDHeOuol2nsxFRJKAZCDL7X+FUkqp4zpp140xplJE7gXmYg2vfNsYs1ZEngUyjTEzT7B7OvCsiFQADuAuY8zeE2yvlFLKzfSCKaWU8gInumCq6Y38V0opVS8a9Eop5eU06JVSystp0CullJfzuJOxIpIPbD+Nl4gCCtxUTlOnn8Wx9PM4ln4eR3nDZ9HRGBNd2wqPC/rTJSKZxzvz7Gv0sziWfh7H0s/jKG//LLTrRimlvJwGvVJKeTlvDPo37S7Ag+hncSz9PI6ln8dRXv1ZeF0fvVJKqWN5Y4teKaWUCw16pZTycl4T9CIyTEQ2isgWEXnU7nrsJCLxIjJfRNaJyFoRud/umuwmIv4i8rOIfGl3LXYTkdYiMkNENojIehHpb3dNdhKRB53/TtaIyEciEmx3Te7mFUHvcgPzS4DuwGgR6W5vVbaqBB42xnTHuhHMPT7+eYB14/r1dhfhIV4BvjbGpABn4sOfi4i0B8YDqcaYHlhTsV9nb1Xu5xVBj8sNzI0x5UD1Dcx9kjFmtzFmhfP3g1j/kNvbW5V9RCQO+B0wye5a7CYirbDuE/EWgDGm3Biz396qbBcANBeRAKAFsMvmetzOW4K+thuY+2ywuRKRBKA3sNTeSmz1MvAI1s1vfF0ikA9MdnZlTRKRlnYXZRdjTC7wd2AH1j2ui4wx39hblft5S9CrWohICPAJ8EDNG7L7ChG5DMhz3rNYWa3Xs4HXjTG9gRLAZ89piUg41rf/RKAd0FJEbrS3KvfzlqA/2Q3MfY6IBGKF/BRjzKd212OjAcBw543rpwJDReQDe0uyVQ6QY4yp/oY3Ayv4fdUFwDZjTL4xpgL4FDjP5prczluC/oQ3MPc1IiJYfbDrjTEv2V2PnYwxjxlj4pw3rr8O+M4Y43UttroyxuwBdopIV+ei84F1NpZktx3AuSLSwvnv5ny88OT0SW8O3hQc7wbmNpdlpwHATcBqEVnpXPZHY8wcG2tSnuM+YIqzUZQFjLW5HtsYY5aKyAxgBdZotZ/xwukQdAoEpZTyct7SdaOUUuo4NOiVUsrLadArpZSX06BXSikvp0GvlFJeToNeKaW8nAa9Ukp5uf8P55TflB+UlTEAAAAASUVORK5CYII=\n",
411 | "text/plain": [
412 | ""
413 | ]
414 | },
415 | "metadata": {
416 | "needs_background": "light",
417 | "tags": []
418 | },
419 | "output_type": "display_data"
420 | }
421 | ],
422 | "source": [
423 | "pd.DataFrame(h1.history)[['loss','val_loss']].plot(title=\"With mixup\")"
424 | ]
425 | },
426 | {
427 | "cell_type": "code",
428 | "execution_count": 10,
429 | "metadata": {
430 | "colab": {
431 | "base_uri": "https://localhost:8080/",
432 | "height": 476
433 | },
434 | "colab_type": "code",
435 | "id": "Iiv7ahP8WeAF",
436 | "outputId": "0ad04311-b497-4830-dd50-a832daf583ac"
437 | },
438 | "outputs": [
439 | {
440 | "name": "stdout",
441 | "output_type": "stream",
442 | "text": [
443 | "Build model...\n",
444 | "Model: \"sequential_1\"\n",
445 | "_________________________________________________________________\n",
446 | "Layer (type) Output Shape Param # \n",
447 | "=================================================================\n",
448 | "embedding_1 (Embedding) (None, 400, 50) 250000 \n",
449 | "_________________________________________________________________\n",
450 | "dropout_2 (Dropout) (None, 400, 50) 0 \n",
451 | "_________________________________________________________________\n",
452 | "conv1d_1 (Conv1D) (None, 398, 250) 37750 \n",
453 | "_________________________________________________________________\n",
454 | "global_max_pooling1d_1 (Glob (None, 250) 0 \n",
455 | "_________________________________________________________________\n",
456 | "dense_2 (Dense) (None, 250) 62750 \n",
457 | "_________________________________________________________________\n",
458 | "dropout_3 (Dropout) (None, 250) 0 \n",
459 | "_________________________________________________________________\n",
460 | "activation_2 (Activation) (None, 250) 0 \n",
461 | "_________________________________________________________________\n",
462 | "dense_3 (Dense) (None, 1) 251 \n",
463 | "_________________________________________________________________\n",
464 | "activation_3 (Activation) (None, 1) 0 \n",
465 | "=================================================================\n",
466 | "Total params: 350,751\n",
467 | "Trainable params: 350,751\n",
468 | "Non-trainable params: 0\n",
469 | "_________________________________________________________________\n"
470 | ]
471 | }
472 | ],
473 | "source": [
474 | "print('Build model...')\n",
475 | "model2 = Sequential()\n",
476 | "\n",
477 | "# we start off with an efficient embedding layer which maps\n",
478 | "# our vocab indices into embedding_dims dimensions\n",
479 | "model2.add(Embedding(max_features,\n",
480 | " embedding_dims,\n",
481 | " input_length=maxlen))\n",
482 | "model2.add(Dropout(0.2))\n",
483 | "\n",
484 | "# we add a Convolution1D, which will learn filters\n",
485 | "# word group filters of size filter_length:\n",
486 | "model2.add(Conv1D(filters,\n",
487 | " kernel_size,\n",
488 | " padding='valid',\n",
489 | " activation='relu',\n",
490 | " strides=1))\n",
491 | "# we use max pooling:\n",
492 | "model2.add(GlobalMaxPooling1D())\n",
493 | "\n",
494 | "# We add a vanilla hidden layer:\n",
495 | "model2.add(Dense(hidden_dims))\n",
496 | "model2.add(Dropout(0.2))\n",
497 | "model2.add(Activation('relu'))\n",
498 | "\n",
499 | "# We project onto a single unit output layer, and squash it with a sigmoid:\n",
500 | "model2.add(Dense(1))\n",
501 | "model2.add(Activation('sigmoid'))\n",
502 | "\n",
503 | "model2.compile(loss='binary_crossentropy',\n",
504 | " optimizer='adam',\n",
505 | " metrics=['accuracy'])\n",
506 | "model2.summary()"
507 | ]
508 | },
509 | {
510 | "cell_type": "code",
511 | "execution_count": 11,
512 | "metadata": {
513 | "colab": {
514 | "base_uri": "https://localhost:8080/",
515 | "height": 357
516 | },
517 | "colab_type": "code",
518 | "id": "ygNHmhGMWeAI",
519 | "outputId": "1592613d-52d2-409b-e210-cceddb7f5bbd"
520 | },
521 | "outputs": [
522 | {
523 | "name": "stdout",
524 | "output_type": "stream",
525 | "text": [
526 | "Epoch 1/10\n",
527 | "782/782 [==============================] - 8s 10ms/step - loss: 0.4057 - accuracy: 0.7964 - val_loss: 0.2819 - val_accuracy: 0.8825\n",
528 | "Epoch 2/10\n",
529 | "782/782 [==============================] - 8s 10ms/step - loss: 0.2260 - accuracy: 0.9100 - val_loss: 0.2540 - val_accuracy: 0.8957\n",
530 | "Epoch 3/10\n",
531 | "782/782 [==============================] - 8s 10ms/step - loss: 0.1579 - accuracy: 0.9409 - val_loss: 0.2806 - val_accuracy: 0.8874\n",
532 | "Epoch 4/10\n",
533 | "782/782 [==============================] - 8s 10ms/step - loss: 0.1056 - accuracy: 0.9625 - val_loss: 0.3103 - val_accuracy: 0.8897\n",
534 | "Epoch 5/10\n",
535 | "782/782 [==============================] - 8s 10ms/step - loss: 0.0732 - accuracy: 0.9730 - val_loss: 0.3593 - val_accuracy: 0.8838\n",
536 | "Epoch 6/10\n",
537 | "782/782 [==============================] - 8s 10ms/step - loss: 0.0539 - accuracy: 0.9808 - val_loss: 0.3938 - val_accuracy: 0.8884\n",
538 | "Epoch 7/10\n",
539 | "782/782 [==============================] - 8s 10ms/step - loss: 0.0419 - accuracy: 0.9854 - val_loss: 0.4444 - val_accuracy: 0.8817\n",
540 | "Epoch 8/10\n",
541 | "782/782 [==============================] - 8s 10ms/step - loss: 0.0340 - accuracy: 0.9876 - val_loss: 0.4842 - val_accuracy: 0.8870\n",
542 | "Epoch 9/10\n",
543 | "782/782 [==============================] - 8s 10ms/step - loss: 0.0388 - accuracy: 0.9857 - val_loss: 0.4686 - val_accuracy: 0.8863\n",
544 | "Epoch 10/10\n",
545 | "782/782 [==============================] - 8s 10ms/step - loss: 0.0314 - accuracy: 0.9887 - val_loss: 0.6685 - val_accuracy: 0.8559\n"
546 | ]
547 | }
548 | ],
549 | "source": [
550 | "h2 = model2.fit(x_train, y_train,\n",
551 | " batch_size=batch_size,\n",
552 | " epochs=epochs,\n",
553 | " validation_data=(x_test, y_test))"
554 | ]
555 | },
556 | {
557 | "cell_type": "code",
558 | "execution_count": 12,
559 | "metadata": {
560 | "colab": {
561 | "base_uri": "https://localhost:8080/",
562 | "height": 298
563 | },
564 | "colab_type": "code",
565 | "id": "DzJEhaPrWeAM",
566 | "outputId": "aec6c655-c5f8-434b-bb16-d1e1056adc03"
567 | },
568 | "outputs": [
569 | {
570 | "data": {
571 | "text/plain": [
572 | ""
573 | ]
574 | },
575 | "execution_count": 12,
576 | "metadata": {
577 | "tags": []
578 | },
579 | "output_type": "execute_result"
580 | },
581 | {
582 | "data": {
583 | "image/png": "iVBORw0KGgoAAAANSUhEUgAAAXQAAAEICAYAAABPgw/pAAAABHNCSVQICAgIfAhkiAAAAAlwSFlzAAALEgAACxIB0t1+/AAAADh0RVh0U29mdHdhcmUAbWF0cGxvdGxpYiB2ZXJzaW9uMy4yLjEsIGh0dHA6Ly9tYXRwbG90bGliLm9yZy+j8jraAAAgAElEQVR4nO3deXxU5b3H8c8vk8m+sIYEEvYlbEU0LLIKiOIG1mrBrdWr0uu+9GptXapo21t7a9GKtRSt1qVI1SoqSpUdQSTsWwiQsCQsSYAkLNnz3D/OAENMwiTM5MxMfu/XKy/mLJzzm1G+8+Q553mOGGNQSikV+ELsLkAppZR3aKArpVSQ0EBXSqkgoYGulFJBQgNdKaWChAa6UkoFCQ105XdEZKSIbK9ne2cRMSIS2pR1Nca53otS3qSBrnxORH4pIl/UWLejjnVTjDHLjDG93NbvFpFLm6jWN0XkeW8dr+Z7UcqXNNBVU1gKDBMRB4CIJAFOYGCNdd1d+yqlGkEDXTWF1VgBfoFreSSwCNheY90uY8x+EblERHIARORtoCPwqYgcF5HH3I57s4jsFZECEXni1EoRCReR6SKy3/UzXUTCXdtuE5Hl7sW5um+6i8hU4GbgMde5Pq3tzbj2v8f1G8UxEXlORLqJyAoRKRaROSIS5trX/b10E5EjInKha7m9iOSLyCWu5bN+ExGRZ0TkHdfrU91MU13v6YCI/E/D/jOoYKeBrnzOGFMOrAJGuVaNApYBy2us+17r3BhzK7AXuMYYE2OMecFt8wigFzAOeFpEervWPwEMxfqyGAAMBp70oM6ZwLvAC65zXVPP7pcDF7nO8xgwE7gFSAH6ATfWcvxdwC+Ad0QkCvg78JYxZvG5anMzBugBXAb8oqm6olRg0EBXTWUJZ8J7JFagL6uxbkkDj/msMabEGLMB2IAV3mC1sqcZY/KMMfnAs8Ct51N8LV4wxhQbY7YAm4H/GGOyjDFFwBfAwNr+kjHmb8BOrC+4JKwvn4Z41hhzwhizCesL4XtfHKr50kBXTWUpMEJEWgFtjTE7gBVYfeutsFq1De0/P+j2+iQQ43rdHtjjtm2Pa503HXJ7XVLLcgx1+xvW+/2zMaasgefd5/baF+9LBTANdNVUVgLxwF3ANwDGmGJgv2vdfmNMdh1/t6FTgu4HOrktd3StAzgBRJ3aICKJ53muBhGRGGA68DrwjOvL7JSzagNq1gZWl84p7u9LKQ101TSMMSVAOvAIVlfLKctd6+prnR8CujbgdP8EnhSRtiLSBngaeMe1bQPQV0QuEJEI4JnzPFdDvQSkG2PuBD4HXnPbth6YIiJOEUkDrq/l7z8lIlEi0he4HXjfh7WqAKOBrprSEiABK8RPWeZaV1+g/w4roAs9vLPjeawvj43AJmCtax3GmExgGvA1sKNGLWC1nPu4zvWxB+fymIhMAiYAd7tWPQJcKCI3u5afAroBR7H6/d+r5TBLsPrgFwD/Z4z5jzdrVIFN9AEXSvk/EekMZANOY0ylvdUof6UtdKWUChIeBbqITBCR7SKyU0Qer2X7n0RkvesnU0QKvV+qUkqp+pyzy8U1NDsTGA/kYI36u9EYs7WO/e8HBhpj/svLtSqllKqHJy30wcBO16CJcmA2MKme/W/EustAKaVUE/Jk+tEOnD2YIQcYUtuOItIJ6AIsrGP7VGAqQHR09EWpqakNKlYppZq7NWvWFBhj2ta2zdvzSU8BPjDGVNW20TVXxkyAtLQ0k56e7uXTK6VUcBORPXVt86TLJZezR6clu9bVZgra3aKUUrbwJNBXAz1EpItrStApwNyaO4lIKtASa4i3UkqpJnbOQHcNYrgPmA9sA+YYY7aIyDQRmei26xRgttGRSkopZQuP+tCNMfOAeTXWPV1j+ZnzLaaiooKcnBxKS0vP91BBLSIiguTkZJxOp92lKKX8iF89ZDcnJ4fY2Fg6d+6MiNhdjl8yxnD48GFycnLo0qWL3eUopfyIXw39Ly0tpXXr1hrm9RARWrdurb/FKKW+x68CHdAw94B+Rkqp2vhdoCulVNCqrob5T0Behk8Or4FeQ0xMfU8OU0qp87D137DyFdi/1ieH10BXSqmmUFkGXz8LCX3hB5N9cgoN9DoYY3j00Ufp168f/fv35/33rSd9HThwgFGjRnHBBRfQr18/li1bRlVVFbfddtvpff/0pz/ZXL1Syu+kvwGFe2D8NAhx+OQUfnXbortnP93C1v3FXj1mn/Zx/Pqavh7t+9FHH7F+/Xo2bNhAQUEBgwYNYtSoUbz33ntcfvnlPPHEE1RVVXHy5EnWr19Pbm4umzdvBqCwUKeDV0q5KS2CJS9Al1HQfZzPTqMt9DosX76cG2+8EYfDQbt27Rg9ejSrV69m0KBB/P3vf+eZZ55h06ZNxMbG0rVrV7Kysrj//vv58ssviYuLs7t8pZQ/WT4dSo5YrXMf3qXmty10T1vSTW3UqFEsXbqUzz//nNtuu41HHnmEn/zkJ2zYsIH58+fz2muvMWfOHN544w27S1VK+YOiXPj2Veh/A7Qf6NNTaQu9DiNHjuT999+nqqqK/Px8li5dyuDBg9mzZw/t2rXjrrvu4s4772Tt2rUUFBRQXV3Nj370I55//nnWrvXNFWylVABa/Fsw1TD2SZ+fym9b6Hb74Q9/yMqVKxkwYAAiwgsvvEBiYiJvvfUWf/jDH3A6ncTExPCPf/yD3Nxcbr/9dqqrqwH43e9+Z3P1Sim/cGgrrH8PhtwNLTv7/HTnfKaor9T2gItt27bRu3dvW+oJNPpZKRUA3v0x7P0WHlwPUa28ckgRWWOMSattm3a5KKWUL2Qvgx3zYeTDXgvzc9FAV0opb6uuhq+egrgOMOS/m+y02oeulFLetvXfsH8dTHoVnJFNdlptoSullDdVlsOCadYQ/wFTmvTU2kJXSilvSn8Dju6Gmz/w2RD/umgLXSmlvKW0CJaeGuJ/aZOfXgNdKaW85ZuX4ORhnw/xr4sG+nmob+703bt3069fvyasRillq+L9sPJV6He9z4f410UDXSmlvGHRb6G6EsY9ZVsJHl0UFZEJwEuAA5hljPnfWvb5MfAMYIANxpibzquyLx6Hg5vO6xDfk9gfrvhe6ac9/vjjpKSkcO+99wLwzDPPEBoayqJFizh69CgVFRU8//zzTJo0qUGnLS0t5e677yY9PZ3Q0FBefPFFxowZw5YtW7j99tspLy+nurqaDz/8kPbt2/PjH/+YnJwcqqqqeOqpp5g82TeT4SulvCRvG6x/17rnvAmG+NflnIEuIg5gBjAeyAFWi8hcY8xWt316AL8EhhtjjopIgq8K9qXJkyfz0EMPnQ70OXPmMH/+fB544AHi4uIoKChg6NChTJw4sUEPap4xYwYiwqZNm8jIyOCyyy4jMzOT1157jQcffJCbb76Z8vJyqqqqmDdvHu3bt+fzzz8HoKioyCfvVSnlRV8/A2GxMOpRW8vwpIU+GNhpjMkCEJHZwCRgq9s+dwEzjDFHAYwxeeddWT0taV8ZOHAgeXl57N+/n/z8fFq2bEliYiIPP/wwS5cuJSQkhNzcXA4dOkRiYqLHx12+fDn3338/AKmpqXTq1InMzEwuvvhifvOb35CTk8N1111Hjx496N+/Pz//+c/5xS9+wdVXX83IkSN99XaVUt6wezlkfgnjft1kQ/zr4kkfegdgn9tyjmudu55ATxH5RkS+dXXRBKQbbriBDz74gPfff5/Jkyfz7rvvkp+fz5o1a1i/fj3t2rWjtLTUK+e66aabmDt3LpGRkVx55ZUsXLiQnj17snbtWvr378+TTz7JtGnTvHIupZQPGAP/cQ3xH3q33dV4bWBRKNADuARIBpaKSH9jzFnPYhORqcBUgI4dO3rp1N41efJk7rrrLgoKCliyZAlz5swhISEBp9PJokWL2LNnT4OPOXLkSN59913Gjh1LZmYme/fupVevXmRlZdG1a1ceeOAB9u7dy8aNG0lNTaVVq1bccssttGjRglmzZvngXSqlvGLLv2H/Wpg0o0mH+NfFk0DPBVLclpNd69zlAKuMMRVAtohkYgX8avedjDEzgZlgTZ/b2KJ9qW/fvhw7dowOHTqQlJTEzTffzDXXXEP//v1JS0sjNTW1wce85557uPvuu+nfvz+hoaG8+eabhIeHM2fOHN5++22cTieJiYn86le/YvXq1Tz66KOEhITgdDr5y1/+4oN3qZQ6b6eH+PeBATfaXQ3gwXzoIhIKZALjsIJ8NXCTMWaL2z4TgBuNMT8VkTbAOuACY8zhuo6r86GfH/2slLLZqr/CF4/BTf+Cnpc12WnPaz50Y0wlcB8wH9gGzDHGbBGRaSIy0bXbfOCwiGwFFgGP1hfmSikV0EqLYcnvofNI6DHe7mpO86gP3RgzD5hXY93Tbq8N8Ijrp1nZtGkTt95661nrwsPDWbVqlU0VKaV8zuYh/nXxu9kWjTENusfbbv3792f9+vVNek67HhuolAKKD8DKGdDvR9DhQrurOYtfDf2PiIjg8OHDGlj1MMZw+PBhIiIi7C5FqeZpsWuI/1j7hvjXxa9a6MnJyeTk5JCfn293KX4tIiKC5ORku8tQqvnJy4B178Dgn0GrLnZX8z1+FehOp5MuXfzvQ1JKKcA1xD/G9iH+dfGrLhellPJbu7+BzC9gxEMQ3druamqlga6UUudiDHz1FMS2hyH2D/Gvi191uSillF/a+jHkrrGG+IdF2V1NnbSFrpRS9fHDIf510Ra6UkrVZ82bcCTLGuIf4rC7mnppC10pperip0P866KBrpRSdVnxMpwsgPHP+tUQ/7pooCulVG1ODfHvex10uMjuajyiga6UUrVZ/DuoqoBx/jfEvy4a6EopVVNeBqx7GwbdCa262l2NxzTQlVKqpgXP+vUQ/7pooCullLs9K2D7PL8e4l8XDXSllDrFGPiP/w/xr4sOLFJKqVO2fgK56TDxFb8e4l8XbaErpRRYd7QseBba9oYLbrK7mkbRFrpSSoHbEP85fj/Evy7aQldKqbJjsPh/XUP8L7O7mkbTQFdKqW8Ca4h/XTwKdBGZICLbRWSniDxey/bbRCRfRNa7fu70fqlKKeUDxw7CylcCaoh/Xc7Zhy4iDmAGMB7IAVaLyFxjzNYau75vjLnPBzUqpZTvBOAQ/7p40kIfDOw0xmQZY8qB2cAk35allFJNIH87rP0HDLojoIb418WTQO8A7HNbznGtq+lHIrJRRD4QkRSvVKeUUr709bPgjA64If518dZF0U+BzsaYHwBfAW/VtpOITBWRdBFJz8/P99KplVKqEfashO2fu4b4t7G7Gq/wJNBzAfcWd7Jr3WnGmMPGmDLX4iyg1isLxpiZxpg0Y0xa27ZtG1OvUkqdP2Pgq6cgNgmG3mN3NV7jSaCvBnqISBcRCQOmAHPddxCRJLfFicA275WolFJetm0u5KyGMb8KyCH+dTnnXS7GmEoRuQ+YDziAN4wxW0RkGpBujJkLPCAiE4FK4Ahwmw9rVkoFkrwM6x7vVl0hJhFCbB7+UlVh9Z237Q0DAnOIf108GvpvjJkHzKux7mm3178Efund0pRSAevYIdj0L9g4Gw5uOrM+NNIK9lZdoHU3aNXNWm7drenCfs2bcGQX3Pg+OIJr9pPgejdKKfuUn7TmEd/wT9i1EEw1tB8IE34PbXta86QczrL+LMiEHf+BqvIzf/9U2Lfu6gp9t7CPTfLOCM5TQ/w7jYCel5//8fyMBrpSqvGqq2HPctgwG7bOhfJjEJcMwx+CAVOgba8z+3YbW+PvVkFRjtVadg/7/O2QOf/cYd/aFfgNCfsVf3YN8Z8W0EP866KBrpRquPztVohvnAPFORAWC30mwYDJVuvXk66TEAe07GT9eBT2u6zzbv8SqivO7OuMgpZdXGHv1qqvGfbHDlqB3veHkBzYQ/zrooGulPLM8XzY/KHVL75/HUgIdBtnTWjV60rv3i1yzrDf5wr6XXAku/6wP9Vnf+Kwa4j/0wQrDXSlVN0qSq1+8Y3vw46vwFRB4g/g8t9Cv+shtl3T1xTigJadrR9Pwz4vAwr3wMX3BMUQ/7pooCulzlZdDXtXWi3xLZ9AWZHVdTHsPvjBFGjXx+4K61Zf2BsTlP3m7jTQlVKWgp1WiG98Hwr3WnOc9JkIP5gMXUYF7FN8TgvyMAcNdKWat5NHrH7xDbOthyNLCHS9BMY8Cb2vhrBouytUDaCBrlRzU1lm3Ra4YbZ1L3h1BST0hfHPQf8bIC7p3MdQfkkDXanmwBjY953VpbL5IygthJh2MORn1v3iif3trlB5gQa6UsHsSJZ1r/iG2XA02xqg0/tqK8S7XBJ0Q9+bO/2vqVSwMQayl8Dy6ZC1CBDoMhJGPwa9r4HwWLsrVD6iga5UsKiugq2fwDcvwYH1VpfK2CdhwI0Qn2x3daoJaKArFegqSmD9e9aw9qPZ1vD3a16y7hl3RthdnWpCGuhKBaqSo7D6dVj1GpzIhw4XWZNOpV4V+PeMq0bRQFcq0BTlwrevWvN6lx+H7pdasxt2HtEsBs+oummgKxUo8jJgxcvWXSumGvpdB8Mf1FsO1Wka6Er5u73fWnesZH5h3XaY9l9w8b3WTIRKudFAV8ofVVfDjvlWkO/7FiJbwujHYfBUiG5td3XKT2mgK+VPKsutZ3GueBnyMyC+I1zxAgy8RedVUeekga6UPyg7Bmvesi52FudCu35w3d+sp+s4nHZXpwKEBrpSdjqeZ912uHoWlBZB55FwzcvQfZzesaIazKNAF5EJwEuAA5hljPnfOvb7EfABMMgYk+61KmsoOllBfJS2WlQAO5JlDQRa9671MOTeV1u3Hian2V2ZCmDnDHQRcQAzgPFADrBaROYaY7bW2C8WeBBY5YtCT5m1LItXF+9iwSOjaRkd5stTKeV9+9dZFzq3zYWQUGtY/rD7oU0PuytTQcCDR3MzGNhpjMkyxpQDs4FJtez3HPB7oNSL9X3PiB5tKCqp4IX5Gb48jVLeYwzsWghvTYSZl1ivhz0AD22CiS9rmCuv8aTLpQOwz205BxjivoOIXAikGGM+F5FHvVjf96QmxnH7sM7MWp7NDWkpXNixpS9Pp1TjVVXC1o+tybIOboSYRLj0WUi7HSLi7a5OBSFPWuj1EpEQ4EXg5x7sO1VE0kUkPT8/v9HnfGh8T9rFhfPUx5upqjaNPo5SPlFSCN/9DV65CD68w5o8a+Kf4aGNMOIhDXPlM5600HOBFLflZNe6U2KBfsBisa7KJwJzRWRizQujxpiZwEyAtLS0RidxTHgoT13dh/veW8c73+7hp8M6N/ZQSnnHsYOQ8TlkfAbZS6G6EjqkwWXPQ6+rIOS8205KnZMngb4a6CEiXbCCfApw06mNxpgioM2pZRFZDPyPL+9yAbiqfxKzu+/j/+Zv54r+iSTE6jShqokdyYJtn1khvu87wECrrjD0HugzyZr9UG89VE3onIFujKkUkfuA+Vi3Lb5hjNkiItOAdGPMXF8XWRsRYdqkvkyYvozfzcvgT5MvsKMM1ZwYA4c2WyG+7VPI22KtT+wPl/zSehpQQm8NcWUbj+5DN8bMA+bVWPd0Hftecv5leaZr2ximjurKK4t2MnlQCkO76hwXysuqq6zWd4YrxAv3AAIdL4bLf2vNPd6ys91VKgUEwUjRe8d059/rcnnq483Me3AkTof2VarzVFlu9YNnfAoZ8+BEHoQ4oeslMPIR6HUlxCTYXaVS3xPwgR4Z5uCZiX256x/pvLE8m5+N7mZ3SSoQlR2HnV9brfAd/4GyYnBGQ4/xVldKj8sgIs7uKpWqV8AHOsD4Pu24tHcCLy3YwcQL2pMUH2l3SSoQnDhszTG+7TNrsE9VGUS1hj4TIfUaq0Wuz+RUASQoAh3g19f05dIXl/DcZ1t59eaL7C5H+auiHOv2wm2fwp4VYKogLtka7JN6tdU37giafxaqmQma/3NTWkVx35ju/PGrTJZk5jO6Z1u7S1L+Ij/T6g/f9qk1lwpAm14w4mFrUqykC/TOFBUUgibQAaaO7spH63L59Seb+fKhUUQ49cnnzZIxsH/tmXvECzKt9R0ugnG/tvrEdf4UFYSCKtDDQx1Mm9SXW1//jplLs3hgnP6jbVYKdsDq162WeHEOiAM6D4dBd1m3F8Z3sLtCpXwqqAIdYGSPtlzVP4kZi3Zy7QUd6Ng6yu6SlC8ZA7uXw8pXIPNLcIRbD4cY+wT0nABRreyuUKkmE3SBDvDU1X1YvD2PZz7dwus/TUO0fzT4VFXAlo9h5Z/hwAbr7pTRj8OgOyFGr5+o5ikoAz0xPoKHLu3Jb+Zt46uth7isb6LdJSlvKS2ynr256q9Wt0rrHnD1dBgwBZx6u6pq3oIy0AFuG96Zf63Zx7OfbmVEjzZEhQXtW20eCvfCt6/B2n9A+THr2ZtX/dEa8KMzGSoFeGE+dH/ldITw3KR+5BaW8MrCnXaXoxordy188F/w0gXWw5R7TYCpi+G2z6zXGuZKnRbUzdYhXVtz3YUd+NuyLK67MJnuCTF2l6Q8UV1tXeBc+Qrs+QbC4+Die2DIf0N8st3VKeW3gjrQAX55RW++2nqIpz/ZzLt3DtELpP6s/CRs+Cd8+yoc3gnxKXDZb+DCn+g8Kkp5IOgDvW1sOI9e3ounP9nCpxsPMHFAe7tLUjUdz7Me2bZ6FpQcgfYD4UevQ59rdRi+Ug3QLP613DykE3PS9/H8Z1sZ06stsRFOu0tSAHkZVrfKxjlQVQ69roCL74NOw3QovlKN0CyuKDlChOev7U/+8TKmf73D7nKaN2MgazG8cz28OgQ2/QsG3gz3pcON/7RGdmqYK9UozaKFDnBBSgumDOrImyt2c/1FyfRO0j7ZJlVZDls+slrkBzdBdFsY8wSk3QHR+qQppbyhWbTQT3ns8l7ERzp56uPNVFcbu8tpHkoKYfl0eGkA/Ptn1gjPiX+GhzbD6Mc0zJXyombTQgdoGR3G4xNSeezDjXy4Nocb0lLsLil4Hd1tDQRa9zaUH4cuo2Hiy9BtnN47rpSPNKtAB7j+omRmr97L777IYHyfdrSICrO7pOCSkw4r/gzb5oKEQL/r4eJ7IekHdlemVNBrdk2lkBDhuWv7UXiynD/M3253OcGhtAg2fwSvXw6zxsGuRTDsfnhwI1z3Vw1zpZqIRy10EZkAvAQ4gFnGmP+tsf2/gXuBKuA4MNUYs9XLtXpN3/bx/HRYZ95csZsfp6UwIKWF3SUFlqpK6wESuxZaPznp1qPcWnSCCb+HgbdAuI7KVaqpiTH1XxwUEQeQCYwHcoDVwI3ugS0iccaYYtfricA9xpgJ9R03LS3NpKenn2f5jVdcWsG4Py4hMS6Cj+8djiNEb5Wr15HsMwGevQzKigCBDhdCt7HQdQykDNGBQEr5mIisMcak1bbNk399g4Gdxpgs18FmA5OA04F+KsxdogG/v4UkLsLJk1f15sHZ63nvu73cOrST3SX5l9IiyF7qCvFFcDTbWh+fAn2vhW5jrAud+gAJpfyGJ4HeAdjntpwDDKm5k4jcCzwChAFjvVKdj00c0J7Z3+3jD19mcEW/RNrEhNtdkn2qKiF3DWQtOrsbJSzGmqp26D1WS7x1Nx34o5Sf8trvx8aYGcAMEbkJeBL4ac19RGQqMBWgY8eO3jp1o4kIz13blyteWsbv5mXwxx8PsLukplVfN8rIR6wATx4EDp0qQalA4Emg5wLuN2wnu9bVZTbwl9o2GGNmAjPB6kP3sEaf6p4Qyx0juvLakl1MGZzCoM5B3IVQUgi7l50J8aO7rfWnu1HGQpdR2o2iVIDyJNBXAz1EpAtWkE8BbnLfQUR6GGNOTZJyFRBQE6Y8MK47c9fn8tTHm/ns/hGEOoLkbs5T3SinAjw3HUy11Y3SZRQMvVe7UZQKIucMdGNMpYjcB8zHum3xDWPMFhGZBqQbY+YC94nIpUAFcJRaulv8WVRYKE9f05f/fmcNb67YzZ0ju9pdUuMdyTpzITN7KZQVWwN82g+EkT/XbhSlgtg5b1v0lUbftrhnBWTOt1qYHYdCWLRX6jHGcPubq1mdfYQFP7+ExPgIrxzX5+rsRulo3Ymi3ShKBZXzvW3Rv+xfb83Y9810CHFCcpoVWJ1HWi1PZ+OCWER4dmJfxv9pKc9/vpVXbrrQy4V7yYkC67Fse1ZYfx7cDJgz3SgX32eFeKuu2o2iVDMTeC10gLLjsO9bq0shexkcWG/1DYdGQMpgK9i6jLa6GRrYtTD960ymf72Dd+4YwogebRpXnzcV7z8T3ru/gQLXdAWhkdZ77TQcOo+wXms3ilJBr74WemAGek0lhbB3pSvgl8Khzdb6sBjoeDF0GWmFfOIPIMRR76FKK6q4fPpSHCJ88dBIwkPr39+rjIHCPVaA7/7GCvFTA3rCYq0upk7DrABPugBCdWIxpZqb4A/0mk4ctvqVdy+zAr4g01ofEQ+dRrha8COhbe9ap3JdtD2P2/++mkcv78W9Y7r7pkawAvzwTti93NUKXwHFOda2yJZW67vTMOunXX8dVq+UCrI+dE9Et7buq+57rbVcfMAKzewlVshv/9xaH9XGCvbOI60uGtfte2N6JTChbyJ/XriDiQPak9Iqyjt1VVdD3lZXeLtC/ES+q+YE6/FrnR6ygrxtqs4brpRqkOBsoZ/L0T2u1rurBX9sv7U+NsnVeh/FoVaDuGRWNsO7t2HWT2v9Mjy3qko4uNHtIuYKKC20tsWnuLXAh+u94EopjzS/LpeGMMa6dzt7yZmAP1kAQHFEB7443oO+w6+m3/CrIS6p/mNVllvTyp66gLlvlfW0HoBW3c6Ed+fh0ML+qQ+UUoFHA70hjIG8bbB7GdVZSzixfTGxnLC2te5xpv+980hwRlmjL/essLp0clZDZam1b9veri6UYdBx2Lm/DJRSygMa6Odh5Y48nn9jDo/1PMTosAwrvE+1ukNCobrSGomZ2P9MF0rHYfrwY6WUTzS/i6JedHGPBLoPGM5dmw4y/+HH6dIyzBrclL3ECvaOw6DjEOsOGqWUspEGugeeuLI3C7fl8fQnm/nHfw1GUgZByiC7y1JKqbPofXEeSIiL4JHLehgNMIgAABBVSURBVLJsRwFfbD5odzlKKVUrDXQP3Tq0E32S4pj26VaOl1XaXY5SSn2PBrqHQh0hPHdtPw4Wl/LygoCa7l0p1UxooDfARZ1aMjkthTeWZ7P94DG7y1FKqbNooDfQL65IJSYilKc+2Yxdt3wqpVRtNNAbqFV0GI9dnsp32Uf497r6Hq2qlFJNSwO9EaYMSuGClBY8/tEmXl6wg7LKKrtLUkopDfTGCAkR/vaTNMb3aceLX2Vy5UvLWJV12O6ylFLNnAZ6I7WNDWfGTRfy99sHUVZZzeSZ3/LYBxs4eqLc7tKUUs2UBvp5GtMrga8eHs3PRnflw7W5jHtxCR+tzdELpkqpJqeB7gWRYQ5+eUVvPrt/BB1bRfHInA3cPGsV2QUn7C5NKdWMeBToIjJBRLaLyE4RebyW7Y+IyFYR2SgiC0Skk/dL9X+9k+L46O5hPH9tPzblFnH59KV60VQp1WTOGegi4gBmAFcAfYAbRaRPjd3WAWnGmB8AHwAveLvQQBESItwytBMLHhmtF02VUk3Kkxb6YGCnMSbLGFMOzAYmue9gjFlkjDnpWvwWSPZumYEnIS5CL5oqpZqUJ4HeAdjntpzjWleXO4AvzqeoYKIXTZVSTcWrF0VF5BYgDfhDHduniki6iKTn5+d789R+TS+aKqWagieBngukuC0nu9adRUQuBZ4AJhpjymo7kDFmpjEmzRiT1rZt28bUG9BOXTR97tp+bMrRi6ZKKe/yJNBXAz1EpIuIhAFTgLnuO4jIQOCvWGGe5/0yg0dIiHDr0E4s+LleNFVKedc5A90YUwncB8wHtgFzjDFbRGSaiEx07fYHIAb4l4isF5G5dRxOuehFU6WUt4ldF+fS0tJMenq6Lef2NyXlVUxfkMmsZdnERzp58qre/HBgB0TE7tKUUn5GRNYYY9Jq26YjRf1AXRdNs/KP212aUiqAaKD7kZoXTSe8tEwvmiqlPKaB7mf0oqlSqrE00P3U6Yumtw2itEIvmiqlzk0D3c+NSU3gq0dG6UhTpdQ5aaAHgKiwUL1oqpQ6Jw30AKIXTZVS9dFADzC1XTQd+39L+MfK3ZRWaLAr1ZzpwKIAtzQzn+lfZ7J2byFtYsK5a2QXbh7aiZjwULtLU0r5QH0DizTQg4AxhpVZh3l10S6W7ywgPtLJ7cM7c9uwzrSICrO7PKWUF2mgNyPr9h5lxqJdfL3tENFhDm4Z2ok7RnYhITbC7tKUUl6ggd4MbTtQzKuLd/H5xv04HSFMHpTC1FFdSW4ZZXdpSqnzoIHejGUXnOC1xbv4aF0OxsC1Aztw9yXd6NY2xu7SlFKNoIGu2F9YwsylWfzzu72UV1VzZf8k7r2kO33ax9ldmlKqATTQ1WkFx8t4fXk2b6/cw/GySsamJnDvmO5c1Kml3aUppTygga6+p+hkBW+t3M0b32RTeLKCi7u25r6x3RnWrbXOw66UH9NAV3U6UVbJP7/by8ylWeQdK+OClBbcO6Y7l/ZO0GBXyg9poKtzKq2o4oM1Oby2ZBc5R0tITYzlnjHduap/Eo4QDXal/IUGuvJYRVU1c9fv59XFO9mVf4IubaK5e3Q3rh3YgbBQnSlCKbtpoKsGq642zN9ykFcW7WTL/mLax0cwdVRXpgzuSITTYXd5SjVbGuiq0YwxLM7MZ8bCnaTvOUqbmDDuGNGVW4Z2JDbCaXd5SjU7GujKK1ZlHeaVRTtZtqOAuIhQbhvWmduHd6FltM4Xo1RTqS/QPeoUFZEJIrJdRHaKyOO1bB8lImtFpFJErj/fgpV/GtK1NW/fMYRP7h3O0K6teXnhTob/fiG/+Xwrh4pL7S5PqWbvnC10EXEAmcB4IAdYDdxojNnqtk9nIA74H2CuMeaDc51YW+iBb/vBY/xl8U7mbthPaEgIY1MTGNs7gTG9EmgbG253eUoFpfpa6J5Mmj0Y2GmMyXIdbDYwCTgd6MaY3a5t1eddrQoYvRJjmT5lIA+P78kby7OZv+UQX245CMCAlBaMS01gXO8E+iTF6T3tSjUBTwK9A7DPbTkHGOKbclQg6tQ6mmcn9eOZiX3ZeqCYhdvyWJCRx5++zuTFrzJJjItgbO8ExqUmMKxbGyLD9C4ZpXyhSR9rIyJTgakAHTt2bMpTqyYgIvRtH0/f9vHcP64H+cfKWLw9jwXb8vhkXS7vrdpLeGgIw7u3sbpnUhNo3yLS7rKVChqeBHoukOK2nOxa12DGmJnATLD60BtzDBU42saGc0NaCjekpVBWWcV32UdYsC2PBRmHWJiRB0CfpDjG9bbCfUByC0J0VKpSjebJRdFQrIui47CCfDVwkzFmSy37vgl8phdFVX2MMezKP+4K9zzW7DlKVbWhTUwYl/SyumZG9Gij97krVYvzvg9dRK4EpgMO4A1jzG9EZBqQboyZKyKDgH8DLYFS4KAxpm99x9RAV6cUnixnSWY+CzPyWLw9n6KSCpwOYUiX1ox1XVjt1Dra7jKV8gs6sEgFjMqqatbsOcrCDKv1vjPvOADd2kYzrnc7xqUmcFGnloQ6dF4Z1TxpoKuAtefwCRZm5LEwI49vsw5TUWWIiwi1umZ6JzC6Z1taROlIVdV8aKCroHC8rJLlO/JZsC2PRdvzKDheTohAWqdWp2+L7J4Qo/e8q6Cmga6CTnW1YUNOodU1sy2PrQeKAUhuGcmAlBaktoslNSmO1MRYOrSI1LtnVNDQQFdB70BRCQsz8liamc+2A8fYe+Tk6W3RYQ56JcbSKzGO3kmx9GoXS2piHPFReheNCjwa6KrZOV5WSeahY2w/eIyMA8VkHDxGxsFjFJVUnN4nKT6CXolWuKcmxtIrMZZubWP0QR7Kr53vXC5KBZyY8FAu7NiSCzu2PL3OGMOh4jIyDloBv90V8t/szKKiymrYhIYI3drGWEGfFEuqK/CT4iO0b175PQ101WyICInxESTGR3BJr4TT6yuqqskuOMG2A8VsdwX9mj1Hmbth/+l9YiNCT4d7r8RYeifF0rNdrA5+Un5FA101e05HCD3bWQHtrqikgsxDx1yt+WIyDhzj43W5HCurPL1PhxaRVr+8W9dNlzbRep+8soUGulJ1iI90MqhzKwZ1bnV6nTGG/UWlZ/XLbz9YzOLt+VRWW902YY4QuraNpkOLSBLjI0iKjyAxPpL2rt8OkuIjdcZJ5RMa6Eo1gIjQoUUkHVpEMq53u9Pryyqr2JV3gu2HrJb8zrzjHCgqZd2+Qo6cKP/eceIjnSS5hX2SK+zbx5/5EogO13+eqmH0/xilvCA81EGf9nH0aR8HA8/eVlpRxcGiUg4UlXKwuIQDRaUcKDyzvCm3iILj3w/92IhQV+ifCfyay9qHr9xpoCvlYxFOB53bRNO5Td0TjJVVVnGoqIwDRSUcLC51hX6JK/RL2XqgmILjZdS8yzgmPNQt6L/f2k+IDScu0olDB1Y1CxroSvmB8FAHHVtH0bF1VJ37lFdWc6i49HTgHyxya+0Xl5J5KJ+8Y98PfRGri6dFpJP4qDBaRlmvW0SF0SLKSUvXny2iwmgRaS3HRzmJiwgN+Fs1jTGUVVZTUl7FyYoqwkNDiI904gzSi9Ya6EoFiLDQEFJaRZHSqu7Qr6iqJv+Y1dI/UFRKXnEZhSUVFJ0s5+jJCgpLKjhyopys/BMcPVnOsdLKOo/lCBHri8D1BXAq6Fu6gr9F9JkvAOsLwfpSiA5zNPiLoPx06FZyoqzKel1eyckK6/WJskpKKqo4We76KTuz7WR55Zn15VWUnLVcSXUtYyejwxzERzqJc72/+Ein672GnVkf6b7e+jM2wr9/29FAVyqIOB0htG8R6fGj/SqrqikqsYK+8GQ5hScrrOB3vS4ssb4Iik5WcLC4lIyDxyg8Wc6J8qp6ahDiI0+1/p3ER4bhdMjpgLVCt+qs5craUrcekU4HUWEOIsMcRIeFEhlmLbeMchIZFkqU00FUuLUuKiz09P7lVdUUnqyw3rPrz+KSCrILTlBUYi2XVtT9rHsRiA0PJT7KSYvIsNOBH+/+peD2RXBqfWO/6BpKA12pZizUEULrmHBax4Q36O+VV7q+CE6WU1hSwdET5d/7UigqKefoiQpyjp6kqtqcDuDEuIjTARzlCuPoMIcVxK71kU4H0eFngjrK6dov3EFEqMOnk62VVlRR7Ap367cbt9euL4DCk+WnvwD2F5W41lXU+8UUGiKnW/4Pje/JxAHtvV67BrpSqsHCQkNoGxtO29iGfREEggingwing4S4iAb9PWMMJ8urzmr9Wz9nwv/U+pY+mhhOA10ppbxARIgODyU6PNTjLi9vC85LvUop1QxpoCulVJDQQFdKqSChga6UUkHCo0AXkQkisl1EdorI47VsDxeR913bV4lIZ28XqpRSqn7nDHQRcQAzgCuAPsCNItKnxm53AEeNMd2BPwG/93ahSiml6udJC30wsNMYk2WMKQdmA5Nq7DMJeMv1+gNgnAT6JBBKKRVgPAn0DsA+t+Uc17pa9zHGVAJFQGtvFKiUUsozTTqwSESmAlNdi8dFZHsjD9UGKPBOVUFBP4+z6edxhn4WZwuGz6NTXRs8CfRcIMVtOdm1rrZ9ckQkFIgHDtc8kDFmJjDTg3PWS0TSjTFp53ucYKGfx9n08zhDP4uzBfvn4UmXy2qgh4h0EZEwYAowt8Y+c4Gful5fDyw0puaszEoppXzpnC10Y0yliNwHzAccwBvGmC0iMg1IN8bMBV4H3haRncARrNBXSinVhDzqQzfGzAPm1Vj3tNvrUuAG75ZWr/Putgky+nmcTT+PM/SzOFtQfx6iPSNKKRUcdOi/UkoFCQ10pZQKEgEX6OeaV6a5EJEUEVkkIltFZIuIPGh3Tf5ARBwisk5EPrO7FruJSAsR+UBEMkRkm4hcbHdNdhGRh13/TjaLyD9FpGGPIwoQARXoHs4r01xUAj83xvQBhgL3NuPPwt2DwDa7i/ATLwFfGmNSgQE0089FRDoADwBpxph+WHfrBeWdeAEV6Hg2r0yzYIw5YIxZ63p9DOsfa80pGZoVEUkGrgJm2V2L3UQkHhiFdUsxxphyY0yhvVXZKhSIdA18jAL221yPTwRaoHsyr0yz45queCCwyt5KbDcdeAyotrsQP9AFyAf+7uqCmiUi0XYXZQdjTC7wf8Be4ABQZIz5j71V+UagBbqqQURigA+Bh4wxxXbXYxcRuRrIM8assbsWPxEKXAj8xRgzEDgBNMtrTiLSEus3+S5AeyBaRG6xtyrfCLRA92RemWZDRJxYYf6uMeYju+ux2XBgoojsxuqKGysi79hbkq1ygBxjzKnf2j7ACvjm6FIg2xiTb4ypAD4Chtlck08EWqB7Mq9Ms+Cab/51YJsx5kW767GbMeaXxphkY0xnrP8vFhpjgrIV5gljzEFgn4j0cq0aB2y1sSQ77QWGikiU69/NOIL0AnGTTp97vuqaV8bmsuwyHLgV2CQi613rfuWapkEpgPuBd12NnyzgdpvrsYUxZpWIfACsxbo7bB1BOgWADv1XSqkgEWhdLkoppeqgga6UUkFCA10ppYKEBrpSSgUJDXSllAoSGuhKKRUkNNCVUipI/D8sPu6lM8OD1gAAAABJRU5ErkJggg==\n",
584 | "text/plain": [
585 | ""
586 | ]
587 | },
588 | "metadata": {
589 | "needs_background": "light",
590 | "tags": []
591 | },
592 | "output_type": "display_data"
593 | }
594 | ],
595 | "source": [
596 | "pd.DataFrame(h2.history)[['loss','val_loss']].plot(title=\"Without mixup\")"
597 | ]
598 | },
599 | {
600 | "cell_type": "markdown",
601 | "metadata": {
602 | "colab_type": "text",
603 | "id": "M2HDERJbGr2a"
604 | },
605 | "source": [
606 | "# Comparison\n",
607 | "See the loss curve with mixup does not overfit."
608 | ]
609 | },
610 | {
611 | "cell_type": "markdown",
612 | "metadata": {
613 | "colab": {},
614 | "colab_type": "code",
615 | "id": "hqteWafKRdF1"
616 | },
617 | "source": [
618 | "## Cite the paper\n",
619 | "```\n",
620 | "@article{marivate2019improving,\n",
621 | " title={Improving short text classification through global augmentation methods},\n",
622 | " author={Marivate, Vukosi and Sefara, Tshephisho},\n",
623 | " journal={arXiv preprint arXiv:1907.03752},\n",
624 | " year={2019}\n",
625 | "}```\n",
626 | "\n",
627 | "https://arxiv.org/abs/1907.03752"
628 | ]
629 | },
630 | {
631 | "cell_type": "code",
632 | "execution_count": null,
633 | "metadata": {},
634 | "outputs": [],
635 | "source": []
636 | }
637 | ],
638 | "metadata": {
639 | "accelerator": "GPU",
640 | "colab": {
641 | "collapsed_sections": [],
642 | "name": "mixup_example_using_IMDB_sentiment.ipynb",
643 | "provenance": []
644 | },
645 | "kernelspec": {
646 | "display_name": "Python 3",
647 | "language": "python",
648 | "name": "python3"
649 | },
650 | "language_info": {
651 | "codemirror_mode": {
652 | "name": "ipython",
653 | "version": 3
654 | },
655 | "file_extension": ".py",
656 | "mimetype": "text/x-python",
657 | "name": "python",
658 | "nbconvert_exporter": "python",
659 | "pygments_lexer": "ipython3",
660 | "version": "3.7.7"
661 | }
662 | },
663 | "nbformat": 4,
664 | "nbformat_minor": 4
665 | }
666 |
--------------------------------------------------------------------------------
/examples/word2vec_example.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "markdown",
5 | "metadata": {
6 | "colab_type": "text",
7 | "id": "JHDJLKDuJkcB"
8 | },
9 | "source": [
10 | "# Example for using word2vec"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 1,
16 | "metadata": {
17 | "colab": {},
18 | "colab_type": "code",
19 | "id": "9m8ChZsdAx41"
20 | },
21 | "outputs": [],
22 | "source": [
23 | "# Import libraries\n",
24 | "try:\n",
25 | " import textaugment, gensim\n",
26 | "except ModuleNotFoundError:\n",
27 | " !pip -q install textaugment gensim\n",
28 | " import textaugment, gensim"
29 | ]
30 | },
31 | {
32 | "cell_type": "code",
33 | "execution_count": 4,
34 | "metadata": {
35 | "colab": {
36 | "base_uri": "https://localhost:8080/",
37 | "height": 153
38 | },
39 | "colab_type": "code",
40 | "id": "ux6Bc4QSrYA8",
41 | "outputId": "9f2b8af1-3b22-455c-dd85-d1ac173a5317"
42 | },
43 | "outputs": [
44 | {
45 | "name": "stdout",
46 | "output_type": "stream",
47 | "text": [
48 | "[nltk_data] Downloading package wordnet to /root/nltk_data...\n",
49 | "[nltk_data] Unzipping corpora/wordnet.zip.\n",
50 | "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
51 | "[nltk_data] Unzipping tokenizers/punkt.zip.\n",
52 | "[nltk_data] Downloading package averaged_perceptron_tagger to\n",
53 | "[nltk_data] /root/nltk_data...\n",
54 | "[nltk_data] Unzipping taggers/averaged_perceptron_tagger.zip.\n"
55 | ]
56 | },
57 | {
58 | "data": {
59 | "text/plain": [
60 | "True"
61 | ]
62 | },
63 | "execution_count": 4,
64 | "metadata": {
65 | "tags": []
66 | },
67 | "output_type": "execute_result"
68 | }
69 | ],
70 | "source": [
71 | "# Import NLRK and download data\n",
72 | "import nltk\n",
73 | "nltk.download(['wordnet','punkt','averaged_perceptron_tagger'])"
74 | ]
75 | },
76 | {
77 | "cell_type": "markdown",
78 | "metadata": {
79 | "colab_type": "text",
80 | "id": "8AUt-F5MtiuI"
81 | },
82 | "source": [
83 | "## Load Google Word2vec embeddings"
84 | ]
85 | },
86 | {
87 | "cell_type": "code",
88 | "execution_count": 5,
89 | "metadata": {
90 | "colab": {
91 | "base_uri": "https://localhost:8080/",
92 | "height": 204
93 | },
94 | "colab_type": "code",
95 | "id": "1xq4dJtSr4RM",
96 | "outputId": "1ff32743-04a9-4b8a-eda3-8dcf55e711ca"
97 | },
98 | "outputs": [
99 | {
100 | "name": "stdout",
101 | "output_type": "stream",
102 | "text": [
103 | "--2020-05-23 18:06:47-- https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\n",
104 | "Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.178.197\n",
105 | "Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.178.197|:443... connected.\n",
106 | "HTTP request sent, awaiting response... 200 OK\n",
107 | "Length: 1647046227 (1.5G) [application/x-gzip]\n",
108 | "Saving to: ‘GoogleNews-vectors-negative300.bin.gz’\n",
109 | "\n",
110 | "GoogleNews-vectors- 100%[===================>] 1.53G 36.2MB/s in 44s \n",
111 | "\n",
112 | "2020-05-23 18:07:31 (35.7 MB/s) - ‘GoogleNews-vectors-negative300.bin.gz’ saved [1647046227/1647046227]\n",
113 | "\n"
114 | ]
115 | }
116 | ],
117 | "source": [
118 | "# Download Google Word2vec embeddings\n",
119 | "!wget \"https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz\""
120 | ]
121 | },
122 | {
123 | "cell_type": "code",
124 | "execution_count": 8,
125 | "metadata": {
126 | "colab": {
127 | "base_uri": "https://localhost:8080/",
128 | "height": 71
129 | },
130 | "colab_type": "code",
131 | "id": "q2wxTNhwrjK-",
132 | "outputId": "e30ff6b7-96a3-4d59-c486-65def436cbd8"
133 | },
134 | "outputs": [
135 | {
136 | "name": "stderr",
137 | "output_type": "stream",
138 | "text": [
139 | "/usr/local/lib/python3.6/dist-packages/smart_open/smart_open_lib.py:253: UserWarning: This function is deprecated, use smart_open.open instead. See the migration notes for details: https://github.com/RaRe-Technologies/smart_open/blob/master/README.rst#migrating-to-the-new-open-function\n",
140 | " 'See the migration notes for details: %s' % _MIGRATION_NOTES_URL\n"
141 | ]
142 | }
143 | ],
144 | "source": [
145 | "model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin.gz', binary=True)"
146 | ]
147 | },
148 | {
149 | "cell_type": "code",
150 | "execution_count": 13,
151 | "metadata": {
152 | "colab": {
153 | "base_uri": "https://localhost:8080/",
154 | "height": 71
155 | },
156 | "colab_type": "code",
157 | "id": "3uHnRL77uATl",
158 | "outputId": "de09c7ff-47bc-4e21-d2eb-89fcadb4d2bd"
159 | },
160 | "outputs": [
161 | {
162 | "name": "stderr",
163 | "output_type": "stream",
164 | "text": [
165 | "/usr/local/lib/python3.6/dist-packages/gensim/matutils.py:737: FutureWarning: Conversion of the second argument of issubdtype from `int` to `np.signedinteger` is deprecated. In future, it will be treated as `np.int64 == np.dtype(int).type`.\n",
166 | " if np.issubdtype(vec.dtype, np.int):\n"
167 | ]
168 | }
169 | ],
170 | "source": [
171 | "from textaugment import Word2vec\n",
172 | "t = Word2vec(model=model)\n",
173 | "output = t.augment('The stories are good', top_n=10)"
174 | ]
175 | },
176 | {
177 | "cell_type": "code",
178 | "execution_count": 14,
179 | "metadata": {
180 | "colab": {
181 | "base_uri": "https://localhost:8080/",
182 | "height": 34
183 | },
184 | "colab_type": "code",
185 | "id": "BhVYt8V3uAwk",
186 | "outputId": "7c36d302-db66-4837-ff6b-ea1793a088d9"
187 | },
188 | "outputs": [
189 | {
190 | "name": "stdout",
191 | "output_type": "stream",
192 | "text": [
193 | "the stories are excellent\n"
194 | ]
195 | }
196 | ],
197 | "source": [
198 | "print(output)"
199 | ]
200 | },
201 | {
202 | "cell_type": "markdown",
203 | "metadata": {
204 | "colab": {},
205 | "colab_type": "code",
206 | "id": "IWoNJrZfy94n"
207 | },
208 | "source": [
209 | "## Cite the paper\n",
210 | "```\n",
211 | "@article{marivate2019improving,\n",
212 | " title={Improving short text classification through global augmentation methods},\n",
213 | " author={Marivate, Vukosi and Sefara, Tshephisho},\n",
214 | " journal={arXiv preprint arXiv:1907.03752},\n",
215 | " year={2019}\n",
216 | "}```\n",
217 | "\n",
218 | "https://arxiv.org/abs/1907.03752"
219 | ]
220 | },
221 | {
222 | "cell_type": "code",
223 | "execution_count": null,
224 | "metadata": {},
225 | "outputs": [],
226 | "source": []
227 | }
228 | ],
229 | "metadata": {
230 | "accelerator": "GPU",
231 | "colab": {
232 | "collapsed_sections": [],
233 | "name": "word2vec example.ipynb",
234 | "provenance": []
235 | },
236 | "kernelspec": {
237 | "display_name": "Python 3",
238 | "language": "python",
239 | "name": "python3"
240 | },
241 | "language_info": {
242 | "codemirror_mode": {
243 | "name": "ipython",
244 | "version": 3
245 | },
246 | "file_extension": ".py",
247 | "mimetype": "text/x-python",
248 | "name": "python",
249 | "nbconvert_exporter": "python",
250 | "pygments_lexer": "ipython3",
251 | "version": "3.7.7"
252 | }
253 | },
254 | "nbformat": 4,
255 | "nbformat_minor": 4
256 | }
257 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | gensim>=4.0
2 | googletrans>=2
3 | nltk
4 | numpy
5 | textblob
6 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # -*- coding: utf-8 -*-
3 | import setuptools
4 | import re
5 |
6 |
7 | def find_version(fname):
8 | """Attempts to find the version number in the file names fname.
9 | Raises RuntimeError if not found.
10 | """
11 | version = ''
12 | with open(fname, 'r') as fp:
13 | reg = re.compile(r'__version__ = [\'"]([^\'"]*)[\'"]')
14 | for line in fp:
15 | m = reg.match(line)
16 | if m:
17 | version = m.group(1)
18 | break
19 | if not version:
20 | raise RuntimeError('Cannot find version information')
21 | return version
22 |
23 |
24 | __version__ = find_version('textaugment/__init__.py')
25 |
26 |
27 | def read(fname):
28 | with open(fname, "r") as fh:
29 | content = fh.read()
30 | return content
31 |
32 |
33 | setuptools.setup(
34 | name='textaugment',
35 | version=__version__,
36 | packages=setuptools.find_packages(exclude=('test*', )),
37 | author='Joseph Sefara',
38 | author_email='sefaratj@gmail.com',
39 | license='MIT',
40 | keywords=['text augmentation', 'python', 'natural language processing', 'nlp'],
41 | url='https://github.com/dsfsi/textaugment',
42 | description='A library for augmenting text for natural language processing applications.',
43 | long_description=read("README.md"),
44 | long_description_content_type="text/markdown",
45 | install_requires=['nltk', 'gensim>=4.0', 'textblob', 'numpy', 'googletrans>=2'],
46 | classifiers=[
47 | "Intended Audience :: Developers",
48 | "Natural Language :: English",
49 | "License :: OSI Approved :: MIT License",
50 | "Operating System :: OS Independent",
51 | "Programming Language :: Python :: 3",
52 | "Programming Language :: Python :: Implementation :: PyPy",
53 | "Topic :: Text Processing :: Linguistic",
54 | ]
55 | )
56 |
--------------------------------------------------------------------------------
/tests/test_translate.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import sys
3 | from textaugment.translate import Translate
4 | from textaugment import translate
5 |
6 |
7 | class InputTestCase(unittest.TestCase):
8 |
9 | def setUp(self):
10 | self.t = Translate(src="en", to="es")
11 |
12 | def test_geometric(self):
13 | with self.assertRaises(ValueError, msg="Parameters nust be set"):
14 | Translate()
15 |
16 | with self.assertRaises(KeyError, msg="Value of parameters must be correct"):
17 | Translate(to=7, src="hello") # Test parameter, type
18 |
19 | with self.assertRaises(TypeError, msg="Only strings are allowed"):
20 | self.t.augment(45)
21 |
22 | def test_translate(self):
23 | self.assertTrue(translate.LANGUAGES, msg="Files exists")
24 |
25 |
26 | class OutputTestCase(unittest.TestCase):
27 |
28 | def setUp(self):
29 | self.t = Translate(src="en", to="es")
30 | self.data = "He walks"
31 |
32 | def test_augment(self):
33 | self.assertEqual(self.t.augment(self.data), self.data)
34 |
35 | self.assertEqual(self.t.augment("4"), "4")
36 |
37 |
38 | class PlatformTestCase(unittest.TestCase):
39 |
40 | def test_platform(self):
41 | self.assertEqual(sys.version_info[0], 3, msg="Must be using Python 3")
42 |
43 |
44 | if __name__ == '__main__':
45 | unittest.main()
46 |
47 |
--------------------------------------------------------------------------------
/tests/test_word2vec.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import sys
3 | from textaugment.word2vec import Word2vec
4 |
5 |
6 | class InputTestCase(unittest.TestCase):
7 |
8 | def setUp(self):
9 | self.path = "/home/tjs/dev/papu/models/gensim_cbow_sepedi"
10 | self.wrongpath = "/home/tjs/dev/papu/models/gensim_cbow_sepedi-wrong"
11 | self.w = Word2vec(model=self.path)
12 |
13 | def test_augment(self):
14 | with self.assertRaises(TypeError, msg="Value for p should be float"):
15 | Word2vec(model=self.path, p="foo")
16 |
17 | with self.assertRaises(TypeError, msg="Value for runs should be integer"):
18 | Word2vec(model=self.path, runs="foo")
19 |
20 | with self.assertRaises(FileNotFoundError, msg="The model is not found"):
21 | Word2vec(model=self.wrongpath)
22 |
23 | with self.assertRaises(TypeError, msg="Input should not be lists"):
24 | self.w.augment(["hello"])
25 |
26 | with self.assertRaises(TypeError, msg="Input should not be numbers"):
27 | self.w.augment(45)
28 |
29 |
30 | class OutputTestCase(unittest.TestCase):
31 |
32 | def setUp(self):
33 | self.path = "/home/tjs/dev/papu/models/gensim_cbow_sepedi"
34 | self.w = Word2vec(model=self.path)
35 | self.data = "We are testing"
36 |
37 | def test_augment(self):
38 | self.assertIsInstance(self.w.augment(self.data), str, msg="Input must be a string")
39 | self.assertEqual(self.w.augment("4"), "4", msg="Input should not be numbers")
40 |
41 |
42 | class PlatformTestCase(unittest.TestCase):
43 |
44 | def test_platform(self):
45 | self.assertEqual(sys.version_info[0], 3, msg="Must be using Python 3")
46 |
47 |
48 | if __name__ == '__main__':
49 | unittest.main()
50 |
51 |
--------------------------------------------------------------------------------
/tests/test_wordnet.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import sys
3 | import numpy as np
4 | from textaugment.wordnet import Wordnet
5 |
6 |
7 | class InputTestCase(unittest.TestCase):
8 | def setUp(self):
9 | self.p = 0.8
10 | self.data = ["I", "am", "testing"]
11 | self.w = Wordnet(p=self.p)
12 |
13 | def test_geometric(self):
14 | with self.assertRaises(TypeError, msg="Receives one parameter"):
15 | self.w.geometric(p=self.p, data=self.data)
16 |
17 | with self.assertRaises(TypeError, msg="Receives one parameter"):
18 | self.w.geometric()
19 |
20 | with self.assertRaises(IndexError, msg="Data must be set using; data='data string'"):
21 | self.w.geometric(data=0)
22 |
23 | def test_augment(self):
24 | with self.assertRaises(TypeError, msg="Expect string not list"):
25 | self.w.augment(self.data)
26 |
27 | with self.assertRaises(TypeError, msg="Expect string not integer"):
28 | self.w.augment(data=0)
29 |
30 |
31 | class OutputTestCase(unittest.TestCase):
32 |
33 | def setUp(self):
34 | self.p = 0.8
35 | self.data = ["I", "am", "testing"]
36 | self.data2 = "известен още с псевдонимите"
37 | self.w = Wordnet(p=self.p)
38 |
39 | def test_augment(self):
40 | self.assertIsInstance(self.w.augment(" ".join(self.data)), str)
41 |
42 | def test_geometric(self):
43 | self.assertIsInstance(self.w.geometric(data=self.data), np.ndarray)
44 |
45 |
46 | class PlatformTestCase(unittest.TestCase):
47 |
48 | def test_platform(self):
49 | self.assertEqual(sys.version_info[0], 3, msg="Must be using Python 3")
50 |
51 |
52 | if __name__ == '__main__':
53 | unittest.main()
54 |
--------------------------------------------------------------------------------
/textaugment/__init__.py:
--------------------------------------------------------------------------------
1 | import os
2 | from .translate import Translate
3 | from .word2vec import Word2vec
4 | from .word2vec import Fasttext
5 | from .wordnet import Wordnet
6 | from .eda import EDA
7 | from .aeda import AEDA
8 | from .mixup import MIXUP
9 | from .constants import LANGUAGES
10 |
11 | name = "textaugment"
12 |
13 | __version__ = '2.0.0'
14 | __licence__ = 'MIT'
15 | __author__ = 'Joseph Sefara'
16 | __url__ = 'https://github.com/dsfsi/textaugment/'
17 |
18 | PACKAGE_DIR = os.path.dirname(os.path.abspath(__file__))
19 |
20 | __all__ = [
21 | 'Translate',
22 | 'Word2vec',
23 | 'Wordnet',
24 | 'EDA',
25 | 'AEDA',
26 | 'MIXUP',
27 | 'LANGUAGES'
28 | ]
29 |
--------------------------------------------------------------------------------
/textaugment/aeda.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # TextAugment: AEDA
3 | #
4 | # Copyright (C) 2023
5 | # Author: Juhwan Choi
6 | #
7 | # URL:
8 | # For license information, see LICENSE
9 | #
10 | """
11 | This module is an implementation of the original AEDA algorithm (2021) [1].
12 | """
13 | import random
14 |
15 |
16 | class AEDA:
17 | """
18 | This class is an implementation of the original AEDA algorithm (2021) [1].
19 |
20 | [1] Karimi et al., 2021, November. AEDA: An Easier Data Augmentation Technique for Text Classification.
21 | In Findings of the Association for Computational Linguistics: EMNLP 2021 (pp. 2748-2754).
22 | https://aclanthology.org/2021.findings-emnlp.234.pdf
23 |
24 | Example usage: ::
25 | >>> from textaugment import AEDA
26 | >>> t = AEDA()
27 | >>> t.punct_insertion("John is going to town")
28 | ! John is going to town
29 | """
30 |
31 | @staticmethod
32 | def validate(**kwargs):
33 | """Validate input data"""
34 | if 'sentence' in kwargs:
35 | if not isinstance(kwargs['sentence'].strip(), str) or len(kwargs['sentence'].strip()) == 0:
36 | raise TypeError("sentence must be a valid sentence")
37 |
38 | def __init__(self, punctuations=['.', ';', '?', ':', '!', ','], random_state=1):
39 | """A method to initialize parameters
40 |
41 | :type punctuations: list
42 | :param punctuations: (optional) Punctuations to be inserted
43 | :type random_state: int
44 | :param random_state: (optional) Seed
45 |
46 | :rtype: None
47 | :return: Constructer do not return.
48 | """
49 | self.punctuations = punctuations
50 | self.random_state = random_state
51 | if isinstance(self.random_state, int):
52 | random.seed(self.random_state)
53 | else:
54 | raise TypeError("random_state must have type int")
55 |
56 | def punct_insertion(self, sentence: str):
57 | """Insert random punctuations to the sentence
58 |
59 | :type sentence: str
60 | :param sentence: Sentence
61 |
62 | :rtype: str
63 | :return: Augmented sentence
64 | """
65 | self.validate(sentence=sentence)
66 |
67 | sentence = sentence.strip().split(' ')
68 | len_sentence = len(sentence)
69 | # Get random number of punctuations to be inserted
70 | # The number of punctuations to be inserted is between 1 and 1/3 of the length of the sentence
71 | num_punctuations = random.randint(1, len_sentence // 3)
72 | augmented_sentence = sentence.copy()
73 |
74 | # Insert random punctuations in random positions
75 | for _ in range(num_punctuations):
76 | punct = random.choice(self.punctuations) # Select punctuation to be inserted
77 | pos = random.randint(0, len(augmented_sentence) - 1) # Select position to insert punctuation
78 | augmented_sentence = augmented_sentence[:pos] + [punct] + augmented_sentence[pos:] # Insert punctuation
79 | augmented_sentence = ' '.join(augmented_sentence)
80 |
81 | return augmented_sentence
82 |
--------------------------------------------------------------------------------
/textaugment/constants.py:
--------------------------------------------------------------------------------
1 | LANGUAGES = {
2 | 'af': 'afrikaans',
3 | 'sq': 'albanian',
4 | 'am': 'amharic',
5 | 'ar': 'arabic',
6 | 'hy': 'armenian',
7 | 'az': 'azerbaijani',
8 | 'eu': 'basque',
9 | 'be': 'belarusian',
10 | 'bn': 'bengali',
11 | 'bs': 'bosnian',
12 | 'bg': 'bulgarian',
13 | 'ca': 'catalan',
14 | 'ceb': 'cebuano',
15 | 'ny': 'chichewa',
16 | 'zh-cn': 'chinese (simplified)',
17 | 'zh-tw': 'chinese (traditional)',
18 | 'co': 'corsican',
19 | 'hr': 'croatian',
20 | 'cs': 'czech',
21 | 'da': 'danish',
22 | 'nl': 'dutch',
23 | 'en': 'english',
24 | 'eo': 'esperanto',
25 | 'et': 'estonian',
26 | 'tl': 'filipino',
27 | 'fi': 'finnish',
28 | 'fr': 'french',
29 | 'fy': 'frisian',
30 | 'gl': 'galician',
31 | 'ka': 'georgian',
32 | 'de': 'german',
33 | 'el': 'greek',
34 | 'gu': 'gujarati',
35 | 'ht': 'haitian creole',
36 | 'ha': 'hausa',
37 | 'haw': 'hawaiian',
38 | 'iw': 'hebrew',
39 | 'hi': 'hindi',
40 | 'hmn': 'hmong',
41 | 'hu': 'hungarian',
42 | 'is': 'icelandic',
43 | 'ig': 'igbo',
44 | 'id': 'indonesian',
45 | 'ga': 'irish',
46 | 'it': 'italian',
47 | 'ja': 'japanese',
48 | 'jw': 'javanese',
49 | 'kn': 'kannada',
50 | 'kk': 'kazakh',
51 | 'km': 'khmer',
52 | 'ko': 'korean',
53 | 'ku': 'kurdish (kurmanji)',
54 | 'ky': 'kyrgyz',
55 | 'lo': 'lao',
56 | 'la': 'latin',
57 | 'lv': 'latvian',
58 | 'lt': 'lithuanian',
59 | 'lb': 'luxembourgish',
60 | 'mk': 'macedonian',
61 | 'mg': 'malagasy',
62 | 'ms': 'malay',
63 | 'ml': 'malayalam',
64 | 'mt': 'maltese',
65 | 'mi': 'maori',
66 | 'mr': 'marathi',
67 | 'mn': 'mongolian',
68 | 'my': 'myanmar (burmese)',
69 | 'ne': 'nepali',
70 | 'no': 'norwegian',
71 | 'ps': 'pashto',
72 | 'fa': 'persian',
73 | 'pl': 'polish',
74 | 'pt': 'portuguese',
75 | 'pa': 'punjabi',
76 | 'ro': 'romanian',
77 | 'ru': 'russian',
78 | 'sm': 'samoan',
79 | 'gd': 'scots gaelic',
80 | 'sr': 'serbian',
81 | 'st': 'sesotho',
82 | 'sn': 'shona',
83 | 'sd': 'sindhi',
84 | 'si': 'sinhala',
85 | 'sk': 'slovak',
86 | 'sl': 'slovenian',
87 | 'so': 'somali',
88 | 'es': 'spanish',
89 | 'su': 'sundanese',
90 | 'sw': 'swahili',
91 | 'sv': 'swedish',
92 | 'tg': 'tajik',
93 | 'ta': 'tamil',
94 | 'te': 'telugu',
95 | 'th': 'thai',
96 | 'tr': 'turkish',
97 | 'uk': 'ukrainian',
98 | 'ur': 'urdu',
99 | 'uz': 'uzbek',
100 | 'vi': 'vietnamese',
101 | 'cy': 'welsh',
102 | 'xh': 'xhosa',
103 | 'yi': 'yiddish',
104 | 'yo': 'yoruba',
105 | 'zu': 'zulu',
106 | 'fil': 'Filipino',
107 | 'he': 'Hebrew'
108 | }
109 |
110 | LANGCODES = dict(map(reversed, LANGUAGES.items()))
111 |
--------------------------------------------------------------------------------
/textaugment/eda.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # TextAugment: EDA
3 | #
4 | # Copyright (C) 2018-2023
5 | # Author: Joseph Sefara
6 | #
7 | # URL:
8 | # For license information, see LICENSE
9 | #
10 | """
11 | This module is an implementation of the original EDA algorithm (2019) [1].
12 | """
13 | import nltk
14 | from nltk.corpus import wordnet, stopwords
15 | import random
16 |
17 |
18 | class EDA:
19 | """
20 | This class is an implementation of the original EDA algorithm (2019) [1].
21 |
22 | [1] Wei, J. and Zou, K., 2019, November. EDA: Easy Data Augmentation Techniques for Boosting Performance on
23 | Text Classification Tasks. In Proceedings of the 2019 Conference on Empirical Methods in Natural Language Processing
24 | and the 9th International Joint Conference on Natural Language Processing (EMNLP-IJCNLP) (pp. 6383-6389).
25 | https://www.aclweb.org/anthology/D19-1670.pdf
26 |
27 | Example usage: ::
28 | >>> from textaugment import EDA
29 | >>> t = EDA()
30 | >>> t.synonym_replacement("John is going to town",top_n=3)
31 | John is give out to town
32 | >>> t.random_deletion("John is going to town", p=0.2)
33 | is going to town
34 | >>> t.random_swap("John is going to town")
35 | John town going to is
36 | >>> t.random_insertion("John is going to town")
37 | John is going to make up town
38 | """
39 |
40 | @staticmethod
41 | def _get_synonyms(word):
42 | """Generate synonym"""
43 | synonyms = set()
44 | for syn in wordnet.synsets(word):
45 | for lemma in syn.lemmas():
46 | synonym = lemma.name().replace("_", " ").replace("-", " ").lower()
47 | synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
48 | synonyms.add(synonym)
49 | if word in synonyms:
50 | synonyms.remove(word)
51 | synonyms = sorted(list(synonyms))
52 | random.shuffle(synonyms)
53 | return synonyms
54 |
55 |
56 | @staticmethod
57 | def swap_word(new_words):
58 | """Swap words"""
59 | random_idx_1 = random.randint(0, len(new_words) - 1)
60 | random_idx_2 = random_idx_1
61 | counter = 0
62 | while random_idx_2 == random_idx_1:
63 | random_idx_2 = random.randint(0, len(new_words) - 1)
64 | counter += 1
65 | if counter > 3:
66 | return new_words
67 | new_words[random_idx_1], new_words[random_idx_2] = new_words[random_idx_2], new_words[random_idx_1]
68 | return new_words
69 |
70 | @staticmethod
71 | def validate(**kwargs):
72 | """Validate input data"""
73 |
74 | if 'p' in kwargs:
75 | if kwargs['p'] > 1 or kwargs['p'] < 0:
76 | raise TypeError("p must be a fraction between 0 and 1")
77 | if 'sentence' in kwargs:
78 | if not isinstance(kwargs['sentence'].strip(), str) or len(kwargs['sentence'].strip()) == 0:
79 | raise TypeError("sentence must be a valid sentence")
80 | if 'n' in kwargs:
81 | if not isinstance(kwargs['n'], int):
82 | raise TypeError("n must be a valid integer")
83 |
84 | def __init__(self, stop_words=None, random_state=1):
85 | """A method to initialize parameters
86 |
87 | :type random_state: int
88 | :param random_state: (optional) Seed
89 | :type stop_words: list
90 | :param stop_words: (optional) List of stopwords
91 |
92 | :rtype: None
93 | :return: Constructer do not return.
94 | """
95 | self.stopwords = stopwords.words('english') if stop_words is None else stop_words
96 | self.sentence = None
97 | self.p = None
98 | self.n = None
99 | self.random_state = random_state
100 | if isinstance(self.random_state, int):
101 | random.seed(self.random_state)
102 | else:
103 | raise TypeError("random_state must have type int")
104 |
105 | def add_word(self, new_words):
106 | """Insert word"""
107 | synonyms = list()
108 | counter = 0
109 | while len(synonyms) < 1:
110 | random_word_list = list([word for word in new_words if word not in self.stopwords])
111 | random_word = random_word_list[random.randint(0, len(random_word_list) - 1)]
112 | synonyms = self._get_synonyms(random_word)
113 | counter += 1
114 | if counter >= 10:
115 | return new_words # See Issue 14 for details
116 | random_synonym = synonyms[0] # TODO
117 | random_idx = random.randint(0, len(new_words) - 1)
118 | new_words.insert(random_idx, random_synonym)
119 | return new_words
120 |
121 | # def synonym_replacement_top_n(self,
122 | # sentence: str,
123 | # n: int = 1,
124 | # top_n: int = None,
125 | # stopwords: list = None,
126 | # lang: str = 'eng'):
127 | #
128 | # """Replace n words in the sentence with top_n synonyms from wordnet
129 | #
130 | # :type sentence: str
131 | # :param sentence: Sentence
132 | # :type n: int
133 | # :param n: Number of repetitions to replace
134 | # :type top_n: int
135 | # :param top_n: top_n of synonyms to randomly choose from
136 | # :type stopwords: list
137 | # :param stopwords: stopwords
138 | # :type lang: str
139 | # :param lang: lang
140 | #
141 | # :rtype: str
142 | # :return: Augmented sentence
143 | # """
144 | #
145 | # stopwords = stopwords if stopwords else self.stopwords
146 | #
147 | # def get_synonyms(w, pos):
148 | # morphy_tag = {
149 | # 'NN': wordnet.NOUN,
150 | # 'JJ': wordnet.ADJ,
151 | # 'VB': wordnet.VERB,
152 | # 'RB': wordnet.ADV
153 | # }
154 | # for sunset in wordnet.synsets(w,
155 | # lang=lang,
156 | # pos=morphy_tag[pos[:2]] if pos[:2] in morphy_tag else None):
157 | # for lemma in sunset.lemmas(lang=lang):
158 | # yield lemma.name()
159 | #
160 | # new_words = list()
161 | # for index, (word, tag) in enumerate(nltk.pos_tag(nltk.word_tokenize(sentence))):
162 | # synonyms = sorted(set(synonym for synonym in get_synonyms(word, tag) if synonym != word))
163 | # synonyms = synonyms[:top_n if top_n else len(synonyms)]
164 | # new_words.append({
165 | # "index": index,
166 | # "word": word,
167 | # "new_word": random.choice(synonyms) if len(synonyms) > 0 else "",
168 | # "synonyms": synonyms,
169 | # "in_stopwords": word in stopwords
170 | # })
171 | #
172 | # replaced_index = random.choices([word["index"] for word in new_words
173 | # if not word["in_stopwords"] and len(word["synonyms"]) > 0], k=n)
174 | #
175 | # return ' '.join([word["new_word" if word["index"] in replaced_index else "word"] for word in new_words])
176 |
177 | def synonym_replacement(self, sentence: str, n: int = 1, top_n: int = None):
178 | """Replace n words in the sentence with synonyms from wordnet
179 |
180 | :type sentence: str
181 | :param sentence: Sentence
182 | :type n: int
183 | :param n: Number of repetitions to replace
184 | :type top_n: int
185 | :param top_n: top_n of synonyms to randomly choose from
186 |
187 | :rtype: str
188 | :return: Augmented sentence
189 | """
190 | self.validate(sentence=sentence, n=n)
191 | self.n = n
192 | self.sentence = sentence
193 | words = sentence.split()
194 | new_words = words.copy()
195 | random_word_list = sorted(set([word for word in words if word not in self.stopwords]))
196 | random.shuffle(random_word_list)
197 | replaced = 0
198 | for random_word in random_word_list:
199 | synonyms = self._get_synonyms(random_word)
200 | if len(synonyms) > 0:
201 | synonyms = synonyms[:top_n if top_n else len(synonyms)] # use top n or all synonyms
202 | synonym = random.choice(synonyms)
203 | new_words = [synonym if word == random_word else word for word in new_words]
204 | replaced += 1
205 | if replaced >= self.n:
206 | break
207 | sentence = ' '.join(new_words)
208 |
209 | return sentence
210 |
211 | def random_deletion(self, sentence: str, p: float = 0.1):
212 | """Randomly delete words from the sentence with probability p
213 |
214 | :type sentence: str
215 | :param sentence: Sentence
216 | :type p: int
217 | :param p: Probability between 0 and 1
218 |
219 | :rtype: str
220 | :return: Augmented sentence
221 | """
222 | self.validate(sentence=sentence, p=p)
223 | self.p = p
224 | self.sentence = sentence
225 | words = sentence.split()
226 | if len(words) == 1:
227 | return words[0]
228 | new_words = list()
229 | for word in words:
230 | r = random.uniform(0, 1)
231 | if r > self.p:
232 | new_words.append(word)
233 | # if all words are deleted, just return a random word
234 | if len(new_words) == 0:
235 | return random.choice(words)
236 |
237 | return " ".join(new_words)
238 |
239 | def random_swap(self, sentence: str, n: int = 1):
240 | """Randomly swap two words in the sentence n times
241 |
242 | :type sentence: str
243 | :param sentence: Sentence
244 | :type n: int
245 | :param n: Number of repetitions to swap
246 |
247 | :rtype: str
248 | :return: Augmented sentence
249 | """
250 | self.validate(sentence=sentence, n=n)
251 | self.n = n
252 | self.sentence = sentence
253 | words = sentence.split()
254 | new_words = words.copy()
255 | for _ in range(self.n):
256 | new_words = self.swap_word(new_words)
257 | return " ".join(new_words)
258 |
259 | def random_insertion(self, sentence: str, n: int = 1):
260 | """Randomly insert n words into the sentence
261 |
262 | :type sentence: str
263 | :param sentence: Sentence
264 | :type n: int
265 | :param n: Number of words to insert
266 |
267 | :rtype: str
268 | :return: Augmented sentence
269 | """
270 | self.validate(sentence=sentence, n=n)
271 | self.n = n
272 | self.sentence = sentence
273 | words = sentence.split()
274 | new_words = words.copy()
275 | for _ in range(self.n):
276 | new_words = self.add_word(new_words)
277 | return " ".join(new_words)
278 |
--------------------------------------------------------------------------------
/textaugment/mixup.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # TextAugment: mixup
3 | #
4 | # Copyright (C) 2018-2023
5 | # Authors: Joseph Sefara, Vukosi Marivate
6 | #
7 | # URL:
8 | # For license information, see LICENSE
9 | import numpy as np
10 | import random
11 |
12 |
13 | class MIXUP:
14 | """
15 | This class implements the mixup algorithm [1] for natural language processing.
16 |
17 | [1] Zhang, Hongyi, Moustapha Cisse, Yann N. Dauphin, and David Lopez-Paz. "mixup: Beyond empirical risk
18 | minimization." in International Conference on Learning Representations (2018).
19 | https://openreview.net/forum?id=r1Ddp1-Rb
20 | """
21 |
22 | @staticmethod
23 | def validate(**kwargs):
24 | """Validate input data"""
25 |
26 | if 'data' in kwargs:
27 | if isinstance(kwargs['data'], list):
28 | kwargs['data'] = np.array(kwargs['data'])
29 | if not isinstance(kwargs['data'], np.ndarray):
30 | raise TypeError("data must be numpy array. Found " + str(type(kwargs['data'])))
31 | if 'labels' in kwargs:
32 | if isinstance(kwargs['labels'], (list, type(None))):
33 | kwargs['labels'] = np.array(kwargs['labels'])
34 | if not isinstance(kwargs['labels'], np.ndarray):
35 | raise TypeError("labels must be numpy array. Found " + str(type(kwargs['labels'])))
36 | if 'batch_size' in kwargs:
37 | if not isinstance(kwargs['batch_size'], int):
38 | raise TypeError("batch_size must be a valid integer. Found " + str(type(kwargs['batch_size'])))
39 | if 'shuffle' in kwargs:
40 | if not isinstance(kwargs['shuffle'], bool):
41 | raise TypeError("shuffle must be a boolean. Found " + str(type(kwargs['shuffle'])))
42 | if 'runs' in kwargs:
43 | if not isinstance(kwargs['runs'], int):
44 | raise TypeError("runs must be a valid integer. Found " + str(type(kwargs['runs'])))
45 |
46 | def __init__(self, random_state=1, runs=1):
47 | self.random_state = random_state
48 | self.runs = runs
49 | if isinstance(self.random_state, int):
50 | random.seed(self.random_state)
51 | np.random.seed(self.random_state)
52 | else:
53 | raise TypeError("random_state must have type int")
54 |
55 | def mixup_data(self, x, y=None, alpha=0.2):
56 | """This method performs mixup. If runs = 1 it just does 1 mixup with whole batch, any n of runs
57 | creates many mixup matches.
58 |
59 | :type x: Numpy array
60 | :param x: Data array
61 | :type y: Numpy array
62 | :param y: (optional) labels
63 | :type alpha: float
64 | :param alpha: alpha
65 |
66 | :rtype: tuple
67 | :return: Returns mixed inputs, pairs of targets, and lambda
68 | """
69 | if self.runs is None:
70 | self.runs = 1
71 | output_x = []
72 | output_y = []
73 | batch_size = x.shape[0]
74 | for i in range(self.runs):
75 | lam_vector = np.random.beta(alpha, alpha, batch_size)
76 | index = np.random.permutation(batch_size)
77 | mixed_x = (x.T * lam_vector).T + (x[index, :].T * (1.0 - lam_vector)).T
78 | output_x.append(mixed_x)
79 | if y is None:
80 | return np.concatenate(output_x, axis=0)
81 | mixed_y = (y.T * lam_vector).T + (y[index].T * (1.0 - lam_vector)).T
82 | output_y.append(mixed_y)
83 | return np.concatenate(output_x, axis=0), np.concatenate(output_y, axis=0)
84 |
85 | def flow(self, data, labels=None, batch_size=32, shuffle=True, runs=1):
86 | """This function implements the batch iterator and specifically calls mixup
87 |
88 | :param data: Input data. Numpy ndarray or list of lists.
89 | :param labels: Labels. Numpy ndarray or list of lists.
90 | :param batch_size: Int (default: 32).
91 | :param shuffle: Boolean (default: True).
92 | :param runs: Int (default: 1). Number of augmentations
93 |
94 | :rtype: array or tuple
95 | :return: array or tuple of arrays (X_data array, labels array)."""
96 |
97 | self.validate(data=data, labels=labels, batch_size=batch_size, shuffle=shuffle, runs=runs)
98 |
99 | self.runs = runs
100 |
101 | num_batches_per_epoch = int((len(data) - 1) / batch_size) + 1
102 |
103 | def data_generator():
104 | data_size = len(data)
105 | while True:
106 | # Shuffle the data at each epoch
107 | if shuffle:
108 | shuffle_indices = np.random.permutation(np.arange(data_size))
109 | shuffled_data = data[shuffle_indices]
110 | if labels is not None:
111 | shuffled_labels = labels[shuffle_indices]
112 | else:
113 | shuffled_data = data
114 | if labels is not None:
115 | shuffled_labels = labels
116 | for batch_num in range(num_batches_per_epoch):
117 | start_index = batch_num * batch_size
118 | end_index = min((batch_num + 1) * batch_size, data_size)
119 | X = shuffled_data[start_index: end_index]
120 | if labels is None:
121 | X = self.mixup_data(X, y=None)
122 | yield X
123 | else:
124 | y = shuffled_labels[start_index: end_index]
125 | X, y = self.mixup_data(X, y)
126 | yield X, y
127 |
128 | return data_generator(), num_batches_per_epoch
129 |
--------------------------------------------------------------------------------
/textaugment/translate.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # WordNet-based data augmentation
3 | #
4 | # Copyright (C) 2020
5 | # Author: Joseph Sefara
6 | # URL:
7 | # For license information, see LICENSE
8 |
9 | from .constants import LANGUAGES
10 | from textblob import TextBlob
11 | from textblob.translate import NotTranslated
12 | from googletrans import Translator
13 |
14 |
15 | class Translate:
16 | """
17 | A set of functions used to augment data.
18 | Supported languages are:
19 | Language Name Code
20 | Afrikaans af
21 | Albanian sq
22 | Arabic ar
23 | Azerbaijani az
24 | Basque eu
25 | Bengali bn
26 | Belarusian be
27 | Bulgarian bg
28 | Catalan ca
29 | Chinese Simplified zh-CN
30 | Chinese Traditional zh-TW
31 | Croatian hr
32 | Czech cs
33 | Danish da
34 | Dutch nl
35 | English en
36 | Esperanto eo
37 | Estonian et
38 | Filipino tl
39 | Finnish fi
40 | French fr
41 | Galician gl
42 | Georgian ka
43 | German de
44 | Greek el
45 | Gujarati gu
46 | Haitian Creole ht
47 | Hebrew iw
48 | Hindi hi
49 | Hungarian hu
50 | Icelandic is
51 | Indonesian id
52 | Irish ga
53 | Italian it
54 | Japanese ja
55 | Kannada kn
56 | Korean ko
57 | Latin la
58 | Latvian lv
59 | Lithuanian lt
60 | Macedonian mk
61 | Malay ms
62 | Maltese mt
63 | Norwegian no
64 | Persian fa
65 | Polish pl
66 | Portuguese pt
67 | Romanian ro
68 | Russian ru
69 | Serbian sr
70 | Slovak sk
71 | Slovenian sl
72 | Spanish es
73 | Swahili sw
74 | Swedish sv
75 | Tamil ta
76 | Telugu te
77 | Thai th
78 | Turkish tr
79 | Ukrainian uk
80 | Urdu ur
81 | Vietnamese vi
82 | Welsh cy
83 | Yiddish yi
84 |
85 | Example usage: ::
86 | >>> from textaugment import Translate
87 | >>> t = Translate(src="en",to="es")
88 | >>> t.augment('I love school')
89 | i adore school
90 | """
91 |
92 | def __init__(self, **kwargs):
93 |
94 | """
95 | A method to initialize parameters
96 |
97 | :type src: str
98 | :param src: Source language of the text
99 | :type to: str
100 | :param to: Destination language to translate to. The language should be a family of the source language for
101 | better results. The text will then be translated back to the source language.
102 | :rtype: None
103 | :return: Constructer do not return.
104 | """
105 | hl = LANGUAGES
106 |
107 | try:
108 | if "to" not in kwargs:
109 | raise ValueError("'to' missing")
110 | elif "src" not in kwargs:
111 | raise ValueError("'src' missing")
112 | if kwargs['to'] not in hl:
113 | raise KeyError("Value of to is not surpported. See help(Translate)")
114 | if kwargs['src'] not in hl:
115 | raise KeyError("Value of src is not surpported. See help(Translate)")
116 | except (ValueError, KeyError):
117 | print("The values of the keys 'to' and 'src' are required. E.g Translate(src='en', to='es')")
118 | raise
119 | else:
120 | self.to = kwargs['to']
121 | self.src = kwargs['src']
122 |
123 | def augment(self, data):
124 | """
125 | A method to paraphrase a sentence.
126 |
127 | :type data: str
128 | :param data: sentence used for data augmentation
129 | :rtype: str
130 | :return: The augmented data
131 | """
132 | if type(data) is not str:
133 | raise TypeError("DataType must be a string")
134 | data = TextBlob(data.lower())
135 | try:
136 | data = data.translate(from_lang=self.src, to=self.to)
137 | data = data.translate(from_lang=self.to, to=self.src)
138 | except NotTranslated:
139 | try: # Switch to googletrans to do translation.
140 | translator = Translator()
141 | data = translator.translate(data, dest=self.to, src=self.src).text
142 | data = translator.translate(data, dest=self.src, src=self.to).text
143 | except Exception:
144 | print("Error Not translated.\n")
145 | raise
146 |
147 | return str(data).lower()
148 |
--------------------------------------------------------------------------------
/textaugment/word2vec.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # Word2vec-based data augmentation
3 | #
4 | # Copyright (C) 2023
5 | # Author: Joseph Sefara
6 | # URL:
7 | # For license information, see LICENSE
8 |
9 | import gensim
10 | import numpy as np
11 | import random
12 |
13 |
14 | class Word2vec:
15 | """
16 | A set of functions used to augment data.
17 |
18 | Typical usage: ::
19 | >>> from textaugment import Word2vec
20 | >>> t = Word2vec(model='path/to/gensim/model'or 'gensim model itself')
21 | >>> t.augment('I love school', top_n=10)
22 | i adore school
23 | """
24 |
25 | def __init__(self, **kwargs):
26 | """
27 | A method to initialize a model on a given path.
28 | :type random_state: int, float, str, bytes, bytearray
29 | :param random_state: seed
30 | :type model: str or gensim.models.word2vec.Word2Vec or gensim.models.fasttext.FastText
31 | :param model: The path to the model or the model itself.
32 | :type runs: int, optional
33 | :param runs: The number of times to augment a sentence. By default is 1.
34 | :type v: bool or optional
35 | :param v: Replace all the words if true. If false randomly replace words.
36 | Used in a Paper (https://www.cs.cmu.edu/~diyiy/docs/emnlp_wang_2015.pdf)
37 | :type p: float, optional
38 | :param p: The probability of success of an individual trial. (0.1
>> from textaugment import Fasttext
162 | >>> t = Fasttext('path/to/gensim/model'or 'gensim model itself')
163 | >>> t.augment('I love school', top_n=10)
164 | i adore school
165 | """
166 | pass
167 |
--------------------------------------------------------------------------------
/textaugment/wordnet.py:
--------------------------------------------------------------------------------
1 | #!/usr/bin/env python
2 | # WordNet-based data augmentation
3 | #
4 | # Copyright (C) 2023
5 | # Author: Joseph Sefara
6 | # URL:
7 | # For license information, see LICENSE
8 |
9 | import numpy as np
10 | import nltk
11 | from itertools import chain
12 | from nltk.corpus import wordnet
13 |
14 |
15 | class Wordnet:
16 | """
17 | A set of functions used to augment data.
18 |
19 | Typical usage: ::
20 | >>> import nltk
21 | >>> nltk.download('punkt')
22 | >>> nltk.download('wordnet')
23 | >>> nltk.download('averaged_perceptron_tagger')
24 | >>> from textaugment import Wordnet
25 | >>> t = Wordnet(v=True,n=True,p=0.5)
26 | >>> t.augment('I love school')
27 | i adore school
28 | """
29 |
30 | def __init__(self, **kwargs):
31 | """
32 | A method to initialize parameters
33 |
34 | :type random_state: int
35 | :param random_state: seed
36 | :type v: bool
37 | :param v: Verb, default is True
38 | :type n: bool
39 | :param n: Noun
40 | :type runs: int
41 | :param runs: Number of repetition on single text
42 | :type p: float, optional
43 | :param p: The probability of success of an individual trial. (0.1
= 1: # There are synonyms
126 | for word in words:
127 | synonyms1 = wordnet.synsets(word[1], wordnet.VERB, lang=lang) # Return verbs only
128 | synonyms = list(set(chain.from_iterable([syn.lemma_names(lang=lang) for syn in synonyms1])))
129 | synonyms_ = [] # Synonyms with no underscores goes here
130 | for w in synonyms:
131 | if '_' not in w:
132 | synonyms_.append(w) # Remove words with underscores
133 | if len(synonyms_) >= 1:
134 | synonyms_ = synonyms_[:top_n if top_n else len(synonyms_)] # use top n or all synonyms
135 | synonym = self.geometric(data=synonyms_).tolist()
136 | if synonym: # There is a synonym
137 | data[int(word[0])] = synonym[0].lower() # Take the first success
138 |
139 | if self.n:
140 | for loop in range(self.runs):
141 | words = [[i, x] for i, x, y in data_tokens if y[0] == 'N']
142 | words = [i for i in self.geometric(data=words)] # List of selected words
143 | if len(words) >= 1: # There are synonyms
144 | for word in words:
145 | synonyms1 = wordnet.synsets(word[1], wordnet.NOUN, lang=lang) # Return nouns only
146 | synonyms = list(set(chain.from_iterable([syn.lemma_names(lang=lang) for syn in synonyms1])))
147 | synonyms_ = [] # Synonyms with no underscores goes here
148 | for w in synonyms:
149 | if '_' not in w:
150 | synonyms_.append(w) # Remove words with underscores
151 | if len(synonyms_) >= 1:
152 | synonyms_ = synonyms_[:top_n if top_n else len(synonyms_)] # use top n or all synonyms
153 | synonym = self.geometric(data=synonyms_).tolist()
154 | if synonym: # There is a synonym
155 | data[int(word[0])] = synonym[0].lower() # Take the first success
156 |
157 | return " ".join(data)
158 |
159 | def augment(self, data, lang="eng", top_n=10):
160 | """
161 | Data augmentation for text. Generate new dataset based on verb/nouns synonyms.
162 |
163 | :type data: str
164 | :param data: sentence used for data augmentation
165 | :rtype: str
166 | :return: The augmented data
167 | :type lang: str
168 | :param lang: choose lang
169 | :type top_n: int
170 | :param top_n: top_n of synonyms to randomly choose from
171 |
172 | :rtype: str
173 | :return: The augmented data
174 | """
175 | # Error handling
176 | if type(data) is not str:
177 | raise TypeError("Only strings are supported")
178 | if type(lang) is not str:
179 | raise TypeError("Only strings are supported")
180 | if type(top_n) is not int:
181 | raise TypeError("Only integers are supported")
182 |
183 | data = self.replace(data, lang, top_n)
184 | return data
185 |
--------------------------------------------------------------------------------