├── .gitignore
├── LICENSE
├── README.md
└── data_augmentation
    ├── ppdb-xl.txt
    ├── word_replacment.py
    └── words_shuffling.py


/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/
105 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Opla.ai
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Text augmentation for Machine Learning tasks: 
 2 | # How to grow your text dataset for classification?
 3 | 
 4 | 
 5 | This is the implementation of some of data augmentation techniques we explained in our blog article
 6 | https://medium.com/opla/text-augmentation-for-machine-learning-tasks-how-to-grow-your-text-dataset-for-classification-38a9a207f88d
 7 | 
 8 | Text augmentation is a technique used when the training data is not enough to achieve accurate performance on machine learning tasks. The goal here is to extend the text data by creating new similar content.
 9 | 
10 | words_shuffling.py allows to dirsupt the order of a sentence words to create a new senteence.
11 | 
12 | word_replacement.py allows to replace some words of each sentence with a similar word, a synonym in this case. 
13 | 
14 | Synonyms are listed in ppdb-xl.txt extracted from http://paraphrase.org/#/ . 
15 | 
16 | Requirements:
17 |   - NLTK
18 |   
19 | 
20 | 


--------------------------------------------------------------------------------
/data_augmentation/word_replacment.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Copyright (c) 2015-present, CWB SAS
 6 | 
 7 | This source code is licensed under the MIT license found in the
 8 | LICENSE file in the root directory of this source tree.
 9 | 
10 | __author__ = "Maali Mnasri"
11 | __copyright__ = "Copyright (c) 2015-present, CWB SAS - All Rights Reserved"
12 | """
13 |     
14 | from nltk import word_tokenize
15 | from nltk.corpus import stopwords
16 | 
17 | stoplist = stopwords.words('english')
18 | 
19 | 
20 | def get_synonyms_lexicon(path):
21 |     synonyms_lexicon = {}
22 |     text_entries = [l.strip() for l in open(path).readlines()]
23 |     for e in text_entries:
24 |         e = e.split(' ')
25 |         k = e[0]
26 |         v = e[1:len(e)]
27 |         synonyms_lexicon[k] = v
28 |     return synonyms_lexicon
29 | 
30 | 
31 | def synonym_replacement(sentence, synonyms_lexicon):
32 |     keys = synonyms_lexicon.keys()
33 |     words = word_tokenize(sentence)
34 |     n_sentence = sentence
35 |     for w in words:
36 |         if w not in stoplist:
37 |             if w in keys:
38 |                 n_sentence = n_sentence.replace(w, synonyms_lexicon[w][0])  # we replace with the first synonym
39 |     return n_sentence
40 | 
41 | 
42 | if __name__ == '__main__':
43 |     text = 'Many customers initiated a return process of the product as it was not suitable for use.' \
44 |            'It was conditioned in very thin box which caused scratches on the main screen.' \
45 |            'The involved businesses positively answered their clients who were fully refunded.'
46 |     sentences = text.split('.')
47 |     sentences.remove('')
48 |     print sentences
49 |     synonyms_lexicon = get_synonyms_lexicon('./ppdb-xl.txt')
50 |     for sentence in sentences:
51 |         new_sentence = synonym_replacement(sentence, synonyms_lexicon)
52 |         print '%s' % sentence
53 |         print '%s' % new_sentence
54 |         print '\n'
55 | 


--------------------------------------------------------------------------------
/data_augmentation/words_shuffling.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | 
 4 | """
 5 | Copyright (c) 2015-present, CWB SAS
 6 | 
 7 | This source code is licensed under the MIT license found in the
 8 | LICENSE file in the root directory of this source tree.
 9 | 
10 | __author__ = "Maali Mnasri"
11 | __copyright__ = "Copyright (c) 2015-present, CWB SAS - All Rights Reserved"
12 | """
13 | 
14 | from nltk import word_tokenize
15 | import random
16 | 
17 | 
18 | def augment(sentence,n):
19 |     new_sentences = []
20 |     words = word_tokenize(sentence)
21 |     for i in range(n):
22 |         random.shuffle(words)
23 |         new_sentences.append(' '.join(words))
24 |     new_sentences = list(set(new_sentences))
25 |     return new_sentences
26 | 
27 | 
28 | nsentences = augment("my new year resolution is to perfect many things as the main solution",10)
29 | for s in nsentences:
30 |     print s
31 | 


--------------------------------------------------------------------------------