├── test
├── __init__.py
├── res
│ └── audio
│ │ ├── Pink_noise.ogg
│ │ └── Yamaha-V50-Rock-Beat-120bpm.wav
├── augmenter
│ ├── word
│ │ ├── test_split.py
│ │ ├── test_antonym.py
│ │ ├── test_spelling.py
│ │ └── test_random_word.py
│ ├── audio
│ │ ├── test_shift.py
│ │ ├── test_audio.py
│ │ ├── test_vtlp.py
│ │ ├── test_crop.py
│ │ ├── test_speed.py
│ │ ├── test_loudness.py
│ │ ├── test_pitch.py
│ │ ├── test_mask.py
│ │ └── test_noise.py
│ ├── spectrogram
│ │ ├── test_spectrogram.py
│ │ ├── test_time_masking.py
│ │ └── test_frequency_masking.py
│ ├── char
│ │ ├── test_ocr.py
│ │ └── test_keyboard.py
│ ├── sentence
│ │ └── test_sentence.py
│ └── test_augmenter.py
├── model
│ ├── word
│ │ └── test_word_embs_model.py
│ └── char
│ │ └── test_keyboard_model.py
├── run_test.py
├── profiling
│ └── sentence
│ │ └── test_context_word_embs_sentence_profiling.py
└── flow
│ ├── test_sometimes.py
│ └── test_sequential.py
├── nlpaug
├── augmenter
│ ├── __init__.py
│ ├── augment.py
│ ├── sentence
│ │ ├── __init__.py
│ │ └── sentence_augmenter.py
│ ├── char
│ │ ├── __init__.py
│ │ └── char_augmenter.py
│ ├── spectrogram
│ │ ├── __init__.py
│ │ ├── time_warping.py
│ │ ├── spectrogram_augmenter.py
│ │ ├── time_masking.py
│ │ └── frequency_masking.py
│ ├── word
│ │ ├── __init__.py
│ │ └── split.py
│ └── audio
│ │ ├── __init__.py
│ │ ├── audio_augmenter.py
│ │ ├── shift.py
│ │ ├── speed.py
│ │ ├── loudness.py
│ │ ├── noise.py
│ │ ├── vtlp.py
│ │ ├── pitch.py
│ │ ├── crop.py
│ │ └── mask.py
├── model
│ ├── __init__.py
│ ├── char
│ │ ├── char.py
│ │ ├── __init__.py
│ │ ├── ocr.py
│ │ └── keyboard.py
│ ├── spectrogram
│ │ ├── spectrogram.py
│ │ ├── __init__.py
│ │ ├── time_masking.py
│ │ ├── frequency_masking.py
│ │ └── time_warping.py
│ ├── word_stats
│ │ ├── __init__.py
│ │ └── word_statistics.py
│ ├── word_embs
│ │ ├── __init__.py
│ │ ├── fasttext.py
│ │ ├── word_embeddings.py
│ │ ├── glove.py
│ │ └── word2vec.py
│ ├── word_dict
│ │ ├── __init__.py
│ │ ├── word_dictionary.py
│ │ ├── wordnet.py
│ │ ├── spelling.py
│ │ └── ppdb.py
│ ├── lang_models
│ │ ├── __init__.py
│ │ ├── gpt2.py
│ │ ├── distilbert.py
│ │ ├── bert.py
│ │ └── roberta.py
│ └── audio
│ │ ├── __init__.py
│ │ ├── shift.py
│ │ ├── crop.py
│ │ ├── loudness.py
│ │ ├── mask.py
│ │ ├── speed.py
│ │ ├── pitch.py
│ │ ├── audio.py
│ │ ├── noise.py
│ │ └── vtlp.py
├── util
│ ├── file
│ │ └── __init__.py
│ ├── math
│ │ ├── __init__.py
│ │ └── normalization.py
│ ├── decorator
│ │ ├── __init__.py
│ │ └── deprecation.py
│ ├── audio
│ │ ├── __init__.py
│ │ ├── loader.py
│ │ └── visualizer.py
│ ├── text
│ │ ├── __init__.py
│ │ ├── tokenizer.py
│ │ └── part_of_speech.py
│ ├── exception
│ │ ├── __init__.py
│ │ ├── exception_info.py
│ │ └── warning.py
│ ├── selection
│ │ ├── __init__.py
│ │ └── randomness.py
│ ├── doc
│ │ ├── __init__.py
│ │ ├── token.py
│ │ ├── change_log.py
│ │ └── doc.py
│ ├── __init__.py
│ ├── method.py
│ └── action.py
├── __init__.py
├── flow
│ ├── __init__.py
│ ├── sequential.py
│ └── sometimes.py
├── .gitignore
└── res
│ └── char
│ └── keyboard
│ ├── en.json
│ └── th.json
├── MANIFEST.in
├── .readthedocs.yml
├── .gitattributes
├── res
├── logo_small.png
├── audio_example.png
└── textual_example.png
├── docs
├── util
│ ├── util.rst
│ └── download.rst
├── flow
│ ├── flow.rst
│ ├── sequential.rst
│ └── sometimes.rst
├── augmenter
│ ├── sentence
│ │ ├── sentence.rst
│ │ └── context_word_embs_sentence.rst
│ ├── char
│ │ ├── char.rst
│ │ ├── keyboard.rst
│ │ ├── ocr.rst
│ │ └── random.rst
│ ├── spectrogram
│ │ ├── spectrogram.rst
│ │ ├── time_masking.rst
│ │ └── frequency_masking.rst
│ ├── audio
│ │ ├── vtlp.rst
│ │ ├── corp.rst
│ │ ├── mask.rst
│ │ ├── noise.rst
│ │ ├── pitch.rst
│ │ ├── shift.rst
│ │ ├── speed.rst
│ │ ├── loudness.rst
│ │ └── audio.rst
│ ├── word
│ │ ├── split.rst
│ │ ├── random.rst
│ │ ├── tfidf.rst
│ │ ├── antonym.rst
│ │ ├── synonym.rst
│ │ ├── spelling.rst
│ │ ├── word_embs.rst
│ │ ├── context_word_embs.rst
│ │ └── word.rst
│ └── augmenter.rst
├── index.rst
├── Makefile
├── example
│ └── example.rst
├── make.bat
└── overview
│ └── overview.rst
├── .codacy.yml
├── requirements.txt
├── .travis.yml
├── script.txt
├── setup.py
├── LICENSE
├── codecov.yml
├── .gitignore
├── model
└── char
│ └── keyboard
│ ├── en.json
│ └── th.json
└── example
└── tfidf-train_model.ipynb
/test/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/nlpaug/model/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include nlpaug/res *.json
--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | python:
2 | version: 3.6
3 |
--------------------------------------------------------------------------------
/nlpaug/util/file/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.file.download import *
2 |
--------------------------------------------------------------------------------
/nlpaug/util/math/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.math.normalization import *
2 |
--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 |
--------------------------------------------------------------------------------
/nlpaug/util/decorator/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.decorator.deprecation import *
2 |
--------------------------------------------------------------------------------
/res/logo_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/nlpaug/master/res/logo_small.png
--------------------------------------------------------------------------------
/res/audio_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/nlpaug/master/res/audio_example.png
--------------------------------------------------------------------------------
/docs/util/util.rst:
--------------------------------------------------------------------------------
1 | Util
2 | ====
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 |
7 | ./download
8 |
--------------------------------------------------------------------------------
/res/textual_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/nlpaug/master/res/textual_example.png
--------------------------------------------------------------------------------
/test/res/audio/Pink_noise.ogg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/nlpaug/master/test/res/audio/Pink_noise.ogg
--------------------------------------------------------------------------------
/.codacy.yml:
--------------------------------------------------------------------------------
1 | exclude_paths:
2 | - test/*
3 | - README.md
4 | - CHANGE.md
5 | - SOURCE.md
6 | - docs/conf.py
--------------------------------------------------------------------------------
/nlpaug/model/char/char.py:
--------------------------------------------------------------------------------
1 | class Character:
2 | def __init__(self, cache=True):
3 | self.cache = cache
4 |
--------------------------------------------------------------------------------
/docs/flow/flow.rst:
--------------------------------------------------------------------------------
1 | Flow
2 | ====
3 |
4 | .. toctree::
5 | :maxdepth: 3
6 |
7 | ./sequential
8 | ./sometimes
9 |
--------------------------------------------------------------------------------
/nlpaug/util/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.audio.loader import *
2 | from nlpaug.util.audio.visualizer import *
3 |
--------------------------------------------------------------------------------
/nlpaug/util/text/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.text.tokenizer import *
2 | from nlpaug.util.text.part_of_speech import *
3 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | matplotlib>=2.2.2
2 | numpy>=1.16.2
3 | setuptools>=39.1.0
4 | python-dotenv>=0.10.1
5 | requests>=2.22.0
6 |
--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/spectrogram.py:
--------------------------------------------------------------------------------
1 | class Spectrogram:
2 | def mask(self, data):
3 | raise NotImplementedError
4 |
--------------------------------------------------------------------------------
/nlpaug/util/exception/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.exception.exception_info import *
2 | from nlpaug.util.exception.warning import *
3 |
--------------------------------------------------------------------------------
/nlpaug/util/selection/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.selection.filtering import *
2 | from nlpaug.util.selection.randomness import *
3 |
--------------------------------------------------------------------------------
/test/res/audio/Yamaha-V50-Rock-Beat-120bpm.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/binhetech/nlpaug/master/test/res/audio/Yamaha-V50-Rock-Beat-120bpm.wav
--------------------------------------------------------------------------------
/nlpaug/util/doc/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.doc.doc import *
2 | from nlpaug.util.doc.change_log import *
3 | from nlpaug.util.doc.token import *
4 |
--------------------------------------------------------------------------------
/docs/augmenter/sentence/sentence.rst:
--------------------------------------------------------------------------------
1 | Sentence Augmenter
2 | ==================
3 |
4 | .. toctree::
5 | :maxdepth: 6
6 |
7 | ./context_word_embs_sentence
8 |
--------------------------------------------------------------------------------
/docs/augmenter/char/char.rst:
--------------------------------------------------------------------------------
1 | Character Augmenter
2 | ===================
3 |
4 | .. toctree::
5 | :maxdepth: 6
6 |
7 | ./keyboard
8 | ./ocr
9 | ./random
--------------------------------------------------------------------------------
/nlpaug/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.base_augmenter import *
3 |
4 | __all__ = ['base_augmenter']
5 |
6 | __version__ = '0.0.14'
7 |
--------------------------------------------------------------------------------
/docs/augmenter/spectrogram/spectrogram.rst:
--------------------------------------------------------------------------------
1 | Spectrogram Augmenter
2 | =====================
3 |
4 | .. toctree::
5 | :maxdepth: 6
6 |
7 | ./frequency_masking
8 | ./time_masking
--------------------------------------------------------------------------------
/nlpaug/flow/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.flow.pipeline import *
3 | from nlpaug.flow.sequential import *
4 | from nlpaug.flow.sometimes import *
5 |
--------------------------------------------------------------------------------
/nlpaug/model/word_stats/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.word_stats.word_statistics import *
3 | from nlpaug.model.word_stats.tfidf import *
4 |
--------------------------------------------------------------------------------
/docs/flow/sequential.rst:
--------------------------------------------------------------------------------
1 | nlpaug.flow\.sequential
2 | ==========================================
3 |
4 | .. automodule:: nlpaug.flow.sequential
5 | :members:
6 | :show-inheritance:
7 |
--------------------------------------------------------------------------------
/docs/flow/sometimes.rst:
--------------------------------------------------------------------------------
1 | nlpaug.flow\.sometimes
2 | ==========================================
3 |
4 | .. automodule:: nlpaug.flow.sometimes
5 | :members:
6 | :show-inheritance:
7 |
--------------------------------------------------------------------------------
/docs/util/download.rst:
--------------------------------------------------------------------------------
1 | nlpaug.util.file\.download
2 | ==========================================
3 |
4 | .. automodule:: nlpaug.util.file.download
5 | :members:
6 | :show-inheritance:
7 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/augment.py:
--------------------------------------------------------------------------------
1 |
2 |
3 | class Augment:
4 | def __init__(self, pos, original, new):
5 | self.pos = pos
6 | self.original = original
7 | self.new = new
8 |
--------------------------------------------------------------------------------
/nlpaug/model/char/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.char.char import *
3 | from nlpaug.model.char.keyboard import *
4 | from nlpaug.model.char.ocr import *
5 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/sentence/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.augmenter.sentence.sentence_augmenter import *
3 | from nlpaug.augmenter.sentence.context_word_embs_sentence import *
4 |
--------------------------------------------------------------------------------
/docs/augmenter/audio/vtlp.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.vtlp
2 | ============================
3 |
4 | .. automodule:: nlpaug.augmenter.audio.vtlp
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/word/split.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.split
2 | ============================
3 |
4 | .. automodule:: nlpaug.augmenter.word.split
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/word/random.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.random
2 | ================================
3 |
4 | .. automodule:: nlpaug.augmenter.word.random
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/word/tfidf.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.tfidf
2 | ================================
3 |
4 | .. automodule:: nlpaug.augmenter.word.tfidf
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/char/keyboard.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.char\.keyboard
2 | ===============================
3 |
4 | .. automodule:: nlpaug.augmenter.char.keyboard
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/char/ocr.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.char\.ocr
2 | ==============================================
3 |
4 | .. automodule:: nlpaug.augmenter.char.ocr
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/word/antonym.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.antonym
2 | ==============================
3 |
4 | .. automodule:: nlpaug.augmenter.word.antonym
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/word/synonym.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.synonym
2 | ==============================
3 |
4 | .. automodule:: nlpaug.augmenter.word.synonym
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/augmenter.rst:
--------------------------------------------------------------------------------
1 | Augmenter
2 | =========
3 |
4 | .. toctree::
5 | :maxdepth: 6
6 |
7 | ./audio/audio
8 | ./char/char
9 | ./sentence/sentence
10 | ./spectrogram/spectrogram
11 | ./word/word
--------------------------------------------------------------------------------
/docs/augmenter/word/spelling.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.spelling
2 | ================================
3 |
4 | .. automodule:: nlpaug.augmenter.word.spelling
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/word/word_embs.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.word_embs
2 | ================================
3 |
4 | .. automodule:: nlpaug.augmenter.word.word_embs
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/audio/corp.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.crop
2 | ==============================================
3 |
4 | .. automodule:: nlpaug.augmenter.audio.crop
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/audio/mask.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.mask
2 | ==============================================
3 |
4 | .. automodule:: nlpaug.augmenter.audio.mask
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/audio/noise.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.noise
2 | ==============================================
3 |
4 | .. automodule:: nlpaug.augmenter.audio.noise
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/audio/pitch.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.pitch
2 | ==============================================
3 |
4 | .. automodule:: nlpaug.augmenter.audio.pitch
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/audio/shift.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.shift
2 | ==============================================
3 |
4 | .. automodule:: nlpaug.augmenter.audio.shift
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/audio/speed.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.speed
2 | ==============================================
3 |
4 | .. automodule:: nlpaug.augmenter.audio.speed
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/char/random.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.char\.random
2 | ==============================================
3 |
4 | .. automodule:: nlpaug.augmenter.char.random
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.spectrogram.spectrogram import *
3 | from nlpaug.model.spectrogram.frequency_masking import *
4 | from nlpaug.model.spectrogram.time_masking import *
--------------------------------------------------------------------------------
/docs/augmenter/audio/loudness.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.loudness
2 | ==============================================
3 |
4 | .. automodule:: nlpaug.augmenter.audio.loudness
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/audio/audio.rst:
--------------------------------------------------------------------------------
1 | Audio Augmenter
2 | ===============
3 |
4 | .. toctree::
5 | :maxdepth: 6
6 |
7 | ./corp
8 | ./loudness
9 | ./mask
10 | ./noise
11 | ./pitch
12 | ./shift
13 | ./speed
14 | ./vtlp
--------------------------------------------------------------------------------
/docs/augmenter/word/context_word_embs.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.context_word_embs
2 | ========================================
3 |
4 | .. automodule:: nlpaug.augmenter.word.context_word_embs
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/char/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.augmenter.char.char_augmenter import *
3 | from nlpaug.augmenter.char.ocr import *
4 | from nlpaug.augmenter.char.random import *
5 | from nlpaug.augmenter.char.keyboard import *
6 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/spectrogram/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.augmenter.spectrogram.spectrogram_augmenter import *
3 | from nlpaug.augmenter.spectrogram.frequency_masking import *
4 | from nlpaug.augmenter.spectrogram.time_masking import *
--------------------------------------------------------------------------------
/docs/augmenter/spectrogram/time_masking.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.spectrogram\.time_masking
2 | ==========================================
3 |
4 | .. automodule:: nlpaug.augmenter.spectrogram.time_masking
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/nlpaug/model/word_embs/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.word_embs.word_embeddings import *
3 | from nlpaug.model.word_embs.glove import *
4 | from nlpaug.model.word_embs.word2vec import *
5 | from nlpaug.model.word_embs.fasttext import *
--------------------------------------------------------------------------------
/docs/augmenter/word/word.rst:
--------------------------------------------------------------------------------
1 | Word Augmenter
2 | ==============
3 |
4 | .. toctree::
5 | :maxdepth: 6
6 |
7 | ./antonym
8 | ./context_word_embs
9 | ./random
10 | ./spelling
11 | ./split
12 | ./synonym
13 | ./tfidf
14 | ./word_embs
15 |
--------------------------------------------------------------------------------
/nlpaug/model/word_dict/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.word_dict.word_dictionary import *
3 | from nlpaug.model.word_dict.spelling import *
4 | from nlpaug.model.word_dict.wordnet import *
5 | from nlpaug.model.word_dict.ppdb import *
6 |
--------------------------------------------------------------------------------
/docs/augmenter/spectrogram/frequency_masking.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.spectrogram\.frequency_masking
2 | ===============================================
3 |
4 | .. automodule:: nlpaug.augmenter.spectrogram.frequency_masking
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/docs/augmenter/sentence/context_word_embs_sentence.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.sentence\.context_word_embs_sentence
2 | =====================================================
3 |
4 | .. automodule:: nlpaug.augmenter.sentence.context_word_embs_sentence
5 | :members:
6 | :inherited-members:
7 | :show-inheritance:
8 |
--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
1 | language: python
2 | python:
3 | - "3.6"
4 |
5 | install:
6 | - pip install -r requirements.txt
7 | - pip install coverage
8 | - pip install codecov
9 | - pip install .
10 | script:
11 | - python test/run_test.py
12 | - coverage run test/run_test.py
13 |
14 | after_success:
15 | - codecov
--------------------------------------------------------------------------------
/nlpaug/util/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.action import *
2 | from nlpaug.util.doc import *
3 | from nlpaug.util.method import *
4 | from nlpaug.util.exception import *
5 | from nlpaug.util.math import *
6 | from nlpaug.util.text import *
7 | from nlpaug.util.audio import *
8 |
9 | from nlpaug.util.file import *
10 | from nlpaug.util.decorator import *
11 |
--------------------------------------------------------------------------------
/nlpaug/util/method.py:
--------------------------------------------------------------------------------
1 | class Method:
2 | CHAR = 'char'
3 | WORD = 'word'
4 | SENTENCE = 'sentence'
5 | SPECTROGRAM = 'spectrogram'
6 | AUDIO = 'audio'
7 |
8 | FLOW = 'flow'
9 |
10 | @staticmethod
11 | def getall():
12 | return [Method.CHAR, Method.WORD, Method.SENTENCE, Method.AUDIO, Method.SPECTROGRAM, Method.FLOW]
13 |
14 |
--------------------------------------------------------------------------------
/nlpaug/model/lang_models/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.lang_models.language_models import *
3 | from nlpaug.model.lang_models.bert import *
4 | from nlpaug.model.lang_models.xlnet import *
5 | from nlpaug.model.lang_models.gpt2 import *
6 | from nlpaug.model.lang_models.distilbert import *
7 | from nlpaug.model.lang_models.roberta import *
8 |
--------------------------------------------------------------------------------
/nlpaug/util/selection/randomness.py:
--------------------------------------------------------------------------------
1 | try:
2 | import torch
3 | except ImportError:
4 | # No installation required if not using this function
5 | pass
6 | import numpy as np
7 | import random
8 |
9 |
10 | class Randomness:
11 | @staticmethod
12 | def seed(seed):
13 | random.seed(seed)
14 | np.random.seed(seed)
15 | torch.manual_seed(seed)
16 | torch.cuda.manual_seed(seed)
17 |
--------------------------------------------------------------------------------
/nlpaug/model/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.audio.audio import *
3 | from nlpaug.model.audio.noise import *
4 | from nlpaug.model.audio.shift import *
5 | from nlpaug.model.audio.speed import *
6 | from nlpaug.model.audio.pitch import *
7 | from nlpaug.model.audio.loudness import *
8 | from nlpaug.model.audio.crop import *
9 | from nlpaug.model.audio.mask import *
10 | from nlpaug.model.audio.vtlp import *
11 |
--------------------------------------------------------------------------------
/nlpaug/util/exception/exception_info.py:
--------------------------------------------------------------------------------
1 |
2 | class ExceptionInfo:
3 | def __init__(self, name, exp_type, code, msg):
4 | self.name = name
5 | self.exp_type = exp_type
6 | self.code = code
7 | self.msg = msg
8 |
9 | def output(self):
10 | msg = '[{}] Name:{}, Code:{}, Message:{}'.format(self.exp_type, self.name, self.code, self.msg)
11 | print(msg)
12 |
13 |
14 | class ExceptionType:
15 | WARNING = 'Warning'
--------------------------------------------------------------------------------
/nlpaug/util/action.py:
--------------------------------------------------------------------------------
1 | class Action:
2 | INSERT = 'insert'
3 | SUBSTITUTE = 'substitute'
4 | DELETE = 'delete'
5 | SWAP = 'swap'
6 | SPLIT = 'split'
7 | ALIGN = 'align'
8 |
9 | SEQUENTIAL = 'sequential'
10 | SOMETIMES = 'sometimes'
11 |
12 | @staticmethod
13 | def getall():
14 | return [Action.INSERT, Action.SUBSTITUTE, Action.SWAP, Action.DELETE, Action.SPLIT,
15 | Action.SEQUENTIAL, Action.SOMETIMES, Action.ALIGN]
--------------------------------------------------------------------------------
/nlpaug/util/text/tokenizer.py:
--------------------------------------------------------------------------------
1 | import re
2 |
3 | ADDING_SPACE_AROUND_PUNCTUATION_REGEX = re.compile(r'(?=1.1.0
24 |
25 |
--------------------------------------------------------------------------------
/nlpaug/util/exception/warning.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.exception.exception_info import ExceptionInfo, ExceptionType
2 |
3 |
4 | class WarningException(ExceptionInfo):
5 | def __init__(self, name, code, msg):
6 | super(WarningException, self).__init__(name=name, exp_type=ExceptionType.WARNING, code=code, msg=msg)
7 |
8 |
9 | class WarningName:
10 | INPUT_VALIDATION_WARNING = 'Input validation issue'
11 | OUT_OF_VOCABULARY = 'Out of vocabulary issue'
12 |
13 |
14 | class WarningCode:
15 | WARNING_CODE_001 = 'W001'
16 | WARNING_CODE_002 = 'W002'
17 |
18 |
19 | class WarningMessage:
20 | LENGTH_IS_ZERO = 'Length of input is 0'
21 | NO_WORD = 'No other word except stop words and OOV. Returning input data without augmentation'
22 |
23 | DEPRECATED = 'Warning: {} will be removed after {} release. Change to use {}'
24 |
--------------------------------------------------------------------------------
/docs/example/example.rst:
--------------------------------------------------------------------------------
1 | Example
2 | =======
3 |
4 | The following examples show a standard use case for augmenter.
5 |
6 | - `Audio augmenters`_
7 | - `Textual augmenters`_
8 | - `Spectrogram augmenters`_
9 | - `Custom augmenter`_
10 | - `TF-IDF model training`_
11 | - `Flow`_
12 |
13 | .. _Audio augmenters: https://github.com/makcedward/nlpaug/blob/master/example/audio_augmenter.ipynb
14 | .. _Textual augmenters: https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb
15 | .. _Spectrogram augmenters: https://github.com/makcedward/nlpaug/blob/master/example/spectrogram_augmenter.ipynb
16 | .. _Custom augmenter: https://github.com/makcedward/nlpaug/blob/master/example/custom_augmenter.ipynb
17 | .. _TF-IDF model training: https://github.com/makcedward/nlpaug/blob/master/example/tfidf-train_model.ipynb
18 | .. _Flow: https://github.com/makcedward/nlpaug/blob/master/example/flow.ipynb
--------------------------------------------------------------------------------
/nlpaug/flow/sequential.py:
--------------------------------------------------------------------------------
1 | """
2 | Flow that apply augmentation sequentially.
3 | """
4 |
5 | from nlpaug.util import Action
6 | from nlpaug.flow import Pipeline
7 |
8 |
9 | class Sequential(Pipeline):
10 | """
11 | Flow that apply augmenters sequentially.
12 |
13 | :param list flow: list of flow or augmenter
14 | :param str name: Name of this augmenter
15 |
16 | >>> import nlpaug.flow as naf
17 | >>> import nlpaug.augmenter.char as nac
18 | >>> import nlpaug.augmenter.word as naw
19 | >>> flow = naf.Sequential([nac.RandomCharAug(), naw.RandomWordAug()])
20 | """
21 |
22 | def __init__(self, flow=None, name='Sequential_Pipeline', include_detail=False, verbose=0):
23 | Pipeline.__init__(self, name=name, action=Action.SEQUENTIAL, flow=flow, include_detail=include_detail,
24 | verbose=verbose)
25 |
26 | def draw(self):
27 | return True
28 |
--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
1 | @ECHO OFF
2 |
3 | pushd %~dp0
4 |
5 | REM Command file for Sphinx documentation
6 |
7 | if "%SPHINXBUILD%" == "" (
8 | set SPHINXBUILD=sphinx-build
9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=../build
12 | set SPHINXPROJ=nlpaug
13 |
14 | if "%1" == "" goto help
15 |
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | echo.
19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | echo.installed, then set the SPHINXBUILD environment variable to point
21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | echo.may add the Sphinx directory to PATH.
23 | echo.
24 | echo.If you don't have Sphinx installed, grab it from
25 | echo.http://sphinx-doc.org/
26 | exit /b 1
27 | )
28 |
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 |
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 |
35 | :end
36 | popd
37 |
--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
1 | from setuptools import setup, find_packages
2 | import sys
3 |
4 | if sys.version_info < (3,):
5 | sys.exit("Sorry, Python3 is required.")
6 |
7 | with open("README.md", encoding="utf8") as f:
8 | readme = f.read()
9 |
10 | setup(
11 | name="nlpaug",
12 | version="0.0.14",
13 | author="Edward Ma",
14 | author_email="makcedward@gmail.com",
15 | url="https://github.com/makcedward/nlpaug",
16 | license="MIT",
17 | description="Natural language processing augmentation library for deep neural networks",
18 | long_description=readme,
19 | long_description_content_type="text/markdown",
20 | packages=find_packages(exclude="test"),
21 | include_package_data=True,
22 | keywords=[
23 | "deep learning", "neural network", "machine learning",
24 | "nlp", "natural language processing", "text", "audio", "spectrogram",
25 | "augmentation", "adversarial attack", "ai", "ml"]
26 | )
27 |
--------------------------------------------------------------------------------
/test/augmenter/word/test_antonym.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from dotenv import load_dotenv
4 |
5 | import nlpaug.augmenter.word as naw
6 |
7 |
8 | class TestAntonym(unittest.TestCase):
9 | @classmethod
10 | def setUpClass(cls):
11 | env_config_path = os.path.abspath(os.path.join(
12 | os.path.dirname(__file__), '..', '..', '..', '.env'))
13 | load_dotenv(env_config_path)
14 |
15 | cls.augs = [
16 | naw.AntonymAug()
17 | ]
18 |
19 | def test_substitute(self):
20 | texts = [
21 | 'Good bad'
22 | ]
23 |
24 | for aug in self.augs:
25 | for text in texts:
26 | augmented_text = aug.augment(text)
27 | self.assertNotEqual(text, augmented_text)
28 |
29 | def test_skip_punctuation(self):
30 | text = '. . . . ! ? # @'
31 |
32 | for aug in self.augs:
33 | augmented_text = aug.augment(text)
34 | self.assertEqual(text, augmented_text)
35 |
--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/time_masking.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from nlpaug.model.spectrogram import Spectrogram
4 |
5 |
6 | class TimeMasking(Spectrogram):
7 | def __init__(self, mask_factor):
8 | super(TimeMasking, self).__init__()
9 |
10 | self.mask_factor = mask_factor
11 |
12 | def mask(self, data):
13 | """
14 | From: https://arxiv.org/pdf/1904.08779.pdf,
15 | Time masking is applied so that t consecutive time steps
16 | [t0, t0 + t) are masked, where t is first chosen from a
17 | uniform distribution from 0 to the time mask parameter
18 | T, and t0 is chosen from [0, tau - t).
19 | :return:
20 | """
21 |
22 | time_range = data.shape[1]
23 | self.t = np.random.randint(self.mask_factor)
24 | self.t0 = np.random.randint(time_range - self.t)
25 |
26 | augmented_mel_spectrogram = data.copy()
27 | augmented_mel_spectrogram[:, self.t0:self.t0+self.t] = 0
28 | return augmented_mel_spectrogram
29 |
--------------------------------------------------------------------------------
/test/model/word/test_word_embs_model.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from dotenv import load_dotenv
4 |
5 | import nlpaug.model.word_embs as nmw
6 |
7 |
8 | class TestWordEmbsModel(unittest.TestCase):
9 | @classmethod
10 | def setUpClass(cls):
11 | env_config_path = os.path.abspath(os.path.join(
12 | os.path.dirname(__file__), '..', '..', '..', '.env'))
13 | load_dotenv(env_config_path)
14 |
15 | def test_bogus_fasttext_loading(self):
16 | test_file = os.path.join(os.environ.get("TEST_DIR"), 'res', 'text', 'bogus_fasttext.vec')
17 | expected_vector = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
18 |
19 | fasttext = nmw.Fasttext()
20 | fasttext.read(test_file)
21 |
22 | for word in fasttext.w2v:
23 | self.assertSequenceEqual(list(fasttext.w2v[word]), expected_vector)
24 |
25 | self.assertSequenceEqual(["test1", "test2", "test_3", "test 4", "test -> 5"], fasttext.get_vocab())
26 |
27 | self.assertEqual(len(fasttext.normalized_vectors), 5)
28 |
--------------------------------------------------------------------------------
/docs/overview/overview.rst:
--------------------------------------------------------------------------------
1 | Overview
2 | ========
3 |
4 | This python library helps you with augmenting nlp for your machine learning projects. Visit this introduction to understand about Data Augmentation in NLP. Augmenter is the basic element of augmentation while Flow is a pipeline to orchestra multi augmenter together.
5 |
6 | - `Data Augmentation library for Text`_
7 | - `Data Augmentation library for Speech Recognition`_
8 | - `Data Augmentation library for Audio`_
9 | - `Does your NLP model able to prevent adversarial attack?`_
10 |
11 | .. _Data Augmentation library for Text: https://towardsdatascience.com/data-augmentation-library-for-text-9661736b13ff
12 | .. _Data Augmentation library for Speech Recognition: https://towardsdatascience.com/data-augmentation-for-speech-recognition-e7c607482e78
13 | .. _Data Augmentation library for Audio: https://towardsdatascience.com/data-augmentation-for-audio-76912b01fdf6
14 | .. _Does your NLP model able to prevent adversarial attack?: https://medium.com/hackernoon/does-your-nlp-model-able-to-prevent-adversarial-attack-45b5ab75129c
--------------------------------------------------------------------------------
/test/augmenter/audio/test_shift.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import numpy as np
4 | from dotenv import load_dotenv
5 |
6 | import nlpaug.augmenter.audio as naa
7 | from nlpaug.util import AudioLoader
8 |
9 |
10 | class TestShift(unittest.TestCase):
11 | @classmethod
12 | def setUpClass(cls):
13 | env_config_path = os.path.abspath(os.path.join(
14 | os.path.dirname(__file__), '..', '..', '..', '.env'))
15 | load_dotenv(env_config_path)
16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 | cls.sample_wav_file = os.path.join(
18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 | )
20 |
21 | def test_substitute(self):
22 | audio, sampling_rate = AudioLoader.load_audio(self.sample_wav_file)
23 |
24 | aug = naa.ShiftAug(sampling_rate, duration=0.5)
25 | augmented_audio = aug.augment(audio)
26 |
27 | self.assertFalse(np.array_equal(audio, augmented_audio))
28 | self.assertTrue(len(audio), len(augmented_audio))
29 |
--------------------------------------------------------------------------------
/nlpaug/flow/sometimes.py:
--------------------------------------------------------------------------------
1 | """
2 | Flow that apply augmentation randomly.
3 | """
4 |
5 | from nlpaug.util import Action
6 | from nlpaug.flow import Pipeline
7 |
8 |
9 | class Sometimes(Pipeline):
10 | """
11 | Flow that apply augmenters randomly.
12 |
13 | :param list flow: list of flow or augmenter
14 | :param str name: Name of this augmenter
15 |
16 | >>> import nlpaug.flow as naf
17 | >>> import nlpaug.augmenter.char as nac
18 | >>> import nlpaug.augmenter.word as naw
19 | >>> flow = naf.Sometimes([nac.RandomCharAug(), naw.RandomWordAug()])
20 | """
21 |
22 | # TODO: deprecated pipeline_p, use aug_p
23 | def __init__(self, flow=None, name='Sometimes_Pipeline', pipeline_p=0.2, aug_p=1, include_detail=False,
24 | verbose=0):
25 | Pipeline.__init__(self, name=name, action=Action.SOMETIMES,
26 | flow=flow, aug_p=aug_p, include_detail=include_detail, verbose=verbose)
27 |
28 | self.pipeline_p = pipeline_p
29 |
30 | def draw(self):
31 | return self.pipeline_p > self.prob()
32 |
--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/frequency_masking.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from nlpaug.model.spectrogram import Spectrogram
4 |
5 |
6 | class FrequencyMasking(Spectrogram):
7 | def __init__(self, mask_factor):
8 | super(FrequencyMasking, self).__init__()
9 |
10 | self.mask_factor = mask_factor
11 |
12 | def mask(self, data):
13 | """
14 | From: https://arxiv.org/pdf/1904.08779.pdf,
15 | Frequency masking is applied so that f consecutive mel
16 | frequency channels [f0, f0 + f) are masked, where f is
17 | first chosen from a uniform distribution from 0 to the
18 | frequency mask parameter F, and f0 is chosen from
19 | [0, v - f). v is the number of mel frequency channels.
20 | :return:
21 | """
22 | v = data.shape[0]
23 | self.f = np.random.randint(self.mask_factor)
24 | self.f0 = np.random.randint(v - self.f)
25 |
26 | augmented_mel_spectrogram = data.copy()
27 | augmented_mel_spectrogram[self.f0:self.f0+self.f, :] = 0
28 | return augmented_mel_spectrogram
29 |
--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
1 | MIT License
2 |
3 | Copyright (c) 2019 Edward Ma
4 |
5 | Permission is hereby granted, free of charge, to any person obtaining a copy
6 | of this software and associated documentation files (the "Software"), to deal
7 | in the Software without restriction, including without limitation the rights
8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 |
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 |
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
--------------------------------------------------------------------------------
/test/augmenter/audio/test_audio.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from dotenv import load_dotenv
4 |
5 | import nlpaug.augmenter.audio as naa
6 | from nlpaug.util import AudioLoader
7 |
8 |
9 | class TestAudio(unittest.TestCase):
10 | @classmethod
11 | def setUpClass(cls):
12 | env_config_path = os.path.abspath(os.path.join(
13 | os.path.dirname(__file__), '..', '..', '..', '.env'))
14 | load_dotenv(env_config_path)
15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
16 | cls.sample_wav_file = os.path.join(
17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
18 | )
19 |
20 | def test_multi_thread(self):
21 | audio, sampling_rate = AudioLoader.load_audio(self.sample_wav_file)
22 | n = 3
23 | augs = [
24 | naa.CropAug(sampling_rate=sampling_rate),
25 | naa.PitchAug(sampling_rate=sampling_rate)
26 | ]
27 |
28 | for num_thread in [1, 3]:
29 | for aug in augs:
30 | augmented_data = aug.augment(audio, n=n, num_thread=num_thread)
31 | self.assertEqual(len(augmented_data), n)
32 |
--------------------------------------------------------------------------------
/nlpaug/model/char/ocr.py:
--------------------------------------------------------------------------------
1 | from nlpaug.model.char import Character
2 |
3 |
4 | class Ocr(Character):
5 | def __init__(self, cache=True):
6 | super().__init__(cache)
7 |
8 | self.model = self.get_model()
9 |
10 | def predict(self, data):
11 | return self.model[data]
12 |
13 | # TODO: Read from file
14 | @classmethod
15 | def get_model(cls):
16 | mapping = {
17 | '0': ['8', '9', 'o', 'O', 'D'],
18 | '1': ['4', '7', 'l', 'I'],
19 | '2': ['z', 'Z'],
20 | '5': ['8'],
21 | '6': ['b'],
22 | '8': ['s', 'S', '@', '&'],
23 | '9': ['g'],
24 | 'o': ['u'],
25 | 'r': ['k'],
26 | 'C': ['G'],
27 | 'O': ['D', 'U'],
28 | 'E': ['B']
29 | }
30 |
31 | result = {}
32 |
33 | for k in mapping:
34 | result[k] = mapping[k]
35 |
36 | for k in mapping:
37 | for v in mapping[k]:
38 | if v not in result:
39 | result[v] = []
40 |
41 | if k not in result[v]:
42 | result[v].append(k)
43 |
44 | return result
45 |
--------------------------------------------------------------------------------
/nlpaug/util/doc/token.py:
--------------------------------------------------------------------------------
1 | class Token:
2 | def __init__(self, token, start_pos=-1, action='', change_seq=0):
3 | self._token = token
4 | self._start_pos = start_pos
5 | self._action = action
6 | self._change_seq = change_seq
7 |
8 | @property
9 | def start_pos(self):
10 | return self._start_pos
11 |
12 | @start_pos.setter
13 | def start_pos(self, v):
14 | self._start_pos = v
15 |
16 | @property
17 | def token(self):
18 | return self._token
19 |
20 | @token.setter
21 | def token(self, v):
22 | self._token = v
23 |
24 | @property
25 | def action(self):
26 | return self._action
27 |
28 | @action.setter
29 | def action(self, v):
30 | self._action = v
31 |
32 | @property
33 | def change_seq(self):
34 | return self._change_seq
35 |
36 | @change_seq.setter
37 | def change_seq(self, v):
38 | self._change_seq = v
39 |
40 | def to_dict(self):
41 | return {
42 | 'token': self.token,
43 | 'action': self.action,
44 | 'start_pos': self.start_pos,
45 | 'change_seq': self.change_seq
46 | }
47 |
--------------------------------------------------------------------------------
/test/augmenter/spectrogram/test_spectrogram.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from dotenv import load_dotenv
4 |
5 | from nlpaug.util import AudioLoader
6 | import nlpaug.augmenter.spectrogram as nas
7 |
8 |
9 | class TestFrequencyMasking(unittest.TestCase):
10 | @classmethod
11 | def setUpClass(cls):
12 | env_config_path = os.path.abspath(os.path.join(
13 | os.path.dirname(__file__), '..', '..', '..', '.env'))
14 | load_dotenv(env_config_path)
15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
16 | cls.sample_wav_file = os.path.join(
17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
18 | )
19 |
20 | def test_multi_thread(self):
21 | mel_spectrogram = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128)
22 | n = 3
23 | augs = [
24 | nas.FrequencyMaskingAug(mask_factor=80),
25 | nas.TimeMaskingAug(mask_factor=80)
26 | ]
27 |
28 | for num_thread in [1, 3]:
29 | for aug in augs:
30 | augmented_data = aug.augment(mel_spectrogram, n=n, num_thread=num_thread)
31 | self.assertEqual(len(augmented_data), n)
32 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/sentence/sentence_augmenter.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util import Method
2 | from nlpaug import Augmenter
3 |
4 |
5 | class SentenceAugmenter(Augmenter):
6 | SENTENCE_SEPARATOR = '.!?'
7 |
8 | def __init__(self, action, name='Sentence_Aug', stopwords=None, tokenizer=None, reverse_tokenizer=None,
9 | device='cuda', include_detail=False, verbose=0):
10 | super().__init__(
11 | name=name, method=Method.SENTENCE, action=action, aug_min=None, aug_max=None, device=device,
12 | verbose=verbose, include_detail=include_detail)
13 | self.tokenizer = tokenizer or self._tokenizer
14 | self.reverse_tokenizer = reverse_tokenizer or self._reverse_tokenizer
15 | self.stopwords = stopwords
16 |
17 | @classmethod
18 | def _tokenizer(cls, text):
19 | return text.split(' ')
20 |
21 | @classmethod
22 | def _reverse_tokenizer(cls, tokens):
23 | return ' '.join(tokens)
24 |
25 | @classmethod
26 | def clean(cls, data):
27 | return data.strip()
28 |
29 | @classmethod
30 | def is_duplicate(cls, dataset, data):
31 | for d in dataset:
32 | if d == data:
33 | return True
34 | return False
35 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/spectrogram/time_masking.py:
--------------------------------------------------------------------------------
1 | """
2 | Augmenter that apply time based masking to spectrogram input.
3 | """
4 |
5 | from nlpaug.augmenter.spectrogram import SpectrogramAugmenter
6 | from nlpaug.util import Action
7 | import nlpaug.model.spectrogram as nms
8 |
9 |
10 | class TimeMaskingAug(SpectrogramAugmenter):
11 | # https://arxiv.org/pdf/1904.08779.pdf
12 | """
13 | Augmenter that mask spectrogram based on time by random values.
14 |
15 | :param int mask_factor: Value between 0 and mask_factor will be picked randomly.
16 | Mask range will be between [0, tau - master_factor) while tau is time range of input.
17 | :param str name: Name of this augmenter
18 |
19 | >>> import nlpaug.augmenter.spectogram as nas
20 | >>> aug = nas.TimeMaskingAug(mask_factor=80)
21 | """
22 |
23 | def __init__(self, mask_factor, name='TimeMasking_Aug', verbose=0):
24 | super(TimeMaskingAug, self).__init__(
25 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose)
26 |
27 | self.model = self.get_model(mask_factor)
28 |
29 | def substitute(self, data):
30 | return self.model.mask(data)
31 |
32 | @classmethod
33 | def get_model(cls, mask_factor):
34 | return nms.TimeMasking(mask_factor)
35 |
--------------------------------------------------------------------------------
/test/augmenter/spectrogram/test_time_masking.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from dotenv import load_dotenv
4 | import numpy as np
5 |
6 | from nlpaug.util import AudioLoader
7 | from nlpaug.augmenter.spectrogram import TimeMaskingAug
8 |
9 |
10 | class TestTimeMasking(unittest.TestCase):
11 | @classmethod
12 | def setUpClass(cls):
13 | env_config_path = os.path.abspath(os.path.join(
14 | os.path.dirname(__file__), '..', '..', '..', '.env'))
15 | load_dotenv(env_config_path)
16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 | cls.sample_wav_file = os.path.join(
18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 | )
20 | cls.num_of_freq_channel = 128
21 |
22 | def test_substitute(self):
23 | time_mask_para = 80
24 |
25 | mel_spectrogram = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=self.num_of_freq_channel)
26 | aug = TimeMaskingAug(mask_factor=time_mask_para)
27 |
28 | augmented_mel_spectrogram = aug.augment(mel_spectrogram)
29 |
30 | self.assertEqual(len(mel_spectrogram[:, aug.model.t0]), np.count_nonzero(mel_spectrogram[:, aug.model.t0]))
31 | self.assertEqual(0, np.count_nonzero(augmented_mel_spectrogram[:, aug.model.t0]))
32 |
--------------------------------------------------------------------------------
/nlpaug/util/text/part_of_speech.py:
--------------------------------------------------------------------------------
1 | class PartOfSpeech:
2 | NOUN = 'noun'
3 | VERB = 'verb'
4 | ADJECTIVE = 'adjective'
5 | ADVERB = 'adverb'
6 |
7 | pos2con = {
8 | 'n': [
9 | 'NN', 'NNS', 'NNP', 'NNPS', # from WordNet
10 | 'NP' # from PPDB
11 | ],
12 | 'v': [
13 | 'VB', 'VBD', 'VBG', 'VBN', 'VBZ', # from WordNet
14 | 'VBP' # from PPDB
15 | ],
16 | 'a': ['JJ', 'JJR', 'JJS', 'IN'],
17 | 's': ['JJ', 'JJR', 'JJS', 'IN'], # Adjective Satellite
18 | 'r': ['RB', 'RBR', 'RBS'], # Adverb
19 | }
20 |
21 | con2pos = {}
22 | poses = []
23 | for key, values in pos2con.items():
24 | poses.extend(values)
25 | for value in values:
26 | if value not in con2pos:
27 | con2pos[value] = []
28 | con2pos[value].append(key)
29 |
30 | @staticmethod
31 | def pos2constituent(pos):
32 | if pos in PartOfSpeech.pos2con:
33 | return PartOfSpeech.pos2con[pos]
34 | return []
35 |
36 | @staticmethod
37 | def constituent2pos(con):
38 | if con in PartOfSpeech.con2pos:
39 | return PartOfSpeech.con2pos[con]
40 | return []
41 |
42 | @staticmethod
43 | def get_pos():
44 | return PartOfSpeech.poses
45 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/spectrogram/frequency_masking.py:
--------------------------------------------------------------------------------
1 | """
2 | Augmenter that apply frequency based masking to spectrogram input.
3 | """
4 |
5 | from nlpaug.augmenter.spectrogram import SpectrogramAugmenter
6 | from nlpaug.util import Action
7 | import nlpaug.model.spectrogram as nms
8 |
9 |
10 | class FrequencyMaskingAug(SpectrogramAugmenter):
11 | # https://arxiv.org/pdf/1904.08779.pdf
12 | """
13 | Augmenter that mask spectrogram based on frequency by random values.
14 |
15 | :param int mask_factor: Value between 0 and mask_factor will be picked randomly.
16 | Mask range will be between [0, v - master_factor) while v is the number of mel frequency channels.
17 | :param str name: Name of this augmenter
18 |
19 | >>> import nlpaug.augmenter.spectogram as nas
20 | >>> aug = nas.FrequencyMaskingAug(mask_factor=80)
21 | """
22 |
23 | def __init__(self, mask_factor, name='FrequencyMasking_Aug', verbose=0):
24 | super(FrequencyMaskingAug, self).__init__(
25 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose)
26 |
27 | self.model = self.get_model(mask_factor)
28 |
29 | def substitute(self, data):
30 | return self.model.mask(data)
31 |
32 | @classmethod
33 | def get_model(cls, mask_factor):
34 | return nms.FrequencyMasking(mask_factor)
35 |
--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
1 | #see https://github.com/codecov/support/wiki/Codecov-Yaml
2 | codecov:
3 | notify:
4 | require_ci_to_pass: yes
5 |
6 | coverage:
7 | precision: 2 # 2 = xx.xx%, 0 = xx%
8 | round: nearest # how coverage is rounded: down/up/nearest
9 | range: 10...90 # custom range of coverage colors from red -> yellow -> green
10 | status:
11 | # https://codecov.readme.io/v1.0/docs/commit-status
12 | project:
13 | default:
14 | against: auto
15 | target: 40% # specify the target coverage for each commit status
16 | threshold: 20% # allow this little decrease on project
17 | # https://github.com/codecov/support/wiki/Filtering-Branches
18 | # branches: master
19 | if_ci_failed: error
20 | # https://github.com/codecov/support/wiki/Patch-Status
21 | patch:
22 | default:
23 | against: parent
24 | target: 30% # specify the target "X%" coverage to hit
25 | # threshold: 50% # allow this much decrease on patch
26 | changes: false
27 |
28 | parsers:
29 | gcov:
30 | branch_detection:
31 | conditional: true
32 | loop: true
33 | macro: false
34 | method: false
35 | javascript:
36 | enable_partials: false
37 |
38 | comment:
39 | layout: header, diff
40 | require_changes: false
41 | behavior: default # update if exists else create new
42 | branches: *
--------------------------------------------------------------------------------
/test/augmenter/char/test_ocr.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | from nlpaug.augmenter.char import OcrAug
4 |
5 |
6 | class TestOcr(unittest.TestCase):
7 | def test_ocr_single_word(self):
8 | texts = ['Zoology', 'roku123456']
9 | aug = OcrAug()
10 | for text in texts:
11 | augmented_text = aug.augment(text)
12 | self.assertNotEqual(text, augmented_text)
13 |
14 | self.assertTrue(len(texts) > 0)
15 |
16 | def test_ocr_single_word_nonexist_char(self):
17 | texts = ['AAAAA', 'KKKKK']
18 | aug = OcrAug()
19 | for text in texts:
20 | augmented_text = aug.augment(text)
21 | self.assertEqual(text, augmented_text)
22 |
23 | self.assertTrue(len(texts) > 0)
24 |
25 | def test_ocr_multi_words(self):
26 | texts = ['The quick brown fox jumps over the lazy dog']
27 | aug = OcrAug()
28 |
29 | for text in texts:
30 | # Since non-exist mapping word may be drawn, try several times
31 | is_augmented = False
32 | for _ in range(10):
33 | augmented_text = aug.augment(text)
34 | is_equal = text == augmented_text
35 | if not is_equal:
36 | is_augmented = True
37 | break
38 |
39 | self.assertTrue(is_augmented)
40 |
41 | self.assertTrue(len(texts) > 0)
42 |
--------------------------------------------------------------------------------
/test/augmenter/spectrogram/test_frequency_masking.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import numpy as np
4 | from dotenv import load_dotenv
5 |
6 | from nlpaug.util import AudioLoader
7 | from nlpaug.augmenter.spectrogram import FrequencyMaskingAug
8 |
9 |
10 | class TestFrequencyMasking(unittest.TestCase):
11 | @classmethod
12 | def setUpClass(cls):
13 | env_config_path = os.path.abspath(os.path.join(
14 | os.path.dirname(__file__), '..', '..', '..', '.env'))
15 | load_dotenv(env_config_path)
16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 | cls.sample_wav_file = os.path.join(
18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 | )
20 |
21 | def test_empty_input(self):
22 | mel_spectrogram = np.array([])
23 | aug = FrequencyMaskingAug(mask_factor=80)
24 | augmented_mel_spectrogram = aug.augment(mel_spectrogram)
25 |
26 | self.assertTrue(np.array_equal(np.array([]), augmented_mel_spectrogram))
27 |
28 | def test_substitute(self):
29 | freq_mask_para = 80
30 |
31 | mel_spectrogram = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128)
32 | aug = FrequencyMaskingAug(mask_factor=freq_mask_para)
33 |
34 | augmented_mel_spectrogram = aug.augment(mel_spectrogram)
35 |
36 | self.assertEqual(len(mel_spectrogram[aug.model.f0]), np.count_nonzero(mel_spectrogram[aug.model.f0]))
37 | self.assertEqual(0, np.count_nonzero(augmented_mel_spectrogram[aug.model.f0]))
38 |
--------------------------------------------------------------------------------
/test/augmenter/sentence/test_sentence.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from dotenv import load_dotenv
4 |
5 | import nlpaug.augmenter.sentence as nas
6 | from nlpaug.util import Action, Doc
7 |
8 |
9 | class TestSentence(unittest.TestCase):
10 | @classmethod
11 | def setUpClass(cls):
12 | env_config_path = os.path.abspath(os.path.join(
13 | os.path.dirname(__file__), '..', '..', '..', '.env'))
14 | load_dotenv(env_config_path)
15 |
16 | cls.model_paths = [
17 | 'xlnet-base-cased',
18 | 'gpt2',
19 | 'distilgpt2'
20 | ]
21 |
22 | cls.text = 'The quick brown fox jumps over the lazy dog.'
23 |
24 | def test_augment_detail(self):
25 | for model_path in self.model_paths:
26 | aug = nas.ContextualWordEmbsForSentenceAug(model_path=model_path, include_detail=True)
27 |
28 | augmented_text, augment_details = aug.augment(self.text)
29 |
30 | self.assertNotEqual(self.text, augmented_text)
31 | self.assertGreater(len(augment_details), 0)
32 | for augment_detail in augment_details:
33 | self.assertTrue(augment_detail['orig_token'] in self.text)
34 | self.assertEqual(augment_detail['orig_start_pos'], -1)
35 | self.assertGreater(augment_detail['new_start_pos'], -1)
36 | self.assertGreater(augment_detail['change_seq'], 0)
37 | self.assertIn(augment_detail['action'], Action.getall())
38 |
39 | self.assertNotEqual(self.text, augmented_text)
40 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/shift.py:
--------------------------------------------------------------------------------
1 | """
2 | Augmenter that apply shifting operation to audio.
3 | """
4 |
5 | from nlpaug.augmenter.audio import AudioAugmenter
6 | import nlpaug.model.audio as nma
7 | from nlpaug.util import Action, WarningMessage
8 |
9 |
10 | class ShiftAug(AudioAugmenter):
11 | """
12 | :param int sampling_rate: Sampling rate of input audio.
13 | :param float duration: Max shifting segment (in second)
14 | :param str direction: Shifting segment to left, right or one of them. Value can be 'left', 'right' or 'random'
15 | :param str name: Name of this augmenter
16 |
17 | >>> import nlpaug.augmenter.audio as naa
18 | >>> aug = naa.ShiftAug(sampling_rate=44010)
19 | """
20 |
21 | def __init__(self, sampling_rate, duration=3, direction='random',
22 | shift_max=3, shift_direction='both',
23 | name='Shift_Aug', verbose=0):
24 | super().__init__(
25 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose)
26 |
27 | if shift_direction != 'both':
28 | print(WarningMessage.DEPRECATED.format('shift_direction', '0.0.12', 'direction'))
29 | direction = shift_direction
30 | if shift_max != 3:
31 | print(WarningMessage.DEPRECATED.format('shift_max', '0.0.12', 'duration'))
32 | duration = shift_max
33 |
34 | self.model = self.get_model(sampling_rate, duration, direction)
35 |
36 | @classmethod
37 | def get_model(cls, sampling_rate, duration, direction):
38 | return nma.Shift(sampling_rate, duration, direction)
39 |
--------------------------------------------------------------------------------
/nlpaug/model/word_dict/wordnet.py:
--------------------------------------------------------------------------------
1 | try:
2 | import nltk
3 | from nltk.corpus import wordnet
4 | except ImportError:
5 | # No installation required if not using this function
6 | pass
7 |
8 | from nlpaug.model.word_dict import WordDictionary
9 |
10 |
11 | class WordNet(WordDictionary):
12 | def __init__(self, lang, is_synonym=True):
13 | super().__init__(cache=True)
14 |
15 | self.lang = lang
16 | self.is_synonym = is_synonym
17 |
18 | try:
19 | wordnet
20 | except NameError:
21 | raise ImportError('Missed nltk library. Install it via `pip install nltk`')
22 | try:
23 | # Check whether wordnet package is downloaded
24 | wordnet.synsets('computer')
25 | # Check whether POS package is downloaded
26 | nltk.pos_tag('computer')
27 | except ImportError:
28 | nltk.download('wordnet')
29 | nltk.download('averaged_perceptron_tagger')
30 |
31 | self.model = self.read()
32 |
33 | def read(self):
34 | return wordnet
35 |
36 | def predict(self, word, pos=None):
37 | results = []
38 | for synonym in self.model.synsets(word, pos=pos, lang=self.lang):
39 | for lemma in synonym.lemmas(lang=self.lang):
40 | if self.is_synonym:
41 | results.append(lemma.name())
42 | else:
43 | for antonym in lemma.antonyms():
44 | results.append(antonym.name())
45 | return results
46 |
47 | @classmethod
48 | def pos_tag(cls, tokens):
49 | return nltk.pos_tag(tokens)
50 |
--------------------------------------------------------------------------------
/nlpaug/model/audio/shift.py:
--------------------------------------------------------------------------------
1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation
2 | import numpy as np
3 |
4 | from nlpaug.model.audio import Audio
5 |
6 |
7 | class Shift(Audio):
8 | def __init__(self, sampling_rate, duration=3,
9 | direction='random', stateless=True):
10 | """
11 | :param int sampling_rate: Sampling rate of input audio.
12 | :param float duration: Max shifting segment (in second)
13 | :param str direction: Shifting segment to left, right or one of them. Value can be 'left', 'right' or 'random'
14 | """
15 |
16 | super().__init__(duration=duration, sampling_rate=sampling_rate, stateless=stateless)
17 | # TODO: remove `both` after 0.0.12
18 | if direction in ['left', 'right', 'random', 'both']:
19 | self.direction = direction
20 | else:
21 | raise ValueError(
22 | 'shift_direction should be either left, right or both while {} is passed.'.format(direction))
23 |
24 | def manipulate(self, data):
25 | aug_shift = int(self.sampling_rate * self.duration)
26 | if self.direction == 'right':
27 | aug_shift = -aug_shift
28 | elif self.direction == 'random':
29 | direction = np.random.randint(0, 2)
30 | if direction == 1:
31 | aug_shift = -aug_shift
32 |
33 | augmented_data = np.roll(data, aug_shift)
34 |
35 | # Set to silence for heading/ tailing
36 | if aug_shift > 0:
37 | augmented_data[:aug_shift] = 0
38 | else:
39 | augmented_data[aug_shift:] = 0
40 | return augmented_data
41 |
--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/time_warping.py:
--------------------------------------------------------------------------------
1 | # import numpy as np
2 | #
3 | # from nlpaug.model import Spectrogram
4 | #
5 | #
6 | # class TimeWarping(Spectrogram):
7 | # def __init__(self, time_warp):
8 | # super(TimeWarping, self).__init__()
9 | #
10 | # self.time_warp = time_warp
11 | #
12 | # # TODO
13 | # def mask(self, mel_spectrogram):
14 | # """
15 | # From: https://arxiv.org/pdf/1904.08779.pdf,
16 | # Time warping is applied via the function
17 | # sparse image warp of tensorflow. Given
18 | # a log mel spectrogram with t time steps, we view it
19 | # as an image where the time axis is horizontal and the
20 | # frequency axis is vertical. A random point along the
21 | # horizontal line passing through the center of the image
22 | # within the time steps (W, t - W) is to be warped
23 | # either to the left or right by a distance w chosen from a
24 | # uniform distribution from 0 to the time warp parameter
25 | # W along that line.
26 | # :return:
27 | # """
28 | #
29 | # time_range = mel_spectrogram.shape[1]
30 | # self.w = np.random.randint(self.time_warp)
31 | #
32 | # center_point = np.random.randint(self.time_warp, time_range-self.time_warp)
33 | # distance = np.random.randint(-self.w, self.w)
34 | #
35 | # # self.w0 = np.random.randint(time_range - self.t)
36 | # #
37 | # # augmented_mel_spectrogram = mel_spectrogram.copy()
38 | # # augmented_mel_spectrogram[:, self.time_warp:self.time_range-self.time_warp] = 0
39 | # # return augmented_mel_spectrogram
40 | # return mel_spectrogram
41 |
--------------------------------------------------------------------------------
/nlpaug/model/word_embs/fasttext.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | from nlpaug.model.word_embs import WordEmbeddings
3 |
4 |
5 | class Fasttext(WordEmbeddings):
6 | # https://arxiv.org/pdf/1712.09405.pdf,
7 | def __init__(self, top_k=100, skip_check=False):
8 | super().__init__(top_k, skip_check)
9 |
10 | def read(self, file_path, max_num_vector=None):
11 | vectors = []
12 | with open(file_path, 'r', encoding='utf-8') as f:
13 | header = f.readline()
14 | self.vocab_size, self.emb_size = map(int, header.split())
15 |
16 | for line in f:
17 | tokens = line.split()
18 | values = [val for val in tokens[(self.emb_size * -1):]]
19 | value_pos = line.find(' '.join(values))
20 | word = line[:value_pos-1]
21 | values = np.array([float(val) for val in values])
22 |
23 | vectors.append(values)
24 | self.i2w[len(self.i2w)] = word
25 | self.w2i[word] = len(self.w2i)
26 | self.w2v[word] = values
27 |
28 | vectors = np.asarray(vectors)
29 | if not self.skip_check:
30 | if len(vectors) != len(self.i2w):
31 | raise AssertionError('Vector Size:{}, Index2Word Size:{}'.format(len(vectors), len(self.i2w)))
32 | if len(self.i2w) != len(self.w2i):
33 | raise AssertionError('Index2Word Size:{}, Word2Index Size:{}'.format(len(self.i2w), len(self.w2i)))
34 | if len(self.w2i) != len(self.w2v):
35 | raise AssertionError('Word2Index Size:{}, Word2Vector Size:{}'.format(len(self.w2i), len(self.w2v)))
36 |
37 | self.normalized_vectors = self._normalize(vectors)
38 |
--------------------------------------------------------------------------------
/nlpaug/model/audio/crop.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from nlpaug.model.audio import Audio
4 |
5 |
6 | class Crop(Audio):
7 | def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=0.1, duration=None, stateless=True):
8 | """
9 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided.
10 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
11 | augmentation
12 | will be applied in first 20% and last 20% of whole audio.
13 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
14 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
15 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
16 | augmented.
17 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided.
18 | `coverage` value will be ignored.
19 | """
20 | super().__init__(zone=zone, coverage=coverage, duration=duration, sampling_rate=sampling_rate,
21 | stateless=stateless)
22 |
23 | def manipulate(self, data):
24 | if self.duration is None:
25 | start_pos, end_pos = self.get_augment_range_by_coverage(data)
26 | else:
27 | start_pos, end_pos = self.get_augment_range_by_duration(data)
28 |
29 | if not self.stateless:
30 | self.start_pos = start_pos
31 | self.end_pos = end_pos
32 |
33 | augmented_data = np.delete(data, np.s_[start_pos:end_pos])
34 | return augmented_data
35 |
--------------------------------------------------------------------------------
/nlpaug/model/word_dict/spelling.py:
--------------------------------------------------------------------------------
1 | """
2 | Source data:
3 | English Neutral Rewriting: https://github.com/ybisk/charNMT-noise/blob/master/noise/en.natural
4 | """
5 | from nlpaug.model.word_dict import WordDictionary
6 |
7 |
8 | class Spelling(WordDictionary):
9 | def __init__(self, dict_path, include_reverse=True, cache=True):
10 | super().__init__(cache)
11 |
12 | self.dict_path = dict_path
13 | self.include_reverse = include_reverse
14 |
15 | self._init()
16 |
17 | def _init(self):
18 | self.dict = {}
19 | self.read(self.dict_path)
20 |
21 | def read(self, model_path):
22 | with open(model_path, 'r', encoding="utf-8") as f:
23 | for line in f.readlines():
24 | tokens = line.split(' ')
25 | # Last token include newline separator
26 | tokens[-1] = tokens[-1].replace('\n', '')
27 |
28 | key = tokens[0]
29 | values = tokens[1:]
30 |
31 | if key not in self.dict:
32 | self.dict[key] = []
33 |
34 | self.dict[key].extend(values)
35 | # Remove duplicate mapping
36 | self.dict[key] = list(set(self.dict[key]))
37 | # Build reverse mapping
38 | if self.include_reverse:
39 | for value in values:
40 | if value not in self.dict:
41 | self.dict[value] = []
42 | if key not in self.dict[value]:
43 | self.dict[value].append(key)
44 |
45 | def predict(self, data):
46 | if data not in self.dict:
47 | return None
48 |
49 | return self.dict[data]
50 |
--------------------------------------------------------------------------------
/nlpaug/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 |
28 | # PyInstaller
29 | # Usually these files are written by a python script from a template
30 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
31 | *.manifest
32 | *.spec
33 |
34 | # Installer logs
35 | pip-log.txt
36 | pip-delete-this-directory.txt
37 |
38 | # Unit test / coverage reports
39 | htmlcov/
40 | .tox/
41 | .coverage
42 | .coverage.*
43 | .cache
44 | nosetests.xml
45 | coverage.xml
46 | *.cover
47 | .hypothesis/
48 | .pytest_cache/
49 |
50 | # Translations
51 | *.mo
52 | *.pot
53 |
54 | # Django stuff:
55 | *.log
56 | local_settings.py
57 | db.sqlite3
58 |
59 | # Flask stuff:
60 | instance/
61 | .webassets-cache
62 |
63 | # Scrapy stuff:
64 | .scrapy
65 |
66 | # Sphinx documentation
67 | docs/_build/
68 |
69 | # PyBuilder
70 | target/
71 |
72 | # Jupyter Notebook
73 | .ipynb_checkpoints
74 |
75 | # pyenv
76 | .python-version
77 |
78 | # celery beat schedule file
79 | celerybeat-schedule
80 |
81 | # SageMath parsed files
82 | *.sage.py
83 |
84 | # Environments
85 | .env
86 | .venv
87 | env/
88 | venv/
89 | ENV/
90 | env.bak/
91 | venv.bak/
92 |
93 | # Spyder project settings
94 | .spyderproject
95 | .spyproject
96 |
97 | # Rope project settings
98 | .ropeproject
99 |
100 | # mkdocs documentation
101 | /site
102 |
103 | # mypy
104 | .mypy_cache/
--------------------------------------------------------------------------------
/nlpaug/model/audio/loudness.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from nlpaug.model.audio import Audio
4 |
5 |
6 | class Loudness(Audio):
7 | def __init__(self, zone=(0.2, 0.8), coverage=1., factor=(0.5, 2), stateless=True):
8 | """
9 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
10 | augmentation
11 | will be applied in first 20% and last 20% of whole audio.
12 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
13 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
14 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
15 | augmented.
16 | :param factor: Input data volume will be increased (decreased). Augmented value will be picked
17 | within the range of this tuple value. If volume will be reduced if value is between 0 and 1.
18 | """
19 | super().__init__(zone=zone, coverage=coverage, factor=factor, stateless=stateless)
20 |
21 | def get_loudness_level(self):
22 | return np.random.uniform(self.factor[0], self.factor[1])
23 |
24 | def manipulate(self, data):
25 | loudness_level = self.get_loudness_level()
26 | start_pos, end_pos = self.get_augment_range_by_coverage(data)
27 | aug_data = data[start_pos:end_pos] * loudness_level
28 |
29 | if not self.stateless:
30 | self.aug_factor = loudness_level
31 | self.start_pos = start_pos
32 | self.end_pos = end_pos
33 | self.aug_data = aug_data
34 |
35 | return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0)
36 |
--------------------------------------------------------------------------------
/nlpaug/util/audio/visualizer.py:
--------------------------------------------------------------------------------
1 | try:
2 | import librosa
3 | import librosa.display
4 | except ImportError:
5 | # No installation required if not using this function
6 | pass
7 | import matplotlib.pyplot as plt
8 | import numpy as np
9 |
10 |
11 | class AudioVisualizer:
12 | @staticmethod
13 | def wave(title, audio, sample_rate):
14 | plt.figure(figsize=(8, 4))
15 | librosa.display.waveplot(audio, sr=sample_rate)
16 | plt.title(title)
17 | plt.tight_layout()
18 | plt.show()
19 |
20 | @staticmethod
21 | def freq_power(title, audio, sample_rate, aug_audio=None):
22 | audio_fft = np.fft.rfft(audio)
23 | audio_fft /= len(audio_fft)
24 |
25 | freq_bins = np.arange(0, len(audio_fft), 1.0) * (sample_rate * 1.0 / len(audio_fft))
26 | plt.plot(freq_bins / 1000, 10 * np.log10(audio_fft), color='#FF0000', linewidth=0.02)
27 |
28 | if aug_audio is not None:
29 | aug_audio_fft = np.fft.rfft(aug_audio)
30 | aug_audio_fft /= len(aug_audio_fft)
31 |
32 | aug_freq_bins = np.arange(0, len(aug_audio_fft), 1.0) * (sample_rate * 1.0 / len(aug_audio_fft))
33 | plt.plot(aug_freq_bins / 1000, 10 * np.log10(aug_audio_fft), color='#000000', linewidth=0.02)
34 |
35 | plt.title(title)
36 | plt.xlabel('Frequency (k Hz)')
37 | plt.ylabel('Power (dB)')
38 | plt.tight_layout()
39 | plt.show()
40 |
41 | @staticmethod
42 | def spectrogram(title, spectrogram):
43 | plt.figure(figsize=(8, 4))
44 | librosa.display.specshow(
45 | librosa.power_to_db(spectrogram, ref=np.max), y_axis='mel', fmax=8000, x_axis='time')
46 | plt.colorbar(format='%+10.0f dB')
47 | plt.title(title)
48 | plt.tight_layout()
49 | plt.show()
50 |
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | # Byte-compiled / optimized / DLL files
2 | __pycache__/
3 | *.py[cod]
4 | *$py.class
5 |
6 | # C extensions
7 | *.so
8 |
9 | # Distribution / packaging
10 | .Python
11 | build/
12 | develop-eggs/
13 | dist/
14 | downloads/
15 | eggs/
16 | .eggs/
17 | lib/
18 | lib64/
19 | parts/
20 | sdist/
21 | var/
22 | wheels/
23 | *.egg-info/
24 | .installed.cfg
25 | *.egg
26 | MANIFEST
27 | *.zip
28 | .DS_Store
29 |
30 | # PyInstaller
31 | # Usually these files are written by a python script from a template
32 | # before PyInstaller builds the exe, so as to inject date/other infos into it.
33 | *.manifest
34 | *.spec
35 |
36 | # Installer logs
37 | pip-log.txt
38 | pip-delete-this-directory.txt
39 |
40 | # Unit test / coverage reports
41 | htmlcov/
42 | .tox/
43 | .coverage
44 | .coverage.*
45 | .cache
46 | nosetests.xml
47 | coverage.xml
48 | *.cover
49 | .hypothesis/
50 | .pytest_cache/
51 |
52 | # Translations
53 | *.mo
54 | *.pot
55 |
56 | # Django stuff:
57 | *.log
58 | local_settings.py
59 | db.sqlite3
60 |
61 | # Flask stuff:
62 | instance/
63 | .webassets-cache
64 |
65 | # Scrapy stuff:
66 | .scrapy
67 |
68 | # Sphinx documentation
69 | docs/_build/
70 |
71 | # PyBuilder
72 | target/
73 |
74 | # Jupyter Notebook
75 | .ipynb_checkpoints
76 |
77 | # pyenv
78 | .python-version
79 |
80 | # celery beat schedule file
81 | celerybeat-schedule
82 |
83 | # SageMath parsed files
84 | *.sage.py
85 |
86 | # Environments
87 | .env
88 | .venv
89 | env/
90 | venv/
91 | ENV/
92 | env.bak/
93 | venv.bak/
94 |
95 | # Spyder project settings
96 | .spyderproject
97 | .spyproject
98 |
99 | # Rope project settings
100 | .ropeproject
101 |
102 | # mkdocs documentation
103 | /site
104 |
105 | # mypy
106 | .mypy_cache/
107 |
108 | # IDE
109 | .idea/
110 |
111 | # model
112 | *.txt
113 | *.bin
114 | *.vec
--------------------------------------------------------------------------------
/model/char/keyboard/en.json:
--------------------------------------------------------------------------------
1 | {
2 | "1": ["!", "2", "@", "q", "w"],
3 | "2": ["@", "1", "!", "3", "#", "q", "w", "e"],
4 | "3": ["#", "2", "@", "4", "$", "w", "e"],
5 | "4": ["$", "3", "#", "5", "%", "e", "r"],
6 | "5": ["%", "4", "$", "6", "^", "r", "t", "y"],
7 | "6": ["^", "5", "%", "7", "&", "t", "y", "u"],
8 | "7": ["&", "6", "^", "8", "*", "y", "u", "i"],
9 | "8": ["*", "7", "&", "9", "(", "u", "i", "o"],
10 | "9": ["(", "8", "*", "0", ")", "i", "o", "p"],
11 | "q": ["1", "!", "2", "@", "w", "a", "s"],
12 | "w": ["1", "!", "2", "@", "3", "#", "q", "e", "a", "s", "d"],
13 | "e": ["2", "@", "3", "#", "4", "$", "w", "r", "s", "d", "f"],
14 | "r": ["3", "#", "4", "$", "5", "%", "e", "t", "d", "f", "g"],
15 | "t": ["4", "$", "5", "%", "6", "^", "r", "y", "f", "g", "h"],
16 | "y": ["5", "%", "6", "^", "7", "&", "t", "u", "g", "h", "j"],
17 | "u": ["6", "^", "7", "&", "8", "*", " t", "i", "h", "j", "k"],
18 | "i": ["7", "&", "8", "*", "9", "(", "u", "o", "j", "k", "l"],
19 | "o": ["8", "*", "9", "(", "0", ")", "i", "p", "k", "l"],
20 | "p": ["9", "(", "0", ")", "o", "l"],
21 | "a": ["q", "w", "a", "s", "z", "x"],
22 | "s": ["q", "w", "e", "a", "d", "z", "x", "c"],
23 | "d": ["w", "e", "r", "s", "f", "x", "c", "v"],
24 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"],
25 | "g": ["r", "t", "y", "f", "h", "v", "b", "n"],
26 | "h": ["t", "y", "u", "g", "j", "b", "n", "m"],
27 | "j": ["y", "u", "i", "h", "k", "n", "m", ",", "<"],
28 | "k": ["u", "i", "o", "j", "l", "m", ",", "<", ".", ">"],
29 | "l": ["i", "o", "p", "k", ";", ":", ",", "<", ".", ">", "/", "?"],
30 | "z": ["a", "s", "x"],
31 | "x": ["a", "s", "d", "z", "c"],
32 | "c": ["s", "d", "f", "x", "v"],
33 | "v": ["d", "f", "g", "c", "b"],
34 | "b": ["f", "g", "h", "v", "n"],
35 | "n": ["g", "h", "j", "b", "m"],
36 | "m": ["h", "j", "k", "n", ",", "<"]
37 | }
--------------------------------------------------------------------------------
/test/augmenter/audio/test_vtlp.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from dotenv import load_dotenv
4 |
5 | import nlpaug.augmenter.audio as naa
6 | from nlpaug.util import AudioLoader
7 |
8 |
9 | class TestVtlp(unittest.TestCase):
10 | @classmethod
11 | def setUpClass(cls):
12 | env_config_path = os.path.abspath(os.path.join(
13 | os.path.dirname(__file__), '..', '..', '..', '.env'))
14 | load_dotenv(env_config_path)
15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
16 | cls.sample_wav_file = os.path.join(
17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
18 | )
19 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
20 |
21 | def test_substitute(self):
22 | for _ in range(10):
23 | aug = naa.VtlpAug(sampling_rate=self.sampling_rate)
24 | aug.model.stateless = False
25 | augmented_audio = aug.augment(self.audio)
26 | self.assertGreater(len(self.audio), len(augmented_audio))
27 |
28 | def test_coverage(self):
29 | zone = (0.3, 0.7)
30 | coverage = 0.1
31 |
32 | for _ in range(10):
33 | aug = naa.VtlpAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage)
34 | aug.model.stateless = False
35 | aug.augment(self.audio)
36 |
37 | self.assertGreater(len(self.audio[aug.model.start_pos:aug.model.end_pos]), len(aug.model.aug_data))
38 |
39 | def test_zone(self):
40 | zone = (0, 1)
41 | coverage = 1.
42 |
43 | for _ in range(10):
44 | aug = naa.VtlpAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage)
45 | aug.model.stateless = False
46 | aug.augment(self.audio)
47 |
48 | self.assertGreater(len(self.audio[aug.model.start_pos:aug.model.end_pos]), len(aug.model.aug_data))
49 |
--------------------------------------------------------------------------------
/nlpaug/util/doc/change_log.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.doc.token import Token
2 |
3 |
4 | class ChangeLog:
5 | def __init__(self, orig_token):
6 | self.orig_token = orig_token
7 | self.change_logs = []
8 | self.add(orig_token.token, 'original', orig_token.change_seq)
9 | self._is_changed = False
10 |
11 | def add(self, token, action, change_seq):
12 | if action != 'original' and not self._is_changed:
13 | self._is_changed = True
14 | self.change_logs.append(Token(token=token, action=action, change_seq=change_seq))
15 |
16 | def update(self, idx, token=None, action=None, change_seq=None):
17 | if not self._is_changed:
18 | self._is_changed = True
19 |
20 | if token:
21 | self.change_logs[idx].token = token
22 | if action:
23 | self.change_logs[idx].action = action
24 | if change_seq:
25 | self.change_logs[idx].change_seq = change_seq
26 |
27 | def size(self):
28 | return len(self.change_logs) - 1
29 |
30 | def is_changed(self):
31 | return self._is_changed
32 |
33 | def get_latest_token(self):
34 | return self.change_logs[-1]
35 |
36 | def update_last_token(self, start_pos):
37 | self.change_logs[-1].start_pos = start_pos
38 |
39 | def to_changed_dict(self):
40 | return {
41 | 'orig_token': self.orig_token.token,
42 | 'orig_start_pos': self.orig_token.start_pos,
43 | 'new_token': self.get_latest_token().token,
44 | 'new_start_pos': self.get_latest_token().start_pos,
45 | 'change_seq': self.get_latest_token().change_seq,
46 | 'action': self.get_latest_token().action
47 | }
48 |
49 | def to_dict(self):
50 | return {
51 | 'orig_token': self.orig_token.to_dict(),
52 | 'change_logs': [t.to_dict() for t in self.change_logs]
53 | }
54 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/speed.py:
--------------------------------------------------------------------------------
1 | """
2 | Augmenter that apply speed adjustment operation to audio.
3 | """
4 |
5 | from nlpaug.augmenter.audio import AudioAugmenter
6 | import nlpaug.model.audio as nma
7 | from nlpaug.util import Action, WarningMessage
8 |
9 |
10 | class SpeedAug(AudioAugmenter):
11 | """
12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
13 | augmentation will be applied in first 20% and last 20% of whole audio.
14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
17 | augmented.
18 | :param int factor: Range of applying speed adjustment operation. Default value is (0.5, 2)
19 | Factor for time stretch. Audio will be slowing down if value is between 0 and 1.
20 | :param tuple speed_range: Deprecated. Use `factor` indeed
21 | :param str name: Name of this augmenter
22 |
23 | >>> import nlpaug.augmenter.audio as naa
24 | >>> aug = naa.ShiftAug()
25 | """
26 |
27 | def __init__(self, zone=(0.2, 0.8), coverage=1., duration=None,
28 | factor=(0.5, 2),
29 | speed_range=(0.5, 2), name='Speed_Aug', verbose=0):
30 | super().__init__(
31 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose)
32 |
33 | if speed_range != (0.5, 2):
34 | print(WarningMessage.DEPRECATED.format('speed_range', '0.0.12', 'factor'))
35 | factor = speed_range
36 |
37 | self.model = self.get_model(zone, coverage, duration, factor)
38 |
39 | @classmethod
40 | def get_model(cls, zone, coverage, duration, factor):
41 | return nma.Speed(zone, coverage, duration, factor)
42 |
--------------------------------------------------------------------------------
/test/run_test.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import sys
3 | import logging
4 |
5 |
6 | if __name__ == '__main__':
7 | sys.path.append('../nlpaug')
8 |
9 | # disable transformer's info logging
10 | for file_name in ['tokenization_utils', 'file_utils', 'modeling_utils', 'modeling_xlnet',
11 | 'configuration_utils']:
12 | logging.getLogger('transformers.' + file_name).setLevel(logging.ERROR)
13 |
14 | test_dirs = [
15 | 'test/augmenter/char/',
16 | 'test/augmenter/word/',
17 | 'test/augmenter/sentence/',
18 | 'test/augmenter/audio/',
19 | 'test/augmenter/spectrogram/',
20 | 'test/model/char/',
21 | 'test/model/word/',
22 | 'test/util/selection/',
23 | 'test/flow/',
24 | 'test/profiling/sentence/',
25 | ]
26 | runner = unittest.TextTestRunner()
27 |
28 | for test_dir in test_dirs:
29 | loader = unittest.TestLoader()
30 | suite = loader.discover(test_dir)
31 | runner.run(suite)
32 |
33 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.sentence.test_context_word_embs_sentence')
34 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.word.test_context_word_embs')
35 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.word.test_word_embs')
36 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.word.test_random_word')
37 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.char.test_random_char')
38 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.word.test_word')
39 | # suite = unittest.TestLoader().loadTestsFromName('util.selection.test_filtering')
40 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.audio.test_noise')
41 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.test_augmenter')
42 | # suite = unittest.TestLoader().loadTestsFromName('model.word.test_word_embs_model')
43 | # runner.run(suite)
44 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/loudness.py:
--------------------------------------------------------------------------------
1 | """
2 | Augmenter that apply adjusting loudness operation to audio.
3 | """
4 |
5 | from nlpaug.augmenter.audio import AudioAugmenter
6 | import nlpaug.model.audio as nma
7 | from nlpaug.util import Action, WarningMessage
8 |
9 |
10 | class LoudnessAug(AudioAugmenter):
11 | """
12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
13 | augmentation will be applied in first 20% and last 20% of whole audio.
14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
17 | augmented.
18 | :param tuple factor: Input data volume will be increased (decreased). Augmented value will be picked
19 | within the range of this tuple value. Volume will be reduced if value is between 0 and 1.
20 | :param tuple loudness_factor: Deprecated. Use `factor` indeed.
21 | :param str name: Name of this augmenter
22 |
23 | >>> import nlpaug.augmenter.audio as naa
24 | >>> aug = naa.LoudnessAug()
25 | """
26 |
27 | def __init__(self, zone=(0.2, 0.8), coverage=1.,
28 | factor=(0.5, 2), loudness_factor=(0.5, 2), name='Loudness_Aug', verbose=0):
29 | super().__init__(
30 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose)
31 |
32 | if loudness_factor != (0.5, 2):
33 | print(WarningMessage.DEPRECATED.format('loudness_factor', '0.0.12', 'factor'))
34 | factor = loudness_factor
35 |
36 | self.model = self.get_model(zone, coverage, factor)
37 |
38 | @classmethod
39 | def get_model(cls, zone, coverage, factor):
40 | return nma.Loudness(zone=zone, coverage=coverage, factor=factor)
41 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/noise.py:
--------------------------------------------------------------------------------
1 | """
2 | Augmenter that apply noise injection operation to audio.
3 | """
4 |
5 | from nlpaug.augmenter.audio import AudioAugmenter
6 | import nlpaug.model.audio as nma
7 | from nlpaug.util import Action, WarningMessage
8 |
9 |
10 | class NoiseAug(AudioAugmenter):
11 | """
12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
13 | augmentation will be applied in first 20% and last 20% of whole audio.
14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
17 | augmented.
18 | :param str color: Colors of noise. Supported 'white', 'pink', 'red', 'brown', 'brownian', 'blue', 'azure',
19 | 'violet', 'purple' and 'random'. If 'random' is used, noise color will be picked randomly in each augment.
20 | :param list noises: Background noises for noise injection. You can provide more than one background noise and
21 | noise will be picked randomly. Expected format is list of numpy array. If this value is provided. `color`
22 | value will be ignored
23 | :param str name: Name of this augmenter
24 |
25 | >>> import nlpaug.augmenter.audio as naa
26 | >>> aug = naa.NoiseAug()
27 | """
28 | def __init__(self, zone=(0.2, 0.8), coverage=1.,
29 | color='white', noises=None, name='Noise_Aug', verbose=0):
30 | super().__init__(
31 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose)
32 |
33 | self.model = self.get_model(zone, coverage, color, noises)
34 |
35 | @classmethod
36 | def get_model(cls, zone, coverage, color, noises):
37 | return nma.Noise(zone=zone, coverage=coverage, color=color, noises=noises)
38 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/vtlp.py:
--------------------------------------------------------------------------------
1 | """
2 | Augmenter that apply vocal tract length perturbation (VTLP) operation to audio.
3 | """
4 |
5 | from nlpaug.augmenter.audio import AudioAugmenter
6 | import nlpaug.model.audio as nma
7 | from nlpaug.util import Action
8 |
9 |
10 | class VtlpAug(AudioAugmenter):
11 | # https://pdfs.semanticscholar.org/3de0/616eb3cd4554fdf9fd65c9c82f2605a17413.pdf
12 | """
13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
14 | augmentation will be applied in first 20% and last 20% of whole audio.
15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
18 | augmented.
19 | :param int factor: Range of applying speed adjustment operation. Default value is (0.5, 2)
20 | Factor for time stretch. Audio will be slowing down if value is between 0 and 1.
21 | :param int fhi: Boundary frequency. Default value is 4800.
22 | :param str name: Name of this augmenter
23 |
24 | >>> import nlpaug.augmenter.audio as naa
25 | >>> aug = naa.VtlpAug()
26 | """
27 |
28 | def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=0.1, duration=None, fhi=4800,
29 | factor=(0.9, 1.1), name='Vtlp_Aug', verbose=0):
30 | super().__init__(
31 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose)
32 |
33 | self.model = self.get_model(sampling_rate, zone, coverage, duration, factor, fhi)
34 |
35 | @classmethod
36 | def get_model(cls, sampling_rate, zone, coverage, duration, factor, fhi):
37 | return nma.Vtlp(sampling_rate=sampling_rate, zone=zone, coverage=coverage,
38 | duration=duration, factor=factor, fhi=fhi)
39 |
--------------------------------------------------------------------------------
/nlpaug/model/word_embs/word_embeddings.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | import nlpaug.util.math.normalization as normalization
4 |
5 |
6 | class WordEmbeddings:
7 | def __init__(self, top_k=100, skip_check=True):
8 | self.top_k = top_k
9 | self.skip_check = skip_check
10 | self.emb_size = 0
11 | self.vocab_size = 0
12 | self.embs = {}
13 | self.w2v = {}
14 | self.i2w = {}
15 | self.w2i = {}
16 | self.vectors = []
17 | self.normalized_vectors = None
18 |
19 | def read(self, file_path, max_num_vector):
20 | raise NotImplementedError
21 |
22 | def similar(self, word):
23 | raise NotImplementedError
24 |
25 | def download(self, model_path):
26 | raise NotImplementedError
27 |
28 | def word2idx(self, word):
29 | return self.w2i[word]
30 |
31 | def word2vector(self, word):
32 | return self.w2v[word]
33 |
34 | def idx2word(self, idx):
35 | return self.i2w[idx]
36 |
37 | def get_vectors(self):
38 | return self.normalized_vectors
39 |
40 | def get_vocab(self):
41 | return [word for word in self.w2v]
42 |
43 | @classmethod
44 | def _normalize(cls, vectors, norm='l2'):
45 | if norm == 'l2':
46 | return normalization.l2_norm(vectors)
47 | elif norm == 'l1':
48 | return normalization.l1_norm(vectors)
49 | elif norm == 'standard':
50 | return normalization.standard_norm(vectors)
51 |
52 | def predict(self, word, n=1):
53 | source_id = self.word2idx(word)
54 | source_vector = self.word2vector(word)
55 | scores = np.dot(self.normalized_vectors, source_vector) # TODO: very slow.
56 | target_ids = np.argpartition(-scores, self.top_k+2)[:self.top_k+2] # TODO: slow.
57 | target_words = [self.idx2word(idx) for idx in target_ids if idx != source_id and self.idx2word(idx).lower() !=
58 | word.lower()] # filter out same word
59 | return target_words[:self.top_k]
60 |
--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/en.json:
--------------------------------------------------------------------------------
1 | {
2 | "1": ["!", "2", "@", "q", "w"],
3 | "2": ["@", "1", "!", "3", "#", "q", "w", "e"],
4 | "3": ["#", "2", "@", "4", "$", "w", "e"],
5 | "4": ["$", "3", "#", "5", "%", "e", "r"],
6 | "5": ["%", "4", "$", "6", "^", "r", "t", "y"],
7 | "6": ["^", "5", "%", "7", "&", "t", "y", "u"],
8 | "7": ["&", "6", "^", "8", "*", "y", "u", "i"],
9 | "8": ["*", "7", "&", "9", "(", "u", "i", "o"],
10 | "9": ["(", "8", "*", "0", ")", "i", "o", "p"],
11 | "!": ["@", "q"],
12 | "@": ["!", "#", "q", "w"],
13 | "#": ["@", "$", "w", "e"],
14 | "$": ["#", "%", "e", "r"],
15 | "%": "$",
16 | "q": ["1", "!", "2", "@", "w", "a", "s"],
17 | "w": ["1", "!", "2", "@", "3", "#", "q", "e", "a", "s", "d"],
18 | "e": ["2", "@", "3", "#", "4", "$", "w", "r", "s", "d", "f"],
19 | "r": ["3", "#", "4", "$", "5", "%", "e", "t", "d", "f", "g"],
20 | "t": ["4", "$", "5", "%", "6", "^", "r", "y", "f", "g", "h"],
21 | "y": ["5", "%", "6", "^", "7", "&", "t", "u", "g", "h", "j"],
22 | "u": ["6", "^", "7", "&", "8", "*", " t", "i", "h", "j", "k"],
23 | "i": ["7", "&", "8", "*", "9", "(", "u", "o", "j", "k", "l"],
24 | "o": ["8", "*", "9", "(", "0", ")", "i", "p", "k", "l"],
25 | "p": ["9", "(", "0", ")", "o", "l"],
26 | "a": ["q", "w", "a", "s", "z", "x"],
27 | "s": ["q", "w", "e", "a", "d", "z", "x", "c"],
28 | "d": ["w", "e", "r", "s", "f", "x", "c", "v"],
29 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"],
30 | "g": ["r", "t", "y", "f", "h", "v", "b", "n"],
31 | "h": ["t", "y", "u", "g", "j", "b", "n", "m"],
32 | "j": ["y", "u", "i", "h", "k", "n", "m", ",", "<"],
33 | "k": ["u", "i", "o", "j", "l", "m", ",", "<", ".", ">"],
34 | "l": ["i", "o", "p", "k", ";", ":", ",", "<", ".", ">", "/", "?"],
35 | "z": ["a", "s", "x"],
36 | "x": ["a", "s", "d", "z", "c"],
37 | "c": ["s", "d", "f", "x", "v"],
38 | "v": ["d", "f", "g", "c", "b"],
39 | "b": ["f", "g", "h", "v", "n"],
40 | "n": ["g", "h", "j", "b", "m"],
41 | "m": ["h", "j", "k", "n", ",", "<"]
42 | }
--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/pitch.py:
--------------------------------------------------------------------------------
1 | """
2 | Augmenter that apply pitch adjustment operation to audio.
3 | """
4 |
5 | from nlpaug.augmenter.audio import AudioAugmenter
6 | import nlpaug.model.audio as nma
7 | from nlpaug.util import Action, WarningMessage
8 |
9 |
10 | class PitchAug(AudioAugmenter):
11 | """
12 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided.
13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
14 | augmentation will be applied in first 20% and last 20% of whole audio.
15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
18 | augmented.
19 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage`
20 | value will be ignored.
21 | :param tuple pitch_range: Deprecated. Use `factor` indeed
22 | :param str name: Name of this augmenter
23 |
24 | >>> import nlpaug.augmenter.audio as naa
25 | >>> aug = naa.PitchAug(sampling_rate=44010)
26 | """
27 |
28 | def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=1., duration=None,
29 | factor=(-10, 10), pitch_range=(-10, 10), name='Pitch_Aug', verbose=0):
30 | super().__init__(
31 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose)
32 |
33 | if pitch_range != (-10, 10):
34 | print(WarningMessage.DEPRECATED.format('pitch_range', '0.0.12', 'factor'))
35 | factor = pitch_range
36 |
37 | self.model = self.get_model(sampling_rate, zone, coverage, duration, factor)
38 |
39 | @classmethod
40 | def get_model(cls, sampling_rate, zone, coverage, duration, factor):
41 | return nma.Pitch(sampling_rate, zone, coverage, duration, factor)
42 |
--------------------------------------------------------------------------------
/nlpaug/model/audio/mask.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from nlpaug.model.audio import Audio
4 |
5 |
6 | class Mask(Audio):
7 | def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=1., duration=None,
8 | mask_with_noise=True, stateless=True):
9 | """
10 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided.
11 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
12 | augmentation
13 | will be applied in first 20% and last 20% of whole audio.
14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
17 | augmented.
18 | :param float duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage`
19 | value will be ignored.
20 | :param bool mask_with_noise: If it is True, targeting area will be replaced by noise. Otherwise, it will be
21 | replaced by 0.
22 | """
23 | super().__init__(zone=zone, coverage=coverage, duration=duration, sampling_rate=sampling_rate,
24 | stateless=stateless)
25 | self.mask_with_noise = mask_with_noise
26 |
27 | def manipulate(self, data):
28 | start_pos, end_pos = self.get_augment_range_by_coverage(data)
29 |
30 | aug_data = None
31 | if self.mask_with_noise:
32 | aug_data = np.random.randn(end_pos - start_pos)
33 | else:
34 | aug_data = np.zeros(end_pos - start_pos)
35 |
36 | if not self.stateless:
37 | self.start_pos = start_pos
38 | self.end_pos = end_pos
39 | self.aug_data = aug_data
40 |
41 | augmented_data = data.copy()
42 | augmented_data[start_pos:end_pos] = aug_data
43 |
44 | return augmented_data
45 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/crop.py:
--------------------------------------------------------------------------------
1 | """
2 | Augmenter that apply cropping operation to audio.
3 | """
4 |
5 | from nlpaug.augmenter.audio import AudioAugmenter
6 | import nlpaug.model.audio as nma
7 | from nlpaug.util import Action, WarningMessage
8 |
9 |
10 | class CropAug(AudioAugmenter):
11 | """
12 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided.
13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
14 | augmentation will be applied in first 20% and last 20% of whole audio.
15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `0.1` is assigned, augment
16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
18 | augmented.
19 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage`
20 | value will be ignored.
21 | :param str name: Name of this augmenter
22 |
23 | >>> import nlpaug.augmenter.audio as naa
24 | >>> aug = naa.CropAug(sampling_rate=44010)
25 | """
26 |
27 | def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=0.1, duration=None,
28 | crop_range=(0.2, 0.8), crop_factor=2, name='Crop_Aug', verbose=0):
29 | super().__init__(
30 | action=Action.DELETE, name=name, device='cpu', verbose=verbose)
31 | self.model = self.get_model(sampling_rate, zone, coverage, duration)
32 |
33 | if crop_range != (0.2, 0.8):
34 | print(WarningMessage.DEPRECATED.format('crop_range', '0.0.12', 'zone'))
35 | if crop_factor != 2:
36 | print(WarningMessage.DEPRECATED.format('crop_factor', '0.0.12', 'temperature'))
37 |
38 | def delete(self, data):
39 | return self.model.manipulate(data)
40 |
41 | @classmethod
42 | def get_model(cls, sampling_rate, zone, coverage, duration):
43 | return nma.Crop(sampling_rate, zone=zone, coverage=coverage, duration=duration)
44 |
--------------------------------------------------------------------------------
/nlpaug/model/word_embs/glove.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from nlpaug.model.word_embs import WordEmbeddings
4 |
5 | pre_trained_model_url = {
6 | 'glove_6b': 'http://nlp.stanford.edu/data/glove.6B.zip',
7 | 'glove_42b_300d': 'http://nlp.stanford.edu/data/glove.42B.300d.zip',
8 | 'glove_840b_300d': 'http://nlp.stanford.edu/data/glove.840B.300d.zip',
9 | 'glove_twitter_27b': 'http://nlp.stanford.edu/data/glove.twitter.27B.zip',
10 | }
11 |
12 |
13 | class GloVe(WordEmbeddings):
14 | # https://nlp.stanford.edu/pubs/glove.pdf
15 | def __init__(self, top_k=100, skip_check=False):
16 | super().__init__(top_k, skip_check)
17 |
18 | def read(self, file_path, max_num_vector=None):
19 | vectors = []
20 | with open(file_path, 'r', encoding='utf-8') as f:
21 | for line in f:
22 | tokens = line.split()
23 | token_len = len(tokens) % 25
24 |
25 | # Handle if token length is longer than 1 (e.g. . . . in glove.840B.300d)
26 | values = np.array([float(val) for val in tokens[token_len:]])
27 |
28 | # Exist two words while one word has extra space (e.g. "pp." and "pp. " in glove.840B.300d)
29 | word = line[:line.find(str(values[0])) - 1]
30 |
31 | # Skip special word
32 | if '�' in word:
33 | continue
34 |
35 | vectors.append(values)
36 | self.i2w[len(self.i2w)] = word
37 | self.w2i[word] = len(self.w2i)
38 | self.w2v[word] = values
39 |
40 | vectors = np.asarray(vectors)
41 | if not self.skip_check:
42 | if len(vectors) != len(self.i2w):
43 | raise AssertionError('Vector Size:{}, Index2Word Size:{}'.format(len(vectors), len(self.i2w)))
44 | if len(self.i2w) != len(self.w2i):
45 | raise AssertionError('Index2Word Size:{}, Word2Index Size:{}'.format(len(self.i2w), len(self.w2i)))
46 | if len(self.w2i) != len(self.w2v):
47 | raise AssertionError('Word2Index Size:{}, Word2Vector Size:{}'.format(len(self.w2i), len(self.w2v)))
48 |
49 | self.normalized_vectors = self._normalize(vectors)
50 |
--------------------------------------------------------------------------------
/nlpaug/model/audio/speed.py:
--------------------------------------------------------------------------------
1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation
2 |
3 | try:
4 | import librosa
5 | except ImportError:
6 | # No installation required if not using this function
7 | pass
8 | import numpy as np
9 |
10 | from nlpaug.model.audio import Audio
11 |
12 |
13 | class Speed(Audio):
14 | """
15 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
16 | augmentation will be applied in first 20% and last 20% of whole audio.
17 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
18 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
19 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
20 | augmented.
21 | :param int factor: Range of applying speed adjustment operation. Default value is (0.5, 2)
22 | Factor for time stretch. Audio will be slowing down if value is between 0 and 1.
23 | """
24 | def __init__(self, zone=(0.2, 0.8), coverage=1., duration=None,
25 | factor=(-10, 10), stateless=False):
26 | super().__init__(zone=zone, coverage=coverage, duration=duration,
27 | factor=factor, stateless=stateless)
28 | try:
29 | librosa
30 | except NameError:
31 | raise ImportError('Missed librosa library. Install it via `pip install librosa`')
32 |
33 | def get_speed_level(self):
34 | speeds = [round(i, 1) for i in np.arange(self.factor[0], self.factor[1], 0.1)]
35 | speeds = [s for s in speeds if s != 1.0]
36 | return speeds[np.random.randint(len(speeds))]
37 |
38 | def manipulate(self, data):
39 | speed = self.get_speed_level()
40 | start_pos, end_pos = self.get_augment_range_by_coverage(data)
41 |
42 | aug_data = librosa.effects.time_stretch(data[start_pos:end_pos], speed)
43 |
44 | if not self.stateless:
45 | self.start_pos = start_pos
46 | self.end_pos = end_pos
47 | self.aug_data = aug_data
48 | self.aug_factor = speed
49 |
50 | return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0)
51 |
--------------------------------------------------------------------------------
/test/augmenter/audio/test_crop.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import numpy as np
4 | from dotenv import load_dotenv
5 |
6 | import nlpaug.augmenter.audio as naa
7 | from nlpaug.util import AudioLoader
8 |
9 |
10 | class TestCrop(unittest.TestCase):
11 | @classmethod
12 | def setUpClass(cls):
13 | env_config_path = os.path.abspath(os.path.join(
14 | os.path.dirname(__file__), '..', '..', '..', '.env'))
15 | load_dotenv(env_config_path)
16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 | cls.sample_wav_file = os.path.join(
18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 | )
20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 |
22 | def test_empty_input(self):
23 | audio = np.array([])
24 | aug = naa.CropAug(sampling_rate=self.sampling_rate)
25 | augmented_audio = aug.augment(audio)
26 |
27 | self.assertTrue(np.array_equal(audio, augmented_audio))
28 |
29 | def test_substitute(self):
30 | aug = naa.CropAug(sampling_rate=self.sampling_rate)
31 | augmented_audio = aug.augment(self.audio)
32 |
33 | self.assertNotEqual(len(self.audio), len(augmented_audio))
34 |
35 | def test_coverage(self):
36 | aug = naa.CropAug(sampling_rate=self.sampling_rate, coverage=0.1)
37 | augmented_data = aug.augment(self.audio)
38 | audio_size = len(self.audio)
39 | augmented_size = len(augmented_data)
40 | expected_crop_size = len(self.audio) * (aug.model.zone[1] - aug.model.zone[0]) * 0.1
41 |
42 | self.assertTrue(-1 <= audio_size - augmented_size - expected_crop_size <= 1)
43 |
44 | def test_duration(self):
45 | duration = 1
46 | audio_size = len(self.audio)
47 |
48 | for _ in range(10):
49 | aug = naa.CropAug(sampling_rate=self.sampling_rate, duration=duration)
50 | aug.model.stateless = False
51 | augmented_data = aug.augment(self.audio)
52 | augmented_size = len(augmented_data)
53 | expected_crop_size = self.sampling_rate * duration
54 |
55 | self.assertGreater(audio_size, augmented_size)
56 | self.assertEqual(len(self.audio[aug.model.start_pos:aug.model.end_pos]), expected_crop_size)
57 |
--------------------------------------------------------------------------------
/test/augmenter/audio/test_speed.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from dotenv import load_dotenv
4 |
5 | import nlpaug.augmenter.audio as naa
6 | from nlpaug.util import AudioLoader
7 |
8 |
9 | class TestSpeed(unittest.TestCase):
10 | @classmethod
11 | def setUpClass(cls):
12 | env_config_path = os.path.abspath(os.path.join(
13 | os.path.dirname(__file__), '..', '..', '..', '.env'))
14 | load_dotenv(env_config_path)
15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
16 | cls.sample_wav_file = os.path.join(
17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
18 | )
19 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
20 |
21 | def test_substitute(self):
22 | for _ in range(10):
23 | aug = naa.SpeedAug()
24 | aug.model.stateless = False
25 | augmented_audio = aug.augment(self.audio)
26 |
27 | if aug.model.aug_factor < 1:
28 | self.assertGreater(len(augmented_audio), len(self.audio))
29 | else:
30 | self.assertLess(len(augmented_audio), len(self.audio))
31 |
32 | def test_coverage(self):
33 | zone = (0.3, 0.7)
34 | coverage = 0.1
35 |
36 | for _ in range(10):
37 | aug = naa.SpeedAug(zone=zone, coverage=coverage)
38 | aug.model.stateless = False
39 | aug.augment(self.audio)
40 |
41 | if aug.model.aug_factor < 1:
42 | self.assertGreater(len(aug.model.aug_data), len(self.audio[aug.model.start_pos:aug.model.end_pos]))
43 | else:
44 | self.assertLess(len(aug.model.aug_data), len(self.audio[aug.model.start_pos:aug.model.end_pos]))
45 |
46 | def test_zone(self):
47 | zone = (0, 1)
48 | coverage = 1.
49 |
50 | for _ in range(10):
51 | aug = naa.SpeedAug(zone=zone, coverage=coverage)
52 | aug.model.stateless = False
53 | aug.augment(self.audio)
54 |
55 | if aug.model.aug_factor < 1:
56 | self.assertGreater(len(aug.model.aug_data), len(self.audio[aug.model.start_pos:aug.model.end_pos]))
57 | else:
58 | self.assertLess(len(aug.model.aug_data), len(self.audio[aug.model.start_pos:aug.model.end_pos]))
59 |
--------------------------------------------------------------------------------
/test/augmenter/word/test_spelling.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | from dotenv import load_dotenv
4 |
5 | import nlpaug
6 | import nlpaug.augmenter.word as naw
7 |
8 |
9 | class TestSpelling(unittest.TestCase):
10 | @classmethod
11 | def setUpClass(cls):
12 | env_config_path = os.path.abspath(os.path.join(
13 | os.path.dirname(__file__), '..', '..', '..', '.env'))
14 | load_dotenv(env_config_path)
15 |
16 | cls.model_dir = os.path.join(nlpaug.__path__[0], 'res', 'word', 'spelling')
17 |
18 | def test_oov(self):
19 | text = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
20 |
21 | aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt'))
22 | augmented_text = aug.augment(text)
23 |
24 | self.assertEqual(text, augmented_text)
25 |
26 | def test_substitute(self):
27 | texts = [
28 | 'The quick brown fox jumps over the lazy dog'
29 | ]
30 |
31 | aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt'))
32 |
33 | for text in texts:
34 | self.assertLess(0, len(text))
35 | augmented_text = aug.augment(text)
36 |
37 | self.assertNotEqual(text, augmented_text)
38 |
39 | self.assertLess(0, len(texts))
40 |
41 | def test_substitute_stopwords(self):
42 | texts = [
43 | 'The quick brown fox jumps over the lazy dog'
44 | ]
45 |
46 | stopwords = [t.lower() for t in texts[0].split(' ')[:3]]
47 | aug_n = 3
48 |
49 | aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt'), stopwords=stopwords)
50 |
51 | for text in texts:
52 | self.assertLess(0, len(text))
53 | augmented_text = aug.augment(text)
54 |
55 | augmented_tokens = aug.tokenizer(augmented_text)
56 | tokens = aug.tokenizer(text)
57 |
58 | augmented_cnt = 0
59 |
60 | for token, augmented_token in zip(tokens, augmented_tokens):
61 | if token.lower() in stopwords and len(token) > aug_n:
62 | self.assertEqual(token.lower(), augmented_token)
63 | else:
64 | augmented_cnt += 1
65 |
66 | self.assertGreater(augmented_cnt, 0)
67 |
68 | self.assertLess(0, len(texts))
69 |
--------------------------------------------------------------------------------
/test/model/char/test_keyboard_model.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import re
3 |
4 | import nlpaug.model.char as nmc
5 |
6 |
7 | class TestKeyboard(unittest.TestCase):
8 | def test_lower_case_only(self):
9 | model = nmc.Keyboard(special_char=False, numeric=False, upper_case=False)
10 | mapping = model.model
11 | for key, values in mapping.items():
12 | self.assertTrue(re.match("^[a-z]*$", key))
13 | self.assertGreater(len(values), 0)
14 | for value in values:
15 | self.assertTrue(re.match("^[a-z]*$", value))
16 | self.assertGreater(len(mapping), 0)
17 |
18 | def test_special_char_lower_case(self):
19 | model = nmc.Keyboard(special_char=True, numeric=False, upper_case=False)
20 | mapping = model.model
21 | for key, values in mapping.items():
22 | self.assertFalse(re.match("^[0-9]*$", key))
23 | self.assertGreater(len(values), 0)
24 | for value in values:
25 | self.assertFalse(re.match("^[0-9]*$", value))
26 | self.assertGreater(len(mapping), 0)
27 |
28 | def test_numeric_lower_case(self):
29 | model = nmc.Keyboard(special_char=False, numeric=True, upper_case=False)
30 | mapping = model.model
31 | for key, values in mapping.items():
32 | self.assertTrue(re.match("^[a-z0-9]*$", key))
33 | self.assertGreater(len(values), 0)
34 | for value in values:
35 | self.assertTrue(re.match("^[a-z0-9]*$", value))
36 | self.assertGreater(len(mapping), 0)
37 |
38 | def test_upper_lower_case(self):
39 | model = nmc.Keyboard(special_char=False, numeric=False, upper_case=True)
40 | mapping = model.model
41 | for key, values in mapping.items():
42 | self.assertTrue(re.match("^[a-zA-Z]*$", key))
43 | self.assertGreater(len(values), 0)
44 | for value in values:
45 | self.assertTrue(re.match("^[a-zA-Z]*$", value))
46 | self.assertGreater(len(mapping), 0)
47 |
48 | def test_special_char_numeric_lower_case(self):
49 | model = nmc.Keyboard(special_char=True, numeric=True, upper_case=True)
50 | mapping = model.model
51 | for key, values in mapping.items():
52 | self.assertGreater(len(values), 0)
53 | self.assertGreater(len(mapping), 0)
54 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/mask.py:
--------------------------------------------------------------------------------
1 | """
2 | Augmenter that apply mask operation to audio.
3 | """
4 |
5 | from nlpaug.augmenter.audio import AudioAugmenter
6 | import nlpaug.model.audio as nma
7 | from nlpaug.util import Action, WarningMessage
8 |
9 |
10 | class MaskAug(AudioAugmenter):
11 | """
12 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided.
13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
14 | augmentation will be applied in first 20% and last 20% of whole audio.
15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
18 | augmented.
19 | :param float duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage`
20 | value will be ignored.
21 | :param bool mask_with_noise: If it is True, targeting area will be replaced by noise. Otherwise, it will be
22 | replaced by 0.
23 | :param str name: Name of this augmenter
24 |
25 | >>> import nlpaug.augmenter.audio as naa
26 | >>> aug = naa.MaskAug(sampling_rate=44010)
27 | """
28 |
29 | def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=1.,
30 | duration=(0.2, 0.8),
31 | mask_range=(0.2, 0.8), mask_factor=2, mask_with_noise=True,
32 | name='Mask_Aug', verbose=0):
33 | super().__init__(
34 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose)
35 |
36 | if mask_range != (0.2, 0.8):
37 | print(WarningMessage.DEPRECATED.format('mask_range', '0.0.12', 'zone'))
38 | zone = mask_range
39 | if mask_factor != 2:
40 | print(WarningMessage.DEPRECATED.format('mask_factor', '0.0.12', 'duration'))
41 | duration = mask_factor
42 |
43 | self.model = self.get_model(sampling_rate, zone, coverage, duration, mask_with_noise)
44 |
45 | @classmethod
46 | def get_model(cls, sampling_rate, zone, coverage, duration, mask_with_noise):
47 | return nma.Mask(sampling_rate, zone, coverage, duration, mask_with_noise)
48 |
--------------------------------------------------------------------------------
/nlpaug/model/word_embs/word2vec.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 | from nlpaug.model.word_embs import WordEmbeddings
4 |
5 |
6 | class Word2vec(WordEmbeddings):
7 | # https://arxiv.org/pdf/1301.3781.pdf
8 | def __init__(self, top_k=100, skip_check=False):
9 | super().__init__(top_k, skip_check)
10 |
11 | def read(self, file_path, max_num_vector=None):
12 | with open(file_path, 'rb') as f:
13 | header = f.readline()
14 | self.vocab_size, self.emb_size = map(int, header.split())
15 | if max_num_vector is not None:
16 | self.vocab_size = min(max_num_vector, self.vocab_size)
17 |
18 | vectors = np.zeros((self.vocab_size, self.emb_size), dtype=np.float32)
19 | binary_len = np.dtype(np.float32).itemsize * self.emb_size
20 |
21 | for _ in range(self.vocab_size):
22 | word = []
23 | while True:
24 | ch = f.read(1)
25 | if ch == b' ':
26 | word = ''.join(word)
27 | break
28 | if ch != '\n':
29 | word.append(ch.decode('cp437'))
30 |
31 | try:
32 | value = f.read(binary_len)
33 | values = np.frombuffer(value, dtype=np.float32)
34 | vectors[len(self.i2w)] = values
35 | self.i2w[len(self.i2w)] = word
36 | self.w2i[word] = len(self.w2i)
37 | self.w2v[word] = values
38 | # values = np.frombuffer(f.read(binary_len), dtype=np.float32)
39 | except Exception as e:
40 | if not self.skip_check:
41 | raise ValueError('Unable to parse row {} ({})'.format(_, value))
42 |
43 | vectors = np.asarray(vectors)
44 | if not self.skip_check:
45 | if len(vectors) != len(self.i2w):
46 | raise AssertionError('Vector Size:{}, Index2Word Size:{}'.format(len(vectors), len(self.i2w)))
47 | if len(self.i2w) != len(self.w2i):
48 | raise AssertionError('Index2Word Size:{}, Word2Index Size:{}'.format(len(self.i2w), len(self.w2i)))
49 | if len(self.w2i) != len(self.w2v):
50 | raise AssertionError('Word2Index Size:{}, Word2Vector Size:{}'.format(len(self.w2i), len(self.w2v)))
51 |
52 | self.normalized_vectors = self._normalize(vectors)
53 |
--------------------------------------------------------------------------------
/nlpaug/model/audio/pitch.py:
--------------------------------------------------------------------------------
1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation
2 |
3 | try:
4 | import librosa
5 | except ImportError:
6 | # No installation required if not using this function
7 | pass
8 | import numpy as np
9 |
10 | from nlpaug.model.audio import Audio
11 |
12 |
13 | class Pitch(Audio):
14 | def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=1., duration=None,
15 | factor=(-10, 10), stateless=False):
16 | """
17 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided.
18 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
19 | augmentation
20 | will be applied in first 20% and last 20% of whole audio.
21 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
22 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
23 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
24 | augmented.
25 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage`
26 | value will be ignored.
27 | :param tuple pitch_range: Deprecated. Use `factor` indeed
28 | :param str name: Name of this augmenter
29 | """
30 | super().__init__(zone=zone, coverage=coverage, duration=duration, sampling_rate=sampling_rate,
31 | factor=factor, stateless=stateless)
32 | try:
33 | librosa
34 | except NameError:
35 | raise ImportError('Missed librosa library. Install it via `pip install librosa`')
36 |
37 | def get_pitch_level(self):
38 | return np.random.randint(self.factor[0], self.factor[1])
39 |
40 | def manipulate(self, data):
41 | n_step = self.get_pitch_level()
42 | start_pos, end_pos = self.get_augment_range_by_coverage(data)
43 |
44 | aug_data = librosa.effects.pitch_shift(data[start_pos:end_pos], self.sampling_rate, n_step)
45 |
46 | if not self.stateless:
47 | self.start_pos = start_pos
48 | self.end_pos = end_pos
49 | self.aug_data = aug_data
50 |
51 | augmented_data = data.copy()
52 | augmented_data[start_pos:end_pos] = aug_data
53 |
54 | return augmented_data
55 |
--------------------------------------------------------------------------------
/nlpaug/model/audio/audio.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 |
3 |
4 | class Audio:
5 | def __init__(self, zone=(0.2, 0.8), coverage=1., factor=None, duration=None,
6 | sampling_rate=None, stateless=True):
7 | self.zone = zone
8 | self.coverage = coverage
9 | self.factor = factor
10 | self.duration = duration
11 | self.sampling_rate = sampling_rate
12 | self.stateless = stateless
13 |
14 | self.start_pos = None
15 | self.end_pos = None
16 | self.aug_data = None
17 | self.aug_factor = None
18 |
19 | @classmethod
20 | def pad(cls, data, noise):
21 | if len(data) - len(noise) == 0:
22 | start_pos = 0
23 | else:
24 | start_pos = np.random.randint(0, len(data) - len(noise))
25 |
26 | prefix_padding = np.array([0] * start_pos)
27 | suffix_padding = np.array([0] * (len(data) - len(noise) - start_pos))
28 | return np.append(np.append(prefix_padding, noise), suffix_padding)
29 |
30 | def get_augmentation_segment_size(self, data):
31 | return int(len(data) * (self.zone[1] - self.zone[0]) * self.coverage)
32 |
33 | def get_augment_range_by_coverage(self, data):
34 | zone_start, zone_end = int(len(data) * self.zone[0]), int(len(data) * self.zone[1])
35 | zone_size = zone_end - zone_start
36 |
37 | target_size = int(zone_size * self.coverage)
38 | last_start = zone_start + int(zone_size * (1 - self.coverage))
39 |
40 | if zone_start == last_start:
41 | start_pos = zone_start
42 | end_pos = zone_end
43 | else:
44 | start_pos = np.random.randint(zone_start, last_start)
45 | end_pos = start_pos + target_size
46 |
47 | return start_pos, end_pos
48 |
49 | def get_augment_range_by_duration(self, data):
50 | zone_start, zone_end = int(len(data) * self.zone[0]), int(len(data) * self.zone[1])
51 | zone_size = zone_end - zone_start
52 |
53 | target_size = int(self.sampling_rate * self.duration)
54 |
55 | if target_size >= zone_size:
56 | start_pos = zone_start
57 | end_pos = zone_end
58 | else:
59 | last_start = zone_start + zone_size - target_size
60 | start_pos = np.random.randint(zone_start, last_start)
61 | end_pos = start_pos + target_size
62 |
63 | return start_pos, end_pos
64 |
65 | def manipulate(self, data):
66 | raise NotImplementedError
67 |
--------------------------------------------------------------------------------
/test/augmenter/char/test_keyboard.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import re
3 | import json
4 | import os
5 |
6 | import nlpaug.augmenter.char as nac
7 |
8 |
9 | class TestKeyboard(unittest.TestCase):
10 | def test_single_word(self):
11 | texts = ['Zoology', 'roku123456']
12 | aug = nac.KeyboardAug()
13 | for text in texts:
14 | augmented_text = aug.augment(text)
15 | self.assertNotEqual(text, augmented_text)
16 |
17 | self.assertTrue(len(texts) > 0)
18 |
19 | def test_multi_words(self):
20 | texts = ['The quick brown fox jumps over the lazy dog']
21 | aug = nac.KeyboardAug()
22 | for text in texts:
23 | augmented_text = aug.augment(text)
24 | self.assertNotEqual(text, augmented_text)
25 |
26 | self.assertTrue(len(texts) > 0)
27 |
28 | def test_no_special_character(self):
29 | text = 'qwertyuioplmnbvcxza'
30 | for i in range(10):
31 | aug = nac.KeyboardAug(include_special_char=False)
32 | augmented_text = aug.augment(text)
33 | self.assertTrue(re.match("^[a-zA-Z0-9]*$", augmented_text))
34 |
35 | def test_lang_th(self):
36 | text = 'ฤฤฤฤ ฤฏณ'
37 | aug = nac.KeyboardAug(lang='th')
38 | augmented_text = aug.augment(text)
39 | self.assertNotEqual(text, augmented_text)
40 |
41 | def test_non_support_lang(self):
42 | try:
43 | nac.KeyboardAug(lang='non_exist')
44 | self.assertTrue(False)
45 | except ValueError:
46 | self.assertTrue(True)
47 |
48 | def test_custom_model(self):
49 | custom_model = {
50 | 'a': '1',
51 | 'b': '2',
52 | }
53 |
54 | custom_model_file_path = 'char_keyboard_custom_model.json'
55 |
56 | with open(custom_model_file_path, 'w') as outfile:
57 | json.dump(custom_model, outfile)
58 |
59 | text = 'ababab'
60 | aug = nac.KeyboardAug(model_path=custom_model_file_path)
61 | augmented_text = aug.augment(text)
62 |
63 | self.assertTrue('1' in augmented_text or '2' in augmented_text)
64 |
65 | if os.path.exists(custom_model_file_path):
66 | os.remove(custom_model_file_path)
67 |
68 | def test_load_custom_model_fail(self):
69 | try:
70 | aug = nac.KeyboardAug(model_path='test_load_custom_model_fail.json')
71 | self.assertTrue(False)
72 | except ValueError:
73 | self.assertTrue(True)
74 |
--------------------------------------------------------------------------------
/nlpaug/util/decorator/deprecation.py:
--------------------------------------------------------------------------------
1 | import functools
2 | import warnings
3 |
4 |
5 | def deprecated(deprecate_from, deprecate_to, msg):
6 | def decorator(obj):
7 | if isinstance(obj, type):
8 | return _decorate_class(obj, deprecate_from, deprecate_to, msg)
9 | # # TODO:
10 | # elif isinstance(obj, property):
11 | # return _decorate_prop(obj, msg)
12 | else:
13 | return _decorate_func(obj, deprecate_from, deprecate_to, msg)
14 | return decorator
15 |
16 |
17 | def _decorate_class(cls, deprecate_from, deprecate_to, msg):
18 | msg_template = 'Class {name} is deprecated from {deprecate_from} version.'
19 | msg_template += ' It will be removed from {deprecate_to} version. {msg}'
20 |
21 | @functools.wraps(cls)
22 | def wrapped(*args, **kwargs):
23 | warnings.simplefilter('always', DeprecationWarning)
24 | warnings.warn(
25 | msg_template.format(
26 | name=cls.__name__, deprecate_from=deprecate_from, deprecate_to=deprecate_to, msg=msg),
27 | category=DeprecationWarning
28 | )
29 | warnings.simplefilter('default', DeprecationWarning)
30 | return cls(*args, **kwargs)
31 |
32 | return wrapped
33 |
34 |
35 | def _decorate_func(func, deprecate_from, deprecate_to, msg):
36 | msg_template = 'Function {name} is deprecated from {deprecate_from} version.'
37 | msg_template += ' It will be removed from {deprecate_to} version. {msg}'
38 |
39 | @functools.wraps(func)
40 | def wrapped(*args, **kwargs):
41 | warnings.simplefilter('always', DeprecationWarning)
42 | warnings.warn(
43 | msg_template.format(
44 | name=func.__name__, deprecate_from=deprecate_from, deprecate_to=deprecate_to, msg=msg),
45 | category=DeprecationWarning
46 | )
47 | warnings.simplefilter('default', DeprecationWarning)
48 | return func(*args, **kwargs)
49 |
50 | return wrapped
51 |
52 |
53 | def _decorate_prop(prop, msg):
54 | @functools.wraps(prop)
55 | @property
56 | def wrapped(*args, **kwargs):
57 | msg_template = 'Property {name} is deprecated. {msg}'
58 | warnings.simplefilter('always', DeprecationWarning)
59 | warnings.warn(
60 | msg_template.format(name=prop.__name__, msg=msg), category=DeprecationWarning
61 | )
62 | warnings.simplefilter('default', DeprecationWarning)
63 | return prop.fget(*args, **kwargs)
64 |
65 | return wrapped
66 |
--------------------------------------------------------------------------------
/test/profiling/sentence/test_context_word_embs_sentence_profiling.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import time
4 | from dotenv import load_dotenv
5 |
6 | import nlpaug.augmenter.sentence as nas
7 |
8 |
9 | class TestContextualWordEmbsAugProfiling(unittest.TestCase):
10 | @classmethod
11 | def setUpClass(cls):
12 | env_config_path = os.path.abspath(os.path.join(
13 | os.path.dirname(__file__), '..', '..', '..', '.env'))
14 | load_dotenv(env_config_path)
15 |
16 | cls.text = 'The quick brown fox jumps over the lazy dog.'
17 |
18 | def test_optimize(self):
19 | model_paths = ['gpt2', 'distilgpt2']
20 | device = 'cpu'
21 | enable_optimize = {'external_memory': 1024, 'return_proba': True}
22 | disable_optimize = {'external_memory': 0, 'return_proba': True}
23 | epoch = 10
24 |
25 | for model_path in model_paths:
26 | # Optimized
27 | durations = []
28 | aug = nas.ContextualWordEmbsForSentenceAug(
29 | model_path=model_path, device=device, optimize=enable_optimize, force_reload=True)
30 | for i in range(epoch):
31 | start_dt = time.monotonic()
32 | for j in range(epoch):
33 | aug.augment(self.text)
34 | end_dt = time.monotonic()
35 | durations.append(round(end_dt-start_dt, 2))
36 |
37 | optimized_total_duration = sum(durations)
38 | optimized_average_duration = round(optimized_total_duration/len(durations), 2)
39 |
40 | # No optimized
41 | durations = []
42 | aug.model.optimize = disable_optimize
43 | for _ in range(epoch):
44 | start_dt = time.monotonic()
45 | for _ in range(epoch):
46 | aug.augment(self.text)
47 | end_dt = time.monotonic()
48 | durations.append(round(end_dt - start_dt, 2))
49 |
50 | no_optimized_total_duration = sum(durations)
51 | no_optimized_average_duration = round(no_optimized_total_duration / len(durations), 2)
52 |
53 | print('Model:{}, Optimized: {}({}), No Optimized: {}({})'.format(
54 | model_path, optimized_total_duration, optimized_average_duration,
55 | no_optimized_total_duration, no_optimized_average_duration
56 | ))
57 |
58 | self.assertGreater(no_optimized_total_duration, optimized_total_duration)
59 | self.assertGreater(no_optimized_average_duration, optimized_average_duration)
60 |
--------------------------------------------------------------------------------
/test/augmenter/audio/test_loudness.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import numpy as np
4 | from dotenv import load_dotenv
5 |
6 | import nlpaug.augmenter.audio as naa
7 | from nlpaug.util import AudioLoader
8 |
9 |
10 | class TestLoudness(unittest.TestCase):
11 | @classmethod
12 | def setUpClass(cls):
13 | env_config_path = os.path.abspath(os.path.join(
14 | os.path.dirname(__file__), '..', '..', '..', '.env'))
15 | load_dotenv(env_config_path)
16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 | cls.sample_wav_file = os.path.join(
18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 | )
20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 |
22 | def test_empty_input(self):
23 | audio = np.array([])
24 | aug = naa.LoudnessAug()
25 | augmented_audio = aug.augment(audio)
26 |
27 | self.assertTrue(np.array_equal(audio, augmented_audio))
28 |
29 | def test_substitute(self):
30 | aug = naa.LoudnessAug()
31 | augmented_audio = aug.augment(self.audio)
32 |
33 | self.assertFalse(np.array_equal(self.audio, augmented_audio))
34 | self.assertEqual(len(self.audio), len(augmented_audio))
35 | self.assertTrue(self.sampling_rate > 0)
36 |
37 | def test_coverage(self):
38 | zone = (0.3, 0.7)
39 | coverage = 0.1
40 |
41 | aug = naa.LoudnessAug(zone=zone, coverage=coverage)
42 | aug.model.stateless = False
43 | augmented_audio = aug.augment(self.audio)
44 |
45 | reconstruct_augmented_audio = np.concatenate(
46 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:]), axis=0)
47 |
48 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio))
49 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
50 |
51 | def test_zone(self):
52 | zone = (0, 1)
53 | coverage = 1
54 |
55 | aug = naa.LoudnessAug(zone=zone, coverage=coverage)
56 | aug.model.stateless = False
57 | augmented_audio = aug.augment(self.audio)
58 |
59 | reconstruct_augmented_audio = np.concatenate(
60 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:]), axis=0)
61 |
62 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio))
63 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
64 |
--------------------------------------------------------------------------------
/nlpaug/model/lang_models/gpt2.py:
--------------------------------------------------------------------------------
1 | try:
2 | import torch
3 | from transformers import GPT2Tokenizer, GPT2LMHeadModel
4 | # from transformers import AutoModel, AutoTokenizer # Thrown error when using nucleus sampling
5 | except ImportError:
6 | # No installation required if not using this function
7 | pass
8 |
9 | from nlpaug.model.lang_models import LanguageModels
10 |
11 |
12 | class Gpt2(LanguageModels):
13 | # https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf
14 | SUBWORD_PREFIX = 'Ġ'
15 |
16 | def __init__(self, model_path='gpt2', temperature=1.0, top_k=None, top_p=None, device=None, optimize=None):
17 | super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p, optimize=optimize)
18 | self.model_path = model_path
19 |
20 | # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
21 | # self.model = AutoModel.from_pretrained(model_path)
22 | self.tokenizer = GPT2Tokenizer.from_pretrained(model_path)
23 | self.model = GPT2LMHeadModel.from_pretrained(model_path)
24 |
25 | self.model.to(self.device)
26 | self.model.eval()
27 |
28 | def id2token(self, _id):
29 | return self.tokenizer.decode(_id, clean_up_tokenization_spaces=True).strip()
30 |
31 | def predict(self, text, target_word=None, n=1, external_memory=None):
32 | # Convert feature
33 | input_idxes = self.tokenizer.encode(text)
34 | # if self.optimize['external_memory']:
35 | # input_idxes = input_idxes[-1:]
36 | input_idxes = torch.tensor(input_idxes, device=self.device).unsqueeze(0).repeat(1, 1)
37 |
38 | # Prediction
39 | with torch.no_grad():
40 | outputs = self.model(input_ids=input_idxes, past=external_memory)
41 | target_token_logits = outputs[0][0][-1] # GPT2 only predict last token
42 |
43 | # Selection
44 | seed = {'temperature': self.temperature, 'top_k': self.top_k, 'top_p': self.top_p}
45 | target_token_logits = self.control_randomness(target_token_logits, seed)
46 | target_token_logits, target_token_idxes = self.filtering(target_token_logits, seed)
47 | if len(target_token_idxes) != 0:
48 | results = self.pick(target_token_logits, target_token_idxes, target_word=target_word, n=n)
49 | else:
50 | results = None
51 |
52 | results = (results,)
53 | if self.optimize['external_memory']:
54 | external_memory = outputs[1]
55 | results += (external_memory,)
56 |
57 | return results
58 |
--------------------------------------------------------------------------------
/nlpaug/util/doc/doc.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.doc.token import Token
2 | from nlpaug.util.doc.change_log import ChangeLog
3 |
4 |
5 | class Doc:
6 | def __init__(self, doc='', tokens=None):
7 | self.doc = doc
8 | if tokens is not None and len(tokens) > 0:
9 | self.tokens = self.token2obj(tokens)
10 | else:
11 | self.tokens = []
12 | self.changed_cnt = 0
13 |
14 | def token2obj(self, tokens):
15 | objs = []
16 | start_pos = 0
17 | for t in tokens:
18 | token_obj = Token(token=t, start_pos=start_pos+self.doc[start_pos:].find(t))
19 | change_log = ChangeLog(orig_token=token_obj)
20 | objs.append(change_log)
21 |
22 | start_pos += len(token_obj.token)
23 | start_pos += 1 # TODO: for textual only
24 |
25 | return objs
26 |
27 | def add_token(self, idx, token, action, change_seq):
28 | token_obj = Token(token=token, start_pos=-1, action=action, change_seq=change_seq)
29 | change_log = ChangeLog(orig_token=token_obj)
30 | self.tokens.insert(idx, change_log)
31 |
32 | def add_change_log(self, idx, new_token, action, change_seq):
33 | self.changed_cnt += 1
34 | self.tokens[idx].add(new_token, action=action, change_seq=change_seq)
35 |
36 | def update_change_log(self, token_idx, change_idx=None, token=None, action=None, change_seq=None):
37 | change_idx = self.tokens[token_idx].size() if change_idx is None else change_idx
38 | self.tokens[token_idx].update(change_idx, token=token, action=action, change_seq=change_seq)
39 |
40 | def get_token(self, idx):
41 | return self.tokens[idx]
42 |
43 | def get_original_tokens(self):
44 | return [t.orig_token.token for t in self.tokens]
45 |
46 | def get_augmented_tokens(self):
47 | return [t.get_latest_token().token for t in self.tokens if len(t.get_latest_token().token) > 0]
48 |
49 | def size(self):
50 | return len(self.tokens)
51 |
52 | def changed_count(self):
53 | return self.changed_cnt
54 |
55 | def get_change_logs(self, start_pos=0):
56 | for i, t in enumerate(self.tokens):
57 | self.tokens[i].update_last_token(start_pos)
58 |
59 | start_pos += len(t.get_latest_token().token)
60 | if len(t.get_latest_token().token) > 0:
61 | # TODO: for textual only
62 | start_pos += 1
63 |
64 | change_logs = [t for t in self.tokens if t.is_changed()]
65 | change_logs.sort(key=lambda x: x.get_latest_token().change_seq)
66 | return [c.to_changed_dict() for c in change_logs]
67 |
--------------------------------------------------------------------------------
/test/augmenter/audio/test_pitch.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import numpy as np
4 | from dotenv import load_dotenv
5 |
6 | import nlpaug.augmenter.audio as naa
7 | from nlpaug.util import AudioLoader
8 |
9 |
10 | class TestPitch(unittest.TestCase):
11 | @classmethod
12 | def setUpClass(cls):
13 | env_config_path = os.path.abspath(os.path.join(
14 | os.path.dirname(__file__), '..', '..', '..', '.env'))
15 | load_dotenv(env_config_path)
16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 | cls.sample_wav_file = os.path.join(
18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 | )
20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 |
22 | def test_substitute(self):
23 | aug = naa.PitchAug(sampling_rate=self.sampling_rate)
24 | augmented_audio = aug.augment(self.audio)
25 |
26 | self.assertFalse(np.array_equal(self.audio, augmented_audio))
27 | self.assertEqual(len(self.audio), len(augmented_audio))
28 |
29 | def test_coverage(self):
30 | zone = (0.3, 0.7)
31 | coverage = 0.1
32 | decimal = 8
33 |
34 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage)
35 | aug.model.stateless = False
36 | augmented_audio = aug.augment(self.audio)
37 | reconstruct_augmented_audio = np.concatenate(
38 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:])
39 | , axis=0).astype(np.float32)
40 |
41 | augmented_audio = np.round(augmented_audio, decimals=decimal)
42 | reconstruct_augmented_audio = np.round(reconstruct_augmented_audio, decimals=decimal)
43 |
44 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio))
45 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
46 |
47 | def test_zone(self):
48 | zone = (0, 1)
49 | coverage = 1.
50 | decimal = 8
51 |
52 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage)
53 | aug.model.stateless = False
54 | augmented_audio = aug.augment(self.audio)
55 | reconstruct_augmented_audio = np.concatenate(
56 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:])
57 | , axis=0).astype(np.float32)
58 |
59 | augmented_audio = np.round(augmented_audio, decimals=decimal)
60 | reconstruct_augmented_audio = np.round(reconstruct_augmented_audio, decimals=decimal)
61 |
62 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio))
63 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
--------------------------------------------------------------------------------
/test/flow/test_sometimes.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import nlpaug.augmenter.char as nac
4 | import nlpaug.flow as naf
5 | from nlpaug.util import Action
6 |
7 |
8 | class TestSometimes(unittest.TestCase):
9 | def test_dry_run(self):
10 | seq = naf.Sometimes()
11 | results = seq.augment([])
12 | self.assertEqual(0, len(results))
13 |
14 | def test_single_action(self):
15 | texts = [
16 | 'The quick brown fox jumps over the lazy dog',
17 | 'Zology raku123456 fasdasd asd4123414 1234584 s@#'
18 | ]
19 |
20 | # Since prob may be low and causing do not perform data augmentation. Retry 5 times
21 | at_least_one_not_equal = False
22 | for _ in range(0, 5):
23 | flow = naf.Sometimes([nac.RandomCharAug(action=Action.INSERT)], pipeline_p=0.6)
24 | for text in texts:
25 | augmented_text = flow.augment(text)
26 |
27 | if text != augmented_text:
28 | at_least_one_not_equal = True
29 |
30 | self.assertLess(0, len(text))
31 |
32 | if at_least_one_not_equal:
33 | break
34 |
35 | self.assertTrue(at_least_one_not_equal)
36 | self.assertLess(0, len(texts))
37 |
38 | def test_multiple_actions(self):
39 | texts = [
40 | 'The quick brown fox jumps over the lazy dog',
41 | 'Zology raku123456 fasdasd asd4123414 1234584'
42 | ]
43 |
44 | flows = [
45 | naf.Sometimes([nac.RandomCharAug(action=Action.INSERT),
46 | nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)],
47 | pipeline_p=0.8),
48 | naf.Sometimes(
49 | [nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
50 | nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6),
51 | nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)],
52 | pipeline_p=0.6)
53 | ]
54 |
55 | # Since prob may be low and causing do not perform data augmentation. Retry 5 times
56 | for flow in flows:
57 | at_least_one_not_equal = False
58 | for _ in range(0, 5):
59 | for text in texts:
60 | self.assertLess(0, len(text))
61 | augmented_text = flow.augment(text)
62 |
63 | if text != augmented_text:
64 | at_least_one_not_equal = True
65 |
66 | self.assertLess(0, len(text))
67 |
68 | if at_least_one_not_equal:
69 | break
70 |
71 | self.assertTrue(at_least_one_not_equal)
72 | self.assertLess(0, len(flows))
73 | self.assertLess(0, len(texts))
74 |
75 |
--------------------------------------------------------------------------------
/nlpaug/model/lang_models/distilbert.py:
--------------------------------------------------------------------------------
1 | try:
2 | import torch
3 | from transformers import DistilBertTokenizer, DistilBertForMaskedLM
4 | # from transformers import AutoModel, AutoTokenizer
5 | except ImportError:
6 | # No installation required if not using this function
7 | pass
8 |
9 | from nlpaug.model.lang_models import LanguageModels
10 | from nlpaug.util.selection.filtering import *
11 |
12 |
13 | class DistilBert(LanguageModels):
14 | # https://arxiv.org/pdf/1910.01108.pdf
15 | START_TOKEN = '[CLS]'
16 | SEPARATOR_TOKEN = '[SEP]'
17 | MASK_TOKEN = '[MASK]'
18 | SUBWORD_PREFIX = '##'
19 |
20 | def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'):
21 | super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
22 | self.model_path = model_path
23 |
24 | # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
25 | # self.model = AutoModel.from_pretrained(model_path)
26 | self.tokenizer = DistilBertTokenizer.from_pretrained(model_path)
27 | self.model = DistilBertForMaskedLM.from_pretrained(model_path)
28 |
29 | self.model.to(self.device)
30 | self.model.eval()
31 |
32 | def id2token(self, _id):
33 | # id: integer format
34 | return self.tokenizer.convert_ids_to_tokens([_id])[0]
35 |
36 | def is_skip_candidate(self, candidate):
37 | return candidate[:2] == self.SUBWORD_PREFIX
38 |
39 | def predict(self, text, target_word=None, n=1):
40 | # Prepare inputs
41 | tokens = self.tokenizer.tokenize(text)
42 |
43 | tokens.insert(0, self.START_TOKEN)
44 | tokens.append(self.SEPARATOR_TOKEN)
45 | target_pos = tokens.index(self.MASK_TOKEN)
46 |
47 | token_inputs = self.tokenizer.convert_tokens_to_ids(tokens)
48 | mask_inputs = [1] * len(token_inputs) # 1: real token, 0: padding token
49 |
50 | # Convert to feature
51 | token_inputs = torch.tensor([token_inputs]).to(self.device)
52 | mask_inputs = torch.tensor([mask_inputs]).to(self.device)
53 |
54 | # Prediction
55 | with torch.no_grad():
56 | outputs = self.model(input_ids=token_inputs, attention_mask=mask_inputs)
57 | target_token_logits = outputs[0][0][target_pos]
58 |
59 | # Selection
60 | seed = {'temperature': self.temperature, 'top_k': self.top_k, 'top_p': self.top_p}
61 | target_token_logits = self.control_randomness(target_token_logits, seed)
62 | target_token_logits, target_token_idxes = self.filtering(target_token_logits, seed)
63 | if len(target_token_idxes) != 0:
64 | results = self.pick(target_token_logits, target_token_idxes, target_word=target_word, n=n)
65 | else:
66 | results = None
67 |
68 | results = (results,)
69 |
70 | return results
71 |
--------------------------------------------------------------------------------
/nlpaug/model/lang_models/bert.py:
--------------------------------------------------------------------------------
1 | try:
2 | import torch
3 | from transformers import BertTokenizer, BertForMaskedLM
4 | # from transformers import AutoModel, AutoTokenizer
5 | except ImportError:
6 | # No installation required if not using this function
7 | pass
8 |
9 | from nlpaug.model.lang_models import LanguageModels
10 | from nlpaug.util.selection.filtering import *
11 |
12 |
13 | class Bert(LanguageModels):
14 | # https://arxiv.org/pdf/1810.04805.pdf
15 | START_TOKEN = '[CLS]'
16 | SEPARATOR_TOKEN = '[SEP]'
17 | MASK_TOKEN = '[MASK]'
18 | SUBWORD_PREFIX = '##'
19 |
20 | def __init__(self, model_path='bert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'):
21 | super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
22 | self.model_path = model_path
23 |
24 | # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
25 | # self.model = AutoModel.from_pretrained(model_path)
26 | self.tokenizer = BertTokenizer.from_pretrained(model_path)
27 | self.model = BertForMaskedLM.from_pretrained(model_path)
28 |
29 | self.model.to(self.device)
30 | self.model.eval()
31 |
32 | def id2token(self, _id):
33 | # id: integer format
34 | return self.tokenizer.convert_ids_to_tokens([_id])[0]
35 |
36 | def is_skip_candidate(self, candidate):
37 | return candidate[:2] == self.SUBWORD_PREFIX
38 |
39 | def predict(self, text, target_word=None, n=1):
40 | # Prepare inputs
41 | tokens = self.tokenizer.tokenize(text)
42 |
43 | tokens.insert(0, self.START_TOKEN)
44 | tokens.append(self.SEPARATOR_TOKEN)
45 | target_pos = tokens.index(self.MASK_TOKEN)
46 |
47 | token_inputs = self.tokenizer.convert_tokens_to_ids(tokens)
48 | segment_inputs = [0] * len(token_inputs)
49 | mask_inputs = [1] * len(token_inputs) # 1: real token, 0: padding token
50 |
51 | # Convert to feature
52 | token_inputs = torch.tensor([token_inputs]).to(self.device)
53 | segment_inputs = torch.tensor([segment_inputs]).to(self.device)
54 | mask_inputs = torch.tensor([mask_inputs]).to(self.device)
55 |
56 | # Prediction
57 | with torch.no_grad():
58 | outputs = self.model(input_ids=token_inputs, token_type_ids=segment_inputs, attention_mask=mask_inputs)
59 | target_token_logits = outputs[0][0][target_pos]
60 |
61 | # Selection
62 | seed = {'temperature': self.temperature, 'top_k': self.top_k, 'top_p': self.top_p}
63 | target_token_logits = self.control_randomness(target_token_logits, seed)
64 | target_token_logits, target_token_idxes = self.filtering(target_token_logits, seed)
65 | if len(target_token_idxes) != 0:
66 | results = self.pick(target_token_logits, target_token_idxes, target_word=target_word, n=n)
67 | else:
68 | results = None
69 |
70 | results = (results,)
71 |
72 | return results
73 |
--------------------------------------------------------------------------------
/test/augmenter/audio/test_mask.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import numpy as np
4 | from dotenv import load_dotenv
5 |
6 | import nlpaug.augmenter.audio as naa
7 | from nlpaug.util import AudioLoader
8 |
9 |
10 | class TestMask(unittest.TestCase):
11 | @classmethod
12 | def setUpClass(cls):
13 | env_config_path = os.path.abspath(os.path.join(
14 | os.path.dirname(__file__), '..', '..', '..', '.env'))
15 | load_dotenv(env_config_path)
16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 | cls.sample_wav_file = os.path.join(
18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 | )
20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 |
22 | def test_empty_input(self):
23 | audio = np.array([])
24 | aug = naa.MaskAug(sampling_rate=44100)
25 | augmented_audio = aug.augment(audio)
26 |
27 | self.assertTrue(np.array_equal(audio, augmented_audio))
28 |
29 | def test_with_noise(self):
30 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, mask_with_noise=True)
31 | augmented_audio = aug.augment(self.audio)
32 |
33 | self.assertFalse(np.array_equal(self.audio, augmented_audio))
34 | self.assertEqual(len(self.audio), len(augmented_audio))
35 |
36 | def test_without_noise(self):
37 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, mask_with_noise=False)
38 | augmented_audio = aug.augment(self.audio)
39 |
40 | self.assertFalse(np.array_equal(self.audio, augmented_audio))
41 | self.assertEqual(len(self.audio), len(augmented_audio))
42 |
43 | def test_coverage(self):
44 | zone = (0.3, 0.7)
45 | coverage = 0.1
46 |
47 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage, mask_with_noise=False)
48 | aug.model.stateless = False
49 | augmented_audio = aug.augment(self.audio)
50 |
51 | reconstruct_augmented_audio = np.concatenate(
52 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:]), axis=0)
53 |
54 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio))
55 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
56 |
57 | def test_zone(self):
58 | zone = (0, 1)
59 | coverage = 1.
60 |
61 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage, mask_with_noise=False)
62 | aug.model.stateless = False
63 | augmented_audio = aug.augment(self.audio)
64 |
65 | reconstruct_augmented_audio = np.concatenate(
66 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:]), axis=0)
67 |
68 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio))
69 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
--------------------------------------------------------------------------------
/example/tfidf-train_model.ipynb:
--------------------------------------------------------------------------------
1 | {
2 | "cells": [
3 | {
4 | "cell_type": "code",
5 | "execution_count": 1,
6 | "metadata": {},
7 | "outputs": [],
8 | "source": [
9 | "import os\n",
10 | "os.environ[\"MODEL_DIR\"] = '../model'"
11 | ]
12 | },
13 | {
14 | "cell_type": "code",
15 | "execution_count": 2,
16 | "metadata": {},
17 | "outputs": [
18 | {
19 | "name": "stdout",
20 | "output_type": "stream",
21 | "text": [
22 | "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n",
23 | "--------------------\n",
24 | "Original Input:The quick brown fox jumps over the lazy dog\n",
25 | "Agumented Output:The quick U_RF F9F9F9F9F3W2TM jumps over the lazy dog\n",
26 | "--------------------\n",
27 | "Original Input:asdasd test apple dog asd asd\n",
28 | "Agumented Output:asdasd test apple dog asd 5hd\n"
29 | ]
30 | }
31 | ],
32 | "source": [
33 | "import sklearn.datasets\n",
34 | "import re\n",
35 | "\n",
36 | "import nlpaug.augmenter.word as naw\n",
37 | "import nlpaug.model.word_stats as nmw\n",
38 | "\n",
39 | "def _tokenizer(text, token_pattern=r\"(?u)\\b\\w\\w+\\b\"):\n",
40 | " token_pattern = re.compile(token_pattern)\n",
41 | " return token_pattern.findall(text)\n",
42 | "\n",
43 | "# Load sample data\n",
44 | "train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))\n",
45 | "train_x = train_data.data\n",
46 | "\n",
47 | "# Tokenize input\n",
48 | "train_x_tokens = [_tokenizer(x) for x in train_x]\n",
49 | "\n",
50 | "# Train TF-IDF model\n",
51 | "tfidf_model = nmw.TfIdf()\n",
52 | "tfidf_model.train(train_x_tokens)\n",
53 | "tfidf_model.save('.')\n",
54 | "\n",
55 | "# Load TF-IDF augmenter\n",
56 | "aug = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer)\n",
57 | "\n",
58 | "texts = [\n",
59 | " 'The quick brown fox jumps over the lazy dog',\n",
60 | " 'asdasd test apple dog asd asd'\n",
61 | "]\n",
62 | "\n",
63 | "for text in texts:\n",
64 | " augmented_text = aug.augment(text)\n",
65 | " \n",
66 | " print('-'*20)\n",
67 | " print('Original Input:{}'.format(text))\n",
68 | " print('Agumented Output:{}'.format(augmented_text))"
69 | ]
70 | },
71 | {
72 | "cell_type": "code",
73 | "execution_count": null,
74 | "metadata": {},
75 | "outputs": [],
76 | "source": []
77 | }
78 | ],
79 | "metadata": {
80 | "kernelspec": {
81 | "display_name": "Python 3",
82 | "language": "python",
83 | "name": "python3"
84 | },
85 | "language_info": {
86 | "codemirror_mode": {
87 | "name": "ipython",
88 | "version": 3
89 | },
90 | "file_extension": ".py",
91 | "mimetype": "text/x-python",
92 | "name": "python",
93 | "nbconvert_exporter": "python",
94 | "pygments_lexer": "ipython3",
95 | "version": "3.6.4"
96 | }
97 | },
98 | "nbformat": 4,
99 | "nbformat_minor": 2
100 | }
101 |
--------------------------------------------------------------------------------
/nlpaug/model/lang_models/roberta.py:
--------------------------------------------------------------------------------
1 | try:
2 | import torch
3 | from transformers import RobertaTokenizer, RobertaForMaskedLM
4 | # from transformers import AutoModel, AutoTokenizer # Thrown error when using nucleus sampling
5 | except ImportError:
6 | # No installation required if not using this function
7 | pass
8 |
9 | from nlpaug.model.lang_models import LanguageModels
10 | from nlpaug.util.selection.filtering import *
11 |
12 |
13 | class Roberta(LanguageModels):
14 | # https://arxiv.org/pdf/1810.04805.pdf
15 | START_TOKEN = ''
16 | SEPARATOR_TOKEN = ''
17 | MASK_TOKEN = ''
18 | SUBWORD_PREFIX = 'Ġ'
19 |
20 | def __init__(self, model_path='roberta-base', temperature=1.0, top_k=None, top_p=None, device='cuda'):
21 | super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p)
22 | self.model_path = model_path
23 |
24 | # self.tokenizer = AutoTokenizer.from_pretrained(model_path)
25 | # self.model = AutoModel.from_pretrained(model_path)
26 | self.tokenizer = RobertaTokenizer.from_pretrained(model_path)
27 | self.model = RobertaForMaskedLM.from_pretrained(model_path)
28 |
29 | self.model.to(self.device)
30 | self.model.eval()
31 |
32 | def id2token(self, _id):
33 | # id: integer format
34 | return self.tokenizer.convert_ids_to_tokens([_id])[0]
35 |
36 | def is_skip_candidate(self, candidate):
37 | return False
38 |
39 | def predict(self, text, target_word=None, n=1):
40 | # Prepare inputs
41 | tokens = self.tokenizer.tokenize(text)
42 |
43 | tokens.insert(0, self.START_TOKEN)
44 | tokens.append(self.SEPARATOR_TOKEN)
45 | target_pos = tokens.index(self.MASK_TOKEN)
46 |
47 | token_inputs = self.tokenizer.convert_tokens_to_ids(tokens)
48 | segment_inputs = [0] * len(token_inputs)
49 | mask_inputs = [1] * len(token_inputs) # 1: real token, 0: padding token
50 |
51 | # Convert to feature
52 | token_inputs = torch.tensor([token_inputs]).to(self.device)
53 | segment_inputs = torch.tensor([segment_inputs]).to(self.device)
54 | mask_inputs = torch.tensor([mask_inputs]).to(self.device)
55 |
56 | # Prediction
57 | with torch.no_grad():
58 | outputs = self.model(input_ids=token_inputs, token_type_ids=segment_inputs, attention_mask=mask_inputs)
59 | target_token_logits = outputs[0][0][target_pos]
60 |
61 | # Selection
62 | seed = {'temperature': self.temperature, 'top_k': self.top_k, 'top_p': self.top_p}
63 | target_token_logits = self.control_randomness(target_token_logits, seed)
64 | target_token_logits, target_token_idxes = self.filtering(target_token_logits, seed)
65 | if len(target_token_idxes) != 0:
66 | results = self.pick(target_token_logits, target_token_idxes, target_word=target_word, n=n)
67 | # Replace '' and 'Ġ' as . and empty string
68 | results = [(r[0].replace('Ġ', ''), r[1]) if r[0] != self.SEPARATOR_TOKEN else ('.', r[1]) for r in results]
69 | else:
70 | results = None
71 |
72 | results = (results,)
73 |
74 | return results
75 |
--------------------------------------------------------------------------------
/test/augmenter/test_augmenter.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import numpy as np
4 | from dotenv import load_dotenv
5 |
6 | import nlpaug.augmenter.char as nac
7 | import nlpaug.augmenter.word as naw
8 | import nlpaug.augmenter.sentence as nas
9 | import nlpaug.augmenter.audio as naa
10 | from nlpaug.util.audio import AudioLoader
11 |
12 |
13 | class TestWordNet(unittest.TestCase):
14 | @classmethod
15 | def setUpClass(cls):
16 | env_config_path = os.path.abspath(os.path.join(
17 | os.path.dirname(__file__), '..', '..', '.env'))
18 | load_dotenv(env_config_path)
19 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
20 | cls.sample_wav_file = os.path.join(
21 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
22 | )
23 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
24 |
25 | cls.textual_augs = [
26 | nac.RandomCharAug(),
27 | naw.ContextualWordEmbsAug(),
28 | nas.ContextualWordEmbsForSentenceAug()
29 | ]
30 |
31 | cls.audio_augs = [
32 | naa.CropAug(sampling_rate=cls.sampling_rate),
33 | naa.SpeedAug(),
34 | ]
35 |
36 | def test_textual_augmenter_n_output(self):
37 | text = 'The quick brown fox jumps over the lazy dog'
38 | n = 3
39 | for aug in self.textual_augs:
40 | augmented_texts = aug.augment(text, n=n)
41 | self.assertGreater(len(augmented_texts), 1)
42 | for augmented_text in augmented_texts:
43 | self.assertNotEqual(augmented_text, text)
44 |
45 | def test_textual_augmenter_n_output_thread(self):
46 | text = 'The quick brown fox jumps over the lazy dog'
47 | n = 3
48 | for aug in self.textual_augs:
49 | augmented_texts = aug.augments([text]*2, n=n, num_thread=n)
50 | self.assertGreater(len(augmented_texts), 1)
51 | for augmented_text in augmented_texts:
52 | self.assertNotEqual(augmented_text, text)
53 |
54 | def test_multiprocess_gpu(self):
55 | text = 'The quick brown fox jumps over the lazy dog'
56 | n = 3
57 | aug = naw.ContextualWordEmbsAug(force_reload=True, device='cuda')
58 |
59 | augmented_texts = aug.augment(text, n=n, num_thread=n)
60 | self.assertGreater(len(augmented_texts), 1)
61 | for augmented_text in augmented_texts:
62 | self.assertNotEqual(augmented_text, text)
63 |
64 | def test_audio_augmenter_n_output(self):
65 | n = 3
66 | for aug in self.audio_augs:
67 | augmented_audios = aug.augment(self.audio, n=n)
68 | self.assertGreater(len(augmented_audios), 1)
69 | for augmented_audio in augmented_audios:
70 | self.assertFalse(np.array_equal(augmented_audio, self.audio))
71 |
72 | def test_audio_augmenter_n_output_thread(self):
73 | n = 3
74 | for aug in self.audio_augs:
75 | augmented_audios = aug.augments([self.audio]*2, n=n, num_thread=n)
76 | self.assertGreater(len(augmented_audios), 1)
77 | for augmented_audio in augmented_audios:
78 | self.assertFalse(np.array_equal(augmented_audio, self.audio))
79 |
--------------------------------------------------------------------------------
/test/augmenter/word/test_random_word.py:
--------------------------------------------------------------------------------
1 | import unittest
2 |
3 | import nlpaug.augmenter.word as naw
4 |
5 |
6 | class TestRandom(unittest.TestCase):
7 | def test_swap(self):
8 | texts = [
9 | 'The quick brown fox jumps over the lazy dog'
10 | ]
11 | aug = naw.RandomWordAug(action="swap")
12 |
13 | for text in texts:
14 | tokens = text.lower().split(' ')
15 | orig_token_freq = {}
16 | for w in tokens:
17 | orig_token_freq[w] = tokens.count(w)
18 |
19 | augmented_text = text
20 |
21 | # https://github.com/makcedward/nlpaug/issues/77
22 | for i in range(10):
23 | augmented_text = aug.augment(augmented_text)
24 |
25 | aug_tokens = augmented_text.lower().split(' ')
26 | aug_token_freq = {}
27 | for w in tokens:
28 | aug_token_freq[w] = aug_tokens.count(w)
29 |
30 | for orig_token, orig_freq in orig_token_freq.items():
31 | self.assertTrue(orig_token in aug_token_freq)
32 | self.assertTrue(aug_token_freq[orig_token] == orig_freq)
33 |
34 | self.assertNotEqual(text, augmented_text)
35 |
36 | def test_substitute_without_target_word(self):
37 | texts = [
38 | 'The quick brown fox jumps over the lazy dog'
39 | ]
40 | aug = naw.RandomWordAug(action='substitute')
41 |
42 | for text in texts:
43 | augmented_text = aug.augment(text)
44 |
45 | self.assertIn('_', augmented_text)
46 | self.assertNotEqual(text, augmented_text)
47 |
48 | def test_substitute_with_target_word(self):
49 | texts = [
50 | 'The quick brown fox jumps over the lazy dog'
51 | ]
52 | target_words = ['$', '#', '^^^']
53 | aug = naw.RandomWordAug(action='substitute', target_words=target_words)
54 |
55 | for text in texts:
56 | augmented_text = aug.augment(text)
57 |
58 | replaced = False
59 | for w in target_words:
60 | if w in augmented_text:
61 | replaced = True
62 | break
63 | self.assertTrue(replaced)
64 | self.assertNotEqual(text, augmented_text)
65 |
66 | def test_delete(self):
67 | texts = [
68 | 'The quick brown fox jumps over the lazy dog'
69 | ]
70 | aug = naw.RandomWordAug()
71 |
72 | for text in texts:
73 | augmented_text = aug.augment(text)
74 | self.assertNotEqual(text, augmented_text)
75 |
76 | # https://github.com/makcedward/nlpaug/issues/76
77 | def test_swap_one_token(self):
78 | texts = [
79 | 'The'
80 | ]
81 | aug = naw.RandomWordAug(action='swap')
82 |
83 | for text in texts:
84 | augmented_text = aug.augment(text)
85 |
86 | self.assertEqual(text, augmented_text)
87 |
88 | # https://github.com/makcedward/nlpaug/issues/76
89 | def test_delete_one_token(self):
90 | texts = [
91 | 'The'
92 | ]
93 | aug = naw.RandomWordAug(action='delete')
94 |
95 | for text in texts:
96 | augmented_text = aug.augment(text)
97 | self.assertEqual(text, augmented_text)
98 |
--------------------------------------------------------------------------------
/nlpaug/model/char/keyboard.py:
--------------------------------------------------------------------------------
1 | import re
2 | import os
3 | import json
4 |
5 | from nlpaug.model.char import Character
6 |
7 |
8 | class Keyboard(Character):
9 | def __init__(self, special_char=True, numeric=True, upper_case=True, cache=True, lang="en", model_path=None):
10 | super().__init__(cache)
11 |
12 | self.model_dir = os.path.join(
13 | os.path.dirname(os.path.abspath(__file__)), '..', '..', 'res', 'char', 'keyboard')
14 |
15 | self.special_char = special_char
16 | self.numeric = numeric
17 | self.upper_case = upper_case
18 | self.lang = lang
19 | self.model_path = model_path
20 | self.model = self.get_model(
21 | model_path=model_path,
22 | model_dir=self.model_dir, special_char=special_char, numeric=numeric, upper_case=upper_case, lang=lang)
23 |
24 | def predict(self, data):
25 | return self.model[data]
26 |
27 | # TODO: Extending to 2 keyboard distance
28 | @classmethod
29 | def get_model(cls, model_path, model_dir, special_char=True, numeric=True, upper_case=True, lang="en"):
30 | # If loading customize model, 'lang' parameter will be ignored.
31 | if model_path is None:
32 | if lang not in ['en', 'th']:
33 | raise ValueError('Only support en and th now. You may provide the keyboard mapping '
34 | 'such that we can support "{}"'.format(lang))
35 |
36 | model_path = os.path.join(model_dir, lang+'.json')
37 |
38 | if not os.path.exists(model_path):
39 | raise ValueError('The model_path does not exist. Please check "{}"'.format(model_path))
40 |
41 | with open(model_path, encoding="utf8") as f:
42 | mapping = json.load(f)
43 |
44 | result = {}
45 |
46 | for key, values in mapping.items():
47 | # Skip records if key is numeric while include_numeric is false
48 | if not numeric and re.match("^[0-9]*$", key):
49 | continue
50 | # skip record if key is special character while include_spec is false
51 | if not special_char and not re.match("^[a-z0-9]*$", key):
52 | continue
53 |
54 | result[key] = []
55 | result[key.upper()] = []
56 |
57 | for value in values:
58 | # Skip record if value is numeric while include_numeric is false
59 | if not numeric and re.match("^[0-9]*$", value):
60 | continue
61 |
62 | # skip record if value is special character while include_spec is false
63 | if not special_char and not re.match("^[a-z0-9]*$", value):
64 | continue
65 |
66 | result[key].append(value)
67 |
68 | if upper_case:
69 | result[key].append(value.upper())
70 | result[key.upper()].append(value)
71 | result[key.upper()].append(value.upper())
72 |
73 | clean_result = {}
74 | for key, values in result.items():
75 | # clear empty mapping
76 | if len(values) == 0:
77 | continue
78 |
79 | # de-duplicate
80 | values = [v for v in values if v != key]
81 | values = sorted(list(set(values)))
82 |
83 | clean_result[key] = values
84 |
85 | return clean_result
86 |
--------------------------------------------------------------------------------
/nlpaug/augmenter/word/split.py:
--------------------------------------------------------------------------------
1 | # Source: https://arxiv.org/pdf/1812.05271v1.pdf
2 |
3 | """
4 | Augmenter that apply word splitting operation to textual input.
5 | """
6 |
7 | from nlpaug.augmenter.word import WordAugmenter
8 | from nlpaug.util import Action, Doc
9 |
10 |
11 | class SplitAug(WordAugmenter):
12 | """
13 | Augmenter that apply word splitting for augmentation.
14 |
15 | :param float aug_p: Percentage of word will be augmented.
16 | :param int aug_min: Minimum number of word will be augmented.
17 | :param int aug_max: Maximum number of word will be augmented. If None is passed, number of augmentation is
18 | calculated via aup_p. If calculated result from aug_p is smaller than aug_max, will use calculated result from
19 | aug_p. Otherwise, using aug_max.
20 | :param int min_char: If word less than this value, do not draw word for augmentation
21 | :param list stopwords: List of words which will be skipped from augment operation.
22 | :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation.
23 | :param func tokenizer: Customize tokenization process
24 | :param func reverse_tokenizer: Customize reverse of tokenization process
25 | :param bool include_detail: Change detail will be returned if it is True.
26 | :param str name: Name of this augmenter
27 |
28 | >>> import nlpaug.augmenter.word as naw
29 | >>> aug = naw.SplitAug()
30 | """
31 |
32 | def __init__(self, name='Split_Aug', aug_min=1, aug_max=10, aug_p=0.3, min_char=4, stopwords=None,
33 | tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, include_detail=False, verbose=0):
34 | super().__init__(
35 | action=Action.SPLIT, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, stopwords=stopwords,
36 | tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, device='cpu', verbose=verbose,
37 | stopwords_regex=stopwords_regex, include_detail=include_detail)
38 |
39 | self.min_char = min_char
40 |
41 | def skip_aug(self, token_idxes, tokens):
42 | results = []
43 | for token_idx in token_idxes:
44 | if len(tokens[token_idx]) >= self.min_char:
45 | results.append(token_idx)
46 | return results
47 |
48 | def split(self, data):
49 | change_seq = 0
50 | doc = Doc(data, self.tokenizer(data))
51 |
52 | aug_idxes = self._get_aug_idxes(doc.get_original_tokens())
53 | aug_idxes.sort(reverse=True)
54 |
55 | if aug_idxes is None or len(aug_idxes) == 0:
56 | if self.include_detail:
57 | return data, []
58 | return data
59 |
60 | for aug_idx in aug_idxes:
61 | target_token = doc.get_token(aug_idx).get_latest_token().token
62 | separate_pos = self.sample(len(target_token), 1)
63 | prev_token = target_token[:separate_pos]
64 | next_token = target_token[separate_pos:]
65 |
66 | change_seq += 1
67 | doc.add_change_log(aug_idx, new_token=next_token, action=Action.SPLIT,
68 | change_seq=self.parent_change_seq + change_seq)
69 | doc.add_token(aug_idx, token=prev_token, action=Action.SPLIT,
70 | change_seq=self.parent_change_seq + change_seq)
71 |
72 | if self.include_detail:
73 | return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs()
74 | else:
75 | return self.reverse_tokenizer(doc.get_augmented_tokens())
76 |
--------------------------------------------------------------------------------
/test/augmenter/audio/test_noise.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import numpy as np
4 | from dotenv import load_dotenv
5 |
6 | import nlpaug.augmenter.audio as naa
7 | from nlpaug.util import AudioLoader
8 |
9 |
10 | class TestNoise(unittest.TestCase):
11 | @classmethod
12 | def setUpClass(cls):
13 | env_config_path = os.path.abspath(os.path.join(
14 | os.path.dirname(__file__), '..', '..', '..', '.env'))
15 | load_dotenv(env_config_path)
16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 | cls.sample_wav_file = os.path.join(
18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 | )
20 | # https://en.wikipedia.org/wiki/Colors_of_noise
21 | cls.noise_wav_file = os.path.join(
22 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Pink_noise.ogg'
23 | )
24 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
25 | cls.noise, cls.noise_sampling_rate = AudioLoader.load_audio(cls.noise_wav_file)
26 |
27 | def test_empty_input(self):
28 | audio = np.array([])
29 | aug = naa.NoiseAug()
30 | augmented_audio = aug.augment(audio)
31 |
32 | self.assertTrue(np.array_equal(audio, augmented_audio))
33 |
34 | def test_substitute(self):
35 | aug = naa.NoiseAug()
36 | augmented_audio = aug.augment(self.audio)
37 |
38 | self.assertFalse(np.array_equal(self.audio, augmented_audio))
39 | self.assertTrue(len(self.audio), len(augmented_audio))
40 | self.assertTrue(self.sampling_rate > 0)
41 |
42 | def test_color_noise(self):
43 | colors = naa.NoiseAug().model.COLOR_NOISES
44 |
45 | for color in colors:
46 | aug = naa.NoiseAug(color=color)
47 | augmented_audio = aug.augment(self.audio)
48 |
49 | self.assertFalse(np.array_equal(self.audio, augmented_audio))
50 | self.assertTrue(len(self.audio), len(augmented_audio))
51 | self.assertTrue(self.sampling_rate > 0)
52 |
53 | def test_background_noise(self):
54 | # noise > audio
55 | aug = naa.NoiseAug(noises=[self.noise])
56 | augmented_audio = aug.augment(self.audio)
57 | self.assertTrue(augmented_audio is not None)
58 |
59 | # audio > noise
60 | aug = naa.NoiseAug(noises=[self.audio])
61 | augmented_audio = aug.augment(self.noise)
62 | self.assertTrue(augmented_audio is not None)
63 |
64 | def test_coverage(self):
65 | zone = (0.3, 0.7)
66 | coverage = 0.1
67 | expected_aug_data_size = int(len(self.audio) * (zone[1] - zone[0]) * coverage)
68 |
69 | # background noise
70 | aug = naa.NoiseAug(zone=zone, noises=[self.noise], coverage=coverage)
71 | aug.model.stateless = False
72 | aug.augment(self.audio)
73 |
74 | self.assertTrue(-1 <= len(aug.model.aug_data) - expected_aug_data_size <= 1)
75 |
76 | # colored noise
77 | aug = naa.NoiseAug(zone=zone, color='pink', coverage=coverage)
78 | aug.model.stateless = False
79 | aug.augment(self.audio)
80 |
81 | self.assertTrue(-1 <= len(aug.model.aug_data) - expected_aug_data_size <= 1)
82 |
83 | def test_zone(self):
84 | zone = (0, 1)
85 | coverage = 1.
86 | expected_aug_data_size = int(len(self.audio) * (zone[1] - zone[0]) * coverage)
87 |
88 | # background noise
89 | aug = naa.NoiseAug(zone=zone, noises=[self.noise], coverage=coverage)
90 | aug.model.stateless = False
91 | aug.augment(self.audio)
92 |
93 | self.assertTrue(-1 <= len(aug.model.aug_data) - expected_aug_data_size <= 1)
94 |
95 | # colored noise
96 | aug = naa.NoiseAug(zone=zone, color='pink', coverage=coverage)
97 | aug.model.stateless = False
98 | aug.augment(self.audio)
99 |
100 | self.assertTrue(-1 <= len(aug.model.aug_data) - expected_aug_data_size <= 1)
--------------------------------------------------------------------------------
/nlpaug/model/word_dict/ppdb.py:
--------------------------------------------------------------------------------
1 | try:
2 | import nltk
3 | from nltk.corpus import wordnet
4 | except ImportError:
5 | # No installation required if not using this function
6 | pass
7 |
8 | from nlpaug.util import PartOfSpeech
9 | from nlpaug.model.word_dict import WordDictionary
10 |
11 |
12 | class Ppdb(WordDictionary):
13 | # http://paraphrase.org/#/download
14 | def __init__(self, dict_path):
15 | super().__init__(cache=True)
16 |
17 | self.dict_path = dict_path
18 | self.lang = 'eng' # TODO: support other languages
19 |
20 | self.score_threshold = self.get_default_score_thresholds() # TODO: support other filtering
21 | self.is_synonym = True # TODO: antonyms
22 |
23 | try:
24 | wordnet
25 | except NameError:
26 | raise ImportError('Missed nltk library. Install it via `pip install nltk`')
27 |
28 | self._init()
29 |
30 | def _init(self):
31 | self.dict = {}
32 | self.read(self.dict_path)
33 |
34 | @classmethod
35 | def get_default_score_thresholds(cls):
36 | return {
37 | 'AGigaSim': 0.6
38 | }
39 |
40 | def read(self, model_path):
41 | with open(model_path, 'rb') as f:
42 | for line in f:
43 | line = line.decode('utf-8')
44 |
45 | if '\\ x' in line or 'xc3' in line:
46 | continue
47 |
48 | fields = line.split('|||')
49 | constituents = fields[0].strip()[1:-1].split('/')
50 | phrase = fields[1].strip()
51 | paraphrase = fields[2].strip()
52 |
53 | # filter multiple words
54 | if len(phrase.split()) != len(paraphrase.split()):
55 | continue
56 |
57 | scores = []
58 |
59 | if len(fields) == 6:
60 | # filter equivalence word ( for PPDB v2.0 only.)
61 | # entailment = fields[5].strip()
62 | # if entailment == 'Equivalence' and self.is_synonym:
63 | # continue
64 |
65 | features = fields[3].strip().split()
66 | features = [feature for feature in features for s in self.score_threshold if
67 | s in feature] # filter by scheme
68 |
69 | for feature in features:
70 | scheme, score = feature.split('=')
71 | if scheme in self.score_threshold and float(score) > self.score_threshold[scheme]:
72 | scores.append((scheme, score))
73 |
74 | # # filter by feature/ score
75 | # if len(scores) == 0:
76 | # continue
77 |
78 | if phrase not in self.dict:
79 | self.dict[phrase] = {}
80 |
81 | part_of_speeches = [pos for con in constituents for pos in PartOfSpeech.constituent2pos(con)]
82 |
83 | for pos in part_of_speeches:
84 | if pos not in self.dict[phrase]:
85 | self.dict[phrase][pos] = []
86 |
87 | self.dict[phrase][pos].append({
88 | 'phrase': phrase,
89 | 'part_of_speech': pos,
90 | 'synonym': paraphrase,
91 | 'scores': scores
92 | })
93 |
94 | def predict(self, word, pos=None):
95 | if pos is None:
96 | candidates = []
97 | if word not in self.dict:
98 | return candidates
99 |
100 | for pos in self.dict[word]:
101 | for candidate in self.dict[word][pos]:
102 | candidates.append(candidate['synonym'])
103 |
104 | return candidates
105 |
106 | if word in self.dict and pos in self.dict[word]:
107 | return [candidate['synonym'] for candidate in self.dict[word][pos]]
108 |
109 | return []
110 |
111 | def pos_tag(self, tokens):
112 | return nltk.pos_tag(tokens)
113 |
--------------------------------------------------------------------------------
/nlpaug/model/audio/noise.py:
--------------------------------------------------------------------------------
1 | import random
2 | import numpy as np
3 | import math
4 |
5 | from nlpaug.model.audio import Audio
6 |
7 |
8 | class Noise(Audio):
9 | COLOR_NOISES = ['white', 'pink', 'red', 'brown', 'brownian', 'blue', 'azure', 'violet', 'purple']
10 |
11 | def __init__(self, zone=(0.2, 0.8), coverage=1.,
12 | color='white', noises=None, stateless=True):
13 | """
14 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
15 | augmentation
16 | will be applied in first 20% and last 20% of whole audio.
17 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
18 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
19 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
20 | augmented.
21 | :param str color: Colors of noise. Supported 'white', 'pink', 'red', 'brown', 'brownian', 'blue', 'azure',
22 | 'violet', 'purple' and 'random'. If 'random' is used, noise color will be picked randomly in each augment.
23 | :param list noises: Background noises for noise injection. You can provide more than one background noise and
24 | noise will be picked randomly. Expected format is list of numpy array. If this value is provided. `color`
25 | value will be ignored
26 | """
27 | super().__init__(zone=zone, coverage=coverage, stateless=stateless)
28 |
29 | self.color = color
30 | self.noises = noises
31 |
32 | def validate(self):
33 | if self.color not in self.COLOR_NOISES + ['random']:
34 | raise ValueError('Only support {} while `{}` is passed'.format(self.COLOR_NOISES+['random'], self.color))
35 |
36 | def color_noise(self, segment_size):
37 | # https://en.wikipedia.org/wiki/Colors_of_noise
38 | uneven = segment_size % 2
39 | fft_size = segment_size // 2 + 1 + uneven
40 | noise_fft = np.random.randn(fft_size)
41 | color_noise = np.linspace(1, fft_size, fft_size)
42 |
43 | if self.color == 'random':
44 | color = np.random.choice(self.COLOR_NOISES)
45 | else:
46 | color = self.color
47 | if color == 'white':
48 | pass # no color noise
49 | else:
50 | if color == 'pink':
51 | color_noise = color_noise ** (-1) # 1/f
52 | elif color in ['red', 'brown', 'brownian']:
53 | color_noise = color_noise ** (-2) # 1/f^2
54 | elif color in ['blue', 'azure']:
55 | pass # f
56 | elif color in ['violet', 'purple']:
57 | color_noise = color_noise ** 2 # f^2
58 |
59 | noise_fft = noise_fft * color_noise
60 |
61 | if uneven:
62 | noise_fft = noise_fft[:-1]
63 |
64 | noise = np.fft.irfft(noise_fft)
65 | return noise, color_noise
66 |
67 | def background_noise(self, segment_size):
68 | # https://arxiv.org/pdf/1608.04363.pdf
69 | noise = random.sample(self.noises, 1)[0]
70 |
71 | # Get noise segment
72 | if len(noise) >= segment_size:
73 | noise_segment = noise[:segment_size]
74 | else:
75 | noise_segment = noise.copy()
76 | for _ in range(math.ceil(segment_size/len(noise))-1):
77 | noise_segment = np.append(noise_segment, noise)
78 | noise_segment = noise_segment[:segment_size]
79 |
80 | return noise_segment
81 |
82 | def manipulate(self, data):
83 | aug_segment_size = self.get_augmentation_segment_size(data)
84 | if self.noises is None:
85 | noise, color = self.color_noise(aug_segment_size)
86 |
87 | if not self.stateless:
88 | self.aug_factor = color
89 | else:
90 | noise = self.background_noise(aug_segment_size)
91 |
92 | if not self.stateless:
93 | self.aug_data = noise
94 |
95 | noise = self.pad(data, noise)
96 |
97 | return (data + noise).astype(type(data[0]))
98 |
--------------------------------------------------------------------------------
/test/flow/test_sequential.py:
--------------------------------------------------------------------------------
1 | import unittest
2 | import os
3 | import numpy as np
4 |
5 | import nlpaug.augmenter.char as nac
6 | import nlpaug.augmenter.word as naw
7 | import nlpaug.augmenter.spectrogram as nas
8 | import nlpaug.augmenter.audio as naa
9 | import nlpaug.flow as naf
10 | from nlpaug.util import Action, AudioLoader
11 |
12 |
13 | class TestSequential(unittest.TestCase):
14 | def test_dry_run(self):
15 | flow = naf.Sequential()
16 | results = flow.augment([])
17 | self.assertEqual(0, len(results))
18 |
19 | def test_single_action(self):
20 | texts = [
21 | 'The quick brown fox jumps over the lazy dog',
22 | 'Zology raku123456 fasdasd asd4123414 1234584 s@#'
23 | ]
24 |
25 | flow = naf.Sequential([nac.RandomCharAug(action=Action.INSERT, min_char=1)])
26 |
27 | for text in texts:
28 | augmented_text = flow.augment(text)
29 |
30 | self.assertNotEqual(text, augmented_text)
31 | self.assertLess(0, len(text))
32 |
33 | self.assertLess(0, len(texts))
34 |
35 | def test_multiple_actions(self):
36 | texts = [
37 | 'The quick brown fox jumps over the lazy dog',
38 | 'Zology raku123456 fasdasd asd4123414 1234584'
39 | ]
40 |
41 | flows = [
42 | naf.Sequential([nac.RandomCharAug(action=Action.INSERT),
43 | naw.RandomWordAug()]),
44 | naf.Sequential([nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
45 | nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)])
46 | ]
47 |
48 | for flow in flows:
49 | for text in texts:
50 | augmented_text = flow.augment(text)
51 |
52 | self.assertNotEqual(text, augmented_text)
53 | self.assertLess(0, len(text))
54 |
55 | self.assertLess(0, len(texts))
56 |
57 | self.assertLess(0, len(flows))
58 |
59 | def test_spectrogram(self):
60 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
61 | sample_wav_file = os.path.join(
62 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
63 | )
64 |
65 | mel_spectrogram = AudioLoader.load_mel_spectrogram(sample_wav_file, n_mels=128)
66 |
67 | flow = naf.Sequential([
68 | nas.FrequencyMaskingAug(mask_factor=50),
69 | nas.TimeMaskingAug(mask_factor=20),
70 | nas.TimeMaskingAug(mask_factor=30)])
71 |
72 | augmented_mel_spectrogram = flow.augment(mel_spectrogram)
73 |
74 | for aug in flow:
75 | if aug.name == 'FrequencyMasking_Aug':
76 | self.assertEqual(len(mel_spectrogram[aug.model.f0]), np.count_nonzero(mel_spectrogram[aug.model.f0]))
77 | self.assertEqual(0, np.count_nonzero(augmented_mel_spectrogram[aug.model.f0]))
78 | elif aug.name == 'TimeMasking_Aug':
79 | self.assertEqual(len(mel_spectrogram[:, aug.model.t0]),
80 | np.count_nonzero(mel_spectrogram[:, aug.model.t0]))
81 | self.assertEqual(0, np.count_nonzero(augmented_mel_spectrogram[:, aug.model.t0]))
82 | else:
83 | # Unexpected flow
84 | self.assertFalse(True)
85 |
86 | self.assertTrue(len(flow) > 0)
87 |
88 | def test_audio(self):
89 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
90 | sample_wav_file = os.path.join(
91 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
92 | )
93 |
94 | audio, sampling_rate = AudioLoader.load_audio(sample_wav_file)
95 |
96 | flow = naf.Sequential([
97 | naa.NoiseAug(),
98 | naa.PitchAug(sampling_rate=sampling_rate, factor=(0.2, 1.5)),
99 | naa.ShiftAug(sampling_rate=sampling_rate, duration=2),
100 | naa.SpeedAug(factor=(1.5, 3))
101 | ])
102 |
103 | augmented_audio = flow.augment(audio)
104 |
105 | self.assertFalse(np.array_equal(audio, augmented_audio))
106 | self.assertTrue(len(audio), len(augmented_audio))
107 |
--------------------------------------------------------------------------------
/nlpaug/model/audio/vtlp.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | import librosa
3 | from nlpaug.model.audio import Audio
4 |
5 |
6 | class Vtlp(Audio):
7 | # https://pdfs.semanticscholar.org/3de0/616eb3cd4554fdf9fd65c9c82f2605a17413.pdf
8 | def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=0.1, duration=None, factor=(0.9, 1.1), fhi=4800,
9 | stateless=True):
10 | """
11 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided.
12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
13 | augmentation
14 | will be applied in first 20% and last 20% of whole audio.
15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
18 | augmented.
19 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided.
20 | `coverage` value will be ignored.
21 | :param int fhi: Boundary frequency. Default value is 4800.
22 | :param tuple factor: Warping factor
23 | """
24 | super().__init__(zone=zone, coverage=coverage, duration=duration, sampling_rate=sampling_rate,
25 | stateless=stateless, factor=factor)
26 | self.fhi = fhi
27 |
28 | @classmethod
29 | def get_scale_factors(cls, freq_dim, sampling_rate, fhi=4800, alpha=0.9):
30 | factors = []
31 | freqs = np.linspace(0, 1, freq_dim)
32 |
33 | scale = fhi * min(alpha, 1)
34 | f_boundary = scale / alpha
35 | half_sr = sampling_rate / 2
36 |
37 | for f in freqs:
38 | f *= sampling_rate
39 | if f <= f_boundary:
40 | factors.append(f * alpha)
41 | else:
42 | warp_freq = half_sr - (half_sr - scale) / (half_sr - scale / alpha) * (half_sr - f)
43 | factors.append(warp_freq)
44 |
45 | return np.array(factors)
46 |
47 | # https://github.com/YerevaNN/Spoken-language-identification/blob/master/augment_data.py#L26
48 | def _manipulate(self, audio, sampling_rate, factor):
49 | stft = librosa.core.stft(audio)
50 | time_dim, freq_dim = stft.shape
51 | data_type = type(stft[0][0])
52 |
53 | factors = self.get_scale_factors(freq_dim, sampling_rate, alpha=factor)
54 | factors *= (freq_dim - 1) / max(factors)
55 | new_stft = np.zeros([time_dim, freq_dim], dtype=data_type)
56 |
57 | for i in range(freq_dim):
58 | # first and last freq
59 | if i == 0 or i + 1 >= freq_dim:
60 | new_stft[:, i] += stft[:, i]
61 | else:
62 | warp_up = factors[i] - np.floor(factors[i])
63 | warp_down = 1 - warp_up
64 | pos = int(np.floor(factors[i]))
65 |
66 | new_stft[:, pos] += warp_down * stft[:, i]
67 | new_stft[:, pos+1] += warp_up * stft[:, i]
68 |
69 | return librosa.core.istft(new_stft)
70 |
71 | def get_warping_level(self):
72 | return np.random.uniform(self.factor[0], self.factor[1])
73 |
74 | def manipulate(self, data):
75 | if self.duration is None:
76 | start_pos, end_pos = self.get_augment_range_by_coverage(data)
77 | else:
78 | start_pos, end_pos = self.get_augment_range_by_duration(data)
79 |
80 | factor = self.get_warping_level()
81 | aug_data = self._manipulate(data[start_pos:end_pos], sampling_rate=self.sampling_rate, factor=factor)
82 |
83 | if not self.stateless:
84 | self.start_pos = start_pos
85 | self.end_pos = end_pos
86 | self.aug_factor = factor
87 | self.aug_data = aug_data
88 |
89 | return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0).astype(type(data[0]))
90 |
91 | # if start_pos > 0:
92 | # aug_data = np.concatenate((data[:start_pos], aug_data), axis=0)
93 | # if end_pos < len(data):
94 | # aug_data = np.concatenate((aug_data, data[end_pos:]), axis=0)
95 | #
96 | # return aug_data.astype(type(data[0]))
97 |
--------------------------------------------------------------------------------
/model/char/keyboard/th.json:
--------------------------------------------------------------------------------
1 | {
2 | "ๅ": ["/", "ๆ", "ไ"],
3 | "/": ["ๅ", "_", "ๆ", "ไ", "ำ"],
4 | "_": ["/", "ภ", "ไ", "ำ", "พ"],
5 | "ภ": ["_", "ถ", "ำ", "พ", "ะ"],
6 | "ถ": ["ภ", "ุ", "พ", "ะ", "ั"],
7 | "ุ": ["ถ", "ึ", "ะ", "ั", "ี"],
8 | "ึ": ["ุ", "ค", "ั", "ี", "ร"],
9 | "ค": ["ึ", "ต", "ี", "ร", "น"],
10 | "ต": ["ค", "จ", "ร", "น", "ย"],
11 | "จ": ["ต", "ข", "น", "ย", "บ"],
12 | "ข": ["จ", "ช", "ย", "บ", "ล"],
13 | "ช": ["ข", "บ", "ล"],
14 |
15 | "ๆ": ["ๅ", "/", "ไ", "ฟ", "ห"],
16 | "ไ": ["ๅ", "/", "_", "ๆ", "ำ", "ฟ", "ห", "ก"],
17 | "ำ": ["/", "_", "ภ", "ไ", "พ", "ห", "ก", "ด"],
18 | "พ": ["_", "ภ", "ถ", "ำ", "ะ", "ก", "ด", "เ"],
19 | "ะ": ["ภ", "ถ", "ุ", "พ", "ั", "ด", "เ", "้"],
20 | "ั": ["ถ", "ุ", "ึ", "ะ", "ี", "เ", "้", "่"],
21 | "ี": ["ุ", "ึ", "ค", "ั", "ร", "้", "่", "า"],
22 | "ร": ["ึ", "ค", "ต", "ี", "น", "่", "า", "ส"],
23 | "น": ["ค", "ต", "จ", "ร", "ย", "า", "ส", "ว"],
24 | "ย": ["ต", "จ", "ข", "น", "บ", "ส", "ว", "ง"],
25 | "บ": ["จ", "ข", "ช", "ย", "ล", "ว", "ง", "ฃ"],
26 | "ล": ["ข", "ช", "บ", "ง", "ฃ"],
27 |
28 | "ฟ": ["ๆ", "ไ", "ห", "ผ"],
29 | "ห": ["ๆ", "ไ", "ำ", "ฟ", "ก", "ผ", "ป"],
30 | "ก": ["ไ", "ำ", "พ", "ห", "ด", "ผ", "ป", "แ"],
31 | "ด": ["ำ", "พ", "ะ", "ก", "เ", "ป", "แ", "อ"],
32 | "เ": ["พ", "ะ", "ั", "ด", "้", "แ", "อ", "ิ"],
33 | "้": ["ะ", "ั", "ี", "เ", "่", "อ", "ิ", "ื"],
34 | "่": ["ั", "ี", "ร", "้", "า", "ิ", "ื", "ท"],
35 | "า": ["ี", "ร", "น", "่", "ส", "ื", "ท", "ม"],
36 | "ส": ["ร", "น", "ย", "า", "ว", "ท", "ม", "ใ"],
37 | "ว": ["น", "ย", "บ", "ส", "ง", "ม", "ใ", "ฝ"],
38 | "ง": ["ย", "บ", "ล", "ว", "ฃ", "ใ", "ฝ"],
39 | "ฃ": ["บ", "ล", "ง", "ฝ"],
40 |
41 | "ผ": ["ฟ", "ห", "ก", "ป"],
42 | "ป": ["ห", "ก", "ด", "ผ", "แ"],
43 | "แ": ["ก", "ด", "เ", "ป", "อ"],
44 | "อ": ["ด", "เ", "้", "แ", "ิ"],
45 | "ิ": ["เ", "้", "่", "อ", "ื"],
46 | "ื": ["้", "่", "า", "ิ", "ท"],
47 | "ท": ["่", "า", "ส", "ื", "ม"],
48 | "ม": ["า", "ส", "ว", "ท", "ใ"],
49 | "ใ": ["ส", "ว", "ง", "ม", "ฝ"],
50 | "ฝ": ["ว", "ง", "ฃ", "ใ"],
51 |
52 | "+": ["๑", "๐", "\""],
53 | "๑": ["+", "๒", "๐", "\"", "ฎ"],
54 | "๒": ["๑", "๓", "\"", "ฎ", "ฑ"],
55 | "๓": ["๒", "๔", "ฎ", "ฑ", "ธ"],
56 | "๔": ["๓", "ู", "ฑ", "ธ", "ํ"],
57 | "ู": ["๔", "฿", "ธ", "ํ", "๊"],
58 | "฿": ["ู", "๕", "ํ", "๊", "ณ"],
59 | "๕": ["฿", "๖", "๊", "ณ", "ฯ"],
60 | "๖": ["๕", "๗", "ณ", "ฯ", "ญ"],
61 | "๗": ["๖", "๘", "ฯ", "ญ", "ฐ"],
62 | "๘": ["๗", "๙", "ญ", "ฐ", ","],
63 | "๙": ["๘", "ฐ", ","],
64 |
65 | "๐": ["+", "๑", "\"", "ฤ", "ฆ"],
66 | "\"": ["+", "๑", "๒", "๐", "ฎ", "ฤ", "ฆ", "ฏ"],
67 | "ฎ": ["๑", "๒", "๓", "\"", "ฑ", "ฆ", "ฏ", "โ"],
68 | "ฑ": ["๒", "๓", "๔", "ฎ", "ธ", "ฏ", "โ", "ฌ"],
69 | "ธ": ["๓", "๔", "ู", "ฑ", "ํ", "โ", "ฌ", "็"],
70 | "ํ": ["๔", "ู", "฿", "ธ", "๊", "ฌ", "็", "๋"],
71 | "๊": ["ู", "฿", "๕", "ํ", "ณ", "็", "๋", "ษ"],
72 | "ณ": ["฿", "๕", "๖", "๊", "ฯ", "๋", "ษ", "ศ"],
73 | "ฯ": ["๕", "๖", "๗", "ณ", "ญ", "ษ", "ศ", "ซ"],
74 | "ญ": ["๖", "๗", "๘", "ฯ", "ฐ", "ศ", "ซ", "."],
75 | "ฐ": ["๗", "๘", "๙", "ญ", ",", "ซ", ".", "ฅ"],
76 | ",": ["๘", "๙", "ฐ", ".", "ฅ"],
77 |
78 | "ฤ": ["๐", "\"", "ฆ", "("],
79 | "ฆ": ["๐", "\"", "ฎ", "ฤ", "ฏ", "(", ")"],
80 | "ฏ": ["\"", "ฎ", "ฑ", "ฆ", "โ", "(", ")", "ฉ"],
81 | "โ": ["ฎ", "ฑ", "ธ", "ฏ", "ฌ", ")", "ฉ", "ฮ"],
82 | "ฌ": ["ฑ", "ธ", "ํ", "โ", "็", "ฉ", "ฮ", "ฺ"],
83 | "็": ["ธ", "ํ", "๊", "ฌ", "๋", "ฮ", "ฺ", "์"],
84 | "๋": ["ํ", "๊", "ณ", "็", "ษ", "ฺ", "์", "?"],
85 | "ษ": ["๊", "ณ", "ฯ", "๋", "ศ", "์", "?", "ฒ"],
86 | "ศ": ["ณ", "ฯ", "ญ", "ษ", "ซ", "?", "ฒ", "ฬ"],
87 | "ซ": ["ฯ", "ญ", "ฐ", "ศ", ".", "ฒ", "ฬ", "ฦ"],
88 | ".": ["ญ", "ฐ", ",", "ซ", "ฅ", "ฬ", "ฦ"],
89 | "ฅ": ["ฐ", ",", ".", "ฦ"],
90 |
91 | "(": ["ฤ", "ฆ", "ฏ", ")"],
92 | ")": ["ฆ", "ฏ", "โ", "(", "ฉ"],
93 | "ฉ": ["ฏ", "โ", "ฌ", ")", "ฮ"],
94 | "ฮ": ["โ", "ฌ", "็", "ฉ", "ฺ"],
95 | "ฺ": ["ฌ", "็", "๋", "ฮ", "์"],
96 | "์": ["็", "๋", "ษ", "ฺ", "?"],
97 | "?": ["๋", "ษ", "ศ", "์", "ฒ"],
98 | "ฒ": ["ษ", "ศ", "ซ", "?", "ฬ"],
99 | "ฬ": ["ศ", "ซ", ".", "ฒ", "ฦ"],
100 | "ฦ": ["ซ", ".", "ฅ", "ฬ"]
101 | }
--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/th.json:
--------------------------------------------------------------------------------
1 | {
2 | "ๅ": ["/", "ๆ", "ไ"],
3 | "/": ["ๅ", "_", "ๆ", "ไ", "ำ"],
4 | "_": ["/", "ภ", "ไ", "ำ", "พ"],
5 | "ภ": ["_", "ถ", "ำ", "พ", "ะ"],
6 | "ถ": ["ภ", "ุ", "พ", "ะ", "ั"],
7 | "ุ": ["ถ", "ึ", "ะ", "ั", "ี"],
8 | "ึ": ["ุ", "ค", "ั", "ี", "ร"],
9 | "ค": ["ึ", "ต", "ี", "ร", "น"],
10 | "ต": ["ค", "จ", "ร", "น", "ย"],
11 | "จ": ["ต", "ข", "น", "ย", "บ"],
12 | "ข": ["จ", "ช", "ย", "บ", "ล"],
13 | "ช": ["ข", "บ", "ล"],
14 |
15 | "ๆ": ["ๅ", "/", "ไ", "ฟ", "ห"],
16 | "ไ": ["ๅ", "/", "_", "ๆ", "ำ", "ฟ", "ห", "ก"],
17 | "ำ": ["/", "_", "ภ", "ไ", "พ", "ห", "ก", "ด"],
18 | "พ": ["_", "ภ", "ถ", "ำ", "ะ", "ก", "ด", "เ"],
19 | "ะ": ["ภ", "ถ", "ุ", "พ", "ั", "ด", "เ", "้"],
20 | "ั": ["ถ", "ุ", "ึ", "ะ", "ี", "เ", "้", "่"],
21 | "ี": ["ุ", "ึ", "ค", "ั", "ร", "้", "่", "า"],
22 | "ร": ["ึ", "ค", "ต", "ี", "น", "่", "า", "ส"],
23 | "น": ["ค", "ต", "จ", "ร", "ย", "า", "ส", "ว"],
24 | "ย": ["ต", "จ", "ข", "น", "บ", "ส", "ว", "ง"],
25 | "บ": ["จ", "ข", "ช", "ย", "ล", "ว", "ง", "ฃ"],
26 | "ล": ["ข", "ช", "บ", "ง", "ฃ"],
27 |
28 | "ฟ": ["ๆ", "ไ", "ห", "ผ"],
29 | "ห": ["ๆ", "ไ", "ำ", "ฟ", "ก", "ผ", "ป"],
30 | "ก": ["ไ", "ำ", "พ", "ห", "ด", "ผ", "ป", "แ"],
31 | "ด": ["ำ", "พ", "ะ", "ก", "เ", "ป", "แ", "อ"],
32 | "เ": ["พ", "ะ", "ั", "ด", "้", "แ", "อ", "ิ"],
33 | "้": ["ะ", "ั", "ี", "เ", "่", "อ", "ิ", "ื"],
34 | "่": ["ั", "ี", "ร", "้", "า", "ิ", "ื", "ท"],
35 | "า": ["ี", "ร", "น", "่", "ส", "ื", "ท", "ม"],
36 | "ส": ["ร", "น", "ย", "า", "ว", "ท", "ม", "ใ"],
37 | "ว": ["น", "ย", "บ", "ส", "ง", "ม", "ใ", "ฝ"],
38 | "ง": ["ย", "บ", "ล", "ว", "ฃ", "ใ", "ฝ"],
39 | "ฃ": ["บ", "ล", "ง", "ฝ"],
40 |
41 | "ผ": ["ฟ", "ห", "ก", "ป"],
42 | "ป": ["ห", "ก", "ด", "ผ", "แ"],
43 | "แ": ["ก", "ด", "เ", "ป", "อ"],
44 | "อ": ["ด", "เ", "้", "แ", "ิ"],
45 | "ิ": ["เ", "้", "่", "อ", "ื"],
46 | "ื": ["้", "่", "า", "ิ", "ท"],
47 | "ท": ["่", "า", "ส", "ื", "ม"],
48 | "ม": ["า", "ส", "ว", "ท", "ใ"],
49 | "ใ": ["ส", "ว", "ง", "ม", "ฝ"],
50 | "ฝ": ["ว", "ง", "ฃ", "ใ"],
51 |
52 | "+": ["๑", "๐", "\""],
53 | "๑": ["+", "๒", "๐", "\"", "ฎ"],
54 | "๒": ["๑", "๓", "\"", "ฎ", "ฑ"],
55 | "๓": ["๒", "๔", "ฎ", "ฑ", "ธ"],
56 | "๔": ["๓", "ู", "ฑ", "ธ", "ํ"],
57 | "ู": ["๔", "฿", "ธ", "ํ", "๊"],
58 | "฿": ["ู", "๕", "ํ", "๊", "ณ"],
59 | "๕": ["฿", "๖", "๊", "ณ", "ฯ"],
60 | "๖": ["๕", "๗", "ณ", "ฯ", "ญ"],
61 | "๗": ["๖", "๘", "ฯ", "ญ", "ฐ"],
62 | "๘": ["๗", "๙", "ญ", "ฐ", ","],
63 | "๙": ["๘", "ฐ", ","],
64 |
65 | "๐": ["+", "๑", "\"", "ฤ", "ฆ"],
66 | "\"": ["+", "๑", "๒", "๐", "ฎ", "ฤ", "ฆ", "ฏ"],
67 | "ฎ": ["๑", "๒", "๓", "\"", "ฑ", "ฆ", "ฏ", "โ"],
68 | "ฑ": ["๒", "๓", "๔", "ฎ", "ธ", "ฏ", "โ", "ฌ"],
69 | "ธ": ["๓", "๔", "ู", "ฑ", "ํ", "โ", "ฌ", "็"],
70 | "ํ": ["๔", "ู", "฿", "ธ", "๊", "ฌ", "็", "๋"],
71 | "๊": ["ู", "฿", "๕", "ํ", "ณ", "็", "๋", "ษ"],
72 | "ณ": ["฿", "๕", "๖", "๊", "ฯ", "๋", "ษ", "ศ"],
73 | "ฯ": ["๕", "๖", "๗", "ณ", "ญ", "ษ", "ศ", "ซ"],
74 | "ญ": ["๖", "๗", "๘", "ฯ", "ฐ", "ศ", "ซ", "."],
75 | "ฐ": ["๗", "๘", "๙", "ญ", ",", "ซ", ".", "ฅ"],
76 | ",": ["๘", "๙", "ฐ", ".", "ฅ"],
77 |
78 | "ฤ": ["๐", "\"", "ฆ", "("],
79 | "ฆ": ["๐", "\"", "ฎ", "ฤ", "ฏ", "(", ")"],
80 | "ฏ": ["\"", "ฎ", "ฑ", "ฆ", "โ", "(", ")", "ฉ"],
81 | "โ": ["ฎ", "ฑ", "ธ", "ฏ", "ฌ", ")", "ฉ", "ฮ"],
82 | "ฌ": ["ฑ", "ธ", "ํ", "โ", "็", "ฉ", "ฮ", "ฺ"],
83 | "็": ["ธ", "ํ", "๊", "ฌ", "๋", "ฮ", "ฺ", "์"],
84 | "๋": ["ํ", "๊", "ณ", "็", "ษ", "ฺ", "์", "?"],
85 | "ษ": ["๊", "ณ", "ฯ", "๋", "ศ", "์", "?", "ฒ"],
86 | "ศ": ["ณ", "ฯ", "ญ", "ษ", "ซ", "?", "ฒ", "ฬ"],
87 | "ซ": ["ฯ", "ญ", "ฐ", "ศ", ".", "ฒ", "ฬ", "ฦ"],
88 | ".": ["ญ", "ฐ", ",", "ซ", "ฅ", "ฬ", "ฦ"],
89 | "ฅ": ["ฐ", ",", ".", "ฦ"],
90 |
91 | "(": ["ฤ", "ฆ", "ฏ", ")"],
92 | ")": ["ฆ", "ฏ", "โ", "(", "ฉ"],
93 | "ฉ": ["ฏ", "โ", "ฌ", ")", "ฮ"],
94 | "ฮ": ["โ", "ฌ", "็", "ฉ", "ฺ"],
95 | "ฺ": ["ฌ", "็", "๋", "ฮ", "์"],
96 | "์": ["็", "๋", "ษ", "ฺ", "?"],
97 | "?": ["๋", "ษ", "ศ", "์", "ฒ"],
98 | "ฒ": ["ษ", "ศ", "ซ", "?", "ฬ"],
99 | "ฬ": ["ศ", "ซ", ".", "ฒ", "ฦ"],
100 | "ฦ": ["ซ", ".", "ฅ", "ฬ"]
101 | }
--------------------------------------------------------------------------------
/nlpaug/augmenter/char/char_augmenter.py:
--------------------------------------------------------------------------------
1 | import string
2 | import re
3 |
4 | from nlpaug.util import Method
5 | from nlpaug import Augmenter
6 | from nlpaug.util import WarningException, WarningName, WarningCode, WarningMessage
7 |
8 |
9 | class CharAugmenter(Augmenter):
10 | TOKENIZER_REGEX = re.compile(r'(\W)')
11 |
12 | def __init__(self, action, name='Char_Aug', min_char=2, aug_char_min=1, aug_char_max=10, aug_char_p=0.3,
13 | aug_word_min=1, aug_word_max=10, aug_word_p=0.3, tokenizer=None, reverse_tokenizer=None,
14 | stopwords=None, device='cpu', verbose=0, stopwords_regex=None, include_special_char=True,
15 | include_detail=False):
16 | super().__init__(
17 | name=name, method=Method.CHAR, action=action, aug_min=None, aug_max=None, device=device, verbose=verbose,
18 | include_detail=include_detail)
19 | self.aug_p = None
20 | self.aug_char_min = aug_char_min
21 | self.aug_char_max = aug_char_max
22 | self.aug_char_p = aug_char_p
23 | self.aug_word_min = aug_word_min
24 | self.aug_word_max = aug_word_max
25 | self.aug_word_p = aug_word_p
26 | self.min_char = min_char
27 |
28 | self.tokenizer = tokenizer or self._tokenizer
29 | self.reverse_tokenizer = reverse_tokenizer or self._reverse_tokenizer
30 | self.stopwords = stopwords
31 | self.stopwords_regex = re.compile(stopwords_regex) if stopwords_regex is not None else stopwords_regex
32 | self.include_special_char = include_special_char
33 |
34 | @classmethod
35 | def _tokenizer(cls, text):
36 | tokens = cls.TOKENIZER_REGEX.split(text)
37 | return [t for t in tokens if len(t.strip()) > 0]
38 |
39 | @classmethod
40 | def token2char(cls, word):
41 | return list(word)
42 |
43 | @classmethod
44 | def _reverse_tokenizer(cls, tokens):
45 | return ' '.join(tokens)
46 |
47 | @classmethod
48 | def clean(cls, data):
49 | return data.strip()
50 |
51 | @classmethod
52 | def is_duplicate(cls, dataset, data):
53 | for d in dataset:
54 | if d == data:
55 | return True
56 | return False
57 |
58 | def skip_aug(self, token_idxes, tokens):
59 | return token_idxes
60 |
61 | def pre_skip_aug(self, tokens, tuple_idx=None):
62 | results = []
63 | for token_idx, token in enumerate(tokens):
64 | if tuple_idx is not None:
65 | _token = token[tuple_idx]
66 | else:
67 | _token = token
68 | # skip punctuation
69 | if _token in string.punctuation and not self.include_special_char:
70 | continue
71 | """
72 | TODO: cannot skip word that were split by tokenizer
73 | """
74 | # skip stopwords by list
75 | if self.stopwords is not None and _token in self.stopwords:
76 | continue
77 |
78 | # skip stopwords by regex
79 | if self.stopwords_regex is not None and (
80 | self.stopwords_regex.match(_token) or self.stopwords_regex.match(' '+_token+' ') or
81 | self.stopwords_regex.match(' '+_token) or self.stopwords_regex.match(_token+' ')):
82 | continue
83 |
84 | results.append(token_idx)
85 |
86 | return results
87 |
88 | def _get_aug_idxes(self, tokens, aug_min, aug_max, aug_p, mode):
89 | if mode == Method.CHAR:
90 | # If word is too short, do not augment it.
91 | if len(tokens) < self.min_char:
92 | return None
93 |
94 | aug_cnt = self._generate_aug_cnt(len(tokens), aug_min, aug_max, aug_p)
95 |
96 | if mode == Method.WORD:
97 | idxes = self.pre_skip_aug(tokens)
98 | elif mode == Method.CHAR:
99 | idxes = [i for i, t in enumerate(tokens)]
100 | idxes = self.skip_aug(idxes, tokens)
101 |
102 | if len(idxes) == 0:
103 | if self.verbose > 0:
104 | exception = WarningException(name=WarningName.OUT_OF_VOCABULARY,
105 | code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD)
106 | exception.output()
107 | return None
108 | if len(idxes) < aug_cnt:
109 | aug_cnt = len(idxes)
110 | aug_idxes = self.sample(idxes, aug_cnt)
111 | return aug_idxes
112 |
--------------------------------------------------------------------------------