├── test ├── __init__.py ├── res │ └── audio │ │ ├── Pink_noise.ogg │ │ └── Yamaha-V50-Rock-Beat-120bpm.wav ├── augmenter │ ├── word │ │ ├── test_split.py │ │ ├── test_antonym.py │ │ ├── test_spelling.py │ │ └── test_random_word.py │ ├── audio │ │ ├── test_shift.py │ │ ├── test_audio.py │ │ ├── test_vtlp.py │ │ ├── test_crop.py │ │ ├── test_speed.py │ │ ├── test_loudness.py │ │ ├── test_pitch.py │ │ ├── test_mask.py │ │ └── test_noise.py │ ├── spectrogram │ │ ├── test_spectrogram.py │ │ ├── test_time_masking.py │ │ └── test_frequency_masking.py │ ├── char │ │ ├── test_ocr.py │ │ └── test_keyboard.py │ ├── sentence │ │ └── test_sentence.py │ └── test_augmenter.py ├── model │ ├── word │ │ └── test_word_embs_model.py │ └── char │ │ └── test_keyboard_model.py ├── run_test.py ├── profiling │ └── sentence │ │ └── test_context_word_embs_sentence_profiling.py └── flow │ ├── test_sometimes.py │ └── test_sequential.py ├── nlpaug ├── augmenter │ ├── __init__.py │ ├── augment.py │ ├── sentence │ │ ├── __init__.py │ │ └── sentence_augmenter.py │ ├── char │ │ ├── __init__.py │ │ └── char_augmenter.py │ ├── spectrogram │ │ ├── __init__.py │ │ ├── time_warping.py │ │ ├── spectrogram_augmenter.py │ │ ├── time_masking.py │ │ └── frequency_masking.py │ ├── word │ │ ├── __init__.py │ │ └── split.py │ └── audio │ │ ├── __init__.py │ │ ├── audio_augmenter.py │ │ ├── shift.py │ │ ├── speed.py │ │ ├── loudness.py │ │ ├── noise.py │ │ ├── vtlp.py │ │ ├── pitch.py │ │ ├── crop.py │ │ └── mask.py ├── model │ ├── __init__.py │ ├── char │ │ ├── char.py │ │ ├── __init__.py │ │ ├── ocr.py │ │ └── keyboard.py │ ├── spectrogram │ │ ├── spectrogram.py │ │ ├── __init__.py │ │ ├── time_masking.py │ │ ├── frequency_masking.py │ │ └── time_warping.py │ ├── word_stats │ │ ├── __init__.py │ │ └── word_statistics.py │ ├── word_embs │ │ ├── __init__.py │ │ ├── fasttext.py │ │ ├── word_embeddings.py │ │ ├── glove.py │ │ └── word2vec.py │ ├── word_dict │ │ ├── __init__.py │ │ ├── word_dictionary.py │ │ ├── wordnet.py │ │ ├── spelling.py │ │ └── ppdb.py │ ├── lang_models │ │ ├── __init__.py │ │ ├── gpt2.py │ │ ├── distilbert.py │ │ ├── bert.py │ │ └── roberta.py │ └── audio │ │ ├── __init__.py │ │ ├── shift.py │ │ ├── crop.py │ │ ├── loudness.py │ │ ├── mask.py │ │ ├── speed.py │ │ ├── pitch.py │ │ ├── audio.py │ │ ├── noise.py │ │ └── vtlp.py ├── util │ ├── file │ │ └── __init__.py │ ├── math │ │ ├── __init__.py │ │ └── normalization.py │ ├── decorator │ │ ├── __init__.py │ │ └── deprecation.py │ ├── audio │ │ ├── __init__.py │ │ ├── loader.py │ │ └── visualizer.py │ ├── text │ │ ├── __init__.py │ │ ├── tokenizer.py │ │ └── part_of_speech.py │ ├── exception │ │ ├── __init__.py │ │ ├── exception_info.py │ │ └── warning.py │ ├── selection │ │ ├── __init__.py │ │ └── randomness.py │ ├── doc │ │ ├── __init__.py │ │ ├── token.py │ │ ├── change_log.py │ │ └── doc.py │ ├── __init__.py │ ├── method.py │ └── action.py ├── __init__.py ├── flow │ ├── __init__.py │ ├── sequential.py │ └── sometimes.py ├── .gitignore └── res │ └── char │ └── keyboard │ ├── en.json │ └── th.json ├── MANIFEST.in ├── .readthedocs.yml ├── .gitattributes ├── res ├── logo_small.png ├── audio_example.png └── textual_example.png ├── docs ├── util │ ├── util.rst │ └── download.rst ├── flow │ ├── flow.rst │ ├── sequential.rst │ └── sometimes.rst ├── augmenter │ ├── sentence │ │ ├── sentence.rst │ │ └── context_word_embs_sentence.rst │ ├── char │ │ ├── char.rst │ │ ├── keyboard.rst │ │ ├── ocr.rst │ │ └── random.rst │ ├── spectrogram │ │ ├── spectrogram.rst │ │ ├── time_masking.rst │ │ └── frequency_masking.rst │ ├── audio │ │ ├── vtlp.rst │ │ ├── corp.rst │ │ ├── mask.rst │ │ ├── noise.rst │ │ ├── pitch.rst │ │ ├── shift.rst │ │ ├── speed.rst │ │ ├── loudness.rst │ │ └── audio.rst │ ├── word │ │ ├── split.rst │ │ ├── random.rst │ │ ├── tfidf.rst │ │ ├── antonym.rst │ │ ├── synonym.rst │ │ ├── spelling.rst │ │ ├── word_embs.rst │ │ ├── context_word_embs.rst │ │ └── word.rst │ └── augmenter.rst ├── index.rst ├── Makefile ├── example │ └── example.rst ├── make.bat └── overview │ └── overview.rst ├── .codacy.yml ├── requirements.txt ├── .travis.yml ├── script.txt ├── setup.py ├── LICENSE ├── codecov.yml ├── .gitignore ├── model └── char │ └── keyboard │ ├── en.json │ └── th.json └── example └── tfidf-train_model.ipynb /test/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nlpaug/augmenter/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /nlpaug/model/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include nlpaug/res *.json -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | python: 2 | version: 3.6 3 | -------------------------------------------------------------------------------- /nlpaug/util/file/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.file.download import * 2 | -------------------------------------------------------------------------------- /nlpaug/util/math/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.math.normalization import * 2 | -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /nlpaug/util/decorator/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.decorator.deprecation import * 2 | -------------------------------------------------------------------------------- /res/logo_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/nlpaug/master/res/logo_small.png -------------------------------------------------------------------------------- /res/audio_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/nlpaug/master/res/audio_example.png -------------------------------------------------------------------------------- /docs/util/util.rst: -------------------------------------------------------------------------------- 1 | Util 2 | ==== 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | ./download 8 | -------------------------------------------------------------------------------- /res/textual_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/nlpaug/master/res/textual_example.png -------------------------------------------------------------------------------- /test/res/audio/Pink_noise.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/nlpaug/master/test/res/audio/Pink_noise.ogg -------------------------------------------------------------------------------- /.codacy.yml: -------------------------------------------------------------------------------- 1 | exclude_paths: 2 | - test/* 3 | - README.md 4 | - CHANGE.md 5 | - SOURCE.md 6 | - docs/conf.py -------------------------------------------------------------------------------- /nlpaug/model/char/char.py: -------------------------------------------------------------------------------- 1 | class Character: 2 | def __init__(self, cache=True): 3 | self.cache = cache 4 | -------------------------------------------------------------------------------- /docs/flow/flow.rst: -------------------------------------------------------------------------------- 1 | Flow 2 | ==== 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | ./sequential 8 | ./sometimes 9 | -------------------------------------------------------------------------------- /nlpaug/util/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.audio.loader import * 2 | from nlpaug.util.audio.visualizer import * 3 | -------------------------------------------------------------------------------- /nlpaug/util/text/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.text.tokenizer import * 2 | from nlpaug.util.text.part_of_speech import * 3 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | matplotlib>=2.2.2 2 | numpy>=1.16.2 3 | setuptools>=39.1.0 4 | python-dotenv>=0.10.1 5 | requests>=2.22.0 6 | -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/spectrogram.py: -------------------------------------------------------------------------------- 1 | class Spectrogram: 2 | def mask(self, data): 3 | raise NotImplementedError 4 | -------------------------------------------------------------------------------- /nlpaug/util/exception/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.exception.exception_info import * 2 | from nlpaug.util.exception.warning import * 3 | -------------------------------------------------------------------------------- /nlpaug/util/selection/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.selection.filtering import * 2 | from nlpaug.util.selection.randomness import * 3 | -------------------------------------------------------------------------------- /test/res/audio/Yamaha-V50-Rock-Beat-120bpm.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/binhetech/nlpaug/master/test/res/audio/Yamaha-V50-Rock-Beat-120bpm.wav -------------------------------------------------------------------------------- /nlpaug/util/doc/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.doc.doc import * 2 | from nlpaug.util.doc.change_log import * 3 | from nlpaug.util.doc.token import * 4 | -------------------------------------------------------------------------------- /docs/augmenter/sentence/sentence.rst: -------------------------------------------------------------------------------- 1 | Sentence Augmenter 2 | ================== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./context_word_embs_sentence 8 | -------------------------------------------------------------------------------- /docs/augmenter/char/char.rst: -------------------------------------------------------------------------------- 1 | Character Augmenter 2 | =================== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./keyboard 8 | ./ocr 9 | ./random -------------------------------------------------------------------------------- /nlpaug/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.base_augmenter import * 3 | 4 | __all__ = ['base_augmenter'] 5 | 6 | __version__ = '0.0.14' 7 | -------------------------------------------------------------------------------- /docs/augmenter/spectrogram/spectrogram.rst: -------------------------------------------------------------------------------- 1 | Spectrogram Augmenter 2 | ===================== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./frequency_masking 8 | ./time_masking -------------------------------------------------------------------------------- /nlpaug/flow/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.flow.pipeline import * 3 | from nlpaug.flow.sequential import * 4 | from nlpaug.flow.sometimes import * 5 | -------------------------------------------------------------------------------- /nlpaug/model/word_stats/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.word_stats.word_statistics import * 3 | from nlpaug.model.word_stats.tfidf import * 4 | -------------------------------------------------------------------------------- /docs/flow/sequential.rst: -------------------------------------------------------------------------------- 1 | nlpaug.flow\.sequential 2 | ========================================== 3 | 4 | .. automodule:: nlpaug.flow.sequential 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/flow/sometimes.rst: -------------------------------------------------------------------------------- 1 | nlpaug.flow\.sometimes 2 | ========================================== 3 | 4 | .. automodule:: nlpaug.flow.sometimes 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/util/download.rst: -------------------------------------------------------------------------------- 1 | nlpaug.util.file\.download 2 | ========================================== 3 | 4 | .. automodule:: nlpaug.util.file.download 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /nlpaug/augmenter/augment.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Augment: 4 | def __init__(self, pos, original, new): 5 | self.pos = pos 6 | self.original = original 7 | self.new = new 8 | -------------------------------------------------------------------------------- /nlpaug/model/char/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.char.char import * 3 | from nlpaug.model.char.keyboard import * 4 | from nlpaug.model.char.ocr import * 5 | -------------------------------------------------------------------------------- /nlpaug/augmenter/sentence/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.augmenter.sentence.sentence_augmenter import * 3 | from nlpaug.augmenter.sentence.context_word_embs_sentence import * 4 | -------------------------------------------------------------------------------- /docs/augmenter/audio/vtlp.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.vtlp 2 | ============================ 3 | 4 | .. automodule:: nlpaug.augmenter.audio.vtlp 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/split.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.split 2 | ============================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.split 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/random.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.random 2 | ================================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.random 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/tfidf.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.tfidf 2 | ================================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.tfidf 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/char/keyboard.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.char\.keyboard 2 | =============================== 3 | 4 | .. automodule:: nlpaug.augmenter.char.keyboard 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/char/ocr.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.char\.ocr 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.char.ocr 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/antonym.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.antonym 2 | ============================== 3 | 4 | .. automodule:: nlpaug.augmenter.word.antonym 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/synonym.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.synonym 2 | ============================== 3 | 4 | .. automodule:: nlpaug.augmenter.word.synonym 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/augmenter.rst: -------------------------------------------------------------------------------- 1 | Augmenter 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./audio/audio 8 | ./char/char 9 | ./sentence/sentence 10 | ./spectrogram/spectrogram 11 | ./word/word -------------------------------------------------------------------------------- /docs/augmenter/word/spelling.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.spelling 2 | ================================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.spelling 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/word_embs.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.word_embs 2 | ================================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.word_embs 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/corp.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.crop 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.crop 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/mask.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.mask 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.mask 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/noise.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.noise 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.noise 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/pitch.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.pitch 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.pitch 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/shift.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.shift 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.shift 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/speed.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.speed 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.speed 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/char/random.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.char\.random 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.char.random 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.spectrogram.spectrogram import * 3 | from nlpaug.model.spectrogram.frequency_masking import * 4 | from nlpaug.model.spectrogram.time_masking import * -------------------------------------------------------------------------------- /docs/augmenter/audio/loudness.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.loudness 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.loudness 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/audio.rst: -------------------------------------------------------------------------------- 1 | Audio Augmenter 2 | =============== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./corp 8 | ./loudness 9 | ./mask 10 | ./noise 11 | ./pitch 12 | ./shift 13 | ./speed 14 | ./vtlp -------------------------------------------------------------------------------- /docs/augmenter/word/context_word_embs.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.context_word_embs 2 | ======================================== 3 | 4 | .. automodule:: nlpaug.augmenter.word.context_word_embs 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /nlpaug/augmenter/char/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.augmenter.char.char_augmenter import * 3 | from nlpaug.augmenter.char.ocr import * 4 | from nlpaug.augmenter.char.random import * 5 | from nlpaug.augmenter.char.keyboard import * 6 | -------------------------------------------------------------------------------- /nlpaug/augmenter/spectrogram/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.augmenter.spectrogram.spectrogram_augmenter import * 3 | from nlpaug.augmenter.spectrogram.frequency_masking import * 4 | from nlpaug.augmenter.spectrogram.time_masking import * -------------------------------------------------------------------------------- /docs/augmenter/spectrogram/time_masking.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.spectrogram\.time_masking 2 | ========================================== 3 | 4 | .. automodule:: nlpaug.augmenter.spectrogram.time_masking 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /nlpaug/model/word_embs/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.word_embs.word_embeddings import * 3 | from nlpaug.model.word_embs.glove import * 4 | from nlpaug.model.word_embs.word2vec import * 5 | from nlpaug.model.word_embs.fasttext import * -------------------------------------------------------------------------------- /docs/augmenter/word/word.rst: -------------------------------------------------------------------------------- 1 | Word Augmenter 2 | ============== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./antonym 8 | ./context_word_embs 9 | ./random 10 | ./spelling 11 | ./split 12 | ./synonym 13 | ./tfidf 14 | ./word_embs 15 | -------------------------------------------------------------------------------- /nlpaug/model/word_dict/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.word_dict.word_dictionary import * 3 | from nlpaug.model.word_dict.spelling import * 4 | from nlpaug.model.word_dict.wordnet import * 5 | from nlpaug.model.word_dict.ppdb import * 6 | -------------------------------------------------------------------------------- /docs/augmenter/spectrogram/frequency_masking.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.spectrogram\.frequency_masking 2 | =============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.spectrogram.frequency_masking 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/sentence/context_word_embs_sentence.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.sentence\.context_word_embs_sentence 2 | ===================================================== 3 | 4 | .. automodule:: nlpaug.augmenter.sentence.context_word_embs_sentence 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | 5 | install: 6 | - pip install -r requirements.txt 7 | - pip install coverage 8 | - pip install codecov 9 | - pip install . 10 | script: 11 | - python test/run_test.py 12 | - coverage run test/run_test.py 13 | 14 | after_success: 15 | - codecov -------------------------------------------------------------------------------- /nlpaug/util/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.action import * 2 | from nlpaug.util.doc import * 3 | from nlpaug.util.method import * 4 | from nlpaug.util.exception import * 5 | from nlpaug.util.math import * 6 | from nlpaug.util.text import * 7 | from nlpaug.util.audio import * 8 | 9 | from nlpaug.util.file import * 10 | from nlpaug.util.decorator import * 11 | -------------------------------------------------------------------------------- /nlpaug/util/method.py: -------------------------------------------------------------------------------- 1 | class Method: 2 | CHAR = 'char' 3 | WORD = 'word' 4 | SENTENCE = 'sentence' 5 | SPECTROGRAM = 'spectrogram' 6 | AUDIO = 'audio' 7 | 8 | FLOW = 'flow' 9 | 10 | @staticmethod 11 | def getall(): 12 | return [Method.CHAR, Method.WORD, Method.SENTENCE, Method.AUDIO, Method.SPECTROGRAM, Method.FLOW] 13 | 14 | -------------------------------------------------------------------------------- /nlpaug/model/lang_models/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.lang_models.language_models import * 3 | from nlpaug.model.lang_models.bert import * 4 | from nlpaug.model.lang_models.xlnet import * 5 | from nlpaug.model.lang_models.gpt2 import * 6 | from nlpaug.model.lang_models.distilbert import * 7 | from nlpaug.model.lang_models.roberta import * 8 | -------------------------------------------------------------------------------- /nlpaug/util/selection/randomness.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | except ImportError: 4 | # No installation required if not using this function 5 | pass 6 | import numpy as np 7 | import random 8 | 9 | 10 | class Randomness: 11 | @staticmethod 12 | def seed(seed): 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | torch.manual_seed(seed) 16 | torch.cuda.manual_seed(seed) 17 | -------------------------------------------------------------------------------- /nlpaug/model/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.audio.audio import * 3 | from nlpaug.model.audio.noise import * 4 | from nlpaug.model.audio.shift import * 5 | from nlpaug.model.audio.speed import * 6 | from nlpaug.model.audio.pitch import * 7 | from nlpaug.model.audio.loudness import * 8 | from nlpaug.model.audio.crop import * 9 | from nlpaug.model.audio.mask import * 10 | from nlpaug.model.audio.vtlp import * 11 | -------------------------------------------------------------------------------- /nlpaug/util/exception/exception_info.py: -------------------------------------------------------------------------------- 1 | 2 | class ExceptionInfo: 3 | def __init__(self, name, exp_type, code, msg): 4 | self.name = name 5 | self.exp_type = exp_type 6 | self.code = code 7 | self.msg = msg 8 | 9 | def output(self): 10 | msg = '[{}] Name:{}, Code:{}, Message:{}'.format(self.exp_type, self.name, self.code, self.msg) 11 | print(msg) 12 | 13 | 14 | class ExceptionType: 15 | WARNING = 'Warning' -------------------------------------------------------------------------------- /nlpaug/util/action.py: -------------------------------------------------------------------------------- 1 | class Action: 2 | INSERT = 'insert' 3 | SUBSTITUTE = 'substitute' 4 | DELETE = 'delete' 5 | SWAP = 'swap' 6 | SPLIT = 'split' 7 | ALIGN = 'align' 8 | 9 | SEQUENTIAL = 'sequential' 10 | SOMETIMES = 'sometimes' 11 | 12 | @staticmethod 13 | def getall(): 14 | return [Action.INSERT, Action.SUBSTITUTE, Action.SWAP, Action.DELETE, Action.SPLIT, 15 | Action.SEQUENTIAL, Action.SOMETIMES, Action.ALIGN] -------------------------------------------------------------------------------- /nlpaug/util/text/tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | ADDING_SPACE_AROUND_PUNCTUATION_REGEX = re.compile(r'(?=1.1.0 24 | 25 | -------------------------------------------------------------------------------- /nlpaug/util/exception/warning.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.exception.exception_info import ExceptionInfo, ExceptionType 2 | 3 | 4 | class WarningException(ExceptionInfo): 5 | def __init__(self, name, code, msg): 6 | super(WarningException, self).__init__(name=name, exp_type=ExceptionType.WARNING, code=code, msg=msg) 7 | 8 | 9 | class WarningName: 10 | INPUT_VALIDATION_WARNING = 'Input validation issue' 11 | OUT_OF_VOCABULARY = 'Out of vocabulary issue' 12 | 13 | 14 | class WarningCode: 15 | WARNING_CODE_001 = 'W001' 16 | WARNING_CODE_002 = 'W002' 17 | 18 | 19 | class WarningMessage: 20 | LENGTH_IS_ZERO = 'Length of input is 0' 21 | NO_WORD = 'No other word except stop words and OOV. Returning input data without augmentation' 22 | 23 | DEPRECATED = 'Warning: {} will be removed after {} release. Change to use {}' 24 | -------------------------------------------------------------------------------- /docs/example/example.rst: -------------------------------------------------------------------------------- 1 | Example 2 | ======= 3 | 4 | The following examples show a standard use case for augmenter. 5 | 6 | - `Audio augmenters`_ 7 | - `Textual augmenters`_ 8 | - `Spectrogram augmenters`_ 9 | - `Custom augmenter`_ 10 | - `TF-IDF model training`_ 11 | - `Flow`_ 12 | 13 | .. _Audio augmenters: https://github.com/makcedward/nlpaug/blob/master/example/audio_augmenter.ipynb 14 | .. _Textual augmenters: https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb 15 | .. _Spectrogram augmenters: https://github.com/makcedward/nlpaug/blob/master/example/spectrogram_augmenter.ipynb 16 | .. _Custom augmenter: https://github.com/makcedward/nlpaug/blob/master/example/custom_augmenter.ipynb 17 | .. _TF-IDF model training: https://github.com/makcedward/nlpaug/blob/master/example/tfidf-train_model.ipynb 18 | .. _Flow: https://github.com/makcedward/nlpaug/blob/master/example/flow.ipynb -------------------------------------------------------------------------------- /nlpaug/flow/sequential.py: -------------------------------------------------------------------------------- 1 | """ 2 | Flow that apply augmentation sequentially. 3 | """ 4 | 5 | from nlpaug.util import Action 6 | from nlpaug.flow import Pipeline 7 | 8 | 9 | class Sequential(Pipeline): 10 | """ 11 | Flow that apply augmenters sequentially. 12 | 13 | :param list flow: list of flow or augmenter 14 | :param str name: Name of this augmenter 15 | 16 | >>> import nlpaug.flow as naf 17 | >>> import nlpaug.augmenter.char as nac 18 | >>> import nlpaug.augmenter.word as naw 19 | >>> flow = naf.Sequential([nac.RandomCharAug(), naw.RandomWordAug()]) 20 | """ 21 | 22 | def __init__(self, flow=None, name='Sequential_Pipeline', include_detail=False, verbose=0): 23 | Pipeline.__init__(self, name=name, action=Action.SEQUENTIAL, flow=flow, include_detail=include_detail, 24 | verbose=verbose) 25 | 26 | def draw(self): 27 | return True 28 | -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=../build 12 | set SPHINXPROJ=nlpaug 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys 3 | 4 | if sys.version_info < (3,): 5 | sys.exit("Sorry, Python3 is required.") 6 | 7 | with open("README.md", encoding="utf8") as f: 8 | readme = f.read() 9 | 10 | setup( 11 | name="nlpaug", 12 | version="0.0.14", 13 | author="Edward Ma", 14 | author_email="makcedward@gmail.com", 15 | url="https://github.com/makcedward/nlpaug", 16 | license="MIT", 17 | description="Natural language processing augmentation library for deep neural networks", 18 | long_description=readme, 19 | long_description_content_type="text/markdown", 20 | packages=find_packages(exclude="test"), 21 | include_package_data=True, 22 | keywords=[ 23 | "deep learning", "neural network", "machine learning", 24 | "nlp", "natural language processing", "text", "audio", "spectrogram", 25 | "augmentation", "adversarial attack", "ai", "ml"] 26 | ) 27 | -------------------------------------------------------------------------------- /test/augmenter/word/test_antonym.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.word as naw 6 | 7 | 8 | class TestAntonym(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls): 11 | env_config_path = os.path.abspath(os.path.join( 12 | os.path.dirname(__file__), '..', '..', '..', '.env')) 13 | load_dotenv(env_config_path) 14 | 15 | cls.augs = [ 16 | naw.AntonymAug() 17 | ] 18 | 19 | def test_substitute(self): 20 | texts = [ 21 | 'Good bad' 22 | ] 23 | 24 | for aug in self.augs: 25 | for text in texts: 26 | augmented_text = aug.augment(text) 27 | self.assertNotEqual(text, augmented_text) 28 | 29 | def test_skip_punctuation(self): 30 | text = '. . . . ! ? # @' 31 | 32 | for aug in self.augs: 33 | augmented_text = aug.augment(text) 34 | self.assertEqual(text, augmented_text) 35 | -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/time_masking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.spectrogram import Spectrogram 4 | 5 | 6 | class TimeMasking(Spectrogram): 7 | def __init__(self, mask_factor): 8 | super(TimeMasking, self).__init__() 9 | 10 | self.mask_factor = mask_factor 11 | 12 | def mask(self, data): 13 | """ 14 | From: https://arxiv.org/pdf/1904.08779.pdf, 15 | Time masking is applied so that t consecutive time steps 16 | [t0, t0 + t) are masked, where t is first chosen from a 17 | uniform distribution from 0 to the time mask parameter 18 | T, and t0 is chosen from [0, tau - t). 19 | :return: 20 | """ 21 | 22 | time_range = data.shape[1] 23 | self.t = np.random.randint(self.mask_factor) 24 | self.t0 = np.random.randint(time_range - self.t) 25 | 26 | augmented_mel_spectrogram = data.copy() 27 | augmented_mel_spectrogram[:, self.t0:self.t0+self.t] = 0 28 | return augmented_mel_spectrogram 29 | -------------------------------------------------------------------------------- /test/model/word/test_word_embs_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.model.word_embs as nmw 6 | 7 | 8 | class TestWordEmbsModel(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls): 11 | env_config_path = os.path.abspath(os.path.join( 12 | os.path.dirname(__file__), '..', '..', '..', '.env')) 13 | load_dotenv(env_config_path) 14 | 15 | def test_bogus_fasttext_loading(self): 16 | test_file = os.path.join(os.environ.get("TEST_DIR"), 'res', 'text', 'bogus_fasttext.vec') 17 | expected_vector = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0] 18 | 19 | fasttext = nmw.Fasttext() 20 | fasttext.read(test_file) 21 | 22 | for word in fasttext.w2v: 23 | self.assertSequenceEqual(list(fasttext.w2v[word]), expected_vector) 24 | 25 | self.assertSequenceEqual(["test1", "test2", "test_3", "test 4", "test -> 5"], fasttext.get_vocab()) 26 | 27 | self.assertEqual(len(fasttext.normalized_vectors), 5) 28 | -------------------------------------------------------------------------------- /docs/overview/overview.rst: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | This python library helps you with augmenting nlp for your machine learning projects. Visit this introduction to understand about Data Augmentation in NLP. Augmenter is the basic element of augmentation while Flow is a pipeline to orchestra multi augmenter together. 5 | 6 | - `Data Augmentation library for Text`_ 7 | - `Data Augmentation library for Speech Recognition`_ 8 | - `Data Augmentation library for Audio`_ 9 | - `Does your NLP model able to prevent adversarial attack?`_ 10 | 11 | .. _Data Augmentation library for Text: https://towardsdatascience.com/data-augmentation-library-for-text-9661736b13ff 12 | .. _Data Augmentation library for Speech Recognition: https://towardsdatascience.com/data-augmentation-for-speech-recognition-e7c607482e78 13 | .. _Data Augmentation library for Audio: https://towardsdatascience.com/data-augmentation-for-audio-76912b01fdf6 14 | .. _Does your NLP model able to prevent adversarial attack?: https://medium.com/hackernoon/does-your-nlp-model-able-to-prevent-adversarial-attack-45b5ab75129c -------------------------------------------------------------------------------- /test/augmenter/audio/test_shift.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestShift(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | 21 | def test_substitute(self): 22 | audio, sampling_rate = AudioLoader.load_audio(self.sample_wav_file) 23 | 24 | aug = naa.ShiftAug(sampling_rate, duration=0.5) 25 | augmented_audio = aug.augment(audio) 26 | 27 | self.assertFalse(np.array_equal(audio, augmented_audio)) 28 | self.assertTrue(len(audio), len(augmented_audio)) 29 | -------------------------------------------------------------------------------- /nlpaug/flow/sometimes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Flow that apply augmentation randomly. 3 | """ 4 | 5 | from nlpaug.util import Action 6 | from nlpaug.flow import Pipeline 7 | 8 | 9 | class Sometimes(Pipeline): 10 | """ 11 | Flow that apply augmenters randomly. 12 | 13 | :param list flow: list of flow or augmenter 14 | :param str name: Name of this augmenter 15 | 16 | >>> import nlpaug.flow as naf 17 | >>> import nlpaug.augmenter.char as nac 18 | >>> import nlpaug.augmenter.word as naw 19 | >>> flow = naf.Sometimes([nac.RandomCharAug(), naw.RandomWordAug()]) 20 | """ 21 | 22 | # TODO: deprecated pipeline_p, use aug_p 23 | def __init__(self, flow=None, name='Sometimes_Pipeline', pipeline_p=0.2, aug_p=1, include_detail=False, 24 | verbose=0): 25 | Pipeline.__init__(self, name=name, action=Action.SOMETIMES, 26 | flow=flow, aug_p=aug_p, include_detail=include_detail, verbose=verbose) 27 | 28 | self.pipeline_p = pipeline_p 29 | 30 | def draw(self): 31 | return self.pipeline_p > self.prob() 32 | -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/frequency_masking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.spectrogram import Spectrogram 4 | 5 | 6 | class FrequencyMasking(Spectrogram): 7 | def __init__(self, mask_factor): 8 | super(FrequencyMasking, self).__init__() 9 | 10 | self.mask_factor = mask_factor 11 | 12 | def mask(self, data): 13 | """ 14 | From: https://arxiv.org/pdf/1904.08779.pdf, 15 | Frequency masking is applied so that f consecutive mel 16 | frequency channels [f0, f0 + f) are masked, where f is 17 | first chosen from a uniform distribution from 0 to the 18 | frequency mask parameter F, and f0 is chosen from 19 | [0, v - f). v is the number of mel frequency channels. 20 | :return: 21 | """ 22 | v = data.shape[0] 23 | self.f = np.random.randint(self.mask_factor) 24 | self.f0 = np.random.randint(v - self.f) 25 | 26 | augmented_mel_spectrogram = data.copy() 27 | augmented_mel_spectrogram[self.f0:self.f0+self.f, :] = 0 28 | return augmented_mel_spectrogram 29 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Edward Ma 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /test/augmenter/audio/test_audio.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.audio as naa 6 | from nlpaug.util import AudioLoader 7 | 8 | 9 | class TestAudio(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 16 | cls.sample_wav_file = os.path.join( 17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 18 | ) 19 | 20 | def test_multi_thread(self): 21 | audio, sampling_rate = AudioLoader.load_audio(self.sample_wav_file) 22 | n = 3 23 | augs = [ 24 | naa.CropAug(sampling_rate=sampling_rate), 25 | naa.PitchAug(sampling_rate=sampling_rate) 26 | ] 27 | 28 | for num_thread in [1, 3]: 29 | for aug in augs: 30 | augmented_data = aug.augment(audio, n=n, num_thread=num_thread) 31 | self.assertEqual(len(augmented_data), n) 32 | -------------------------------------------------------------------------------- /nlpaug/model/char/ocr.py: -------------------------------------------------------------------------------- 1 | from nlpaug.model.char import Character 2 | 3 | 4 | class Ocr(Character): 5 | def __init__(self, cache=True): 6 | super().__init__(cache) 7 | 8 | self.model = self.get_model() 9 | 10 | def predict(self, data): 11 | return self.model[data] 12 | 13 | # TODO: Read from file 14 | @classmethod 15 | def get_model(cls): 16 | mapping = { 17 | '0': ['8', '9', 'o', 'O', 'D'], 18 | '1': ['4', '7', 'l', 'I'], 19 | '2': ['z', 'Z'], 20 | '5': ['8'], 21 | '6': ['b'], 22 | '8': ['s', 'S', '@', '&'], 23 | '9': ['g'], 24 | 'o': ['u'], 25 | 'r': ['k'], 26 | 'C': ['G'], 27 | 'O': ['D', 'U'], 28 | 'E': ['B'] 29 | } 30 | 31 | result = {} 32 | 33 | for k in mapping: 34 | result[k] = mapping[k] 35 | 36 | for k in mapping: 37 | for v in mapping[k]: 38 | if v not in result: 39 | result[v] = [] 40 | 41 | if k not in result[v]: 42 | result[v].append(k) 43 | 44 | return result 45 | -------------------------------------------------------------------------------- /nlpaug/util/doc/token.py: -------------------------------------------------------------------------------- 1 | class Token: 2 | def __init__(self, token, start_pos=-1, action='', change_seq=0): 3 | self._token = token 4 | self._start_pos = start_pos 5 | self._action = action 6 | self._change_seq = change_seq 7 | 8 | @property 9 | def start_pos(self): 10 | return self._start_pos 11 | 12 | @start_pos.setter 13 | def start_pos(self, v): 14 | self._start_pos = v 15 | 16 | @property 17 | def token(self): 18 | return self._token 19 | 20 | @token.setter 21 | def token(self, v): 22 | self._token = v 23 | 24 | @property 25 | def action(self): 26 | return self._action 27 | 28 | @action.setter 29 | def action(self, v): 30 | self._action = v 31 | 32 | @property 33 | def change_seq(self): 34 | return self._change_seq 35 | 36 | @change_seq.setter 37 | def change_seq(self, v): 38 | self._change_seq = v 39 | 40 | def to_dict(self): 41 | return { 42 | 'token': self.token, 43 | 'action': self.action, 44 | 'start_pos': self.start_pos, 45 | 'change_seq': self.change_seq 46 | } 47 | -------------------------------------------------------------------------------- /test/augmenter/spectrogram/test_spectrogram.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | from nlpaug.util import AudioLoader 6 | import nlpaug.augmenter.spectrogram as nas 7 | 8 | 9 | class TestFrequencyMasking(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 16 | cls.sample_wav_file = os.path.join( 17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 18 | ) 19 | 20 | def test_multi_thread(self): 21 | mel_spectrogram = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128) 22 | n = 3 23 | augs = [ 24 | nas.FrequencyMaskingAug(mask_factor=80), 25 | nas.TimeMaskingAug(mask_factor=80) 26 | ] 27 | 28 | for num_thread in [1, 3]: 29 | for aug in augs: 30 | augmented_data = aug.augment(mel_spectrogram, n=n, num_thread=num_thread) 31 | self.assertEqual(len(augmented_data), n) 32 | -------------------------------------------------------------------------------- /nlpaug/augmenter/sentence/sentence_augmenter.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util import Method 2 | from nlpaug import Augmenter 3 | 4 | 5 | class SentenceAugmenter(Augmenter): 6 | SENTENCE_SEPARATOR = '.!?' 7 | 8 | def __init__(self, action, name='Sentence_Aug', stopwords=None, tokenizer=None, reverse_tokenizer=None, 9 | device='cuda', include_detail=False, verbose=0): 10 | super().__init__( 11 | name=name, method=Method.SENTENCE, action=action, aug_min=None, aug_max=None, device=device, 12 | verbose=verbose, include_detail=include_detail) 13 | self.tokenizer = tokenizer or self._tokenizer 14 | self.reverse_tokenizer = reverse_tokenizer or self._reverse_tokenizer 15 | self.stopwords = stopwords 16 | 17 | @classmethod 18 | def _tokenizer(cls, text): 19 | return text.split(' ') 20 | 21 | @classmethod 22 | def _reverse_tokenizer(cls, tokens): 23 | return ' '.join(tokens) 24 | 25 | @classmethod 26 | def clean(cls, data): 27 | return data.strip() 28 | 29 | @classmethod 30 | def is_duplicate(cls, dataset, data): 31 | for d in dataset: 32 | if d == data: 33 | return True 34 | return False 35 | -------------------------------------------------------------------------------- /nlpaug/augmenter/spectrogram/time_masking.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply time based masking to spectrogram input. 3 | """ 4 | 5 | from nlpaug.augmenter.spectrogram import SpectrogramAugmenter 6 | from nlpaug.util import Action 7 | import nlpaug.model.spectrogram as nms 8 | 9 | 10 | class TimeMaskingAug(SpectrogramAugmenter): 11 | # https://arxiv.org/pdf/1904.08779.pdf 12 | """ 13 | Augmenter that mask spectrogram based on time by random values. 14 | 15 | :param int mask_factor: Value between 0 and mask_factor will be picked randomly. 16 | Mask range will be between [0, tau - master_factor) while tau is time range of input. 17 | :param str name: Name of this augmenter 18 | 19 | >>> import nlpaug.augmenter.spectogram as nas 20 | >>> aug = nas.TimeMaskingAug(mask_factor=80) 21 | """ 22 | 23 | def __init__(self, mask_factor, name='TimeMasking_Aug', verbose=0): 24 | super(TimeMaskingAug, self).__init__( 25 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose) 26 | 27 | self.model = self.get_model(mask_factor) 28 | 29 | def substitute(self, data): 30 | return self.model.mask(data) 31 | 32 | @classmethod 33 | def get_model(cls, mask_factor): 34 | return nms.TimeMasking(mask_factor) 35 | -------------------------------------------------------------------------------- /test/augmenter/spectrogram/test_time_masking.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | import numpy as np 5 | 6 | from nlpaug.util import AudioLoader 7 | from nlpaug.augmenter.spectrogram import TimeMaskingAug 8 | 9 | 10 | class TestTimeMasking(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.num_of_freq_channel = 128 21 | 22 | def test_substitute(self): 23 | time_mask_para = 80 24 | 25 | mel_spectrogram = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=self.num_of_freq_channel) 26 | aug = TimeMaskingAug(mask_factor=time_mask_para) 27 | 28 | augmented_mel_spectrogram = aug.augment(mel_spectrogram) 29 | 30 | self.assertEqual(len(mel_spectrogram[:, aug.model.t0]), np.count_nonzero(mel_spectrogram[:, aug.model.t0])) 31 | self.assertEqual(0, np.count_nonzero(augmented_mel_spectrogram[:, aug.model.t0])) 32 | -------------------------------------------------------------------------------- /nlpaug/util/text/part_of_speech.py: -------------------------------------------------------------------------------- 1 | class PartOfSpeech: 2 | NOUN = 'noun' 3 | VERB = 'verb' 4 | ADJECTIVE = 'adjective' 5 | ADVERB = 'adverb' 6 | 7 | pos2con = { 8 | 'n': [ 9 | 'NN', 'NNS', 'NNP', 'NNPS', # from WordNet 10 | 'NP' # from PPDB 11 | ], 12 | 'v': [ 13 | 'VB', 'VBD', 'VBG', 'VBN', 'VBZ', # from WordNet 14 | 'VBP' # from PPDB 15 | ], 16 | 'a': ['JJ', 'JJR', 'JJS', 'IN'], 17 | 's': ['JJ', 'JJR', 'JJS', 'IN'], # Adjective Satellite 18 | 'r': ['RB', 'RBR', 'RBS'], # Adverb 19 | } 20 | 21 | con2pos = {} 22 | poses = [] 23 | for key, values in pos2con.items(): 24 | poses.extend(values) 25 | for value in values: 26 | if value not in con2pos: 27 | con2pos[value] = [] 28 | con2pos[value].append(key) 29 | 30 | @staticmethod 31 | def pos2constituent(pos): 32 | if pos in PartOfSpeech.pos2con: 33 | return PartOfSpeech.pos2con[pos] 34 | return [] 35 | 36 | @staticmethod 37 | def constituent2pos(con): 38 | if con in PartOfSpeech.con2pos: 39 | return PartOfSpeech.con2pos[con] 40 | return [] 41 | 42 | @staticmethod 43 | def get_pos(): 44 | return PartOfSpeech.poses 45 | -------------------------------------------------------------------------------- /nlpaug/augmenter/spectrogram/frequency_masking.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply frequency based masking to spectrogram input. 3 | """ 4 | 5 | from nlpaug.augmenter.spectrogram import SpectrogramAugmenter 6 | from nlpaug.util import Action 7 | import nlpaug.model.spectrogram as nms 8 | 9 | 10 | class FrequencyMaskingAug(SpectrogramAugmenter): 11 | # https://arxiv.org/pdf/1904.08779.pdf 12 | """ 13 | Augmenter that mask spectrogram based on frequency by random values. 14 | 15 | :param int mask_factor: Value between 0 and mask_factor will be picked randomly. 16 | Mask range will be between [0, v - master_factor) while v is the number of mel frequency channels. 17 | :param str name: Name of this augmenter 18 | 19 | >>> import nlpaug.augmenter.spectogram as nas 20 | >>> aug = nas.FrequencyMaskingAug(mask_factor=80) 21 | """ 22 | 23 | def __init__(self, mask_factor, name='FrequencyMasking_Aug', verbose=0): 24 | super(FrequencyMaskingAug, self).__init__( 25 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose) 26 | 27 | self.model = self.get_model(mask_factor) 28 | 29 | def substitute(self, data): 30 | return self.model.mask(data) 31 | 32 | @classmethod 33 | def get_model(cls, mask_factor): 34 | return nms.FrequencyMasking(mask_factor) 35 | -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | #see https://github.com/codecov/support/wiki/Codecov-Yaml 2 | codecov: 3 | notify: 4 | require_ci_to_pass: yes 5 | 6 | coverage: 7 | precision: 2 # 2 = xx.xx%, 0 = xx% 8 | round: nearest # how coverage is rounded: down/up/nearest 9 | range: 10...90 # custom range of coverage colors from red -> yellow -> green 10 | status: 11 | # https://codecov.readme.io/v1.0/docs/commit-status 12 | project: 13 | default: 14 | against: auto 15 | target: 40% # specify the target coverage for each commit status 16 | threshold: 20% # allow this little decrease on project 17 | # https://github.com/codecov/support/wiki/Filtering-Branches 18 | # branches: master 19 | if_ci_failed: error 20 | # https://github.com/codecov/support/wiki/Patch-Status 21 | patch: 22 | default: 23 | against: parent 24 | target: 30% # specify the target "X%" coverage to hit 25 | # threshold: 50% # allow this much decrease on patch 26 | changes: false 27 | 28 | parsers: 29 | gcov: 30 | branch_detection: 31 | conditional: true 32 | loop: true 33 | macro: false 34 | method: false 35 | javascript: 36 | enable_partials: false 37 | 38 | comment: 39 | layout: header, diff 40 | require_changes: false 41 | behavior: default # update if exists else create new 42 | branches: * -------------------------------------------------------------------------------- /test/augmenter/char/test_ocr.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | from nlpaug.augmenter.char import OcrAug 4 | 5 | 6 | class TestOcr(unittest.TestCase): 7 | def test_ocr_single_word(self): 8 | texts = ['Zoology', 'roku123456'] 9 | aug = OcrAug() 10 | for text in texts: 11 | augmented_text = aug.augment(text) 12 | self.assertNotEqual(text, augmented_text) 13 | 14 | self.assertTrue(len(texts) > 0) 15 | 16 | def test_ocr_single_word_nonexist_char(self): 17 | texts = ['AAAAA', 'KKKKK'] 18 | aug = OcrAug() 19 | for text in texts: 20 | augmented_text = aug.augment(text) 21 | self.assertEqual(text, augmented_text) 22 | 23 | self.assertTrue(len(texts) > 0) 24 | 25 | def test_ocr_multi_words(self): 26 | texts = ['The quick brown fox jumps over the lazy dog'] 27 | aug = OcrAug() 28 | 29 | for text in texts: 30 | # Since non-exist mapping word may be drawn, try several times 31 | is_augmented = False 32 | for _ in range(10): 33 | augmented_text = aug.augment(text) 34 | is_equal = text == augmented_text 35 | if not is_equal: 36 | is_augmented = True 37 | break 38 | 39 | self.assertTrue(is_augmented) 40 | 41 | self.assertTrue(len(texts) > 0) 42 | -------------------------------------------------------------------------------- /test/augmenter/spectrogram/test_frequency_masking.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | from nlpaug.util import AudioLoader 7 | from nlpaug.augmenter.spectrogram import FrequencyMaskingAug 8 | 9 | 10 | class TestFrequencyMasking(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | 21 | def test_empty_input(self): 22 | mel_spectrogram = np.array([]) 23 | aug = FrequencyMaskingAug(mask_factor=80) 24 | augmented_mel_spectrogram = aug.augment(mel_spectrogram) 25 | 26 | self.assertTrue(np.array_equal(np.array([]), augmented_mel_spectrogram)) 27 | 28 | def test_substitute(self): 29 | freq_mask_para = 80 30 | 31 | mel_spectrogram = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128) 32 | aug = FrequencyMaskingAug(mask_factor=freq_mask_para) 33 | 34 | augmented_mel_spectrogram = aug.augment(mel_spectrogram) 35 | 36 | self.assertEqual(len(mel_spectrogram[aug.model.f0]), np.count_nonzero(mel_spectrogram[aug.model.f0])) 37 | self.assertEqual(0, np.count_nonzero(augmented_mel_spectrogram[aug.model.f0])) 38 | -------------------------------------------------------------------------------- /test/augmenter/sentence/test_sentence.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.sentence as nas 6 | from nlpaug.util import Action, Doc 7 | 8 | 9 | class TestSentence(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | 16 | cls.model_paths = [ 17 | 'xlnet-base-cased', 18 | 'gpt2', 19 | 'distilgpt2' 20 | ] 21 | 22 | cls.text = 'The quick brown fox jumps over the lazy dog.' 23 | 24 | def test_augment_detail(self): 25 | for model_path in self.model_paths: 26 | aug = nas.ContextualWordEmbsForSentenceAug(model_path=model_path, include_detail=True) 27 | 28 | augmented_text, augment_details = aug.augment(self.text) 29 | 30 | self.assertNotEqual(self.text, augmented_text) 31 | self.assertGreater(len(augment_details), 0) 32 | for augment_detail in augment_details: 33 | self.assertTrue(augment_detail['orig_token'] in self.text) 34 | self.assertEqual(augment_detail['orig_start_pos'], -1) 35 | self.assertGreater(augment_detail['new_start_pos'], -1) 36 | self.assertGreater(augment_detail['change_seq'], 0) 37 | self.assertIn(augment_detail['action'], Action.getall()) 38 | 39 | self.assertNotEqual(self.text, augmented_text) 40 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/shift.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply shifting operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class ShiftAug(AudioAugmenter): 11 | """ 12 | :param int sampling_rate: Sampling rate of input audio. 13 | :param float duration: Max shifting segment (in second) 14 | :param str direction: Shifting segment to left, right or one of them. Value can be 'left', 'right' or 'random' 15 | :param str name: Name of this augmenter 16 | 17 | >>> import nlpaug.augmenter.audio as naa 18 | >>> aug = naa.ShiftAug(sampling_rate=44010) 19 | """ 20 | 21 | def __init__(self, sampling_rate, duration=3, direction='random', 22 | shift_max=3, shift_direction='both', 23 | name='Shift_Aug', verbose=0): 24 | super().__init__( 25 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose) 26 | 27 | if shift_direction != 'both': 28 | print(WarningMessage.DEPRECATED.format('shift_direction', '0.0.12', 'direction')) 29 | direction = shift_direction 30 | if shift_max != 3: 31 | print(WarningMessage.DEPRECATED.format('shift_max', '0.0.12', 'duration')) 32 | duration = shift_max 33 | 34 | self.model = self.get_model(sampling_rate, duration, direction) 35 | 36 | @classmethod 37 | def get_model(cls, sampling_rate, duration, direction): 38 | return nma.Shift(sampling_rate, duration, direction) 39 | -------------------------------------------------------------------------------- /nlpaug/model/word_dict/wordnet.py: -------------------------------------------------------------------------------- 1 | try: 2 | import nltk 3 | from nltk.corpus import wordnet 4 | except ImportError: 5 | # No installation required if not using this function 6 | pass 7 | 8 | from nlpaug.model.word_dict import WordDictionary 9 | 10 | 11 | class WordNet(WordDictionary): 12 | def __init__(self, lang, is_synonym=True): 13 | super().__init__(cache=True) 14 | 15 | self.lang = lang 16 | self.is_synonym = is_synonym 17 | 18 | try: 19 | wordnet 20 | except NameError: 21 | raise ImportError('Missed nltk library. Install it via `pip install nltk`') 22 | try: 23 | # Check whether wordnet package is downloaded 24 | wordnet.synsets('computer') 25 | # Check whether POS package is downloaded 26 | nltk.pos_tag('computer') 27 | except ImportError: 28 | nltk.download('wordnet') 29 | nltk.download('averaged_perceptron_tagger') 30 | 31 | self.model = self.read() 32 | 33 | def read(self): 34 | return wordnet 35 | 36 | def predict(self, word, pos=None): 37 | results = [] 38 | for synonym in self.model.synsets(word, pos=pos, lang=self.lang): 39 | for lemma in synonym.lemmas(lang=self.lang): 40 | if self.is_synonym: 41 | results.append(lemma.name()) 42 | else: 43 | for antonym in lemma.antonyms(): 44 | results.append(antonym.name()) 45 | return results 46 | 47 | @classmethod 48 | def pos_tag(cls, tokens): 49 | return nltk.pos_tag(tokens) 50 | -------------------------------------------------------------------------------- /nlpaug/model/audio/shift.py: -------------------------------------------------------------------------------- 1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation 2 | import numpy as np 3 | 4 | from nlpaug.model.audio import Audio 5 | 6 | 7 | class Shift(Audio): 8 | def __init__(self, sampling_rate, duration=3, 9 | direction='random', stateless=True): 10 | """ 11 | :param int sampling_rate: Sampling rate of input audio. 12 | :param float duration: Max shifting segment (in second) 13 | :param str direction: Shifting segment to left, right or one of them. Value can be 'left', 'right' or 'random' 14 | """ 15 | 16 | super().__init__(duration=duration, sampling_rate=sampling_rate, stateless=stateless) 17 | # TODO: remove `both` after 0.0.12 18 | if direction in ['left', 'right', 'random', 'both']: 19 | self.direction = direction 20 | else: 21 | raise ValueError( 22 | 'shift_direction should be either left, right or both while {} is passed.'.format(direction)) 23 | 24 | def manipulate(self, data): 25 | aug_shift = int(self.sampling_rate * self.duration) 26 | if self.direction == 'right': 27 | aug_shift = -aug_shift 28 | elif self.direction == 'random': 29 | direction = np.random.randint(0, 2) 30 | if direction == 1: 31 | aug_shift = -aug_shift 32 | 33 | augmented_data = np.roll(data, aug_shift) 34 | 35 | # Set to silence for heading/ tailing 36 | if aug_shift > 0: 37 | augmented_data[:aug_shift] = 0 38 | else: 39 | augmented_data[aug_shift:] = 0 40 | return augmented_data 41 | -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/time_warping.py: -------------------------------------------------------------------------------- 1 | # import numpy as np 2 | # 3 | # from nlpaug.model import Spectrogram 4 | # 5 | # 6 | # class TimeWarping(Spectrogram): 7 | # def __init__(self, time_warp): 8 | # super(TimeWarping, self).__init__() 9 | # 10 | # self.time_warp = time_warp 11 | # 12 | # # TODO 13 | # def mask(self, mel_spectrogram): 14 | # """ 15 | # From: https://arxiv.org/pdf/1904.08779.pdf, 16 | # Time warping is applied via the function 17 | # sparse image warp of tensorflow. Given 18 | # a log mel spectrogram with t time steps, we view it 19 | # as an image where the time axis is horizontal and the 20 | # frequency axis is vertical. A random point along the 21 | # horizontal line passing through the center of the image 22 | # within the time steps (W, t - W) is to be warped 23 | # either to the left or right by a distance w chosen from a 24 | # uniform distribution from 0 to the time warp parameter 25 | # W along that line. 26 | # :return: 27 | # """ 28 | # 29 | # time_range = mel_spectrogram.shape[1] 30 | # self.w = np.random.randint(self.time_warp) 31 | # 32 | # center_point = np.random.randint(self.time_warp, time_range-self.time_warp) 33 | # distance = np.random.randint(-self.w, self.w) 34 | # 35 | # # self.w0 = np.random.randint(time_range - self.t) 36 | # # 37 | # # augmented_mel_spectrogram = mel_spectrogram.copy() 38 | # # augmented_mel_spectrogram[:, self.time_warp:self.time_range-self.time_warp] = 0 39 | # # return augmented_mel_spectrogram 40 | # return mel_spectrogram 41 | -------------------------------------------------------------------------------- /nlpaug/model/word_embs/fasttext.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | from nlpaug.model.word_embs import WordEmbeddings 3 | 4 | 5 | class Fasttext(WordEmbeddings): 6 | # https://arxiv.org/pdf/1712.09405.pdf, 7 | def __init__(self, top_k=100, skip_check=False): 8 | super().__init__(top_k, skip_check) 9 | 10 | def read(self, file_path, max_num_vector=None): 11 | vectors = [] 12 | with open(file_path, 'r', encoding='utf-8') as f: 13 | header = f.readline() 14 | self.vocab_size, self.emb_size = map(int, header.split()) 15 | 16 | for line in f: 17 | tokens = line.split() 18 | values = [val for val in tokens[(self.emb_size * -1):]] 19 | value_pos = line.find(' '.join(values)) 20 | word = line[:value_pos-1] 21 | values = np.array([float(val) for val in values]) 22 | 23 | vectors.append(values) 24 | self.i2w[len(self.i2w)] = word 25 | self.w2i[word] = len(self.w2i) 26 | self.w2v[word] = values 27 | 28 | vectors = np.asarray(vectors) 29 | if not self.skip_check: 30 | if len(vectors) != len(self.i2w): 31 | raise AssertionError('Vector Size:{}, Index2Word Size:{}'.format(len(vectors), len(self.i2w))) 32 | if len(self.i2w) != len(self.w2i): 33 | raise AssertionError('Index2Word Size:{}, Word2Index Size:{}'.format(len(self.i2w), len(self.w2i))) 34 | if len(self.w2i) != len(self.w2v): 35 | raise AssertionError('Word2Index Size:{}, Word2Vector Size:{}'.format(len(self.w2i), len(self.w2v))) 36 | 37 | self.normalized_vectors = self._normalize(vectors) 38 | -------------------------------------------------------------------------------- /nlpaug/model/audio/crop.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.audio import Audio 4 | 5 | 6 | class Crop(Audio): 7 | def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=0.1, duration=None, stateless=True): 8 | """ 9 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided. 10 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 11 | augmentation 12 | will be applied in first 20% and last 20% of whole audio. 13 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 14 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 15 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 16 | augmented. 17 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. 18 | `coverage` value will be ignored. 19 | """ 20 | super().__init__(zone=zone, coverage=coverage, duration=duration, sampling_rate=sampling_rate, 21 | stateless=stateless) 22 | 23 | def manipulate(self, data): 24 | if self.duration is None: 25 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 26 | else: 27 | start_pos, end_pos = self.get_augment_range_by_duration(data) 28 | 29 | if not self.stateless: 30 | self.start_pos = start_pos 31 | self.end_pos = end_pos 32 | 33 | augmented_data = np.delete(data, np.s_[start_pos:end_pos]) 34 | return augmented_data 35 | -------------------------------------------------------------------------------- /nlpaug/model/word_dict/spelling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Source data: 3 | English Neutral Rewriting: https://github.com/ybisk/charNMT-noise/blob/master/noise/en.natural 4 | """ 5 | from nlpaug.model.word_dict import WordDictionary 6 | 7 | 8 | class Spelling(WordDictionary): 9 | def __init__(self, dict_path, include_reverse=True, cache=True): 10 | super().__init__(cache) 11 | 12 | self.dict_path = dict_path 13 | self.include_reverse = include_reverse 14 | 15 | self._init() 16 | 17 | def _init(self): 18 | self.dict = {} 19 | self.read(self.dict_path) 20 | 21 | def read(self, model_path): 22 | with open(model_path, 'r', encoding="utf-8") as f: 23 | for line in f.readlines(): 24 | tokens = line.split(' ') 25 | # Last token include newline separator 26 | tokens[-1] = tokens[-1].replace('\n', '') 27 | 28 | key = tokens[0] 29 | values = tokens[1:] 30 | 31 | if key not in self.dict: 32 | self.dict[key] = [] 33 | 34 | self.dict[key].extend(values) 35 | # Remove duplicate mapping 36 | self.dict[key] = list(set(self.dict[key])) 37 | # Build reverse mapping 38 | if self.include_reverse: 39 | for value in values: 40 | if value not in self.dict: 41 | self.dict[value] = [] 42 | if key not in self.dict[value]: 43 | self.dict[value].append(key) 44 | 45 | def predict(self, data): 46 | if data not in self.dict: 47 | return None 48 | 49 | return self.dict[data] 50 | -------------------------------------------------------------------------------- /nlpaug/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ -------------------------------------------------------------------------------- /nlpaug/model/audio/loudness.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.audio import Audio 4 | 5 | 6 | class Loudness(Audio): 7 | def __init__(self, zone=(0.2, 0.8), coverage=1., factor=(0.5, 2), stateless=True): 8 | """ 9 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 10 | augmentation 11 | will be applied in first 20% and last 20% of whole audio. 12 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 13 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 14 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 15 | augmented. 16 | :param factor: Input data volume will be increased (decreased). Augmented value will be picked 17 | within the range of this tuple value. If volume will be reduced if value is between 0 and 1. 18 | """ 19 | super().__init__(zone=zone, coverage=coverage, factor=factor, stateless=stateless) 20 | 21 | def get_loudness_level(self): 22 | return np.random.uniform(self.factor[0], self.factor[1]) 23 | 24 | def manipulate(self, data): 25 | loudness_level = self.get_loudness_level() 26 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 27 | aug_data = data[start_pos:end_pos] * loudness_level 28 | 29 | if not self.stateless: 30 | self.aug_factor = loudness_level 31 | self.start_pos = start_pos 32 | self.end_pos = end_pos 33 | self.aug_data = aug_data 34 | 35 | return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0) 36 | -------------------------------------------------------------------------------- /nlpaug/util/audio/visualizer.py: -------------------------------------------------------------------------------- 1 | try: 2 | import librosa 3 | import librosa.display 4 | except ImportError: 5 | # No installation required if not using this function 6 | pass 7 | import matplotlib.pyplot as plt 8 | import numpy as np 9 | 10 | 11 | class AudioVisualizer: 12 | @staticmethod 13 | def wave(title, audio, sample_rate): 14 | plt.figure(figsize=(8, 4)) 15 | librosa.display.waveplot(audio, sr=sample_rate) 16 | plt.title(title) 17 | plt.tight_layout() 18 | plt.show() 19 | 20 | @staticmethod 21 | def freq_power(title, audio, sample_rate, aug_audio=None): 22 | audio_fft = np.fft.rfft(audio) 23 | audio_fft /= len(audio_fft) 24 | 25 | freq_bins = np.arange(0, len(audio_fft), 1.0) * (sample_rate * 1.0 / len(audio_fft)) 26 | plt.plot(freq_bins / 1000, 10 * np.log10(audio_fft), color='#FF0000', linewidth=0.02) 27 | 28 | if aug_audio is not None: 29 | aug_audio_fft = np.fft.rfft(aug_audio) 30 | aug_audio_fft /= len(aug_audio_fft) 31 | 32 | aug_freq_bins = np.arange(0, len(aug_audio_fft), 1.0) * (sample_rate * 1.0 / len(aug_audio_fft)) 33 | plt.plot(aug_freq_bins / 1000, 10 * np.log10(aug_audio_fft), color='#000000', linewidth=0.02) 34 | 35 | plt.title(title) 36 | plt.xlabel('Frequency (k Hz)') 37 | plt.ylabel('Power (dB)') 38 | plt.tight_layout() 39 | plt.show() 40 | 41 | @staticmethod 42 | def spectrogram(title, spectrogram): 43 | plt.figure(figsize=(8, 4)) 44 | librosa.display.specshow( 45 | librosa.power_to_db(spectrogram, ref=np.max), y_axis='mel', fmax=8000, x_axis='time') 46 | plt.colorbar(format='%+10.0f dB') 47 | plt.title(title) 48 | plt.tight_layout() 49 | plt.show() 50 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | *.zip 28 | .DS_Store 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | 77 | # pyenv 78 | .python-version 79 | 80 | # celery beat schedule file 81 | celerybeat-schedule 82 | 83 | # SageMath parsed files 84 | *.sage.py 85 | 86 | # Environments 87 | .env 88 | .venv 89 | env/ 90 | venv/ 91 | ENV/ 92 | env.bak/ 93 | venv.bak/ 94 | 95 | # Spyder project settings 96 | .spyderproject 97 | .spyproject 98 | 99 | # Rope project settings 100 | .ropeproject 101 | 102 | # mkdocs documentation 103 | /site 104 | 105 | # mypy 106 | .mypy_cache/ 107 | 108 | # IDE 109 | .idea/ 110 | 111 | # model 112 | *.txt 113 | *.bin 114 | *.vec -------------------------------------------------------------------------------- /model/char/keyboard/en.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["!", "2", "@", "q", "w"], 3 | "2": ["@", "1", "!", "3", "#", "q", "w", "e"], 4 | "3": ["#", "2", "@", "4", "$", "w", "e"], 5 | "4": ["$", "3", "#", "5", "%", "e", "r"], 6 | "5": ["%", "4", "$", "6", "^", "r", "t", "y"], 7 | "6": ["^", "5", "%", "7", "&", "t", "y", "u"], 8 | "7": ["&", "6", "^", "8", "*", "y", "u", "i"], 9 | "8": ["*", "7", "&", "9", "(", "u", "i", "o"], 10 | "9": ["(", "8", "*", "0", ")", "i", "o", "p"], 11 | "q": ["1", "!", "2", "@", "w", "a", "s"], 12 | "w": ["1", "!", "2", "@", "3", "#", "q", "e", "a", "s", "d"], 13 | "e": ["2", "@", "3", "#", "4", "$", "w", "r", "s", "d", "f"], 14 | "r": ["3", "#", "4", "$", "5", "%", "e", "t", "d", "f", "g"], 15 | "t": ["4", "$", "5", "%", "6", "^", "r", "y", "f", "g", "h"], 16 | "y": ["5", "%", "6", "^", "7", "&", "t", "u", "g", "h", "j"], 17 | "u": ["6", "^", "7", "&", "8", "*", " t", "i", "h", "j", "k"], 18 | "i": ["7", "&", "8", "*", "9", "(", "u", "o", "j", "k", "l"], 19 | "o": ["8", "*", "9", "(", "0", ")", "i", "p", "k", "l"], 20 | "p": ["9", "(", "0", ")", "o", "l"], 21 | "a": ["q", "w", "a", "s", "z", "x"], 22 | "s": ["q", "w", "e", "a", "d", "z", "x", "c"], 23 | "d": ["w", "e", "r", "s", "f", "x", "c", "v"], 24 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"], 25 | "g": ["r", "t", "y", "f", "h", "v", "b", "n"], 26 | "h": ["t", "y", "u", "g", "j", "b", "n", "m"], 27 | "j": ["y", "u", "i", "h", "k", "n", "m", ",", "<"], 28 | "k": ["u", "i", "o", "j", "l", "m", ",", "<", ".", ">"], 29 | "l": ["i", "o", "p", "k", ";", ":", ",", "<", ".", ">", "/", "?"], 30 | "z": ["a", "s", "x"], 31 | "x": ["a", "s", "d", "z", "c"], 32 | "c": ["s", "d", "f", "x", "v"], 33 | "v": ["d", "f", "g", "c", "b"], 34 | "b": ["f", "g", "h", "v", "n"], 35 | "n": ["g", "h", "j", "b", "m"], 36 | "m": ["h", "j", "k", "n", ",", "<"] 37 | } -------------------------------------------------------------------------------- /test/augmenter/audio/test_vtlp.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.audio as naa 6 | from nlpaug.util import AudioLoader 7 | 8 | 9 | class TestVtlp(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 16 | cls.sample_wav_file = os.path.join( 17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 18 | ) 19 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 20 | 21 | def test_substitute(self): 22 | for _ in range(10): 23 | aug = naa.VtlpAug(sampling_rate=self.sampling_rate) 24 | aug.model.stateless = False 25 | augmented_audio = aug.augment(self.audio) 26 | self.assertGreater(len(self.audio), len(augmented_audio)) 27 | 28 | def test_coverage(self): 29 | zone = (0.3, 0.7) 30 | coverage = 0.1 31 | 32 | for _ in range(10): 33 | aug = naa.VtlpAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage) 34 | aug.model.stateless = False 35 | aug.augment(self.audio) 36 | 37 | self.assertGreater(len(self.audio[aug.model.start_pos:aug.model.end_pos]), len(aug.model.aug_data)) 38 | 39 | def test_zone(self): 40 | zone = (0, 1) 41 | coverage = 1. 42 | 43 | for _ in range(10): 44 | aug = naa.VtlpAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage) 45 | aug.model.stateless = False 46 | aug.augment(self.audio) 47 | 48 | self.assertGreater(len(self.audio[aug.model.start_pos:aug.model.end_pos]), len(aug.model.aug_data)) 49 | -------------------------------------------------------------------------------- /nlpaug/util/doc/change_log.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.doc.token import Token 2 | 3 | 4 | class ChangeLog: 5 | def __init__(self, orig_token): 6 | self.orig_token = orig_token 7 | self.change_logs = [] 8 | self.add(orig_token.token, 'original', orig_token.change_seq) 9 | self._is_changed = False 10 | 11 | def add(self, token, action, change_seq): 12 | if action != 'original' and not self._is_changed: 13 | self._is_changed = True 14 | self.change_logs.append(Token(token=token, action=action, change_seq=change_seq)) 15 | 16 | def update(self, idx, token=None, action=None, change_seq=None): 17 | if not self._is_changed: 18 | self._is_changed = True 19 | 20 | if token: 21 | self.change_logs[idx].token = token 22 | if action: 23 | self.change_logs[idx].action = action 24 | if change_seq: 25 | self.change_logs[idx].change_seq = change_seq 26 | 27 | def size(self): 28 | return len(self.change_logs) - 1 29 | 30 | def is_changed(self): 31 | return self._is_changed 32 | 33 | def get_latest_token(self): 34 | return self.change_logs[-1] 35 | 36 | def update_last_token(self, start_pos): 37 | self.change_logs[-1].start_pos = start_pos 38 | 39 | def to_changed_dict(self): 40 | return { 41 | 'orig_token': self.orig_token.token, 42 | 'orig_start_pos': self.orig_token.start_pos, 43 | 'new_token': self.get_latest_token().token, 44 | 'new_start_pos': self.get_latest_token().start_pos, 45 | 'change_seq': self.get_latest_token().change_seq, 46 | 'action': self.get_latest_token().action 47 | } 48 | 49 | def to_dict(self): 50 | return { 51 | 'orig_token': self.orig_token.to_dict(), 52 | 'change_logs': [t.to_dict() for t in self.change_logs] 53 | } 54 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/speed.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply speed adjustment operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class SpeedAug(AudioAugmenter): 11 | """ 12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 13 | augmentation will be applied in first 20% and last 20% of whole audio. 14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 17 | augmented. 18 | :param int factor: Range of applying speed adjustment operation. Default value is (0.5, 2) 19 | Factor for time stretch. Audio will be slowing down if value is between 0 and 1. 20 | :param tuple speed_range: Deprecated. Use `factor` indeed 21 | :param str name: Name of this augmenter 22 | 23 | >>> import nlpaug.augmenter.audio as naa 24 | >>> aug = naa.ShiftAug() 25 | """ 26 | 27 | def __init__(self, zone=(0.2, 0.8), coverage=1., duration=None, 28 | factor=(0.5, 2), 29 | speed_range=(0.5, 2), name='Speed_Aug', verbose=0): 30 | super().__init__( 31 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose) 32 | 33 | if speed_range != (0.5, 2): 34 | print(WarningMessage.DEPRECATED.format('speed_range', '0.0.12', 'factor')) 35 | factor = speed_range 36 | 37 | self.model = self.get_model(zone, coverage, duration, factor) 38 | 39 | @classmethod 40 | def get_model(cls, zone, coverage, duration, factor): 41 | return nma.Speed(zone, coverage, duration, factor) 42 | -------------------------------------------------------------------------------- /test/run_test.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import sys 3 | import logging 4 | 5 | 6 | if __name__ == '__main__': 7 | sys.path.append('../nlpaug') 8 | 9 | # disable transformer's info logging 10 | for file_name in ['tokenization_utils', 'file_utils', 'modeling_utils', 'modeling_xlnet', 11 | 'configuration_utils']: 12 | logging.getLogger('transformers.' + file_name).setLevel(logging.ERROR) 13 | 14 | test_dirs = [ 15 | 'test/augmenter/char/', 16 | 'test/augmenter/word/', 17 | 'test/augmenter/sentence/', 18 | 'test/augmenter/audio/', 19 | 'test/augmenter/spectrogram/', 20 | 'test/model/char/', 21 | 'test/model/word/', 22 | 'test/util/selection/', 23 | 'test/flow/', 24 | 'test/profiling/sentence/', 25 | ] 26 | runner = unittest.TextTestRunner() 27 | 28 | for test_dir in test_dirs: 29 | loader = unittest.TestLoader() 30 | suite = loader.discover(test_dir) 31 | runner.run(suite) 32 | 33 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.sentence.test_context_word_embs_sentence') 34 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.word.test_context_word_embs') 35 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.word.test_word_embs') 36 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.word.test_random_word') 37 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.char.test_random_char') 38 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.word.test_word') 39 | # suite = unittest.TestLoader().loadTestsFromName('util.selection.test_filtering') 40 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.audio.test_noise') 41 | # suite = unittest.TestLoader().loadTestsFromName('augmenter.test_augmenter') 42 | # suite = unittest.TestLoader().loadTestsFromName('model.word.test_word_embs_model') 43 | # runner.run(suite) 44 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/loudness.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply adjusting loudness operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class LoudnessAug(AudioAugmenter): 11 | """ 12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 13 | augmentation will be applied in first 20% and last 20% of whole audio. 14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 17 | augmented. 18 | :param tuple factor: Input data volume will be increased (decreased). Augmented value will be picked 19 | within the range of this tuple value. Volume will be reduced if value is between 0 and 1. 20 | :param tuple loudness_factor: Deprecated. Use `factor` indeed. 21 | :param str name: Name of this augmenter 22 | 23 | >>> import nlpaug.augmenter.audio as naa 24 | >>> aug = naa.LoudnessAug() 25 | """ 26 | 27 | def __init__(self, zone=(0.2, 0.8), coverage=1., 28 | factor=(0.5, 2), loudness_factor=(0.5, 2), name='Loudness_Aug', verbose=0): 29 | super().__init__( 30 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose) 31 | 32 | if loudness_factor != (0.5, 2): 33 | print(WarningMessage.DEPRECATED.format('loudness_factor', '0.0.12', 'factor')) 34 | factor = loudness_factor 35 | 36 | self.model = self.get_model(zone, coverage, factor) 37 | 38 | @classmethod 39 | def get_model(cls, zone, coverage, factor): 40 | return nma.Loudness(zone=zone, coverage=coverage, factor=factor) 41 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/noise.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply noise injection operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class NoiseAug(AudioAugmenter): 11 | """ 12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 13 | augmentation will be applied in first 20% and last 20% of whole audio. 14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 17 | augmented. 18 | :param str color: Colors of noise. Supported 'white', 'pink', 'red', 'brown', 'brownian', 'blue', 'azure', 19 | 'violet', 'purple' and 'random'. If 'random' is used, noise color will be picked randomly in each augment. 20 | :param list noises: Background noises for noise injection. You can provide more than one background noise and 21 | noise will be picked randomly. Expected format is list of numpy array. If this value is provided. `color` 22 | value will be ignored 23 | :param str name: Name of this augmenter 24 | 25 | >>> import nlpaug.augmenter.audio as naa 26 | >>> aug = naa.NoiseAug() 27 | """ 28 | def __init__(self, zone=(0.2, 0.8), coverage=1., 29 | color='white', noises=None, name='Noise_Aug', verbose=0): 30 | super().__init__( 31 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose) 32 | 33 | self.model = self.get_model(zone, coverage, color, noises) 34 | 35 | @classmethod 36 | def get_model(cls, zone, coverage, color, noises): 37 | return nma.Noise(zone=zone, coverage=coverage, color=color, noises=noises) 38 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/vtlp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply vocal tract length perturbation (VTLP) operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action 8 | 9 | 10 | class VtlpAug(AudioAugmenter): 11 | # https://pdfs.semanticscholar.org/3de0/616eb3cd4554fdf9fd65c9c82f2605a17413.pdf 12 | """ 13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 14 | augmentation will be applied in first 20% and last 20% of whole audio. 15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 18 | augmented. 19 | :param int factor: Range of applying speed adjustment operation. Default value is (0.5, 2) 20 | Factor for time stretch. Audio will be slowing down if value is between 0 and 1. 21 | :param int fhi: Boundary frequency. Default value is 4800. 22 | :param str name: Name of this augmenter 23 | 24 | >>> import nlpaug.augmenter.audio as naa 25 | >>> aug = naa.VtlpAug() 26 | """ 27 | 28 | def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=0.1, duration=None, fhi=4800, 29 | factor=(0.9, 1.1), name='Vtlp_Aug', verbose=0): 30 | super().__init__( 31 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose) 32 | 33 | self.model = self.get_model(sampling_rate, zone, coverage, duration, factor, fhi) 34 | 35 | @classmethod 36 | def get_model(cls, sampling_rate, zone, coverage, duration, factor, fhi): 37 | return nma.Vtlp(sampling_rate=sampling_rate, zone=zone, coverage=coverage, 38 | duration=duration, factor=factor, fhi=fhi) 39 | -------------------------------------------------------------------------------- /nlpaug/model/word_embs/word_embeddings.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | import nlpaug.util.math.normalization as normalization 4 | 5 | 6 | class WordEmbeddings: 7 | def __init__(self, top_k=100, skip_check=True): 8 | self.top_k = top_k 9 | self.skip_check = skip_check 10 | self.emb_size = 0 11 | self.vocab_size = 0 12 | self.embs = {} 13 | self.w2v = {} 14 | self.i2w = {} 15 | self.w2i = {} 16 | self.vectors = [] 17 | self.normalized_vectors = None 18 | 19 | def read(self, file_path, max_num_vector): 20 | raise NotImplementedError 21 | 22 | def similar(self, word): 23 | raise NotImplementedError 24 | 25 | def download(self, model_path): 26 | raise NotImplementedError 27 | 28 | def word2idx(self, word): 29 | return self.w2i[word] 30 | 31 | def word2vector(self, word): 32 | return self.w2v[word] 33 | 34 | def idx2word(self, idx): 35 | return self.i2w[idx] 36 | 37 | def get_vectors(self): 38 | return self.normalized_vectors 39 | 40 | def get_vocab(self): 41 | return [word for word in self.w2v] 42 | 43 | @classmethod 44 | def _normalize(cls, vectors, norm='l2'): 45 | if norm == 'l2': 46 | return normalization.l2_norm(vectors) 47 | elif norm == 'l1': 48 | return normalization.l1_norm(vectors) 49 | elif norm == 'standard': 50 | return normalization.standard_norm(vectors) 51 | 52 | def predict(self, word, n=1): 53 | source_id = self.word2idx(word) 54 | source_vector = self.word2vector(word) 55 | scores = np.dot(self.normalized_vectors, source_vector) # TODO: very slow. 56 | target_ids = np.argpartition(-scores, self.top_k+2)[:self.top_k+2] # TODO: slow. 57 | target_words = [self.idx2word(idx) for idx in target_ids if idx != source_id and self.idx2word(idx).lower() != 58 | word.lower()] # filter out same word 59 | return target_words[:self.top_k] 60 | -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/en.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["!", "2", "@", "q", "w"], 3 | "2": ["@", "1", "!", "3", "#", "q", "w", "e"], 4 | "3": ["#", "2", "@", "4", "$", "w", "e"], 5 | "4": ["$", "3", "#", "5", "%", "e", "r"], 6 | "5": ["%", "4", "$", "6", "^", "r", "t", "y"], 7 | "6": ["^", "5", "%", "7", "&", "t", "y", "u"], 8 | "7": ["&", "6", "^", "8", "*", "y", "u", "i"], 9 | "8": ["*", "7", "&", "9", "(", "u", "i", "o"], 10 | "9": ["(", "8", "*", "0", ")", "i", "o", "p"], 11 | "!": ["@", "q"], 12 | "@": ["!", "#", "q", "w"], 13 | "#": ["@", "$", "w", "e"], 14 | "$": ["#", "%", "e", "r"], 15 | "%": "$", 16 | "q": ["1", "!", "2", "@", "w", "a", "s"], 17 | "w": ["1", "!", "2", "@", "3", "#", "q", "e", "a", "s", "d"], 18 | "e": ["2", "@", "3", "#", "4", "$", "w", "r", "s", "d", "f"], 19 | "r": ["3", "#", "4", "$", "5", "%", "e", "t", "d", "f", "g"], 20 | "t": ["4", "$", "5", "%", "6", "^", "r", "y", "f", "g", "h"], 21 | "y": ["5", "%", "6", "^", "7", "&", "t", "u", "g", "h", "j"], 22 | "u": ["6", "^", "7", "&", "8", "*", " t", "i", "h", "j", "k"], 23 | "i": ["7", "&", "8", "*", "9", "(", "u", "o", "j", "k", "l"], 24 | "o": ["8", "*", "9", "(", "0", ")", "i", "p", "k", "l"], 25 | "p": ["9", "(", "0", ")", "o", "l"], 26 | "a": ["q", "w", "a", "s", "z", "x"], 27 | "s": ["q", "w", "e", "a", "d", "z", "x", "c"], 28 | "d": ["w", "e", "r", "s", "f", "x", "c", "v"], 29 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"], 30 | "g": ["r", "t", "y", "f", "h", "v", "b", "n"], 31 | "h": ["t", "y", "u", "g", "j", "b", "n", "m"], 32 | "j": ["y", "u", "i", "h", "k", "n", "m", ",", "<"], 33 | "k": ["u", "i", "o", "j", "l", "m", ",", "<", ".", ">"], 34 | "l": ["i", "o", "p", "k", ";", ":", ",", "<", ".", ">", "/", "?"], 35 | "z": ["a", "s", "x"], 36 | "x": ["a", "s", "d", "z", "c"], 37 | "c": ["s", "d", "f", "x", "v"], 38 | "v": ["d", "f", "g", "c", "b"], 39 | "b": ["f", "g", "h", "v", "n"], 40 | "n": ["g", "h", "j", "b", "m"], 41 | "m": ["h", "j", "k", "n", ",", "<"] 42 | } -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/pitch.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply pitch adjustment operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class PitchAug(AudioAugmenter): 11 | """ 12 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided. 13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 14 | augmentation will be applied in first 20% and last 20% of whole audio. 15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 18 | augmented. 19 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage` 20 | value will be ignored. 21 | :param tuple pitch_range: Deprecated. Use `factor` indeed 22 | :param str name: Name of this augmenter 23 | 24 | >>> import nlpaug.augmenter.audio as naa 25 | >>> aug = naa.PitchAug(sampling_rate=44010) 26 | """ 27 | 28 | def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=1., duration=None, 29 | factor=(-10, 10), pitch_range=(-10, 10), name='Pitch_Aug', verbose=0): 30 | super().__init__( 31 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose) 32 | 33 | if pitch_range != (-10, 10): 34 | print(WarningMessage.DEPRECATED.format('pitch_range', '0.0.12', 'factor')) 35 | factor = pitch_range 36 | 37 | self.model = self.get_model(sampling_rate, zone, coverage, duration, factor) 38 | 39 | @classmethod 40 | def get_model(cls, sampling_rate, zone, coverage, duration, factor): 41 | return nma.Pitch(sampling_rate, zone, coverage, duration, factor) 42 | -------------------------------------------------------------------------------- /nlpaug/model/audio/mask.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.audio import Audio 4 | 5 | 6 | class Mask(Audio): 7 | def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=1., duration=None, 8 | mask_with_noise=True, stateless=True): 9 | """ 10 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided. 11 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 12 | augmentation 13 | will be applied in first 20% and last 20% of whole audio. 14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 17 | augmented. 18 | :param float duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage` 19 | value will be ignored. 20 | :param bool mask_with_noise: If it is True, targeting area will be replaced by noise. Otherwise, it will be 21 | replaced by 0. 22 | """ 23 | super().__init__(zone=zone, coverage=coverage, duration=duration, sampling_rate=sampling_rate, 24 | stateless=stateless) 25 | self.mask_with_noise = mask_with_noise 26 | 27 | def manipulate(self, data): 28 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 29 | 30 | aug_data = None 31 | if self.mask_with_noise: 32 | aug_data = np.random.randn(end_pos - start_pos) 33 | else: 34 | aug_data = np.zeros(end_pos - start_pos) 35 | 36 | if not self.stateless: 37 | self.start_pos = start_pos 38 | self.end_pos = end_pos 39 | self.aug_data = aug_data 40 | 41 | augmented_data = data.copy() 42 | augmented_data[start_pos:end_pos] = aug_data 43 | 44 | return augmented_data 45 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/crop.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply cropping operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class CropAug(AudioAugmenter): 11 | """ 12 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided. 13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 14 | augmentation will be applied in first 20% and last 20% of whole audio. 15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `0.1` is assigned, augment 16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 18 | augmented. 19 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage` 20 | value will be ignored. 21 | :param str name: Name of this augmenter 22 | 23 | >>> import nlpaug.augmenter.audio as naa 24 | >>> aug = naa.CropAug(sampling_rate=44010) 25 | """ 26 | 27 | def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=0.1, duration=None, 28 | crop_range=(0.2, 0.8), crop_factor=2, name='Crop_Aug', verbose=0): 29 | super().__init__( 30 | action=Action.DELETE, name=name, device='cpu', verbose=verbose) 31 | self.model = self.get_model(sampling_rate, zone, coverage, duration) 32 | 33 | if crop_range != (0.2, 0.8): 34 | print(WarningMessage.DEPRECATED.format('crop_range', '0.0.12', 'zone')) 35 | if crop_factor != 2: 36 | print(WarningMessage.DEPRECATED.format('crop_factor', '0.0.12', 'temperature')) 37 | 38 | def delete(self, data): 39 | return self.model.manipulate(data) 40 | 41 | @classmethod 42 | def get_model(cls, sampling_rate, zone, coverage, duration): 43 | return nma.Crop(sampling_rate, zone=zone, coverage=coverage, duration=duration) 44 | -------------------------------------------------------------------------------- /nlpaug/model/word_embs/glove.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.word_embs import WordEmbeddings 4 | 5 | pre_trained_model_url = { 6 | 'glove_6b': 'http://nlp.stanford.edu/data/glove.6B.zip', 7 | 'glove_42b_300d': 'http://nlp.stanford.edu/data/glove.42B.300d.zip', 8 | 'glove_840b_300d': 'http://nlp.stanford.edu/data/glove.840B.300d.zip', 9 | 'glove_twitter_27b': 'http://nlp.stanford.edu/data/glove.twitter.27B.zip', 10 | } 11 | 12 | 13 | class GloVe(WordEmbeddings): 14 | # https://nlp.stanford.edu/pubs/glove.pdf 15 | def __init__(self, top_k=100, skip_check=False): 16 | super().__init__(top_k, skip_check) 17 | 18 | def read(self, file_path, max_num_vector=None): 19 | vectors = [] 20 | with open(file_path, 'r', encoding='utf-8') as f: 21 | for line in f: 22 | tokens = line.split() 23 | token_len = len(tokens) % 25 24 | 25 | # Handle if token length is longer than 1 (e.g. . . . in glove.840B.300d) 26 | values = np.array([float(val) for val in tokens[token_len:]]) 27 | 28 | # Exist two words while one word has extra space (e.g. "pp." and "pp. " in glove.840B.300d) 29 | word = line[:line.find(str(values[0])) - 1] 30 | 31 | # Skip special word 32 | if '�' in word: 33 | continue 34 | 35 | vectors.append(values) 36 | self.i2w[len(self.i2w)] = word 37 | self.w2i[word] = len(self.w2i) 38 | self.w2v[word] = values 39 | 40 | vectors = np.asarray(vectors) 41 | if not self.skip_check: 42 | if len(vectors) != len(self.i2w): 43 | raise AssertionError('Vector Size:{}, Index2Word Size:{}'.format(len(vectors), len(self.i2w))) 44 | if len(self.i2w) != len(self.w2i): 45 | raise AssertionError('Index2Word Size:{}, Word2Index Size:{}'.format(len(self.i2w), len(self.w2i))) 46 | if len(self.w2i) != len(self.w2v): 47 | raise AssertionError('Word2Index Size:{}, Word2Vector Size:{}'.format(len(self.w2i), len(self.w2v))) 48 | 49 | self.normalized_vectors = self._normalize(vectors) 50 | -------------------------------------------------------------------------------- /nlpaug/model/audio/speed.py: -------------------------------------------------------------------------------- 1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation 2 | 3 | try: 4 | import librosa 5 | except ImportError: 6 | # No installation required if not using this function 7 | pass 8 | import numpy as np 9 | 10 | from nlpaug.model.audio import Audio 11 | 12 | 13 | class Speed(Audio): 14 | """ 15 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 16 | augmentation will be applied in first 20% and last 20% of whole audio. 17 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 18 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 19 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 20 | augmented. 21 | :param int factor: Range of applying speed adjustment operation. Default value is (0.5, 2) 22 | Factor for time stretch. Audio will be slowing down if value is between 0 and 1. 23 | """ 24 | def __init__(self, zone=(0.2, 0.8), coverage=1., duration=None, 25 | factor=(-10, 10), stateless=False): 26 | super().__init__(zone=zone, coverage=coverage, duration=duration, 27 | factor=factor, stateless=stateless) 28 | try: 29 | librosa 30 | except NameError: 31 | raise ImportError('Missed librosa library. Install it via `pip install librosa`') 32 | 33 | def get_speed_level(self): 34 | speeds = [round(i, 1) for i in np.arange(self.factor[0], self.factor[1], 0.1)] 35 | speeds = [s for s in speeds if s != 1.0] 36 | return speeds[np.random.randint(len(speeds))] 37 | 38 | def manipulate(self, data): 39 | speed = self.get_speed_level() 40 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 41 | 42 | aug_data = librosa.effects.time_stretch(data[start_pos:end_pos], speed) 43 | 44 | if not self.stateless: 45 | self.start_pos = start_pos 46 | self.end_pos = end_pos 47 | self.aug_data = aug_data 48 | self.aug_factor = speed 49 | 50 | return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0) 51 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_crop.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestCrop(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_empty_input(self): 23 | audio = np.array([]) 24 | aug = naa.CropAug(sampling_rate=self.sampling_rate) 25 | augmented_audio = aug.augment(audio) 26 | 27 | self.assertTrue(np.array_equal(audio, augmented_audio)) 28 | 29 | def test_substitute(self): 30 | aug = naa.CropAug(sampling_rate=self.sampling_rate) 31 | augmented_audio = aug.augment(self.audio) 32 | 33 | self.assertNotEqual(len(self.audio), len(augmented_audio)) 34 | 35 | def test_coverage(self): 36 | aug = naa.CropAug(sampling_rate=self.sampling_rate, coverage=0.1) 37 | augmented_data = aug.augment(self.audio) 38 | audio_size = len(self.audio) 39 | augmented_size = len(augmented_data) 40 | expected_crop_size = len(self.audio) * (aug.model.zone[1] - aug.model.zone[0]) * 0.1 41 | 42 | self.assertTrue(-1 <= audio_size - augmented_size - expected_crop_size <= 1) 43 | 44 | def test_duration(self): 45 | duration = 1 46 | audio_size = len(self.audio) 47 | 48 | for _ in range(10): 49 | aug = naa.CropAug(sampling_rate=self.sampling_rate, duration=duration) 50 | aug.model.stateless = False 51 | augmented_data = aug.augment(self.audio) 52 | augmented_size = len(augmented_data) 53 | expected_crop_size = self.sampling_rate * duration 54 | 55 | self.assertGreater(audio_size, augmented_size) 56 | self.assertEqual(len(self.audio[aug.model.start_pos:aug.model.end_pos]), expected_crop_size) 57 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_speed.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.audio as naa 6 | from nlpaug.util import AudioLoader 7 | 8 | 9 | class TestSpeed(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 16 | cls.sample_wav_file = os.path.join( 17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 18 | ) 19 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 20 | 21 | def test_substitute(self): 22 | for _ in range(10): 23 | aug = naa.SpeedAug() 24 | aug.model.stateless = False 25 | augmented_audio = aug.augment(self.audio) 26 | 27 | if aug.model.aug_factor < 1: 28 | self.assertGreater(len(augmented_audio), len(self.audio)) 29 | else: 30 | self.assertLess(len(augmented_audio), len(self.audio)) 31 | 32 | def test_coverage(self): 33 | zone = (0.3, 0.7) 34 | coverage = 0.1 35 | 36 | for _ in range(10): 37 | aug = naa.SpeedAug(zone=zone, coverage=coverage) 38 | aug.model.stateless = False 39 | aug.augment(self.audio) 40 | 41 | if aug.model.aug_factor < 1: 42 | self.assertGreater(len(aug.model.aug_data), len(self.audio[aug.model.start_pos:aug.model.end_pos])) 43 | else: 44 | self.assertLess(len(aug.model.aug_data), len(self.audio[aug.model.start_pos:aug.model.end_pos])) 45 | 46 | def test_zone(self): 47 | zone = (0, 1) 48 | coverage = 1. 49 | 50 | for _ in range(10): 51 | aug = naa.SpeedAug(zone=zone, coverage=coverage) 52 | aug.model.stateless = False 53 | aug.augment(self.audio) 54 | 55 | if aug.model.aug_factor < 1: 56 | self.assertGreater(len(aug.model.aug_data), len(self.audio[aug.model.start_pos:aug.model.end_pos])) 57 | else: 58 | self.assertLess(len(aug.model.aug_data), len(self.audio[aug.model.start_pos:aug.model.end_pos])) 59 | -------------------------------------------------------------------------------- /test/augmenter/word/test_spelling.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug 6 | import nlpaug.augmenter.word as naw 7 | 8 | 9 | class TestSpelling(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | 16 | cls.model_dir = os.path.join(nlpaug.__path__[0], 'res', 'word', 'spelling') 17 | 18 | def test_oov(self): 19 | text = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' 20 | 21 | aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt')) 22 | augmented_text = aug.augment(text) 23 | 24 | self.assertEqual(text, augmented_text) 25 | 26 | def test_substitute(self): 27 | texts = [ 28 | 'The quick brown fox jumps over the lazy dog' 29 | ] 30 | 31 | aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt')) 32 | 33 | for text in texts: 34 | self.assertLess(0, len(text)) 35 | augmented_text = aug.augment(text) 36 | 37 | self.assertNotEqual(text, augmented_text) 38 | 39 | self.assertLess(0, len(texts)) 40 | 41 | def test_substitute_stopwords(self): 42 | texts = [ 43 | 'The quick brown fox jumps over the lazy dog' 44 | ] 45 | 46 | stopwords = [t.lower() for t in texts[0].split(' ')[:3]] 47 | aug_n = 3 48 | 49 | aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt'), stopwords=stopwords) 50 | 51 | for text in texts: 52 | self.assertLess(0, len(text)) 53 | augmented_text = aug.augment(text) 54 | 55 | augmented_tokens = aug.tokenizer(augmented_text) 56 | tokens = aug.tokenizer(text) 57 | 58 | augmented_cnt = 0 59 | 60 | for token, augmented_token in zip(tokens, augmented_tokens): 61 | if token.lower() in stopwords and len(token) > aug_n: 62 | self.assertEqual(token.lower(), augmented_token) 63 | else: 64 | augmented_cnt += 1 65 | 66 | self.assertGreater(augmented_cnt, 0) 67 | 68 | self.assertLess(0, len(texts)) 69 | -------------------------------------------------------------------------------- /test/model/char/test_keyboard_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import re 3 | 4 | import nlpaug.model.char as nmc 5 | 6 | 7 | class TestKeyboard(unittest.TestCase): 8 | def test_lower_case_only(self): 9 | model = nmc.Keyboard(special_char=False, numeric=False, upper_case=False) 10 | mapping = model.model 11 | for key, values in mapping.items(): 12 | self.assertTrue(re.match("^[a-z]*$", key)) 13 | self.assertGreater(len(values), 0) 14 | for value in values: 15 | self.assertTrue(re.match("^[a-z]*$", value)) 16 | self.assertGreater(len(mapping), 0) 17 | 18 | def test_special_char_lower_case(self): 19 | model = nmc.Keyboard(special_char=True, numeric=False, upper_case=False) 20 | mapping = model.model 21 | for key, values in mapping.items(): 22 | self.assertFalse(re.match("^[0-9]*$", key)) 23 | self.assertGreater(len(values), 0) 24 | for value in values: 25 | self.assertFalse(re.match("^[0-9]*$", value)) 26 | self.assertGreater(len(mapping), 0) 27 | 28 | def test_numeric_lower_case(self): 29 | model = nmc.Keyboard(special_char=False, numeric=True, upper_case=False) 30 | mapping = model.model 31 | for key, values in mapping.items(): 32 | self.assertTrue(re.match("^[a-z0-9]*$", key)) 33 | self.assertGreater(len(values), 0) 34 | for value in values: 35 | self.assertTrue(re.match("^[a-z0-9]*$", value)) 36 | self.assertGreater(len(mapping), 0) 37 | 38 | def test_upper_lower_case(self): 39 | model = nmc.Keyboard(special_char=False, numeric=False, upper_case=True) 40 | mapping = model.model 41 | for key, values in mapping.items(): 42 | self.assertTrue(re.match("^[a-zA-Z]*$", key)) 43 | self.assertGreater(len(values), 0) 44 | for value in values: 45 | self.assertTrue(re.match("^[a-zA-Z]*$", value)) 46 | self.assertGreater(len(mapping), 0) 47 | 48 | def test_special_char_numeric_lower_case(self): 49 | model = nmc.Keyboard(special_char=True, numeric=True, upper_case=True) 50 | mapping = model.model 51 | for key, values in mapping.items(): 52 | self.assertGreater(len(values), 0) 53 | self.assertGreater(len(mapping), 0) 54 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/mask.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply mask operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class MaskAug(AudioAugmenter): 11 | """ 12 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided. 13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 14 | augmentation will be applied in first 20% and last 20% of whole audio. 15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 18 | augmented. 19 | :param float duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage` 20 | value will be ignored. 21 | :param bool mask_with_noise: If it is True, targeting area will be replaced by noise. Otherwise, it will be 22 | replaced by 0. 23 | :param str name: Name of this augmenter 24 | 25 | >>> import nlpaug.augmenter.audio as naa 26 | >>> aug = naa.MaskAug(sampling_rate=44010) 27 | """ 28 | 29 | def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=1., 30 | duration=(0.2, 0.8), 31 | mask_range=(0.2, 0.8), mask_factor=2, mask_with_noise=True, 32 | name='Mask_Aug', verbose=0): 33 | super().__init__( 34 | action=Action.SUBSTITUTE, name=name, device='cpu', verbose=verbose) 35 | 36 | if mask_range != (0.2, 0.8): 37 | print(WarningMessage.DEPRECATED.format('mask_range', '0.0.12', 'zone')) 38 | zone = mask_range 39 | if mask_factor != 2: 40 | print(WarningMessage.DEPRECATED.format('mask_factor', '0.0.12', 'duration')) 41 | duration = mask_factor 42 | 43 | self.model = self.get_model(sampling_rate, zone, coverage, duration, mask_with_noise) 44 | 45 | @classmethod 46 | def get_model(cls, sampling_rate, zone, coverage, duration, mask_with_noise): 47 | return nma.Mask(sampling_rate, zone, coverage, duration, mask_with_noise) 48 | -------------------------------------------------------------------------------- /nlpaug/model/word_embs/word2vec.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.word_embs import WordEmbeddings 4 | 5 | 6 | class Word2vec(WordEmbeddings): 7 | # https://arxiv.org/pdf/1301.3781.pdf 8 | def __init__(self, top_k=100, skip_check=False): 9 | super().__init__(top_k, skip_check) 10 | 11 | def read(self, file_path, max_num_vector=None): 12 | with open(file_path, 'rb') as f: 13 | header = f.readline() 14 | self.vocab_size, self.emb_size = map(int, header.split()) 15 | if max_num_vector is not None: 16 | self.vocab_size = min(max_num_vector, self.vocab_size) 17 | 18 | vectors = np.zeros((self.vocab_size, self.emb_size), dtype=np.float32) 19 | binary_len = np.dtype(np.float32).itemsize * self.emb_size 20 | 21 | for _ in range(self.vocab_size): 22 | word = [] 23 | while True: 24 | ch = f.read(1) 25 | if ch == b' ': 26 | word = ''.join(word) 27 | break 28 | if ch != '\n': 29 | word.append(ch.decode('cp437')) 30 | 31 | try: 32 | value = f.read(binary_len) 33 | values = np.frombuffer(value, dtype=np.float32) 34 | vectors[len(self.i2w)] = values 35 | self.i2w[len(self.i2w)] = word 36 | self.w2i[word] = len(self.w2i) 37 | self.w2v[word] = values 38 | # values = np.frombuffer(f.read(binary_len), dtype=np.float32) 39 | except Exception as e: 40 | if not self.skip_check: 41 | raise ValueError('Unable to parse row {} ({})'.format(_, value)) 42 | 43 | vectors = np.asarray(vectors) 44 | if not self.skip_check: 45 | if len(vectors) != len(self.i2w): 46 | raise AssertionError('Vector Size:{}, Index2Word Size:{}'.format(len(vectors), len(self.i2w))) 47 | if len(self.i2w) != len(self.w2i): 48 | raise AssertionError('Index2Word Size:{}, Word2Index Size:{}'.format(len(self.i2w), len(self.w2i))) 49 | if len(self.w2i) != len(self.w2v): 50 | raise AssertionError('Word2Index Size:{}, Word2Vector Size:{}'.format(len(self.w2i), len(self.w2v))) 51 | 52 | self.normalized_vectors = self._normalize(vectors) 53 | -------------------------------------------------------------------------------- /nlpaug/model/audio/pitch.py: -------------------------------------------------------------------------------- 1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation 2 | 3 | try: 4 | import librosa 5 | except ImportError: 6 | # No installation required if not using this function 7 | pass 8 | import numpy as np 9 | 10 | from nlpaug.model.audio import Audio 11 | 12 | 13 | class Pitch(Audio): 14 | def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=1., duration=None, 15 | factor=(-10, 10), stateless=False): 16 | """ 17 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided. 18 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 19 | augmentation 20 | will be applied in first 20% and last 20% of whole audio. 21 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 22 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 23 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 24 | augmented. 25 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage` 26 | value will be ignored. 27 | :param tuple pitch_range: Deprecated. Use `factor` indeed 28 | :param str name: Name of this augmenter 29 | """ 30 | super().__init__(zone=zone, coverage=coverage, duration=duration, sampling_rate=sampling_rate, 31 | factor=factor, stateless=stateless) 32 | try: 33 | librosa 34 | except NameError: 35 | raise ImportError('Missed librosa library. Install it via `pip install librosa`') 36 | 37 | def get_pitch_level(self): 38 | return np.random.randint(self.factor[0], self.factor[1]) 39 | 40 | def manipulate(self, data): 41 | n_step = self.get_pitch_level() 42 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 43 | 44 | aug_data = librosa.effects.pitch_shift(data[start_pos:end_pos], self.sampling_rate, n_step) 45 | 46 | if not self.stateless: 47 | self.start_pos = start_pos 48 | self.end_pos = end_pos 49 | self.aug_data = aug_data 50 | 51 | augmented_data = data.copy() 52 | augmented_data[start_pos:end_pos] = aug_data 53 | 54 | return augmented_data 55 | -------------------------------------------------------------------------------- /nlpaug/model/audio/audio.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class Audio: 5 | def __init__(self, zone=(0.2, 0.8), coverage=1., factor=None, duration=None, 6 | sampling_rate=None, stateless=True): 7 | self.zone = zone 8 | self.coverage = coverage 9 | self.factor = factor 10 | self.duration = duration 11 | self.sampling_rate = sampling_rate 12 | self.stateless = stateless 13 | 14 | self.start_pos = None 15 | self.end_pos = None 16 | self.aug_data = None 17 | self.aug_factor = None 18 | 19 | @classmethod 20 | def pad(cls, data, noise): 21 | if len(data) - len(noise) == 0: 22 | start_pos = 0 23 | else: 24 | start_pos = np.random.randint(0, len(data) - len(noise)) 25 | 26 | prefix_padding = np.array([0] * start_pos) 27 | suffix_padding = np.array([0] * (len(data) - len(noise) - start_pos)) 28 | return np.append(np.append(prefix_padding, noise), suffix_padding) 29 | 30 | def get_augmentation_segment_size(self, data): 31 | return int(len(data) * (self.zone[1] - self.zone[0]) * self.coverage) 32 | 33 | def get_augment_range_by_coverage(self, data): 34 | zone_start, zone_end = int(len(data) * self.zone[0]), int(len(data) * self.zone[1]) 35 | zone_size = zone_end - zone_start 36 | 37 | target_size = int(zone_size * self.coverage) 38 | last_start = zone_start + int(zone_size * (1 - self.coverage)) 39 | 40 | if zone_start == last_start: 41 | start_pos = zone_start 42 | end_pos = zone_end 43 | else: 44 | start_pos = np.random.randint(zone_start, last_start) 45 | end_pos = start_pos + target_size 46 | 47 | return start_pos, end_pos 48 | 49 | def get_augment_range_by_duration(self, data): 50 | zone_start, zone_end = int(len(data) * self.zone[0]), int(len(data) * self.zone[1]) 51 | zone_size = zone_end - zone_start 52 | 53 | target_size = int(self.sampling_rate * self.duration) 54 | 55 | if target_size >= zone_size: 56 | start_pos = zone_start 57 | end_pos = zone_end 58 | else: 59 | last_start = zone_start + zone_size - target_size 60 | start_pos = np.random.randint(zone_start, last_start) 61 | end_pos = start_pos + target_size 62 | 63 | return start_pos, end_pos 64 | 65 | def manipulate(self, data): 66 | raise NotImplementedError 67 | -------------------------------------------------------------------------------- /test/augmenter/char/test_keyboard.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import re 3 | import json 4 | import os 5 | 6 | import nlpaug.augmenter.char as nac 7 | 8 | 9 | class TestKeyboard(unittest.TestCase): 10 | def test_single_word(self): 11 | texts = ['Zoology', 'roku123456'] 12 | aug = nac.KeyboardAug() 13 | for text in texts: 14 | augmented_text = aug.augment(text) 15 | self.assertNotEqual(text, augmented_text) 16 | 17 | self.assertTrue(len(texts) > 0) 18 | 19 | def test_multi_words(self): 20 | texts = ['The quick brown fox jumps over the lazy dog'] 21 | aug = nac.KeyboardAug() 22 | for text in texts: 23 | augmented_text = aug.augment(text) 24 | self.assertNotEqual(text, augmented_text) 25 | 26 | self.assertTrue(len(texts) > 0) 27 | 28 | def test_no_special_character(self): 29 | text = 'qwertyuioplmnbvcxza' 30 | for i in range(10): 31 | aug = nac.KeyboardAug(include_special_char=False) 32 | augmented_text = aug.augment(text) 33 | self.assertTrue(re.match("^[a-zA-Z0-9]*$", augmented_text)) 34 | 35 | def test_lang_th(self): 36 | text = 'ฤฤฤฤ ฤฏณ' 37 | aug = nac.KeyboardAug(lang='th') 38 | augmented_text = aug.augment(text) 39 | self.assertNotEqual(text, augmented_text) 40 | 41 | def test_non_support_lang(self): 42 | try: 43 | nac.KeyboardAug(lang='non_exist') 44 | self.assertTrue(False) 45 | except ValueError: 46 | self.assertTrue(True) 47 | 48 | def test_custom_model(self): 49 | custom_model = { 50 | 'a': '1', 51 | 'b': '2', 52 | } 53 | 54 | custom_model_file_path = 'char_keyboard_custom_model.json' 55 | 56 | with open(custom_model_file_path, 'w') as outfile: 57 | json.dump(custom_model, outfile) 58 | 59 | text = 'ababab' 60 | aug = nac.KeyboardAug(model_path=custom_model_file_path) 61 | augmented_text = aug.augment(text) 62 | 63 | self.assertTrue('1' in augmented_text or '2' in augmented_text) 64 | 65 | if os.path.exists(custom_model_file_path): 66 | os.remove(custom_model_file_path) 67 | 68 | def test_load_custom_model_fail(self): 69 | try: 70 | aug = nac.KeyboardAug(model_path='test_load_custom_model_fail.json') 71 | self.assertTrue(False) 72 | except ValueError: 73 | self.assertTrue(True) 74 | -------------------------------------------------------------------------------- /nlpaug/util/decorator/deprecation.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import warnings 3 | 4 | 5 | def deprecated(deprecate_from, deprecate_to, msg): 6 | def decorator(obj): 7 | if isinstance(obj, type): 8 | return _decorate_class(obj, deprecate_from, deprecate_to, msg) 9 | # # TODO: 10 | # elif isinstance(obj, property): 11 | # return _decorate_prop(obj, msg) 12 | else: 13 | return _decorate_func(obj, deprecate_from, deprecate_to, msg) 14 | return decorator 15 | 16 | 17 | def _decorate_class(cls, deprecate_from, deprecate_to, msg): 18 | msg_template = 'Class {name} is deprecated from {deprecate_from} version.' 19 | msg_template += ' It will be removed from {deprecate_to} version. {msg}' 20 | 21 | @functools.wraps(cls) 22 | def wrapped(*args, **kwargs): 23 | warnings.simplefilter('always', DeprecationWarning) 24 | warnings.warn( 25 | msg_template.format( 26 | name=cls.__name__, deprecate_from=deprecate_from, deprecate_to=deprecate_to, msg=msg), 27 | category=DeprecationWarning 28 | ) 29 | warnings.simplefilter('default', DeprecationWarning) 30 | return cls(*args, **kwargs) 31 | 32 | return wrapped 33 | 34 | 35 | def _decorate_func(func, deprecate_from, deprecate_to, msg): 36 | msg_template = 'Function {name} is deprecated from {deprecate_from} version.' 37 | msg_template += ' It will be removed from {deprecate_to} version. {msg}' 38 | 39 | @functools.wraps(func) 40 | def wrapped(*args, **kwargs): 41 | warnings.simplefilter('always', DeprecationWarning) 42 | warnings.warn( 43 | msg_template.format( 44 | name=func.__name__, deprecate_from=deprecate_from, deprecate_to=deprecate_to, msg=msg), 45 | category=DeprecationWarning 46 | ) 47 | warnings.simplefilter('default', DeprecationWarning) 48 | return func(*args, **kwargs) 49 | 50 | return wrapped 51 | 52 | 53 | def _decorate_prop(prop, msg): 54 | @functools.wraps(prop) 55 | @property 56 | def wrapped(*args, **kwargs): 57 | msg_template = 'Property {name} is deprecated. {msg}' 58 | warnings.simplefilter('always', DeprecationWarning) 59 | warnings.warn( 60 | msg_template.format(name=prop.__name__, msg=msg), category=DeprecationWarning 61 | ) 62 | warnings.simplefilter('default', DeprecationWarning) 63 | return prop.fget(*args, **kwargs) 64 | 65 | return wrapped 66 | -------------------------------------------------------------------------------- /test/profiling/sentence/test_context_word_embs_sentence_profiling.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import time 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.sentence as nas 7 | 8 | 9 | class TestContextualWordEmbsAugProfiling(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | 16 | cls.text = 'The quick brown fox jumps over the lazy dog.' 17 | 18 | def test_optimize(self): 19 | model_paths = ['gpt2', 'distilgpt2'] 20 | device = 'cpu' 21 | enable_optimize = {'external_memory': 1024, 'return_proba': True} 22 | disable_optimize = {'external_memory': 0, 'return_proba': True} 23 | epoch = 10 24 | 25 | for model_path in model_paths: 26 | # Optimized 27 | durations = [] 28 | aug = nas.ContextualWordEmbsForSentenceAug( 29 | model_path=model_path, device=device, optimize=enable_optimize, force_reload=True) 30 | for i in range(epoch): 31 | start_dt = time.monotonic() 32 | for j in range(epoch): 33 | aug.augment(self.text) 34 | end_dt = time.monotonic() 35 | durations.append(round(end_dt-start_dt, 2)) 36 | 37 | optimized_total_duration = sum(durations) 38 | optimized_average_duration = round(optimized_total_duration/len(durations), 2) 39 | 40 | # No optimized 41 | durations = [] 42 | aug.model.optimize = disable_optimize 43 | for _ in range(epoch): 44 | start_dt = time.monotonic() 45 | for _ in range(epoch): 46 | aug.augment(self.text) 47 | end_dt = time.monotonic() 48 | durations.append(round(end_dt - start_dt, 2)) 49 | 50 | no_optimized_total_duration = sum(durations) 51 | no_optimized_average_duration = round(no_optimized_total_duration / len(durations), 2) 52 | 53 | print('Model:{}, Optimized: {}({}), No Optimized: {}({})'.format( 54 | model_path, optimized_total_duration, optimized_average_duration, 55 | no_optimized_total_duration, no_optimized_average_duration 56 | )) 57 | 58 | self.assertGreater(no_optimized_total_duration, optimized_total_duration) 59 | self.assertGreater(no_optimized_average_duration, optimized_average_duration) 60 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_loudness.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestLoudness(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_empty_input(self): 23 | audio = np.array([]) 24 | aug = naa.LoudnessAug() 25 | augmented_audio = aug.augment(audio) 26 | 27 | self.assertTrue(np.array_equal(audio, augmented_audio)) 28 | 29 | def test_substitute(self): 30 | aug = naa.LoudnessAug() 31 | augmented_audio = aug.augment(self.audio) 32 | 33 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 34 | self.assertEqual(len(self.audio), len(augmented_audio)) 35 | self.assertTrue(self.sampling_rate > 0) 36 | 37 | def test_coverage(self): 38 | zone = (0.3, 0.7) 39 | coverage = 0.1 40 | 41 | aug = naa.LoudnessAug(zone=zone, coverage=coverage) 42 | aug.model.stateless = False 43 | augmented_audio = aug.augment(self.audio) 44 | 45 | reconstruct_augmented_audio = np.concatenate( 46 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:]), axis=0) 47 | 48 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio)) 49 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage)) 50 | 51 | def test_zone(self): 52 | zone = (0, 1) 53 | coverage = 1 54 | 55 | aug = naa.LoudnessAug(zone=zone, coverage=coverage) 56 | aug.model.stateless = False 57 | augmented_audio = aug.augment(self.audio) 58 | 59 | reconstruct_augmented_audio = np.concatenate( 60 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:]), axis=0) 61 | 62 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio)) 63 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage)) 64 | -------------------------------------------------------------------------------- /nlpaug/model/lang_models/gpt2.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | from transformers import GPT2Tokenizer, GPT2LMHeadModel 4 | # from transformers import AutoModel, AutoTokenizer # Thrown error when using nucleus sampling 5 | except ImportError: 6 | # No installation required if not using this function 7 | pass 8 | 9 | from nlpaug.model.lang_models import LanguageModels 10 | 11 | 12 | class Gpt2(LanguageModels): 13 | # https://d4mucfpksywv.cloudfront.net/better-language-models/language_models_are_unsupervised_multitask_learners.pdf 14 | SUBWORD_PREFIX = 'Ġ' 15 | 16 | def __init__(self, model_path='gpt2', temperature=1.0, top_k=None, top_p=None, device=None, optimize=None): 17 | super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p, optimize=optimize) 18 | self.model_path = model_path 19 | 20 | # self.tokenizer = AutoTokenizer.from_pretrained(model_path) 21 | # self.model = AutoModel.from_pretrained(model_path) 22 | self.tokenizer = GPT2Tokenizer.from_pretrained(model_path) 23 | self.model = GPT2LMHeadModel.from_pretrained(model_path) 24 | 25 | self.model.to(self.device) 26 | self.model.eval() 27 | 28 | def id2token(self, _id): 29 | return self.tokenizer.decode(_id, clean_up_tokenization_spaces=True).strip() 30 | 31 | def predict(self, text, target_word=None, n=1, external_memory=None): 32 | # Convert feature 33 | input_idxes = self.tokenizer.encode(text) 34 | # if self.optimize['external_memory']: 35 | # input_idxes = input_idxes[-1:] 36 | input_idxes = torch.tensor(input_idxes, device=self.device).unsqueeze(0).repeat(1, 1) 37 | 38 | # Prediction 39 | with torch.no_grad(): 40 | outputs = self.model(input_ids=input_idxes, past=external_memory) 41 | target_token_logits = outputs[0][0][-1] # GPT2 only predict last token 42 | 43 | # Selection 44 | seed = {'temperature': self.temperature, 'top_k': self.top_k, 'top_p': self.top_p} 45 | target_token_logits = self.control_randomness(target_token_logits, seed) 46 | target_token_logits, target_token_idxes = self.filtering(target_token_logits, seed) 47 | if len(target_token_idxes) != 0: 48 | results = self.pick(target_token_logits, target_token_idxes, target_word=target_word, n=n) 49 | else: 50 | results = None 51 | 52 | results = (results,) 53 | if self.optimize['external_memory']: 54 | external_memory = outputs[1] 55 | results += (external_memory,) 56 | 57 | return results 58 | -------------------------------------------------------------------------------- /nlpaug/util/doc/doc.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.doc.token import Token 2 | from nlpaug.util.doc.change_log import ChangeLog 3 | 4 | 5 | class Doc: 6 | def __init__(self, doc='', tokens=None): 7 | self.doc = doc 8 | if tokens is not None and len(tokens) > 0: 9 | self.tokens = self.token2obj(tokens) 10 | else: 11 | self.tokens = [] 12 | self.changed_cnt = 0 13 | 14 | def token2obj(self, tokens): 15 | objs = [] 16 | start_pos = 0 17 | for t in tokens: 18 | token_obj = Token(token=t, start_pos=start_pos+self.doc[start_pos:].find(t)) 19 | change_log = ChangeLog(orig_token=token_obj) 20 | objs.append(change_log) 21 | 22 | start_pos += len(token_obj.token) 23 | start_pos += 1 # TODO: for textual only 24 | 25 | return objs 26 | 27 | def add_token(self, idx, token, action, change_seq): 28 | token_obj = Token(token=token, start_pos=-1, action=action, change_seq=change_seq) 29 | change_log = ChangeLog(orig_token=token_obj) 30 | self.tokens.insert(idx, change_log) 31 | 32 | def add_change_log(self, idx, new_token, action, change_seq): 33 | self.changed_cnt += 1 34 | self.tokens[idx].add(new_token, action=action, change_seq=change_seq) 35 | 36 | def update_change_log(self, token_idx, change_idx=None, token=None, action=None, change_seq=None): 37 | change_idx = self.tokens[token_idx].size() if change_idx is None else change_idx 38 | self.tokens[token_idx].update(change_idx, token=token, action=action, change_seq=change_seq) 39 | 40 | def get_token(self, idx): 41 | return self.tokens[idx] 42 | 43 | def get_original_tokens(self): 44 | return [t.orig_token.token for t in self.tokens] 45 | 46 | def get_augmented_tokens(self): 47 | return [t.get_latest_token().token for t in self.tokens if len(t.get_latest_token().token) > 0] 48 | 49 | def size(self): 50 | return len(self.tokens) 51 | 52 | def changed_count(self): 53 | return self.changed_cnt 54 | 55 | def get_change_logs(self, start_pos=0): 56 | for i, t in enumerate(self.tokens): 57 | self.tokens[i].update_last_token(start_pos) 58 | 59 | start_pos += len(t.get_latest_token().token) 60 | if len(t.get_latest_token().token) > 0: 61 | # TODO: for textual only 62 | start_pos += 1 63 | 64 | change_logs = [t for t in self.tokens if t.is_changed()] 65 | change_logs.sort(key=lambda x: x.get_latest_token().change_seq) 66 | return [c.to_changed_dict() for c in change_logs] 67 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_pitch.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestPitch(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_substitute(self): 23 | aug = naa.PitchAug(sampling_rate=self.sampling_rate) 24 | augmented_audio = aug.augment(self.audio) 25 | 26 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 27 | self.assertEqual(len(self.audio), len(augmented_audio)) 28 | 29 | def test_coverage(self): 30 | zone = (0.3, 0.7) 31 | coverage = 0.1 32 | decimal = 8 33 | 34 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage) 35 | aug.model.stateless = False 36 | augmented_audio = aug.augment(self.audio) 37 | reconstruct_augmented_audio = np.concatenate( 38 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:]) 39 | , axis=0).astype(np.float32) 40 | 41 | augmented_audio = np.round(augmented_audio, decimals=decimal) 42 | reconstruct_augmented_audio = np.round(reconstruct_augmented_audio, decimals=decimal) 43 | 44 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio)) 45 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage)) 46 | 47 | def test_zone(self): 48 | zone = (0, 1) 49 | coverage = 1. 50 | decimal = 8 51 | 52 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage) 53 | aug.model.stateless = False 54 | augmented_audio = aug.augment(self.audio) 55 | reconstruct_augmented_audio = np.concatenate( 56 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:]) 57 | , axis=0).astype(np.float32) 58 | 59 | augmented_audio = np.round(augmented_audio, decimals=decimal) 60 | reconstruct_augmented_audio = np.round(reconstruct_augmented_audio, decimals=decimal) 61 | 62 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio)) 63 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage)) -------------------------------------------------------------------------------- /test/flow/test_sometimes.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import nlpaug.augmenter.char as nac 4 | import nlpaug.flow as naf 5 | from nlpaug.util import Action 6 | 7 | 8 | class TestSometimes(unittest.TestCase): 9 | def test_dry_run(self): 10 | seq = naf.Sometimes() 11 | results = seq.augment([]) 12 | self.assertEqual(0, len(results)) 13 | 14 | def test_single_action(self): 15 | texts = [ 16 | 'The quick brown fox jumps over the lazy dog', 17 | 'Zology raku123456 fasdasd asd4123414 1234584 s@#' 18 | ] 19 | 20 | # Since prob may be low and causing do not perform data augmentation. Retry 5 times 21 | at_least_one_not_equal = False 22 | for _ in range(0, 5): 23 | flow = naf.Sometimes([nac.RandomCharAug(action=Action.INSERT)], pipeline_p=0.6) 24 | for text in texts: 25 | augmented_text = flow.augment(text) 26 | 27 | if text != augmented_text: 28 | at_least_one_not_equal = True 29 | 30 | self.assertLess(0, len(text)) 31 | 32 | if at_least_one_not_equal: 33 | break 34 | 35 | self.assertTrue(at_least_one_not_equal) 36 | self.assertLess(0, len(texts)) 37 | 38 | def test_multiple_actions(self): 39 | texts = [ 40 | 'The quick brown fox jumps over the lazy dog', 41 | 'Zology raku123456 fasdasd asd4123414 1234584' 42 | ] 43 | 44 | flows = [ 45 | naf.Sometimes([nac.RandomCharAug(action=Action.INSERT), 46 | nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)], 47 | pipeline_p=0.8), 48 | naf.Sometimes( 49 | [nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), 50 | nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6), 51 | nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)], 52 | pipeline_p=0.6) 53 | ] 54 | 55 | # Since prob may be low and causing do not perform data augmentation. Retry 5 times 56 | for flow in flows: 57 | at_least_one_not_equal = False 58 | for _ in range(0, 5): 59 | for text in texts: 60 | self.assertLess(0, len(text)) 61 | augmented_text = flow.augment(text) 62 | 63 | if text != augmented_text: 64 | at_least_one_not_equal = True 65 | 66 | self.assertLess(0, len(text)) 67 | 68 | if at_least_one_not_equal: 69 | break 70 | 71 | self.assertTrue(at_least_one_not_equal) 72 | self.assertLess(0, len(flows)) 73 | self.assertLess(0, len(texts)) 74 | 75 | -------------------------------------------------------------------------------- /nlpaug/model/lang_models/distilbert.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | from transformers import DistilBertTokenizer, DistilBertForMaskedLM 4 | # from transformers import AutoModel, AutoTokenizer 5 | except ImportError: 6 | # No installation required if not using this function 7 | pass 8 | 9 | from nlpaug.model.lang_models import LanguageModels 10 | from nlpaug.util.selection.filtering import * 11 | 12 | 13 | class DistilBert(LanguageModels): 14 | # https://arxiv.org/pdf/1910.01108.pdf 15 | START_TOKEN = '[CLS]' 16 | SEPARATOR_TOKEN = '[SEP]' 17 | MASK_TOKEN = '[MASK]' 18 | SUBWORD_PREFIX = '##' 19 | 20 | def __init__(self, model_path='distilbert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'): 21 | super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) 22 | self.model_path = model_path 23 | 24 | # self.tokenizer = AutoTokenizer.from_pretrained(model_path) 25 | # self.model = AutoModel.from_pretrained(model_path) 26 | self.tokenizer = DistilBertTokenizer.from_pretrained(model_path) 27 | self.model = DistilBertForMaskedLM.from_pretrained(model_path) 28 | 29 | self.model.to(self.device) 30 | self.model.eval() 31 | 32 | def id2token(self, _id): 33 | # id: integer format 34 | return self.tokenizer.convert_ids_to_tokens([_id])[0] 35 | 36 | def is_skip_candidate(self, candidate): 37 | return candidate[:2] == self.SUBWORD_PREFIX 38 | 39 | def predict(self, text, target_word=None, n=1): 40 | # Prepare inputs 41 | tokens = self.tokenizer.tokenize(text) 42 | 43 | tokens.insert(0, self.START_TOKEN) 44 | tokens.append(self.SEPARATOR_TOKEN) 45 | target_pos = tokens.index(self.MASK_TOKEN) 46 | 47 | token_inputs = self.tokenizer.convert_tokens_to_ids(tokens) 48 | mask_inputs = [1] * len(token_inputs) # 1: real token, 0: padding token 49 | 50 | # Convert to feature 51 | token_inputs = torch.tensor([token_inputs]).to(self.device) 52 | mask_inputs = torch.tensor([mask_inputs]).to(self.device) 53 | 54 | # Prediction 55 | with torch.no_grad(): 56 | outputs = self.model(input_ids=token_inputs, attention_mask=mask_inputs) 57 | target_token_logits = outputs[0][0][target_pos] 58 | 59 | # Selection 60 | seed = {'temperature': self.temperature, 'top_k': self.top_k, 'top_p': self.top_p} 61 | target_token_logits = self.control_randomness(target_token_logits, seed) 62 | target_token_logits, target_token_idxes = self.filtering(target_token_logits, seed) 63 | if len(target_token_idxes) != 0: 64 | results = self.pick(target_token_logits, target_token_idxes, target_word=target_word, n=n) 65 | else: 66 | results = None 67 | 68 | results = (results,) 69 | 70 | return results 71 | -------------------------------------------------------------------------------- /nlpaug/model/lang_models/bert.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | from transformers import BertTokenizer, BertForMaskedLM 4 | # from transformers import AutoModel, AutoTokenizer 5 | except ImportError: 6 | # No installation required if not using this function 7 | pass 8 | 9 | from nlpaug.model.lang_models import LanguageModels 10 | from nlpaug.util.selection.filtering import * 11 | 12 | 13 | class Bert(LanguageModels): 14 | # https://arxiv.org/pdf/1810.04805.pdf 15 | START_TOKEN = '[CLS]' 16 | SEPARATOR_TOKEN = '[SEP]' 17 | MASK_TOKEN = '[MASK]' 18 | SUBWORD_PREFIX = '##' 19 | 20 | def __init__(self, model_path='bert-base-uncased', temperature=1.0, top_k=None, top_p=None, device='cuda'): 21 | super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) 22 | self.model_path = model_path 23 | 24 | # self.tokenizer = AutoTokenizer.from_pretrained(model_path) 25 | # self.model = AutoModel.from_pretrained(model_path) 26 | self.tokenizer = BertTokenizer.from_pretrained(model_path) 27 | self.model = BertForMaskedLM.from_pretrained(model_path) 28 | 29 | self.model.to(self.device) 30 | self.model.eval() 31 | 32 | def id2token(self, _id): 33 | # id: integer format 34 | return self.tokenizer.convert_ids_to_tokens([_id])[0] 35 | 36 | def is_skip_candidate(self, candidate): 37 | return candidate[:2] == self.SUBWORD_PREFIX 38 | 39 | def predict(self, text, target_word=None, n=1): 40 | # Prepare inputs 41 | tokens = self.tokenizer.tokenize(text) 42 | 43 | tokens.insert(0, self.START_TOKEN) 44 | tokens.append(self.SEPARATOR_TOKEN) 45 | target_pos = tokens.index(self.MASK_TOKEN) 46 | 47 | token_inputs = self.tokenizer.convert_tokens_to_ids(tokens) 48 | segment_inputs = [0] * len(token_inputs) 49 | mask_inputs = [1] * len(token_inputs) # 1: real token, 0: padding token 50 | 51 | # Convert to feature 52 | token_inputs = torch.tensor([token_inputs]).to(self.device) 53 | segment_inputs = torch.tensor([segment_inputs]).to(self.device) 54 | mask_inputs = torch.tensor([mask_inputs]).to(self.device) 55 | 56 | # Prediction 57 | with torch.no_grad(): 58 | outputs = self.model(input_ids=token_inputs, token_type_ids=segment_inputs, attention_mask=mask_inputs) 59 | target_token_logits = outputs[0][0][target_pos] 60 | 61 | # Selection 62 | seed = {'temperature': self.temperature, 'top_k': self.top_k, 'top_p': self.top_p} 63 | target_token_logits = self.control_randomness(target_token_logits, seed) 64 | target_token_logits, target_token_idxes = self.filtering(target_token_logits, seed) 65 | if len(target_token_idxes) != 0: 66 | results = self.pick(target_token_logits, target_token_idxes, target_word=target_word, n=n) 67 | else: 68 | results = None 69 | 70 | results = (results,) 71 | 72 | return results 73 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_mask.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestMask(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_empty_input(self): 23 | audio = np.array([]) 24 | aug = naa.MaskAug(sampling_rate=44100) 25 | augmented_audio = aug.augment(audio) 26 | 27 | self.assertTrue(np.array_equal(audio, augmented_audio)) 28 | 29 | def test_with_noise(self): 30 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, mask_with_noise=True) 31 | augmented_audio = aug.augment(self.audio) 32 | 33 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 34 | self.assertEqual(len(self.audio), len(augmented_audio)) 35 | 36 | def test_without_noise(self): 37 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, mask_with_noise=False) 38 | augmented_audio = aug.augment(self.audio) 39 | 40 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 41 | self.assertEqual(len(self.audio), len(augmented_audio)) 42 | 43 | def test_coverage(self): 44 | zone = (0.3, 0.7) 45 | coverage = 0.1 46 | 47 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage, mask_with_noise=False) 48 | aug.model.stateless = False 49 | augmented_audio = aug.augment(self.audio) 50 | 51 | reconstruct_augmented_audio = np.concatenate( 52 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:]), axis=0) 53 | 54 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio)) 55 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage)) 56 | 57 | def test_zone(self): 58 | zone = (0, 1) 59 | coverage = 1. 60 | 61 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, zone=zone, coverage=coverage, mask_with_noise=False) 62 | aug.model.stateless = False 63 | augmented_audio = aug.augment(self.audio) 64 | 65 | reconstruct_augmented_audio = np.concatenate( 66 | (self.audio[:aug.model.start_pos], aug.model.aug_data, self.audio[aug.model.end_pos:]), axis=0) 67 | 68 | self.assertTrue(np.array_equal(augmented_audio, reconstruct_augmented_audio)) 69 | self.assertTrue(len(aug.model.aug_data), int(len(self.audio) * (zone[1] - zone[0]) * coverage)) -------------------------------------------------------------------------------- /example/tfidf-train_model.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": {}, 7 | "outputs": [], 8 | "source": [ 9 | "import os\n", 10 | "os.environ[\"MODEL_DIR\"] = '../model'" 11 | ] 12 | }, 13 | { 14 | "cell_type": "code", 15 | "execution_count": 2, 16 | "metadata": {}, 17 | "outputs": [ 18 | { 19 | "name": "stdout", 20 | "output_type": "stream", 21 | "text": [ 22 | "Better speed can be achieved with apex installed from https://www.github.com/nvidia/apex.\n", 23 | "--------------------\n", 24 | "Original Input:The quick brown fox jumps over the lazy dog\n", 25 | "Agumented Output:The quick U_RF F9F9F9F9F3W2TM jumps over the lazy dog\n", 26 | "--------------------\n", 27 | "Original Input:asdasd test apple dog asd asd\n", 28 | "Agumented Output:asdasd test apple dog asd 5hd\n" 29 | ] 30 | } 31 | ], 32 | "source": [ 33 | "import sklearn.datasets\n", 34 | "import re\n", 35 | "\n", 36 | "import nlpaug.augmenter.word as naw\n", 37 | "import nlpaug.model.word_stats as nmw\n", 38 | "\n", 39 | "def _tokenizer(text, token_pattern=r\"(?u)\\b\\w\\w+\\b\"):\n", 40 | " token_pattern = re.compile(token_pattern)\n", 41 | " return token_pattern.findall(text)\n", 42 | "\n", 43 | "# Load sample data\n", 44 | "train_data = sklearn.datasets.fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))\n", 45 | "train_x = train_data.data\n", 46 | "\n", 47 | "# Tokenize input\n", 48 | "train_x_tokens = [_tokenizer(x) for x in train_x]\n", 49 | "\n", 50 | "# Train TF-IDF model\n", 51 | "tfidf_model = nmw.TfIdf()\n", 52 | "tfidf_model.train(train_x_tokens)\n", 53 | "tfidf_model.save('.')\n", 54 | "\n", 55 | "# Load TF-IDF augmenter\n", 56 | "aug = naw.TfIdfAug(model_path='.', tokenizer=_tokenizer)\n", 57 | "\n", 58 | "texts = [\n", 59 | " 'The quick brown fox jumps over the lazy dog',\n", 60 | " 'asdasd test apple dog asd asd'\n", 61 | "]\n", 62 | "\n", 63 | "for text in texts:\n", 64 | " augmented_text = aug.augment(text)\n", 65 | " \n", 66 | " print('-'*20)\n", 67 | " print('Original Input:{}'.format(text))\n", 68 | " print('Agumented Output:{}'.format(augmented_text))" 69 | ] 70 | }, 71 | { 72 | "cell_type": "code", 73 | "execution_count": null, 74 | "metadata": {}, 75 | "outputs": [], 76 | "source": [] 77 | } 78 | ], 79 | "metadata": { 80 | "kernelspec": { 81 | "display_name": "Python 3", 82 | "language": "python", 83 | "name": "python3" 84 | }, 85 | "language_info": { 86 | "codemirror_mode": { 87 | "name": "ipython", 88 | "version": 3 89 | }, 90 | "file_extension": ".py", 91 | "mimetype": "text/x-python", 92 | "name": "python", 93 | "nbconvert_exporter": "python", 94 | "pygments_lexer": "ipython3", 95 | "version": "3.6.4" 96 | } 97 | }, 98 | "nbformat": 4, 99 | "nbformat_minor": 2 100 | } 101 | -------------------------------------------------------------------------------- /nlpaug/model/lang_models/roberta.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | from transformers import RobertaTokenizer, RobertaForMaskedLM 4 | # from transformers import AutoModel, AutoTokenizer # Thrown error when using nucleus sampling 5 | except ImportError: 6 | # No installation required if not using this function 7 | pass 8 | 9 | from nlpaug.model.lang_models import LanguageModels 10 | from nlpaug.util.selection.filtering import * 11 | 12 | 13 | class Roberta(LanguageModels): 14 | # https://arxiv.org/pdf/1810.04805.pdf 15 | START_TOKEN = '' 16 | SEPARATOR_TOKEN = '' 17 | MASK_TOKEN = '' 18 | SUBWORD_PREFIX = 'Ġ' 19 | 20 | def __init__(self, model_path='roberta-base', temperature=1.0, top_k=None, top_p=None, device='cuda'): 21 | super().__init__(device, temperature=temperature, top_k=top_k, top_p=top_p) 22 | self.model_path = model_path 23 | 24 | # self.tokenizer = AutoTokenizer.from_pretrained(model_path) 25 | # self.model = AutoModel.from_pretrained(model_path) 26 | self.tokenizer = RobertaTokenizer.from_pretrained(model_path) 27 | self.model = RobertaForMaskedLM.from_pretrained(model_path) 28 | 29 | self.model.to(self.device) 30 | self.model.eval() 31 | 32 | def id2token(self, _id): 33 | # id: integer format 34 | return self.tokenizer.convert_ids_to_tokens([_id])[0] 35 | 36 | def is_skip_candidate(self, candidate): 37 | return False 38 | 39 | def predict(self, text, target_word=None, n=1): 40 | # Prepare inputs 41 | tokens = self.tokenizer.tokenize(text) 42 | 43 | tokens.insert(0, self.START_TOKEN) 44 | tokens.append(self.SEPARATOR_TOKEN) 45 | target_pos = tokens.index(self.MASK_TOKEN) 46 | 47 | token_inputs = self.tokenizer.convert_tokens_to_ids(tokens) 48 | segment_inputs = [0] * len(token_inputs) 49 | mask_inputs = [1] * len(token_inputs) # 1: real token, 0: padding token 50 | 51 | # Convert to feature 52 | token_inputs = torch.tensor([token_inputs]).to(self.device) 53 | segment_inputs = torch.tensor([segment_inputs]).to(self.device) 54 | mask_inputs = torch.tensor([mask_inputs]).to(self.device) 55 | 56 | # Prediction 57 | with torch.no_grad(): 58 | outputs = self.model(input_ids=token_inputs, token_type_ids=segment_inputs, attention_mask=mask_inputs) 59 | target_token_logits = outputs[0][0][target_pos] 60 | 61 | # Selection 62 | seed = {'temperature': self.temperature, 'top_k': self.top_k, 'top_p': self.top_p} 63 | target_token_logits = self.control_randomness(target_token_logits, seed) 64 | target_token_logits, target_token_idxes = self.filtering(target_token_logits, seed) 65 | if len(target_token_idxes) != 0: 66 | results = self.pick(target_token_logits, target_token_idxes, target_word=target_word, n=n) 67 | # Replace '' and 'Ġ' as . and empty string 68 | results = [(r[0].replace('Ġ', ''), r[1]) if r[0] != self.SEPARATOR_TOKEN else ('.', r[1]) for r in results] 69 | else: 70 | results = None 71 | 72 | results = (results,) 73 | 74 | return results 75 | -------------------------------------------------------------------------------- /test/augmenter/test_augmenter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.char as nac 7 | import nlpaug.augmenter.word as naw 8 | import nlpaug.augmenter.sentence as nas 9 | import nlpaug.augmenter.audio as naa 10 | from nlpaug.util.audio import AudioLoader 11 | 12 | 13 | class TestWordNet(unittest.TestCase): 14 | @classmethod 15 | def setUpClass(cls): 16 | env_config_path = os.path.abspath(os.path.join( 17 | os.path.dirname(__file__), '..', '..', '.env')) 18 | load_dotenv(env_config_path) 19 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 20 | cls.sample_wav_file = os.path.join( 21 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 22 | ) 23 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 24 | 25 | cls.textual_augs = [ 26 | nac.RandomCharAug(), 27 | naw.ContextualWordEmbsAug(), 28 | nas.ContextualWordEmbsForSentenceAug() 29 | ] 30 | 31 | cls.audio_augs = [ 32 | naa.CropAug(sampling_rate=cls.sampling_rate), 33 | naa.SpeedAug(), 34 | ] 35 | 36 | def test_textual_augmenter_n_output(self): 37 | text = 'The quick brown fox jumps over the lazy dog' 38 | n = 3 39 | for aug in self.textual_augs: 40 | augmented_texts = aug.augment(text, n=n) 41 | self.assertGreater(len(augmented_texts), 1) 42 | for augmented_text in augmented_texts: 43 | self.assertNotEqual(augmented_text, text) 44 | 45 | def test_textual_augmenter_n_output_thread(self): 46 | text = 'The quick brown fox jumps over the lazy dog' 47 | n = 3 48 | for aug in self.textual_augs: 49 | augmented_texts = aug.augments([text]*2, n=n, num_thread=n) 50 | self.assertGreater(len(augmented_texts), 1) 51 | for augmented_text in augmented_texts: 52 | self.assertNotEqual(augmented_text, text) 53 | 54 | def test_multiprocess_gpu(self): 55 | text = 'The quick brown fox jumps over the lazy dog' 56 | n = 3 57 | aug = naw.ContextualWordEmbsAug(force_reload=True, device='cuda') 58 | 59 | augmented_texts = aug.augment(text, n=n, num_thread=n) 60 | self.assertGreater(len(augmented_texts), 1) 61 | for augmented_text in augmented_texts: 62 | self.assertNotEqual(augmented_text, text) 63 | 64 | def test_audio_augmenter_n_output(self): 65 | n = 3 66 | for aug in self.audio_augs: 67 | augmented_audios = aug.augment(self.audio, n=n) 68 | self.assertGreater(len(augmented_audios), 1) 69 | for augmented_audio in augmented_audios: 70 | self.assertFalse(np.array_equal(augmented_audio, self.audio)) 71 | 72 | def test_audio_augmenter_n_output_thread(self): 73 | n = 3 74 | for aug in self.audio_augs: 75 | augmented_audios = aug.augments([self.audio]*2, n=n, num_thread=n) 76 | self.assertGreater(len(augmented_audios), 1) 77 | for augmented_audio in augmented_audios: 78 | self.assertFalse(np.array_equal(augmented_audio, self.audio)) 79 | -------------------------------------------------------------------------------- /test/augmenter/word/test_random_word.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import nlpaug.augmenter.word as naw 4 | 5 | 6 | class TestRandom(unittest.TestCase): 7 | def test_swap(self): 8 | texts = [ 9 | 'The quick brown fox jumps over the lazy dog' 10 | ] 11 | aug = naw.RandomWordAug(action="swap") 12 | 13 | for text in texts: 14 | tokens = text.lower().split(' ') 15 | orig_token_freq = {} 16 | for w in tokens: 17 | orig_token_freq[w] = tokens.count(w) 18 | 19 | augmented_text = text 20 | 21 | # https://github.com/makcedward/nlpaug/issues/77 22 | for i in range(10): 23 | augmented_text = aug.augment(augmented_text) 24 | 25 | aug_tokens = augmented_text.lower().split(' ') 26 | aug_token_freq = {} 27 | for w in tokens: 28 | aug_token_freq[w] = aug_tokens.count(w) 29 | 30 | for orig_token, orig_freq in orig_token_freq.items(): 31 | self.assertTrue(orig_token in aug_token_freq) 32 | self.assertTrue(aug_token_freq[orig_token] == orig_freq) 33 | 34 | self.assertNotEqual(text, augmented_text) 35 | 36 | def test_substitute_without_target_word(self): 37 | texts = [ 38 | 'The quick brown fox jumps over the lazy dog' 39 | ] 40 | aug = naw.RandomWordAug(action='substitute') 41 | 42 | for text in texts: 43 | augmented_text = aug.augment(text) 44 | 45 | self.assertIn('_', augmented_text) 46 | self.assertNotEqual(text, augmented_text) 47 | 48 | def test_substitute_with_target_word(self): 49 | texts = [ 50 | 'The quick brown fox jumps over the lazy dog' 51 | ] 52 | target_words = ['$', '#', '^^^'] 53 | aug = naw.RandomWordAug(action='substitute', target_words=target_words) 54 | 55 | for text in texts: 56 | augmented_text = aug.augment(text) 57 | 58 | replaced = False 59 | for w in target_words: 60 | if w in augmented_text: 61 | replaced = True 62 | break 63 | self.assertTrue(replaced) 64 | self.assertNotEqual(text, augmented_text) 65 | 66 | def test_delete(self): 67 | texts = [ 68 | 'The quick brown fox jumps over the lazy dog' 69 | ] 70 | aug = naw.RandomWordAug() 71 | 72 | for text in texts: 73 | augmented_text = aug.augment(text) 74 | self.assertNotEqual(text, augmented_text) 75 | 76 | # https://github.com/makcedward/nlpaug/issues/76 77 | def test_swap_one_token(self): 78 | texts = [ 79 | 'The' 80 | ] 81 | aug = naw.RandomWordAug(action='swap') 82 | 83 | for text in texts: 84 | augmented_text = aug.augment(text) 85 | 86 | self.assertEqual(text, augmented_text) 87 | 88 | # https://github.com/makcedward/nlpaug/issues/76 89 | def test_delete_one_token(self): 90 | texts = [ 91 | 'The' 92 | ] 93 | aug = naw.RandomWordAug(action='delete') 94 | 95 | for text in texts: 96 | augmented_text = aug.augment(text) 97 | self.assertEqual(text, augmented_text) 98 | -------------------------------------------------------------------------------- /nlpaug/model/char/keyboard.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | 5 | from nlpaug.model.char import Character 6 | 7 | 8 | class Keyboard(Character): 9 | def __init__(self, special_char=True, numeric=True, upper_case=True, cache=True, lang="en", model_path=None): 10 | super().__init__(cache) 11 | 12 | self.model_dir = os.path.join( 13 | os.path.dirname(os.path.abspath(__file__)), '..', '..', 'res', 'char', 'keyboard') 14 | 15 | self.special_char = special_char 16 | self.numeric = numeric 17 | self.upper_case = upper_case 18 | self.lang = lang 19 | self.model_path = model_path 20 | self.model = self.get_model( 21 | model_path=model_path, 22 | model_dir=self.model_dir, special_char=special_char, numeric=numeric, upper_case=upper_case, lang=lang) 23 | 24 | def predict(self, data): 25 | return self.model[data] 26 | 27 | # TODO: Extending to 2 keyboard distance 28 | @classmethod 29 | def get_model(cls, model_path, model_dir, special_char=True, numeric=True, upper_case=True, lang="en"): 30 | # If loading customize model, 'lang' parameter will be ignored. 31 | if model_path is None: 32 | if lang not in ['en', 'th']: 33 | raise ValueError('Only support en and th now. You may provide the keyboard mapping ' 34 | 'such that we can support "{}"'.format(lang)) 35 | 36 | model_path = os.path.join(model_dir, lang+'.json') 37 | 38 | if not os.path.exists(model_path): 39 | raise ValueError('The model_path does not exist. Please check "{}"'.format(model_path)) 40 | 41 | with open(model_path, encoding="utf8") as f: 42 | mapping = json.load(f) 43 | 44 | result = {} 45 | 46 | for key, values in mapping.items(): 47 | # Skip records if key is numeric while include_numeric is false 48 | if not numeric and re.match("^[0-9]*$", key): 49 | continue 50 | # skip record if key is special character while include_spec is false 51 | if not special_char and not re.match("^[a-z0-9]*$", key): 52 | continue 53 | 54 | result[key] = [] 55 | result[key.upper()] = [] 56 | 57 | for value in values: 58 | # Skip record if value is numeric while include_numeric is false 59 | if not numeric and re.match("^[0-9]*$", value): 60 | continue 61 | 62 | # skip record if value is special character while include_spec is false 63 | if not special_char and not re.match("^[a-z0-9]*$", value): 64 | continue 65 | 66 | result[key].append(value) 67 | 68 | if upper_case: 69 | result[key].append(value.upper()) 70 | result[key.upper()].append(value) 71 | result[key.upper()].append(value.upper()) 72 | 73 | clean_result = {} 74 | for key, values in result.items(): 75 | # clear empty mapping 76 | if len(values) == 0: 77 | continue 78 | 79 | # de-duplicate 80 | values = [v for v in values if v != key] 81 | values = sorted(list(set(values))) 82 | 83 | clean_result[key] = values 84 | 85 | return clean_result 86 | -------------------------------------------------------------------------------- /nlpaug/augmenter/word/split.py: -------------------------------------------------------------------------------- 1 | # Source: https://arxiv.org/pdf/1812.05271v1.pdf 2 | 3 | """ 4 | Augmenter that apply word splitting operation to textual input. 5 | """ 6 | 7 | from nlpaug.augmenter.word import WordAugmenter 8 | from nlpaug.util import Action, Doc 9 | 10 | 11 | class SplitAug(WordAugmenter): 12 | """ 13 | Augmenter that apply word splitting for augmentation. 14 | 15 | :param float aug_p: Percentage of word will be augmented. 16 | :param int aug_min: Minimum number of word will be augmented. 17 | :param int aug_max: Maximum number of word will be augmented. If None is passed, number of augmentation is 18 | calculated via aup_p. If calculated result from aug_p is smaller than aug_max, will use calculated result from 19 | aug_p. Otherwise, using aug_max. 20 | :param int min_char: If word less than this value, do not draw word for augmentation 21 | :param list stopwords: List of words which will be skipped from augment operation. 22 | :param str stopwords_regex: Regular expression for matching words which will be skipped from augment operation. 23 | :param func tokenizer: Customize tokenization process 24 | :param func reverse_tokenizer: Customize reverse of tokenization process 25 | :param bool include_detail: Change detail will be returned if it is True. 26 | :param str name: Name of this augmenter 27 | 28 | >>> import nlpaug.augmenter.word as naw 29 | >>> aug = naw.SplitAug() 30 | """ 31 | 32 | def __init__(self, name='Split_Aug', aug_min=1, aug_max=10, aug_p=0.3, min_char=4, stopwords=None, 33 | tokenizer=None, reverse_tokenizer=None, stopwords_regex=None, include_detail=False, verbose=0): 34 | super().__init__( 35 | action=Action.SPLIT, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, stopwords=stopwords, 36 | tokenizer=tokenizer, reverse_tokenizer=reverse_tokenizer, device='cpu', verbose=verbose, 37 | stopwords_regex=stopwords_regex, include_detail=include_detail) 38 | 39 | self.min_char = min_char 40 | 41 | def skip_aug(self, token_idxes, tokens): 42 | results = [] 43 | for token_idx in token_idxes: 44 | if len(tokens[token_idx]) >= self.min_char: 45 | results.append(token_idx) 46 | return results 47 | 48 | def split(self, data): 49 | change_seq = 0 50 | doc = Doc(data, self.tokenizer(data)) 51 | 52 | aug_idxes = self._get_aug_idxes(doc.get_original_tokens()) 53 | aug_idxes.sort(reverse=True) 54 | 55 | if aug_idxes is None or len(aug_idxes) == 0: 56 | if self.include_detail: 57 | return data, [] 58 | return data 59 | 60 | for aug_idx in aug_idxes: 61 | target_token = doc.get_token(aug_idx).get_latest_token().token 62 | separate_pos = self.sample(len(target_token), 1) 63 | prev_token = target_token[:separate_pos] 64 | next_token = target_token[separate_pos:] 65 | 66 | change_seq += 1 67 | doc.add_change_log(aug_idx, new_token=next_token, action=Action.SPLIT, 68 | change_seq=self.parent_change_seq + change_seq) 69 | doc.add_token(aug_idx, token=prev_token, action=Action.SPLIT, 70 | change_seq=self.parent_change_seq + change_seq) 71 | 72 | if self.include_detail: 73 | return self.reverse_tokenizer(doc.get_augmented_tokens()), doc.get_change_logs() 74 | else: 75 | return self.reverse_tokenizer(doc.get_augmented_tokens()) 76 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_noise.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestNoise(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | # https://en.wikipedia.org/wiki/Colors_of_noise 21 | cls.noise_wav_file = os.path.join( 22 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Pink_noise.ogg' 23 | ) 24 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 25 | cls.noise, cls.noise_sampling_rate = AudioLoader.load_audio(cls.noise_wav_file) 26 | 27 | def test_empty_input(self): 28 | audio = np.array([]) 29 | aug = naa.NoiseAug() 30 | augmented_audio = aug.augment(audio) 31 | 32 | self.assertTrue(np.array_equal(audio, augmented_audio)) 33 | 34 | def test_substitute(self): 35 | aug = naa.NoiseAug() 36 | augmented_audio = aug.augment(self.audio) 37 | 38 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 39 | self.assertTrue(len(self.audio), len(augmented_audio)) 40 | self.assertTrue(self.sampling_rate > 0) 41 | 42 | def test_color_noise(self): 43 | colors = naa.NoiseAug().model.COLOR_NOISES 44 | 45 | for color in colors: 46 | aug = naa.NoiseAug(color=color) 47 | augmented_audio = aug.augment(self.audio) 48 | 49 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 50 | self.assertTrue(len(self.audio), len(augmented_audio)) 51 | self.assertTrue(self.sampling_rate > 0) 52 | 53 | def test_background_noise(self): 54 | # noise > audio 55 | aug = naa.NoiseAug(noises=[self.noise]) 56 | augmented_audio = aug.augment(self.audio) 57 | self.assertTrue(augmented_audio is not None) 58 | 59 | # audio > noise 60 | aug = naa.NoiseAug(noises=[self.audio]) 61 | augmented_audio = aug.augment(self.noise) 62 | self.assertTrue(augmented_audio is not None) 63 | 64 | def test_coverage(self): 65 | zone = (0.3, 0.7) 66 | coverage = 0.1 67 | expected_aug_data_size = int(len(self.audio) * (zone[1] - zone[0]) * coverage) 68 | 69 | # background noise 70 | aug = naa.NoiseAug(zone=zone, noises=[self.noise], coverage=coverage) 71 | aug.model.stateless = False 72 | aug.augment(self.audio) 73 | 74 | self.assertTrue(-1 <= len(aug.model.aug_data) - expected_aug_data_size <= 1) 75 | 76 | # colored noise 77 | aug = naa.NoiseAug(zone=zone, color='pink', coverage=coverage) 78 | aug.model.stateless = False 79 | aug.augment(self.audio) 80 | 81 | self.assertTrue(-1 <= len(aug.model.aug_data) - expected_aug_data_size <= 1) 82 | 83 | def test_zone(self): 84 | zone = (0, 1) 85 | coverage = 1. 86 | expected_aug_data_size = int(len(self.audio) * (zone[1] - zone[0]) * coverage) 87 | 88 | # background noise 89 | aug = naa.NoiseAug(zone=zone, noises=[self.noise], coverage=coverage) 90 | aug.model.stateless = False 91 | aug.augment(self.audio) 92 | 93 | self.assertTrue(-1 <= len(aug.model.aug_data) - expected_aug_data_size <= 1) 94 | 95 | # colored noise 96 | aug = naa.NoiseAug(zone=zone, color='pink', coverage=coverage) 97 | aug.model.stateless = False 98 | aug.augment(self.audio) 99 | 100 | self.assertTrue(-1 <= len(aug.model.aug_data) - expected_aug_data_size <= 1) -------------------------------------------------------------------------------- /nlpaug/model/word_dict/ppdb.py: -------------------------------------------------------------------------------- 1 | try: 2 | import nltk 3 | from nltk.corpus import wordnet 4 | except ImportError: 5 | # No installation required if not using this function 6 | pass 7 | 8 | from nlpaug.util import PartOfSpeech 9 | from nlpaug.model.word_dict import WordDictionary 10 | 11 | 12 | class Ppdb(WordDictionary): 13 | # http://paraphrase.org/#/download 14 | def __init__(self, dict_path): 15 | super().__init__(cache=True) 16 | 17 | self.dict_path = dict_path 18 | self.lang = 'eng' # TODO: support other languages 19 | 20 | self.score_threshold = self.get_default_score_thresholds() # TODO: support other filtering 21 | self.is_synonym = True # TODO: antonyms 22 | 23 | try: 24 | wordnet 25 | except NameError: 26 | raise ImportError('Missed nltk library. Install it via `pip install nltk`') 27 | 28 | self._init() 29 | 30 | def _init(self): 31 | self.dict = {} 32 | self.read(self.dict_path) 33 | 34 | @classmethod 35 | def get_default_score_thresholds(cls): 36 | return { 37 | 'AGigaSim': 0.6 38 | } 39 | 40 | def read(self, model_path): 41 | with open(model_path, 'rb') as f: 42 | for line in f: 43 | line = line.decode('utf-8') 44 | 45 | if '\\ x' in line or 'xc3' in line: 46 | continue 47 | 48 | fields = line.split('|||') 49 | constituents = fields[0].strip()[1:-1].split('/') 50 | phrase = fields[1].strip() 51 | paraphrase = fields[2].strip() 52 | 53 | # filter multiple words 54 | if len(phrase.split()) != len(paraphrase.split()): 55 | continue 56 | 57 | scores = [] 58 | 59 | if len(fields) == 6: 60 | # filter equivalence word ( for PPDB v2.0 only.) 61 | # entailment = fields[5].strip() 62 | # if entailment == 'Equivalence' and self.is_synonym: 63 | # continue 64 | 65 | features = fields[3].strip().split() 66 | features = [feature for feature in features for s in self.score_threshold if 67 | s in feature] # filter by scheme 68 | 69 | for feature in features: 70 | scheme, score = feature.split('=') 71 | if scheme in self.score_threshold and float(score) > self.score_threshold[scheme]: 72 | scores.append((scheme, score)) 73 | 74 | # # filter by feature/ score 75 | # if len(scores) == 0: 76 | # continue 77 | 78 | if phrase not in self.dict: 79 | self.dict[phrase] = {} 80 | 81 | part_of_speeches = [pos for con in constituents for pos in PartOfSpeech.constituent2pos(con)] 82 | 83 | for pos in part_of_speeches: 84 | if pos not in self.dict[phrase]: 85 | self.dict[phrase][pos] = [] 86 | 87 | self.dict[phrase][pos].append({ 88 | 'phrase': phrase, 89 | 'part_of_speech': pos, 90 | 'synonym': paraphrase, 91 | 'scores': scores 92 | }) 93 | 94 | def predict(self, word, pos=None): 95 | if pos is None: 96 | candidates = [] 97 | if word not in self.dict: 98 | return candidates 99 | 100 | for pos in self.dict[word]: 101 | for candidate in self.dict[word][pos]: 102 | candidates.append(candidate['synonym']) 103 | 104 | return candidates 105 | 106 | if word in self.dict and pos in self.dict[word]: 107 | return [candidate['synonym'] for candidate in self.dict[word][pos]] 108 | 109 | return [] 110 | 111 | def pos_tag(self, tokens): 112 | return nltk.pos_tag(tokens) 113 | -------------------------------------------------------------------------------- /nlpaug/model/audio/noise.py: -------------------------------------------------------------------------------- 1 | import random 2 | import numpy as np 3 | import math 4 | 5 | from nlpaug.model.audio import Audio 6 | 7 | 8 | class Noise(Audio): 9 | COLOR_NOISES = ['white', 'pink', 'red', 'brown', 'brownian', 'blue', 'azure', 'violet', 'purple'] 10 | 11 | def __init__(self, zone=(0.2, 0.8), coverage=1., 12 | color='white', noises=None, stateless=True): 13 | """ 14 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 15 | augmentation 16 | will be applied in first 20% and last 20% of whole audio. 17 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 18 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 19 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 20 | augmented. 21 | :param str color: Colors of noise. Supported 'white', 'pink', 'red', 'brown', 'brownian', 'blue', 'azure', 22 | 'violet', 'purple' and 'random'. If 'random' is used, noise color will be picked randomly in each augment. 23 | :param list noises: Background noises for noise injection. You can provide more than one background noise and 24 | noise will be picked randomly. Expected format is list of numpy array. If this value is provided. `color` 25 | value will be ignored 26 | """ 27 | super().__init__(zone=zone, coverage=coverage, stateless=stateless) 28 | 29 | self.color = color 30 | self.noises = noises 31 | 32 | def validate(self): 33 | if self.color not in self.COLOR_NOISES + ['random']: 34 | raise ValueError('Only support {} while `{}` is passed'.format(self.COLOR_NOISES+['random'], self.color)) 35 | 36 | def color_noise(self, segment_size): 37 | # https://en.wikipedia.org/wiki/Colors_of_noise 38 | uneven = segment_size % 2 39 | fft_size = segment_size // 2 + 1 + uneven 40 | noise_fft = np.random.randn(fft_size) 41 | color_noise = np.linspace(1, fft_size, fft_size) 42 | 43 | if self.color == 'random': 44 | color = np.random.choice(self.COLOR_NOISES) 45 | else: 46 | color = self.color 47 | if color == 'white': 48 | pass # no color noise 49 | else: 50 | if color == 'pink': 51 | color_noise = color_noise ** (-1) # 1/f 52 | elif color in ['red', 'brown', 'brownian']: 53 | color_noise = color_noise ** (-2) # 1/f^2 54 | elif color in ['blue', 'azure']: 55 | pass # f 56 | elif color in ['violet', 'purple']: 57 | color_noise = color_noise ** 2 # f^2 58 | 59 | noise_fft = noise_fft * color_noise 60 | 61 | if uneven: 62 | noise_fft = noise_fft[:-1] 63 | 64 | noise = np.fft.irfft(noise_fft) 65 | return noise, color_noise 66 | 67 | def background_noise(self, segment_size): 68 | # https://arxiv.org/pdf/1608.04363.pdf 69 | noise = random.sample(self.noises, 1)[0] 70 | 71 | # Get noise segment 72 | if len(noise) >= segment_size: 73 | noise_segment = noise[:segment_size] 74 | else: 75 | noise_segment = noise.copy() 76 | for _ in range(math.ceil(segment_size/len(noise))-1): 77 | noise_segment = np.append(noise_segment, noise) 78 | noise_segment = noise_segment[:segment_size] 79 | 80 | return noise_segment 81 | 82 | def manipulate(self, data): 83 | aug_segment_size = self.get_augmentation_segment_size(data) 84 | if self.noises is None: 85 | noise, color = self.color_noise(aug_segment_size) 86 | 87 | if not self.stateless: 88 | self.aug_factor = color 89 | else: 90 | noise = self.background_noise(aug_segment_size) 91 | 92 | if not self.stateless: 93 | self.aug_data = noise 94 | 95 | noise = self.pad(data, noise) 96 | 97 | return (data + noise).astype(type(data[0])) 98 | -------------------------------------------------------------------------------- /test/flow/test_sequential.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | 5 | import nlpaug.augmenter.char as nac 6 | import nlpaug.augmenter.word as naw 7 | import nlpaug.augmenter.spectrogram as nas 8 | import nlpaug.augmenter.audio as naa 9 | import nlpaug.flow as naf 10 | from nlpaug.util import Action, AudioLoader 11 | 12 | 13 | class TestSequential(unittest.TestCase): 14 | def test_dry_run(self): 15 | flow = naf.Sequential() 16 | results = flow.augment([]) 17 | self.assertEqual(0, len(results)) 18 | 19 | def test_single_action(self): 20 | texts = [ 21 | 'The quick brown fox jumps over the lazy dog', 22 | 'Zology raku123456 fasdasd asd4123414 1234584 s@#' 23 | ] 24 | 25 | flow = naf.Sequential([nac.RandomCharAug(action=Action.INSERT, min_char=1)]) 26 | 27 | for text in texts: 28 | augmented_text = flow.augment(text) 29 | 30 | self.assertNotEqual(text, augmented_text) 31 | self.assertLess(0, len(text)) 32 | 33 | self.assertLess(0, len(texts)) 34 | 35 | def test_multiple_actions(self): 36 | texts = [ 37 | 'The quick brown fox jumps over the lazy dog', 38 | 'Zology raku123456 fasdasd asd4123414 1234584' 39 | ] 40 | 41 | flows = [ 42 | naf.Sequential([nac.RandomCharAug(action=Action.INSERT), 43 | naw.RandomWordAug()]), 44 | naf.Sequential([nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), 45 | nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6)]) 46 | ] 47 | 48 | for flow in flows: 49 | for text in texts: 50 | augmented_text = flow.augment(text) 51 | 52 | self.assertNotEqual(text, augmented_text) 53 | self.assertLess(0, len(text)) 54 | 55 | self.assertLess(0, len(texts)) 56 | 57 | self.assertLess(0, len(flows)) 58 | 59 | def test_spectrogram(self): 60 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 61 | sample_wav_file = os.path.join( 62 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 63 | ) 64 | 65 | mel_spectrogram = AudioLoader.load_mel_spectrogram(sample_wav_file, n_mels=128) 66 | 67 | flow = naf.Sequential([ 68 | nas.FrequencyMaskingAug(mask_factor=50), 69 | nas.TimeMaskingAug(mask_factor=20), 70 | nas.TimeMaskingAug(mask_factor=30)]) 71 | 72 | augmented_mel_spectrogram = flow.augment(mel_spectrogram) 73 | 74 | for aug in flow: 75 | if aug.name == 'FrequencyMasking_Aug': 76 | self.assertEqual(len(mel_spectrogram[aug.model.f0]), np.count_nonzero(mel_spectrogram[aug.model.f0])) 77 | self.assertEqual(0, np.count_nonzero(augmented_mel_spectrogram[aug.model.f0])) 78 | elif aug.name == 'TimeMasking_Aug': 79 | self.assertEqual(len(mel_spectrogram[:, aug.model.t0]), 80 | np.count_nonzero(mel_spectrogram[:, aug.model.t0])) 81 | self.assertEqual(0, np.count_nonzero(augmented_mel_spectrogram[:, aug.model.t0])) 82 | else: 83 | # Unexpected flow 84 | self.assertFalse(True) 85 | 86 | self.assertTrue(len(flow) > 0) 87 | 88 | def test_audio(self): 89 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 90 | sample_wav_file = os.path.join( 91 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 92 | ) 93 | 94 | audio, sampling_rate = AudioLoader.load_audio(sample_wav_file) 95 | 96 | flow = naf.Sequential([ 97 | naa.NoiseAug(), 98 | naa.PitchAug(sampling_rate=sampling_rate, factor=(0.2, 1.5)), 99 | naa.ShiftAug(sampling_rate=sampling_rate, duration=2), 100 | naa.SpeedAug(factor=(1.5, 3)) 101 | ]) 102 | 103 | augmented_audio = flow.augment(audio) 104 | 105 | self.assertFalse(np.array_equal(audio, augmented_audio)) 106 | self.assertTrue(len(audio), len(augmented_audio)) 107 | -------------------------------------------------------------------------------- /nlpaug/model/audio/vtlp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | import librosa 3 | from nlpaug.model.audio import Audio 4 | 5 | 6 | class Vtlp(Audio): 7 | # https://pdfs.semanticscholar.org/3de0/616eb3cd4554fdf9fd65c9c82f2605a17413.pdf 8 | def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=0.1, duration=None, factor=(0.9, 1.1), fhi=4800, 9 | stateless=True): 10 | """ 11 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided. 12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 13 | augmentation 14 | will be applied in first 20% and last 20% of whole audio. 15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 18 | augmented. 19 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. 20 | `coverage` value will be ignored. 21 | :param int fhi: Boundary frequency. Default value is 4800. 22 | :param tuple factor: Warping factor 23 | """ 24 | super().__init__(zone=zone, coverage=coverage, duration=duration, sampling_rate=sampling_rate, 25 | stateless=stateless, factor=factor) 26 | self.fhi = fhi 27 | 28 | @classmethod 29 | def get_scale_factors(cls, freq_dim, sampling_rate, fhi=4800, alpha=0.9): 30 | factors = [] 31 | freqs = np.linspace(0, 1, freq_dim) 32 | 33 | scale = fhi * min(alpha, 1) 34 | f_boundary = scale / alpha 35 | half_sr = sampling_rate / 2 36 | 37 | for f in freqs: 38 | f *= sampling_rate 39 | if f <= f_boundary: 40 | factors.append(f * alpha) 41 | else: 42 | warp_freq = half_sr - (half_sr - scale) / (half_sr - scale / alpha) * (half_sr - f) 43 | factors.append(warp_freq) 44 | 45 | return np.array(factors) 46 | 47 | # https://github.com/YerevaNN/Spoken-language-identification/blob/master/augment_data.py#L26 48 | def _manipulate(self, audio, sampling_rate, factor): 49 | stft = librosa.core.stft(audio) 50 | time_dim, freq_dim = stft.shape 51 | data_type = type(stft[0][0]) 52 | 53 | factors = self.get_scale_factors(freq_dim, sampling_rate, alpha=factor) 54 | factors *= (freq_dim - 1) / max(factors) 55 | new_stft = np.zeros([time_dim, freq_dim], dtype=data_type) 56 | 57 | for i in range(freq_dim): 58 | # first and last freq 59 | if i == 0 or i + 1 >= freq_dim: 60 | new_stft[:, i] += stft[:, i] 61 | else: 62 | warp_up = factors[i] - np.floor(factors[i]) 63 | warp_down = 1 - warp_up 64 | pos = int(np.floor(factors[i])) 65 | 66 | new_stft[:, pos] += warp_down * stft[:, i] 67 | new_stft[:, pos+1] += warp_up * stft[:, i] 68 | 69 | return librosa.core.istft(new_stft) 70 | 71 | def get_warping_level(self): 72 | return np.random.uniform(self.factor[0], self.factor[1]) 73 | 74 | def manipulate(self, data): 75 | if self.duration is None: 76 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 77 | else: 78 | start_pos, end_pos = self.get_augment_range_by_duration(data) 79 | 80 | factor = self.get_warping_level() 81 | aug_data = self._manipulate(data[start_pos:end_pos], sampling_rate=self.sampling_rate, factor=factor) 82 | 83 | if not self.stateless: 84 | self.start_pos = start_pos 85 | self.end_pos = end_pos 86 | self.aug_factor = factor 87 | self.aug_data = aug_data 88 | 89 | return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0).astype(type(data[0])) 90 | 91 | # if start_pos > 0: 92 | # aug_data = np.concatenate((data[:start_pos], aug_data), axis=0) 93 | # if end_pos < len(data): 94 | # aug_data = np.concatenate((aug_data, data[end_pos:]), axis=0) 95 | # 96 | # return aug_data.astype(type(data[0])) 97 | -------------------------------------------------------------------------------- /model/char/keyboard/th.json: -------------------------------------------------------------------------------- 1 | { 2 | "ๅ": ["/", "ๆ", "ไ"], 3 | "/": ["ๅ", "_", "ๆ", "ไ", "ำ"], 4 | "_": ["/", "ภ", "ไ", "ำ", "พ"], 5 | "ภ": ["_", "ถ", "ำ", "พ", "ะ"], 6 | "ถ": ["ภ", "ุ", "พ", "ะ", "ั"], 7 | "ุ": ["ถ", "ึ", "ะ", "ั", "ี"], 8 | "ึ": ["ุ", "ค", "ั", "ี", "ร"], 9 | "ค": ["ึ", "ต", "ี", "ร", "น"], 10 | "ต": ["ค", "จ", "ร", "น", "ย"], 11 | "จ": ["ต", "ข", "น", "ย", "บ"], 12 | "ข": ["จ", "ช", "ย", "บ", "ล"], 13 | "ช": ["ข", "บ", "ล"], 14 | 15 | "ๆ": ["ๅ", "/", "ไ", "ฟ", "ห"], 16 | "ไ": ["ๅ", "/", "_", "ๆ", "ำ", "ฟ", "ห", "ก"], 17 | "ำ": ["/", "_", "ภ", "ไ", "พ", "ห", "ก", "ด"], 18 | "พ": ["_", "ภ", "ถ", "ำ", "ะ", "ก", "ด", "เ"], 19 | "ะ": ["ภ", "ถ", "ุ", "พ", "ั", "ด", "เ", "้"], 20 | "ั": ["ถ", "ุ", "ึ", "ะ", "ี", "เ", "้", "่"], 21 | "ี": ["ุ", "ึ", "ค", "ั", "ร", "้", "่", "า"], 22 | "ร": ["ึ", "ค", "ต", "ี", "น", "่", "า", "ส"], 23 | "น": ["ค", "ต", "จ", "ร", "ย", "า", "ส", "ว"], 24 | "ย": ["ต", "จ", "ข", "น", "บ", "ส", "ว", "ง"], 25 | "บ": ["จ", "ข", "ช", "ย", "ล", "ว", "ง", "ฃ"], 26 | "ล": ["ข", "ช", "บ", "ง", "ฃ"], 27 | 28 | "ฟ": ["ๆ", "ไ", "ห", "ผ"], 29 | "ห": ["ๆ", "ไ", "ำ", "ฟ", "ก", "ผ", "ป"], 30 | "ก": ["ไ", "ำ", "พ", "ห", "ด", "ผ", "ป", "แ"], 31 | "ด": ["ำ", "พ", "ะ", "ก", "เ", "ป", "แ", "อ"], 32 | "เ": ["พ", "ะ", "ั", "ด", "้", "แ", "อ", "ิ"], 33 | "้": ["ะ", "ั", "ี", "เ", "่", "อ", "ิ", "ื"], 34 | "่": ["ั", "ี", "ร", "้", "า", "ิ", "ื", "ท"], 35 | "า": ["ี", "ร", "น", "่", "ส", "ื", "ท", "ม"], 36 | "ส": ["ร", "น", "ย", "า", "ว", "ท", "ม", "ใ"], 37 | "ว": ["น", "ย", "บ", "ส", "ง", "ม", "ใ", "ฝ"], 38 | "ง": ["ย", "บ", "ล", "ว", "ฃ", "ใ", "ฝ"], 39 | "ฃ": ["บ", "ล", "ง", "ฝ"], 40 | 41 | "ผ": ["ฟ", "ห", "ก", "ป"], 42 | "ป": ["ห", "ก", "ด", "ผ", "แ"], 43 | "แ": ["ก", "ด", "เ", "ป", "อ"], 44 | "อ": ["ด", "เ", "้", "แ", "ิ"], 45 | "ิ": ["เ", "้", "่", "อ", "ื"], 46 | "ื": ["้", "่", "า", "ิ", "ท"], 47 | "ท": ["่", "า", "ส", "ื", "ม"], 48 | "ม": ["า", "ส", "ว", "ท", "ใ"], 49 | "ใ": ["ส", "ว", "ง", "ม", "ฝ"], 50 | "ฝ": ["ว", "ง", "ฃ", "ใ"], 51 | 52 | "+": ["๑", "๐", "\""], 53 | "๑": ["+", "๒", "๐", "\"", "ฎ"], 54 | "๒": ["๑", "๓", "\"", "ฎ", "ฑ"], 55 | "๓": ["๒", "๔", "ฎ", "ฑ", "ธ"], 56 | "๔": ["๓", "ู", "ฑ", "ธ", "ํ"], 57 | "ู": ["๔", "฿", "ธ", "ํ", "๊"], 58 | "฿": ["ู", "๕", "ํ", "๊", "ณ"], 59 | "๕": ["฿", "๖", "๊", "ณ", "ฯ"], 60 | "๖": ["๕", "๗", "ณ", "ฯ", "ญ"], 61 | "๗": ["๖", "๘", "ฯ", "ญ", "ฐ"], 62 | "๘": ["๗", "๙", "ญ", "ฐ", ","], 63 | "๙": ["๘", "ฐ", ","], 64 | 65 | "๐": ["+", "๑", "\"", "ฤ", "ฆ"], 66 | "\"": ["+", "๑", "๒", "๐", "ฎ", "ฤ", "ฆ", "ฏ"], 67 | "ฎ": ["๑", "๒", "๓", "\"", "ฑ", "ฆ", "ฏ", "โ"], 68 | "ฑ": ["๒", "๓", "๔", "ฎ", "ธ", "ฏ", "โ", "ฌ"], 69 | "ธ": ["๓", "๔", "ู", "ฑ", "ํ", "โ", "ฌ", "็"], 70 | "ํ": ["๔", "ู", "฿", "ธ", "๊", "ฌ", "็", "๋"], 71 | "๊": ["ู", "฿", "๕", "ํ", "ณ", "็", "๋", "ษ"], 72 | "ณ": ["฿", "๕", "๖", "๊", "ฯ", "๋", "ษ", "ศ"], 73 | "ฯ": ["๕", "๖", "๗", "ณ", "ญ", "ษ", "ศ", "ซ"], 74 | "ญ": ["๖", "๗", "๘", "ฯ", "ฐ", "ศ", "ซ", "."], 75 | "ฐ": ["๗", "๘", "๙", "ญ", ",", "ซ", ".", "ฅ"], 76 | ",": ["๘", "๙", "ฐ", ".", "ฅ"], 77 | 78 | "ฤ": ["๐", "\"", "ฆ", "("], 79 | "ฆ": ["๐", "\"", "ฎ", "ฤ", "ฏ", "(", ")"], 80 | "ฏ": ["\"", "ฎ", "ฑ", "ฆ", "โ", "(", ")", "ฉ"], 81 | "โ": ["ฎ", "ฑ", "ธ", "ฏ", "ฌ", ")", "ฉ", "ฮ"], 82 | "ฌ": ["ฑ", "ธ", "ํ", "โ", "็", "ฉ", "ฮ", "ฺ"], 83 | "็": ["ธ", "ํ", "๊", "ฌ", "๋", "ฮ", "ฺ", "์"], 84 | "๋": ["ํ", "๊", "ณ", "็", "ษ", "ฺ", "์", "?"], 85 | "ษ": ["๊", "ณ", "ฯ", "๋", "ศ", "์", "?", "ฒ"], 86 | "ศ": ["ณ", "ฯ", "ญ", "ษ", "ซ", "?", "ฒ", "ฬ"], 87 | "ซ": ["ฯ", "ญ", "ฐ", "ศ", ".", "ฒ", "ฬ", "ฦ"], 88 | ".": ["ญ", "ฐ", ",", "ซ", "ฅ", "ฬ", "ฦ"], 89 | "ฅ": ["ฐ", ",", ".", "ฦ"], 90 | 91 | "(": ["ฤ", "ฆ", "ฏ", ")"], 92 | ")": ["ฆ", "ฏ", "โ", "(", "ฉ"], 93 | "ฉ": ["ฏ", "โ", "ฌ", ")", "ฮ"], 94 | "ฮ": ["โ", "ฌ", "็", "ฉ", "ฺ"], 95 | "ฺ": ["ฌ", "็", "๋", "ฮ", "์"], 96 | "์": ["็", "๋", "ษ", "ฺ", "?"], 97 | "?": ["๋", "ษ", "ศ", "์", "ฒ"], 98 | "ฒ": ["ษ", "ศ", "ซ", "?", "ฬ"], 99 | "ฬ": ["ศ", "ซ", ".", "ฒ", "ฦ"], 100 | "ฦ": ["ซ", ".", "ฅ", "ฬ"] 101 | } -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/th.json: -------------------------------------------------------------------------------- 1 | { 2 | "ๅ": ["/", "ๆ", "ไ"], 3 | "/": ["ๅ", "_", "ๆ", "ไ", "ำ"], 4 | "_": ["/", "ภ", "ไ", "ำ", "พ"], 5 | "ภ": ["_", "ถ", "ำ", "พ", "ะ"], 6 | "ถ": ["ภ", "ุ", "พ", "ะ", "ั"], 7 | "ุ": ["ถ", "ึ", "ะ", "ั", "ี"], 8 | "ึ": ["ุ", "ค", "ั", "ี", "ร"], 9 | "ค": ["ึ", "ต", "ี", "ร", "น"], 10 | "ต": ["ค", "จ", "ร", "น", "ย"], 11 | "จ": ["ต", "ข", "น", "ย", "บ"], 12 | "ข": ["จ", "ช", "ย", "บ", "ล"], 13 | "ช": ["ข", "บ", "ล"], 14 | 15 | "ๆ": ["ๅ", "/", "ไ", "ฟ", "ห"], 16 | "ไ": ["ๅ", "/", "_", "ๆ", "ำ", "ฟ", "ห", "ก"], 17 | "ำ": ["/", "_", "ภ", "ไ", "พ", "ห", "ก", "ด"], 18 | "พ": ["_", "ภ", "ถ", "ำ", "ะ", "ก", "ด", "เ"], 19 | "ะ": ["ภ", "ถ", "ุ", "พ", "ั", "ด", "เ", "้"], 20 | "ั": ["ถ", "ุ", "ึ", "ะ", "ี", "เ", "้", "่"], 21 | "ี": ["ุ", "ึ", "ค", "ั", "ร", "้", "่", "า"], 22 | "ร": ["ึ", "ค", "ต", "ี", "น", "่", "า", "ส"], 23 | "น": ["ค", "ต", "จ", "ร", "ย", "า", "ส", "ว"], 24 | "ย": ["ต", "จ", "ข", "น", "บ", "ส", "ว", "ง"], 25 | "บ": ["จ", "ข", "ช", "ย", "ล", "ว", "ง", "ฃ"], 26 | "ล": ["ข", "ช", "บ", "ง", "ฃ"], 27 | 28 | "ฟ": ["ๆ", "ไ", "ห", "ผ"], 29 | "ห": ["ๆ", "ไ", "ำ", "ฟ", "ก", "ผ", "ป"], 30 | "ก": ["ไ", "ำ", "พ", "ห", "ด", "ผ", "ป", "แ"], 31 | "ด": ["ำ", "พ", "ะ", "ก", "เ", "ป", "แ", "อ"], 32 | "เ": ["พ", "ะ", "ั", "ด", "้", "แ", "อ", "ิ"], 33 | "้": ["ะ", "ั", "ี", "เ", "่", "อ", "ิ", "ื"], 34 | "่": ["ั", "ี", "ร", "้", "า", "ิ", "ื", "ท"], 35 | "า": ["ี", "ร", "น", "่", "ส", "ื", "ท", "ม"], 36 | "ส": ["ร", "น", "ย", "า", "ว", "ท", "ม", "ใ"], 37 | "ว": ["น", "ย", "บ", "ส", "ง", "ม", "ใ", "ฝ"], 38 | "ง": ["ย", "บ", "ล", "ว", "ฃ", "ใ", "ฝ"], 39 | "ฃ": ["บ", "ล", "ง", "ฝ"], 40 | 41 | "ผ": ["ฟ", "ห", "ก", "ป"], 42 | "ป": ["ห", "ก", "ด", "ผ", "แ"], 43 | "แ": ["ก", "ด", "เ", "ป", "อ"], 44 | "อ": ["ด", "เ", "้", "แ", "ิ"], 45 | "ิ": ["เ", "้", "่", "อ", "ื"], 46 | "ื": ["้", "่", "า", "ิ", "ท"], 47 | "ท": ["่", "า", "ส", "ื", "ม"], 48 | "ม": ["า", "ส", "ว", "ท", "ใ"], 49 | "ใ": ["ส", "ว", "ง", "ม", "ฝ"], 50 | "ฝ": ["ว", "ง", "ฃ", "ใ"], 51 | 52 | "+": ["๑", "๐", "\""], 53 | "๑": ["+", "๒", "๐", "\"", "ฎ"], 54 | "๒": ["๑", "๓", "\"", "ฎ", "ฑ"], 55 | "๓": ["๒", "๔", "ฎ", "ฑ", "ธ"], 56 | "๔": ["๓", "ู", "ฑ", "ธ", "ํ"], 57 | "ู": ["๔", "฿", "ธ", "ํ", "๊"], 58 | "฿": ["ู", "๕", "ํ", "๊", "ณ"], 59 | "๕": ["฿", "๖", "๊", "ณ", "ฯ"], 60 | "๖": ["๕", "๗", "ณ", "ฯ", "ญ"], 61 | "๗": ["๖", "๘", "ฯ", "ญ", "ฐ"], 62 | "๘": ["๗", "๙", "ญ", "ฐ", ","], 63 | "๙": ["๘", "ฐ", ","], 64 | 65 | "๐": ["+", "๑", "\"", "ฤ", "ฆ"], 66 | "\"": ["+", "๑", "๒", "๐", "ฎ", "ฤ", "ฆ", "ฏ"], 67 | "ฎ": ["๑", "๒", "๓", "\"", "ฑ", "ฆ", "ฏ", "โ"], 68 | "ฑ": ["๒", "๓", "๔", "ฎ", "ธ", "ฏ", "โ", "ฌ"], 69 | "ธ": ["๓", "๔", "ู", "ฑ", "ํ", "โ", "ฌ", "็"], 70 | "ํ": ["๔", "ู", "฿", "ธ", "๊", "ฌ", "็", "๋"], 71 | "๊": ["ู", "฿", "๕", "ํ", "ณ", "็", "๋", "ษ"], 72 | "ณ": ["฿", "๕", "๖", "๊", "ฯ", "๋", "ษ", "ศ"], 73 | "ฯ": ["๕", "๖", "๗", "ณ", "ญ", "ษ", "ศ", "ซ"], 74 | "ญ": ["๖", "๗", "๘", "ฯ", "ฐ", "ศ", "ซ", "."], 75 | "ฐ": ["๗", "๘", "๙", "ญ", ",", "ซ", ".", "ฅ"], 76 | ",": ["๘", "๙", "ฐ", ".", "ฅ"], 77 | 78 | "ฤ": ["๐", "\"", "ฆ", "("], 79 | "ฆ": ["๐", "\"", "ฎ", "ฤ", "ฏ", "(", ")"], 80 | "ฏ": ["\"", "ฎ", "ฑ", "ฆ", "โ", "(", ")", "ฉ"], 81 | "โ": ["ฎ", "ฑ", "ธ", "ฏ", "ฌ", ")", "ฉ", "ฮ"], 82 | "ฌ": ["ฑ", "ธ", "ํ", "โ", "็", "ฉ", "ฮ", "ฺ"], 83 | "็": ["ธ", "ํ", "๊", "ฌ", "๋", "ฮ", "ฺ", "์"], 84 | "๋": ["ํ", "๊", "ณ", "็", "ษ", "ฺ", "์", "?"], 85 | "ษ": ["๊", "ณ", "ฯ", "๋", "ศ", "์", "?", "ฒ"], 86 | "ศ": ["ณ", "ฯ", "ญ", "ษ", "ซ", "?", "ฒ", "ฬ"], 87 | "ซ": ["ฯ", "ญ", "ฐ", "ศ", ".", "ฒ", "ฬ", "ฦ"], 88 | ".": ["ญ", "ฐ", ",", "ซ", "ฅ", "ฬ", "ฦ"], 89 | "ฅ": ["ฐ", ",", ".", "ฦ"], 90 | 91 | "(": ["ฤ", "ฆ", "ฏ", ")"], 92 | ")": ["ฆ", "ฏ", "โ", "(", "ฉ"], 93 | "ฉ": ["ฏ", "โ", "ฌ", ")", "ฮ"], 94 | "ฮ": ["โ", "ฌ", "็", "ฉ", "ฺ"], 95 | "ฺ": ["ฌ", "็", "๋", "ฮ", "์"], 96 | "์": ["็", "๋", "ษ", "ฺ", "?"], 97 | "?": ["๋", "ษ", "ศ", "์", "ฒ"], 98 | "ฒ": ["ษ", "ศ", "ซ", "?", "ฬ"], 99 | "ฬ": ["ศ", "ซ", ".", "ฒ", "ฦ"], 100 | "ฦ": ["ซ", ".", "ฅ", "ฬ"] 101 | } -------------------------------------------------------------------------------- /nlpaug/augmenter/char/char_augmenter.py: -------------------------------------------------------------------------------- 1 | import string 2 | import re 3 | 4 | from nlpaug.util import Method 5 | from nlpaug import Augmenter 6 | from nlpaug.util import WarningException, WarningName, WarningCode, WarningMessage 7 | 8 | 9 | class CharAugmenter(Augmenter): 10 | TOKENIZER_REGEX = re.compile(r'(\W)') 11 | 12 | def __init__(self, action, name='Char_Aug', min_char=2, aug_char_min=1, aug_char_max=10, aug_char_p=0.3, 13 | aug_word_min=1, aug_word_max=10, aug_word_p=0.3, tokenizer=None, reverse_tokenizer=None, 14 | stopwords=None, device='cpu', verbose=0, stopwords_regex=None, include_special_char=True, 15 | include_detail=False): 16 | super().__init__( 17 | name=name, method=Method.CHAR, action=action, aug_min=None, aug_max=None, device=device, verbose=verbose, 18 | include_detail=include_detail) 19 | self.aug_p = None 20 | self.aug_char_min = aug_char_min 21 | self.aug_char_max = aug_char_max 22 | self.aug_char_p = aug_char_p 23 | self.aug_word_min = aug_word_min 24 | self.aug_word_max = aug_word_max 25 | self.aug_word_p = aug_word_p 26 | self.min_char = min_char 27 | 28 | self.tokenizer = tokenizer or self._tokenizer 29 | self.reverse_tokenizer = reverse_tokenizer or self._reverse_tokenizer 30 | self.stopwords = stopwords 31 | self.stopwords_regex = re.compile(stopwords_regex) if stopwords_regex is not None else stopwords_regex 32 | self.include_special_char = include_special_char 33 | 34 | @classmethod 35 | def _tokenizer(cls, text): 36 | tokens = cls.TOKENIZER_REGEX.split(text) 37 | return [t for t in tokens if len(t.strip()) > 0] 38 | 39 | @classmethod 40 | def token2char(cls, word): 41 | return list(word) 42 | 43 | @classmethod 44 | def _reverse_tokenizer(cls, tokens): 45 | return ' '.join(tokens) 46 | 47 | @classmethod 48 | def clean(cls, data): 49 | return data.strip() 50 | 51 | @classmethod 52 | def is_duplicate(cls, dataset, data): 53 | for d in dataset: 54 | if d == data: 55 | return True 56 | return False 57 | 58 | def skip_aug(self, token_idxes, tokens): 59 | return token_idxes 60 | 61 | def pre_skip_aug(self, tokens, tuple_idx=None): 62 | results = [] 63 | for token_idx, token in enumerate(tokens): 64 | if tuple_idx is not None: 65 | _token = token[tuple_idx] 66 | else: 67 | _token = token 68 | # skip punctuation 69 | if _token in string.punctuation and not self.include_special_char: 70 | continue 71 | """ 72 | TODO: cannot skip word that were split by tokenizer 73 | """ 74 | # skip stopwords by list 75 | if self.stopwords is not None and _token in self.stopwords: 76 | continue 77 | 78 | # skip stopwords by regex 79 | if self.stopwords_regex is not None and ( 80 | self.stopwords_regex.match(_token) or self.stopwords_regex.match(' '+_token+' ') or 81 | self.stopwords_regex.match(' '+_token) or self.stopwords_regex.match(_token+' ')): 82 | continue 83 | 84 | results.append(token_idx) 85 | 86 | return results 87 | 88 | def _get_aug_idxes(self, tokens, aug_min, aug_max, aug_p, mode): 89 | if mode == Method.CHAR: 90 | # If word is too short, do not augment it. 91 | if len(tokens) < self.min_char: 92 | return None 93 | 94 | aug_cnt = self._generate_aug_cnt(len(tokens), aug_min, aug_max, aug_p) 95 | 96 | if mode == Method.WORD: 97 | idxes = self.pre_skip_aug(tokens) 98 | elif mode == Method.CHAR: 99 | idxes = [i for i, t in enumerate(tokens)] 100 | idxes = self.skip_aug(idxes, tokens) 101 | 102 | if len(idxes) == 0: 103 | if self.verbose > 0: 104 | exception = WarningException(name=WarningName.OUT_OF_VOCABULARY, 105 | code=WarningCode.WARNING_CODE_002, msg=WarningMessage.NO_WORD) 106 | exception.output() 107 | return None 108 | if len(idxes) < aug_cnt: 109 | aug_cnt = len(idxes) 110 | aug_idxes = self.sample(idxes, aug_cnt) 111 | return aug_idxes 112 | --------------------------------------------------------------------------------