├── .codacy.yml ├── .gitattributes ├── .github └── FUNDING.yml ├── .gitignore ├── .readthedocs.yml ├── .travis.yml ├── CHANGE.md ├── CITED.md ├── LICENSE ├── MANIFEST.in ├── README.md ├── SOURCE.md ├── codecov.yml ├── conda.sh ├── docs ├── Makefile ├── augmenter │ ├── audio │ │ ├── audio.rst │ │ ├── crop.rst │ │ ├── loudness.rst │ │ ├── mask.rst │ │ ├── noise.rst │ │ ├── normalization.rst │ │ ├── pitch.rst │ │ ├── shift.rst │ │ ├── speed.rst │ │ └── vtlp.rst │ ├── augmenter.rst │ ├── char │ │ ├── char.rst │ │ ├── keyboard.rst │ │ ├── ocr.rst │ │ └── random.rst │ ├── sentence │ │ ├── abst_summ.rst │ │ ├── context_word_embs_sentence.rst │ │ ├── lambada.rst │ │ ├── random.rst │ │ └── sentence.rst │ ├── spectrogram │ │ ├── frequency_masking.rst │ │ ├── spectrogram.rst │ │ └── time_masking.rst │ └── word │ │ ├── antonym.rst │ │ ├── back_translation.rst │ │ ├── context_word_embs.rst │ │ ├── random.rst │ │ ├── reserved.rst │ │ ├── spelling.rst │ │ ├── split.rst │ │ ├── synonym.rst │ │ ├── tfidf.rst │ │ ├── word.rst │ │ └── word_embs.rst ├── conf.py ├── example │ └── example.rst ├── flow │ ├── flow.rst │ ├── sequential.rst │ └── sometimes.rst ├── index.rst ├── make.bat ├── overview │ └── overview.rst └── util │ ├── download.rst │ └── util.rst ├── example ├── audio_augmenter.ipynb ├── change_log.ipynb ├── custom_augmenter.ipynb ├── flow.ipynb ├── lambada-train_model.ipynb ├── quick_example.ipynb ├── spectrogram_augmenter.ipynb ├── textual_augmenter.ipynb ├── textual_language_augmenter.ipynb └── tfidf-train_model.ipynb ├── meta.yaml ├── nlpaug ├── .gitignore ├── __init__.py ├── augmenter │ ├── __init__.py │ ├── audio │ │ ├── __init__.py │ │ ├── audio_augmenter.py │ │ ├── crop.py │ │ ├── inversion.py │ │ ├── loudness.py │ │ ├── mask.py │ │ ├── noise.py │ │ ├── normalization.py │ │ ├── pitch.py │ │ ├── shift.py │ │ ├── speed.py │ │ └── vtlp.py │ ├── augment.py │ ├── char │ │ ├── __init__.py │ │ ├── char_augmenter.py │ │ ├── keyboard.py │ │ ├── ocr.py │ │ └── random.py │ ├── sentence │ │ ├── __init__.py │ │ ├── abst_summ.py │ │ ├── context_word_embs_sentence.py │ │ ├── lambada.py │ │ ├── random.py │ │ └── sentence_augmenter.py │ ├── spectrogram │ │ ├── __init__.py │ │ ├── frequency_masking.py │ │ ├── loudness.py │ │ ├── spectrogram_augmenter.py │ │ ├── time_masking.py │ │ └── time_warping.py │ └── word │ │ ├── __init__.py │ │ ├── antonym.py │ │ ├── back_translation.py │ │ ├── context_word_embs.py │ │ ├── random.py │ │ ├── reserved.py │ │ ├── spelling.py │ │ ├── split.py │ │ ├── synonym.py │ │ ├── tfidf.py │ │ ├── word_augmenter.py │ │ └── word_embs.py ├── base_augmenter.py ├── flow │ ├── __init__.py │ ├── pipeline.py │ ├── sequential.py │ └── sometimes.py ├── model │ ├── __init__.py │ ├── audio │ │ ├── __init__.py │ │ ├── audio.py │ │ ├── crop.py │ │ ├── inversion.py │ │ ├── loudness.py │ │ ├── mask.py │ │ ├── noise.py │ │ ├── normalization.py │ │ ├── pitch.py │ │ ├── shift.py │ │ ├── speed.py │ │ └── vtlp.py │ ├── base_model.py │ ├── char │ │ ├── __init__.py │ │ ├── char.py │ │ ├── keyboard.py │ │ └── ocr.py │ ├── lang_models │ │ ├── __init__.py │ │ ├── bart.py │ │ ├── bert.py │ │ ├── distilbert.py │ │ ├── fairseq.py │ │ ├── fill_mask_transformers.py │ │ ├── gpt2.py │ │ ├── lambada.py │ │ ├── language_models.py │ │ ├── machine_translation_transformers.py │ │ ├── roberta.py │ │ ├── summarization_transformers.py │ │ ├── t5.py │ │ ├── text_generation_transformers.py │ │ └── xlnet.py │ ├── spectrogram │ │ ├── __init__.py │ │ ├── frequency_masking.py │ │ ├── loudness.py │ │ ├── spectrogram.py │ │ ├── time_masking.py │ │ └── time_warping.py │ ├── word_dict │ │ ├── __init__.py │ │ ├── ppdb.py │ │ ├── spelling.py │ │ ├── word_dictionary.py │ │ └── wordnet.py │ ├── word_embs │ │ ├── __init__.py │ │ ├── fasttext.py │ │ ├── glove.py │ │ ├── word2vec.py │ │ └── word_embeddings.py │ ├── word_rule │ │ ├── __init__.py │ │ ├── shuffle.py │ │ └── word_rule.py │ └── word_stats │ │ ├── __init__.py │ │ ├── tfidf.py │ │ └── word_statistics.py ├── res │ ├── char │ │ ├── keyboard │ │ │ ├── de.json │ │ │ ├── en.json │ │ │ ├── es.json │ │ │ ├── fr.json │ │ │ ├── he.json │ │ │ ├── it.json │ │ │ ├── nl.json │ │ │ ├── pl.json │ │ │ ├── th.json │ │ │ ├── tr.json │ │ │ └── uk.json │ │ └── ocr │ │ │ └── en.json │ └── word │ │ └── spelling │ │ └── spelling_en.txt └── util │ ├── __init__.py │ ├── action.py │ ├── audio │ ├── __init__.py │ ├── loader.py │ └── visualizer.py │ ├── decorator │ ├── __init__.py │ └── deprecation.py │ ├── doc │ ├── __init__.py │ ├── change_log.py │ ├── doc.py │ └── token.py │ ├── exception │ ├── __init__.py │ ├── exception_info.py │ └── warning.py │ ├── file │ ├── __init__.py │ ├── download.py │ ├── library.py │ └── read.py │ ├── lib_ver.py │ ├── logger │ ├── __init__.py │ └── logger.py │ ├── math │ ├── __init__.py │ └── normalization.py │ ├── method.py │ ├── selection │ ├── __init__.py │ ├── filtering.py │ └── randomness.py │ └── text │ ├── __init__.py │ ├── part_of_speech.py │ └── tokenizer.py ├── pypi.sh ├── requirements.txt ├── requirements_dev.txt ├── res ├── audio_example.png ├── lambada_algo.png ├── logo_small.png └── textual_example.png ├── script.txt ├── scripts ├── lambada │ ├── data_processing.py │ ├── run_clm.py │ └── train_cls.py ├── run_lambada.py └── train_lambada.sh ├── setup.py └── test ├── __init__.py ├── augmenter ├── __init__.py ├── audio │ ├── __init__.py │ ├── test_audio.py │ ├── test_crop.py │ ├── test_inversion.py │ ├── test_loudness.py │ ├── test_mask.py │ ├── test_noise.py │ ├── test_normalization.py │ ├── test_pitch.py │ ├── test_shift.py │ ├── test_speed.py │ └── test_vtlp.py ├── char │ ├── __init__.py │ ├── test_char.py │ ├── test_keyboard.py │ ├── test_ocr.py │ └── test_random_char.py ├── sentence │ ├── __init__.py │ ├── test_abst_summ.py │ ├── test_context_word_embs_sentence.py │ ├── test_lambada.py │ ├── test_random.py │ └── test_sentence.py ├── spectrogram │ ├── __init__.py │ ├── test_frequency_masking.py │ ├── test_loudness_spec.py │ ├── test_spectrogram.py │ └── test_time_masking.py ├── test_audio_augmenter.py ├── test_base_augmenter.py ├── test_text_augmenter.py └── word │ ├── __init__.py │ ├── test_antonym.py │ ├── test_back_translation.py │ ├── test_context_word_embs.py │ ├── test_random_word.py │ ├── test_reserved.py │ ├── test_spelling.py │ ├── test_split.py │ ├── test_synonym.py │ ├── test_tfidf.py │ ├── test_word.py │ └── test_word_embs.py ├── flow ├── __init__.py ├── test_flow.py ├── test_sequential.py └── test_sometimes.py ├── model ├── __init__.py ├── char │ ├── __init__.py │ └── test_keyboard_model.py └── word │ ├── __init__.py │ └── test_word_embs_model.py ├── profiler.py ├── profiling ├── __init__.py ├── sentence │ ├── __init__.py │ └── test_context_word_embs_sentence_profiling.py └── word │ └── profile_context_word_embs.py ├── res ├── audio │ ├── Pink_noise.ogg │ └── Yamaha-V50-Rock-Beat-120bpm.wav ├── common │ └── sample.json └── text │ └── classification.csv ├── run_profile_context_word_embs.sh ├── run_profile_import.sh ├── run_test.py └── util ├── __init__.py ├── selection ├── __init__.py └── test_filtering.py └── text └── test_tokenizer.py /.codacy.yml: -------------------------------------------------------------------------------- 1 | exclude_paths: 2 | - test/* 3 | - README.md 4 | - CHANGE.md 5 | - SOURCE.md 6 | - docs/conf.py -------------------------------------------------------------------------------- /.gitattributes: -------------------------------------------------------------------------------- 1 | # Auto detect text files and perform LF normalization 2 | * text=auto 3 | -------------------------------------------------------------------------------- /.github/FUNDING.yml: -------------------------------------------------------------------------------- 1 | # These are supported funding model platforms 2 | 3 | github: [makcedward] 4 | patreon: # Replace with a single Patreon username 5 | open_collective: # Replace with a single Open Collective username 6 | ko_fi: # Replace with a single Ko-fi username 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry 9 | liberapay: # Replace with a single Liberapay username 10 | issuehunt: # Replace with a single IssueHunt username 11 | otechie: # Replace with a single Otechie username 12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2'] 13 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | *.zip 28 | .DS_Store 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | .hypothesis/ 50 | .pytest_cache/ 51 | 52 | # Translations 53 | *.mo 54 | *.pot 55 | 56 | # Django stuff: 57 | *.log 58 | local_settings.py 59 | db.sqlite3 60 | 61 | # Flask stuff: 62 | instance/ 63 | .webassets-cache 64 | 65 | # Scrapy stuff: 66 | .scrapy 67 | 68 | # Sphinx documentation 69 | docs/_build/ 70 | 71 | # PyBuilder 72 | target/ 73 | 74 | # Jupyter Notebook 75 | .ipynb_checkpoints 76 | notebook/ 77 | 78 | # pyenv 79 | .python-version 80 | 81 | # celery beat schedule file 82 | celerybeat-schedule 83 | 84 | # SageMath parsed files 85 | *.sage.py 86 | 87 | # Environments 88 | .env 89 | .venv 90 | env/ 91 | venv/ 92 | ENV/ 93 | env.bak/ 94 | venv.bak/ 95 | 96 | # Spyder project settings 97 | .spyderproject 98 | .spyproject 99 | 100 | # Rope project settings 101 | .ropeproject 102 | 103 | # mkdocs documentation 104 | /site 105 | 106 | # mypy 107 | .mypy_cache/ 108 | 109 | # IDE 110 | .idea/ 111 | 112 | # model 113 | model/* 114 | *.txt 115 | *.bin 116 | *.vec 117 | *.zip 118 | *.gz -------------------------------------------------------------------------------- /.readthedocs.yml: -------------------------------------------------------------------------------- 1 | python: 2 | version: 3.8 3 | -------------------------------------------------------------------------------- /.travis.yml: -------------------------------------------------------------------------------- 1 | language: python 2 | python: 3 | - "3.6" 4 | 5 | install: 6 | - pip install -r requirements.txt 7 | - pip install coverage 8 | - pip install codecov 9 | - pip install . 10 | 11 | script: 12 | - python test/run_test.py 13 | 14 | after_success: 15 | - codecov -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2019 Edward Ma 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. -------------------------------------------------------------------------------- /MANIFEST.in: -------------------------------------------------------------------------------- 1 | recursive-include nlpaug/res *.json 2 | recursive-include nlpaug/res *.txt -------------------------------------------------------------------------------- /codecov.yml: -------------------------------------------------------------------------------- 1 | #see https://github.com/codecov/support/wiki/Codecov-Yaml 2 | codecov: 3 | notify: 4 | require_ci_to_pass: yes 5 | 6 | coverage: 7 | precision: 2 # 2 = xx.xx%, 0 = xx% 8 | round: nearest # how coverage is rounded: down/up/nearest 9 | range: 10...90 # custom range of coverage colors from red -> yellow -> green 10 | status: 11 | # https://codecov.readme.io/v1.0/docs/commit-status 12 | project: 13 | default: 14 | against: auto 15 | target: 40% # specify the target coverage for each commit status 16 | threshold: 20% # allow this little decrease on project 17 | # https://github.com/codecov/support/wiki/Filtering-Branches 18 | # branches: master 19 | if_ci_failed: error 20 | # https://github.com/codecov/support/wiki/Patch-Status 21 | patch: 22 | default: 23 | against: parent 24 | target: 30% # specify the target "X%" coverage to hit 25 | # threshold: 50% # allow this much decrease on patch 26 | changes: false 27 | 28 | parsers: 29 | gcov: 30 | branch_detection: 31 | conditional: true 32 | loop: true 33 | macro: false 34 | method: false 35 | javascript: 36 | enable_partials: false 37 | 38 | comment: 39 | layout: header, diff 40 | require_changes: false 41 | behavior: default # update if exists else create new 42 | branches: * -------------------------------------------------------------------------------- /conda.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | build_name='.' 4 | pkg_name='nlpaug' 5 | py_vers=(3.5 3.6 3.7 3.8 3.9) 6 | pkg_ver='1.1.10' 7 | conda_dir="/home/edward/anaconda3/conda-bld" 8 | 9 | echo "Building conda package ..." 10 | for i in "${py_vers[@]}" 11 | do 12 | conda-build --python $i $build_name 13 | done 14 | 15 | echo "Converting package to other platforms" 16 | platforms=(osx-64 linux-32 win-32 win-64) 17 | find "$conda_dir"/linux-64/"$pkg_name"*"$pkg_ver"*.tar.bz2 | while read file 18 | do 19 | for platform in "${platforms[@]}" 20 | do 21 | conda convert --platform $platform $file -o "$conda_dir" 22 | done 23 | done 24 | 25 | echo "Upload to Anaconda" 26 | for platform in "${platforms[@]}" 27 | do 28 | find "$conda_dir"/"$platform"/"$pkg_name"*"$pkg_ver"*.tar.bz2 | while read file 29 | do 30 | anaconda upload --force $file 31 | done 32 | done 33 | 34 | 35 | anaconda upload --force "$conda_dir"/linux-32/"$pkg_name"*"$pkg_ver"*.tar.bz2 36 | anaconda upload --force "$conda_dir"/linux-64/"$pkg_name"*"$pkg_ver"*.tar.bz2 37 | anaconda upload --force "$conda_dir"/win-32/"$pkg_name"*"$pkg_ver"*.tar.bz2 38 | anaconda upload --force "$conda_dir"/win-64/"$pkg_name"*"$pkg_ver"*.tar.bz2 39 | anaconda upload --force "$conda_dir"/osx-64/"$pkg_name"*"$pkg_ver"*.tar.bz2 40 | -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # Minimal makefile for Sphinx documentation 2 | # 3 | 4 | # You can set these variables from the command line. 5 | SPHINXOPTS = 6 | SPHINXBUILD = sphinx-build 7 | SPHINXPROJ = nlpaug 8 | SOURCEDIR = source 9 | BUILDDIR = build 10 | 11 | # Put it first so that "make" without argument is like "make help". 12 | help: 13 | @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) 14 | 15 | .PHONY: help Makefile 16 | 17 | # Catch-all target: route all unknown targets to Sphinx using the new 18 | # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). 19 | %: Makefile 20 | @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) -------------------------------------------------------------------------------- /docs/augmenter/audio/audio.rst: -------------------------------------------------------------------------------- 1 | Audio Augmenter 2 | =============== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./crop 8 | ./loudness 9 | ./mask 10 | ./noise 11 | ./normalization 12 | ./pitch 13 | ./shift 14 | ./speed 15 | ./vtlp -------------------------------------------------------------------------------- /docs/augmenter/audio/crop.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.crop 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.crop 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/loudness.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.loudness 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.loudness 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/mask.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.mask 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.mask 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/noise.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.noise 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.noise 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/normalization.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.normalization 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.normalization 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/pitch.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.pitch 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.pitch 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/shift.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.shift 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.shift 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/speed.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.speed 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.audio.speed 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/audio/vtlp.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.audio\.vtlp 2 | ============================ 3 | 4 | .. automodule:: nlpaug.augmenter.audio.vtlp 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/augmenter.rst: -------------------------------------------------------------------------------- 1 | Augmenter 2 | ========= 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./audio/audio 8 | ./char/char 9 | ./sentence/sentence 10 | ./spectrogram/spectrogram 11 | ./word/word -------------------------------------------------------------------------------- /docs/augmenter/char/char.rst: -------------------------------------------------------------------------------- 1 | Character Augmenter 2 | =================== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./keyboard 8 | ./ocr 9 | ./random -------------------------------------------------------------------------------- /docs/augmenter/char/keyboard.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.char\.keyboard 2 | =============================== 3 | 4 | .. automodule:: nlpaug.augmenter.char.keyboard 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/char/ocr.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.char\.ocr 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.char.ocr 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/char/random.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.char\.random 2 | ============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.char.random 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/sentence/abst_summ.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.sentence\.abst_summ 2 | ===================================================== 3 | 4 | .. automodule:: nlpaug.augmenter.sentence.abst_summ 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/sentence/context_word_embs_sentence.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.sentence\.context_word_embs_sentence 2 | ===================================================== 3 | 4 | .. automodule:: nlpaug.augmenter.sentence.context_word_embs_sentence 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/sentence/lambada.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.sentence\.lambada 2 | ===================================================== 3 | 4 | .. automodule:: nlpaug.augmenter.sentence.lambada 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/sentence/random.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.sentence\.random 2 | ===================================================== 3 | 4 | .. automodule:: nlpaug.augmenter.sentence.random 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/sentence/sentence.rst: -------------------------------------------------------------------------------- 1 | Sentence Augmenter 2 | ================== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./abst_summ 8 | ./context_word_embs_sentence 9 | ./lambada 10 | ./random -------------------------------------------------------------------------------- /docs/augmenter/spectrogram/frequency_masking.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.spectrogram\.frequency_masking 2 | =============================================== 3 | 4 | .. automodule:: nlpaug.augmenter.spectrogram.frequency_masking 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/spectrogram/spectrogram.rst: -------------------------------------------------------------------------------- 1 | Spectrogram Augmenter 2 | ===================== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./frequency_masking 8 | ./time_masking -------------------------------------------------------------------------------- /docs/augmenter/spectrogram/time_masking.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.spectrogram\.time_masking 2 | ========================================== 3 | 4 | .. automodule:: nlpaug.augmenter.spectrogram.time_masking 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/antonym.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.antonym 2 | ============================== 3 | 4 | .. automodule:: nlpaug.augmenter.word.antonym 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/back_translation.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.back_translation 2 | ======================================== 3 | 4 | .. automodule:: nlpaug.augmenter.word.back_translation 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/context_word_embs.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.context_word_embs 2 | ======================================== 3 | 4 | .. automodule:: nlpaug.augmenter.word.context_word_embs 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/random.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.random 2 | ================================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.random 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/reserved.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.reserved 2 | ================================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.reserved 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/spelling.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.spelling 2 | ================================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.spelling 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/split.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.split 2 | ============================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.split 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/synonym.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.synonym 2 | ============================== 3 | 4 | .. automodule:: nlpaug.augmenter.word.synonym 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/tfidf.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.tfidf 2 | ================================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.tfidf 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/augmenter/word/word.rst: -------------------------------------------------------------------------------- 1 | Word Augmenter 2 | ============== 3 | 4 | .. toctree:: 5 | :maxdepth: 6 6 | 7 | ./antonym 8 | ./back_translation 9 | ./context_word_embs 10 | ./random 11 | ./reserved 12 | ./spelling 13 | ./split 14 | ./synonym 15 | ./tfidf 16 | ./word_embs -------------------------------------------------------------------------------- /docs/augmenter/word/word_embs.rst: -------------------------------------------------------------------------------- 1 | nlpaug.augmenter.word\.word_embs 2 | ================================ 3 | 4 | .. automodule:: nlpaug.augmenter.word.word_embs 5 | :members: 6 | :inherited-members: 7 | :show-inheritance: 8 | -------------------------------------------------------------------------------- /docs/example/example.rst: -------------------------------------------------------------------------------- 1 | Example 2 | ======= 3 | 4 | The following examples show a standard use case for augmenter. 5 | 6 | - `Audio augmenters`_ 7 | - `Textual augmenters`_ 8 | - `Spectrogram augmenters`_ 9 | - `Custom augmenter`_ 10 | - `TF-IDF model training`_ 11 | - `Flow`_ 12 | 13 | .. _Audio augmenters: https://github.com/makcedward/nlpaug/blob/master/example/audio_augmenter.ipynb 14 | .. _Textual augmenters: https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb 15 | .. _Spectrogram augmenters: https://github.com/makcedward/nlpaug/blob/master/example/spectrogram_augmenter.ipynb 16 | .. _Custom augmenter: https://github.com/makcedward/nlpaug/blob/master/example/custom_augmenter.ipynb 17 | .. _TF-IDF model training: https://github.com/makcedward/nlpaug/blob/master/example/tfidf-train_model.ipynb 18 | .. _Flow: https://github.com/makcedward/nlpaug/blob/master/example/flow.ipynb -------------------------------------------------------------------------------- /docs/flow/flow.rst: -------------------------------------------------------------------------------- 1 | Flow 2 | ==== 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | ./sequential 8 | ./sometimes 9 | -------------------------------------------------------------------------------- /docs/flow/sequential.rst: -------------------------------------------------------------------------------- 1 | nlpaug.flow\.sequential 2 | ========================================== 3 | 4 | .. automodule:: nlpaug.flow.sequential 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/flow/sometimes.rst: -------------------------------------------------------------------------------- 1 | nlpaug.flow\.sometimes 2 | ========================================== 3 | 4 | .. automodule:: nlpaug.flow.sometimes 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/index.rst: -------------------------------------------------------------------------------- 1 | nlpaug 2 | ====== 3 | 4 | `nlpgaug` is a library for textual augmentation in machine learning experiments. 5 | The goal is improving deep learning model performance by generating textual data. 6 | It also able to generate adversarial examples to prevent adversarial attacks. 7 | 8 | .. toctree:: 9 | :maxdepth: 3 10 | :caption: Contents: 11 | 12 | ./overview/overview 13 | ./example/example 14 | ./augmenter/augmenter 15 | ./flow/flow 16 | ./util/util 17 | 18 | See :ref:`modindex` for API. 19 | 20 | Indices and tables 21 | ================== 22 | 23 | * :ref:`genindex` 24 | * :ref:`modindex` 25 | * :ref:`search` -------------------------------------------------------------------------------- /docs/make.bat: -------------------------------------------------------------------------------- 1 | @ECHO OFF 2 | 3 | pushd %~dp0 4 | 5 | REM Command file for Sphinx documentation 6 | 7 | if "%SPHINXBUILD%" == "" ( 8 | set SPHINXBUILD=sphinx-build 9 | ) 10 | set SOURCEDIR=. 11 | set BUILDDIR=../build 12 | set SPHINXPROJ=nlpaug 13 | 14 | if "%1" == "" goto help 15 | 16 | %SPHINXBUILD% >NUL 2>NUL 17 | if errorlevel 9009 ( 18 | echo. 19 | echo.The 'sphinx-build' command was not found. Make sure you have Sphinx 20 | echo.installed, then set the SPHINXBUILD environment variable to point 21 | echo.to the full path of the 'sphinx-build' executable. Alternatively you 22 | echo.may add the Sphinx directory to PATH. 23 | echo. 24 | echo.If you don't have Sphinx installed, grab it from 25 | echo.http://sphinx-doc.org/ 26 | exit /b 1 27 | ) 28 | 29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 30 | goto end 31 | 32 | :help 33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% 34 | 35 | :end 36 | popd 37 | -------------------------------------------------------------------------------- /docs/overview/overview.rst: -------------------------------------------------------------------------------- 1 | Overview 2 | ======== 3 | 4 | This python library helps you with augmenting nlp for your machine learning projects. Visit this introduction to understand about Data Augmentation in NLP. Augmenter is the basic element of augmentation while Flow is a pipeline to orchestra multi augmenter together. 5 | 6 | - `Data Augmentation library for Text`_ 7 | - `Data Augmentation library for Speech Recognition`_ 8 | - `Data Augmentation library for Audio`_ 9 | - `Does your NLP model able to prevent adversarial attack?`_ 10 | 11 | .. _Data Augmentation library for Text: https://towardsdatascience.com/data-augmentation-library-for-text-9661736b13ff 12 | .. _Data Augmentation library for Speech Recognition: https://towardsdatascience.com/data-augmentation-for-speech-recognition-e7c607482e78 13 | .. _Data Augmentation library for Audio: https://towardsdatascience.com/data-augmentation-for-audio-76912b01fdf6 14 | .. _Does your NLP model able to prevent adversarial attack?: https://medium.com/hackernoon/does-your-nlp-model-able-to-prevent-adversarial-attack-45b5ab75129c -------------------------------------------------------------------------------- /docs/util/download.rst: -------------------------------------------------------------------------------- 1 | nlpaug.util.file\.download 2 | ========================================== 3 | 4 | .. automodule:: nlpaug.util.file.download 5 | :members: 6 | :show-inheritance: 7 | -------------------------------------------------------------------------------- /docs/util/util.rst: -------------------------------------------------------------------------------- 1 | Util 2 | ==== 3 | 4 | .. toctree:: 5 | :maxdepth: 3 6 | 7 | ./download 8 | -------------------------------------------------------------------------------- /meta.yaml: -------------------------------------------------------------------------------- 1 | {% set name = "nlpaug" %} 2 | {% set version = "1.1.9" %} 3 | 4 | package: 5 | name: "{{ name|lower }}" 6 | version: "{{ version }}" 7 | 8 | requirements: 9 | host: 10 | - pip 11 | - python 12 | run: 13 | - python 14 | 15 | about: 16 | home: "https://github.com/makcedward/nlpaug" 17 | license: MIT 18 | summary: "Natural language processing augmentation library for deep neural networks." 19 | dev_url: "https://nlpaug.readthedocs.io/" 20 | -------------------------------------------------------------------------------- /nlpaug/.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | *.egg-info/ 24 | .installed.cfg 25 | *.egg 26 | MANIFEST 27 | 28 | # PyInstaller 29 | # Usually these files are written by a python script from a template 30 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 31 | *.manifest 32 | *.spec 33 | 34 | # Installer logs 35 | pip-log.txt 36 | pip-delete-this-directory.txt 37 | 38 | # Unit test / coverage reports 39 | htmlcov/ 40 | .tox/ 41 | .coverage 42 | .coverage.* 43 | .cache 44 | nosetests.xml 45 | coverage.xml 46 | *.cover 47 | .hypothesis/ 48 | .pytest_cache/ 49 | 50 | # Translations 51 | *.mo 52 | *.pot 53 | 54 | # Django stuff: 55 | *.log 56 | local_settings.py 57 | db.sqlite3 58 | 59 | # Flask stuff: 60 | instance/ 61 | .webassets-cache 62 | 63 | # Scrapy stuff: 64 | .scrapy 65 | 66 | # Sphinx documentation 67 | docs/_build/ 68 | 69 | # PyBuilder 70 | target/ 71 | 72 | # Jupyter Notebook 73 | .ipynb_checkpoints 74 | 75 | # pyenv 76 | .python-version 77 | 78 | # celery beat schedule file 79 | celerybeat-schedule 80 | 81 | # SageMath parsed files 82 | *.sage.py 83 | 84 | # Environments 85 | .env 86 | .venv 87 | env/ 88 | venv/ 89 | ENV/ 90 | env.bak/ 91 | venv.bak/ 92 | 93 | # Spyder project settings 94 | .spyderproject 95 | .spyproject 96 | 97 | # Rope project settings 98 | .ropeproject 99 | 100 | # mkdocs documentation 101 | /site 102 | 103 | # mypy 104 | .mypy_cache/ -------------------------------------------------------------------------------- /nlpaug/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.base_augmenter import * 3 | 4 | __all__ = ['base_augmenter'] 5 | 6 | __version__ = '1.1.11' 7 | __description__ = 'Natural language processing augmentation library for deep neural networks.' 8 | __url__ = 'https://github.com/makcedward/nlpaug' 9 | __author__ = 'Edward Ma' 10 | __author_email__ = 'makcedward@gmail.com' -------------------------------------------------------------------------------- /nlpaug/augmenter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/nlpaug/augmenter/__init__.py -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.augmenter.audio.audio_augmenter import AudioAugmenter 3 | from nlpaug.augmenter.audio.noise import NoiseAug 4 | from nlpaug.augmenter.audio.shift import ShiftAug 5 | from nlpaug.augmenter.audio.speed import SpeedAug 6 | from nlpaug.augmenter.audio.pitch import PitchAug 7 | from nlpaug.augmenter.audio.loudness import LoudnessAug 8 | from nlpaug.augmenter.audio.crop import CropAug 9 | from nlpaug.augmenter.audio.mask import MaskAug 10 | from nlpaug.augmenter.audio.vtlp import VtlpAug 11 | from nlpaug.augmenter.audio.normalization import NormalizeAug 12 | from nlpaug.augmenter.audio.inversion import PolarityInverseAug 13 | 14 | __all__ = ['audio_augmenter', 'noise', 'shift', 'speed', 'pitch', 'loudness', 'crop', 'mask', 'vtlp', 15 | 'normalization', 'inversion'] 16 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/audio_augmenter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.util import Method 4 | from nlpaug import Augmenter 5 | 6 | 7 | class AudioAugmenter(Augmenter): 8 | def __init__(self, action, zone=None, coverage=None, factor=None, duration=None, name='Audio_Aug', 9 | device='cpu', verbose=0, stateless=True): 10 | super(AudioAugmenter, self).__init__( 11 | name=name, method=Method.AUDIO, action=action, aug_min=None, aug_max=None, device=device, 12 | verbose=verbose) 13 | 14 | self.zone = zone 15 | self.coverage = coverage 16 | self.factor = factor 17 | self.duration = duration 18 | self.stateless = stateless 19 | 20 | @classmethod 21 | def clean(cls, data): 22 | return data 23 | 24 | @classmethod 25 | def is_duplicate(cls, dataset, data): 26 | for d in dataset: 27 | if np.array_equal(d, data): 28 | return True 29 | return False 30 | 31 | def get_random_factor(self, low=None, high=None, dtype='float'): 32 | lower_bound = low if low else self.factor[0] 33 | upper_bound = high if high else self.factor[1] 34 | if dtype == 'int': 35 | return np.random.randint(lower_bound, upper_bound) 36 | elif dtype == 'float': 37 | return np.random.uniform(lower_bound, upper_bound) 38 | 39 | return np.random.uniform(lower_bound, upper_bound) 40 | 41 | def get_augmentation_segment_size(self, data): 42 | return int(len(data) * (self.zone[1] - self.zone[0]) * self.coverage) 43 | 44 | def get_augment_range_by_coverage(self, data): 45 | zone_start, zone_end = int(len(data) * self.zone[0]), int(len(data) * self.zone[1]) 46 | zone_size = zone_end - zone_start 47 | 48 | target_size = int(zone_size * self.coverage) 49 | last_start = zone_start + int(zone_size * (1 - self.coverage)) 50 | 51 | if zone_start == last_start: 52 | start_pos = zone_start 53 | end_pos = zone_end 54 | else: 55 | start_pos = np.random.randint(zone_start, last_start) 56 | end_pos = start_pos + target_size 57 | 58 | return start_pos, end_pos 59 | 60 | def get_augment_range_by_duration(self, data): 61 | zone_start, zone_end = int(len(data) * self.zone[0]), int(len(data) * self.zone[1]) 62 | zone_size = zone_end - zone_start 63 | 64 | target_size = int(self.sampling_rate * self.duration) 65 | 66 | if target_size >= zone_size: 67 | start_pos = zone_start 68 | end_pos = zone_end 69 | else: 70 | last_start = zone_start + zone_size - target_size 71 | start_pos = np.random.randint(zone_start, last_start) 72 | end_pos = start_pos + target_size 73 | 74 | return start_pos, end_pos -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/crop.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply cropping operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class CropAug(AudioAugmenter): 11 | """ 12 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided. 13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 14 | augmentation will be applied in first 20% and last 20% of whole audio. 15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `0.1` is assigned, augment 16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 25.2 seconds ((0.8-0.2)*0.7*60) audio will be 18 | augmented. 19 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage` 20 | value will be ignored. 21 | :param str name: Name of this augmenter 22 | 23 | >>> import nlpaug.augmenter.audio as naa 24 | >>> aug = naa.CropAug(sampling_rate=44010) 25 | """ 26 | 27 | def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=0.1, duration=None, name='Crop_Aug', 28 | verbose=0, stateless=True): 29 | super().__init__( 30 | action=Action.DELETE, zone=zone, coverage=coverage, duration=duration, name=name, 31 | device='cpu', verbose=verbose, stateless=stateless) 32 | 33 | self.sampling_rate = sampling_rate 34 | self.model = nma.Crop() 35 | 36 | def delete(self, data): 37 | if self.duration is None: 38 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 39 | else: 40 | start_pos, end_pos = self.get_augment_range_by_duration(data) 41 | 42 | if not self.stateless: 43 | self.start_pos = start_pos 44 | self.end_pos = end_pos 45 | 46 | return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos) 47 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/inversion.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply polarity inversion to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class PolarityInverseAug(AudioAugmenter): 11 | """ 12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 13 | augmentation will be applied in first 20% and last 20% of whole audio. 14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `0.1` is assigned, augment 15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 25.2 seconds ((0.8-0.2)*0.7*60) audio will be 17 | augmented. 18 | :param str name: Name of this augmenter 19 | 20 | >>> import nlpaug.augmenter.audio as naa 21 | >>> aug = naa.PolarityInverseAug() 22 | """ 23 | 24 | def __init__(self, zone=(0.2, 0.8), coverage=0.3, name='PolarityInverse_Aug', verbose=0, stateless=True): 25 | super().__init__( 26 | action=Action.SUBSTITUTE, zone=zone, coverage=coverage, name=name, device='cpu', verbose=verbose, 27 | stateless=stateless) 28 | 29 | self.model = nma.PolarityInversion() 30 | 31 | def substitute(self, data): 32 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 33 | if not self.stateless: 34 | self.start_pos = start_pos 35 | self.end_pos = end_pos 36 | 37 | return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos) 38 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/loudness.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply adjusting loudness operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class LoudnessAug(AudioAugmenter): 11 | """ 12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 13 | augmentation will be applied in first 20% and last 20% of whole audio. 14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 17 | augmented. 18 | :param tuple factor: Input data volume will be increased (decreased). Augmented value will be picked 19 | within the range of this tuple value. Volume will be reduced if value is between 0 and 1. 20 | :param str name: Name of this augmenter 21 | 22 | >>> import nlpaug.augmenter.audio as naa 23 | >>> aug = naa.LoudnessAug() 24 | """ 25 | 26 | def __init__(self, zone=(0.2, 0.8), coverage=1., factor=(0.5, 2), name='Loudness_Aug', verbose=0, 27 | stateless=True): 28 | super().__init__(action=Action.SUBSTITUTE, name=name, zone=zone, coverage=coverage, 29 | factor=factor, device='cpu', verbose=verbose, stateless=stateless) 30 | 31 | self.model = nma.Loudness() 32 | 33 | def substitute(self, data): 34 | loudness_level = self.get_random_factor() 35 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 36 | 37 | if not self.stateless: 38 | self.start_pos, self.end_pos, self.aug_factor = start_pos, end_pos, loudness_level 39 | 40 | return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos, loudness_level=loudness_level) 41 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/mask.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply mask operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class MaskAug(AudioAugmenter): 11 | """ 12 | :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided. 13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 14 | augmentation will be applied in first 20% and last 20% of whole audio. 15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 18 | augmented. 19 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage` 20 | value will be ignored. 21 | :param bool mask_with_noise: If it is True, targeting area will be replaced by noise. Otherwise, it will be 22 | replaced by 0. 23 | :param str name: Name of this augmenter 24 | 25 | >>> import nlpaug.augmenter.audio as naa 26 | >>> aug = naa.MaskAug(sampling_rate=44010) 27 | """ 28 | 29 | def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=1., duration=None, 30 | mask_with_noise=True, name='Mask_Aug', verbose=0, stateless=True): 31 | super().__init__( 32 | action=Action.SUBSTITUTE, zone=zone, coverage=coverage, duration=duration, 33 | name=name, device='cpu', verbose=verbose, stateless=stateless) 34 | 35 | self.mask_with_noise = mask_with_noise 36 | self.model = nma.Mask() 37 | 38 | def substitute(self, data): 39 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 40 | 41 | if not self.stateless: 42 | self.start_pos, self.end_pos = start_pos, end_pos 43 | 44 | return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos, 45 | mask_with_noise=self.mask_with_noise) 46 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/noise.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply noise injection operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class NoiseAug(AudioAugmenter): 11 | """ 12 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 13 | augmentation will be applied in first 20% and last 20% of whole audio. 14 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 15 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 16 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 17 | augmented. 18 | :param str color: Colors of noise. Supported 'white', 'pink', 'red', 'brown', 'brownian', 'blue', 'azure', 19 | 'violet', 'purple' and 'random'. If 'random' is used, noise color will be picked randomly in each augment. 20 | :param list noises: Background noises for noise injection. You can provide more than one background noise and 21 | noise will be picked randomly. Expected format is list of numpy array. If this value is provided. `color` 22 | value will be ignored 23 | :param str name: Name of this augmenter 24 | 25 | >>> import nlpaug.augmenter.audio as naa 26 | >>> aug = naa.NoiseAug() 27 | """ 28 | def __init__(self, zone=(0.2, 0.8), coverage=1., color='white', noises=None, name='Noise_Aug', 29 | verbose=0, stateless=True): 30 | super().__init__(action=Action.SUBSTITUTE, zone=zone, coverage=coverage, name=name, 31 | device='cpu', verbose=verbose, stateless=stateless) 32 | 33 | self.color = color 34 | self.noises = noises 35 | self.model = nma.Noise() 36 | 37 | self.model.validate(color) 38 | 39 | def substitute(self, data): 40 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 41 | aug_segment_size = end_pos - start_pos 42 | 43 | noise, color = self.model.get_noise_and_color(aug_segment_size, self.noises, self.color) 44 | 45 | if not self.stateless: 46 | self.start_pos, self.end_pos, self.aug_factor = start_pos, end_pos, color 47 | 48 | return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos, noise=noise) 49 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/normalization.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply mask normalization to audio. 3 | """ 4 | 5 | import random 6 | 7 | from nlpaug.augmenter.audio import AudioAugmenter 8 | import nlpaug.model.audio as nma 9 | from nlpaug.util import Action, WarningMessage 10 | 11 | 12 | class NormalizeAug(AudioAugmenter): 13 | """ 14 | :param str method: It supports 'minmax', 'max' and 'standard'. For 'minmax', data will be 15 | substracted by min value in data and dividing by range of max value and min value. For 16 | 'max', data will be divided by max value only. For 'standard', data will be substracted 17 | by mean value and dividing by value of standard deviation. If 'random' is used, method 18 | will be picked randomly in each augment. 19 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 20 | augmentation will be applied in first 20% and last 20% of whole audio. 21 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `0.1` is assigned, augment 22 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 23 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 25.2 seconds ((0.8-0.2)*0.7*60) audio will be 24 | augmented. 25 | :param str name: Name of this augmenter 26 | 27 | >>> import nlpaug.augmenter.audio as naa 28 | >>> aug = naa.NormalizeAug() 29 | """ 30 | 31 | def __init__(self, method='max', zone=(0.2, 0.8), coverage=0.3, name='Normalize_Aug', verbose=0, 32 | stateless=True): 33 | super().__init__( 34 | action=Action.SUBSTITUTE, zone=zone, coverage=coverage, name=name, device='cpu', 35 | verbose=verbose, stateless=stateless) 36 | 37 | self.model = nma.Normalization() 38 | self.method = method 39 | self.validate() 40 | 41 | def random_method(self): 42 | return self.sample(self.model.get_support_methods(), 1)[0] 43 | 44 | def substitute(self, data): 45 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 46 | 47 | method = self.random_method() if self.method == 'random' else self.method 48 | 49 | if not self.stateless: 50 | self.start_pos = start_pos 51 | self.end_pos = end_pos 52 | self.run_method = method 53 | 54 | return self.model.manipulate(data, method=method, start_pos=start_pos, end_pos=end_pos) 55 | 56 | def validate(self): 57 | if self.method not in ['random'] + self.model.get_support_methods(): 58 | raise ValueError('{} does not support yet. You may pick one of {}'.format( 59 | self.method, ['random'] + self.model.get_support_methods())) 60 | 61 | return True 62 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/pitch.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply pitch adjustment operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class PitchAug(AudioAugmenter): 11 | """ 12 | :param int sampling_rate: Sampling rate of input audio. 13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 14 | augmentation will be applied in first 20% and last 20% of whole audio. 15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 18 | augmented. 19 | :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage` 20 | value will be ignored. 21 | :param tuple factor: Input data pitch will be increased (decreased). Augmented value will be picked 22 | within the range of this tuple value. Pitch will be reduced if value is between 0 and 1. 23 | :param str name: Name of this augmenter 24 | 25 | >>> import nlpaug.augmenter.audio as naa 26 | >>> aug = naa.PitchAug(sampling_rate=44010) 27 | """ 28 | 29 | def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=1., duration=None, 30 | factor=(-10, 10), name='Pitch_Aug', verbose=0, stateless=True): 31 | super().__init__(action=Action.SUBSTITUTE, zone=zone, coverage=coverage, factor=factor, 32 | duration=duration, name=name, device='cpu', verbose=verbose, stateless=stateless) 33 | 34 | self.sampling_rate = sampling_rate 35 | self.model = nma.Pitch() 36 | 37 | def substitute(self, data): 38 | pitch_level = self.get_random_factor() 39 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 40 | 41 | if not self.stateless: 42 | self.start_pos, self.end_pos, self.aug_factor = start_pos, end_pos, pitch_level 43 | 44 | return self.model.manipulate(data, start_pos, end_pos, pitch_level, self.sampling_rate) 45 | 46 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/shift.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply shifting operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action, WarningMessage 8 | 9 | 10 | class ShiftAug(AudioAugmenter): 11 | """ 12 | :param int sampling_rate: Sampling rate of input audio. 13 | :param float duration: Max shifting segment (in second) 14 | :param str direction: Shifting segment to left, right or one of them. Value can be 'left', 'right' or 'random' 15 | :param str name: Name of this augmenter 16 | 17 | >>> import nlpaug.augmenter.audio as naa 18 | >>> aug = naa.ShiftAug(sampling_rate=44010) 19 | """ 20 | 21 | def __init__(self, sampling_rate, duration=3, direction='random', shift_direction='random', 22 | name='Shift_Aug', verbose=0, stateless=True): 23 | super().__init__(action=Action.SUBSTITUTE, name=name, duration=duration, device='cpu', verbose=verbose, 24 | stateless=stateless) 25 | 26 | self.sampling_rate = sampling_rate 27 | self.direction = direction 28 | self.shift_direction = shift_direction 29 | self.model = nma.Shift() 30 | 31 | self.model.validate(shift_direction) 32 | 33 | def _get_aug_shift(self): 34 | aug_shift = int(self.sampling_rate * self.duration) 35 | if self.direction == 'right': 36 | return -aug_shift 37 | elif self.direction == 'random': 38 | direction = self.sample(4)-1 39 | if direction == 1: 40 | return -aug_shift 41 | 42 | return aug_shift 43 | 44 | def substitute(self, data): 45 | aug_shift = self._get_aug_shift() 46 | 47 | if not self.stateless: 48 | self.aug_factor = aug_shift 49 | 50 | return self.model.manipulate(data, aug_shift) 51 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/speed.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply speed adjustment operation to audio. 3 | """ 4 | 5 | import numpy as np 6 | 7 | from nlpaug.augmenter.audio import AudioAugmenter 8 | import nlpaug.model.audio as nma 9 | from nlpaug.util import Action, WarningMessage 10 | 11 | 12 | class SpeedAug(AudioAugmenter): 13 | """ 14 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 15 | augmentation will be applied in first 20% and last 20% of whole audio. 16 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 17 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 18 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 19 | augmented. 20 | :param tuple factor: Input data speed will be increased (decreased). Augmented value will be picked 21 | within the range of this tuple value. Speed will be reduced if value is between 0 and 1. 22 | :param tuple speed_range: Deprecated. Use `factor` indeed 23 | :param str name: Name of this augmenter 24 | 25 | >>> import nlpaug.augmenter.audio as naa 26 | >>> aug = naa.ShiftAug() 27 | """ 28 | 29 | def __init__(self, zone=(0.2, 0.8), coverage=1., factor=(0.5, 2), name='Speed_Aug', verbose=0, 30 | stateless=True): 31 | super().__init__(action=Action.SUBSTITUTE, name=name, zone=zone, coverage=coverage, 32 | factor=factor, device='cpu', verbose=verbose, stateless=stateless) 33 | 34 | self.model = nma.Speed() 35 | 36 | def substitute(self, data): 37 | speed_level = self.get_random_factor() 38 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 39 | 40 | if not self.stateless: 41 | self.start_pos, self.end_pos, self.aug_factor = start_pos, end_pos, speed_level 42 | 43 | return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos, speed=speed_level) 44 | 45 | -------------------------------------------------------------------------------- /nlpaug/augmenter/audio/vtlp.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply vocal tract length perturbation (VTLP) operation to audio. 3 | """ 4 | 5 | from nlpaug.augmenter.audio import AudioAugmenter 6 | import nlpaug.model.audio as nma 7 | from nlpaug.util import Action 8 | 9 | 10 | class VtlpAug(AudioAugmenter): 11 | # https://pdfs.semanticscholar.org/3de0/616eb3cd4554fdf9fd65c9c82f2605a17413.pdf 12 | """ 13 | :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any 14 | augmentation will be applied in first 20% and last 20% of whole audio. 15 | :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment 16 | operation will be applied to target audio segment. For example, the audio duration is 60 seconds while 17 | zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be 18 | augmented. 19 | :param tuple factor: Input data vocal will be increased (decreased). Augmented value will be picked 20 | within the range of this tuple value. Vocal will be reduced if value is between 0 and 1. 21 | :param int fhi: Boundary frequency. Default value is 4800. 22 | :param str name: Name of this augmenter 23 | 24 | >>> import nlpaug.augmenter.audio as naa 25 | >>> aug = naa.VtlpAug() 26 | """ 27 | 28 | def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=0.1, fhi=4800, factor=(0.9, 1.1), 29 | name='Vtlp_Aug', verbose=0, stateless=True): 30 | super().__init__( 31 | action=Action.SUBSTITUTE, zone=zone, coverage=coverage, factor=factor, name=name, 32 | device='cpu', verbose=verbose, stateless=stateless) 33 | 34 | self.sampling_rate = sampling_rate 35 | self.fhi = fhi 36 | self.model = nma.Vtlp() 37 | 38 | def substitute(self, data): 39 | if self.duration is None: 40 | start_pos, end_pos = self.get_augment_range_by_coverage(data) 41 | else: 42 | start_pos, end_pos = self.get_augment_range_by_duration(data) 43 | 44 | warp_factor = self.get_random_factor() 45 | 46 | if not self.stateless: 47 | self.start_pos, self.end_pos, self.aug_factor = start_pos, end_pos, warp_factor 48 | 49 | return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos, sampling_rate=self.sampling_rate, 50 | warp_factor=warp_factor) 51 | -------------------------------------------------------------------------------- /nlpaug/augmenter/augment.py: -------------------------------------------------------------------------------- 1 | 2 | 3 | class Augment: 4 | def __init__(self, pos, original, new): 5 | self.pos = pos 6 | self.original = original 7 | self.new = new 8 | -------------------------------------------------------------------------------- /nlpaug/augmenter/char/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.augmenter.char.char_augmenter import * 3 | from nlpaug.augmenter.char.ocr import * 4 | from nlpaug.augmenter.char.random import * 5 | from nlpaug.augmenter.char.keyboard import * 6 | -------------------------------------------------------------------------------- /nlpaug/augmenter/sentence/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.augmenter.sentence.sentence_augmenter import * 3 | from nlpaug.augmenter.sentence.context_word_embs_sentence import * 4 | from nlpaug.augmenter.sentence.abst_summ import * 5 | from nlpaug.augmenter.sentence.lambada import * 6 | from nlpaug.augmenter.sentence.random import * -------------------------------------------------------------------------------- /nlpaug/augmenter/sentence/random.py: -------------------------------------------------------------------------------- 1 | """ 2 | Augmenter that apply operation (sentence level) to textual input based on abstractive summarization. 3 | """ 4 | 5 | import os 6 | 7 | 8 | from nlpaug.augmenter.sentence import SentenceAugmenter 9 | import nlpaug.model.word_rule as nmr 10 | from nlpaug.util import Action, Doc 11 | 12 | 13 | class RandomSentAug(SentenceAugmenter): 14 | 15 | """ 16 | Augmenter that apply randomly behavior for augmentation. 17 | 18 | :param str mode: Shuffle sentence to left, right, neighbor or random position. For `left`, target sentence 19 | will be swapped with left sentnece. For `right`, target sentence will be swapped with right sentnece. 20 | For `neighbor`, target sentence will be swapped with left or right sentnece radomly. For `random`, 21 | target sentence will be swapped with any sentnece randomly. 22 | :param float aug_p: Percentage of sentence will be augmented. 23 | :param int aug_min: Minimum number of sentence will be augmented. 24 | :param int aug_max: Maximum number of sentence will be augmented. If None is passed, number of augmentation is 25 | calculated via aup_p. If calculated result from aug_p is smaller than aug_max, will use calculated result from 26 | aug_p. Otherwise, using aug_max. 27 | :param func tokenizer: Customize tokenization process 28 | :param str name: Name of this augmenter 29 | 30 | >>> import nlpaug.augmenter.sentence as nas 31 | >>> aug = nas.RandomSentAug() 32 | """ 33 | 34 | def __init__(self, mode='neighbor', action=Action.SWAP, name='RandomSent_Aug', aug_min=1, aug_max=10, aug_p=0.3, 35 | tokenizer=None, verbose=0): 36 | super().__init__( 37 | action=action, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, verbose=verbose) 38 | 39 | self.model = nmr.Shuffle(mode=mode, model_type='sentence', tokenizer=tokenizer) 40 | 41 | def pre_skip_aug(self, data): 42 | return list(range(len(data))) 43 | 44 | # https://arxiv.org/abs/1910.13461 45 | def swap(self, data): 46 | if not data: 47 | return data 48 | 49 | if isinstance(data, list): 50 | all_data = data 51 | else: 52 | if data.strip() == '': 53 | return data 54 | all_data = [data] 55 | 56 | for i, d in enumerate(all_data): 57 | sentences = self.model.tokenize(d) 58 | aug_idxes = self._get_random_aug_idxes(sentences) 59 | for aug_idx in aug_idxes: 60 | sentences = self.model.predict(sentences, aug_idx) 61 | all_data[i] = ' '.join(sentences) 62 | 63 | # TODO: always return array 64 | if isinstance(data, list): 65 | return all_data 66 | else: 67 | return all_data[0] 68 | 69 | -------------------------------------------------------------------------------- /nlpaug/augmenter/sentence/sentence_augmenter.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | from nlpaug.util import Method 4 | from nlpaug.util.text.tokenizer import Tokenizer 5 | from nlpaug import Augmenter 6 | from typing import Iterable 7 | 8 | 9 | class SentenceAugmenter(Augmenter): 10 | def __init__(self, action, name='Sentence_Aug', stopwords=None, tokenizer=None, reverse_tokenizer=None, 11 | device='cuda', aug_min=None, aug_max=None, aug_p=None, include_detail=False, verbose=0): 12 | super().__init__( 13 | name=name, method=Method.SENTENCE, action=action, aug_min=aug_min, aug_max=aug_max, aug_p=aug_p, 14 | device=device, verbose=verbose, include_detail=include_detail) 15 | self.tokenizer = tokenizer or Tokenizer.tokenizer 16 | self.reverse_tokenizer = reverse_tokenizer or Tokenizer.reverse_tokenizer 17 | self.stopwords = stopwords 18 | 19 | @classmethod 20 | def clean(cls, data): 21 | if isinstance(data, str): 22 | return data.strip() 23 | if isinstance(data, Iterable): 24 | return [d.strip() for d in data] 25 | return str(data).strip() 26 | 27 | @classmethod 28 | def is_duplicate(cls, dataset, data): 29 | for d in dataset: 30 | if d == data: 31 | return True 32 | return False 33 | -------------------------------------------------------------------------------- /nlpaug/augmenter/spectrogram/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.augmenter.spectrogram.spectrogram_augmenter import * 3 | from nlpaug.augmenter.spectrogram.frequency_masking import * 4 | from nlpaug.augmenter.spectrogram.time_masking import * 5 | from nlpaug.augmenter.spectrogram.loudness import * -------------------------------------------------------------------------------- /nlpaug/augmenter/spectrogram/loudness.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.augmenter.spectrogram import SpectrogramAugmenter 4 | from nlpaug.util import Action 5 | import nlpaug.model.spectrogram as nms 6 | 7 | 8 | class LoudnessAug(SpectrogramAugmenter): 9 | """ 10 | Augmenter that change loudness on mel spectrogram by random values. 11 | 12 | :param tuple zone: Default value is (0.2, 0.8). Assign a zone for augmentation. By default, no any augmentation 13 | will be applied in first 20% and last 20% of whole audio. 14 | :param float coverage: Default value is 1 and value should be between 0 and 1. Portion of augmentation. 15 | If `1` is assigned, augment operation will be applied to target audio segment. For example, the audio 16 | duration is 60 seconds while zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 17 | seconds ((0.8-0.2)*0.7*60) audio will be augmented. 18 | :param tuple factor: Default value is (0.5, 2). Volume change value will be picked within the range of this 19 | tuple value. Volume will be reduced if value is between 0 and 1. Otherwise, volume will be increased. 20 | :param str name: Name of this augmenter 21 | """ 22 | def __init__(self, name='Loudness_Aug', zone=(0.2, 0.8), coverage=1., factor=(0.5, 2), verbose=0, 23 | silence=False, stateless=True): 24 | super().__init__(action=Action.SUBSTITUTE, zone=zone, coverage=coverage, factor=factor, 25 | verbose=verbose, name=name, silence=silence, stateless=stateless) 26 | 27 | self.model = nms.Loudness() 28 | 29 | def substitute(self, data): 30 | # https://arxiv.org/pdf/2001.01401.pdf 31 | 32 | loudness_level = self.get_random_factor() 33 | time_start, time_end = self.get_augment_range_by_coverage(data) 34 | 35 | if not self.stateless: 36 | self.time_start, self.time_end, self.loudness_level = time_start, time_end, loudness_level 37 | 38 | return self.model.manipulate(data, loudness_level=loudness_level, time_start=time_start, time_end=time_end) 39 | -------------------------------------------------------------------------------- /nlpaug/augmenter/spectrogram/spectrogram_augmenter.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.util import Method 4 | from nlpaug import Augmenter 5 | 6 | 7 | class SpectrogramAugmenter(Augmenter): 8 | def __init__(self, action, zone=None, coverage=None, factor=None, name='Spectrogram_Aug', device='cpu', 9 | verbose=0, stateless=True, silence=False): 10 | super().__init__(name=name, method=Method.SPECTROGRAM, action=action, aug_min=None, 11 | aug_max=None, device=device, verbose=verbose) 12 | 13 | self.zone = zone 14 | self.coverage = coverage 15 | self.factor = factor 16 | self.stateless = stateless 17 | self.silence = silence 18 | 19 | if self.zone[0] < 0: 20 | raise ValueError('Lower bound of zone is smaller than {}.'.format(0) + 21 | ' It should be larger than {}'.format(0)) 22 | 23 | if self.zone[1] > 1: 24 | raise ValueError('Upper bound of zone is larger than {}.'.format(1) + 25 | ' It should be smaller than {}'.format(1)) 26 | 27 | if self.coverage < 0 or self.coverage > 1: 28 | raise ValueError('Coverage value should be between than 0 and 1 while ' + 29 | 'input value is {}'.format(self.coverage)) 30 | 31 | @classmethod 32 | def clean(cls, data): 33 | return data 34 | 35 | @classmethod 36 | def is_duplicate(cls, dataset, data): 37 | for d in dataset: 38 | if np.array_equal(d, data): 39 | return True 40 | return False 41 | 42 | def get_random_factor(self, low=None, high=None, dtype='float'): 43 | lower_bound = self.factor[0] if low is None else low 44 | upper_bound = self.factor[1] if high is None else high 45 | if dtype == 'int': 46 | return np.random.randint(lower_bound, upper_bound) 47 | elif dtype == 'float': 48 | return np.random.uniform(lower_bound, upper_bound) 49 | else: 50 | return np.random.uniform(lower_bound, upper_bound) 51 | 52 | def get_augment_range_by_coverage(self, data): 53 | zone_start, zone_end = int(data.shape[1] * self.zone[0]), int(data.shape[1] * self.zone[1]) 54 | zone_size = zone_end - zone_start 55 | 56 | target_size = int(zone_size * self.coverage) 57 | last_start = zone_start + int(zone_size * (1 - self.coverage)) 58 | 59 | if zone_start == last_start: 60 | start_pos = zone_start 61 | end_pos = zone_end 62 | else: 63 | start_pos = np.random.randint(zone_start, last_start) 64 | end_pos = start_pos + target_size 65 | 66 | return start_pos, end_pos 67 | -------------------------------------------------------------------------------- /nlpaug/augmenter/spectrogram/time_masking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.augmenter.spectrogram import SpectrogramAugmenter 4 | from nlpaug.util import Action 5 | import nlpaug.model.spectrogram as nms 6 | 7 | 8 | class TimeMaskingAug(SpectrogramAugmenter): 9 | """ 10 | Augmenter that mask spectrogram based on frequency by random values. 11 | 12 | :param tuple zone: Default value is (0.2, 0.8). Assign a zone for augmentation. By default, no any augmentation 13 | will be applied in first 20% and last 20% of whole audio. 14 | :param float coverage: Default value is 1 and value should be between 0 and 1. Portion of augmentation. 15 | If `1` is assigned, augment operation will be applied to target audio segment. For example, the audio 16 | duration is 60 seconds while zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 17 | seconds ((0.8-0.2)*0.7*60) audio will be chosen for augmentation. 18 | :param str name: Name of this augmenter 19 | 20 | >>> import nlpaug.augmenter.spectogram as nas 21 | >>> aug = nas.TimeMaskingAug() 22 | """ 23 | 24 | def __init__(self, name='TimeMasking_Aug', zone=(0.2, 0.8), coverage=1., verbose=0, 25 | silence=False, stateless=True): 26 | super().__init__(action=Action.SUBSTITUTE, zone=zone, coverage=coverage, factor=(1, 1), verbose=verbose, 27 | name=name, silence=silence, stateless=stateless) 28 | 29 | self.model = nms.TimeMasking() 30 | 31 | def substitute(self, data): 32 | """ 33 | From: https://arxiv.org/pdf/1904.08779.pdf, 34 | Time masking is applied so that t consecutive time steps 35 | [t0, t0 + t) are masked, where t is first chosen from a 36 | uniform distribution from 0 to the time mask parameter 37 | T, and t0 is chosen from [0, tau - t). 38 | """ 39 | 40 | tau = data.shape[1] 41 | t0, time_end = self.get_augment_range_by_coverage(data) 42 | t = self.get_random_factor(high=time_end, dtype='int') 43 | 44 | if not self.stateless: 45 | self.tau, self.t, self.t0 = tau, t, t0 46 | 47 | return self.model.manipulate(data, t=t, t0=t0) 48 | -------------------------------------------------------------------------------- /nlpaug/augmenter/spectrogram/time_warping.py: -------------------------------------------------------------------------------- 1 | # from nlpaug.augmenter.spectrogram import SpectrogramAugmenter 2 | # from nlpaug.util import Action 3 | # import nlpaug.model.spectrogram as nms 4 | # 5 | # 6 | # class TimeWarpingAug(SpectrogramAugmenter): 7 | # https://arxiv.org/pdf/1904.08779.pdf 8 | # def __init__(self, time_mask, name='TimeWarpingAug_Aug'): 9 | # super(TimeWarpingAug, self).__init__( 10 | # action=Action.SUBSTITUTE, name=name, aug_p=1, aug_min=0.3) 11 | # 12 | # self.model = self.get_model(time_mask) 13 | # 14 | # def substitute(self, mel_spectrogram): 15 | # return self.model.mask(mel_spectrogram) 16 | # 17 | # def get_model(self, time_mask): 18 | # return nms.TimeWarping(time_mask) 19 | -------------------------------------------------------------------------------- /nlpaug/augmenter/word/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.augmenter.word.word_augmenter import * 3 | from nlpaug.augmenter.word.random import * 4 | from nlpaug.augmenter.word.word_embs import * 5 | from nlpaug.augmenter.word.tfidf import * 6 | from nlpaug.augmenter.word.spelling import * 7 | from nlpaug.augmenter.word.context_word_embs import * 8 | from nlpaug.augmenter.word.synonym import * 9 | from nlpaug.augmenter.word.antonym import * 10 | from nlpaug.augmenter.word.split import * 11 | from nlpaug.augmenter.word.back_translation import * 12 | from nlpaug.augmenter.word.reserved import * 13 | -------------------------------------------------------------------------------- /nlpaug/flow/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.flow.pipeline import * 3 | from nlpaug.flow.sequential import * 4 | from nlpaug.flow.sometimes import * 5 | -------------------------------------------------------------------------------- /nlpaug/flow/sequential.py: -------------------------------------------------------------------------------- 1 | """ 2 | Flow that apply augmentation sequentially. 3 | """ 4 | 5 | from nlpaug.util import Action 6 | from nlpaug.flow import Pipeline 7 | 8 | 9 | class Sequential(Pipeline): 10 | """ 11 | Flow that apply augmenters sequentially. 12 | 13 | :param list flow: list of flow or augmenter 14 | :param str name: Name of this augmenter 15 | 16 | >>> import nlpaug.flow as naf 17 | >>> import nlpaug.augmenter.char as nac 18 | >>> import nlpaug.augmenter.word as naw 19 | >>> flow = naf.Sequential([nac.RandomCharAug(), naw.RandomWordAug()]) 20 | """ 21 | 22 | def __init__(self, flow=None, name='Sequential_Pipeline', verbose=0): 23 | Pipeline.__init__(self, name=name, action=Action.SEQUENTIAL, flow=flow, include_detail=False, 24 | verbose=verbose) 25 | 26 | def draw(self): 27 | return True 28 | -------------------------------------------------------------------------------- /nlpaug/flow/sometimes.py: -------------------------------------------------------------------------------- 1 | """ 2 | Flow that apply augmentation randomly. 3 | """ 4 | 5 | from nlpaug.util import Action 6 | from nlpaug.flow import Pipeline 7 | 8 | 9 | class Sometimes(Pipeline): 10 | """ 11 | Flow that apply augmenters randomly. 12 | 13 | :param list flow: list of flow or augmenter 14 | :param float aug_p: Percentage of pipeline will be executed. 15 | :param str name: Name of this augmenter 16 | 17 | >>> import nlpaug.flow as naf 18 | >>> import nlpaug.augmenter.char as nac 19 | >>> import nlpaug.augmenter.word as naw 20 | >>> flow = naf.Sometimes([nac.RandomCharAug(), naw.RandomWordAug()]) 21 | """ 22 | 23 | def __init__(self, flow=None, name='Sometimes_Pipeline', aug_p=0.8, verbose=0): 24 | Pipeline.__init__(self, name=name, action=Action.SOMETIMES, 25 | flow=flow, aug_p=aug_p, include_detail=False, verbose=verbose) 26 | 27 | def draw(self): 28 | return self.aug_p > self.prob() 29 | -------------------------------------------------------------------------------- /nlpaug/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/nlpaug/model/__init__.py -------------------------------------------------------------------------------- /nlpaug/model/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.audio.audio import * 3 | from nlpaug.model.audio.noise import * 4 | from nlpaug.model.audio.shift import * 5 | from nlpaug.model.audio.speed import * 6 | from nlpaug.model.audio.pitch import * 7 | from nlpaug.model.audio.loudness import * 8 | from nlpaug.model.audio.crop import * 9 | from nlpaug.model.audio.mask import * 10 | from nlpaug.model.audio.vtlp import * 11 | from nlpaug.model.audio.normalization import * 12 | from nlpaug.model.audio.inversion import * 13 | -------------------------------------------------------------------------------- /nlpaug/model/audio/audio.py: -------------------------------------------------------------------------------- 1 | class Audio: 2 | def manipulate(self, data): 3 | raise NotImplementedError 4 | -------------------------------------------------------------------------------- /nlpaug/model/audio/crop.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.audio import Audio 4 | 5 | 6 | class Crop(Audio): 7 | def manipulate(self, data, start_pos, end_pos): 8 | aug_data = data.copy() 9 | aug_data = np.delete(aug_data, np.s_[start_pos:end_pos]) 10 | return aug_data 11 | -------------------------------------------------------------------------------- /nlpaug/model/audio/inversion.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.audio import Audio 4 | 5 | class PolarityInversion(Audio): 6 | # https://en.wikipedia.org/wiki/Phase_inversion 7 | def manipulate(self, data, start_pos, end_pos): 8 | aug_data = data.copy() 9 | aug_data[start_pos:end_pos] = -aug_data[start_pos:end_pos] 10 | 11 | return aug_data 12 | -------------------------------------------------------------------------------- /nlpaug/model/audio/loudness.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.audio import Audio 4 | 5 | 6 | class Loudness(Audio): 7 | def manipulate(self, data, start_pos, end_pos, loudness_level): 8 | aug_data = data.copy() 9 | aug_data[start_pos:end_pos] = aug_data[start_pos:end_pos] * loudness_level 10 | 11 | return aug_data 12 | -------------------------------------------------------------------------------- /nlpaug/model/audio/mask.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.audio import Audio 4 | 5 | 6 | class Mask(Audio): 7 | def manipulate(self, data, start_pos, end_pos, mask_with_noise): 8 | if mask_with_noise: 9 | noise_data = np.random.randn(end_pos - start_pos) 10 | else: 11 | noise_data = np.zeros(end_pos - start_pos) 12 | 13 | aug_data = data.copy() 14 | aug_data[start_pos:end_pos] = noise_data 15 | 16 | return aug_data 17 | -------------------------------------------------------------------------------- /nlpaug/model/audio/normalization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.audio import Audio 4 | 5 | 6 | class Normalization(Audio): 7 | def manipulate(self, data, method, start_pos, end_pos): 8 | aug_data = data.copy() 9 | if method == 'minmax': 10 | new_data = self._min_max(aug_data[start_pos:end_pos]) 11 | elif method == 'max': 12 | new_data = self._max(aug_data[start_pos:end_pos]) 13 | elif method == 'standard': 14 | new_data = self._standard(aug_data[start_pos:end_pos]) 15 | 16 | aug_data[start_pos:end_pos] = new_data 17 | 18 | return aug_data 19 | 20 | def get_support_methods(self): 21 | return ['minmax', 'max', 'standard'] 22 | 23 | def _standard(self, data): 24 | return (data - np.mean(data)) / np.std(data) 25 | 26 | def _max(self, data): 27 | return data / np.amax(np.abs(data)) 28 | 29 | def _min_max(self, data): 30 | lower = np.amin(np.abs(data)) 31 | return (data - lower) / (np.amax(np.abs(data)) - lower) 32 | -------------------------------------------------------------------------------- /nlpaug/model/audio/pitch.py: -------------------------------------------------------------------------------- 1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation 2 | 3 | try: 4 | import librosa 5 | except ImportError: 6 | # No installation required if not using this function 7 | pass 8 | import numpy as np 9 | 10 | from nlpaug.model.audio import Audio 11 | 12 | 13 | class Pitch(Audio): 14 | def __init__(self): 15 | super().__init__() 16 | try: 17 | import librosa 18 | except ModuleNotFoundError: 19 | raise ModuleNotFoundError('Missed librosa library. Install it via `pip install librosa`') 20 | 21 | def manipulate(self, data, start_pos, end_pos, pitch_level, sampling_rate): 22 | aug_data = data.copy() 23 | aug_data[start_pos:end_pos] = librosa.effects.pitch_shift( 24 | y=aug_data[start_pos:end_pos], sr=sampling_rate, n_steps=pitch_level) 25 | 26 | return aug_data 27 | -------------------------------------------------------------------------------- /nlpaug/model/audio/shift.py: -------------------------------------------------------------------------------- 1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation 2 | import numpy as np 3 | 4 | from nlpaug.model.audio import Audio 5 | 6 | 7 | class Shift(Audio): 8 | def validate(self, direction): 9 | if direction not in ['left', 'right', 'random']: 10 | raise ValueError( 11 | 'shift_direction should be either left, right or both while {} is passed.'.format(direction)) 12 | 13 | def manipulate(self, data, shift): 14 | aug_data = np.roll(data.copy(), shift) 15 | # Set to silence for heading/ tailing 16 | if shift > 0: 17 | aug_data[:shift] = 0 18 | else: 19 | aug_data[shift:] = 0 20 | return aug_data 21 | -------------------------------------------------------------------------------- /nlpaug/model/audio/speed.py: -------------------------------------------------------------------------------- 1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation 2 | 3 | try: 4 | import librosa 5 | except ImportError: 6 | # No installation required if not using this function 7 | pass 8 | import numpy as np 9 | 10 | from nlpaug.model.audio import Audio 11 | 12 | 13 | class Speed(Audio): 14 | def __init__(self): 15 | super().__init__() 16 | try: 17 | import librosa 18 | except ModuleNotFoundError: 19 | raise ModuleNotFoundError('Missed librosa library. Install it via `pip install librosa`') 20 | 21 | def manipulate(self, data, start_pos, end_pos, speed): 22 | aug_data = librosa.effects.time_stretch(y=data[start_pos:end_pos], rate=speed) 23 | return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0) 24 | -------------------------------------------------------------------------------- /nlpaug/model/audio/vtlp.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | try: 3 | import librosa 4 | except ImportError: 5 | # No installation required if not using this function 6 | pass 7 | 8 | from nlpaug.model.audio import Audio 9 | 10 | 11 | class Vtlp(Audio): 12 | # https://pdfs.semanticscholar.org/3de0/616eb3cd4554fdf9fd65c9c82f2605a17413.pdf 13 | def __init__(self): 14 | super().__init__() 15 | 16 | try: 17 | import librosa 18 | except ModuleNotFoundError: 19 | raise ModuleNotFoundError('Missed librosa library. Install import librosa by `pip install librosa`') 20 | 21 | # http://www.cs.toronto.edu/~hinton/absps/perturb.pdf 22 | @classmethod 23 | def get_scale_factors(cls, freq_dim, sampling_rate, fhi=4800, alpha=0.9): 24 | factors = [] 25 | freqs = np.linspace(0, 1, freq_dim) 26 | 27 | scale = fhi * min(alpha, 1) 28 | f_boundary = scale / alpha 29 | half_sr = sampling_rate / 2 30 | 31 | for f in freqs: 32 | f *= sampling_rate 33 | if f <= f_boundary: 34 | factors.append(f * alpha) 35 | else: 36 | warp_freq = half_sr - (half_sr - scale) / (half_sr - scale / alpha) * (half_sr - f) 37 | factors.append(warp_freq) 38 | 39 | return np.array(factors) 40 | 41 | # https://github.com/YerevaNN/Spoken-language-identification/blob/master/augment_data.py#L26 42 | def _manipulate(self, audio, sampling_rate, factor): 43 | stft = librosa.core.stft(audio) 44 | freq_dim, time_dim = stft.shape 45 | data_type = type(stft[0][0]) 46 | 47 | factors = self.get_scale_factors(freq_dim, sampling_rate, alpha=factor) 48 | factors *= (freq_dim - 1) / max(factors) 49 | new_stft = np.zeros([freq_dim, time_dim], dtype=data_type) 50 | 51 | for i in range(freq_dim): 52 | # first and last freq 53 | if i == 0 or i + 1 >= freq_dim: 54 | new_stft[i, :] += stft[i, :] 55 | else: 56 | warp_up = factors[i] - np.floor(factors[i]) 57 | warp_down = 1 - warp_up 58 | pos = int(np.floor(factors[i])) 59 | 60 | new_stft[pos, :] += warp_down * stft[i, :] 61 | new_stft[pos+1, :] += warp_up * stft[i, :] 62 | 63 | return librosa.core.istft(new_stft) 64 | 65 | def manipulate(self, data, start_pos, end_pos, sampling_rate, warp_factor): 66 | aug_data = self._manipulate(data[start_pos:end_pos], sampling_rate=sampling_rate, factor=warp_factor) 67 | 68 | return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0).astype(type(data[0])) -------------------------------------------------------------------------------- /nlpaug/model/base_model.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | class Model: 4 | @classmethod 5 | def sample(cls, x, num=None): 6 | if isinstance(x, list): 7 | return np.random.choice(x, size=num, replace=False) 8 | elif isinstance(x, int): 9 | return np.random.randint(0, x, size=num) -------------------------------------------------------------------------------- /nlpaug/model/char/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.char.char import * 3 | from nlpaug.model.char.keyboard import * 4 | from nlpaug.model.char.ocr import * 5 | -------------------------------------------------------------------------------- /nlpaug/model/char/char.py: -------------------------------------------------------------------------------- 1 | class Character: 2 | def __init__(self, cache=True): 3 | self.cache = cache 4 | -------------------------------------------------------------------------------- /nlpaug/model/char/keyboard.py: -------------------------------------------------------------------------------- 1 | import re 2 | import os 3 | import json 4 | 5 | from nlpaug.model.char import Character 6 | 7 | 8 | class Keyboard(Character): 9 | def __init__(self, special_char=True, numeric=True, upper_case=True, cache=True, lang="en", model_path=None): 10 | super().__init__(cache) 11 | 12 | self.special_char = special_char 13 | self.numeric = numeric 14 | self.upper_case = upper_case 15 | self.lang = lang 16 | self.model_path = model_path 17 | self.model = self.get_model(model_path=model_path, special_char=special_char, numeric=numeric, 18 | upper_case=upper_case, lang=lang) 19 | 20 | def predict(self, data): 21 | return self.model[data] 22 | 23 | # TODO: Extending to 2 keyboard distance 24 | @classmethod 25 | def get_model(cls, model_path, special_char=True, numeric=True, upper_case=True, lang="en"): 26 | if not os.path.exists(model_path): 27 | raise ValueError('The model_path does not exist. Please check "{}"'.format(model_path)) 28 | 29 | with open(model_path, encoding="utf8") as f: 30 | mapping = json.load(f) 31 | 32 | result = {} 33 | 34 | for key, values in mapping.items(): 35 | # Skip records if key is numeric while include_numeric is false 36 | if not numeric and re.match("^[0-9]*$", key): 37 | continue 38 | # skip record if key is special character while include_spec is false 39 | if not special_char and not re.match("^[a-z0-9]*$", key): 40 | continue 41 | 42 | result[key] = [] 43 | result[key.upper()] = [] 44 | 45 | for value in values: 46 | # Skip record if value is numeric while include_numeric is false 47 | if not numeric and re.match("^[0-9]*$", value): 48 | continue 49 | 50 | # skip record if value is special character while include_spec is false 51 | if not special_char and not re.match("^[a-z0-9]*$", value): 52 | continue 53 | 54 | result[key].append(value) 55 | 56 | if upper_case: 57 | result[key].append(value.upper()) 58 | result[key.upper()].append(value) 59 | result[key.upper()].append(value.upper()) 60 | 61 | clean_result = {} 62 | for key, values in result.items(): 63 | # clear empty mapping 64 | if len(values) == 0: 65 | continue 66 | 67 | # de-duplicate 68 | values = [v for v in values if v != key] 69 | values = sorted(list(set(values))) 70 | 71 | clean_result[key] = values 72 | 73 | return clean_result 74 | -------------------------------------------------------------------------------- /nlpaug/model/char/ocr.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | from nlpaug.model.char import Character 4 | 5 | 6 | class Ocr(Character): 7 | def __init__(self, model, cache=True): 8 | super().__init__(cache) 9 | 10 | self.model = self.generate_mapping(model) 11 | 12 | def generate_mapping(self, mapping): 13 | result = {} 14 | 15 | for k in mapping: 16 | result[k] = mapping[k] 17 | 18 | # reverse mapping 19 | for k in mapping: 20 | for v in mapping[k]: 21 | if v not in result: 22 | result[v] = [] 23 | 24 | if k not in result[v]: 25 | result[v].append(k) 26 | return result 27 | 28 | def predict(self, data): 29 | return self.model[data] 30 | 31 | # Deprecated. Will remove in coming release 32 | # # TODO: Read from file 33 | # @classmethod 34 | # def get_model(cls): 35 | # mapping = { 36 | # '0': ['8', '9', 'o', 'O', 'D'], 37 | # '1': ['4', '7', 'l', 'I'], 38 | # '2': ['z', 'Z'], 39 | # '5': ['8'], 40 | # '6': ['b'], 41 | # '8': ['s', 'S', '@', '&'], 42 | # '9': ['g'], 43 | # 'o': ['u'], 44 | # 'r': ['k'], 45 | # 'C': ['G'], 46 | # 'O': ['D', 'U'], 47 | # 'E': ['B'] 48 | # } 49 | 50 | # result = {} 51 | 52 | # for k in mapping: 53 | # result[k] = mapping[k] 54 | 55 | # for k in mapping: 56 | # for v in mapping[k]: 57 | # if v not in result: 58 | # result[v] = [] 59 | 60 | # if k not in result[v]: 61 | # result[v].append(k) 62 | 63 | # return result 64 | -------------------------------------------------------------------------------- /nlpaug/model/lang_models/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.lang_models.language_models import * 3 | from nlpaug.model.lang_models.bert import * 4 | from nlpaug.model.lang_models.xlnet import * 5 | from nlpaug.model.lang_models.gpt2 import * 6 | from nlpaug.model.lang_models.distilbert import * 7 | from nlpaug.model.lang_models.roberta import * 8 | from nlpaug.model.lang_models.fairseq import * 9 | from nlpaug.model.lang_models.t5 import * 10 | from nlpaug.model.lang_models.bart import * 11 | from nlpaug.model.lang_models.fill_mask_transformers import * 12 | from nlpaug.model.lang_models.machine_translation_transformers import * 13 | from nlpaug.model.lang_models.summarization_transformers import * 14 | from nlpaug.model.lang_models.lambada import * 15 | from nlpaug.model.lang_models.text_generation_transformers import * -------------------------------------------------------------------------------- /nlpaug/model/lang_models/summarization_transformers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | try: 4 | import torch 5 | from transformers import pipeline 6 | except ImportError: 7 | # No installation required if not using this function 8 | pass 9 | 10 | from nlpaug.model.lang_models import LanguageModels 11 | 12 | 13 | class XSumTransformers(LanguageModels): 14 | def __init__(self, model_name="t5-base", tokenizer_name=None, min_length=10, max_length=20, 15 | temperature=1.0, top_k=50, top_p=0.9, batch_size=32, device='cuda', silence=True): 16 | super().__init__(device, model_type=None, silence=silence) 17 | try: 18 | from transformers import pipeline 19 | except ModuleNotFoundError: 20 | raise ModuleNotFoundError('Missed transformers library. Install transfomers by `pip install transformers`') 21 | 22 | self.model_name = model_name 23 | self.tokenizer_name = model_name if tokenizer_name is None else tokenizer_name 24 | self.min_length = min_length 25 | self.max_length = max_length 26 | self.temperature = temperature 27 | self.top_k = top_k 28 | self.top_p = top_p 29 | self.batch_size = batch_size 30 | 31 | if silence: 32 | # Transformers thrown an warning regrading to weight initialization. It is expected 33 | orig_log_level = logging.getLogger('transformers.' + 'modeling_utils').getEffectiveLevel() 34 | logging.getLogger('transformers.' + 'modeling_utils').setLevel(logging.ERROR) 35 | 36 | device = self.convert_device(device) 37 | 38 | self.model = pipeline("summarization", model=self.model_name, tokenizer=self.tokenizer_name, 39 | device=device, framework="pt") 40 | logging.getLogger('transformers.' + 'modeling_utils').setLevel(orig_log_level) 41 | 42 | def get_device(self): 43 | return str(self.model.device) 44 | 45 | def predict(self, texts, target_words=None, n=1): 46 | results = [] 47 | with torch.no_grad(): 48 | for i in range(0, len(texts), self.batch_size): 49 | predict_result = self.model(texts[i:i+self.batch_size], 50 | min_length=self.min_length, 51 | max_length=self.max_length, 52 | temperature=self.temperature, 53 | top_k=self.top_k, 54 | top_p=self.top_p, 55 | num_workers=1) 56 | if isinstance(predict_result, list): 57 | results.extend(predict_result) 58 | else: 59 | results.append(predict_result) 60 | results = [r['summary_text'] for r in results] 61 | 62 | return results 63 | -------------------------------------------------------------------------------- /nlpaug/model/lang_models/text_generation_transformers.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | try: 4 | import torch 5 | from transformers import pipeline 6 | except ImportError: 7 | # No installation required if not using this function 8 | pass 9 | 10 | from nlpaug.model.lang_models import LanguageModels 11 | 12 | 13 | class TextGenTransformers(LanguageModels): 14 | def __init__(self, model_path='gpt2', device='cuda', min_length=100, max_length=300, 15 | batch_size=32, temperature=1.0, top_k=50, top_p=0.9, silence=True): 16 | super().__init__(device, model_type=None, silence=silence) 17 | try: 18 | from transformers import pipeline 19 | except ModuleNotFoundError: 20 | raise ModuleNotFoundError('Missed transformers library. Install transfomers by `pip install transformers`') 21 | 22 | self.min_length = min_length 23 | self.max_length = max_length 24 | self.batch_size = batch_size 25 | self.temperature = temperature 26 | self.top_k = top_k 27 | self.top_p = top_p 28 | self.model_path = model_path 29 | self.device = self.convert_device(device) 30 | 31 | if silence: 32 | # Transformers thrown an warning regrading to weight initialization. It is expected 33 | orig_log_level = logging.getLogger('transformers.' + 'modeling_utils').getEffectiveLevel() 34 | logging.getLogger('transformers.' + 'modeling_utils').setLevel(logging.ERROR) 35 | self.model = pipeline("text-generation", model=model_path, device=self.device) 36 | logging.getLogger('transformers.' + 'modeling_utils').setLevel(orig_log_level) 37 | else: 38 | self.model = pipeline("text-generation", model=model_path, device=self.device) 39 | 40 | def to(self, device): 41 | self.model.model.to(device) 42 | 43 | def get_device(self): 44 | return str(self.model.device) 45 | 46 | def predict(self, texts, target_words=None, n=1): 47 | results = [] 48 | with torch.no_grad(): 49 | for i in range(0, len(texts), self.batch_size): 50 | predict_result = self.model( 51 | texts[i:i+self.batch_size], 52 | pad_token_id=50256, 53 | min_length=self.min_length, 54 | max_length=self.max_length, 55 | temperature=self.temperature, 56 | top_k=self.top_k, 57 | top_p=self.top_p, 58 | do_sample=True, 59 | num_return_sequences=1, 60 | num_workers=1 61 | ) 62 | if isinstance(predict_result, list): 63 | results.extend([y for x in predict_result for y in x]) 64 | 65 | return [r['generated_text'] for r in results] 66 | -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.spectrogram.spectrogram import * 3 | from nlpaug.model.spectrogram.frequency_masking import * 4 | from nlpaug.model.spectrogram.time_masking import * 5 | from nlpaug.model.spectrogram.loudness import * -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/frequency_masking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.spectrogram import Spectrogram 4 | 5 | 6 | class FrequencyMasking(Spectrogram): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def manipulate(self, data, f, f0, time_start, time_end): 11 | """ 12 | https://arxiv.org/pdf/1904.08779.pdf, https://arxiv.org/pdf/2001.01401.pdf 13 | Frequency masking is applied so that f consecutive mel 14 | frequency channels [f0, f0 + f) are masked, where f is 15 | first chosen from a uniform distribution from 0 to the 16 | frequency mask parameter F, and f0 is chosen from 17 | [0, v - f). v is the number of mel frequency channels. 18 | """ 19 | 20 | aug_data = data.copy() 21 | aug_data[f0:f0+f, time_start:time_end] = 0 22 | return aug_data 23 | -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/loudness.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.spectrogram import Spectrogram 4 | 5 | 6 | class Loudness(Spectrogram): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def manipulate(self, data, loudness_level, time_start, time_end): 11 | # https://arxiv.org/pdf/2001.01401.pdf 12 | aug_data = data.copy() 13 | aug_data[:, time_start:time_end] = aug_data[:, time_start:time_end] * loudness_level * 1000 14 | return aug_data 15 | -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/spectrogram.py: -------------------------------------------------------------------------------- 1 | class Spectrogram: 2 | def manipulate(self, data): 3 | raise NotImplementedError 4 | -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/time_masking.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | from nlpaug.model.spectrogram import Spectrogram 4 | 5 | 6 | class TimeMasking(Spectrogram): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def manipulate(self, data, t, t0): 11 | """ 12 | From: https://arxiv.org/pdf/1904.08779.pdf, 13 | Time masking is applied so that t consecutive time steps 14 | [t0, t0 + t) are masked, where t is first chosen from a 15 | uniform distribution from 0 to the time mask parameter 16 | T, and t0 is chosen from [0, tau - t). 17 | """ 18 | 19 | aug_data = data.copy() 20 | aug_data[:, t0:t0+t] = 0 21 | return aug_data 22 | -------------------------------------------------------------------------------- /nlpaug/model/spectrogram/time_warping.py: -------------------------------------------------------------------------------- 1 | # import numpy as np 2 | # 3 | # from nlpaug.model import Spectrogram 4 | # 5 | # 6 | # class TimeWarping(Spectrogram): 7 | # def __init__(self, time_warp): 8 | # super(TimeWarping, self).__init__() 9 | # 10 | # self.time_warp = time_warp 11 | # 12 | # # TODO 13 | # def mask(self, mel_spectrogram): 14 | # """ 15 | # From: https://arxiv.org/pdf/1904.08779.pdf, 16 | # Time warping is applied via the function 17 | # sparse image warp of tensorflow. Given 18 | # a log mel spectrogram with t time steps, we view it 19 | # as an image where the time axis is horizontal and the 20 | # frequency axis is vertical. A random point along the 21 | # horizontal line passing through the center of the image 22 | # within the time steps (W, t - W) is to be warped 23 | # either to the left or right by a distance w chosen from a 24 | # uniform distribution from 0 to the time warp parameter 25 | # W along that line. 26 | # :return: 27 | # """ 28 | # 29 | # time_range = mel_spectrogram.shape[1] 30 | # self.w = np.random.randint(self.time_warp) 31 | # 32 | # center_point = np.random.randint(self.time_warp, time_range-self.time_warp) 33 | # distance = np.random.randint(-self.w, self.w) 34 | # 35 | # # self.w0 = np.random.randint(time_range - self.t) 36 | # # 37 | # # augmented_mel_spectrogram = mel_spectrogram.copy() 38 | # # augmented_mel_spectrogram[:, self.time_warp:self.time_range-self.time_warp] = 0 39 | # # return augmented_mel_spectrogram 40 | # return mel_spectrogram 41 | -------------------------------------------------------------------------------- /nlpaug/model/word_dict/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.word_dict.word_dictionary import * 3 | from nlpaug.model.word_dict.spelling import * 4 | from nlpaug.model.word_dict.wordnet import * 5 | from nlpaug.model.word_dict.ppdb import * 6 | -------------------------------------------------------------------------------- /nlpaug/model/word_dict/spelling.py: -------------------------------------------------------------------------------- 1 | """ 2 | Source data: 3 | English Neutral Rewriting: https://github.com/ybisk/charNMT-noise/blob/master/noise/en.natural 4 | """ 5 | from nlpaug.model.word_dict import WordDictionary 6 | 7 | 8 | class Spelling(WordDictionary): 9 | def __init__(self, dict_path, include_reverse=True, cache=True): 10 | super().__init__(cache) 11 | 12 | self.dict_path = dict_path 13 | self.include_reverse = include_reverse 14 | 15 | self._init() 16 | 17 | def _init(self): 18 | self.dict = {} 19 | self.read(self.dict_path) 20 | 21 | def read(self, model_path): 22 | with open(model_path, 'r', encoding="utf-8") as f: 23 | for line in f.readlines(): 24 | tokens = line.split(' ') 25 | # Last token include newline separator 26 | tokens[-1] = tokens[-1].replace('\n', '') 27 | 28 | key = tokens[0] 29 | values = tokens[1:] 30 | 31 | if key not in self.dict: 32 | self.dict[key] = [] 33 | 34 | self.dict[key].extend(values) 35 | # Remove duplicate mapping 36 | self.dict[key] = list(set(self.dict[key])) 37 | # Build reverse mapping 38 | if self.include_reverse: 39 | for value in values: 40 | if value not in self.dict: 41 | self.dict[value] = [] 42 | if key not in self.dict[value]: 43 | self.dict[value].append(key) 44 | 45 | def predict(self, data): 46 | if data not in self.dict: 47 | return None 48 | 49 | return self.dict[data] 50 | -------------------------------------------------------------------------------- /nlpaug/model/word_dict/word_dictionary.py: -------------------------------------------------------------------------------- 1 | class WordDictionary: 2 | def __init__(self, cache=True): 3 | self.cache = cache 4 | 5 | # pylint: disable=R0201 6 | def train(self, data): 7 | raise NotImplementedError 8 | 9 | # pylint: disable=R0201 10 | def predict(self, data): 11 | raise NotImplementedError 12 | 13 | # pylint: disable=R0201 14 | def save(self, model_path): 15 | raise NotImplementedError 16 | 17 | # pylint: disable=R0201 18 | def read(self, model_path): 19 | raise NotImplementedError 20 | -------------------------------------------------------------------------------- /nlpaug/model/word_dict/wordnet.py: -------------------------------------------------------------------------------- 1 | try: 2 | import nltk 3 | from nltk.corpus import wordnet 4 | except ImportError: 5 | # No installation required if not using this function 6 | pass 7 | 8 | from nlpaug.model.word_dict import WordDictionary 9 | 10 | 11 | class WordNet(WordDictionary): 12 | def __init__(self, lang, is_synonym=True): 13 | super().__init__(cache=True) 14 | 15 | self.lang = lang 16 | self.is_synonym = is_synonym 17 | 18 | try: 19 | import nltk 20 | from nltk.corpus import wordnet 21 | except ModuleNotFoundError: 22 | raise ModuleNotFoundError('Missed nltk library. Install nltk by `pip install nltk`') 23 | 24 | # try: 25 | # # Check whether wordnet package is downloaded 26 | # wordnet.synsets('computer') 27 | # # Check whether POS package is downloaded 28 | # nltk.pos_tag('computer') 29 | # except LookupError: 30 | # nltk.download('wordnet') 31 | # nltk.download('averaged_perceptron_tagger') 32 | 33 | self.model = self.read() 34 | 35 | def read(self): 36 | try: 37 | wordnet.synsets('testing') 38 | return wordnet 39 | except LookupError: 40 | nltk.download('wordnet') 41 | nltk.download('omw-1.4') 42 | return wordnet 43 | 44 | def predict(self, word, pos=None): 45 | results = [] 46 | for synonym in self.model.synsets(word, pos=pos, lang=self.lang): 47 | for lemma in synonym.lemmas(lang=self.lang): 48 | if self.is_synonym: 49 | results.append(lemma.name()) 50 | else: 51 | for antonym in lemma.antonyms(): 52 | results.append(antonym.name()) 53 | return results 54 | 55 | @classmethod 56 | def pos_tag(cls, tokens): 57 | try: 58 | results = nltk.pos_tag(tokens) 59 | except LookupError: 60 | nltk.download('averaged_perceptron_tagger') 61 | results = nltk.pos_tag(tokens) 62 | 63 | return results -------------------------------------------------------------------------------- /nlpaug/model/word_embs/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.word_embs.word_embeddings import * 3 | from nlpaug.model.word_embs.glove import * 4 | from nlpaug.model.word_embs.word2vec import * 5 | from nlpaug.model.word_embs.fasttext import * -------------------------------------------------------------------------------- /nlpaug/model/word_embs/fasttext.py: -------------------------------------------------------------------------------- 1 | try: 2 | from gensim.models import KeyedVectors 3 | except ImportError: 4 | # No installation required if not using this function 5 | pass 6 | 7 | from nlpaug.model.word_embs import WordEmbeddings 8 | 9 | 10 | class Fasttext(WordEmbeddings): 11 | # https://arxiv.org/pdf/1712.09405.pdf, 12 | def __init__(self, top_k=100, skip_check=False): 13 | super().__init__(top_k, skip_check) 14 | 15 | try: 16 | from gensim.models import KeyedVectors 17 | except ModuleNotFoundError: 18 | raise ModuleNotFoundError('Missed gensim library. Install transfomers by `pip install gensim`') 19 | 20 | self.model = None 21 | self.words = [] 22 | 23 | def read(self, file_path, max_num_vector=None): 24 | self.model = KeyedVectors.load_word2vec_format(file_path, limit=max_num_vector) 25 | super()._read() 26 | -------------------------------------------------------------------------------- /nlpaug/model/word_embs/glove.py: -------------------------------------------------------------------------------- 1 | try: 2 | from gensim.models import KeyedVectors 3 | except ImportError: 4 | # No installation required if not using this function 5 | pass 6 | 7 | from nlpaug.model.word_embs import WordEmbeddings 8 | 9 | pre_trained_model_url = { 10 | 'glove_6b': 'http://nlp.stanford.edu/data/glove.6B.zip', 11 | 'glove_42b_300d': 'http://nlp.stanford.edu/data/glove.42B.300d.zip', 12 | 'glove_840b_300d': 'http://nlp.stanford.edu/data/glove.840B.300d.zip', 13 | 'glove_twitter_27b': 'http://nlp.stanford.edu/data/glove.twitter.27B.zip', 14 | } 15 | 16 | 17 | class GloVe(WordEmbeddings): 18 | # https://nlp.stanford.edu/pubs/glove.pdf 19 | def __init__(self, top_k=100, skip_check=False): 20 | super().__init__(top_k, skip_check) 21 | 22 | try: 23 | from gensim.models import KeyedVectors 24 | except ModuleNotFoundError: 25 | raise ModuleNotFoundError('Missed gensim library. Install transfomers by `pip install gensim`') 26 | 27 | self.model = None 28 | self.words = [] 29 | 30 | def read(self, file_path, max_num_vector=None): 31 | self.model = KeyedVectors.load_word2vec_format(file_path, binary=False, no_header=True, limit=max_num_vector) 32 | super()._read() -------------------------------------------------------------------------------- /nlpaug/model/word_embs/word2vec.py: -------------------------------------------------------------------------------- 1 | try: 2 | from gensim.models import KeyedVectors 3 | except ImportError: 4 | # No installation required if not using this function 5 | pass 6 | 7 | from nlpaug.model.word_embs import WordEmbeddings 8 | 9 | 10 | class Word2vec(WordEmbeddings): 11 | # https://arxiv.org/pdf/1301.3781.pdf 12 | def __init__(self, top_k=100, skip_check=False): 13 | super().__init__(top_k, skip_check) 14 | 15 | try: 16 | from gensim.models import KeyedVectors 17 | except ModuleNotFoundError: 18 | raise ModuleNotFoundError('Missed gensim library. Install transfomers by `pip install gensim`') 19 | 20 | self.model = None 21 | self.words = [] 22 | 23 | def read(self, file_path, max_num_vector=None): 24 | self.model = KeyedVectors.load_word2vec_format(file_path, binary=True, limit=max_num_vector) 25 | super()._read() 26 | -------------------------------------------------------------------------------- /nlpaug/model/word_embs/word_embeddings.py: -------------------------------------------------------------------------------- 1 | import nlpaug.util.math.normalization as normalization 2 | 3 | 4 | class WordEmbeddings: 5 | def __init__(self, top_k=100, skip_check=True): 6 | self.top_k = top_k 7 | self.skip_check = skip_check 8 | self.emb_size = 0 9 | self.vocab_size = 0 10 | self.words = [] 11 | 12 | def read(self, file_path, max_num_vector): 13 | raise NotImplementedError 14 | 15 | def _read(self): 16 | self.words = [self.model.index_to_key[i] for i in range(len(self.model.index_to_key))] 17 | self.emb_size = self.model[self.model.key_to_index[self.model.index_to_key[0]]] 18 | self.vocab_size = len(self.words) 19 | 20 | def download(self, model_path): 21 | raise NotImplementedError 22 | 23 | def get_vocab(self): 24 | return self.words 25 | 26 | @classmethod 27 | def _normalize(cls, vectors, norm='l2'): 28 | if norm == 'l2': 29 | return normalization.l2_norm(vectors) 30 | elif norm == 'l1': 31 | return normalization.l1_norm(vectors) 32 | elif norm == 'standard': 33 | return normalization.standard_norm(vectors) 34 | 35 | def predict(self, word, n=1): 36 | result = self.model.most_similar(word, topn=self.top_k+1) 37 | result = [w for w, s in result if w.lower() != word.lower()] 38 | return result[:self.top_k] 39 | -------------------------------------------------------------------------------- /nlpaug/model/word_rule/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.word_rule.word_rule import * 3 | from nlpaug.model.word_rule.shuffle import * -------------------------------------------------------------------------------- /nlpaug/model/word_rule/shuffle.py: -------------------------------------------------------------------------------- 1 | try: 2 | from nltk.tokenize import sent_tokenize 3 | except ImportError: 4 | # No installation required if not using this function 5 | pass 6 | 7 | from nlpaug.model.word_rule.word_rule import WordRule 8 | 9 | 10 | class Shuffle(WordRule): 11 | TYPES = ['sentence'] 12 | 13 | def __init__(self, model_type, mode='neighbor', tokenizer=None): 14 | super().__init__(cache=True) 15 | 16 | self.model_type = model_type # /sentence, word or character 17 | self.mode = mode 18 | 19 | if tokenizer: 20 | self.tokenizer = tokenizer 21 | else: 22 | if self.model_type == 'sentence': 23 | try: 24 | from nltk.tokenize import sent_tokenize 25 | except ModuleNotFoundError: 26 | raise ModuleNotFoundError('Missed nltk library. Install transfomers by `pip install nltk`') 27 | self.tokenizer = sent_tokenize 28 | 29 | def tokenize(self, data): 30 | return self.tokenizer(data) 31 | 32 | def predict(self, data, idx): 33 | if self.model_type == 'sentence': return self._predict_sentence(data, idx) 34 | 35 | return Exception( 36 | '{} is unexpected model_type. Possbile value is {}'.format( 37 | self.model_type, self.TYPES)) 38 | 39 | def _predict_sentence(self, sentences, idx): 40 | last_idx = len(sentences) - 1 41 | direction = '' 42 | if self.mode == 'neighbor': 43 | if self.sample(2) == 0: 44 | direction = 'left' 45 | else: 46 | direction = 'right' 47 | if self.mode == 'left' or direction == 'left': 48 | if idx == 0: 49 | sentences[0], sentences[last_idx] = sentences[last_idx], sentences[0] 50 | else: 51 | sentences[idx], sentences[idx-1] = sentences[idx-1], sentences[idx] 52 | elif self.mode == 'right' or direction == 'right': 53 | if idx == last_idx: 54 | sentences[0], sentences[idx] = sentences[idx], sentences[0] 55 | else: 56 | sentences[idx], sentences[idx+1] = sentences[idx+1], sentences[idx] 57 | elif self.mode == 'random': 58 | idxes = self.sample(list(range(len(sentences))), num=2) 59 | for _id in idxes: 60 | if _id != idx: 61 | sentences[_id], sentences[idx] = sentences[idx], sentences[_id] 62 | break 63 | return sentences 64 | -------------------------------------------------------------------------------- /nlpaug/model/word_rule/word_rule.py: -------------------------------------------------------------------------------- 1 | from nlpaug.model.base_model import Model 2 | 3 | class WordRule(Model): 4 | def __init__(self, cache=True): 5 | self.cache = cache 6 | 7 | # pylint: disable=R0201 8 | def predict(self, data): 9 | raise NotImplementedError 10 | -------------------------------------------------------------------------------- /nlpaug/model/word_stats/__init__.py: -------------------------------------------------------------------------------- 1 | from __future__ import absolute_import 2 | from nlpaug.model.word_stats.word_statistics import * 3 | from nlpaug.model.word_stats.tfidf import * 4 | -------------------------------------------------------------------------------- /nlpaug/model/word_stats/word_statistics.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | 4 | class WordStatistics: 5 | def __init__(self, cache=True): 6 | self.cache = cache 7 | 8 | def train(self, data): 9 | raise NotImplementedError 10 | 11 | def predict(self, data, top_k): 12 | raise NotImplementedError 13 | 14 | def save(self, model_path): 15 | raise NotImplementedError 16 | 17 | def read(self, model_path): 18 | raise NotImplementedError 19 | 20 | @classmethod 21 | def choice(cls, x, p, size=1): 22 | return np.random.choice(len(x), size, p=p) 23 | -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/de.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["!", "2", "\"", "q", "w"], 3 | "2": ["\"", "1", "!", "3", "§", "q", "w", "e"], 4 | "3": ["§", "2", "\"", "4", "$", "w", "e"], 5 | "4": ["$", "3", "§", "5", "%", "e", "r"], 6 | "5": ["%", "4", "$", "6", "&", "r", "t", "z"], 7 | "6": ["&", "5", "%", "7", "/", "t", "z", "u"], 8 | "7": ["/", "6", "&", "8", "(", "z", "u", "i"], 9 | "8": ["(", "7", "/", "9", ")", "u", "i", "o"], 10 | "9": [")", "8", "(", "0", "=", "i", "o", "p"], 11 | "q": ["1", "!", "2", "\"", "w", "a", "s"], 12 | "w": ["1", "!", "2", "\"", "3", "§", "q", "e", "a", "s", "d"], 13 | "e": ["2", "\"", "3", "§", "4", "$", "w", "r", "s", "d", "f"], 14 | "r": ["3", "§", "4", "$", "5", "%", "e", "t", "d", "f", "g"], 15 | "t": ["4", "$", "5", "%", "6", "&", "r", "z", "f", "g", "h"], 16 | "z": ["5", "%", "6", "&", "7", "/", "t", "u", "g", "h", "j"], 17 | "u": ["6", "&", "7", "/", "8", "(", "i", "h", "j", "k"], 18 | "i": ["7", "/", "8", "(", "9", ")", "u", "o", "j", "k", "l"], 19 | "o": ["8", "(", "9", ")", "0", "=", "i", "p", "k", "l"], 20 | "p": ["9", ")", "0", "=", "o", "l"], 21 | "a": ["q", "w", "a", "s", "y", "x"], 22 | "s": ["q", "w", "e", "a", "d", "y", "x", "c"], 23 | "d": ["w", "e", "r", "s", "f", "x", "c", "v"], 24 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"], 25 | "g": ["r", "t", "z", "f", "h", "v", "b", "n"], 26 | "h": ["t", "z", "u", "g", "j", "b", "n", "m"], 27 | "j": ["z", "u", "i", "h", "k", "n", "m", ",", ";"], 28 | "k": ["u", "i", "o", "j", "l", "m", ",", ";", ".", ":"], 29 | "l": ["i", "o", "p", "k", "ö", "Ö", ",", ";", ".", ":", "-", "_"], 30 | "y": ["a", "s", "x"], 31 | "x": ["a", "s", "d", "y", "c"], 32 | "c": ["s", "d", "f", "x", "v"], 33 | "v": ["d", "f", "g", "c", "b"], 34 | "b": ["f", "g", "h", "v", "n"], 35 | "n": ["g", "h", "j", "b", "m"], 36 | "m": ["h", "j", "k", "n", ",", ";"], 37 | "!": ["\"", "q"], 38 | "\"": ["!", "§", "q", "w"], 39 | "§": ["\"", "$", "w", "e"], 40 | "$": ["§", "%", "e", "r"], 41 | "%": ["$"] 42 | } -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/en.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["!", "2", "@", "q", "w"], 3 | "2": ["@", "1", "!", "3", "#", "q", "w", "e"], 4 | "3": ["#", "2", "@", "4", "$", "w", "e"], 5 | "4": ["$", "3", "#", "5", "%", "e", "r"], 6 | "5": ["%", "4", "$", "6", "^", "r", "t", "y"], 7 | "6": ["^", "5", "%", "7", "&", "t", "y", "u"], 8 | "7": ["&", "6", "^", "8", "*", "y", "u", "i"], 9 | "8": ["*", "7", "&", "9", "(", "u", "i", "o"], 10 | "9": ["(", "8", "*", "0", ")", "i", "o", "p"], 11 | "!": ["@", "q"], 12 | "@": ["!", "#", "q", "w"], 13 | "#": ["@", "$", "w", "e"], 14 | "$": ["#", "%", "e", "r"], 15 | "%": "$", 16 | "q": ["1", "!", "2", "@", "w", "a", "s"], 17 | "w": ["1", "!", "2", "@", "3", "#", "q", "e", "a", "s", "d"], 18 | "e": ["2", "@", "3", "#", "4", "$", "w", "r", "s", "d", "f"], 19 | "r": ["3", "#", "4", "$", "5", "%", "e", "t", "d", "f", "g"], 20 | "t": ["4", "$", "5", "%", "6", "^", "r", "y", "f", "g", "h"], 21 | "y": ["5", "%", "6", "^", "7", "&", "t", "u", "g", "h", "j"], 22 | "u": ["6", "^", "7", "&", "8", "*", " t", "i", "h", "j", "k"], 23 | "i": ["7", "&", "8", "*", "9", "(", "u", "o", "j", "k", "l"], 24 | "o": ["8", "*", "9", "(", "0", ")", "i", "p", "k", "l"], 25 | "p": ["9", "(", "0", ")", "o", "l"], 26 | "a": ["q", "w", "a", "s", "z", "x"], 27 | "s": ["q", "w", "e", "a", "d", "z", "x", "c"], 28 | "d": ["w", "e", "r", "s", "f", "x", "c", "v"], 29 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"], 30 | "g": ["r", "t", "y", "f", "h", "v", "b", "n"], 31 | "h": ["t", "y", "u", "g", "j", "b", "n", "m"], 32 | "j": ["y", "u", "i", "h", "k", "n", "m", ",", "<"], 33 | "k": ["u", "i", "o", "j", "l", "m", ",", "<", ".", ">"], 34 | "l": ["i", "o", "p", "k", ";", ":", ",", "<", ".", ">", "/", "?"], 35 | "z": ["a", "s", "x"], 36 | "x": ["a", "s", "d", "z", "c"], 37 | "c": ["s", "d", "f", "x", "v"], 38 | "v": ["d", "f", "g", "c", "b"], 39 | "b": ["f", "g", "h", "v", "n"], 40 | "n": ["g", "h", "j", "b", "m"], 41 | "m": ["h", "j", "k", "n", ",", "<"] 42 | } -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/es.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["¡", "2", "!", "q", "w"], 3 | "2": ["!", "1", "¡", "3", "#", "q", "w", "e"], 4 | "3": ["#", "2", "!", "4", "$", "w", "e"], 5 | "4": ["$", "3", "#", "5", "%", "e", "r"], 6 | "5": ["%", "4", "$", "6", "/", "r", "t", "y"], 7 | "6": ["/", "5", "%", "7", "&", "t", "y", "u"], 8 | "7": ["&", "6", "/", "8", "*", "y", "u", "i"], 9 | "8": ["*", "7", "&", "9", "(", "u", "i", "o"], 10 | "9": ["(", "8", "*", "0", ")", "i", "o", "p"], 11 | "q": ["1", "¡", "2", "!", "w", "a", "s"], 12 | "w": ["1", "¡", "2", "!", "3", "#", "q", "e", "a", "s", "d"], 13 | "e": ["2", "!", "3", "#", "4", "$", "w", "r", "s", "d", "f"], 14 | "r": ["3", "#", "4", "$", "5", "%", "e", "t", "d", "f", "g"], 15 | "t": ["4", "$", "5", "%", "6", "/", "r", "y", "f", "g", "h"], 16 | "y": ["5", "%", "6", "/", "7", "&", "t", "u", "g", "h", "j"], 17 | "u": ["6", "/", "7", "&", "8", "*", "i", "h", "j", "k"], 18 | "i": ["7", "&", "8", "*", "9", "(", "u", "o", "j", "k", "l"], 19 | "o": ["8", "*", "9", "(", "0", ")", "i", "p", "k", "l"], 20 | "p": ["9", "(", "0", ")", "o", "l"], 21 | "a": ["q", "w", "a", "s", "z", "x"], 22 | "s": ["q", "w", "e", "a", "d", "z", "x", "c"], 23 | "d": ["w", "e", "r", "s", "f", "x", "c", "v"], 24 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"], 25 | "g": ["r", "t", "y", "f", "h", "v", "b", "n"], 26 | "h": ["t", "y", "u", "g", "j", "b", "n", "m"], 27 | "j": ["y", "u", "i", "h", "k", "n", "m", ",", "¿"], 28 | "k": ["u", "i", "o", "j", "l", "m", ",", "¿", ".", "?"], 29 | "l": ["i", "o", "p", "k", "ñ", "Ñ", ",", "¿", ".", "?", "ç", "Ç"], 30 | "z": ["a", "s", "x"], 31 | "x": ["a", "s", "d", "z", "c"], 32 | "c": ["s", "d", "f", "x", "v"], 33 | "v": ["d", "f", "g", "c", "b"], 34 | "b": ["f", "g", "h", "v", "n"], 35 | "n": ["g", "h", "j", "b", "m"], 36 | "m": ["h", "j", "k", "n", ",", "¿"], 37 | "¡": ["!", "q"], 38 | "!": ["¡", "#", "q", "w"], 39 | "#": ["!", "$", "w", "e"], 40 | "$": ["#", "%", "e", "r"], 41 | "%": ["$"] 42 | } -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/fr.json: -------------------------------------------------------------------------------- 1 | { 2 | "&": ["1", "é", "2", "a", "z"], 3 | "é": ["2", "&", "1", "\"", "3", "a", "z", "e"], 4 | "\"": ["3", "é", "2", "'", "4", "z", "e"], 5 | "'": ["4", "\"", "3", "(", "5", "e", "r"], 6 | "(": ["5", "'", "4", "§", "6", "r", "t", "y"], 7 | "§": ["6", "(", "5", "è", "7", "t", "y", "u"], 8 | "è": ["7", "§", "6", "!", "8", "y", "u", "i"], 9 | "!": ["8", "è", "7", "ç", "9", "u", "i", "o"], 10 | "ç": ["9", "!", "8", "à", "0", "i", "o", "p"], 11 | "a": ["&", "1", "é", "2", "z", "q", "s"], 12 | "z": ["&", "1", "é", "2", "\"", "3", "a", "e", "q", "s", "d"], 13 | "e": ["é", "2", "\"", "3", "'", "4", "z", "r", "s", "d", "f"], 14 | "r": ["\"", "3", "'", "4", "(", "5", "e", "t", "d", "f", "g"], 15 | "t": ["'", "4", "(", "5", "§", "6", "r", "y", "f", "g", "h"], 16 | "y": ["(", "5", "§", "6", "è", "7", "t", "u", "g", "h", "j"], 17 | "u": ["§", "6", "è", "7", "!", "8", "i", "h", "j", "k"], 18 | "i": ["è", "7", "!", "8", "ç", "9", "u", "o", "j", "k", "l"], 19 | "o": ["!", "8", "ç", "9", "à", "0", "i", "p", "k", "l"], 20 | "p": ["ç", "9", "à", "0", "o", "l"], 21 | "q": ["a", "z", "q", "s", "w", "x"], 22 | "s": ["a", "z", "e", "q", "d", "w", "x", "c"], 23 | "d": ["z", "e", "r", "s", "f", "x", "c", "v"], 24 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"], 25 | "g": ["r", "t", "y", "f", "h", "v", "b", "n"], 26 | "h": ["t", "y", "u", "g", "j", "b", "n", ","], 27 | "j": ["y", "u", "i", "h", "k", "n", ",", ";", "."], 28 | "k": ["u", "i", "o", "j", "l", ",", ";", ".", ":", "/"], 29 | "l": ["i", "o", "p", "k", "m", "M", ";", ".", ":", "/", "=", "+"], 30 | "w": ["q", "s", "x"], 31 | "x": ["q", "s", "d", "w", "c"], 32 | "c": ["s", "d", "f", "x", "v"], 33 | "v": ["d", "f", "g", "c", "b"], 34 | "b": ["f", "g", "h", "v", "n"], 35 | "n": ["g", "h", "j", "b", ","], 36 | ",": ["h", "j", "k", "n", ";", "."], 37 | "1": ["2", "a"], 38 | "2": ["1", "3", "a", "z"], 39 | "3": ["2", "4", "z", "e"], 40 | "4": ["3", "5", "e", "r"], 41 | "5": ["4"] 42 | } -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/he.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["!", "2", "@", "/", "׳"], 3 | "2": ["@", "1", "!", "3", "#", "/", "׳", "ק"], 4 | "3": ["#", "2", "@", "4", "$", "׳", "ק"], 5 | "4": ["$", "3", "#", "5", "%", "ק", "ר"], 6 | "5": ["%", "4", "$", "6", "^", "ר", "א", "ט"], 7 | "6": ["^", "5", "%", "7", "₪", "א", "ט", "ו"], 8 | "7": ["₪", "6", "^", "8", "*", "ט", "ו", "ן"], 9 | "8": ["*", "7", "₪", "9", ")", "ו", "ן", "ם"], 10 | "9": [")", "8", "*", "0", "(", "ן", "ם", "פ"], 11 | "/": ["1", "!", "2", "@", "׳", "ש", "ד"], 12 | "׳": ["1", "!", "2", "@", "3", "#", "/", "ק", "ש", "ד", "ג"], 13 | "ק": ["2", "@", "3", "#", "4", "$", "׳", "ר", "ד", "ג", "כ"], 14 | "ר": ["3", "#", "4", "$", "5", "%", "ק", "א", "ג", "כ", "ע"], 15 | "א": ["4", "$", "5", "%", "6", "^", "ר", "ט", "כ", "ע", "י"], 16 | "ט": ["5", "%", "6", "^", "7", "₪", "א", "ו", "ע", "י", "ח"], 17 | "ו": ["6", "^", "7", "₪", "8", "*", "ן", "י", "ח", "ל"], 18 | "ן": ["7", "₪", "8", "*", "9", ")", "ו", "ם", "ח", "ל", "ך"], 19 | "ם": ["8", "*", "9", ")", "0", "(", "ן", "פ", "ל", "ך"], 20 | "פ": ["9", ")", "0", "(", "ם", "ך"], 21 | "ש": ["/", "׳", "ש", "ד", "ז", "ס"], 22 | "ד": ["/", "׳", "ק", "ש", "ג", "ז", "ס", "ב"], 23 | "ג": ["׳", "ק", "ר", "ד", "כ", "ס", "ב", "ה"], 24 | "כ": ["ק", "ר", "א", "ג", "ע", "ב", "ה", "נ"], 25 | "ע": ["ר", "א", "ט", "כ", "י", "ה", "נ", "מ"], 26 | "י": ["א", "ט", "ו", "ע", "ח", "נ", "מ", "צ"], 27 | "ח": ["ט", "ו", "ן", "י", "ל", "מ", "צ", "ת", ">"], 28 | "ל": ["ו", "ן", "ם", "ח", "ך", "צ", "ת", ">", "ץ", "<"], 29 | "ך": ["ן", "ם", "פ", "ל", "ף", ":", "ת", ">", "ץ", "<", ".", "?"], 30 | "ז": ["ש", "ד", "ס"], 31 | "ס": ["ש", "ד", "ג", "ז", "ב"], 32 | "ב": ["ד", "ג", "כ", "ס", "ה"], 33 | "ה": ["ג", "כ", "ע", "ב", "נ"], 34 | "נ": ["כ", "ע", "י", "ה", "מ"], 35 | "מ": ["ע", "י", "ח", "נ", "צ"], 36 | "צ": ["י", "ח", "ל", "מ", "ת", ">"], 37 | "!": ["@", "/"], 38 | "@": ["!", "#", "/", "׳"], 39 | "#": ["@", "$", "׳", "ק"], 40 | "$": ["#", "%", "ק", "ר"], 41 | "%": ["$"] 42 | } -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/it.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["!", "2", "\"", "q", "w"], 3 | "2": ["\"", "1", "!", "3", "£", "q", "w", "e"], 4 | "3": ["£", "2", "\"", "4", "$", "w", "e"], 5 | "4": ["$", "3", "£", "5", "%", "e", "r"], 6 | "5": ["%", "4", "$", "6", "&", "r", "t", "y"], 7 | "6": ["&", "5", "%", "7", "/", "t", "y", "u"], 8 | "7": ["/", "6", "&", "8", "(", "y", "u", "i"], 9 | "8": ["(", "7", "/", "9", ")", "u", "i", "o"], 10 | "9": [")", "8", "(", "0", "=", "i", "o", "p"], 11 | "q": ["1", "!", "2", "\"", "w", "a", "s"], 12 | "w": ["1", "!", "2", "\"", "3", "£", "q", "e", "a", "s", "d"], 13 | "e": ["2", "\"", "3", "£", "4", "$", "w", "r", "s", "d", "f"], 14 | "r": ["3", "£", "4", "$", "5", "%", "e", "t", "d", "f", "g"], 15 | "t": ["4", "$", "5", "%", "6", "&", "r", "y", "f", "g", "h"], 16 | "y": ["5", "%", "6", "&", "7", "/", "t", "u", "g", "h", "j"], 17 | "u": ["6", "&", "7", "/", "8", "(", "i", "h", "j", "k"], 18 | "i": ["7", "/", "8", "(", "9", ")", "u", "o", "j", "k", "l"], 19 | "o": ["8", "(", "9", ")", "0", "=", "i", "p", "k", "l"], 20 | "p": ["9", ")", "0", "=", "o", "l"], 21 | "a": ["q", "w", "a", "s", "z", "x"], 22 | "s": ["q", "w", "e", "a", "d", "z", "x", "c"], 23 | "d": ["w", "e", "r", "s", "f", "x", "c", "v"], 24 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"], 25 | "g": ["r", "t", "y", "f", "h", "v", "b", "n"], 26 | "h": ["t", "y", "u", "g", "j", "b", "n", "m"], 27 | "j": ["y", "u", "i", "h", "k", "n", "m", ",", ";"], 28 | "k": ["u", "i", "o", "j", "l", "m", ",", ";", ".", ":"], 29 | "l": ["i", "o", "p", "k", "ò", "ç", ",", ";", ".", ":", "-", "_"], 30 | "z": ["a", "s", "x"], 31 | "x": ["a", "s", "d", "z", "c"], 32 | "c": ["s", "d", "f", "x", "v"], 33 | "v": ["d", "f", "g", "c", "b"], 34 | "b": ["f", "g", "h", "v", "n"], 35 | "n": ["g", "h", "j", "b", "m"], 36 | "m": ["h", "j", "k", "n", ",", ";"], 37 | "!": ["\"", "q"], 38 | "\"": ["!", "£", "q", "w"], 39 | "£": ["\"", "$", "w", "e"], 40 | "$": ["£", "%", "e", "r"], 41 | "%": ["$"] 42 | } -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/nl.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["!", "2", "@", "q", "w"], 3 | "2": ["@", "1", "!", "3", "#", "q", "w", "e"], 4 | "3": ["#", "2", "@", "4", "$", "w", "e"], 5 | "4": ["$", "3", "#", "5", "%", "e", "r"], 6 | "5": ["%", "4", "$", "6", "^", "r", "t", "y"], 7 | "6": ["^", "5", "%", "7", "&", "t", "y", "u"], 8 | "7": ["&", "6", "^", "8", "*", "y", "u", "i"], 9 | "8": ["*", "7", "&", "9", "(", "u", "i", "o"], 10 | "9": ["(", "8", "*", "0", ")", "i", "o", "p"], 11 | "q": ["1", "!", "2", "@", "w", "a", "s"], 12 | "w": ["1", "!", "2", "@", "3", "#", "q", "e", "a", "s", "d"], 13 | "e": ["2", "@", "3", "#", "4", "$", "w", "r", "s", "d", "f"], 14 | "r": ["3", "#", "4", "$", "5", "%", "e", "t", "d", "f", "g"], 15 | "t": ["4", "$", "5", "%", "6", "^", "r", "y", "f", "g", "h"], 16 | "y": ["5", "%", "6", "^", "7", "&", "t", "u", "g", "h", "j"], 17 | "u": ["6", "^", "7", "&", "8", "*", "i", "h", "j", "k"], 18 | "i": ["7", "&", "8", "*", "9", "(", "u", "o", "j", "k", "l"], 19 | "o": ["8", "*", "9", "(", "0", ")", "i", "p", "k", "l"], 20 | "p": ["9", "(", "0", ")", "o", "l"], 21 | "a": ["q", "w", "a", "s", "z", "x"], 22 | "s": ["q", "w", "e", "a", "d", "z", "x", "c"], 23 | "d": ["w", "e", "r", "s", "f", "x", "c", "v"], 24 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"], 25 | "g": ["r", "t", "y", "f", "h", "v", "b", "n"], 26 | "h": ["t", "y", "u", "g", "j", "b", "n", "m"], 27 | "j": ["y", "u", "i", "h", "k", "n", "m", ",", "<"], 28 | "k": ["u", "i", "o", "j", "l", "m", ",", "<", ".", ">"], 29 | "l": ["i", "o", "p", "k", ";", ":", ",", "<", ".", ">", "/", "?"], 30 | "z": ["a", "s", "x"], 31 | "x": ["a", "s", "d", "z", "c"], 32 | "c": ["s", "d", "f", "x", "v"], 33 | "v": ["d", "f", "g", "c", "b"], 34 | "b": ["f", "g", "h", "v", "n"], 35 | "n": ["g", "h", "j", "b", "m"], 36 | "m": ["h", "j", "k", "n", ",", "<"], 37 | "!": ["@", "q"], 38 | "@": ["!", "#", "q", "w"], 39 | "#": ["@", "$", "w", "e"], 40 | "$": ["#", "%", "e", "r"], 41 | "%": ["$"] 42 | } -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/pl.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["§", "2", "%", "q", "w"], 3 | "2": ["%", "1", "§", "3", "!", "q", "w", "e"], 4 | "3": ["!", "2", "%", "4", "?", "w", "e"], 5 | "4": ["?", "3", "!", "5", "+", "e", "r"], 6 | "5": ["+", "4", "?", "6", "=", "r", "t", "z"], 7 | "6": ["=", "5", "+", "7", ":", "t", "z", "u"], 8 | "7": [":", "6", "=", "8", "_", "z", "u", "i"], 9 | "8": ["_", "7", ":", "9", "/", "u", "i", "o"], 10 | "9": ["/", "8", "_", "0", "\"", "i", "o", "p"], 11 | "q": ["1", "§", "2", "%", "w", "a", "s"], 12 | "w": ["1", "§", "2", "%", "3", "!", "q", "e", "a", "s", "d"], 13 | "e": ["2", "%", "3", "!", "4", "?", "w", "r", "s", "d", "f"], 14 | "r": ["3", "!", "4", "?", "5", "+", "e", "t", "d", "f", "g"], 15 | "t": ["4", "?", "5", "+", "6", "=", "r", "z", "f", "g", "h"], 16 | "z": ["5", "+", "6", "=", "7", ":", "t", "u", "g", "h", "j"], 17 | "u": ["6", "=", "7", ":", "8", "_", "i", "h", "j", "k"], 18 | "i": ["7", ":", "8", "_", "9", "/", "u", "o", "j", "k", "l"], 19 | "o": ["8", "_", "9", "/", "0", "\"", "i", "p", "k", "l"], 20 | "p": ["9", "/", "0", "\"", "o", "l"], 21 | "a": ["q", "w", "a", "s", "y", "x"], 22 | "s": ["q", "w", "e", "a", "d", "y", "x", "c"], 23 | "d": ["w", "e", "r", "s", "f", "x", "c", "v"], 24 | "f": ["e", "r", "t", "d", "g", "c", "v", "b"], 25 | "g": ["r", "t", "z", "f", "h", "v", "b", "n"], 26 | "h": ["t", "z", "u", "g", "j", "b", "n", "m"], 27 | "j": ["z", "u", "i", "h", "k", "n", "m", ".", "ś"], 28 | "k": ["u", "i", "o", "j", "l", "m", ".", "ś", ",", "ń"], 29 | "l": ["i", "o", "p", "k", "ł", "Ł", ".", "ś", ",", "ń", "-", "ć"], 30 | "y": ["a", "s", "x"], 31 | "x": ["a", "s", "d", "y", "c"], 32 | "c": ["s", "d", "f", "x", "v"], 33 | "v": ["d", "f", "g", "c", "b"], 34 | "b": ["f", "g", "h", "v", "n"], 35 | "n": ["g", "h", "j", "b", "m"], 36 | "m": ["h", "j", "k", "n", ".", "ś"], 37 | "§": ["%", "q"], 38 | "%": ["§", "!", "q", "w"], 39 | "!": ["%", "?", "w", "e"], 40 | "?": ["!", "+", "e", "r"], 41 | "+": ["?"] 42 | } -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/tr.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["q", "w", "2", "'"], 3 | "2": ["1", "q", "w", "e", "3", "!", "^"], 4 | "3": ["2", "w", "e", "r", "4", "^", "%"], 5 | "4": ["3", "e", "r", "t", "5", "^", "%"], 6 | "5": ["4", "r", "t", "y", "6", "+", "&"], 7 | "6": ["5", "t", "y", "u", "7", "%", "/"], 8 | "7": ["6", "y", "u", "8", "&", "(", ")"], 9 | "8": ["7", "u", "ı", "9", "/", ")"], 10 | "9": ["8", "ı", "o", "0", "(", "ı", "o", "0"], 11 | "q": ["1", "2", "w", "a", "s", "!", "'"], 12 | "w": ["1", "2", "3", "q", "e", "a", "s", "d", "!", "'", "^"], 13 | "e": ["3", "4", "w", "r", "s", "d", "f", "^", "+"], 14 | "r": ["4", "5", "e", "t", "d", "f", "g", "+", "%"], 15 | "t": ["5", "6", "r", "y", "f", "g", "h", "%", "&"], 16 | "y": ["6", "7", "t", "u", "g", "h", "j", "&", "/"], 17 | "u": ["7", "8", "y", "ı", "h", "j", "k", "/", "("], 18 | "ı": ["8", "9", "u", "o", "j", "k", "l", "(", ")"], 19 | "o": ["9", "0", "ı", "p", "k", "l", "ş", ")", "="], 20 | "p": ["0", "*", "o", "ğ", "l", "ş", "i", "=", "?"], 21 | "ğ": ["*", "-", "p", "ü", "ş", "i", ",", "=", "?", "_", ";"], 22 | "a": ["q", "w", "s", "x", "z", "<", ">"], 23 | "s": ["q", "w", "e", "a", "d", "z", "x", "c"], 24 | "d": ["w", "e", "r", "s", "f", "x", "c"], 25 | "f": ["r", "t", "d", "g", "c", "v"], 26 | "g": ["r", "t", "y", "f", "h", "v", "b"], 27 | "h": ["y", "u", "g", "j", "b", "n"], 28 | "j": ["u", "ı", "h", "k", "n", "m"], 29 | "k": ["ı", "o", "j", "l", "m", "ö"], 30 | "l": ["o", "p", "k", "ş", "ö", "ç"], 31 | "ş": ["p", "ğ", "l", "i", "ç", ".", ":"], 32 | "i": ["ğ", "ü", "ş", ",", ".", ";"], 33 | "z": ["a", "s", "x", "<", ">"], 34 | "x": ["s", "d", "z", "c"], 35 | "c": ["d", "f", "x", "v"], 36 | "v": ["f", "g", "c", "b"], 37 | "b": ["g", "h", "v", "n"], 38 | "n": ["h", "j", "b", "m"], 39 | "m": ["j", "k", "n", "ö"], 40 | "ö": ["k", "l", "m", "ç"], 41 | "ç": ["l", "ş", "ö", ".", ":"] 42 | } 43 | -------------------------------------------------------------------------------- /nlpaug/res/char/keyboard/uk.json: -------------------------------------------------------------------------------- 1 | { 2 | "1": ["!", "2", "\"", "й", "ц"], 3 | "2": ["\"", "1", "!", "3", "№", "й", "ц", "у"], 4 | "3": ["№", "2", "\"", "4", ";", "ц", "у"], 5 | "4": [";", "3", "№", "5", "%", "у", "к"], 6 | "5": ["%", "4", ";", "6", ":", "к", "е", "н"], 7 | "6": [":", "5", "%", "7", "?", "е", "н", "г"], 8 | "7": ["?", "6", ":", "8", "*", "н", "г", "ш"], 9 | "8": ["*", "7", "?", "9", "(", "г", "ш", "щ"], 10 | "9": ["(", "8", "*", "0", ")", "ш", "щ", "з"], 11 | "й": ["1", "!", "2", "\"", "ц", "ф", "і"], 12 | "ц": ["1", "!", "2", "\"", "3", "№", "й", "у", "ф", "і", "в"], 13 | "у": ["2", "\"", "3", "№", "4", ";", "ц", "к", "і", "в", "а"], 14 | "к": ["3", "№", "4", ";", "5", "%", "у", "е", "в", "а", "п"], 15 | "е": ["4", ";", "5", "%", "6", ":", "к", "н", "а", "п", "р"], 16 | "н": ["5", "%", "6", ":", "7", "?", "е", "г", "п", "р", "о"], 17 | "г": ["6", ":", "7", "?", "8", "*", "ш", "р", "о", "л"], 18 | "ш": ["7", "?", "8", "*", "9", "(", "г", "щ", "о", "л", "д"], 19 | "щ": ["8", "*", "9", "(", "0", ")", "ш", "з", "л", "д"], 20 | "з": ["9", "(", "0", ")", "щ", "д"], 21 | "ф": ["й", "ц", "ф", "і", "я", "ч"], 22 | "і": ["й", "ц", "у", "ф", "в", "я", "ч", "с"], 23 | "в": ["ц", "у", "к", "і", "а", "ч", "с", "м"], 24 | "а": ["у", "к", "е", "в", "п", "с", "м", "и"], 25 | "п": ["к", "е", "н", "а", "р", "м", "и", "т"], 26 | "р": ["е", "н", "г", "п", "о", "и", "т", "ь"], 27 | "о": ["н", "г", "ш", "р", "л", "т", "ь", "б", "Б"], 28 | "л": ["г", "ш", "щ", "о", "д", "ь", "б", "Б", "ю", "Ю"], 29 | "д": ["ш", "щ", "з", "л", "ж", "Ж", "б", "Б", "ю", "Ю", ".", ","], 30 | "я": ["ф", "і", "ч"], 31 | "ч": ["ф", "і", "в", "я", "с"], 32 | "с": ["і", "в", "а", "ч", "м"], 33 | "м": ["в", "а", "п", "с", "и"], 34 | "и": ["а", "п", "р", "м", "т"], 35 | "т": ["п", "р", "о", "и", "ь"], 36 | "ь": ["р", "о", "л", "т", "б", "Б"], 37 | "!": ["\"", "й"], 38 | "\"": ["!", "№", "й", "ц"], 39 | "№": ["\"", ";", "ц", "у"], 40 | ";": ["№", "%", "у", "к"], 41 | "%": [";"] 42 | } -------------------------------------------------------------------------------- /nlpaug/res/char/ocr/en.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": [ 3 | "8", 4 | "9", 5 | "o", 6 | "O", 7 | "D" 8 | ], 9 | "1": [ 10 | "4", 11 | "7", 12 | "l", 13 | "I" 14 | ], 15 | "2": [ 16 | "z", 17 | "Z" 18 | ], 19 | "5": [ 20 | "8" 21 | ], 22 | "6": [ 23 | "b" 24 | ], 25 | "8": [ 26 | "s", 27 | "S", 28 | "@", 29 | "&" 30 | ], 31 | "9": [ 32 | "g", 33 | "q" 34 | ], 35 | "o": [ 36 | "u" 37 | ], 38 | "r": [ 39 | "k" 40 | ], 41 | "C": [ 42 | "G" 43 | ], 44 | "O": [ 45 | "D", 46 | "U" 47 | ], 48 | "E": [ 49 | "B" 50 | ] 51 | } -------------------------------------------------------------------------------- /nlpaug/util/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.action import * 2 | from nlpaug.util.doc import * 3 | from nlpaug.util.method import * 4 | from nlpaug.util.exception import * 5 | from nlpaug.util.math import * 6 | from nlpaug.util.text import * 7 | from nlpaug.util.audio import * 8 | 9 | from nlpaug.util.file import * 10 | from nlpaug.util.decorator import * 11 | from nlpaug.util.logger import * 12 | from nlpaug.util.selection import * 13 | -------------------------------------------------------------------------------- /nlpaug/util/action.py: -------------------------------------------------------------------------------- 1 | class Action: 2 | INSERT = 'insert' 3 | SUBSTITUTE = 'substitute' 4 | DELETE = 'delete' 5 | SWAP = 'swap' 6 | SPLIT = 'split' 7 | ALIGN = 'align' 8 | CROP = 'crop' 9 | 10 | SEQUENTIAL = 'sequential' 11 | SOMETIMES = 'sometimes' 12 | 13 | @staticmethod 14 | def getall(): 15 | return [Action.INSERT, Action.SUBSTITUTE, Action.SWAP, Action.DELETE, Action.SPLIT, Action.CROP, 16 | Action.SEQUENTIAL, Action.SOMETIMES, Action.ALIGN] -------------------------------------------------------------------------------- /nlpaug/util/audio/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.audio.loader import * 2 | from nlpaug.util.audio.visualizer import * 3 | -------------------------------------------------------------------------------- /nlpaug/util/audio/loader.py: -------------------------------------------------------------------------------- 1 | try: 2 | import librosa 3 | except ImportError: 4 | # No installation required if not using this function 5 | pass 6 | 7 | 8 | class AudioLoader: 9 | @staticmethod 10 | def load_audio(file_path): 11 | try: 12 | import librosa 13 | except ModuleNotFoundError: 14 | raise ModuleNotFoundError('Missed librosa library. Install import librosa by `pip install librosa`') 15 | 16 | return librosa.load(file_path) 17 | 18 | @staticmethod 19 | def load_mel_spectrogram(file_path, n_mels=128, fmax=8000): 20 | try: 21 | import librosa 22 | except ModuleNotFoundError: 23 | raise ModuleNotFoundError('Missed librosa library. Install import librosa by `pip install librosa`') 24 | 25 | audio, sampling_rate = AudioLoader.load_audio(file_path) 26 | return librosa.feature.melspectrogram(y=audio, sr=sampling_rate, n_mels=n_mels, fmax=fmax) 27 | -------------------------------------------------------------------------------- /nlpaug/util/audio/visualizer.py: -------------------------------------------------------------------------------- 1 | try: 2 | import librosa 3 | import librosa.display 4 | import matplotlib.pyplot as plt 5 | except ImportError: 6 | # No installation required if not using this function 7 | pass 8 | 9 | import numpy as np 10 | 11 | 12 | class AudioVisualizer: 13 | @staticmethod 14 | def wave(title, audio, sample_rate): 15 | plt.figure(figsize=(8, 4)) 16 | librosa.display.waveplot(audio, sr=sample_rate) 17 | plt.title(title) 18 | plt.tight_layout() 19 | plt.show() 20 | 21 | @staticmethod 22 | def freq_power(title, audio, sample_rate, aug_audio=None): 23 | audio_fft = np.fft.rfft(audio) 24 | audio_fft /= len(audio_fft) 25 | 26 | freq_bins = np.arange(0, len(audio_fft), 1.0) * (sample_rate * 1.0 / len(audio_fft)) 27 | plt.plot(freq_bins / 1000, 10 * np.log10(audio_fft), color='#FF0000', linewidth=0.02) 28 | 29 | if aug_audio is not None: 30 | aug_audio_fft = np.fft.rfft(aug_audio) 31 | aug_audio_fft /= len(aug_audio_fft) 32 | 33 | aug_freq_bins = np.arange(0, len(aug_audio_fft), 1.0) * (sample_rate * 1.0 / len(aug_audio_fft)) 34 | plt.plot(aug_freq_bins / 1000, 10 * np.log10(aug_audio_fft), color='#000000', linewidth=0.02) 35 | 36 | plt.title(title) 37 | plt.xlabel('Frequency (k Hz)') 38 | plt.ylabel('Power (dB)') 39 | plt.tight_layout() 40 | plt.show() 41 | 42 | @staticmethod 43 | def spectrogram(title, spectrogram): 44 | plt.figure(figsize=(8, 4)) 45 | librosa.display.specshow( 46 | librosa.power_to_db(spectrogram, ref=np.max), y_axis='mel', fmax=8000, x_axis='time') 47 | plt.colorbar(format='%+10.0f dB') 48 | plt.title(title) 49 | plt.tight_layout() 50 | plt.show() 51 | -------------------------------------------------------------------------------- /nlpaug/util/decorator/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.decorator.deprecation import * 2 | -------------------------------------------------------------------------------- /nlpaug/util/decorator/deprecation.py: -------------------------------------------------------------------------------- 1 | import functools 2 | import warnings 3 | 4 | 5 | def deprecated(deprecate_from, deprecate_to, msg): 6 | def decorator(obj): 7 | if isinstance(obj, type): 8 | return _decorate_class(obj, deprecate_from, deprecate_to, msg) 9 | # # TODO: 10 | # elif isinstance(obj, property): 11 | # return _decorate_prop(obj, msg) 12 | else: 13 | return _decorate_func(obj, deprecate_from, deprecate_to, msg) 14 | return decorator 15 | 16 | 17 | def _decorate_class(cls, deprecate_from, deprecate_to, msg): 18 | msg_template = 'Class {name} is deprecated from {deprecate_from} version.' 19 | msg_template += ' It will be removed from {deprecate_to} version. {msg}' 20 | 21 | @functools.wraps(cls) 22 | def wrapped(*args, **kwargs): 23 | warnings.simplefilter('always', DeprecationWarning) 24 | warnings.warn( 25 | msg_template.format( 26 | name=cls.__name__, deprecate_from=deprecate_from, deprecate_to=deprecate_to, msg=msg), 27 | category=DeprecationWarning 28 | ) 29 | warnings.simplefilter('default', DeprecationWarning) 30 | return cls(*args, **kwargs) 31 | 32 | return wrapped 33 | 34 | 35 | def _decorate_func(func, deprecate_from, deprecate_to, msg): 36 | msg_template = 'Function {name} is deprecated from {deprecate_from} version.' 37 | msg_template += ' It will be removed from {deprecate_to} version. {msg}' 38 | 39 | @functools.wraps(func) 40 | def wrapped(*args, **kwargs): 41 | warnings.simplefilter('always', DeprecationWarning) 42 | warnings.warn( 43 | msg_template.format( 44 | name=func.__name__, deprecate_from=deprecate_from, deprecate_to=deprecate_to, msg=msg), 45 | category=DeprecationWarning 46 | ) 47 | warnings.simplefilter('default', DeprecationWarning) 48 | return func(*args, **kwargs) 49 | 50 | return wrapped 51 | 52 | 53 | def _decorate_prop(prop, msg): 54 | @functools.wraps(prop) 55 | @property 56 | def wrapped(*args, **kwargs): 57 | msg_template = 'Property {name} is deprecated. {msg}' 58 | warnings.simplefilter('always', DeprecationWarning) 59 | warnings.warn( 60 | msg_template.format(name=prop.__name__, msg=msg), category=DeprecationWarning 61 | ) 62 | warnings.simplefilter('default', DeprecationWarning) 63 | return prop.fget(*args, **kwargs) 64 | 65 | return wrapped 66 | -------------------------------------------------------------------------------- /nlpaug/util/doc/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.doc.doc import * 2 | from nlpaug.util.doc.change_log import * 3 | from nlpaug.util.doc.token import * 4 | -------------------------------------------------------------------------------- /nlpaug/util/doc/change_log.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.doc.token import Token 2 | 3 | 4 | class ChangeLog: 5 | def __init__(self, orig_token): 6 | self.orig_token = orig_token 7 | self.change_logs = [] 8 | self.add(orig_token.token, 'original', orig_token.change_seq) 9 | self._is_changed = False 10 | 11 | def add(self, token, action, change_seq): 12 | if action != 'original' and not self._is_changed: 13 | self._is_changed = True 14 | self.change_logs.append(Token(token=token, action=action, change_seq=change_seq)) 15 | 16 | def update(self, idx, token=None, action=None, change_seq=None): 17 | if not self._is_changed: 18 | self._is_changed = True 19 | 20 | if token: 21 | self.change_logs[idx].token = token 22 | if action: 23 | self.change_logs[idx].action = action 24 | if change_seq: 25 | self.change_logs[idx].change_seq = change_seq 26 | 27 | def size(self): 28 | return len(self.change_logs) - 1 29 | 30 | def is_changed(self): 31 | return self._is_changed 32 | 33 | def get_latest_token(self): 34 | return self.change_logs[-1] 35 | 36 | def update_last_token(self, start_pos): 37 | self.change_logs[-1].start_pos = start_pos 38 | 39 | def to_changed_dict(self): 40 | return { 41 | 'orig_token': self.orig_token.token, 42 | 'orig_start_pos': self.orig_token.start_pos, 43 | 'new_token': self.get_latest_token().token, 44 | 'new_start_pos': self.get_latest_token().start_pos, 45 | 'change_seq': self.get_latest_token().change_seq, 46 | 'action': self.get_latest_token().action 47 | } 48 | 49 | def to_dict(self): 50 | return { 51 | 'orig_token': self.orig_token.to_dict(), 52 | 'change_logs': [t.to_dict() for t in self.change_logs] 53 | } 54 | -------------------------------------------------------------------------------- /nlpaug/util/doc/doc.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.doc.token import Token 2 | from nlpaug.util.doc.change_log import ChangeLog 3 | 4 | 5 | class Doc: 6 | def __init__(self, doc='', tokens=None): 7 | self.doc = doc 8 | if tokens is not None and len(tokens) > 0: 9 | self.tokens = self.token2obj(tokens) 10 | else: 11 | self.tokens = [] 12 | self.changed_cnt = 0 13 | 14 | def token2obj(self, tokens): 15 | objs = [] 16 | start_pos = 0 17 | for t in tokens: 18 | token_obj = Token(token=t, start_pos=start_pos+self.doc[start_pos:].find(t)) 19 | change_log = ChangeLog(orig_token=token_obj) 20 | objs.append(change_log) 21 | 22 | start_pos += len(token_obj.token) 23 | start_pos += 1 # TODO: for textual only 24 | 25 | return objs 26 | 27 | def add_token(self, idx, token, action, change_seq): 28 | token_obj = Token(token=token, start_pos=-1, action=action, change_seq=change_seq) 29 | change_log = ChangeLog(orig_token=token_obj) 30 | self.tokens.insert(idx, change_log) 31 | 32 | def add_change_log(self, idx, new_token, action, change_seq): 33 | self.changed_cnt += 1 34 | self.tokens[idx].add(new_token, action=action, change_seq=change_seq) 35 | 36 | def update_change_log(self, token_idx, change_idx=None, token=None, action=None, change_seq=None): 37 | change_idx = self.tokens[token_idx].size() if change_idx is None else change_idx 38 | self.tokens[token_idx].update(change_idx, token=token, action=action, change_seq=change_seq) 39 | 40 | def get_token(self, idx): 41 | return self.tokens[idx] 42 | 43 | def get_original_tokens(self): 44 | return [t.orig_token.token for t in self.tokens] 45 | 46 | def get_augmented_tokens(self): 47 | return [t.get_latest_token().token for t in self.tokens if len(t.get_latest_token().token) > 0] 48 | 49 | def size(self): 50 | return len(self.tokens) 51 | 52 | def changed_count(self): 53 | return self.changed_cnt 54 | 55 | def get_change_logs(self, start_pos=0): 56 | for i, t in enumerate(self.tokens): 57 | self.tokens[i].update_last_token(start_pos) 58 | 59 | start_pos += len(t.get_latest_token().token) 60 | if len(t.get_latest_token().token) > 0: 61 | # TODO: for textual only 62 | start_pos += 1 63 | 64 | change_logs = [t for t in self.tokens if t.is_changed()] 65 | change_logs.sort(key=lambda x: x.get_latest_token().change_seq) 66 | return [c.to_changed_dict() for c in change_logs] 67 | -------------------------------------------------------------------------------- /nlpaug/util/doc/token.py: -------------------------------------------------------------------------------- 1 | class Token: 2 | def __init__(self, token, start_pos=-1, action='', change_seq=0): 3 | self._token = token 4 | self._start_pos = start_pos 5 | self._action = action 6 | self._change_seq = change_seq 7 | 8 | @property 9 | def start_pos(self): 10 | return self._start_pos 11 | 12 | @start_pos.setter 13 | def start_pos(self, v): 14 | self._start_pos = v 15 | 16 | @property 17 | def token(self): 18 | return self._token 19 | 20 | @token.setter 21 | def token(self, v): 22 | self._token = v 23 | 24 | @property 25 | def action(self): 26 | return self._action 27 | 28 | @action.setter 29 | def action(self, v): 30 | self._action = v 31 | 32 | @property 33 | def change_seq(self): 34 | return self._change_seq 35 | 36 | @change_seq.setter 37 | def change_seq(self, v): 38 | self._change_seq = v 39 | 40 | def to_dict(self): 41 | return { 42 | 'token': self.token, 43 | 'action': self.action, 44 | 'start_pos': self.start_pos, 45 | 'change_seq': self.change_seq 46 | } 47 | -------------------------------------------------------------------------------- /nlpaug/util/exception/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.exception.exception_info import * 2 | from nlpaug.util.exception.warning import * 3 | -------------------------------------------------------------------------------- /nlpaug/util/exception/exception_info.py: -------------------------------------------------------------------------------- 1 | 2 | class ExceptionInfo: 3 | def __init__(self, name, exp_type, code, msg): 4 | self.name = name 5 | self.exp_type = exp_type 6 | self.code = code 7 | self.msg = msg 8 | 9 | def output(self): 10 | msg = '[{}] Name:{}, Code:{}, Message:{}'.format(self.exp_type, self.name, self.code, self.msg) 11 | print(msg) 12 | 13 | 14 | class ExceptionType: 15 | WARNING = 'Warning' -------------------------------------------------------------------------------- /nlpaug/util/exception/warning.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.exception.exception_info import ExceptionInfo, ExceptionType 2 | 3 | 4 | class WarningException(ExceptionInfo): 5 | def __init__(self, name, code, msg): 6 | super(WarningException, self).__init__(name=name, exp_type=ExceptionType.WARNING, code=code, msg=msg) 7 | 8 | 9 | class WarningName: 10 | INPUT_VALIDATION_WARNING = 'Input validation issue' 11 | OUT_OF_VOCABULARY = 'Out of vocabulary issue' 12 | 13 | 14 | class WarningCode: 15 | WARNING_CODE_001 = 'W001' 16 | WARNING_CODE_002 = 'W002' 17 | 18 | 19 | class WarningMessage: 20 | LENGTH_IS_ZERO = 'Length of input is 0' 21 | NO_WORD = 'No other word except stop words and OOV. Returning input data without augmentation' 22 | 23 | DEPRECATED = 'Warning: {} will be removed after {} release. Change to use {}' 24 | -------------------------------------------------------------------------------- /nlpaug/util/file/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.file.download import * 2 | from nlpaug.util.file.library import * 3 | from nlpaug.util.file.read import * -------------------------------------------------------------------------------- /nlpaug/util/file/library.py: -------------------------------------------------------------------------------- 1 | import os 2 | 3 | import nlpaug 4 | 5 | 6 | class LibraryUtil: 7 | """ 8 | Helper function for retreiving library file 9 | 10 | >>> from nlpaug.util.file.library import LibraryUtil 11 | """ 12 | 13 | @staticmethod 14 | def get_res_dir(): 15 | """ 16 | >>> LibraryUtil.get_res_dir() 17 | 18 | """ 19 | lib_dir = os.path.dirname(nlpaug.__file__) 20 | return os.path.join(lib_dir, 'res') 21 | -------------------------------------------------------------------------------- /nlpaug/util/file/read.py: -------------------------------------------------------------------------------- 1 | import json, os 2 | 3 | 4 | class ReadUtil: 5 | """ 6 | Helper function for reading file. 7 | 8 | >>> from nlpaug.util.file.read import ReadUtil 9 | """ 10 | @staticmethod 11 | def read_json(file_path): 12 | """ 13 | :param str file_path: Path of json file 14 | 15 | >>> ReadUtil.read_json('file.json') 16 | 17 | """ 18 | if os.path.exists(file_path): 19 | try: 20 | with open(file_path) as f: 21 | return json.load(f) 22 | except: 23 | return None 24 | else: 25 | return None 26 | -------------------------------------------------------------------------------- /nlpaug/util/lib_ver.py: -------------------------------------------------------------------------------- 1 | import nlpaug 2 | import numpy as np 3 | from platform import python_version 4 | 5 | def get_lib_ver(): 6 | lib_ver = { 7 | 'python': python_version(), 8 | 'nlpaug': nlpaug.__version__, 9 | 'numpy': np.__version__ 10 | } 11 | 12 | try: 13 | import transformers 14 | lib_ver['transformers'] = transformers.__version__ 15 | except: 16 | pass 17 | 18 | try: 19 | import torch 20 | lib_ver['torch'] = torch.__version__ 21 | except: 22 | pass 23 | 24 | try: 25 | import fairseq 26 | lib_ver['fairseq'] = fairseq.__version__ 27 | except: 28 | pass 29 | 30 | try: 31 | import nltk 32 | lib_ver['nltk'] = nltk.__version__ 33 | except: 34 | pass 35 | 36 | return lib_ver 37 | 38 | 39 | -------------------------------------------------------------------------------- /nlpaug/util/logger/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.logger.logger import * 2 | -------------------------------------------------------------------------------- /nlpaug/util/logger/logger.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | 4 | logger = logging.getLogger("nlpaug-general") 5 | 6 | class Logger: 7 | @staticmethod 8 | def log(): 9 | return logger -------------------------------------------------------------------------------- /nlpaug/util/math/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.math.normalization import * 2 | -------------------------------------------------------------------------------- /nlpaug/util/math/normalization.py: -------------------------------------------------------------------------------- 1 | import numpy as np 2 | 3 | def standard_norm(data): 4 | means = data.mean(axis =1) 5 | stds = data.std(axis= 1, ddof=1) 6 | data = (data - means[:, np.newaxis]) / stds[:, np.newaxis] 7 | return np.nan_to_num(data) 8 | 9 | def l1_norm(data): 10 | _norm = np.array([x.sum(axis=0) for x in data]) 11 | data = data/_norm[:, np.newaxis] 12 | return np.nan_to_num(data) 13 | 14 | def l2_norm(data): 15 | _norm = np.array([np.sqrt((x*x).sum(axis=0)) for x in data]) 16 | data = data/_norm[:, np.newaxis] 17 | return np.nan_to_num(data) 18 | -------------------------------------------------------------------------------- /nlpaug/util/method.py: -------------------------------------------------------------------------------- 1 | class Method: 2 | CHAR = 'char' 3 | WORD = 'word' 4 | SENTENCE = 'sentence' 5 | SPECTROGRAM = 'spectrogram' 6 | AUDIO = 'audio' 7 | 8 | FLOW = 'flow' 9 | 10 | @staticmethod 11 | def getall(): 12 | return [Method.CHAR, Method.WORD, Method.SENTENCE, Method.AUDIO, Method.SPECTROGRAM, Method.FLOW] 13 | 14 | -------------------------------------------------------------------------------- /nlpaug/util/selection/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.selection.filtering import * 2 | from nlpaug.util.selection.randomness import * 3 | -------------------------------------------------------------------------------- /nlpaug/util/selection/randomness.py: -------------------------------------------------------------------------------- 1 | try: 2 | import torch 3 | except ImportError: 4 | # No installation required if not using this function 5 | pass 6 | import numpy as np 7 | import random 8 | 9 | 10 | class Randomness: 11 | @staticmethod 12 | def seed(seed): 13 | random.seed(seed) 14 | np.random.seed(seed) 15 | try: 16 | torch.manual_seed(seed) 17 | torch.cuda.manual_seed(seed) 18 | torch.cuda.manual_seed_all(2021) 19 | except: 20 | pass 21 | -------------------------------------------------------------------------------- /nlpaug/util/text/__init__.py: -------------------------------------------------------------------------------- 1 | from nlpaug.util.text.tokenizer import * 2 | from nlpaug.util.text.part_of_speech import * 3 | -------------------------------------------------------------------------------- /nlpaug/util/text/part_of_speech.py: -------------------------------------------------------------------------------- 1 | class PartOfSpeech: 2 | NOUN = 'noun' 3 | VERB = 'verb' 4 | ADJECTIVE = 'adjective' 5 | ADVERB = 'adverb' 6 | 7 | pos2con = { 8 | 'n': [ 9 | 'NN', 'NNS', 'NNP', 'NNPS', # from WordNet 10 | 'NP' # from PPDB 11 | ], 12 | 'v': [ 13 | 'VB', 'VBD', 'VBG', 'VBN', 'VBZ', # from WordNet 14 | 'VBP' # from PPDB 15 | ], 16 | 'a': ['JJ', 'JJR', 'JJS', 'IN'], 17 | 's': ['JJ', 'JJR', 'JJS', 'IN'], # Adjective Satellite 18 | 'r': ['RB', 'RBR', 'RBS'], # Adverb 19 | } 20 | 21 | con2pos = {} 22 | poses = [] 23 | for key, values in pos2con.items(): 24 | poses.extend(values) 25 | for value in values: 26 | if value not in con2pos: 27 | con2pos[value] = [] 28 | con2pos[value].append(key) 29 | 30 | @staticmethod 31 | def pos2constituent(pos): 32 | if pos in PartOfSpeech.pos2con: 33 | return PartOfSpeech.pos2con[pos] 34 | return [] 35 | 36 | @staticmethod 37 | def constituent2pos(con): 38 | if con in PartOfSpeech.con2pos: 39 | return PartOfSpeech.con2pos[con] 40 | return [] 41 | 42 | @staticmethod 43 | def get_pos(): 44 | return PartOfSpeech.poses 45 | -------------------------------------------------------------------------------- /nlpaug/util/text/tokenizer.py: -------------------------------------------------------------------------------- 1 | import re 2 | 3 | ADDING_SPACE_AROUND_PUNCTUATION_REGEX = re.compile(r'(?'), # Left bracket 12 | (re.compile(r'\s([\]\)\}\>])\s'), r'\g<1> '), # right bracket 13 | ] 14 | 15 | SENTENCE_SEPARATOR = '.!?' 16 | 17 | def add_space_around_punctuation(text): 18 | return ADDING_SPACE_AROUND_PUNCTUATION_REGEX.sub(r' ', text) 19 | 20 | def split_sentence(text): 21 | return SPLIT_WORD_REGEX.findall(text) 22 | 23 | class Tokenizer: 24 | @staticmethod 25 | def tokenizer(text): 26 | tokens = TOKENIZER_REGEX.split(text) 27 | return [t for t in tokens if len(t.strip()) > 0] 28 | 29 | @staticmethod 30 | def reverse_tokenizer(tokens): 31 | text = ' '.join(tokens) 32 | for regex, sub in DETOKENIZER_REGEXS: 33 | text = regex.sub(sub, text) 34 | return text.strip() -------------------------------------------------------------------------------- /pypi.sh: -------------------------------------------------------------------------------- 1 | #!/bin/bash 2 | 3 | python setup.py bdist_wheel --bdist-dir ~/temp/bdistwheel 4 | python -m twine upload dist/* --verbose 5 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | numpy>=1.16.2 2 | pandas>=1.2.0 3 | requests>=2.22.0 4 | gdown>=4.0.0 5 | -------------------------------------------------------------------------------- /requirements_dev.txt: -------------------------------------------------------------------------------- 1 | setuptools>=39.1.0 2 | python-dotenv>=0.10.1 3 | nltk>=3.4.5 4 | pyinstrument 5 | transformers 6 | torch 7 | simpletransformers 8 | gensim>=4.1.2 9 | librosa>=0.9 -------------------------------------------------------------------------------- /res/audio_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/res/audio_example.png -------------------------------------------------------------------------------- /res/lambada_algo.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/res/lambada_algo.png -------------------------------------------------------------------------------- /res/logo_small.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/res/logo_small.png -------------------------------------------------------------------------------- /res/textual_example.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/res/textual_example.png -------------------------------------------------------------------------------- /script.txt: -------------------------------------------------------------------------------- 1 | # Generate requirements.txt 2 | pipreqs . —-force 3 | 4 | # Upload to pypl 5 | https://packaging.python.org/tutorials/packaging-projects/ 6 | python -m pip install --user --upgrade setuptools wheel 7 | python -m pip install --user --upgrade twine 8 | python setup.py bdist_wheel --bdist-dir ~/temp/bdistwheel 9 | python -m twine upload --repository-url https://test.pypi.org/legacy/ dist/* --verbose 10 | python -m twine upload dist/* --verbose 11 | 12 | # Code Coverage 13 | coverage run test/run_test.py && coverage html --include=./** 14 | 15 | # Build Document 16 | cd docs 17 | make clean && make html -a 18 | 19 | # Setup new environment in Window 20 | conda install jupyter 21 | pip install numpy requests 22 | pip install torch===1.2.0 -f https://download.pytorch.org/whl/torch_stable.html 23 | pip install pytorch_pretrained_bert>=1.1.0 24 | 25 | # Install torch (for Linux) 26 | pip install torch torchvision 27 | 28 | # Install fairfeq 29 | pip install subword_nmt sacremoses fastBPE 30 | sudo apt-get update 31 | sudo apt-get install gcc 32 | sudo apt-get install g++ 33 | pip install sacrebleu 34 | git clone https://github.com/pytorch/fairseq 35 | cd fairseq 36 | python setup.py build develop -------------------------------------------------------------------------------- /scripts/lambada/data_processing.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import os 3 | import pandas as pd 4 | 5 | 6 | def prepare_mlm_data(labels, texts, output_file_path, sep_token): 7 | with open(os.path.join(output_file_path, 'mlm_data.txt'), 'w') as f: 8 | for label, text in zip(labels, texts): 9 | f.write(' '.join([str(label), sep_token, text]) + '\n') 10 | 11 | def main(args): 12 | data = pd.read_csv(args.data_path) 13 | prepare_mlm_data(data['label'].tolist(), data['text'].tolist(), args.output_dir, '[SEP]') 14 | 15 | 16 | if __name__ == '__main__': 17 | parser = argparse.ArgumentParser(description='parameters', prefix_chars='-') 18 | parser.add_argument('--data_path', default='./test/res/text/classification.csv', help='Data path') 19 | parser.add_argument('--output_dir', default='./test/res/text', help='File output directory') 20 | 21 | args = parser.parse_args() 22 | 23 | main(args) 24 | -------------------------------------------------------------------------------- /scripts/run_lambada.py: -------------------------------------------------------------------------------- 1 | import pandas as pd 2 | from nlpaug.model.lang_models.lambada import Lambada 3 | 4 | if __name__ == '__main__': 5 | model = Lambada(cls_model_dir='../model/lambada/cls', gen_model_dir='../model/lambada/gen', threshold=0.3, device='cuda') 6 | generated_results, filtered_results = model.predict(['0', '1', '2', '3', '4', '5'], 5) 7 | generated_results.to_csv('lambada_generated_result.csv', index=False) 8 | filtered_results.to_csv('lambada_filtered_result.csv', index=False) 9 | -------------------------------------------------------------------------------- /scripts/train_lambada.sh: -------------------------------------------------------------------------------- 1 | python scripts/lambada/train_cls.py --train_data_path ./test/res/text/classification.csv --val_data_path ./test/res/text/classification.csv --output_dir ./model/lambada/cls --device cuda --num_epoch 2 2 | python scripts/lambada/data_processing.py --data_path ./test/res/text/classification.csv --output_dir ./test/res/text 3 | python scripts/lambada/run_clm.py --tokenizer_name ./model/lambada/cls --model_name_or_path gpt2 --model_type gpt2 --train_file ./test/res/text/mlm_data.txt --output_dir ./model/lambada/gen --do_train --overwrite_output_dir --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --save_steps=10000 --num_train_epochs 2 -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | import sys 3 | 4 | if sys.version_info < (3,): 5 | sys.exit("Sorry, Python3 is required.") 6 | 7 | with open("README.md", encoding="utf8") as f: 8 | readme = f.read() 9 | 10 | with open('requirements.txt') as f: 11 | install_reqs = f.read().splitlines() 12 | 13 | setup( 14 | name="nlpaug", 15 | version="1.1.11", 16 | author="Edward Ma", 17 | author_email="makcedward@gmail.com", 18 | url="https://github.com/makcedward/nlpaug", 19 | license="MIT", 20 | description="Natural language processing augmentation library for deep neural networks", 21 | long_description=readme, 22 | long_description_content_type="text/markdown", 23 | packages=find_packages(exclude="test"), 24 | include_package_data=True, 25 | install_requires=install_reqs, 26 | keywords=[ 27 | "deep learning", "neural network", "machine learning", 28 | "nlp", "natural language processing", "text", "audio", "spectrogram", 29 | "augmentation", "adversarial attack", "ai", "ml"], 30 | python_requires=">=3.7" 31 | ) 32 | -------------------------------------------------------------------------------- /test/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/__init__.py -------------------------------------------------------------------------------- /test/augmenter/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/__init__.py -------------------------------------------------------------------------------- /test/augmenter/audio/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/audio/__init__.py -------------------------------------------------------------------------------- /test/augmenter/audio/test_audio.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.audio as naa 6 | from nlpaug.util import AudioLoader 7 | 8 | 9 | class TestAudio(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 16 | cls.sample_wav_file = os.path.join( 17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 18 | ) 19 | 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_multi_thread(self): 23 | n = 3 24 | augs = [ 25 | naa.CropAug(sampling_rate=self.sampling_rate), 26 | naa.PitchAug(sampling_rate=self.sampling_rate) 27 | ] 28 | 29 | for num_thread in [1, 3]: 30 | for aug in augs: 31 | augmented_data = aug.augment(self.audio, n=n, num_thread=num_thread) 32 | self.assertEqual(len(augmented_data), n) 33 | 34 | def test_coverage_and_zone(self): 35 | params = [ 36 | ((0.3, 0.7), 1), 37 | ((0, 1), 1) 38 | ] 39 | 40 | for zone, coverage in params: 41 | augs = [ 42 | naa.LoudnessAug(zone=zone, coverage=coverage, stateless=False), 43 | naa.MaskAug(zone=zone, coverage=coverage, stateless=False), 44 | naa.NoiseAug(zone=zone, coverage=coverage, stateless=False), 45 | naa.PitchAug(zone=zone, coverage=coverage, stateless=False, sampling_rate=self.sampling_rate), 46 | naa.SpeedAug(zone=zone, coverage=coverage, stateless=False), 47 | naa.VtlpAug(zone=zone, coverage=coverage, stateless=False, sampling_rate=self.sampling_rate), 48 | naa.NormalizeAug(zone=zone, coverage=coverage, stateless=False), 49 | naa.PolarityInverseAug(zone=zone, coverage=coverage, stateless=False) 50 | ] 51 | 52 | for aug in augs: 53 | aug_data = aug.augment(self.audio) 54 | aug_audio = aug_data[0] 55 | self.assertTrue(len(aug_audio[aug.start_pos:aug.end_pos]), int(len(self.audio) * (zone[1] - zone[0]) * coverage)) 56 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_crop.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestCrop(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_empty_input(self): 23 | audio = np.array([]) 24 | aug = naa.CropAug(sampling_rate=self.sampling_rate) 25 | augmented_data = aug.augment(audio) 26 | 27 | self.assertTrue(np.array_equal(audio, augmented_data)) 28 | 29 | def test_substitute(self): 30 | aug = naa.CropAug(sampling_rate=self.sampling_rate) 31 | augmented_data = aug.augment(self.audio) 32 | augmented_audio = augmented_data[0] 33 | 34 | self.assertNotEqual(len(self.audio), len(augmented_audio)) 35 | 36 | def test_coverage(self): 37 | aug = naa.CropAug(sampling_rate=self.sampling_rate, coverage=0.1) 38 | augmented_data = aug.augment(self.audio) 39 | augmented_audio = augmented_data[0] 40 | 41 | audio_size = len(self.audio) 42 | augmented_size = len(augmented_audio) 43 | expected_crop_size = len(self.audio) * (aug.zone[1] - aug.zone[0]) * 0.1 44 | 45 | self.assertTrue(-1 <= audio_size - augmented_size - expected_crop_size <= 1) 46 | 47 | def test_duration(self): 48 | duration = 1 49 | audio_size = len(self.audio) 50 | 51 | for _ in range(10): 52 | aug = naa.CropAug(sampling_rate=self.sampling_rate, duration=duration, stateless=False) 53 | augmented_data = aug.augment(self.audio) 54 | augmented_audio = augmented_data[0] 55 | 56 | aug_size = len(augmented_audio) 57 | expected_crop_size = self.sampling_rate * duration 58 | 59 | self.assertGreater(audio_size, aug_size) 60 | self.assertEqual(len(self.audio[aug.start_pos:aug.end_pos]), expected_crop_size) 61 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_inversion.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestInversion(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_empty_input(self): 23 | audio = np.array([]) 24 | aug = naa.PolarityInverseAug() 25 | augmented_data = aug.augment(audio) 26 | 27 | self.assertTrue(np.array_equal(audio, augmented_data)) 28 | 29 | def test_inverse(self): 30 | aug = naa.PolarityInverseAug() 31 | augmented_data = aug.augment(self.audio) 32 | augmented_audio = augmented_data[0] 33 | 34 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 35 | self.assertEqual(len(self.audio), len(augmented_audio)) 36 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_loudness.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestLoudness(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_empty_input(self): 23 | audio = np.array([]) 24 | aug = naa.LoudnessAug() 25 | augmented_data = aug.augment(audio) 26 | 27 | self.assertTrue(np.array_equal(audio, augmented_data)) 28 | 29 | def test_substitute(self): 30 | aug = naa.LoudnessAug() 31 | augmented_data = aug.augment(self.audio) 32 | augmented_audio = augmented_data[0] 33 | 34 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 35 | self.assertEqual(len(self.audio), len(augmented_audio)) 36 | self.assertTrue(self.sampling_rate > 0) 37 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_mask.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestMask(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_empty_input(self): 23 | audio = np.array([]) 24 | aug = naa.MaskAug(sampling_rate=44100) 25 | augmented_data = aug.augment(audio) 26 | 27 | self.assertTrue(np.array_equal(audio, augmented_data)) 28 | 29 | def test_with_noise(self): 30 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, mask_with_noise=True) 31 | augmented_data = aug.augment(self.audio) 32 | augmented_audio = augmented_data[0] 33 | 34 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 35 | self.assertEqual(len(self.audio), len(augmented_audio)) 36 | 37 | def test_without_noise(self): 38 | aug = naa.MaskAug(sampling_rate=self.sampling_rate, mask_with_noise=False) 39 | augmented_data = aug.augment(self.audio) 40 | augmented_audio = augmented_data[0] 41 | 42 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 43 | self.assertEqual(len(self.audio), len(augmented_audio)) 44 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_noise.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestNoise(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | # https://en.wikipedia.org/wiki/Colors_of_noise 21 | cls.noise_wav_file = os.path.join( 22 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Pink_noise.ogg' 23 | ) 24 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 25 | cls.noise, cls.noise_sampling_rate = AudioLoader.load_audio(cls.noise_wav_file) 26 | 27 | def test_empty_input(self): 28 | audio = np.array([]) 29 | aug = naa.NoiseAug() 30 | augmented_data = aug.augment(audio) 31 | 32 | self.assertTrue(np.array_equal(audio, augmented_data)) 33 | 34 | def test_substitute(self): 35 | aug = naa.NoiseAug() 36 | augmented_data = aug.augment(self.audio) 37 | augmented_audio = augmented_data[0] 38 | 39 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 40 | self.assertTrue(len(self.audio), len(augmented_audio)) 41 | self.assertTrue(self.sampling_rate > 0) 42 | 43 | def test_color_noise(self): 44 | colors = naa.NoiseAug().model.COLOR_NOISES 45 | 46 | for color in colors: 47 | aug = naa.NoiseAug(color=color) 48 | augmented_data = aug.augment(self.audio) 49 | augmented_audio = augmented_data[0] 50 | 51 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 52 | self.assertTrue(len(self.audio), len(augmented_audio)) 53 | self.assertTrue(self.sampling_rate > 0) 54 | 55 | def test_background_noise(self): 56 | # noise > audio 57 | aug = naa.NoiseAug(noises=[self.noise]) 58 | augmented_data = aug.augment(self.audio) 59 | augmented_audio = augmented_data[0] 60 | self.assertTrue(augmented_audio is not None) 61 | 62 | # audio > noise 63 | aug = naa.NoiseAug(noises=[self.audio]) 64 | augmented_data = aug.augment(self.audio) 65 | augmented_audio = augmented_data[0] 66 | self.assertTrue(augmented_audio is not None) 67 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_normalization.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestNormalization(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_empty_input(self): 23 | audio = np.array([]) 24 | aug = naa.NormalizeAug() 25 | augmented_data = aug.augment(audio) 26 | 27 | self.assertTrue(np.array_equal(audio, augmented_data)) 28 | 29 | def test_non_exist_method(self): 30 | with self.assertRaises(ValueError) as error: 31 | aug = naa.NormalizeAug(method='test1234') 32 | self.assertTrue('does not support yet. You may pick one' in str(error.exception)) 33 | 34 | def test_minmax(self): 35 | aug = naa.NormalizeAug(method='minmax') 36 | augmented_data = aug.augment(self.audio) 37 | augmented_audio = augmented_data[0] 38 | 39 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 40 | self.assertEqual(len(self.audio), len(augmented_audio)) 41 | 42 | def test_max(self): 43 | aug = naa.NormalizeAug(method='max') 44 | augmented_data = aug.augment(self.audio) 45 | augmented_audio = augmented_data[0] 46 | 47 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 48 | self.assertEqual(len(self.audio), len(augmented_audio)) 49 | 50 | def test_standard(self): 51 | aug = naa.NormalizeAug(method='standard') 52 | augmented_data = aug.augment(self.audio) 53 | augmented_audio = augmented_data[0] 54 | 55 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 56 | self.assertEqual(len(self.audio), len(augmented_audio)) 57 | 58 | def test_random_method(self): 59 | aug = naa.NormalizeAug(method='random', stateless=False) 60 | augmented_data = aug.augment(self.audio) 61 | augmented_audio = augmented_data[0] 62 | 63 | self.assertTrue(aug.run_method in aug.model.get_support_methods()) 64 | 65 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 66 | self.assertEqual(len(self.audio), len(augmented_audio)) 67 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_pitch.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestPitch(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_substitute(self): 23 | aug = naa.PitchAug(sampling_rate=self.sampling_rate) 24 | augmented_data = aug.augment(self.audio) 25 | augmented_audio = augmented_data[0] 26 | 27 | self.assertFalse(np.array_equal(self.audio, augmented_audio)) 28 | self.assertEqual(len(self.audio), len(augmented_audio)) 29 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_shift.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util import AudioLoader 8 | 9 | 10 | class TestShift(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | def test_substitute(self): 23 | audio, sampling_rate = AudioLoader.load_audio(self.sample_wav_file) 24 | 25 | aug = naa.ShiftAug(sampling_rate, duration=0.5) 26 | augmented_data = aug.augment(self.audio) 27 | augmented_audio = augmented_data[0] 28 | 29 | self.assertFalse(np.array_equal(audio, augmented_audio)) 30 | self.assertTrue(len(audio), len(augmented_audio)) 31 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_speed.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.audio as naa 6 | from nlpaug.util import AudioLoader 7 | 8 | 9 | class TestSpeed(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 16 | cls.sample_wav_file = os.path.join( 17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 18 | ) 19 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 20 | 21 | def test_substitute(self): 22 | for _ in range(10): 23 | aug = naa.SpeedAug(stateless=False) 24 | augmented_data = aug.augment(self.audio) 25 | augmented_audio = augmented_data[0] 26 | 27 | if aug.aug_factor < 1: 28 | self.assertGreater(len(augmented_audio), len(self.audio)) 29 | else: 30 | self.assertLess(len(augmented_audio), len(self.audio)) 31 | -------------------------------------------------------------------------------- /test/augmenter/audio/test_vtlp.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.audio as naa 6 | from nlpaug.util import AudioLoader 7 | 8 | 9 | class TestVtlp(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 16 | cls.sample_wav_file = os.path.join( 17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 18 | ) 19 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 20 | 21 | def test_substitute(self): 22 | for _ in range(10): 23 | aug = naa.VtlpAug(sampling_rate=self.sampling_rate, stateless=False) 24 | augmented_data = aug.augment(self.audio) 25 | augmented_audio = augmented_data[0] 26 | self.assertGreater(len(self.audio), len(augmented_audio)) -------------------------------------------------------------------------------- /test/augmenter/char/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/char/__init__.py -------------------------------------------------------------------------------- /test/augmenter/char/test_ocr.py: -------------------------------------------------------------------------------- 1 | import unittest, os 2 | 3 | from nlpaug.augmenter.char import OcrAug 4 | 5 | 6 | class TestOcr(unittest.TestCase): 7 | def test_ocr_single_word(self): 8 | texts = ['Zoology', 'roku123456'] 9 | aug = OcrAug() 10 | for text in texts: 11 | augmented_data = aug.augment(text) 12 | augmented_text = augmented_data[0] 13 | self.assertNotEqual(text, augmented_text) 14 | 15 | self.assertTrue(len(texts) > 0) 16 | 17 | def test_ocr_single_word_nonexist_char(self): 18 | texts = ['AAAAA', 'KKKKK'] 19 | aug = OcrAug() 20 | for text in texts: 21 | augmented_data = aug.augment(text) 22 | augmented_text = augmented_data[0] 23 | self.assertEqual(text, augmented_text) 24 | 25 | self.assertTrue(len(texts) > 0) 26 | 27 | def test_ocr_multi_words(self): 28 | texts = ['The quick brown fox jumps over the lazy dog'] 29 | aug = OcrAug() 30 | 31 | for text in texts: 32 | # Since non-exist mapping word may be drawn, try several times 33 | is_augmented = False 34 | for _ in range(10): 35 | augmented_data = aug.augment(text) 36 | augmented_text = augmented_data[0] 37 | is_equal = text == augmented_text 38 | if not is_equal: 39 | is_augmented = True 40 | break 41 | 42 | self.assertTrue(is_augmented) 43 | 44 | self.assertTrue(len(texts) > 0) 45 | 46 | def test_ocr_model_from_dict(self): 47 | mapping = {'0': ['2']} 48 | aug = OcrAug(dict_of_path=mapping) 49 | augmented_data = aug.augment('0000000') 50 | augmented_text = augmented_data[0] 51 | self.assertIn('2', augmented_text) 52 | 53 | def test_ocr_model_from_json(self): 54 | sample_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'res', 'common', 'sample.json')) 55 | aug = OcrAug(dict_of_path=sample_path) 56 | augmented_data = aug.augment('0000000') 57 | augmented_text = augmented_data[0] 58 | self.assertIn('3', augmented_text) 59 | 60 | with self.assertRaises(Exception) as error: 61 | sample_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'res', 'common', 'non_exist.json')) 62 | aug = OcrAug(dict_of_path=sample_path) 63 | self.assertIn('The dict_of_path does not exist', str(error.exception)) 64 | -------------------------------------------------------------------------------- /test/augmenter/sentence/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/sentence/__init__.py -------------------------------------------------------------------------------- /test/augmenter/sentence/test_random.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | 4 | import nlpaug.augmenter.sentence as nas 5 | 6 | 7 | class TestRandomSentenceAug(unittest.TestCase): 8 | @classmethod 9 | def setUpClass(cls): 10 | cls.data = 'This is sentence1. This is sentence2! This is sentence3? This is, sentence4 with comma.' 11 | 12 | def test_mode(self): 13 | for mode in ['left', 'right', 'neighbor', 'random']: 14 | aug = nas.RandomSentAug(mode='left') 15 | aug_data = aug.augment(self.data) 16 | self.assertNotEqual(self.data, aug_data[0]) 17 | self.assertEqual(4, len(aug.model.tokenize(aug_data[0]))) 18 | -------------------------------------------------------------------------------- /test/augmenter/sentence/test_sentence.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.sentence as nas 6 | from nlpaug.util import Action, Doc 7 | 8 | 9 | class TestSentence(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | 16 | cls.model_paths = [ 17 | 'xlnet-base-cased', 18 | 'gpt2', 19 | 'distilgpt2' 20 | ] 21 | 22 | cls.text = 'The quick brown fox jumps over the lazy dog.' 23 | -------------------------------------------------------------------------------- /test/augmenter/spectrogram/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/spectrogram/__init__.py -------------------------------------------------------------------------------- /test/augmenter/spectrogram/test_frequency_masking.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | from nlpaug.util import AudioLoader 7 | import nlpaug.augmenter.spectrogram as nas 8 | 9 | 10 | class TestFrequencyMasking(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | 21 | def test_empty_input(self): 22 | data = np.array([]) 23 | aug = nas.FrequencyMaskingAug() 24 | aug_data = aug.augment(data) 25 | 26 | self.assertTrue(np.array_equal(np.array([]), aug_data)) 27 | 28 | def test_no_change_source(self): 29 | data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128) 30 | aug = nas.FrequencyMaskingAug() 31 | aug_data = aug.augment(data) 32 | aug_audio = aug_data[0] 33 | 34 | comparison = data == aug_audio 35 | self.assertFalse(comparison.all()) 36 | 37 | def test_substitute(self): 38 | data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128) 39 | aug = nas.FrequencyMaskingAug(stateless=False) 40 | 41 | aug_data = aug.augment(data) 42 | aug_audio = aug_data[0] 43 | 44 | self.assertEqual(len(data[aug.f0]), np.count_nonzero(data[aug.f0])) 45 | self.assertEqual(0, np.count_nonzero(aug_audio[aug.f0][aug.time_start:aug.time_end])) 46 | self.assertEqual(0, len(np.where(aug_audio[aug.f0][:aug.time_start] == 0)[0])) 47 | self.assertEqual(0, len(np.where(aug_audio[aug.f0][aug.time_end:] == 0)[0])) 48 | -------------------------------------------------------------------------------- /test/augmenter/spectrogram/test_loudness_spec.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | from nlpaug.util import AudioLoader 7 | import nlpaug.augmenter.spectrogram as nas 8 | 9 | 10 | class TestLoudnessSpec(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | 21 | def test_no_change_source(self): 22 | data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128) 23 | aug = nas.LoudnessAug(stateless=False) 24 | aug_data = aug.augment(data) 25 | aug_audio = aug_data[0] 26 | 27 | comparison = data == aug_audio 28 | self.assertFalse(comparison.all()) 29 | 30 | def test_substitute(self): 31 | data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128) 32 | aug = nas.LoudnessAug(stateless=False) 33 | 34 | aug_data = aug.augment(data) 35 | aug_audio = aug_data[0] 36 | 37 | comparison = data[:, aug.time_start:aug.time_end] == aug_audio[:, aug.time_start:aug.time_end] 38 | self.assertFalse(comparison.all()) 39 | comparison = data[:, :aug.time_start] == aug_audio[:, :aug.time_start] 40 | self.assertTrue(comparison.all()) 41 | comparison = data[:, aug.time_end:] == aug_audio[:, aug.time_end:] 42 | self.assertTrue(comparison.all()) 43 | -------------------------------------------------------------------------------- /test/augmenter/spectrogram/test_spectrogram.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | from nlpaug.util import AudioLoader 6 | import nlpaug.augmenter.spectrogram as nas 7 | 8 | 9 | class TestFrequencyMasking(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 16 | cls.sample_wav_file = os.path.join( 17 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 18 | ) 19 | 20 | def test_multi_thread(self): 21 | mel_spectrogram = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128) 22 | n = 3 23 | augs = [ 24 | nas.FrequencyMaskingAug(), 25 | nas.TimeMaskingAug() 26 | ] 27 | 28 | for num_thread in [1, 3]: 29 | for aug in augs: 30 | augmented_data = aug.augment(mel_spectrogram, n=n, num_thread=num_thread) 31 | self.assertEqual(len(augmented_data), n) 32 | 33 | def test_zone_parameter(self): 34 | aug = nas.LoudnessAug(zone=(0, 1)) 35 | aug = nas.LoudnessAug(zone=(0.5, 0.7)) 36 | aug = nas.LoudnessAug(zone=(0.6, 1)) 37 | 38 | with self.assertRaises(ValueError) as context: 39 | aug = nas.LoudnessAug(zone=(-1, 1)) 40 | self.assertTrue('Lower bound of zone is smaller than' in str(context.exception)) 41 | 42 | with self.assertRaises(ValueError) as context: 43 | aug = nas.LoudnessAug(zone=(0, 1.2)) 44 | self.assertTrue('Upper bound of zone is larger than' in str(context.exception)) 45 | 46 | def test_coverage_parameter(self): 47 | aug = nas.LoudnessAug(coverage=0) 48 | aug = nas.LoudnessAug(coverage=0.5) 49 | aug = nas.LoudnessAug(coverage=1) 50 | 51 | with self.assertRaises(ValueError) as context: 52 | aug = nas.LoudnessAug(coverage=-1) 53 | self.assertTrue('Coverage value should be between than 0 and 1 while' in str(context.exception)) 54 | 55 | with self.assertRaises(ValueError) as context: 56 | aug = nas.LoudnessAug(coverage=1.1) 57 | self.assertTrue('Coverage value should be between than 0 and 1 while' in str(context.exception)) 58 | -------------------------------------------------------------------------------- /test/augmenter/spectrogram/test_time_masking.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | import numpy as np 5 | 6 | from nlpaug.util import AudioLoader 7 | import nlpaug.augmenter.spectrogram as nas 8 | 9 | 10 | class TestTimeMasking(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.num_of_freq_channel = 128 21 | 22 | def test_no_change_source(self): 23 | data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128) 24 | aug = nas.TimeMaskingAug() 25 | aug_data = aug.augment(data) 26 | 27 | comparison = data == aug_data 28 | self.assertFalse(comparison.all()) 29 | 30 | def test_substitute(self): 31 | data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=self.num_of_freq_channel) 32 | aug = nas.TimeMaskingAug(stateless=False) 33 | 34 | aug_data = aug.augment(data) 35 | aug_audio = aug_data[0] 36 | 37 | self.assertEqual(len(data[:, aug.t0]), np.count_nonzero(data[:, aug.t0])) 38 | self.assertEqual(0, np.count_nonzero(aug_audio[:, aug.t0])) 39 | -------------------------------------------------------------------------------- /test/augmenter/test_audio_augmenter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.audio as naa 7 | from nlpaug.util.audio import AudioLoader 8 | 9 | 10 | class TestAudioAugmenter(unittest.TestCase): 11 | @classmethod 12 | def setUpClass(cls): 13 | env_config_path = os.path.abspath(os.path.join( 14 | os.path.dirname(__file__), '..', '..', '.env')) 15 | load_dotenv(env_config_path) 16 | # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm 17 | cls.sample_wav_file = os.path.join( 18 | os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav' 19 | ) 20 | cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file) 21 | 22 | cls.audio_augs = [ 23 | naa.CropAug(sampling_rate=cls.sampling_rate), 24 | naa.SpeedAug(), 25 | ] 26 | 27 | def test_augmenter_n_output(self): 28 | n = 3 29 | for aug in self.audio_augs: 30 | augmented_audios = aug.augment(self.audio, n=n) 31 | self.assertEqual(len(augmented_audios), n) 32 | for augmented_audio in augmented_audios: 33 | self.assertFalse(np.array_equal(augmented_audio, self.audio)) 34 | 35 | data = [self.audio, self.audio, self.audio] 36 | for aug in self.audio_augs: 37 | augmented_audios = aug.augment(data, n=1) 38 | self.assertEqual(len(augmented_audios), len(data)) 39 | for d, augmented_audio in zip(data, augmented_audios): 40 | self.assertFalse(np.array_equal(augmented_audio, d)) 41 | 42 | def test_augmenter_n_output_thread(self): 43 | n = 3 44 | for aug in self.audio_augs: 45 | augmented_audios = aug.augment([self.audio]*2, n=n, num_thread=n) 46 | self.assertGreater(len(augmented_audios), 1) 47 | for augmented_audio in augmented_audios: 48 | self.assertFalse(np.array_equal(augmented_audio, self.audio)) 49 | -------------------------------------------------------------------------------- /test/augmenter/test_base_augmenter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | from nlpaug import Augmenter 7 | 8 | 9 | class TestBaseAugmenter(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | 16 | cls.aug = Augmenter(name='base', method='flow', action='insert', 17 | aug_min=1, aug_max=10, aug_p=0.5) 18 | 19 | def test_generate_aug_cnt(self): 20 | self.assertEqual(0, self.aug.generate_aug_cnt(0)) 21 | self.assertEqual(1, self.aug.generate_aug_cnt(1)) 22 | self.assertGreater(self.aug.generate_aug_cnt(10), 1) 23 | -------------------------------------------------------------------------------- /test/augmenter/test_text_augmenter.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import torch 4 | import numpy as np 5 | from dotenv import load_dotenv 6 | 7 | import nlpaug.augmenter.char as nac 8 | import nlpaug.augmenter.word as naw 9 | import nlpaug.augmenter.sentence as nas 10 | 11 | 12 | class TestTextAugmenter(unittest.TestCase): 13 | @classmethod 14 | def setUpClass(cls): 15 | env_config_path = os.path.abspath(os.path.join( 16 | os.path.dirname(__file__), '..', '..', '.env')) 17 | load_dotenv(env_config_path) 18 | 19 | cls.augs = [ 20 | nac.RandomCharAug(), 21 | naw.ContextualWordEmbsAug(), 22 | nas.ContextualWordEmbsForSentenceAug() 23 | ] 24 | 25 | def test_augmenter_n_output(self): 26 | text = 'The quick brown fox jumps over the lazy dog' 27 | n = 3 28 | for aug in self.augs: 29 | augmented_texts = aug.augment(text, n=n) 30 | self.assertGreater(len(augmented_texts), 1) 31 | for augmented_text in augmented_texts: 32 | self.assertNotEqual(augmented_text, text) 33 | 34 | for aug in self.augs: 35 | augmented_texts = aug.augment([text]*2, n=1, num_thread=1) 36 | self.assertGreater(len(augmented_texts), 1) 37 | for augmented_text in augmented_texts: 38 | self.assertNotEqual(augmented_text, text) 39 | 40 | def test_augmenter_n_output_thread(self): 41 | text = 'The quick brown fox jumps over the lazy dog' 42 | n = 3 43 | for aug in self.augs: 44 | augmented_texts = aug.augment([text]*2, n=n, num_thread=n) 45 | self.assertGreater(len(augmented_texts), 1) 46 | for augmented_text in augmented_texts: 47 | self.assertNotEqual(augmented_text, text) 48 | 49 | def test_multiprocess_gpu(self): 50 | text = 'The quick brown fox jumps over the lazy dog' 51 | n = 3 52 | if torch.cuda.is_available(): 53 | aug = naw.ContextualWordEmbsAug(force_reload=True, device='cuda') 54 | 55 | augmented_texts = aug.augment(text, n=n, num_thread=n) 56 | self.assertGreater(len(augmented_texts), 1) 57 | for augmented_text in augmented_texts: 58 | self.assertNotEqual(augmented_text, text) 59 | 60 | self.assertTrue(True) 61 | 62 | def test_get_aug_range_idxes(self): 63 | aug = naw.RandomWordAug() 64 | self.assertTrue(len(aug._get_aug_range_idxes([])) == 0) -------------------------------------------------------------------------------- /test/augmenter/word/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/word/__init__.py -------------------------------------------------------------------------------- /test/augmenter/word/test_antonym.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.word as naw 6 | 7 | 8 | class TestAntonym(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls): 11 | env_config_path = os.path.abspath(os.path.join( 12 | os.path.dirname(__file__), '..', '..', '..', '.env')) 13 | load_dotenv(env_config_path) 14 | 15 | cls.augs = [ 16 | naw.AntonymAug() 17 | ] 18 | 19 | def test_substitute(self): 20 | texts = [ 21 | 'Older people feel more youthful when they also feel in control.', 22 | 'Good bad', 23 | 'Heart patients may benefit more from exercise than healthy people.', 24 | 'Beer first or wine, either way might not be fine.' 25 | ] 26 | 27 | for aug in self.augs: 28 | for text in texts: 29 | for _ in range(5): 30 | augmented_data = aug.augment(text) 31 | augmented_text = augmented_data[0] 32 | self.assertNotEqual(text, augmented_text) 33 | 34 | def test_unable_to_substitute(self): 35 | texts = [ 36 | 'Insomnia, sleep apnea diagnoses up sharply in U.S. Army.' 37 | ] 38 | 39 | for aug in self.augs: 40 | for text in texts: 41 | augmented_data = aug.augment(text) 42 | augmented_text = augmented_data[0] 43 | self.assertEqual(text, augmented_text) 44 | 45 | def test_skip_punctuation(self): 46 | text = '. . . . ! ? # @' 47 | 48 | for aug in self.augs: 49 | augmented_data = aug.augment(text) 50 | augmented_text = augmented_data[0] 51 | self.assertEqual(text, augmented_text) 52 | -------------------------------------------------------------------------------- /test/augmenter/word/test_spelling.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.augmenter.word as naw 6 | 7 | 8 | class TestSpelling(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls): 11 | env_config_path = os.path.abspath(os.path.join( 12 | os.path.dirname(__file__), '..', '..', '..', '.env')) 13 | load_dotenv(env_config_path) 14 | 15 | cls.model_dir = os.path.join(os.environ.get("PACKAGE_DIR"), 'res', 'word', 'spelling') 16 | 17 | def test_read_default_dict(self): 18 | text = 'abcdef' 19 | 20 | aug = naw.SpellingAug() 21 | self.assertTrue(aug.model.dict_path) 22 | aug.augment(text) 23 | self.assertTrue(True) 24 | 25 | def test_oov(self): 26 | text = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa' 27 | 28 | aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt')) 29 | augmented_data = aug.augment(text) 30 | augmented_text = augmented_data[0] 31 | 32 | self.assertEqual(text, augmented_text) 33 | 34 | def test_substitute(self): 35 | texts = [ 36 | 'The quick brown fox jumps over the lazy dog' 37 | ] 38 | 39 | aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt')) 40 | 41 | for text in texts: 42 | self.assertLess(0, len(text)) 43 | augmented_data = aug.augment(text) 44 | augmented_text = augmented_data[0] 45 | 46 | self.assertNotEqual(text, augmented_text) 47 | 48 | self.assertLess(0, len(texts)) 49 | 50 | def test_substitute_stopwords(self): 51 | texts = [ 52 | 'The quick brown fox jumps over the lazy dog' 53 | ] 54 | 55 | stopwords = [t.lower() for t in texts[0].split(' ')[:3]] 56 | aug_n = 3 57 | 58 | aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt'), stopwords=stopwords) 59 | 60 | for text in texts: 61 | self.assertLess(0, len(text)) 62 | augmented_data = aug.augment(text) 63 | augmented_text = augmented_data[0] 64 | 65 | augmented_tokens = aug.tokenizer(augmented_text) 66 | tokens = aug.tokenizer(text) 67 | 68 | augmented_cnt = 0 69 | 70 | for token, augmented_token in zip(tokens, augmented_tokens): 71 | if token.lower() in stopwords and len(token) > aug_n: 72 | self.assertEqual(token.lower(), augmented_token) 73 | else: 74 | augmented_cnt += 1 75 | 76 | self.assertGreater(augmented_cnt, 0) 77 | 78 | self.assertLess(0, len(texts)) 79 | -------------------------------------------------------------------------------- /test/augmenter/word/test_split.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import nlpaug.augmenter.word as naw 4 | 5 | 6 | class TestSplit(unittest.TestCase): 7 | def test_split(self): 8 | texts = [ 9 | 'The quick brown fox jumps over the lazy dog' 10 | ] 11 | aug = naw.SplitAug() 12 | 13 | for text in texts: 14 | augmented_data = aug.augment(text) 15 | augmented_text = augmented_data[0] 16 | 17 | self.assertLess(len(text), len(augmented_text)) 18 | 19 | def test_split_min_char(self): 20 | texts = [ 21 | 'quick brown' 22 | ] 23 | aug = naw.SplitAug(min_char=6) 24 | 25 | for text in texts: 26 | augmented_data = aug.augment(text) 27 | augmented_text = augmented_data[0] 28 | 29 | self.assertEqual(text, augmented_text) 30 | -------------------------------------------------------------------------------- /test/flow/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/flow/__init__.py -------------------------------------------------------------------------------- /test/flow/test_sometimes.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | 3 | import nlpaug.augmenter.char as nac 4 | import nlpaug.flow as naf 5 | from nlpaug.util import Action 6 | 7 | 8 | class TestSometimes(unittest.TestCase): 9 | def test_dry_run(self): 10 | seq = naf.Sometimes() 11 | results = seq.augment([]) 12 | self.assertEqual(0, len(results)) 13 | 14 | def test_single_action(self): 15 | texts = [ 16 | 'The quick brown fox jumps over the lazy dog', 17 | 'Zology raku123456 fasdasd asd4123414 1234584 s@#' 18 | ] 19 | 20 | # Since prob may be low and causing do not perform data augmentation. Retry 5 times 21 | at_least_one_not_equal = False 22 | for _ in range(0, 5): 23 | flow = naf.Sometimes([nac.RandomCharAug(action=Action.INSERT)], aug_p=0.6) 24 | for text in texts: 25 | augmented_text = flow.augment(text) 26 | 27 | if text != augmented_text: 28 | at_least_one_not_equal = True 29 | 30 | self.assertLess(0, len(text)) 31 | 32 | if at_least_one_not_equal: 33 | break 34 | 35 | self.assertTrue(at_least_one_not_equal) 36 | self.assertLess(0, len(texts)) 37 | 38 | def test_multiple_actions(self): 39 | texts = [ 40 | 'The quick brown fox jumps over the lazy dog', 41 | 'Zology raku123456 fasdasd asd4123414 1234584' 42 | ] 43 | 44 | flows = [ 45 | naf.Sometimes([nac.RandomCharAug(action=Action.INSERT), 46 | nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)], 47 | aug_p=0.8), 48 | naf.Sometimes( 49 | [nac.OcrAug(), nac.KeyboardAug(aug_char_min=1), 50 | nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6), 51 | nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)], 52 | aug_p=0.6) 53 | ] 54 | 55 | # Since prob may be low and causing do not perform data augmentation. Retry 5 times 56 | for flow in flows: 57 | at_least_one_not_equal = False 58 | for _ in range(0, 5): 59 | for text in texts: 60 | self.assertLess(0, len(text)) 61 | augmented_text = flow.augment(text) 62 | 63 | if text != augmented_text: 64 | at_least_one_not_equal = True 65 | 66 | self.assertLess(0, len(text)) 67 | 68 | if at_least_one_not_equal: 69 | break 70 | 71 | self.assertTrue(at_least_one_not_equal) 72 | self.assertLess(0, len(flows)) 73 | self.assertLess(0, len(texts)) 74 | 75 | -------------------------------------------------------------------------------- /test/model/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/model/__init__.py -------------------------------------------------------------------------------- /test/model/char/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/model/char/__init__.py -------------------------------------------------------------------------------- /test/model/word/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/model/word/__init__.py -------------------------------------------------------------------------------- /test/model/word/test_word_embs_model.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | from dotenv import load_dotenv 4 | 5 | import nlpaug.model.word_embs as nmw 6 | 7 | 8 | class TestWordEmbsModel(unittest.TestCase): 9 | @classmethod 10 | def setUpClass(cls): 11 | env_config_path = os.path.abspath(os.path.join( 12 | os.path.dirname(__file__), '..', '..', '..', '.env')) 13 | load_dotenv(env_config_path) 14 | 15 | def test_bogus_fasttext_loading(self): 16 | test_file = os.path.join(os.environ.get("PACKAGE_DIR"), 'res', 'text', 'bogus_fasttext.vec') 17 | 18 | # Change to not supporting incorrect format file after switching to use gensim package 19 | with self.assertRaises(Exception) as error: 20 | fasttext = nmw.Fasttext() 21 | fasttext.read(test_file) 22 | self.assertIn('cannot copy sequence with size 11 to array axis with dimension 10', str(error.exception)) 23 | 24 | # for word in fasttext.get_vocab(): 25 | # self.assertSequenceEqual(list(fasttext.model[word]), expected_vector) 26 | 27 | # self.assertSequenceEqual(["test1", "test2", "test_3", "test 4", "test -> 5"], fasttext.get_vocab()) 28 | 29 | # self.assertEqual(len(fasttext.get_vocab()), 5) 30 | -------------------------------------------------------------------------------- /test/profiler.py: -------------------------------------------------------------------------------- 1 | import nlpaug, transformers, torch, fairseq, nltk 2 | from platform import python_version 3 | import nlpaug.augmenter.audio as naa 4 | import nlpaug.augmenter.char as nac 5 | import nlpaug.augmenter.word as naw 6 | import nlpaug.augmenter.sentence as nas 7 | 8 | from pyinstrument import Profiler 9 | 10 | profiler = Profiler() 11 | 12 | def main(): 13 | model_paths = [ 14 | # 'distilbert-base-uncased', 15 | 'bert-base-uncased', 16 | # 'bert-base-cased', 17 | # 'xlnet-base-cased', 18 | # 'roberta-base', 19 | # 'distilroberta-base' 20 | ] 21 | for model_path in model_paths: 22 | print('-----------------:', model_path) 23 | aug = naw.ContextualWordEmbsAug(model_path=model_path) 24 | text = 'The quick brown fox jumps over the lazaaaaaaaaay dog' 25 | augmented_text = aug.augment([text]*2) 26 | # print(augmented_text) 27 | 28 | 29 | if __name__ == '__main__': 30 | print('python_version:{}'.format(python_version())) 31 | print('nlpaug:{}'.format(nlpaug.__version__)) 32 | print('transformers:{}'.format(transformers.__version__)) 33 | print('torch:{}'.format(torch.__version__)) 34 | print('fairseq:{}'.format(fairseq.__version__)) 35 | print('nltk:{}'.format(nltk.__version__)) 36 | 37 | # yappi.set_clock_type("cpu") # Use set_clock_type("wall") for wall time 38 | # yappi.start() 39 | profiler.start() 40 | main() 41 | profiler.stop() 42 | print(profiler.output_text(unicode=True, color=True)) 43 | # yappi.get_func_stats().print_all() 44 | # yappi.get_thread_stats().print_all() -------------------------------------------------------------------------------- /test/profiling/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/profiling/__init__.py -------------------------------------------------------------------------------- /test/profiling/sentence/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/profiling/sentence/__init__.py -------------------------------------------------------------------------------- /test/profiling/sentence/test_context_word_embs_sentence_profiling.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import time 4 | from dotenv import load_dotenv 5 | 6 | import nlpaug.augmenter.sentence as nas 7 | 8 | 9 | class TestContextualWordEmbsAugProfiling(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | 16 | cls.text = 'The quick brown fox jumps over the lazy dog.' 17 | 18 | def test_optimize(self): 19 | model_paths = ['gpt2', 'distilgpt2'] 20 | device = 'cpu' 21 | enable_optimize = {'external_memory': 1024, 'return_proba': True} 22 | disable_optimize = {'external_memory': 0, 'return_proba': True} 23 | epoch = 10 24 | 25 | for model_path in model_paths: 26 | # Optimized 27 | durations = [] 28 | aug = nas.ContextualWordEmbsForSentenceAug( 29 | model_path=model_path, device=device, optimize=enable_optimize, force_reload=True) 30 | for i in range(epoch): 31 | start_dt = time.monotonic() 32 | for j in range(epoch): 33 | aug.augment(self.text) 34 | end_dt = time.monotonic() 35 | durations.append(round(end_dt-start_dt, 2)) 36 | 37 | optimized_total_duration = sum(durations) 38 | optimized_average_duration = round(optimized_total_duration/len(durations), 2) 39 | 40 | # No optimized 41 | durations = [] 42 | aug.model.optimize = disable_optimize 43 | for _ in range(epoch): 44 | start_dt = time.monotonic() 45 | for _ in range(epoch): 46 | aug.augment(self.text) 47 | end_dt = time.monotonic() 48 | durations.append(round(end_dt - start_dt, 2)) 49 | 50 | no_optimized_total_duration = sum(durations) 51 | no_optimized_average_duration = round(no_optimized_total_duration / len(durations), 2) 52 | 53 | print('Model:{}, Optimized: {}({}), No Optimized: {}({})'.format( 54 | model_path, optimized_total_duration, optimized_average_duration, 55 | no_optimized_total_duration, no_optimized_average_duration 56 | )) 57 | 58 | self.assertGreater(no_optimized_total_duration, optimized_total_duration) 59 | self.assertGreater(no_optimized_average_duration, optimized_average_duration) 60 | -------------------------------------------------------------------------------- /test/profiling/word/profile_context_word_embs.py: -------------------------------------------------------------------------------- 1 | import datetime 2 | 3 | def run_core(): 4 | print(datetime.datetime.now(), 'before import') 5 | import nlpaug.augmenter.word as naw 6 | 7 | print(datetime.datetime.now(), 'before init') 8 | aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', model_type="bert", use_custom_api=True) 9 | text = 'The quick brown fox jumps over the lazy dog.' 10 | print(datetime.datetime.now(), 'before augment') 11 | aug.augment([text] * 2) 12 | print(datetime.datetime.now(), 'done') 13 | 14 | if __name__ == '__main__': 15 | run_core() 16 | -------------------------------------------------------------------------------- /test/res/audio/Pink_noise.ogg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/res/audio/Pink_noise.ogg -------------------------------------------------------------------------------- /test/res/audio/Yamaha-V50-Rock-Beat-120bpm.wav: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/res/audio/Yamaha-V50-Rock-Beat-120bpm.wav -------------------------------------------------------------------------------- /test/res/common/sample.json: -------------------------------------------------------------------------------- 1 | { 2 | "0": [ 3 | "3" 4 | ] 5 | } -------------------------------------------------------------------------------- /test/run_profile_context_word_embs.sh: -------------------------------------------------------------------------------- 1 | py-spy record -o profile.svg -- python ./test/profiling/word/profile_context_word_embs.py -------------------------------------------------------------------------------- /test/run_profile_import.sh: -------------------------------------------------------------------------------- 1 | python -X importtime -c 'import nlpaug' 2> nlpaug-imports.log -------------------------------------------------------------------------------- /test/util/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/util/__init__.py -------------------------------------------------------------------------------- /test/util/selection/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/util/selection/__init__.py -------------------------------------------------------------------------------- /test/util/text/test_tokenizer.py: -------------------------------------------------------------------------------- 1 | import unittest 2 | import os 3 | import numpy as np 4 | from dotenv import load_dotenv 5 | 6 | from nlpaug.util.text.tokenizer import Tokenizer 7 | 8 | 9 | class TestTokenizer(unittest.TestCase): 10 | @classmethod 11 | def setUpClass(cls): 12 | env_config_path = os.path.abspath(os.path.join( 13 | os.path.dirname(__file__), '..', '..', '.env')) 14 | load_dotenv(env_config_path) 15 | 16 | def test_tokenizer(self): 17 | text = 'The quick brown fox jumps over the lazy dog?' 18 | 19 | tokens = Tokenizer.tokenizer(text) 20 | expected_tokens = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '?'] 21 | self.assertEqual(tokens, expected_tokens) 22 | 23 | 24 | def test_reverse_tokenizer(self): 25 | text = 'The quick (brown) [fox] {jumps} over the lazy dog?' 26 | 27 | tokens = Tokenizer.tokenizer(text) 28 | self.assertEqual(text, Tokenizer.reverse_tokenizer(tokens)) --------------------------------------------------------------------------------