├── .codacy.yml
├── .gitattributes
├── .github
    └── FUNDING.yml
├── .gitignore
├── .readthedocs.yml
├── .travis.yml
├── CHANGE.md
├── CITED.md
├── LICENSE
├── MANIFEST.in
├── README.md
├── SOURCE.md
├── codecov.yml
├── conda.sh
├── docs
    ├── Makefile
    ├── augmenter
    │   ├── audio
    │   │   ├── audio.rst
    │   │   ├── crop.rst
    │   │   ├── loudness.rst
    │   │   ├── mask.rst
    │   │   ├── noise.rst
    │   │   ├── normalization.rst
    │   │   ├── pitch.rst
    │   │   ├── shift.rst
    │   │   ├── speed.rst
    │   │   └── vtlp.rst
    │   ├── augmenter.rst
    │   ├── char
    │   │   ├── char.rst
    │   │   ├── keyboard.rst
    │   │   ├── ocr.rst
    │   │   └── random.rst
    │   ├── sentence
    │   │   ├── abst_summ.rst
    │   │   ├── context_word_embs_sentence.rst
    │   │   ├── lambada.rst
    │   │   ├── random.rst
    │   │   └── sentence.rst
    │   ├── spectrogram
    │   │   ├── frequency_masking.rst
    │   │   ├── spectrogram.rst
    │   │   └── time_masking.rst
    │   └── word
    │   │   ├── antonym.rst
    │   │   ├── back_translation.rst
    │   │   ├── context_word_embs.rst
    │   │   ├── random.rst
    │   │   ├── reserved.rst
    │   │   ├── spelling.rst
    │   │   ├── split.rst
    │   │   ├── synonym.rst
    │   │   ├── tfidf.rst
    │   │   ├── word.rst
    │   │   └── word_embs.rst
    ├── conf.py
    ├── example
    │   └── example.rst
    ├── flow
    │   ├── flow.rst
    │   ├── sequential.rst
    │   └── sometimes.rst
    ├── index.rst
    ├── make.bat
    ├── overview
    │   └── overview.rst
    └── util
    │   ├── download.rst
    │   └── util.rst
├── example
    ├── audio_augmenter.ipynb
    ├── change_log.ipynb
    ├── custom_augmenter.ipynb
    ├── flow.ipynb
    ├── lambada-train_model.ipynb
    ├── quick_example.ipynb
    ├── spectrogram_augmenter.ipynb
    ├── textual_augmenter.ipynb
    ├── textual_language_augmenter.ipynb
    └── tfidf-train_model.ipynb
├── meta.yaml
├── nlpaug
    ├── .gitignore
    ├── __init__.py
    ├── augmenter
    │   ├── __init__.py
    │   ├── audio
    │   │   ├── __init__.py
    │   │   ├── audio_augmenter.py
    │   │   ├── crop.py
    │   │   ├── inversion.py
    │   │   ├── loudness.py
    │   │   ├── mask.py
    │   │   ├── noise.py
    │   │   ├── normalization.py
    │   │   ├── pitch.py
    │   │   ├── shift.py
    │   │   ├── speed.py
    │   │   └── vtlp.py
    │   ├── augment.py
    │   ├── char
    │   │   ├── __init__.py
    │   │   ├── char_augmenter.py
    │   │   ├── keyboard.py
    │   │   ├── ocr.py
    │   │   └── random.py
    │   ├── sentence
    │   │   ├── __init__.py
    │   │   ├── abst_summ.py
    │   │   ├── context_word_embs_sentence.py
    │   │   ├── lambada.py
    │   │   ├── random.py
    │   │   └── sentence_augmenter.py
    │   ├── spectrogram
    │   │   ├── __init__.py
    │   │   ├── frequency_masking.py
    │   │   ├── loudness.py
    │   │   ├── spectrogram_augmenter.py
    │   │   ├── time_masking.py
    │   │   └── time_warping.py
    │   └── word
    │   │   ├── __init__.py
    │   │   ├── antonym.py
    │   │   ├── back_translation.py
    │   │   ├── context_word_embs.py
    │   │   ├── random.py
    │   │   ├── reserved.py
    │   │   ├── spelling.py
    │   │   ├── split.py
    │   │   ├── synonym.py
    │   │   ├── tfidf.py
    │   │   ├── word_augmenter.py
    │   │   └── word_embs.py
    ├── base_augmenter.py
    ├── flow
    │   ├── __init__.py
    │   ├── pipeline.py
    │   ├── sequential.py
    │   └── sometimes.py
    ├── model
    │   ├── __init__.py
    │   ├── audio
    │   │   ├── __init__.py
    │   │   ├── audio.py
    │   │   ├── crop.py
    │   │   ├── inversion.py
    │   │   ├── loudness.py
    │   │   ├── mask.py
    │   │   ├── noise.py
    │   │   ├── normalization.py
    │   │   ├── pitch.py
    │   │   ├── shift.py
    │   │   ├── speed.py
    │   │   └── vtlp.py
    │   ├── base_model.py
    │   ├── char
    │   │   ├── __init__.py
    │   │   ├── char.py
    │   │   ├── keyboard.py
    │   │   └── ocr.py
    │   ├── lang_models
    │   │   ├── __init__.py
    │   │   ├── bart.py
    │   │   ├── bert.py
    │   │   ├── distilbert.py
    │   │   ├── fairseq.py
    │   │   ├── fill_mask_transformers.py
    │   │   ├── gpt2.py
    │   │   ├── lambada.py
    │   │   ├── language_models.py
    │   │   ├── machine_translation_transformers.py
    │   │   ├── roberta.py
    │   │   ├── summarization_transformers.py
    │   │   ├── t5.py
    │   │   ├── text_generation_transformers.py
    │   │   └── xlnet.py
    │   ├── spectrogram
    │   │   ├── __init__.py
    │   │   ├── frequency_masking.py
    │   │   ├── loudness.py
    │   │   ├── spectrogram.py
    │   │   ├── time_masking.py
    │   │   └── time_warping.py
    │   ├── word_dict
    │   │   ├── __init__.py
    │   │   ├── ppdb.py
    │   │   ├── spelling.py
    │   │   ├── word_dictionary.py
    │   │   └── wordnet.py
    │   ├── word_embs
    │   │   ├── __init__.py
    │   │   ├── fasttext.py
    │   │   ├── glove.py
    │   │   ├── word2vec.py
    │   │   └── word_embeddings.py
    │   ├── word_rule
    │   │   ├── __init__.py
    │   │   ├── shuffle.py
    │   │   └── word_rule.py
    │   └── word_stats
    │   │   ├── __init__.py
    │   │   ├── tfidf.py
    │   │   └── word_statistics.py
    ├── res
    │   ├── char
    │   │   ├── keyboard
    │   │   │   ├── de.json
    │   │   │   ├── en.json
    │   │   │   ├── es.json
    │   │   │   ├── fr.json
    │   │   │   ├── he.json
    │   │   │   ├── it.json
    │   │   │   ├── nl.json
    │   │   │   ├── pl.json
    │   │   │   ├── th.json
    │   │   │   ├── tr.json
    │   │   │   └── uk.json
    │   │   └── ocr
    │   │   │   └── en.json
    │   └── word
    │   │   └── spelling
    │   │       └── spelling_en.txt
    └── util
    │   ├── __init__.py
    │   ├── action.py
    │   ├── audio
    │       ├── __init__.py
    │       ├── loader.py
    │       └── visualizer.py
    │   ├── decorator
    │       ├── __init__.py
    │       └── deprecation.py
    │   ├── doc
    │       ├── __init__.py
    │       ├── change_log.py
    │       ├── doc.py
    │       └── token.py
    │   ├── exception
    │       ├── __init__.py
    │       ├── exception_info.py
    │       └── warning.py
    │   ├── file
    │       ├── __init__.py
    │       ├── download.py
    │       ├── library.py
    │       └── read.py
    │   ├── lib_ver.py
    │   ├── logger
    │       ├── __init__.py
    │       └── logger.py
    │   ├── math
    │       ├── __init__.py
    │       └── normalization.py
    │   ├── method.py
    │   ├── selection
    │       ├── __init__.py
    │       ├── filtering.py
    │       └── randomness.py
    │   └── text
    │       ├── __init__.py
    │       ├── part_of_speech.py
    │       └── tokenizer.py
├── pypi.sh
├── requirements.txt
├── requirements_dev.txt
├── res
    ├── audio_example.png
    ├── lambada_algo.png
    ├── logo_small.png
    └── textual_example.png
├── script.txt
├── scripts
    ├── lambada
    │   ├── data_processing.py
    │   ├── run_clm.py
    │   └── train_cls.py
    ├── run_lambada.py
    └── train_lambada.sh
├── setup.py
└── test
    ├── __init__.py
    ├── augmenter
        ├── __init__.py
        ├── audio
        │   ├── __init__.py
        │   ├── test_audio.py
        │   ├── test_crop.py
        │   ├── test_inversion.py
        │   ├── test_loudness.py
        │   ├── test_mask.py
        │   ├── test_noise.py
        │   ├── test_normalization.py
        │   ├── test_pitch.py
        │   ├── test_shift.py
        │   ├── test_speed.py
        │   └── test_vtlp.py
        ├── char
        │   ├── __init__.py
        │   ├── test_char.py
        │   ├── test_keyboard.py
        │   ├── test_ocr.py
        │   └── test_random_char.py
        ├── sentence
        │   ├── __init__.py
        │   ├── test_abst_summ.py
        │   ├── test_context_word_embs_sentence.py
        │   ├── test_lambada.py
        │   ├── test_random.py
        │   └── test_sentence.py
        ├── spectrogram
        │   ├── __init__.py
        │   ├── test_frequency_masking.py
        │   ├── test_loudness_spec.py
        │   ├── test_spectrogram.py
        │   └── test_time_masking.py
        ├── test_audio_augmenter.py
        ├── test_base_augmenter.py
        ├── test_text_augmenter.py
        └── word
        │   ├── __init__.py
        │   ├── test_antonym.py
        │   ├── test_back_translation.py
        │   ├── test_context_word_embs.py
        │   ├── test_random_word.py
        │   ├── test_reserved.py
        │   ├── test_spelling.py
        │   ├── test_split.py
        │   ├── test_synonym.py
        │   ├── test_tfidf.py
        │   ├── test_word.py
        │   └── test_word_embs.py
    ├── flow
        ├── __init__.py
        ├── test_flow.py
        ├── test_sequential.py
        └── test_sometimes.py
    ├── model
        ├── __init__.py
        ├── char
        │   ├── __init__.py
        │   └── test_keyboard_model.py
        └── word
        │   ├── __init__.py
        │   └── test_word_embs_model.py
    ├── profiler.py
    ├── profiling
        ├── __init__.py
        ├── sentence
        │   ├── __init__.py
        │   └── test_context_word_embs_sentence_profiling.py
        └── word
        │   └── profile_context_word_embs.py
    ├── res
        ├── audio
        │   ├── Pink_noise.ogg
        │   └── Yamaha-V50-Rock-Beat-120bpm.wav
        ├── common
        │   └── sample.json
        └── text
        │   └── classification.csv
    ├── run_profile_context_word_embs.sh
    ├── run_profile_import.sh
    ├── run_test.py
    └── util
        ├── __init__.py
        ├── selection
            ├── __init__.py
            └── test_filtering.py
        └── text
            └── test_tokenizer.py


/.codacy.yml:
--------------------------------------------------------------------------------
1 | exclude_paths:
2 |   - test/*
3 |   - README.md
4 |   - CHANGE.md
5 |   - SOURCE.md
6 |   - docs/conf.py


--------------------------------------------------------------------------------
/.gitattributes:
--------------------------------------------------------------------------------
1 | # Auto detect text files and perform LF normalization
2 | * text=auto
3 | 


--------------------------------------------------------------------------------
/.github/FUNDING.yml:
--------------------------------------------------------------------------------
 1 | # These are supported funding model platforms
 2 | 
 3 | github: [makcedward]
 4 | patreon: # Replace with a single Patreon username
 5 | open_collective: # Replace with a single Open Collective username
 6 | ko_fi: # Replace with a single Ko-fi username
 7 | tidelift: # Replace with a single Tidelift platform-name/package-name e.g., npm/babel
 8 | community_bridge: # Replace with a single Community Bridge project-name e.g., cloud-foundry
 9 | liberapay: # Replace with a single Liberapay username
10 | issuehunt: # Replace with a single IssueHunt username
11 | otechie: # Replace with a single Otechie username
12 | custom: # Replace with up to 4 custom sponsorship URLs e.g., ['link1', 'link2']
13 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | *.zip
 28 | .DS_Store
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | .hypothesis/
 50 | .pytest_cache/
 51 | 
 52 | # Translations
 53 | *.mo
 54 | *.pot
 55 | 
 56 | # Django stuff:
 57 | *.log
 58 | local_settings.py
 59 | db.sqlite3
 60 | 
 61 | # Flask stuff:
 62 | instance/
 63 | .webassets-cache
 64 | 
 65 | # Scrapy stuff:
 66 | .scrapy
 67 | 
 68 | # Sphinx documentation
 69 | docs/_build/
 70 | 
 71 | # PyBuilder
 72 | target/
 73 | 
 74 | # Jupyter Notebook
 75 | .ipynb_checkpoints
 76 | notebook/
 77 | 
 78 | # pyenv
 79 | .python-version
 80 | 
 81 | # celery beat schedule file
 82 | celerybeat-schedule
 83 | 
 84 | # SageMath parsed files
 85 | *.sage.py
 86 | 
 87 | # Environments
 88 | .env
 89 | .venv
 90 | env/
 91 | venv/
 92 | ENV/
 93 | env.bak/
 94 | venv.bak/
 95 | 
 96 | # Spyder project settings
 97 | .spyderproject
 98 | .spyproject
 99 | 
100 | # Rope project settings
101 | .ropeproject
102 | 
103 | # mkdocs documentation
104 | /site
105 | 
106 | # mypy
107 | .mypy_cache/
108 | 
109 | # IDE
110 | .idea/
111 | 
112 | # model
113 | model/*
114 | *.txt
115 | *.bin
116 | *.vec
117 | *.zip
118 | *.gz


--------------------------------------------------------------------------------
/.readthedocs.yml:
--------------------------------------------------------------------------------
1 | python:
2 |     version: 3.8
3 | 


--------------------------------------------------------------------------------
/.travis.yml:
--------------------------------------------------------------------------------
 1 | language: python
 2 | python:
 3 |     - "3.6"
 4 | 
 5 | install:
 6 |     - pip install -r requirements.txt
 7 |     - pip install coverage
 8 |     - pip install codecov
 9 |     - pip install .
10 | 
11 | script:
12 |     - python test/run_test.py
13 | 
14 | after_success:
15 |   - codecov


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2019 Edward Ma
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.


--------------------------------------------------------------------------------
/MANIFEST.in:
--------------------------------------------------------------------------------
1 | recursive-include nlpaug/res *.json
2 | recursive-include nlpaug/res *.txt


--------------------------------------------------------------------------------
/codecov.yml:
--------------------------------------------------------------------------------
 1 | #see https://github.com/codecov/support/wiki/Codecov-Yaml
 2 | codecov:
 3 |   notify:
 4 |     require_ci_to_pass: yes
 5 | 
 6 | coverage:
 7 |   precision: 2  # 2 = xx.xx%, 0 = xx%
 8 |   round: nearest # how coverage is rounded: down/up/nearest
 9 |   range: 10...90 # custom range of coverage colors from red -> yellow -> green
10 |   status:
11 |     # https://codecov.readme.io/v1.0/docs/commit-status
12 |     project:
13 |       default:
14 |         against: auto
15 |         target: 40% # specify the target coverage for each commit status
16 |         threshold: 20% # allow this little decrease on project
17 |         # https://github.com/codecov/support/wiki/Filtering-Branches
18 |         # branches: master
19 |         if_ci_failed: error
20 |     # https://github.com/codecov/support/wiki/Patch-Status
21 |     patch:
22 |       default:
23 |         against: parent
24 |         target: 30% # specify the target "X%" coverage to hit
25 |         # threshold: 50% # allow this much decrease on patch
26 |     changes: false
27 | 
28 | parsers:
29 |   gcov:
30 |     branch_detection:
31 |       conditional: true
32 |       loop: true
33 |       macro: false
34 |       method: false
35 |   javascript:
36 |     enable_partials: false
37 | 
38 | comment:
39 |   layout: header, diff
40 |   require_changes: false
41 |   behavior: default  # update if exists else create new
42 |   branches: *


--------------------------------------------------------------------------------
/conda.sh:
--------------------------------------------------------------------------------
 1 | #!/bin/bash
 2 | 
 3 | build_name='.'
 4 | pkg_name='nlpaug'
 5 | py_vers=(3.5 3.6 3.7 3.8 3.9)
 6 | pkg_ver='1.1.10'
 7 | conda_dir="/home/edward/anaconda3/conda-bld"
 8 | 
 9 | echo "Building conda package ..."
10 | for i in "${py_vers[@]}"
11 | do
12 | 	conda-build --python $i $build_name
13 | done
14 | 
15 | echo "Converting package to other platforms"
16 | platforms=(osx-64 linux-32 win-32 win-64)
17 | find "$conda_dir"/linux-64/"$pkg_name"*"$pkg_ver"*.tar.bz2 | while read file
18 | do
19 | 	for platform in "${platforms[@]}"
20 | 	do
21 | 		conda convert --platform $platform $file -o "$conda_dir"
22 | 	done
23 | done
24 | 
25 | echo "Upload to Anaconda"
26 | for platform in "${platforms[@]}"
27 | do
28 | 	find "$conda_dir"/"$platform"/"$pkg_name"*"$pkg_ver"*.tar.bz2 | while read file
29 | 	do
30 | 		anaconda upload --force $file
31 | 	done
32 | done
33 | 
34 | 
35 | anaconda upload --force "$conda_dir"/linux-32/"$pkg_name"*"$pkg_ver"*.tar.bz2
36 | anaconda upload --force "$conda_dir"/linux-64/"$pkg_name"*"$pkg_ver"*.tar.bz2
37 | anaconda upload --force "$conda_dir"/win-32/"$pkg_name"*"$pkg_ver"*.tar.bz2
38 | anaconda upload --force "$conda_dir"/win-64/"$pkg_name"*"$pkg_ver"*.tar.bz2
39 | anaconda upload --force "$conda_dir"/osx-64/"$pkg_name"*"$pkg_ver"*.tar.bz2
40 | 


--------------------------------------------------------------------------------
/docs/Makefile:
--------------------------------------------------------------------------------
 1 | # Minimal makefile for Sphinx documentation
 2 | #
 3 | 
 4 | # You can set these variables from the command line.
 5 | SPHINXOPTS    =
 6 | SPHINXBUILD   = sphinx-build
 7 | SPHINXPROJ    = nlpaug
 8 | SOURCEDIR     = source
 9 | BUILDDIR      = build
10 | 
11 | # Put it first so that "make" without argument is like "make help".
12 | help:
13 | 	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14 | 
15 | .PHONY: help Makefile
16 | 
17 | # Catch-all target: route all unknown targets to Sphinx using the new
18 | # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
19 | %: Makefile
20 | 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)


--------------------------------------------------------------------------------
/docs/augmenter/audio/audio.rst:
--------------------------------------------------------------------------------
 1 | Audio Augmenter
 2 | ===============
 3 | 
 4 | .. toctree::
 5 |     :maxdepth: 6
 6 | 
 7 |     ./crop
 8 |     ./loudness
 9 |     ./mask
10 |     ./noise
11 |     ./normalization
12 |     ./pitch
13 |     ./shift
14 |     ./speed
15 |     ./vtlp


--------------------------------------------------------------------------------
/docs/augmenter/audio/crop.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.crop
2 | ==============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.audio.crop
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/audio/loudness.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.loudness
2 | ==============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.audio.loudness
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/audio/mask.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.mask
2 | ==============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.audio.mask
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/audio/noise.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.noise
2 | ==============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.audio.noise
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/audio/normalization.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.normalization
2 | ==============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.audio.normalization
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/audio/pitch.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.pitch
2 | ==============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.audio.pitch
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/audio/shift.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.shift
2 | ==============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.audio.shift
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/audio/speed.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.speed
2 | ==============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.audio.speed
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/audio/vtlp.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.audio\.vtlp
2 | ============================
3 | 
4 | .. automodule:: nlpaug.augmenter.audio.vtlp
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/augmenter.rst:
--------------------------------------------------------------------------------
 1 | Augmenter
 2 | =========
 3 | 
 4 | .. toctree::
 5 |     :maxdepth: 6
 6 | 
 7 |     ./audio/audio
 8 |     ./char/char
 9 |     ./sentence/sentence
10 |     ./spectrogram/spectrogram
11 |     ./word/word


--------------------------------------------------------------------------------
/docs/augmenter/char/char.rst:
--------------------------------------------------------------------------------
1 | Character Augmenter
2 | ===================
3 | 
4 | .. toctree::
5 |     :maxdepth: 6
6 | 
7 |     ./keyboard
8 |     ./ocr
9 |     ./random


--------------------------------------------------------------------------------
/docs/augmenter/char/keyboard.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.char\.keyboard
2 | ===============================
3 | 
4 | .. automodule:: nlpaug.augmenter.char.keyboard
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/char/ocr.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.char\.ocr
2 | ==============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.char.ocr
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/char/random.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.char\.random
2 | ==============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.char.random
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/sentence/abst_summ.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.sentence\.abst_summ
2 | =====================================================
3 | 
4 | .. automodule:: nlpaug.augmenter.sentence.abst_summ
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/sentence/context_word_embs_sentence.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.sentence\.context_word_embs_sentence
2 | =====================================================
3 | 
4 | .. automodule:: nlpaug.augmenter.sentence.context_word_embs_sentence
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/sentence/lambada.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.sentence\.lambada
2 | =====================================================
3 | 
4 | .. automodule:: nlpaug.augmenter.sentence.lambada
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/sentence/random.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.sentence\.random
2 | =====================================================
3 | 
4 | .. automodule:: nlpaug.augmenter.sentence.random
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/sentence/sentence.rst:
--------------------------------------------------------------------------------
 1 | Sentence Augmenter
 2 | ==================
 3 | 
 4 | .. toctree::
 5 |     :maxdepth: 6
 6 | 
 7 |     ./abst_summ
 8 |     ./context_word_embs_sentence
 9 |     ./lambada
10 |     ./random


--------------------------------------------------------------------------------
/docs/augmenter/spectrogram/frequency_masking.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.spectrogram\.frequency_masking
2 | ===============================================
3 | 
4 | .. automodule:: nlpaug.augmenter.spectrogram.frequency_masking
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/spectrogram/spectrogram.rst:
--------------------------------------------------------------------------------
1 | Spectrogram Augmenter
2 | =====================
3 | 
4 | .. toctree::
5 |     :maxdepth: 6
6 | 
7 |     ./frequency_masking
8 |     ./time_masking


--------------------------------------------------------------------------------
/docs/augmenter/spectrogram/time_masking.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.spectrogram\.time_masking
2 | ==========================================
3 | 
4 | .. automodule:: nlpaug.augmenter.spectrogram.time_masking
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/word/antonym.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.antonym
2 | ==============================
3 | 
4 | .. automodule:: nlpaug.augmenter.word.antonym
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/word/back_translation.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.back_translation
2 | ========================================
3 | 
4 | .. automodule:: nlpaug.augmenter.word.back_translation
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/word/context_word_embs.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.context_word_embs
2 | ========================================
3 | 
4 | .. automodule:: nlpaug.augmenter.word.context_word_embs
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/word/random.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.random
2 | ================================
3 | 
4 | .. automodule:: nlpaug.augmenter.word.random
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/word/reserved.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.reserved
2 | ================================
3 | 
4 | .. automodule:: nlpaug.augmenter.word.reserved
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/word/spelling.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.spelling
2 | ================================
3 | 
4 | .. automodule:: nlpaug.augmenter.word.spelling
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/word/split.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.split
2 | ============================
3 | 
4 | .. automodule:: nlpaug.augmenter.word.split
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/word/synonym.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.synonym
2 | ==============================
3 | 
4 | .. automodule:: nlpaug.augmenter.word.synonym
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/word/tfidf.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.tfidf
2 | ================================
3 | 
4 | .. automodule:: nlpaug.augmenter.word.tfidf
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/augmenter/word/word.rst:
--------------------------------------------------------------------------------
 1 | Word Augmenter
 2 | ==============
 3 | 
 4 | .. toctree::
 5 |     :maxdepth: 6
 6 | 
 7 |     ./antonym
 8 |     ./back_translation
 9 |     ./context_word_embs
10 |     ./random
11 |     ./reserved
12 |     ./spelling
13 |     ./split
14 |     ./synonym
15 |     ./tfidf
16 |     ./word_embs


--------------------------------------------------------------------------------
/docs/augmenter/word/word_embs.rst:
--------------------------------------------------------------------------------
1 | nlpaug.augmenter.word\.word_embs
2 | ================================
3 | 
4 | .. automodule:: nlpaug.augmenter.word.word_embs
5 |     :members:
6 |     :inherited-members:
7 |     :show-inheritance:
8 | 


--------------------------------------------------------------------------------
/docs/example/example.rst:
--------------------------------------------------------------------------------
 1 | Example
 2 | =======
 3 | 
 4 | The following examples show a standard use case for augmenter.
 5 | 
 6 | -  `Audio augmenters`_
 7 | -  `Textual augmenters`_
 8 | -  `Spectrogram augmenters`_
 9 | -  `Custom augmenter`_
10 | -  `TF-IDF model training`_
11 | -  `Flow`_
12 | 
13 | .. _Audio augmenters: https://github.com/makcedward/nlpaug/blob/master/example/audio_augmenter.ipynb
14 | .. _Textual augmenters: https://github.com/makcedward/nlpaug/blob/master/example/textual_augmenter.ipynb
15 | .. _Spectrogram augmenters: https://github.com/makcedward/nlpaug/blob/master/example/spectrogram_augmenter.ipynb
16 | .. _Custom augmenter: https://github.com/makcedward/nlpaug/blob/master/example/custom_augmenter.ipynb
17 | .. _TF-IDF model training: https://github.com/makcedward/nlpaug/blob/master/example/tfidf-train_model.ipynb
18 | .. _Flow: https://github.com/makcedward/nlpaug/blob/master/example/flow.ipynb


--------------------------------------------------------------------------------
/docs/flow/flow.rst:
--------------------------------------------------------------------------------
1 | Flow
2 | ====
3 | 
4 | .. toctree::
5 |    :maxdepth: 3
6 | 
7 |    ./sequential
8 |    ./sometimes
9 | 


--------------------------------------------------------------------------------
/docs/flow/sequential.rst:
--------------------------------------------------------------------------------
1 | nlpaug.flow\.sequential
2 | ==========================================
3 | 
4 | .. automodule:: nlpaug.flow.sequential
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/flow/sometimes.rst:
--------------------------------------------------------------------------------
1 | nlpaug.flow\.sometimes
2 | ==========================================
3 | 
4 | .. automodule:: nlpaug.flow.sometimes
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/index.rst:
--------------------------------------------------------------------------------
 1 | nlpaug
 2 | ======
 3 | 
 4 | `nlpgaug` is a library for textual augmentation in machine learning experiments.
 5 | The goal is improving deep learning model performance by generating textual data.
 6 | It also able to generate adversarial examples to prevent adversarial attacks.
 7 | 
 8 | .. toctree::
 9 |    :maxdepth: 3
10 |    :caption: Contents:
11 | 
12 |    ./overview/overview
13 |    ./example/example
14 |    ./augmenter/augmenter
15 |    ./flow/flow
16 |    ./util/util
17 | 
18 | See :ref:`modindex` for API.
19 | 
20 | Indices and tables
21 | ==================
22 | 
23 | * :ref:`genindex`
24 | * :ref:`modindex`
25 | * :ref:`search`


--------------------------------------------------------------------------------
/docs/make.bat:
--------------------------------------------------------------------------------
 1 | @ECHO OFF
 2 | 
 3 | pushd %~dp0
 4 | 
 5 | REM Command file for Sphinx documentation
 6 | 
 7 | if "%SPHINXBUILD%" == "" (
 8 | 	set SPHINXBUILD=sphinx-build
 9 | )
10 | set SOURCEDIR=.
11 | set BUILDDIR=../build
12 | set SPHINXPROJ=nlpaug
13 | 
14 | if "%1" == "" goto help
15 | 
16 | %SPHINXBUILD% >NUL 2>NUL
17 | if errorlevel 9009 (
18 | 	echo.
19 | 	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
20 | 	echo.installed, then set the SPHINXBUILD environment variable to point
21 | 	echo.to the full path of the 'sphinx-build' executable. Alternatively you
22 | 	echo.may add the Sphinx directory to PATH.
23 | 	echo.
24 | 	echo.If you don't have Sphinx installed, grab it from
25 | 	echo.http://sphinx-doc.org/
26 | 	exit /b 1
27 | )
28 | 
29 | %SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
30 | goto end
31 | 
32 | :help
33 | %SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS%
34 | 
35 | :end
36 | popd
37 | 


--------------------------------------------------------------------------------
/docs/overview/overview.rst:
--------------------------------------------------------------------------------
 1 | Overview
 2 | ========
 3 | 
 4 | This python library helps you with augmenting nlp for your machine learning projects. Visit this introduction to understand about Data Augmentation in NLP. Augmenter is the basic element of augmentation while Flow is a pipeline to orchestra multi augmenter together.
 5 | 
 6 | -  `Data Augmentation library for Text`_
 7 | -  `Data Augmentation library for Speech Recognition`_
 8 | -  `Data Augmentation library for Audio`_
 9 | -  `Does your NLP model able to prevent adversarial attack?`_
10 | 
11 | .. _Data Augmentation library for Text: https://towardsdatascience.com/data-augmentation-library-for-text-9661736b13ff
12 | .. _Data Augmentation library for Speech Recognition: https://towardsdatascience.com/data-augmentation-for-speech-recognition-e7c607482e78
13 | .. _Data Augmentation library for Audio: https://towardsdatascience.com/data-augmentation-for-audio-76912b01fdf6
14 | .. _Does your NLP model able to prevent adversarial attack?: https://medium.com/hackernoon/does-your-nlp-model-able-to-prevent-adversarial-attack-45b5ab75129c


--------------------------------------------------------------------------------
/docs/util/download.rst:
--------------------------------------------------------------------------------
1 | nlpaug.util.file\.download
2 | ==========================================
3 | 
4 | .. automodule:: nlpaug.util.file.download
5 |     :members:
6 |     :show-inheritance:
7 | 


--------------------------------------------------------------------------------
/docs/util/util.rst:
--------------------------------------------------------------------------------
1 | Util
2 | ====
3 | 
4 | .. toctree::
5 |    :maxdepth: 3
6 | 
7 |    ./download
8 | 


--------------------------------------------------------------------------------
/meta.yaml:
--------------------------------------------------------------------------------
 1 | {% set name = "nlpaug" %}
 2 | {% set version = "1.1.9" %}
 3 | 
 4 | package:
 5 |   name: "{{ name|lower }}"
 6 |   version: "{{ version }}"
 7 | 
 8 | requirements:
 9 |   host:
10 |     - pip
11 |     - python
12 |   run:
13 |     - python
14 | 
15 | about:
16 |   home: "https://github.com/makcedward/nlpaug"
17 |   license: MIT
18 |   summary: "Natural language processing augmentation library for deep neural networks."
19 |   dev_url: "https://nlpaug.readthedocs.io/"
20 | 


--------------------------------------------------------------------------------
/nlpaug/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | *.egg-info/
 24 | .installed.cfg
 25 | *.egg
 26 | MANIFEST
 27 | 
 28 | # PyInstaller
 29 | #  Usually these files are written by a python script from a template
 30 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 31 | *.manifest
 32 | *.spec
 33 | 
 34 | # Installer logs
 35 | pip-log.txt
 36 | pip-delete-this-directory.txt
 37 | 
 38 | # Unit test / coverage reports
 39 | htmlcov/
 40 | .tox/
 41 | .coverage
 42 | .coverage.*
 43 | .cache
 44 | nosetests.xml
 45 | coverage.xml
 46 | *.cover
 47 | .hypothesis/
 48 | .pytest_cache/
 49 | 
 50 | # Translations
 51 | *.mo
 52 | *.pot
 53 | 
 54 | # Django stuff:
 55 | *.log
 56 | local_settings.py
 57 | db.sqlite3
 58 | 
 59 | # Flask stuff:
 60 | instance/
 61 | .webassets-cache
 62 | 
 63 | # Scrapy stuff:
 64 | .scrapy
 65 | 
 66 | # Sphinx documentation
 67 | docs/_build/
 68 | 
 69 | # PyBuilder
 70 | target/
 71 | 
 72 | # Jupyter Notebook
 73 | .ipynb_checkpoints
 74 | 
 75 | # pyenv
 76 | .python-version
 77 | 
 78 | # celery beat schedule file
 79 | celerybeat-schedule
 80 | 
 81 | # SageMath parsed files
 82 | *.sage.py
 83 | 
 84 | # Environments
 85 | .env
 86 | .venv
 87 | env/
 88 | venv/
 89 | ENV/
 90 | env.bak/
 91 | venv.bak/
 92 | 
 93 | # Spyder project settings
 94 | .spyderproject
 95 | .spyproject
 96 | 
 97 | # Rope project settings
 98 | .ropeproject
 99 | 
100 | # mkdocs documentation
101 | /site
102 | 
103 | # mypy
104 | .mypy_cache/


--------------------------------------------------------------------------------
/nlpaug/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from nlpaug.base_augmenter import *
 3 | 
 4 | __all__ = ['base_augmenter']
 5 | 
 6 | __version__ = '1.1.11'
 7 | __description__ = 'Natural language processing augmentation library for deep neural networks.'
 8 | __url__ = 'https://github.com/makcedward/nlpaug'
 9 | __author__ = 'Edward Ma'
10 | __author_email__ = 'makcedward@gmail.com'


--------------------------------------------------------------------------------
/nlpaug/augmenter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/nlpaug/augmenter/__init__.py


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from nlpaug.augmenter.audio.audio_augmenter import AudioAugmenter
 3 | from nlpaug.augmenter.audio.noise import NoiseAug
 4 | from nlpaug.augmenter.audio.shift import ShiftAug
 5 | from nlpaug.augmenter.audio.speed import SpeedAug
 6 | from nlpaug.augmenter.audio.pitch import PitchAug
 7 | from nlpaug.augmenter.audio.loudness import LoudnessAug
 8 | from nlpaug.augmenter.audio.crop import CropAug
 9 | from nlpaug.augmenter.audio.mask import MaskAug
10 | from nlpaug.augmenter.audio.vtlp import VtlpAug
11 | from nlpaug.augmenter.audio.normalization import NormalizeAug
12 | from nlpaug.augmenter.audio.inversion import PolarityInverseAug
13 | 
14 | __all__ = ['audio_augmenter', 'noise', 'shift', 'speed', 'pitch', 'loudness', 'crop', 'mask', 'vtlp', 
15 | 'normalization', 'inversion']
16 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/audio_augmenter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.util import Method
 4 | from nlpaug import Augmenter
 5 | 
 6 | 
 7 | class AudioAugmenter(Augmenter):
 8 |     def __init__(self, action, zone=None, coverage=None, factor=None, duration=None, name='Audio_Aug', 
 9 |         device='cpu', verbose=0, stateless=True):
10 |         super(AudioAugmenter, self).__init__(
11 |             name=name, method=Method.AUDIO, action=action, aug_min=None, aug_max=None, device=device, 
12 |             verbose=verbose)
13 | 
14 |         self.zone = zone
15 |         self.coverage = coverage
16 |         self.factor = factor
17 |         self.duration = duration
18 |         self.stateless = stateless
19 | 
20 |     @classmethod
21 |     def clean(cls, data):
22 |         return data
23 | 
24 |     @classmethod
25 |     def is_duplicate(cls, dataset, data):
26 |         for d in dataset:
27 |             if np.array_equal(d, data):
28 |                 return True
29 |         return False
30 | 
31 |     def get_random_factor(self, low=None, high=None, dtype='float'):
32 |         lower_bound = low if low else self.factor[0]
33 |         upper_bound = high if high else self.factor[1]
34 |         if dtype == 'int':
35 |             return np.random.randint(lower_bound, upper_bound)
36 |         elif dtype == 'float':
37 |             return np.random.uniform(lower_bound, upper_bound)
38 |         
39 |         return np.random.uniform(lower_bound, upper_bound)
40 | 
41 |     def get_augmentation_segment_size(self, data):
42 |         return int(len(data) * (self.zone[1] - self.zone[0]) * self.coverage)
43 | 
44 |     def get_augment_range_by_coverage(self, data):
45 |         zone_start, zone_end = int(len(data) * self.zone[0]), int(len(data) * self.zone[1])
46 |         zone_size = zone_end - zone_start
47 | 
48 |         target_size = int(zone_size * self.coverage)
49 |         last_start = zone_start + int(zone_size * (1 - self.coverage))
50 | 
51 |         if zone_start == last_start:
52 |             start_pos = zone_start
53 |             end_pos = zone_end
54 |         else:
55 |             start_pos = np.random.randint(zone_start, last_start)
56 |             end_pos = start_pos + target_size
57 | 
58 |         return start_pos, end_pos
59 | 
60 |     def get_augment_range_by_duration(self, data):
61 |         zone_start, zone_end = int(len(data) * self.zone[0]), int(len(data) * self.zone[1])
62 |         zone_size = zone_end - zone_start
63 | 
64 |         target_size = int(self.sampling_rate * self.duration)
65 | 
66 |         if target_size >= zone_size:
67 |             start_pos = zone_start
68 |             end_pos = zone_end
69 |         else:
70 |             last_start = zone_start + zone_size - target_size
71 |             start_pos = np.random.randint(zone_start, last_start)
72 |             end_pos = start_pos + target_size
73 | 
74 |         return start_pos, end_pos


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/crop.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply cropping operation to audio.
 3 | """
 4 | 
 5 | from nlpaug.augmenter.audio import AudioAugmenter
 6 | import nlpaug.model.audio as nma
 7 | from nlpaug.util import Action, WarningMessage
 8 | 
 9 | 
10 | class CropAug(AudioAugmenter):
11 |     """
12 |     :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided.
13 |     :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
14 |         augmentation will be applied in first 20% and last 20% of whole audio.
15 |     :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `0.1` is assigned, augment
16 |         operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
17 |         zone and coverage are (0.2, 0.8) and 0.7 respectively. 25.2 seconds ((0.8-0.2)*0.7*60) audio will be
18 |         augmented.
19 |     :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage`
20 |         value will be ignored.
21 |     :param str name: Name of this augmenter
22 | 
23 |     >>> import nlpaug.augmenter.audio as naa
24 |     >>> aug = naa.CropAug(sampling_rate=44010)
25 |     """
26 | 
27 |     def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=0.1, duration=None, name='Crop_Aug', 
28 |         verbose=0, stateless=True):
29 |         super().__init__(
30 |             action=Action.DELETE, zone=zone, coverage=coverage, duration=duration, name=name, 
31 |             device='cpu', verbose=verbose, stateless=stateless)
32 | 
33 |         self.sampling_rate = sampling_rate
34 |         self.model = nma.Crop()
35 | 
36 |     def delete(self, data):
37 |         if self.duration is None:
38 |             start_pos, end_pos = self.get_augment_range_by_coverage(data)
39 |         else:
40 |             start_pos, end_pos = self.get_augment_range_by_duration(data)
41 | 
42 |         if not self.stateless:
43 |             self.start_pos = start_pos
44 |             self.end_pos = end_pos
45 |             
46 |         return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos)
47 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/inversion.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply polarity inversion to audio. 
 3 | """
 4 | 
 5 | from nlpaug.augmenter.audio import AudioAugmenter
 6 | import nlpaug.model.audio as nma
 7 | from nlpaug.util import Action, WarningMessage
 8 | 
 9 | 
10 | class PolarityInverseAug(AudioAugmenter):
11 |     """
12 |     :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
13 |         augmentation will be applied in first 20% and last 20% of whole audio.
14 |     :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `0.1` is assigned, augment
15 |         operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
16 |         zone and coverage are (0.2, 0.8) and 0.7 respectively. 25.2 seconds ((0.8-0.2)*0.7*60) audio will be
17 |         augmented.
18 |     :param str name: Name of this augmenter
19 | 
20 |     >>> import nlpaug.augmenter.audio as naa
21 |     >>> aug = naa.PolarityInverseAug()
22 |     """
23 | 
24 |     def __init__(self, zone=(0.2, 0.8), coverage=0.3, name='PolarityInverse_Aug', verbose=0, stateless=True):
25 |         super().__init__(
26 |             action=Action.SUBSTITUTE, zone=zone, coverage=coverage, name=name, device='cpu', verbose=verbose, 
27 |             stateless=stateless)
28 | 
29 |         self.model = nma.PolarityInversion()
30 | 
31 |     def substitute(self, data):
32 |         start_pos, end_pos = self.get_augment_range_by_coverage(data)
33 |         if not self.stateless:
34 |             self.start_pos = start_pos
35 |             self.end_pos = end_pos
36 | 
37 |         return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos)
38 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/loudness.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply adjusting loudness operation to audio.
 3 | """
 4 | 
 5 | from nlpaug.augmenter.audio import AudioAugmenter
 6 | import nlpaug.model.audio as nma
 7 | from nlpaug.util import Action, WarningMessage
 8 | 
 9 | 
10 | class LoudnessAug(AudioAugmenter):
11 |     """
12 |     :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
13 |         augmentation will be applied in first 20% and last 20% of whole audio.
14 |     :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
15 |         operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
16 |         zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
17 |         augmented.
18 |     :param tuple factor: Input data volume will be increased (decreased). Augmented value will be picked
19 |             within the range of this tuple value. Volume will be reduced if value is between 0 and 1.
20 |     :param str name: Name of this augmenter
21 | 
22 |     >>> import nlpaug.augmenter.audio as naa
23 |     >>> aug = naa.LoudnessAug()
24 |     """
25 | 
26 |     def __init__(self, zone=(0.2, 0.8), coverage=1., factor=(0.5, 2), name='Loudness_Aug', verbose=0,
27 |         stateless=True):
28 |         super().__init__(action=Action.SUBSTITUTE, name=name, zone=zone, coverage=coverage, 
29 |             factor=factor, device='cpu', verbose=verbose, stateless=stateless)
30 | 
31 |         self.model = nma.Loudness()
32 | 
33 |     def substitute(self, data):
34 |         loudness_level = self.get_random_factor()
35 |         start_pos, end_pos = self.get_augment_range_by_coverage(data)
36 | 
37 |         if not self.stateless:
38 |             self.start_pos, self.end_pos, self.aug_factor = start_pos, end_pos, loudness_level
39 | 
40 |         return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos, loudness_level=loudness_level)
41 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/mask.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply mask operation to audio.
 3 | """
 4 | 
 5 | from nlpaug.augmenter.audio import AudioAugmenter
 6 | import nlpaug.model.audio as nma
 7 | from nlpaug.util import Action, WarningMessage
 8 | 
 9 | 
10 | class MaskAug(AudioAugmenter):
11 |     """
12 |     :param int sampling_rate: Sampling rate of input audio. Mandatory if duration is provided.
13 |     :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
14 |         augmentation will be applied in first 20% and last 20% of whole audio.
15 |     :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
16 |         operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
17 |         zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
18 |         augmented.
19 |     :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage`
20 |         value will be ignored.
21 |     :param bool mask_with_noise: If it is True, targeting area will be replaced by noise. Otherwise, it will be
22 |             replaced by 0.
23 |     :param str name: Name of this augmenter
24 | 
25 |     >>> import nlpaug.augmenter.audio as naa
26 |     >>> aug = naa.MaskAug(sampling_rate=44010)
27 |     """
28 | 
29 |     def __init__(self, sampling_rate=None, zone=(0.2, 0.8), coverage=1., duration=None,
30 |         mask_with_noise=True, name='Mask_Aug', verbose=0, stateless=True):
31 |         super().__init__(
32 |             action=Action.SUBSTITUTE, zone=zone, coverage=coverage, duration=duration, 
33 |             name=name, device='cpu', verbose=verbose, stateless=stateless)
34 | 
35 |         self.mask_with_noise = mask_with_noise
36 |         self.model = nma.Mask()
37 | 
38 |     def substitute(self, data):
39 |         start_pos, end_pos = self.get_augment_range_by_coverage(data)
40 | 
41 |         if not self.stateless:
42 |             self.start_pos, self.end_pos = start_pos, end_pos
43 | 
44 |         return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos, 
45 |             mask_with_noise=self.mask_with_noise)
46 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/noise.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply noise injection operation to audio.
 3 | """
 4 | 
 5 | from nlpaug.augmenter.audio import AudioAugmenter
 6 | import nlpaug.model.audio as nma
 7 | from nlpaug.util import Action, WarningMessage
 8 | 
 9 | 
10 | class NoiseAug(AudioAugmenter):
11 |     """
12 |     :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
13 |         augmentation will be applied in first 20% and last 20% of whole audio.
14 |     :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
15 |         operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
16 |         zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
17 |         augmented.
18 |     :param str color: Colors of noise. Supported 'white', 'pink', 'red', 'brown', 'brownian', 'blue', 'azure',
19 |         'violet', 'purple' and 'random'. If 'random' is used, noise color will be picked randomly in each augment.
20 |     :param list noises: Background noises for noise injection. You can provide more than one background noise and
21 |         noise will be picked randomly. Expected format is list of numpy array. If this value is provided. `color`
22 |         value will be ignored
23 |     :param str name: Name of this augmenter
24 | 
25 |     >>> import nlpaug.augmenter.audio as naa
26 |     >>> aug = naa.NoiseAug()
27 |     """
28 |     def __init__(self, zone=(0.2, 0.8), coverage=1., color='white', noises=None, name='Noise_Aug', 
29 |         verbose=0, stateless=True):
30 |         super().__init__(action=Action.SUBSTITUTE, zone=zone, coverage=coverage, name=name, 
31 |             device='cpu', verbose=verbose, stateless=stateless)
32 | 
33 |         self.color = color
34 |         self.noises = noises
35 |         self.model = nma.Noise()
36 | 
37 |         self.model.validate(color)
38 | 
39 |     def substitute(self, data):
40 |         start_pos, end_pos = self.get_augment_range_by_coverage(data)
41 |         aug_segment_size = end_pos - start_pos
42 | 
43 |         noise, color = self.model.get_noise_and_color(aug_segment_size, self.noises, self.color)
44 | 
45 |         if not self.stateless:
46 |             self.start_pos, self.end_pos, self.aug_factor = start_pos, end_pos, color
47 | 
48 |         return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos, noise=noise)
49 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/normalization.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply mask normalization to audio.
 3 | """
 4 | 
 5 | import random
 6 | 
 7 | from nlpaug.augmenter.audio import AudioAugmenter
 8 | import nlpaug.model.audio as nma
 9 | from nlpaug.util import Action, WarningMessage
10 | 
11 | 
12 | class NormalizeAug(AudioAugmenter):
13 |     """
14 |     :param str method: It supports 'minmax', 'max' and 'standard'. For 'minmax', data will be 
15 |         substracted by min value in data and dividing by range of max value and min value. For
16 |         'max', data will be divided by max value only. For 'standard', data will be substracted
17 |         by mean value and dividing by value of standard deviation. If 'random' is used, method 
18 |         will be picked randomly in each augment.
19 |     :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
20 |         augmentation will be applied in first 20% and last 20% of whole audio.
21 |     :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `0.1` is assigned, augment
22 |         operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
23 |         zone and coverage are (0.2, 0.8) and 0.7 respectively. 25.2 seconds ((0.8-0.2)*0.7*60) audio will be
24 |         augmented.
25 |     :param str name: Name of this augmenter
26 | 
27 |     >>> import nlpaug.augmenter.audio as naa
28 |     >>> aug = naa.NormalizeAug()
29 |     """
30 | 
31 |     def __init__(self, method='max', zone=(0.2, 0.8), coverage=0.3, name='Normalize_Aug', verbose=0, 
32 |         stateless=True):
33 |         super().__init__(
34 |             action=Action.SUBSTITUTE, zone=zone, coverage=coverage, name=name, device='cpu', 
35 |             verbose=verbose, stateless=stateless)
36 | 
37 |         self.model = nma.Normalization()
38 |         self.method = method
39 |         self.validate()
40 | 
41 |     def random_method(self):
42 |         return self.sample(self.model.get_support_methods(), 1)[0]
43 | 
44 |     def substitute(self, data):
45 |         start_pos, end_pos = self.get_augment_range_by_coverage(data)
46 | 
47 |         method = self.random_method() if self.method == 'random' else self.method
48 |         
49 |         if not self.stateless:
50 |             self.start_pos = start_pos
51 |             self.end_pos = end_pos
52 |             self.run_method = method
53 | 
54 |         return self.model.manipulate(data, method=method, start_pos=start_pos, end_pos=end_pos)
55 | 
56 |     def validate(self):
57 |         if self.method not in ['random'] + self.model.get_support_methods():
58 |             raise ValueError('{} does not support yet. You may pick one of {}'.format(
59 |                 self.method, ['random'] + self.model.get_support_methods()))
60 | 
61 |         return True
62 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/pitch.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply pitch adjustment operation to audio.
 3 | """
 4 | 
 5 | from nlpaug.augmenter.audio import AudioAugmenter
 6 | import nlpaug.model.audio as nma
 7 | from nlpaug.util import Action, WarningMessage
 8 | 
 9 | 
10 | class PitchAug(AudioAugmenter):
11 |     """
12 |     :param int sampling_rate: Sampling rate of input audio.
13 |     :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
14 |         augmentation will be applied in first 20% and last 20% of whole audio.
15 |     :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
16 |         operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
17 |         zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
18 |         augmented.
19 |     :param int duration: Duration of augmentation (in second). Default value is None. If value is provided. `coverage`
20 |         value will be ignored.
21 |     :param tuple factor: Input data pitch will be increased (decreased). Augmented value will be picked
22 |             within the range of this tuple value. Pitch will be reduced if value is between 0 and 1.
23 |     :param str name: Name of this augmenter
24 | 
25 |     >>> import nlpaug.augmenter.audio as naa
26 |     >>> aug = naa.PitchAug(sampling_rate=44010)
27 |     """
28 | 
29 |     def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=1., duration=None, 
30 |         factor=(-10, 10), name='Pitch_Aug', verbose=0, stateless=True):
31 |         super().__init__(action=Action.SUBSTITUTE, zone=zone, coverage=coverage, factor=factor, 
32 |             duration=duration, name=name, device='cpu', verbose=verbose, stateless=stateless)
33 | 
34 |         self.sampling_rate = sampling_rate
35 |         self.model = nma.Pitch()
36 | 
37 |     def substitute(self, data):
38 |         pitch_level = self.get_random_factor()
39 |         start_pos, end_pos = self.get_augment_range_by_coverage(data)
40 | 
41 |         if not self.stateless:
42 |             self.start_pos, self.end_pos, self.aug_factor = start_pos, end_pos, pitch_level
43 | 
44 |         return self.model.manipulate(data, start_pos, end_pos, pitch_level, self.sampling_rate)
45 | 
46 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/shift.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply shifting operation to audio.
 3 | """
 4 | 
 5 | from nlpaug.augmenter.audio import AudioAugmenter
 6 | import nlpaug.model.audio as nma
 7 | from nlpaug.util import Action, WarningMessage
 8 | 
 9 | 
10 | class ShiftAug(AudioAugmenter):
11 |     """
12 |     :param int sampling_rate: Sampling rate of input audio.
13 |     :param float duration: Max shifting segment (in second)
14 |     :param str direction: Shifting segment to left, right or one of them. Value can be 'left', 'right' or 'random'
15 |     :param str name: Name of this augmenter
16 | 
17 |     >>> import nlpaug.augmenter.audio as naa
18 |     >>> aug = naa.ShiftAug(sampling_rate=44010)
19 |     """
20 | 
21 |     def __init__(self, sampling_rate, duration=3, direction='random', shift_direction='random', 
22 |         name='Shift_Aug', verbose=0, stateless=True):
23 |         super().__init__(action=Action.SUBSTITUTE, name=name, duration=duration, device='cpu', verbose=verbose, 
24 |             stateless=stateless)
25 | 
26 |         self.sampling_rate = sampling_rate
27 |         self.direction = direction
28 |         self.shift_direction = shift_direction
29 |         self.model = nma.Shift()
30 | 
31 |         self.model.validate(shift_direction)
32 | 
33 |     def _get_aug_shift(self):
34 |         aug_shift = int(self.sampling_rate * self.duration)
35 |         if self.direction == 'right':
36 |             return -aug_shift
37 |         elif self.direction == 'random':
38 |             direction = self.sample(4)-1
39 |             if direction == 1:
40 |                 return -aug_shift
41 | 
42 |         return aug_shift
43 | 
44 |     def substitute(self, data):
45 |         aug_shift = self._get_aug_shift()
46 | 
47 |         if not self.stateless:
48 |             self.aug_factor = aug_shift
49 | 
50 |         return self.model.manipulate(data, aug_shift)
51 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/speed.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply speed adjustment operation to audio.
 3 | """
 4 | 
 5 | import numpy as np
 6 | 
 7 | from nlpaug.augmenter.audio import AudioAugmenter
 8 | import nlpaug.model.audio as nma
 9 | from nlpaug.util import Action, WarningMessage
10 | 
11 | 
12 | class SpeedAug(AudioAugmenter):
13 |     """
14 |     :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
15 |         augmentation will be applied in first 20% and last 20% of whole audio.
16 |     :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
17 |         operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
18 |         zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
19 |         augmented.
20 |     :param tuple factor: Input data speed will be increased (decreased). Augmented value will be picked
21 |         within the range of this tuple value. Speed will be reduced if value is between 0 and 1.
22 |     :param tuple speed_range: Deprecated. Use `factor` indeed
23 |     :param str name: Name of this augmenter
24 | 
25 |     >>> import nlpaug.augmenter.audio as naa
26 |     >>> aug = naa.ShiftAug()
27 |     """
28 | 
29 |     def __init__(self, zone=(0.2, 0.8), coverage=1., factor=(0.5, 2), name='Speed_Aug', verbose=0, 
30 |         stateless=True):
31 |         super().__init__(action=Action.SUBSTITUTE, name=name, zone=zone, coverage=coverage, 
32 |             factor=factor, device='cpu', verbose=verbose, stateless=stateless)
33 | 
34 |         self.model = nma.Speed()
35 | 
36 |     def substitute(self, data):
37 |         speed_level = self.get_random_factor()
38 |         start_pos, end_pos = self.get_augment_range_by_coverage(data)
39 | 
40 |         if not self.stateless:
41 |             self.start_pos, self.end_pos, self.aug_factor = start_pos, end_pos, speed_level
42 | 
43 |         return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos, speed=speed_level)
44 |         
45 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/audio/vtlp.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply vocal tract length perturbation (VTLP) operation to audio.
 3 | """
 4 | 
 5 | from nlpaug.augmenter.audio import AudioAugmenter
 6 | import nlpaug.model.audio as nma
 7 | from nlpaug.util import Action
 8 | 
 9 | 
10 | class VtlpAug(AudioAugmenter):
11 |     # https://pdfs.semanticscholar.org/3de0/616eb3cd4554fdf9fd65c9c82f2605a17413.pdf
12 |     """
13 |     :param tuple zone: Assign a zone for augmentation. Default value is (0.2, 0.8) which means that no any
14 |         augmentation will be applied in first 20% and last 20% of whole audio.
15 |     :param float coverage: Portion of augmentation. Value should be between 0 and 1. If `1` is assigned, augment
16 |         operation will be applied to target audio segment. For example, the audio duration is 60 seconds while
17 |         zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 seconds ((0.8-0.2)*0.7*60) audio will be
18 |         augmented.
19 |     :param tuple factor: Input data vocal will be increased (decreased). Augmented value will be picked
20 |         within the range of this tuple value. Vocal will be reduced if value is between 0 and 1.
21 |     :param int fhi: Boundary frequency. Default value is 4800.
22 |     :param str name: Name of this augmenter
23 | 
24 |     >>> import nlpaug.augmenter.audio as naa
25 |     >>> aug = naa.VtlpAug()
26 |     """
27 | 
28 |     def __init__(self, sampling_rate, zone=(0.2, 0.8), coverage=0.1, fhi=4800, factor=(0.9, 1.1), 
29 |         name='Vtlp_Aug', verbose=0, stateless=True):
30 |         super().__init__(
31 |             action=Action.SUBSTITUTE, zone=zone, coverage=coverage, factor=factor, name=name, 
32 |             device='cpu', verbose=verbose, stateless=stateless)
33 | 
34 |         self.sampling_rate = sampling_rate
35 |         self.fhi = fhi
36 |         self.model = nma.Vtlp()
37 | 
38 |     def substitute(self, data):
39 |         if self.duration is None:
40 |             start_pos, end_pos = self.get_augment_range_by_coverage(data)
41 |         else:
42 |             start_pos, end_pos = self.get_augment_range_by_duration(data)
43 | 
44 |         warp_factor = self.get_random_factor()
45 | 
46 |         if not self.stateless:
47 |             self.start_pos, self.end_pos, self.aug_factor = start_pos, end_pos, warp_factor
48 | 
49 |         return self.model.manipulate(data, start_pos=start_pos, end_pos=end_pos, sampling_rate=self.sampling_rate,
50 |             warp_factor=warp_factor)
51 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/augment.py:
--------------------------------------------------------------------------------
1 | 
2 | 
3 | class Augment:
4 |     def __init__(self, pos, original, new):
5 |         self.pos = pos
6 |         self.original = original
7 |         self.new = new
8 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/char/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.augmenter.char.char_augmenter import *
3 | from nlpaug.augmenter.char.ocr import *
4 | from nlpaug.augmenter.char.random import *
5 | from nlpaug.augmenter.char.keyboard import *
6 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/sentence/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.augmenter.sentence.sentence_augmenter import *
3 | from nlpaug.augmenter.sentence.context_word_embs_sentence import *
4 | from nlpaug.augmenter.sentence.abst_summ import *
5 | from nlpaug.augmenter.sentence.lambada import *
6 | from nlpaug.augmenter.sentence.random import *


--------------------------------------------------------------------------------
/nlpaug/augmenter/sentence/random.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Augmenter that apply operation (sentence level) to textual input based on abstractive summarization.
 3 | """
 4 | 
 5 | import os
 6 | 
 7 | 
 8 | from nlpaug.augmenter.sentence import SentenceAugmenter
 9 | import nlpaug.model.word_rule as nmr
10 | from nlpaug.util import Action, Doc
11 | 
12 | 
13 | class RandomSentAug(SentenceAugmenter):
14 | 
15 |     """
16 |     Augmenter that apply randomly behavior for augmentation.
17 | 
18 |     :param str mode: Shuffle sentence to left, right, neighbor or random position. For `left`, target sentence
19 |         will be swapped with left sentnece. For `right`, target sentence will be swapped with right sentnece.
20 |         For `neighbor`, target sentence will be swapped with left or right sentnece radomly. For `random`, 
21 |         target sentence will be swapped with any sentnece randomly.
22 |     :param float aug_p: Percentage of sentence will be augmented. 
23 |     :param int aug_min: Minimum number of sentence will be augmented.
24 |     :param int aug_max: Maximum number of sentence will be augmented. If None is passed, number of augmentation is
25 |         calculated via aup_p. If calculated result from aug_p is smaller than aug_max, will use calculated result from
26 |         aug_p. Otherwise, using aug_max.
27 |     :param func tokenizer: Customize tokenization process
28 |     :param str name: Name of this augmenter
29 | 
30 |     >>> import nlpaug.augmenter.sentence as nas
31 |     >>> aug = nas.RandomSentAug()
32 |     """
33 | 
34 |     def __init__(self, mode='neighbor', action=Action.SWAP, name='RandomSent_Aug', aug_min=1, aug_max=10, aug_p=0.3,
35 |         tokenizer=None, verbose=0):
36 |         super().__init__(
37 |             action=action, name=name, aug_p=aug_p, aug_min=aug_min, aug_max=aug_max, verbose=verbose)
38 | 
39 |         self.model = nmr.Shuffle(mode=mode, model_type='sentence', tokenizer=tokenizer)
40 | 
41 |     def pre_skip_aug(self, data):
42 |         return list(range(len(data)))
43 |         
44 |     # https://arxiv.org/abs/1910.13461
45 |     def swap(self, data):
46 |         if not data:
47 |             return data
48 | 
49 |         if isinstance(data, list):
50 |             all_data = data
51 |         else:
52 |             if data.strip() == '':
53 |                 return data
54 |             all_data = [data]
55 | 
56 |         for i, d in enumerate(all_data):
57 |             sentences = self.model.tokenize(d)
58 |             aug_idxes = self._get_random_aug_idxes(sentences)
59 |             for aug_idx in aug_idxes:
60 |                 sentences = self.model.predict(sentences, aug_idx)
61 |             all_data[i] = ' '.join(sentences)
62 | 
63 |         # TODO: always return array
64 |         if isinstance(data, list):
65 |             return all_data
66 |         else:
67 |             return all_data[0]
68 | 
69 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/sentence/sentence_augmenter.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | from nlpaug.util import Method
 4 | from nlpaug.util.text.tokenizer import Tokenizer
 5 | from nlpaug import Augmenter
 6 | from typing import Iterable
 7 | 
 8 | 
 9 | class SentenceAugmenter(Augmenter):
10 |     def __init__(self, action, name='Sentence_Aug', stopwords=None, tokenizer=None, reverse_tokenizer=None,
11 |                  device='cuda', aug_min=None, aug_max=None, aug_p=None, include_detail=False, verbose=0):
12 |         super().__init__(
13 |             name=name, method=Method.SENTENCE, action=action, aug_min=aug_min, aug_max=aug_max, aug_p=aug_p,
14 |             device=device, verbose=verbose, include_detail=include_detail)
15 |         self.tokenizer = tokenizer or Tokenizer.tokenizer
16 |         self.reverse_tokenizer = reverse_tokenizer or Tokenizer.reverse_tokenizer
17 |         self.stopwords = stopwords
18 | 
19 |     @classmethod
20 |     def clean(cls, data):
21 |         if isinstance(data, str):
22 |             return data.strip()
23 |         if isinstance(data, Iterable):
24 |             return [d.strip() for d in data]
25 |         return str(data).strip()
26 | 
27 |     @classmethod
28 |     def is_duplicate(cls, dataset, data):
29 |         for d in dataset:
30 |             if d == data:
31 |                 return True
32 |         return False
33 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/spectrogram/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.augmenter.spectrogram.spectrogram_augmenter import *
3 | from nlpaug.augmenter.spectrogram.frequency_masking import *
4 | from nlpaug.augmenter.spectrogram.time_masking import *
5 | from nlpaug.augmenter.spectrogram.loudness import *


--------------------------------------------------------------------------------
/nlpaug/augmenter/spectrogram/loudness.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.augmenter.spectrogram import SpectrogramAugmenter
 4 | from nlpaug.util import Action
 5 | import nlpaug.model.spectrogram as nms
 6 | 
 7 | 
 8 | class LoudnessAug(SpectrogramAugmenter):
 9 |     """
10 |     Augmenter that change loudness on mel spectrogram by random values.
11 | 
12 |     :param tuple zone: Default value is (0.2, 0.8). Assign a zone for augmentation. By default, no any augmentation
13 |          will be applied in first 20% and last 20% of whole audio.
14 |     :param float coverage: Default value is 1 and value should be between 0 and 1. Portion of augmentation. 
15 |         If `1` is assigned, augment operation will be applied to target audio segment. For example, the audio 
16 |         duration is 60 seconds while zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 
17 |         seconds ((0.8-0.2)*0.7*60) audio will be augmented.
18 |     :param tuple factor: Default value is (0.5, 2). Volume change value will be picked within the range of this 
19 |         tuple value. Volume will be reduced if value is between 0 and 1. Otherwise, volume will be increased.
20 |     :param str name: Name of this augmenter
21 |     """
22 |     def __init__(self, name='Loudness_Aug', zone=(0.2, 0.8), coverage=1., factor=(0.5, 2), verbose=0,
23 |         silence=False, stateless=True):
24 |         super().__init__(action=Action.SUBSTITUTE, zone=zone, coverage=coverage, factor=factor, 
25 |             verbose=verbose, name=name, silence=silence, stateless=stateless)
26 | 
27 |         self.model = nms.Loudness()
28 | 
29 |     def substitute(self, data):
30 |         # https://arxiv.org/pdf/2001.01401.pdf
31 | 
32 |         loudness_level = self.get_random_factor()
33 |         time_start, time_end = self.get_augment_range_by_coverage(data)
34 | 
35 |         if not self.stateless:
36 |             self.time_start, self.time_end, self.loudness_level = time_start, time_end, loudness_level
37 | 
38 |         return self.model.manipulate(data, loudness_level=loudness_level, time_start=time_start, time_end=time_end)
39 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/spectrogram/spectrogram_augmenter.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.util import Method
 4 | from nlpaug import Augmenter
 5 | 
 6 | 
 7 | class SpectrogramAugmenter(Augmenter):
 8 |     def __init__(self, action, zone=None, coverage=None, factor=None, name='Spectrogram_Aug', device='cpu', 
 9 |         verbose=0, stateless=True, silence=False):
10 |         super().__init__(name=name, method=Method.SPECTROGRAM, action=action, aug_min=None, 
11 |             aug_max=None, device=device, verbose=verbose)
12 | 
13 |         self.zone = zone
14 |         self.coverage = coverage
15 |         self.factor = factor
16 |         self.stateless = stateless
17 |         self.silence = silence
18 | 
19 |         if self.zone[0] < 0:
20 |             raise ValueError('Lower bound of zone is smaller than {}.'.format(0) + 
21 |                 ' It should be larger than {}'.format(0))
22 | 
23 |         if self.zone[1] > 1:
24 |             raise ValueError('Upper bound of zone is larger than {}.'.format(1) + 
25 |                 ' It should be smaller than {}'.format(1))
26 | 
27 |         if self.coverage < 0 or self.coverage > 1:
28 |             raise ValueError('Coverage value should be between than 0 and 1 while ' +
29 |                 'input value is {}'.format(self.coverage))
30 | 
31 |     @classmethod
32 |     def clean(cls, data):
33 |         return data
34 | 
35 |     @classmethod
36 |     def is_duplicate(cls, dataset, data):
37 |         for d in dataset:
38 |             if np.array_equal(d, data):
39 |                 return True
40 |         return False
41 | 
42 |     def get_random_factor(self, low=None, high=None, dtype='float'):
43 |         lower_bound = self.factor[0] if low is None else low
44 |         upper_bound = self.factor[1] if high is None else high
45 |         if dtype == 'int':
46 |             return np.random.randint(lower_bound, upper_bound)
47 |         elif dtype == 'float':
48 |             return np.random.uniform(lower_bound, upper_bound)
49 |         else:
50 |             return np.random.uniform(lower_bound, upper_bound)
51 | 
52 |     def get_augment_range_by_coverage(self, data):
53 |         zone_start, zone_end = int(data.shape[1] * self.zone[0]), int(data.shape[1] * self.zone[1])
54 |         zone_size = zone_end - zone_start
55 | 
56 |         target_size = int(zone_size * self.coverage)
57 |         last_start = zone_start + int(zone_size * (1 - self.coverage))
58 | 
59 |         if zone_start == last_start:
60 |             start_pos = zone_start
61 |             end_pos = zone_end
62 |         else:
63 |             start_pos = np.random.randint(zone_start, last_start)
64 |             end_pos = start_pos + target_size
65 | 
66 |         return start_pos, end_pos
67 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/spectrogram/time_masking.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.augmenter.spectrogram import SpectrogramAugmenter
 4 | from nlpaug.util import Action
 5 | import nlpaug.model.spectrogram as nms
 6 | 
 7 | 
 8 | class TimeMaskingAug(SpectrogramAugmenter):
 9 |     """
10 |     Augmenter that mask spectrogram based on frequency by random values.
11 | 
12 |     :param tuple zone: Default value is (0.2, 0.8). Assign a zone for augmentation. By default, no any augmentation
13 |          will be applied in first 20% and last 20% of whole audio.
14 |     :param float coverage: Default value is 1 and value should be between 0 and 1. Portion of augmentation. 
15 |         If `1` is assigned, augment operation will be applied to target audio segment. For example, the audio 
16 |         duration is 60 seconds while zone and coverage are (0.2, 0.8) and 0.7 respectively. 42 
17 |         seconds ((0.8-0.2)*0.7*60) audio will be chosen for augmentation.
18 |     :param str name: Name of this augmenter
19 | 
20 |     >>> import nlpaug.augmenter.spectogram as nas
21 |     >>> aug = nas.TimeMaskingAug()
22 |     """
23 | 
24 |     def __init__(self, name='TimeMasking_Aug', zone=(0.2, 0.8), coverage=1., verbose=0, 
25 |         silence=False, stateless=True):
26 |         super().__init__(action=Action.SUBSTITUTE, zone=zone, coverage=coverage, factor=(1, 1), verbose=verbose, 
27 |             name=name, silence=silence, stateless=stateless)
28 | 
29 |         self.model = nms.TimeMasking()
30 | 
31 |     def substitute(self, data):
32 |         """
33 |             From: https://arxiv.org/pdf/1904.08779.pdf,
34 |             Time masking is applied so that t consecutive time steps
35 |             [t0, t0 + t) are masked, where t is first chosen from a
36 |             uniform distribution from 0 to the time mask parameter
37 |             T, and t0 is chosen from [0, tau - t).
38 |         """
39 | 
40 |         tau = data.shape[1]
41 |         t0, time_end = self.get_augment_range_by_coverage(data)
42 |         t = self.get_random_factor(high=time_end, dtype='int')
43 |         
44 |         if not self.stateless:
45 |             self.tau, self.t, self.t0 = tau, t, t0
46 | 
47 |         return self.model.manipulate(data, t=t, t0=t0)
48 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/spectrogram/time_warping.py:
--------------------------------------------------------------------------------
 1 | # from nlpaug.augmenter.spectrogram import SpectrogramAugmenter
 2 | # from nlpaug.util import Action
 3 | # import nlpaug.model.spectrogram as nms
 4 | #
 5 | #
 6 | # class TimeWarpingAug(SpectrogramAugmenter):
 7 | # https://arxiv.org/pdf/1904.08779.pdf
 8 | #     def __init__(self, time_mask, name='TimeWarpingAug_Aug'):
 9 | #         super(TimeWarpingAug, self).__init__(
10 | #             action=Action.SUBSTITUTE, name=name, aug_p=1, aug_min=0.3)
11 | #
12 | #         self.model = self.get_model(time_mask)
13 | #
14 | #     def substitute(self, mel_spectrogram):
15 | #         return self.model.mask(mel_spectrogram)
16 | #
17 | #     def get_model(self, time_mask):
18 | #         return nms.TimeWarping(time_mask)
19 | 


--------------------------------------------------------------------------------
/nlpaug/augmenter/word/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from nlpaug.augmenter.word.word_augmenter import *
 3 | from nlpaug.augmenter.word.random import *
 4 | from nlpaug.augmenter.word.word_embs import *
 5 | from nlpaug.augmenter.word.tfidf import *
 6 | from nlpaug.augmenter.word.spelling import *
 7 | from nlpaug.augmenter.word.context_word_embs import *
 8 | from nlpaug.augmenter.word.synonym import *
 9 | from nlpaug.augmenter.word.antonym import *
10 | from nlpaug.augmenter.word.split import *
11 | from nlpaug.augmenter.word.back_translation import *
12 | from nlpaug.augmenter.word.reserved import *
13 | 


--------------------------------------------------------------------------------
/nlpaug/flow/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.flow.pipeline import *
3 | from nlpaug.flow.sequential import *
4 | from nlpaug.flow.sometimes import *
5 | 


--------------------------------------------------------------------------------
/nlpaug/flow/sequential.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Flow that apply augmentation sequentially.
 3 | """
 4 | 
 5 | from nlpaug.util import Action
 6 | from nlpaug.flow import Pipeline
 7 | 
 8 | 
 9 | class Sequential(Pipeline):
10 |     """
11 |     Flow that apply augmenters sequentially.
12 | 
13 |     :param list flow: list of flow or augmenter
14 |     :param str name: Name of this augmenter
15 | 
16 |     >>> import nlpaug.flow as naf
17 |     >>> import nlpaug.augmenter.char as nac
18 |     >>> import nlpaug.augmenter.word as naw
19 |     >>> flow = naf.Sequential([nac.RandomCharAug(), naw.RandomWordAug()])
20 |     """
21 | 
22 |     def __init__(self, flow=None, name='Sequential_Pipeline', verbose=0):
23 |         Pipeline.__init__(self, name=name, action=Action.SEQUENTIAL, flow=flow, include_detail=False,
24 |                           verbose=verbose)
25 | 
26 |     def draw(self):
27 |         return True
28 | 


--------------------------------------------------------------------------------
/nlpaug/flow/sometimes.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Flow that apply augmentation randomly.
 3 | """
 4 | 
 5 | from nlpaug.util import Action
 6 | from nlpaug.flow import Pipeline
 7 | 
 8 | 
 9 | class Sometimes(Pipeline):
10 |     """
11 |     Flow that apply augmenters randomly.
12 | 
13 |     :param list flow: list of flow or augmenter
14 |     :param float aug_p: Percentage of pipeline will be executed. 
15 |     :param str name: Name of this augmenter
16 | 
17 |     >>> import nlpaug.flow as naf
18 |     >>> import nlpaug.augmenter.char as nac
19 |     >>> import nlpaug.augmenter.word as naw
20 |     >>> flow = naf.Sometimes([nac.RandomCharAug(), naw.RandomWordAug()])
21 |     """
22 | 
23 |     def __init__(self, flow=None, name='Sometimes_Pipeline', aug_p=0.8, verbose=0):
24 |         Pipeline.__init__(self, name=name, action=Action.SOMETIMES,
25 |                           flow=flow, aug_p=aug_p, include_detail=False, verbose=verbose)
26 | 
27 |     def draw(self):
28 |         return self.aug_p > self.prob()
29 | 


--------------------------------------------------------------------------------
/nlpaug/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/nlpaug/model/__init__.py


--------------------------------------------------------------------------------
/nlpaug/model/audio/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from nlpaug.model.audio.audio import *
 3 | from nlpaug.model.audio.noise import *
 4 | from nlpaug.model.audio.shift import *
 5 | from nlpaug.model.audio.speed import *
 6 | from nlpaug.model.audio.pitch import *
 7 | from nlpaug.model.audio.loudness import *
 8 | from nlpaug.model.audio.crop import *
 9 | from nlpaug.model.audio.mask import *
10 | from nlpaug.model.audio.vtlp import *
11 | from nlpaug.model.audio.normalization import *
12 | from nlpaug.model.audio.inversion import *
13 | 


--------------------------------------------------------------------------------
/nlpaug/model/audio/audio.py:
--------------------------------------------------------------------------------
1 | class Audio:
2 |     def manipulate(self, data):
3 |         raise NotImplementedError
4 | 


--------------------------------------------------------------------------------
/nlpaug/model/audio/crop.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.model.audio import Audio
 4 | 
 5 | 
 6 | class Crop(Audio):
 7 |     def manipulate(self, data, start_pos, end_pos):
 8 |         aug_data = data.copy()
 9 |         aug_data = np.delete(aug_data, np.s_[start_pos:end_pos])
10 |         return aug_data
11 | 


--------------------------------------------------------------------------------
/nlpaug/model/audio/inversion.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.model.audio import Audio
 4 | 
 5 | class PolarityInversion(Audio):
 6 | 	# https://en.wikipedia.org/wiki/Phase_inversion
 7 | 	def manipulate(self, data, start_pos, end_pos):
 8 | 		aug_data = data.copy()
 9 | 		aug_data[start_pos:end_pos] = -aug_data[start_pos:end_pos]
10 | 
11 | 		return aug_data
12 | 


--------------------------------------------------------------------------------
/nlpaug/model/audio/loudness.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.model.audio import Audio
 4 | 
 5 | 
 6 | class Loudness(Audio):
 7 |     def manipulate(self, data, start_pos, end_pos, loudness_level):
 8 |         aug_data = data.copy()
 9 |         aug_data[start_pos:end_pos] = aug_data[start_pos:end_pos] * loudness_level
10 | 
11 |         return aug_data
12 | 


--------------------------------------------------------------------------------
/nlpaug/model/audio/mask.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.model.audio import Audio
 4 | 
 5 | 
 6 | class Mask(Audio):
 7 |     def manipulate(self, data, start_pos, end_pos, mask_with_noise):
 8 |         if mask_with_noise:
 9 |             noise_data = np.random.randn(end_pos - start_pos)
10 |         else:
11 |             noise_data = np.zeros(end_pos - start_pos)
12 | 
13 |         aug_data = data.copy()
14 |         aug_data[start_pos:end_pos] = noise_data
15 | 
16 |         return aug_data
17 | 


--------------------------------------------------------------------------------
/nlpaug/model/audio/normalization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.model.audio import Audio
 4 | 
 5 | 
 6 | class Normalization(Audio):
 7 | 	def manipulate(self, data, method, start_pos, end_pos):
 8 | 		aug_data = data.copy()
 9 | 		if method == 'minmax':
10 | 			new_data = self._min_max(aug_data[start_pos:end_pos])
11 | 		elif method == 'max':
12 | 			new_data = self._max(aug_data[start_pos:end_pos])
13 | 		elif method == 'standard':
14 | 			new_data = self._standard(aug_data[start_pos:end_pos])
15 | 
16 | 		aug_data[start_pos:end_pos] = new_data
17 | 
18 | 		return aug_data
19 | 
20 | 	def get_support_methods(self):
21 | 		return ['minmax', 'max', 'standard']
22 | 
23 | 	def _standard(self, data):
24 | 		return (data - np.mean(data)) / np.std(data)
25 | 
26 | 	def _max(self, data):
27 | 		return data / np.amax(np.abs(data))
28 | 
29 | 	def _min_max(self, data):
30 | 		lower = np.amin(np.abs(data))
31 | 		return (data - lower) / (np.amax(np.abs(data)) - lower)
32 | 


--------------------------------------------------------------------------------
/nlpaug/model/audio/pitch.py:
--------------------------------------------------------------------------------
 1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation
 2 | 
 3 | try:
 4 |     import librosa
 5 | except ImportError:
 6 |     # No installation required if not using this function
 7 |     pass
 8 | import numpy as np
 9 | 
10 | from nlpaug.model.audio import Audio
11 | 
12 | 
13 | class Pitch(Audio):
14 |     def __init__(self):
15 |         super().__init__()
16 |         try:
17 |             import librosa
18 |         except ModuleNotFoundError:
19 |             raise ModuleNotFoundError('Missed librosa library. Install it via `pip install librosa`')
20 | 
21 |     def manipulate(self, data, start_pos, end_pos, pitch_level, sampling_rate):
22 |         aug_data = data.copy()
23 |         aug_data[start_pos:end_pos] = librosa.effects.pitch_shift(
24 |             y=aug_data[start_pos:end_pos], sr=sampling_rate, n_steps=pitch_level)
25 | 
26 |         return aug_data
27 | 


--------------------------------------------------------------------------------
/nlpaug/model/audio/shift.py:
--------------------------------------------------------------------------------
 1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation
 2 | import numpy as np
 3 | 
 4 | from nlpaug.model.audio import Audio
 5 | 
 6 | 
 7 | class Shift(Audio):
 8 |     def validate(self, direction):
 9 |         if direction not in ['left', 'right', 'random']:
10 |             raise ValueError(
11 |                 'shift_direction should be either left, right or both while {} is passed.'.format(direction))
12 | 
13 |     def manipulate(self, data, shift):
14 |         aug_data = np.roll(data.copy(), shift)
15 |         # Set to silence for heading/ tailing
16 |         if shift > 0:
17 |             aug_data[:shift] = 0
18 |         else:
19 |             aug_data[shift:] = 0
20 |         return aug_data
21 | 


--------------------------------------------------------------------------------
/nlpaug/model/audio/speed.py:
--------------------------------------------------------------------------------
 1 | # Reference: https://www.kaggle.com/CVxTz/audio-data-augmentation
 2 | 
 3 | try:
 4 |     import librosa
 5 | except ImportError:
 6 |     # No installation required if not using this function
 7 |     pass
 8 | import numpy as np
 9 | 
10 | from nlpaug.model.audio import Audio
11 | 
12 | 
13 | class Speed(Audio):
14 |     def __init__(self):
15 |         super().__init__()
16 |         try:
17 |             import librosa
18 |         except ModuleNotFoundError:
19 |             raise ModuleNotFoundError('Missed librosa library. Install it via `pip install librosa`')
20 | 
21 |     def manipulate(self, data, start_pos, end_pos, speed):
22 |         aug_data = librosa.effects.time_stretch(y=data[start_pos:end_pos], rate=speed)
23 |         return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0)
24 | 


--------------------------------------------------------------------------------
/nlpaug/model/audio/vtlp.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | try:
 3 |     import librosa
 4 | except ImportError:
 5 |     # No installation required if not using this function
 6 |     pass
 7 | 
 8 | from nlpaug.model.audio import Audio
 9 | 
10 | 
11 | class Vtlp(Audio):
12 |     # https://pdfs.semanticscholar.org/3de0/616eb3cd4554fdf9fd65c9c82f2605a17413.pdf
13 |     def __init__(self):
14 |         super().__init__()
15 | 
16 |         try:
17 |             import librosa
18 |         except ModuleNotFoundError:
19 |             raise ModuleNotFoundError('Missed librosa library. Install import librosa by `pip install librosa`')
20 | 
21 |     # http://www.cs.toronto.edu/~hinton/absps/perturb.pdf
22 |     @classmethod
23 |     def get_scale_factors(cls, freq_dim, sampling_rate, fhi=4800, alpha=0.9):
24 |         factors = []
25 |         freqs = np.linspace(0, 1, freq_dim)
26 | 
27 |         scale = fhi * min(alpha, 1)
28 |         f_boundary = scale / alpha
29 |         half_sr = sampling_rate / 2
30 | 
31 |         for f in freqs:
32 |             f *= sampling_rate
33 |             if f <= f_boundary:
34 |                 factors.append(f * alpha)
35 |             else:
36 |                 warp_freq = half_sr - (half_sr - scale) / (half_sr - scale / alpha) * (half_sr - f)
37 |                 factors.append(warp_freq)
38 | 
39 |         return np.array(factors)
40 | 
41 |     # https://github.com/YerevaNN/Spoken-language-identification/blob/master/augment_data.py#L26
42 |     def _manipulate(self, audio, sampling_rate, factor):
43 |         stft = librosa.core.stft(audio)
44 |         freq_dim, time_dim = stft.shape
45 |         data_type = type(stft[0][0])
46 | 
47 |         factors = self.get_scale_factors(freq_dim, sampling_rate, alpha=factor)
48 |         factors *= (freq_dim - 1) / max(factors)
49 |         new_stft = np.zeros([freq_dim, time_dim], dtype=data_type)
50 | 
51 |         for i in range(freq_dim):
52 |             # first and last freq
53 |             if i == 0 or i + 1 >= freq_dim:
54 |                 new_stft[i, :] += stft[i, :]
55 |             else:
56 |                 warp_up = factors[i] - np.floor(factors[i])
57 |                 warp_down = 1 - warp_up
58 |                 pos = int(np.floor(factors[i]))
59 | 
60 |                 new_stft[pos, :] += warp_down * stft[i, :]
61 |                 new_stft[pos+1, :] += warp_up * stft[i, :]
62 | 
63 |         return librosa.core.istft(new_stft)
64 | 
65 |     def manipulate(self, data, start_pos, end_pos, sampling_rate, warp_factor):
66 |         aug_data = self._manipulate(data[start_pos:end_pos], sampling_rate=sampling_rate, factor=warp_factor)
67 | 
68 |         return np.concatenate((data[:start_pos], aug_data, data[end_pos:]), axis=0).astype(type(data[0]))


--------------------------------------------------------------------------------
/nlpaug/model/base_model.py:
--------------------------------------------------------------------------------
1 | import numpy as np
2 | 
3 | class Model:
4 | 	@classmethod
5 | 	def sample(cls, x, num=None):
6 | 		if isinstance(x, list):
7 | 			return np.random.choice(x, size=num, replace=False)
8 | 		elif isinstance(x, int):
9 | 			return np.random.randint(0, x, size=num)


--------------------------------------------------------------------------------
/nlpaug/model/char/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.char.char import *
3 | from nlpaug.model.char.keyboard import *
4 | from nlpaug.model.char.ocr import *
5 | 


--------------------------------------------------------------------------------
/nlpaug/model/char/char.py:
--------------------------------------------------------------------------------
1 | class Character:
2 |     def __init__(self, cache=True):
3 |         self.cache = cache
4 | 


--------------------------------------------------------------------------------
/nlpaug/model/char/keyboard.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | import os
 3 | import json
 4 | 
 5 | from nlpaug.model.char import Character
 6 | 
 7 | 
 8 | class Keyboard(Character):
 9 |     def __init__(self, special_char=True, numeric=True, upper_case=True, cache=True, lang="en", model_path=None):
10 |         super().__init__(cache)
11 | 
12 |         self.special_char = special_char
13 |         self.numeric = numeric
14 |         self.upper_case = upper_case
15 |         self.lang = lang
16 |         self.model_path = model_path
17 |         self.model = self.get_model(model_path=model_path, special_char=special_char, numeric=numeric, 
18 |             upper_case=upper_case, lang=lang)
19 | 
20 |     def predict(self, data):
21 |         return self.model[data]
22 | 
23 |     # TODO: Extending to 2 keyboard distance
24 |     @classmethod
25 |     def get_model(cls, model_path, special_char=True, numeric=True, upper_case=True, lang="en"):
26 |         if not os.path.exists(model_path):
27 |             raise ValueError('The model_path does not exist. Please check "{}"'.format(model_path))
28 | 
29 |         with open(model_path, encoding="utf8") as f:
30 |             mapping = json.load(f)
31 | 
32 |         result = {}
33 | 
34 |         for key, values in mapping.items():
35 |             # Skip records if key is numeric while include_numeric is false
36 |             if not numeric and re.match("^[0-9]*$", key):
37 |                 continue
38 |             # skip record if key is special character while include_spec is false
39 |             if not special_char and not re.match("^[a-z0-9]*$", key):
40 |                 continue
41 | 
42 |             result[key] = []
43 |             result[key.upper()] = []
44 | 
45 |             for value in values:
46 |                 # Skip record if value is numeric while include_numeric is false
47 |                 if not numeric and re.match("^[0-9]*$", value):
48 |                     continue
49 | 
50 |                 # skip record if value is special character while include_spec is false
51 |                 if not special_char and not re.match("^[a-z0-9]*$", value):
52 |                     continue
53 | 
54 |                 result[key].append(value)
55 | 
56 |                 if upper_case:
57 |                     result[key].append(value.upper())
58 |                     result[key.upper()].append(value)
59 |                     result[key.upper()].append(value.upper())
60 | 
61 |         clean_result = {}
62 |         for key, values in result.items():
63 |             # clear empty mapping
64 |             if len(values) == 0:
65 |                 continue
66 | 
67 |             # de-duplicate
68 |             values = [v for v in values if v != key]
69 |             values = sorted(list(set(values)))
70 | 
71 |             clean_result[key] = values
72 | 
73 |         return clean_result
74 | 


--------------------------------------------------------------------------------
/nlpaug/model/char/ocr.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | from nlpaug.model.char import Character
 4 | 
 5 | 
 6 | class Ocr(Character):
 7 |     def __init__(self, model, cache=True):
 8 |         super().__init__(cache)
 9 | 
10 |         self.model = self.generate_mapping(model)
11 | 
12 |     def generate_mapping(self, mapping):
13 |         result = {}
14 | 
15 |         for k in mapping:
16 |             result[k] = mapping[k]
17 | 
18 |         # reverse mapping
19 |         for k in mapping:
20 |             for v in mapping[k]:
21 |                 if v not in result:
22 |                     result[v] = []
23 | 
24 |                 if k not in result[v]:
25 |                     result[v].append(k)
26 |         return result
27 | 
28 |     def predict(self, data):
29 |         return self.model[data]
30 | 
31 |     # Deprecated. Will remove in coming release
32 |     # # TODO: Read from file
33 |     # @classmethod
34 |     # def get_model(cls):
35 |     #     mapping = {
36 |     #         '0': ['8', '9', 'o', 'O', 'D'],
37 |     #         '1': ['4', '7', 'l', 'I'],
38 |     #         '2': ['z', 'Z'],
39 |     #         '5': ['8'],
40 |     #         '6': ['b'],
41 |     #         '8': ['s', 'S', '@', '&'],
42 |     #         '9': ['g'],
43 |     #         'o': ['u'],
44 |     #         'r': ['k'],
45 |     #         'C': ['G'],
46 |     #         'O': ['D', 'U'],
47 |     #         'E': ['B']
48 |     #     }
49 | 
50 |     #     result = {}
51 | 
52 |     #     for k in mapping:
53 |     #         result[k] = mapping[k]
54 | 
55 |     #     for k in mapping:
56 |     #         for v in mapping[k]:
57 |     #             if v not in result:
58 |     #                 result[v] = []
59 | 
60 |     #             if k not in result[v]:
61 |     #                 result[v].append(k)
62 | 
63 |     #     return result
64 | 


--------------------------------------------------------------------------------
/nlpaug/model/lang_models/__init__.py:
--------------------------------------------------------------------------------
 1 | from __future__ import absolute_import
 2 | from nlpaug.model.lang_models.language_models import *
 3 | from nlpaug.model.lang_models.bert import *
 4 | from nlpaug.model.lang_models.xlnet import *
 5 | from nlpaug.model.lang_models.gpt2 import *
 6 | from nlpaug.model.lang_models.distilbert import *
 7 | from nlpaug.model.lang_models.roberta import *
 8 | from nlpaug.model.lang_models.fairseq import *
 9 | from nlpaug.model.lang_models.t5 import *
10 | from nlpaug.model.lang_models.bart import *
11 | from nlpaug.model.lang_models.fill_mask_transformers import *
12 | from nlpaug.model.lang_models.machine_translation_transformers import *
13 | from nlpaug.model.lang_models.summarization_transformers import *
14 | from nlpaug.model.lang_models.lambada import *
15 | from nlpaug.model.lang_models.text_generation_transformers import *


--------------------------------------------------------------------------------
/nlpaug/model/lang_models/summarization_transformers.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | try:
 4 |     import torch
 5 |     from transformers import pipeline
 6 | except ImportError:
 7 |     # No installation required if not using this function
 8 |     pass
 9 | 
10 | from nlpaug.model.lang_models import LanguageModels
11 | 
12 | 
13 | class XSumTransformers(LanguageModels):
14 |     def __init__(self, model_name="t5-base", tokenizer_name=None, min_length=10, max_length=20, 
15 |         temperature=1.0, top_k=50, top_p=0.9, batch_size=32, device='cuda', silence=True):
16 |         super().__init__(device, model_type=None, silence=silence)
17 |         try:
18 |             from transformers import pipeline
19 |         except ModuleNotFoundError:
20 |             raise ModuleNotFoundError('Missed transformers library. Install transfomers by `pip install transformers`')
21 | 
22 |         self.model_name = model_name
23 |         self.tokenizer_name = model_name if tokenizer_name is None else tokenizer_name
24 |         self.min_length = min_length
25 |         self.max_length = max_length
26 |         self.temperature = temperature
27 |         self.top_k = top_k
28 |         self.top_p = top_p
29 |         self.batch_size = batch_size
30 | 
31 |         if silence:
32 |             # Transformers thrown an warning regrading to weight initialization. It is expected
33 |             orig_log_level = logging.getLogger('transformers.' + 'modeling_utils').getEffectiveLevel()
34 |             logging.getLogger('transformers.' + 'modeling_utils').setLevel(logging.ERROR)
35 | 
36 |             device = self.convert_device(device)
37 | 
38 |             self.model = pipeline("summarization", model=self.model_name, tokenizer=self.tokenizer_name, 
39 |                 device=device, framework="pt")
40 |             logging.getLogger('transformers.' + 'modeling_utils').setLevel(orig_log_level)
41 | 
42 |     def get_device(self):
43 |         return str(self.model.device)
44 | 
45 |     def predict(self, texts, target_words=None, n=1):
46 |         results = []
47 |         with torch.no_grad():
48 |             for i in range(0, len(texts), self.batch_size):
49 |                 predict_result = self.model(texts[i:i+self.batch_size], 
50 |                     min_length=self.min_length, 
51 |                     max_length=self.max_length,
52 |                     temperature=self.temperature,
53 |                     top_k=self.top_k,
54 |                     top_p=self.top_p,
55 |                     num_workers=1)
56 |                 if isinstance(predict_result, list):
57 |                     results.extend(predict_result)
58 |                 else:
59 |                     results.append(predict_result)
60 |         results = [r['summary_text'] for r in results]
61 |         
62 |         return results
63 | 


--------------------------------------------------------------------------------
/nlpaug/model/lang_models/text_generation_transformers.py:
--------------------------------------------------------------------------------
 1 | import logging
 2 | 
 3 | try:
 4 |     import torch
 5 |     from transformers import pipeline
 6 | except ImportError:
 7 |     # No installation required if not using this function
 8 |     pass
 9 | 
10 | from nlpaug.model.lang_models import LanguageModels
11 | 
12 | 
13 | class TextGenTransformers(LanguageModels):
14 |     def __init__(self, model_path='gpt2', device='cuda', min_length=100, max_length=300, 
15 |         batch_size=32, temperature=1.0, top_k=50, top_p=0.9, silence=True):
16 |         super().__init__(device, model_type=None, silence=silence)
17 |         try:
18 |             from transformers import pipeline
19 |         except ModuleNotFoundError:
20 |             raise ModuleNotFoundError('Missed transformers library. Install transfomers by `pip install transformers`')
21 | 
22 |         self.min_length = min_length
23 |         self.max_length = max_length
24 |         self.batch_size = batch_size
25 |         self.temperature = temperature
26 |         self.top_k = top_k
27 |         self.top_p = top_p
28 |         self.model_path = model_path
29 |         self.device = self.convert_device(device)
30 | 
31 |         if silence:
32 |             # Transformers thrown an warning regrading to weight initialization. It is expected
33 |             orig_log_level = logging.getLogger('transformers.' + 'modeling_utils').getEffectiveLevel()
34 |             logging.getLogger('transformers.' + 'modeling_utils').setLevel(logging.ERROR)
35 |             self.model = pipeline("text-generation", model=model_path, device=self.device)
36 |             logging.getLogger('transformers.' + 'modeling_utils').setLevel(orig_log_level)
37 |         else:
38 |             self.model = pipeline("text-generation", model=model_path, device=self.device)
39 | 
40 |     def to(self, device):
41 |         self.model.model.to(device)
42 | 
43 |     def get_device(self):
44 |         return str(self.model.device)
45 | 
46 |     def predict(self, texts, target_words=None, n=1):
47 |         results = []
48 |         with torch.no_grad():
49 |             for i in range(0, len(texts), self.batch_size):
50 |                 predict_result = self.model(
51 |                     texts[i:i+self.batch_size], 
52 |                     pad_token_id=50256,
53 |                     min_length=self.min_length, 
54 |                     max_length=self.max_length,
55 |                     temperature=self.temperature,
56 |                     top_k=self.top_k,
57 |                     top_p=self.top_p,
58 |                     do_sample=True,
59 |                     num_return_sequences=1,
60 |                     num_workers=1
61 |                 )
62 |                 if isinstance(predict_result, list):
63 |                     results.extend([y for x in predict_result for y in x])
64 | 
65 |         return [r['generated_text'] for r in results]
66 | 


--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.spectrogram.spectrogram import *
3 | from nlpaug.model.spectrogram.frequency_masking import *
4 | from nlpaug.model.spectrogram.time_masking import *
5 | from nlpaug.model.spectrogram.loudness import *


--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/frequency_masking.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.model.spectrogram import Spectrogram
 4 | 
 5 | 
 6 | class FrequencyMasking(Spectrogram):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def manipulate(self, data, f, f0, time_start, time_end):
11 |         """
12 |             https://arxiv.org/pdf/1904.08779.pdf, https://arxiv.org/pdf/2001.01401.pdf
13 |             Frequency masking is applied so that f consecutive mel
14 |             frequency channels [f0, f0 + f) are masked, where f is
15 |             first chosen from a uniform distribution from 0 to the
16 |             frequency mask parameter F, and f0 is chosen from
17 |             [0, v - f). v is the number of mel frequency channels.
18 |         """
19 | 
20 |         aug_data = data.copy()
21 |         aug_data[f0:f0+f, time_start:time_end] = 0
22 |         return aug_data
23 | 


--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/loudness.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.model.spectrogram import Spectrogram
 4 | 
 5 | 
 6 | class Loudness(Spectrogram):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def manipulate(self, data, loudness_level, time_start, time_end):
11 |         # https://arxiv.org/pdf/2001.01401.pdf
12 |         aug_data = data.copy()
13 |         aug_data[:, time_start:time_end] = aug_data[:, time_start:time_end] * loudness_level * 1000
14 |         return aug_data
15 | 


--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/spectrogram.py:
--------------------------------------------------------------------------------
1 | class Spectrogram:
2 |     def manipulate(self, data):
3 |         raise NotImplementedError
4 | 


--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/time_masking.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | from nlpaug.model.spectrogram import Spectrogram
 4 | 
 5 | 
 6 | class TimeMasking(Spectrogram):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def manipulate(self, data, t, t0):
11 |         """
12 |             From: https://arxiv.org/pdf/1904.08779.pdf,
13 |             Time masking is applied so that t consecutive time steps
14 |             [t0, t0 + t) are masked, where t is first chosen from a
15 |             uniform distribution from 0 to the time mask parameter
16 |             T, and t0 is chosen from [0, tau - t).
17 |         """
18 | 
19 |         aug_data = data.copy()
20 |         aug_data[:, t0:t0+t] = 0
21 |         return aug_data
22 | 


--------------------------------------------------------------------------------
/nlpaug/model/spectrogram/time_warping.py:
--------------------------------------------------------------------------------
 1 | # import numpy as np
 2 | #
 3 | # from nlpaug.model import Spectrogram
 4 | #
 5 | #
 6 | # class TimeWarping(Spectrogram):
 7 | #     def __init__(self, time_warp):
 8 | #         super(TimeWarping, self).__init__()
 9 | #
10 | #         self.time_warp = time_warp
11 | #
12 | #     # TODO
13 | #     def mask(self, mel_spectrogram):
14 | #         """
15 | #             From: https://arxiv.org/pdf/1904.08779.pdf,
16 | #             Time warping is applied via the function
17 | #             sparse image warp of tensorflow. Given
18 | #             a log mel spectrogram with t time steps, we view it
19 | #             as an image where the time axis is horizontal and the
20 | #             frequency axis is vertical. A random point along the
21 | #             horizontal line passing through the center of the image
22 | #             within the time steps (W, t - W) is to be warped
23 | #             either to the left or right by a distance w chosen from a
24 | #             uniform distribution from 0 to the time warp parameter
25 | #             W along that line.
26 | #         :return:
27 | #         """
28 | #
29 | #         time_range = mel_spectrogram.shape[1]
30 | #         self.w = np.random.randint(self.time_warp)
31 | #
32 | #         center_point = np.random.randint(self.time_warp, time_range-self.time_warp)
33 | #         distance = np.random.randint(-self.w, self.w)
34 | #
35 | #         # self.w0 = np.random.randint(time_range - self.t)
36 | #         #
37 | #         # augmented_mel_spectrogram = mel_spectrogram.copy()
38 | #         # augmented_mel_spectrogram[:, self.time_warp:self.time_range-self.time_warp] = 0
39 | #         # return augmented_mel_spectrogram
40 | #         return mel_spectrogram
41 | 


--------------------------------------------------------------------------------
/nlpaug/model/word_dict/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.word_dict.word_dictionary import *
3 | from nlpaug.model.word_dict.spelling import *
4 | from nlpaug.model.word_dict.wordnet import *
5 | from nlpaug.model.word_dict.ppdb import *
6 | 


--------------------------------------------------------------------------------
/nlpaug/model/word_dict/spelling.py:
--------------------------------------------------------------------------------
 1 | """
 2 |     Source data:
 3 |     English Neutral Rewriting: https://github.com/ybisk/charNMT-noise/blob/master/noise/en.natural
 4 | """
 5 | from nlpaug.model.word_dict import WordDictionary
 6 | 
 7 | 
 8 | class Spelling(WordDictionary):
 9 |     def __init__(self, dict_path, include_reverse=True, cache=True):
10 |         super().__init__(cache)
11 | 
12 |         self.dict_path = dict_path
13 |         self.include_reverse = include_reverse
14 | 
15 |         self._init()
16 | 
17 |     def _init(self):
18 |         self.dict = {}
19 |         self.read(self.dict_path)
20 | 
21 |     def read(self, model_path):
22 |         with open(model_path, 'r', encoding="utf-8") as f:
23 |             for line in f.readlines():
24 |                 tokens = line.split(' ')
25 |                 # Last token include newline separator
26 |                 tokens[-1] = tokens[-1].replace('\n', '')
27 | 
28 |                 key = tokens[0]
29 |                 values = tokens[1:]
30 | 
31 |                 if key not in self.dict:
32 |                     self.dict[key] = []
33 | 
34 |                 self.dict[key].extend(values)
35 |                 # Remove duplicate mapping
36 |                 self.dict[key] = list(set(self.dict[key]))
37 |                 # Build reverse mapping
38 |                 if self.include_reverse:
39 |                     for value in values:
40 |                         if value not in self.dict:
41 |                             self.dict[value] = []
42 |                         if key not in self.dict[value]:
43 |                             self.dict[value].append(key)
44 | 
45 |     def predict(self, data):
46 |         if data not in self.dict:
47 |             return None
48 | 
49 |         return self.dict[data]
50 | 


--------------------------------------------------------------------------------
/nlpaug/model/word_dict/word_dictionary.py:
--------------------------------------------------------------------------------
 1 | class WordDictionary:
 2 |     def __init__(self, cache=True):
 3 |         self.cache = cache
 4 | 
 5 |     # pylint: disable=R0201
 6 |     def train(self, data):
 7 |         raise NotImplementedError
 8 | 
 9 |     # pylint: disable=R0201
10 |     def predict(self, data):
11 |         raise NotImplementedError
12 | 
13 |     # pylint: disable=R0201
14 |     def save(self, model_path):
15 |         raise NotImplementedError
16 | 
17 |     # pylint: disable=R0201
18 |     def read(self, model_path):
19 |         raise NotImplementedError
20 | 


--------------------------------------------------------------------------------
/nlpaug/model/word_dict/wordnet.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import nltk
 3 |     from nltk.corpus import wordnet
 4 | except ImportError:
 5 |     # No installation required if not using this function
 6 |     pass
 7 | 
 8 | from nlpaug.model.word_dict import WordDictionary
 9 | 
10 | 
11 | class WordNet(WordDictionary):
12 |     def __init__(self, lang, is_synonym=True):
13 |         super().__init__(cache=True)
14 | 
15 |         self.lang = lang
16 |         self.is_synonym = is_synonym
17 | 
18 |         try:
19 |             import nltk
20 |             from nltk.corpus import wordnet
21 |         except ModuleNotFoundError:
22 |             raise ModuleNotFoundError('Missed nltk library. Install nltk by `pip install nltk`')
23 | 
24 |         # try:
25 |         #     # Check whether wordnet package is downloaded
26 |         #     wordnet.synsets('computer')
27 |         #     # Check whether POS package is downloaded
28 |         #     nltk.pos_tag('computer')
29 |         # except LookupError:
30 |         #     nltk.download('wordnet')
31 |         #     nltk.download('averaged_perceptron_tagger')
32 | 
33 |         self.model = self.read()
34 | 
35 |     def read(self):
36 |         try:
37 |             wordnet.synsets('testing')
38 |             return wordnet
39 |         except LookupError:
40 |             nltk.download('wordnet')
41 |             nltk.download('omw-1.4')
42 |             return wordnet
43 | 
44 |     def predict(self, word, pos=None):
45 |         results = []
46 |         for synonym in self.model.synsets(word, pos=pos, lang=self.lang):
47 |             for lemma in synonym.lemmas(lang=self.lang):
48 |                 if self.is_synonym:
49 |                     results.append(lemma.name())
50 |                 else:
51 |                     for antonym in lemma.antonyms():
52 |                         results.append(antonym.name())
53 |         return results
54 | 
55 |     @classmethod
56 |     def pos_tag(cls, tokens):
57 |         try:
58 |             results = nltk.pos_tag(tokens)
59 |         except LookupError:
60 |             nltk.download('averaged_perceptron_tagger')
61 |             results = nltk.pos_tag(tokens)
62 | 
63 |         return results


--------------------------------------------------------------------------------
/nlpaug/model/word_embs/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.word_embs.word_embeddings import *
3 | from nlpaug.model.word_embs.glove import *
4 | from nlpaug.model.word_embs.word2vec import *
5 | from nlpaug.model.word_embs.fasttext import *


--------------------------------------------------------------------------------
/nlpaug/model/word_embs/fasttext.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from gensim.models import KeyedVectors
 3 | except ImportError:
 4 |     # No installation required if not using this function
 5 |     pass
 6 | 
 7 | from nlpaug.model.word_embs import WordEmbeddings
 8 | 
 9 | 
10 | class Fasttext(WordEmbeddings):
11 |     # https://arxiv.org/pdf/1712.09405.pdf,
12 |     def __init__(self, top_k=100, skip_check=False):
13 |         super().__init__(top_k, skip_check)
14 | 
15 |         try:
16 |             from gensim.models import KeyedVectors
17 |         except ModuleNotFoundError:
18 |             raise ModuleNotFoundError('Missed gensim library. Install transfomers by `pip install gensim`')
19 | 
20 |         self.model = None
21 |         self.words = []
22 | 
23 |     def read(self, file_path, max_num_vector=None):
24 |         self.model = KeyedVectors.load_word2vec_format(file_path, limit=max_num_vector)
25 |         super()._read()
26 | 


--------------------------------------------------------------------------------
/nlpaug/model/word_embs/glove.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from gensim.models import KeyedVectors
 3 | except ImportError:
 4 |     # No installation required if not using this function
 5 |     pass
 6 | 
 7 | from nlpaug.model.word_embs import WordEmbeddings
 8 | 
 9 | pre_trained_model_url = {
10 |     'glove_6b': 'http://nlp.stanford.edu/data/glove.6B.zip',
11 |     'glove_42b_300d': 'http://nlp.stanford.edu/data/glove.42B.300d.zip',
12 |     'glove_840b_300d': 'http://nlp.stanford.edu/data/glove.840B.300d.zip',
13 |     'glove_twitter_27b': 'http://nlp.stanford.edu/data/glove.twitter.27B.zip',
14 | }
15 | 
16 | 
17 | class GloVe(WordEmbeddings):
18 |     # https://nlp.stanford.edu/pubs/glove.pdf
19 |     def __init__(self, top_k=100, skip_check=False):
20 |         super().__init__(top_k, skip_check)
21 | 
22 |         try:
23 |             from gensim.models import KeyedVectors
24 |         except ModuleNotFoundError:
25 |             raise ModuleNotFoundError('Missed gensim library. Install transfomers by `pip install gensim`')
26 | 
27 |         self.model = None
28 |         self.words = []
29 | 
30 |     def read(self, file_path, max_num_vector=None):
31 |         self.model = KeyedVectors.load_word2vec_format(file_path, binary=False, no_header=True, limit=max_num_vector)
32 |         super()._read()


--------------------------------------------------------------------------------
/nlpaug/model/word_embs/word2vec.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     from gensim.models import KeyedVectors
 3 | except ImportError:
 4 |     # No installation required if not using this function
 5 |     pass
 6 | 
 7 | from nlpaug.model.word_embs import WordEmbeddings
 8 | 
 9 | 
10 | class Word2vec(WordEmbeddings):
11 |     # https://arxiv.org/pdf/1301.3781.pdf
12 |     def __init__(self, top_k=100, skip_check=False):
13 |         super().__init__(top_k, skip_check)
14 | 
15 |         try:
16 |             from gensim.models import KeyedVectors
17 |         except ModuleNotFoundError:
18 |             raise ModuleNotFoundError('Missed gensim library. Install transfomers by `pip install gensim`')
19 | 
20 |         self.model = None
21 |         self.words = []
22 | 
23 |     def read(self, file_path, max_num_vector=None):
24 |         self.model = KeyedVectors.load_word2vec_format(file_path, binary=True, limit=max_num_vector)
25 |         super()._read()
26 | 


--------------------------------------------------------------------------------
/nlpaug/model/word_embs/word_embeddings.py:
--------------------------------------------------------------------------------
 1 | import nlpaug.util.math.normalization as normalization
 2 | 
 3 | 
 4 | class WordEmbeddings:
 5 |     def __init__(self, top_k=100, skip_check=True):
 6 |         self.top_k = top_k
 7 |         self.skip_check = skip_check
 8 |         self.emb_size = 0
 9 |         self.vocab_size = 0
10 |         self.words = []
11 | 
12 |     def read(self, file_path, max_num_vector):
13 |         raise NotImplementedError
14 | 
15 |     def _read(self):
16 |         self.words = [self.model.index_to_key[i] for i in range(len(self.model.index_to_key))]
17 |         self.emb_size = self.model[self.model.key_to_index[self.model.index_to_key[0]]]
18 |         self.vocab_size = len(self.words)
19 | 
20 |     def download(self, model_path):
21 |         raise NotImplementedError
22 | 
23 |     def get_vocab(self):
24 |         return self.words
25 | 
26 |     @classmethod
27 |     def _normalize(cls, vectors, norm='l2'):
28 |         if norm == 'l2':
29 |             return normalization.l2_norm(vectors)
30 |         elif norm == 'l1':
31 |             return normalization.l1_norm(vectors)
32 |         elif norm == 'standard':
33 |             return normalization.standard_norm(vectors)
34 | 
35 |     def predict(self, word, n=1):
36 |         result = self.model.most_similar(word, topn=self.top_k+1)
37 |         result = [w for w, s in result if w.lower() != word.lower()]
38 |         return result[:self.top_k]
39 | 


--------------------------------------------------------------------------------
/nlpaug/model/word_rule/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.word_rule.word_rule import *
3 | from nlpaug.model.word_rule.shuffle import *


--------------------------------------------------------------------------------
/nlpaug/model/word_rule/shuffle.py:
--------------------------------------------------------------------------------
 1 | try:
 2 | 	from nltk.tokenize import sent_tokenize
 3 | except ImportError:
 4 | 	# No installation required if not using this function
 5 | 	pass
 6 | 
 7 | from nlpaug.model.word_rule.word_rule import WordRule
 8 | 
 9 | 
10 | class Shuffle(WordRule):
11 | 	TYPES = ['sentence']
12 | 
13 | 	def __init__(self, model_type, mode='neighbor', tokenizer=None):
14 | 		super().__init__(cache=True)
15 | 
16 | 		self.model_type = model_type # /sentence, word or character
17 | 		self.mode = mode
18 | 
19 | 		if tokenizer:
20 | 			self.tokenizer = tokenizer
21 | 		else:
22 | 			if self.model_type == 'sentence':
23 | 				try:
24 | 					from nltk.tokenize import sent_tokenize
25 | 				except ModuleNotFoundError:
26 | 					raise ModuleNotFoundError('Missed nltk library. Install transfomers by `pip install nltk`')
27 | 				self.tokenizer = sent_tokenize
28 | 
29 | 	def tokenize(self, data):
30 | 		return self.tokenizer(data)
31 | 
32 | 	def predict(self, data, idx):
33 | 		if self.model_type == 'sentence': return self._predict_sentence(data, idx)
34 | 
35 | 		return Exception(
36 | 			'{} is unexpected model_type. Possbile value is {}'.format(
37 | 				self.model_type, self.TYPES))
38 | 
39 | 	def _predict_sentence(self, sentences, idx):
40 | 		last_idx = len(sentences) - 1
41 | 		direction = ''
42 | 		if self.mode == 'neighbor':
43 | 			if self.sample(2) == 0:
44 | 				direction = 'left'
45 | 			else:
46 | 				direction = 'right'
47 | 		if self.mode == 'left' or direction == 'left':
48 | 			if idx == 0:
49 | 				sentences[0], sentences[last_idx] = sentences[last_idx], sentences[0]
50 | 			else:
51 | 				sentences[idx], sentences[idx-1] = sentences[idx-1], sentences[idx]
52 | 		elif self.mode == 'right' or direction == 'right':
53 | 			if idx == last_idx:
54 | 				sentences[0], sentences[idx] = sentences[idx], sentences[0]
55 | 			else:
56 | 				sentences[idx], sentences[idx+1] = sentences[idx+1], sentences[idx]
57 | 		elif self.mode == 'random':
58 | 			idxes = self.sample(list(range(len(sentences))), num=2)
59 | 			for _id in idxes:
60 | 				if _id != idx:
61 | 					sentences[_id], sentences[idx] = sentences[idx], sentences[_id]
62 | 					break
63 | 		return sentences
64 | 


--------------------------------------------------------------------------------
/nlpaug/model/word_rule/word_rule.py:
--------------------------------------------------------------------------------
 1 | from nlpaug.model.base_model import Model
 2 | 
 3 | class WordRule(Model):
 4 |     def __init__(self, cache=True):
 5 |         self.cache = cache
 6 | 
 7 |     # pylint: disable=R0201
 8 |     def predict(self, data):
 9 |         raise NotImplementedError
10 | 


--------------------------------------------------------------------------------
/nlpaug/model/word_stats/__init__.py:
--------------------------------------------------------------------------------
1 | from __future__ import absolute_import
2 | from nlpaug.model.word_stats.word_statistics import *
3 | from nlpaug.model.word_stats.tfidf import *
4 | 


--------------------------------------------------------------------------------
/nlpaug/model/word_stats/word_statistics.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | 
 4 | class WordStatistics:
 5 |     def __init__(self, cache=True):
 6 |         self.cache = cache
 7 | 
 8 |     def train(self, data):
 9 |         raise NotImplementedError
10 | 
11 |     def predict(self, data, top_k):
12 |         raise NotImplementedError
13 | 
14 |     def save(self, model_path):
15 |         raise NotImplementedError
16 | 
17 |     def read(self, model_path):
18 |         raise NotImplementedError
19 | 
20 |     @classmethod
21 |     def choice(cls, x, p, size=1):
22 |         return np.random.choice(len(x), size, p=p)
23 | 


--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/de.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "1": ["!", "2", "\"", "q", "w"],
 3 |   "2": ["\"", "1", "!", "3", "§", "q", "w", "e"],
 4 |   "3": ["§", "2", "\"", "4", "$", "w", "e"],
 5 |   "4": ["$", "3", "§", "5", "%", "e", "r"],
 6 |   "5": ["%", "4", "$", "6", "&", "r", "t", "z"],
 7 |   "6": ["&", "5", "%", "7", "/", "t", "z", "u"],
 8 |   "7": ["/", "6", "&", "8", "(", "z", "u", "i"],
 9 |   "8": ["(", "7", "/", "9", ")", "u", "i", "o"],
10 |   "9": [")", "8", "(", "0", "=", "i", "o", "p"],
11 |   "q": ["1", "!", "2", "\"", "w", "a", "s"],
12 |   "w": ["1", "!", "2", "\"", "3", "§", "q", "e", "a", "s", "d"],
13 |   "e": ["2", "\"", "3", "§", "4", "$", "w", "r", "s", "d", "f"],
14 |   "r": ["3", "§", "4", "$", "5", "%", "e", "t", "d", "f", "g"],
15 |   "t": ["4", "$", "5", "%", "6", "&", "r", "z", "f", "g", "h"],
16 |   "z": ["5", "%", "6", "&", "7", "/", "t", "u", "g", "h", "j"],
17 |   "u": ["6", "&", "7", "/", "8", "(", "i", "h", "j", "k"],
18 |   "i": ["7", "/", "8", "(", "9", ")", "u", "o", "j", "k", "l"],
19 |   "o": ["8", "(", "9", ")", "0", "=", "i", "p", "k", "l"],
20 |   "p": ["9", ")", "0", "=", "o", "l"],
21 |   "a": ["q", "w", "a", "s", "y", "x"],
22 |   "s": ["q", "w", "e", "a", "d", "y", "x", "c"],
23 |   "d": ["w", "e", "r", "s", "f", "x", "c", "v"],
24 |   "f": ["e", "r", "t", "d", "g", "c", "v", "b"],
25 |   "g": ["r", "t", "z", "f", "h", "v", "b", "n"],
26 |   "h": ["t", "z", "u", "g", "j", "b", "n", "m"],
27 |   "j": ["z", "u", "i", "h", "k", "n", "m", ",", ";"],
28 |   "k": ["u", "i", "o", "j", "l", "m", ",", ";", ".", ":"],
29 |   "l": ["i", "o", "p", "k", "ö", "Ö", ",", ";", ".", ":", "-", "_"],
30 |   "y": ["a", "s", "x"],
31 |   "x": ["a", "s", "d", "y", "c"],
32 |   "c": ["s", "d", "f", "x", "v"],
33 |   "v": ["d", "f", "g", "c", "b"],
34 |   "b": ["f", "g", "h", "v", "n"],
35 |   "n": ["g", "h", "j", "b", "m"],
36 |   "m": ["h", "j", "k", "n", ",", ";"],
37 |   "!": ["\"", "q"],
38 |   "\"": ["!", "§", "q", "w"],
39 |   "§": ["\"", "$", "w", "e"],
40 |   "$": ["§", "%", "e", "r"],
41 |   "%": ["$"]
42 | }


--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/en.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "1": ["!", "2", "@", "q", "w"],
 3 |     "2": ["@", "1", "!", "3", "#", "q", "w", "e"],
 4 |     "3": ["#", "2", "@", "4", "$", "w", "e"],
 5 |     "4": ["$", "3", "#", "5", "%", "e", "r"],
 6 |     "5": ["%", "4", "$", "6", "^", "r", "t", "y"],
 7 |     "6": ["^", "5", "%", "7", "&", "t", "y", "u"],
 8 |     "7": ["&", "6", "^", "8", "*", "y", "u", "i"],
 9 |     "8": ["*", "7", "&", "9", "(", "u", "i", "o"],
10 |     "9": ["(", "8", "*", "0", ")", "i", "o", "p"],
11 |     "!": ["@", "q"],
12 |     "@": ["!", "#", "q", "w"],
13 |     "#": ["@", "$", "w", "e"],
14 |     "$": ["#", "%", "e", "r"],
15 |     "%": "$",
16 |     "q": ["1", "!", "2", "@", "w", "a", "s"],
17 |     "w": ["1", "!", "2", "@", "3", "#", "q", "e", "a", "s", "d"],
18 |     "e": ["2", "@", "3", "#", "4", "$", "w", "r", "s", "d", "f"],
19 |     "r": ["3", "#", "4", "$", "5", "%", "e", "t", "d", "f", "g"],
20 |     "t": ["4", "$", "5", "%", "6", "^", "r", "y", "f", "g", "h"],
21 |     "y": ["5", "%", "6", "^", "7", "&", "t", "u", "g", "h", "j"],
22 |     "u": ["6", "^", "7", "&", "8", "*", " t", "i", "h", "j", "k"],
23 |     "i": ["7", "&", "8", "*", "9", "(", "u", "o", "j", "k", "l"],
24 |     "o": ["8", "*", "9", "(", "0", ")", "i", "p", "k", "l"],
25 |     "p": ["9", "(", "0", ")", "o", "l"],
26 |     "a": ["q", "w", "a", "s", "z", "x"],
27 |     "s": ["q", "w", "e", "a", "d", "z", "x", "c"],
28 |     "d": ["w", "e", "r", "s", "f", "x", "c", "v"],
29 |     "f": ["e", "r", "t", "d", "g", "c", "v", "b"],
30 |     "g": ["r", "t", "y", "f", "h", "v", "b", "n"],
31 |     "h": ["t", "y", "u", "g", "j", "b", "n", "m"],
32 |     "j": ["y", "u", "i", "h", "k", "n", "m", ",", "<"],
33 |     "k": ["u", "i", "o", "j", "l", "m", ",", "<", ".", ">"],
34 |     "l": ["i", "o", "p", "k", ";", ":", ",", "<", ".", ">", "/", "?"],
35 |     "z": ["a", "s", "x"],
36 |     "x": ["a", "s", "d", "z", "c"],
37 |     "c": ["s", "d", "f", "x", "v"],
38 |     "v": ["d", "f", "g", "c", "b"],
39 |     "b": ["f", "g", "h", "v", "n"],
40 |     "n": ["g", "h", "j", "b", "m"],
41 |     "m": ["h", "j", "k", "n", ",", "<"]
42 | }


--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/es.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "1": ["¡", "2", "!", "q", "w"],
 3 |   "2": ["!", "1", "¡", "3", "#", "q", "w", "e"],
 4 |   "3": ["#", "2", "!", "4", "$", "w", "e"],
 5 |   "4": ["$", "3", "#", "5", "%", "e", "r"],
 6 |   "5": ["%", "4", "$", "6", "/", "r", "t", "y"],
 7 |   "6": ["/", "5", "%", "7", "&", "t", "y", "u"],
 8 |   "7": ["&", "6", "/", "8", "*", "y", "u", "i"],
 9 |   "8": ["*", "7", "&", "9", "(", "u", "i", "o"],
10 |   "9": ["(", "8", "*", "0", ")", "i", "o", "p"],
11 |   "q": ["1", "¡", "2", "!", "w", "a", "s"],
12 |   "w": ["1", "¡", "2", "!", "3", "#", "q", "e", "a", "s", "d"],
13 |   "e": ["2", "!", "3", "#", "4", "$", "w", "r", "s", "d", "f"],
14 |   "r": ["3", "#", "4", "$", "5", "%", "e", "t", "d", "f", "g"],
15 |   "t": ["4", "$", "5", "%", "6", "/", "r", "y", "f", "g", "h"],
16 |   "y": ["5", "%", "6", "/", "7", "&", "t", "u", "g", "h", "j"],
17 |   "u": ["6", "/", "7", "&", "8", "*", "i", "h", "j", "k"],
18 |   "i": ["7", "&", "8", "*", "9", "(", "u", "o", "j", "k", "l"],
19 |   "o": ["8", "*", "9", "(", "0", ")", "i", "p", "k", "l"],
20 |   "p": ["9", "(", "0", ")", "o", "l"],
21 |   "a": ["q", "w", "a", "s", "z", "x"],
22 |   "s": ["q", "w", "e", "a", "d", "z", "x", "c"],
23 |   "d": ["w", "e", "r", "s", "f", "x", "c", "v"],
24 |   "f": ["e", "r", "t", "d", "g", "c", "v", "b"],
25 |   "g": ["r", "t", "y", "f", "h", "v", "b", "n"],
26 |   "h": ["t", "y", "u", "g", "j", "b", "n", "m"],
27 |   "j": ["y", "u", "i", "h", "k", "n", "m", ",", "¿"],
28 |   "k": ["u", "i", "o", "j", "l", "m", ",", "¿", ".", "?"],
29 |   "l": ["i", "o", "p", "k", "ñ", "Ñ", ",", "¿", ".", "?", "ç", "Ç"],
30 |   "z": ["a", "s", "x"],
31 |   "x": ["a", "s", "d", "z", "c"],
32 |   "c": ["s", "d", "f", "x", "v"],
33 |   "v": ["d", "f", "g", "c", "b"],
34 |   "b": ["f", "g", "h", "v", "n"],
35 |   "n": ["g", "h", "j", "b", "m"],
36 |   "m": ["h", "j", "k", "n", ",", "¿"],
37 |   "¡": ["!", "q"],
38 |   "!": ["¡", "#", "q", "w"],
39 |   "#": ["!", "$", "w", "e"],
40 |   "$": ["#", "%", "e", "r"],
41 |   "%": ["$"]
42 | }


--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/fr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "&": ["1", "é", "2", "a", "z"],
 3 |   "é": ["2", "&", "1", "\"", "3", "a", "z", "e"],
 4 |   "\"": ["3", "é", "2", "'", "4", "z", "e"],
 5 |   "'": ["4", "\"", "3", "(", "5", "e", "r"],
 6 |   "(": ["5", "'", "4", "§", "6", "r", "t", "y"],
 7 |   "§": ["6", "(", "5", "è", "7", "t", "y", "u"],
 8 |   "è": ["7", "§", "6", "!", "8", "y", "u", "i"],
 9 |   "!": ["8", "è", "7", "ç", "9", "u", "i", "o"],
10 |   "ç": ["9", "!", "8", "à", "0", "i", "o", "p"],
11 |   "a": ["&", "1", "é", "2", "z", "q", "s"],
12 |   "z": ["&", "1", "é", "2", "\"", "3", "a", "e", "q", "s", "d"],
13 |   "e": ["é", "2", "\"", "3", "'", "4", "z", "r", "s", "d", "f"],
14 |   "r": ["\"", "3", "'", "4", "(", "5", "e", "t", "d", "f", "g"],
15 |   "t": ["'", "4", "(", "5", "§", "6", "r", "y", "f", "g", "h"],
16 |   "y": ["(", "5", "§", "6", "è", "7", "t", "u", "g", "h", "j"],
17 |   "u": ["§", "6", "è", "7", "!", "8", "i", "h", "j", "k"],
18 |   "i": ["è", "7", "!", "8", "ç", "9", "u", "o", "j", "k", "l"],
19 |   "o": ["!", "8", "ç", "9", "à", "0", "i", "p", "k", "l"],
20 |   "p": ["ç", "9", "à", "0", "o", "l"],
21 |   "q": ["a", "z", "q", "s", "w", "x"],
22 |   "s": ["a", "z", "e", "q", "d", "w", "x", "c"],
23 |   "d": ["z", "e", "r", "s", "f", "x", "c", "v"],
24 |   "f": ["e", "r", "t", "d", "g", "c", "v", "b"],
25 |   "g": ["r", "t", "y", "f", "h", "v", "b", "n"],
26 |   "h": ["t", "y", "u", "g", "j", "b", "n", ","],
27 |   "j": ["y", "u", "i", "h", "k", "n", ",", ";", "."],
28 |   "k": ["u", "i", "o", "j", "l", ",", ";", ".", ":", "/"],
29 |   "l": ["i", "o", "p", "k", "m", "M", ";", ".", ":", "/", "=", "+"],
30 |   "w": ["q", "s", "x"],
31 |   "x": ["q", "s", "d", "w", "c"],
32 |   "c": ["s", "d", "f", "x", "v"],
33 |   "v": ["d", "f", "g", "c", "b"],
34 |   "b": ["f", "g", "h", "v", "n"],
35 |   "n": ["g", "h", "j", "b", ","],
36 |   ",": ["h", "j", "k", "n", ";", "."],
37 |   "1": ["2", "a"],
38 |   "2": ["1", "3", "a", "z"],
39 |   "3": ["2", "4", "z", "e"],
40 |   "4": ["3", "5", "e", "r"],
41 |   "5": ["4"]
42 | }


--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/he.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "1": ["!", "2", "@", "/", "׳"],
 3 |   "2": ["@", "1", "!", "3", "#", "/", "׳", "ק"],
 4 |   "3": ["#", "2", "@", "4", "$", "׳", "ק"],
 5 |   "4": ["$", "3", "#", "5", "%", "ק", "ר"],
 6 |   "5": ["%", "4", "$", "6", "^", "ר", "א", "ט"],
 7 |   "6": ["^", "5", "%", "7", "₪", "א", "ט", "ו"],
 8 |   "7": ["₪", "6", "^", "8", "*", "ט", "ו", "ן"],
 9 |   "8": ["*", "7", "₪", "9", ")", "ו", "ן", "ם"],
10 |   "9": [")", "8", "*", "0", "(", "ן", "ם", "פ"],
11 |   "/": ["1", "!", "2", "@", "׳", "ש", "ד"],
12 |   "׳": ["1", "!", "2", "@", "3", "#", "/", "ק", "ש", "ד", "ג"],
13 |   "ק": ["2", "@", "3", "#", "4", "$", "׳", "ר", "ד", "ג", "כ"],
14 |   "ר": ["3", "#", "4", "$", "5", "%", "ק", "א", "ג", "כ", "ע"],
15 |   "א": ["4", "$", "5", "%", "6", "^", "ר", "ט", "כ", "ע", "י"],
16 |   "ט": ["5", "%", "6", "^", "7", "₪", "א", "ו", "ע", "י", "ח"],
17 |   "ו": ["6", "^", "7", "₪", "8", "*", "ן", "י", "ח", "ל"],
18 |   "ן": ["7", "₪", "8", "*", "9", ")", "ו", "ם", "ח", "ל", "ך"],
19 |   "ם": ["8", "*", "9", ")", "0", "(", "ן", "פ", "ל", "ך"],
20 |   "פ": ["9", ")", "0", "(", "ם", "ך"],
21 |   "ש": ["/", "׳", "ש", "ד", "ז", "ס"],
22 |   "ד": ["/", "׳", "ק", "ש", "ג", "ז", "ס", "ב"],
23 |   "ג": ["׳", "ק", "ר", "ד", "כ", "ס", "ב", "ה"],
24 |   "כ": ["ק", "ר", "א", "ג", "ע", "ב", "ה", "נ"],
25 |   "ע": ["ר", "א", "ט", "כ", "י", "ה", "נ", "מ"],
26 |   "י": ["א", "ט", "ו", "ע", "ח", "נ", "מ", "צ"],
27 |   "ח": ["ט", "ו", "ן", "י", "ל", "מ", "צ", "ת", ">"],
28 |   "ל": ["ו", "ן", "ם", "ח", "ך", "צ", "ת", ">", "ץ", "<"],
29 |   "ך": ["ן", "ם", "פ", "ל", "ף", ":", "ת", ">", "ץ", "<", ".", "?"],
30 |   "ז": ["ש", "ד", "ס"],
31 |   "ס": ["ש", "ד", "ג", "ז", "ב"],
32 |   "ב": ["ד", "ג", "כ", "ס", "ה"],
33 |   "ה": ["ג", "כ", "ע", "ב", "נ"],
34 |   "נ": ["כ", "ע", "י", "ה", "מ"],
35 |   "מ": ["ע", "י", "ח", "נ", "צ"],
36 |   "צ": ["י", "ח", "ל", "מ", "ת", ">"],
37 |   "!": ["@", "/"],
38 |   "@": ["!", "#", "/", "׳"],
39 |   "#": ["@", "$", "׳", "ק"],
40 |   "$": ["#", "%", "ק", "ר"],
41 |   "%": ["$"]
42 | }


--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/it.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "1": ["!", "2", "\"", "q", "w"],
 3 |   "2": ["\"", "1", "!", "3", "£", "q", "w", "e"],
 4 |   "3": ["£", "2", "\"", "4", "$", "w", "e"],
 5 |   "4": ["$", "3", "£", "5", "%", "e", "r"],
 6 |   "5": ["%", "4", "$", "6", "&", "r", "t", "y"],
 7 |   "6": ["&", "5", "%", "7", "/", "t", "y", "u"],
 8 |   "7": ["/", "6", "&", "8", "(", "y", "u", "i"],
 9 |   "8": ["(", "7", "/", "9", ")", "u", "i", "o"],
10 |   "9": [")", "8", "(", "0", "=", "i", "o", "p"],
11 |   "q": ["1", "!", "2", "\"", "w", "a", "s"],
12 |   "w": ["1", "!", "2", "\"", "3", "£", "q", "e", "a", "s", "d"],
13 |   "e": ["2", "\"", "3", "£", "4", "$", "w", "r", "s", "d", "f"],
14 |   "r": ["3", "£", "4", "$", "5", "%", "e", "t", "d", "f", "g"],
15 |   "t": ["4", "$", "5", "%", "6", "&", "r", "y", "f", "g", "h"],
16 |   "y": ["5", "%", "6", "&", "7", "/", "t", "u", "g", "h", "j"],
17 |   "u": ["6", "&", "7", "/", "8", "(", "i", "h", "j", "k"],
18 |   "i": ["7", "/", "8", "(", "9", ")", "u", "o", "j", "k", "l"],
19 |   "o": ["8", "(", "9", ")", "0", "=", "i", "p", "k", "l"],
20 |   "p": ["9", ")", "0", "=", "o", "l"],
21 |   "a": ["q", "w", "a", "s", "z", "x"],
22 |   "s": ["q", "w", "e", "a", "d", "z", "x", "c"],
23 |   "d": ["w", "e", "r", "s", "f", "x", "c", "v"],
24 |   "f": ["e", "r", "t", "d", "g", "c", "v", "b"],
25 |   "g": ["r", "t", "y", "f", "h", "v", "b", "n"],
26 |   "h": ["t", "y", "u", "g", "j", "b", "n", "m"],
27 |   "j": ["y", "u", "i", "h", "k", "n", "m", ",", ";"],
28 |   "k": ["u", "i", "o", "j", "l", "m", ",", ";", ".", ":"],
29 |   "l": ["i", "o", "p", "k", "ò", "ç", ",", ";", ".", ":", "-", "_"],
30 |   "z": ["a", "s", "x"],
31 |   "x": ["a", "s", "d", "z", "c"],
32 |   "c": ["s", "d", "f", "x", "v"],
33 |   "v": ["d", "f", "g", "c", "b"],
34 |   "b": ["f", "g", "h", "v", "n"],
35 |   "n": ["g", "h", "j", "b", "m"],
36 |   "m": ["h", "j", "k", "n", ",", ";"],
37 |   "!": ["\"", "q"],
38 |   "\"": ["!", "£", "q", "w"],
39 |   "£": ["\"", "$", "w", "e"],
40 |   "$": ["£", "%", "e", "r"],
41 |   "%": ["$"]
42 | }


--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/nl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "1": ["!", "2", "@", "q", "w"],
 3 |   "2": ["@", "1", "!", "3", "#", "q", "w", "e"],
 4 |   "3": ["#", "2", "@", "4", "$", "w", "e"],
 5 |   "4": ["$", "3", "#", "5", "%", "e", "r"],
 6 |   "5": ["%", "4", "$", "6", "^", "r", "t", "y"],
 7 |   "6": ["^", "5", "%", "7", "&", "t", "y", "u"],
 8 |   "7": ["&", "6", "^", "8", "*", "y", "u", "i"],
 9 |   "8": ["*", "7", "&", "9", "(", "u", "i", "o"],
10 |   "9": ["(", "8", "*", "0", ")", "i", "o", "p"],
11 |   "q": ["1", "!", "2", "@", "w", "a", "s"],
12 |   "w": ["1", "!", "2", "@", "3", "#", "q", "e", "a", "s", "d"],
13 |   "e": ["2", "@", "3", "#", "4", "$", "w", "r", "s", "d", "f"],
14 |   "r": ["3", "#", "4", "$", "5", "%", "e", "t", "d", "f", "g"],
15 |   "t": ["4", "$", "5", "%", "6", "^", "r", "y", "f", "g", "h"],
16 |   "y": ["5", "%", "6", "^", "7", "&", "t", "u", "g", "h", "j"],
17 |   "u": ["6", "^", "7", "&", "8", "*", "i", "h", "j", "k"],
18 |   "i": ["7", "&", "8", "*", "9", "(", "u", "o", "j", "k", "l"],
19 |   "o": ["8", "*", "9", "(", "0", ")", "i", "p", "k", "l"],
20 |   "p": ["9", "(", "0", ")", "o", "l"],
21 |   "a": ["q", "w", "a", "s", "z", "x"],
22 |   "s": ["q", "w", "e", "a", "d", "z", "x", "c"],
23 |   "d": ["w", "e", "r", "s", "f", "x", "c", "v"],
24 |   "f": ["e", "r", "t", "d", "g", "c", "v", "b"],
25 |   "g": ["r", "t", "y", "f", "h", "v", "b", "n"],
26 |   "h": ["t", "y", "u", "g", "j", "b", "n", "m"],
27 |   "j": ["y", "u", "i", "h", "k", "n", "m", ",", "<"],
28 |   "k": ["u", "i", "o", "j", "l", "m", ",", "<", ".", ">"],
29 |   "l": ["i", "o", "p", "k", ";", ":", ",", "<", ".", ">", "/", "?"],
30 |   "z": ["a", "s", "x"],
31 |   "x": ["a", "s", "d", "z", "c"],
32 |   "c": ["s", "d", "f", "x", "v"],
33 |   "v": ["d", "f", "g", "c", "b"],
34 |   "b": ["f", "g", "h", "v", "n"],
35 |   "n": ["g", "h", "j", "b", "m"],
36 |   "m": ["h", "j", "k", "n", ",", "<"],
37 |   "!": ["@", "q"],
38 |   "@": ["!", "#", "q", "w"],
39 |   "#": ["@", "$", "w", "e"],
40 |   "$": ["#", "%", "e", "r"],
41 |   "%": ["$"]
42 | }


--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/pl.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "1": ["§", "2", "%", "q", "w"],
 3 |   "2": ["%", "1", "§", "3", "!", "q", "w", "e"],
 4 |   "3": ["!", "2", "%", "4", "?", "w", "e"],
 5 |   "4": ["?", "3", "!", "5", "+", "e", "r"],
 6 |   "5": ["+", "4", "?", "6", "=", "r", "t", "z"],
 7 |   "6": ["=", "5", "+", "7", ":", "t", "z", "u"],
 8 |   "7": [":", "6", "=", "8", "_", "z", "u", "i"],
 9 |   "8": ["_", "7", ":", "9", "/", "u", "i", "o"],
10 |   "9": ["/", "8", "_", "0", "\"", "i", "o", "p"],
11 |   "q": ["1", "§", "2", "%", "w", "a", "s"],
12 |   "w": ["1", "§", "2", "%", "3", "!", "q", "e", "a", "s", "d"],
13 |   "e": ["2", "%", "3", "!", "4", "?", "w", "r", "s", "d", "f"],
14 |   "r": ["3", "!", "4", "?", "5", "+", "e", "t", "d", "f", "g"],
15 |   "t": ["4", "?", "5", "+", "6", "=", "r", "z", "f", "g", "h"],
16 |   "z": ["5", "+", "6", "=", "7", ":", "t", "u", "g", "h", "j"],
17 |   "u": ["6", "=", "7", ":", "8", "_", "i", "h", "j", "k"],
18 |   "i": ["7", ":", "8", "_", "9", "/", "u", "o", "j", "k", "l"],
19 |   "o": ["8", "_", "9", "/", "0", "\"", "i", "p", "k", "l"],
20 |   "p": ["9", "/", "0", "\"", "o", "l"],
21 |   "a": ["q", "w", "a", "s", "y", "x"],
22 |   "s": ["q", "w", "e", "a", "d", "y", "x", "c"],
23 |   "d": ["w", "e", "r", "s", "f", "x", "c", "v"],
24 |   "f": ["e", "r", "t", "d", "g", "c", "v", "b"],
25 |   "g": ["r", "t", "z", "f", "h", "v", "b", "n"],
26 |   "h": ["t", "z", "u", "g", "j", "b", "n", "m"],
27 |   "j": ["z", "u", "i", "h", "k", "n", "m", ".", "ś"],
28 |   "k": ["u", "i", "o", "j", "l", "m", ".", "ś", ",", "ń"],
29 |   "l": ["i", "o", "p", "k", "ł", "Ł", ".", "ś", ",", "ń", "-", "ć"],
30 |   "y": ["a", "s", "x"],
31 |   "x": ["a", "s", "d", "y", "c"],
32 |   "c": ["s", "d", "f", "x", "v"],
33 |   "v": ["d", "f", "g", "c", "b"],
34 |   "b": ["f", "g", "h", "v", "n"],
35 |   "n": ["g", "h", "j", "b", "m"],
36 |   "m": ["h", "j", "k", "n", ".", "ś"],
37 |   "§": ["%", "q"],
38 |   "%": ["§", "!", "q", "w"],
39 |   "!": ["%", "?", "w", "e"],
40 |   "?": ["!", "+", "e", "r"],
41 |   "+": ["?"]
42 | }


--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/tr.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "1": ["q", "w", "2", "'"],
 3 |     "2": ["1", "q", "w", "e", "3", "!", "^"],
 4 |     "3": ["2", "w", "e", "r", "4", "^", "%"],
 5 |     "4": ["3", "e", "r", "t", "5", "^", "%"],
 6 |     "5": ["4", "r", "t", "y", "6", "+", "&"],
 7 |     "6": ["5", "t", "y", "u", "7", "%", "/"],
 8 |     "7": ["6", "y", "u", "8", "&", "(", ")"],
 9 |     "8": ["7", "u", "ı", "9", "/", ")"],
10 |     "9": ["8", "ı", "o", "0", "(", "ı", "o", "0"],
11 |     "q": ["1", "2", "w", "a", "s", "!", "'"],
12 |     "w": ["1", "2", "3", "q", "e", "a", "s", "d", "!", "'", "^"],
13 |     "e": ["3", "4", "w", "r", "s", "d", "f", "^", "+"],
14 |     "r": ["4", "5", "e", "t", "d", "f", "g", "+", "%"],
15 |     "t": ["5", "6", "r", "y", "f", "g", "h", "%", "&"],
16 |     "y": ["6", "7", "t", "u", "g", "h", "j", "&", "/"],
17 |     "u": ["7", "8", "y", "ı", "h", "j", "k", "/", "("],
18 |     "ı": ["8", "9", "u", "o", "j", "k", "l", "(", ")"],
19 |     "o": ["9", "0", "ı", "p", "k", "l", "ş", ")", "="],
20 |     "p": ["0", "*", "o", "ğ", "l", "ş", "i", "=", "?"],
21 |     "ğ": ["*", "-", "p", "ü", "ş", "i", ",", "=", "?", "_", ";"],
22 |     "a": ["q", "w", "s", "x", "z", "<", ">"],
23 |     "s": ["q", "w", "e", "a", "d", "z", "x", "c"],
24 |     "d": ["w", "e", "r", "s", "f", "x", "c"],
25 |     "f": ["r", "t", "d", "g", "c", "v"],
26 |     "g": ["r", "t", "y", "f", "h", "v", "b"],
27 |     "h": ["y", "u", "g", "j", "b", "n"],
28 |     "j": ["u", "ı", "h", "k", "n", "m"],
29 |     "k": ["ı", "o", "j", "l", "m", "ö"],
30 |     "l": ["o", "p", "k", "ş", "ö", "ç"],
31 |     "ş": ["p", "ğ", "l", "i", "ç", ".", ":"],
32 |     "i": ["ğ", "ü", "ş", ",", ".", ";"],
33 |     "z": ["a", "s", "x", "<", ">"],
34 |     "x": ["s", "d", "z", "c"],
35 |     "c": ["d", "f", "x", "v"],
36 |     "v": ["f", "g", "c", "b"],
37 |     "b": ["g", "h", "v", "n"],
38 |     "n": ["h", "j", "b", "m"],
39 |     "m": ["j", "k", "n", "ö"],
40 |     "ö": ["k", "l", "m", "ç"],
41 |     "ç": ["l", "ş", "ö", ".", ":"]
42 | }
43 | 


--------------------------------------------------------------------------------
/nlpaug/res/char/keyboard/uk.json:
--------------------------------------------------------------------------------
 1 | {
 2 |   "1": ["!", "2", "\"", "й", "ц"],
 3 |   "2": ["\"", "1", "!", "3", "№", "й", "ц", "у"],
 4 |   "3": ["№", "2", "\"", "4", ";", "ц", "у"],
 5 |   "4": [";", "3", "№", "5", "%", "у", "к"],
 6 |   "5": ["%", "4", ";", "6", ":", "к", "е", "н"],
 7 |   "6": [":", "5", "%", "7", "?", "е", "н", "г"],
 8 |   "7": ["?", "6", ":", "8", "*", "н", "г", "ш"],
 9 |   "8": ["*", "7", "?", "9", "(", "г", "ш", "щ"],
10 |   "9": ["(", "8", "*", "0", ")", "ш", "щ", "з"],
11 |   "й": ["1", "!", "2", "\"", "ц", "ф", "і"],
12 |   "ц": ["1", "!", "2", "\"", "3", "№", "й", "у", "ф", "і", "в"],
13 |   "у": ["2", "\"", "3", "№", "4", ";", "ц", "к", "і", "в", "а"],
14 |   "к": ["3", "№", "4", ";", "5", "%", "у", "е", "в", "а", "п"],
15 |   "е": ["4", ";", "5", "%", "6", ":", "к", "н", "а", "п", "р"],
16 |   "н": ["5", "%", "6", ":", "7", "?", "е", "г", "п", "р", "о"],
17 |   "г": ["6", ":", "7", "?", "8", "*", "ш", "р", "о", "л"],
18 |   "ш": ["7", "?", "8", "*", "9", "(", "г", "щ", "о", "л", "д"],
19 |   "щ": ["8", "*", "9", "(", "0", ")", "ш", "з", "л", "д"],
20 |   "з": ["9", "(", "0", ")", "щ", "д"],
21 |   "ф": ["й", "ц", "ф", "і", "я", "ч"],
22 |   "і": ["й", "ц", "у", "ф", "в", "я", "ч", "с"],
23 |   "в": ["ц", "у", "к", "і", "а", "ч", "с", "м"],
24 |   "а": ["у", "к", "е", "в", "п", "с", "м", "и"],
25 |   "п": ["к", "е", "н", "а", "р", "м", "и", "т"],
26 |   "р": ["е", "н", "г", "п", "о", "и", "т", "ь"],
27 |   "о": ["н", "г", "ш", "р", "л", "т", "ь", "б", "Б"],
28 |   "л": ["г", "ш", "щ", "о", "д", "ь", "б", "Б", "ю", "Ю"],
29 |   "д": ["ш", "щ", "з", "л", "ж", "Ж", "б", "Б", "ю", "Ю", ".", ","],
30 |   "я": ["ф", "і", "ч"],
31 |   "ч": ["ф", "і", "в", "я", "с"],
32 |   "с": ["і", "в", "а", "ч", "м"],
33 |   "м": ["в", "а", "п", "с", "и"],
34 |   "и": ["а", "п", "р", "м", "т"],
35 |   "т": ["п", "р", "о", "и", "ь"],
36 |   "ь": ["р", "о", "л", "т", "б", "Б"],
37 |   "!": ["\"", "й"],
38 |   "\"": ["!", "№", "й", "ц"],
39 |   "№": ["\"", ";", "ц", "у"],
40 |   ";": ["№", "%", "у", "к"],
41 |   "%": [";"]
42 | }


--------------------------------------------------------------------------------
/nlpaug/res/char/ocr/en.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "0": [
 3 |         "8",
 4 |         "9",
 5 |         "o",
 6 |         "O",
 7 |         "D"
 8 |     ],
 9 |     "1": [
10 |         "4",
11 |         "7",
12 |         "l",
13 |         "I"
14 |     ],
15 |     "2": [
16 |         "z",
17 |         "Z"
18 |     ],
19 |     "5": [
20 |         "8"
21 |     ],
22 |     "6": [
23 |         "b"
24 |     ],
25 |     "8": [
26 |         "s",
27 |         "S",
28 |         "@",
29 |         "&"
30 |     ],
31 |     "9": [
32 |         "g",
33 |         "q"
34 |     ],
35 |     "o": [
36 |         "u"
37 |     ],
38 |     "r": [
39 |         "k"
40 |     ],
41 |     "C": [
42 |         "G"
43 |     ],
44 |     "O": [
45 |         "D",
46 |         "U"
47 |     ],
48 |     "E": [
49 |         "B"
50 |     ]
51 | }


--------------------------------------------------------------------------------
/nlpaug/util/__init__.py:
--------------------------------------------------------------------------------
 1 | from nlpaug.util.action import *
 2 | from nlpaug.util.doc import *
 3 | from nlpaug.util.method import *
 4 | from nlpaug.util.exception import *
 5 | from nlpaug.util.math import *
 6 | from nlpaug.util.text import *
 7 | from nlpaug.util.audio import *
 8 | 
 9 | from nlpaug.util.file import *
10 | from nlpaug.util.decorator import *
11 | from nlpaug.util.logger import *
12 | from nlpaug.util.selection import *
13 | 


--------------------------------------------------------------------------------
/nlpaug/util/action.py:
--------------------------------------------------------------------------------
 1 | class Action:
 2 |     INSERT = 'insert'
 3 |     SUBSTITUTE = 'substitute'
 4 |     DELETE = 'delete'
 5 |     SWAP = 'swap'
 6 |     SPLIT = 'split'
 7 |     ALIGN = 'align'
 8 |     CROP = 'crop'
 9 | 
10 |     SEQUENTIAL = 'sequential'
11 |     SOMETIMES = 'sometimes'
12 | 
13 |     @staticmethod
14 |     def getall():
15 |         return [Action.INSERT, Action.SUBSTITUTE, Action.SWAP, Action.DELETE, Action.SPLIT, Action.CROP,
16 |                 Action.SEQUENTIAL, Action.SOMETIMES, Action.ALIGN]


--------------------------------------------------------------------------------
/nlpaug/util/audio/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.audio.loader import *
2 | from nlpaug.util.audio.visualizer import *
3 | 


--------------------------------------------------------------------------------
/nlpaug/util/audio/loader.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import librosa
 3 | except ImportError:
 4 |     # No installation required if not using this function
 5 |     pass
 6 | 
 7 | 
 8 | class AudioLoader:
 9 |     @staticmethod
10 |     def load_audio(file_path):
11 |         try:
12 |             import librosa
13 |         except ModuleNotFoundError:
14 |             raise ModuleNotFoundError('Missed librosa library. Install import librosa by `pip install librosa`')
15 | 
16 |         return librosa.load(file_path)
17 | 
18 |     @staticmethod
19 |     def load_mel_spectrogram(file_path, n_mels=128, fmax=8000):
20 |         try:
21 |             import librosa
22 |         except ModuleNotFoundError:
23 |             raise ModuleNotFoundError('Missed librosa library. Install import librosa by `pip install librosa`')
24 | 
25 |         audio, sampling_rate = AudioLoader.load_audio(file_path)
26 |         return librosa.feature.melspectrogram(y=audio, sr=sampling_rate, n_mels=n_mels, fmax=fmax)
27 | 


--------------------------------------------------------------------------------
/nlpaug/util/audio/visualizer.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import librosa
 3 |     import librosa.display
 4 |     import matplotlib.pyplot as plt
 5 | except ImportError:
 6 |     # No installation required if not using this function
 7 |     pass
 8 | 
 9 | import numpy as np
10 | 
11 | 
12 | class AudioVisualizer:
13 |     @staticmethod
14 |     def wave(title, audio, sample_rate):
15 |         plt.figure(figsize=(8, 4))
16 |         librosa.display.waveplot(audio, sr=sample_rate)
17 |         plt.title(title)
18 |         plt.tight_layout()
19 |         plt.show()
20 | 
21 |     @staticmethod
22 |     def freq_power(title, audio, sample_rate, aug_audio=None):
23 |         audio_fft = np.fft.rfft(audio)
24 |         audio_fft /= len(audio_fft)
25 | 
26 |         freq_bins = np.arange(0, len(audio_fft), 1.0) * (sample_rate * 1.0 / len(audio_fft))
27 |         plt.plot(freq_bins / 1000, 10 * np.log10(audio_fft), color='#FF0000', linewidth=0.02)
28 | 
29 |         if aug_audio is not None:
30 |             aug_audio_fft = np.fft.rfft(aug_audio)
31 |             aug_audio_fft /= len(aug_audio_fft)
32 | 
33 |             aug_freq_bins = np.arange(0, len(aug_audio_fft), 1.0) * (sample_rate * 1.0 / len(aug_audio_fft))
34 |             plt.plot(aug_freq_bins / 1000, 10 * np.log10(aug_audio_fft), color='#000000', linewidth=0.02)
35 | 
36 |         plt.title(title)
37 |         plt.xlabel('Frequency (k Hz)')
38 |         plt.ylabel('Power (dB)')
39 |         plt.tight_layout()
40 |         plt.show()
41 | 
42 |     @staticmethod
43 |     def spectrogram(title, spectrogram):
44 |         plt.figure(figsize=(8, 4))
45 |         librosa.display.specshow(
46 |             librosa.power_to_db(spectrogram, ref=np.max), y_axis='mel', fmax=8000, x_axis='time')
47 |         plt.colorbar(format='%+10.0f dB')
48 |         plt.title(title)
49 |         plt.tight_layout()
50 |         plt.show()
51 | 


--------------------------------------------------------------------------------
/nlpaug/util/decorator/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.decorator.deprecation import *
2 | 


--------------------------------------------------------------------------------
/nlpaug/util/decorator/deprecation.py:
--------------------------------------------------------------------------------
 1 | import functools
 2 | import warnings
 3 | 
 4 | 
 5 | def deprecated(deprecate_from, deprecate_to, msg):
 6 |     def decorator(obj):
 7 |         if isinstance(obj, type):
 8 |             return _decorate_class(obj, deprecate_from, deprecate_to, msg)
 9 |         # # TODO:
10 |         # elif isinstance(obj, property):
11 |         #     return _decorate_prop(obj, msg)
12 |         else:
13 |             return _decorate_func(obj, deprecate_from, deprecate_to, msg)
14 |     return decorator
15 | 
16 | 
17 | def _decorate_class(cls, deprecate_from, deprecate_to, msg):
18 |     msg_template = 'Class {name} is deprecated from {deprecate_from} version.'
19 |     msg_template += ' It will be removed from {deprecate_to} version. {msg}'
20 | 
21 |     @functools.wraps(cls)
22 |     def wrapped(*args, **kwargs):
23 |         warnings.simplefilter('always', DeprecationWarning)
24 |         warnings.warn(
25 |             msg_template.format(
26 |                 name=cls.__name__, deprecate_from=deprecate_from, deprecate_to=deprecate_to, msg=msg),
27 |             category=DeprecationWarning
28 |         )
29 |         warnings.simplefilter('default', DeprecationWarning)
30 |         return cls(*args, **kwargs)
31 | 
32 |     return wrapped
33 | 
34 | 
35 | def _decorate_func(func, deprecate_from, deprecate_to, msg):
36 |     msg_template = 'Function {name} is deprecated from {deprecate_from} version.'
37 |     msg_template += ' It will be removed from {deprecate_to} version. {msg}'
38 | 
39 |     @functools.wraps(func)
40 |     def wrapped(*args, **kwargs):
41 |         warnings.simplefilter('always', DeprecationWarning)
42 |         warnings.warn(
43 |             msg_template.format(
44 |                 name=func.__name__, deprecate_from=deprecate_from, deprecate_to=deprecate_to, msg=msg),
45 |             category=DeprecationWarning
46 |         )
47 |         warnings.simplefilter('default', DeprecationWarning)
48 |         return func(*args, **kwargs)
49 | 
50 |     return wrapped
51 | 
52 | 
53 | def _decorate_prop(prop, msg):
54 |     @functools.wraps(prop)
55 |     @property
56 |     def wrapped(*args, **kwargs):
57 |         msg_template = 'Property {name} is deprecated. {msg}'
58 |         warnings.simplefilter('always', DeprecationWarning)
59 |         warnings.warn(
60 |             msg_template.format(name=prop.__name__, msg=msg), category=DeprecationWarning
61 |         )
62 |         warnings.simplefilter('default', DeprecationWarning)
63 |         return prop.fget(*args, **kwargs)
64 | 
65 |     return wrapped
66 | 


--------------------------------------------------------------------------------
/nlpaug/util/doc/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.doc.doc import *
2 | from nlpaug.util.doc.change_log import *
3 | from nlpaug.util.doc.token import *
4 | 


--------------------------------------------------------------------------------
/nlpaug/util/doc/change_log.py:
--------------------------------------------------------------------------------
 1 | from nlpaug.util.doc.token import Token
 2 | 
 3 | 
 4 | class ChangeLog:
 5 |     def __init__(self, orig_token):
 6 |         self.orig_token = orig_token
 7 |         self.change_logs = []
 8 |         self.add(orig_token.token, 'original', orig_token.change_seq)
 9 |         self._is_changed = False
10 | 
11 |     def add(self, token, action, change_seq):
12 |         if action != 'original' and not self._is_changed:
13 |             self._is_changed = True
14 |         self.change_logs.append(Token(token=token, action=action, change_seq=change_seq))
15 | 
16 |     def update(self, idx, token=None, action=None, change_seq=None):
17 |         if not self._is_changed:
18 |             self._is_changed = True
19 | 
20 |         if token:
21 |             self.change_logs[idx].token = token
22 |         if action:
23 |             self.change_logs[idx].action = action
24 |         if change_seq:
25 |             self.change_logs[idx].change_seq = change_seq
26 | 
27 |     def size(self):
28 |         return len(self.change_logs) - 1
29 | 
30 |     def is_changed(self):
31 |         return self._is_changed
32 | 
33 |     def get_latest_token(self):
34 |         return self.change_logs[-1]
35 | 
36 |     def update_last_token(self, start_pos):
37 |         self.change_logs[-1].start_pos = start_pos
38 | 
39 |     def to_changed_dict(self):
40 |         return {
41 |             'orig_token': self.orig_token.token,
42 |             'orig_start_pos': self.orig_token.start_pos,
43 |             'new_token': self.get_latest_token().token,
44 |             'new_start_pos': self.get_latest_token().start_pos,
45 |             'change_seq': self.get_latest_token().change_seq,
46 |             'action': self.get_latest_token().action
47 |         }
48 | 
49 |     def to_dict(self):
50 |         return {
51 |             'orig_token': self.orig_token.to_dict(),
52 |             'change_logs': [t.to_dict() for t in self.change_logs]
53 |         }
54 | 


--------------------------------------------------------------------------------
/nlpaug/util/doc/doc.py:
--------------------------------------------------------------------------------
 1 | from nlpaug.util.doc.token import Token
 2 | from nlpaug.util.doc.change_log import ChangeLog
 3 | 
 4 | 
 5 | class Doc:
 6 |     def __init__(self, doc='', tokens=None):
 7 |         self.doc = doc
 8 |         if tokens is not None and len(tokens) > 0:
 9 |             self.tokens = self.token2obj(tokens)
10 |         else:
11 |             self.tokens = []
12 |         self.changed_cnt = 0
13 | 
14 |     def token2obj(self, tokens):
15 |         objs = []
16 |         start_pos = 0
17 |         for t in tokens:
18 |             token_obj = Token(token=t, start_pos=start_pos+self.doc[start_pos:].find(t))
19 |             change_log = ChangeLog(orig_token=token_obj)
20 |             objs.append(change_log)
21 | 
22 |             start_pos += len(token_obj.token)
23 |             start_pos += 1 # TODO: for textual only
24 | 
25 |         return objs
26 | 
27 |     def add_token(self, idx, token, action, change_seq):
28 |         token_obj = Token(token=token, start_pos=-1, action=action, change_seq=change_seq)
29 |         change_log = ChangeLog(orig_token=token_obj)
30 |         self.tokens.insert(idx, change_log)
31 | 
32 |     def add_change_log(self, idx, new_token, action, change_seq):
33 |         self.changed_cnt += 1
34 |         self.tokens[idx].add(new_token, action=action, change_seq=change_seq)
35 | 
36 |     def update_change_log(self, token_idx, change_idx=None, token=None, action=None, change_seq=None):
37 |         change_idx = self.tokens[token_idx].size() if change_idx is None else change_idx
38 |         self.tokens[token_idx].update(change_idx, token=token, action=action, change_seq=change_seq)
39 | 
40 |     def get_token(self, idx):
41 |         return self.tokens[idx]
42 | 
43 |     def get_original_tokens(self):
44 |         return [t.orig_token.token for t in self.tokens]
45 | 
46 |     def get_augmented_tokens(self):
47 |         return [t.get_latest_token().token for t in self.tokens if len(t.get_latest_token().token) > 0]
48 | 
49 |     def size(self):
50 |         return len(self.tokens)
51 | 
52 |     def changed_count(self):
53 |         return self.changed_cnt
54 | 
55 |     def get_change_logs(self, start_pos=0):
56 |         for i, t in enumerate(self.tokens):
57 |             self.tokens[i].update_last_token(start_pos)
58 | 
59 |             start_pos += len(t.get_latest_token().token)
60 |             if len(t.get_latest_token().token) > 0:
61 |                 # TODO: for textual only
62 |                 start_pos += 1
63 | 
64 |         change_logs = [t for t in self.tokens if t.is_changed()]
65 |         change_logs.sort(key=lambda x: x.get_latest_token().change_seq)
66 |         return [c.to_changed_dict() for c in change_logs]
67 | 


--------------------------------------------------------------------------------
/nlpaug/util/doc/token.py:
--------------------------------------------------------------------------------
 1 | class Token:
 2 |     def __init__(self, token, start_pos=-1, action='', change_seq=0):
 3 |         self._token = token
 4 |         self._start_pos = start_pos
 5 |         self._action = action
 6 |         self._change_seq = change_seq
 7 | 
 8 |     @property
 9 |     def start_pos(self):
10 |         return self._start_pos
11 | 
12 |     @start_pos.setter
13 |     def start_pos(self, v):
14 |         self._start_pos = v
15 | 
16 |     @property
17 |     def token(self):
18 |         return self._token
19 | 
20 |     @token.setter
21 |     def token(self, v):
22 |         self._token = v
23 | 
24 |     @property
25 |     def action(self):
26 |         return self._action
27 | 
28 |     @action.setter
29 |     def action(self, v):
30 |         self._action = v
31 | 
32 |     @property
33 |     def change_seq(self):
34 |         return self._change_seq
35 | 
36 |     @change_seq.setter
37 |     def change_seq(self, v):
38 |         self._change_seq = v
39 | 
40 |     def to_dict(self):
41 |         return {
42 |             'token': self.token,
43 |             'action': self.action,
44 |             'start_pos': self.start_pos,
45 |             'change_seq': self.change_seq
46 |         }
47 | 


--------------------------------------------------------------------------------
/nlpaug/util/exception/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.exception.exception_info import *
2 | from nlpaug.util.exception.warning import *
3 | 


--------------------------------------------------------------------------------
/nlpaug/util/exception/exception_info.py:
--------------------------------------------------------------------------------
 1 | 
 2 | class ExceptionInfo:
 3 |     def __init__(self, name, exp_type, code, msg):
 4 |         self.name = name
 5 |         self.exp_type = exp_type
 6 |         self.code = code
 7 |         self.msg = msg
 8 | 
 9 |     def output(self):
10 |         msg = '[{}] Name:{}, Code:{}, Message:{}'.format(self.exp_type, self.name, self.code, self.msg)
11 |         print(msg)
12 | 
13 | 
14 | class ExceptionType:
15 |     WARNING = 'Warning'


--------------------------------------------------------------------------------
/nlpaug/util/exception/warning.py:
--------------------------------------------------------------------------------
 1 | from nlpaug.util.exception.exception_info import ExceptionInfo, ExceptionType
 2 | 
 3 | 
 4 | class WarningException(ExceptionInfo):
 5 |     def __init__(self, name, code, msg):
 6 |         super(WarningException, self).__init__(name=name, exp_type=ExceptionType.WARNING, code=code, msg=msg)
 7 | 
 8 | 
 9 | class WarningName:
10 |     INPUT_VALIDATION_WARNING = 'Input validation issue'
11 |     OUT_OF_VOCABULARY = 'Out of vocabulary issue'
12 | 
13 | 
14 | class WarningCode:
15 |     WARNING_CODE_001 = 'W001'
16 |     WARNING_CODE_002 = 'W002'
17 | 
18 | 
19 | class WarningMessage:
20 |     LENGTH_IS_ZERO = 'Length of input is 0'
21 |     NO_WORD = 'No other word except stop words and OOV. Returning input data without augmentation'
22 | 
23 |     DEPRECATED = 'Warning: {} will be removed after {} release. Change to use {}'
24 | 


--------------------------------------------------------------------------------
/nlpaug/util/file/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.file.download import *
2 | from nlpaug.util.file.library import *
3 | from nlpaug.util.file.read import *


--------------------------------------------------------------------------------
/nlpaug/util/file/library.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | 
 3 | import nlpaug
 4 | 
 5 | 
 6 | class LibraryUtil:
 7 |     """
 8 |     Helper function for retreiving library file
 9 | 
10 |     >>> from nlpaug.util.file.library import LibraryUtil
11 |     """
12 | 
13 |     @staticmethod
14 |     def get_res_dir():
15 |         """
16 |         >>> LibraryUtil.get_res_dir()
17 | 
18 |         """
19 |         lib_dir = os.path.dirname(nlpaug.__file__)
20 |         return os.path.join(lib_dir, 'res')
21 | 


--------------------------------------------------------------------------------
/nlpaug/util/file/read.py:
--------------------------------------------------------------------------------
 1 | import json, os
 2 | 
 3 | 
 4 | class ReadUtil:
 5 | 	"""
 6 | 	Helper function for reading file.
 7 | 
 8 | 	>>> from nlpaug.util.file.read import ReadUtil
 9 | 	"""
10 | 	@staticmethod
11 | 	def read_json(file_path):
12 | 		"""
13 | 		:param str file_path: Path of json file
14 | 
15 | 		>>> ReadUtil.read_json('file.json')
16 | 
17 | 		"""
18 | 		if os.path.exists(file_path):
19 | 			try:
20 | 				with open(file_path) as f:
21 | 					return json.load(f)
22 | 			except:
23 | 				return None
24 | 		else:
25 | 			return None
26 | 


--------------------------------------------------------------------------------
/nlpaug/util/lib_ver.py:
--------------------------------------------------------------------------------
 1 | import nlpaug
 2 | import numpy as np
 3 | from platform import python_version
 4 | 
 5 | def get_lib_ver():
 6 | 	lib_ver = {
 7 | 		'python': python_version(),
 8 | 		'nlpaug': nlpaug.__version__,
 9 | 		'numpy': np.__version__
10 | 	}
11 | 
12 | 	try:
13 | 		import transformers
14 | 		lib_ver['transformers'] = transformers.__version__
15 | 	except:
16 | 		pass
17 | 
18 | 	try:
19 | 		import torch
20 | 		lib_ver['torch'] = torch.__version__
21 | 	except:
22 | 		pass
23 | 
24 | 	try:
25 | 		import fairseq
26 | 		lib_ver['fairseq'] = fairseq.__version__
27 | 	except:
28 | 		pass
29 | 
30 | 	try:
31 | 		import nltk
32 | 		lib_ver['nltk'] = nltk.__version__
33 | 	except:
34 | 		pass
35 | 
36 | 	return lib_ver
37 | 
38 | 
39 | 


--------------------------------------------------------------------------------
/nlpaug/util/logger/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.logger.logger import *
2 | 


--------------------------------------------------------------------------------
/nlpaug/util/logger/logger.py:
--------------------------------------------------------------------------------
1 | import logging
2 | 
3 | 
4 | logger = logging.getLogger("nlpaug-general")
5 | 
6 | class Logger:
7 | 	@staticmethod
8 | 	def log():
9 | 		return logger


--------------------------------------------------------------------------------
/nlpaug/util/math/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.math.normalization import *
2 | 


--------------------------------------------------------------------------------
/nlpaug/util/math/normalization.py:
--------------------------------------------------------------------------------
 1 | import numpy as np
 2 | 
 3 | def standard_norm(data):
 4 |     means = data.mean(axis =1)
 5 |     stds = data.std(axis= 1, ddof=1)
 6 |     data = (data - means[:, np.newaxis]) / stds[:, np.newaxis]
 7 |     return np.nan_to_num(data)
 8 | 
 9 | def l1_norm(data):
10 |     _norm = np.array([x.sum(axis=0) for x in data])
11 |     data = data/_norm[:, np.newaxis]
12 |     return np.nan_to_num(data)
13 | 
14 | def l2_norm(data):
15 |     _norm = np.array([np.sqrt((x*x).sum(axis=0)) for x in data])
16 |     data = data/_norm[:, np.newaxis]
17 |     return np.nan_to_num(data)
18 | 


--------------------------------------------------------------------------------
/nlpaug/util/method.py:
--------------------------------------------------------------------------------
 1 | class Method:
 2 |     CHAR = 'char'
 3 |     WORD = 'word'
 4 |     SENTENCE = 'sentence'
 5 |     SPECTROGRAM = 'spectrogram'
 6 |     AUDIO = 'audio'
 7 | 
 8 |     FLOW = 'flow'
 9 | 
10 |     @staticmethod
11 |     def getall():
12 |         return [Method.CHAR, Method.WORD, Method.SENTENCE, Method.AUDIO, Method.SPECTROGRAM, Method.FLOW]
13 | 
14 | 


--------------------------------------------------------------------------------
/nlpaug/util/selection/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.selection.filtering import *
2 | from nlpaug.util.selection.randomness import *
3 | 


--------------------------------------------------------------------------------
/nlpaug/util/selection/randomness.py:
--------------------------------------------------------------------------------
 1 | try:
 2 |     import torch
 3 | except ImportError:
 4 |     # No installation required if not using this function
 5 |     pass
 6 | import numpy as np
 7 | import random
 8 | 
 9 | 
10 | class Randomness:
11 |     @staticmethod
12 |     def seed(seed):
13 |         random.seed(seed)
14 |         np.random.seed(seed)
15 |         try:
16 |             torch.manual_seed(seed)
17 |             torch.cuda.manual_seed(seed)
18 |             torch.cuda.manual_seed_all(2021)
19 |         except:
20 |             pass
21 | 


--------------------------------------------------------------------------------
/nlpaug/util/text/__init__.py:
--------------------------------------------------------------------------------
1 | from nlpaug.util.text.tokenizer import *
2 | from nlpaug.util.text.part_of_speech import *
3 | 


--------------------------------------------------------------------------------
/nlpaug/util/text/part_of_speech.py:
--------------------------------------------------------------------------------
 1 | class PartOfSpeech:
 2 |     NOUN = 'noun'
 3 |     VERB = 'verb'
 4 |     ADJECTIVE = 'adjective'
 5 |     ADVERB = 'adverb'
 6 | 
 7 |     pos2con = {
 8 |         'n': [
 9 |             'NN', 'NNS', 'NNP', 'NNPS',  # from WordNet
10 |             'NP'  # from PPDB
11 |         ],
12 |         'v': [
13 |             'VB', 'VBD', 'VBG', 'VBN', 'VBZ',  # from WordNet
14 |             'VBP'  # from PPDB
15 |         ],
16 |         'a': ['JJ', 'JJR', 'JJS', 'IN'],
17 |         's': ['JJ', 'JJR', 'JJS', 'IN'],  # Adjective Satellite
18 |         'r': ['RB', 'RBR', 'RBS'],  # Adverb
19 |     }
20 | 
21 |     con2pos = {}
22 |     poses = []
23 |     for key, values in pos2con.items():
24 |         poses.extend(values)
25 |         for value in values:
26 |             if value not in con2pos:
27 |                 con2pos[value] = []
28 |             con2pos[value].append(key)
29 | 
30 |     @staticmethod
31 |     def pos2constituent(pos):
32 |         if pos in PartOfSpeech.pos2con:
33 |             return PartOfSpeech.pos2con[pos]
34 |         return []
35 | 
36 |     @staticmethod
37 |     def constituent2pos(con):
38 |         if con in PartOfSpeech.con2pos:
39 |             return PartOfSpeech.con2pos[con]
40 |         return []
41 | 
42 |     @staticmethod
43 |     def get_pos():
44 |         return PartOfSpeech.poses
45 | 


--------------------------------------------------------------------------------
/nlpaug/util/text/tokenizer.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | ADDING_SPACE_AROUND_PUNCTUATION_REGEX = re.compile(r'(?<! )(?=[.,!?()])|(?<=[.,!?()])(?! )')
 4 | 
 5 | SPLIT_WORD_REGEX = re.compile(r'\b.*?\S.*?(?:\b|$)')
 6 | 
 7 | TOKENIZER_REGEX = re.compile(r'(\W)')
 8 | DETOKENIZER_REGEXS = [
 9 | 	(re.compile(r'\s([.,:;?!%]+)([ \'"`])'), r'\1\2'), # End of sentence
10 | 	(re.compile(r'\s([.,:;?!%]+)$'), r'\1'), # End of sentence
11 | 	(re.compile(r'\s([\[\(\{\<])\s'), r' \g<1>'), # Left bracket
12 | 	(re.compile(r'\s([\]\)\}\>])\s'), r'\g<1> '), # right bracket
13 | ]
14 | 
15 | SENTENCE_SEPARATOR = '.!?'
16 | 
17 | def add_space_around_punctuation(text):
18 |     return ADDING_SPACE_AROUND_PUNCTUATION_REGEX.sub(r' ', text)
19 | 
20 | def split_sentence(text):
21 |     return SPLIT_WORD_REGEX.findall(text)
22 | 
23 | class Tokenizer:
24 | 	@staticmethod
25 | 	def tokenizer(text):
26 | 		tokens = TOKENIZER_REGEX.split(text)
27 | 		return [t for t in tokens if len(t.strip()) > 0]
28 | 
29 | 	@staticmethod
30 | 	def reverse_tokenizer(tokens):
31 | 		text = ' '.join(tokens)
32 | 		for regex, sub in DETOKENIZER_REGEXS:
33 | 			text = regex.sub(sub, text)
34 | 		return text.strip()


--------------------------------------------------------------------------------
/pypi.sh:
--------------------------------------------------------------------------------
1 | #!/bin/bash
2 | 
3 | python setup.py bdist_wheel --bdist-dir ~/temp/bdistwheel
4 | python -m twine upload dist/* --verbose
5 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | numpy>=1.16.2
2 | pandas>=1.2.0
3 | requests>=2.22.0
4 | gdown>=4.0.0
5 | 


--------------------------------------------------------------------------------
/requirements_dev.txt:
--------------------------------------------------------------------------------
1 | setuptools>=39.1.0
2 | python-dotenv>=0.10.1
3 | nltk>=3.4.5
4 | pyinstrument
5 | transformers
6 | torch
7 | simpletransformers
8 | gensim>=4.1.2
9 | librosa>=0.9


--------------------------------------------------------------------------------
/res/audio_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/res/audio_example.png


--------------------------------------------------------------------------------
/res/lambada_algo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/res/lambada_algo.png


--------------------------------------------------------------------------------
/res/logo_small.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/res/logo_small.png


--------------------------------------------------------------------------------
/res/textual_example.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/res/textual_example.png


--------------------------------------------------------------------------------
/script.txt:
--------------------------------------------------------------------------------
 1 | ﻿# Generate requirements.txt
 2 | pipreqs . —-force
 3 | 
 4 | # Upload to pypl
 5 | https://packaging.python.org/tutorials/packaging-projects/
 6 | python -m pip install --user --upgrade setuptools wheel
 7 | python -m pip install --user --upgrade twine
 8 | python setup.py bdist_wheel --bdist-dir ~/temp/bdistwheel
 9 | python -m twine upload --repository-url https://test.pypi.org/legacy/ dist/* --verbose
10 | python -m twine upload dist/* --verbose
11 | 
12 | # Code Coverage
13 | coverage run test/run_test.py && coverage html --include=./**
14 | 
15 | # Build Document
16 | cd docs
17 | make clean && make html -a
18 | 
19 | # Setup new environment in Window
20 | conda install jupyter
21 | pip install numpy requests
22 | pip install torch===1.2.0 -f https://download.pytorch.org/whl/torch_stable.html
23 | pip install pytorch_pretrained_bert>=1.1.0
24 | 
25 | # Install torch (for Linux)
26 | pip install torch torchvision 
27 | 
28 | # Install fairfeq
29 | pip install subword_nmt sacremoses fastBPE
30 | sudo apt-get update
31 | sudo apt-get install gcc
32 | sudo apt-get install g++
33 | pip install sacrebleu
34 | git clone https://github.com/pytorch/fairseq
35 | cd fairseq
36 | python setup.py build develop


--------------------------------------------------------------------------------
/scripts/lambada/data_processing.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import os
 3 | import pandas as pd
 4 | 
 5 | 
 6 | def prepare_mlm_data(labels, texts, output_file_path, sep_token):
 7 | 	with open(os.path.join(output_file_path, 'mlm_data.txt'), 'w') as f:
 8 | 		for label, text in zip(labels, texts):
 9 | 			f.write(' '.join([str(label), sep_token, text]) + '\n')
10 | 
11 | def main(args):
12 | 	data = pd.read_csv(args.data_path)
13 | 	prepare_mlm_data(data['label'].tolist(), data['text'].tolist(), args.output_dir, '[SEP]')
14 | 
15 | 
16 | if __name__ == '__main__':
17 | 	parser = argparse.ArgumentParser(description='parameters', prefix_chars='-')
18 | 	parser.add_argument('--data_path', default='./test/res/text/classification.csv', help='Data path')
19 | 	parser.add_argument('--output_dir', default='./test/res/text', help='File output directory')
20 | 
21 | 	args = parser.parse_args()
22 | 
23 | 	main(args)
24 | 


--------------------------------------------------------------------------------
/scripts/run_lambada.py:
--------------------------------------------------------------------------------
1 | import pandas as pd
2 | from nlpaug.model.lang_models.lambada import Lambada
3 | 
4 | if __name__ == '__main__':
5 | 	model = Lambada(cls_model_dir='../model/lambada/cls', gen_model_dir='../model/lambada/gen', threshold=0.3, device='cuda')
6 | 	generated_results, filtered_results = model.predict(['0', '1', '2', '3', '4', '5'], 5)
7 | 	generated_results.to_csv('lambada_generated_result.csv', index=False)
8 |     filtered_results.to_csv('lambada_filtered_result.csv', index=False)
9 | 


--------------------------------------------------------------------------------
/scripts/train_lambada.sh:
--------------------------------------------------------------------------------
1 | python scripts/lambada/train_cls.py --train_data_path ./test/res/text/classification.csv --val_data_path ./test/res/text/classification.csv --output_dir ./model/lambada/cls --device cuda --num_epoch 2
2 | python scripts/lambada/data_processing.py --data_path ./test/res/text/classification.csv --output_dir ./test/res/text
3 | python scripts/lambada/run_clm.py --tokenizer_name ./model/lambada/cls --model_name_or_path gpt2 --model_type gpt2 --train_file ./test/res/text/mlm_data.txt --output_dir ./model/lambada/gen --do_train --overwrite_output_dir --per_device_train_batch_size 4 --per_device_eval_batch_size 4 --save_steps=10000 --num_train_epochs 2


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | import sys
 3 | 
 4 | if sys.version_info < (3,):
 5 |     sys.exit("Sorry, Python3 is required.")
 6 | 
 7 | with open("README.md", encoding="utf8") as f:
 8 |     readme = f.read()
 9 | 
10 | with open('requirements.txt') as f:
11 |     install_reqs = f.read().splitlines()
12 | 
13 | setup(
14 |     name="nlpaug",
15 |     version="1.1.11",
16 |     author="Edward Ma",
17 |     author_email="makcedward@gmail.com",
18 |     url="https://github.com/makcedward/nlpaug",
19 |     license="MIT",
20 |     description="Natural language processing augmentation library for deep neural networks",
21 |     long_description=readme,
22 |     long_description_content_type="text/markdown",
23 |     packages=find_packages(exclude="test"),
24 |     include_package_data=True,
25 |     install_requires=install_reqs,
26 |     keywords=[
27 |         "deep learning", "neural network", "machine learning",
28 |         "nlp", "natural language processing", "text", "audio", "spectrogram",
29 |         "augmentation", "adversarial attack", "ai", "ml"],
30 |     python_requires=">=3.7"
31 | )
32 | 


--------------------------------------------------------------------------------
/test/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/__init__.py


--------------------------------------------------------------------------------
/test/augmenter/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/__init__.py


--------------------------------------------------------------------------------
/test/augmenter/audio/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/audio/__init__.py


--------------------------------------------------------------------------------
/test/augmenter/audio/test_audio.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from dotenv import load_dotenv
 4 | 
 5 | import nlpaug.augmenter.audio as naa
 6 | from nlpaug.util import AudioLoader
 7 | 
 8 | 
 9 | class TestAudio(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         env_config_path = os.path.abspath(os.path.join(
13 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
14 |         load_dotenv(env_config_path)
15 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
16 |         cls.sample_wav_file = os.path.join(
17 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
18 |         )
19 | 
20 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 | 
22 |     def test_multi_thread(self):
23 |         n = 3
24 |         augs = [
25 |             naa.CropAug(sampling_rate=self.sampling_rate),
26 |             naa.PitchAug(sampling_rate=self.sampling_rate)
27 |         ]
28 | 
29 |         for num_thread in [1, 3]:
30 |             for aug in augs:
31 |                 augmented_data = aug.augment(self.audio, n=n, num_thread=num_thread)
32 |                 self.assertEqual(len(augmented_data), n)
33 | 
34 |     def test_coverage_and_zone(self):
35 |         params = [
36 |             ((0.3, 0.7), 1),
37 |             ((0, 1), 1)
38 |         ]
39 | 
40 |         for zone, coverage in params:
41 |             augs = [
42 |                 naa.LoudnessAug(zone=zone, coverage=coverage, stateless=False),
43 |                 naa.MaskAug(zone=zone, coverage=coverage, stateless=False),
44 |                 naa.NoiseAug(zone=zone, coverage=coverage, stateless=False),
45 |                 naa.PitchAug(zone=zone, coverage=coverage, stateless=False, sampling_rate=self.sampling_rate),
46 |                 naa.SpeedAug(zone=zone, coverage=coverage, stateless=False),
47 |                 naa.VtlpAug(zone=zone, coverage=coverage, stateless=False, sampling_rate=self.sampling_rate),
48 |                 naa.NormalizeAug(zone=zone, coverage=coverage, stateless=False),
49 |                 naa.PolarityInverseAug(zone=zone, coverage=coverage, stateless=False)
50 |             ]
51 | 
52 |             for aug in augs:
53 |                 aug_data = aug.augment(self.audio)
54 |                 aug_audio = aug_data[0]
55 |                 self.assertTrue(len(aug_audio[aug.start_pos:aug.end_pos]), int(len(self.audio) * (zone[1] - zone[0]) * coverage))
56 | 


--------------------------------------------------------------------------------
/test/augmenter/audio/test_crop.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | import nlpaug.augmenter.audio as naa
 7 | from nlpaug.util import AudioLoader
 8 | 
 9 | 
10 | class TestCrop(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 | 
22 |     def test_empty_input(self):
23 |         audio = np.array([])
24 |         aug = naa.CropAug(sampling_rate=self.sampling_rate)
25 |         augmented_data = aug.augment(audio)
26 | 
27 |         self.assertTrue(np.array_equal(audio, augmented_data))
28 | 
29 |     def test_substitute(self):
30 |         aug = naa.CropAug(sampling_rate=self.sampling_rate)
31 |         augmented_data = aug.augment(self.audio)
32 |         augmented_audio = augmented_data[0]
33 | 
34 |         self.assertNotEqual(len(self.audio), len(augmented_audio))
35 | 
36 |     def test_coverage(self):
37 |         aug = naa.CropAug(sampling_rate=self.sampling_rate, coverage=0.1)
38 |         augmented_data = aug.augment(self.audio)
39 |         augmented_audio = augmented_data[0]
40 | 
41 |         audio_size = len(self.audio)
42 |         augmented_size = len(augmented_audio)
43 |         expected_crop_size = len(self.audio) * (aug.zone[1] - aug.zone[0]) * 0.1
44 | 
45 |         self.assertTrue(-1 <= audio_size - augmented_size - expected_crop_size <= 1)
46 | 
47 |     def test_duration(self):
48 |         duration = 1
49 |         audio_size = len(self.audio)
50 | 
51 |         for _ in range(10):
52 |             aug = naa.CropAug(sampling_rate=self.sampling_rate, duration=duration, stateless=False)
53 |             augmented_data = aug.augment(self.audio)
54 |             augmented_audio = augmented_data[0]
55 | 
56 |             aug_size = len(augmented_audio)
57 |             expected_crop_size = self.sampling_rate * duration
58 | 
59 |             self.assertGreater(audio_size, aug_size)
60 |             self.assertEqual(len(self.audio[aug.start_pos:aug.end_pos]), expected_crop_size)
61 | 


--------------------------------------------------------------------------------
/test/augmenter/audio/test_inversion.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | import nlpaug.augmenter.audio as naa
 7 | from nlpaug.util import AudioLoader
 8 | 
 9 | 
10 | class TestInversion(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 | 
22 |     def test_empty_input(self):
23 |         audio = np.array([])
24 |         aug = naa.PolarityInverseAug()
25 |         augmented_data = aug.augment(audio)
26 | 
27 |         self.assertTrue(np.array_equal(audio, augmented_data))
28 | 
29 |     def test_inverse(self):
30 |         aug = naa.PolarityInverseAug()
31 |         augmented_data = aug.augment(self.audio)
32 |         augmented_audio = augmented_data[0]
33 | 
34 |         self.assertFalse(np.array_equal(self.audio, augmented_audio))
35 |         self.assertEqual(len(self.audio), len(augmented_audio))
36 | 


--------------------------------------------------------------------------------
/test/augmenter/audio/test_loudness.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | import nlpaug.augmenter.audio as naa
 7 | from nlpaug.util import AudioLoader
 8 | 
 9 | 
10 | class TestLoudness(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 | 
22 |     def test_empty_input(self):
23 |         audio = np.array([])
24 |         aug = naa.LoudnessAug()
25 |         augmented_data = aug.augment(audio)
26 | 
27 |         self.assertTrue(np.array_equal(audio, augmented_data))
28 | 
29 |     def test_substitute(self):
30 |         aug = naa.LoudnessAug()
31 |         augmented_data = aug.augment(self.audio)
32 |         augmented_audio = augmented_data[0]
33 | 
34 |         self.assertFalse(np.array_equal(self.audio, augmented_audio))
35 |         self.assertEqual(len(self.audio), len(augmented_audio))
36 |         self.assertTrue(self.sampling_rate > 0)
37 | 


--------------------------------------------------------------------------------
/test/augmenter/audio/test_mask.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | import nlpaug.augmenter.audio as naa
 7 | from nlpaug.util import AudioLoader
 8 | 
 9 | 
10 | class TestMask(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 | 
22 |     def test_empty_input(self):
23 |         audio = np.array([])
24 |         aug = naa.MaskAug(sampling_rate=44100)
25 |         augmented_data = aug.augment(audio)
26 | 
27 |         self.assertTrue(np.array_equal(audio, augmented_data))
28 | 
29 |     def test_with_noise(self):
30 |         aug = naa.MaskAug(sampling_rate=self.sampling_rate, mask_with_noise=True)
31 |         augmented_data = aug.augment(self.audio)
32 |         augmented_audio = augmented_data[0]
33 | 
34 |         self.assertFalse(np.array_equal(self.audio, augmented_audio))
35 |         self.assertEqual(len(self.audio), len(augmented_audio))
36 | 
37 |     def test_without_noise(self):
38 |         aug = naa.MaskAug(sampling_rate=self.sampling_rate, mask_with_noise=False)
39 |         augmented_data = aug.augment(self.audio)
40 |         augmented_audio = augmented_data[0]
41 | 
42 |         self.assertFalse(np.array_equal(self.audio, augmented_audio))
43 |         self.assertEqual(len(self.audio), len(augmented_audio))
44 | 


--------------------------------------------------------------------------------
/test/augmenter/audio/test_noise.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | import nlpaug.augmenter.audio as naa
 7 | from nlpaug.util import AudioLoader
 8 | 
 9 | 
10 | class TestNoise(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 |         # https://en.wikipedia.org/wiki/Colors_of_noise
21 |         cls.noise_wav_file = os.path.join(
22 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Pink_noise.ogg'
23 |         )
24 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
25 |         cls.noise, cls.noise_sampling_rate = AudioLoader.load_audio(cls.noise_wav_file)
26 | 
27 |     def test_empty_input(self):
28 |         audio = np.array([])
29 |         aug = naa.NoiseAug()
30 |         augmented_data = aug.augment(audio)
31 | 
32 |         self.assertTrue(np.array_equal(audio, augmented_data))
33 | 
34 |     def test_substitute(self):
35 |         aug = naa.NoiseAug()
36 |         augmented_data = aug.augment(self.audio)
37 |         augmented_audio = augmented_data[0]
38 | 
39 |         self.assertFalse(np.array_equal(self.audio, augmented_audio))
40 |         self.assertTrue(len(self.audio), len(augmented_audio))
41 |         self.assertTrue(self.sampling_rate > 0)
42 | 
43 |     def test_color_noise(self):
44 |         colors = naa.NoiseAug().model.COLOR_NOISES
45 | 
46 |         for color in colors:
47 |             aug = naa.NoiseAug(color=color)
48 |             augmented_data = aug.augment(self.audio)
49 |             augmented_audio = augmented_data[0]
50 | 
51 |             self.assertFalse(np.array_equal(self.audio, augmented_audio))
52 |             self.assertTrue(len(self.audio), len(augmented_audio))
53 |             self.assertTrue(self.sampling_rate > 0)
54 | 
55 |     def test_background_noise(self):
56 |         # noise > audio
57 |         aug = naa.NoiseAug(noises=[self.noise])
58 |         augmented_data = aug.augment(self.audio)
59 |         augmented_audio = augmented_data[0]
60 |         self.assertTrue(augmented_audio is not None)
61 | 
62 |         # audio > noise
63 |         aug = naa.NoiseAug(noises=[self.audio])
64 |         augmented_data = aug.augment(self.audio)
65 |         augmented_audio = augmented_data[0]
66 |         self.assertTrue(augmented_audio is not None)
67 | 


--------------------------------------------------------------------------------
/test/augmenter/audio/test_normalization.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | import nlpaug.augmenter.audio as naa
 7 | from nlpaug.util import AudioLoader
 8 | 
 9 | 
10 | class TestNormalization(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 | 
22 |     def test_empty_input(self):
23 |         audio = np.array([])
24 |         aug = naa.NormalizeAug()
25 |         augmented_data = aug.augment(audio)
26 | 
27 |         self.assertTrue(np.array_equal(audio, augmented_data))
28 | 
29 |     def test_non_exist_method(self):
30 |         with self.assertRaises(ValueError) as error:
31 |             aug = naa.NormalizeAug(method='test1234')
32 |         self.assertTrue('does not support yet. You may pick one' in str(error.exception))
33 | 
34 |     def test_minmax(self):
35 |         aug = naa.NormalizeAug(method='minmax')
36 |         augmented_data = aug.augment(self.audio)
37 |         augmented_audio = augmented_data[0]
38 | 
39 |         self.assertFalse(np.array_equal(self.audio, augmented_audio))
40 |         self.assertEqual(len(self.audio), len(augmented_audio))
41 | 
42 |     def test_max(self):
43 |         aug = naa.NormalizeAug(method='max')
44 |         augmented_data = aug.augment(self.audio)
45 |         augmented_audio = augmented_data[0]
46 | 
47 |         self.assertFalse(np.array_equal(self.audio, augmented_audio))
48 |         self.assertEqual(len(self.audio), len(augmented_audio))
49 | 
50 |     def test_standard(self):
51 |         aug = naa.NormalizeAug(method='standard')
52 |         augmented_data = aug.augment(self.audio)
53 |         augmented_audio = augmented_data[0]
54 | 
55 |         self.assertFalse(np.array_equal(self.audio, augmented_audio))
56 |         self.assertEqual(len(self.audio), len(augmented_audio))
57 | 
58 |     def test_random_method(self):
59 |         aug = naa.NormalizeAug(method='random', stateless=False)
60 |         augmented_data = aug.augment(self.audio)
61 |         augmented_audio = augmented_data[0]
62 | 
63 |         self.assertTrue(aug.run_method in aug.model.get_support_methods())
64 | 
65 |         self.assertFalse(np.array_equal(self.audio, augmented_audio))
66 |         self.assertEqual(len(self.audio), len(augmented_audio))
67 | 


--------------------------------------------------------------------------------
/test/augmenter/audio/test_pitch.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | import nlpaug.augmenter.audio as naa
 7 | from nlpaug.util import AudioLoader
 8 | 
 9 | 
10 | class TestPitch(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 | 
22 |     def test_substitute(self):
23 |         aug = naa.PitchAug(sampling_rate=self.sampling_rate)
24 |         augmented_data = aug.augment(self.audio)
25 |         augmented_audio = augmented_data[0]
26 | 
27 |         self.assertFalse(np.array_equal(self.audio, augmented_audio))
28 |         self.assertEqual(len(self.audio), len(augmented_audio))
29 | 


--------------------------------------------------------------------------------
/test/augmenter/audio/test_shift.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | import nlpaug.augmenter.audio as naa
 7 | from nlpaug.util import AudioLoader
 8 | 
 9 | 
10 | class TestShift(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 | 
22 |     def test_substitute(self):
23 |         audio, sampling_rate = AudioLoader.load_audio(self.sample_wav_file)
24 | 
25 |         aug = naa.ShiftAug(sampling_rate, duration=0.5)
26 |         augmented_data = aug.augment(self.audio)
27 |         augmented_audio = augmented_data[0]
28 | 
29 |         self.assertFalse(np.array_equal(audio, augmented_audio))
30 |         self.assertTrue(len(audio), len(augmented_audio))
31 | 


--------------------------------------------------------------------------------
/test/augmenter/audio/test_speed.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from dotenv import load_dotenv
 4 | 
 5 | import nlpaug.augmenter.audio as naa
 6 | from nlpaug.util import AudioLoader
 7 | 
 8 | 
 9 | class TestSpeed(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         env_config_path = os.path.abspath(os.path.join(
13 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
14 |         load_dotenv(env_config_path)
15 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
16 |         cls.sample_wav_file = os.path.join(
17 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
18 |         )
19 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
20 | 
21 |     def test_substitute(self):
22 |         for _ in range(10):
23 |             aug = naa.SpeedAug(stateless=False)
24 |             augmented_data = aug.augment(self.audio)
25 |             augmented_audio = augmented_data[0]
26 | 
27 |             if aug.aug_factor < 1:
28 |                 self.assertGreater(len(augmented_audio), len(self.audio))
29 |             else:
30 |                 self.assertLess(len(augmented_audio), len(self.audio))
31 | 


--------------------------------------------------------------------------------
/test/augmenter/audio/test_vtlp.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from dotenv import load_dotenv
 4 | 
 5 | import nlpaug.augmenter.audio as naa
 6 | from nlpaug.util import AudioLoader
 7 | 
 8 | 
 9 | class TestVtlp(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         env_config_path = os.path.abspath(os.path.join(
13 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
14 |         load_dotenv(env_config_path)
15 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
16 |         cls.sample_wav_file = os.path.join(
17 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
18 |         )
19 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
20 | 
21 |     def test_substitute(self):
22 |         for _ in range(10):
23 |             aug = naa.VtlpAug(sampling_rate=self.sampling_rate, stateless=False)
24 |             augmented_data = aug.augment(self.audio)
25 |             augmented_audio = augmented_data[0]
26 |             self.assertGreater(len(self.audio), len(augmented_audio))


--------------------------------------------------------------------------------
/test/augmenter/char/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/char/__init__.py


--------------------------------------------------------------------------------
/test/augmenter/char/test_ocr.py:
--------------------------------------------------------------------------------
 1 | import unittest, os
 2 | 
 3 | from nlpaug.augmenter.char import OcrAug
 4 | 
 5 | 
 6 | class TestOcr(unittest.TestCase):
 7 |     def test_ocr_single_word(self):
 8 |         texts = ['Zoology', 'roku123456']
 9 |         aug = OcrAug()
10 |         for text in texts:
11 |             augmented_data = aug.augment(text)
12 |             augmented_text = augmented_data[0]
13 |             self.assertNotEqual(text, augmented_text)
14 | 
15 |         self.assertTrue(len(texts) > 0)
16 | 
17 |     def test_ocr_single_word_nonexist_char(self):
18 |         texts = ['AAAAA', 'KKKKK']
19 |         aug = OcrAug()
20 |         for text in texts:
21 |             augmented_data = aug.augment(text)
22 |             augmented_text = augmented_data[0]
23 |             self.assertEqual(text, augmented_text)
24 | 
25 |         self.assertTrue(len(texts) > 0)
26 | 
27 |     def test_ocr_multi_words(self):
28 |         texts = ['The quick brown fox jumps over the lazy dog']
29 |         aug = OcrAug()
30 | 
31 |         for text in texts:
32 |             # Since non-exist mapping word may be drawn, try several times
33 |             is_augmented = False
34 |             for _ in range(10):
35 |                 augmented_data = aug.augment(text)
36 |                 augmented_text = augmented_data[0]
37 |                 is_equal = text == augmented_text
38 |                 if not is_equal:
39 |                     is_augmented = True
40 |                     break
41 | 
42 |             self.assertTrue(is_augmented)
43 | 
44 |         self.assertTrue(len(texts) > 0)
45 | 
46 |     def test_ocr_model_from_dict(self):
47 |         mapping = {'0': ['2']}
48 |         aug = OcrAug(dict_of_path=mapping)
49 |         augmented_data = aug.augment('0000000')
50 |         augmented_text = augmented_data[0]
51 |         self.assertIn('2', augmented_text)
52 | 
53 |     def test_ocr_model_from_json(self):
54 |         sample_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'res', 'common', 'sample.json'))
55 |         aug = OcrAug(dict_of_path=sample_path)
56 |         augmented_data = aug.augment('0000000')
57 |         augmented_text = augmented_data[0]
58 |         self.assertIn('3', augmented_text)
59 | 
60 |         with self.assertRaises(Exception) as error:
61 |             sample_path = os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..', 'res', 'common', 'non_exist.json'))
62 |             aug = OcrAug(dict_of_path=sample_path)
63 |         self.assertIn('The dict_of_path does not exist', str(error.exception))
64 |         


--------------------------------------------------------------------------------
/test/augmenter/sentence/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/sentence/__init__.py


--------------------------------------------------------------------------------
/test/augmenter/sentence/test_random.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | 
 4 | import nlpaug.augmenter.sentence as nas
 5 | 
 6 | 
 7 | class TestRandomSentenceAug(unittest.TestCase):
 8 |     @classmethod
 9 |     def setUpClass(cls):
10 |         cls.data = 'This is sentence1. This is sentence2! This is sentence3? This is, sentence4 with comma.'
11 | 
12 |     def test_mode(self):
13 |         for mode in ['left', 'right', 'neighbor', 'random']:
14 |             aug = nas.RandomSentAug(mode='left')
15 |             aug_data = aug.augment(self.data)
16 |             self.assertNotEqual(self.data, aug_data[0])
17 |             self.assertEqual(4, len(aug.model.tokenize(aug_data[0])))
18 | 


--------------------------------------------------------------------------------
/test/augmenter/sentence/test_sentence.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from dotenv import load_dotenv
 4 | 
 5 | import nlpaug.augmenter.sentence as nas
 6 | from nlpaug.util import Action, Doc
 7 | 
 8 | 
 9 | class TestSentence(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         env_config_path = os.path.abspath(os.path.join(
13 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
14 |         load_dotenv(env_config_path)
15 | 
16 |         cls.model_paths = [
17 |             'xlnet-base-cased',
18 |             'gpt2',
19 |             'distilgpt2'
20 |         ]
21 | 
22 |         cls.text = 'The quick brown fox jumps over the lazy dog.'
23 | 


--------------------------------------------------------------------------------
/test/augmenter/spectrogram/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/spectrogram/__init__.py


--------------------------------------------------------------------------------
/test/augmenter/spectrogram/test_frequency_masking.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | from nlpaug.util import AudioLoader
 7 | import nlpaug.augmenter.spectrogram as nas
 8 | 
 9 | 
10 | class TestFrequencyMasking(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 | 
21 |     def test_empty_input(self):
22 |         data = np.array([])
23 |         aug = nas.FrequencyMaskingAug()
24 |         aug_data = aug.augment(data)
25 | 
26 |         self.assertTrue(np.array_equal(np.array([]), aug_data))
27 | 
28 |     def test_no_change_source(self):
29 |         data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128)
30 |         aug = nas.FrequencyMaskingAug()
31 |         aug_data = aug.augment(data)
32 |         aug_audio = aug_data[0]
33 | 
34 |         comparison = data == aug_audio
35 |         self.assertFalse(comparison.all())
36 | 
37 |     def test_substitute(self):
38 |         data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128)
39 |         aug = nas.FrequencyMaskingAug(stateless=False)
40 | 
41 |         aug_data = aug.augment(data)
42 |         aug_audio = aug_data[0]
43 | 
44 |         self.assertEqual(len(data[aug.f0]), np.count_nonzero(data[aug.f0]))
45 |         self.assertEqual(0, np.count_nonzero(aug_audio[aug.f0][aug.time_start:aug.time_end]))
46 |         self.assertEqual(0, len(np.where(aug_audio[aug.f0][:aug.time_start] == 0)[0]))
47 |         self.assertEqual(0, len(np.where(aug_audio[aug.f0][aug.time_end:] == 0)[0]))
48 | 


--------------------------------------------------------------------------------
/test/augmenter/spectrogram/test_loudness_spec.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | from nlpaug.util import AudioLoader
 7 | import nlpaug.augmenter.spectrogram as nas
 8 | 
 9 | 
10 | class TestLoudnessSpec(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 | 
21 |     def test_no_change_source(self):
22 |         data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128)
23 |         aug = nas.LoudnessAug(stateless=False)
24 |         aug_data = aug.augment(data)
25 |         aug_audio = aug_data[0]
26 | 
27 |         comparison = data == aug_audio
28 |         self.assertFalse(comparison.all())
29 | 
30 |     def test_substitute(self):
31 |         data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128)
32 |         aug = nas.LoudnessAug(stateless=False)
33 | 
34 |         aug_data = aug.augment(data)
35 |         aug_audio = aug_data[0]
36 |         
37 |         comparison = data[:, aug.time_start:aug.time_end] == aug_audio[:, aug.time_start:aug.time_end]
38 |         self.assertFalse(comparison.all())
39 |         comparison = data[:, :aug.time_start] == aug_audio[:, :aug.time_start]
40 |         self.assertTrue(comparison.all())
41 |         comparison = data[:, aug.time_end:] == aug_audio[:, aug.time_end:]
42 |         self.assertTrue(comparison.all())
43 | 


--------------------------------------------------------------------------------
/test/augmenter/spectrogram/test_spectrogram.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from dotenv import load_dotenv
 4 | 
 5 | from nlpaug.util import AudioLoader
 6 | import nlpaug.augmenter.spectrogram as nas
 7 | 
 8 | 
 9 | class TestFrequencyMasking(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         env_config_path = os.path.abspath(os.path.join(
13 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
14 |         load_dotenv(env_config_path)
15 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
16 |         cls.sample_wav_file = os.path.join(
17 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
18 |         )
19 | 
20 |     def test_multi_thread(self):
21 |         mel_spectrogram = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128)
22 |         n = 3
23 |         augs = [
24 |             nas.FrequencyMaskingAug(),
25 |             nas.TimeMaskingAug()
26 |         ]
27 | 
28 |         for num_thread in [1, 3]:
29 |             for aug in augs:
30 |                 augmented_data = aug.augment(mel_spectrogram, n=n, num_thread=num_thread)
31 |                 self.assertEqual(len(augmented_data), n)
32 | 
33 |     def test_zone_parameter(self):
34 |         aug = nas.LoudnessAug(zone=(0, 1))
35 |         aug = nas.LoudnessAug(zone=(0.5, 0.7))
36 |         aug = nas.LoudnessAug(zone=(0.6, 1))
37 | 
38 |         with self.assertRaises(ValueError) as context:
39 |             aug = nas.LoudnessAug(zone=(-1, 1))
40 |         self.assertTrue('Lower bound of zone is smaller than' in str(context.exception))
41 | 
42 |         with self.assertRaises(ValueError) as context:
43 |             aug = nas.LoudnessAug(zone=(0, 1.2))
44 |         self.assertTrue('Upper bound of zone is larger than' in str(context.exception))
45 | 
46 |     def test_coverage_parameter(self):
47 |         aug = nas.LoudnessAug(coverage=0)
48 |         aug = nas.LoudnessAug(coverage=0.5)
49 |         aug = nas.LoudnessAug(coverage=1)
50 | 
51 |         with self.assertRaises(ValueError) as context:
52 |             aug = nas.LoudnessAug(coverage=-1)
53 |         self.assertTrue('Coverage value should be between than 0 and 1 while' in str(context.exception))
54 |         
55 |         with self.assertRaises(ValueError) as context:
56 |             aug = nas.LoudnessAug(coverage=1.1)
57 |         self.assertTrue('Coverage value should be between than 0 and 1 while' in str(context.exception))
58 | 


--------------------------------------------------------------------------------
/test/augmenter/spectrogram/test_time_masking.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from dotenv import load_dotenv
 4 | import numpy as np
 5 | 
 6 | from nlpaug.util import AudioLoader
 7 | import nlpaug.augmenter.spectrogram as nas
 8 | 
 9 | 
10 | class TestTimeMasking(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 |         cls.num_of_freq_channel = 128
21 | 
22 |     def test_no_change_source(self):
23 |         data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=128)
24 |         aug = nas.TimeMaskingAug()
25 |         aug_data = aug.augment(data)
26 | 
27 |         comparison = data == aug_data
28 |         self.assertFalse(comparison.all())
29 | 
30 |     def test_substitute(self):
31 |         data = AudioLoader.load_mel_spectrogram(self.sample_wav_file, n_mels=self.num_of_freq_channel)
32 |         aug = nas.TimeMaskingAug(stateless=False)
33 | 
34 |         aug_data = aug.augment(data)
35 |         aug_audio = aug_data[0]
36 | 
37 |         self.assertEqual(len(data[:, aug.t0]), np.count_nonzero(data[:, aug.t0]))
38 |         self.assertEqual(0, np.count_nonzero(aug_audio[:, aug.t0]))
39 | 


--------------------------------------------------------------------------------
/test/augmenter/test_audio_augmenter.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | import nlpaug.augmenter.audio as naa
 7 | from nlpaug.util.audio import AudioLoader
 8 | 
 9 | 
10 | class TestAudioAugmenter(unittest.TestCase):
11 |     @classmethod
12 |     def setUpClass(cls):
13 |         env_config_path = os.path.abspath(os.path.join(
14 |             os.path.dirname(__file__), '..', '..', '.env'))
15 |         load_dotenv(env_config_path)
16 |         # https://freewavesamples.com/yamaha-v50-rock-beat-120-bpm
17 |         cls.sample_wav_file = os.path.join(
18 |             os.environ.get("TEST_DIR"), 'res', 'audio', 'Yamaha-V50-Rock-Beat-120bpm.wav'
19 |         )
20 |         cls.audio, cls.sampling_rate = AudioLoader.load_audio(cls.sample_wav_file)
21 | 
22 |         cls.audio_augs = [
23 |             naa.CropAug(sampling_rate=cls.sampling_rate),
24 |             naa.SpeedAug(),
25 |         ]
26 | 
27 |     def test_augmenter_n_output(self):
28 |         n = 3
29 |         for aug in self.audio_augs:
30 |             augmented_audios = aug.augment(self.audio, n=n)
31 |             self.assertEqual(len(augmented_audios), n)
32 |             for augmented_audio in augmented_audios:
33 |                 self.assertFalse(np.array_equal(augmented_audio, self.audio))
34 | 
35 |         data = [self.audio, self.audio, self.audio]
36 |         for aug in self.audio_augs:
37 |             augmented_audios = aug.augment(data, n=1)
38 |             self.assertEqual(len(augmented_audios), len(data))
39 |             for d, augmented_audio in zip(data, augmented_audios):
40 |                 self.assertFalse(np.array_equal(augmented_audio, d))
41 | 
42 |     def test_augmenter_n_output_thread(self):
43 |         n = 3
44 |         for aug in self.audio_augs:
45 |             augmented_audios = aug.augment([self.audio]*2, n=n, num_thread=n)
46 |             self.assertGreater(len(augmented_audios), 1)
47 |             for augmented_audio in augmented_audios:
48 |                 self.assertFalse(np.array_equal(augmented_audio, self.audio))
49 | 


--------------------------------------------------------------------------------
/test/augmenter/test_base_augmenter.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | from nlpaug import Augmenter
 7 | 
 8 | 
 9 | class TestBaseAugmenter(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         env_config_path = os.path.abspath(os.path.join(
13 |             os.path.dirname(__file__), '..', '..', '.env'))
14 |         load_dotenv(env_config_path)
15 | 
16 |         cls.aug = Augmenter(name='base', method='flow', action='insert',
17 |             aug_min=1, aug_max=10, aug_p=0.5)
18 | 
19 |     def test_generate_aug_cnt(self):
20 |         self.assertEqual(0, self.aug.generate_aug_cnt(0))
21 |         self.assertEqual(1, self.aug.generate_aug_cnt(1))
22 |         self.assertGreater(self.aug.generate_aug_cnt(10), 1)
23 | 


--------------------------------------------------------------------------------
/test/augmenter/test_text_augmenter.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import torch
 4 | import numpy as np
 5 | from dotenv import load_dotenv
 6 | 
 7 | import nlpaug.augmenter.char as nac
 8 | import nlpaug.augmenter.word as naw
 9 | import nlpaug.augmenter.sentence as nas
10 | 
11 | 
12 | class TestTextAugmenter(unittest.TestCase):
13 |     @classmethod
14 |     def setUpClass(cls):
15 |         env_config_path = os.path.abspath(os.path.join(
16 |             os.path.dirname(__file__), '..', '..', '.env'))
17 |         load_dotenv(env_config_path)
18 | 
19 |         cls.augs = [
20 |             nac.RandomCharAug(),
21 |             naw.ContextualWordEmbsAug(),
22 |             nas.ContextualWordEmbsForSentenceAug()
23 |         ]
24 | 
25 |     def test_augmenter_n_output(self):
26 |         text = 'The quick brown fox jumps over the lazy dog'
27 |         n = 3
28 |         for aug in self.augs:
29 |             augmented_texts = aug.augment(text, n=n)
30 |             self.assertGreater(len(augmented_texts), 1)
31 |             for augmented_text in augmented_texts:
32 |                 self.assertNotEqual(augmented_text, text)
33 | 
34 |         for aug in self.augs:
35 |             augmented_texts = aug.augment([text]*2, n=1, num_thread=1)
36 |             self.assertGreater(len(augmented_texts), 1)
37 |             for augmented_text in augmented_texts:
38 |                 self.assertNotEqual(augmented_text, text)
39 | 
40 |     def test_augmenter_n_output_thread(self):
41 |         text = 'The quick brown fox jumps over the lazy dog'
42 |         n = 3
43 |         for aug in self.augs:
44 |             augmented_texts = aug.augment([text]*2, n=n, num_thread=n)
45 |             self.assertGreater(len(augmented_texts), 1)
46 |             for augmented_text in augmented_texts:
47 |                 self.assertNotEqual(augmented_text, text)
48 | 
49 |     def test_multiprocess_gpu(self):
50 |         text = 'The quick brown fox jumps over the lazy dog'
51 |         n = 3
52 |         if torch.cuda.is_available():
53 |             aug = naw.ContextualWordEmbsAug(force_reload=True, device='cuda')
54 | 
55 |             augmented_texts = aug.augment(text, n=n, num_thread=n)
56 |             self.assertGreater(len(augmented_texts), 1)
57 |             for augmented_text in augmented_texts:
58 |                 self.assertNotEqual(augmented_text, text)
59 | 
60 |         self.assertTrue(True)
61 | 
62 |     def test_get_aug_range_idxes(self):
63 |         aug = naw.RandomWordAug()
64 |         self.assertTrue(len(aug._get_aug_range_idxes([])) == 0)


--------------------------------------------------------------------------------
/test/augmenter/word/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/augmenter/word/__init__.py


--------------------------------------------------------------------------------
/test/augmenter/word/test_antonym.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from dotenv import load_dotenv
 4 | 
 5 | import nlpaug.augmenter.word as naw
 6 | 
 7 | 
 8 | class TestAntonym(unittest.TestCase):
 9 |     @classmethod
10 |     def setUpClass(cls):
11 |         env_config_path = os.path.abspath(os.path.join(
12 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
13 |         load_dotenv(env_config_path)
14 | 
15 |         cls.augs = [
16 |             naw.AntonymAug()
17 |         ]
18 | 
19 |     def test_substitute(self):
20 |         texts = [
21 |             'Older people feel more youthful when they also feel in control.',
22 |             'Good bad',
23 |             'Heart patients may benefit more from exercise than healthy people.',
24 |             'Beer first or wine, either way might not be fine.'
25 |         ]
26 | 
27 |         for aug in self.augs:
28 |             for text in texts:
29 |                 for _ in range(5):
30 |                     augmented_data = aug.augment(text)
31 |                     augmented_text = augmented_data[0]
32 |                     self.assertNotEqual(text, augmented_text)
33 | 
34 |     def test_unable_to_substitute(self):
35 |         texts = [
36 |             'Insomnia, sleep apnea diagnoses up sharply in U.S. Army.'
37 |         ]
38 | 
39 |         for aug in self.augs:
40 |             for text in texts:
41 |                 augmented_data = aug.augment(text)
42 |                 augmented_text = augmented_data[0]
43 |                 self.assertEqual(text, augmented_text)
44 | 
45 |     def test_skip_punctuation(self):
46 |         text = '. . . . ! ? # @'
47 | 
48 |         for aug in self.augs:
49 |             augmented_data = aug.augment(text)
50 |             augmented_text = augmented_data[0]
51 |             self.assertEqual(text, augmented_text)
52 | 


--------------------------------------------------------------------------------
/test/augmenter/word/test_spelling.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from dotenv import load_dotenv
 4 | 
 5 | import nlpaug.augmenter.word as naw
 6 | 
 7 | 
 8 | class TestSpelling(unittest.TestCase):
 9 |     @classmethod
10 |     def setUpClass(cls):
11 |         env_config_path = os.path.abspath(os.path.join(
12 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
13 |         load_dotenv(env_config_path)
14 | 
15 |         cls.model_dir = os.path.join(os.environ.get("PACKAGE_DIR"), 'res', 'word', 'spelling')
16 | 
17 |     def test_read_default_dict(self):
18 |         text = 'abcdef'
19 | 
20 |         aug = naw.SpellingAug()
21 |         self.assertTrue(aug.model.dict_path)
22 |         aug.augment(text)
23 |         self.assertTrue(True)
24 | 
25 |     def test_oov(self):
26 |         text = 'aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa'
27 | 
28 |         aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt'))
29 |         augmented_data = aug.augment(text)
30 |         augmented_text = augmented_data[0]
31 | 
32 |         self.assertEqual(text, augmented_text)
33 | 
34 |     def test_substitute(self):
35 |         texts = [
36 |             'The quick brown fox jumps over the lazy dog'
37 |         ]
38 | 
39 |         aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt'))
40 | 
41 |         for text in texts:
42 |             self.assertLess(0, len(text))
43 |             augmented_data = aug.augment(text)
44 |             augmented_text = augmented_data[0]
45 | 
46 |             self.assertNotEqual(text, augmented_text)
47 | 
48 |         self.assertLess(0, len(texts))
49 | 
50 |     def test_substitute_stopwords(self):
51 |         texts = [
52 |             'The quick brown fox jumps over the lazy dog'
53 |         ]
54 | 
55 |         stopwords = [t.lower() for t in texts[0].split(' ')[:3]]
56 |         aug_n = 3
57 | 
58 |         aug = naw.SpellingAug(dict_path=os.path.join(self.model_dir, 'spelling_en.txt'), stopwords=stopwords)
59 | 
60 |         for text in texts:
61 |             self.assertLess(0, len(text))
62 |             augmented_data = aug.augment(text)
63 |             augmented_text = augmented_data[0]
64 | 
65 |             augmented_tokens = aug.tokenizer(augmented_text)
66 |             tokens = aug.tokenizer(text)
67 | 
68 |             augmented_cnt = 0
69 | 
70 |             for token, augmented_token in zip(tokens, augmented_tokens):
71 |                 if token.lower() in stopwords and len(token) > aug_n:
72 |                     self.assertEqual(token.lower(), augmented_token)
73 |                 else:
74 |                     augmented_cnt += 1
75 | 
76 |             self.assertGreater(augmented_cnt, 0)
77 | 
78 |         self.assertLess(0, len(texts))
79 | 


--------------------------------------------------------------------------------
/test/augmenter/word/test_split.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import nlpaug.augmenter.word as naw
 4 | 
 5 | 
 6 | class TestSplit(unittest.TestCase):
 7 |     def test_split(self):
 8 |         texts = [
 9 |             'The quick brown fox jumps over the lazy dog'
10 |         ]
11 |         aug = naw.SplitAug()
12 | 
13 |         for text in texts:
14 |             augmented_data = aug.augment(text)
15 |             augmented_text = augmented_data[0]
16 | 
17 |             self.assertLess(len(text), len(augmented_text))
18 | 
19 |     def test_split_min_char(self):
20 |         texts = [
21 |             'quick brown'
22 |         ]
23 |         aug = naw.SplitAug(min_char=6)
24 | 
25 |         for text in texts:
26 |             augmented_data = aug.augment(text)
27 |             augmented_text = augmented_data[0]
28 |             
29 |             self.assertEqual(text, augmented_text)
30 | 


--------------------------------------------------------------------------------
/test/flow/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/flow/__init__.py


--------------------------------------------------------------------------------
/test/flow/test_sometimes.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | 
 3 | import nlpaug.augmenter.char as nac
 4 | import nlpaug.flow as naf
 5 | from nlpaug.util import Action
 6 | 
 7 | 
 8 | class TestSometimes(unittest.TestCase):
 9 |     def test_dry_run(self):
10 |         seq = naf.Sometimes()
11 |         results = seq.augment([])
12 |         self.assertEqual(0, len(results))
13 | 
14 |     def test_single_action(self):
15 |         texts = [
16 |             'The quick brown fox jumps over the lazy dog',
17 |             'Zology raku123456 fasdasd asd4123414 1234584 s@#'
18 |         ]
19 | 
20 |         # Since prob may be low and causing do not perform data augmentation. Retry 5 times
21 |         at_least_one_not_equal = False
22 |         for _ in range(0, 5):
23 |             flow = naf.Sometimes([nac.RandomCharAug(action=Action.INSERT)], aug_p=0.6)
24 |             for text in texts:
25 |                 augmented_text = flow.augment(text)
26 | 
27 |                 if text != augmented_text:
28 |                     at_least_one_not_equal = True
29 | 
30 |                 self.assertLess(0, len(text))
31 | 
32 |             if at_least_one_not_equal:
33 |                 break
34 | 
35 |         self.assertTrue(at_least_one_not_equal)
36 |         self.assertLess(0, len(texts))
37 | 
38 |     def test_multiple_actions(self):
39 |         texts = [
40 |             'The quick brown fox jumps over the lazy dog',
41 |             'Zology raku123456 fasdasd asd4123414 1234584'
42 |         ]
43 | 
44 |         flows = [
45 |             naf.Sometimes([nac.RandomCharAug(action=Action.INSERT),
46 |                            nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)],
47 |                           aug_p=0.8),
48 |             naf.Sometimes(
49 |                 [nac.OcrAug(), nac.KeyboardAug(aug_char_min=1),
50 |                  nac.RandomCharAug(action=Action.SUBSTITUTE, aug_char_min=1, aug_char_p=0.6, aug_word_p=0.6),
51 |                  nac.RandomCharAug(action=Action.INSERT), nac.RandomCharAug(action=Action.DELETE)],
52 |                 aug_p=0.6)
53 |         ]
54 | 
55 |         # Since prob may be low and causing do not perform data augmentation. Retry 5 times
56 |         for flow in flows:
57 |             at_least_one_not_equal = False
58 |             for _ in range(0, 5):
59 |                 for text in texts:
60 |                     self.assertLess(0, len(text))
61 |                     augmented_text = flow.augment(text)
62 | 
63 |                     if text != augmented_text:
64 |                         at_least_one_not_equal = True
65 | 
66 |                     self.assertLess(0, len(text))
67 | 
68 |                 if at_least_one_not_equal:
69 |                     break
70 | 
71 |         self.assertTrue(at_least_one_not_equal)
72 |         self.assertLess(0, len(flows))
73 |         self.assertLess(0, len(texts))
74 | 
75 | 


--------------------------------------------------------------------------------
/test/model/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/model/__init__.py


--------------------------------------------------------------------------------
/test/model/char/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/model/char/__init__.py


--------------------------------------------------------------------------------
/test/model/word/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/model/word/__init__.py


--------------------------------------------------------------------------------
/test/model/word/test_word_embs_model.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | from dotenv import load_dotenv
 4 | 
 5 | import nlpaug.model.word_embs as nmw
 6 | 
 7 | 
 8 | class TestWordEmbsModel(unittest.TestCase):
 9 |     @classmethod
10 |     def setUpClass(cls):
11 |         env_config_path = os.path.abspath(os.path.join(
12 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
13 |         load_dotenv(env_config_path)
14 | 
15 |     def test_bogus_fasttext_loading(self):
16 |         test_file = os.path.join(os.environ.get("PACKAGE_DIR"), 'res', 'text', 'bogus_fasttext.vec')
17 | 
18 |         # Change to not supporting incorrect format file after switching to use gensim package
19 |         with self.assertRaises(Exception) as error:
20 |             fasttext = nmw.Fasttext()
21 |             fasttext.read(test_file)
22 |         self.assertIn('cannot copy sequence with size 11 to array axis with dimension 10', str(error.exception))
23 | 
24 |         # for word in fasttext.get_vocab():
25 |         #     self.assertSequenceEqual(list(fasttext.model[word]), expected_vector)
26 | 
27 |         # self.assertSequenceEqual(["test1", "test2", "test_3", "test 4", "test -> 5"], fasttext.get_vocab())
28 | 
29 |         # self.assertEqual(len(fasttext.get_vocab()), 5)
30 | 


--------------------------------------------------------------------------------
/test/profiler.py:
--------------------------------------------------------------------------------
 1 | import nlpaug, transformers, torch, fairseq, nltk
 2 | from platform import python_version
 3 | import nlpaug.augmenter.audio as naa
 4 | import nlpaug.augmenter.char as nac
 5 | import nlpaug.augmenter.word as naw
 6 | import nlpaug.augmenter.sentence as nas
 7 | 
 8 | from pyinstrument import Profiler
 9 | 
10 | profiler = Profiler()
11 | 
12 | def main():
13 | 	model_paths = [
14 | 	#     'distilbert-base-uncased',
15 | 	    'bert-base-uncased',
16 | 	#     'bert-base-cased',
17 | 	#     'xlnet-base-cased',
18 | 	    # 'roberta-base',
19 | 	#     'distilroberta-base'
20 | 	]	
21 | 	for model_path in model_paths:
22 | 	    print('-----------------:', model_path)
23 | 	    aug = naw.ContextualWordEmbsAug(model_path=model_path)
24 | 	    text = 'The quick brown fox jumps over the lazaaaaaaaaay dog'
25 | 	    augmented_text = aug.augment([text]*2)
26 | 	    # print(augmented_text)
27 | 	    
28 | 
29 | if __name__ == '__main__':
30 | 	print('python_version:{}'.format(python_version()))
31 | 	print('nlpaug:{}'.format(nlpaug.__version__))
32 | 	print('transformers:{}'.format(transformers.__version__))
33 | 	print('torch:{}'.format(torch.__version__))
34 | 	print('fairseq:{}'.format(fairseq.__version__))
35 | 	print('nltk:{}'.format(nltk.__version__))
36 | 
37 | 	# yappi.set_clock_type("cpu") # Use set_clock_type("wall") for wall time
38 | 	# yappi.start()
39 | 	profiler.start()
40 | 	main()
41 | 	profiler.stop()
42 | 	print(profiler.output_text(unicode=True, color=True))
43 | 	# yappi.get_func_stats().print_all()
44 | 	# yappi.get_thread_stats().print_all()


--------------------------------------------------------------------------------
/test/profiling/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/profiling/__init__.py


--------------------------------------------------------------------------------
/test/profiling/sentence/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/profiling/sentence/__init__.py


--------------------------------------------------------------------------------
/test/profiling/sentence/test_context_word_embs_sentence_profiling.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import time
 4 | from dotenv import load_dotenv
 5 | 
 6 | import nlpaug.augmenter.sentence as nas
 7 | 
 8 | 
 9 | class TestContextualWordEmbsAugProfiling(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         env_config_path = os.path.abspath(os.path.join(
13 |             os.path.dirname(__file__), '..', '..', '..', '.env'))
14 |         load_dotenv(env_config_path)
15 | 
16 |         cls.text = 'The quick brown fox jumps over the lazy dog.'
17 | 
18 |     def test_optimize(self):
19 |         model_paths = ['gpt2', 'distilgpt2']
20 |         device = 'cpu'
21 |         enable_optimize = {'external_memory': 1024, 'return_proba': True}
22 |         disable_optimize = {'external_memory': 0, 'return_proba': True}
23 |         epoch = 10
24 | 
25 |         for model_path in model_paths:
26 |             # Optimized
27 |             durations = []
28 |             aug = nas.ContextualWordEmbsForSentenceAug(
29 |                 model_path=model_path, device=device, optimize=enable_optimize, force_reload=True)
30 |             for i in range(epoch):
31 |                 start_dt = time.monotonic()
32 |                 for j in range(epoch):
33 |                     aug.augment(self.text)
34 |                 end_dt = time.monotonic()
35 |                 durations.append(round(end_dt-start_dt, 2))
36 | 
37 |             optimized_total_duration = sum(durations)
38 |             optimized_average_duration = round(optimized_total_duration/len(durations), 2)
39 | 
40 |             # No optimized
41 |             durations = []
42 |             aug.model.optimize = disable_optimize
43 |             for _ in range(epoch):
44 |                 start_dt = time.monotonic()
45 |                 for _ in range(epoch):
46 |                     aug.augment(self.text)
47 |                 end_dt = time.monotonic()
48 |                 durations.append(round(end_dt - start_dt, 2))
49 | 
50 |             no_optimized_total_duration = sum(durations)
51 |             no_optimized_average_duration = round(no_optimized_total_duration / len(durations), 2)
52 | 
53 |             print('Model:{}, Optimized: {}({}), No Optimized: {}({})'.format(
54 |                 model_path, optimized_total_duration, optimized_average_duration,
55 |                 no_optimized_total_duration, no_optimized_average_duration
56 |             ))
57 | 
58 |             self.assertGreater(no_optimized_total_duration, optimized_total_duration)
59 |             self.assertGreater(no_optimized_average_duration, optimized_average_duration)
60 | 


--------------------------------------------------------------------------------
/test/profiling/word/profile_context_word_embs.py:
--------------------------------------------------------------------------------
 1 | import datetime
 2 | 
 3 | def run_core():
 4 | 	print(datetime.datetime.now(), 'before import')
 5 | 	import nlpaug.augmenter.word as naw
 6 | 
 7 | 	print(datetime.datetime.now(), 'before init')
 8 | 	aug = naw.ContextualWordEmbsAug(model_path='bert-base-uncased', model_type="bert", use_custom_api=True)
 9 | 	text = 'The quick brown fox jumps over the lazy dog.'
10 | 	print(datetime.datetime.now(), 'before augment')
11 | 	aug.augment([text] * 2)
12 | 	print(datetime.datetime.now(), 'done')
13 | 
14 | if __name__ == '__main__':
15 | 	run_core()
16 | 	


--------------------------------------------------------------------------------
/test/res/audio/Pink_noise.ogg:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/res/audio/Pink_noise.ogg


--------------------------------------------------------------------------------
/test/res/audio/Yamaha-V50-Rock-Beat-120bpm.wav:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/res/audio/Yamaha-V50-Rock-Beat-120bpm.wav


--------------------------------------------------------------------------------
/test/res/common/sample.json:
--------------------------------------------------------------------------------
1 | {
2 |     "0": [
3 |         "3"
4 |     ]
5 | }


--------------------------------------------------------------------------------
/test/run_profile_context_word_embs.sh:
--------------------------------------------------------------------------------
1 | py-spy record -o profile.svg -- python ./test/profiling/word/profile_context_word_embs.py


--------------------------------------------------------------------------------
/test/run_profile_import.sh:
--------------------------------------------------------------------------------
1 | python -X importtime -c 'import nlpaug' 2> nlpaug-imports.log


--------------------------------------------------------------------------------
/test/util/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/util/__init__.py


--------------------------------------------------------------------------------
/test/util/selection/__init__.py:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/makcedward/nlpaug/23800cbb9632c7fc8c4a88d46f9c4ecf68a96299/test/util/selection/__init__.py


--------------------------------------------------------------------------------
/test/util/text/test_tokenizer.py:
--------------------------------------------------------------------------------
 1 | import unittest
 2 | import os
 3 | import numpy as np
 4 | from dotenv import load_dotenv
 5 | 
 6 | from nlpaug.util.text.tokenizer import Tokenizer
 7 | 
 8 | 
 9 | class TestTokenizer(unittest.TestCase):
10 |     @classmethod
11 |     def setUpClass(cls):
12 |         env_config_path = os.path.abspath(os.path.join(
13 |             os.path.dirname(__file__), '..', '..', '.env'))
14 |         load_dotenv(env_config_path)
15 | 
16 |     def test_tokenizer(self):
17 |         text = 'The quick brown fox jumps over the lazy dog?'
18 | 
19 |         tokens = Tokenizer.tokenizer(text)
20 |         expected_tokens = ['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '?']
21 |         self.assertEqual(tokens, expected_tokens)
22 |         
23 | 
24 |     def test_reverse_tokenizer(self):
25 |         text = 'The quick (brown) [fox] {jumps} over the lazy dog?'
26 | 
27 |         tokens = Tokenizer.tokenizer(text)
28 |         self.assertEqual(text, Tokenizer.reverse_tokenizer(tokens))


--------------------------------------------------------------------------------