├── .github └── workflows │ ├── doc-build.yml │ ├── docker_pull.yml │ ├── docker_tts_sdp_test.yml │ ├── importmanager.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── __init__.py ├── dataset_configs ├── arabic │ ├── everyayah │ │ └── config.yaml │ ├── fleurs │ │ └── config.yaml │ ├── masc │ │ ├── config.yaml │ │ └── config_filter_noisy_train.yaml │ ├── mcv │ │ └── config.yaml │ ├── mediaspeech │ │ └── config.yaml │ └── readme.md ├── armenian │ ├── audio_books │ │ └── config.yaml │ ├── fleurs │ │ └── config.yaml │ ├── text_mcv │ │ └── config.yaml │ └── toloka │ │ ├── pipeline_get_final_res.yaml │ │ ├── pipeline_start.yaml │ │ └── pipeline_validate_answers.yaml ├── commoncrawl │ └── README.md ├── english │ ├── coraal │ │ └── config.yaml │ ├── earnings │ │ └── config.yaml │ ├── hifitts2 │ │ ├── config_22khz.yaml │ │ ├── config_44khz.yaml │ │ └── config_bandwidth.yaml │ ├── librispeech │ │ ├── all.yaml │ │ ├── config.yaml │ │ └── mini.yaml │ └── slr83 │ │ └── config.yaml ├── georgian │ └── mcv │ │ └── config.yaml ├── ipl │ ├── config.yaml │ └── nemo_run_config.yaml ├── italian │ ├── mcv │ │ └── config.yaml │ ├── mls │ │ ├── config.yaml │ │ └── config_nopc.yaml │ └── voxpopuli │ │ └── config.yaml ├── kazakh │ ├── ksc2 │ │ └── config.yaml │ ├── mcv │ │ └── config.yaml │ ├── slr102 │ │ └── config.yaml │ └── slr140 │ │ └── config.yaml ├── multilingual │ └── granary │ │ └── readme.md ├── portuguese │ ├── coraa │ │ └── config.yaml │ ├── mcv │ │ └── config.yaml │ ├── mls │ │ └── config.yaml │ └── mtedx │ │ └── config.yaml ├── spanish │ └── mls │ │ ├── config.yaml │ │ └── unique_processors │ │ ├── 1-100_roman_numeral_table.csv │ │ └── clean_roman_numerals.py ├── spanish_pc │ ├── fisher │ │ ├── config.yaml │ │ └── unique_processors │ │ │ └── create_initial_manifest_fisher_spanish.py │ ├── mcv12 │ │ └── config.yaml │ ├── mls │ │ └── config.yaml │ └── voxpopuli │ │ └── config.yaml ├── tts │ └── ytc │ │ └── config.yaml └── uzbek │ ├── fleurs │ └── config.yaml │ ├── mcv │ └── config.yaml │ └── uzbekvoice │ └── config.yaml ├── docker ├── Dockerfile └── Dockerfile.tts_sdp ├── docs ├── Makefile ├── README.md ├── gen_docs.py └── src │ ├── _static │ ├── css │ │ └── custom.css │ └── js │ │ └── pk_scripts.js │ ├── _templates │ └── layout.html │ ├── conf.py │ ├── favicon.ico │ ├── index.rst │ └── sdp │ ├── adding_processors.rst │ ├── api.rst │ ├── config_structure.rst │ └── existing_configs.rst ├── main.py ├── pytest.ini ├── requirements ├── docs.txt ├── huggingface.txt ├── ipl.txt ├── main.txt ├── tests.txt └── tts.txt ├── sdp ├── __init__.py ├── logging.py ├── processors │ ├── __init__.py │ ├── base_processor.py │ ├── datasets │ │ ├── __init__.py │ │ ├── commoncrawl │ │ │ ├── __init__.py │ │ │ ├── commoncrawl.py │ │ │ └── harv_utils.py │ │ ├── coraa │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── coraal │ │ │ ├── __init__.py │ │ │ ├── create_initial_manifest.py │ │ │ └── data_splits.py │ │ ├── earnings │ │ │ ├── __init__.py │ │ │ ├── apply_normalizations.py │ │ │ └── create_initial_manifest.py │ │ ├── fleurs │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── hifitts2 │ │ │ ├── __init__.py │ │ │ ├── download_dataset.py │ │ │ └── remove_failed_chapters.py │ │ ├── ksc2 │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── lhotse.py │ │ ├── librispeech │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── masc │ │ │ ├── __init__.py │ │ │ ├── aggregate_segments.py │ │ │ ├── apply_reg_exp_on_vtt_entries.py │ │ │ ├── create_initial_manifest.py │ │ │ ├── get_caption_file_segments.py │ │ │ └── utils.py │ │ ├── mcv │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── mediaspeech │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── mls │ │ │ ├── __init__.py │ │ │ ├── create_initial_manifest.py │ │ │ └── restore_pc.py │ │ ├── mtedx │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── slr102 │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── slr140 │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── slr83 │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── uzbekvoice │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── voxpopuli │ │ │ ├── __init__.py │ │ │ ├── create_initial_manifest.py │ │ │ └── normalize_from_non_pc_text.py │ │ └── ytc │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ ├── huggingface │ │ ├── __init__.py │ │ ├── create_initial_manifest.py │ │ └── speech_recognition.py │ ├── ipl │ │ ├── README.md │ │ ├── __init__.py │ │ ├── ipl_processors.py │ │ └── nemo_run_processor.py │ ├── langs │ │ ├── __init__.py │ │ ├── arabic.py │ │ ├── armenian.py │ │ └── kazakh.py │ ├── modify_manifest │ │ ├── __init__.py │ │ ├── common.py │ │ ├── create_manifest.py │ │ ├── data_to_data.py │ │ ├── data_to_dropbool.py │ │ └── make_letters_uppercase_after_period.py │ ├── nemo │ │ ├── __init__.py │ │ ├── asr_inference.py │ │ ├── estimate_bandwidth.py │ │ ├── pc_inference.py │ │ └── transcribe_speech.py │ ├── toloka │ │ ├── __init__.py │ │ ├── accept_if.py │ │ ├── create_pool.py │ │ ├── create_project.py │ │ ├── create_sentence_set.py │ │ ├── create_task_set.py │ │ ├── download_responses.py │ │ └── reject_if.py │ └── tts │ │ ├── README.md │ │ ├── __init__.py │ │ ├── merge_alignment_diarization.py │ │ ├── metrics.py │ │ ├── nemo_asr_align.py │ │ ├── prepare_tts_segments.py │ │ ├── pyannote.py │ │ ├── split.py │ │ └── text.py ├── run_processors.py └── utils │ ├── __init__.py │ ├── apply_operators.py │ ├── bootstrap_estimates.py │ ├── common.py │ ├── edit_spaces.py │ ├── get_diff.py │ ├── import_manager.py │ ├── ipl_utils.py │ ├── metrics_computation.py │ ├── nemo_run_utils.py │ └── skills_utils.py ├── setup.py └── tests ├── README.md ├── __init__.py ├── prepare_test_data ├── prepare_coraa_data.py ├── prepare_fleurs_data.py ├── prepare_hifitts2_data.py ├── prepare_huggingface_data.py ├── prepare_ksc2_data.py ├── prepare_masc_data.py ├── prepare_mcv_data.py ├── prepare_mediaspeech_data.py ├── prepare_mls_data.py ├── prepare_mtedx_data.py ├── prepare_slr102_data.py ├── prepare_slr140_data.py ├── prepare_voxpopuli_data.py └── prepare_ytc_data.py ├── test_bootstrap_estimate.py ├── test_cfg_end_to_end_tests.py ├── test_cfg_runtime_tests.py ├── test_data_to_data.py ├── test_data_to_dropbool.py ├── test_import_manager.py ├── test_lhotse.py ├── test_manifest_chunking.py ├── test_modify_manifest.py ├── test_normalize_text.py ├── test_tts_sdp_end_to_end.py └── test_utils.py /.github/workflows/doc-build.yml: -------------------------------------------------------------------------------- 1 | name: Deploy static content to Pages 2 | 3 | on: 4 | push: 5 | branches: ["main"] 6 | 7 | # Allows you to run this workflow manually from the Actions tab 8 | workflow_dispatch: 9 | 10 | permissions: 11 | contents: read 12 | pages: write 13 | id-token: write 14 | 15 | # Allow only one concurrent deployment, skipping runs queued between the run in-progress and latest queued. 16 | # However, do NOT cancel in-progress runs as we want to allow these production deployments to complete. 17 | concurrency: 18 | group: "pages" 19 | cancel-in-progress: false 20 | 21 | jobs: 22 | # Build docs and deploy to the website 23 | deploy: 24 | environment: 25 | name: github-pages 26 | url: ${{ steps.deployment.outputs.page_url }} 27 | runs-on: ubuntu-latest 28 | steps: 29 | - name: Checkout 30 | uses: actions/checkout@v4 31 | - name: Setup Pages 32 | uses: actions/configure-pages@v4 33 | - name: Install dependencies 34 | run: | 35 | python -m pip install --upgrade pip 36 | pip install -r requirements/docs.txt 37 | - name: Build docs with sphinx 38 | run: | 39 | cd docs && make clean && make html 40 | 41 | - name: Upload artifact 42 | uses: actions/upload-pages-artifact@v3 43 | with: 44 | path: 'docs/html' 45 | - name: Deploy to GitHub Pages 46 | id: deployment 47 | uses: actions/deploy-pages@v3 48 | -------------------------------------------------------------------------------- /.github/workflows/docker_pull.yml: -------------------------------------------------------------------------------- 1 | name: Docker Build and Test 2 | 3 | on: 4 | pull_request: 5 | branches: [ "main" ] 6 | workflow_dispatch: 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | build-and-test: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v3 18 | 19 | - name: Set up Docker Buildx 20 | uses: docker/setup-buildx-action@v3 21 | 22 | - name: Build Docker image 23 | run: | 24 | docker build -t sdp-test-image:${{ github.sha }} -f docker/Dockerfile . 25 | 26 | - name: Run test tests 27 | run: | 28 | docker run --rm \ 29 | -v ${{ github.workspace }}:/workspace \ 30 | -w /workspace \ 31 | sdp-test-image:${{ github.sha }} \ 32 | bash -c "python -m pytest tests/test_utils.py -v" 33 | 34 | - name: Get test results 35 | if: always() 36 | uses: actions/upload-artifact@v4 37 | with: 38 | name: test-results 39 | path: | 40 | pytest.xml 41 | coverage.xml 42 | 43 | - name: Docker cleanup 44 | run: docker system prune -af -------------------------------------------------------------------------------- /.github/workflows/docker_tts_sdp_test.yml: -------------------------------------------------------------------------------- 1 | name: SDP TTS Docker Build and Test 2 | 3 | on: 4 | pull_request: 5 | branches: [ "main" ] 6 | workflow_dispatch: 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | build-and-test: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Checkout code 17 | uses: actions/checkout@v3 18 | 19 | - name: Set up Docker Buildx 20 | uses: docker/setup-buildx-action@v3 21 | 22 | - name: Build Docker image 23 | run: | 24 | docker build -t sdp-test-image:${{ github.sha }} -f docker/Dockerfile.tts_sdp . 25 | 26 | - name: Run sdp tts tests 27 | env: 28 | AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }} 29 | AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }} 30 | HF_SECRET_KEY: ${{ secrets.HF_SECRET_KEY }} 31 | CLEAN_UP_TMP_PATH: 1 32 | run: | 33 | docker run --rm \ 34 | -v ${{ github.workspace }}:/workspace \ 35 | -w /workspace \ 36 | --shm-size=4g \ 37 | -e AWS_SECRET_KEY="${AWS_SECRET_KEY}" \ 38 | -e AWS_ACCESS_KEY="${AWS_ACCESS_KEY}" \ 39 | -e HF_SECRET_KEY="${HF_SECRET_KEY}" \ 40 | -e CLEAN_UP_TMP_PATH="${CLEAN_UP_TMP_PATH}" \ 41 | sdp-test-image:${{ github.sha }} \ 42 | bash -c "python -m pytest tests/test_tts_sdp_end_to_end.py -v" 43 | 44 | - name: Get test results 45 | if: always() 46 | uses: actions/upload-artifact@v4 47 | with: 48 | name: test-results 49 | path: | 50 | pytest.xml 51 | coverage.xml 52 | 53 | - name: Docker cleanup 54 | run: docker system prune -af -------------------------------------------------------------------------------- /.github/workflows/importmanager.yml: -------------------------------------------------------------------------------- 1 | name: ImportManager Test 2 | 3 | on: 4 | pull_request: 5 | branches: [ "main" ] 6 | workflow_dispatch: # Allows manual workflow triggering 7 | 8 | permissions: 9 | contents: read 10 | 11 | jobs: 12 | import-manager-check: 13 | runs-on: ubuntu-latest 14 | 15 | steps: 16 | - name: Check out the repository 17 | uses: actions/checkout@v3 18 | 19 | - name: Set up Python 3.10 20 | uses: actions/setup-python@v3 21 | with: 22 | python-version: "3.10" 23 | 24 | - name: Install dependencies 25 | run: | 26 | python -m pip install --upgrade pip 27 | pip install -r requirements/tests.txt 28 | pip install -r requirements/main.txt 29 | 30 | - name: Run ImportManager Test 31 | run: | 32 | python -m pytest tests/test_import_manager.py --maxfail=1 --disable-warnings 33 | 34 | 35 | 36 | -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- 1 | name: SDP tests 2 | 3 | on: 4 | pull_request: 5 | branches: [ "main" ] 6 | 7 | # Allows you to run this workflow manually from the Actions tab 8 | workflow_dispatch: 9 | 10 | permissions: 11 | contents: read 12 | 13 | jobs: 14 | doc-building: 15 | runs-on: ubuntu-latest 16 | 17 | steps: 18 | - uses: actions/checkout@v3 19 | - name: Set up Python 3.10 20 | uses: actions/setup-python@v3 21 | with: 22 | python-version: "3.10" 23 | - name: Install dependencies 24 | run: | 25 | python -m pip install --upgrade pip 26 | pip install -r requirements/docs.txt 27 | pip install nemo_text_processing 28 | python -m pip cache purge 29 | # we are being quite strict here, but hopefully that will not be too inconvenient 30 | - name: Checking that documentation builds with no warnings and all links are working 31 | run: | 32 | cd docs && make clean && make html SPHINXOPTS="-b linkcheck -W --keep-going -n" 33 | 34 | no-nemo-tests: 35 | runs-on: ubuntu-latest 36 | 37 | steps: 38 | - uses: actions/checkout@v3 39 | - name: Set up Python 3.10 40 | uses: actions/setup-python@v3 41 | with: 42 | python-version: "3.10" 43 | - name: Install dependencies 44 | run: | 45 | python -m pip install --upgrade pip 46 | pip install -r requirements/main.txt 47 | pip install -r requirements/tests.txt 48 | pip install -r requirements/huggingface.txt 49 | pip install nemo_text_processing 50 | python -m pip cache purge 51 | - name: Checking that SDP can be imported and basic configs can be run without nemo 52 | # in the future this might fail if some runtime tests require nemo 53 | # in that case this test will need to be changed 54 | run: | 55 | python -m pytest tests/test_cfg_runtime_tests.py 56 | 57 | main-tests: 58 | runs-on: ubuntu-latest 59 | 60 | steps: 61 | - uses: actions/checkout@v3 62 | - name: Set up Python 3.10 63 | uses: actions/setup-python@v3 64 | with: 65 | python-version: "3.10" 66 | - name: Install dependencies 67 | run: | 68 | python -m pip install --upgrade pip 69 | pip install -r requirements/main.txt 70 | pip install -r requirements/tests.txt 71 | sudo apt-get update 72 | sudo apt-get install -y libsndfile1 ffmpeg sox libsox-fmt-mp3 73 | pip install pytorch_lightning 74 | pip install Cython wheel # need to pre-install to avoid error in nemo installation 75 | pip install nemo-toolkit[asr,nlp]==1.23.0 76 | pip install nemo_text_processing 77 | pip install -r requirements/huggingface.txt 78 | pip install certifi #this needed to avoid problems with certificates [COORAL] 79 | export SSL_CERT_FILE=$(python -m certifi) 80 | python -m pip cache purge 81 | 82 | 83 | - name: Run all tests 84 | env: 85 | AWS_SECRET_KEY: ${{ secrets.AWS_SECRET_KEY }} 86 | AWS_ACCESS_KEY: ${{ secrets.AWS_ACCESS_KEY }} 87 | CLEAN_UP_TMP_PATH: 1 88 | run: | 89 | 90 | wget https://uit.stanford.edu/sites/default/files/2023/10/11/incommon-rsa-ca2.pem #downloading cert manually [for CORAL] 91 | sudo cp incommon-rsa-ca2.pem /usr/local/share/ca-certificates/incommon-rsa-server-ca-2.crt # [cert for CORAL] 92 | sudo update-ca-certificates # [cert for CORAL] 93 | set -o pipefail # this will make sure next line returns non-0 exit code if tests fail 94 | python -m pytest tests/ --junitxml=pytest.xml --ignore=tests/test_tts_sdp_end_to_end.py --cov-report=term-missing:skip-covered --cov=sdp --durations=30 -rs | tee pytest-coverage.txt 95 | 96 | 97 | # TODO: add some way to see if e2e tests were skipped 98 | # (which will be the case for PRs from public forks). 99 | # below step is supposed to do that, but not working yet 100 | 101 | # - name: Pytest coverage comment 102 | # uses: MishaKav/pytest-coverage-comment@main 103 | # with: 104 | # pytest-coverage-path: ./pytest-coverage.txt 105 | # junitxml-path: ./pytest.xml 106 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # log and data files 2 | test_data 3 | workdir 4 | lightning_logs 5 | 6 | # unit test / coverage reports 7 | .hypothesis 8 | .coverage 9 | pytest.xml 10 | pytest-coverage.txt 11 | 12 | # vscode 13 | .vscode 14 | 15 | # docs 16 | docs/html 17 | docs/src/sdp/config-docs/ 18 | 19 | # venv 20 | asr-venv 21 | test-venv 22 | 23 | # byte-compiled / optimized / DLL files 24 | __pycache__ 25 | 26 | # egg-info 27 | sdp.egg-info 28 | 29 | # build 30 | build -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | default_language_version: 16 | python: python3 17 | 18 | ci: 19 | autofix_prs: true 20 | autoupdate_commit_msg: '[pre-commit.ci] pre-commit suggestions' 21 | autoupdate_schedule: quarterly 22 | 23 | repos: 24 | - repo: https://github.com/pre-commit/pre-commit-hooks 25 | rev: v4.3.0 26 | hooks: 27 | - id: check-yaml 28 | - id: check-case-conflict 29 | - id: detect-private-key 30 | - id: check-added-large-files 31 | args: ['--maxkb=1000'] 32 | - id: requirements-txt-fixer 33 | 34 | - repo: https://github.com/PyCQA/isort 35 | rev: 5.12.0 36 | hooks: 37 | - id: isort 38 | name: Format imports 39 | args: ["--profile", "black"] 40 | exclude: docs/ 41 | 42 | - repo: https://github.com/psf/black 43 | rev: 23.3.0 44 | hooks: 45 | - id: black 46 | name: Format code 47 | args: [--skip-string-normalization, --line-length=119] 48 | additional_dependencies: ['click==8.1.3'] 49 | -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- 1 | Contributions to SDP are welcome and encouraged! 2 | 3 | Before you create a PR, please double check the following: 4 | 5 | - All tests are passing. See [tests/README.md](tests/README.md) for details on how to run tests. 6 | - Code style conforms to our guidelines. The easiest way to ensure this is to 7 | install a pre-commit hook by running `pip install pre-commit` and then 8 | `pre-commit install`. 9 | 10 | TODO: information on what docs to add for new features -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Speech Data Processor (SDP) Toolkit 2 | 3 | The Speech Data Processor (SDP) is a toolkit designed to simplify the processing of speech datasets. It minimizes the boilerplate code required and allows for easy sharing of processing steps. SDP's philosophy is to represent processing operations as 'processor' classes, which take in a path to a NeMo-style data manifest as input (or a path to the raw data directory if you do not have a NeMo-style manifest to start with), apply some processing to it, and then save the output manifest file. 4 | 5 | ## Features 6 | 7 | - **Creating Manifests:** Generate manifests for your datasets. 8 | - **Running ASR Inference:** Automatically run ASR inference to remove utterances where the reference text differs greatly from ASR predictions. 9 | - **Text Transformations:** Apply text-based transformations to lines in the manifest. 10 | - **Removing Inaccurate Transcripts:** Remove lines from the manifest which may contain inaccurate transcripts. 11 | - **Custom Processors:** Write your own processor classes if the provided ones do not meet your needs. 12 | 13 | ## Installation 14 | 15 | SDP is officially supported for Python 3.10, but might work for other versions. 16 | 17 | 1. Clone the repository: 18 | 19 | ```bash 20 | git clone https://github.com/NVIDIA/NeMo-speech-data-processor.git 21 | cd NeMo-speech-data-processor 22 | ``` 23 | 2. Install dependencies: 24 | ```bash 25 | pip install -r requirements/main.txt 26 | ``` 27 | 28 | 3. Optional: If you need to use ASR, NLP parts, or NeMo Text Processing, follow the NeMo installation instructions: 29 | - [NeMo Installation](https://github.com/NVIDIA/NeMo) 30 | 31 | ## Example: 32 | 1. In this example we will load librispeech using SDP. 33 | * For downloading all available data - replace config.yaml with all.yaml 34 | * For mini dataset - replace with mini.yaml. 35 | ```bash 36 | python NeMo-speech-data-processor/main.py \ 37 | --config-path="dataset_configs/english/librispeech" \ 38 | --config-name="config.yaml" \ 39 | processors_to_run="0:" \ 40 | workspace_dir="librispeech_data_dir" 41 | ``` 42 | ## Usage 43 | 44 | 1. Create a Configuration YAML File: 45 | 46 | Here is a simplified example of a `config.yaml` file: 47 | 48 | ```yaml 49 | processors: 50 | - _target_: sdp.processors.CreateInitialManifestMCV 51 | output_manifest_file: "${data_split}_initial_manifest.json" 52 | language_id: es 53 | - _target_: sdp.processors.ASRInference 54 | pretrained_model: "stt_es_quartznet15x5" 55 | - _target_: sdp.processors.SubRegex 56 | regex_params_list: 57 | - {"pattern": "¡", "repl": "."} 58 | - {"pattern": "ó", "repl": "o"} 59 | test_cases: 60 | - {input: {text: "hey!"}, output: {text: "hey."}} 61 | - _target_: sdp.processors.DropNonAlphabet 62 | alphabet: "abcdefghijklmnopqrstuvwxyzáéiñóúüABCDEFGHIJKLMNOPQRSTUVWXYZÁÉÍÑÓÚÜ" 63 | test_cases: 64 | - {input: {text: "test Тест ¡"}, output: null} 65 | - {input: {text: "test"}, output: {text: "test"}} 66 | - _target_: sdp.processors.KeepOnlySpecifiedFields 67 | output_manifest_file: "${data_split}_final_manifest.json" 68 | fields_to_keep: 69 | - "audio_filepath" 70 | - "text" 71 | - "duration" 72 | ``` 73 | 74 | 2. Run the Processor: 75 | 76 | Use the following command to process your dataset: 77 | 78 | ```bash 79 | python /main.py \ 80 | --config-path="dataset_configs///" \ 81 | --config-name="config.yaml" \ 82 | processors_to_run="all" \ 83 | data_split="train" \ 84 | workspace_dir="" 85 | ``` 86 | 87 | ![SDP overview](https://github.com/NVIDIA/NeMo/releases/download/v1.17.0/sdp_overview_diagram.png) 88 | 89 | To learn more about SDP, have a look at our [documentation](https://nvidia.github.io/NeMo-speech-data-processor/). 90 | 91 | 92 | ## Contributing 93 | We welcome community contributions! Please refer to the [CONTRIBUTING.md](CONTRIBUTING.md) for the process. 94 | -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /dataset_configs/arabic/everyayah/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | Tarteel AI's Everyayah 3 | ################################### 4 | 5 | The config performs the following data processing. 6 | 1. Drops any data that contains symbols not in the supported alphabet. 7 | 2. Can be used to remove punctuation and diacritical marks. 8 | 3. Can be used to replace positional forms of Arabic letters with general unicodes. 9 | 4. Can be used to normalize Arabic ligatures. 10 | 11 | **Required arguments**. 12 | 13 | * **raw_dataset_dir**: path to the tarred dataset. 14 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 15 | * **data_split**: should be "train", "validation", "test". 16 | * **remove_punctuation**: specify whether to remove punctuation or not. Should be "True" or "False". Defaults to False. 17 | * **remove_diacritics**: specify whether to remove tatweel marks or not. Should be "True" or "False". Defaults to True. 18 | * **remove_tatweel**: specify whether to remove punctuation or not. Should be "True" or "False". Defaults to True. 19 | * **normalize_ligature**: specify whether to normalize ligature or not. Should be "True" or "False". Defaults to True. 20 | * **apply_nfkc**: Applies NFKC normalization to the text. Find more here https://docs.python.org/3/library/unicodedata.html#unicodedata.normalize. 21 | Defaults to True. 22 | * **min_duration**: minimal duration of segment in seconds. Defaults to 0.1s. 23 | * **max_duration**: maximal duration of segment in seconds. Defaults to 20s. 24 | 25 | **Output format**. 26 | 27 | This config dumps the final manifest at ``${workspace_dir}/${data_split}/manifest.json`` and wav files ``${workspace_dir}/${data_split}/audios``. 28 | The output manifest contains the following fields: 29 | 30 | * **audio_filepath (str)**: relative path to the audio files. 31 | * **text (str)**: transcription. 32 | * **duration (float)**: audio duration in seconds. 33 | 34 | processors_to_run: "0:" 35 | 36 | data_split: test # specify dataset type (clean_train, clean_test, ...) 37 | workspace_dir: ??? 38 | 39 | min_duration: 0.1 # minimal duration of the segment 40 | max_duration: 20.0 # maximal duration of the segment 41 | 42 | raw_data_dir: ${workspace_dir} 43 | already_downloaded: True 44 | 45 | remove_punctuation: False 46 | remove_diacritics: True 47 | remove_tatweel: True 48 | normalize_ligature: True 49 | apply_nfkc: True 50 | 51 | final_manifest: ${workspace_dir}/${data_split}/manifest.json 52 | 53 | processors: 54 | # 0 creating manifest {sample_id, audio_path} 55 | - _target_: sdp.processors.CreateInitialManifestHuggingFace 56 | dataset_name: tarteel-ai/everyayah 57 | data_split: ${data_split} 58 | target_samplerate: 16000 59 | resampled_audio_dir: ${workspace_dir}/${data_split}/audios 60 | raw_data_dir: ${raw_data_dir} 61 | already_downloaded: ${already_downloaded} 62 | output_manifest_file: ${workspace_dir}/${data_split}/manifest0.json 63 | 64 | # 1 Dropping too short and too long segments 65 | - _target_: sdp.processors.DropHighLowDuration 66 | high_duration_threshold: ${max_duration} 67 | low_duration_threshold: ${min_duration} 68 | duration_key: duration 69 | output_manifest_file: ${workspace_dir}/${data_split}/manifest1.json 70 | 71 | # 2 removing punctuation, diacritics, dotted letters and tatweel 72 | - _target_: sdp.processors.langs.arabic.ArabicTextPreprocessor 73 | input_text_key: text 74 | remove_diacritics: ${remove_diacritics} 75 | remove_punctuation: ${remove_punctuation} 76 | remove_tatweel: ${remove_tatweel} 77 | normalize_ligature: ${normalize_ligature} 78 | apply_nfkc: ${apply_nfkc} 79 | output_manifest_file: ${workspace_dir}/${data_split}/manifest2.json 80 | 81 | # 3 dropping non alphabetical symbols 82 | - _target_: sdp.processors.DropNonAlphabet 83 | alphabet: " \u0631\u0630\u062F\u062E\u062D\u062C\u062B\u062A\u0629\u0628\u0627\u0626\u0625\u0624\u0623\u0622\u0621\u064A\u0649\u0648\u0647\u0646\u0645\u0644\u0643\u0642\u0641\u063A\u0639\u0638\u0637\u0636\u0635\u0634\u0633\u0632\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652" 84 | output_manifest_file: ${final_manifest} 85 | -------------------------------------------------------------------------------- /dataset_configs/arabic/mediaspeech/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | MediaSpeech 3 | ############ 4 | Dataset must be downloaded manually prior. 5 | 6 | The config creates initial manifest for MediaSpeech dataset. 7 | 8 | **Required arguments**. 9 | 10 | * **raw_data_dir**: path to the tarred dataset. 11 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 12 | 13 | **Output format**. 14 | 15 | This config dumps the final manifest at ``${manifest_dir}/manifest.json`` and wav files ``${manifest_dir}/audios``. 16 | The output manifest contains the following fields: 17 | 18 | * **audio_filepath (str)**: relative path to the audio files. 19 | * **text (str)**: transcription. 20 | * **duration (float)**: audio duration in seconds. 21 | 22 | processors_to_run: "0:" 23 | 24 | workspace_dir: ?? 25 | final_manifest: ${workspace_dir}/manifest.json 26 | 27 | processors: 28 | # 0 creating initial manifest 29 | - _target_: sdp.processors.CreateInitialManifestMediaSpeech 30 | raw_data_dir: ${workspace_dir} 31 | extract_archive_dir: ${workspace_dir} 32 | resampled_audios_dir: ${workspace_dir}/audios 33 | output_manifest_file: ${workspace_dir}/manifest0.json 34 | 35 | # 1 calculating durations 36 | - _target_: sdp.processors.GetAudioDuration 37 | audio_filepath_key: audio_filepath 38 | duration_key: duration 39 | output_manifest_file: ${workspace_dir}/manifest1.json 40 | 41 | # 2 dropping non alphabetical symbols 42 | - _target_: sdp.processors.DropNonAlphabet 43 | alphabet: " \u0631\u0630\u062F\u062E\u062D\u062C\u062B\u062A\u0629\u0628\u0627\u0626\u0625\u0624\u0623\u0622\u0621\u064A\u0649\u0648\u0647\u0646\u0645\u0644\u0643\u0642\u0641\u063A\u0639\u0638\u0637\u0636\u0635\u0634\u0633\u0632\u064B\u064C\u064D\u064E\u064F\u0650\u0651\u0652" 44 | output_manifest_file: ${workspace_dir}/manifest2.json 45 | 46 | # 3 changing paths to relative 47 | - _target_: sdp.processors.ChangeToRelativePath 48 | base_dir: ${workspace_dir} 49 | output_manifest_file: ${final_manifest} 50 | -------------------------------------------------------------------------------- /dataset_configs/arabic/readme.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | This folder is designated for Arabic speech processing configuration files will be added soon. It is associated with a forthcoming paper, which will detail the work done within this project. 4 | 5 | 6 | Note: This folder is a work in progress. -------------------------------------------------------------------------------- /dataset_configs/armenian/fleurs/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | FLEURS 3 | ###### 4 | This config can be used to prepare 5 | `FLEURS `_ 6 | dataset in the NeMo format. 7 | It produces manifest for dev split of armenian language. 8 | This config performs the following data processing. 9 | 10 | 1. Downloads FLEURS data 11 | 2. Calculates the length of wav files 12 | 13 | **Required arguments**. 14 | 15 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 16 | 17 | Note that you can customize any part of this config either directly or from command-line. 18 | 19 | **Output format** 20 | 21 | This config generates output manifest files: 22 | 23 | * ``${workspace_dir}/${final_manifest}`` - dev subset of the data. 24 | 25 | Output manifest contains the following keys: 26 | 27 | * **audio_filepath (str)**: relative path to the audio files. 28 | * **text (str)**: transcription (lower-case without punctuation). 29 | * **duration (float)**: audio duration in seconds. 30 | processors_to_run: all 31 | workspace_dir: ??? 32 | data_split: dev 33 | final_manifest: ${workspace_dir}/${data_split}_manifest.json 34 | 35 | processors: 36 | # creating manifest for armenian dev set 37 | - _target_: sdp.processors.CreateInitialManifestFleurs 38 | lang: "hy_am" 39 | split: ${data_split} 40 | raw_data_dir: ${workspace_dir}/raw_data 41 | 42 | - _target_: sdp.processors.GetAudioDuration 43 | audio_filepath_key: audio_filepath 44 | duration_key: duration 45 | output_manifest_file: ${final_manifest} 46 | -------------------------------------------------------------------------------- /dataset_configs/armenian/toloka/pipeline_get_final_res.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | Getting final resuts from Toloka 3 | ################################ 4 | 5 | This configuration represents the final stage of processing Armenian language datasets for the Toloka platform. 6 | It processes all accepted results from the Toloka pool and prepares the data for training by refining and resampling audio files and ensuring text formatting consistency. 7 | 8 | **Stage Overview**: 9 | 10 | This stage includes the following steps: 11 | 1. Downloading all the ACCEPTED results from the Toloka platform. 12 | 2. Filtering out damaged audio files. 13 | 3. Resampling audio files to ensure compatibility with ASR models (16 kHz, mono channel). 14 | 4. Ensuring all utterances end with a proper Armenian end symbol; adding `:` if not. 15 | 5. Dropping all unnecessary fields, keeping only `text` and `audio_filepath` for training. 16 | 6. Calculating the audio duration for each utterance. 17 | 18 | **Required Arguments**: 19 | - `workspace_dir`: Specify the directory for storing intermediate and final output files. 20 | 21 | **Output Files**: 22 | - `${workspace_dir}/manifest-1.json`: Manifest of all accepted results. 23 | - `${workspace_dir}/manifest0.json`: Manifest after filtering out damaged audio files. 24 | - `${workspace_dir}/manifest1.json`: Manifest with resampled audio files. 25 | - `${workspace_dir}/manifest3.json`: Manifest with text formatting corrections. 26 | - `${workspace_dir}/manifest4.json`: Manifest with only the necessary fields (`text`, `audio_filepath`). 27 | - `${final_manifest}`: Final manifest with audio durations. 28 | 29 | processors_to_run: all 30 | workspace_dir: ??? 31 | final_manifest: ${workspace_dir}/results.json 32 | 33 | processors: 34 | - _target_: sdp.processors.GetTolokaResults 35 | input_pool_file: ${workspace_dir}/taskpool.json 36 | input_data_file: ${workspace_dir}/data_file.json 37 | status: ACCEPTED 38 | output_dir: ${workspace_dir}/results 39 | output_manifest_file: ${workspace_dir}/manifest-1.json 40 | 41 | - _target_: sdp.processors.ASRFileCheck 42 | audio_filepath_key: audio_filepath 43 | workspace_dir: ${workspace_dir} 44 | corrupted_audio_dir: ${workspace_dir}/curr 45 | output_manifest_file: ${workspace_dir}/manifest0.json 46 | 47 | - _target_: sdp.processors.SoxConvert 48 | output_manifest_file: ${workspace_dir}/manifest1.json 49 | converted_audio_dir: ${workspace_dir}/16k 50 | input_audio_file_key: "audio_filepath" 51 | output_audio_file_key: "audio_filepath" 52 | output_format: "wav" 53 | rate: 16000 54 | channels: 1 55 | workspace_dir: ${workspace_dir} 56 | 57 | - _target_: sdp.processors.MakeSentence 58 | text_key: "text" 59 | end_symbol: ":" 60 | make_uppercase: True 61 | output_manifest_file: ${workspace_dir}/manifest3.json 62 | 63 | - _target_: sdp.processors.KeepOnlySpecifiedFields 64 | output_manifest_file: ${workspace_dir}/manifest4.json 65 | fields_to_keep: ["text", "audio_filepath"] 66 | 67 | - _target_: sdp.processors.GetAudioDuration 68 | audio_filepath_key: audio_filepath 69 | duration_key: duration 70 | output_manifest_file: ${final_manifest} 71 | 72 | 73 | -------------------------------------------------------------------------------- /dataset_configs/commoncrawl/README.md: -------------------------------------------------------------------------------- 1 | # Multilingual Labelled Speech Transcription Pipeline for Common Crawl 2 | 3 | -------------------------------------------------------------------------------- /dataset_configs/english/hifitts2/config_22khz.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | HiFiTTS-2 22kHz 3 | ############### 4 | 5 | This config can be used to download the audio data for 6 | `HiFiTTS-2 22kHz `_ 7 | 8 | 1. Downloads HiFiTTS-2 audio from LibriVox. 9 | 2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they 10 | were removed from the website) are removed. 11 | 12 | **Required arguments**. 13 | 14 | * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored. 15 | 16 | Note that you can customize any part of this config either directly or from command-line. 17 | 18 | **Output format**. 19 | 20 | This config outputs 2 manifest files: 21 | 22 | * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox. 23 | * ``${workspace_dir}/manifest_filtered_22khz`` - input manifest file without utterances from failed chapters. 24 | 25 | processors_to_run: all 26 | workspace_dir: ??? 27 | manifest_filename: manifest_22khz.json 28 | output_filename: manifest_filtered_22khz.json 29 | chapter_filename: chapters_22khz.json 30 | error_filename: errors_22khz.json 31 | audio_dir_name: audio_22khz 32 | chapter_audio_dir_name: chapters 33 | sample_rate: 22050 34 | delete_chapter_files: true 35 | exit_on_error: false 36 | use_dask: false 37 | max_workers: 8 38 | chunksize: 50 39 | 40 | input_manifest_file: ${workspace_dir}/${manifest_filename} 41 | chapter_file: ${workspace_dir}/${chapter_filename} 42 | error_file: ${workspace_dir}/${error_filename} 43 | audio_dir: ${workspace_dir}/${audio_dir_name} 44 | chapter_dir: ${workspace_dir}/${chapter_audio_dir_name} 45 | final_manifest: ${workspace_dir}/${output_filename} 46 | 47 | processors: 48 | - _target_: sdp.processors.DownloadHiFiTTS2 49 | audio_dir: ${audio_dir} 50 | chapter_dir: ${chapter_dir} 51 | sample_rate: ${sample_rate} 52 | delete_chapter_files: ${delete_chapter_files} 53 | exit_on_error: ${exit_on_error} 54 | input_manifest_file: ${chapter_file} 55 | output_manifest_file: ${error_file} 56 | use_dask: ${use_dask} 57 | max_workers: ${max_workers} 58 | chunksize: ${chunksize} 59 | 60 | - _target_: sdp.processors.RemovedFailedChapters 61 | input_manifest_file: ${input_manifest_file} 62 | output_manifest_file: ${final_manifest} 63 | error_file: ${error_file} 64 | -------------------------------------------------------------------------------- /dataset_configs/english/hifitts2/config_44khz.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | HiFiTTS-2 44kHz 3 | ################## 4 | 5 | This config can be used to download the audio data for 6 | `HiFiTTS-2 44kHz `_ 7 | 8 | 9 | 1. Downloads HiFiTTS-2 audio from LibriVox. 10 | 2. Outputs a new manifest in which LibriVox audiobook chapters which could not be downloaded (e.g. because they 11 | were removed from the website) are removed. 12 | 13 | **Required arguments**. 14 | 15 | * **workspace_dir**: specify the workspace folder where all audio files and manifests will be stored. 16 | 17 | Note that you can customize any part of this config either directly or from command-line. 18 | 19 | **Output format**. 20 | 21 | This config outputs 2 manifest files: 22 | 23 | * ``${workspace_dir}/errors.json`` - entries from the input chapters file which failed to download from LibriVox. 24 | * ``${workspace_dir}/manifest_filtered_44khz`` - input manifest file without utterances from failed chapters. 25 | 26 | processors_to_run: all 27 | workspace_dir: ??? 28 | manifest_filename: manifest_44khz.json 29 | output_filename: manifest_filtered_44khz.json 30 | chapter_filename: chapters_44khz.json 31 | error_filename: errors_44khz.json 32 | audio_dir_name: audio_44khz 33 | chapter_audio_dir_name: chapters 34 | sample_rate: 44100 35 | delete_chapter_files: true 36 | exit_on_error: false 37 | use_dask: false 38 | max_workers: 8 39 | chunksize: 50 40 | 41 | input_manifest_file: ${workspace_dir}/${manifest_filename} 42 | chapter_file: ${workspace_dir}/${chapter_filename} 43 | error_file: ${workspace_dir}/${error_filename} 44 | audio_dir: ${workspace_dir}/${audio_dir_name} 45 | chapter_dir: ${workspace_dir}/${chapter_audio_dir_name} 46 | final_manifest: ${workspace_dir}/${output_filename} 47 | 48 | processors: 49 | - _target_: sdp.processors.DownloadHiFiTTS2 50 | audio_dir: ${audio_dir} 51 | chapter_dir: ${chapter_dir} 52 | sample_rate: ${sample_rate} 53 | delete_chapter_files: ${delete_chapter_files} 54 | exit_on_error: ${exit_on_error} 55 | input_manifest_file: ${chapter_file} 56 | output_manifest_file: ${error_file} 57 | use_dask: ${use_dask} 58 | max_workers: ${max_workers} 59 | chunksize: ${chunksize} 60 | 61 | - _target_: sdp.processors.RemovedFailedChapters 62 | input_manifest_file: ${input_manifest_file} 63 | output_manifest_file: ${final_manifest} 64 | error_file: ${error_file} 65 | -------------------------------------------------------------------------------- /dataset_configs/english/hifitts2/config_bandwidth.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | HiFiTTS-2 Bandwidth Estimation 3 | ############################## 4 | 5 | This config contains the bandwidth estimation code used for HiFiTTS and HiFiTTS-2. 6 | This config can be used to estimate bandwidth for any dataset. For HiFiTTS-2 bandwidth 7 | was estimated using the first 30 seconds of every audiobook chapter, but the estimate is still 8 | reasonably accurate if run over a shorter duration or with individual utterances. 9 | 10 | **Required arguments**. 11 | 12 | * **workspace_dir**: The workspace folder where all audio files and manifests are stored. 13 | * **audio_dir**: Folder in workspace containing audio files to estimate bandwidth of. 14 | * **input_manifest_filename**: Manifest file in workspace containing relative paths to audio. 15 | 16 | **Output format**. 17 | 18 | This config outputs a single manifest with the following field(s): 19 | 20 | * **bandwidth (int)**: Estimated bandwidth of the audio file. 21 | 22 | processors_to_run: all 23 | workspace_dir: ??? 24 | audio_dir_name: ??? 25 | input_manifest_filename: ??? 26 | output_manifest_filename: manifest_bandwidth.json 27 | audio_key: audio_filepath 28 | use_dask: false 29 | max_workers: 1 30 | chunksize: 1 31 | 32 | input_manifest_file: ${workspace_dir}/${input_manifest_filename} 33 | final_manifest: ${workspace_dir}/${output_manifest_filename} 34 | audio_dir: ${workspace_dir}/${audio_dir_name} 35 | 36 | processors: 37 | - _target_: sdp.processors.EstimateBandwidth 38 | input_manifest_file: ${input_manifest_file} 39 | output_manifest_file: ${final_manifest} 40 | audio_dir: ${audio_dir} 41 | input_audio_key: ${audio_key} 42 | use_dask: ${use_dask} 43 | max_workers: ${max_workers} 44 | chunksize: ${chunksize} 45 | -------------------------------------------------------------------------------- /dataset_configs/english/librispeech/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | Librispeech 3 | ########### 4 | 5 | This config can be used to prepare 6 | `Librispeech `_ 7 | dataset in the NeMo format. 8 | 9 | It produces manifests for the dev-clean split (for other splits, please configure). 10 | The options are: 11 | 12 | - ``"dev-clean"`` 13 | - ``"dev-other"`` 14 | - ``"test-clean"`` 15 | - ``"test-other"`` 16 | - ``"train-clean-100"`` 17 | - ``"train-clean-360"`` 18 | - ``"train-other-500"`` 19 | - ``"dev-clean-2"`` 20 | - ``"train-clean-5"`` 21 | 22 | This config performs the following data processing. 23 | 24 | 1. Downloads Librispeech data 25 | 2. Converts flac files to wav file 26 | 3. Calculates the length of wav files 27 | 4. Makes capitalization lowercase 28 | 29 | **Required arguments**. 30 | 31 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 32 | 33 | Note that you can customize any part of this config either directly or from command-line. 34 | 35 | **Output format**. 36 | 37 | This config generates output manifest file: 38 | 39 | * ``${workspace_dir}/manifest.json`` - dev-clean subset of the data. 40 | 41 | Output manifest contains the following fields: 42 | 43 | * **audio_filepath (str)**: relative path to the audio files. 44 | * **text (str)**: transcription (lower-case without punctuation). 45 | * **duration (float)**: audio duration in seconds. 46 | 47 | processors_to_run: all 48 | workspace_dir: ??? 49 | data_split: "dev-clean" 50 | final_manifest: ${workspace_dir}/manifest.json 51 | 52 | processors: 53 | # creating manifest for dev-clean set 54 | - _target_: sdp.processors.CreateInitialManifestLibrispeech 55 | split: ${data_split} 56 | raw_data_dir: ${workspace_dir}/raw_data 57 | 58 | - _target_: sdp.processors.SoxConvert 59 | converted_audio_dir: ${workspace_dir}/audio 60 | input_audio_file_key: "audio_filepath" 61 | output_audio_file_key: "audio_filepath" 62 | output_format: "wav" 63 | 64 | - _target_: sdp.processors.GetAudioDuration 65 | audio_filepath_key: audio_filepath 66 | duration_key: duration 67 | 68 | - _target_: sdp.processors.SubMakeLowercase 69 | output_manifest_file: ${final_manifest} 70 | -------------------------------------------------------------------------------- /dataset_configs/english/librispeech/mini.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | Librispeech (mini) 3 | ################## 4 | 5 | This config can be used to prepare 6 | `Librispeech mini `_ 7 | dataset in the NeMo format. 8 | 9 | It produces manifests for the mini split of Libripseech. 10 | 11 | This config performs the following data processing. 12 | 13 | 1. Downloads Librispeech data 14 | 2. Converts flac files to wav file 15 | 3. Calculates the length of wav files 16 | 4. Makes capitalization lowercase 17 | 18 | **Required arguments**. 19 | 20 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 21 | 22 | Note that you can customize any part of this config either directly or from command-line. 23 | 24 | **Output format**. 25 | 26 | This config generates 2 output manifest files: 27 | 28 | * ``${workspace_dir}/dev-clean-2.json`` - mini dev-clean subset of the data. 29 | * ``${workspace_dir}/train-clean-5.json`` - mini train-clean subset of the data. 30 | 31 | Output manifest contains the following fields: 32 | 33 | * **audio_filepath (str)**: relative path to the audio files. 34 | * **text (str)**: transcription (lower-case without punctuation). 35 | * **duration (float)**: audio duration in seconds. 36 | 37 | processors_to_run: all 38 | workspace_dir: ??? 39 | 40 | processors: 41 | # creating manifest for mini dev-clean set 42 | - _target_: sdp.processors.CreateInitialManifestLibrispeech 43 | split: dev-clean-2 44 | raw_data_dir: ${workspace_dir}/raw_data 45 | 46 | - _target_: sdp.processors.SoxConvert 47 | converted_audio_dir: ${workspace_dir}/audio 48 | input_audio_file_key: "audio_filepath" 49 | output_audio_file_key: "audio_filepath" 50 | output_format: "wav" 51 | 52 | - _target_: sdp.processors.GetAudioDuration 53 | audio_filepath_key: audio_filepath 54 | duration_key: duration 55 | 56 | - _target_: sdp.processors.SubMakeLowercase 57 | output_manifest_file: ${workspace_dir}/dev-clean-2.json 58 | 59 | # creating manifest for mini traio-clean set 60 | - _target_: sdp.processors.CreateInitialManifestLibrispeech 61 | split: train-clean-5 62 | raw_data_dir: ${workspace_dir}/raw_data 63 | 64 | - _target_: sdp.processors.SoxConvert 65 | converted_audio_dir: ${workspace_dir}/audio 66 | input_audio_file_key: "audio_filepath" 67 | output_audio_file_key: "audio_filepath" 68 | output_format: "wav" 69 | 70 | - _target_: sdp.processors.GetAudioDuration 71 | audio_filepath_key: audio_filepath 72 | duration_key: duration 73 | 74 | - _target_: sdp.processors.SubMakeLowercase 75 | output_manifest_file: ${workspace_dir}/train-clean-5.json -------------------------------------------------------------------------------- /dataset_configs/english/slr83/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | SLR83 3 | ##### 4 | 5 | This config can be used to prepare 6 | `UK and Ireland English Dialect (SLR83) `_ 7 | datasets in the NeMo format. The original data does not contain any 8 | splits, so we provide a custom way to split the data. 9 | See https://arxiv.org/abs/2210.03255 for details on the data splits. 10 | 11 | Note that SLR83 consists of 11 different accents and dialects and we do not 12 | combine them together. You will need to run this config 11 times with different 13 | command-line parameters to get all the datasets and if you want to combine 14 | them all together, this currently needs to be done manually. 15 | 16 | This config performs the following data processing. 17 | 18 | 1. Downloads and extracts the data from the official website. 19 | 2. Lower-cases all text and removes ``-`` characters (that's the only 20 | punctuation available in the transcription). 21 | 3. Drops all utterances with non-alphabet symbols. 22 | 4. Splits the data into train, dev or test, depending on the config parameters. 23 | 24 | **Required arguments**. 25 | 26 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 27 | * **data_split**: should be "train", "dev" or "test". 28 | * **dialect**: should be on of the 29 | 30 | * ``irish_english_male`` 31 | * ``midlands_english_female`` 32 | * ``midlands_english_male`` 33 | * ``northern_english_female`` 34 | * ``northern_english_male`` 35 | * ``scottish_english_female`` 36 | * ``scottish_english_male`` 37 | * ``southern_english_female`` 38 | * ``southern_english_male`` 39 | * ``welsh_english_female`` 40 | * ``welsh_english_male`` 41 | 42 | Note that you can customize any part of this config either directly or from command-line. 43 | 44 | **Output format**. 45 | 46 | This config dumps the final manifest at ``${workspace_dir}/${dialect}/${data_split}_manifest.json``. 47 | The output manifest contains the following fields: 48 | 49 | * **audio_filepath (str)**: relative path to the audio files. 50 | * **text (str)**: transcription (lower-case without punctuation). 51 | * **duration (float)**: audio duration in seconds. 52 | 53 | 54 | processors_to_run: all 55 | data_split: ??? 56 | workspace_dir: ??? 57 | final_manifest: ${workspace_dir}/${dialect}/${data_split}_manifest.json 58 | dialect: irish_english_male 59 | 60 | processors: 61 | - _target_: sdp.processors.CreateInitialManifestSLR83 62 | dialect: ${dialect} 63 | raw_data_dir: ${workspace_dir}/${dialect}/raw_data 64 | 65 | - _target_: sdp.processors.SubMakeLowercase 66 | 67 | - _target_: sdp.processors.SubRegex 68 | regex_params_list: 69 | - {"pattern": "’", "repl": "'"} 70 | - {"pattern": "[-–—]", "repl": " "} 71 | 72 | - _target_: sdp.processors.DropNonAlphabet 73 | alphabet: " 'abcdefghijklmnopqrstuvwxyz" 74 | 75 | - _target_: sdp.processors.CustomDataSplitSLR83 76 | data_split: ${data_split} 77 | dialect: ${dialect} 78 | 79 | - _target_: sdp.processors.ChangeToRelativePath 80 | base_dir: ${workspace_dir}/${dialect} 81 | 82 | - _target_: sdp.processors.KeepOnlySpecifiedFields 83 | output_manifest_file: ${final_manifest} 84 | fields_to_keep: 85 | - audio_filepath 86 | - text 87 | - duration 88 | -------------------------------------------------------------------------------- /dataset_configs/ipl/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | TopIPL 3 | ###### 4 | 5 | This config is used to run the `TopIPL: Iterative Pseudo-Labeling for ASR `_ training algorithm using NeMo-Run. 6 | 7 | TopIPL is a **semi-supervised training method** for automatic speech recognition (ASR) that iteratively alternates between model training and pseudo-label generation for unlabeled data. It uses a **top-N checkpoint averaging strategy** to create a strong teacher model and maintains a **dynamic cache** of pseudo-labels throughout the process. 8 | 9 | The pipeline is implemented as a processor compatible with the `nemo_run` framework. It generates an output manifest containing updated labels based on pseudo-labeling iterations. 10 | 11 | This config performs the following steps: 12 | 13 | 1. Runs training and inference commands using NeMo-Run. 14 | 2. Periodically stops training to generate pseudo-labels with a top-N checkpoint ensemble. 15 | 3. Maintains a dynamic cache of pseudo-labels for unlabeled data. 16 | 4. Produces a new output manifest after each iteration. 17 | 18 | **Required arguments** 19 | 20 | - **output_manifest_file**: path where the final manifest with pseudo-labels will be saved. 21 | - **nemo_run_config**: YAML config file specifying the training, inference, and IPL parameters. 22 | 23 | **Training config requirements** 24 | 25 | Your training config must include the following setting to enable IPL: 26 | 27 | .. code-block:: yaml 28 | 29 | exp_manager: 30 | create_ipl_epoch_stopper_callback: True 31 | 32 | If you're not using Lhotse, also include: 33 | 34 | .. code-block:: yaml 35 | 36 | ipl_epoch_stopper_callback_params: 37 | stop_every_n_epochs: 2 38 | 39 | ### Prerequisites 40 | 41 | - nemo_run 42 | - ``pip install -r ipl.txt`` 43 | 44 | processors_to_run: all 45 | 46 | processors: 47 | - _target_: sdp.processors.IPL.nemo_run_processor.NemoRunIPLProcessor 48 | config_path: ./nemo_run_config.yaml 49 | output_manifest_file: ??? 50 | -------------------------------------------------------------------------------- /dataset_configs/ipl/nemo_run_config.yaml: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | # The script to be run. 16 | script: # Script path to run relative to directory 17 | script_config: # Training config file for the script. ipl_epoch_stopper_callback should be provided in the config 18 | inference_config: # Inference config file of unlabeled data for transcribe_speech_parallel 19 | 20 | exp_name: null # populated by exp_manager.name if not provided 21 | results_dir: # Where to store the results of the run 22 | 23 | # Path to the local NeMo repository. This is used to locate scripts and configs from NeMo. 24 | # To set this up: 25 | # 1. Clone the NeMo repository: 26 | # git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo 27 | # 2. Set the path here: 28 | # Make sure this path is valid and NeMo is up to date if you're using its scripts. 29 | nemo_directory: # Nemo directory path 30 | do_average: # Boolean value indicating whether to do average of checkpoints for pseudo-label generation 31 | p_cache: # Probability with which update pseudo-labeled set 32 | num_ipl_epochs: # How many epochs do pseudo-labeling 33 | 34 | # Optional arguments 35 | num_runs: 36 | num_gpus: 37 | num_tasks_per_node: 38 | max_runtime: # Specify for clusters 39 | 40 | ######################################################################################################################## 41 | 42 | executor: slurm # or local 43 | 44 | USER: 45 | 46 | # Fields for cluster run 47 | ssh_tunnel: 48 | host: 49 | # ------------------------------- Fill this up! ------------------------------- 50 | user: "${USER}" # your username; or resolved from ${USER} environment variable ; or can be null which resolved from ${USER} environment variable 51 | job_dir: "" # Job directory to keep created files 52 | identity: "" 53 | # ----------------------------------------------------------------------------- 54 | 55 | account: 56 | partition: 57 | job_name_prefix: 58 | 59 | containers: 60 | asr: # Container image 61 | 62 | 63 | env_vars: 64 | - 'TOKENIZERS_PARALLELISM=' 65 | - 'AIS_ENDPOINT=' 66 | - 'LHOTSE_AUDIO_DURATION_MISMATCH_TOLERANCE=' 67 | - 'TORCH_CUDNN_V8_API_ENABLED=' 68 | - 'PYTORCH_CUDA_ALLOC_CONF=' 69 | - 'HYDRA_FULL_ERROR=1' 70 | 71 | required_env_vars: 72 | - 'HF_TOKEN=' 73 | - 'WANDB_KEY=' 74 | 75 | mounts: 76 | # Replace with your own paths in your cluster config 77 | - /path/to/mount:/where/to/mount/ 78 | 79 | timeouts: 80 | partition_name: # Specify time 81 | -------------------------------------------------------------------------------- /dataset_configs/italian/mcv/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | MCV Italian 3 | ########### 4 | 5 | This config was originally designed for the 6 | `Mozilla Common Voice (MCV) `_ dataset 7 | 12.0 release, but should work for any subsequent releases as well. 8 | 9 | It performs the following data processing. 10 | 11 | 1. Extracts and converts all data to the NeMo format. 12 | 2. Replaces certain non-supported characters and punctuation marks with equivalent supported versions. 13 | 3. Drops any data that contains symbols not in the supported alphabet. 14 | 4. Drops a few manually specified audio files that were found to contain transcription errors. 15 | 16 | **Required arguments**. 17 | 18 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 19 | You need to manually place the downloaded MCV Italian data inside 20 | ``/raw_data/`` subfolder. 21 | * **data_split**: should be "train", "dev" or "test". 22 | 23 | Note that you can customize any part of this config either directly or from command-line. 24 | Here are some common customizations to consider: 25 | 26 | * **remove_pc**: set to True if P&C is not needed. Defaults to False. 27 | 28 | **Output format**. 29 | 30 | This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``. 31 | The output manifest contains the following fields: 32 | 33 | * **audio_filepath (str)**: relative path to the audio files. 34 | * **text (str)**: transcription, including punctuation ".,?" and capitalization. 35 | * **duration (float)**: audio duration in seconds. 36 | 37 | 38 | processors_to_run: all 39 | data_split: ??? 40 | workspace_dir: ??? 41 | final_manifest: ${workspace_dir}/${data_split}_manifest.json 42 | remove_pc: False 43 | 44 | processors: 45 | - _target_: sdp.processors.CreateInitialManifestMCV 46 | output_manifest_file: ${workspace_dir}/${data_split}_manifest0.json 47 | language_id: it 48 | extract_archive_dir: ${workspace_dir}/raw_data 49 | resampled_audio_dir: ${workspace_dir}/${data_split}/audio/ 50 | data_split: ${data_split} 51 | raw_data_dir: ${workspace_dir}/raw_data 52 | 53 | - _target_: sdp.processors.SubRegex 54 | regex_params_list: 55 | - {"pattern": "!", "repl": "."} 56 | - {"pattern": "…", "repl": "."} 57 | - {"pattern": "’", "repl": "'"} 58 | - {"pattern": '[\":\(\)“”;]', "repl": ''} 59 | - {"pattern": "[-/]", "repl": " "} 60 | # note that we exclude î and ó - according to wikipedia they are very 61 | # rarely used in modern italian. So it's safer to replace them, as they 62 | # often represent other languages (e.g., french or spanish, most often 63 | # in names), rather than actual italian 64 | - {"pattern": "î", "repl": "i"} 65 | - {"pattern": "ó", "repl": "o"} 66 | - {"pattern": "Î", "repl": "I"} 67 | - {"pattern": "Ó", "repl": "O"} 68 | test_cases: 69 | - {input: {text: "Wow!"}, output: {text: "Wow."}} 70 | 71 | - _target_: sdp.processors.DropNonAlphabet 72 | alphabet: ".,? 'abcdefghijklmnopqrstuvwxyzàèéìíòùúABCDEFGHIJKLMNOPQRSTUVWXYZÀÈÉÌÍÒÙÚ" 73 | test_cases: 74 | - {input: {text: "test тест 测试"}, output: null} 75 | - {input: {text: "test"}, output: {text: "test"}} 76 | 77 | - _target_: sdp.processors.DropIfRegexMatch 78 | regex_patterns: [ 79 | # transcription errors 80 | "common_voice_it_17553281.wav", 81 | "common_voice_it_19976820.wav", 82 | "common_voice_it_17553352.wav", 83 | ] 84 | text_key: audio_filepath 85 | 86 | # ------------------------ if P&C is not needed ------------------------ 87 | 88 | - _target_: sdp.processors.SubMakeLowercase 89 | should_run: ${remove_pc} 90 | 91 | - _target_: sdp.processors.SubRegex 92 | should_run: ${remove_pc} 93 | regex_params_list: 94 | - {"pattern": '[\?\.,]', "repl": ""} 95 | 96 | # ---------------------------------------------------------------------- 97 | 98 | 99 | - _target_: sdp.processors.ChangeToRelativePath 100 | base_dir: ${workspace_dir} 101 | 102 | - _target_: sdp.processors.KeepOnlySpecifiedFields 103 | output_manifest_file: ${final_manifest} 104 | fields_to_keep: 105 | - audio_filepath 106 | - text 107 | - duration 108 | -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/readme.md: -------------------------------------------------------------------------------- 1 | # README 2 | 3 | This folder is designated for Granary speech data processing configuration files will be added soon. It is associated with a forthcoming paper, which will detail the work done within this project. 4 | 5 | Note: This folder is a work in progress. 6 | -------------------------------------------------------------------------------- /dataset_configs/portuguese/coraa/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | Coraa Portuguese 3 | ################ 4 | 5 | The config performs the following data processing. 6 | 7 | 1. Downloads and extracts all the data from the "https://huggingface.co/datasets/gabrielrstan/CORAA-v1.1/tree/main" 8 | 2. Replaces certain non-supported characters, abbreviations and punctuation marks with equivalent supported versions. 9 | 3. Drops any data that contains high/low character occurence. 10 | 4. Drops any data that contains symbols not in the supported alphabet. 11 | 12 | **Required arguments**. 13 | 14 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 15 | * **data_split**: should be "train", "dev" or "test". 16 | 17 | **Output format**. 18 | 19 | This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``. 20 | The output manifest contains the following fields: 21 | 22 | * **audio_filepath (str)**: relative path to the audio files. 23 | * **text (str)**: transcription, including punctuation ".,?" and capitalization. 24 | * **duration (float)**: audio duration in seconds. 25 | 26 | 27 | processors_to_run: all 28 | workspace_dir: ??? 29 | data_split: ??? 30 | final_manifest: ??? 31 | 32 | 33 | processors: 34 | - _target_: sdp.processors.CreateInitialManifestCORAA 35 | raw_data_dir: ${workspace_dir} 36 | data_split: ${data_split} 37 | extract_archive_dir: ${workspace_dir}/extracted 38 | resampled_audio_dir: ${workspace_dir}/extracted/16k 39 | already_downloaded: false 40 | already_extracted: false 41 | output_manifest_file: ${workspace_dir}/${data_split}_manifest0.json 42 | 43 | - _target_: sdp.processors.SubRegex 44 | regex_params_list: 45 | - {"pattern": "(Aplausos)", "repl": " "} 46 | - {"pattern": "(Risos)", "repl": " "} 47 | - {"pattern": '[\-\‐\‑\–\—\―\"]', "repl": " "} 48 | - {"pattern": "'", "repl": " "} 49 | - {"pattern": '[\$\&\¡\(\)]', "repl": " "} 50 | - {"pattern": '[\«\°\´\·\»]', "repl": " "} 51 | - {"pattern": '[\«\°\´\·\»]', "repl": " "} 52 | - {"pattern": '[\‘\’\“\”\„]', "repl": " "} 53 | - {"pattern": '[\:\;\`\ʻ]', "repl": " "} 54 | - {"pattern": "!", "repl": "."} 55 | - {"pattern": "…\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end 56 | - {"pattern": "\\.{2,20}\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end 57 | 58 | # remove remaining repeated periods since most of the time they are unnecessary in this data 59 | - {"pattern": "\\.{2,20}", "repl": " "} 60 | 61 | - {"pattern": " ([Pp])rofa ", "repl" : ' \1rofessora '} 62 | - {"pattern": " ([Ss])ra.", "repl" : ' \1enhora'} 63 | - {"pattern": " ([Ss])rta.", "repl": '\1enhorita'} 64 | - {"pattern": " ([Ss])r.", 'repl': '\1enhor' } 65 | - {"pattern": " ([Dd])r ", "repl" : ' \1octor '} 66 | - {"pattern": " ([Dd])r.", "repl" : ' \1octor '} 67 | - {"pattern": " ([Dd])ra ", "repl" : ' \1octora '} 68 | 69 | - {"pattern": " um km ", "repl" : " um quilômetro "} 70 | - {"pattern": " km ", "repl" : " quilômetros "} 71 | 72 | - _target_: sdp.processors.DropHighLowDuration 73 | high_duration_threshold: 20 74 | low_duration_threshold: 0.5 75 | 76 | - _target_: sdp.processors.DropHighLowCharrate 77 | high_charrate_threshold: 21 78 | low_charrate_threshold: 1 79 | 80 | - _target_: sdp.processors.DropNonAlphabet 81 | output_manifest_file: ${final_manifest} 82 | alphabet: " ÁÃÀÂÇÉÊÍÕÓÔÚÜáãàâçéêíõóôúüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,.?" 83 | -------------------------------------------------------------------------------- /dataset_configs/portuguese/mcv/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | MCV Portuguese 3 | ############## 4 | 5 | This config was originally designed for the 6 | `Mozilla Common Voice (MCV) `_ dataset 7 | 15.0 release, but should work for any subsequent releases as well. 8 | 9 | It performs the following data processing. 10 | 11 | 1. Extracts and converts all data to the NeMo format. 12 | 2. Replaces certain non-supported characters, abbreviations and punctuation marks with equivalent supported versions. 13 | 3. Drops any data that contains high/low character occurence. 14 | 4. Drops any data that contains symbols not in the supported alphabet. 15 | 16 | **Required arguments**. 17 | 18 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 19 | You need to manually place the downloaded MCV Portuguese data inside 20 | ``/raw_data/`` subfolder. 21 | * **data_split**: should be "train", "dev" or "test". 22 | 23 | **Output format**. 24 | 25 | This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``. 26 | The output manifest contains the following fields: 27 | 28 | * **audio_filepath (str)**: relative path to the audio files. 29 | * **text (str)**: transcription, including punctuation ".,?" and capitalization. 30 | * **duration (float)**: audio duration in seconds. 31 | 32 | 33 | 34 | processors_to_run: all 35 | workspace_dir: ??? 36 | data_split: ??? 37 | final_manifest: ??? 38 | 39 | 40 | processors: 41 | - _target_: sdp.processors.CreateInitialManifestMCV 42 | raw_data_dir: ${workspace_dir}/raw_data 43 | extract_archive_dir: ${workspace_dir}/raw 44 | resampled_audio_dir: ${workspace_dir}/${data_split}/audio 45 | data_split: ${data_split} 46 | language_id: pt 47 | output_manifest_file: ${workspace_dir}/${data_split}_manifest0.json 48 | 49 | - _target_: sdp.processors.SubRegex 50 | regex_params_list: 51 | - {"pattern": '[\-\‐\‑\–\—\―\"]', "repl": " "} 52 | - {"pattern": "'", "repl": " "} 53 | - {"pattern": '[\$\&\¡\(\)]', "repl": " "} 54 | - {"pattern": '[\«\°\´\·\»]', "repl": " "} 55 | - {"pattern": '[\«\°\´\·\»]', "repl": " "} 56 | - {"pattern": '[\‘\’\“\”\„]', "repl": " "} 57 | - {"pattern": '[\:\;\`\ʻ]', "repl": " "} 58 | - {"pattern": "!", "repl": "."} 59 | - {"pattern": "…\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end 60 | - {"pattern": "\\.{2,20}\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end 61 | 62 | # remove remaining repeated periods since most of the time they are unnecessary in this data 63 | - {"pattern": "\\.{2,20}", "repl": " "} 64 | 65 | - {"pattern": " ([Pp])rofa ", "repl" : ' \1rofessora '} 66 | - {"pattern": " ([Ss])ra.", "repl" : ' \1enhora'} 67 | - {"pattern": " ([Ss])rta.", "repl": '\1enhorita'} 68 | - {"pattern": " ([Ss])r.", 'repl': '\1enhor' } 69 | - {"pattern": " ([Dd])r ", "repl" : ' \1octor '} 70 | - {"pattern": " ([Dd])r.", "repl" : ' \1octor '} 71 | - {"pattern": " ([Dd])ra ", "repl" : ' \1octora '} 72 | 73 | - {"pattern": " um km ", "repl" : " um quilômetro "} 74 | - {"pattern": " km ", "repl" : " quilômetros "} 75 | 76 | - _target_: sdp.processors.DropHighLowCharrate 77 | high_charrate_threshold: 21 78 | low_charrate_threshold: 1 79 | 80 | - _target_: sdp.processors.DropHighLowDuration 81 | high_duration_threshold: 16 82 | low_duration_threshold: 1 83 | 84 | - _target_: sdp.processors.DropNonAlphabet 85 | output_manifest_file: ${final_manifest} 86 | alphabet: " ÁÃÀÂÇÉÊÍÕÓÔÚÜáãàâçéêíõóôúüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,.?" 87 | 88 | -------------------------------------------------------------------------------- /dataset_configs/portuguese/mls/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | MLS Portuguese 3 | ############## 4 | The config performs the following data processing. 5 | 6 | 1. Downloads and extracts all the data from the "https://www.openslr.org/94/" in Portuguese 7 | 2. Converts all flac audio files to wav format. 8 | 3. Replaces certain non-supported characters, abbreviations and punctuation marks with equivalent supported versions. 9 | 4. Drops any data that contains high/low character occurence. 10 | 5. Drops any data that contains symbols not in the supported alphabet. 11 | 12 | **Required arguments**. 13 | 14 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 15 | * **data_split**: should be "train", "dev" or "test". 16 | 17 | **Output format**. 18 | 19 | This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``. 20 | The output manifest contains the following fields: 21 | 22 | * **audio_filepath (str)**: relative path to the audio files. 23 | * **text (str)**: transcription, including punctuation ".,?" and capitalization. 24 | * **duration (float)**: audio duration in seconds. 25 | 26 | 27 | processors_to_run: all 28 | workspace_dir: ??? 29 | data_split: ??? 30 | final_manifest: ??? 31 | 32 | processors: 33 | - _target_: sdp.processors.CreateInitialManifestMLS 34 | output_manifest_file: ${workspace_dir}/mls_portuguese_processed/${data_split}_manifest.json 35 | raw_data_dir: ${workspace_dir} 36 | language: portuguese 37 | resampled_audio_dir: "" #not passing an argument here to convert it with ffmpeg 38 | data_split: ${data_split} 39 | 40 | - _target_: sdp.processors.FfmpegConvert 41 | converted_audio_dir: ${workspace_dir}/resampled 42 | input_file_key: audio_filepath 43 | output_file_key: audio_filepath 44 | 45 | - _target_: sdp.processors.SubRegex 46 | regex_params_list: 47 | - {"pattern": '[\-\‐\‑\–\—\―\"]', "repl": " "} 48 | - {"pattern": "'", "repl": " "} 49 | - {"pattern": '[\$\&\¡\(\)]', "repl": " "} 50 | - {"pattern": '[\«\°\´\·\»]', "repl": " "} 51 | - {"pattern": '[\«\°\´\·\»]', "repl": " "} 52 | - {"pattern": '[\‘\’\“\”\„]', "repl": " "} 53 | - {"pattern": '[\:\;\`\ʻ]', "repl": " "} 54 | - {"pattern": "!", "repl": "."} 55 | - {"pattern": "…\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end 56 | - {"pattern": "\\.{2,20}\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end 57 | 58 | # remove remaining repeated periods since most of the time they are unnecessary in this data 59 | - {"pattern": "\\.{2,20}", "repl": " "} 60 | 61 | - {"pattern": " ([Pp])rofa ", "repl" : ' \1rofessora '} 62 | - {"pattern": " ([Ss])ra.", "repl" : ' \1enhora'} 63 | - {"pattern": " ([Ss])rta.", "repl": '\1enhorita'} 64 | - {"pattern": " ([Ss])r.", 'repl': '\1enhor' } 65 | - {"pattern": " ([Dd])r ", "repl" : ' \1octor '} 66 | - {"pattern": " ([Dd])r.", "repl" : ' \1octor '} 67 | - {"pattern": " ([Dd])ra ", "repl" : ' \1octora '} 68 | 69 | - {"pattern": " um km ", "repl" : " um quilômetro "} 70 | - {"pattern": " km ", "repl" : " quilômetros "} 71 | - _target_: sdp.processors.DropHighLowCharrate 72 | high_charrate_threshold: 21 73 | low_charrate_threshold: 1 74 | 75 | - _target_: sdp.processors.DropHighLowDuration 76 | high_duration_threshold: 20 77 | low_duration_threshold: 1 78 | 79 | - _target_: sdp.processors.DropNonAlphabet 80 | output_manifest_file: ${final_manifest} 81 | alphabet: " ÁÃÀÂÇÉÊÍÕÓÔÚÜáãàâçéêíõóôúüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,.?" 82 | -------------------------------------------------------------------------------- /dataset_configs/portuguese/mtedx/config.yaml: -------------------------------------------------------------------------------- 1 | documentation: | 2 | MTEDX Portuguese 3 | ################ 4 | The config performs the following data processing. 5 | 6 | 1. Downloads and extracts the data from the "https://www.openslr.org/100/" in Portuguese 7 | 2. Converts all flac audio files to wav format. 8 | 3. Splits audio by the given time steps in vtt files. 9 | 4. Replaces certain non-supported characters, abbreviations and punctuation marks with equivalent supported versions. 10 | 5. Drops any data that contains high/low character occurence. 11 | 6. Drops any data that contains symbols not in the supported alphabet. 12 | 13 | **Required arguments**. 14 | 15 | * **workspace_dir**: specify the workspace folder where all audio files will be stored. 16 | * **raw_data_dir**: specify in which folder the data will be downladed. 17 | * **data_split**: should be "train", "valid" or "test". 18 | 19 | **Output format**. 20 | 21 | This config dumps the final manifest at ``${workspace_dir}/${data_split}_manifest.json``. 22 | The output manifest contains the following fields: 23 | 24 | * **audio_filepath (str)**: relative path to the audio files. 25 | * **text (str)**: transcription, including punctuation ".,?" and capitalization. 26 | * **duration (float)**: audio duration in seconds. 27 | 28 | 29 | 30 | processors_to_run: all 31 | workspace_dir: ??? 32 | data_split: ??? 33 | final_manifest: ??? 34 | 35 | 36 | processors: 37 | - _target_: sdp.processors.CreateInitialManifestMTEDX 38 | raw_data_dir: ${workspace_dir}/raw_data 39 | data_split: ${data_split} 40 | language_id: pt 41 | already_extracted: False 42 | output_manifest_file: ${workspace_dir}/${data_split}_manifest0.json 43 | 44 | - _target_: sdp.processors.FfmpegConvert 45 | converted_audio_dir: ${workspace_dir}/resampled 46 | input_file_key: audio_filepath 47 | output_file_key: audio_filepath 48 | 49 | - _target_: sdp.processors.datasets.commoncrawl.SplitByVttSentence 50 | output_manifest_file: ${workspace_dir}/manifest_vtt.json 51 | input_manifest_file: ${workspace_dir}/${data_split}_manifest0.json 52 | splited_audio_dir: ${workspace_dir}/splited 53 | source_audio_field: audio_filepath 54 | target_audio_field: audio_filepath 55 | duration_field: duration 56 | text_field: text 57 | vtt_field: vtt_filepath 58 | additional_fields: [] 59 | duration_threshold: 20.0 60 | 61 | - _target_: sdp.processors.SubRegex 62 | regex_params_list: 63 | - {"pattern": "(Aplausos)", "repl": " "} 64 | - {"pattern": "(Risos)", "repl": " "} 65 | - {"pattern": '[\-\‐\‑\–\—\―\"]', "repl": " "} 66 | - {"pattern": "'", "repl": " "} 67 | - {"pattern": '[\$\&\¡\(\)]', "repl": " "} 68 | - {"pattern": '[\«\°\´\·\»]', "repl": " "} 69 | - {"pattern": '[\«\°\´\·\»]', "repl": " "} 70 | - {"pattern": '[\‘\’\“\”\„]', "repl": " "} 71 | - {"pattern": '[\:\;\`\ʻ]', "repl": " "} 72 | - {"pattern": "!", "repl": "."} 73 | - {"pattern": "…\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end 74 | - {"pattern": "\\.{2,20}\\s$", "repl": "."} # '\\s' is to to account for the fact that SDP insert spaces at start and end 75 | 76 | # remove remaining repeated periods since most of the time they are unnecessary in this data 77 | - {"pattern": "\\.{2,20}", "repl": " "} 78 | 79 | - {"pattern": " ([Pp])rofa ", "repl" : ' \1rofessora '} 80 | - {"pattern": " ([Ss])ra.", "repl" : ' \1enhora'} 81 | - {"pattern": " ([Ss])rta.", "repl": '\1enhorita'} 82 | - {"pattern": " ([Ss])r.", 'repl': '\1enhor' } 83 | - {"pattern": " ([Dd])r ", "repl" : ' \1octor '} 84 | - {"pattern": " ([Dd])r.", "repl" : ' \1octor '} 85 | - {"pattern": " ([Dd])ra ", "repl" : ' \1octora '} 86 | 87 | - {"pattern": " um km ", "repl" : " um quilômetro "} 88 | - {"pattern": " km ", "repl" : " quilômetros "} 89 | 90 | - _target_: sdp.processors.DropHighLowDuration 91 | high_duration_threshold: 20 92 | low_duration_threshold: 1 93 | 94 | - _target_: sdp.processors.DropHighLowCharrate 95 | high_charrate_threshold: 21 96 | low_charrate_threshold: 1 97 | 98 | - _target_: sdp.processors.DropNonAlphabet 99 | output_manifest_file: ${final_manifest} 100 | alphabet: " ÁÃÀÂÇÉÊÍÕÓÔÚÜáãàâçéêíõóôúüABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz,.?" 101 | 102 | -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- 1 | FROM nvidia/cuda:12.1.1-cudnn8-devel-ubuntu22.04 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | ENV TZ=America/Los_Angeles 5 | 6 | # Install basics 7 | RUN apt-get update \ 8 | && apt-get install -y --no-install-recommends \ 9 | python3 python3-pip python3-dev python-is-python3 \ 10 | build-essential \ 11 | curl \ 12 | ffmpeg \ 13 | git \ 14 | sox \ 15 | libsox-fmt-mp3 \ 16 | unzip \ 17 | wget \ 18 | && apt-get clean \ 19 | && rm -rf /var/lib/apt/lists/* 20 | 21 | # Update pip 22 | RUN pip install --upgrade pip 23 | 24 | #install typing-ext manually 25 | RUN pip install typing-extensions 26 | 27 | # Clone the NeMo SDP repository 28 | COPY . /src/NeMo-speech-data-processor 29 | RUN rm -rf /src/NeMo-speech-data-processor/.git 30 | 31 | 32 | WORKDIR /src/NeMo-speech-data-processor 33 | #need to install numpy before reqs, even thougth it present in reqs (cause it requred to install [python-sox], otherwise we face an error) 34 | RUN pip install numpy 35 | RUN find requirements/ -name "*.txt" -exec pip install -r {} \; 36 | # Set working directory back to NeMo-speech-data-processor 37 | WORKDIR /src/NeMo-speech-data-processor 38 | 39 | # Set up entrypoint 40 | CMD ["bash"] 41 | -------------------------------------------------------------------------------- /docker/Dockerfile.tts_sdp: -------------------------------------------------------------------------------- 1 | FROM pytorch/pytorch:2.4.1-cuda12.1-cudnn9-devel 2 | 3 | ARG DEBIAN_FRONTEND=noninteractive 4 | 5 | ENV TZ=America/Los_Angeles 6 | 7 | # Install basics 8 | RUN apt-get update && apt-get install -y --no-install-recommends \ 9 | build-essential \ 10 | bzip2 \ 11 | ca-certificates \ 12 | libsox-fmt-mp3 \ 13 | cmake \ 14 | curl \ 15 | ffmpeg \ 16 | g++ \ 17 | sox \ 18 | unzip \ 19 | vim \ 20 | wget 21 | 22 | # Update pip 23 | RUN pip install --upgrade pip 24 | 25 | # Link all cudnn .so libraries for runtime 26 | RUN ln -s /opt/conda/lib/python3.11/site-packages/nvidia/cudnn/include/cudnn*.h /usr/include/ 27 | RUN mkdir -p /usr/local/cuda/lib64 28 | RUN ln -s /opt/conda/lib/python3.11/site-packages/nvidia/cudnn/lib/libcudnn*.so* /usr/local/cuda/lib64/ 29 | ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH 30 | 31 | 32 | # Copy NeMo SDP 33 | WORKDIR /src 34 | COPY . /src/NeMo-speech-data-processor 35 | RUN rm -rf /src/NeMo-speech-data-processor/.git 36 | 37 | # Install requirements 38 | WORKDIR /src/NeMo-speech-data-processor 39 | RUN pip install -r requirements/main.txt 40 | RUN pip install -r requirements/tts.txt 41 | RUN pip install flash-attn --no-build-isolation 42 | RUN pip install https://github.com/LahiLuk/YouTokenToMe/archive/master.zip 43 | RUN pip install megatron-core transformer_engine[pytorch]==2.4.0 44 | RUN pip install nemo_toolkit['all']==2.1.0 45 | 46 | WORKDIR /src/NeMo-speech-data-processor -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- 1 | # You can set these variables from the command line. 2 | SPHINXOPTS = 3 | SPHINXBUILD = sphinx-build 4 | 5 | # User-friendly check for sphinx-build 6 | ifeq ($(shell which $(SPHINXBUILD) >/dev/null 2>&1; echo $$?), 1) 7 | $(error The '$(SPHINXBUILD)' command was not found. Did you install requirements/docs.txt?) 8 | endif 9 | 10 | .PHONY: help 11 | help: 12 | @echo "Please use \`make ' where is one of" 13 | @echo " html to make standalone HTML files" 14 | @echo " clean to fully remove the previous docs build" 15 | 16 | .PHONY: clean 17 | clean: 18 | rm -rf html && rm -rf src/sdp/config-docs 19 | 20 | .PHONY: html 21 | html: 22 | python gen_docs.py && $(SPHINXBUILD) src html $(SPHINXOPTS) 23 | @echo 24 | @echo "Build finished. The HTML pages are in 'html' subfolder." 25 | -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- 1 | To build SDP documentation, make sure to first install dependencies by running 2 | 3 | ``` 4 | pip install -r requirements/docs.txt 5 | ``` 6 | 7 | Then you can run `make clean` from this folder to remove any previously generated docs 8 | and `make html` to build the new documentation. Open `html/index.html` to view the docs 9 | locally. -------------------------------------------------------------------------------- /docs/gen_docs.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Run this file to generate documentation for SDP config files. 16 | 17 | Will parse all the yaml files and include any built-in documentation in 18 | the expected format. 19 | """ 20 | 21 | import yaml 22 | import os 23 | from pathlib import Path 24 | 25 | ROOT_LINK = "https://github.com/NVIDIA/NeMo-speech-data-processor/blob/main/dataset_configs" 26 | 27 | # let's ignore some of the configs we don't (yet) want to be exposed in the documentation 28 | IGNORE_CONFIGS = [] 29 | 30 | 31 | def gen_docs(): 32 | config_dir = str(Path(__file__).absolute().parents[1] / 'dataset_configs') 33 | config_docs_dir = str(Path(__file__).parents[0] / 'src' / 'sdp' / 'config-docs') 34 | 35 | for root, dirs, files in os.walk(config_dir): 36 | # Create corresponding directories in the destination directory 37 | for directory in dirs: 38 | source_path = os.path.join(root, directory) 39 | destination_path = source_path.replace(config_dir, config_docs_dir) 40 | os.makedirs(destination_path, exist_ok=True) 41 | 42 | # Copy files and change the file extensions 43 | for file in files: 44 | if file.endswith('.yaml'): 45 | source_path = os.path.join(root, file) 46 | config_path = source_path.replace(config_dir, '')[1:] # removing leading / 47 | if config_path in IGNORE_CONFIGS: 48 | continue 49 | destination_path = source_path.replace(config_dir, config_docs_dir).replace('.yaml', '.rst') 50 | with open(source_path, "rt", encoding="utf-8") as fin: 51 | docs = yaml.safe_load(fin).get('documentation', "Documentation is not yet available.") + "\n\n" 52 | link = f"Config link: `dataset_configs/{config_path} <{ROOT_LINK}/{config_path}>`_" 53 | with open(destination_path, "wt", encoding="utf-8") as fout: 54 | fout.write(docs + link) 55 | 56 | if __name__ == '__main__': 57 | gen_docs() 58 | -------------------------------------------------------------------------------- /docs/src/_static/js/pk_scripts.js: -------------------------------------------------------------------------------- 1 | document.addEventListener("DOMContentLoaded", function () { 2 | var params = window.location.search.substring(1).split("&").reduce(function (params, param) { 3 | if (!param) { 4 | return params; 5 | } 6 | 7 | var values = param.split("="); 8 | var name = values[0]; 9 | var value = values[1]; 10 | params[name] = value; 11 | return params; 12 | }, {}); 13 | 14 | var form = document.getElementById("feedback-form"); 15 | for (var name in params) { 16 | var input = form.querySelector("[name=" + name + "]"); 17 | input.value = params[name]; 18 | } 19 | }); -------------------------------------------------------------------------------- /docs/src/_templates/layout.html: -------------------------------------------------------------------------------- 1 | {% extends "!layout.html" %} 2 | 3 | {% block extrahead %} 4 | 5 | 7 | 8 | {% endblock %} 9 | 10 | {% block footer %} 11 | 12 | 13 | 14 | {% endblock %} -------------------------------------------------------------------------------- /docs/src/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/docs/src/favicon.ico -------------------------------------------------------------------------------- /docs/src/index.rst: -------------------------------------------------------------------------------- 1 | .. _sdp-introduction: 2 | 3 | Speech Data Processor 4 | ===================== 5 | 6 | Speech Data Processor (SDP) is a toolkit to make it easy to: 7 | 8 | 1. Write code to process a new dataset, minimizing the amount of boilerplate code required. 9 | 2. Share the steps for processing a speech dataset. 10 | 11 | SDP is hosted here: https://github.com/NVIDIA/NeMo-speech-data-processor. 12 | It's mainly used to prepare datasets for `NeMo toolkit `_. 13 | 14 | SDP's philosophy is to represent processing operations as 'processor' classes, which take in a path to a NeMo-style 15 | data manifest as input (or a path to the raw data directory if you do not have a NeMo-style manifest to start with), 16 | apply some processing to it, and then save the output manifest file. 17 | 18 | You specify which processors you want to run using a YAML config file. Many common processing operations are provided, 19 | and it is easy to add your own. 20 | 21 | .. image:: https://github.com/NVIDIA/NeMo/releases/download/v1.17.0/sdp_overview_diagram.png 22 | :alt: Overview diagram of Speech Data Processor 23 | 24 | To learn more about SDP, have a look at the following sections. 25 | 26 | .. toctree:: 27 | :maxdepth: 1 28 | 29 | sdp/config_structure 30 | sdp/adding_processors 31 | sdp/existing_configs 32 | sdp/api 33 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import sys 16 | import hydra 17 | from omegaconf import DictConfig, open_dict 18 | 19 | from sdp.run_processors import run_processors, update_processor_imports 20 | 21 | 22 | @hydra.main(version_base=None) 23 | def main(cfg: DictConfig): 24 | """ 25 | Main entry point for the Speech Data Processor (SDP). 26 | 27 | Args: 28 | cfg: Hydra configuration object containing processing settings 29 | """ 30 | # Check if running in import manager mode 31 | if hasattr(cfg, 'mode') and cfg.mode == 'update_imports': 32 | update_processor_imports(cfg.config_path) 33 | 34 | # Check arg for using Dask 35 | if not hasattr(cfg, 'use_dask'): 36 | with open_dict(cfg): 37 | # Default to using Dask 38 | cfg.use_dask = True 39 | 40 | # Run the processors 41 | run_processors(cfg) 42 | 43 | 44 | if __name__ == "__main__": 45 | # hacking the arguments to always disable hydra's output 46 | # TODO: maybe better to copy-paste hydra_runner from nemo if there are 47 | # any problems with this approach 48 | sys.argv.extend( 49 | ["hydra.run.dir=.", "hydra.output_subdir=null", "hydra/job_logging=none", "hydra/hydra_logging=none"] 50 | ) 51 | main() 52 | -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- 1 | [pytest] 2 | addopts = --doctest-modules 3 | markers = 4 | dependency: mark a test as a dependent on the other mentioned test. 5 | slow: marks tests as slow (deselect with '-m "not slow"'). -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- 1 | pydata-sphinx-theme 2 | pyyaml 3 | Sphinx 4 | sphinx-book-theme 5 | sphinx-copybutton 6 | sphinxext-opengraph 7 | -------------------------------------------------------------------------------- /requirements/huggingface.txt: -------------------------------------------------------------------------------- 1 | accelerate 2 | transformers>=0.2.1 3 | huggingface_hub>=0.20.3,<0.24.0 # https://github.com/NVIDIA/NeMo/issues/9793 4 | -------------------------------------------------------------------------------- /requirements/ipl.txt: -------------------------------------------------------------------------------- 1 | nemo_run 2 | 3 | # Nemo repository path is also required, it is used to locate scripts and configs from NeMo. 4 | # 5 | # To set this up: 6 | # 1. Clone the NeMo repository: 7 | # git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo 8 | # 2. Set the path in nemo_run_config.yaml: 9 | # nemo_directory: /your/desired/path/to/nemo 10 | # 11 | # Make sure this path is valid and NeMo is up to date if you're using its scripts. 12 | -------------------------------------------------------------------------------- /requirements/main.txt: -------------------------------------------------------------------------------- 1 | diff_match_patch 2 | editdistance 3 | ffmpeg 4 | hydra-core 5 | joblib 6 | librosa>=0.10.0 # specify >=0.10.0 so that librosa.get_duration(path=...) will work 7 | numpy>=1.26, <2.0 # module was used numpy 1.x and may crash in 2.x 8 | omegaconf 9 | pandas 10 | rarfile 11 | regex 12 | sox 13 | tqdm 14 | gdown 15 | webvtt-py 16 | wget 17 | python-docx 18 | pydub 19 | dask 20 | distributed 21 | jiwer>=3.1.0,<4.0.0 22 | pyarrow>=8.0.0,<14.0.0 23 | datasets>=2.14.0,<3.0.0 24 | # toloka-kit # Temporarily disabled due to Toloka's technical pause; keep as reference for past and future API support 25 | # for some processers, additionally https://github.com/NVIDIA/NeMo is required 26 | # for some processers, additionally nemo_text_processing is required 27 | # for mcv: apt-get update && apt-get upgrade -y && apt-get install -y sox libsox-fmt-all 28 | -------------------------------------------------------------------------------- /requirements/tests.txt: -------------------------------------------------------------------------------- 1 | boto3 2 | # additional packages required to run tests 3 | pytest 4 | pytest-cov 5 | # lhotse requires torch and torchaudio to be present 6 | lhotse 7 | torch 8 | torchaudio -------------------------------------------------------------------------------- /requirements/tts.txt: -------------------------------------------------------------------------------- 1 | transformers 2 | accelerate 3 | torchaudio 4 | pyannote-audio 5 | ffmpeg-python 6 | whisperx==3.3.1 7 | -------------------------------------------------------------------------------- /sdp/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /sdp/logging.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import logging 15 | 16 | # overriding with the library specific logger, so that it's possible to 17 | # customize in any downstream applications 18 | logger = logging.getLogger("sdp") 19 | -------------------------------------------------------------------------------- /sdp/processors/datasets/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/commoncrawl/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .commoncrawl import SplitByVttSentence 16 | -------------------------------------------------------------------------------- /sdp/processors/datasets/commoncrawl/harv_utils.py: -------------------------------------------------------------------------------- 1 | import webvtt # pip install webvtt-py 2 | from datetime import datetime 3 | from sdp.logging import logger 4 | 5 | 6 | def parse_hours(inp): 7 | inp_list = inp.split(":") 8 | if len(inp_list) == 3 and int(inp_list[0]) >= 24: 9 | hours = int(inp_list[0]) % 24 10 | days = int(inp_list[0]) // 24 11 | if days < 31: 12 | inp = str(1 + days) + ":" + str(hours) + ":" + ":".join(inp_list[1:]) 13 | return datetime.strptime(inp, '%d:%H:%M:%S.%f') 14 | else: 15 | months = days // 31 16 | days = days % 31 17 | inp = str(1 + months) + "/" + str(1 + days) + " " + str(hours) + ":" + ":".join(inp_list[1:]) 18 | return datetime.strptime(inp, '%m/%d %H:%M:%S.%f') 19 | else: 20 | return datetime.strptime(inp, '%H:%M:%S.%f') 21 | 22 | 23 | def split_by_vtt(vtt_file, samplerate): 24 | try: 25 | _begin = datetime.strptime('00:00:00.000', '%H:%M:%S.%f') 26 | text_list, start_s, end_s = [], [], [] 27 | for caption in webvtt.read(vtt_file): 28 | text = ' '.join(caption.text.split('\n')) 29 | 30 | _start = parse_hours(caption.start) 31 | start = (_start - _begin).total_seconds() 32 | start_sr = int(start * samplerate) 33 | 34 | _end = parse_hours(caption.end) 35 | end = (_end - _begin).total_seconds() 36 | end_sr = int(end * samplerate) 37 | 38 | text_list.append(text.strip()) 39 | start_s.append(start_sr) 40 | end_s.append(end_sr) 41 | return text_list, start_s, end_s 42 | except Exception as e: 43 | logger.warning(str(e) + vtt_file) 44 | return None, None, None 45 | 46 | -------------------------------------------------------------------------------- /sdp/processors/datasets/coraa/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/coraa/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/coraal/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .create_initial_manifest import CreateInitialManifestCORAAL 16 | from .data_splits import TrainDevTestSplitCORAAL 17 | -------------------------------------------------------------------------------- /sdp/processors/datasets/earnings/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from sdp.processors.datasets.earnings.create_initial_manifest import ( 16 | CreateInitialAudioAndManifest, 17 | CreateFullAudioManifestEarnings21, 18 | SpeakerSegmentedManifest, 19 | CreateSentenceSegmentedManifest, 20 | NeMoForcedAligner, 21 | ) 22 | from sdp.processors.datasets.earnings.apply_normalizations import ( 23 | ApplyEarnings21Normalizations, 24 | ) -------------------------------------------------------------------------------- /sdp/processors/datasets/fleurs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/fleurs/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/hifitts2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/hifitts2/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/hifitts2/remove_failed_chapters.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import json 17 | from pathlib import Path 18 | from tqdm import tqdm 19 | 20 | from sdp.processors.base_processor import BaseProcessor 21 | from sdp.utils.common import load_manifest 22 | 23 | 24 | class RemovedFailedChapters(BaseProcessor): 25 | """ 26 | Removes all utterances in the input chapter file from the input manifest. This processor is expected to be 27 | run using the file output by the DownloadHiFiTTS2 containing failed chapter downloads. 28 | 29 | Args: 30 | error_file (str): Path to file with chapter download errors. 31 | 32 | Returns: 33 | This outputs a manifest which is the same as its input manifest but with utterances in 'error_file' removed. 34 | 35 | Example: 36 | .. code-block:: yaml 37 | 38 | - _target_: sdp.processors.RemovedFailedChapters 39 | input_manifest_file: ${workspace_dir}/manifest_22khz.json 40 | output_manifest_file: ${workspace_dir}/manifest_filtered_22khz.json 41 | error_file: ${workspace_dir}/errors_22khz.json 42 | """ 43 | 44 | def __init__( 45 | self, 46 | error_file: str, 47 | **kwargs, 48 | ): 49 | super().__init__(**kwargs) 50 | self.error_file = Path(error_file) 51 | 52 | def process(self): 53 | chapter_rows = load_manifest(self.error_file) 54 | audio_files_to_remove = set() 55 | for chapter_row in chapter_rows: 56 | for utt_list in chapter_row["utterances"]: 57 | audio_files_to_remove.add(utt_list["audio_filepath"]) 58 | 59 | rows = load_manifest(Path(self.input_manifest_file)) 60 | with open(self.output_manifest_file, "w", encoding="utf-8") as output_f: 61 | for row in tqdm(rows): 62 | if row["audio_filepath"] in audio_files_to_remove: 63 | continue 64 | 65 | output_line = f"{json.dumps(row, ensure_ascii=False)}\n" 66 | output_f.write(output_line) 67 | -------------------------------------------------------------------------------- /sdp/processors/datasets/ksc2/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/ksc2/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/lhotse.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | import json 15 | 16 | from sdp.processors.base_processor import BaseProcessor 17 | 18 | 19 | class LhotseImport(BaseProcessor): 20 | """Processor to create an initial manifest imported from a Lhotse CutSet. 21 | The ``input_manifest_file`` is expected to point to a Lhotse CutSet manifest, 22 | which usually has ``cuts`` in its name and a ``.jsonl`` or ``.jsonl.gz`` extension. 23 | 24 | Lhotse is a library for speech data processing and loading; see: 25 | 26 | * https://github.com/lhotse-speech/lhotse 27 | * https://lhotse.readthedocs.io 28 | 29 | It can be installed using ``pip install lhotse``. 30 | 31 | .. caution:: Currently we only support the importing of cut sets that represent 32 | single-channel, single-audio-file-per-utterance datasets. 33 | 34 | Returns: 35 | This processor generates an initial manifest file with the following fields:: 36 | 37 | { 38 | "audio_filepath": , 39 | "duration": , 40 | "text": , 41 | } 42 | """ 43 | 44 | def process(self): 45 | from lhotse import CutSet 46 | 47 | cuts = CutSet.from_file(self.input_manifest_file) 48 | with open(self.output_manifest_file, "w") as f: 49 | for cut in cuts: 50 | self.check_entry(cut) 51 | data = { 52 | "audio_filepath": cut.recording.sources[0].source, 53 | "duration": cut.duration, 54 | "lhotse_cut_id": cut.id, 55 | } 56 | for meta in ("text", "speaker", "gender", "language"): 57 | if (item := getattr(cut.supervisions[0], meta)) is not None: 58 | data[meta] = item 59 | if (custom := cut.supervisions[0].custom) is not None: 60 | data.update(custom) 61 | print(json.dumps(data), file=f) 62 | 63 | def check_entry(self, cut) -> None: 64 | from lhotse import MonoCut 65 | 66 | assert isinstance( 67 | cut, MonoCut 68 | ), f"Currently, only MonoCut import is supported. Received: {cut}" 69 | assert ( 70 | cut.has_recording 71 | ), f"Currently, we only support cuts with recordings. Received: {cut}" 72 | assert ( 73 | cut.recording.num_channels == 1 74 | ), f"Currently, we only supports recordings with a single channel. Received: {cut}" 75 | assert ( 76 | len(cut.recording.sources) == 1 77 | ), f"Currently, we only support recordings with a single AudioSource. Received: {cut}" 78 | assert ( 79 | cut.recording.sources[0].type == "file" 80 | ), f"Currently, we only suppport AudioSources of type='file'. Received: {cut}" 81 | assert ( 82 | len(cut.supervisions) == 1 83 | ), f"Currently, we only support cuts with a single supervision. Received: {cut}" 84 | -------------------------------------------------------------------------------- /sdp/processors/datasets/librispeech/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/librispeech/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/masc/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from .create_initial_manifest import CreateInitialManifestMASC 16 | from .aggregate_segments import AggregateSegments 17 | from .apply_reg_exp_on_vtt_entries import RegExpVttEntries 18 | from .get_caption_file_segments import GetCaptionFileSegments 19 | -------------------------------------------------------------------------------- /sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import re 17 | import webvtt # pip install webvtt-py 18 | from typing import Dict 19 | from sdp.processors.base_processor import BaseParallelProcessor, DataEntry 20 | 21 | 22 | class RegExpVttEntries(BaseParallelProcessor): 23 | """ 24 | Applies regular expressions on entries of a .vtt (WebVTT) file and stores the processed file in the specified directory. 25 | 26 | Args:: 27 | input_filepath_key (str): Key that stores path to the input `.vtt` file. 28 | output_filtered_vtt_dir (str): Directory where the processed `.vtt` files will be stored. 29 | output_filepath_key (str): Key to store the output `.vtt` file path. 30 | 31 | Returns:: 32 | Manifest with additional field: 33 | { 34 | "output_filepath_key": 35 | } 36 | """ 37 | 38 | def __init__( 39 | self, 40 | regex_params: Dict, 41 | input_filepath_key: str = "vtt_filepath", 42 | output_filtered_vtt_dir: str = "filtered_vtt_filepath", 43 | output_filepath_key: str = "filtered_vtt_filepath", 44 | **kwargs, 45 | ): 46 | super().__init__(**kwargs) 47 | self.input_filepath_key = input_filepath_key 48 | self.output_filepath_key = output_filepath_key 49 | self.output_filtered_vtt_dir = output_filtered_vtt_dir 50 | self.regex_params = regex_params 51 | 52 | def prepare(self): 53 | os.makedirs(self.output_filtered_vtt_dir, exist_ok=True) 54 | 55 | def process_dataset_entry(self, data_entry): 56 | try: 57 | vtt = webvtt.read(data_entry[self.input_filepath_key]) 58 | 59 | for caption in vtt: 60 | caption.text = re.sub( 61 | pattern=self.regex_params["pattern"], 62 | repl=self.regex_params["repl"], 63 | string=caption.text, 64 | count=self.regex_params.get("count", 0), 65 | ) 66 | 67 | basename = os.path.basename(data_entry[self.input_filepath_key]) 68 | filtered_vtt_filepath = os.path.join(self.output_filtered_vtt_dir, basename) 69 | data_entry[self.output_filepath_key] = filtered_vtt_filepath 70 | 71 | vtt.save(filtered_vtt_filepath) 72 | return [DataEntry(data=data_entry)] 73 | except: 74 | return [DataEntry(data=None)] 75 | -------------------------------------------------------------------------------- /sdp/processors/datasets/masc/get_caption_file_segments.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import logging 17 | from sdp.processors.base_processor import BaseParallelProcessor, DataEntry 18 | from sdp.processors.datasets.masc.utils import parse_captions 19 | 20 | class GetCaptionFileSegments(BaseParallelProcessor): 21 | """ 22 | This class extracts subtitle information from .vtt (WebVTT) files. 23 | Each segment represents a single subtitle line. 24 | 25 | Args: 26 | input_caption_file_key (str): The field name in the input manifest containing path to the caption file. 27 | output_segments_key (str): The field name to store segment information. Defaults to "segments". 28 | verbose (bool): Set true for outputing logging information. 29 | 30 | Returns: 31 | This processor adds an output_segments field to the input manifest with a list of segments. 32 | Each segment has a structure: 33 | { 34 | "segment_id": , 35 | "start_time": , 36 | "end_time": 37 | "text": 38 | } 39 | """ 40 | def __init__( 41 | self, 42 | input_caption_file_key: str, 43 | output_segments_key: str = "segments", 44 | verbose: bool = True, 45 | **kwargs, 46 | ): 47 | super().__init__(**kwargs) 48 | self.caption_file_key = input_caption_file_key 49 | self.output_segments_key = output_segments_key 50 | self.verbose = verbose 51 | 52 | def process_dataset_entry(self, data_entry): 53 | caption_file = data_entry[self.caption_file_key] 54 | 55 | if not os.path.exists(caption_file): 56 | if self.verbose: 57 | logging.info(f"File {caption_file} does not exist.") 58 | return [] 59 | 60 | data_entry[self.output_segments_key] = parse_captions(caption_file) 61 | 62 | return [DataEntry(data=data_entry)] 63 | -------------------------------------------------------------------------------- /sdp/processors/datasets/masc/utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import webvtt # pip install webvtt-py 16 | from typing import Optional 17 | from sdp.processors.datasets.commoncrawl.harv_utils import parse_hours 18 | from datetime import datetime 19 | 20 | def save_audio_segment(audio, start_time: float, end_time: float, output_audio_filepath: Optional[str]): 21 | """ 22 | Extracts a segment from audio. 23 | 24 | Args: 25 | audio: input audio 26 | start_time (float): segment start time in seconds. 27 | end_time (float): segment end time in seconds. 28 | audio_filepath (Optional[str]): filepath to store the segment. 29 | 30 | Returns: 31 | audio_segment: audio segment 32 | 33 | IndexError: Raised if segment boundaries are out of range. 34 | """ 35 | start_time = start_time * 1000 36 | end_time = end_time * 1000 37 | 38 | if start_time >= len(audio) or end_time >= len(audio): 39 | raise IndexError("Segment boundaries are out of range.") 40 | 41 | audio_segment = audio[start_time:end_time] 42 | if output_audio_filepath: 43 | audio_segment.export(output_audio_filepath, format="wav") 44 | 45 | return audio_segment 46 | 47 | 48 | def parse_captions(captions_filepath: str): 49 | """ 50 | Creates a list of segments from .vtt caption files. 51 | Each segment has a structure: 52 | { 53 | "segment_id": int, # Unique identifier for the segment 54 | "start_time": float, # Start time of the segment (in seconds) 55 | "end_time": float, # End time of the segment (in seconds) 56 | "text": str # Text content of the segment 57 | } 58 | 59 | Args: 60 | captions_filepath (str): path to .vtt file. 61 | """ 62 | srt_segments = [] 63 | initial_timestamp = datetime.strptime('00:00:00.000', '%H:%M:%S.%f') 64 | for index, caption in enumerate(webvtt.read(captions_filepath)): 65 | text = ' '.join([text.strip() for text in caption.text.split('\n')]) 66 | start_time = parse_hours(caption.start) - initial_timestamp 67 | end_time = parse_hours(caption.end) - initial_timestamp 68 | 69 | segment = { 70 | "segment_id": index, 71 | "start_time": start_time.total_seconds(), 72 | "end_time": end_time.total_seconds(), 73 | "text": text 74 | } 75 | srt_segments.append(segment) 76 | 77 | return srt_segments -------------------------------------------------------------------------------- /sdp/processors/datasets/mcv/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/mcv/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/mediaspeech/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /sdp/processors/datasets/mls/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/mls/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/mtedx/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/mtedx/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/mtedx/create_initial_manifest.py: -------------------------------------------------------------------------------- 1 | import os 2 | from pathlib import Path 3 | from typing import List 4 | import librosa 5 | from sdp.processors.base_processor import BaseParallelProcessor, DataEntry 6 | from sdp.utils.common import download_file, extract_archive 7 | 8 | MTEDX_URL = "https://www.openslr.org/resources/100/mtedx_{language_id}.tgz" 9 | 10 | class CreateInitialManifestMTEDX(BaseParallelProcessor): 11 | """Processor to create initial manifest for the Multilingual TEDx (MTedX dataset. 12 | 13 | Dataset link: https://www.openslr.org/100/ 14 | 15 | Downloads dataset for the specified language and creates initial manifest with the provided 16 | audio and vtt files. 17 | 18 | Args: 19 | raw_data_dir (str): the directory where the downloaded data will be/is saved. 20 | This is also where the extracted and processed data will be. 21 | data_split (str): "train", "dev" or "test". 22 | language_id (str): the ID of the language of the data. E.g., "en", "es", "it", etc. 23 | target_samplerate (int): sample rate (Hz) to use for resampling. 24 | already_extracted: (bool): if True, we will not try to extract the raw data. 25 | Defaults to False. 26 | 27 | Returns: 28 | This processor generates an initial manifest file with the following fields:: 29 | 30 | { 31 | "audio_filepath": , 32 | "vtt_filepath": 33 | "duration": 34 | } 35 | """ 36 | def __init__( 37 | self, 38 | raw_data_dir: str, 39 | language_id: str, 40 | data_split: str, 41 | already_extracted: bool = False, 42 | **kwargs, 43 | ): 44 | super().__init__(**kwargs) 45 | self.raw_data_dir = Path(raw_data_dir) 46 | self.language_id = language_id 47 | self.data_split = data_split 48 | self.already_extracted = already_extracted 49 | 50 | def prepare(self): 51 | """Downloading and extracting data (unless already done).""" 52 | os.makedirs(self.raw_data_dir, exist_ok=True) 53 | 54 | 55 | url = MTEDX_URL.format(language_id=self.language_id) 56 | if not (self.raw_data_dir / f"mtedx_{self.language_id}.tgz").exists(): 57 | download_file(url, str(self.raw_data_dir)) 58 | 59 | if not self.already_extracted: 60 | extract_archive(str(self.raw_data_dir / os.path.basename(url)), str(self.raw_data_dir)) 61 | 62 | data_folder = Path(self.raw_data_dir) / f"{self.language_id}-{self.language_id}"/ "data"/ self.data_split 63 | self.audio_path_prefix = Path(data_folder) / "wav" 64 | self.vtt_path_prefix = Path(data_folder) / "vtt" 65 | 66 | def read_manifest(self): 67 | """Creating entries of initial manifest with flac and vtt files""" 68 | audio_filepaths = [] 69 | for audio_file in os.listdir(self.audio_path_prefix): 70 | vtt_filepath = os.path.join(self.vtt_path_prefix, audio_file.split('.')[0] + "." + self.language_id + ".vtt") 71 | audio_filepath = os.path.join(self.audio_path_prefix, audio_file) 72 | audio_filepaths.append((audio_filepath, vtt_filepath)) 73 | return audio_filepaths 74 | 75 | def process_dataset_entry(self, data_entry) -> List[DataEntry]: 76 | """Processing the data entries.""" 77 | audio_filepath, vtt_filepath = data_entry 78 | 79 | data = { 80 | 'audio_filepath': audio_filepath, 81 | 'vtt_filepath': vtt_filepath, 82 | 'duration': float(librosa.get_duration(path=audio_filepath)), 83 | } 84 | return [DataEntry(data=data)] 85 | -------------------------------------------------------------------------------- /sdp/processors/datasets/slr102/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/slr102/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/slr140/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/slr140/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/slr83/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/slr83/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/uzbekvoice/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /sdp/processors/datasets/voxpopuli/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/voxpopuli/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/ytc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/datasets/ytc/__init__.py -------------------------------------------------------------------------------- /sdp/processors/huggingface/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/huggingface/__init__.py -------------------------------------------------------------------------------- /sdp/processors/huggingface/create_initial_manifest.py: -------------------------------------------------------------------------------- 1 | import os 2 | import glob 3 | 4 | import soundfile as sf 5 | 6 | from sdp.processors.base_processor import BaseParallelProcessor, DataEntry 7 | from sdp.logging import logger 8 | from typing import Optional 9 | 10 | class CreateInitialManifestHuggingFace(BaseParallelProcessor): 11 | """Processor to create initial manifest for HuggingFace dataset. 12 | 13 | Downloads HuggingFace dataset and creates an initial manifest. 14 | 15 | Args: 16 | dataset_name (str): the name of the dataset. E.g., "tarteel-ai/everyayah" 17 | raw_data_dir (str): the path to the directory containing the raw dataset files. 18 | resampled_audio_dir (str): directory where the resampled audio will be saved. 19 | data_split (str): "train", "validation" or "test". 20 | already_downloaded (bool): if True, we will not try to load dataset from HuggingFace. 21 | Defaults to False. 22 | target_samplerate (int): sample rate (Hz) to use for resampling. 23 | Defaults to 16000. 24 | 25 | Returns: 26 | This processor generates an initial manifest file with the following fields:: 27 | 28 | { 29 | "audio_filepath": , 30 | "duration": , 31 | "text": , 32 | } 33 | """ 34 | 35 | def __init__( 36 | self, 37 | dataset_name: str, 38 | resampled_audio_dir: str, 39 | data_split: str, 40 | raw_data_dir: Optional[str] = None, 41 | already_downloaded: bool = False, 42 | target_samplerate: int = 16000, 43 | **kwargs, 44 | ): 45 | super().__init__(**kwargs) 46 | self.data_split = data_split 47 | self.target_samplerate = target_samplerate 48 | self.resampled_audio_dir = resampled_audio_dir 49 | self.dataset_name = dataset_name 50 | self.raw_data_dir = raw_data_dir 51 | self.already_downloaded = already_downloaded 52 | 53 | def prepare(self): 54 | os.makedirs(self.resampled_audio_dir, exist_ok=True) 55 | 56 | def read_manifest(self): 57 | import datasets 58 | 59 | # checking if dataset should be loaded from disk 60 | if self.already_downloaded: 61 | if os.path.exists(self.raw_data_dir): 62 | hf_files = glob.glob(f'{self.raw_data_dir}/*.hf') 63 | self.dataset = datasets.load_from_disk(os.path.join(self.raw_data_dir, hf_files[0])) 64 | else: 65 | logger.info("Dataset not found locally. Initiating download from Hugging Face.") 66 | else: 67 | logger.info(f"Initiating download of dataset '{self.dataset_name}' from Hugging Face.") 68 | self.dataset = datasets.load_dataset(self.dataset_name, split=self.data_split) 69 | logger.info(f"Finished download of dataset '{self.dataset_name}' from Hugging Face.") 70 | return range(0, len(self.dataset)) 71 | 72 | def process_dataset_entry(self, data_id): 73 | sample_data = self.dataset[data_id] 74 | sample_audio = sample_data["audio"]["array"] 75 | audio_filepath = os.path.join(self.resampled_audio_dir, f"{data_id}.wav") 76 | sf.write( 77 | audio_filepath, 78 | sample_audio, 79 | self.target_samplerate, 80 | ) 81 | duration = len(sample_audio) / self.target_samplerate 82 | text = sample_data["text"] 83 | 84 | return [ 85 | DataEntry( 86 | data={ 87 | "audio_filepath": os.path.join("audios", f"{data_id}.wav"), 88 | "duration": duration, 89 | "text": text, 90 | } 91 | ) 92 | ] -------------------------------------------------------------------------------- /sdp/processors/ipl/README.md: -------------------------------------------------------------------------------- 1 | # 🧠 TopIPL: Iterative Pseudo-Labeling for ASR 2 | 3 | TopIPL is an **iterative pseudo-labeling algorithm** designed for training ASR models using both labeled and unlabeled data. It maintains a **dynamic pseudo-label cache** and leverages **top-N averaged checkpoints** as a teacher model to generate high-quality pseudo-labels across training iterations. 4 | 5 | ## 📦 Contents 6 | 7 | - `NemoRunIPLProcessor` — Command generator and job submitter for IPL runs, compatible with local and cluster environments. 8 | - `nemo_run_config.yaml` — Main configuration file. Users should define all required paths and parameters here. 9 | 10 | ## 🚀 Getting Started 11 | 12 | TopIPL runs like any other processor in the `nemo_run` framework. To use it, you must pass: 13 | 14 | - `output_manifest_file`: Path where the resulting manifest will be saved. 15 | - `nemo_run_config`: YAML file containing IPL setup, training/inference configs, and NeMo-Run settings. 16 | 17 | ### 🔧 Training Config Requirements 18 | 19 | Your training config must: 20 | 21 | ```yaml 22 | exp_manager: 23 | create_ipl_epoch_stopper_callback: True 24 | ``` 25 | If you're not using Lhotse, also include: 26 | 27 | ```yaml 28 | ipl_epoch_stopper_callback_params: 29 | stop_every_n_epochs: 2 30 | 31 | ``` 32 | 33 | ### Prerequisites 34 | 35 | Before using TopIPL, make sure the following are set up: 36 | 37 | - Clone the NeMo repository: 38 | ```bash 39 | git clone https://github.com/NVIDIA/NeMo.git /your/desired/path/to/nemo 40 | 41 | - Set the path to NeMo in your `nemo_run_config.yaml`: `nemo_directory: /your/desired/path/to/nemo` 42 | - `pip install -r requirements/ipl.txt` 43 | 44 | ### Running the Code 45 | 46 | ```bash 47 | python main.py --config-path=/path/to/directory/config --config-name=config.yaml -------------------------------------------------------------------------------- /sdp/processors/ipl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/ipl/__init__.py -------------------------------------------------------------------------------- /sdp/processors/langs/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /sdp/processors/langs/armenian.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | from pathlib import Path 17 | 18 | import pandas as pd 19 | 20 | from sdp.processors.base_processor import ( 21 | BaseParallelProcessor, 22 | BaseProcessor, 23 | DataEntry, 24 | ) 25 | from sdp.utils.common import load_manifest 26 | 27 | 28 | class GetSourceBookName(BaseParallelProcessor): 29 | """ 30 | Processor for extracting source book name from file paths and updating the manifest. 31 | 32 | Args: 33 | source_file_key (str): The field containing the file path in the manifest. 34 | source_key (str): The field to store the extracted source book name in the manifest. 35 | **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. 36 | 37 | """ 38 | 39 | def __init__( 40 | self, 41 | source_file_key: str, 42 | source_key: str, 43 | **kwargs, 44 | ): 45 | super().__init__(**kwargs) 46 | self.source_file_key = source_file_key 47 | self.source_key = source_key 48 | 49 | def process_dataset_entry(self, data_entry): 50 | input_values = os.path.splitext(data_entry[self.source_file_key])[0].split("/") 51 | 52 | data_entry[self.source_key] = input_values[-1] 53 | return [DataEntry(data=data_entry)] 54 | 55 | 56 | class MakeTsv(BaseProcessor): 57 | """ 58 | Processor for converting a JSON manifest file to a TSV (Tab-Separated Values) file. 59 | 60 | Args: 61 | **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. 62 | 63 | """ 64 | 65 | def process(self): 66 | df1 = pd.DataFrame.from_records(load_manifest(Path(self.input_manifest_file))) 67 | df1.to_csv(self.output_manifest_file, index=None, sep='\t') 68 | 69 | 70 | class RandomTsvPart(BaseProcessor): 71 | """ 72 | Processor for creating a random subset of a TSV (Tab-Separated Values) file based on the specified fraction. 73 | 74 | Args: 75 | part (float): The fraction of the dataset to include in the random subset, should be in the range (0.0, 1.0). 76 | random_state (int): Seed for reproducibility when generating the random subset. 77 | **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. 78 | 79 | """ 80 | 81 | def __init__( 82 | self, 83 | part: float, 84 | random_state: int = 100, 85 | **kwargs, 86 | ): 87 | super().__init__(**kwargs) 88 | self.part = part 89 | self.random_state = random_state 90 | 91 | def process(self): 92 | df1 = pd.read_csv(self.input_manifest_file, sep='\t') 93 | df1.sample(frac=self.part, random_state=self.random_state).to_csv( 94 | self.output_manifest_file, index=None, sep='\t' 95 | ) 96 | -------------------------------------------------------------------------------- /sdp/processors/langs/kazakh.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import collections 16 | from typing import List 17 | 18 | from sdp.logging import logger 19 | from sdp.processors.base_processor import BaseParallelProcessor, DataEntry 20 | 21 | 22 | class LatinToCyrillic(BaseParallelProcessor): 23 | """Converts visually identical latin letters to cyrillic equivalents. 24 | 25 | Args: 26 | text_key (str): a string indicating which key of the data entries 27 | should be used to find the utterance transcript. Defaults to "text". 28 | 29 | Returns: 30 | The same data as in the input manifest with latin letters replaced with cyrillic ones. 31 | """ 32 | 33 | LATIN = "AaƏəBEeKkMHOoPpCcTYyXxhi" 34 | CYRILLIC = "АаӘәВЕеКкМНОоРрСсТУуХхһі" 35 | 36 | def __init__( 37 | self, 38 | text_key: str = "text", 39 | **kwargs, 40 | ): 41 | super().__init__(**kwargs) 42 | self.text_key = text_key 43 | 44 | def process_dataset_entry(self, data_entry) -> List: 45 | latin_counter = collections.defaultdict(int) 46 | 47 | text_in = data_entry[self.text_key] 48 | text_out = text_in 49 | 50 | for char in text_in: 51 | if char in self.LATIN: 52 | cyrillic_eqv = self.CYRILLIC[self.LATIN.index(char)] 53 | text_out = text_out.replace(char, cyrillic_eqv) 54 | latin_counter[char] += 1 55 | 56 | data_entry[self.text_key] = text_out 57 | return [DataEntry(data=data_entry, metrics=latin_counter)] 58 | 59 | def finalize(self, metrics): 60 | total_counter = collections.defaultdict(int) 61 | for counter in metrics: 62 | for char, value in counter.items(): 63 | total_counter[char] += value 64 | logger.info("Num of Latin characters") 65 | for char, count in total_counter.items(): 66 | logger.info(f"{char}: {count}") 67 | super().finalize(metrics) 68 | -------------------------------------------------------------------------------- /sdp/processors/modify_manifest/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /sdp/processors/modify_manifest/create_manifest.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | from pathlib import Path 17 | 18 | import pandas 19 | 20 | from sdp.processors.base_processor import ( 21 | BaseParallelProcessor, 22 | DataEntry, 23 | ) 24 | 25 | 26 | class CreateInitialManifestByExt(BaseParallelProcessor): 27 | """ 28 | Processor for creating an initial dataset manifest by saving filepaths with a common extension to the field specified in output_field. 29 | 30 | Args: 31 | raw_data_dir (str): The root directory of the files to be added to the initial manifest. This processor will recursively look for files with the extension 'extension' inside this directory. 32 | output_file_key (str): The key to store the paths to the files in the dataset. 33 | extension (str): The file extension of the of the files to be added to the manifest. 34 | **kwargs: Additional keyword arguments to be passed to the base class `BaseParallelProcessor`. 35 | 36 | """ 37 | 38 | def __init__( 39 | self, 40 | raw_data_dir: str, 41 | output_file_key: str = "audio_filepath", 42 | extension: str = "mp3", 43 | **kwargs, 44 | ): 45 | super().__init__(**kwargs) 46 | self.raw_data_dir = Path(raw_data_dir) 47 | self.output_file_key = output_file_key 48 | self.extension = extension 49 | 50 | def read_manifest(self): 51 | # Get all files with the specified extension 52 | files = list(self.raw_data_dir.rglob('*.' + self.extension)) 53 | # Get relative paths and then rebuild proper paths to avoid duplication 54 | return [str(self.raw_data_dir / file.relative_to(self.raw_data_dir)) for file in files] 55 | 56 | def process_dataset_entry(self, data_entry): 57 | data = {self.output_file_key: data_entry} 58 | return [DataEntry(data=data)] 59 | 60 | 61 | class CreateCombinedManifests(BaseParallelProcessor): 62 | """Reads JSON lines from specified files and creates a combined manifest. 63 | 64 | This processor iterates over files listed in `manifest_list`, reads each file line by line, 65 | and yields the parsed JSON data from each line. 66 | 67 | Args: 68 | manifest_list (list(str)): A list of file paths or directories to process. The processor will 69 | recursively read files within the directories and expect each file to contain JSON data. 70 | **kwargs: Additional keyword arguments passed to the base class `BaseParallelProcessor`. 71 | 72 | Returns: 73 | A generator that yields parsed JSON data from each line in the files listed in `manifest_list`. 74 | """ 75 | def __init__( 76 | self, 77 | manifest_list: list[str], 78 | **kwargs, 79 | ): 80 | super().__init__(**kwargs) 81 | self.manifest_list = manifest_list 82 | 83 | def read_manifest(self): 84 | for file in self.manifest_list: 85 | with open(file, "rt", encoding="utf8") as fin: 86 | for line in fin: 87 | yield json.loads(line) 88 | 89 | def process_dataset_entry(self, data_entry): 90 | return [DataEntry(data=data_entry)] 91 | 92 | 93 | 94 | -------------------------------------------------------------------------------- /sdp/processors/modify_manifest/make_letters_uppercase_after_period.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import collections 16 | from typing import List 17 | 18 | from sdp.logging import logger 19 | from sdp.processors.base_processor import BaseParallelProcessor, DataEntry 20 | 21 | # TODO: should be done with general sub-regex processor 22 | 23 | 24 | class MakeLettersUppercaseAfterPeriod(BaseParallelProcessor): 25 | """Can be used to replace characters with upper-case version after punctuation. 26 | 27 | Args: 28 | punctuation (str): string with all punctuation characters to consider. 29 | Defaults to ".!?". 30 | text_key (str): a string indicating which key of the data entries 31 | should be used to find the utterance transcript. Defaults to "text". 32 | 33 | Returns: 34 | The same data as in the input manifest with ```` field changed. 35 | """ 36 | 37 | def __init__( 38 | self, punctuation=".!?", text_key: str = "text", **kwargs, 39 | ): 40 | super().__init__(**kwargs) 41 | self.punctuation = punctuation 42 | self.text_key = text_key 43 | 44 | def process_dataset_entry(self, data_entry) -> List: 45 | replace_word_counter = collections.defaultdict(int) 46 | 47 | # keeping in a list, since strings are immutable 48 | new_text = [] 49 | 50 | idx = 0 51 | while idx < len(data_entry[self.text_key]): 52 | character = data_entry[self.text_key][idx] 53 | # checking that next is space and then we upper whatever is after that 54 | # note that Python's upper correctly does not change anything that's not a letter 55 | if ( 56 | character in self.punctuation 57 | and idx + 2 < len(data_entry[self.text_key]) 58 | and data_entry[self.text_key][idx + 1] == " " 59 | ): 60 | new_text.extend([character, " ", data_entry[self.text_key][idx + 2].upper()]) 61 | replace_word_counter[data_entry[self.text_key][idx : idx + 3]] += 1 62 | idx += 2 63 | else: 64 | new_text.append(character) 65 | idx += 1 66 | data_entry[self.text_key] = "".join(new_text) 67 | 68 | return [DataEntry(data=data_entry, metrics=replace_word_counter)] 69 | 70 | def finalize(self, metrics): 71 | total_counter = collections.defaultdict(int) 72 | for counter in metrics: 73 | for word, count in counter.items(): 74 | total_counter[word] += count 75 | logger.info("Some of the substrings that were uppercased") 76 | total_counter_sorted = dict(sorted(total_counter.items(), key=lambda x: x[1], reverse=True)) 77 | for word, count in total_counter_sorted.items(): 78 | if count > 1: 79 | logger.info(f"{word} {count}") 80 | super().finalize(metrics) 81 | -------------------------------------------------------------------------------- /sdp/processors/nemo/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/64f9d132f01b9f1d319994383962a27f3625dcf7/sdp/processors/nemo/__init__.py -------------------------------------------------------------------------------- /sdp/processors/nemo/asr_inference.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import os 16 | import subprocess 17 | from pathlib import Path 18 | from typing import Optional 19 | 20 | from sdp.processors.base_processor import BaseProcessor 21 | 22 | # Note that we do not re-use base parallel implementation, since the ASR 23 | # inference is already run in batches. 24 | 25 | # TODO: actually, it might still be beneficial to have another level of 26 | # parallelization, but that needs to be tested. 27 | 28 | 29 | class ASRInference(BaseProcessor): 30 | """This processor performs ASR inference on each utterance of the input manifest. 31 | 32 | ASR predictions will be saved in the ``pred_text`` key. 33 | 34 | Args: 35 | pretrained_model (str, Optional): the name or the filepath of the pretrained NeMo ASR model 36 | which will be used to do inference. 37 | batch_size (int): the batch size to use for ASR inference. Defaults to 32. 38 | **kwargs: Additional keyword arguments to be passed to the base class `BaseProcessor`. 39 | 40 | Returns: 41 | The same data as in the input manifest with an additional field 42 | ``pred_text`` containing ASR model's predictions. 43 | """ 44 | 45 | def __init__( 46 | self, 47 | pretrained_model: Optional[str]=None, 48 | batch_size: int = 32, 49 | **kwargs, 50 | ): 51 | super().__init__(**kwargs) 52 | self.script_path = Path(__file__).parents[1] / "nemo" / "transcribe_speech.py" 53 | self.pretrained_model = pretrained_model 54 | self.batch_size = batch_size 55 | 56 | def process(self): 57 | """This will add "pred_text" key into the output manifest.""" 58 | os.makedirs(os.path.dirname(self.output_manifest_file), exist_ok=True) 59 | if self.pretrained_model.endswith(".nemo"): 60 | subprocess.run( 61 | f"python {self.script_path} " 62 | f"model_path={self.pretrained_model} " 63 | f"dataset_manifest={self.input_manifest_file} " 64 | f"output_filename={self.output_manifest_file} " 65 | f"batch_size={self.batch_size} ", 66 | shell=True, 67 | check=True, 68 | ) 69 | else: 70 | subprocess.run( 71 | f"python {self.script_path} " 72 | f"pretrained_name={self.pretrained_model} " 73 | f"dataset_manifest={self.input_manifest_file} " 74 | f"output_filename={self.output_manifest_file} " 75 | f"batch_size={self.batch_size} ", 76 | shell=True, 77 | check=True, 78 | ) -------------------------------------------------------------------------------- /sdp/processors/nemo/estimate_bandwidth.py: -------------------------------------------------------------------------------- 1 | import librosa 2 | import numpy as np 3 | from pathlib import Path 4 | 5 | from sdp.processors.base_processor import BaseParallelProcessor, DataEntry 6 | 7 | 8 | class EstimateBandwidth(BaseParallelProcessor): 9 | """ 10 | Adds estimated bandwidth to each utterance in the input manifest file. 11 | 12 | Args: 13 | audio_dir (str): Root directory where audio files are stored. 14 | input_audio_key (str): Manifest key with relative audio paths. 15 | output_bandwidth_key (str): Manifest key to store estimated bandwidth in. 16 | max_seconds (float): The maximum length of audio to use for bandwidth estimation. 17 | By default, uses the first 30 seconds. 18 | sample_rate (int): Sample rate to resample audio to before doing bandwidth estimation. 19 | Defaults to 44100, upsampling the input audio as needed. 20 | n_fft (int): Number of FFT bins to use for bandwidth estimation. Defaults to 512. 21 | hop_length (int): Audio frame hop length to use for bandwidth estimation. 22 | Defaults to 441, corresponding to 0.01 seconds for 44100 sample rate. 23 | top_db (float): top_db treshhold to use for bandwidth estimation. 24 | frequency_threshold (float): Bandwidth estimation finds the highest frequency with mean power spectrum that is 25 | within 'frequency_threshold' dB of its peak power. Defaults to -50 dB. 26 | 27 | Returns: 28 | This processor estimates the bandwidth of the audio file in the`input_audio_key` field and saves the estimate 29 | in the output_bandwidth_key` field. 30 | 31 | Example: 32 | .. code-block:: yaml 33 | 34 | - _target_: sdp.processors.EstimateBandwidth 35 | input_manifest_file: ${workspace_dir}/manifest.json 36 | output_manifest_file: ${workspace_dir}/manifest_bandwidth.json 37 | audio_dir: ${workspace_dir}/audio_22khz 38 | max_workers: 8 39 | """ 40 | 41 | def __init__( 42 | self, 43 | audio_dir: str, 44 | input_audio_key: str = "audio_filepath", 45 | output_bandwidth_key: str = "bandwidth", 46 | max_seconds: float = 30.0, 47 | sample_rate: int = 44100, 48 | n_fft: int = 512, 49 | hop_length: int = 441, 50 | top_db: float = 100.0, 51 | frequency_threshold: float = -50.0, 52 | **kwargs, 53 | ): 54 | super().__init__(**kwargs) 55 | self.audio_directory = Path(audio_dir) 56 | self.input_audio_key = input_audio_key 57 | self.output_bandwidth_key = output_bandwidth_key 58 | self.max_seconds = max_seconds 59 | self.sample_rate = sample_rate 60 | self.n_fft = n_fft 61 | self.hop_length = hop_length 62 | self.top_db = top_db 63 | self.frequency_threshold = frequency_threshold 64 | 65 | def _estimate_bandwidth(self, audio, sample_rate): 66 | spec = librosa.stft(y=audio, n_fft=self.n_fft, hop_length=self.hop_length, window="blackmanharris") 67 | power_spec = np.abs(spec) ** 2 68 | power_spec = np.mean(power_spec, axis=1) 69 | power_spec = librosa.power_to_db(power_spec, ref=self.n_fft, top_db=self.top_db) 70 | 71 | bandwidth = 0 72 | peak = np.max(power_spec) 73 | freq_width = sample_rate / self.n_fft 74 | for idx in range(len(power_spec) - 1, -1, -1): 75 | if power_spec[idx] - peak > self.frequency_threshold: 76 | bandwidth = idx * freq_width 77 | break 78 | 79 | return bandwidth 80 | 81 | def process_dataset_entry(self, data_entry): 82 | audio_filename = data_entry[self.input_audio_key] 83 | audio_file = self.audio_directory / audio_filename 84 | audio, sr = librosa.load(path=audio_file, sr=self.sample_rate, duration=self.max_seconds) 85 | bandwidth = self._estimate_bandwidth(audio=audio, sample_rate=sr) 86 | data_entry[self.output_bandwidth_key] = int(bandwidth) 87 | return [DataEntry(data=data_entry)] 88 | -------------------------------------------------------------------------------- /sdp/processors/toloka/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /sdp/processors/toloka/create_sentence_set.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import json 16 | import os 17 | 18 | from docx import Document 19 | 20 | from sdp.processors.base_processor import BaseParallelProcessor, DataEntry 21 | 22 | 23 | class CreateSentenceSet(BaseParallelProcessor): 24 | """Creates a set of sentences from a DOCX file by splitting its content into individual sentences. 25 | 26 | This processor reads a DOCX file, extracts the full text, splits it into sentences 27 | based on the Armenian period character, and wraps each sentence into a `DataEntry`. 28 | 29 | Args: 30 | **kwargs: Additional arguments passed to the base `BaseParallelProcessor` class. 31 | 32 | Returns: 33 | A list of `DataEntry` objects, each containing a single extracted sentence. 34 | """ 35 | def __init__(self, **kwargs): 36 | super().__init__(**kwargs) 37 | 38 | def parse_docx(self, file_path): 39 | doc = Document(file_path) 40 | 41 | full_text = [] 42 | for para in doc.paragraphs: 43 | full_text.append(para.text) 44 | 45 | combined_text = '\n'.join(full_text) 46 | 47 | sentences = combined_text.split('։') 48 | 49 | return sentences 50 | 51 | def process_dataset_entry(self, data_entry): 52 | file = data_entry["source_filepath"] 53 | 54 | data = [DataEntry(data={"text": text}) for text in self.parse_docx(file)] 55 | 56 | return data 57 | -------------------------------------------------------------------------------- /sdp/processors/tts/__init__.py: -------------------------------------------------------------------------------- 1 | 2 | from sdp.processors.tts.pyannote import PyAnnoteDiarizationAndOverlapDetection -------------------------------------------------------------------------------- /sdp/utils/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | from sdp.utils.bootstrap_estimates import BootstrapProcessor -------------------------------------------------------------------------------- /sdp/utils/edit_spaces.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | def remove_extra_spaces(input_string): 17 | """ 18 | Removes extra spaces in between words and at the start and end 19 | of the string. 20 | e.g. "abc xyz abc xyz" --> "abc xyz abc xyz" 21 | e.g. " abc xyz " --> "abc xyz" 22 | """ 23 | output_string = " ".join(input_string.split()) 24 | return output_string 25 | 26 | 27 | def add_start_end_spaces(input_string): 28 | """ 29 | Adds spaces at the start and end of the input string. 30 | This is useful for when we specify we are looking for a particular 31 | word " ". This will ensure we will find the word even 32 | if it is at the beginning or end of the utterances (ie. there will 33 | definitely be two spaces around the word). 34 | 35 | e.g. "abc xyz" --> " abc xyz " 36 | """ 37 | # ensure no extra spaces 38 | no_extra_spaces_string = remove_extra_spaces(input_string) 39 | output_string = f" {no_extra_spaces_string} " 40 | 41 | return output_string 42 | -------------------------------------------------------------------------------- /sdp/utils/get_diff.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from typing import List 16 | 17 | import diff_match_patch 18 | 19 | from sdp.utils.edit_spaces import remove_extra_spaces 20 | 21 | diff = diff_match_patch.diff_match_patch() 22 | diff.Diff_Timeout = 0 23 | 24 | 25 | def get_diff(orig_words: str, pred_words: str) -> List[tuple]: 26 | orig_words = remove_extra_spaces(orig_words) 27 | orig_words = orig_words.replace(" ", "\n") + "\n" 28 | 29 | pred_words = remove_extra_spaces(pred_words) 30 | pred_words = pred_words.replace(" ", "\n") + "\n" 31 | 32 | orig_enc, pred_enc, enc = diff.diff_linesToChars(orig_words, pred_words) 33 | diffs = diff.diff_main(orig_enc, pred_enc, False) 34 | diff.diff_charsToLines(diffs, enc) 35 | diffs_post = [] 36 | 37 | for d in diffs: 38 | diffs_post.append((d[0], d[1].replace("\n", " "))) 39 | return diffs_post 40 | 41 | 42 | def get_diff_with_subs_grouped(orig_words: str, pred_words: str) -> List[tuple]: 43 | """ 44 | Function to produce a list of word-level diffs, but with the substitutions 45 | grouped together. 46 | e.g. 47 | orig_words = "hello there nemo" 48 | pred_words = "hello my name is nemo" 49 | will give an output of: 50 | [(0, 'hello '), ((-1, 'there '), (1, 'my name is ')), (0, 'nemo ')] 51 | (note how the 'there' nad 'my name is' entry are grouped together in a tuple) 52 | 53 | This is to make it easier to find substitutions in the diffs, as 54 | dif_match_patch does not show substitutions clearly, only as a deletion followed by 55 | an insertion. 56 | 57 | Args: 58 | orig_words: a string containing the ground truth. 59 | pred_words: a string containing the text predicted by ASR. 60 | 61 | Returns: 62 | A list of tuples containing the word-level diffs between the ground truth 63 | and ASR. 64 | """ 65 | diffs = get_diff(orig_words, pred_words) 66 | 67 | diffs_group_subs = [] 68 | i = 0 69 | while i < len(diffs): 70 | if i < len(diffs) - 1: # if i == len(diffs), line accessing diffs[i+1] will raise error 71 | if diffs[i][0] == -1 and diffs[i + 1][0] == 1: 72 | diffs_group_subs.append((diffs[i], diffs[i + 1])) 73 | i += 1 # skip extra diff entry so we don't append diffs[i+1] again 74 | else: 75 | diffs_group_subs.append(diffs[i]) 76 | else: 77 | diffs_group_subs.append(diffs[i]) 78 | 79 | i += 1 80 | 81 | return diffs_group_subs 82 | -------------------------------------------------------------------------------- /sdp/utils/metrics_computation.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import difflib 16 | 17 | import editdistance 18 | 19 | sm = difflib.SequenceMatcher() 20 | 21 | 22 | def get_cer(text, pred_text): 23 | char_dist = editdistance.eval(text, pred_text) 24 | num_chars = len(text) 25 | cer = round(char_dist / num_chars * 100.0, 2) 26 | 27 | return cer 28 | 29 | 30 | def get_wer(text, pred_text): 31 | text_words = text.split() 32 | pred_text_words = pred_text.split() 33 | word_dist = editdistance.eval(text_words, pred_text_words) 34 | 35 | num_words = len(text_words) 36 | wer = round(word_dist / num_words * 100.0, 2) 37 | 38 | return wer 39 | 40 | 41 | def get_charrate(text, duration): 42 | num_chars = len(text) 43 | charrate = round(num_chars / duration, 2) 44 | 45 | return charrate 46 | 47 | 48 | def get_wordrate(text, duration): 49 | num_words = len(text.split()) 50 | wordrate = round(num_words / duration, 2) 51 | 52 | return wordrate 53 | 54 | 55 | def get_wmr(text, pred_text): 56 | orig = text.strip().split() 57 | sm.set_seqs(orig, pred_text.strip().split()) 58 | num_matches = 0 59 | for m in sm.get_matching_blocks(): 60 | for word_idx in range(m[0], m[0] + m[2]): 61 | num_matches += 1 62 | wmr = round(num_matches / len(orig) * 100.0, 2) 63 | return wmr 64 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | from setuptools import find_packages, setup 16 | 17 | 18 | def parse_requirements(filename): 19 | with open(filename) as f: 20 | return f.read().splitlines() 21 | 22 | 23 | # Read the requirements from the requirements/main.txt file 24 | requirements = parse_requirements('requirements/main.txt') 25 | 26 | setup( 27 | name="sdp", 28 | version="0.1.0", 29 | description="NeMo-Speech-Data-Processor is a toolkit for processing speech data and creating speech datasets", 30 | long_description=open("README.md").read(), 31 | long_description_content_type="text/markdown", 32 | license="Apache License, Version 2.0", 33 | url="https://github.com/NVIDIA/NeMo-speech-data-processor", 34 | packages=find_packages(include=["sdp*"]), 35 | python_requires=">=3.10", 36 | install_requires=requirements, 37 | classifiers=[ 38 | "Programming Language :: Python :: 3", 39 | "Programming Language :: Python :: 3.10", 40 | "License :: OSI Approved :: Apache Software License", 41 | "Operating System :: OS Independent", 42 | ], 43 | ) 44 | -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- 1 | To run tests you will need to install additional packages from 2 | [requirements/tests.txt](/requirements/tests.txt). 3 | 4 | Command to run all tests from this directory: `python -m pytest tests/`. 5 | 6 | There are multiple levels of tests that we use: 7 | 8 | - full end-to-end tests that will try to run tests on some configs inside the `dataset_configs/` folder using small subsets of datasets. These tests require `TEST_DATA_ROOT` to be defined, either as an environment variable, or by accessing the AWS S3 bucket (which is used during Github CI tests). If `TEST_DATA_ROOT` is not defined, these end-to-end tests are skipped. These tests are run by the `tests/test_cfg_end_to_end_tests.py` file. These tests also require the processor that creates the initial manifest to have a `raw_data_dir` parameter. 9 | - unit tests and doc tests for various SDP components. 10 | 11 | ### For SDP maintainers - how to set up end-to-end tests for a dataset. 12 | Once you are happy with the config & code for a dataset, you can also set up an end-to-end test for it to make sure that future changes to SDP will not affect the workings of your config & code. 13 | 14 | The steps for this are as follows: 15 | 16 | 1. Create a script like `tests/prepare_test_data/prepare_mls_data.py` which you will use to make a mini version of the initial dataset that is read by the first SDP processor for your dataset. Run this script. 17 | 2. Run the SDP dataset creation process for your dataset but with flags like `data_split=True, final_manifest=test_data_reference.json, processor.0.raw_data_dir=, workspace_dir=`. 18 | 3. Save the mini initial dataset produced in step 1, and the final manifest produced in step 2 in the location of `//`. 19 | 20 | a. If you save the files locally, the end-to-end test will work locally. 21 | 22 | b. If you save the files in the SDP tests AWS S3 bucket (you can only do this if you have access), the tests will be able to work when the Github CI is run. 23 | 24 | 4. Update the function `get_test_cases()` inside `tests/test_cfg_end_to_end_tests.py` so it will run your test. 25 | -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_coraa_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | 16 | import argparse 17 | import os 18 | import shutil 19 | import tempfile 20 | from pathlib import Path 21 | import zipfile 22 | import subprocess # For external commands (e.g., for rar) 23 | import random 24 | import csv 25 | import glob 26 | 27 | def create_zip_archive(source_dir, output_path): 28 | with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zipf: 29 | for root, dirs, files in os.walk(source_dir): 30 | for file in files: 31 | zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), os.path.join(source_dir, '..'))) 32 | 33 | def create_rar_archive(source_dir, output_path): 34 | parent_dir = os.path.dirname(source_dir) 35 | target_folder_name = os.path.basename(source_dir) 36 | command = ['rar', 'a', '-r', '-v20m', output_path, target_folder_name] 37 | subprocess.run(command, check=True, cwd=parent_dir) 38 | 39 | def sample_and_copy_entries(transcript_path, tmpdir_path, num_entries, extracted_data_path, output_metadata_path): 40 | with open(transcript_path, "rt", encoding="utf8") as fin: 41 | reader = csv.reader(fin) 42 | header = next(reader) 43 | selected_rows = random.sample(list(reader), num_entries) 44 | 45 | with open(output_metadata_path, "wt", encoding="utf8", newline='') as fout: 46 | writer = csv.writer(fout) 47 | writer.writerow(header) # Write the header 48 | for row in selected_rows: 49 | filepath = row[0] 50 | src_path = os.path.join(extracted_data_path, filepath) 51 | tgt_path = os.path.join(tmpdir_path, filepath) 52 | os.makedirs(os.path.dirname(tgt_path), exist_ok=True) 53 | shutil.copy(src_path, tgt_path) 54 | writer.writerow(row) 55 | 56 | if __name__ == "__main__": 57 | parser = argparse.ArgumentParser(description="Preparing Coraa test data") 58 | parser.add_argument("--extracted_data_path", required=True, help="Path to the downloaded and extracted data.") 59 | parser.add_argument("--num_entries", default=200, type=int, help="Number of entries to keep (in each split)") 60 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 61 | 62 | args = parser.parse_args() 63 | 64 | with tempfile.TemporaryDirectory() as tmpdir: 65 | tmpdir_path = Path(tmpdir) 66 | 67 | for split in ["train", "dev", "test"]: 68 | transcript_path = Path(args.extracted_data_path) / f"metadata_{split}_final.csv" 69 | output_metadata_path = Path(args.test_data_folder) / f"metadata_{split}_final.csv" 70 | sample_and_copy_entries(transcript_path, tmpdir_path, args.num_entries, args.extracted_data_path, output_metadata_path) 71 | archive_path = os.path.join(args.test_data_folder, split) 72 | source_dir = os.path.join(tmpdir_path, split) 73 | if split in ['dev', 'test']: 74 | create_zip_archive(source_dir, f"{archive_path}.zip") 75 | elif split == 'train': 76 | train_folder = os.path.join(args.test_data_folder, "train_dividido") 77 | os.makedirs(train_folder, exist_ok=True) 78 | create_rar_archive(source_dir, archive_path) 79 | pattern = os.path.join(args.test_data_folder, 'train*.rar') 80 | for file_path in glob.glob(pattern): 81 | shutil.move(file_path,train_folder) 82 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_fleurs_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded .tsv file and audios directory and create a version with only X entries.""" 16 | 17 | import argparse 18 | import os 19 | import csv 20 | import shutil 21 | import tarfile 22 | import tempfile 23 | from pathlib import Path 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser("Preparing Fleurs test data") 27 | parser.add_argument("--extracted_tsv_path", required=True, help="Path to the downloaded .tsv file.") 28 | parser.add_argument("--extracted_audios_dir", required=True, help="Path to the downloaded and extracted audios directory.") 29 | parser.add_argument( 30 | "--archive_file_stem", 31 | required=True, 32 | help="What the stem (ie without the 'tar.gz' bit) of the new archive file should be", 33 | ) 34 | parser.add_argument("--num_entries", default=20, type=int, help="How many entries to keep (in each split)") 35 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 36 | 37 | args = parser.parse_args() 38 | os.makedirs(args.test_data_folder, exist_ok=True) 39 | 40 | with tempfile.TemporaryDirectory() as tmpdir: 41 | tmpdir_path = Path(tmpdir) 42 | with open(args.extracted_tsv_path, "rt", encoding="utf8") as fin, \ 43 | open(os.path.join(args.test_data_folder, args.archive_file_stem + '.tsv'), "wt", encoding="utf8") as fout: 44 | csv_reader = csv.reader(fin, delimiter='\t') # creating CSV reader object 45 | csv_writer = csv.writer(fout, delimiter='\t') # creating CSV reader object 46 | 47 | for idx, row in enumerate(csv_reader): 48 | if idx == args.num_entries: 49 | break 50 | 51 | src_audio_path = os.path.join(args.extracted_audios_dir, row[1]) 52 | dst_audio_path = os.path.join(tmpdir, row[1]) 53 | shutil.copy(src_audio_path, dst_audio_path) 54 | 55 | csv_writer.writerow(row) 56 | 57 | with tarfile.open(os.path.join(args.test_data_folder, f"{args.archive_file_stem}.tar.gz"), "w:gz") as tar: 58 | # has to be the same as what's before .tar.gz 59 | tar.add(tmpdir, arcname=args.archive_file_stem) 60 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_hifitts2_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Copies HiFiTTS-2 manifests and audio into a new directory with fewer entries.""" 16 | 17 | import argparse 18 | import json 19 | import os 20 | from pathlib import Path 21 | import shutil 22 | 23 | if __name__ == "__main__": 24 | parser = argparse.ArgumentParser("Preparing HiFiTTS-2 test data") 25 | parser.add_argument( 26 | "--workspace_folder", required=True, type=Path, help="Path to workspace where dataset was downloaded." 27 | ) 28 | parser.add_argument( 29 | "--audio_folder", default="audio_22khz", type=Path, required=False, help="Name of root folder with audio." 30 | ) 31 | parser.add_argument("--test_data_folder", required=True, type=Path, help="Where to place the prepared data") 32 | parser.add_argument( 33 | "--manifest_filename", default="manifest_22khz.json", type=str, required=False, help="Name of manifest manifest." 34 | ) 35 | parser.add_argument( 36 | "--chapters_filename", default="chapters_22khz.json", type=str, required=False, help="Name of chapter manifest." 37 | ) 38 | parser.add_argument( 39 | "--error_filename", default="errors_22khz.json", type=str, required=False, help="Name of chapter error manifest." 40 | ) 41 | parser.add_argument("--num_entries", default=20, type=int, help="How many entries to keep from each manifest") 42 | 43 | args = parser.parse_args() 44 | 45 | files_to_copy = [args.manifest_filename, args.chapters_filename, args.error_filename] 46 | 47 | os.makedirs(args.test_data_folder, exist_ok=True) 48 | # Copy manifest files 49 | for filename in files_to_copy: 50 | input_path = args.workspace_folder / filename 51 | output_path = args.test_data_folder / filename 52 | with open(input_path, "r", encoding="utf-8") as input_f: 53 | with open(output_path, "w", encoding="utf-8") as output_f: 54 | for i, line in enumerate(input_f): 55 | if i >= args.num_entries: 56 | break 57 | output_f.write(line) 58 | 59 | # Copy audio 60 | manifest_path = args.test_data_folder / args.manifest_filename 61 | input_audio_dir = args.workspace_folder / args.audio_folder 62 | output_audio_dir = args.test_data_folder / args.audio_folder 63 | with open(manifest_path, "r", encoding="utf-8") as input_f: 64 | for i, line in enumerate(input_f): 65 | if i >= args.num_entries: 66 | break 67 | row = json.loads(line) 68 | audio_filepath = row["audio_filepath"] 69 | input_path = input_audio_dir / audio_filepath 70 | output_path = output_audio_dir / audio_filepath 71 | output_path.parent.mkdir(exist_ok=True, parents=True) 72 | shutil.copy(src=input_path, dst=output_path) -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_huggingface_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded tar file and create a version with only X entries.""" 16 | 17 | import argparse 18 | import os 19 | import tempfile 20 | import itertools 21 | from pathlib import Path 22 | 23 | if __name__ == "__main__": 24 | from datasets import load_dataset, Dataset, load_from_disk 25 | 26 | parser = argparse.ArgumentParser("Preparing TarteelAI's EveryAyah test data") 27 | parser.add_argument("--dataset_name", required=True, help="Hugging Face dataset name. E.g., 'tarteel-ai/everyayah'") 28 | parser.add_argument( 29 | "--archive_file_stem", 30 | required=True, 31 | help="What the stem (ie without the '.hf' bit) of the new archive file should be", 32 | ) 33 | parser.add_argument("--data_split", default="test", help="Dataset data split") 34 | parser.add_argument("--num_entries", default=20, type=int, help="How many entries to keep (in each split)") 35 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 36 | 37 | args = parser.parse_args() 38 | 39 | os.makedirs(args.test_data_folder, exist_ok=True) 40 | with tempfile.TemporaryDirectory() as tmpdir: 41 | tmpdir_path = Path(tmpdir) 42 | 43 | dataset = load_dataset(args.dataset_name, split="train", streaming=True) 44 | sampled_dataset = list(itertools.islice(dataset, args.num_entries)) 45 | sampled_dataset = Dataset.from_list(sampled_dataset) 46 | sampled_dataset.save_to_disk(os.path.join(args.test_data_folder, f"{args.archive_file_stem}.hf")) -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_ksc2_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded tar file and create a version with only X entries.""" 16 | 17 | import argparse 18 | import csv 19 | import json 20 | import os 21 | import shutil 22 | import tarfile 23 | import tempfile 24 | import zipfile 25 | from pathlib import Path 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser("Preparing KSC2 test data") 29 | parser.add_argument("--extracted_data_path", required=True, help="Path to the downloaded and extracted data.") 30 | parser.add_argument( 31 | "--archive_file_stem", 32 | required=True, 33 | help="What the stem (ie without the 'tar.gz' bit) of the new archive file should be", 34 | ) 35 | parser.add_argument("--num_entries", default=200, type=int, help="How many entries to keep (in each audio)") 36 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 37 | 38 | args = parser.parse_args() 39 | with tempfile.TemporaryDirectory() as tmpdir: 40 | tmpdir_path = Path(tmpdir) 41 | 42 | split_dir = Path(args.extracted_data_path, "Train") 43 | tmp_split_dir = Path(tmpdir_path, "Train") 44 | tmp_split_dir.mkdir(exist_ok=True) 45 | 46 | for source_dir in split_dir.glob("*"): 47 | tmp_source_dir = Path(tmp_split_dir, source_dir.stem) 48 | tmp_source_dir.mkdir(exist_ok=True) 49 | 50 | for idx, audio_filepath in enumerate(source_dir.glob('*.flac')): 51 | if idx == args.num_entries: 52 | break 53 | 54 | transcription_filepath = Path(audio_filepath.parent, audio_filepath.stem).with_suffix('.txt') 55 | 56 | if not transcription_filepath.exists(): 57 | transcription_filepath = transcription_filepath.with_suffix('.txt.txt') 58 | 59 | tgt_audio_path = Path(tmp_source_dir, audio_filepath.name) 60 | tgt_transcription_filepath = Path(tmp_source_dir, transcription_filepath.name) 61 | 62 | shutil.copy(audio_filepath, tgt_audio_path) 63 | shutil.copy(transcription_filepath, tgt_transcription_filepath) 64 | 65 | test_data_folder = Path(args.test_data_folder) 66 | test_data_folder.mkdir(exist_ok=True, parents=True) 67 | 68 | with tarfile.open(os.path.join(args.test_data_folder, f"{args.archive_file_stem}.tar.gz"), "w:gz") as tar: 69 | # has to be the same as what's before .tar.gz 70 | tar.add(tmpdir, arcname=args.archive_file_stem) 71 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_masc_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded tar file and create a version with only X entries.""" 16 | 17 | import argparse 18 | import os 19 | import shutil 20 | import tarfile 21 | import tempfile 22 | import csv 23 | from pathlib import Path 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser("Preparing MASC test data") 27 | parser.add_argument("--extracted_data_path", required=True, help="Path to the downloaded and extracted data.") 28 | parser.add_argument( 29 | "--archive_file_stem", 30 | required=True, 31 | help="What the stem (ie without the 'tar.gz' bit) of the new archive file should be", 32 | ) 33 | parser.add_argument("--num_entries", default=10, type=int, help="How many entries to keep (in each split)") 34 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 35 | 36 | args = parser.parse_args() 37 | 38 | # Define a dictionary to map splits to filenames 39 | filename_map = { 40 | "train": "clean_train.csv", 41 | "dev": "clean_dev_meta.csv", 42 | "test": "clean_test_meta.csv" 43 | } 44 | 45 | with tempfile.TemporaryDirectory() as tmpdir: 46 | tmpdir_path = Path(tmpdir) 47 | os.makedirs(tmpdir_path / "audios") 48 | os.makedirs(tmpdir_path / "subtitles") 49 | os.makedirs(tmpdir_path / "subsets") 50 | 51 | for split in ["train", "dev", "test"]: 52 | transcript_path = Path(args.extracted_data_path) / "subsets" / filename_map[split] 53 | with open(transcript_path, "rt", encoding="utf8") as fin, open(tmpdir_path / "subsets" / filename_map[split], "wt", encoding="utf8") as fout: 54 | csv_reader = csv.reader(fin) # creating CSV reader object 55 | csv_writer = csv.writer(fout) # creating CSV reader object 56 | 57 | csv_writer.writerow(next(csv_reader)) # writing colomns line 58 | for idx, row in enumerate(csv_reader): 59 | if idx == args.num_entries: 60 | break 61 | utt_id = row[0] 62 | 63 | # copying audio file 64 | src_audio_path = os.path.join(args.extracted_data_path, "audios", f"{utt_id}.wav") 65 | tgt_audio_path = os.path.join(tmpdir_path, "audios", f"{utt_id}.wav") 66 | shutil.copy(src_audio_path, tgt_audio_path) 67 | 68 | # copying transcription file 69 | src_transcript_path = os.path.join(args.extracted_data_path, "subtitles", f"{utt_id}.ar.vtt") 70 | tgt_transcript_path = os.path.join(tmpdir_path, "subtitles", f"{utt_id}.ar.vtt") 71 | shutil.copy(src_transcript_path, tgt_transcript_path) 72 | 73 | csv_writer.writerow(row) 74 | 75 | os.makedirs(args.test_data_folder, exist_ok=True) 76 | with tarfile.open(os.path.join(args.test_data_folder, f"{args.archive_file_stem}.tar.gz"), "w:gz") as tar: 77 | # has to be the same as what's before .tar.gz 78 | tar.add(tmpdir, arcname=args.archive_file_stem) 79 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_mcv_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded tar file and create a version with only X entries.""" 16 | 17 | import argparse 18 | import os 19 | import shutil 20 | import tarfile 21 | import tempfile 22 | from pathlib import Path 23 | 24 | if __name__ == "__main__": 25 | parser = argparse.ArgumentParser("Preparing MCV test data") 26 | parser.add_argument("--extracted_data_path", required=True, help="Path to the downloaded and extracted data.") 27 | parser.add_argument( 28 | "--archive_file_stem", 29 | required=True, 30 | help="What the stem (ie without the 'tar.gz' bit) of the new archive file should be", 31 | ) 32 | parser.add_argument("--num_entries", default=200, type=int, help="How many entries to keep (in each split)") 33 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 34 | 35 | args = parser.parse_args() 36 | with tempfile.TemporaryDirectory() as tmpdir: 37 | tmpdir_path = Path(tmpdir) 38 | os.makedirs(tmpdir_path / "clips") 39 | for split in ["train", "dev", "test"]: 40 | transcript_path = Path(args.extracted_data_path) / f"{split}.tsv" 41 | with open(transcript_path, "rt", encoding="utf8") as fin, open( 42 | tmpdir_path / f"{split}.tsv", "wt", encoding="utf8" 43 | ) as fout: 44 | fout.write(fin.readline()) # just copy over header line 45 | for idx, line in enumerate(fin): 46 | if idx == args.num_entries: 47 | break 48 | utt_id = line.split("\t")[1] 49 | src_mp3_path = os.path.join(args.extracted_data_path, "clips", utt_id) 50 | fout.write(line) 51 | tgt_mp3_path = os.path.join(tmpdir_path, "clips", utt_id) 52 | shutil.copy(src_mp3_path, tgt_mp3_path) 53 | 54 | os.makedirs(args.test_data_folder, exist_ok=True) 55 | with tarfile.open(os.path.join(args.test_data_folder, f"{args.archive_file_stem}.tar.gz"), "w:gz") as tar: 56 | # has to be the same as what's before .tar.gz 57 | tar.add(tmpdir, arcname=args.archive_file_stem) 58 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_mediaspeech_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded tar file and create a version with only X entries.""" 16 | 17 | import argparse 18 | import os 19 | import shutil 20 | import glob 21 | import tarfile 22 | import tempfile 23 | from pathlib import Path 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser("Preparing Mediaspeech test data") 27 | parser.add_argument("--extracted_data_path", required=True, help="Path to the downloaded and extracted data.") 28 | parser.add_argument( 29 | "--archive_file_stem", 30 | required=True, 31 | help="What the stem (ie without the 'tar.gz' bit) of the new archive file should be", 32 | ) 33 | parser.add_argument("--num_entries", default=20, type=int, help="How many entries to keep (in each split)") 34 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 35 | 36 | args = parser.parse_args() 37 | 38 | os.makedirs(args.test_data_folder, exist_ok=True) 39 | with tempfile.TemporaryDirectory() as tmpdir: 40 | tmpdir_path = Path(tmpdir) 41 | 42 | audio_filepaths = glob.glob(f"{args.extracted_data_path}/*.flac") 43 | for idx, src_audio_filepath in enumerate(audio_filepaths): 44 | if idx == args.num_entries: 45 | break 46 | 47 | sample_id = os.path.basename(src_audio_filepath).split(".")[0] 48 | src_text_filepath = os.path.join(args.extracted_data_path, f"{sample_id}.txt") 49 | dst_text_filepath = os.path.join(tmpdir, f"{sample_id}.txt") 50 | dst_audio_filepath = os.path.join(tmpdir, f"{sample_id}.flac") 51 | 52 | shutil.copy(src_text_filepath, dst_text_filepath) 53 | shutil.copy(src_audio_filepath, dst_audio_filepath) 54 | 55 | with tarfile.open(os.path.join(args.test_data_folder, f"{args.archive_file_stem}.tar.gz"), "w:gz") as tar: 56 | # has to be the same as what's before .tar.gz 57 | tar.add(tmpdir, arcname=args.archive_file_stem) -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_mls_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded tar file and create a version with only X entries.""" 16 | 17 | import argparse 18 | import os 19 | import shutil 20 | import tarfile 21 | import tempfile 22 | from pathlib import Path 23 | 24 | if __name__ == "__main__": 25 | parser = argparse.ArgumentParser("Preparing MLS test data") 26 | parser.add_argument("--extracted_data_path", required=True, help="Path to the downloaded and extracted data.") 27 | parser.add_argument( 28 | "--language", 29 | required=True, 30 | help="The name of the language, used to determine output file name mls_{language}.tar.gz", 31 | ) 32 | parser.add_argument("--num_entries", default=200, type=int, help="How many entries to keep (in each split)") 33 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 34 | 35 | args = parser.parse_args() 36 | with tempfile.TemporaryDirectory() as tmpdir: 37 | tmpdir_path = Path(tmpdir) 38 | for split in ["train", "dev", "test"]: 39 | os.makedirs(tmpdir_path / split / "audio") 40 | transcript_path = Path(args.extracted_data_path) / split / "transcripts.txt" 41 | with open(transcript_path, "rt", encoding="utf8") as fin, open( 42 | tmpdir_path / split / "transcripts.txt", "wt", encoding="utf8" 43 | ) as fout: 44 | for idx, line in enumerate(fin): 45 | if idx == args.num_entries: 46 | break 47 | utt_id = line.split("\t", 1)[0] 48 | src_flac_path = os.path.join( 49 | args.extracted_data_path, split, "audio", *utt_id.split("_")[:2], utt_id + ".flac" 50 | ) 51 | fout.write(line) 52 | tgt_flac_dir = os.path.join(tmpdir_path, split, "audio", *utt_id.split("_")[:2]) 53 | os.makedirs(tgt_flac_dir, exist_ok=True) 54 | shutil.copy(src_flac_path, os.path.join(tgt_flac_dir, utt_id + ".flac")) 55 | os.makedirs(args.test_data_folder, exist_ok=True) 56 | with tarfile.open(os.path.join(args.test_data_folder, f"mls_{args.language}.tar.gz"), "w:gz") as tar: 57 | # has to be the same as what's before .tar.gz 58 | tar.add(tmpdir, arcname=f"mls_{args.language}") 59 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_mtedx_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import argparse 16 | import os 17 | import shutil 18 | import tarfile 19 | import tempfile 20 | from pathlib import Path 21 | 22 | if __name__ == "__main__": 23 | parser = argparse.ArgumentParser("Preparing MTEDX data") 24 | parser.add_argument("--extracted_data_path", required=True, help="Path to the downloaded and extracted data.") 25 | parser.add_argument( 26 | "--language_id", 27 | required=True, 28 | help="The name of the language, used to determine output file name mtedx_{language}.tgz", 29 | ) 30 | parser.add_argument("--num_entries", default=2, type=int, help="How many flac files to be splitted") 31 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 32 | 33 | args = parser.parse_args() 34 | with tempfile.TemporaryDirectory() as tmpdir: 35 | tmpdir_path = Path(tmpdir) 36 | data_path = os.path.join(tmpdir_path, "data") 37 | os.makedirs(data_path, exist_ok=True) 38 | for split in ["train", "valid", "test"]: 39 | vtt_path_dest= os.path.join(data_path, split, "vtt") 40 | flac_path_dest= os.path.join(data_path, split, "wav") 41 | os.makedirs(vtt_path_dest, exist_ok=True) 42 | os.makedirs(flac_path_dest, exist_ok=True) 43 | for idx, vtt_file in enumerate(os.listdir(os.path.join( 44 | args.extracted_data_path, "data", split, "vtt"))): 45 | if idx == args.num_entries: 46 | break 47 | flac_file = vtt_file.split(".")[0] + ".flac" 48 | vtt_file_src = os.path.join(args.extracted_data_path,"data", split, "vtt", vtt_file) 49 | flac_file_src = os.path.join(args.extracted_data_path, "data", split, "wav", flac_file) 50 | shutil.copy(vtt_file_src, vtt_path_dest) 51 | shutil.copy(flac_file_src, flac_path_dest) 52 | with tarfile.open(os.path.join(args.test_data_folder, f"mtedx_{args.language_id}.tgz"), "w:gz") as tar: 53 | tar.add(tmpdir, arcname=f"mtedx_{args.language_id}") 54 | 55 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_slr102_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded tar file and create a version with only X entries.""" 16 | 17 | import argparse 18 | import csv 19 | import json 20 | import os 21 | import shutil 22 | import tarfile 23 | import tempfile 24 | import zipfile 25 | from pathlib import Path 26 | 27 | if __name__ == "__main__": 28 | parser = argparse.ArgumentParser("Preparing SLR102 test data") 29 | parser.add_argument("--extracted_data_path", required=True, help="Path to the downloaded and extracted data.") 30 | parser.add_argument( 31 | "--archive_file_stem", 32 | required=True, 33 | help="What the stem (ie without the 'tar.gz' bit) of the new archive file should be", 34 | ) 35 | parser.add_argument("--num_entries", default=200, type=int, help="How many entries to keep (in each audio)") 36 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 37 | 38 | args = parser.parse_args() 39 | with tempfile.TemporaryDirectory() as tmpdir: 40 | tmpdir_path = Path(tmpdir) 41 | 42 | split_dir = Path(args.extracted_data_path, "Meta") 43 | tmp_split_dir = Path(tmpdir_path, "Meta") 44 | tmp_split_dir.mkdir(exist_ok=True) 45 | 46 | with open(Path(split_dir, "train.csv"), "rt", encoding="utf8") as csvfile_in, open( 47 | Path(tmp_split_dir, "train.csv"), "wt", encoding="utf8" 48 | ) as csvfile_out: 49 | reader = csv.DictReader(csvfile_in, delimiter=" ") 50 | headers = next(reader, None) # skip the headers 51 | writer = csv.DictWriter(csvfile_out, fieldnames=headers, delimiter=" ") 52 | writer.writeheader() 53 | 54 | utt_used = [] 55 | 56 | for idx, row in enumerate(reader): 57 | if idx == args.num_entries: 58 | break 59 | writer.writerow(row) 60 | utt_used.append(row["uttID"]) 61 | 62 | transcript_dir = Path(args.extracted_data_path, "Transcriptions") 63 | tmp_transcript_dir = Path(tmpdir_path, "Transcriptions") 64 | tmp_transcript_dir.mkdir(exist_ok=True) 65 | 66 | audios_dir = Path(args.extracted_data_path, "Audios_flac") 67 | tmp_audios_dir = Path(tmpdir_path, "Audios_flac") 68 | tmp_audios_dir.mkdir(exist_ok=True) 69 | 70 | for utt in utt_used: 71 | audio_path = Path(audios_dir, utt).with_suffix(".flac") 72 | transcript_path = Path(transcript_dir, utt).with_suffix(".txt") 73 | 74 | tgt_audio_path = Path(tmp_audios_dir, utt).with_suffix(".flac") 75 | tgt_transcript_path = Path(tmp_transcript_dir, utt).with_suffix(".txt") 76 | 77 | shutil.copy(audio_path, tgt_audio_path) 78 | shutil.copy(transcript_path, tgt_transcript_path) 79 | 80 | test_data_folder = Path(args.test_data_folder) 81 | test_data_folder.mkdir(exist_ok=True, parents=True) 82 | 83 | with tarfile.open(os.path.join(args.test_data_folder, f"{args.archive_file_stem}.tar.gz"), "w:gz") as tar: 84 | # has to be the same as what's before .tar.gz 85 | tar.add(tmpdir, arcname=args.archive_file_stem) 86 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_slr140_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded tar file and create a version with only X entries.""" 16 | 17 | import argparse 18 | import json 19 | import os 20 | import shutil 21 | import tarfile 22 | import tempfile 23 | import zipfile 24 | from pathlib import Path 25 | 26 | if __name__ == "__main__": 27 | parser = argparse.ArgumentParser("Preparing SLR140 test data") 28 | parser.add_argument("--extracted_data_path", required=True, help="Path to the downloaded and extracted data.") 29 | parser.add_argument( 30 | "--archive_file_stem", 31 | required=True, 32 | help="What the stem (ie without the 'tar.gz' bit) of the new archive file should be", 33 | ) 34 | parser.add_argument("--num_entries", default=200, type=int, help="How many entries to keep (in each audio)") 35 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 36 | 37 | args = parser.parse_args() 38 | with tempfile.TemporaryDirectory() as tmpdir: 39 | tmpdir_path = Path(tmpdir) 40 | 41 | for audio_dir in Path(args.extracted_data_path).glob('*'): 42 | if not audio_dir.is_dir(): 43 | continue 44 | 45 | transcript_path = audio_dir / "train.json" 46 | audio_tmpdir = tmpdir_path / audio_dir.stem / audio_dir.stem 47 | audio_tmpdir.mkdir(exist_ok=True, parents=True) 48 | 49 | with open(transcript_path, "rt", encoding="utf-8-sig") as fin, open( 50 | audio_tmpdir / "train.json", "wt", encoding="utf-8-sig" 51 | ) as fout: 52 | sample = [json.loads(line) for line in fin.readlines()][0][: args.num_entries] 53 | 54 | for entry in sample: 55 | utt_id = entry['wav'].split('/')[-1] 56 | utt_dir = entry['wav'].split('/')[-2] 57 | 58 | utt_tmp_dir = audio_tmpdir / utt_dir 59 | utt_tmp_dir.mkdir(exist_ok=True) 60 | 61 | src_wav_path = audio_dir / utt_dir / utt_id 62 | tgt_wav_path = utt_tmp_dir / utt_id 63 | shutil.copy(src_wav_path, tgt_wav_path) 64 | 65 | fout.write(str(sample).replace("'", '"')) 66 | 67 | shutil.make_archive((tmpdir_path / audio_dir.stem), 'zip', (tmpdir_path / audio_dir.stem)) 68 | 69 | print(os.listdir(tmpdir_path)) 70 | 71 | shutil.rmtree((tmpdir_path / audio_dir.stem)) 72 | 73 | print(os.listdir(tmpdir_path)) 74 | 75 | test_data_folder = Path(args.test_data_folder) 76 | test_data_folder.mkdir(exist_ok=True, parents=True) 77 | 78 | with tarfile.open(os.path.join(args.test_data_folder, f"{args.archive_file_stem}.tar.gz"), "w:gz") as tar: 79 | # has to be the same as what's before .tar.gz 80 | tar.add(tmpdir, arcname=args.archive_file_stem) 81 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_voxpopuli_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded tar file and create a version with only X entries.""" 16 | 17 | import argparse 18 | import os 19 | import shutil 20 | import tarfile 21 | import tempfile 22 | from pathlib import Path 23 | 24 | if __name__ == "__main__": 25 | parser = argparse.ArgumentParser("Preparing VoxPopuli test data") 26 | parser.add_argument("--data_path", required=True, help="Path to the processed data.") 27 | parser.add_argument( 28 | "--language_id", 29 | required=True, 30 | help="The id of the language", 31 | ) 32 | parser.add_argument("--num_entries", default=200, type=int, help="How many entries to keep (in each split)") 33 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 34 | 35 | args = parser.parse_args() 36 | with tempfile.TemporaryDirectory() as tmpdir: 37 | tmpdir_path = Path(tmpdir) 38 | os.makedirs(tmpdir_path / "transcribed_data" / args.language_id) 39 | 40 | for split in ["train", "dev", "test"]: 41 | transcript_path = Path(args.data_path) / "transcribed_data" / args.language_id / f"asr_{split}.tsv" 42 | with open(transcript_path, "rt", encoding="utf8") as fin, open( 43 | tmpdir_path / "transcribed_data" / args.language_id / f"asr_{split}.tsv", "wt", encoding="utf8" 44 | ) as fout: 45 | for idx, line in enumerate(fin): 46 | if idx == args.num_entries + 1: 47 | break 48 | fout.write(line) 49 | if idx == 0: # skipping header 50 | continue 51 | utt_id, raw_text, norm_text, spk_id, _, gender, is_gold_transcript, accent = line.split("\t") 52 | year = utt_id[:4] 53 | src_audio_path = ( 54 | Path(args.data_path) / "transcribed_data" / args.language_id / year / (utt_id + ".ogg") 55 | ) 56 | target_audio_dir = tmpdir_path / "transcribed_data" / args.language_id / year 57 | os.makedirs(target_audio_dir, exist_ok=True) 58 | shutil.copy(src_audio_path, target_audio_dir / (utt_id + ".ogg")) 59 | # even though the voxpopuli processor expects untarred folder, 60 | # we still tar it to save time on the download from s3 61 | with tarfile.open(os.path.join(args.test_data_folder, f"transcribed_data.tar.gz"), "w:gz") as tar: 62 | # has to be the same as what's before .tar.gz 63 | tar.add(tmpdir_path / "transcribed_data", arcname=f"transcribed_data") 64 | -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_ytc_data.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """Will take the downloaded tar file and create a version with only X entries.""" 16 | 17 | import argparse 18 | import os 19 | import shutil 20 | import tarfile 21 | import tempfile 22 | import json 23 | from pathlib import Path 24 | 25 | if __name__ == "__main__": 26 | parser = argparse.ArgumentParser("Preparing YTC test data") 27 | parser.add_argument("--extracted_data_path", required=True, help="Path to the downloaded and extracted data.") 28 | parser.add_argument( 29 | "--language", 30 | required=True, 31 | help="The name of the language, used to determine output file name ytc_{language}.tar.gz", 32 | ) 33 | parser.add_argument("--num_entries", default=200, type=int, help="How many entries to keep (in each split)") 34 | parser.add_argument("--test_data_folder", required=True, help="Where to place the prepared data") 35 | 36 | args = parser.parse_args() 37 | with tempfile.TemporaryDirectory() as tmpdir: 38 | tmpdir_path = Path(tmpdir) 39 | split = "test" 40 | os.makedirs(tmpdir_path / split / "audio") 41 | manifest_path = tmpdir_path / split / "manifest.jsonl" 42 | with open(manifest_path, "w", encoding="utf-8") as fout: 43 | for idx, audio_file in enumerate(Path(args.extracted_data_path).glob("audios/*")): 44 | if idx == args.num_entries: 45 | break 46 | 47 | # Copy audio file to temp directory maintaining relative path 48 | rel_path = audio_file.relative_to(Path(args.extracted_data_path)) 49 | target_path = tmpdir_path / split / "audio" / rel_path 50 | target_path.parent.mkdir(parents=True, exist_ok=True) 51 | shutil.copy2(audio_file, target_path) 52 | stem = audio_file.stem 53 | 54 | # Write manifest entry 55 | manifest_entry = { 56 | "audio_filepath": str(target_path.relative_to(tmpdir_path / split)), 57 | "audio_item_id": stem 58 | } 59 | fout.write(f"{json.dumps(manifest_entry)}\n") 60 | 61 | os.makedirs(args.test_data_folder, exist_ok=True) 62 | with tarfile.open(os.path.join(args.test_data_folder, f"ytc_{args.language}.tar.gz"), "w:gz") as tar: 63 | # has to be the same as what's before .tar.gz 64 | tar.add(tmpdir, arcname=f"ytc_{args.language}") 65 | -------------------------------------------------------------------------------- /tests/test_cfg_runtime_tests.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import glob 16 | from pathlib import Path 17 | 18 | import hydra 19 | import pytest 20 | from omegaconf import OmegaConf, open_dict 21 | 22 | DATASET_CONFIGS_ROOT = Path(__file__).parents[1] / "dataset_configs" 23 | 24 | 25 | def get_test_cases(): 26 | """Returns paths to all configs that are checked in.""" 27 | for config_path in glob.glob(f"{DATASET_CONFIGS_ROOT}/**/*.yaml", recursive=True): 28 | if not config_path.endswith("nemo_run_config.yaml"): 29 | yield config_path 30 | 31 | 32 | @pytest.mark.parametrize("config_path", get_test_cases()) 33 | def test_configs(config_path: str): 34 | try: 35 | # to be able to correctly read some of the configs 36 | OmegaConf.register_new_resolver("subfield", lambda node, field: node[field]) 37 | OmegaConf.register_new_resolver("not", lambda x: not x) 38 | OmegaConf.register_new_resolver("equal", lambda field, value: field == value) 39 | except ValueError: # already registered 40 | pass 41 | 42 | cfg = OmegaConf.load(config_path) 43 | cfg["data_split"] = "train" # in case it's required for tests 44 | for processor_cfg in cfg.processors: 45 | if "test_cases" in processor_cfg: 46 | # clear input_manifest_file and output_manifest_file to make sure we don't get 47 | # a MissingMandatoryValue error when we instantiate the processor 48 | with open_dict(processor_cfg): 49 | processor_cfg["output_manifest_file"] = None 50 | processor_cfg["input_manifest_file"] = None 51 | # in case should_run is specified, we need to remove it 52 | processor_cfg.pop("should_run", None) 53 | processor = hydra.utils.instantiate(processor_cfg) 54 | processor.test() 55 | -------------------------------------------------------------------------------- /tests/test_import_manager.py: -------------------------------------------------------------------------------- 1 | import tempfile 2 | import os 3 | import json 4 | from pathlib import Path 5 | from sdp.utils.import_manager import ImportManager 6 | import pytest 7 | from typing import Dict, List, Union, Optional 8 | 9 | # Example YAML content with processors 10 | #Content is right, additional {} is needed because of the format function 11 | TEST_YAML_CONTENT = """ 12 | use_import_manager: True 13 | processors_to_run: ":" # Run all processors 14 | workspace_dir: {workspace_dir} 15 | processors: 16 | - _target_: sdp.processors.modify_manifest.common.DuplicateFields 17 | input_manifest_file: {workspace_dir}/test1.json 18 | output_manifest_file: {workspace_dir}/test2.json 19 | duplicate_fields: {{"text": "answer"}} 20 | 21 | - _target_: sdp.processors.modify_manifest.common.RenameFields 22 | input_manifest_file: {workspace_dir}/test2.json 23 | output_manifest_file: {workspace_dir}/test3.json 24 | rename_fields: {{"text": "text2test"}} 25 | """ 26 | 27 | # Example manifest content 28 | EXAMPLE_MANIFEST = [ 29 | {"id": 1, "text": "hello", "duration": 10, "audio_filepath": "path1"}, 30 | {"id": 2, "text": "world", "duration": 12, "audio_filepath": "path2"} 31 | ] 32 | 33 | def _write_manifest(file_path, content: List[Dict]): 34 | """json lines to a file.""" 35 | with open(file_path, "w") as f: 36 | for entry in content: 37 | f.write(json.dumps(entry) + "\n") 38 | 39 | def test_import_manager_with_workspace(): 40 | """ 41 | Test ImportManager's functionality with a workspace directory and example manifests. 42 | """ 43 | with tempfile.TemporaryDirectory() as tmp_workspace: 44 | #workspace_dir = Path 45 | workspace_dir = Path(tmp_workspace) 46 | 47 | # Step 1: example manifest files 48 | test1_path = workspace_dir / "test1.json" 49 | test2_path = workspace_dir / "test2.json" 50 | test3_path = workspace_dir / "test3.json" 51 | _write_manifest(test1_path, EXAMPLE_MANIFEST) 52 | 53 | # create yaml configuration file 54 | yaml_content = TEST_YAML_CONTENT.format(workspace_dir=workspace_dir) 55 | yaml_file = workspace_dir / "config.yaml" 56 | with open(yaml_file, "w") as f: 57 | f.write(yaml_content) 58 | 59 | # Run ImportManager 60 | init_file = workspace_dir / "__init__.py" 61 | manager = ImportManager() 62 | manager.sync_with_config(yaml_config=str(yaml_file), init_file=str(init_file)) 63 | 64 | # Verify that __init__.py contains the expected imports 65 | assert init_file.exists(), "__init__.py file should be created" 66 | 67 | with open(init_file, "r") as f: 68 | init_content = f.read() 69 | 70 | expected_imports = [ 71 | "from sdp.processors.modify_manifest.common import DuplicateFields", 72 | "from sdp.processors.modify_manifest.common import RenameFields", 73 | ] 74 | for expected_import in expected_imports: 75 | assert expected_import in init_content, f"Expected import '{expected_import}' not found" 76 | 77 | # Verify that the manifests is ok 78 | assert test1_path.exists(), "test1.json should exist" 79 | assert not test2_path.exists(), "test2.json should not be overwritten yet" 80 | assert not test3_path.exists(), "test3.json should not be overwritten yet" 81 | 82 | -------------------------------------------------------------------------------- /tests/test_lhotse.py: -------------------------------------------------------------------------------- 1 | import json 2 | 3 | import pytest 4 | import soundfile 5 | 6 | from lhotse.testing.dummies import DummyManifest 7 | from lhotse import CutSet 8 | 9 | from sdp.processors.datasets.lhotse import LhotseImport 10 | 11 | 12 | @pytest.fixture 13 | def cuts_path(tmp_path): 14 | """ 15 | Create tmpdir with audio data referenced by a CutSet 16 | (two 1s utterances with text, speaker, gender, and language values of 'irrelevant'). 17 | """ 18 | p = tmp_path / "cuts.jsonl.gz" 19 | 20 | def drop_custom(c): 21 | c.custom = None 22 | c.supervisions[0].custom = None 23 | return c 24 | 25 | ( 26 | DummyManifest(CutSet, begin_id=0, end_id=2, with_data=True) 27 | .map(drop_custom) 28 | .save_audios(tmp_path / "audios") 29 | .to_file(p) 30 | ) 31 | 32 | return p 33 | 34 | 35 | def test_lhotse_import(tmp_path, cuts_path): 36 | out_path = tmp_path / "nemo_manifest.json" 37 | 38 | processor = LhotseImport( 39 | input_manifest_file=cuts_path, output_manifest_file=out_path 40 | ) 41 | processor.process() 42 | 43 | EXPECTED_KEYS = { 44 | "audio_filepath", 45 | "lhotse_cut_id", 46 | "text", 47 | "duration", 48 | "speaker", 49 | "gender", 50 | "language", 51 | } 52 | 53 | data = [json.loads(line) for line in out_path.open()] 54 | assert len(data) == 2 55 | 56 | for item in data: 57 | assert set(item.keys()) == EXPECTED_KEYS 58 | assert item["duration"] == 1.0 59 | audio, sr = soundfile.read(item["audio_filepath"]) 60 | assert audio.shape == (16000,) 61 | for key in ("text", "speaker", "gender", "language"): 62 | assert item[key] == "irrelevant" 63 | -------------------------------------------------------------------------------- /tests/test_manifest_chunking.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2023, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | """ 16 | We do small-scale tests with small values of in_memory_chunksize to check that 17 | processors work correctly even when chunking is used. 18 | """ 19 | 20 | 21 | import json 22 | 23 | import pytest 24 | 25 | from sdp.processors import DropNonAlphabet 26 | from sdp.processors import SubMakeLowercase 27 | 28 | def test_submakelowercase_with_chunking(tmp_path): 29 | 30 | input_lines = [ 31 | {"text": "ABC"}, 32 | {"text": "DEF"}, 33 | {"text": "GHI"}, 34 | {"text": "JKL"}, 35 | {"text": "MNO"}, 36 | {"text": "PQR"}, 37 | {"text": "STU"}, 38 | {"text": "VWX"}, 39 | {"text": "YZ"}, 40 | ] 41 | 42 | expected_output_lines = [ 43 | {"text": "abc"}, 44 | {"text": "def"}, 45 | {"text": "ghi"}, 46 | {"text": "jkl"}, 47 | {"text": "mno"}, 48 | {"text": "pqr"}, 49 | {"text": "stu"}, 50 | {"text": "vwx"}, 51 | {"text": "yz"}, 52 | ] 53 | 54 | 55 | # save input lines to manifest: 56 | input_manifest_file = tmp_path / "input_manifest.json" 57 | with open(input_manifest_file, "w") as f: 58 | for line in input_lines: 59 | f.write(json.dumps(line) + "\n") 60 | 61 | # run make_lowercase processor: 62 | output_manifest_file = tmp_path / "output_manifest_make_lowercase.json" 63 | processor = SubMakeLowercase( 64 | input_manifest_file=input_manifest_file, 65 | output_manifest_file=output_manifest_file, 66 | in_memory_chunksize=2 67 | ) 68 | 69 | processor.process() 70 | 71 | # check that output manifest matches expected lines: 72 | with open(output_manifest_file, "r") as f: 73 | output_lines = [json.loads(line) for line in f] 74 | 75 | assert output_lines == expected_output_lines 76 | 77 | 78 | def test_dropnonalphabet_with_chunking(tmp_path): 79 | 80 | input_lines = [ 81 | {"text": "ABC"}, 82 | {"text": "DEF"}, 83 | {"text": "GHI"}, 84 | {"text": "JKL"}, 85 | {"text": "MNO"}, 86 | {"text": "PQR"}, 87 | {"text": "STU"}, 88 | {"text": "VWX"}, 89 | {"text": "YZ"}, 90 | ] 91 | 92 | expected_output_lines = [ 93 | {"text": "ABC"}, 94 | ] 95 | 96 | # save input lines to manifest: 97 | input_manifest_file = tmp_path / "input_manifest.json" 98 | with open(input_manifest_file, "w") as f: 99 | for line in input_lines: 100 | f.write(json.dumps(line) + "\n") 101 | 102 | # run make_lowercase processor: 103 | output_manifest_file = tmp_path / "output_manifest_make_lowercase.json" 104 | processor = DropNonAlphabet( 105 | input_manifest_file=input_manifest_file, 106 | output_manifest_file=output_manifest_file, 107 | in_memory_chunksize=2, 108 | alphabet="ABC" 109 | ) 110 | 111 | processor.process() 112 | 113 | # check that output manifest matches expected lines: 114 | with open(output_manifest_file, "r") as f: 115 | output_lines = [json.loads(line) for line in f] 116 | 117 | assert output_lines == expected_output_lines 118 | -------------------------------------------------------------------------------- /tests/test_normalize_text.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from sdp.processors.modify_manifest.data_to_data import ( 18 | InverseNormalizeText, 19 | NormalizeText, 20 | ) 21 | 22 | normalize_test_params_list = [] 23 | 24 | normalize_test_params_list.extend( 25 | [ 26 | ( 27 | { 28 | "input_text_key": "text", 29 | "input_language": "en", 30 | "input_case": "cased", 31 | "output_text_key": "normalized_text", 32 | }, 33 | {"text": "$12"}, 34 | {"text": "$12", "normalized_text": "twelve dollars"}, 35 | ), 36 | ( 37 | { 38 | "input_text_key": "text", 39 | "input_language": "en", 40 | "input_case": "cased", 41 | "output_text_key": "normalized_text", 42 | }, 43 | {"text": "120"}, 44 | {"text": "120", "normalized_text": "one hundred and twenty"}, 45 | ), 46 | ( 47 | { 48 | "input_text_key": "text", 49 | "input_language": "hy", 50 | "input_case": "cased", 51 | "output_text_key": "normalized_text", 52 | }, 53 | {"text": "11"}, 54 | {"text": "11", "normalized_text": "տասնմեկ"}, 55 | ), 56 | ] 57 | ) 58 | 59 | 60 | @pytest.mark.parametrize("class_kwargs,test_input,expected_output", normalize_test_params_list, ids=str) 61 | def test_normalize_text(class_kwargs, test_input, expected_output): 62 | processor = NormalizeText(**class_kwargs, output_manifest_file=None) 63 | processor.prepare() 64 | 65 | output = processor.process_dataset_entry(test_input)[0].data 66 | 67 | assert output == expected_output 68 | 69 | 70 | inverse_normalize_test_params_list = [] 71 | 72 | inverse_normalize_test_params_list.extend( 73 | [ 74 | ( 75 | { 76 | "input_text_key": "text", 77 | "input_language": "en", 78 | "input_case": "cased", 79 | "output_text_key": "inverse_normalized_text", 80 | }, 81 | {"text": "twelve dollars"}, 82 | {"text": "twelve dollars", "inverse_normalized_text": "$12"}, 83 | ), 84 | ( 85 | { 86 | "input_text_key": "text", 87 | "input_language": "en", 88 | "input_case": "cased", 89 | "output_text_key": "inverse_normalized_text", 90 | }, 91 | {"text": "one hundred and twenty"}, 92 | {"text": "one hundred and twenty", "inverse_normalized_text": "120"}, 93 | ), 94 | ( 95 | { 96 | "input_text_key": "text", 97 | "input_language": "hy", 98 | "input_case": "cased", 99 | "output_text_key": "inverse_normalized_text", 100 | }, 101 | {"text": "տասնմեկ"}, 102 | {"text": "տասնմեկ", "inverse_normalized_text": "11"}, 103 | ), 104 | ] 105 | ) 106 | 107 | 108 | @pytest.mark.parametrize("class_kwargs,test_input,expected_output", inverse_normalize_test_params_list, ids=str) 109 | def test_inverse_normalize_text(class_kwargs, test_input, expected_output): 110 | processor = InverseNormalizeText(**class_kwargs, output_manifest_file=None) 111 | processor.prepare() 112 | 113 | output = processor.process_dataset_entry(test_input)[0].data 114 | 115 | assert output == expected_output 116 | -------------------------------------------------------------------------------- /tests/test_tts_sdp_end_to_end.py: -------------------------------------------------------------------------------- 1 | import pytest 2 | import boto3 3 | import json 4 | import os 5 | import tarfile 6 | from pathlib import Path 7 | from omegaconf import OmegaConf 8 | from sdp.run_processors import run_processors 9 | from sdp.utils.common import load_manifest 10 | 11 | DATASET_CONFIGS_ROOT = Path(__file__).parents[1] / "dataset_configs" 12 | 13 | @pytest.fixture 14 | def get_tts_ytc_data(tmpdir: str): 15 | # Download the data from S3 16 | s3 = boto3.client( 17 | 's3', 18 | aws_access_key_id=os.getenv("AWS_ACCESS_KEY"), 19 | aws_secret_access_key=os.getenv("AWS_SECRET_KEY") 20 | ) 21 | s3.download_file( 22 | "sdp-test-data", 23 | "test_data/tts/ytc/test_data_reference.json", 24 | tmpdir/"test_data_reference.json", 25 | ) 26 | 27 | s3.download_file( 28 | "sdp-test-data", 29 | "test_data/tts/ytc/ytc.en.tar.gz", 30 | tmpdir/"ytc.en.tar.gz", 31 | ) 32 | 33 | # Extract the tar.gz file 34 | with tarfile.open(tmpdir/"ytc.en.tar.gz", "r:gz") as tar: 35 | tar.extractall(tmpdir) 36 | 37 | audio_files = Path(tmpdir).glob("audios/*") 38 | with open(os.path.join(tmpdir, "input_manifest.jsonl"), "w") as f: 39 | for audio_file in audio_files: 40 | data = { 41 | "audio_filepath": f"{tmpdir}/audios/{audio_file.name}", 42 | "audio_item_id": audio_file.stem, 43 | } 44 | f.write(json.dumps(data) + "\n") 45 | 46 | return tmpdir 47 | 48 | def test_tts_sdp_end_to_end(get_tts_ytc_data): 49 | data_dir = get_tts_ytc_data 50 | assert os.path.exists(data_dir) 51 | config_path = DATASET_CONFIGS_ROOT / "tts/ytc/config.yaml" 52 | input_manifest_file = os.path.join(data_dir, "input_manifest.jsonl") 53 | reference_manifest_file = os.path.join(data_dir, "test_data_reference.json") 54 | 55 | cfg = OmegaConf.load(config_path) 56 | cfg.hf_token = os.getenv("HF_SECRET_KEY") 57 | cfg.final_manifest = os.path.join(data_dir, "output_manifest.jsonl") 58 | cfg.raw_audio_dir = os.path.join(data_dir, "audios") 59 | cfg.data_split = "train" 60 | cfg.device = "cpu" 61 | cfg.language_short = "en" 62 | cfg.processors[3].model_name = "nvidia/stt_en_fastconformer_ctc_large" 63 | cfg.processors[3].parakeet = False 64 | cfg.processors[3].ctc = True 65 | cfg.processors[0].input_manifest_file = input_manifest_file 66 | 67 | run_processors(cfg) 68 | 69 | assert os.path.exists(cfg.final_manifest) 70 | output_file_data = {} 71 | output_data = load_manifest(cfg.final_manifest, encoding="utf8") 72 | for item in output_data: 73 | output_file_data[item["audio_item_id"]] = item 74 | 75 | reference_file_data = {} 76 | reference_data = load_manifest(reference_manifest_file, encoding="utf8") 77 | for item in reference_data: 78 | reference_file_data[item["audio_item_id"]] = item 79 | 80 | assert len(output_file_data) == len(reference_file_data) 81 | assert len(output_file_data) == 2 82 | for audio_item_id in output_file_data: 83 | assert output_file_data[audio_item_id]["segments"] == reference_file_data[audio_item_id]["segments"] -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- 1 | # Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. 2 | # 3 | # Licensed under the Apache License, Version 2.0 (the "License"); 4 | # you may not use this file except in compliance with the License. 5 | # You may obtain a copy of the License at 6 | # 7 | # http://www.apache.org/licenses/LICENSE-2.0 8 | # 9 | # Unless required by applicable law or agreed to in writing, software 10 | # distributed under the License is distributed on an "AS IS" BASIS, 11 | # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12 | # See the License for the specific language governing permissions and 13 | # limitations under the License. 14 | 15 | import pytest 16 | 17 | from sdp.utils.edit_spaces import add_start_end_spaces, remove_extra_spaces 18 | 19 | 20 | @pytest.mark.parametrize("input,expected_output", [("abc xyz abc xyz", "abc xyz abc xyz"), (" abc xyz ", "abc xyz")]) 21 | def test_remove_extra_spaces(input, expected_output): 22 | assert remove_extra_spaces(input) == expected_output 23 | 24 | 25 | @pytest.mark.parametrize("input,expected_output", [("abc", " abc "), ("abc xyz", " abc xyz ")]) 26 | def test_add_start_end_spaces(input, expected_output): 27 | assert add_start_end_spaces(input) == expected_output 28 | --------------------------------------------------------------------------------