├── .github └── workflows │ ├── doc-build.yml │ ├── docker_pull.yml │ ├── docker_tts_sdp_test.yml │ ├── importmanager.yml │ └── tests.yml ├── .gitignore ├── .pre-commit-config.yaml ├── CONTRIBUTING.md ├── LICENSE ├── README.md ├── __init__.py ├── dataset_configs ├── arabic │ ├── everyayah │ │ └── config.yaml │ ├── fleurs │ │ └── config.yaml │ ├── masc │ │ ├── config.yaml │ │ └── config_filter_noisy_train.yaml │ ├── mcv │ │ └── config.yaml │ ├── mediaspeech │ │ └── config.yaml │ └── readme.md ├── armenian │ ├── audio_books │ │ └── config.yaml │ ├── fleurs │ │ └── config.yaml │ ├── text_mcv │ │ └── config.yaml │ └── toloka │ │ ├── pipeline_get_final_res.yaml │ │ ├── pipeline_start.yaml │ │ └── pipeline_validate_answers.yaml ├── commoncrawl │ └── README.md ├── english │ ├── coraal │ │ └── config.yaml │ ├── earnings │ │ └── config.yaml │ ├── hifitts2 │ │ ├── config_22khz.yaml │ │ ├── config_44khz.yaml │ │ └── config_bandwidth.yaml │ ├── librispeech │ │ ├── all.yaml │ │ ├── config.yaml │ │ └── mini.yaml │ └── slr83 │ │ └── config.yaml ├── georgian │ └── mcv │ │ └── config.yaml ├── ipl │ ├── config.yaml │ └── nemo_run_config.yaml ├── italian │ ├── mcv │ │ └── config.yaml │ ├── mls │ │ ├── config.yaml │ │ └── config_nopc.yaml │ └── voxpopuli │ │ └── config.yaml ├── kazakh │ ├── ksc2 │ │ └── config.yaml │ ├── mcv │ │ └── config.yaml │ ├── slr102 │ │ └── config.yaml │ └── slr140 │ │ └── config.yaml ├── multilingual │ └── granary │ │ ├── README.md │ │ ├── config.yaml │ │ └── partials │ │ ├── common_phrases │ │ ├── bg.txt │ │ ├── cs.txt │ │ ├── da.txt │ │ ├── de.txt │ │ ├── el.txt │ │ ├── en.txt │ │ ├── es.txt │ │ ├── et.txt │ │ ├── fi.txt │ │ ├── fr.txt │ │ ├── hr.txt │ │ ├── hu.txt │ │ ├── it.txt │ │ ├── lt.txt │ │ ├── lv.txt │ │ ├── mt.txt │ │ ├── nl.txt │ │ ├── pl.txt │ │ ├── pt.txt │ │ ├── ro.txt │ │ ├── sk.txt │ │ ├── sl.txt │ │ └── sv.txt │ │ ├── pr_recovery_prompts │ │ ├── bg.yaml │ │ ├── cs.yaml │ │ ├── da.yaml │ │ ├── de.yaml │ │ ├── el.yaml │ │ ├── en.yaml │ │ ├── es.yaml │ │ ├── et.yaml │ │ ├── fi.yaml │ │ ├── fr.yaml │ │ ├── hr.yaml │ │ ├── hu.yaml │ │ ├── it.yaml │ │ ├── lt.yaml │ │ ├── lv.yaml │ │ ├── mt.yaml │ │ ├── nl.yaml │ │ ├── pl.yaml │ │ ├── pt.yaml │ │ ├── ro.yaml │ │ ├── ru.yaml │ │ ├── sk.yaml │ │ ├── sl.yaml │ │ ├── sv.yaml │ │ └── uk.yaml │ │ └── subregex_params │ │ ├── bg.yaml │ │ ├── common.yaml │ │ ├── cs.yaml │ │ ├── da.yaml │ │ ├── de.yaml │ │ ├── el.yaml │ │ ├── en.yaml │ │ ├── es.yaml │ │ ├── et.yaml │ │ ├── fi.yaml │ │ ├── fr.yaml │ │ ├── hr.yaml │ │ ├── hu.yaml │ │ ├── it.yaml │ │ ├── lt.yaml │ │ ├── lv.yaml │ │ ├── mt.yaml │ │ ├── nl.yaml │ │ ├── pl.yaml │ │ ├── pt.yaml │ │ ├── ro.yaml │ │ ├── ru.yaml │ │ ├── sk.yaml │ │ ├── sl.yaml │ │ ├── sv.yaml │ │ └── uk.yaml ├── portuguese │ ├── coraa │ │ └── config.yaml │ ├── mcv │ │ └── config.yaml │ ├── mls │ │ └── config.yaml │ ├── mtedx │ │ └── config.yaml │ └── unlabeled │ │ └── config.yaml ├── spanish │ └── mls │ │ ├── config.yaml │ │ └── unique_processors │ │ ├── 1-100_roman_numeral_table.csv │ │ └── clean_roman_numerals.py ├── spanish_pc │ ├── fisher │ │ ├── config.yaml │ │ └── unique_processors │ │ │ └── create_initial_manifest_fisher_spanish.py │ ├── mcv12 │ │ └── config.yaml │ ├── mls │ │ └── config.yaml │ └── voxpopuli │ │ └── config.yaml ├── tts │ └── ytc │ │ └── config.yaml └── uzbek │ ├── fleurs │ └── config.yaml │ ├── mcv │ └── config.yaml │ └── uzbekvoice │ └── config.yaml ├── docker ├── Dockerfile └── Dockerfile.tts_sdp ├── docs ├── Makefile ├── README.md ├── gen_docs.py └── src │ ├── _static │ ├── css │ │ └── custom.css │ └── js │ │ └── pk_scripts.js │ ├── _templates │ └── layout.html │ ├── conf.py │ ├── favicon.ico │ ├── index.rst │ └── sdp │ ├── adding_processors.rst │ ├── api.rst │ ├── config_structure.rst │ └── existing_configs.rst ├── main.py ├── pytest.ini ├── requirements ├── docs.txt ├── huggingface.txt ├── ipl.txt ├── main.txt ├── tests.txt └── tts.txt ├── sdp ├── __init__.py ├── logging.py ├── processors │ ├── __init__.py │ ├── base_processor.py │ ├── datasets │ │ ├── __init__.py │ │ ├── commoncrawl │ │ │ ├── __init__.py │ │ │ ├── commoncrawl.py │ │ │ └── harv_utils.py │ │ ├── coraa │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── coraal │ │ │ ├── __init__.py │ │ │ ├── create_initial_manifest.py │ │ │ └── data_splits.py │ │ ├── earnings │ │ │ ├── __init__.py │ │ │ ├── apply_normalizations.py │ │ │ └── create_initial_manifest.py │ │ ├── fleurs │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── hifitts2 │ │ │ ├── __init__.py │ │ │ ├── download_dataset.py │ │ │ └── remove_failed_chapters.py │ │ ├── ksc2 │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── lhotse.py │ │ ├── librispeech │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── masc │ │ │ ├── __init__.py │ │ │ ├── aggregate_segments.py │ │ │ ├── apply_reg_exp_on_vtt_entries.py │ │ │ ├── create_initial_manifest.py │ │ │ ├── get_caption_file_segments.py │ │ │ └── utils.py │ │ ├── mcv │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── mediaspeech │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── mls │ │ │ ├── __init__.py │ │ │ ├── create_initial_manifest.py │ │ │ └── restore_pc.py │ │ ├── mtedx │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── slr102 │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── slr140 │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── slr83 │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── uzbekvoice │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ │ ├── voxpopuli │ │ │ ├── __init__.py │ │ │ ├── create_initial_manifest.py │ │ │ └── normalize_from_non_pc_text.py │ │ └── ytc │ │ │ ├── __init__.py │ │ │ └── create_initial_manifest.py │ ├── huggingface │ │ ├── __init__.py │ │ └── create_initial_manifest.py │ ├── inference │ │ ├── asr │ │ │ ├── faster_whisper │ │ │ │ └── faster_whisper_inference.py │ │ │ ├── nemo │ │ │ │ ├── asr_inference.py │ │ │ │ ├── lid_inference.py │ │ │ │ └── utils │ │ │ │ │ ├── frame_vad_infer_postprocess.yaml │ │ │ │ │ ├── speech_to_text_with_vad.py │ │ │ │ │ └── transcribe_speech.py │ │ │ ├── transformers │ │ │ │ └── speech_recognition.py │ │ │ └── utils │ │ │ │ ├── rttm.py │ │ │ │ └── whisper_hallucinations.py │ │ ├── llm │ │ │ ├── utils │ │ │ │ └── qwen_cleaning.py │ │ │ └── vllm │ │ │ │ └── vllm.py │ │ ├── nlp │ │ │ ├── fasttext │ │ │ │ └── fasttext.py │ │ │ └── nemo │ │ │ │ └── pc_inference.py │ │ └── quality_estimation │ │ │ └── pymarian.py │ ├── ipl │ │ ├── README.md │ │ ├── __init__.py │ │ ├── ipl_processors.py │ │ └── nemo_run_processor.py │ ├── langs │ │ ├── __init__.py │ │ ├── arabic.py │ │ ├── armenian.py │ │ └── kazakh.py │ ├── manage_files │ │ ├── __init__.py │ │ ├── convert_audio.py │ │ ├── convert_to_tarred_audio_dataset.py │ │ ├── extract.py │ │ ├── remove.py │ │ └── utils │ │ │ ├── convert_to_tarred_audio_dataset.py │ │ │ └── create_dali_tarred_dataset_index.py │ ├── modify_manifest │ │ ├── __init__.py │ │ ├── common.py │ │ ├── create_manifest.py │ │ ├── data_to_data.py │ │ ├── data_to_dropbool.py │ │ └── make_letters_uppercase_after_period.py │ ├── toloka │ │ ├── __init__.py │ │ ├── accept_if.py │ │ ├── create_pool.py │ │ ├── create_project.py │ │ ├── create_sentence_set.py │ │ ├── create_task_set.py │ │ ├── download_responses.py │ │ └── reject_if.py │ └── tts │ │ ├── README.md │ │ ├── __init__.py │ │ ├── merge_alignment_diarization.py │ │ ├── metrics.py │ │ ├── nemo_asr_align.py │ │ ├── prepare_tts_segments.py │ │ ├── pyannote.py │ │ ├── split.py │ │ └── text.py ├── run_processors.py └── utils │ ├── __init__.py │ ├── apply_operators.py │ ├── bootstrap_estimates.py │ ├── common.py │ ├── edit_spaces.py │ ├── get_diff.py │ ├── import_manager.py │ ├── ipl_utils.py │ ├── metrics_computation.py │ ├── nemo_run_utils.py │ └── skills_utils.py ├── setup.py └── tests ├── README.md ├── __init__.py ├── prepare_test_data ├── prepare_coraa_data.py ├── prepare_fleurs_data.py ├── prepare_hifitts2_data.py ├── prepare_huggingface_data.py ├── prepare_ksc2_data.py ├── prepare_masc_data.py ├── prepare_mcv_data.py ├── prepare_mediaspeech_data.py ├── prepare_mls_data.py ├── prepare_mtedx_data.py ├── prepare_slr102_data.py ├── prepare_slr140_data.py ├── prepare_voxpopuli_data.py └── prepare_ytc_data.py ├── test_bootstrap_estimate.py ├── test_cfg_end_to_end_tests.py ├── test_cfg_runtime_tests.py ├── test_cometoid_qe.py ├── test_convert_to_tarred_audio_dataset.py ├── test_data_to_data.py ├── test_data_to_dropbool.py ├── test_fasttext_inference.py ├── test_import_manager.py ├── test_lhotse.py ├── test_manifest_chunking.py ├── test_modify_manifest.py ├── test_normalize_text.py ├── test_tts_sdp_end_to_end.py └── test_utils.py /.github/workflows/doc-build.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/.github/workflows/doc-build.yml -------------------------------------------------------------------------------- /.github/workflows/docker_pull.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/.github/workflows/docker_pull.yml -------------------------------------------------------------------------------- /.github/workflows/docker_tts_sdp_test.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/.github/workflows/docker_tts_sdp_test.yml -------------------------------------------------------------------------------- /.github/workflows/importmanager.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/.github/workflows/importmanager.yml -------------------------------------------------------------------------------- /.github/workflows/tests.yml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/.github/workflows/tests.yml -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/.gitignore -------------------------------------------------------------------------------- /.pre-commit-config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/.pre-commit-config.yaml -------------------------------------------------------------------------------- /CONTRIBUTING.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/CONTRIBUTING.md -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/README.md -------------------------------------------------------------------------------- /__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/__init__.py -------------------------------------------------------------------------------- /dataset_configs/arabic/everyayah/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/arabic/everyayah/config.yaml -------------------------------------------------------------------------------- /dataset_configs/arabic/fleurs/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/arabic/fleurs/config.yaml -------------------------------------------------------------------------------- /dataset_configs/arabic/masc/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/arabic/masc/config.yaml -------------------------------------------------------------------------------- /dataset_configs/arabic/masc/config_filter_noisy_train.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/arabic/masc/config_filter_noisy_train.yaml -------------------------------------------------------------------------------- /dataset_configs/arabic/mcv/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/arabic/mcv/config.yaml -------------------------------------------------------------------------------- /dataset_configs/arabic/mediaspeech/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/arabic/mediaspeech/config.yaml -------------------------------------------------------------------------------- /dataset_configs/arabic/readme.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/arabic/readme.md -------------------------------------------------------------------------------- /dataset_configs/armenian/audio_books/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/armenian/audio_books/config.yaml -------------------------------------------------------------------------------- /dataset_configs/armenian/fleurs/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/armenian/fleurs/config.yaml -------------------------------------------------------------------------------- /dataset_configs/armenian/text_mcv/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/armenian/text_mcv/config.yaml -------------------------------------------------------------------------------- /dataset_configs/armenian/toloka/pipeline_get_final_res.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/armenian/toloka/pipeline_get_final_res.yaml -------------------------------------------------------------------------------- /dataset_configs/armenian/toloka/pipeline_start.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/armenian/toloka/pipeline_start.yaml -------------------------------------------------------------------------------- /dataset_configs/armenian/toloka/pipeline_validate_answers.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/armenian/toloka/pipeline_validate_answers.yaml -------------------------------------------------------------------------------- /dataset_configs/commoncrawl/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/commoncrawl/README.md -------------------------------------------------------------------------------- /dataset_configs/english/coraal/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/english/coraal/config.yaml -------------------------------------------------------------------------------- /dataset_configs/english/earnings/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/english/earnings/config.yaml -------------------------------------------------------------------------------- /dataset_configs/english/hifitts2/config_22khz.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/english/hifitts2/config_22khz.yaml -------------------------------------------------------------------------------- /dataset_configs/english/hifitts2/config_44khz.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/english/hifitts2/config_44khz.yaml -------------------------------------------------------------------------------- /dataset_configs/english/hifitts2/config_bandwidth.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/english/hifitts2/config_bandwidth.yaml -------------------------------------------------------------------------------- /dataset_configs/english/librispeech/all.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/english/librispeech/all.yaml -------------------------------------------------------------------------------- /dataset_configs/english/librispeech/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/english/librispeech/config.yaml -------------------------------------------------------------------------------- /dataset_configs/english/librispeech/mini.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/english/librispeech/mini.yaml -------------------------------------------------------------------------------- /dataset_configs/english/slr83/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/english/slr83/config.yaml -------------------------------------------------------------------------------- /dataset_configs/georgian/mcv/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/georgian/mcv/config.yaml -------------------------------------------------------------------------------- /dataset_configs/ipl/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/ipl/config.yaml -------------------------------------------------------------------------------- /dataset_configs/ipl/nemo_run_config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/ipl/nemo_run_config.yaml -------------------------------------------------------------------------------- /dataset_configs/italian/mcv/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/italian/mcv/config.yaml -------------------------------------------------------------------------------- /dataset_configs/italian/mls/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/italian/mls/config.yaml -------------------------------------------------------------------------------- /dataset_configs/italian/mls/config_nopc.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/italian/mls/config_nopc.yaml -------------------------------------------------------------------------------- /dataset_configs/italian/voxpopuli/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/italian/voxpopuli/config.yaml -------------------------------------------------------------------------------- /dataset_configs/kazakh/ksc2/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/kazakh/ksc2/config.yaml -------------------------------------------------------------------------------- /dataset_configs/kazakh/mcv/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/kazakh/mcv/config.yaml -------------------------------------------------------------------------------- /dataset_configs/kazakh/slr102/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/kazakh/slr102/config.yaml -------------------------------------------------------------------------------- /dataset_configs/kazakh/slr140/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/kazakh/slr140/config.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/README.md -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/config.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/bg.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/bg.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/cs.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/cs.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/da.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/da.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/de.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/de.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/el.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/el.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/en.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/es.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/es.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/et.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/et.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/fi.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/fi.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/fr.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/fr.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/hr.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/hr.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/hu.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/hu.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/it.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/it.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/lt.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/lt.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/lv.txt: -------------------------------------------------------------------------------- 1 | Paldies 3222 2 | Plāksmē 64 3 | -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/mt.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/mt.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/nl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/nl.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/pl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/pl.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/pt.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/pt.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/ro.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/ro.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/sk.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/sk.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/sl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/sl.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/common_phrases/sv.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/common_phrases/sv.txt -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/bg.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/bg.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/cs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/cs.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/da.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/da.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/de.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/de.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/el.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/el.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/en.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/en.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/es.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/es.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/et.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/et.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fi.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fi.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fr.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/fr.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hr.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hr.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hu.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/hu.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/it.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/it.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lt.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/lv.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/mt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/mt.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/nl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/nl.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pl.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/pt.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ro.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ro.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ru.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/ru.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sk.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sk.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sl.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/sv.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/pr_recovery_prompts/uk.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/pr_recovery_prompts/uk.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/bg.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/bg.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/common.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/common.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/cs.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/cs.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/da.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/da.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/de.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/de.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/el.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/el.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/en.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/en.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/es.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/es.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/et.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/et.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/fi.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/fi.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/fr.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/fr.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/hr.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/hr.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/hu.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/hu.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/it.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/it.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/lt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/lt.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/lv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/lv.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/mt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/mt.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/nl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/nl.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/pl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/pl.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/pt.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/pt.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/ro.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/ro.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/ru.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/ru.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/sk.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/sk.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/sl.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/sl.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/sv.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/sv.yaml -------------------------------------------------------------------------------- /dataset_configs/multilingual/granary/partials/subregex_params/uk.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/multilingual/granary/partials/subregex_params/uk.yaml -------------------------------------------------------------------------------- /dataset_configs/portuguese/coraa/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/portuguese/coraa/config.yaml -------------------------------------------------------------------------------- /dataset_configs/portuguese/mcv/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/portuguese/mcv/config.yaml -------------------------------------------------------------------------------- /dataset_configs/portuguese/mls/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/portuguese/mls/config.yaml -------------------------------------------------------------------------------- /dataset_configs/portuguese/mtedx/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/portuguese/mtedx/config.yaml -------------------------------------------------------------------------------- /dataset_configs/portuguese/unlabeled/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/portuguese/unlabeled/config.yaml -------------------------------------------------------------------------------- /dataset_configs/spanish/mls/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/spanish/mls/config.yaml -------------------------------------------------------------------------------- /dataset_configs/spanish/mls/unique_processors/1-100_roman_numeral_table.csv: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/spanish/mls/unique_processors/1-100_roman_numeral_table.csv -------------------------------------------------------------------------------- /dataset_configs/spanish/mls/unique_processors/clean_roman_numerals.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/spanish/mls/unique_processors/clean_roman_numerals.py -------------------------------------------------------------------------------- /dataset_configs/spanish_pc/fisher/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/spanish_pc/fisher/config.yaml -------------------------------------------------------------------------------- /dataset_configs/spanish_pc/fisher/unique_processors/create_initial_manifest_fisher_spanish.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/spanish_pc/fisher/unique_processors/create_initial_manifest_fisher_spanish.py -------------------------------------------------------------------------------- /dataset_configs/spanish_pc/mcv12/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/spanish_pc/mcv12/config.yaml -------------------------------------------------------------------------------- /dataset_configs/spanish_pc/mls/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/spanish_pc/mls/config.yaml -------------------------------------------------------------------------------- /dataset_configs/spanish_pc/voxpopuli/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/spanish_pc/voxpopuli/config.yaml -------------------------------------------------------------------------------- /dataset_configs/tts/ytc/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/tts/ytc/config.yaml -------------------------------------------------------------------------------- /dataset_configs/uzbek/fleurs/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/uzbek/fleurs/config.yaml -------------------------------------------------------------------------------- /dataset_configs/uzbek/mcv/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/uzbek/mcv/config.yaml -------------------------------------------------------------------------------- /dataset_configs/uzbek/uzbekvoice/config.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/dataset_configs/uzbek/uzbekvoice/config.yaml -------------------------------------------------------------------------------- /docker/Dockerfile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docker/Dockerfile -------------------------------------------------------------------------------- /docker/Dockerfile.tts_sdp: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docker/Dockerfile.tts_sdp -------------------------------------------------------------------------------- /docs/Makefile: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/Makefile -------------------------------------------------------------------------------- /docs/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/README.md -------------------------------------------------------------------------------- /docs/gen_docs.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/gen_docs.py -------------------------------------------------------------------------------- /docs/src/_static/css/custom.css: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/src/_static/css/custom.css -------------------------------------------------------------------------------- /docs/src/_static/js/pk_scripts.js: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/src/_static/js/pk_scripts.js -------------------------------------------------------------------------------- /docs/src/_templates/layout.html: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/src/_templates/layout.html -------------------------------------------------------------------------------- /docs/src/conf.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/src/conf.py -------------------------------------------------------------------------------- /docs/src/favicon.ico: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/src/favicon.ico -------------------------------------------------------------------------------- /docs/src/index.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/src/index.rst -------------------------------------------------------------------------------- /docs/src/sdp/adding_processors.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/src/sdp/adding_processors.rst -------------------------------------------------------------------------------- /docs/src/sdp/api.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/src/sdp/api.rst -------------------------------------------------------------------------------- /docs/src/sdp/config_structure.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/src/sdp/config_structure.rst -------------------------------------------------------------------------------- /docs/src/sdp/existing_configs.rst: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/docs/src/sdp/existing_configs.rst -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/main.py -------------------------------------------------------------------------------- /pytest.ini: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/pytest.ini -------------------------------------------------------------------------------- /requirements/docs.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/requirements/docs.txt -------------------------------------------------------------------------------- /requirements/huggingface.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/requirements/huggingface.txt -------------------------------------------------------------------------------- /requirements/ipl.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/requirements/ipl.txt -------------------------------------------------------------------------------- /requirements/main.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/requirements/main.txt -------------------------------------------------------------------------------- /requirements/tests.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/requirements/tests.txt -------------------------------------------------------------------------------- /requirements/tts.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/requirements/tts.txt -------------------------------------------------------------------------------- /sdp/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/__init__.py -------------------------------------------------------------------------------- /sdp/logging.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/logging.py -------------------------------------------------------------------------------- /sdp/processors/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/__init__.py -------------------------------------------------------------------------------- /sdp/processors/base_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/base_processor.py -------------------------------------------------------------------------------- /sdp/processors/datasets/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/commoncrawl/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/commoncrawl/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/commoncrawl/commoncrawl.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/commoncrawl/commoncrawl.py -------------------------------------------------------------------------------- /sdp/processors/datasets/commoncrawl/harv_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/commoncrawl/harv_utils.py -------------------------------------------------------------------------------- /sdp/processors/datasets/coraa/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/coraa/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/coraa/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/coraal/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/coraal/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/coraal/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/coraal/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/coraal/data_splits.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/coraal/data_splits.py -------------------------------------------------------------------------------- /sdp/processors/datasets/earnings/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/earnings/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/earnings/apply_normalizations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/earnings/apply_normalizations.py -------------------------------------------------------------------------------- /sdp/processors/datasets/earnings/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/earnings/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/fleurs/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/fleurs/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/fleurs/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/hifitts2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/hifitts2/download_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/hifitts2/download_dataset.py -------------------------------------------------------------------------------- /sdp/processors/datasets/hifitts2/remove_failed_chapters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/hifitts2/remove_failed_chapters.py -------------------------------------------------------------------------------- /sdp/processors/datasets/ksc2/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/ksc2/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/ksc2/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/lhotse.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/lhotse.py -------------------------------------------------------------------------------- /sdp/processors/datasets/librispeech/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/librispeech/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/librispeech/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/masc/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/masc/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/masc/aggregate_segments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/masc/aggregate_segments.py -------------------------------------------------------------------------------- /sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/masc/apply_reg_exp_on_vtt_entries.py -------------------------------------------------------------------------------- /sdp/processors/datasets/masc/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/masc/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/masc/get_caption_file_segments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/masc/get_caption_file_segments.py -------------------------------------------------------------------------------- /sdp/processors/datasets/masc/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/masc/utils.py -------------------------------------------------------------------------------- /sdp/processors/datasets/mcv/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/mcv/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/mcv/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/mediaspeech/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/mediaspeech/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/mediaspeech/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/mediaspeech/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/mls/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/mls/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/mls/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/mls/restore_pc.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/mls/restore_pc.py -------------------------------------------------------------------------------- /sdp/processors/datasets/mtedx/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/mtedx/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/mtedx/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/slr102/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/slr102/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/slr102/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/slr140/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/slr140/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/slr140/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/slr83/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/slr83/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/slr83/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/uzbekvoice/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/uzbekvoice/__init__.py -------------------------------------------------------------------------------- /sdp/processors/datasets/uzbekvoice/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/uzbekvoice/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/voxpopuli/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/voxpopuli/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/voxpopuli/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/datasets/voxpopuli/normalize_from_non_pc_text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/voxpopuli/normalize_from_non_pc_text.py -------------------------------------------------------------------------------- /sdp/processors/datasets/ytc/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/datasets/ytc/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/datasets/ytc/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/huggingface/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/huggingface/create_initial_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/huggingface/create_initial_manifest.py -------------------------------------------------------------------------------- /sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/asr/faster_whisper/faster_whisper_inference.py -------------------------------------------------------------------------------- /sdp/processors/inference/asr/nemo/asr_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/asr/nemo/asr_inference.py -------------------------------------------------------------------------------- /sdp/processors/inference/asr/nemo/lid_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/asr/nemo/lid_inference.py -------------------------------------------------------------------------------- /sdp/processors/inference/asr/nemo/utils/frame_vad_infer_postprocess.yaml: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/asr/nemo/utils/frame_vad_infer_postprocess.yaml -------------------------------------------------------------------------------- /sdp/processors/inference/asr/nemo/utils/speech_to_text_with_vad.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/asr/nemo/utils/speech_to_text_with_vad.py -------------------------------------------------------------------------------- /sdp/processors/inference/asr/nemo/utils/transcribe_speech.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/asr/nemo/utils/transcribe_speech.py -------------------------------------------------------------------------------- /sdp/processors/inference/asr/transformers/speech_recognition.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/asr/transformers/speech_recognition.py -------------------------------------------------------------------------------- /sdp/processors/inference/asr/utils/rttm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/asr/utils/rttm.py -------------------------------------------------------------------------------- /sdp/processors/inference/asr/utils/whisper_hallucinations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/asr/utils/whisper_hallucinations.py -------------------------------------------------------------------------------- /sdp/processors/inference/llm/utils/qwen_cleaning.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/llm/utils/qwen_cleaning.py -------------------------------------------------------------------------------- /sdp/processors/inference/llm/vllm/vllm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/llm/vllm/vllm.py -------------------------------------------------------------------------------- /sdp/processors/inference/nlp/fasttext/fasttext.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/nlp/fasttext/fasttext.py -------------------------------------------------------------------------------- /sdp/processors/inference/nlp/nemo/pc_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/nlp/nemo/pc_inference.py -------------------------------------------------------------------------------- /sdp/processors/inference/quality_estimation/pymarian.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/inference/quality_estimation/pymarian.py -------------------------------------------------------------------------------- /sdp/processors/ipl/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/ipl/README.md -------------------------------------------------------------------------------- /sdp/processors/ipl/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /sdp/processors/ipl/ipl_processors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/ipl/ipl_processors.py -------------------------------------------------------------------------------- /sdp/processors/ipl/nemo_run_processor.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/ipl/nemo_run_processor.py -------------------------------------------------------------------------------- /sdp/processors/langs/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/langs/__init__.py -------------------------------------------------------------------------------- /sdp/processors/langs/arabic.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/langs/arabic.py -------------------------------------------------------------------------------- /sdp/processors/langs/armenian.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/langs/armenian.py -------------------------------------------------------------------------------- /sdp/processors/langs/kazakh.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/langs/kazakh.py -------------------------------------------------------------------------------- /sdp/processors/manage_files/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/manage_files/__init__.py -------------------------------------------------------------------------------- /sdp/processors/manage_files/convert_audio.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/manage_files/convert_audio.py -------------------------------------------------------------------------------- /sdp/processors/manage_files/convert_to_tarred_audio_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/manage_files/convert_to_tarred_audio_dataset.py -------------------------------------------------------------------------------- /sdp/processors/manage_files/extract.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/manage_files/extract.py -------------------------------------------------------------------------------- /sdp/processors/manage_files/remove.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/manage_files/remove.py -------------------------------------------------------------------------------- /sdp/processors/manage_files/utils/convert_to_tarred_audio_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/manage_files/utils/convert_to_tarred_audio_dataset.py -------------------------------------------------------------------------------- /sdp/processors/manage_files/utils/create_dali_tarred_dataset_index.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/manage_files/utils/create_dali_tarred_dataset_index.py -------------------------------------------------------------------------------- /sdp/processors/modify_manifest/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/modify_manifest/__init__.py -------------------------------------------------------------------------------- /sdp/processors/modify_manifest/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/modify_manifest/common.py -------------------------------------------------------------------------------- /sdp/processors/modify_manifest/create_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/modify_manifest/create_manifest.py -------------------------------------------------------------------------------- /sdp/processors/modify_manifest/data_to_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/modify_manifest/data_to_data.py -------------------------------------------------------------------------------- /sdp/processors/modify_manifest/data_to_dropbool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/modify_manifest/data_to_dropbool.py -------------------------------------------------------------------------------- /sdp/processors/modify_manifest/make_letters_uppercase_after_period.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/modify_manifest/make_letters_uppercase_after_period.py -------------------------------------------------------------------------------- /sdp/processors/toloka/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/toloka/__init__.py -------------------------------------------------------------------------------- /sdp/processors/toloka/accept_if.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/toloka/accept_if.py -------------------------------------------------------------------------------- /sdp/processors/toloka/create_pool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/toloka/create_pool.py -------------------------------------------------------------------------------- /sdp/processors/toloka/create_project.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/toloka/create_project.py -------------------------------------------------------------------------------- /sdp/processors/toloka/create_sentence_set.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/toloka/create_sentence_set.py -------------------------------------------------------------------------------- /sdp/processors/toloka/create_task_set.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/toloka/create_task_set.py -------------------------------------------------------------------------------- /sdp/processors/toloka/download_responses.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/toloka/download_responses.py -------------------------------------------------------------------------------- /sdp/processors/toloka/reject_if.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/toloka/reject_if.py -------------------------------------------------------------------------------- /sdp/processors/tts/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/tts/README.md -------------------------------------------------------------------------------- /sdp/processors/tts/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/tts/__init__.py -------------------------------------------------------------------------------- /sdp/processors/tts/merge_alignment_diarization.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/tts/merge_alignment_diarization.py -------------------------------------------------------------------------------- /sdp/processors/tts/metrics.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/tts/metrics.py -------------------------------------------------------------------------------- /sdp/processors/tts/nemo_asr_align.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/tts/nemo_asr_align.py -------------------------------------------------------------------------------- /sdp/processors/tts/prepare_tts_segments.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/tts/prepare_tts_segments.py -------------------------------------------------------------------------------- /sdp/processors/tts/pyannote.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/tts/pyannote.py -------------------------------------------------------------------------------- /sdp/processors/tts/split.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/tts/split.py -------------------------------------------------------------------------------- /sdp/processors/tts/text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/processors/tts/text.py -------------------------------------------------------------------------------- /sdp/run_processors.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/run_processors.py -------------------------------------------------------------------------------- /sdp/utils/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/__init__.py -------------------------------------------------------------------------------- /sdp/utils/apply_operators.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/apply_operators.py -------------------------------------------------------------------------------- /sdp/utils/bootstrap_estimates.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/bootstrap_estimates.py -------------------------------------------------------------------------------- /sdp/utils/common.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/common.py -------------------------------------------------------------------------------- /sdp/utils/edit_spaces.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/edit_spaces.py -------------------------------------------------------------------------------- /sdp/utils/get_diff.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/get_diff.py -------------------------------------------------------------------------------- /sdp/utils/import_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/import_manager.py -------------------------------------------------------------------------------- /sdp/utils/ipl_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/ipl_utils.py -------------------------------------------------------------------------------- /sdp/utils/metrics_computation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/metrics_computation.py -------------------------------------------------------------------------------- /sdp/utils/nemo_run_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/nemo_run_utils.py -------------------------------------------------------------------------------- /sdp/utils/skills_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/sdp/utils/skills_utils.py -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/setup.py -------------------------------------------------------------------------------- /tests/README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/README.md -------------------------------------------------------------------------------- /tests/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/__init__.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_coraa_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_coraa_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_fleurs_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_fleurs_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_hifitts2_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_hifitts2_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_huggingface_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_huggingface_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_ksc2_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_ksc2_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_masc_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_masc_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_mcv_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_mcv_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_mediaspeech_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_mediaspeech_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_mls_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_mls_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_mtedx_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_mtedx_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_slr102_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_slr102_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_slr140_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_slr140_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_voxpopuli_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_voxpopuli_data.py -------------------------------------------------------------------------------- /tests/prepare_test_data/prepare_ytc_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/prepare_test_data/prepare_ytc_data.py -------------------------------------------------------------------------------- /tests/test_bootstrap_estimate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_bootstrap_estimate.py -------------------------------------------------------------------------------- /tests/test_cfg_end_to_end_tests.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_cfg_end_to_end_tests.py -------------------------------------------------------------------------------- /tests/test_cfg_runtime_tests.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_cfg_runtime_tests.py -------------------------------------------------------------------------------- /tests/test_cometoid_qe.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_cometoid_qe.py -------------------------------------------------------------------------------- /tests/test_convert_to_tarred_audio_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_convert_to_tarred_audio_dataset.py -------------------------------------------------------------------------------- /tests/test_data_to_data.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_data_to_data.py -------------------------------------------------------------------------------- /tests/test_data_to_dropbool.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_data_to_dropbool.py -------------------------------------------------------------------------------- /tests/test_fasttext_inference.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_fasttext_inference.py -------------------------------------------------------------------------------- /tests/test_import_manager.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_import_manager.py -------------------------------------------------------------------------------- /tests/test_lhotse.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_lhotse.py -------------------------------------------------------------------------------- /tests/test_manifest_chunking.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_manifest_chunking.py -------------------------------------------------------------------------------- /tests/test_modify_manifest.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_modify_manifest.py -------------------------------------------------------------------------------- /tests/test_normalize_text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_normalize_text.py -------------------------------------------------------------------------------- /tests/test_tts_sdp_end_to_end.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_tts_sdp_end_to_end.py -------------------------------------------------------------------------------- /tests/test_utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/NVIDIA/NeMo-speech-data-processor/HEAD/tests/test_utils.py --------------------------------------------------------------------------------