├── .gitignore ├── LICENSE ├── README.md ├── dataspeech ├── __init__.py ├── cpu_enrichments │ ├── __init__.py │ └── rate.py └── gpu_enrichments │ ├── __init__.py │ ├── pitch.py │ ├── snr_and_reverb.py │ └── squim.py ├── examples ├── prompt_creation │ ├── run_prompt_creation_10k.sh │ ├── run_prompt_creation_1k.sh │ ├── run_prompt_creation_1k_with_speaker_consistency.sh │ ├── run_prompt_creation_45k.sh │ ├── run_prompt_creation_dummy.sh │ ├── run_prompt_creation_jenny.sh │ └── speaker_ids_to_names.json ├── prompt_creation_llm_swarm │ ├── nginx.template.conf │ ├── run_prompt_creation_10k.sh │ ├── run_prompt_creation_1k.sh │ ├── run_prompt_creation_dummy.sh │ ├── run_prompt_creation_full_mls.sh │ └── tgi_h100.template.slurm ├── tagging │ ├── run_main_10k.sh │ ├── run_main_1k.sh │ ├── run_main_45k.sh │ └── run_main_dummy.sh └── tags_to_annotations │ ├── run_metadata_to_text_10k.sh │ ├── run_metadata_to_text_10k_v02.sh │ ├── run_metadata_to_text_for_finetuning.sh │ ├── v01_bin_edges.json │ ├── v01_text_bins.json │ ├── v02_bin_edges.json │ └── v02_text_bins.json ├── main.py ├── requirements.txt └── scripts ├── filter_audio_separation.py ├── merge_audio_to_metadata.py ├── metadata_to_text.py ├── per_dataset_script ├── add_gender_to_MLS.py ├── add_gender_to_libritts_r.py └── clean_libritts_r.py ├── run_prompt_creation.py └── run_prompt_creation_llm_swarm.py /.gitignore: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/.gitignore -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/README.md -------------------------------------------------------------------------------- /dataspeech/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/dataspeech/__init__.py -------------------------------------------------------------------------------- /dataspeech/cpu_enrichments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/dataspeech/cpu_enrichments/__init__.py -------------------------------------------------------------------------------- /dataspeech/cpu_enrichments/rate.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/dataspeech/cpu_enrichments/rate.py -------------------------------------------------------------------------------- /dataspeech/gpu_enrichments/__init__.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/dataspeech/gpu_enrichments/__init__.py -------------------------------------------------------------------------------- /dataspeech/gpu_enrichments/pitch.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/dataspeech/gpu_enrichments/pitch.py -------------------------------------------------------------------------------- /dataspeech/gpu_enrichments/snr_and_reverb.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/dataspeech/gpu_enrichments/snr_and_reverb.py -------------------------------------------------------------------------------- /dataspeech/gpu_enrichments/squim.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/dataspeech/gpu_enrichments/squim.py -------------------------------------------------------------------------------- /examples/prompt_creation/run_prompt_creation_10k.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation/run_prompt_creation_10k.sh -------------------------------------------------------------------------------- /examples/prompt_creation/run_prompt_creation_1k.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation/run_prompt_creation_1k.sh -------------------------------------------------------------------------------- /examples/prompt_creation/run_prompt_creation_1k_with_speaker_consistency.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation/run_prompt_creation_1k_with_speaker_consistency.sh -------------------------------------------------------------------------------- /examples/prompt_creation/run_prompt_creation_45k.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation/run_prompt_creation_45k.sh -------------------------------------------------------------------------------- /examples/prompt_creation/run_prompt_creation_dummy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation/run_prompt_creation_dummy.sh -------------------------------------------------------------------------------- /examples/prompt_creation/run_prompt_creation_jenny.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation/run_prompt_creation_jenny.sh -------------------------------------------------------------------------------- /examples/prompt_creation/speaker_ids_to_names.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation/speaker_ids_to_names.json -------------------------------------------------------------------------------- /examples/prompt_creation_llm_swarm/nginx.template.conf: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation_llm_swarm/nginx.template.conf -------------------------------------------------------------------------------- /examples/prompt_creation_llm_swarm/run_prompt_creation_10k.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation_llm_swarm/run_prompt_creation_10k.sh -------------------------------------------------------------------------------- /examples/prompt_creation_llm_swarm/run_prompt_creation_1k.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation_llm_swarm/run_prompt_creation_1k.sh -------------------------------------------------------------------------------- /examples/prompt_creation_llm_swarm/run_prompt_creation_dummy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation_llm_swarm/run_prompt_creation_dummy.sh -------------------------------------------------------------------------------- /examples/prompt_creation_llm_swarm/run_prompt_creation_full_mls.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation_llm_swarm/run_prompt_creation_full_mls.sh -------------------------------------------------------------------------------- /examples/prompt_creation_llm_swarm/tgi_h100.template.slurm: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/prompt_creation_llm_swarm/tgi_h100.template.slurm -------------------------------------------------------------------------------- /examples/tagging/run_main_10k.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tagging/run_main_10k.sh -------------------------------------------------------------------------------- /examples/tagging/run_main_1k.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tagging/run_main_1k.sh -------------------------------------------------------------------------------- /examples/tagging/run_main_45k.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tagging/run_main_45k.sh -------------------------------------------------------------------------------- /examples/tagging/run_main_dummy.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tagging/run_main_dummy.sh -------------------------------------------------------------------------------- /examples/tags_to_annotations/run_metadata_to_text_10k.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tags_to_annotations/run_metadata_to_text_10k.sh -------------------------------------------------------------------------------- /examples/tags_to_annotations/run_metadata_to_text_10k_v02.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tags_to_annotations/run_metadata_to_text_10k_v02.sh -------------------------------------------------------------------------------- /examples/tags_to_annotations/run_metadata_to_text_for_finetuning.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tags_to_annotations/run_metadata_to_text_for_finetuning.sh -------------------------------------------------------------------------------- /examples/tags_to_annotations/v01_bin_edges.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tags_to_annotations/v01_bin_edges.json -------------------------------------------------------------------------------- /examples/tags_to_annotations/v01_text_bins.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tags_to_annotations/v01_text_bins.json -------------------------------------------------------------------------------- /examples/tags_to_annotations/v02_bin_edges.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tags_to_annotations/v02_bin_edges.json -------------------------------------------------------------------------------- /examples/tags_to_annotations/v02_text_bins.json: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/examples/tags_to_annotations/v02_text_bins.json -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/main.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/requirements.txt -------------------------------------------------------------------------------- /scripts/filter_audio_separation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/scripts/filter_audio_separation.py -------------------------------------------------------------------------------- /scripts/merge_audio_to_metadata.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/scripts/merge_audio_to_metadata.py -------------------------------------------------------------------------------- /scripts/metadata_to_text.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/scripts/metadata_to_text.py -------------------------------------------------------------------------------- /scripts/per_dataset_script/add_gender_to_MLS.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/scripts/per_dataset_script/add_gender_to_MLS.py -------------------------------------------------------------------------------- /scripts/per_dataset_script/add_gender_to_libritts_r.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/scripts/per_dataset_script/add_gender_to_libritts_r.py -------------------------------------------------------------------------------- /scripts/per_dataset_script/clean_libritts_r.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/scripts/per_dataset_script/clean_libritts_r.py -------------------------------------------------------------------------------- /scripts/run_prompt_creation.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/scripts/run_prompt_creation.py -------------------------------------------------------------------------------- /scripts/run_prompt_creation_llm_swarm.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/huggingface/dataspeech/HEAD/scripts/run_prompt_creation_llm_swarm.py --------------------------------------------------------------------------------