├── LICENSE ├── README.md ├── architecture.svg ├── config.yaml.sample ├── data_server ├── append_low_frequency_chars.py ├── character_frequency.py ├── clone.py ├── create_dataset.py ├── create_filehashes.py ├── dataset_filters.py ├── db_pool_proxy.py ├── exclusion_chars │ ├── de.txt │ ├── en.txt │ └── es.txt ├── import_dataset.py ├── sanity_check.py ├── schema.psql ├── server.py ├── start_wsgi.sh ├── test_batch_whisper.py ├── test_batch_workflow.py ├── training_session_pg.py ├── update_durations.py ├── update_lang.py ├── utils.py ├── validate_media_entries.py ├── whisper_benchmark.py ├── whisper_multiple_files.py ├── whisper_single_file.py └── worker.py ├── podcasts ├── generate_list_from_podcastindex.py ├── html_stats.py ├── podcast_lists │ ├── rss_feeds_de │ ├── rss_feeds_de_at │ ├── rss_feeds_de_ch │ └── rss_feeds_fr ├── search_ddg.py ├── simple_podcast_downloader.py └── utils.py ├── requirements.txt ├── requirements_worker.txt ├── stats_screenshot.png └── tedx ├── filter_by_language.py ├── get_tedx_titles.sh └── tedx_yt_videolist /LICENSE: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/LICENSE -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/README.md -------------------------------------------------------------------------------- /architecture.svg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/architecture.svg -------------------------------------------------------------------------------- /config.yaml.sample: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/config.yaml.sample -------------------------------------------------------------------------------- /data_server/append_low_frequency_chars.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/append_low_frequency_chars.py -------------------------------------------------------------------------------- /data_server/character_frequency.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/character_frequency.py -------------------------------------------------------------------------------- /data_server/clone.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/clone.py -------------------------------------------------------------------------------- /data_server/create_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/create_dataset.py -------------------------------------------------------------------------------- /data_server/create_filehashes.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/create_filehashes.py -------------------------------------------------------------------------------- /data_server/dataset_filters.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/dataset_filters.py -------------------------------------------------------------------------------- /data_server/db_pool_proxy.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/db_pool_proxy.py -------------------------------------------------------------------------------- /data_server/exclusion_chars/de.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/exclusion_chars/de.txt -------------------------------------------------------------------------------- /data_server/exclusion_chars/en.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/exclusion_chars/en.txt -------------------------------------------------------------------------------- /data_server/exclusion_chars/es.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/exclusion_chars/es.txt -------------------------------------------------------------------------------- /data_server/import_dataset.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/import_dataset.py -------------------------------------------------------------------------------- /data_server/sanity_check.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/sanity_check.py -------------------------------------------------------------------------------- /data_server/schema.psql: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/schema.psql -------------------------------------------------------------------------------- /data_server/server.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/server.py -------------------------------------------------------------------------------- /data_server/start_wsgi.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/start_wsgi.sh -------------------------------------------------------------------------------- /data_server/test_batch_whisper.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/test_batch_whisper.py -------------------------------------------------------------------------------- /data_server/test_batch_workflow.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/test_batch_workflow.py -------------------------------------------------------------------------------- /data_server/training_session_pg.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/training_session_pg.py -------------------------------------------------------------------------------- /data_server/update_durations.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/update_durations.py -------------------------------------------------------------------------------- /data_server/update_lang.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/update_lang.py -------------------------------------------------------------------------------- /data_server/utils.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/utils.py -------------------------------------------------------------------------------- /data_server/validate_media_entries.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/validate_media_entries.py -------------------------------------------------------------------------------- /data_server/whisper_benchmark.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/whisper_benchmark.py -------------------------------------------------------------------------------- /data_server/whisper_multiple_files.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/whisper_multiple_files.py -------------------------------------------------------------------------------- /data_server/whisper_single_file.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/whisper_single_file.py -------------------------------------------------------------------------------- /data_server/worker.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/data_server/worker.py -------------------------------------------------------------------------------- /podcasts/generate_list_from_podcastindex.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/podcasts/generate_list_from_podcastindex.py -------------------------------------------------------------------------------- /podcasts/html_stats.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/podcasts/html_stats.py -------------------------------------------------------------------------------- /podcasts/podcast_lists/rss_feeds_de: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/podcasts/podcast_lists/rss_feeds_de -------------------------------------------------------------------------------- /podcasts/podcast_lists/rss_feeds_de_at: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/podcasts/podcast_lists/rss_feeds_de_at -------------------------------------------------------------------------------- /podcasts/podcast_lists/rss_feeds_de_ch: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/podcasts/podcast_lists/rss_feeds_de_ch -------------------------------------------------------------------------------- /podcasts/podcast_lists/rss_feeds_fr: -------------------------------------------------------------------------------- 1 | https://cgwhy.net/feed/mp3/ 2 | -------------------------------------------------------------------------------- /podcasts/search_ddg.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/podcasts/search_ddg.py -------------------------------------------------------------------------------- /podcasts/simple_podcast_downloader.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/podcasts/simple_podcast_downloader.py -------------------------------------------------------------------------------- /podcasts/utils.py: -------------------------------------------------------------------------------- 1 | ../data_server/utils.py -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/requirements.txt -------------------------------------------------------------------------------- /requirements_worker.txt: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/requirements_worker.txt -------------------------------------------------------------------------------- /stats_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/stats_screenshot.png -------------------------------------------------------------------------------- /tedx/filter_by_language.py: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/tedx/filter_by_language.py -------------------------------------------------------------------------------- /tedx/get_tedx_titles.sh: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/tedx/get_tedx_titles.sh -------------------------------------------------------------------------------- /tedx/tedx_yt_videolist: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/speechcatcher-asr/speechcatcher-data/HEAD/tedx/tedx_yt_videolist --------------------------------------------------------------------------------