├── input ├── channels_id_example.txt └── playlists_id _example.txt ├── imgs └── katube-process.png ├── tools ├── Azure │ ├── config.py │ └── azure_cloud.py ├── Google │ └── google-stt.py └── AWS │ └── aws_transcribe.ipynb ├── Dockerfile ├── requirements.txt ├── utils ├── select_min_lev.py ├── change_filepath_metadata1.py ├── create_ignore_youtube_videos_list.py ├── size_dataset.py ├── corrigir_colunas_metadata.py ├── create_metadata_min_lev.py ├── delete_wavs_from_csv.py ├── change_filepath_metadata0.py ├── delete_wavs.py ├── verify_wavs_folder_metadata.py ├── clear_dataset.py ├── brspeech_generation.py ├── verificar_metadata_wavs.py ├── downsampling_wavs.py ├── downsampling.py ├── move_downsampled_wavs_folder.py ├── verificar_wavs_metadata.py ├── create_internal_metadata_min_lev.py ├── create_compressed_package.py ├── recreate_metadata.py ├── delete_folders_with_erros.py ├── exclude_unecessary_files.py └── number_to_text.py ├── config.py ├── environment.yml ├── synchronization.py ├── .gitignore ├── selection.py ├── search.py ├── download.py ├── transcribe.py ├── validation.py ├── audio_segmentation.py ├── README.md ├── text_normalization.py ├── main.py └── LICENSE /input/channels_id_example.txt: -------------------------------------------------------------------------------- 1 | # Globo 2 | UCEPRQVF6hxGGM9gi1ELaWHg -------------------------------------------------------------------------------- /imgs/katube-process.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/freds0/katube/HEAD/imgs/katube-process.png -------------------------------------------------------------------------------- /input/playlists_id _example.txt: -------------------------------------------------------------------------------- 1 | # https://www.youtube.com/watch?v=5tSIDFYm0xk&list=PLMsxmUeVRKYlVMAMCAxVB8yQSAFPXL3uu 2 | PLMsxmUeVRKYlVMAMCAxVB8yQSAFPXL3uu 3 | -------------------------------------------------------------------------------- /tools/Azure/config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | base_dir = 'dataset_path' 3 | output_name = 'dataset_name' 4 | output_path = 'csv_files/' 5 | speech_key = "" 6 | service_region = "brazilsouth" -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | # docker build -t ubuntu1604py36 2 | FROM ubuntu:18.04 3 | 4 | RUN set -x \ 5 | && apt-get update \ 6 | && apt-get install -y espeak ffmpeg libespeak-dev libsndfile1 libsndfile1-dev python python-dev python-pip python-numpy python-lxml \ 7 | && rm -rf /var/lib/apt/lists/* 8 | RUN apt-get update 9 | RUN apt-get install -y build-essential python3.6 python3.6-dev python3-pip python3.6-venv sox 10 | RUN apt-get install -y wget git nano 11 | 12 | # update pip 13 | RUN python3.6 -m pip install pip --upgrade 14 | RUN python3.6 -m pip install wheel 15 | 16 | RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10 17 | 18 | RUN export PYTHONIOENCODING=UTF-8 -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | apiclient==1.0.4 2 | audioread==2.1.8 3 | beautifulsoup4==4.9.0 4 | cachetools==4.0.0 5 | certifi==2019.11.28 6 | cffi==1.13.2 7 | chardet==3.0.4 8 | decorator==4.4.1 9 | google-api-python-client==1.7.11 10 | google-auth==1.10.1 11 | google-auth-httplib2==0.0.3 12 | httplib2==0.15.0 13 | idna==2.8 14 | joblib==0.14.1 15 | librosa==0.7.2 16 | llvmlite==0.31.0 17 | lxml==4.5.0 18 | numba==0.47.0 19 | numpy==1.22.4 20 | oauth2client==3.0.0 21 | pandas==1.0.3 22 | pyasn1==0.4.8 23 | pyasn1-modules==0.2.8 24 | pycparser==2.19 25 | pydub==0.23.1 26 | pysubs2==0.2.4 27 | python-dateutil==2.8.1 28 | pytube3==9.6.4 29 | pytz==2019.3 30 | requests==2.22.0 31 | resampy==0.2.2 32 | rsa==4.0 33 | scikit-learn==0.22.1 34 | scipy==1.4.1 35 | six==1.13.0 36 | SoundFile==0.10.3.post1 37 | soupsieve==2.0 38 | textdistance==4.1.5 39 | tqdm==4.41.1 40 | typing-extensions==3.7.4.2 41 | uritemplate==3.0.1 42 | urllib3==1.25.7 43 | youtube-dl==2021.4.17 44 | youtube-transcript-api==0.3.1 45 | -------------------------------------------------------------------------------- /utils/select_min_lev.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import soundfile as sf 3 | import pandas as pd 4 | from os.path import join 5 | import csv 6 | 7 | 8 | def generate_metadata(args): 9 | 10 | df = pd.read_csv(join(args.base_dir, args.csv_file), sep = '|', header=None, quoting=csv.QUOTE_NONE) 11 | new_df = df[df[3] >= float(args.min_value)] 12 | new_df.to_csv(join(args.base_dir, args.output_file), sep = '|', header=False, index=False, quoting=csv.QUOTE_NONE) 13 | 14 | 15 | def main(): 16 | parser = argparse.ArgumentParser() 17 | parser.add_argument('--base_dir', default='./') 18 | parser.add_argument('--csv_file', default='metadata.csv', help='Name of csv file') 19 | parser.add_argument('--min_value', default=0.90, help='Minimal value of levenshtein distance') 20 | parser.add_argument('--output_file', default='metadata_sub.csv', help='Name of csv file') 21 | args = parser.parse_args() 22 | generate_metadata(args) 23 | 24 | if __name__ == "__main__": 25 | main() 26 | 27 | -------------------------------------------------------------------------------- /tools/Google/google-stt.py: -------------------------------------------------------------------------------- 1 | import io 2 | import os 3 | 4 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./config.json" 5 | 6 | # Imports the Google Cloud client library 7 | from google.cloud import speech 8 | 9 | # Instantiates a client 10 | client = speech.SpeechClient() 11 | 12 | # The name of the audio file to transcribe 13 | # file_name = os.path.join(os.path.dirname(__file__), "resources", "audio.raw") 14 | 15 | file_name = './340_CO_bpubmn11.wav' 16 | 17 | # Loads the audio into memory 18 | with io.open(file_name, "rb") as audio_file: 19 | content = audio_file.read() 20 | audio = speech.RecognitionAudio(content=content) 21 | 22 | config = speech.RecognitionConfig( 23 | encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16, 24 | sample_rate_hertz=16000, 25 | language_code="pt-BR", 26 | ) 27 | 28 | # Detects speech in the audio file 29 | response = client.recognize(config=config, audio=audio) 30 | 31 | for result in response.results: 32 | print("Transcript: {}".format(result.alternatives[0].transcript)) -------------------------------------------------------------------------------- /utils/change_filepath_metadata1.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | import tqdm 5 | import argparse 6 | 7 | def remove_folder(args): 8 | for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))): 9 | if os.path.exists(folder) and os.path.isdir(folder): 10 | old_file = os.path.join(folder, 'metadata.csv') 11 | new_file = os.path.join(folder, 'metadata_new.csv') 12 | if not os.path.exists(old_file): 13 | print(folder) 14 | continue 15 | if not os.path.exists(new_file): 16 | print(folder) 17 | continue 18 | 19 | os.remove(old_file) 20 | os.rename(new_file, old_file) 21 | #print(old_file) 22 | #print(new_file) 23 | 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('--base_dir', default='./BRSpeech-ASR-beta3/') 28 | args = parser.parse_args() 29 | remove_folder(args) 30 | 31 | if __name__ == "__main__": 32 | main() 33 | -------------------------------------------------------------------------------- /utils/create_ignore_youtube_videos_list.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import csv 5 | 6 | 7 | def generate_file(args): 8 | try: 9 | f = open(os.path.join(args.base_dir, args.output_file), 'w') 10 | for folder in sorted(glob.glob(os.path.join(args.base_dir, args.input_folder) + '/*/*')): 11 | if os.path.exists(folder) and os.path.isdir(folder): 12 | youtube_link = 'https://www.youtube.com/watch?v=' + folder.split('/')[-1] 13 | f.write(youtube_link + '\n') 14 | f.close() 15 | except IOError: 16 | print("Error: Create file {}.".format(args.output_file)) 17 | return False 18 | 19 | return True 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--base_dir', default='./') 24 | parser.add_argument('--input_folder', default='./output/playlist') 25 | parser.add_argument('--output_file', default='youtube_ignored_videos.txt', help='Name of csv file') 26 | args = parser.parse_args() 27 | generate_file(args) 28 | 29 | if __name__ == "__main__": 30 | main() 31 | 32 | -------------------------------------------------------------------------------- /utils/size_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import soundfile as sf 3 | from os.path import isfile, join, dirname 4 | import pandas as pd 5 | import os 6 | import csv 7 | import tqdm 8 | 9 | def get_seconds(x): 10 | f = sf.SoundFile(x) 11 | t = len(f) / f.samplerate 12 | return t 13 | 14 | 15 | def calcular_horas(args): 16 | metadata = os.path.join(args.base_dir, args.csv_file) 17 | df = pd.read_csv(metadata, sep = '|', quoting=csv.QUOTE_NONE) 18 | total = 0 19 | for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]): 20 | path_file = os.path.join(row[0]) 21 | temp = get_seconds(path_file) 22 | total += temp 23 | 24 | print('Total em Segundos: {}'.format(total)) 25 | print('horas: {}'.format(total/3600)) 26 | print('Minutos: {}'.format(total%3600/60)) 27 | print('Segundos: {}'.format( (total%3600)%60)) 28 | 29 | def main(): 30 | parser = argparse.ArgumentParser() 31 | parser.add_argument('--base_dir', default='./') 32 | parser.add_argument('--csv_file', default='metadata.csv', help='Name of csv file') 33 | args = parser.parse_args() 34 | calcular_horas(args) 35 | 36 | if __name__ == "__main__": 37 | main() 38 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | class Config: 2 | base_dir = './' 3 | dest_dir = 'output' 4 | ## serch and ingest 5 | api_key = '' 6 | ## videos origin 7 | orig_base = 'channel' # ['channel', 'playlist'] 8 | ## Channels and Playlists files 9 | channels_file = './input/channels_id_example.txt' 10 | playlists_file = './input/playlists_id.txt' 11 | 12 | # Logs 13 | logs_dir = 'logs' 14 | youtube_videos_error = 'error_youtube_videos.txt' 15 | log_file = 'errors.log' 16 | # Ignore videos list 17 | ignored_youtube_videos = '' 18 | downloaded_youtube_videos = logs_dir + '/downloaded_youtube_videos.txt' 19 | 20 | output_search_file = 'youtube_videos.txt' 21 | # text_normalization 22 | min_words = 15 23 | max_words = 30 24 | # split_audio 25 | wavs_dir = 'wavs' 26 | metadata_subtitles_file = 'subtitles.csv' 27 | # convertion to transcribe format 28 | tmp_wavs_dir = 'wavs_tmp' 29 | tmp_sampling_rate = 16000 30 | # transcribe 31 | transcription_file = 'transcript.csv' 32 | #output_converted_wavs_path = '00_16k' 33 | # validation 34 | validation_file = 'validation.csv' 35 | # selection 36 | minimal_levenshtein_distance = 0.9 37 | # downsampling 38 | sampling_rate = 22050 39 | # result 40 | result_file = 'metadata.csv' 41 | delete_temp_files = True -------------------------------------------------------------------------------- /utils/corrigir_colunas_metadata.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import shutil 5 | import pandas as pd 6 | import csv 7 | import tqdm 8 | 9 | def deletar(args): 10 | total = 0 11 | for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))): 12 | if os.path.exists(folder) and os.path.isdir(folder): 13 | metadata = os.path.join(folder, args.input_file) 14 | if not os.path.exists(metadata): 15 | continue 16 | df = pd.read_csv(metadata, sep = '|', quoting=csv.QUOTE_NONE) 17 | #new_df = df[df['levenshtein'] >= float(args.min_value)].copy() 18 | if set(['levenshtein']).issubset(df.columns): 19 | continue 20 | print(metadata) 21 | df.rename(columns={"text": "subtitle", "similarity" : "levenshtein"}, inplace=True) 22 | df.to_csv(os.path.join(folder, args.input_file), sep = '|', index=False, quoting=csv.QUOTE_NONE) 23 | total += 1 24 | print('Total created metadata: ', total) 25 | 26 | 27 | def main(): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument('--base_dir', default='./') 30 | parser.add_argument('--input_file', default='validation.csv') 31 | #parser.add_argument('--delete_file', default='delete.csv') 32 | args = parser.parse_args() 33 | deletar(args) 34 | 35 | if __name__ == "__main__": 36 | main() 37 | -------------------------------------------------------------------------------- /utils/create_metadata_min_lev.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import soundfile as sf 3 | import pandas as pd 4 | from os.path import join, exists 5 | import csv 6 | 7 | 8 | def generate_metadata(args): 9 | 10 | metadata_file = join(args.base_dir, args.csv_file) 11 | if not exists(metadata_file): 12 | print('File {} not found.'.format(metadata_file)) 13 | return 14 | 15 | df = pd.read_csv(metadata_file, sep = '|', header=None, quoting=csv.QUOTE_NONE) 16 | new_df = df[df[3] >= float(args.min_value)] 17 | new_df.to_csv(join(args.base_dir, args.save_file), sep = '|', header=False, index=False, quoting=csv.QUOTE_NONE) 18 | new_df = df[df[3] < float(args.min_value)] 19 | new_df.to_csv(join(args.base_dir, args.delete_file), sep = '|', header=False, index=False, quoting=csv.QUOTE_NONE) 20 | 21 | def main(): 22 | parser = argparse.ArgumentParser() 23 | parser.add_argument('--base_dir', default='./') 24 | parser.add_argument('--csv_file', default='metadata_complete.csv', help='Name of csv file') 25 | parser.add_argument('--min_value', default=0.90, help='Minimal value of levenshtein distance') 26 | parser.add_argument('--save_file', default='save.csv', help='Name of csv file') 27 | parser.add_argument('--delete_file', default='delete.csv', help='Name of csv file') 28 | args = parser.parse_args() 29 | generate_metadata(args) 30 | 31 | if __name__ == "__main__": 32 | main() 33 | -------------------------------------------------------------------------------- /utils/delete_wavs_from_csv.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import soundfile as sf 3 | from os.path import isfile, join, dirname, exists 4 | import pandas as pd 5 | import os 6 | import csv 7 | import tqdm 8 | 9 | def delete_wavs(args): 10 | metadata_file = os.path.join(args.base_dir, args.csv_file) 11 | if not exists(metadata_file): 12 | print('File {} not found.'.format(metadata_file)) 13 | return 14 | 15 | df = pd.read_csv(metadata_file, sep = '|', quoting=csv.QUOTE_NONE) 16 | total = 0 17 | total_deleted = 0 18 | 19 | for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]): 20 | path_file = os.path.join(row[0]) 21 | if os.path.exists(path_file): 22 | total_deleted += 1 23 | if not(args.force): 24 | print(path_file) 25 | else: 26 | os.remove(path_file) 27 | 28 | if not(args.force): 29 | print('Total wavs to be deleted: ', total_deleted) 30 | else: 31 | print('Total wavs deleted: ', total_deleted) 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument('--base_dir', default='./') 36 | parser.add_argument('--csv_file', default='delete.csv', help='Name of csv file') 37 | parser.add_argument('--force', action='store_true', default=False) 38 | args = parser.parse_args() 39 | delete_wavs(args) 40 | 41 | if __name__ == "__main__": 42 | main() 43 | -------------------------------------------------------------------------------- /utils/change_filepath_metadata0.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import soundfile as sf 3 | import pandas as pd 4 | import csv 5 | import glob 6 | import os 7 | import tqdm 8 | 9 | def generate_metadata(args): 10 | separator = '|' 11 | for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))): 12 | if os.path.isdir(folder) and os.path.exists(os.path.join(folder, args.csv_file)): 13 | output_file = open(os.path.join(folder, args.output_file), 'w') 14 | line = separator.join(['filename', 'subtitle', 'transcript', 'levenshtein']) 15 | output_file.write(line + '\n') 16 | df = pd.read_csv(os.path.join(folder, args.csv_file), sep = '|', quoting=csv.QUOTE_NONE) 17 | for index, row in df.iterrows(): 18 | filename = row[0].split('/')[-1] 19 | line = separator.join([filename, row[1], row[2], str(row[3])]) 20 | output_file.write(line + '\n') 21 | output_file.close() 22 | 23 | 24 | def main(): 25 | parser = argparse.ArgumentParser() 26 | parser.add_argument('--base_dir', default='./') 27 | parser.add_argument('--csv_file', default='metadata.csv', help='Name of csv file') 28 | parser.add_argument('--output_file', default='metadata_new.csv', help='Name of csv file') 29 | args = parser.parse_args() 30 | generate_metadata(args) 31 | 32 | if __name__ == "__main__": 33 | main() 34 | -------------------------------------------------------------------------------- /utils/delete_wavs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import pandas as pd 5 | import csv 6 | import tqdm 7 | 8 | def deletar(args): 9 | total = 0 10 | for metadata_file in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*/" + args.input_file))): 11 | if os.path.isfile(metadata_file): 12 | df = pd.read_csv(metadata_file, sep = '|', quoting=csv.QUOTE_NONE) 13 | folder_path = os.path.join(*metadata_file.split('/')[0:-1]) 14 | for index, row in df.iterrows(): 15 | path_file = os.path.join(folder_path, args.wavs_folder, row[0]) 16 | if os.path.exists(path_file): 17 | total += 1 18 | if not(args.force): 19 | print(path_file) 20 | else: 21 | os.remove(path_file) 22 | 23 | if args.force: 24 | print('Total wav files erased: ', total) 25 | else: 26 | print('Total wav files read to be erased: ', total) 27 | 28 | def main(): 29 | parser = argparse.ArgumentParser() 30 | parser.add_argument('--base_dir', default='./output/channel/') 31 | parser.add_argument('--input_file', default='delete.csv') 32 | parser.add_argument('--wavs_folder', default='wavs', help='Input wavs folder') 33 | parser.add_argument('--force', action='store_true', default=False) 34 | args = parser.parse_args() 35 | deletar(args) 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /utils/verify_wavs_folder_metadata.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | from os import makedirs 5 | from os.path import join, exists 6 | import tqdm 7 | 8 | separator = '|' 9 | 10 | def verify_folder(args): 11 | 12 | i = 0 13 | for wav_file in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/wavs/*.wav"))): 14 | 15 | filename = wav_file.split('/')[-1] 16 | folder = '/'.join(wav_file.split('/')[0:-2]) 17 | 18 | metadata_path = join(folder, args.csv_file) 19 | if not exists(metadata_path): 20 | continue 21 | f = open(metadata_path, 'r') 22 | content = f.readlines() 23 | found = False 24 | for line in content: 25 | filename_metadata, _, _, _ = line.split(separator) 26 | if filename_metadata == filename: 27 | found = True 28 | break 29 | if not found: 30 | i+=1 31 | if not args.erase: 32 | print('Excluir arquivo: ' + wav_file) 33 | else: 34 | os.remove(wav_file) 35 | 36 | 37 | print('Total: ' + str(i) + ' arquivos') 38 | 39 | def main(): 40 | parser = argparse.ArgumentParser() 41 | parser.add_argument('--base_dir', default='./') 42 | parser.add_argument('--folder', default='', help='Name of the origin directory of wav files') 43 | parser.add_argument('--csv_file', default='metadata.csv') 44 | parser.add_argument('--erase', action='store_true', default=False) 45 | args = parser.parse_args() 46 | verify_folder(args) 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /utils/clear_dataset.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import shutil 5 | def clear_dataset(args): 6 | total = 0 7 | for folder in sorted(glob.glob(args.base_dir + "/*/*")): 8 | if os.path.isdir(folder) and not os.path.exists(os.path.join(folder, 'metatada.csv')): 9 | total+=1 10 | if not args.force: 11 | print(folder) 12 | else: 13 | shutil.rmtree(folder) 14 | if os.path.isdir(folder) and not os.listdir(folder): 15 | total+=1 16 | if not args.force: 17 | print(folder) 18 | else: 19 | shutil.rmtree(folder) 20 | wavs_folder = os.path.join(folder, args.wavs_folder) 21 | if os.path.exists(wavs_folder) and os.path.isdir(wavs_folder): 22 | if not os.listdir(wavs_folder): #if len (os.listdir(wavs_folder)) == 0: 23 | total+=1 24 | if not args.force: 25 | print(folder) 26 | else: 27 | shutil.rmtree(folder) 28 | 29 | if args.force: 30 | print('Total folders erased: ', total) 31 | else: 32 | print('Total folders with problems: ', total) 33 | 34 | def main(): 35 | parser = argparse.ArgumentParser() 36 | parser.add_argument('--base_dir', default='./output/channel') 37 | parser.add_argument('--wavs_folder', default='wavs', help='Input wavs folder') 38 | parser.add_argument('--force', action='store_true', default=False) 39 | 40 | args = parser.parse_args() 41 | clear_dataset(args) 42 | 43 | if __name__ == "__main__": 44 | main() 45 | 46 | 47 | 48 | 49 | 50 | -------------------------------------------------------------------------------- /utils/brspeech_generation.py: -------------------------------------------------------------------------------- 1 | import subprocess 2 | 3 | base_dir = 'output/channel/' 4 | def main(): 5 | 6 | command_line = "python utils/0-create_metadata.py --base_dir {} ".format(base_dir) 7 | subprocess.call(command_line, shell=True) 8 | 9 | command_line = "python utils/1-delete_folders_with_erros.py --base_dir {} --erase".format(base_dir) 10 | subprocess.call(command_line, shell=True) 11 | 12 | command_line = "python utils/2-clear_dataset.py --base_dir {} --erase".format(base_dir) 13 | subprocess.call(command_line, shell=True) 14 | 15 | command_line = "python utils/3-create_metadata_min_lev.py --base_dir {}".format(base_dir) 16 | subprocess.call(command_line, shell=True) 17 | 18 | command_line = "python utils/4-create_internal_metadata_min_lev.py --base_dir {}".format(base_dir) 19 | subprocess.call(command_line, shell=True) 20 | 21 | command_line = "python utils/5-delete_wavs.py --base_dir {} --erase".format(base_dir) 22 | subprocess.call(command_line, shell=True) 23 | 24 | command_line = "python utils/6-downsampling_wavs.py --base_dir {} --convert".format(base_dir) 25 | subprocess.call(command_line, shell=True) 26 | 27 | command_line = "python utils/7-move_downsampled_wavs_folder.py --base_dir {} --erase".format(base_dir) 28 | subprocess.call(command_line, shell=True) 29 | 30 | command_line = "python utils/8-exclude_unecessary_files.py --base_dir {} --erase".format(base_dir) 31 | subprocess.call(command_line, shell=True) 32 | 33 | command_line = "python utils/9-change_filepath_metadata.py --base_dir {} --str_filepath_to_remove {}".format(base_dir, base_dir) 34 | subprocess.call(command_line, shell=True) 35 | 36 | if __name__ == "__main__": 37 | main() 38 | -------------------------------------------------------------------------------- /utils/verificar_metadata_wavs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import shutil 5 | import pandas as pd 6 | import csv 7 | import tqdm 8 | 9 | def deletar(args): 10 | total = 0 11 | separator = '|' 12 | for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))): 13 | if os.path.exists(folder) and os.path.isdir(folder): 14 | metadata_path = os.path.join(folder, args.csv_file) 15 | if not os.path.exists(metadata_path): 16 | continue 17 | f = open(metadata_path, 'r') 18 | content = f.readlines()[1:] 19 | for line in content: 20 | filename_metadata, _, _, _ = line.split(separator) 21 | filepath = os.path.join(folder, 'wavs', filename_metadata) 22 | print(filepath) 23 | if os.path.exists(filepath): 24 | continue 25 | else: 26 | total+=1 27 | if not args.erase: 28 | print('Excluir arquivo: ' + filename_metadata) 29 | else: 30 | os.remove(filename_metadata) 31 | 32 | if not args.erase: 33 | print('Total wavs to be erased: ', total) 34 | else: 35 | print('Total erased: ', total) 36 | 37 | 38 | def main(): 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('--base_dir', default='./') 41 | parser.add_argument('--csv_file', default='metadata.csv') 42 | parser.add_argument('--erase', action='store_true', default=False) 43 | #parser.add_argument('--delete_file', default='delete.csv') 44 | 45 | args = parser.parse_args() 46 | deletar(args) 47 | 48 | if __name__ == "__main__": 49 | main() 50 | -------------------------------------------------------------------------------- /utils/downsampling_wavs.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | from os import makedirs 5 | from os.path import join, exists 6 | import tqdm 7 | 8 | number_bits = 16 9 | encoding = "signed-integer" 10 | number_channels = 1 11 | 12 | def downsampling(args): 13 | sample_rate = args.sample_rate 14 | 15 | for folder in tqdm.tqdm(glob.glob(args.base_dir + "/*/*")): 16 | for wav_path in glob.glob(join(folder, args.wav_dir) + "/*.wav"): 17 | prev = '/'.join(wav_path.split('/')[0:5]) 18 | filename = wav_path.split('/')[-1] 19 | new_wav_path = join(prev, args.new_wav_dir, filename) 20 | dir_path = os.path.dirname(new_wav_path) 21 | if not args.force: 22 | print("sox %s -V0 -c %d -r %d -b %d -e %s %s"% (wav_path, int(number_channels), int(sample_rate), number_bits, encoding, new_wav_path)) 23 | else: 24 | os.makedirs(dir_path, exist_ok=True) 25 | os.system("sox %s -V0 -c %d -r %d -b %d -e %s %s"% (wav_path, int(number_channels), int(sample_rate), number_bits, encoding, new_wav_path)) 26 | 27 | def main(): 28 | parser = argparse.ArgumentParser() 29 | parser.add_argument('--base_dir', default='./output/channel/') 30 | parser.add_argument('--wav_dir', default='wavs', help='Name of the origin directory of wav files') 31 | parser.add_argument('--new_wav_dir', default='wavs22', help='Name of the origin directory of wav files') 32 | parser.add_argument('--sample_rate', default=22050, help='Sample rate of destination wav files') 33 | parser.add_argument('--force', action='store_true', default=False) 34 | args = parser.parse_args() 35 | downsampling(args) 36 | 37 | if __name__ == "__main__": 38 | main() 39 | -------------------------------------------------------------------------------- /utils/downsampling.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | from os import makedirs 5 | from os.path import join, exists 6 | import tqdm 7 | 8 | number_bits = 16 9 | encoding = "signed-integer" 10 | number_channels = 1 11 | 12 | def downsampling(folder, wav_dir, new_wav_dir, sample_rate, force): 13 | for wav_path in glob.glob(join(folder, wav_dir) + "/*.wav"): 14 | prev = '/'.join(wav_path.split('/')[0:5]) 15 | filename = wav_path.split('/')[-1] 16 | new_wav_path = join(prev, new_wav_dir, filename) 17 | dir_path = os.path.dirname(new_wav_path) 18 | if not force: 19 | print("sox %s -V0 -c %d -r %d -b %d -e %s %s"% (wav_path, int(number_channels), int(sample_rate), number_bits, encoding, new_wav_path)) 20 | else: 21 | os.makedirs(dir_path, exist_ok=True) 22 | os.system("sox %s -V0 -c %d -r %d -b %d -e %s %s"% (wav_path, int(number_channels), int(sample_rate), number_bits, encoding, new_wav_path)) 23 | return True 24 | 25 | def main(): 26 | parser = argparse.ArgumentParser() 27 | parser.add_argument('--base_dir', default='./output/channel/') 28 | parser.add_argument('--wav_dir', default='wavs', help='Name of the origin directory of wav files') 29 | parser.add_argument('--new_wav_dir', default='wavs22', help='Name of the origin directory of wav files') 30 | parser.add_argument('--sample_rate', default=22050, help='Sample rate of destination wav files') 31 | parser.add_argument('--force', action='store_true', default=False) 32 | args = parser.parse_args() 33 | for folder in glob.glob(args.base_dir + "/**"): 34 | downsampling(folder, args.wav_dir, args.new_wav_dir, args.sample_rate, args.force) 35 | 36 | if __name__ == "__main__": 37 | main() 38 | -------------------------------------------------------------------------------- /utils/move_downsampled_wavs_folder.py: -------------------------------------------------------------------------------- 1 | import glob 2 | import os 3 | import shutil 4 | import tqdm 5 | import argparse 6 | 7 | def remove_old_folder_wavs(args): 8 | total_erased = 0 9 | for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))): 10 | if os.path.exists(folder) and os.path.isdir(folder): 11 | old_folder = os.path.join(folder, args.old_folder) 12 | new_folder = os.path.join(folder, args.new_folder) 13 | if not os.path.exists(old_folder): 14 | print('Verify folder: ' + old_folder) 15 | continue 16 | #exit() 17 | if not os.path.exists(new_folder): 18 | print('Verify folder: ' + new_folder) 19 | continue 20 | #exit() 21 | total_erased+=1 22 | if not args.force: 23 | print('rm ' + old_folder) 24 | print('mv ' + new_folder + ' ' + old_folder) 25 | else: 26 | shutil.rmtree(old_folder) 27 | os.rename(new_folder, old_folder) 28 | 29 | if args.force: 30 | print('Total modified folders ', total_erased) 31 | else: 32 | print('Total to be modified folders ', total_erased) 33 | 34 | 35 | def main(): 36 | parser = argparse.ArgumentParser() 37 | parser.add_argument('--base_dir', default='./output/channel/') 38 | parser.add_argument('--old_folder', default='wavs', help='Name of old wavs folder, to erase') 39 | parser.add_argument('--new_folder', default='wavs22', help='Name of new wavs folder') 40 | parser.add_argument('--force', action='store_true', default=False) 41 | args = parser.parse_args() 42 | remove_old_folder_wavs(args) 43 | 44 | if __name__ == "__main__": 45 | main() 46 | -------------------------------------------------------------------------------- /utils/verificar_wavs_metadata.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import shutil 5 | import pandas as pd 6 | import csv 7 | import tqdm 8 | 9 | def deletar(args): 10 | total = 0 11 | separator = '|' 12 | for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))): 13 | if os.path.exists(folder) and os.path.isdir(folder): 14 | for wav_file in sorted(glob.glob(folder + "/wavs/*.wav")): 15 | filename = wav_file.split('/')[-1] 16 | folder = '/'.join(wav_file.split('/')[0:-2]) 17 | metadata_path = os.path.join(folder, args.csv_file) 18 | if not os.path.exists(metadata_path): 19 | continue 20 | f = open(metadata_path, 'r') 21 | content = f.readlines() 22 | found = False 23 | for line in content: 24 | filename_metadata, _, _, _ = line.split(separator) 25 | if filename_metadata == filename: 26 | found = True 27 | break 28 | if not found: 29 | total+=1 30 | if not args.erase: 31 | print('Excluir arquivo: ' + wav_file) 32 | else: 33 | os.remove(wav_file) 34 | 35 | if not args.erase: 36 | print('Total wavs to be erased: ', total) 37 | else: 38 | print('Total erased: ', total) 39 | 40 | 41 | def main(): 42 | parser = argparse.ArgumentParser() 43 | parser.add_argument('--base_dir', default='./') 44 | parser.add_argument('--csv_file', default='metadata.csv') 45 | parser.add_argument('--erase', action='store_true', default=False) 46 | #parser.add_argument('--delete_file', default='delete.csv') 47 | 48 | args = parser.parse_args() 49 | deletar(args) 50 | 51 | if __name__ == "__main__": 52 | main() 53 | -------------------------------------------------------------------------------- /utils/create_internal_metadata_min_lev.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import shutil 5 | import pandas as pd 6 | import csv 7 | import tqdm 8 | 9 | def deletar(args): 10 | total = 0 11 | for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))): 12 | if os.path.exists(folder) and os.path.isdir(folder): 13 | metadata = os.path.join(folder, args.input_file) 14 | if not os.path.exists(metadata): 15 | continue 16 | df = pd.read_csv(metadata, sep = '|', quoting=csv.QUOTE_NONE) 17 | # Creating save files 18 | save_df = df[df['similarity'] >= float(args.min_value)].copy() 19 | filenames = save_df['filename'].apply(lambda x: x.split('/')[-1]) 20 | save_df['filename'] = filenames 21 | save_df.to_csv(os.path.join(folder, args.save_file), sep = '|', index=False, quoting=csv.QUOTE_NONE) 22 | # Creating delete files 23 | delete_df = df[df['similarity'] < float(args.min_value)].copy() 24 | filenames = delete_df['filename'].apply(lambda x: x.split('/')[-1]) 25 | delete_df['filename'] = filenames 26 | delete_df.to_csv(os.path.join(folder, args.delete_file), sep = '|', index=False, quoting=csv.QUOTE_NONE) 27 | 28 | total += 1 29 | 30 | print('Total save/delete files metadata created: ', total) 31 | 32 | 33 | def main(): 34 | parser = argparse.ArgumentParser() 35 | parser.add_argument('--base_dir', default='./output/channel/') 36 | parser.add_argument('--input_file', default='validation.csv') 37 | parser.add_argument('--save_file', default='save.csv') 38 | parser.add_argument('--delete_file', default='delete.csv') 39 | parser.add_argument('--min_value', default=0.90, help='Minimal value of levenshtein distance') 40 | args = parser.parse_args() 41 | deletar(args) 42 | 43 | if __name__ == "__main__": 44 | main() 45 | -------------------------------------------------------------------------------- /utils/create_compressed_package.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import tarfile 3 | import os 4 | import tqdm 5 | 6 | def create_brspech_file(input_file, output_file, min_value): 7 | root = 'BRSpeech-ASR' 8 | write_mode = 'w:bz2' # 'w' or 'w:gz' or 'w:bz2' 9 | internal_folder = 'wavs' 10 | 11 | tar_file = tarfile.open(output_file, mode=write_mode) 12 | 13 | num_lines = sum(1 for line in open(input_file,'r')) 14 | in_file = open(input_file, "r") 15 | folders_list = [] 16 | for line in tqdm.tqdm(in_file, total=num_lines): 17 | 18 | file, subtitle, transcript, levenshtein = line.split('|') 19 | 20 | folder = file.split('/')[-3] 21 | folder_path = '/'.join(file.split('/')[:-2]) 22 | filename = file.split('/')[-1] 23 | 24 | #if float(levenshtein) > float(min_value): 25 | 26 | tar_file.add(file, arcname=os.path.join(root, folder, internal_folder, filename)) 27 | folders_list.append(folder_path) 28 | 29 | for folder in folders_list: 30 | tar_file.add(os.path.join(folder, 'validation.csv'), arcname=os.path.join(root, folder, internal_folder, filename)) 31 | 32 | tar_file.add(input_file, arcname=os.path.join(root, 'metadata.csv')) 33 | tar_file.close() 34 | in_file.close() 35 | 36 | 37 | def main(): 38 | 39 | parser = argparse.ArgumentParser() 40 | parser.add_argument('--base_dir', default='./') 41 | parser.add_argument('--metadata_file', default='metadata_all.csv', help='Input filename') 42 | parser.add_argument('--output_file', default='BRSpeech.tar.bz', help='Tar.bz file') 43 | parser.add_argument('--min_value', default=0.95, help='Minimal value of levenshtein distance') 44 | 45 | args = parser.parse_args() 46 | 47 | input_file = os.path.join(args.base_dir, args.metadata_file) 48 | output_file = os.path.join(args.base_dir, args.output_file) 49 | 50 | create_brspech_file(input_file, output_file, args.min_value) 51 | 52 | if __name__ == "__main__": 53 | main() 54 | -------------------------------------------------------------------------------- /environment.yml: -------------------------------------------------------------------------------- 1 | name: katube 2 | channels: 3 | - defaults 4 | dependencies: 5 | - _libgcc_mutex=0.1=main 6 | - _openmp_mutex=5.1=1_gnu 7 | - ca-certificates=2022.4.26=h06a4308_0 8 | - ld_impl_linux-64=2.38=h1181459_1 9 | - libffi=3.3=he6710b0_2 10 | - libgcc-ng=11.2.0=h1234567_1 11 | - libgomp=11.2.0=h1234567_1 12 | - libstdcxx-ng=11.2.0=h1234567_1 13 | - ncurses=6.3=h7f8727e_2 14 | - openssl=1.1.1o=h7f8727e_0 15 | - pip=21.2.4=py38h06a4308_0 16 | - python=3.8.13=h12debd9_0 17 | - readline=8.1.2=h7f8727e_1 18 | - setuptools=61.2.0=py38h06a4308_0 19 | - sqlite=3.38.3=hc218d9a_0 20 | - tk=8.6.12=h1ccaba5_0 21 | - wheel=0.37.1=pyhd3eb1b0_0 22 | - xz=5.2.5=h7f8727e_1 23 | - zlib=1.2.12=h7f8727e_2 24 | - pip: 25 | - apiclient==1.0.4 26 | - audioread==2.1.8 27 | - beautifulsoup4==4.9.0 28 | - cachetools==4.0.0 29 | - certifi==2019.11.28 30 | - cffi==1.13.2 31 | - chardet==3.0.4 32 | - decorator==4.4.1 33 | - google-api-python-client==1.7.11 34 | - google-auth==1.10.1 35 | - google-auth-httplib2==0.0.3 36 | - httplib2==0.15.0 37 | - idna==2.8 38 | - joblib==0.14.1 39 | - librosa==0.7.2 40 | - llvmlite==0.31.0 41 | - lxml==4.5.0 42 | - numba==0.47.0 43 | - numpy==1.22.4 44 | - oauth2client==3.0.0 45 | - pandas==1.0.3 46 | - pyasn1==0.4.8 47 | - pyasn1-modules==0.2.8 48 | - pycparser==2.19 49 | - pydub==0.23.1 50 | - pysubs2==0.2.4 51 | - python-dateutil==2.8.1 52 | - pytube3==9.6.4 53 | - pytz==2019.3 54 | - requests==2.22.0 55 | - resampy==0.2.2 56 | - rsa==4.0 57 | - scikit-learn==0.22.1 58 | - scipy==1.4.1 59 | - six==1.13.0 60 | - soundfile==0.10.3.post1 61 | - soupsieve==2.0 62 | - textdistance==4.1.5 63 | - tqdm==4.41.1 64 | - typing-extensions==3.7.4.2 65 | - uritemplate==3.0.1 66 | - urllib3==1.25.7 67 | - youtube-dl==2021.4.17 68 | - youtube-transcript-api==0.3.1 69 | prefix: /opt/anaconda3/envs/katube 70 | -------------------------------------------------------------------------------- /utils/recreate_metadata.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | from os import makedirs 4 | from os.path import join, exists, isdir 5 | import csv 6 | import tqdm 7 | 8 | def regenerate_metadata(input_file1, basename, output_file): 9 | 10 | try: 11 | f = open(input_file1) 12 | content_file1 = f.readlines()[1:] 13 | except IOError: 14 | print("Error: File {} does not appear to exist.".format(input_file1)) 15 | return False 16 | else: 17 | f.close() 18 | 19 | output_file = open(output_file, 'a') 20 | separator = '|' 21 | 22 | for line1 in content_file1: 23 | file1, text1, text2, lev = line1.split('|') 24 | filepath = join(basename, file1) 25 | line = separator.join([filepath, text1.rstrip(), text2.strip(), str(lev)]) 26 | output_file.write(line) 27 | 28 | output_file.close() 29 | return True 30 | 31 | def main(): 32 | 33 | parser = argparse.ArgumentParser() 34 | parser.add_argument('--base_dir', default='./output/channel/') 35 | parser.add_argument('--csv_file', default='metadata.csv', help='Name of csv file') 36 | parser.add_argument('--wav_folder', default='wavs', help='Name of wavs folder') 37 | parser.add_argument('--internal_csv_file', default='metadata.csv', help='Name of csv file') 38 | args = parser.parse_args() 39 | 40 | separator = '|' 41 | output_path_file = join(args.base_dir, args.csv_file) 42 | output_file = open(output_path_file, 'w') 43 | header = separator.join(['filename', 'subtitle', 'transcript', 'similarity']) + '\n' 44 | output_file.write(header) 45 | output_file.close() 46 | 47 | for folder_path in tqdm.tqdm(sorted(glob.glob(args.base_dir + '/*/*'))): 48 | if not isdir(folder_path): 49 | continue 50 | foldername = join(folder_path, args.wav_folder) 51 | input_path_file1 = join(folder_path, args.internal_csv_file) 52 | regenerate_metadata(input_path_file1, foldername, output_path_file) 53 | 54 | if __name__ == "__main__": 55 | main() 56 | -------------------------------------------------------------------------------- /synchronization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com 5 | # 6 | # 7 | import argparse 8 | import sys 9 | from os.path import split, join 10 | from aeneas.executetask import ExecuteTask 11 | from aeneas.task import Task 12 | 13 | 14 | def create_aeneas_json_file(audio_path, text_path, output_path): 15 | """ 16 | Use the api aeneas to synchronize audio and text. 17 | 18 | Parameters: 19 | audio_path (str): audio filepath. 20 | text_path (str): text filepath. 21 | output_path (str): output json filepath. 22 | 23 | Returns: 24 | Boolean: True or False. 25 | """ 26 | try: 27 | # create Task object 28 | config_string = u"task_language=por|is_text_type=plain|os_task_file_format=json|task_adjust_boundary_percent_value=50|mfcc_mask_nonspeech_l2=True" 29 | task = Task(config_string=config_string) 30 | task.audio_file_path_absolute = u"{}".format(audio_path) 31 | task.text_file_path_absolute = u"{}".format(text_path) 32 | task.sync_map_file_path_absolute = u"{}".format(output_path) 33 | 34 | # process Task 35 | ExecuteTask(task).execute() 36 | 37 | # output sync map to file 38 | task.output_sync_map_file() 39 | 40 | except KeyboardInterrupt: 41 | print("KeyboardInterrupt Detected!") 42 | exit() 43 | 44 | except: 45 | exc_type, exc_obj, exc_tb = sys.exc_info() 46 | exc_file = split(exc_tb.tb_frame.f_code.co_filename)[1] 47 | print(exc_type, exc_file, exc_tb.tb_lineno) 48 | return False 49 | 50 | return True 51 | 52 | 53 | def main(): 54 | parser = argparse.ArgumentParser() 55 | parser.add_argument('--base_dir', default='./') 56 | parser.add_argument('--audio_file', default='audio.mp3', help='Filename to input audio file') 57 | parser.add_argument('--text_file', default='input.txt', help='Filename of input text') 58 | parser.add_argument('--output_file', default='output.json', help='Output json file') 59 | args = parser.parse_args() 60 | 61 | audio_path = join(args.base_dir, args.audio_file) 62 | text_path = join(args.base_dir, args.text_file) 63 | output_path = join(args.base_dir, args.output_file) 64 | create_aeneas_json_file(audio_path, text_path, output_path) 65 | 66 | if __name__ == "__main__": 67 | main() 68 | -------------------------------------------------------------------------------- /utils/delete_folders_with_erros.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import shutil 5 | 6 | def erase_folders_with_error(args): 7 | total = 0 8 | for folder in sorted(glob.glob(args.base_dir + "/*/*")): 9 | if os.path.exists(folder) and os.path.isdir(folder): 10 | if os.path.isfile(os.path.join(folder, args.metada_file1)): 11 | try: 12 | with open(os.path.join(folder, args.metada_file1)) as f: 13 | content_file1 = f.readlines() 14 | except IOError: 15 | print("Error: File {} does not appear to exist.".format(metada_file1)) 16 | # return False 17 | else: 18 | total += 1 19 | if not args.force: 20 | print(folder) 21 | else: 22 | shutil.rmtree(folder) 23 | continue 24 | if os.path.isfile(os.path.join(folder, args.metada_file2)): 25 | try: 26 | with open(os.path.join(folder, args.metada_file2)) as f: 27 | content_file2 = f.readlines() 28 | except IOError: 29 | print("Error: File {} does not appear to exist.".format(filename2)) 30 | #return False 31 | else: 32 | total += 1 33 | if not args.force: 34 | print(folder) 35 | else: 36 | shutil.rmtree(folder) 37 | continue 38 | 39 | if not (len(content_file1) == len(content_file2)): 40 | total += 1 41 | if not args.force: 42 | print(folder) 43 | else: 44 | shutil.rmtree(folder) 45 | if args.force: 46 | print('Total folders erased: ', total) 47 | else: 48 | print('Total folders with problems: ', total) 49 | 50 | def main(): 51 | parser = argparse.ArgumentParser() 52 | parser.add_argument('--base_dir', default='./output/channel') 53 | parser.add_argument('--metada_file1', default='subtitles.csv') 54 | parser.add_argument('--metada_file2', default='transcript.csv') 55 | parser.add_argument('--force', action='store_true', default=False) 56 | args = parser.parse_args() 57 | erase_folders_with_error(args) 58 | 59 | if __name__ == "__main__": 60 | main() 61 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | pip-wheel-metadata/ 24 | share/python-wheels/ 25 | *.egg-info/ 26 | .installed.cfg 27 | *.egg 28 | MANIFEST 29 | 30 | # PyInstaller 31 | # Usually these files are written by a python script from a template 32 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 33 | *.manifest 34 | *.spec 35 | 36 | # Installer logs 37 | pip-log.txt 38 | pip-delete-this-directory.txt 39 | 40 | # Unit test / coverage reports 41 | htmlcov/ 42 | .tox/ 43 | .nox/ 44 | .coverage 45 | .coverage.* 46 | .cache 47 | nosetests.xml 48 | coverage.xml 49 | *.cover 50 | *.py,cover 51 | .hypothesis/ 52 | .pytest_cache/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | target/ 76 | 77 | # Jupyter Notebook 78 | .ipynb_checkpoints 79 | 80 | # IPython 81 | profile_default/ 82 | ipython_config.py 83 | 84 | # pyenv 85 | .python-version 86 | 87 | # pipenv 88 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 89 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 90 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 91 | # install all needed dependencies. 92 | #Pipfile.lock 93 | 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow 95 | __pypackages__/ 96 | 97 | # Celery stuff 98 | celerybeat-schedule 99 | celerybeat.pid 100 | 101 | # SageMath parsed files 102 | *.sage.py 103 | 104 | # Environments 105 | .env 106 | .venv 107 | env/ 108 | venv/ 109 | ENV/ 110 | env.bak/ 111 | venv.bak/ 112 | 113 | # Spyder project settings 114 | .spyderproject 115 | .spyproject 116 | 117 | # Rope project settings 118 | .ropeproject 119 | 120 | # mkdocs documentation 121 | /site 122 | 123 | # mypy 124 | .mypy_cache/ 125 | .dmypy.json 126 | dmypy.json 127 | 128 | # Pyre type checker 129 | .pyre/ 130 | -------------------------------------------------------------------------------- /tools/Azure/azure_cloud.py: -------------------------------------------------------------------------------- 1 | import azure.cognitiveservices.speech as speechsdk 2 | from config import Config as config 3 | import os 4 | from pathlib import Path 5 | import tqdm 6 | import glob 7 | 8 | import pandas as pd 9 | 10 | def pass_through_files(speech_config=None): 11 | ''' 12 | Realiza a análise de todos os arquivos no diretório desejado 13 | para a transcrição. 14 | ''' 15 | 16 | transcribed_texts = [] 17 | file_names = [] 18 | 19 | for filepath in tqdm.tqdm(sorted(glob.glob(config.base_dir + '/**/*.wav', recursive=True))): 20 | transcription = run_transcription(filepath, speech_config) 21 | 22 | transcribed_texts.append(transcription) 23 | file_names.append(filepath.split('/')[-1]) 24 | 25 | 26 | return transcribed_texts, file_names 27 | 28 | 29 | def run_transcription(filepath='./', speech_config=None): 30 | ''' 31 | Realiza a transcrição de um único áudio de forma assíncrona. 32 | ''' 33 | 34 | audio_input = speechsdk.AudioConfig(filename=filepath) 35 | speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, 36 | audio_config=audio_input, 37 | language="pt-BR") 38 | 39 | result_future = speech_recognizer.recognize_once_async() 40 | 41 | # Retrieve the recognition result. This blocks until recognition is complete. 42 | result = result_future.get() 43 | 44 | # Check the result 45 | if result.reason == speechsdk.ResultReason.RecognizedSpeech: 46 | # print(result.text) 47 | return result.text 48 | else: 49 | return '' 50 | # elif result.reason == speechsdk.ResultReason.NoMatch: 51 | # print("No speech could be recognized: {}".format(result.no_match_details)) 52 | # elif result.reason == speechsdk.ResultReason.Canceled: 53 | # cancellation_details = result.cancellation_details 54 | # print("Speech Recognition canceled: {}".format(cancellation_details.reason)) 55 | # if cancellation_details.reason == speechsdk.CancellationReason.Error: 56 | # print("Error details: {}".format(cancellation_details.error_details)) 57 | 58 | def make_matadata(file_names, transcribed_texts): 59 | """ 60 | Cria um arquivo csv com os textos transcritos. 61 | """ 62 | 63 | os.makedirs(config.output_path, exist_ok=True) 64 | 65 | df = pd.DataFrame() 66 | 67 | for file_name, text in zip(file_names, transcribed_texts): 68 | df = df.append({'A': file_name, 'B' : text}, ignore_index=True) 69 | 70 | df.to_csv(os.path.join(config.output_path, config.output_name.lower() + '_transcribed_azure' + '.csv'), sep='|', index=False, header=False, quotechar="'") 71 | 72 | def main(): 73 | 74 | speech_config = speechsdk.SpeechConfig(subscription=config.speech_key, region=config.service_region) 75 | transcribed_texts, file_names = pass_through_files(speech_config) 76 | make_matadata(file_names, transcribed_texts) 77 | 78 | 79 | if __name__ == '__main__': 80 | main() -------------------------------------------------------------------------------- /selection.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com 5 | # 6 | # 7 | import argparse 8 | import sys 9 | from os import remove 10 | from os.path import basename, join, split 11 | 12 | 13 | def select(input_csv_file, output_filepath, min_similarity, force): 14 | """ 15 | Given a csv file, selects only files with similarity greater than min_similarity and deletes the others. 16 | 17 | Parameters: 18 | input_csv_file (str): Input csv filepath following the template: "filename| subtitle | transcript | similarity" 19 | output_filepath (str): Output csv filepath following the template: "filename| subtitle | transcript | similarity" 20 | min_similarity (float): Threshold that defines which files will be excluded. 21 | force (boolean): if True, it will remove the files, otherwise only show what files wil be removed. 22 | 23 | Returns: 24 | Boolean: returns True or False. 25 | """ 26 | 27 | try: 28 | f = open(input_csv_file) 29 | content_file = f.readlines()[1:] 30 | 31 | except IOError: 32 | print("Error: File {} does not appear to exist.".format(input_csv_file)) 33 | return False 34 | 35 | else: 36 | f.close() 37 | 38 | try: 39 | separator = '|' 40 | output_file = open(output_filepath, 'w') 41 | header = separator.join(['filename', 'subtitle', 'transcript', 'similarity']) + '\n' 42 | output_file.write(header) 43 | 44 | for line in content_file: 45 | filepath, text1, text2, similarity = line.split(separator) 46 | 47 | # Selects only files with similarity greater than min_similarity 48 | if float(similarity) >= float(min_similarity): 49 | filename = basename(filepath) 50 | line = separator.join([filename.strip(), text1.strip(), text2.strip(), str(similarity).strip()]) 51 | output_file.write(line + '\n') 52 | 53 | # otherwise, delete the file. 54 | else: 55 | if force: 56 | remove(filepath) 57 | else: 58 | print('rm {}'.format(filepath)) 59 | 60 | output_file.close() 61 | 62 | except KeyboardInterrupt: 63 | print("KeyboardInterrupt Detected!") 64 | exit() 65 | 66 | except: 67 | exc_type, exc_obj, exc_tb = sys.exc_info() 68 | exc_file = split(exc_tb.tb_frame.f_code.co_filename)[1] 69 | print(exc_type, exc_file, exc_tb.tb_lineno) 70 | return False 71 | 72 | return True 73 | 74 | 75 | def main(): 76 | parser = argparse.ArgumentParser() 77 | parser.add_argument('--base_dir', default='./') 78 | parser.add_argument('--csv_file', default='validation.csv', help='Name of csv file') 79 | parser.add_argument('--min_value', default=0.90, help='Minimal value of levenshtein distance') 80 | parser.add_argument('--save_file', default='metadata.csv') 81 | parser.add_argument('--force', action='store_true', default=False) 82 | args = parser.parse_args() 83 | 84 | input_csv_file = join(args.base_dir, args.csv_file) 85 | output_filepath = join(args.base_dir, args.save_file) 86 | 87 | select(input_csv_file, output_filepath, args.min_value, args.force) 88 | 89 | 90 | if __name__ == "__main__": 91 | main() 92 | -------------------------------------------------------------------------------- /utils/exclude_unecessary_files.py: -------------------------------------------------------------------------------- 1 | import argparse 2 | import glob 3 | import os 4 | import tqdm 5 | 6 | def exclude_files(args): 7 | 8 | total = 0 9 | for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + '/*/*'))): 10 | 11 | if not os.path.exists(folder) or not os.path.isdir(folder): 12 | continue 13 | 14 | old_metadata_file = os.path.join(folder, 'save.csv') 15 | metadata_file = os.path.join(folder, 'metadata.csv') 16 | 17 | if os.path.exists(metadata_file): 18 | continue 19 | 20 | if not os.path.exists(old_metadata_file): 21 | print('Verify the folder: ' + folder) 22 | continue 23 | #exit() 24 | 25 | folder_name = folder.split('/')[-1] 26 | json_file = os.path.join(folder, folder_name + '.json') 27 | if not os.path.isfile(json_file): 28 | print(json_file) 29 | continue 30 | srt_file = os.path.join(folder, folder_name + '.srt') 31 | if not os.path.isfile(srt_file): 32 | print(srt_file) 33 | continue 34 | txt_file = os.path.join(folder, folder_name + '.txt') 35 | if not os.path.isfile(txt_file): 36 | print(txt_file) 37 | continue 38 | subtitles_file = os.path.join(folder, 'subtitles.csv') 39 | if not os.path.isfile(subtitles_file): 40 | print(subtitles_file) 41 | continue 42 | transcript_file = os.path.join(folder, 'transcript.csv') 43 | if not os.path.isfile(transcript_file): 44 | print(transcript_file) 45 | continue 46 | validation_file = os.path.join(folder, 'validation.csv') 47 | if not os.path.isfile(validation_file): 48 | print(validation_file) 49 | continue 50 | delete_file = os.path.join(folder, 'delete.csv') 51 | if not os.path.isfile(delete_file): 52 | print(delete_file) 53 | continue 54 | 55 | try: 56 | f = open(old_metadata_file) 57 | content_file = f.readlines()[1:] 58 | except IOError: 59 | print("Error: File {} does not appear to exist.".format(old_metadata_file)) 60 | return False 61 | else: 62 | f.close() 63 | 64 | if (len(content_file) == len(os.listdir(os.path.join(folder, args.wav_folder)))): 65 | total +=1 66 | if not args.force: 67 | print('mv ' + old_metadata_file + ' ' + metadata_file) 68 | else: 69 | os.remove(json_file) 70 | os.remove(srt_file) 71 | os.remove(txt_file) 72 | os.remove(subtitles_file) 73 | os.remove(transcript_file) 74 | os.remove(validation_file) 75 | os.remove(delete_file) 76 | os.rename(old_metadata_file, metadata_file) 77 | 78 | else: 79 | print('Founded diferences between ' + folder + ' and wavs.') 80 | 81 | 82 | 83 | def main(): 84 | parser = argparse.ArgumentParser() 85 | parser.add_argument('--base_dir', default='./output/channel/') 86 | parser.add_argument('--wav_folder', default='wavs', help='Name of old wavs folder, to erase') 87 | parser.add_argument('--force', action='store_true', default=False) 88 | args = parser.parse_args() 89 | exclude_files(args) 90 | 91 | if __name__ == "__main__": 92 | main() -------------------------------------------------------------------------------- /search.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com 5 | # 6 | # 7 | from config import Config 8 | import argparse 9 | from os import makedirs 10 | from os.path import join, exists, split 11 | from googleapiclient.discovery import build 12 | from tqdm import tqdm 13 | import sys 14 | 15 | 16 | def get_videos(youtube, conv_id): 17 | """ 18 | Get all videos from youtube channel/playlist. 19 | 20 | Parameters: 21 | youtube (str): googleapiclient object. 22 | conv_id (str): google channel/playlist id. 23 | 24 | Returns: 25 | Videos (str): returns list of videos. 26 | """ 27 | 28 | videos = [] 29 | next_page_token = None 30 | 31 | while True: 32 | res = youtube.playlistItems().list(playlistId = conv_id, 33 | part = 'snippet', 34 | maxResults = 50, 35 | pageToken = next_page_token).execute() 36 | 37 | videos += res['items'] 38 | next_page_token = res.get('nextPageToken') 39 | 40 | if next_page_token is None: 41 | break 42 | 43 | return videos 44 | 45 | 46 | def search_videos(api_key, content_id, output_folderpath, output_result_file): 47 | """ 48 | Search all the videos from a channel 49 | 50 | Parameters: 51 | api_key (str): Google developer Key 52 | content_id (str): Playlist or Channel id 53 | output_folderpath (str): folder 54 | output_result_file (str): output file to save youtube videos list 55 | 56 | Returns: 57 | file_path: returns 58 | """ 59 | youtube_prefix = 'https://www.youtube.com/watch?v=' 60 | 61 | api_service_name = 'youtube' 62 | api_version = 'v3' 63 | 64 | #print('Searching videos from {} - {}...'.format(Config.orig_base, content_id)) 65 | path_dest = join(output_folderpath, Config.orig_base, content_id ) 66 | 67 | if not(exists(path_dest)): 68 | makedirs(path_dest) 69 | 70 | output_filepath = join(path_dest, output_result_file) 71 | 72 | # Checks if it has already been downloaded 73 | if exists(output_filepath): 74 | return output_filepath 75 | 76 | try: 77 | # Open output file 78 | f = open(output_filepath, 'w+') 79 | 80 | youtube = build(api_service_name, api_version, developerKey = api_key) 81 | 82 | if Config.orig_base == 'playlist': 83 | conv_id = content_id 84 | elif Config.orig_base == 'channel': 85 | res = youtube.channels().list(id = content_id, 86 | part = 'contentDetails').execute() 87 | conv_id = res['items'][0]['contentDetails']['relatedPlaylists']['uploads'] 88 | else: 89 | conv_id = None 90 | 91 | # Get all videos from youtube channel/playlist. 92 | videos = get_videos(youtube, conv_id) 93 | 94 | print('Writing video links to file...') 95 | for video in tqdm(videos): 96 | f.write( youtube_prefix + video['snippet']['resourceId']['videoId'] + '\n' ) 97 | 98 | print("Total videos: {0}".format( len(videos) )) 99 | f.close() 100 | 101 | except KeyboardInterrupt: 102 | print("KeyboardInterrupt Detected!") 103 | exit() 104 | 105 | except: 106 | exc_type, exc_obj, exc_tb = sys.exc_info() 107 | exc_file = split(exc_tb.tb_frame.f_code.co_filename)[1] 108 | print(exc_type, exc_file, exc_tb.tb_lineno) 109 | return False 110 | 111 | return output_filepath 112 | 113 | 114 | def main(): 115 | parser = argparse.ArgumentParser() 116 | parser.add_argument('--api_key', default = '') 117 | parser.add_argument('--content_id', default = '') 118 | parser.add_argument('--base_dir', default = './') 119 | parser.add_argument('--dest_dir', default = 'output') 120 | parser.add_argument('--output_search_file', default='youtube_videos.txt') 121 | args = parser.parse_args() 122 | output_path = join(args.base_dir, args.dest_dir) 123 | search_videos(args.api_key, args.content_id, output_path, args.output_search_file) 124 | 125 | 126 | if __name__ == '__main__': 127 | main() 128 | -------------------------------------------------------------------------------- /download.py: -------------------------------------------------------------------------------- 1 | # !/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com 5 | # 6 | # 7 | import argparse 8 | import sys 9 | from os import makedirs 10 | from os.path import join, exists, split 11 | import time 12 | import youtube_dl 13 | from youtube_transcript_api import YouTubeTranscriptApi 14 | from pathlib import Path 15 | from urllib.parse import parse_qs, urlparse 16 | from random import randint 17 | 18 | def my_progress(d): 19 | ''' 20 | Show download progress. 21 | ''' 22 | if d['status'] == 'finished': 23 | print('Done downloading, now converting ...') 24 | 25 | 26 | def download_audio_and_subtitles_from_youtube(yt_url, output_path): # function for ingesting when given a url 27 | ''' 28 | Download audio and subtitle from a youtube video given a url. 29 | Parameters: 30 | yt_url (str): Youtube URL format https://www.youtube.com/watch?v=XXXXXXXXXXX 31 | output_path (str): folder to save youtube audio. 32 | 33 | Returns: 34 | String: returns True or False 35 | 36 | ''' 37 | # Use vid as the diretory name for download and processing 38 | vids = parse_qs(urlparse(yt_url).query, keep_blank_values=True).get('v') 39 | vid = None if vids == None else vids[0] 40 | 41 | video_dir = join(output_path, vid) 42 | 43 | # Filename for audio stream (.mp4) and subtitle (.srt) files 44 | audio = join(video_dir, vid + '.webm') 45 | subtitle = join(video_dir, vid + '.srt') 46 | 47 | if Path(audio).exists() and Path(subtitle).exists(): 48 | return False 49 | 50 | if exists(audio.replace('.webm', '.mp3')) and exists(subtitle): 51 | return False 52 | 53 | # Get information on the YouTube content 54 | try: 55 | # Random time do waiting to avoid youtube access blocking 56 | t = randint(30,60) 57 | print('Waiting %d seconds ...'%(t)) 58 | time.sleep(t) # Overcome YouTube blocking 59 | 60 | if not (exists(video_dir)): 61 | makedirs(video_dir) 62 | 63 | ydl_opts = { 64 | 'format': 'bestaudio/best', 65 | 'postprocessors': [{ 66 | 'key': 'FFmpegExtractAudio', 67 | 'preferredcodec': 'mp3', 68 | 'preferredquality': '320', 69 | }], 70 | 'outtmpl': audio, 71 | 'noplaylist' : True, 72 | 'progress_hooks': [my_progress], 73 | } 74 | # Download audio stream and convert to mp3 75 | with youtube_dl.YoutubeDL(ydl_opts) as ydl: 76 | ydl.download([yt_url]) 77 | 78 | # get video_id from youtube_uri 79 | video_id = yt_url.replace('https://www.youtube.com/watch?v=','') 80 | # Download subtitle and write to an .srt file 81 | transcript_list = YouTubeTranscriptApi.list_transcripts(video_id) 82 | 83 | # filter first for manually created transcripts and second for automatically generated ones 84 | transcript = transcript_list.find_transcript(['pt']) 85 | # get only text from transcript 86 | text_transcript_list = [] 87 | for line in transcript.fetch(): 88 | text_transcript_list.append(line['text']) 89 | text_transcript = ' '.join(text_transcript_list) 90 | 91 | # Write transcript to file 92 | output_file = open(subtitle, 'w') 93 | output_file.write(text_transcript) 94 | output_file.close() 95 | 96 | except KeyboardInterrupt: 97 | print("KeyboardInterrupt Detected!") 98 | exit() 99 | 100 | except: 101 | exc_type, exc_obj, exc_tb = sys.exc_info() 102 | exc_file = split(exc_tb.tb_frame.f_code.co_filename)[1] 103 | print(exc_type, exc_file, exc_tb.tb_lineno) 104 | return False 105 | 106 | return True 107 | 108 | def main(): 109 | parser = argparse.ArgumentParser() 110 | parser.add_argument('--youtube_url', help="URL of the youtube video.") 111 | parser.add_argument('--output_dir', default='data', help='Directory to save downloaded audio and transcript files.') 112 | 113 | args = parser.parse_args() 114 | 115 | if args.youtube_url.startswith('https://'): 116 | download_audio_and_subtitles_from_youtube(args.youtube_url, args.output_dir) 117 | 118 | else: 119 | print("URL of the video file should start with https://") 120 | sys.exit(1) 121 | 122 | if __name__ == '__main__': 123 | main() 124 | 125 | -------------------------------------------------------------------------------- /transcribe.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com 5 | # 6 | # 7 | import argparse 8 | import sys 9 | from os import makedirs 10 | from os.path import join, exists, basename, split 11 | from glob import glob 12 | from tqdm import tqdm 13 | import librosa 14 | import requests 15 | import soundfile as sf 16 | import json 17 | 18 | 19 | def convert_audios_samplerate(input_path, output_path, new_sample_rate): 20 | """ 21 | Converts all audio files within a folder to a new sample rate. 22 | parameters: 23 | input_path: input folder path with wav files. 24 | output_path: output folder path to save converted wav files. 25 | 26 | Returns: 27 | Boolean: True of False. 28 | """ 29 | 30 | if not(exists(output_path)): 31 | makedirs(output_path) 32 | 33 | for wavfile_path in tqdm(sorted(glob(input_path + "/*.wav"))): 34 | try: 35 | filename = basename(wavfile_path) 36 | data, sample_rate = librosa.load(wavfile_path) 37 | data = data.T 38 | new_data = librosa.resample(data, sample_rate, new_sample_rate) 39 | output_file = join(output_path, filename) 40 | sf.write(output_file, new_data, new_sample_rate) 41 | except: 42 | print('Error converting ' + wavfile_path) 43 | return False 44 | 45 | return True 46 | 47 | 48 | def get_transcript(wavefile_path): 49 | """ 50 | Custom function to access a service STT. You must adapt it to use your contracted STT service. 51 | parameters: 52 | wavefile_path: wav filepath which will be transcribed. 53 | 54 | Returns: 55 | Text (str): Transcription of wav file. 56 | """ 57 | with open(wavefile_path,'rb') as file_data: 58 | headers_raw = { 59 | 'Content-Type': "application/x-www-form-urlencoded", 60 | 'endpointer.enabled': "true", 61 | 'endpointer.waitEnd': "5000", 62 | 'endpointer.levelThreshold': "5", 63 | 'decoder.confidenceThreshold': "10", 64 | 'decoder.maxSentences': "1", 65 | 'decoder.wordDetails': "0", 66 | } 67 | try: 68 | res = requests.post(url='https://your_url_here', 69 | data=file_data, 70 | headers=headers_raw) 71 | 72 | res.encoding='utf-8' 73 | except KeyboardInterrupt: 74 | print("KeyboardInterrupt Detected!") 75 | exit() 76 | except: 77 | #json_data=[{"message": "ERROR NO SPEECH"}] 78 | #return json_data 79 | return False 80 | return res.text 81 | 82 | 83 | def transcribe_audios(input_path, output_file): 84 | """ 85 | Iterate over the wav files inside a folder and transcribe them all. 86 | parameters: 87 | input_path: input wavs folder. 88 | output_file: output file to save the transcriptions following the template: "filename| transcription" 89 | 90 | Returns: 91 | Boolean: True or False. 92 | """ 93 | 94 | out = open(output_file, 'w') 95 | 96 | for wavfile_path in tqdm(sorted(glob(input_path + "/*.wav"))): 97 | filename = basename(wavfile_path) 98 | # Four attempts if connection error occurs. 99 | for attempts in range(4): 100 | 101 | if attempts != 0: 102 | print('Attempt - {}...'.format(attempts)) 103 | 104 | transcript = get_transcript(wavfile_path) 105 | if not transcript: 106 | text = '' 107 | break 108 | 109 | try: 110 | transcript_json = json.loads(str(transcript).replace("'", '"')) 111 | if transcript_json[0]['result_status'] == 'RECOGNIZED': 112 | text = transcript_json[0]['alternatives'][0]['text'] 113 | break 114 | else: 115 | #print("Erro") 116 | text = '' 117 | break 118 | except: 119 | exc_type, exc_obj, exc_tb = sys.exc_info() 120 | exc_file = split(exc_tb.tb_frame.f_code.co_filename)[1] 121 | print("Transcribing error: ") 122 | print(exc_type, exc_file, exc_tb.tb_lineno) 123 | 124 | else: 125 | text = '' 126 | 127 | out.write("{}|{}\n".format(str(filename),str(text))) 128 | 129 | out.close() 130 | return True 131 | 132 | 133 | def main(): 134 | parser = argparse.ArgumentParser() 135 | parser.add_argument('--base_dir', default='./') 136 | parser.add_argument('--transcription_file', default='transcript.txt', help='Filename to save the transcripts') 137 | parser.add_argument('--input_dir', default='wavs', help='Directory of wav files') 138 | parser.add_argument('--temp_dir', default='wavs_16k', help='Directory to save wav files with sample rate (16k)') 139 | parser.add_argument('--new_sample_rate', default=16000, help='Sample rate used by the transcription api.') 140 | 141 | args = parser.parse_args() 142 | 143 | input_path = join(args.base_dir, args.input_dir) 144 | converted_wavs_temp_path = join(args.base_dir,args.temp_dir) 145 | output_file = join(args.base_dir,args.transcription_file) 146 | 147 | # Convert audio sample rate 148 | print('Converting wav files...') 149 | convert_audios_samplerate(input_path, converted_wavs_temp_path, args.new_sample_rate) 150 | 151 | # Transcribe all wavs files 152 | print('Transcribing...') 153 | transcribe_audios(converted_wavs_temp_path, output_file) 154 | 155 | 156 | if __name__ == "__main__": 157 | main() 158 | -------------------------------------------------------------------------------- /validation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com 5 | # 6 | # 7 | import argparse 8 | from os import makedirs 9 | from os.path import join, exists, dirname 10 | from textdistance import levenshtein 11 | from tqdm import tqdm 12 | 13 | 14 | def remove_punctuations(sentence): 15 | """ 16 | Removes punctuations and unwanted characters from a sentence. 17 | """ 18 | punctuations = '''—!()-[]{};:'"\,<>./?@#$%^&*_~''' 19 | sentence_with_no_punct = "" 20 | for char in sentence: 21 | if char not in punctuations: 22 | sentence_with_no_punct = sentence_with_no_punct + char 23 | return sentence_with_no_punct.strip() 24 | 25 | 26 | def clear_sentences(sentence): 27 | """ 28 | Converts the sentence to lowercase and removes unwanted characters. 29 | """ 30 | sentence = sentence.lower() 31 | clean_sentence = remove_punctuations(sentence) 32 | return clean_sentence 33 | 34 | 35 | def create_validation_file(input_file1, input_file2, prefix_filepath, output_file): 36 | """ 37 | Given two files containing different transcriptions of audio files, this function calculates the similarity (levenshtein distance) between the sentences, 38 | saving the result in a third file. 39 | 40 | Parameters: 41 | input_file1 (str): First filepath. The contents of the file must follow the template: "filename | text" 42 | input_file2 (str): Second filepath. The contents of the file must follow the template: "filename | text" 43 | prefix_filepath: Prefix to be added to the file path within the output file. 44 | 45 | Returns: 46 | output_file (str): Returns output filepath. The content of the file follows the template: prefix_filepath/filename | text1 | text2 | similarity 47 | """ 48 | 49 | # Loads the contents of the first input file 50 | try: 51 | with open(input_file1) as f: 52 | content_file1 = f.readlines() 53 | 54 | except KeyboardInterrupt: 55 | print("KeyboardInterrupt detected!") 56 | exit() 57 | 58 | except IOError: 59 | print("Error: File {} does not appear to exist.".format(input_file1)) 60 | return False 61 | 62 | # Loads the contents of the second input file 63 | try: 64 | with open(input_file2) as g: 65 | content_file2 = g.readlines() 66 | 67 | except KeyboardInterrupt: 68 | print("KeyboardInterrupt detected!") 69 | exit() 70 | 71 | except IOError: 72 | print("Error: File {} does not appear to exist.".format(input_file2)) 73 | return False 74 | 75 | # Both files must be the same length, otherwise there is an error. 76 | if not (len(content_file1) == len(content_file2)): 77 | print("Error: length File {} not igual to File {}.".format(content_file1, content_file2)) 78 | return False 79 | 80 | # Checks if the output folder exists 81 | output_folderpath = dirname(output_file) 82 | 83 | if not(exists(output_folderpath)): 84 | makedirs(output_folderpath) 85 | 86 | # Saves the result to the output file. 87 | try: 88 | o_file = open(output_file, 'w') 89 | 90 | except KeyboardInterrupt: 91 | print("KeyboardInterrupt detected!") 92 | exit() 93 | 94 | except IOError: 95 | print("Error: creating File {} problem.".format(output_file)) 96 | return False 97 | 98 | # Iterate over the two files content simultaneously to calculate the similarity between the sentences. 99 | else: 100 | separator = '|' 101 | header = separator.join(['filename', 'subtitle', 'transcript', 'similarity']) 102 | o_file.write(header + '\n') 103 | 104 | # Input files must be csv files with the character "|" as a separator: filename | text 105 | for line1, line2 in tqdm(zip(content_file1, content_file2), total=len(content_file1)): 106 | 107 | file1, text1 = line1.split('|') 108 | file2, text2 = line2.split('|') 109 | 110 | # Clears sentences by removing unwanted characters. 111 | clean_text1 = clear_sentences(text1) 112 | clean_text2 = clear_sentences(text2) 113 | filepath = join(prefix_filepath, file1) 114 | 115 | # Calculates the levenshtein distance to define the normalized similarity (0-1) between two sentences. 116 | l = levenshtein.normalized_similarity(clean_text1, clean_text2) 117 | 118 | # Defines the output content and writes to a file. 119 | line = separator.join([filepath, text1.strip(), text2.strip(), str(l)]) 120 | o_file.write(line + '\n') 121 | 122 | finally: 123 | o_file.close() 124 | 125 | return True 126 | 127 | 128 | def main(): 129 | 130 | parser = argparse.ArgumentParser() 131 | parser.add_argument('--base_dir', default='./') 132 | parser.add_argument('--input_file1', default='metadata1.csv', help='Input first filename') 133 | parser.add_argument('--input_file2', default='metadata2.csv', help='Input second filename') 134 | parser.add_argument('--prefix', default='', help='Prefix to filename on metadata output file.') 135 | parser.add_argument('--output_dir', default='output', help='Directory to save distances') 136 | parser.add_argument('--output_file', default='validation.csv', help='Output file with the template: "filename, text1, text2, similarity"') 137 | 138 | args = parser.parse_args() 139 | 140 | input_path_file1 = join(args.base_dir, args.input_file1) 141 | input_path_file2 = join(args.base_dir, args.input_file2) 142 | output_path_file = join(args.base_dir, args.output_dir, args.output_file) 143 | 144 | create_validation_file(input_path_file1, input_path_file2, args.prefix, output_path_file) 145 | 146 | 147 | if __name__ == "__main__": 148 | main() -------------------------------------------------------------------------------- /audio_segmentation.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com 5 | # 6 | # Adapted from https://gist.github.com/keithito/771cfc1a1ab69d1957914e377e65b6bd from Keith Ito: kito@kito.us 7 | # 8 | import argparse 9 | import os 10 | import json 11 | from pydub import AudioSegment 12 | 13 | 14 | class Segment: 15 | """ 16 | Linked segments lists 17 | """ 18 | def __init__(self, begin, end, text): 19 | self.begin = begin 20 | self.end = end 21 | self.text = text 22 | self.next = None 23 | self.filename = None 24 | self.gap = 0 # gap between segments (current and next) 25 | 26 | def set_next(self, next): 27 | self.next = next 28 | self.gap = next.begin - self.end 29 | 30 | def set_filename_and_id(self, filename, id): 31 | self.filename = filename 32 | self.id = id 33 | 34 | def merge_from(self, next): 35 | # merge two segments (current and next) 36 | self.next = next.next 37 | self.gap = next.gap 38 | self.end = next.end 39 | 40 | def duration(self, sample_rate): 41 | return (self.end - self.start - 1) / sample_rate 42 | 43 | 44 | def create_segments_list_from_aeneas_json(json_path): 45 | """ 46 | Creates a list of segments from the json file resulting from aeneas processing. 47 | """ 48 | 49 | head = None 50 | with open(json_path) as jfile : 51 | data = json.load(jfile) 52 | for i, fragment in enumerate(data['fragments']): 53 | text = fragment['lines'] 54 | begin = float(fragment['begin'])*1000 55 | end = float(fragment['end'])*1000 56 | 57 | # Build a segment list 58 | segment = Segment(begin, end, text) 59 | if head is None: 60 | head = segment 61 | else: 62 | prev.set_next(segment) 63 | prev = segment 64 | 65 | return head 66 | 67 | 68 | def create_audio_files_from_segments_list(audio_file, filenames_base, head_list, output_dir): 69 | """ 70 | Segments an audio file from a segment list, saving the files in a folder. 71 | Parameters: 72 | audio_file (str): filepath of source audio file. 73 | filenames_base (str): Filename prefix of audio segmented files. 74 | head_list (str): Reference of the linked list of segments. 75 | output_dir (str): Folder to save segmented audio files. 76 | 77 | Returns: 78 | String: returns True or False 79 | """ 80 | 81 | if not os.path.exists(output_dir): 82 | os.makedirs(output_dir) 83 | 84 | sound = AudioSegment.from_file(audio_file) 85 | curr = head_list 86 | i = 1 87 | while curr is not None: 88 | begin = curr.begin 89 | end = curr.end 90 | text = curr.text 91 | audio_segment = sound[begin:end] 92 | filename = '{}-{:04d}.wav'.format(filenames_base, i) 93 | curr.set_filename_and_id(filename, i) 94 | filepath = os.path.join(output_dir, filename) 95 | try: 96 | audio_segment.export(filepath, 'wav') 97 | except IOError: 98 | print("Error: Writing audio file {} problem.".format(filepath)) 99 | return False 100 | else: 101 | curr = curr.next 102 | i += 1 103 | return True 104 | 105 | 106 | def create_metadata_from_segments_list(head_list, output_file): 107 | """ 108 | Creates a csv file following the template: "filename | text" 109 | Parameters: 110 | head_list (str): Reference of the linked list of segments. 111 | output_file (str): csv output filename. 112 | 113 | Returns: 114 | String: returns True or False 115 | """ 116 | separator = '|' 117 | curr = head_list 118 | try: 119 | f = open(output_file, "w") 120 | while curr is not None: 121 | text = curr.text 122 | filename = curr.filename.replace('.mp3', '') 123 | f.write(filename + separator + text[0] + '\n') 124 | curr = curr.next 125 | f.close() 126 | except IOError: 127 | print("Error: creating File {} problem.".format(output_file)) 128 | return False 129 | return True 130 | 131 | 132 | def segment_audio(audio_path, json_path, output_path, metadata_output_file, filename_base): 133 | """ 134 | Performs the segmentation of the audio files and the creation of the csv file. 135 | Parameters: 136 | audio_path (str): filepath of source audio file. 137 | json_path (str): json file resulting from aeneas processing. 138 | output_path (str): Folder to save segmented audio files. 139 | metadata_output_file (str): csv output filename. 140 | filename_base (str): Filename prefix of audio segmented files. 141 | 142 | Returns: 143 | String: returns True or False 144 | """ 145 | segments_list = create_segments_list_from_aeneas_json(json_path) 146 | if not create_audio_files_from_segments_list(audio_path, filename_base, segments_list, output_path): 147 | return False 148 | if not create_metadata_from_segments_list(segments_list, metadata_output_file): 149 | return False 150 | return True 151 | 152 | 153 | def main(): 154 | parser = argparse.ArgumentParser() 155 | parser.add_argument('--base_dir', default='./') 156 | parser.add_argument('--audio_file', default='audio.mp3', help='Filename to input audio file') 157 | parser.add_argument('--filename_base', default='audio', help='Filename base of splited audios file. Ex. audio-0001.wav') 158 | parser.add_argument('--json_file', default='output.json', help='Filename of input json file') 159 | parser.add_argument('--output_dir', default='output', help='Output dir') 160 | parser.add_argument('--metadata_file', default='metadata.csv', help='Filename to metadata output file') 161 | args = parser.parse_args() 162 | 163 | audio_path = os.path.join(args.base_dir, args.audio_file) 164 | json_path = os.path.join(args.base_dir, args.json_file) 165 | output_dir = os.path.join(args.base_dir, args.output_dir) 166 | metadata_output_file = os.path.join(args.base_dir, args.output_dir, args.metadata_file) 167 | 168 | segment_audio(audio_path, json_path, output_dir, metadata_output_file, args.filename_base) 169 | 170 | if __name__ == "__main__": 171 | main() -------------------------------------------------------------------------------- /utils/number_to_text.py: -------------------------------------------------------------------------------- 1 | import unicodedata 2 | from math import ceil, floor 3 | import re 4 | 5 | ordinals_numbers = { 6 | '1º':"primeito", '1ª':"primeira", 7 | '2º':"segundo", '2ª':"segunda", 8 | '3º':"terceiro", '3ª':"terceira", 9 | '4º':"quarto", '4ª':"quarta", 10 | '5º':"quinto", '5ª':"quinta", 11 | '6º':"sexto", '6ª':"sexta", 12 | '7º':"sétimo", '7ª':"sétima", 13 | '8º':"oitavo", '8ª':"oitava", 14 | '9º':"nono", '9ª':"nona", 15 | '10º':"décimo", '10ª':"décima", 16 | '11º':"décimo primeiro", '11ª':"décima primeira", 17 | '12º':"décimo segundo", '12ª':"décima segunda", 18 | '13º':"décimo terceiro", '13ª':"décima terceira", 19 | '14º':"décimo quarto", '14ª':"décima quarta", 20 | '15ª':"décimo quinto", '15ª':"décima quinta", 21 | '16º':"décimo sexto", '16ª':"décima sexta", 22 | '17º':"décimo sétimo", '17ª':"décima sétima", 23 | '18º':"décimo oitavo", '18ª':"décima oitava", 24 | '19º':"décima nono", '19ª':"décima nona", 25 | '20º':"vigésimo", '20ª':"vigésima", 26 | '21º':"vigésimo primeiro", '21ª':"vigésima primeira", 27 | '22º': "vigésimo segundo", '22ª': "vigésima segunda", 28 | '26º':"vigésimo sexto", '26ª':"vigésima sexta", 29 | '30º':"trigésimo", '30ª':"trigésima", 30 | '60º':"sexagésimo", '60ª':"sexagésima", 31 | '89º':"octogésimo nono", 32 | '90º':"nonagésimo", '90ª':"nonagésima", 33 | 'nº':"número" 34 | } 35 | 36 | 37 | class Palavra: 38 | 39 | def __init__(self, singular, plural): 40 | self.singular = singular 41 | self.plural = plural 42 | 43 | class Extenso: 44 | 45 | def __init__(self): 46 | 47 | self._numero_maximo = 999999999999999999999999999999999999999999999 48 | 49 | # Dicionários para armazenar os números por extenso 50 | self.unidades = {1: 'um', 2: 'dois', 3: 'três', 4: 'quatro', 5: 'cinco', 6: 'seis', 7: 'sete', 8: 'oito', 9: 'nove', 10 : 'dez', 51 | 11 : 'onze', 12 : 'doze', 13 : 'treze', 14 : 'quatorze', 15 : 'quinze', 16 : 'dezesseis', 17 : 'dezessete', 18 : 'dezoito', 19 : 'dezenove'} 52 | 53 | self.dezenas = {2: 'vinte', 3: 'trinta', 4: 'quarenta', 5: 'cinquenta', 6: 'sessenta', 7: 'setenta', 8: 'oitenta', 9: 'noventa'} 54 | 55 | self.centenas = {1: Palavra('cem', 'cento'), 2: 'duzentos', 3: 'trezentos', 4: 'quatrocentos', 5: 'quinhentos', 6: 'seiscentos', 7: 'setecentos', 8: 'oitocentos', 9: 'novecentos'} 56 | 57 | # Tupla armazenando os milhares 58 | self.milhares = (Palavra('',''), Palavra('mil','mil'), Palavra('milhão','milhões'), \ 59 | Palavra('bilhão','bilhões'), Palavra('trilhão','trilhões'), Palavra('quatrilhão','quatrilhões'), \ 60 | Palavra('quintilhão','quintilhões'), Palavra('sextilhão','sextilhões'), Palavra('septilhão','septilhões'), \ 61 | Palavra('octilhão','octilhões'),Palavra('nonilhão','nonilhões'), Palavra('decilhão','decilhões'), \ 62 | Palavra('undecilhão','undecilhões'), Palavra('duodecilhão','duodecilhões'),Palavra('tredecilhão','tredecilhões')) 63 | 64 | 65 | def escrever(self, numero): 66 | if (numero > self._numero_maximo): 67 | raise Exception('Número informado maior que o número máximo suportado') 68 | if (numero == 0): 69 | return 'zero' 70 | extenso = '' 71 | 72 | # Conversão do número recebido para string 73 | numero_string = str(numero) 74 | # Busca o tamanho do número informado 75 | tamanho = len(numero_string) 76 | 77 | # Arredonda para cima para saber quantos grupos de três há 78 | ternarios = ceil(tamanho / 3) 79 | 80 | # Preenche a string do número com zeros até o tamanho divisível por 3 81 | numero_string = numero_string.zfill(ternarios * 3) 82 | 83 | # percorre os grupos de três números 84 | for n in range(1, ternarios + 1): 85 | # Busca a parte do número referente ao grupo atual 86 | parte_numero = int(numero_string[(n - 1) * 3 : n * 3]) 87 | 88 | # Caso o grupo seja zero, não precisa de tratamento 89 | if parte_numero == 0: 90 | continue 91 | 92 | # Cálculo para retornar a centena 93 | centena = floor(parte_numero / 100) 94 | 95 | # Cálculo para retornar a dezena 96 | dezena = floor((parte_numero - (centena*100)) / 10) 97 | 98 | # Cálculo para retornar a unidade 99 | unidade = parte_numero - (centena*100) - (dezena*10) 100 | 101 | # Caso a centena esteja preenchida, faz o tratamento 102 | if (centena > 0): 103 | if (dezena == 0 and unidade == 0 and extenso != ''): 104 | extenso += ' e ' 105 | elif extenso != '': 106 | extenso += ', ' 107 | if (centena == 1): # Se for CEM deve busca do singular, caso a unidade ou dezena esteja preenchida, busca do plural 108 | if(dezena > 0 or unidade > 0): 109 | extenso += self.centenas[centena].plural 110 | else: 111 | extenso += self.centenas[centena].singular 112 | else: 113 | extenso += self.centenas[centena] # Caso a centena for maior que 1, busca a string correspondente no dicionário 114 | 115 | # Caso a dezena esteja preenchida, faz o tratamento 116 | if (dezena > 0): 117 | if (extenso != ''): # Se o número por extenso já veio preenchido, adiciona "E" 118 | extenso += ' e ' 119 | 120 | if (dezena == 1): # Se a dezena for um, busca das unidades 121 | dezena = 10 + unidade 122 | unidade = 0 # para não executar o extenso das unidades 123 | extenso += self.unidades[dezena] # Busca o extenso correspondente nas unidades 124 | else: 125 | extenso += self.dezenas[dezena] # Se a dezena for maior que um, busca da sua posição correspondente nas dezenas 126 | 127 | # Caso a unidade esteja preenchida, faz o tratamento 128 | if (unidade > 0): 129 | if (extenso != ''): # Se a centena ou dezena estão preenchidas, adiciona "E" 130 | extenso += ' e ' 131 | extenso += self.unidades[unidade] # Busca o extenso correspondente nas unidades 132 | 133 | # Tratamento para milhares 134 | if n < tamanho: # Se não for o último, concatena o milhar correspondente 135 | if (parte_numero > 1): 136 | extenso += f' {self.milhares[ternarios - n].plural}' # Maior que 1, busca o plural 137 | else: 138 | extenso += f' {self.milhares[ternarios - n].singular}' # Se for 1, busca o singular 139 | return extenso.replace('um mil,', 'mil') 140 | 141 | def number_to_text(text): 142 | """ 143 | Given a text, it replaces the numbers (decimals and ordinals) found by its full version. 144 | """ 145 | ex = Extenso() 146 | 147 | words = re.split(r'([.,;!? ])', text) 148 | for word in words: 149 | if 'º' in word or 'ª' in word: 150 | if word in ordinals_numbers.keys(): 151 | new_word = ordinals_numbers[word] 152 | text = text.replace(word, new_word) 153 | else: 154 | #raise ValueError(' The ordinal number '+word+'is not in ordinals_numbers list fix this!') 155 | print('The ordinal number "'+ word +'" is not in ordinals_numbers list fix this!') 156 | 157 | if word.isdigit(): 158 | new_word = ex.escrever(int(word)) 159 | text = text.replace(word, new_word) 160 | return text 161 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # KATube - KATube Audio dataset creator from youTube 2 | 3 | KATube is a tool to automate the process of creating datasets for training Text-To-Speech (TTS) and Speech-To-Text (STT) models. It is based on the work of Pansori [https://arxiv.org/abs/1812.09798]. 4 | 5 | 6 | From a list of YouTube playlists or YouTube channels, KATube downloads all audios with their respective subtitles, segments the audios, performing audio-text alignment using the external tool [AENEAS](https://github.com/readbeyond/aeneas). From this alignment, KATube will segment the audio, according to the sentences created. 7 | 8 | Finally, a validation step can be performed. For this, KATube must use an external translation tool STT (not available here). This validation will calculate the similarity between the subtitle and the transcript, using the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance). This step ensures that the subtitle and the transcript are correct. KATube can be configured to perform a last selection step, in which will be discard audios that do not have a minimum guarantee of similarity between the sentence and the transcript. 9 | 10 | Use at your own risk. 11 | 12 | ![katube-process](imgs/katube-process.png) 13 | 14 | ## Search and Ingest Videos 15 | 16 | In the first stage, KATube performs two procedures: search and download. The search is done by a function that uses the youtube api, so you will need an [api key](https://developers.google.com/places/web-service/get-api-key) to perform this functionality. The search function receives the id of a youtube channel, the api key and returns a list of all the videos available on that channel. This functionality is provided by the script named "search.py" and can be used separately. Execute the script using as input argument the api_key of the google account, the youtube channel id (or playlist id), the output directory, and the output file name. For example: 17 | 18 | ``` 19 | $ python search.py --api_key=GOOGLE_DEVELOPER_API_KEY --content_id=CHANNEL_ID --dest_dir=OUTPUT_FOLDER --output_search_file=YOUTUBE_VIDEOS.txt 20 | ``` 21 | 22 | Then, a function is used to download audio and subtitles from online videos as .mp3 and .srt files, respectively. This process can be time consuming, as it is necessary to wait a few seconds (a value between 30 and 60 seconds set at random) to avoid blocking the IP by the Youtube servers. This functionality is provided by the script named "download.py" and can be used separately. Execute the script using as input argument the URL of the youtube video and the destination directory. For example: 23 | 24 | ``` 25 | $ python download.py --youtube_url=https://www.youtube.com/watch?v=999999999 --output_dir=OUTPUT_FOLDER 26 | ``` 27 | 28 | Two files will be created: one .srt and the other .mp3. The names of the files will be the same as the video ID, which is the code after the URL, for example https://. 29 | 30 | ## Cleaning and Normalization of the text 31 | 32 | The subtitles contain segmented text and timing information which corresponds to the audio contents of the associated video. THe timing subtitles is discarded and subtitles are joined. The text corresponding to the sentences is cleaned, normalized and divided into sentences, according to the punctuation. The division of sentences will try to respect the limits defined by the minimum and maximum number of words previously defined. If it is not possible to respect these limits by segmenting the text in the punctuation, the text will be segmented in an arbitrary manner, regardless of the punctuation. 33 | 34 | This functionality is provided by the script named "text_normalization.py" and can be used separately. Run the script using as input argument the subtitles filepath, the minimal and maximal number of words on sentence, and the output filepath. For example: 35 | 36 | ``` 37 | $ python text_normalization.py --input_file=SUBTITLES.txt --min_words=10 --max_words=30 --output_file=CLEAN_AND_NORMALIZED_SUBTITLES.txt 38 | ``` 39 | 40 | ## Align (Synchronization) Text-Audio 41 | 42 | For alignment, the AENEAS tool is used, which receives an audio file and the clean and normalized text, divided into sentences. A json file will be produced and contains the time (begin and end) of each sentence in the text. The audio file must be in wav or mp3 format. The text is divided into sentences, one on each line of a txt file. For more information about the AENEAS operation, check the [oficial documentation](https://pypi.org/project/aeneas/). 43 | 44 | This functionality is provided by the script named "synchronization.py" and can be used separately. Execute the script using as input argument the audio filepath (mp3 or wav), the text filepath, which contains the segmented sentences, and the output filepath, in which the .json file resulting from the alignment produced by the AENEAS tool will be saved. For example: 45 | 46 | ``` 47 | $ python synchronization.py --audio_file=AUDIO_FILE.mp3 --text_file=CLEAN_AND_NORMALIZED_SUBTITLES.txt --output_file=SYNCHRONIZED_AUDIO_TEXT.json 48 | ``` 49 | 50 | ## Audio Segmentation 51 | 52 | This step receives the json file from the previous step and performs the segmentation of the audio file. This script is based on the script provided by [Keith Ito](https://keithito.com), who kindly provided it via email. In this step, a logical list of segments is first created, storing the filename, the start and end times. Then, go through this logical list, dividing the original audio, saving each segment to disk. 53 | 54 | This functionality is provided by the script named "audio_segmentation.py" and can be used separately. Run the script using as input argument the path of the audio file (mp3 or wav) to be segmented, the json file from the previous step, the output directory, where the segmented files will be saved, and the path of the metadata file, which is a csv file that will contain the name of the segmented audio file and the corresponding text. For example: 55 | 56 | ``` 57 | $ python audio_segmentation.py --audio_file=AUDIO_FILE.mp3 --json_file=SYNCHRONIZED_AUDIO_TEXT.json --output_dir=OUTPUT_FOLDER --metadata_file=METADATA.CSV 58 | ``` 59 | 60 | The output file will follow the template: 61 | 62 | ``` 63 | filename1 | text one 64 | filename2 | text two 65 | filename3 | text three 66 | filename4 | text four 67 | ``` 68 | 69 | ## Transcribe 70 | 71 | Here there is a script template to access an external STT API, if you have it available. You need to configure the link to access the API. Adapt this script as needed. A sample rate conversion function is also available, in case it is necessary to convert the files before using the STT API. This functionality is provided by the script named "transcribe.py" and can be used separately. Run the script using as input argument of the input directory of wavs files, the transcription output file, and the new sample rate, which the wav files will be converted before sending to the STT API. For example: 72 | 73 | ``` 74 | $ python transcribe.py --input_dir=WAVS_FOLDER --new_sample_rate=16000 --transcription_file=TRANSCRIPTS.CSV 75 | ``` 76 | 77 | Check the "tools" folder for examples of using STT APIs, such as Google, Azure and AWS. 78 | 79 | ## Validation 80 | 81 | Although the audio and text data are force-aligned with each other, several problems can happen that prejudices the results. 82 | The text may be unclean or incorrect, the pronunciation may be erroneous or the audio may be corrupted (like ambient noise or poor recording quality). 83 | 84 | KATube can validate the text of the sentence. To do this, you must have available an external STT (not provided here), such as AWS, Google or Azure. Some sample scripts are available in the "tools" folder. The external STT will generate a transcript of the segmented audio. So, you can compare the sentence with the transcript using the levenshtein distance, and thus have a guarantee that the audio really matches the text of the sentence. 85 | 86 | This functionality is provided by the script named "validation.py" and can be used separately. Run the script using as input argument the paths of two csv metadata files, the output directory and the path of the output file, which will contain the texts and the Levenshtein distance between them. For example: 87 | 88 | ``` 89 | $ python validation.py --input_file1=METADATA.CSV --input_file2=TRANSCRIPTS.CSV --output_dir=OUTPUT_FOLDER --output_file=VALIDATION.csv 90 | ``` 91 | 92 | ## Selection 93 | 94 | After validating the data it is possible to select only those audios that have a minimal similarity between the transcription and the sentence. KATube can discard audios that have a similarity value less than a value you define (90% is a good start). 95 | 96 | This functionality is provided by the script named "selection.py" and can be used separately. Run the script using as input argument the path of the csv validation file, from the previews step, the minimal value of Levenshtein distance, and the result output file. A security parameter (--force) must be passed in order to effectively delete the files with lower value of Levenshtein distance. For example: 97 | 98 | ``` 99 | $ python selection.py --csv_file=VALIDATION.csv --min_value=0.9 --save_file=METADATA.csv --force 100 | ``` 101 | 102 | # Installation 103 | 104 | ## How to create a docker image 105 | 106 | ```sh 107 | $ git clone https://github.com/freds0/katube 108 | $ cd katube 109 | $ docker build -t katube ./ 110 | $ sudo docker run --rm --net='host' -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8 -v ~/:/root/ -w /root -it katube 111 | ``` 112 | 113 | If you prefer, use a conda environment: 114 | 115 | ```sh 116 | $ conda create -n katube python=3.6 pip 117 | $ conda activate katube 118 | ``` 119 | 120 | ## Aeneas Installation 121 | 122 | Install the requirements: 123 | 124 | ```sh 125 | $ apt-get install ffmpeg espeak libespeak-dev wget git 126 | $ wget https://raw.githubusercontent.com/readbeyond/aeneas/master/install_dependencies.sh 127 | $ bash install_dependencies.sh 128 | ``` 129 | 130 | Install Aeneas: 131 | 132 | ```sh 133 | $ git clone https://github.com/ReadBeyond/aeneas.git 134 | $ cd aeneas 135 | $ sudo pip install -r requirements.txt 136 | $ python setup.py build_ext --inplace 137 | $ python aeneas_check_setup.py 138 | $ cd .. 139 | $ pip install -e aeneas 140 | ``` 141 | 142 | ## KATube Installation 143 | 144 | Install the KATube requirements: 145 | 146 | ```sh 147 | $ pip install -r requirements.txt 148 | $ pip install git+https://github.com/freds0/pytube3 149 | or 150 | $ pip install git+https://github.com/swiftyy-mage/pytube3 151 | ``` 152 | 153 | # Configuration 154 | 155 | First, create your google api_key at: 156 | 157 | [https://developers.google.com/places/web-service/get-api-key] 158 | 159 | In the "config.py" file, set the variable with your google_id: 160 | 161 | ```sh 162 | api_key = 'put_your_google_id_here' 163 | ``` 164 | 165 | Second, in the "config.py" file, choose the source to download the audio data: 166 | 167 | - playlist 168 | - channel 169 | 170 | If you choose a playlist, set variable orig_base as follows in the config.py file: 171 | 172 | ```sh 173 | orig_base = 'playlist' # ['channel', 'playlist'] 174 | ``` 175 | 176 | Third, create a list containing the playlist or channel ids from youtube. For example, to download all audios from the playlist 177 | - 178 | 179 | 180 | Configure the file "input/playlists_id.txt" as follows: 181 | 182 | ```sh 183 | PLZoTAELRMXVPGU70ZGsckrMdr0FteeRUi 184 | ``` 185 | 186 | Check the settings in the "config.py" file. 187 | 188 | # Execution 189 | 190 | After the configuration, execute the command: 191 | 192 | ``` 193 | python main.py 194 | ``` 195 | 196 | and KATube will start dataset generating. 197 | 198 | # TODO 199 | 200 | Try to use [Montreal Forced Alignment](https://montreal-forced-aligner.readthedocs.io/en/latest/). 201 | 202 | # References: 203 | 204 | - Pansori [sourcecode](https://github.com/yc9701/pansori) 205 | - Pansori [paper](https://arxiv.org/abs/1812.09798) 206 | - [KABooks](https://github.com/freds0/kabooks), our similar tool, used to create dataset from audiobooks. 207 | 208 | # Thanks 209 | 210 | - [Keith Ito](https://keithito.com) 211 | -------------------------------------------------------------------------------- /text_normalization.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com 5 | # 6 | # 7 | import re 8 | import argparse 9 | import unicodedata 10 | from utils.number_to_text import number_to_text 11 | 12 | vocab="abcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû\-0123456789,.;:!?' " 13 | chars_map = {'ï': 'i', 'ù': 'ú', 'ö': 'o', 'î':'i', 'ñ':' n', 'ë':'e', 'ì':'í', 'ò': 'ó', 'ũ': 'u','ẽ':'e', 'ü':'u', 'è':'é', 'æ':'a', 'å': 'a'} 14 | 15 | 16 | def get_number_of_words(sentence): 17 | """ 18 | Count number of words on sentence. 19 | Parameters: 20 | sentence (str). text sentence. 21 | 22 | Returns: 23 | int: returns sentence length 24 | """ 25 | sentence_length = len(sentence.split(' ')) 26 | return sentence_length 27 | 28 | 29 | def get_text_from_subtitle(input_file): 30 | """ 31 | Extracts the text from a subtitle file. 32 | Parameters: 33 | input_file (str): input subtitles file (.srt). 34 | 35 | Returns: 36 | text (str): returns text of subtitles files. 37 | """ 38 | # Read all lines from file 39 | try: 40 | file = open(input_file, "r") 41 | lines = file.readlines() 42 | file.close() 43 | 44 | except IOError: 45 | print("Error: Reading subtitle file {}.".format(input_file)) 46 | return False 47 | 48 | # Declare variable empty list 49 | line_list = [] 50 | for line in lines: 51 | text = '' 52 | # Look for patterns and parse 53 | if re.search('^[0-9]+$', line) is None and re.search('^[0-9]{2}:[0-9]{2}:[0-9]{2}', line) is None and re.search('^$', line) is None: 54 | text += ' ' + line.strip('\n') 55 | line_list.append(text) 56 | 57 | # Finish with list.join() to bring everything together 58 | text = '\n'.join(line_list) 59 | return text 60 | 61 | 62 | def merge_sentences(sentences, min_words): 63 | """ 64 | Merge sentences that have a number of words less than min_words. 65 | Parameters: 66 | sentences (list): list of sentences. 67 | min_words (int): minimum quantity of words. 68 | 69 | Returns: 70 | sentences (list): returns sentences list with length bigger then min_words. 71 | """ 72 | found_short_sentence = True 73 | while(found_short_sentence): 74 | found_short_sentence = False 75 | for index, sentence in enumerate(sentences): 76 | # Verify number of words on sentence 77 | if (len(sentence.split()) < min_words): 78 | found_short_sentence = True 79 | # Merge sentences 80 | sentences[index:index+2] = [' '.join(sentences[index:index+2])] 81 | # Removing blank itens from list 82 | nonempty_sentences = list(filter(None, sentences)) 83 | return nonempty_sentences 84 | 85 | 86 | def tokenize_sentences_on_blank_space(text): 87 | """ 88 | Divide a text into words, that is, create tokens, breaking it into the blank spaces. 89 | Parameters: 90 | text (str): normalized text. 91 | 92 | Returns: 93 | words (list): returns list of words. 94 | """ 95 | # Tokenize on blank spaces 96 | words = text.split(' ') 97 | return words 98 | 99 | 100 | def tokenize_sentences_on_punctuation(text): 101 | """ 102 | Creates sentences from a text, splitting it in the punctuation. 103 | Parameters: 104 | text (str): normalized text. 105 | 106 | Returns: 107 | sentences (list): returns list of sentences split on punctuation. 108 | """ 109 | # Tokenize by punctuation 110 | # sentences = re.split(r'([.,!?:;])', text)# Result example: ['Esta é uma frase', '.', 'Esta é outra frase', ','] 111 | sentences = re.split(r'([.;!?])', text) 112 | for index, sentence in enumerate(sentences[:-1]): 113 | sentence = sentence.strip() 114 | sentences[index:index+2] = [''.join(sentences[index:index+2])] # Result example: ['Esta é uma frase.', 'Esta é outra frase,'] 115 | # Removing blank itens from list 116 | nonempty_sentences = list(filter(None, sentences)) 117 | sentences = nonempty_sentences 118 | return sentences 119 | 120 | 121 | def tokenize_sentences_on_special_words(text): 122 | """ 123 | Creates sentences from a text, splitting it in the special words of the Portuguese language. 124 | Parameters: 125 | text (str): normalized text. 126 | 127 | Returns: 128 | sentences (list): returns list of sentences split on special words. 129 | """ 130 | special_words = [' mas ', ' porém ', ' todavia ', ' contudo ', ' entretanto ', ' no entanto ', ' pois ',' logo ', ' porque ', ' bem como ', ' por isso ', ' isto é ', ' visto que ', ' quando ', ' logo que ', ' desde que'] 131 | # Tokenize by special words 132 | for word in special_words: 133 | sentences = re.split(r'({})'.format(word), text) 134 | 135 | for index, sentence in enumerate(sentences[:-1]): 136 | sentences[index:index+2] = [''.join(sentences[index:index+2])] 137 | # Removing blank itens from list 138 | nonempty_sentences = list(filter(None, sentences)) 139 | sentences = nonempty_sentences 140 | return sentences 141 | 142 | 143 | def get_size_of_biggest_sentence(sentences): 144 | """ 145 | Given a list of sentences, it returns the length of the largest sentence. 146 | Parameters: 147 | sentences (list): sentences list. 148 | 149 | Returns: 150 | int: returns length of largest sentence. 151 | """ 152 | max_length_sentence = 0 153 | for sentence in sentences: 154 | length_sentence = get_number_of_words(sentence) 155 | if length_sentence > max_length_sentence: 156 | max_length_sentence = length_sentence 157 | return max_length_sentence 158 | 159 | 160 | def create_sentences_from_text(text, min_words, max_words): 161 | """ 162 | Creates sentences from a text, taking into account the minimum and maximum number of words. 163 | Initially, it divides the text according to the punctuation, then it divides the larger sentences according to special words, 164 | and, finally, it divides the larger sentences into tokens. 165 | After the division, the tokens are concatenated until they are within the min and max limits. 166 | Parameters: 167 | text (str): normalized text. 168 | min_words (int): number minimum of words of each sentence. 169 | max_words (int): number maximum of words of each sentence. 170 | 171 | Returns: 172 | sentences (list): returns sentences list. 173 | """ 174 | # First: tokenize on punctuation 175 | sentences = tokenize_sentences_on_punctuation(text) 176 | 177 | # Verify length of sentences 178 | length_biggest_sentence = get_size_of_biggest_sentence(sentences) 179 | 180 | # Second: tokenize on special words 181 | if length_biggest_sentence > max_words: # very long sentence 182 | sentences = tokenize_sentences_on_special_words(text) 183 | 184 | # Verify length of sentences 185 | length_biggest_sentence = get_size_of_biggest_sentence(sentences) 186 | 187 | # Third: tokenize on blank space 188 | if length_biggest_sentence > max_words: # very long sentence 189 | sentences = tokenize_sentences_on_blank_space(text) 190 | 191 | # Concatenates small sentences 192 | sentences = iter(sentences) 193 | lines, current = [], next(sentences) 194 | for sentence in sentences: 195 | if get_number_of_words(current) > min_words: 196 | lines.append(current) 197 | current = sentence # next 198 | # Concatenates sentences 199 | else: 200 | current += " " + sentence # concatenate two sentences 201 | 202 | lines.append(current) 203 | nonempty_lines = list(filter(None, lines)) 204 | return nonempty_lines 205 | 206 | 207 | def remove_html_tags(text): 208 | """ 209 | Remove html tags from a string using regular expressions. 210 | """ 211 | clean = re.compile('<.*?>') 212 | return re.sub(clean, '', text) 213 | 214 | 215 | def text_cleaning(text): 216 | """ 217 | Performs a series of operations to clear the text in order to normalize it. 218 | """ 219 | # Removing line break. 220 | text = text.replace('\n', ' ') 221 | 222 | # Removing html tags. 223 | text = remove_html_tags(text) 224 | 225 | # Normalizing accents to unidecode. 226 | accents = ('COMBINING ACUTE ACCENT', 'COMBINING GRAVE ACCENT') #portuguese 227 | chars = [c for c in unicodedata.normalize('NFD', text) if c not in accents] 228 | text = unicodedata.normalize('NFC', ''.join(chars))# Strip accent 229 | 230 | # Converting to lower case 231 | text = text.lower() 232 | 233 | # Remove all not in vocab 234 | #text = re.sub("[^{}]".format(vocab), " ", text) 235 | 236 | # Replacing ... for . 237 | text = re.sub("[...]+", ".", text) # Substitute "..." for "." 238 | 239 | # Remove (, [ 240 | text = re.sub("[(\[\])]+", "", text) 241 | 242 | # Remove space before punctuation 243 | text = re.sub(r'\s([.,;:?!"](?:\s|$))', r'\1', text) 244 | 245 | # Removing double blank spaces 246 | text = re.sub("[ ]+", " ", text) 247 | for word in text.split(' '): 248 | for c in word: 249 | if c in chars_map.keys(): 250 | word = word.replace(c,chars_map[c]) 251 | c = chars_map[c] 252 | 253 | return text 254 | 255 | 256 | def create_normalized_text_from_subtitles_file(subtitle_file, output_file, min_words, max_words): 257 | """ 258 | Given a subtitle file (.srt) it cleans and normalizes the text, dividing it into sentences, 259 | according to the number of words (min_words and max_words), 260 | saving the result in output_file. 261 | Parameters: 262 | subtitle_file (str): subtitles .srt file. 263 | output_file (str): file path to save the normalized text. 264 | min_words (int): number minimum of words of each sentence. 265 | max_words (int): number maximum of words of each sentence. 266 | 267 | Returns: 268 | Boolean: returns True or False. 269 | """ 270 | 271 | # If the file comes with the time for each subtitle, uncomment this line as only the subtitles text will be extracted. 272 | #text = get_text_from_subtitle(subtitle_file) 273 | 274 | # Read all lines from file 275 | try: 276 | file = open(subtitle_file, "r") 277 | text = '\n'.join(file.readlines()) 278 | file.close() 279 | except IOError: 280 | print("Error: Reading subtitle file {}.".format(subtitle_file)) 281 | return False 282 | 283 | # If it was unable to extract the text. 284 | if not text: 285 | return False 286 | 287 | # Clear and normalize the text. 288 | text = text_cleaning(text) 289 | 290 | # Creates a list of sentences. 291 | sentences = create_sentences_from_text(text, int(min_words), int(max_words)) 292 | 293 | # Save the sentences to the output file. 294 | try: 295 | f = open(output_file, "w") 296 | for sentence in sentences: 297 | # Converting numbers by its full version. 298 | sentence = number_to_text(sentence) 299 | f.write(sentence.strip() + '\n') 300 | f.close() 301 | 302 | except IOError: 303 | print("Error: Writing audio file {}.".format(output_file)) 304 | return False 305 | 306 | return True 307 | 308 | 309 | def main(): 310 | parser = argparse.ArgumentParser() 311 | parser.add_argument('--base_dir', default='./') 312 | parser.add_argument('--input_file', default='subtitles.txt', help='Subtitles filename (only text)') 313 | parser.add_argument('--output_file', default='output.txt', help='Filename to save the normalize text') 314 | parser.add_argument('--min_words', default=10, help='Minimal number of words on sentence') 315 | parser.add_argument('--max_words', default=30, help='Maximal number of words on sentence') 316 | args = parser.parse_args() 317 | 318 | min_words = int(args.min_words) 319 | max_words = int(args.max_words) 320 | 321 | create_normalized_text_from_subtitles_file(args.input_file, args.output_file, min_words, max_words) 322 | 323 | if __name__ == "__main__": 324 | main() 325 | -------------------------------------------------------------------------------- /main.py: -------------------------------------------------------------------------------- 1 | #!/usr/bin/env python 2 | # -*- coding: utf-8 -*- 3 | # 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com 5 | # 6 | # 7 | from config import Config 8 | from urllib.parse import parse_qs, urlparse 9 | from search import search_videos 10 | from download import download_audio_and_subtitles_from_youtube 11 | from text_normalization import create_normalized_text_from_subtitles_file 12 | from synchronization import create_aeneas_json_file 13 | from audio_segmentation import segment_audio 14 | from transcribe import convert_audios_samplerate, transcribe_audios 15 | from validation import create_validation_file 16 | from selection import select 17 | from utils.downsampling import downsampling 18 | import shutil 19 | import os 20 | import logging 21 | 22 | ###################################################### 23 | # Logs Config 24 | ###################################################### 25 | if not(os.path.exists(Config.logs_dir)): 26 | os.makedirs(Config.logs_dir) 27 | 28 | log_path = os.path.join(Config.logs_dir, Config.log_file) 29 | if not os.path.exists(Config.logs_dir): 30 | os.makedirs(Config.logs_dir) 31 | open(log_path, 'w').close() 32 | 33 | level = logging.DEBUG # Options: logging.DEBUG | logging.INFO | logging.WARNING | logging.ERROR | logging.CRITICAL 34 | logging.basicConfig(filename=log_path, filemode='w', format='%(message)s', level=level) 35 | 36 | 37 | # Argument Parser from File 38 | ''' 39 | class LoadFromFile (argparse.Action): 40 | def __call__ (self, parser, namespace, values, option_string = None): 41 | with values as f: 42 | print(f.read().split()) 43 | parser.parse_args(f.read().split(), namespace) 44 | ''' 45 | 46 | 47 | def main(): 48 | if Config.orig_base == 'channel': 49 | g = open(Config.channels_file, "r", encoding='utf-8') 50 | elif Config.orig_base == 'playlist': 51 | g = open(Config.playlists_file, "r", encoding='utf-8') 52 | else: 53 | g = None 54 | 55 | # Errors youtube videos file 56 | log_error_file = open(os.path.join(Config.logs_dir, Config.youtube_videos_error), "w") 57 | 58 | ###################################################### 59 | # Youtube ignored videos 60 | ###################################################### 61 | if Config.ignored_youtube_videos: 62 | try: 63 | f = open(Config.ignored_youtube_videos, encoding='utf-8') 64 | ignored_youtube_videos = f.readlines() 65 | f.close() 66 | except IOError: 67 | print("Error: File {} does not appear to exist.".format(Config.ignored_youtube_videos)) 68 | return exit(False) 69 | 70 | ###################################################### 71 | # Youtube already downloaded videos 72 | ###################################################### 73 | if Config.downloaded_youtube_videos: 74 | try: 75 | if os.path.exists(Config.downloaded_youtube_videos): 76 | f = open(Config.downloaded_youtube_videos, "r", encoding='utf-8') 77 | downloaded_youtube_videos = f.readlines() 78 | f.close() 79 | else: 80 | f = open(Config.downloaded_youtube_videos, "w", encoding='utf-8') 81 | downloaded_youtube_videos = [] 82 | f.close() 83 | except IOError: 84 | print("Error: File {} does not appear to exist.".format(Config.downloaded_youtube_videos)) 85 | return exit(False) 86 | 87 | ###################################################### 88 | # Iterates over the youtube channels list 89 | ###################################################### 90 | for content_id in g: 91 | content_id = content_id.rstrip() 92 | # ignore channel description 93 | if content_id.startswith('#'): 94 | print('Ignoring {}: {}'.format(Config.orig_base, content_id)) 95 | continue 96 | # Defining output paths 97 | base_path = os.path.join(Config.base_dir, Config.dest_dir) 98 | output_path = os.path.join(base_path, Config.orig_base, content_id) 99 | 100 | ###################################################### 101 | # Searching all videos from Youtube channel 102 | ###################################################### 103 | print('Searching videos from {} - {}...'.format(Config.orig_base, content_id)) 104 | # content_file contains the list of all videos on the youtube channel 105 | content_file = search_videos(Config.api_key, content_id, base_path, Config.output_search_file) 106 | if not content_file: 107 | logging.error('Error downloading channel video list: ' + content_id) 108 | continue 109 | 110 | # Open youtube videos list of the channel 111 | f = open(content_file, "r", encoding='utf-8') 112 | 113 | ###################################################### 114 | # Iterate over youtube videos of the channel 115 | ###################################################### 116 | i = 0 117 | for youtube_link in f: 118 | youtube_link = youtube_link.strip() 119 | ###################################################### 120 | # Ignoring videos commented or found on list "Config.ignored_youtube_videos" 121 | ###################################################### 122 | if youtube_link.startswith('#') or (Config.ignored_youtube_videos and youtube_link + '\n' in ignored_youtube_videos): 123 | print('Ignoring youtube video: {} '.format(youtube_link)) 124 | continue 125 | 126 | videos = parse_qs(urlparse(youtube_link).query, keep_blank_values=True).get('v') 127 | video_id = None if videos == None else videos[0] 128 | 129 | ###################################################### 130 | # Download mp3 from youtube_link 131 | ###################################################### 132 | print('Downloading {} - {}...'.format(i, youtube_link)) 133 | # Ignore videos with no portuguese caption or no caption at all 134 | if os.path.exists(os.path.join(output_path, video_id)) or (not download_audio_and_subtitles_from_youtube(youtube_link, output_path)): 135 | logging.error('YouTube video already downloaded or is unavailable: ' + youtube_link) 136 | log_error_file.write(youtube_link + ': ingest_dataset' + '\n') 137 | i += 1 138 | continue 139 | 140 | ###################################################### 141 | # Normalizing text preparing to syncronizing text-audio 142 | ###################################################### 143 | print('Normalizing text {} - {}...'.format(i, youtube_link)) 144 | subtitle_file = os.path.join(output_path, video_id) + '/' + video_id + ".srt" 145 | text_file = os.path.join(output_path, video_id) + '/' + video_id + ".txt" 146 | if not create_normalized_text_from_subtitles_file(subtitle_file, text_file, Config.min_words, Config.max_words): 147 | logging.error('YouTube video creating normalized text from subtitles file: ' + youtube_link) 148 | log_error_file.write(youtube_link + ': create_normalized_text_from_subtitles_file' + '\n') 149 | i += 1 150 | continue 151 | if Config.delete_temp_files: 152 | os.remove(subtitle_file) 153 | 154 | ###################################################### 155 | # Syncronizing text-audio using aeneas 156 | ###################################################### 157 | print('Syncronizing Text-Audio {} - {}...'.format(i, youtube_link)) 158 | json_filename = video_id + ".json" 159 | audio_filename = video_id + ".mp3" 160 | json_file = os.path.join(output_path, video_id, json_filename) 161 | audio_file = os.path.join(output_path, video_id, audio_filename) 162 | if not create_aeneas_json_file(audio_file, text_file, json_file): 163 | logging.error('YouTube video syncronizing aeneas json file: ' + youtube_link) 164 | log_error_file.write(youtube_link + ': create_aeneas_json_file' + '\n') 165 | i += 1 166 | continue 167 | if Config.delete_temp_files: 168 | os.remove(text_file) 169 | 170 | ###################################################### 171 | # Segmenting audio using aeneas output 172 | ###################################################### 173 | print('Segmenting audio {} - {}...'.format(i, youtube_link)) 174 | wavs_dir = os.path.join(output_path, video_id, Config.wavs_dir) 175 | metadata_subtitles_file = os.path.join(output_path, video_id, Config.metadata_subtitles_file) 176 | filename_base = video_id 177 | if not segment_audio(audio_file, json_file, wavs_dir, metadata_subtitles_file, filename_base): 178 | logging.error('YouTube video segmenting audio: ' + youtube_link) 179 | log_error_file.write(youtube_link + ': segment_audio' + '\n') 180 | i += 1 181 | continue 182 | # Removing original audio file 183 | if Config.delete_temp_files: 184 | os.remove(audio_file) 185 | os.remove(json_file) 186 | 187 | ###################################################### 188 | # Converting audios: adjust audios to transcription tool 189 | ###################################################### 190 | print('Converting {} - {}...'.format(i, youtube_link)) 191 | tmp_wavs_dir = os.path.join(output_path, video_id, Config.tmp_wavs_dir) 192 | if not convert_audios_samplerate(wavs_dir, tmp_wavs_dir, Config.tmp_sampling_rate): 193 | logging.error('YouTube video converting audio: ' + youtube_link) 194 | log_error_file.write(youtube_link + ': convert_audios_samplerate' + '\n') 195 | i += 1 196 | continue 197 | 198 | ###################################################### 199 | # Transcribing: using external ASR api 200 | ###################################################### 201 | print('Transcribing {} - {}...'.format(i, youtube_link)) 202 | transcription_file = os.path.join(output_path, video_id, Config.transcription_file) 203 | if not transcribe_audios(tmp_wavs_dir, transcription_file): 204 | logging.error('YouTube video transcribing: ' + youtube_link) 205 | log_error_file.write(youtube_link + ': transcribe_audios' + '\n') 206 | # Removing temp dir 207 | shutil.rmtree(tmp_wavs_dir, ignore_errors=True) 208 | i += 1 209 | continue 210 | # Removing temp dir 211 | shutil.rmtree(tmp_wavs_dir, ignore_errors=True) 212 | 213 | ###################################################### 214 | # Validating: using levenshtein distance 215 | ###################################################### 216 | print('Validating {} - {}...'.format(i, youtube_link)) 217 | basename = wavs_dir 218 | validation_file = os.path.join(output_path, video_id, Config.validation_file) 219 | if not create_validation_file(metadata_subtitles_file, transcription_file, basename, validation_file): 220 | logging.error('YouTube video calculate distance: ' + youtube_link) 221 | log_error_file.write(youtube_link + ': create_validation_file'+ '\n') 222 | i += 1 223 | continue 224 | if Config.delete_temp_files: 225 | os.remove(metadata_subtitles_file) 226 | os.remove(transcription_file) 227 | 228 | ###################################################### 229 | # Selection: selecting only files with similarity (levenshtein) >= Config.minimal_levenshtein_distance 230 | ###################################################### 231 | print('Selection {} - {}...'.format(i, youtube_link)) 232 | basename = wavs_dir 233 | output_filepath = os.path.join(output_path, video_id, Config.result_file) 234 | if not select(validation_file, output_filepath, Config.minimal_levenshtein_distance, Config.delete_temp_files): 235 | logging.error('YouTube video selection: ' + youtube_link) 236 | log_error_file.write(youtube_link + ': selection_file'+ '\n') 237 | i += 1 238 | continue 239 | if Config.delete_temp_files: 240 | os.remove(validation_file) 241 | 242 | ###################################################### 243 | # Downsampling: downsampling wav files 244 | ###################################################### 245 | print('Downsampling {} - {}...'.format(i, youtube_link)) 246 | if not downsampling(os.path.join(output_path, video_id), Config.wavs_dir, Config.tmp_wavs_dir, Config.sampling_rate, True): 247 | logging.error('YouTube video downsampling: ' + youtube_link) 248 | log_error_file.write(youtube_link + ': downsampling'+ '\n') 249 | i += 1 250 | continue 251 | shutil.rmtree(os.path.join(output_path, video_id, Config.wavs_dir)) 252 | if (os.path.exists(os.path.join(output_path, video_id, Config.tmp_wavs_dir))): 253 | os.rename(os.path.join(output_path, video_id, Config.tmp_wavs_dir), os.path.join(output_path, video_id, Config.wavs_dir)) 254 | 255 | ###################################################### 256 | # Excluding folders with no wav files 257 | ###################################################### 258 | if not os.path.isdir(wavs_dir) or not os.listdir(wavs_dir): 259 | shutil.rmtree(os.path.join(output_path, video_id)) 260 | 261 | print('Finish {} - {}...'.format(i, youtube_link)) 262 | 263 | # Add youtube_link to already downloaded videos file 264 | if Config.downloaded_youtube_videos: 265 | with open(Config.downloaded_youtube_videos, 'a', encoding='utf-8') as out: 266 | out.write(youtube_link + "\n") 267 | 268 | i += 1 # Next 269 | 270 | f.close() # youtube videos list 271 | 272 | log_error_file.close() 273 | 274 | g.close() # channels or playlist list 275 | 276 | if __name__ == "__main__": 277 | main() 278 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Mozilla Public License Version 2.0 2 | ================================== 3 | 4 | 1. Definitions 5 | -------------- 6 | 7 | 1.1. "Contributor" 8 | means each individual or legal entity that creates, contributes to 9 | the creation of, or owns Covered Software. 10 | 11 | 1.2. "Contributor Version" 12 | means the combination of the Contributions of others (if any) used 13 | by a Contributor and that particular Contributor's Contribution. 14 | 15 | 1.3. "Contribution" 16 | means Covered Software of a particular Contributor. 17 | 18 | 1.4. "Covered Software" 19 | means Source Code Form to which the initial Contributor has attached 20 | the notice in Exhibit A, the Executable Form of such Source Code 21 | Form, and Modifications of such Source Code Form, in each case 22 | including portions thereof. 23 | 24 | 1.5. "Incompatible With Secondary Licenses" 25 | means 26 | 27 | (a) that the initial Contributor has attached the notice described 28 | in Exhibit B to the Covered Software; or 29 | 30 | (b) that the Covered Software was made available under the terms of 31 | version 1.1 or earlier of the License, but not also under the 32 | terms of a Secondary License. 33 | 34 | 1.6. "Executable Form" 35 | means any form of the work other than Source Code Form. 36 | 37 | 1.7. "Larger Work" 38 | means a work that combines Covered Software with other material, in 39 | a separate file or files, that is not Covered Software. 40 | 41 | 1.8. "License" 42 | means this document. 43 | 44 | 1.9. "Licensable" 45 | means having the right to grant, to the maximum extent possible, 46 | whether at the time of the initial grant or subsequently, any and 47 | all of the rights conveyed by this License. 48 | 49 | 1.10. "Modifications" 50 | means any of the following: 51 | 52 | (a) any file in Source Code Form that results from an addition to, 53 | deletion from, or modification of the contents of Covered 54 | Software; or 55 | 56 | (b) any new file in Source Code Form that contains any Covered 57 | Software. 58 | 59 | 1.11. "Patent Claims" of a Contributor 60 | means any patent claim(s), including without limitation, method, 61 | process, and apparatus claims, in any patent Licensable by such 62 | Contributor that would be infringed, but for the grant of the 63 | License, by the making, using, selling, offering for sale, having 64 | made, import, or transfer of either its Contributions or its 65 | Contributor Version. 66 | 67 | 1.12. "Secondary License" 68 | means either the GNU General Public License, Version 2.0, the GNU 69 | Lesser General Public License, Version 2.1, the GNU Affero General 70 | Public License, Version 3.0, or any later versions of those 71 | licenses. 72 | 73 | 1.13. "Source Code Form" 74 | means the form of the work preferred for making modifications. 75 | 76 | 1.14. "You" (or "Your") 77 | means an individual or a legal entity exercising rights under this 78 | License. For legal entities, "You" includes any entity that 79 | controls, is controlled by, or is under common control with You. For 80 | purposes of this definition, "control" means (a) the power, direct 81 | or indirect, to cause the direction or management of such entity, 82 | whether by contract or otherwise, or (b) ownership of more than 83 | fifty percent (50%) of the outstanding shares or beneficial 84 | ownership of such entity. 85 | 86 | 2. License Grants and Conditions 87 | -------------------------------- 88 | 89 | 2.1. Grants 90 | 91 | Each Contributor hereby grants You a world-wide, royalty-free, 92 | non-exclusive license: 93 | 94 | (a) under intellectual property rights (other than patent or trademark) 95 | Licensable by such Contributor to use, reproduce, make available, 96 | modify, display, perform, distribute, and otherwise exploit its 97 | Contributions, either on an unmodified basis, with Modifications, or 98 | as part of a Larger Work; and 99 | 100 | (b) under Patent Claims of such Contributor to make, use, sell, offer 101 | for sale, have made, import, and otherwise transfer either its 102 | Contributions or its Contributor Version. 103 | 104 | 2.2. Effective Date 105 | 106 | The licenses granted in Section 2.1 with respect to any Contribution 107 | become effective for each Contribution on the date the Contributor first 108 | distributes such Contribution. 109 | 110 | 2.3. Limitations on Grant Scope 111 | 112 | The licenses granted in this Section 2 are the only rights granted under 113 | this License. No additional rights or licenses will be implied from the 114 | distribution or licensing of Covered Software under this License. 115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a 116 | Contributor: 117 | 118 | (a) for any code that a Contributor has removed from Covered Software; 119 | or 120 | 121 | (b) for infringements caused by: (i) Your and any other third party's 122 | modifications of Covered Software, or (ii) the combination of its 123 | Contributions with other software (except as part of its Contributor 124 | Version); or 125 | 126 | (c) under Patent Claims infringed by Covered Software in the absence of 127 | its Contributions. 128 | 129 | This License does not grant any rights in the trademarks, service marks, 130 | or logos of any Contributor (except as may be necessary to comply with 131 | the notice requirements in Section 3.4). 132 | 133 | 2.4. Subsequent Licenses 134 | 135 | No Contributor makes additional grants as a result of Your choice to 136 | distribute the Covered Software under a subsequent version of this 137 | License (see Section 10.2) or under the terms of a Secondary License (if 138 | permitted under the terms of Section 3.3). 139 | 140 | 2.5. Representation 141 | 142 | Each Contributor represents that the Contributor believes its 143 | Contributions are its original creation(s) or it has sufficient rights 144 | to grant the rights to its Contributions conveyed by this License. 145 | 146 | 2.6. Fair Use 147 | 148 | This License is not intended to limit any rights You have under 149 | applicable copyright doctrines of fair use, fair dealing, or other 150 | equivalents. 151 | 152 | 2.7. Conditions 153 | 154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted 155 | in Section 2.1. 156 | 157 | 3. Responsibilities 158 | ------------------- 159 | 160 | 3.1. Distribution of Source Form 161 | 162 | All distribution of Covered Software in Source Code Form, including any 163 | Modifications that You create or to which You contribute, must be under 164 | the terms of this License. You must inform recipients that the Source 165 | Code Form of the Covered Software is governed by the terms of this 166 | License, and how they can obtain a copy of this License. You may not 167 | attempt to alter or restrict the recipients' rights in the Source Code 168 | Form. 169 | 170 | 3.2. Distribution of Executable Form 171 | 172 | If You distribute Covered Software in Executable Form then: 173 | 174 | (a) such Covered Software must also be made available in Source Code 175 | Form, as described in Section 3.1, and You must inform recipients of 176 | the Executable Form how they can obtain a copy of such Source Code 177 | Form by reasonable means in a timely manner, at a charge no more 178 | than the cost of distribution to the recipient; and 179 | 180 | (b) You may distribute such Executable Form under the terms of this 181 | License, or sublicense it under different terms, provided that the 182 | license for the Executable Form does not attempt to limit or alter 183 | the recipients' rights in the Source Code Form under this License. 184 | 185 | 3.3. Distribution of a Larger Work 186 | 187 | You may create and distribute a Larger Work under terms of Your choice, 188 | provided that You also comply with the requirements of this License for 189 | the Covered Software. If the Larger Work is a combination of Covered 190 | Software with a work governed by one or more Secondary Licenses, and the 191 | Covered Software is not Incompatible With Secondary Licenses, this 192 | License permits You to additionally distribute such Covered Software 193 | under the terms of such Secondary License(s), so that the recipient of 194 | the Larger Work may, at their option, further distribute the Covered 195 | Software under the terms of either this License or such Secondary 196 | License(s). 197 | 198 | 3.4. Notices 199 | 200 | You may not remove or alter the substance of any license notices 201 | (including copyright notices, patent notices, disclaimers of warranty, 202 | or limitations of liability) contained within the Source Code Form of 203 | the Covered Software, except that You may alter any license notices to 204 | the extent required to remedy known factual inaccuracies. 205 | 206 | 3.5. Application of Additional Terms 207 | 208 | You may choose to offer, and to charge a fee for, warranty, support, 209 | indemnity or liability obligations to one or more recipients of Covered 210 | Software. However, You may do so only on Your own behalf, and not on 211 | behalf of any Contributor. You must make it absolutely clear that any 212 | such warranty, support, indemnity, or liability obligation is offered by 213 | You alone, and You hereby agree to indemnify every Contributor for any 214 | liability incurred by such Contributor as a result of warranty, support, 215 | indemnity or liability terms You offer. You may include additional 216 | disclaimers of warranty and limitations of liability specific to any 217 | jurisdiction. 218 | 219 | 4. Inability to Comply Due to Statute or Regulation 220 | --------------------------------------------------- 221 | 222 | If it is impossible for You to comply with any of the terms of this 223 | License with respect to some or all of the Covered Software due to 224 | statute, judicial order, or regulation then You must: (a) comply with 225 | the terms of this License to the maximum extent possible; and (b) 226 | describe the limitations and the code they affect. Such description must 227 | be placed in a text file included with all distributions of the Covered 228 | Software under this License. Except to the extent prohibited by statute 229 | or regulation, such description must be sufficiently detailed for a 230 | recipient of ordinary skill to be able to understand it. 231 | 232 | 5. Termination 233 | -------------- 234 | 235 | 5.1. The rights granted under this License will terminate automatically 236 | if You fail to comply with any of its terms. However, if You become 237 | compliant, then the rights granted under this License from a particular 238 | Contributor are reinstated (a) provisionally, unless and until such 239 | Contributor explicitly and finally terminates Your grants, and (b) on an 240 | ongoing basis, if such Contributor fails to notify You of the 241 | non-compliance by some reasonable means prior to 60 days after You have 242 | come back into compliance. Moreover, Your grants from a particular 243 | Contributor are reinstated on an ongoing basis if such Contributor 244 | notifies You of the non-compliance by some reasonable means, this is the 245 | first time You have received notice of non-compliance with this License 246 | from such Contributor, and You become compliant prior to 30 days after 247 | Your receipt of the notice. 248 | 249 | 5.2. If You initiate litigation against any entity by asserting a patent 250 | infringement claim (excluding declaratory judgment actions, 251 | counter-claims, and cross-claims) alleging that a Contributor Version 252 | directly or indirectly infringes any patent, then the rights granted to 253 | You by any and all Contributors for the Covered Software under Section 254 | 2.1 of this License shall terminate. 255 | 256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all 257 | end user license agreements (excluding distributors and resellers) which 258 | have been validly granted by You or Your distributors under this License 259 | prior to termination shall survive termination. 260 | 261 | ************************************************************************ 262 | * * 263 | * 6. Disclaimer of Warranty * 264 | * ------------------------- * 265 | * * 266 | * Covered Software is provided under this License on an "as is" * 267 | * basis, without warranty of any kind, either expressed, implied, or * 268 | * statutory, including, without limitation, warranties that the * 269 | * Covered Software is free of defects, merchantable, fit for a * 270 | * particular purpose or non-infringing. The entire risk as to the * 271 | * quality and performance of the Covered Software is with You. * 272 | * Should any Covered Software prove defective in any respect, You * 273 | * (not any Contributor) assume the cost of any necessary servicing, * 274 | * repair, or correction. This disclaimer of warranty constitutes an * 275 | * essential part of this License. No use of any Covered Software is * 276 | * authorized under this License except under this disclaimer. * 277 | * * 278 | ************************************************************************ 279 | 280 | ************************************************************************ 281 | * * 282 | * 7. Limitation of Liability * 283 | * -------------------------- * 284 | * * 285 | * Under no circumstances and under no legal theory, whether tort * 286 | * (including negligence), contract, or otherwise, shall any * 287 | * Contributor, or anyone who distributes Covered Software as * 288 | * permitted above, be liable to You for any direct, indirect, * 289 | * special, incidental, or consequential damages of any character * 290 | * including, without limitation, damages for lost profits, loss of * 291 | * goodwill, work stoppage, computer failure or malfunction, or any * 292 | * and all other commercial damages or losses, even if such party * 293 | * shall have been informed of the possibility of such damages. This * 294 | * limitation of liability shall not apply to liability for death or * 295 | * personal injury resulting from such party's negligence to the * 296 | * extent applicable law prohibits such limitation. Some * 297 | * jurisdictions do not allow the exclusion or limitation of * 298 | * incidental or consequential damages, so this exclusion and * 299 | * limitation may not apply to You. * 300 | * * 301 | ************************************************************************ 302 | 303 | 8. Litigation 304 | ------------- 305 | 306 | Any litigation relating to this License may be brought only in the 307 | courts of a jurisdiction where the defendant maintains its principal 308 | place of business and such litigation shall be governed by laws of that 309 | jurisdiction, without reference to its conflict-of-law provisions. 310 | Nothing in this Section shall prevent a party's ability to bring 311 | cross-claims or counter-claims. 312 | 313 | 9. Miscellaneous 314 | ---------------- 315 | 316 | This License represents the complete agreement concerning the subject 317 | matter hereof. If any provision of this License is held to be 318 | unenforceable, such provision shall be reformed only to the extent 319 | necessary to make it enforceable. Any law or regulation which provides 320 | that the language of a contract shall be construed against the drafter 321 | shall not be used to construe this License against a Contributor. 322 | 323 | 10. Versions of the License 324 | --------------------------- 325 | 326 | 10.1. New Versions 327 | 328 | Mozilla Foundation is the license steward. Except as provided in Section 329 | 10.3, no one other than the license steward has the right to modify or 330 | publish new versions of this License. Each version will be given a 331 | distinguishing version number. 332 | 333 | 10.2. Effect of New Versions 334 | 335 | You may distribute the Covered Software under the terms of the version 336 | of the License under which You originally received the Covered Software, 337 | or under the terms of any subsequent version published by the license 338 | steward. 339 | 340 | 10.3. Modified Versions 341 | 342 | If you create software not governed by this License, and you want to 343 | create a new license for such software, you may create and use a 344 | modified version of this License if you rename the license and remove 345 | any references to the name of the license steward (except to note that 346 | such modified license differs from this License). 347 | 348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary 349 | Licenses 350 | 351 | If You choose to distribute Source Code Form that is Incompatible With 352 | Secondary Licenses under the terms of this version of the License, the 353 | notice described in Exhibit B of this License must be attached. 354 | 355 | Exhibit A - Source Code Form License Notice 356 | ------------------------------------------- 357 | 358 | This Source Code Form is subject to the terms of the Mozilla Public 359 | License, v. 2.0. If a copy of the MPL was not distributed with this 360 | file, You can obtain one at http://mozilla.org/MPL/2.0/. 361 | 362 | If it is not possible or desirable to put the notice in a particular 363 | file, then You may include the notice in a location (such as a LICENSE 364 | file in a relevant directory) where a recipient would be likely to look 365 | for such a notice. 366 | 367 | You may add additional accurate notices of copyright ownership. 368 | 369 | Exhibit B - "Incompatible With Secondary Licenses" Notice 370 | --------------------------------------------------------- 371 | 372 | This Source Code Form is "Incompatible With Secondary Licenses", as 373 | defined by the Mozilla Public License, v. 2.0. 374 | -------------------------------------------------------------------------------- /tools/AWS/aws_transcribe.ipynb: -------------------------------------------------------------------------------- 1 | { 2 | "cells": [ 3 | { 4 | "cell_type": "code", 5 | "execution_count": 1, 6 | "metadata": { 7 | "colab": { 8 | "base_uri": "https://localhost:8080/", 9 | "height": 289 10 | }, 11 | "id": "KH2ZxvYyaQ-I", 12 | "outputId": "e98e5c90-4278-483e-c87f-bfba03c60954" 13 | }, 14 | "outputs": [ 15 | { 16 | "name": "stdout", 17 | "output_type": "stream", 18 | "text": [ 19 | "Collecting boto3\n", 20 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/2f/08/f1ff665147a5d75b871bbe5ba76916f6490419c52a33e588385c4b69281b/boto3-1.15.18-py2.py3-none-any.whl (129kB)\n", 21 | "\u001b[K |████████████████████████████████| 133kB 2.7MB/s \n", 22 | "\u001b[?25hCollecting botocore<1.19.0,>=1.18.18\n", 23 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/2d/72/984ac8f33b5c8df5ff63f323a8724f65b4d0f8956968b942b77d35d3a1ef/botocore-1.18.18-py2.py3-none-any.whl (6.7MB)\n", 24 | "\u001b[K |████████████████████████████████| 6.7MB 6.8MB/s \n", 25 | "\u001b[?25hCollecting jmespath<1.0.0,>=0.7.1\n", 26 | " Downloading https://files.pythonhosted.org/packages/07/cb/5f001272b6faeb23c1c9e0acc04d48eaaf5c862c17709d20e3469c6e0139/jmespath-0.10.0-py2.py3-none-any.whl\n", 27 | "Collecting s3transfer<0.4.0,>=0.3.0\n", 28 | "\u001b[?25l Downloading https://files.pythonhosted.org/packages/69/79/e6afb3d8b0b4e96cefbdc690f741d7dd24547ff1f94240c997a26fa908d3/s3transfer-0.3.3-py2.py3-none-any.whl (69kB)\n", 29 | "\u001b[K |████████████████████████████████| 71kB 6.6MB/s \n", 30 | "\u001b[?25hRequirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.19.0,>=1.18.18->boto3) (2.8.1)\n", 31 | "Requirement already satisfied: urllib3<1.26,>=1.20; python_version != \"3.4\" in /usr/local/lib/python3.6/dist-packages (from botocore<1.19.0,>=1.18.18->boto3) (1.24.3)\n", 32 | "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.19.0,>=1.18.18->boto3) (1.15.0)\n", 33 | "Installing collected packages: jmespath, botocore, s3transfer, boto3\n", 34 | "Successfully installed boto3-1.15.18 botocore-1.18.18 jmespath-0.10.0 s3transfer-0.3.3\n" 35 | ] 36 | } 37 | ], 38 | "source": [ 39 | "!pip install boto3" 40 | ] 41 | }, 42 | { 43 | "cell_type": "code", 44 | "execution_count": 2, 45 | "metadata": { 46 | "id": "DuUJr2DyFYWT" 47 | }, 48 | "outputs": [], 49 | "source": [ 50 | "class config:\n", 51 | " AWS_ACCESS_KEY_ID = ''\n", 52 | " AWS_SECRET_ACCESS_KEY = ''\n", 53 | " region_name ='sa-east-1'\n", 54 | " bucket_name = 'amazon-transcribe'\n", 55 | " audio_format = 'wav'\n", 56 | " output_path = '/content'" 57 | ] 58 | }, 59 | { 60 | "cell_type": "code", 61 | "execution_count": 22, 62 | "metadata": { 63 | "id": "HkalGSJgWun0" 64 | }, 65 | "outputs": [], 66 | "source": [ 67 | "from boto3 import client\n", 68 | "from time import sleep\n", 69 | "from urllib.request import urlopen\n", 70 | "from json import loads\n", 71 | "import pandas as pd\n", 72 | "from os.path import join\n", 73 | "from tqdm import tqdm\n", 74 | "\n", 75 | "def get_transcription_from_job(transcribe, job_name):\n", 76 | " status = transcribe.get_transcription_job(TranscriptionJobName=job_name)\n", 77 | " response = urlopen(status['TranscriptionJob']['Transcript']['TranscriptFileUri'])\n", 78 | " data = loads(response.read())\n", 79 | " text = data['results']['transcripts'][0]['transcript']\n", 80 | "\n", 81 | " return text\n", 82 | "\n", 83 | "def get_bucket_names():\n", 84 | " \"\"\"\n", 85 | " Realisa a busca de diferentes buckets cujos nomes começam pelo nome\n", 86 | " dado em config.bucket_name e os retorna.\n", 87 | " \"\"\"\n", 88 | "\n", 89 | " s3 = client('s3', \n", 90 | " aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n", 91 | " aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n", 92 | " region_name=config.region_name)\n", 93 | "\n", 94 | " response = s3.list_buckets()\n", 95 | "\n", 96 | " bucket_names = []\n", 97 | "\n", 98 | " for idx, _ in enumerate(response):\n", 99 | " bucket = response['Buckets'][idx]['Name']\n", 100 | " if bucket.startswith(config.bucket_name):\n", 101 | " bucket_names.append(response['Buckets'][idx]['Name'])\n", 102 | "\n", 103 | " return bucket_names\n", 104 | "\n", 105 | "def get_audio_files_url(bucket_name=None):\n", 106 | " \"\"\"\n", 107 | " Realiza a busca das URLs dos arquivos em um determinado bucket.\n", 108 | " Caso um nome de um determinado bucket não seja passado como parâmetro, \n", 109 | " como padrão, o último bucket criado será analisado.\n", 110 | " \"\"\"\n", 111 | "\n", 112 | " URLS = []\n", 113 | "\n", 114 | " s3 = client('s3', \n", 115 | " aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n", 116 | " aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n", 117 | " region_name=config.region_name)\n", 118 | "\n", 119 | " if bucket_name is None:\n", 120 | " response = s3.list_buckets()\n", 121 | "\n", 122 | " bucket_name = response['Buckets'][-1]['Name']\n", 123 | " \n", 124 | " # print(f' {bucket_name}')\n", 125 | " # print(len(response))\n", 126 | " # for idx, _ in enumerate(response):\n", 127 | " # print(response['Buckets'][idx]['Name'])\n", 128 | "\n", 129 | "\n", 130 | " url_prefix = 'https://' + bucket_name + '.s3' + '-' + config.region_name + '.amazonaws.com'\n", 131 | "\n", 132 | " for key in s3.list_objects(Bucket=bucket_name)['Contents']:\n", 133 | " if key['Key'].endswith('.' + config.audio_format):\n", 134 | " URLS.append(url_prefix + '/' + key['Key'])\n", 135 | "\n", 136 | " # print(URLS)\n", 137 | "\n", 138 | " return URLS\n", 139 | "\n", 140 | "\n", 141 | "def transcribe_audio_files(URLS):\n", 142 | " \"\"\"\n", 143 | " Realisa a transcrição dos áudios.\n", 144 | " Tem como parâmetro as URLs dos áudios em um bucket.\n", 145 | " \"\"\"\n", 146 | "\n", 147 | " file_names = []\n", 148 | "\n", 149 | " transcribed_texts = []\n", 150 | "\n", 151 | " transcribe = client('transcribe', \n", 152 | " aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n", 153 | " aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n", 154 | " region_name=config.region_name)\n", 155 | "\n", 156 | " for counter, url in enumerate(tqdm(URLS)):\n", 157 | " file_name = str(url).split('/')[-1]\n", 158 | " file_names.append(file_name)\n", 159 | "\n", 160 | " # print(f\"Transcrevendo {file_name}... \")\n", 161 | " # Diferentes nomes para cada iteração\n", 162 | " job_name = file_name.split('.')[0]\n", 163 | " \n", 164 | " try:\n", 165 | " transcribe.start_transcription_job(TranscriptionJobName=job_name, \n", 166 | " Media={'MediaFileUri': url}, \n", 167 | " MediaFormat=config.audio_format, \n", 168 | " LanguageCode='pt-BR')\n", 169 | " except transcribe.exceptions.ConflictException:\n", 170 | " print(f\"\\tO arquivo '{file_name}' já foi transcrito, indo para o próximo arquivo...\")\n", 171 | " text = get_transcription_from_job(transcribe, job_name)\n", 172 | " transcribed_texts.append(text)\n", 173 | " continue\n", 174 | "\n", 175 | " while True:\n", 176 | " status = transcribe.get_transcription_job(TranscriptionJobName=job_name)\n", 177 | " if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:\n", 178 | " break\n", 179 | " sleep(2)\n", 180 | " \n", 181 | " # print(f\"{status['TranscriptionJob']['TranscriptionJobStatus']}\\n\")\n", 182 | "\n", 183 | " if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':\n", 184 | " response = urlopen(status['TranscriptionJob']['Transcript']['TranscriptFileUri'])\n", 185 | " data = loads(response.read())\n", 186 | " text = data['results']['transcripts'][0]['transcript']\n", 187 | " transcribed_texts.append(text)\n", 188 | "\n", 189 | " return file_names, transcribed_texts\n", 190 | "\n", 191 | "def get_completed_job_names():\n", 192 | "\n", 193 | " transcribe = client('transcribe', \n", 194 | " aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n", 195 | " aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n", 196 | " region_name=config.region_name)\n", 197 | " \n", 198 | " job_names = []\n", 199 | " \n", 200 | " response = transcribe.list_transcription_jobs(Status='COMPLETED', MaxResults=100)\n", 201 | " response_more = response\n", 202 | "\n", 203 | " # Do while\n", 204 | " while True:\n", 205 | " for job in response_more['TranscriptionJobSummaries']:\n", 206 | " job_names.append(job['TranscriptionJobName'])\n", 207 | " \n", 208 | " if 'NextToken' not in response_more.keys():\n", 209 | " break\n", 210 | "\n", 211 | " response_more = transcribe.list_transcription_jobs(Status='COMPLETED', NextToken=response_more['NextToken'], MaxResults=100)\n", 212 | "\n", 213 | " return job_names\n", 214 | "\n", 215 | "\n", 216 | "def delete_completed_jobs(completed_job_names):\n", 217 | " \"\"\"\n", 218 | " Deleta os 'Transcription jobs' completos.\n", 219 | " Essa função se faz necessária caso se use o mesmo \n", 220 | " 'job_name' (na classe config) para mais de uma execução do script.\n", 221 | "\n", 222 | " Os 'Transcription jobs' devem ser únicos.\n", 223 | " \"\"\"\n", 224 | "\n", 225 | " transcribe = client('transcribe', \n", 226 | " aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n", 227 | " aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n", 228 | " region_name=config.region_name)\n", 229 | " \n", 230 | " for job_name in tqdm(completed_job_names):\n", 231 | " transcribe.delete_transcription_job(TranscriptionJobName=job_name)\n", 232 | "\n", 233 | "def make_matadata(file_names, transcribed_texts):\n", 234 | " df = pd.DataFrame()\n", 235 | "\n", 236 | " for file_name, text in zip(file_names, transcribed_texts):\n", 237 | " df = df.append({'A': file_name, 'B' : text}, ignore_index=True)\n", 238 | "\n", 239 | " df.to_csv(join(config.output_path, 'transcribed_text.csv'), sep='|', index=False, header=False, quotechar=\"'\")\n", 240 | "\n", 241 | "def run_transcribe():\n", 242 | "\n", 243 | " # completed_jobs = get_completed_job_names()\n", 244 | " # print(completed_jobs)\n", 245 | "\n", 246 | " bucket_names = get_bucket_names()\n", 247 | "\n", 248 | " for idx, bucket in enumerate(bucket_names):\n", 249 | " print(\"\\nAcessando bucket {0} -> {1} de {2}\".format(bucket, idx+1, len(bucket_names)))\n", 250 | "\n", 251 | " URLS = get_audio_files_url(bucket)\n", 252 | " file_names, transcribed_texts = transcribe_audio_files(URLS)\n", 253 | " make_matadata(file_names, transcribed_texts)" 254 | ] 255 | }, 256 | { 257 | "cell_type": "code", 258 | "execution_count": 24, 259 | "metadata": { 260 | "colab": { 261 | "base_uri": "https://localhost:8080/", 262 | "height": 85 263 | }, 264 | "id": "7sL4wWD66KGz", 265 | "outputId": "19f6ef24-3e8b-42eb-ced7-e4da49ea7020" 266 | }, 267 | "outputs": [ 268 | { 269 | "name": "stderr", 270 | "output_type": "stream", 271 | "text": [ 272 | "\n", 273 | "0it [00:00, ?it/s]" 274 | ] 275 | }, 276 | { 277 | "name": "stdout", 278 | "output_type": "stream", 279 | "text": [ 280 | "[]\n", 281 | "[]\n" 282 | ] 283 | }, 284 | { 285 | "name": "stderr", 286 | "output_type": "stream", 287 | "text": [ 288 | "\n" 289 | ] 290 | } 291 | ], 292 | "source": [ 293 | "run_transcribe()" 294 | ] 295 | }, 296 | { 297 | "cell_type": "code", 298 | "execution_count": null, 299 | "metadata": { 300 | "id": "KYPZLDovCpjN" 301 | }, 302 | "outputs": [], 303 | "source": [ 304 | "from google.colab import files\n", 305 | "files.download('transcribed_text.csv') " 306 | ] 307 | }, 308 | { 309 | "cell_type": "code", 310 | "execution_count": null, 311 | "metadata": { 312 | "id": "1YR2jSYnqTGx" 313 | }, 314 | "outputs": [], 315 | "source": [ 316 | "import logging\n", 317 | "from boto3 import client\n", 318 | "from botocore.exceptions import ClientError\n", 319 | "\n", 320 | "import os\n", 321 | "import sys\n", 322 | "import threading\n", 323 | "\n", 324 | "import ntpath\n", 325 | "\n", 326 | "class ProgressPercentage(object):\n", 327 | "\n", 328 | " def __init__(self, filename):\n", 329 | " self._filename = filename\n", 330 | " self._size = float(os.path.getsize(filename))\n", 331 | " self._seen_so_far = 0\n", 332 | " self._lock = threading.Lock()\n", 333 | "\n", 334 | " def __call__(self, bytes_amount):\n", 335 | " # To simplify, assume this is hooked up to a single filename\n", 336 | " with self._lock:\n", 337 | " self._seen_so_far += bytes_amount\n", 338 | " percentage = (self._seen_so_far / self._size) * 100\n", 339 | " sys.stdout.write(\n", 340 | " \"\\r%s %s / %s (%.2f%%)\" % (\n", 341 | " self._filename, self._seen_so_far, self._size,\n", 342 | " percentage))\n", 343 | " sys.stdout.flush()\n", 344 | "\n", 345 | "def upload_file(file_path, bucket, object_name=None):\n", 346 | "\n", 347 | " if object_name is None:\n", 348 | " object_name = ntpath.basename(file_path)\n", 349 | "\n", 350 | " # Upload the file\n", 351 | " s3 = client('s3', \n", 352 | " aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n", 353 | " aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n", 354 | " region_name=config.region_name)\n", 355 | " try:\n", 356 | " response = s3.upload_file(file_path, \n", 357 | " bucket, \n", 358 | " object_name,\n", 359 | " Callback=ProgressPercentage(file_path))\n", 360 | " except ClientError as e:\n", 361 | " logging.error(e)\n", 362 | " return False\n", 363 | " return True\n", 364 | "\n", 365 | "def upload_multiple_files(files_path, bucket_name):\n", 366 | " \"\"\"\n", 367 | " Realiza o upload de multiplos arquivos.\n", 368 | " \n", 369 | " ---\n", 370 | "\n", 371 | " files_path se refere ao caminho até o diretório onde se encontram\n", 372 | " os arquivos de áudio.\n", 373 | "\n", 374 | " bucket_name se refere ao nome de um bucket especifico, onde será feita o \n", 375 | " upload dos arquivos.\n", 376 | " \"\"\"\n", 377 | "\n", 378 | " files = os.listdir(files_path)\n", 379 | "\n", 380 | " for file in files:\n", 381 | " upload_file(os.path.join(files_path, file), bucket_name)\n", 382 | "\n", 383 | "def create_bucket(bucket_name, region=None):\n", 384 | " \"\"\"\n", 385 | " Cria um bucket com permissões privadas.\n", 386 | "\n", 387 | " ---\n", 388 | "\n", 389 | " bucket_name é o nome do bucket a ser criado.\n", 390 | "\n", 391 | " region é o código de região (region_name), se não for explicitado, como padrão,\n", 392 | " a região 'us-west-2' (US West (Oregon)) será utilizada.\n", 393 | " \"\"\"\n", 394 | "\n", 395 | " try:\n", 396 | " if region is None:\n", 397 | " s3 = client('s3', \n", 398 | " aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n", 399 | " aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY)\n", 400 | " s3.create_bucket(Bucket=bucket_name, ACL='private')\n", 401 | "\n", 402 | "\n", 403 | " else:\n", 404 | " s3 = client('s3', \n", 405 | " aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n", 406 | " aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n", 407 | " region_name=region)\n", 408 | " location = {'LocationConstraint': region}\n", 409 | " s3.create_bucket(Bucket=bucket_name,\n", 410 | " CreateBucketConfiguration=location,\n", 411 | " ACL='private')\n", 412 | " \n", 413 | " response_public = s3.put_public_access_block(\n", 414 | " Bucket=bucket_name,\n", 415 | " PublicAccessBlockConfiguration={\n", 416 | " 'BlockPublicAcls': True,\n", 417 | " 'IgnorePublicAcls': True,\n", 418 | " 'BlockPublicPolicy': True,\n", 419 | " 'RestrictPublicBuckets': True\n", 420 | " },\n", 421 | " )\n", 422 | "\n", 423 | " except ClientError as e:\n", 424 | " logging.error(e)\n", 425 | " return False\n", 426 | " return True" 427 | ] 428 | }, 429 | { 430 | "cell_type": "markdown", 431 | "metadata": { 432 | "id": "7T-tRyzPB6Dq" 433 | }, 434 | "source": [ 435 | "# Exemplos de uso" 436 | ] 437 | }, 438 | { 439 | "cell_type": "code", 440 | "execution_count": null, 441 | "metadata": { 442 | "colab": { 443 | "base_uri": "https://localhost:8080/", 444 | "height": 34 445 | }, 446 | "id": "gC_FPhosrkGu", 447 | "outputId": "6942a7fd-c613-4c2b-dc9d-6f7b7eb55149" 448 | }, 449 | "outputs": [ 450 | { 451 | "data": { 452 | "text/plain": [ 453 | "True" 454 | ] 455 | }, 456 | "execution_count": 37, 457 | "metadata": { 458 | "tags": [] 459 | }, 460 | "output_type": "execute_result" 461 | } 462 | ], 463 | "source": [ 464 | "create_bucket('cbtest0', config.region_name)" 465 | ] 466 | }, 467 | { 468 | "cell_type": "code", 469 | "execution_count": null, 470 | "metadata": { 471 | "colab": { 472 | "base_uri": "https://localhost:8080/", 473 | "height": 34 474 | }, 475 | "id": "-aO_Rbslxjbm", 476 | "outputId": "4f793dc5-1901-4da6-82a2-492659411863" 477 | }, 478 | "outputs": [ 479 | { 480 | "name": "stdout", 481 | "output_type": "stream", 482 | "text": [ 483 | "/content/a/015.wav 62044 / 62044.0 (100.00%)" 484 | ] 485 | } 486 | ], 487 | "source": [ 488 | "upload_multiple_files('/content/audio', 'cbtest0')" 489 | ] 490 | } 491 | ], 492 | "metadata": { 493 | "colab": { 494 | "collapsed_sections": [], 495 | "name": "aws_transcribe_2_1.ipynb", 496 | "provenance": [], 497 | "toc_visible": true 498 | }, 499 | "kernelspec": { 500 | "display_name": "Python 3", 501 | "language": "python", 502 | "name": "python3" 503 | }, 504 | "language_info": { 505 | "codemirror_mode": { 506 | "name": "ipython", 507 | "version": 3 508 | }, 509 | "file_extension": ".py", 510 | "mimetype": "text/x-python", 511 | "name": "python", 512 | "nbconvert_exporter": "python", 513 | "pygments_lexer": "ipython3", 514 | "version": "3.7.10" 515 | } 516 | }, 517 | "nbformat": 4, 518 | "nbformat_minor": 1 519 | } 520 | --------------------------------------------------------------------------------