├── input
    ├── channels_id_example.txt
    └── playlists_id _example.txt
├── imgs
    └── katube-process.png
├── tools
    ├── Azure
    │   ├── config.py
    │   └── azure_cloud.py
    ├── Google
    │   └── google-stt.py
    └── AWS
    │   └── aws_transcribe.ipynb
├── Dockerfile
├── requirements.txt
├── utils
    ├── select_min_lev.py
    ├── change_filepath_metadata1.py
    ├── create_ignore_youtube_videos_list.py
    ├── size_dataset.py
    ├── corrigir_colunas_metadata.py
    ├── create_metadata_min_lev.py
    ├── delete_wavs_from_csv.py
    ├── change_filepath_metadata0.py
    ├── delete_wavs.py
    ├── verify_wavs_folder_metadata.py
    ├── clear_dataset.py
    ├── brspeech_generation.py
    ├── verificar_metadata_wavs.py
    ├── downsampling_wavs.py
    ├── downsampling.py
    ├── move_downsampled_wavs_folder.py
    ├── verificar_wavs_metadata.py
    ├── create_internal_metadata_min_lev.py
    ├── create_compressed_package.py
    ├── recreate_metadata.py
    ├── delete_folders_with_erros.py
    ├── exclude_unecessary_files.py
    └── number_to_text.py
├── config.py
├── environment.yml
├── synchronization.py
├── .gitignore
├── selection.py
├── search.py
├── download.py
├── transcribe.py
├── validation.py
├── audio_segmentation.py
├── README.md
├── text_normalization.py
├── main.py
└── LICENSE


/input/channels_id_example.txt:
--------------------------------------------------------------------------------
1 | # Globo
2 | UCEPRQVF6hxGGM9gi1ELaWHg


--------------------------------------------------------------------------------
/imgs/katube-process.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/freds0/katube/HEAD/imgs/katube-process.png


--------------------------------------------------------------------------------
/input/playlists_id _example.txt:
--------------------------------------------------------------------------------
1 | # https://www.youtube.com/watch?v=5tSIDFYm0xk&list=PLMsxmUeVRKYlVMAMCAxVB8yQSAFPXL3uu
2 | PLMsxmUeVRKYlVMAMCAxVB8yQSAFPXL3uu
3 | 


--------------------------------------------------------------------------------
/tools/Azure/config.py:
--------------------------------------------------------------------------------
1 | class Config:
2 |     base_dir = 'dataset_path'
3 |     output_name = 'dataset_name'
4 |     output_path = 'csv_files/' 
5 |     speech_key = ""
6 |     service_region = "brazilsouth"


--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
 1 | # docker build -t ubuntu1604py36
 2 | FROM ubuntu:18.04
 3 | 
 4 | RUN set -x \
 5 |       && apt-get update \
 6 |       && apt-get install -y espeak ffmpeg libespeak-dev libsndfile1 libsndfile1-dev python python-dev python-pip python-numpy python-lxml \
 7 |       && rm -rf /var/lib/apt/lists/*
 8 | RUN apt-get update
 9 | RUN apt-get install -y build-essential python3.6 python3.6-dev python3-pip python3.6-venv sox
10 | RUN apt-get install -y wget git nano
11 | 
12 | # update pip
13 | RUN python3.6 -m pip install pip --upgrade
14 | RUN python3.6 -m pip install wheel
15 | 
16 | RUN update-alternatives --install /usr/bin/python python /usr/bin/python3 10
17 | 
18 | RUN export PYTHONIOENCODING=UTF-8


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | apiclient==1.0.4
 2 | audioread==2.1.8
 3 | beautifulsoup4==4.9.0
 4 | cachetools==4.0.0
 5 | certifi==2019.11.28
 6 | cffi==1.13.2
 7 | chardet==3.0.4
 8 | decorator==4.4.1
 9 | google-api-python-client==1.7.11
10 | google-auth==1.10.1
11 | google-auth-httplib2==0.0.3
12 | httplib2==0.15.0
13 | idna==2.8
14 | joblib==0.14.1
15 | librosa==0.7.2
16 | llvmlite==0.31.0
17 | lxml==4.5.0
18 | numba==0.47.0
19 | numpy==1.22.4
20 | oauth2client==3.0.0
21 | pandas==1.0.3
22 | pyasn1==0.4.8
23 | pyasn1-modules==0.2.8
24 | pycparser==2.19
25 | pydub==0.23.1
26 | pysubs2==0.2.4
27 | python-dateutil==2.8.1
28 | pytube3==9.6.4
29 | pytz==2019.3
30 | requests==2.22.0
31 | resampy==0.2.2
32 | rsa==4.0
33 | scikit-learn==0.22.1
34 | scipy==1.4.1
35 | six==1.13.0
36 | SoundFile==0.10.3.post1
37 | soupsieve==2.0
38 | textdistance==4.1.5
39 | tqdm==4.41.1
40 | typing-extensions==3.7.4.2
41 | uritemplate==3.0.1
42 | urllib3==1.25.7
43 | youtube-dl==2021.4.17
44 | youtube-transcript-api==0.3.1
45 | 


--------------------------------------------------------------------------------
/utils/select_min_lev.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import soundfile as sf
 3 | import pandas as pd
 4 | from os.path import join
 5 | import csv
 6 | 
 7 |         
 8 | def generate_metadata(args):
 9 | 
10 |     df = pd.read_csv(join(args.base_dir, args.csv_file), sep = '|', header=None, quoting=csv.QUOTE_NONE)
11 |     new_df = df[df[3] >= float(args.min_value)]
12 |     new_df.to_csv(join(args.base_dir, args.output_file), sep = '|', header=False, index=False, quoting=csv.QUOTE_NONE)
13 | 
14 | 
15 | def main():
16 |     parser = argparse.ArgumentParser()
17 |     parser.add_argument('--base_dir', default='./')
18 |     parser.add_argument('--csv_file', default='metadata.csv', help='Name of csv file')
19 |     parser.add_argument('--min_value', default=0.90, help='Minimal value of levenshtein distance')   
20 |     parser.add_argument('--output_file', default='metadata_sub.csv', help='Name of csv file')
21 |     args = parser.parse_args()
22 |     generate_metadata(args)
23 | 
24 | if __name__ == "__main__":
25 |     main()
26 | 
27 | 


--------------------------------------------------------------------------------
/tools/Google/google-stt.py:
--------------------------------------------------------------------------------
 1 | import io
 2 | import os 
 3 | 
 4 | os.environ["GOOGLE_APPLICATION_CREDENTIALS"]="./config.json"
 5 | 
 6 | # Imports the Google Cloud client library
 7 | from google.cloud import speech
 8 | 
 9 | # Instantiates a client
10 | client = speech.SpeechClient()
11 | 
12 | # The name of the audio file to transcribe
13 | # file_name = os.path.join(os.path.dirname(__file__), "resources", "audio.raw")
14 | 
15 | file_name = './340_CO_bpubmn11.wav'
16 | 
17 | # Loads the audio into memory
18 | with io.open(file_name, "rb") as audio_file:
19 |     content = audio_file.read()
20 |     audio = speech.RecognitionAudio(content=content)
21 | 
22 | config = speech.RecognitionConfig(
23 |     encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
24 |     sample_rate_hertz=16000,
25 |     language_code="pt-BR",
26 | )
27 | 
28 | # Detects speech in the audio file
29 | response = client.recognize(config=config, audio=audio)
30 | 
31 | for result in response.results:
32 |     print("Transcript: {}".format(result.alternatives[0].transcript))


--------------------------------------------------------------------------------
/utils/change_filepath_metadata1.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import shutil
 4 | import tqdm
 5 | import argparse
 6 | 
 7 | def remove_folder(args):
 8 |     for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))):
 9 |         if os.path.exists(folder) and os.path.isdir(folder):
10 |             old_file = os.path.join(folder, 'metadata.csv')
11 |             new_file = os.path.join(folder, 'metadata_new.csv')
12 |             if not os.path.exists(old_file):
13 |                 print(folder)
14 |                 continue
15 |             if not os.path.exists(new_file):
16 |                 print(folder)
17 |                 continue
18 | 
19 |             os.remove(old_file)
20 |             os.rename(new_file, old_file)
21 |             #print(old_file)
22 |             #print(new_file)
23 | 
24 | 
25 | def main():
26 |   parser = argparse.ArgumentParser()
27 |   parser.add_argument('--base_dir', default='./BRSpeech-ASR-beta3/')  
28 |   args = parser.parse_args()
29 |   remove_folder(args)
30 | 
31 | if __name__ == "__main__":
32 |   main()
33 | 


--------------------------------------------------------------------------------
/utils/create_ignore_youtube_videos_list.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import csv
 5 | 
 6 |         
 7 | def generate_file(args):
 8 |     try:
 9 |         f = open(os.path.join(args.base_dir, args.output_file), 'w')
10 |         for folder in sorted(glob.glob(os.path.join(args.base_dir, args.input_folder) + '/*/*')):
11 |             if os.path.exists(folder) and os.path.isdir(folder):
12 |                 youtube_link = 'https://www.youtube.com/watch?v=' + folder.split('/')[-1]
13 |                 f.write(youtube_link + '\n')
14 |         f.close()
15 |     except IOError:
16 |       print("Error: Create file {}.".format(args.output_file))
17 |       return False
18 |     
19 |     return True
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('--base_dir', default='./')
24 |     parser.add_argument('--input_folder', default='./output/playlist')
25 |     parser.add_argument('--output_file', default='youtube_ignored_videos.txt', help='Name of csv file')
26 |     args = parser.parse_args()
27 |     generate_file(args)
28 | 
29 | if __name__ == "__main__":
30 |     main()
31 | 
32 | 


--------------------------------------------------------------------------------
/utils/size_dataset.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import soundfile as sf
 3 | from os.path import isfile, join, dirname
 4 | import pandas as pd
 5 | import os
 6 | import csv
 7 | import tqdm
 8 | 
 9 | def get_seconds(x):
10 |     f = sf.SoundFile(x)
11 |     t = len(f) / f.samplerate
12 |     return t
13 | 
14 | 
15 | def calcular_horas(args):
16 |     metadata = os.path.join(args.base_dir, args.csv_file)
17 |     df = pd.read_csv(metadata, sep = '|', quoting=csv.QUOTE_NONE)
18 |     total = 0
19 |     for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
20 |         path_file = os.path.join(row[0])
21 |         temp = get_seconds(path_file)
22 |         total += temp
23 | 
24 |     print('Total em Segundos: {}'.format(total))
25 |     print('horas: {}'.format(total/3600))
26 |     print('Minutos: {}'.format(total%3600/60))
27 |     print('Segundos: {}'.format( (total%3600)%60))
28 | 
29 | def main():
30 |     parser = argparse.ArgumentParser()
31 |     parser.add_argument('--base_dir', default='./')
32 |     parser.add_argument('--csv_file', default='metadata.csv', help='Name of csv file')
33 |     args = parser.parse_args()
34 |     calcular_horas(args)
35 | 
36 | if __name__ == "__main__":
37 |     main()
38 | 


--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
 1 | class Config:
 2 |     base_dir = './'
 3 |     dest_dir = 'output'
 4 |     ## serch and ingest
 5 |     api_key = ''
 6 |     ## videos origin
 7 |     orig_base = 'channel' # ['channel', 'playlist']
 8 |     ## Channels and Playlists files
 9 |     channels_file = './input/channels_id_example.txt'
10 |     playlists_file = './input/playlists_id.txt'
11 | 
12 |     # Logs
13 |     logs_dir = 'logs'
14 |     youtube_videos_error = 'error_youtube_videos.txt'
15 |     log_file = 'errors.log'
16 |     # Ignore videos list
17 |     ignored_youtube_videos = ''
18 |     downloaded_youtube_videos = logs_dir + '/downloaded_youtube_videos.txt'
19 | 
20 |     output_search_file = 'youtube_videos.txt'
21 |     # text_normalization
22 |     min_words = 15
23 |     max_words = 30
24 |     # split_audio
25 |     wavs_dir = 'wavs'
26 |     metadata_subtitles_file = 'subtitles.csv'
27 |     # convertion to transcribe format
28 |     tmp_wavs_dir = 'wavs_tmp'
29 |     tmp_sampling_rate = 16000
30 |     # transcribe
31 |     transcription_file = 'transcript.csv'
32 |     #output_converted_wavs_path = '00_16k'
33 |     # validation
34 |     validation_file = 'validation.csv'
35 |     # selection
36 |     minimal_levenshtein_distance = 0.9
37 |     # downsampling
38 |     sampling_rate = 22050
39 |     # result
40 |     result_file = 'metadata.csv'
41 |     delete_temp_files = True


--------------------------------------------------------------------------------
/utils/corrigir_colunas_metadata.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import shutil
 5 | import pandas as pd
 6 | import csv
 7 | import tqdm
 8 | 
 9 | def deletar(args):
10 |     total = 0
11 |     for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))):
12 |         if os.path.exists(folder) and os.path.isdir(folder):
13 |             metadata = os.path.join(folder, args.input_file)
14 |             if not os.path.exists(metadata):
15 |                 continue
16 |             df = pd.read_csv(metadata, sep = '|', quoting=csv.QUOTE_NONE)
17 |             #new_df = df[df['levenshtein'] >= float(args.min_value)].copy()
18 |             if set(['levenshtein']).issubset(df.columns):
19 |                 continue
20 |             print(metadata)
21 |             df.rename(columns={"text": "subtitle", "similarity" : "levenshtein"}, inplace=True)
22 |             df.to_csv(os.path.join(folder, args.input_file), sep = '|', index=False, quoting=csv.QUOTE_NONE)
23 |             total += 1
24 |     print('Total created metadata: ', total)
25 | 
26 | 
27 | def main():
28 |     parser = argparse.ArgumentParser()
29 |     parser.add_argument('--base_dir', default='./')
30 |     parser.add_argument('--input_file', default='validation.csv')
31 |     #parser.add_argument('--delete_file', default='delete.csv')
32 |     args = parser.parse_args()
33 |     deletar(args)
34 | 
35 | if __name__ == "__main__":
36 |     main()
37 | 


--------------------------------------------------------------------------------
/utils/create_metadata_min_lev.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import soundfile as sf
 3 | import pandas as pd
 4 | from os.path import join, exists
 5 | import csv
 6 | 
 7 |         
 8 | def generate_metadata(args):
 9 | 
10 |     metadata_file = join(args.base_dir, args.csv_file)
11 |     if not exists(metadata_file):
12 |         print('File {} not found.'.format(metadata_file))
13 |         return
14 | 
15 |     df = pd.read_csv(metadata_file, sep = '|', header=None, quoting=csv.QUOTE_NONE)
16 |     new_df = df[df[3] >= float(args.min_value)]
17 |     new_df.to_csv(join(args.base_dir, args.save_file), sep = '|', header=False, index=False, quoting=csv.QUOTE_NONE)
18 |     new_df = df[df[3] < float(args.min_value)]
19 |     new_df.to_csv(join(args.base_dir, args.delete_file), sep = '|', header=False, index=False, quoting=csv.QUOTE_NONE)
20 | 
21 | def main():
22 |     parser = argparse.ArgumentParser()
23 |     parser.add_argument('--base_dir', default='./')
24 |     parser.add_argument('--csv_file', default='metadata_complete.csv', help='Name of csv file')
25 |     parser.add_argument('--min_value', default=0.90, help='Minimal value of levenshtein distance')   
26 |     parser.add_argument('--save_file', default='save.csv', help='Name of csv file')
27 |     parser.add_argument('--delete_file', default='delete.csv', help='Name of csv file')
28 |     args = parser.parse_args()
29 |     generate_metadata(args)
30 | 
31 | if __name__ == "__main__":
32 |     main()
33 | 


--------------------------------------------------------------------------------
/utils/delete_wavs_from_csv.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import soundfile as sf
 3 | from os.path import isfile, join, dirname, exists
 4 | import pandas as pd
 5 | import os
 6 | import csv
 7 | import tqdm
 8 | 
 9 | def delete_wavs(args):
10 |     metadata_file = os.path.join(args.base_dir, args.csv_file)
11 |     if not exists(metadata_file):
12 |         print('File {} not found.'.format(metadata_file))
13 |         return
14 | 
15 |     df = pd.read_csv(metadata_file, sep = '|', quoting=csv.QUOTE_NONE)
16 |     total = 0
17 |     total_deleted = 0
18 | 
19 |     for index, row in tqdm.tqdm(df.iterrows(), total=df.shape[0]):
20 |         path_file = os.path.join(row[0])
21 |         if os.path.exists(path_file):
22 |             total_deleted += 1
23 |             if not(args.force):
24 |                 print(path_file)
25 |             else:
26 |                 os.remove(path_file)
27 | 
28 |     if not(args.force):
29 |         print('Total wavs to be deleted: ', total_deleted)
30 |     else:
31 |         print('Total wavs deleted: ', total_deleted)
32 |     
33 | def main():
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument('--base_dir', default='./')
36 |     parser.add_argument('--csv_file', default='delete.csv', help='Name of csv file')
37 |     parser.add_argument('--force', action='store_true', default=False)
38 |     args = parser.parse_args()
39 |     delete_wavs(args)
40 | 
41 | if __name__ == "__main__":
42 |     main()
43 | 


--------------------------------------------------------------------------------
/utils/change_filepath_metadata0.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import soundfile as sf
 3 | import pandas as pd
 4 | import csv
 5 | import glob
 6 | import os
 7 | import tqdm
 8 | 
 9 | def generate_metadata(args):
10 |     separator = '|'
11 |     for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))):        
12 |         if os.path.isdir(folder) and os.path.exists(os.path.join(folder, args.csv_file)):
13 |             output_file = open(os.path.join(folder, args.output_file), 'w')        
14 |             line = separator.join(['filename', 'subtitle', 'transcript', 'levenshtein'])
15 |             output_file.write(line + '\n')
16 |             df = pd.read_csv(os.path.join(folder, args.csv_file), sep = '|', quoting=csv.QUOTE_NONE)
17 |             for index, row in df.iterrows():
18 |                 filename = row[0].split('/')[-1]                
19 |                 line = separator.join([filename, row[1], row[2], str(row[3])])
20 |                 output_file.write(line + '\n')           
21 |             output_file.close()                       
22 | 
23 | 
24 | def main():
25 |     parser = argparse.ArgumentParser()
26 |     parser.add_argument('--base_dir', default='./')
27 |     parser.add_argument('--csv_file', default='metadata.csv', help='Name of csv file')  
28 |     parser.add_argument('--output_file', default='metadata_new.csv', help='Name of csv file')
29 |     args = parser.parse_args()
30 |     generate_metadata(args)
31 | 
32 | if __name__ == "__main__":
33 |     main()
34 | 


--------------------------------------------------------------------------------
/utils/delete_wavs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import pandas as pd
 5 | import csv
 6 | import tqdm
 7 | 
 8 | def deletar(args):
 9 |     total = 0
10 |     for metadata_file in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*/" + args.input_file))):
11 |         if os.path.isfile(metadata_file):         
12 |             df = pd.read_csv(metadata_file, sep = '|', quoting=csv.QUOTE_NONE)
13 |             folder_path = os.path.join(*metadata_file.split('/')[0:-1])
14 |             for index, row in df.iterrows():                 
15 |                 path_file = os.path.join(folder_path, args.wavs_folder, row[0])
16 |                 if os.path.exists(path_file):
17 |                     total += 1
18 |                     if not(args.force):
19 |                         print(path_file)
20 |                     else:
21 |                         os.remove(path_file)
22 | 
23 |     if args.force:    
24 |         print('Total wav files erased: ', total)
25 |     else:
26 |         print('Total wav files read to be erased: ', total)  
27 | 
28 | def main():
29 |     parser = argparse.ArgumentParser()
30 |     parser.add_argument('--base_dir', default='./output/channel/')
31 |     parser.add_argument('--input_file', default='delete.csv')
32 |     parser.add_argument('--wavs_folder', default='wavs', help='Input wavs folder')
33 |     parser.add_argument('--force', action='store_true', default=False)
34 |     args = parser.parse_args()
35 |     deletar(args)
36 | 
37 | if __name__ == "__main__":
38 |     main()
39 | 


--------------------------------------------------------------------------------
/utils/verify_wavs_folder_metadata.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | from os import makedirs
 5 | from os.path import join, exists
 6 | import tqdm
 7 | 
 8 | separator = '|'
 9 | 
10 | def verify_folder(args):
11 | 
12 |     i = 0
13 |     for wav_file in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/wavs/*.wav"))): 
14 | 
15 |             filename = wav_file.split('/')[-1]
16 |             folder = '/'.join(wav_file.split('/')[0:-2])
17 | 
18 |             metadata_path = join(folder, args.csv_file)
19 |             if not exists(metadata_path):
20 |                 continue
21 |             f = open(metadata_path, 'r')
22 |             content = f.readlines()
23 |             found = False
24 |             for line in content:
25 |                 filename_metadata, _, _, _ = line.split(separator)
26 |                 if filename_metadata == filename:
27 |                     found = True
28 |                     break   
29 |             if not found:
30 |                 i+=1
31 |                 if not args.erase:    
32 |                     print('Excluir arquivo: ' + wav_file)
33 |                 else:
34 |                     os.remove(wav_file)
35 | 
36 | 
37 |     print('Total: ' + str(i) + ' arquivos')
38 | 
39 | def main():
40 |   parser = argparse.ArgumentParser()
41 |   parser.add_argument('--base_dir', default='./')  
42 |   parser.add_argument('--folder', default='', help='Name of the origin directory of wav files')
43 |   parser.add_argument('--csv_file', default='metadata.csv')
44 |   parser.add_argument('--erase', action='store_true', default=False)
45 |   args = parser.parse_args()
46 |   verify_folder(args)
47 | 
48 | if __name__ == "__main__":
49 |   main()
50 | 


--------------------------------------------------------------------------------
/utils/clear_dataset.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import shutil
 5 | def clear_dataset(args):
 6 |     total = 0
 7 |     for folder in sorted(glob.glob(args.base_dir + "/*/*")):
 8 |         if os.path.isdir(folder) and not os.path.exists(os.path.join(folder, 'metatada.csv')):
 9 |             total+=1
10 |             if not args.force:
11 |                 print(folder)
12 |             else:
13 |                 shutil.rmtree(folder)
14 |         if os.path.isdir(folder) and not os.listdir(folder):
15 |             total+=1
16 |             if not args.force:
17 |                 print(folder)
18 |             else:
19 |                 shutil.rmtree(folder)
20 |         wavs_folder = os.path.join(folder, args.wavs_folder)
21 |         if os.path.exists(wavs_folder) and os.path.isdir(wavs_folder):            
22 |             if not os.listdir(wavs_folder): #if len (os.listdir(wavs_folder)) == 0:
23 |                 total+=1
24 |                 if not args.force:
25 |                     print(folder)
26 |                 else:
27 |                     shutil.rmtree(folder) 
28 | 
29 |     if args.force:    
30 |         print('Total folders erased: ', total)
31 |     else:
32 |         print('Total folders with problems: ', total)  
33 | 
34 | def main():
35 |     parser = argparse.ArgumentParser()
36 |     parser.add_argument('--base_dir', default='./output/channel')
37 |     parser.add_argument('--wavs_folder', default='wavs', help='Input wavs folder')
38 |     parser.add_argument('--force', action='store_true', default=False)
39 | 
40 |     args = parser.parse_args()
41 |     clear_dataset(args)
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 
46 | 
47 | 
48 | 
49 | 
50 | 


--------------------------------------------------------------------------------
/utils/brspeech_generation.py:
--------------------------------------------------------------------------------
 1 | import subprocess
 2 | 
 3 | base_dir = 'output/channel/'
 4 | def main():
 5 | 
 6 |     command_line = "python utils/0-create_metadata.py --base_dir {} ".format(base_dir)
 7 |     subprocess.call(command_line, shell=True)
 8 | 
 9 |     command_line = "python utils/1-delete_folders_with_erros.py --base_dir {} --erase".format(base_dir)
10 |     subprocess.call(command_line, shell=True)
11 | 
12 |     command_line = "python utils/2-clear_dataset.py --base_dir {} --erase".format(base_dir)
13 |     subprocess.call(command_line, shell=True)
14 | 
15 |     command_line = "python utils/3-create_metadata_min_lev.py --base_dir {}".format(base_dir)
16 |     subprocess.call(command_line, shell=True)
17 | 
18 |     command_line = "python utils/4-create_internal_metadata_min_lev.py --base_dir {}".format(base_dir)
19 |     subprocess.call(command_line, shell=True)
20 | 
21 |     command_line = "python utils/5-delete_wavs.py --base_dir {} --erase".format(base_dir)
22 |     subprocess.call(command_line, shell=True)
23 | 
24 |     command_line = "python utils/6-downsampling_wavs.py --base_dir {} --convert".format(base_dir)
25 |     subprocess.call(command_line, shell=True)
26 | 
27 |     command_line = "python utils/7-move_downsampled_wavs_folder.py --base_dir {} --erase".format(base_dir)
28 |     subprocess.call(command_line, shell=True)
29 | 
30 |     command_line = "python utils/8-exclude_unecessary_files.py --base_dir {} --erase".format(base_dir)
31 |     subprocess.call(command_line, shell=True)
32 | 
33 |     command_line = "python utils/9-change_filepath_metadata.py --base_dir {} --str_filepath_to_remove {}".format(base_dir, base_dir)
34 |     subprocess.call(command_line, shell=True)
35 | 
36 | if __name__ == "__main__":
37 |   main()
38 | 


--------------------------------------------------------------------------------
/utils/verificar_metadata_wavs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import shutil
 5 | import pandas as pd
 6 | import csv
 7 | import tqdm
 8 | 
 9 | def deletar(args):
10 |     total = 0
11 |     separator = '|'
12 |     for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))):
13 |         if os.path.exists(folder) and os.path.isdir(folder):
14 |             metadata_path = os.path.join(folder, args.csv_file)
15 |             if not os.path.exists(metadata_path):
16 |                 continue
17 |             f = open(metadata_path, 'r')
18 |             content = f.readlines()[1:]
19 |             for line in content:
20 |                 filename_metadata, _, _, _ = line.split(separator)
21 |                 filepath = os.path.join(folder, 'wavs', filename_metadata)
22 |                 print(filepath)
23 |                 if os.path.exists(filepath):
24 |                     continue
25 |                 else:
26 |                     total+=1
27 |                     if not args.erase:
28 |                         print('Excluir arquivo: ' + filename_metadata)
29 |                     else:
30 |                         os.remove(filename_metadata)
31 | 
32 |     if not args.erase:
33 |         print('Total wavs to be erased: ', total)
34 |     else:
35 |         print('Total erased: ', total)
36 | 
37 | 
38 | def main():
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument('--base_dir', default='./')
41 |     parser.add_argument('--csv_file', default='metadata.csv')
42 |     parser.add_argument('--erase', action='store_true', default=False)
43 |     #parser.add_argument('--delete_file', default='delete.csv')
44 | 
45 |     args = parser.parse_args()
46 |     deletar(args)
47 | 
48 | if __name__ == "__main__":
49 |     main()
50 | 


--------------------------------------------------------------------------------
/utils/downsampling_wavs.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | from os import makedirs
 5 | from os.path import join, exists
 6 | import tqdm
 7 | 
 8 | number_bits = 16
 9 | encoding = "signed-integer"
10 | number_channels = 1
11 | 
12 | def downsampling(args):
13 |     sample_rate = args.sample_rate
14 | 
15 |     for folder in tqdm.tqdm(glob.glob(args.base_dir + "/*/*")):        
16 |         for wav_path in glob.glob(join(folder, args.wav_dir) + "/*.wav"):           
17 |             prev = '/'.join(wav_path.split('/')[0:5])            
18 |             filename = wav_path.split('/')[-1]
19 |             new_wav_path = join(prev, args.new_wav_dir, filename)
20 |             dir_path = os.path.dirname(new_wav_path)  
21 |             if not args.force:    
22 |                 print("sox %s -V0 -c %d -r %d -b %d -e %s %s"% (wav_path, int(number_channels), int(sample_rate), number_bits, encoding, new_wav_path))
23 |             else:
24 |                 os.makedirs(dir_path, exist_ok=True)
25 |                 os.system("sox %s -V0 -c %d -r %d -b %d -e %s %s"% (wav_path, int(number_channels), int(sample_rate), number_bits, encoding, new_wav_path))
26 | 
27 | def main():
28 |   parser = argparse.ArgumentParser()
29 |   parser.add_argument('--base_dir', default='./output/channel/')  
30 |   parser.add_argument('--wav_dir', default='wavs', help='Name of the origin directory of wav files')
31 |   parser.add_argument('--new_wav_dir', default='wavs22', help='Name of the origin directory of wav files')
32 |   parser.add_argument('--sample_rate', default=22050, help='Sample rate of destination wav files')
33 |   parser.add_argument('--force', action='store_true', default=False)
34 |   args = parser.parse_args()
35 |   downsampling(args)
36 | 
37 | if __name__ == "__main__":
38 |   main()
39 | 


--------------------------------------------------------------------------------
/utils/downsampling.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | from os import makedirs
 5 | from os.path import join, exists
 6 | import tqdm
 7 | 
 8 | number_bits = 16
 9 | encoding = "signed-integer"
10 | number_channels = 1
11 | 
12 | def downsampling(folder, wav_dir, new_wav_dir, sample_rate, force):
13 |     for wav_path in glob.glob(join(folder, wav_dir) + "/*.wav"):           
14 |         prev = '/'.join(wav_path.split('/')[0:5])            
15 |         filename = wav_path.split('/')[-1]
16 |         new_wav_path = join(prev, new_wav_dir, filename)
17 |         dir_path = os.path.dirname(new_wav_path)  
18 |         if not force:    
19 |             print("sox %s -V0 -c %d -r %d -b %d -e %s %s"% (wav_path, int(number_channels), int(sample_rate), number_bits, encoding, new_wav_path))
20 |         else:
21 |             os.makedirs(dir_path, exist_ok=True)
22 |             os.system("sox %s -V0 -c %d -r %d -b %d -e %s %s"% (wav_path, int(number_channels), int(sample_rate), number_bits, encoding, new_wav_path))
23 |     return True
24 | 
25 | def main():
26 |   parser = argparse.ArgumentParser()
27 |   parser.add_argument('--base_dir', default='./output/channel/')  
28 |   parser.add_argument('--wav_dir', default='wavs', help='Name of the origin directory of wav files')
29 |   parser.add_argument('--new_wav_dir', default='wavs22', help='Name of the origin directory of wav files')
30 |   parser.add_argument('--sample_rate', default=22050, help='Sample rate of destination wav files')
31 |   parser.add_argument('--force', action='store_true', default=False)
32 |   args = parser.parse_args()
33 |   for folder in glob.glob(args.base_dir + "/**"): 
34 |     downsampling(folder, args.wav_dir, args.new_wav_dir, args.sample_rate, args.force)
35 | 
36 | if __name__ == "__main__":
37 |   main()
38 | 


--------------------------------------------------------------------------------
/utils/move_downsampled_wavs_folder.py:
--------------------------------------------------------------------------------
 1 | import glob
 2 | import os
 3 | import shutil
 4 | import tqdm
 5 | import argparse
 6 | 
 7 | def remove_old_folder_wavs(args):
 8 |     total_erased = 0
 9 |     for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))):
10 |         if os.path.exists(folder) and os.path.isdir(folder):
11 |             old_folder = os.path.join(folder, args.old_folder)
12 |             new_folder = os.path.join(folder, args.new_folder)
13 |             if not os.path.exists(old_folder):
14 |                 print('Verify folder: ' + old_folder)
15 |                 continue
16 |                 #exit()
17 |             if not os.path.exists(new_folder):
18 |                 print('Verify folder: ' + new_folder)
19 |                 continue
20 |                 #exit()
21 |             total_erased+=1
22 |             if not args.force:
23 |                 print('rm ' + old_folder)
24 |                 print('mv ' + new_folder + ' ' + old_folder)
25 |             else:
26 |                 shutil.rmtree(old_folder) 
27 |                 os.rename(new_folder, old_folder)                
28 | 
29 |     if args.force:
30 |         print('Total modified folders ', total_erased)
31 |     else:
32 |         print('Total to be modified folders ', total_erased)
33 | 
34 | 
35 | def main():
36 |   parser = argparse.ArgumentParser()
37 |   parser.add_argument('--base_dir', default='./output/channel/')  
38 |   parser.add_argument('--old_folder', default='wavs', help='Name of old wavs folder, to erase')
39 |   parser.add_argument('--new_folder', default='wavs22', help='Name of new wavs folder')
40 |   parser.add_argument('--force', action='store_true', default=False)
41 |   args = parser.parse_args()
42 |   remove_old_folder_wavs(args)
43 | 
44 | if __name__ == "__main__":
45 |   main()
46 | 


--------------------------------------------------------------------------------
/utils/verificar_wavs_metadata.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import shutil
 5 | import pandas as pd
 6 | import csv
 7 | import tqdm
 8 | 
 9 | def deletar(args):
10 |     total = 0
11 |     separator = '|'
12 |     for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))):
13 |         if os.path.exists(folder) and os.path.isdir(folder):
14 |             for wav_file in sorted(glob.glob(folder + "/wavs/*.wav")): 
15 |                 filename = wav_file.split('/')[-1]
16 |                 folder = '/'.join(wav_file.split('/')[0:-2])
17 |                 metadata_path = os.path.join(folder, args.csv_file)
18 |                 if not os.path.exists(metadata_path):
19 |                     continue
20 |                 f = open(metadata_path, 'r')
21 |                 content = f.readlines()
22 |                 found = False
23 |                 for line in content:
24 |                     filename_metadata, _, _, _ = line.split(separator)
25 |                     if filename_metadata == filename:
26 |                         found = True
27 |                         break
28 |                 if not found:
29 |                     total+=1
30 |                     if not args.erase:    
31 |                         print('Excluir arquivo: ' + wav_file)
32 |                     else:
33 |                         os.remove(wav_file)
34 | 
35 |     if not args.erase:
36 |         print('Total wavs to be erased: ', total)
37 |     else:
38 |         print('Total erased: ', total)
39 | 
40 | 
41 | def main():
42 |     parser = argparse.ArgumentParser()
43 |     parser.add_argument('--base_dir', default='./')
44 |     parser.add_argument('--csv_file', default='metadata.csv')
45 |     parser.add_argument('--erase', action='store_true', default=False)
46 |     #parser.add_argument('--delete_file', default='delete.csv')
47 | 
48 |     args = parser.parse_args()
49 |     deletar(args)
50 | 
51 | if __name__ == "__main__":
52 |     main()
53 | 


--------------------------------------------------------------------------------
/utils/create_internal_metadata_min_lev.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import shutil
 5 | import pandas as pd
 6 | import csv
 7 | import tqdm
 8 | 
 9 | def deletar(args):
10 |     total = 0
11 |     for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + "/*/*"))):
12 |         if os.path.exists(folder) and os.path.isdir(folder):
13 |             metadata = os.path.join(folder, args.input_file)
14 |             if not os.path.exists(metadata):
15 |                 continue            
16 |             df = pd.read_csv(metadata, sep = '|', quoting=csv.QUOTE_NONE)
17 |             # Creating save files
18 |             save_df = df[df['similarity'] >= float(args.min_value)].copy()
19 |             filenames = save_df['filename'].apply(lambda x: x.split('/')[-1])
20 |             save_df['filename'] = filenames
21 |             save_df.to_csv(os.path.join(folder, args.save_file), sep = '|', index=False, quoting=csv.QUOTE_NONE)
22 |             # Creating delete files
23 |             delete_df = df[df['similarity'] < float(args.min_value)].copy()
24 |             filenames = delete_df['filename'].apply(lambda x: x.split('/')[-1])
25 |             delete_df['filename'] = filenames
26 |             delete_df.to_csv(os.path.join(folder, args.delete_file), sep = '|', index=False, quoting=csv.QUOTE_NONE)
27 | 
28 |             total += 1
29 | 
30 |     print('Total save/delete files metadata created: ', total)
31 | 
32 | 
33 | def main():
34 |     parser = argparse.ArgumentParser()
35 |     parser.add_argument('--base_dir', default='./output/channel/')
36 |     parser.add_argument('--input_file', default='validation.csv')
37 |     parser.add_argument('--save_file', default='save.csv')
38 |     parser.add_argument('--delete_file', default='delete.csv')
39 |     parser.add_argument('--min_value', default=0.90, help='Minimal value of levenshtein distance') 
40 |     args = parser.parse_args()
41 |     deletar(args)
42 | 
43 | if __name__ == "__main__":
44 |     main()
45 | 


--------------------------------------------------------------------------------
/utils/create_compressed_package.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import tarfile
 3 | import os
 4 | import tqdm
 5 | 
 6 | def create_brspech_file(input_file, output_file, min_value):
 7 |     root = 'BRSpeech-ASR'
 8 |     write_mode = 'w:bz2' # 'w' or 'w:gz' or 'w:bz2'
 9 |     internal_folder = 'wavs'
10 | 
11 |     tar_file = tarfile.open(output_file, mode=write_mode)
12 | 
13 |     num_lines = sum(1 for line in open(input_file,'r'))
14 |     in_file = open(input_file, "r")
15 |     folders_list = []
16 |     for line in tqdm.tqdm(in_file, total=num_lines):
17 | 
18 |         file, subtitle, transcript, levenshtein = line.split('|')
19 | 
20 |         folder = file.split('/')[-3]
21 |         folder_path = '/'.join(file.split('/')[:-2])
22 |         filename = file.split('/')[-1]
23 | 
24 |         #if float(levenshtein) > float(min_value):
25 | 
26 |         tar_file.add(file, arcname=os.path.join(root, folder, internal_folder, filename))
27 |         folders_list.append(folder_path)
28 | 
29 |     for folder in folders_list:
30 |         tar_file.add(os.path.join(folder, 'validation.csv'), arcname=os.path.join(root, folder, internal_folder, filename))
31 | 
32 |     tar_file.add(input_file, arcname=os.path.join(root, 'metadata.csv'))
33 |     tar_file.close()
34 |     in_file.close()
35 | 
36 | 
37 | def main():
38 | 
39 |     parser = argparse.ArgumentParser()
40 |     parser.add_argument('--base_dir', default='./')
41 |     parser.add_argument('--metadata_file', default='metadata_all.csv', help='Input filename')
42 |     parser.add_argument('--output_file', default='BRSpeech.tar.bz', help='Tar.bz file')
43 |     parser.add_argument('--min_value', default=0.95, help='Minimal value of levenshtein distance')
44 | 
45 |     args = parser.parse_args()
46 | 
47 |     input_file = os.path.join(args.base_dir, args.metadata_file)
48 |     output_file = os.path.join(args.base_dir, args.output_file)
49 | 
50 |     create_brspech_file(input_file, output_file, args.min_value)
51 | 
52 | if __name__ == "__main__":
53 |     main()
54 | 


--------------------------------------------------------------------------------
/environment.yml:
--------------------------------------------------------------------------------
 1 | name: katube
 2 | channels:
 3 |   - defaults
 4 | dependencies:
 5 |   - _libgcc_mutex=0.1=main
 6 |   - _openmp_mutex=5.1=1_gnu
 7 |   - ca-certificates=2022.4.26=h06a4308_0
 8 |   - ld_impl_linux-64=2.38=h1181459_1
 9 |   - libffi=3.3=he6710b0_2
10 |   - libgcc-ng=11.2.0=h1234567_1
11 |   - libgomp=11.2.0=h1234567_1
12 |   - libstdcxx-ng=11.2.0=h1234567_1
13 |   - ncurses=6.3=h7f8727e_2
14 |   - openssl=1.1.1o=h7f8727e_0
15 |   - pip=21.2.4=py38h06a4308_0
16 |   - python=3.8.13=h12debd9_0
17 |   - readline=8.1.2=h7f8727e_1
18 |   - setuptools=61.2.0=py38h06a4308_0
19 |   - sqlite=3.38.3=hc218d9a_0
20 |   - tk=8.6.12=h1ccaba5_0
21 |   - wheel=0.37.1=pyhd3eb1b0_0
22 |   - xz=5.2.5=h7f8727e_1
23 |   - zlib=1.2.12=h7f8727e_2
24 |   - pip:
25 |     - apiclient==1.0.4
26 |     - audioread==2.1.8
27 |     - beautifulsoup4==4.9.0
28 |     - cachetools==4.0.0
29 |     - certifi==2019.11.28
30 |     - cffi==1.13.2
31 |     - chardet==3.0.4
32 |     - decorator==4.4.1
33 |     - google-api-python-client==1.7.11
34 |     - google-auth==1.10.1
35 |     - google-auth-httplib2==0.0.3
36 |     - httplib2==0.15.0
37 |     - idna==2.8
38 |     - joblib==0.14.1
39 |     - librosa==0.7.2
40 |     - llvmlite==0.31.0
41 |     - lxml==4.5.0
42 |     - numba==0.47.0
43 |     - numpy==1.22.4
44 |     - oauth2client==3.0.0
45 |     - pandas==1.0.3
46 |     - pyasn1==0.4.8
47 |     - pyasn1-modules==0.2.8
48 |     - pycparser==2.19
49 |     - pydub==0.23.1
50 |     - pysubs2==0.2.4
51 |     - python-dateutil==2.8.1
52 |     - pytube3==9.6.4
53 |     - pytz==2019.3
54 |     - requests==2.22.0
55 |     - resampy==0.2.2
56 |     - rsa==4.0
57 |     - scikit-learn==0.22.1
58 |     - scipy==1.4.1
59 |     - six==1.13.0
60 |     - soundfile==0.10.3.post1
61 |     - soupsieve==2.0
62 |     - textdistance==4.1.5
63 |     - tqdm==4.41.1
64 |     - typing-extensions==3.7.4.2
65 |     - uritemplate==3.0.1
66 |     - urllib3==1.25.7
67 |     - youtube-dl==2021.4.17
68 |     - youtube-transcript-api==0.3.1
69 | prefix: /opt/anaconda3/envs/katube
70 | 


--------------------------------------------------------------------------------
/utils/recreate_metadata.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | from os import makedirs
 4 | from os.path import join, exists, isdir
 5 | import csv
 6 | import tqdm
 7 | 
 8 | def regenerate_metadata(input_file1, basename, output_file):
 9 | 
10 |     try:
11 |         f = open(input_file1)
12 |         content_file1 = f.readlines()[1:]
13 |     except IOError:
14 |       print("Error: File {} does not appear to exist.".format(input_file1))
15 |       return False
16 |     else:
17 |         f.close()
18 | 
19 |     output_file = open(output_file, 'a')
20 |     separator = '|'
21 | 
22 |     for line1 in content_file1:
23 |         file1, text1, text2, lev = line1.split('|')
24 |         filepath = join(basename, file1)
25 |         line = separator.join([filepath, text1.rstrip(), text2.strip(), str(lev)])        
26 |         output_file.write(line)
27 | 
28 |     output_file.close()
29 |     return True
30 | 
31 | def main():
32 | 
33 |     parser = argparse.ArgumentParser()
34 |     parser.add_argument('--base_dir', default='./output/channel/')
35 |     parser.add_argument('--csv_file', default='metadata.csv', help='Name of csv file')
36 |     parser.add_argument('--wav_folder', default='wavs', help='Name of wavs folder')
37 |     parser.add_argument('--internal_csv_file', default='metadata.csv', help='Name of csv file')
38 |     args = parser.parse_args()
39 | 
40 |     separator = '|'
41 |     output_path_file = join(args.base_dir, args.csv_file)
42 |     output_file = open(output_path_file, 'w')
43 |     header = separator.join(['filename', 'subtitle', 'transcript', 'similarity']) + '\n'
44 |     output_file.write(header)
45 |     output_file.close()
46 | 
47 |     for folder_path in tqdm.tqdm(sorted(glob.glob(args.base_dir + '/*/*'))):
48 |         if not isdir(folder_path):
49 |             continue
50 |         foldername = join(folder_path, args.wav_folder)
51 |         input_path_file1 = join(folder_path, args.internal_csv_file)
52 |         regenerate_metadata(input_path_file1, foldername, output_path_file)
53 | 
54 | if __name__ == "__main__":
55 |     main()
56 | 


--------------------------------------------------------------------------------
/synchronization.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com
 5 | #
 6 | #
 7 | import argparse
 8 | import sys
 9 | from os.path import split, join
10 | from aeneas.executetask import ExecuteTask
11 | from aeneas.task import Task
12 | 
13 | 
14 | def create_aeneas_json_file(audio_path, text_path, output_path):
15 |     """
16 |     Use the api aeneas to synchronize audio and text.
17 | 
18 |         Parameters:
19 |         audio_path (str): audio filepath.
20 |         text_path (str): text filepath.
21 |         output_path (str): output json filepath.
22 | 
23 |         Returns:
24 |         Boolean: True or False.
25 |     """
26 |     try:
27 |         # create Task object
28 |         config_string = u"task_language=por|is_text_type=plain|os_task_file_format=json|task_adjust_boundary_percent_value=50|mfcc_mask_nonspeech_l2=True"
29 |         task = Task(config_string=config_string)
30 |         task.audio_file_path_absolute = u"{}".format(audio_path)
31 |         task.text_file_path_absolute = u"{}".format(text_path)
32 |         task.sync_map_file_path_absolute = u"{}".format(output_path)
33 | 
34 |         # process Task
35 |         ExecuteTask(task).execute()
36 | 
37 |         # output sync map to file
38 |         task.output_sync_map_file()
39 | 
40 |     except KeyboardInterrupt:
41 |         print("KeyboardInterrupt Detected!")
42 |         exit()
43 | 
44 |     except:
45 |         exc_type, exc_obj, exc_tb = sys.exc_info()
46 |         exc_file = split(exc_tb.tb_frame.f_code.co_filename)[1]
47 |         print(exc_type, exc_file, exc_tb.tb_lineno)
48 |         return False
49 | 
50 |     return True
51 | 
52 | 
53 | def main():
54 |     parser = argparse.ArgumentParser()
55 |     parser.add_argument('--base_dir', default='./')
56 |     parser.add_argument('--audio_file', default='audio.mp3', help='Filename to input audio file')
57 |     parser.add_argument('--text_file', default='input.txt', help='Filename of input text')
58 |     parser.add_argument('--output_file', default='output.json', help='Output json file')
59 |     args = parser.parse_args()
60 | 
61 |     audio_path = join(args.base_dir, args.audio_file)
62 |     text_path = join(args.base_dir, args.text_file)
63 |     output_path = join(args.base_dir, args.output_file)
64 |     create_aeneas_json_file(audio_path, text_path, output_path)
65 | 
66 | if __name__ == "__main__":
67 |     main()
68 | 


--------------------------------------------------------------------------------
/utils/delete_folders_with_erros.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import shutil
 5 | 
 6 | def erase_folders_with_error(args):
 7 |     total = 0
 8 |     for folder in sorted(glob.glob(args.base_dir + "/*/*")):
 9 |         if os.path.exists(folder) and os.path.isdir(folder):
10 |             if os.path.isfile(os.path.join(folder, args.metada_file1)):
11 |                 try:
12 |                     with open(os.path.join(folder, args.metada_file1)) as f:
13 |                         content_file1 = f.readlines()
14 |                 except IOError:
15 |                   print("Error: File {} does not appear to exist.".format(metada_file1))
16 |                  # return False
17 |             else:
18 |                 total += 1
19 |                 if not args.force:
20 |                     print(folder)
21 |                 else:                 
22 |                     shutil.rmtree(folder)
23 |                 continue
24 |             if os.path.isfile(os.path.join(folder, args.metada_file2)):
25 |                 try:
26 |                     with open(os.path.join(folder, args.metada_file2)) as f:
27 |                         content_file2 = f.readlines()
28 |                 except IOError:
29 |                   print("Error: File {} does not appear to exist.".format(filename2))
30 |                   #return False
31 |             else:
32 |                 total += 1
33 |                 if not args.force:
34 |                     print(folder)
35 |                 else:
36 |                     shutil.rmtree(folder)
37 |                 continue
38 | 
39 |             if not (len(content_file1) == len(content_file2)):
40 |                 total += 1
41 |                 if not args.force:
42 |                     print(folder)
43 |                 else:
44 |                     shutil.rmtree(folder)
45 |     if args.force:
46 |         print('Total folders erased: ', total)
47 |     else:
48 |         print('Total folders with problems: ', total)  
49 |   
50 | def main():
51 |     parser = argparse.ArgumentParser()
52 |     parser.add_argument('--base_dir', default='./output/channel')
53 |     parser.add_argument('--metada_file1', default='subtitles.csv')
54 |     parser.add_argument('--metada_file2', default='transcript.csv')
55 |     parser.add_argument('--force', action='store_true', default=False)
56 |     args = parser.parse_args()
57 |     erase_folders_with_error(args)
58 | 
59 | if __name__ == "__main__":
60 |     main()
61 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | pip-wheel-metadata/
 24 | share/python-wheels/
 25 | *.egg-info/
 26 | .installed.cfg
 27 | *.egg
 28 | MANIFEST
 29 | 
 30 | # PyInstaller
 31 | #  Usually these files are written by a python script from a template
 32 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 33 | *.manifest
 34 | *.spec
 35 | 
 36 | # Installer logs
 37 | pip-log.txt
 38 | pip-delete-this-directory.txt
 39 | 
 40 | # Unit test / coverage reports
 41 | htmlcov/
 42 | .tox/
 43 | .nox/
 44 | .coverage
 45 | .coverage.*
 46 | .cache
 47 | nosetests.xml
 48 | coverage.xml
 49 | *.cover
 50 | *.py,cover
 51 | .hypothesis/
 52 | .pytest_cache/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | target/
 76 | 
 77 | # Jupyter Notebook
 78 | .ipynb_checkpoints
 79 | 
 80 | # IPython
 81 | profile_default/
 82 | ipython_config.py
 83 | 
 84 | # pyenv
 85 | .python-version
 86 | 
 87 | # pipenv
 88 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 89 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 90 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 91 | #   install all needed dependencies.
 92 | #Pipfile.lock
 93 | 
 94 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow
 95 | __pypackages__/
 96 | 
 97 | # Celery stuff
 98 | celerybeat-schedule
 99 | celerybeat.pid
100 | 
101 | # SageMath parsed files
102 | *.sage.py
103 | 
104 | # Environments
105 | .env
106 | .venv
107 | env/
108 | venv/
109 | ENV/
110 | env.bak/
111 | venv.bak/
112 | 
113 | # Spyder project settings
114 | .spyderproject
115 | .spyproject
116 | 
117 | # Rope project settings
118 | .ropeproject
119 | 
120 | # mkdocs documentation
121 | /site
122 | 
123 | # mypy
124 | .mypy_cache/
125 | .dmypy.json
126 | dmypy.json
127 | 
128 | # Pyre type checker
129 | .pyre/
130 | 


--------------------------------------------------------------------------------
/tools/Azure/azure_cloud.py:
--------------------------------------------------------------------------------
 1 | import azure.cognitiveservices.speech as speechsdk
 2 | from config import Config as config
 3 | import os
 4 | from pathlib import Path
 5 | import tqdm
 6 | import glob
 7 | 
 8 | import pandas as pd
 9 | 
10 | def pass_through_files(speech_config=None):
11 |     '''
12 |     Realiza a análise de todos os arquivos no diretório desejado 
13 |     para a transcrição.
14 |     '''
15 | 
16 |     transcribed_texts = []
17 |     file_names = []
18 | 
19 |     for filepath in tqdm.tqdm(sorted(glob.glob(config.base_dir + '/**/*.wav', recursive=True))):
20 |         transcription = run_transcription(filepath, speech_config)
21 | 
22 |         transcribed_texts.append(transcription)
23 |         file_names.append(filepath.split('/')[-1])
24 | 
25 | 
26 |     return transcribed_texts, file_names
27 | 
28 | 
29 | def run_transcription(filepath='./', speech_config=None):
30 |     '''
31 |     Realiza a transcrição de um único áudio de forma assíncrona.
32 |     '''
33 |     
34 |     audio_input = speechsdk.AudioConfig(filename=filepath)
35 |     speech_recognizer = speechsdk.SpeechRecognizer(speech_config=speech_config, 
36 |                                                     audio_config=audio_input, 
37 |                                                     language="pt-BR")
38 | 
39 |     result_future = speech_recognizer.recognize_once_async()
40 | 
41 |     # Retrieve the recognition result. This blocks until recognition is complete.
42 |     result = result_future.get()
43 | 
44 |     # Check the result
45 |     if result.reason == speechsdk.ResultReason.RecognizedSpeech:
46 |         # print(result.text)
47 |         return result.text
48 |     else:
49 |         return ''
50 |     # elif result.reason == speechsdk.ResultReason.NoMatch:
51 |     #     print("No speech could be recognized: {}".format(result.no_match_details))
52 |     # elif result.reason == speechsdk.ResultReason.Canceled:
53 |     #     cancellation_details = result.cancellation_details
54 |     #     print("Speech Recognition canceled: {}".format(cancellation_details.reason))
55 |     #     if cancellation_details.reason == speechsdk.CancellationReason.Error:
56 |     #         print("Error details: {}".format(cancellation_details.error_details))
57 | 
58 | def make_matadata(file_names, transcribed_texts):
59 |     """
60 |     Cria um arquivo csv com os textos transcritos.
61 |     """
62 | 
63 |     os.makedirs(config.output_path, exist_ok=True)
64 | 
65 |     df = pd.DataFrame()
66 | 
67 |     for file_name, text in zip(file_names, transcribed_texts):
68 |         df = df.append({'A': file_name, 'B' : text}, ignore_index=True)
69 | 
70 |     df.to_csv(os.path.join(config.output_path, config.output_name.lower() + '_transcribed_azure' + '.csv'), sep='|', index=False, header=False, quotechar="'")
71 | 
72 | def main():
73 | 
74 |     speech_config = speechsdk.SpeechConfig(subscription=config.speech_key, region=config.service_region)
75 |     transcribed_texts, file_names = pass_through_files(speech_config)
76 |     make_matadata(file_names, transcribed_texts)
77 | 
78 | 
79 | if __name__ == '__main__':
80 |     main()


--------------------------------------------------------------------------------
/selection.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python
 2 | # -*- coding: utf-8 -*-
 3 | #
 4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com
 5 | #
 6 | #
 7 | import argparse
 8 | import sys
 9 | from os import remove
10 | from os.path import basename, join, split
11 | 
12 | 
13 | def select(input_csv_file, output_filepath, min_similarity, force):
14 |     """
15 |     Given a csv file, selects only files with similarity greater than min_similarity and deletes the others.
16 | 
17 |         Parameters:
18 |         input_csv_file (str): Input csv filepath following the template: "filename| subtitle | transcript | similarity"
19 |         output_filepath (str): Output csv filepath following the template: "filename| subtitle | transcript | similarity"
20 |         min_similarity (float): Threshold that defines which files will be excluded.
21 |         force (boolean):  if True, it will remove the files, otherwise only show what files wil be removed.
22 | 
23 |         Returns:
24 |         Boolean: returns True or False.
25 |     """
26 | 
27 |     try:
28 |         f = open(input_csv_file)
29 |         content_file = f.readlines()[1:]
30 | 
31 |     except IOError:
32 |       print("Error: File {} does not appear to exist.".format(input_csv_file))
33 |       return False
34 | 
35 |     else:
36 |         f.close()
37 | 
38 |     try:
39 |         separator = '|'
40 |         output_file = open(output_filepath, 'w')
41 |         header = separator.join(['filename', 'subtitle', 'transcript', 'similarity']) + '\n'
42 |         output_file.write(header)
43 | 
44 |         for line in content_file:
45 |             filepath, text1, text2, similarity = line.split(separator)
46 | 
47 |             # Selects only files with similarity greater than min_similarity
48 |             if float(similarity) >= float(min_similarity):
49 |                 filename = basename(filepath)
50 |                 line = separator.join([filename.strip(), text1.strip(), text2.strip(), str(similarity).strip()])
51 |                 output_file.write(line + '\n')
52 | 
53 |             # otherwise, delete the file.
54 |             else:
55 |                 if force:
56 |                     remove(filepath)
57 |                 else:
58 |                     print('rm {}'.format(filepath))
59 | 
60 |         output_file.close()
61 | 
62 |     except KeyboardInterrupt:
63 |         print("KeyboardInterrupt Detected!")
64 |         exit()
65 | 
66 |     except:
67 |         exc_type, exc_obj, exc_tb = sys.exc_info()
68 |         exc_file = split(exc_tb.tb_frame.f_code.co_filename)[1]
69 |         print(exc_type, exc_file, exc_tb.tb_lineno)
70 |         return False
71 | 
72 |     return True
73 | 
74 | 
75 | def main():
76 |     parser = argparse.ArgumentParser()
77 |     parser.add_argument('--base_dir', default='./')
78 |     parser.add_argument('--csv_file', default='validation.csv', help='Name of csv file')
79 |     parser.add_argument('--min_value', default=0.90, help='Minimal value of levenshtein distance') 
80 |     parser.add_argument('--save_file', default='metadata.csv')
81 |     parser.add_argument('--force', action='store_true', default=False)
82 |     args = parser.parse_args()
83 | 
84 |     input_csv_file = join(args.base_dir, args.csv_file)
85 |     output_filepath = join(args.base_dir, args.save_file)
86 | 
87 |     select(input_csv_file, output_filepath, args.min_value, args.force)
88 | 
89 | 
90 | if __name__ == "__main__":
91 |     main()
92 | 


--------------------------------------------------------------------------------
/utils/exclude_unecessary_files.py:
--------------------------------------------------------------------------------
 1 | import argparse
 2 | import glob
 3 | import os
 4 | import tqdm
 5 | 
 6 | def exclude_files(args):
 7 | 
 8 |     total = 0
 9 |     for folder in tqdm.tqdm(sorted(glob.glob(args.base_dir + '/*/*'))):
10 | 
11 |         if not os.path.exists(folder) or not os.path.isdir(folder):
12 |             continue
13 | 
14 |         old_metadata_file = os.path.join(folder, 'save.csv')
15 |         metadata_file = os.path.join(folder, 'metadata.csv')
16 | 
17 |         if os.path.exists(metadata_file):
18 |             continue
19 | 
20 |         if not os.path.exists(old_metadata_file):
21 |             print('Verify the folder: ' + folder)
22 |             continue
23 |             #exit()
24 | 
25 |         folder_name = folder.split('/')[-1]
26 |         json_file = os.path.join(folder, folder_name + '.json')
27 |         if not os.path.isfile(json_file):
28 |             print(json_file)
29 |             continue      
30 |         srt_file = os.path.join(folder, folder_name + '.srt')
31 |         if not os.path.isfile(srt_file):
32 |             print(srt_file)
33 |             continue           
34 |         txt_file = os.path.join(folder, folder_name + '.txt')
35 |         if not os.path.isfile(txt_file):
36 |             print(txt_file)
37 |             continue  
38 |         subtitles_file = os.path.join(folder, 'subtitles.csv')
39 |         if not os.path.isfile(subtitles_file):
40 |             print(subtitles_file)
41 |             continue 
42 |         transcript_file = os.path.join(folder, 'transcript.csv')
43 |         if not os.path.isfile(transcript_file):
44 |             print(transcript_file)
45 |             continue 
46 |         validation_file = os.path.join(folder, 'validation.csv')
47 |         if not os.path.isfile(validation_file):
48 |             print(validation_file)
49 |             continue 
50 |         delete_file = os.path.join(folder, 'delete.csv')
51 |         if not os.path.isfile(delete_file):
52 |             print(delete_file)
53 |             continue 
54 |    
55 |         try:
56 |             f = open(old_metadata_file)
57 |             content_file = f.readlines()[1:]
58 |         except IOError:
59 |           print("Error: File {} does not appear to exist.".format(old_metadata_file))
60 |           return False
61 |         else:
62 |             f.close()
63 | 
64 |         if (len(content_file) == len(os.listdir(os.path.join(folder, args.wav_folder)))):
65 |             total +=1
66 |             if not args.force:
67 |                 print('mv ' + old_metadata_file + ' ' + metadata_file)
68 |             else:
69 |                 os.remove(json_file)
70 |                 os.remove(srt_file)
71 |                 os.remove(txt_file)
72 |                 os.remove(subtitles_file)
73 |                 os.remove(transcript_file)
74 |                 os.remove(validation_file)
75 |                 os.remove(delete_file)
76 |                 os.rename(old_metadata_file, metadata_file)
77 | 
78 |         else:
79 |             print('Founded diferences between ' + folder + ' and wavs.')
80 | 
81 | 
82 | 
83 | def main():
84 |   parser = argparse.ArgumentParser()
85 |   parser.add_argument('--base_dir', default='./output/channel/')  
86 |   parser.add_argument('--wav_folder', default='wavs', help='Name of old wavs folder, to erase')
87 |   parser.add_argument('--force', action='store_true', default=False)
88 |   args = parser.parse_args()
89 |   exclude_files(args)
90 | 
91 | if __name__ == "__main__":
92 |   main()


--------------------------------------------------------------------------------
/search.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com
  5 | #
  6 | #
  7 | from config import Config
  8 | import argparse
  9 | from os import makedirs
 10 | from os.path import join, exists, split
 11 | from googleapiclient.discovery import build
 12 | from tqdm import tqdm
 13 | import sys
 14 | 
 15 | 
 16 | def get_videos(youtube, conv_id):
 17 |     """
 18 |     Get all videos from youtube channel/playlist.
 19 | 
 20 |         Parameters:
 21 |         youtube (str): googleapiclient object.
 22 |         conv_id (str): google channel/playlist id.
 23 | 
 24 |         Returns:
 25 |         Videos (str): returns list of videos.
 26 |     """
 27 | 
 28 |     videos = []
 29 |     next_page_token = None
 30 | 
 31 |     while True:
 32 |         res = youtube.playlistItems().list(playlistId = conv_id,
 33 |                                             part = 'snippet',
 34 |                                             maxResults = 50,
 35 |                                             pageToken = next_page_token).execute()
 36 | 
 37 |         videos += res['items']
 38 |         next_page_token = res.get('nextPageToken')
 39 | 
 40 |         if next_page_token is None:
 41 |             break
 42 | 
 43 |     return videos
 44 | 
 45 | 
 46 | def search_videos(api_key, content_id, output_folderpath, output_result_file):
 47 |     """
 48 |     Search all the videos from a channel
 49 | 
 50 |         Parameters:
 51 |         api_key (str): Google developer Key
 52 |         content_id (str): Playlist or Channel id
 53 |         output_folderpath (str): folder
 54 |         output_result_file (str): output file to save youtube videos list
 55 | 
 56 |         Returns:
 57 |         file_path: returns
 58 |     """
 59 |     youtube_prefix = 'https://www.youtube.com/watch?v='
 60 | 
 61 |     api_service_name = 'youtube'
 62 |     api_version = 'v3'
 63 | 
 64 |     #print('Searching videos from {} - {}...'.format(Config.orig_base, content_id))
 65 |     path_dest = join(output_folderpath, Config.orig_base, content_id )
 66 | 
 67 |     if not(exists(path_dest)):
 68 |         makedirs(path_dest)
 69 | 
 70 |     output_filepath = join(path_dest, output_result_file)
 71 | 
 72 |     # Checks if it has already been downloaded
 73 |     if exists(output_filepath):
 74 |         return output_filepath
 75 | 
 76 |     try:
 77 |         # Open output file
 78 |         f = open(output_filepath, 'w+')
 79 | 
 80 |         youtube = build(api_service_name, api_version, developerKey = api_key)
 81 | 
 82 |         if Config.orig_base == 'playlist':
 83 |             conv_id = content_id
 84 |         elif Config.orig_base == 'channel':
 85 |             res = youtube.channels().list(id = content_id,
 86 |                         part = 'contentDetails').execute()
 87 |             conv_id = res['items'][0]['contentDetails']['relatedPlaylists']['uploads']
 88 |         else:
 89 |             conv_id = None
 90 | 
 91 |         # Get all videos from youtube channel/playlist.
 92 |         videos = get_videos(youtube, conv_id)
 93 | 
 94 |         print('Writing video links to file...')
 95 |         for video in tqdm(videos):
 96 |             f.write( youtube_prefix + video['snippet']['resourceId']['videoId'] + '\n' )
 97 | 
 98 |         print("Total videos: {0}".format( len(videos) ))
 99 |         f.close()
100 | 
101 |     except KeyboardInterrupt:
102 |         print("KeyboardInterrupt Detected!")
103 |         exit()
104 | 
105 |     except:
106 |         exc_type, exc_obj, exc_tb = sys.exc_info()
107 |         exc_file = split(exc_tb.tb_frame.f_code.co_filename)[1]
108 |         print(exc_type, exc_file, exc_tb.tb_lineno)
109 |         return False
110 | 
111 |     return output_filepath
112 | 
113 | 
114 | def main():
115 |     parser = argparse.ArgumentParser()
116 |     parser.add_argument('--api_key', default = '')
117 |     parser.add_argument('--content_id', default = '')
118 |     parser.add_argument('--base_dir', default = './')
119 |     parser.add_argument('--dest_dir', default = 'output')
120 |     parser.add_argument('--output_search_file', default='youtube_videos.txt')
121 |     args = parser.parse_args()
122 |     output_path = join(args.base_dir, args.dest_dir)
123 |     search_videos(args.api_key, args.content_id, output_path, args.output_search_file)
124 | 
125 | 
126 | if __name__ == '__main__':
127 |     main()
128 | 


--------------------------------------------------------------------------------
/download.py:
--------------------------------------------------------------------------------
  1 | # !/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com
  5 | #
  6 | #
  7 | import argparse
  8 | import sys
  9 | from os import makedirs
 10 | from os.path import join, exists, split
 11 | import time
 12 | import youtube_dl
 13 | from youtube_transcript_api import YouTubeTranscriptApi
 14 | from pathlib import Path
 15 | from urllib.parse import parse_qs, urlparse
 16 | from random import randint
 17 | 
 18 | def my_progress(d):
 19 |     '''
 20 |     Show download progress.
 21 |     '''
 22 |     if d['status'] == 'finished':
 23 |         print('Done downloading, now converting ...')
 24 | 
 25 | 
 26 | def download_audio_and_subtitles_from_youtube(yt_url, output_path): # function for ingesting when given a url
 27 |     '''
 28 |     Download audio and subtitle from a youtube video given a url.
 29 |         Parameters:
 30 |         yt_url (str): Youtube URL format https://www.youtube.com/watch?v=XXXXXXXXXXX
 31 |         output_path (str): folder to save youtube audio.
 32 | 
 33 |         Returns:
 34 |         String: returns True or False
 35 | 
 36 |     '''
 37 |     # Use vid as the diretory name for download and processing
 38 |     vids = parse_qs(urlparse(yt_url).query, keep_blank_values=True).get('v')
 39 |     vid = None if vids == None else vids[0]
 40 | 
 41 |     video_dir = join(output_path, vid)
 42 | 
 43 |     # Filename for audio stream (.mp4) and subtitle (.srt) files
 44 |     audio = join(video_dir, vid + '.webm')
 45 |     subtitle = join(video_dir, vid + '.srt')
 46 | 
 47 |     if Path(audio).exists() and Path(subtitle).exists():
 48 |         return False
 49 | 
 50 |     if exists(audio.replace('.webm', '.mp3')) and exists(subtitle):
 51 |         return False
 52 | 
 53 |     # Get information on the YouTube content
 54 |     try:
 55 |         # Random time do waiting to avoid youtube access blocking
 56 |         t = randint(30,60) 
 57 |         print('Waiting %d seconds ...'%(t))
 58 |         time.sleep(t) # Overcome YouTube blocking
 59 | 
 60 |         if not (exists(video_dir)):
 61 |             makedirs(video_dir)
 62 | 
 63 |         ydl_opts = {
 64 |             'format': 'bestaudio/best',  
 65 |             'postprocessors': [{
 66 |                 'key': 'FFmpegExtractAudio',
 67 |                 'preferredcodec': 'mp3',
 68 |                 'preferredquality': '320',
 69 |             }],     
 70 |             'outtmpl': audio,        
 71 |             'noplaylist' : True,        
 72 |             'progress_hooks': [my_progress],  
 73 |         }
 74 |         # Download audio stream and convert to mp3
 75 |         with youtube_dl.YoutubeDL(ydl_opts) as ydl:
 76 |             ydl.download([yt_url])
 77 | 
 78 |         # get video_id from youtube_uri
 79 |         video_id = yt_url.replace('https://www.youtube.com/watch?v=','')
 80 |         # Download subtitle and write to an .srt file
 81 |         transcript_list = YouTubeTranscriptApi.list_transcripts(video_id)
 82 | 
 83 |         # filter first for manually created transcripts and second for automatically generated ones
 84 |         transcript = transcript_list.find_transcript(['pt'])
 85 |         # get only text from transcript
 86 |         text_transcript_list = []
 87 |         for line in transcript.fetch():
 88 |             text_transcript_list.append(line['text'])
 89 |         text_transcript = ' '.join(text_transcript_list)
 90 | 
 91 |         # Write transcript to file
 92 |         output_file = open(subtitle, 'w')
 93 |         output_file.write(text_transcript)
 94 |         output_file.close()
 95 | 
 96 |     except KeyboardInterrupt:
 97 |         print("KeyboardInterrupt Detected!")
 98 |         exit()
 99 | 
100 |     except:
101 |         exc_type, exc_obj, exc_tb = sys.exc_info()
102 |         exc_file = split(exc_tb.tb_frame.f_code.co_filename)[1]
103 |         print(exc_type, exc_file, exc_tb.tb_lineno)
104 |         return False
105 | 
106 |     return True
107 | 
108 | def main():
109 |     parser = argparse.ArgumentParser()
110 |     parser.add_argument('--youtube_url', help="URL of the youtube video.")
111 |     parser.add_argument('--output_dir', default='data', help='Directory to save downloaded audio and transcript files.')
112 | 
113 |     args = parser.parse_args()
114 | 
115 |     if args.youtube_url.startswith('https://'):
116 |         download_audio_and_subtitles_from_youtube(args.youtube_url, args.output_dir)
117 | 
118 |     else:
119 |         print("URL of the video file should start with https://")
120 |         sys.exit(1)
121 | 
122 | if __name__ == '__main__':
123 |     main()
124 | 
125 | 


--------------------------------------------------------------------------------
/transcribe.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com
  5 | #
  6 | #
  7 | import argparse
  8 | import sys
  9 | from os import makedirs
 10 | from os.path import join, exists, basename, split
 11 | from glob import glob
 12 | from tqdm import tqdm
 13 | import librosa
 14 | import requests
 15 | import soundfile as sf
 16 | import json
 17 | 
 18 | 
 19 | def convert_audios_samplerate(input_path, output_path, new_sample_rate):
 20 |     """
 21 |     Converts all audio files within a folder to a new sample rate.
 22 |         parameters:
 23 |             input_path: input folder path with wav files.
 24 |             output_path: output folder path to save converted wav files.
 25 | 
 26 |         Returns:
 27 |             Boolean: True of False.
 28 |     """
 29 | 
 30 |     if not(exists(output_path)):
 31 |         makedirs(output_path)
 32 | 
 33 |     for wavfile_path in tqdm(sorted(glob(input_path + "/*.wav"))):
 34 |         try:
 35 |             filename = basename(wavfile_path)
 36 |             data, sample_rate = librosa.load(wavfile_path)
 37 |             data = data.T
 38 |             new_data = librosa.resample(data, sample_rate, new_sample_rate)
 39 |             output_file = join(output_path, filename)
 40 |             sf.write(output_file, new_data, new_sample_rate)
 41 |         except:
 42 |             print('Error converting ' + wavfile_path)
 43 |             return False
 44 | 
 45 |     return True
 46 | 
 47 | 
 48 | def get_transcript(wavefile_path):
 49 |     """
 50 |     Custom function to access a service STT. You must adapt it to use your contracted STT service.
 51 |         parameters:
 52 |             wavefile_path: wav filepath which will be transcribed.
 53 | 
 54 |         Returns:
 55 |             Text (str): Transcription of wav file.
 56 |     """
 57 |     with open(wavefile_path,'rb') as file_data:
 58 |         headers_raw = {
 59 |                 'Content-Type': "application/x-www-form-urlencoded",
 60 |             	'endpointer.enabled': "true",
 61 |             	'endpointer.waitEnd': "5000",
 62 |             	'endpointer.levelThreshold': "5",
 63 |             	'decoder.confidenceThreshold': "10",
 64 |             	'decoder.maxSentences': "1",
 65 |             	'decoder.wordDetails': "0",
 66 |         }
 67 |         try:
 68 |             res = requests.post(url='https://your_url_here',
 69 |                                 data=file_data,
 70 |                                 headers=headers_raw)
 71 | 
 72 |             res.encoding='utf-8'
 73 |         except KeyboardInterrupt:
 74 |             print("KeyboardInterrupt Detected!")
 75 |             exit()
 76 |         except:
 77 |             #json_data=[{"message": "ERROR NO SPEECH"}]
 78 |             #return json_data
 79 |             return False
 80 |     return res.text
 81 | 
 82 | 
 83 | def transcribe_audios(input_path, output_file):
 84 |     """
 85 |     Iterate over the wav files inside a folder and transcribe them all.
 86 |         parameters:
 87 |             input_path: input wavs folder.
 88 |             output_file: output file to save the transcriptions following the template: "filename| transcription"
 89 | 
 90 |         Returns:
 91 |             Boolean: True or False.
 92 |     """
 93 | 
 94 |     out = open(output_file, 'w')
 95 | 
 96 |     for wavfile_path in tqdm(sorted(glob(input_path + "/*.wav"))):
 97 |         filename = basename(wavfile_path)
 98 |         # Four attempts if connection error occurs.
 99 |         for attempts in range(4):
100 | 
101 |             if attempts != 0:
102 |                 print('Attempt - {}...'.format(attempts))
103 | 
104 |             transcript = get_transcript(wavfile_path)
105 |             if not transcript:
106 |                 text = ''
107 |                 break
108 | 
109 |             try:
110 |                 transcript_json = json.loads(str(transcript).replace("'", '"'))
111 |                 if transcript_json[0]['result_status'] == 'RECOGNIZED':
112 |                     text = transcript_json[0]['alternatives'][0]['text']
113 |                     break
114 |                 else:
115 |                     #print("Erro")
116 |                     text = ''
117 |                     break
118 |             except:
119 |                 exc_type, exc_obj, exc_tb = sys.exc_info()
120 |                 exc_file = split(exc_tb.tb_frame.f_code.co_filename)[1]
121 |                 print("Transcribing error: ")
122 |                 print(exc_type, exc_file, exc_tb.tb_lineno)
123 | 
124 |         else:
125 |             text = ''
126 | 
127 |         out.write("{}|{}\n".format(str(filename),str(text)))
128 | 
129 |     out.close()
130 |     return True
131 | 
132 | 
133 | def main():
134 |     parser = argparse.ArgumentParser()
135 |     parser.add_argument('--base_dir', default='./')
136 |     parser.add_argument('--transcription_file', default='transcript.txt', help='Filename to save the transcripts')
137 |     parser.add_argument('--input_dir', default='wavs', help='Directory of wav files')
138 |     parser.add_argument('--temp_dir', default='wavs_16k', help='Directory to save wav files with sample rate (16k)')
139 |     parser.add_argument('--new_sample_rate', default=16000, help='Sample rate used by the transcription api.')
140 | 
141 |     args = parser.parse_args()
142 | 
143 |     input_path = join(args.base_dir, args.input_dir)
144 |     converted_wavs_temp_path = join(args.base_dir,args.temp_dir)
145 |     output_file = join(args.base_dir,args.transcription_file)
146 | 
147 |     # Convert audio sample rate
148 |     print('Converting wav files...')
149 |     convert_audios_samplerate(input_path, converted_wavs_temp_path, args.new_sample_rate)
150 | 
151 |     # Transcribe all wavs files
152 |     print('Transcribing...')
153 |     transcribe_audios(converted_wavs_temp_path, output_file)
154 | 
155 | 
156 | if __name__ == "__main__":
157 |   main()
158 | 


--------------------------------------------------------------------------------
/validation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com
  5 | #
  6 | #
  7 | import argparse
  8 | from os import makedirs
  9 | from os.path import join, exists, dirname
 10 | from textdistance import levenshtein
 11 | from tqdm import tqdm
 12 | 
 13 | 
 14 | def remove_punctuations(sentence):
 15 |     """
 16 |     Removes punctuations and unwanted characters from a sentence.
 17 |     """
 18 |     punctuations = '''—!()-[]{};:'"\,<>./?@#$%^&*_~'''
 19 |     sentence_with_no_punct = ""
 20 |     for char in sentence:
 21 |        if char not in punctuations:
 22 |            sentence_with_no_punct = sentence_with_no_punct + char
 23 |     return sentence_with_no_punct.strip()
 24 | 
 25 | 
 26 | def clear_sentences(sentence):
 27 |     """
 28 |     Converts the sentence to lowercase and removes unwanted characters.
 29 |     """
 30 |     sentence = sentence.lower()
 31 |     clean_sentence = remove_punctuations(sentence)
 32 |     return clean_sentence
 33 | 
 34 | 
 35 | def create_validation_file(input_file1, input_file2, prefix_filepath, output_file):
 36 |     """
 37 |     Given two files containing different transcriptions of audio files, this function calculates the similarity (levenshtein distance) between the sentences,
 38 |     saving the result in a third file.
 39 | 
 40 |         Parameters:
 41 |         input_file1 (str): First filepath. The contents of the file must follow the template: "filename | text"
 42 |         input_file2 (str): Second filepath. The contents of the file must follow the template: "filename | text"
 43 |         prefix_filepath: Prefix to be added to the file path within the output file.
 44 | 
 45 |         Returns:
 46 |         output_file (str): Returns output filepath. The content of the file follows the template: prefix_filepath/filename | text1 | text2 | similarity
 47 |     """
 48 | 
 49 |     # Loads the contents of the first input file
 50 |     try:
 51 |         with open(input_file1) as f:
 52 |             content_file1 = f.readlines()
 53 | 
 54 |     except KeyboardInterrupt:
 55 |         print("KeyboardInterrupt detected!")
 56 |         exit()
 57 | 
 58 |     except IOError:
 59 |       print("Error: File {} does not appear to exist.".format(input_file1))
 60 |       return False
 61 | 
 62 |     # Loads the contents of the second input file
 63 |     try:
 64 |         with open(input_file2) as g:
 65 |             content_file2 = g.readlines()
 66 | 
 67 |     except KeyboardInterrupt:
 68 |         print("KeyboardInterrupt detected!")
 69 |         exit()
 70 | 
 71 |     except IOError:
 72 |       print("Error: File {} does not appear to exist.".format(input_file2))
 73 |       return False
 74 | 
 75 |     # Both files must be the same length, otherwise there is an error.
 76 |     if not (len(content_file1) == len(content_file2)):
 77 |         print("Error: length File {} not igual to File {}.".format(content_file1, content_file2))
 78 |         return False
 79 | 
 80 |     # Checks if the output folder exists
 81 |     output_folderpath = dirname(output_file)
 82 | 
 83 |     if not(exists(output_folderpath)):
 84 |         makedirs(output_folderpath)
 85 | 
 86 |     # Saves the result to the output file.
 87 |     try:
 88 |         o_file = open(output_file, 'w')
 89 | 
 90 |     except KeyboardInterrupt:
 91 |         print("KeyboardInterrupt detected!")
 92 |         exit()
 93 | 
 94 |     except IOError:
 95 |         print("Error: creating File {} problem.".format(output_file))
 96 |         return False
 97 | 
 98 |     # Iterate over the two files content simultaneously to calculate the similarity between the sentences.
 99 |     else:
100 |         separator = '|'
101 |         header = separator.join(['filename', 'subtitle', 'transcript', 'similarity'])
102 |         o_file.write(header + '\n')
103 | 
104 |         # Input files must be csv files with the character "|" as a separator: filename | text
105 |         for line1, line2 in tqdm(zip(content_file1, content_file2), total=len(content_file1)):
106 | 
107 |             file1, text1 = line1.split('|')
108 |             file2, text2 = line2.split('|')
109 | 
110 |             # Clears sentences by removing unwanted characters.
111 |             clean_text1 = clear_sentences(text1)
112 |             clean_text2 = clear_sentences(text2)
113 |             filepath = join(prefix_filepath, file1)
114 | 
115 |             # Calculates the levenshtein distance to define the normalized similarity (0-1) between two sentences.
116 |             l = levenshtein.normalized_similarity(clean_text1, clean_text2)
117 | 
118 |             # Defines the output content and writes to a file.
119 |             line = separator.join([filepath, text1.strip(), text2.strip(), str(l)])           
120 |             o_file.write(line + '\n')
121 | 
122 |     finally:
123 |         o_file.close()
124 | 
125 |     return True
126 | 
127 | 
128 | def main():
129 | 
130 |     parser = argparse.ArgumentParser()
131 |     parser.add_argument('--base_dir', default='./')
132 |     parser.add_argument('--input_file1', default='metadata1.csv', help='Input first filename')
133 |     parser.add_argument('--input_file2', default='metadata2.csv', help='Input second filename')
134 |     parser.add_argument('--prefix', default='', help='Prefix to filename on metadata output file.')
135 |     parser.add_argument('--output_dir', default='output', help='Directory to save distances')
136 |     parser.add_argument('--output_file', default='validation.csv', help='Output file with the template: "filename, text1, text2, similarity"')
137 | 
138 |     args = parser.parse_args()
139 | 
140 |     input_path_file1 = join(args.base_dir, args.input_file1)
141 |     input_path_file2 = join(args.base_dir, args.input_file2)
142 |     output_path_file = join(args.base_dir, args.output_dir, args.output_file)
143 | 
144 |     create_validation_file(input_path_file1, input_path_file2, args.prefix, output_path_file)
145 | 
146 | 
147 | if __name__ == "__main__":
148 |     main()


--------------------------------------------------------------------------------
/audio_segmentation.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com
  5 | #
  6 | # Adapted from https://gist.github.com/keithito/771cfc1a1ab69d1957914e377e65b6bd from Keith Ito: kito@kito.us
  7 | #
  8 | import argparse
  9 | import os
 10 | import json
 11 | from pydub import AudioSegment
 12 | 
 13 | 
 14 | class Segment:
 15 |     """
 16 |     Linked segments lists
 17 |     """
 18 |     def __init__(self, begin, end, text):
 19 |         self.begin = begin
 20 |         self.end = end
 21 |         self.text = text
 22 |         self.next = None
 23 |         self.filename = None
 24 |         self.gap = 0 # gap between segments (current and next)
 25 | 
 26 |     def set_next(self, next):
 27 |         self.next = next
 28 |         self.gap = next.begin - self.end
 29 | 
 30 |     def set_filename_and_id(self, filename, id):
 31 |         self.filename = filename
 32 |         self.id = id
 33 | 
 34 |     def merge_from(self, next):
 35 |         # merge two segments (current and next)
 36 |         self.next = next.next
 37 |         self.gap = next.gap
 38 |         self.end = next.end
 39 | 
 40 |     def duration(self, sample_rate):
 41 |         return (self.end - self.start - 1) / sample_rate
 42 | 
 43 | 
 44 | def create_segments_list_from_aeneas_json(json_path):
 45 |     """
 46 |     Creates a list of segments from the json file resulting from aeneas processing.
 47 |     """
 48 | 
 49 |     head = None
 50 |     with  open(json_path) as jfile :
 51 |         data = json.load(jfile)
 52 |         for i, fragment in enumerate(data['fragments']):
 53 |             text = fragment['lines']
 54 |             begin = float(fragment['begin'])*1000
 55 |             end = float(fragment['end'])*1000
 56 | 
 57 |             # Build a segment list
 58 |             segment = Segment(begin, end, text)
 59 |             if head is None:
 60 |                 head = segment
 61 |             else:
 62 |                 prev.set_next(segment)
 63 |             prev = segment
 64 | 
 65 |     return head
 66 | 
 67 | 
 68 | def create_audio_files_from_segments_list(audio_file, filenames_base, head_list, output_dir):
 69 |     """
 70 |     Segments an audio file from a segment list, saving the files in a folder.
 71 |         Parameters:
 72 |         audio_file (str): filepath of source audio file.
 73 |         filenames_base (str): Filename prefix of audio segmented files.
 74 |         head_list (str): Reference of the linked list of segments.
 75 |         output_dir (str): Folder to save segmented audio files.
 76 | 
 77 |         Returns:
 78 |         String: returns True or False
 79 |     """
 80 | 
 81 |     if not os.path.exists(output_dir):
 82 |         os.makedirs(output_dir)
 83 | 
 84 |     sound = AudioSegment.from_file(audio_file)
 85 |     curr = head_list
 86 |     i = 1
 87 |     while curr is not None:
 88 |         begin = curr.begin
 89 |         end = curr.end
 90 |         text = curr.text
 91 |         audio_segment = sound[begin:end]
 92 |         filename = '{}-{:04d}.wav'.format(filenames_base, i)
 93 |         curr.set_filename_and_id(filename, i)
 94 |         filepath = os.path.join(output_dir, filename)
 95 |         try:
 96 |             audio_segment.export(filepath, 'wav')
 97 |         except IOError:
 98 |           print("Error: Writing audio file {} problem.".format(filepath))
 99 |           return False
100 |         else:
101 |             curr = curr.next
102 |             i += 1
103 |     return True
104 | 
105 | 
106 | def create_metadata_from_segments_list(head_list, output_file):
107 |     """
108 |     Creates a csv file following the template: "filename | text"
109 |         Parameters:
110 |         head_list (str): Reference of the linked list of segments.
111 |         output_file (str): csv output filename.
112 | 
113 |         Returns:
114 |         String: returns True or False
115 |     """
116 |     separator = '|'
117 |     curr = head_list
118 |     try:
119 |         f = open(output_file, "w")
120 |         while curr is not None:
121 |             text = curr.text
122 |             filename = curr.filename.replace('.mp3', '')
123 |             f.write(filename + separator + text[0] + '\n')
124 |             curr = curr.next
125 |         f.close()
126 |     except IOError:
127 |         print("Error: creating File {} problem.".format(output_file))
128 |         return False
129 |     return True
130 | 
131 | 
132 | def segment_audio(audio_path, json_path, output_path, metadata_output_file, filename_base):
133 |     """
134 |     Performs the segmentation of the audio files and the creation of the csv file.
135 |         Parameters:
136 |         audio_path (str): filepath of source audio file.
137 |         json_path (str): json file resulting from aeneas processing.
138 |         output_path (str): Folder to save segmented audio files.
139 |         metadata_output_file (str): csv output filename.
140 |         filename_base (str): Filename prefix of audio segmented files.
141 | 
142 |         Returns:
143 |         String: returns True or False
144 |     """
145 |     segments_list = create_segments_list_from_aeneas_json(json_path)
146 |     if not create_audio_files_from_segments_list(audio_path, filename_base, segments_list, output_path):
147 |         return False
148 |     if not create_metadata_from_segments_list(segments_list, metadata_output_file):
149 |         return False
150 |     return True
151 | 
152 | 
153 | def main():
154 |     parser = argparse.ArgumentParser()
155 |     parser.add_argument('--base_dir', default='./')
156 |     parser.add_argument('--audio_file', default='audio.mp3', help='Filename to input audio file')
157 |     parser.add_argument('--filename_base', default='audio', help='Filename base of splited audios file. Ex. audio-0001.wav')
158 |     parser.add_argument('--json_file', default='output.json', help='Filename of input json file')
159 |     parser.add_argument('--output_dir', default='output', help='Output dir')
160 |     parser.add_argument('--metadata_file', default='metadata.csv', help='Filename to metadata output file')
161 |     args = parser.parse_args()
162 | 
163 |     audio_path = os.path.join(args.base_dir, args.audio_file)
164 |     json_path = os.path.join(args.base_dir, args.json_file)
165 |     output_dir = os.path.join(args.base_dir, args.output_dir)
166 |     metadata_output_file = os.path.join(args.base_dir, args.output_dir, args.metadata_file)
167 | 
168 |     segment_audio(audio_path, json_path, output_dir, metadata_output_file, args.filename_base)
169 | 
170 | if __name__ == "__main__":
171 |     main()


--------------------------------------------------------------------------------
/utils/number_to_text.py:
--------------------------------------------------------------------------------
  1 | import unicodedata
  2 | from math import ceil, floor
  3 | import re
  4 | 
  5 | ordinals_numbers = {
  6 |     '1º':"primeito", '1ª':"primeira", 
  7 |     '2º':"segundo", '2ª':"segunda",
  8 |     '3º':"terceiro", '3ª':"terceira", 
  9 |     '4º':"quarto", '4ª':"quarta", 
 10 |     '5º':"quinto", '5ª':"quinta",
 11 |     '6º':"sexto", '6ª':"sexta", 
 12 |     '7º':"sétimo", '7ª':"sétima",  
 13 |     '8º':"oitavo", '8ª':"oitava",
 14 |     '9º':"nono", '9ª':"nona", 
 15 |     '10º':"décimo", '10ª':"décima", 
 16 |     '11º':"décimo primeiro", '11ª':"décima primeira", 
 17 |     '12º':"décimo segundo", '12ª':"décima segunda", 
 18 |     '13º':"décimo terceiro", '13ª':"décima terceira",
 19 |     '14º':"décimo quarto", '14ª':"décima quarta", 
 20 |     '15ª':"décimo quinto", '15ª':"décima quinta", 
 21 |     '16º':"décimo sexto", '16ª':"décima sexta", 
 22 |     '17º':"décimo sétimo", '17ª':"décima sétima",
 23 |     '18º':"décimo oitavo", '18ª':"décima oitava",    
 24 |     '19º':"décima nono", '19ª':"décima nona",        
 25 |     '20º':"vigésimo", '20ª':"vigésima",      
 26 |     '21º':"vigésimo primeiro", '21ª':"vigésima primeira", 
 27 |     '22º': "vigésimo segundo", '22ª': "vigésima segunda", 
 28 |     '26º':"vigésimo sexto", '26ª':"vigésima sexta", 
 29 |     '30º':"trigésimo", '30ª':"trigésima",  
 30 |     '60º':"sexagésimo", '60ª':"sexagésima", 
 31 |     '89º':"octogésimo nono", 
 32 |     '90º':"nonagésimo", '90ª':"nonagésima", 
 33 |     'nº':"número"
 34 | }
 35 | 
 36 | 
 37 | class Palavra:
 38 | 
 39 |     def __init__(self, singular, plural):
 40 |         self.singular = singular
 41 |         self.plural = plural
 42 | 
 43 | class Extenso:
 44 | 
 45 |     def __init__(self):
 46 | 
 47 |         self._numero_maximo = 999999999999999999999999999999999999999999999
 48 | 
 49 |         # Dicionários para armazenar os números por extenso
 50 |         self.unidades = {1: 'um', 2: 'dois', 3: 'três', 4: 'quatro', 5: 'cinco', 6: 'seis', 7: 'sete', 8: 'oito', 9: 'nove', 10 : 'dez', 
 51 |                          11 : 'onze', 12 : 'doze', 13 : 'treze', 14 : 'quatorze', 15 : 'quinze', 16 : 'dezesseis', 17 : 'dezessete', 18 : 'dezoito', 19 : 'dezenove'}
 52 | 
 53 |         self.dezenas = {2: 'vinte', 3: 'trinta', 4: 'quarenta', 5: 'cinquenta', 6: 'sessenta', 7: 'setenta', 8: 'oitenta', 9: 'noventa'}
 54 | 
 55 |         self.centenas = {1: Palavra('cem', 'cento'), 2: 'duzentos', 3: 'trezentos', 4: 'quatrocentos', 5: 'quinhentos', 6: 'seiscentos', 7: 'setecentos', 8: 'oitocentos', 9: 'novecentos'}
 56 | 
 57 |         # Tupla armazenando os milhares
 58 |         self.milhares = (Palavra('',''), Palavra('mil','mil'), Palavra('milhão','milhões'), \
 59 |                         Palavra('bilhão','bilhões'), Palavra('trilhão','trilhões'), Palavra('quatrilhão','quatrilhões'), \
 60 |                         Palavra('quintilhão','quintilhões'), Palavra('sextilhão','sextilhões'), Palavra('septilhão','septilhões'), \
 61 |                         Palavra('octilhão','octilhões'),Palavra('nonilhão','nonilhões'), Palavra('decilhão','decilhões'), \
 62 |                         Palavra('undecilhão','undecilhões'), Palavra('duodecilhão','duodecilhões'),Palavra('tredecilhão','tredecilhões'))
 63 | 
 64 | 
 65 |     def escrever(self, numero):
 66 |         if (numero > self._numero_maximo):
 67 |             raise Exception('Número informado maior que o número máximo suportado')
 68 |         if (numero == 0):
 69 |             return 'zero'
 70 |         extenso = ''
 71 | 
 72 |         # Conversão do número recebido para string
 73 |         numero_string = str(numero)
 74 |         # Busca o tamanho do número informado
 75 |         tamanho = len(numero_string)
 76 | 
 77 |         # Arredonda para cima para saber quantos grupos de três há
 78 |         ternarios = ceil(tamanho / 3)
 79 | 
 80 |         # Preenche a string do número com zeros até o tamanho divisível por 3
 81 |         numero_string = numero_string.zfill(ternarios * 3)
 82 |         
 83 |         # percorre os grupos de três números
 84 |         for n in range(1, ternarios + 1):
 85 |             # Busca a parte do número referente ao grupo atual
 86 |             parte_numero = int(numero_string[(n - 1) * 3 : n * 3])
 87 | 
 88 |             # Caso o grupo seja zero, não precisa de tratamento
 89 |             if parte_numero == 0:
 90 |                 continue
 91 | 
 92 |             # Cálculo para retornar a centena
 93 |             centena = floor(parte_numero / 100)
 94 | 
 95 |             # Cálculo para retornar a dezena
 96 |             dezena = floor((parte_numero - (centena*100)) / 10)
 97 | 
 98 |             # Cálculo para retornar a unidade
 99 |             unidade = parte_numero - (centena*100) - (dezena*10)
100 | 
101 |             # Caso a centena esteja preenchida, faz o tratamento
102 |             if (centena > 0):
103 |                 if (dezena == 0 and unidade == 0 and extenso != ''):
104 |                     extenso += ' e '
105 |                 elif extenso != '':
106 |                     extenso += ', '
107 |                 if (centena == 1): # Se for CEM deve busca do singular, caso a unidade ou dezena esteja preenchida, busca do plural
108 |                     if(dezena > 0 or unidade > 0):
109 |                         extenso += self.centenas[centena].plural
110 |                     else:
111 |                         extenso += self.centenas[centena].singular
112 |                 else:
113 |                     extenso += self.centenas[centena] # Caso a centena for maior que 1, busca a string correspondente no dicionário
114 |             
115 |             # Caso a dezena esteja preenchida, faz o tratamento
116 |             if (dezena > 0):
117 |                 if (extenso != ''): # Se o número por extenso já veio preenchido, adiciona "E"
118 |                     extenso += ' e '
119 | 
120 |                 if (dezena == 1): # Se a dezena for um, busca das unidades
121 |                     dezena = 10 + unidade
122 |                     unidade = 0 # para não executar o extenso das unidades
123 |                     extenso += self.unidades[dezena] # Busca o extenso correspondente nas unidades
124 |                 else:
125 |                     extenso += self.dezenas[dezena] # Se a dezena for maior que um, busca da sua posição correspondente nas dezenas
126 | 
127 |             # Caso a unidade esteja preenchida, faz o tratamento
128 |             if (unidade > 0):
129 |                 if (extenso != ''): # Se a centena ou dezena estão preenchidas, adiciona "E"
130 |                     extenso += ' e '
131 |                 extenso += self.unidades[unidade] # Busca o extenso correspondente nas unidades
132 | 
133 |             # Tratamento para milhares
134 |             if n < tamanho: # Se não for o último, concatena o milhar correspondente
135 |                 if (parte_numero > 1):
136 |                     extenso += f' {self.milhares[ternarios - n].plural}' # Maior que 1, busca o plural
137 |                 else:
138 |                     extenso += f' {self.milhares[ternarios - n].singular}' # Se for 1, busca o singular
139 |         return extenso.replace('um mil,', 'mil')
140 | 
141 | def number_to_text(text):
142 |     """
143 |     Given a text, it replaces the numbers (decimals and ordinals) found by its full version.
144 |     """
145 |     ex = Extenso()
146 | 
147 |     words = re.split(r'([.,;!? ])', text)
148 |     for word in words:
149 |         if 'º' in word or 'ª' in word:
150 |             if word in ordinals_numbers.keys():
151 |                 new_word = ordinals_numbers[word]
152 |                 text = text.replace(word, new_word)
153 |             else:
154 |                 #raise ValueError(' The ordinal number '+word+'is not in ordinals_numbers list fix this!')
155 |                 print('The ordinal number "'+ word +'" is not in ordinals_numbers list fix this!')
156 | 
157 |         if word.isdigit():
158 |             new_word = ex.escrever(int(word))
159 |             text = text.replace(word, new_word)
160 |     return text
161 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # KATube - KATube Audio dataset creator from youTube
  2 | 
  3 | KATube is a tool to automate the process of creating datasets for training Text-To-Speech (TTS) and Speech-To-Text (STT) models. It is based on the work of Pansori [https://arxiv.org/abs/1812.09798]. 
  4 | 
  5 | 
  6 | From a list of YouTube playlists or YouTube channels, KATube downloads all audios with their respective subtitles, segments the audios, performing audio-text alignment using the external tool [AENEAS](https://github.com/readbeyond/aeneas). From this alignment, KATube will segment the audio, according to the sentences created.
  7 | 
  8 | Finally, a validation step can be performed. For this, KATube must use an external translation tool STT (not available here). This validation will calculate the similarity between the subtitle and the transcript, using the [Levenshtein distance](https://en.wikipedia.org/wiki/Levenshtein_distance). This step ensures that the subtitle and the transcript are correct. KATube can be configured to perform a last selection step, in which will be discard audios that do not have a minimum guarantee of similarity between the sentence and the transcript.
  9 | 
 10 | Use at your own risk.
 11 | 
 12 | ![katube-process](imgs/katube-process.png)
 13 | 
 14 | ## Search and Ingest Videos
 15 | 
 16 | In the first stage, KATube performs two procedures: search and download. The search is done by a function that uses the youtube api, so you will need an [api key](https://developers.google.com/places/web-service/get-api-key) to perform this functionality. The search function receives the id of a youtube channel, the api key and returns a list of all the videos available on that channel. This functionality is provided by the script named "search.py" and can be used separately. Execute the script using as input argument the api_key of the google account, the youtube channel id (or playlist id), the output directory, and the output file name. For example:
 17 | 
 18 | ```
 19 | $ python search.py --api_key=GOOGLE_DEVELOPER_API_KEY --content_id=CHANNEL_ID --dest_dir=OUTPUT_FOLDER --output_search_file=YOUTUBE_VIDEOS.txt
 20 | ```
 21 | 
 22 | Then, a function is used to download audio and subtitles from online videos as .mp3 and .srt files, respectively. This process can be time consuming, as it is necessary to wait a few seconds (a value between 30 and 60 seconds set at random) to avoid blocking the IP by the Youtube servers. This functionality is provided by the script named "download.py" and can be used separately. Execute the script using as input argument the URL of the youtube video and the destination directory. For example:
 23 | 
 24 | ```
 25 | $ python download.py --youtube_url=https://www.youtube.com/watch?v=999999999 --output_dir=OUTPUT_FOLDER
 26 | ```
 27 | 
 28 | Two files will be created: one .srt and the other .mp3. The names of the files will be the same as the video ID, which is the code after the URL, for example https://<www.youtube.com/watch?v=VIDEO_ID>.
 29 | 
 30 | ## Cleaning and Normalization of the text
 31 | 
 32 | The subtitles contain segmented text and timing information which corresponds to the audio contents of the associated video. THe timing subtitles is discarded and subtitles are joined. The text corresponding to the sentences is cleaned, normalized and divided into sentences, according to the punctuation. The division of sentences will try to respect the limits defined by the minimum and maximum number of words previously defined. If it is not possible to respect these limits by segmenting the text in the punctuation, the text will be segmented in an arbitrary manner, regardless of the punctuation.
 33 | 
 34 | This functionality is provided by the script named "text_normalization.py" and can be used separately. Run the script using as input argument the subtitles filepath, the minimal and maximal number of words on sentence, and the output filepath. For example:
 35 | 
 36 | ```
 37 | $ python text_normalization.py --input_file=SUBTITLES.txt --min_words=10 --max_words=30 --output_file=CLEAN_AND_NORMALIZED_SUBTITLES.txt
 38 | ```
 39 | 
 40 | ## Align (Synchronization) Text-Audio 
 41 | 
 42 | For alignment, the AENEAS tool is used, which receives an audio file and the clean and normalized text, divided into sentences. A json file will be produced and contains the time (begin and end) of each sentence in the text. The audio file must be in wav or mp3 format. The text is divided into sentences, one on each line of a txt file. For more information about the AENEAS operation, check the [oficial documentation](https://pypi.org/project/aeneas/).
 43 | 
 44 | This functionality is provided by the script named "synchronization.py" and can be used separately. Execute the script using as input argument the audio filepath (mp3 or wav), the text filepath, which contains the segmented sentences, and the output filepath, in which the .json file resulting from the alignment produced by the AENEAS tool will be saved. For example:
 45 | 
 46 | ```
 47 | $ python synchronization.py --audio_file=AUDIO_FILE.mp3 --text_file=CLEAN_AND_NORMALIZED_SUBTITLES.txt --output_file=SYNCHRONIZED_AUDIO_TEXT.json
 48 | ```
 49 | 
 50 | ## Audio Segmentation
 51 | 
 52 | This step receives the json file from the previous step and performs the segmentation of the audio file. This script is based on the script provided by [Keith Ito](https://keithito.com), who kindly provided it via email. In this step, a logical list of segments is first created, storing the filename, the start and end times. Then, go through this logical list, dividing the original audio, saving each segment to disk. 
 53 | 
 54 | This functionality is provided by the script named "audio_segmentation.py" and can be used separately. Run the script using as input argument the path of the audio file (mp3 or wav) to be segmented, the json file from the previous step, the output directory, where the segmented files will be saved, and the path of the metadata file, which is a csv file that will contain the name of the segmented audio file and the corresponding text. For example:
 55 | 
 56 | ```
 57 | $ python audio_segmentation.py --audio_file=AUDIO_FILE.mp3 --json_file=SYNCHRONIZED_AUDIO_TEXT.json --output_dir=OUTPUT_FOLDER --metadata_file=METADATA.CSV
 58 | ```
 59 | 
 60 | The output file will follow the template:
 61 | 
 62 | ```
 63 | filename1 | text one
 64 | filename2 | text two
 65 | filename3 | text three
 66 | filename4 | text four
 67 | ```
 68 | 
 69 | ## Transcribe
 70 | 
 71 | Here there is a script template to access an external STT API, if you have it available. You need to configure the link to access the API. Adapt this script as needed. A sample rate conversion function is also available, in case it is necessary to convert the files before using the STT API. This functionality is provided by the script named "transcribe.py" and can be used separately. Run the script using as input argument of the input directory of wavs files, the transcription output file, and the new sample rate, which the wav files will be converted before sending to the STT API. For example:
 72 | 
 73 | ```
 74 | $ python transcribe.py --input_dir=WAVS_FOLDER --new_sample_rate=16000 --transcription_file=TRANSCRIPTS.CSV
 75 | ```
 76 | 
 77 | Check the "tools" folder for examples of using STT APIs, such as Google, Azure and AWS.
 78 | 
 79 | ## Validation
 80 | 
 81 | Although the audio and text data are force-aligned with each other, several problems can happen that prejudices the results.
 82 | The text may be unclean or incorrect, the pronunciation may be erroneous or the audio may be corrupted (like ambient noise or poor recording quality).
 83 | 
 84 | KATube can validate the text of the sentence. To do this, you must have available an external STT (not provided here), such as AWS, Google or Azure. Some sample scripts are available in the "tools" folder.  The external STT will generate a transcript of the segmented audio. So, you can compare the sentence with the transcript using the levenshtein distance, and thus have a guarantee that the audio really matches the text of the sentence.
 85 | 
 86 | This functionality is provided by the script named "validation.py" and can be used separately. Run the script using as input argument the paths of two csv metadata files, the output directory and the path of the output file, which will contain the texts and the Levenshtein distance between them. For example:
 87 | 
 88 | ```
 89 | $ python validation.py --input_file1=METADATA.CSV --input_file2=TRANSCRIPTS.CSV --output_dir=OUTPUT_FOLDER --output_file=VALIDATION.csv
 90 | ```
 91 | 
 92 | ## Selection
 93 | 
 94 | After validating the data it is possible to select only those audios that have a minimal similarity between the transcription and the sentence. KATube can discard audios that have a similarity value less than a value you define (90% is a good start).
 95 | 
 96 | This functionality is provided by the script named "selection.py" and can be used separately. Run the script using as input argument the path of the csv validation file, from the previews step, the minimal value of Levenshtein distance, and the result output file. A security parameter (--force) must be passed in order to effectively delete the files with lower value of Levenshtein distance. For example:
 97 | 
 98 | ```
 99 | $ python selection.py --csv_file=VALIDATION.csv --min_value=0.9 --save_file=METADATA.csv --force
100 | ```
101 | 
102 | # Installation
103 | 
104 | ## How to create a docker image
105 | 
106 | ```sh
107 | $ git clone https://github.com/freds0/katube
108 | $ cd katube
109 | $ docker build -t katube ./
110 | $ sudo docker run --rm --net='host' -e LANG=C.UTF-8 -e LC_ALL=C.UTF-8 -v ~/:/root/ -w /root -it  katube
111 | ```
112 | 
113 | If you prefer, use a conda environment:
114 | 
115 | ```sh
116 | $ conda create -n katube python=3.6 pip
117 | $ conda activate katube
118 | ```
119 | 
120 | ## Aeneas Installation
121 | 
122 | Install the requirements:
123 | 
124 | ```sh
125 | $ apt-get install ffmpeg espeak libespeak-dev wget git
126 | $ wget https://raw.githubusercontent.com/readbeyond/aeneas/master/install_dependencies.sh
127 | $ bash install_dependencies.sh
128 | ```
129 | 
130 | Install Aeneas:
131 | 
132 | ```sh
133 | $ git clone https://github.com/ReadBeyond/aeneas.git
134 | $ cd aeneas
135 | $ sudo pip install -r requirements.txt
136 | $ python setup.py build_ext --inplace
137 | $ python aeneas_check_setup.py
138 | $ cd ..
139 | $ pip install -e aeneas
140 | ```
141 | 
142 | ## KATube Installation
143 | 
144 | Install the KATube requirements:
145 | 
146 | ```sh
147 | $ pip install -r requirements.txt
148 | $ pip install git+https://github.com/freds0/pytube3
149 | or
150 | $ pip install git+https://github.com/swiftyy-mage/pytube3
151 | ```
152 | 
153 | # Configuration
154 | 
155 | First, create your google api_key at:
156 | 
157 | [https://developers.google.com/places/web-service/get-api-key]
158 | 
159 | In the "config.py" file, set the variable with your google_id:
160 | 
161 | ```sh
162 | api_key = 'put_your_google_id_here'
163 | ```
164 | 
165 | Second, in the "config.py" file, choose the source to download the audio data:
166 | 
167 |  - playlist 
168 |  - channel
169 |  
170 | If you choose a playlist, set variable orig_base as follows in the config.py file: 
171 | 
172 | ```sh
173 | orig_base = 'playlist' # ['channel', 'playlist'] 
174 | ```
175 | 
176 | Third, create a list containing the playlist or channel ids from youtube. For example, to download all audios from the playlist
177 |  - <https://www.youtube.com/playlist?list=PLZoTAELRMXVPGU70ZGsckrMdr0FteeRUi>
178 |  
179 | 
180 | Configure the file "input/playlists_id.txt" as follows:
181 | 
182 | ```sh
183 | PLZoTAELRMXVPGU70ZGsckrMdr0FteeRUi
184 | ```
185 | 
186 | Check the settings in the "config.py" file.
187 | 
188 | # Execution
189 | 
190 | After the configuration, execute the command:
191 | 
192 | ```
193 | python main.py
194 | ```
195 | 
196 | and KATube will start dataset generating. 
197 | 
198 | # TODO
199 | 
200 | Try to use [Montreal Forced Alignment](https://montreal-forced-aligner.readthedocs.io/en/latest/).
201 | 
202 | # References:
203 | 
204 | - Pansori [sourcecode](https://github.com/yc9701/pansori)
205 | - Pansori [paper](https://arxiv.org/abs/1812.09798)
206 | - [KABooks](https://github.com/freds0/kabooks), our similar tool, used to create dataset from audiobooks.
207 | 
208 | # Thanks
209 | 
210 | - [Keith Ito](https://keithito.com)
211 | 


--------------------------------------------------------------------------------
/text_normalization.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com
  5 | #
  6 | #
  7 | import re
  8 | import argparse
  9 | import unicodedata
 10 | from utils.number_to_text import number_to_text
 11 | 
 12 | vocab="abcdefghijklmnopqrstuvwxyzçãàáâêéíóôõúû\-0123456789,.;:!?' "
 13 | chars_map = {'ï': 'i', 'ù': 'ú', 'ö': 'o', 'î':'i', 'ñ':' n', 'ë':'e', 'ì':'í', 'ò': 'ó', 'ũ': 'u','ẽ':'e', 'ü':'u', 'è':'é', 'æ':'a', 'å': 'a'}
 14 | 
 15 | 
 16 | def get_number_of_words(sentence):
 17 |     """
 18 |     Count number of words on sentence.
 19 |         Parameters:
 20 |         sentence (str). text sentence.
 21 | 
 22 |         Returns:
 23 |         int: returns sentence length
 24 |     """
 25 |     sentence_length = len(sentence.split(' '))
 26 |     return sentence_length
 27 | 
 28 | 
 29 | def get_text_from_subtitle(input_file):
 30 |     """
 31 |     Extracts the text from a subtitle file.
 32 |         Parameters:
 33 |         input_file (str): input subtitles file (.srt).
 34 | 
 35 |         Returns:
 36 |         text (str): returns text of subtitles files.
 37 |     """
 38 |     # Read all lines from file
 39 |     try:
 40 |         file = open(input_file, "r")
 41 |         lines = file.readlines()
 42 |         file.close()
 43 | 
 44 |     except IOError:
 45 |         print("Error: Reading subtitle file {}.".format(input_file))
 46 |         return False
 47 | 
 48 |     # Declare variable empty list
 49 |     line_list = []
 50 |     for line in lines:
 51 |         text = ''
 52 |         # Look for patterns and parse
 53 |         if re.search('^[0-9]+$', line) is None and re.search('^[0-9]{2}:[0-9]{2}:[0-9]{2}', line) is None and re.search('^$', line) is None:
 54 |             text += ' ' + line.strip('\n')
 55 |             line_list.append(text)
 56 | 
 57 |     # Finish with list.join() to bring everything together
 58 |     text = '\n'.join(line_list)
 59 |     return text
 60 | 
 61 | 
 62 | def merge_sentences(sentences, min_words):
 63 |     """
 64 |     Merge sentences that have a number of words less than min_words.
 65 |         Parameters:
 66 |         sentences (list): list of sentences.
 67 |         min_words (int): minimum quantity of words.
 68 | 
 69 |         Returns:
 70 |         sentences (list): returns sentences list with length bigger then min_words.
 71 |     """
 72 |     found_short_sentence = True
 73 |     while(found_short_sentence):
 74 |         found_short_sentence = False
 75 |         for index, sentence in enumerate(sentences):
 76 |             # Verify number of words on sentence
 77 |             if (len(sentence.split()) < min_words):
 78 |                 found_short_sentence = True
 79 |                 # Merge sentences
 80 |                 sentences[index:index+2] = [' '.join(sentences[index:index+2])]
 81 |     # Removing blank itens from list
 82 |     nonempty_sentences = list(filter(None, sentences))
 83 |     return nonempty_sentences
 84 | 
 85 | 
 86 | def tokenize_sentences_on_blank_space(text):
 87 |     """
 88 |     Divide a text into words, that is, create tokens, breaking it into the blank spaces.
 89 |         Parameters:
 90 |         text (str): normalized text.
 91 | 
 92 |         Returns:
 93 |         words (list): returns list of words.
 94 |     """
 95 |     # Tokenize on blank spaces
 96 |     words = text.split(' ')
 97 |     return words
 98 | 
 99 | 
100 | def tokenize_sentences_on_punctuation(text):
101 |     """
102 |     Creates sentences from a text, splitting it in the punctuation.
103 |         Parameters:
104 |         text (str): normalized text.
105 | 
106 |         Returns:
107 |         sentences (list): returns list of sentences split on punctuation.
108 |     """
109 |     # Tokenize by punctuation
110 |     # sentences = re.split(r'([.,!?:;])', text)# Result example: ['Esta é uma frase', '.', 'Esta é outra frase', ',']
111 |     sentences = re.split(r'([.;!?])', text)
112 |     for index, sentence in enumerate(sentences[:-1]):
113 |         sentence = sentence.strip()
114 |         sentences[index:index+2] = [''.join(sentences[index:index+2])] # Result example: ['Esta é uma frase.', 'Esta é outra frase,']
115 |     # Removing blank itens from list
116 |     nonempty_sentences = list(filter(None, sentences))
117 |     sentences = nonempty_sentences
118 |     return sentences
119 | 
120 | 
121 | def tokenize_sentences_on_special_words(text):
122 |     """
123 |     Creates sentences from a text, splitting it in the special words of the Portuguese language.
124 |         Parameters:
125 |         text (str): normalized text.
126 | 
127 |         Returns:
128 |         sentences (list): returns list of sentences split on special words.
129 |     """
130 |     special_words = [' mas ', ' porém ', ' todavia ', ' contudo ', ' entretanto ', ' no entanto ', ' pois ',' logo ', ' porque ', ' bem como ', ' por isso ', ' isto é ', ' visto que ', ' quando ', ' logo que ', ' desde que']
131 |     # Tokenize by special words
132 |     for word in special_words:
133 |         sentences = re.split(r'({})'.format(word), text) 
134 | 
135 |     for index, sentence in enumerate(sentences[:-1]):
136 |         sentences[index:index+2] = [''.join(sentences[index:index+2])]
137 |     # Removing blank itens from list
138 |     nonempty_sentences = list(filter(None, sentences))
139 |     sentences = nonempty_sentences
140 |     return sentences
141 | 
142 | 
143 | def get_size_of_biggest_sentence(sentences):
144 |     """
145 |     Given a list of sentences, it returns the length of the largest sentence.
146 |         Parameters:
147 |         sentences (list): sentences list.
148 | 
149 |         Returns:
150 |         int: returns length of largest sentence.
151 |     """
152 |     max_length_sentence = 0
153 |     for sentence in sentences:
154 |         length_sentence = get_number_of_words(sentence)
155 |         if  length_sentence > max_length_sentence:
156 |             max_length_sentence = length_sentence
157 |     return max_length_sentence
158 | 
159 | 
160 | def create_sentences_from_text(text, min_words, max_words):
161 |     """
162 |     Creates sentences from a text, taking into account the minimum and maximum number of words.
163 |     Initially, it divides the text according to the punctuation, then it divides the larger sentences according to special words,
164 |     and, finally, it divides the larger sentences into tokens.
165 |     After the division, the tokens are concatenated until they are within the min and max limits.
166 |         Parameters:
167 |         text (str): normalized text.
168 |         min_words (int): number minimum of words of each sentence.
169 |         max_words (int): number maximum of words of each sentence.
170 | 
171 |         Returns:
172 |         sentences (list): returns sentences list.
173 |     """
174 |     # First: tokenize on punctuation
175 |     sentences = tokenize_sentences_on_punctuation(text)
176 | 
177 |     # Verify length of sentences
178 |     length_biggest_sentence = get_size_of_biggest_sentence(sentences)
179 | 
180 |     # Second: tokenize on special words
181 |     if length_biggest_sentence > max_words: # very long sentence
182 |         sentences = tokenize_sentences_on_special_words(text)
183 | 
184 |     # Verify length of sentences
185 |     length_biggest_sentence = get_size_of_biggest_sentence(sentences)
186 | 
187 |     # Third: tokenize on blank space
188 |     if length_biggest_sentence > max_words: # very long sentence
189 |         sentences = tokenize_sentences_on_blank_space(text)
190 | 
191 |     # Concatenates small sentences
192 |     sentences = iter(sentences)
193 |     lines, current = [], next(sentences)
194 |     for sentence in sentences:    
195 |         if get_number_of_words(current) > min_words:
196 |             lines.append(current)
197 |             current = sentence # next
198 |         # Concatenates sentences
199 |         else:
200 |             current += " " + sentence # concatenate two sentences
201 | 
202 |     lines.append(current)
203 |     nonempty_lines = list(filter(None, lines))
204 |     return nonempty_lines
205 | 
206 | 
207 | def remove_html_tags(text):
208 |     """
209 |     Remove html tags from a string using regular expressions.
210 |     """
211 |     clean = re.compile('<.*?>')
212 |     return re.sub(clean, '', text)
213 | 
214 | 
215 | def text_cleaning(text):
216 |     """
217 |     Performs a series of operations to clear the text in order to normalize it.
218 |     """
219 |     # Removing line break.
220 |     text = text.replace('\n', ' ')
221 | 
222 |     # Removing html tags.
223 |     text = remove_html_tags(text)
224 | 
225 |     # Normalizing accents to unidecode.
226 |     accents = ('COMBINING ACUTE ACCENT', 'COMBINING GRAVE ACCENT') #portuguese
227 |     chars = [c for c in unicodedata.normalize('NFD', text) if c not in accents]
228 |     text = unicodedata.normalize('NFC', ''.join(chars))# Strip accent
229 | 
230 |     # Converting to lower case
231 |     text = text.lower()
232 | 
233 |     # Remove all not in vocab
234 |     #text = re.sub("[^{}]".format(vocab), " ", text)
235 | 
236 |     # Replacing ... for .
237 |     text = re.sub("[...]+", ".", text) # Substitute "..." for "."
238 | 
239 |     # Remove (, [
240 |     text = re.sub("[(\[\])]+", "", text)
241 | 
242 |     # Remove space before punctuation
243 |     text = re.sub(r'\s([.,;:?!"](?:\s|$))', r'\1', text)
244 | 
245 |     # Removing double blank spaces
246 |     text = re.sub("[  ]+", " ", text)
247 |     for word in text.split(' '):
248 |         for c in word:
249 |             if c in chars_map.keys():
250 |                 word = word.replace(c,chars_map[c])
251 |                 c = chars_map[c]
252 | 
253 |     return text
254 | 
255 | 
256 | def create_normalized_text_from_subtitles_file(subtitle_file, output_file, min_words, max_words):
257 |     """
258 |     Given a subtitle file (.srt) it cleans and normalizes the text, dividing it into sentences,
259 |     according to the number of words (min_words and max_words),
260 |     saving the result in output_file.
261 |         Parameters:
262 |         subtitle_file (str): subtitles .srt file.
263 |         output_file (str): file path to save the normalized text.
264 |         min_words (int): number minimum of words of each sentence.
265 |         max_words (int): number maximum of words of each sentence.
266 | 
267 |         Returns:
268 |         Boolean: returns True or False.
269 |     """
270 | 
271 |     # If the file comes with the time for each subtitle, uncomment this line as only the subtitles text will be extracted.
272 |     #text = get_text_from_subtitle(subtitle_file)
273 | 
274 |     # Read all lines from file
275 |     try:
276 |         file = open(subtitle_file, "r")
277 |         text = '\n'.join(file.readlines())
278 |         file.close()
279 |     except IOError:
280 |         print("Error: Reading subtitle file {}.".format(subtitle_file))
281 |         return False    
282 | 
283 |     # If it was unable to extract the text.
284 |     if not text:
285 |         return False
286 | 
287 |     # Clear and normalize the text.
288 |     text = text_cleaning(text)
289 | 
290 |     # Creates a list of sentences.
291 |     sentences = create_sentences_from_text(text, int(min_words), int(max_words))
292 | 
293 |     # Save the sentences to the output file.
294 |     try:
295 |         f = open(output_file, "w")
296 |         for sentence in sentences:
297 |             #  Converting numbers by its full version.
298 |             sentence = number_to_text(sentence)
299 |             f.write(sentence.strip() + '\n')
300 |         f.close()
301 | 
302 |     except IOError:
303 |         print("Error: Writing audio file {}.".format(output_file))
304 |         return False
305 | 
306 |     return True
307 | 
308 | 
309 | def main():
310 |     parser = argparse.ArgumentParser()
311 |     parser.add_argument('--base_dir', default='./')
312 |     parser.add_argument('--input_file', default='subtitles.txt', help='Subtitles filename (only text)')
313 |     parser.add_argument('--output_file', default='output.txt', help='Filename to save the normalize text')
314 |     parser.add_argument('--min_words', default=10, help='Minimal number of words on sentence')
315 |     parser.add_argument('--max_words', default=30, help='Maximal number of words on sentence')
316 |     args = parser.parse_args()
317 | 
318 |     min_words = int(args.min_words)
319 |     max_words = int(args.max_words)
320 | 
321 |     create_normalized_text_from_subtitles_file(args.input_file, args.output_file, min_words, max_words)
322 | 
323 | if __name__ == "__main__":
324 |     main()
325 | 


--------------------------------------------------------------------------------
/main.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python
  2 | # -*- coding: utf-8 -*-
  3 | #
  4 | # (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com
  5 | #
  6 | #
  7 | from config import Config
  8 | from urllib.parse import parse_qs, urlparse
  9 | from search import search_videos
 10 | from download import download_audio_and_subtitles_from_youtube
 11 | from text_normalization import create_normalized_text_from_subtitles_file
 12 | from synchronization import create_aeneas_json_file
 13 | from audio_segmentation import segment_audio
 14 | from transcribe import convert_audios_samplerate, transcribe_audios
 15 | from validation import create_validation_file
 16 | from selection import select
 17 | from utils.downsampling import downsampling
 18 | import shutil
 19 | import os
 20 | import logging
 21 | 
 22 | ######################################################
 23 | # Logs Config
 24 | ######################################################
 25 | if not(os.path.exists(Config.logs_dir)):
 26 |     os.makedirs(Config.logs_dir)
 27 | 
 28 | log_path = os.path.join(Config.logs_dir, Config.log_file)
 29 | if not os.path.exists(Config.logs_dir):
 30 |     os.makedirs(Config.logs_dir)
 31 | open(log_path, 'w').close()
 32 | 
 33 | level = logging.DEBUG # Options: logging.DEBUG | logging.INFO | logging.WARNING | logging.ERROR | logging.CRITICAL
 34 | logging.basicConfig(filename=log_path, filemode='w', format='%(message)s', level=level)
 35 | 
 36 | 
 37 | # Argument Parser from File
 38 | '''
 39 | class LoadFromFile (argparse.Action):
 40 |     def __call__ (self, parser, namespace, values, option_string = None):
 41 |         with values as f:
 42 |             print(f.read().split())
 43 |             parser.parse_args(f.read().split(), namespace)
 44 | '''
 45 | 
 46 | 
 47 | def main():
 48 |     if Config.orig_base == 'channel':
 49 |         g = open(Config.channels_file, "r", encoding='utf-8')
 50 |     elif Config.orig_base == 'playlist':
 51 |         g = open(Config.playlists_file, "r", encoding='utf-8')
 52 |     else:
 53 |         g = None
 54 | 
 55 |     # Errors youtube videos file
 56 |     log_error_file = open(os.path.join(Config.logs_dir, Config.youtube_videos_error), "w")
 57 | 
 58 |     ######################################################
 59 |     # Youtube ignored videos
 60 |     ######################################################
 61 |     if Config.ignored_youtube_videos:
 62 |         try:
 63 |             f = open(Config.ignored_youtube_videos, encoding='utf-8')
 64 |             ignored_youtube_videos = f.readlines()
 65 |             f.close()
 66 |         except IOError:
 67 |           print("Error: File {} does not appear to exist.".format(Config.ignored_youtube_videos))
 68 |           return exit(False)
 69 | 
 70 |     ######################################################
 71 |     # Youtube already downloaded videos
 72 |     ######################################################
 73 |     if Config.downloaded_youtube_videos:
 74 |         try:
 75 |             if os.path.exists(Config.downloaded_youtube_videos):
 76 |                 f = open(Config.downloaded_youtube_videos, "r", encoding='utf-8')
 77 |                 downloaded_youtube_videos = f.readlines()
 78 |                 f.close()
 79 |             else:
 80 |                 f = open(Config.downloaded_youtube_videos, "w", encoding='utf-8')
 81 |                 downloaded_youtube_videos = []
 82 |                 f.close()                
 83 |         except IOError:
 84 |           print("Error: File {} does not appear to exist.".format(Config.downloaded_youtube_videos))
 85 |           return exit(False)
 86 | 
 87 |     ######################################################
 88 |     # Iterates over the youtube channels list
 89 |     ######################################################
 90 |     for content_id in g:
 91 |         content_id = content_id.rstrip()
 92 |         # ignore channel description
 93 |         if content_id.startswith('#'):
 94 |             print('Ignoring {}: {}'.format(Config.orig_base, content_id))
 95 |             continue
 96 |         # Defining output paths
 97 |         base_path = os.path.join(Config.base_dir, Config.dest_dir)
 98 |         output_path = os.path.join(base_path, Config.orig_base, content_id)
 99 | 
100 |         ######################################################
101 |         # Searching all videos from Youtube channel
102 |         ######################################################
103 |         print('Searching videos from {} - {}...'.format(Config.orig_base, content_id))
104 |         # content_file contains the list of all videos on the youtube channel
105 |         content_file = search_videos(Config.api_key, content_id, base_path, Config.output_search_file)
106 |         if not content_file:
107 |             logging.error('Error downloading channel video list: ' + content_id)
108 |             continue
109 | 
110 |         # Open youtube videos list of the channel
111 |         f = open(content_file, "r", encoding='utf-8')
112 | 
113 |         ######################################################
114 |         # Iterate over youtube videos of the channel
115 |         ######################################################
116 |         i = 0
117 |         for youtube_link in f:
118 |             youtube_link = youtube_link.strip()
119 |             ######################################################
120 |             # Ignoring videos commented or found on list "Config.ignored_youtube_videos"
121 |             ######################################################
122 |             if youtube_link.startswith('#') or (Config.ignored_youtube_videos and youtube_link + '\n' in ignored_youtube_videos):
123 |                 print('Ignoring youtube video: {} '.format(youtube_link))
124 |                 continue
125 | 
126 |             videos = parse_qs(urlparse(youtube_link).query, keep_blank_values=True).get('v')
127 |             video_id = None if videos == None else videos[0]
128 | 
129 |             ######################################################
130 |             # Download mp3 from youtube_link
131 |             ######################################################
132 |             print('Downloading {} - {}...'.format(i, youtube_link))
133 |             # Ignore videos with no portuguese caption or no caption at all
134 |             if os.path.exists(os.path.join(output_path, video_id)) or (not download_audio_and_subtitles_from_youtube(youtube_link, output_path)):
135 |                 logging.error('YouTube video already downloaded or is unavailable: ' + youtube_link)
136 |                 log_error_file.write(youtube_link + ': ingest_dataset' + '\n')
137 |                 i += 1
138 |                 continue
139 | 
140 |             ######################################################
141 |             # Normalizing text preparing to syncronizing text-audio
142 |             ######################################################
143 |             print('Normalizing text {} - {}...'.format(i, youtube_link))
144 |             subtitle_file = os.path.join(output_path, video_id) + '/' + video_id + ".srt"
145 |             text_file = os.path.join(output_path, video_id) + '/' + video_id + ".txt"
146 |             if not create_normalized_text_from_subtitles_file(subtitle_file, text_file, Config.min_words, Config.max_words):
147 |                 logging.error('YouTube video creating normalized text from subtitles file: ' + youtube_link)
148 |                 log_error_file.write(youtube_link + ': create_normalized_text_from_subtitles_file' + '\n')
149 |                 i += 1
150 |                 continue
151 |             if Config.delete_temp_files:
152 |                 os.remove(subtitle_file)
153 | 
154 |             ######################################################
155 |             # Syncronizing text-audio using aeneas
156 |             ######################################################
157 |             print('Syncronizing Text-Audio {} - {}...'.format(i, youtube_link))
158 |             json_filename = video_id + ".json"
159 |             audio_filename = video_id + ".mp3"
160 |             json_file = os.path.join(output_path, video_id, json_filename)
161 |             audio_file = os.path.join(output_path, video_id, audio_filename)
162 |             if not create_aeneas_json_file(audio_file, text_file, json_file):
163 |                 logging.error('YouTube video syncronizing aeneas json file: ' + youtube_link)
164 |                 log_error_file.write(youtube_link + ': create_aeneas_json_file' + '\n')
165 |                 i += 1
166 |                 continue
167 |             if Config.delete_temp_files:
168 |                 os.remove(text_file)
169 | 
170 |             ######################################################
171 |             # Segmenting audio using aeneas output
172 |             ######################################################
173 |             print('Segmenting audio {} - {}...'.format(i, youtube_link))
174 |             wavs_dir  = os.path.join(output_path, video_id, Config.wavs_dir)
175 |             metadata_subtitles_file = os.path.join(output_path, video_id, Config.metadata_subtitles_file)
176 |             filename_base = video_id
177 |             if not segment_audio(audio_file, json_file, wavs_dir, metadata_subtitles_file, filename_base):
178 |                 logging.error('YouTube video segmenting audio: '  + youtube_link)
179 |                 log_error_file.write(youtube_link + ': segment_audio' + '\n')
180 |                 i += 1
181 |                 continue
182 |             # Removing original audio file
183 |             if Config.delete_temp_files:
184 |                 os.remove(audio_file)
185 |                 os.remove(json_file)
186 | 
187 |             ######################################################
188 |             # Converting audios: adjust audios to transcription tool
189 |             ######################################################
190 |             print('Converting {} - {}...'.format(i, youtube_link))
191 |             tmp_wavs_dir = os.path.join(output_path, video_id, Config.tmp_wavs_dir)
192 |             if not convert_audios_samplerate(wavs_dir, tmp_wavs_dir, Config.tmp_sampling_rate):
193 |                 logging.error('YouTube video converting audio: ' + youtube_link)
194 |                 log_error_file.write(youtube_link  + ': convert_audios_samplerate' + '\n')
195 |                 i += 1
196 |                 continue
197 | 
198 |             ######################################################
199 |             # Transcribing: using external ASR api
200 |             ######################################################
201 |             print('Transcribing {} - {}...'.format(i, youtube_link))
202 |             transcription_file = os.path.join(output_path, video_id, Config.transcription_file)
203 |             if not transcribe_audios(tmp_wavs_dir, transcription_file):
204 |                 logging.error('YouTube video transcribing: ' + youtube_link)
205 |                 log_error_file.write(youtube_link + ': transcribe_audios' + '\n')
206 |                 # Removing temp dir
207 |                 shutil.rmtree(tmp_wavs_dir, ignore_errors=True)
208 |                 i += 1
209 |                 continue
210 |             # Removing temp dir
211 |             shutil.rmtree(tmp_wavs_dir, ignore_errors=True)
212 | 
213 |             ######################################################
214 |             # Validating: using levenshtein distance
215 |             ######################################################
216 |             print('Validating {} - {}...'.format(i, youtube_link))
217 |             basename = wavs_dir
218 |             validation_file = os.path.join(output_path, video_id, Config.validation_file)
219 |             if not create_validation_file(metadata_subtitles_file, transcription_file, basename, validation_file):
220 |                 logging.error('YouTube video calculate distance: ' + youtube_link)
221 |                 log_error_file.write(youtube_link + ': create_validation_file'+ '\n')
222 |                 i += 1
223 |                 continue
224 |             if Config.delete_temp_files:
225 |                 os.remove(metadata_subtitles_file)
226 |                 os.remove(transcription_file)
227 | 
228 |             ######################################################
229 |             # Selection: selecting only files with similarity (levenshtein) >= Config.minimal_levenshtein_distance
230 |             ######################################################
231 |             print('Selection {} - {}...'.format(i, youtube_link))
232 |             basename = wavs_dir
233 |             output_filepath = os.path.join(output_path, video_id, Config.result_file)
234 |             if not select(validation_file, output_filepath, Config.minimal_levenshtein_distance, Config.delete_temp_files):
235 |                 logging.error('YouTube video selection: ' + youtube_link)
236 |                 log_error_file.write(youtube_link + ': selection_file'+ '\n')
237 |                 i += 1
238 |                 continue
239 |             if Config.delete_temp_files:
240 |                 os.remove(validation_file)
241 | 
242 |             ######################################################
243 |             # Downsampling: downsampling wav files
244 |             ######################################################            
245 |             print('Downsampling {} - {}...'.format(i, youtube_link))
246 |             if not downsampling(os.path.join(output_path, video_id), Config.wavs_dir, Config.tmp_wavs_dir, Config.sampling_rate, True):
247 |                 logging.error('YouTube video downsampling: ' + youtube_link)
248 |                 log_error_file.write(youtube_link + ': downsampling'+ '\n')
249 |                 i += 1
250 |                 continue
251 |             shutil.rmtree(os.path.join(output_path, video_id, Config.wavs_dir))
252 |             if (os.path.exists(os.path.join(output_path, video_id, Config.tmp_wavs_dir))):
253 |                 os.rename(os.path.join(output_path, video_id, Config.tmp_wavs_dir), os.path.join(output_path, video_id, Config.wavs_dir))
254 | 
255 |             ######################################################
256 |             # Excluding folders with no wav files
257 |             ######################################################
258 |             if not os.path.isdir(wavs_dir) or not os.listdir(wavs_dir):
259 |                 shutil.rmtree(os.path.join(output_path, video_id))
260 | 
261 |             print('Finish {} - {}...'.format(i, youtube_link))
262 | 
263 |             # Add youtube_link to already downloaded videos file
264 |             if Config.downloaded_youtube_videos:
265 |                 with open(Config.downloaded_youtube_videos, 'a', encoding='utf-8') as out:            
266 |                     out.write(youtube_link + "\n")    
267 | 
268 |             i += 1 # Next
269 | 
270 |         f.close() #  youtube videos list
271 | 
272 |     log_error_file.close()
273 | 
274 |     g.close() # channels or playlist list
275 | 
276 | if __name__ == "__main__":
277 |     main()
278 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
  1 | Mozilla Public License Version 2.0
  2 | ==================================
  3 | 
  4 | 1. Definitions
  5 | --------------
  6 | 
  7 | 1.1. "Contributor"
  8 |     means each individual or legal entity that creates, contributes to
  9 |     the creation of, or owns Covered Software.
 10 | 
 11 | 1.2. "Contributor Version"
 12 |     means the combination of the Contributions of others (if any) used
 13 |     by a Contributor and that particular Contributor's Contribution.
 14 | 
 15 | 1.3. "Contribution"
 16 |     means Covered Software of a particular Contributor.
 17 | 
 18 | 1.4. "Covered Software"
 19 |     means Source Code Form to which the initial Contributor has attached
 20 |     the notice in Exhibit A, the Executable Form of such Source Code
 21 |     Form, and Modifications of such Source Code Form, in each case
 22 |     including portions thereof.
 23 | 
 24 | 1.5. "Incompatible With Secondary Licenses"
 25 |     means
 26 | 
 27 |     (a) that the initial Contributor has attached the notice described
 28 |         in Exhibit B to the Covered Software; or
 29 | 
 30 |     (b) that the Covered Software was made available under the terms of
 31 |         version 1.1 or earlier of the License, but not also under the
 32 |         terms of a Secondary License.
 33 | 
 34 | 1.6. "Executable Form"
 35 |     means any form of the work other than Source Code Form.
 36 | 
 37 | 1.7. "Larger Work"
 38 |     means a work that combines Covered Software with other material, in
 39 |     a separate file or files, that is not Covered Software.
 40 | 
 41 | 1.8. "License"
 42 |     means this document.
 43 | 
 44 | 1.9. "Licensable"
 45 |     means having the right to grant, to the maximum extent possible,
 46 |     whether at the time of the initial grant or subsequently, any and
 47 |     all of the rights conveyed by this License.
 48 | 
 49 | 1.10. "Modifications"
 50 |     means any of the following:
 51 | 
 52 |     (a) any file in Source Code Form that results from an addition to,
 53 |         deletion from, or modification of the contents of Covered
 54 |         Software; or
 55 | 
 56 |     (b) any new file in Source Code Form that contains any Covered
 57 |         Software.
 58 | 
 59 | 1.11. "Patent Claims" of a Contributor
 60 |     means any patent claim(s), including without limitation, method,
 61 |     process, and apparatus claims, in any patent Licensable by such
 62 |     Contributor that would be infringed, but for the grant of the
 63 |     License, by the making, using, selling, offering for sale, having
 64 |     made, import, or transfer of either its Contributions or its
 65 |     Contributor Version.
 66 | 
 67 | 1.12. "Secondary License"
 68 |     means either the GNU General Public License, Version 2.0, the GNU
 69 |     Lesser General Public License, Version 2.1, the GNU Affero General
 70 |     Public License, Version 3.0, or any later versions of those
 71 |     licenses.
 72 | 
 73 | 1.13. "Source Code Form"
 74 |     means the form of the work preferred for making modifications.
 75 | 
 76 | 1.14. "You" (or "Your")
 77 |     means an individual or a legal entity exercising rights under this
 78 |     License. For legal entities, "You" includes any entity that
 79 |     controls, is controlled by, or is under common control with You. For
 80 |     purposes of this definition, "control" means (a) the power, direct
 81 |     or indirect, to cause the direction or management of such entity,
 82 |     whether by contract or otherwise, or (b) ownership of more than
 83 |     fifty percent (50%) of the outstanding shares or beneficial
 84 |     ownership of such entity.
 85 | 
 86 | 2. License Grants and Conditions
 87 | --------------------------------
 88 | 
 89 | 2.1. Grants
 90 | 
 91 | Each Contributor hereby grants You a world-wide, royalty-free,
 92 | non-exclusive license:
 93 | 
 94 | (a) under intellectual property rights (other than patent or trademark)
 95 |     Licensable by such Contributor to use, reproduce, make available,
 96 |     modify, display, perform, distribute, and otherwise exploit its
 97 |     Contributions, either on an unmodified basis, with Modifications, or
 98 |     as part of a Larger Work; and
 99 | 
100 | (b) under Patent Claims of such Contributor to make, use, sell, offer
101 |     for sale, have made, import, and otherwise transfer either its
102 |     Contributions or its Contributor Version.
103 | 
104 | 2.2. Effective Date
105 | 
106 | The licenses granted in Section 2.1 with respect to any Contribution
107 | become effective for each Contribution on the date the Contributor first
108 | distributes such Contribution.
109 | 
110 | 2.3. Limitations on Grant Scope
111 | 
112 | The licenses granted in this Section 2 are the only rights granted under
113 | this License. No additional rights or licenses will be implied from the
114 | distribution or licensing of Covered Software under this License.
115 | Notwithstanding Section 2.1(b) above, no patent license is granted by a
116 | Contributor:
117 | 
118 | (a) for any code that a Contributor has removed from Covered Software;
119 |     or
120 | 
121 | (b) for infringements caused by: (i) Your and any other third party's
122 |     modifications of Covered Software, or (ii) the combination of its
123 |     Contributions with other software (except as part of its Contributor
124 |     Version); or
125 | 
126 | (c) under Patent Claims infringed by Covered Software in the absence of
127 |     its Contributions.
128 | 
129 | This License does not grant any rights in the trademarks, service marks,
130 | or logos of any Contributor (except as may be necessary to comply with
131 | the notice requirements in Section 3.4).
132 | 
133 | 2.4. Subsequent Licenses
134 | 
135 | No Contributor makes additional grants as a result of Your choice to
136 | distribute the Covered Software under a subsequent version of this
137 | License (see Section 10.2) or under the terms of a Secondary License (if
138 | permitted under the terms of Section 3.3).
139 | 
140 | 2.5. Representation
141 | 
142 | Each Contributor represents that the Contributor believes its
143 | Contributions are its original creation(s) or it has sufficient rights
144 | to grant the rights to its Contributions conveyed by this License.
145 | 
146 | 2.6. Fair Use
147 | 
148 | This License is not intended to limit any rights You have under
149 | applicable copyright doctrines of fair use, fair dealing, or other
150 | equivalents.
151 | 
152 | 2.7. Conditions
153 | 
154 | Sections 3.1, 3.2, 3.3, and 3.4 are conditions of the licenses granted
155 | in Section 2.1.
156 | 
157 | 3. Responsibilities
158 | -------------------
159 | 
160 | 3.1. Distribution of Source Form
161 | 
162 | All distribution of Covered Software in Source Code Form, including any
163 | Modifications that You create or to which You contribute, must be under
164 | the terms of this License. You must inform recipients that the Source
165 | Code Form of the Covered Software is governed by the terms of this
166 | License, and how they can obtain a copy of this License. You may not
167 | attempt to alter or restrict the recipients' rights in the Source Code
168 | Form.
169 | 
170 | 3.2. Distribution of Executable Form
171 | 
172 | If You distribute Covered Software in Executable Form then:
173 | 
174 | (a) such Covered Software must also be made available in Source Code
175 |     Form, as described in Section 3.1, and You must inform recipients of
176 |     the Executable Form how they can obtain a copy of such Source Code
177 |     Form by reasonable means in a timely manner, at a charge no more
178 |     than the cost of distribution to the recipient; and
179 | 
180 | (b) You may distribute such Executable Form under the terms of this
181 |     License, or sublicense it under different terms, provided that the
182 |     license for the Executable Form does not attempt to limit or alter
183 |     the recipients' rights in the Source Code Form under this License.
184 | 
185 | 3.3. Distribution of a Larger Work
186 | 
187 | You may create and distribute a Larger Work under terms of Your choice,
188 | provided that You also comply with the requirements of this License for
189 | the Covered Software. If the Larger Work is a combination of Covered
190 | Software with a work governed by one or more Secondary Licenses, and the
191 | Covered Software is not Incompatible With Secondary Licenses, this
192 | License permits You to additionally distribute such Covered Software
193 | under the terms of such Secondary License(s), so that the recipient of
194 | the Larger Work may, at their option, further distribute the Covered
195 | Software under the terms of either this License or such Secondary
196 | License(s).
197 | 
198 | 3.4. Notices
199 | 
200 | You may not remove or alter the substance of any license notices
201 | (including copyright notices, patent notices, disclaimers of warranty,
202 | or limitations of liability) contained within the Source Code Form of
203 | the Covered Software, except that You may alter any license notices to
204 | the extent required to remedy known factual inaccuracies.
205 | 
206 | 3.5. Application of Additional Terms
207 | 
208 | You may choose to offer, and to charge a fee for, warranty, support,
209 | indemnity or liability obligations to one or more recipients of Covered
210 | Software. However, You may do so only on Your own behalf, and not on
211 | behalf of any Contributor. You must make it absolutely clear that any
212 | such warranty, support, indemnity, or liability obligation is offered by
213 | You alone, and You hereby agree to indemnify every Contributor for any
214 | liability incurred by such Contributor as a result of warranty, support,
215 | indemnity or liability terms You offer. You may include additional
216 | disclaimers of warranty and limitations of liability specific to any
217 | jurisdiction.
218 | 
219 | 4. Inability to Comply Due to Statute or Regulation
220 | ---------------------------------------------------
221 | 
222 | If it is impossible for You to comply with any of the terms of this
223 | License with respect to some or all of the Covered Software due to
224 | statute, judicial order, or regulation then You must: (a) comply with
225 | the terms of this License to the maximum extent possible; and (b)
226 | describe the limitations and the code they affect. Such description must
227 | be placed in a text file included with all distributions of the Covered
228 | Software under this License. Except to the extent prohibited by statute
229 | or regulation, such description must be sufficiently detailed for a
230 | recipient of ordinary skill to be able to understand it.
231 | 
232 | 5. Termination
233 | --------------
234 | 
235 | 5.1. The rights granted under this License will terminate automatically
236 | if You fail to comply with any of its terms. However, if You become
237 | compliant, then the rights granted under this License from a particular
238 | Contributor are reinstated (a) provisionally, unless and until such
239 | Contributor explicitly and finally terminates Your grants, and (b) on an
240 | ongoing basis, if such Contributor fails to notify You of the
241 | non-compliance by some reasonable means prior to 60 days after You have
242 | come back into compliance. Moreover, Your grants from a particular
243 | Contributor are reinstated on an ongoing basis if such Contributor
244 | notifies You of the non-compliance by some reasonable means, this is the
245 | first time You have received notice of non-compliance with this License
246 | from such Contributor, and You become compliant prior to 30 days after
247 | Your receipt of the notice.
248 | 
249 | 5.2. If You initiate litigation against any entity by asserting a patent
250 | infringement claim (excluding declaratory judgment actions,
251 | counter-claims, and cross-claims) alleging that a Contributor Version
252 | directly or indirectly infringes any patent, then the rights granted to
253 | You by any and all Contributors for the Covered Software under Section
254 | 2.1 of this License shall terminate.
255 | 
256 | 5.3. In the event of termination under Sections 5.1 or 5.2 above, all
257 | end user license agreements (excluding distributors and resellers) which
258 | have been validly granted by You or Your distributors under this License
259 | prior to termination shall survive termination.
260 | 
261 | ************************************************************************
262 | *                                                                      *
263 | *  6. Disclaimer of Warranty                                           *
264 | *  -------------------------                                           *
265 | *                                                                      *
266 | *  Covered Software is provided under this License on an "as is"       *
267 | *  basis, without warranty of any kind, either expressed, implied, or  *
268 | *  statutory, including, without limitation, warranties that the       *
269 | *  Covered Software is free of defects, merchantable, fit for a        *
270 | *  particular purpose or non-infringing. The entire risk as to the     *
271 | *  quality and performance of the Covered Software is with You.        *
272 | *  Should any Covered Software prove defective in any respect, You     *
273 | *  (not any Contributor) assume the cost of any necessary servicing,   *
274 | *  repair, or correction. This disclaimer of warranty constitutes an   *
275 | *  essential part of this License. No use of any Covered Software is   *
276 | *  authorized under this License except under this disclaimer.         *
277 | *                                                                      *
278 | ************************************************************************
279 | 
280 | ************************************************************************
281 | *                                                                      *
282 | *  7. Limitation of Liability                                          *
283 | *  --------------------------                                          *
284 | *                                                                      *
285 | *  Under no circumstances and under no legal theory, whether tort      *
286 | *  (including negligence), contract, or otherwise, shall any           *
287 | *  Contributor, or anyone who distributes Covered Software as          *
288 | *  permitted above, be liable to You for any direct, indirect,         *
289 | *  special, incidental, or consequential damages of any character      *
290 | *  including, without limitation, damages for lost profits, loss of    *
291 | *  goodwill, work stoppage, computer failure or malfunction, or any    *
292 | *  and all other commercial damages or losses, even if such party      *
293 | *  shall have been informed of the possibility of such damages. This   *
294 | *  limitation of liability shall not apply to liability for death or   *
295 | *  personal injury resulting from such party's negligence to the       *
296 | *  extent applicable law prohibits such limitation. Some               *
297 | *  jurisdictions do not allow the exclusion or limitation of           *
298 | *  incidental or consequential damages, so this exclusion and          *
299 | *  limitation may not apply to You.                                    *
300 | *                                                                      *
301 | ************************************************************************
302 | 
303 | 8. Litigation
304 | -------------
305 | 
306 | Any litigation relating to this License may be brought only in the
307 | courts of a jurisdiction where the defendant maintains its principal
308 | place of business and such litigation shall be governed by laws of that
309 | jurisdiction, without reference to its conflict-of-law provisions.
310 | Nothing in this Section shall prevent a party's ability to bring
311 | cross-claims or counter-claims.
312 | 
313 | 9. Miscellaneous
314 | ----------------
315 | 
316 | This License represents the complete agreement concerning the subject
317 | matter hereof. If any provision of this License is held to be
318 | unenforceable, such provision shall be reformed only to the extent
319 | necessary to make it enforceable. Any law or regulation which provides
320 | that the language of a contract shall be construed against the drafter
321 | shall not be used to construe this License against a Contributor.
322 | 
323 | 10. Versions of the License
324 | ---------------------------
325 | 
326 | 10.1. New Versions
327 | 
328 | Mozilla Foundation is the license steward. Except as provided in Section
329 | 10.3, no one other than the license steward has the right to modify or
330 | publish new versions of this License. Each version will be given a
331 | distinguishing version number.
332 | 
333 | 10.2. Effect of New Versions
334 | 
335 | You may distribute the Covered Software under the terms of the version
336 | of the License under which You originally received the Covered Software,
337 | or under the terms of any subsequent version published by the license
338 | steward.
339 | 
340 | 10.3. Modified Versions
341 | 
342 | If you create software not governed by this License, and you want to
343 | create a new license for such software, you may create and use a
344 | modified version of this License if you rename the license and remove
345 | any references to the name of the license steward (except to note that
346 | such modified license differs from this License).
347 | 
348 | 10.4. Distributing Source Code Form that is Incompatible With Secondary
349 | Licenses
350 | 
351 | If You choose to distribute Source Code Form that is Incompatible With
352 | Secondary Licenses under the terms of this version of the License, the
353 | notice described in Exhibit B of this License must be attached.
354 | 
355 | Exhibit A - Source Code Form License Notice
356 | -------------------------------------------
357 | 
358 |   This Source Code Form is subject to the terms of the Mozilla Public
359 |   License, v. 2.0. If a copy of the MPL was not distributed with this
360 |   file, You can obtain one at http://mozilla.org/MPL/2.0/.
361 | 
362 | If it is not possible or desirable to put the notice in a particular
363 | file, then You may include the notice in a location (such as a LICENSE
364 | file in a relevant directory) where a recipient would be likely to look
365 | for such a notice.
366 | 
367 | You may add additional accurate notices of copyright ownership.
368 | 
369 | Exhibit B - "Incompatible With Secondary Licenses" Notice
370 | ---------------------------------------------------------
371 | 
372 |   This Source Code Form is "Incompatible With Secondary Licenses", as
373 |   defined by the Mozilla Public License, v. 2.0.
374 | 


--------------------------------------------------------------------------------
/tools/AWS/aws_transcribe.ipynb:
--------------------------------------------------------------------------------
  1 | {
  2 |  "cells": [
  3 |   {
  4 |    "cell_type": "code",
  5 |    "execution_count": 1,
  6 |    "metadata": {
  7 |     "colab": {
  8 |      "base_uri": "https://localhost:8080/",
  9 |      "height": 289
 10 |     },
 11 |     "id": "KH2ZxvYyaQ-I",
 12 |     "outputId": "e98e5c90-4278-483e-c87f-bfba03c60954"
 13 |    },
 14 |    "outputs": [
 15 |     {
 16 |      "name": "stdout",
 17 |      "output_type": "stream",
 18 |      "text": [
 19 |       "Collecting boto3\n",
 20 |       "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/2f/08/f1ff665147a5d75b871bbe5ba76916f6490419c52a33e588385c4b69281b/boto3-1.15.18-py2.py3-none-any.whl (129kB)\n",
 21 |       "\u001b[K     |████████████████████████████████| 133kB 2.7MB/s \n",
 22 |       "\u001b[?25hCollecting botocore<1.19.0,>=1.18.18\n",
 23 |       "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/2d/72/984ac8f33b5c8df5ff63f323a8724f65b4d0f8956968b942b77d35d3a1ef/botocore-1.18.18-py2.py3-none-any.whl (6.7MB)\n",
 24 |       "\u001b[K     |████████████████████████████████| 6.7MB 6.8MB/s \n",
 25 |       "\u001b[?25hCollecting jmespath<1.0.0,>=0.7.1\n",
 26 |       "  Downloading https://files.pythonhosted.org/packages/07/cb/5f001272b6faeb23c1c9e0acc04d48eaaf5c862c17709d20e3469c6e0139/jmespath-0.10.0-py2.py3-none-any.whl\n",
 27 |       "Collecting s3transfer<0.4.0,>=0.3.0\n",
 28 |       "\u001b[?25l  Downloading https://files.pythonhosted.org/packages/69/79/e6afb3d8b0b4e96cefbdc690f741d7dd24547ff1f94240c997a26fa908d3/s3transfer-0.3.3-py2.py3-none-any.whl (69kB)\n",
 29 |       "\u001b[K     |████████████████████████████████| 71kB 6.6MB/s \n",
 30 |       "\u001b[?25hRequirement already satisfied: python-dateutil<3.0.0,>=2.1 in /usr/local/lib/python3.6/dist-packages (from botocore<1.19.0,>=1.18.18->boto3) (2.8.1)\n",
 31 |       "Requirement already satisfied: urllib3<1.26,>=1.20; python_version != \"3.4\" in /usr/local/lib/python3.6/dist-packages (from botocore<1.19.0,>=1.18.18->boto3) (1.24.3)\n",
 32 |       "Requirement already satisfied: six>=1.5 in /usr/local/lib/python3.6/dist-packages (from python-dateutil<3.0.0,>=2.1->botocore<1.19.0,>=1.18.18->boto3) (1.15.0)\n",
 33 |       "Installing collected packages: jmespath, botocore, s3transfer, boto3\n",
 34 |       "Successfully installed boto3-1.15.18 botocore-1.18.18 jmespath-0.10.0 s3transfer-0.3.3\n"
 35 |      ]
 36 |     }
 37 |    ],
 38 |    "source": [
 39 |     "!pip install boto3"
 40 |    ]
 41 |   },
 42 |   {
 43 |    "cell_type": "code",
 44 |    "execution_count": 2,
 45 |    "metadata": {
 46 |     "id": "DuUJr2DyFYWT"
 47 |    },
 48 |    "outputs": [],
 49 |    "source": [
 50 |     "class config:\n",
 51 |     "    AWS_ACCESS_KEY_ID = ''\n",
 52 |     "    AWS_SECRET_ACCESS_KEY = ''\n",
 53 |     "    region_name ='sa-east-1'\n",
 54 |     "    bucket_name = 'amazon-transcribe'\n",
 55 |     "    audio_format = 'wav'\n",
 56 |     "    output_path = '/content'"
 57 |    ]
 58 |   },
 59 |   {
 60 |    "cell_type": "code",
 61 |    "execution_count": 22,
 62 |    "metadata": {
 63 |     "id": "HkalGSJgWun0"
 64 |    },
 65 |    "outputs": [],
 66 |    "source": [
 67 |     "from boto3 import client\n",
 68 |     "from time import sleep\n",
 69 |     "from urllib.request import urlopen\n",
 70 |     "from json import loads\n",
 71 |     "import pandas as pd\n",
 72 |     "from os.path import join\n",
 73 |     "from tqdm import tqdm\n",
 74 |     "\n",
 75 |     "def get_transcription_from_job(transcribe, job_name):\n",
 76 |     "    status = transcribe.get_transcription_job(TranscriptionJobName=job_name)\n",
 77 |     "    response = urlopen(status['TranscriptionJob']['Transcript']['TranscriptFileUri'])\n",
 78 |     "    data = loads(response.read())\n",
 79 |     "    text = data['results']['transcripts'][0]['transcript']\n",
 80 |     "\n",
 81 |     "    return text\n",
 82 |     "\n",
 83 |     "def get_bucket_names():\n",
 84 |     "    \"\"\"\n",
 85 |     "    Realisa a busca de diferentes buckets cujos nomes começam pelo nome\n",
 86 |     "    dado em config.bucket_name e os retorna.\n",
 87 |     "    \"\"\"\n",
 88 |     "\n",
 89 |     "    s3 = client('s3', \n",
 90 |     "                aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n",
 91 |     "                aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n",
 92 |     "                region_name=config.region_name)\n",
 93 |     "\n",
 94 |     "    response = s3.list_buckets()\n",
 95 |     "\n",
 96 |     "    bucket_names = []\n",
 97 |     "\n",
 98 |     "    for idx, _ in enumerate(response):\n",
 99 |     "        bucket = response['Buckets'][idx]['Name']\n",
100 |     "        if bucket.startswith(config.bucket_name):\n",
101 |     "            bucket_names.append(response['Buckets'][idx]['Name'])\n",
102 |     "\n",
103 |     "    return bucket_names\n",
104 |     "\n",
105 |     "def get_audio_files_url(bucket_name=None):\n",
106 |     "    \"\"\"\n",
107 |     "    Realiza a busca das URLs dos arquivos em um determinado bucket.\n",
108 |     "    Caso um nome de um determinado bucket não seja passado como parâmetro, \n",
109 |     "    como padrão, o último bucket criado será analisado.\n",
110 |     "    \"\"\"\n",
111 |     "\n",
112 |     "    URLS = []\n",
113 |     "\n",
114 |     "    s3 = client('s3', \n",
115 |     "                aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n",
116 |     "                aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n",
117 |     "                region_name=config.region_name)\n",
118 |     "\n",
119 |     "    if bucket_name is None:\n",
120 |     "        response = s3.list_buckets()\n",
121 |     "\n",
122 |     "        bucket_name = response['Buckets'][-1]['Name']\n",
123 |     "        \n",
124 |     "        # print(f'  {bucket_name}')\n",
125 |     "        # print(len(response))\n",
126 |     "        # for idx, _ in enumerate(response):\n",
127 |     "        #     print(response['Buckets'][idx]['Name'])\n",
128 |     "\n",
129 |     "\n",
130 |     "    url_prefix = 'https://' + bucket_name + '.s3' + '-' + config.region_name + '.amazonaws.com'\n",
131 |     "\n",
132 |     "    for key in s3.list_objects(Bucket=bucket_name)['Contents']:\n",
133 |     "        if key['Key'].endswith('.' + config.audio_format):\n",
134 |     "            URLS.append(url_prefix + '/' + key['Key'])\n",
135 |     "\n",
136 |     "    # print(URLS)\n",
137 |     "\n",
138 |     "    return URLS\n",
139 |     "\n",
140 |     "\n",
141 |     "def transcribe_audio_files(URLS):\n",
142 |     "    \"\"\"\n",
143 |     "    Realisa a transcrição dos áudios.\n",
144 |     "    Tem como parâmetro as URLs dos áudios em um bucket.\n",
145 |     "    \"\"\"\n",
146 |     "\n",
147 |     "    file_names = []\n",
148 |     "\n",
149 |     "    transcribed_texts = []\n",
150 |     "\n",
151 |     "    transcribe = client('transcribe', \n",
152 |     "                        aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n",
153 |     "                        aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n",
154 |     "                        region_name=config.region_name)\n",
155 |     "\n",
156 |     "    for counter, url in enumerate(tqdm(URLS)):\n",
157 |     "        file_name = str(url).split('/')[-1]\n",
158 |     "        file_names.append(file_name)\n",
159 |     "\n",
160 |     "        # print(f\"Transcrevendo {file_name}... \")\n",
161 |     "        # Diferentes nomes para cada iteração\n",
162 |     "        job_name = file_name.split('.')[0]\n",
163 |     "    \n",
164 |     "        try:\n",
165 |     "            transcribe.start_transcription_job(TranscriptionJobName=job_name, \n",
166 |     "                                            Media={'MediaFileUri': url}, \n",
167 |     "                                            MediaFormat=config.audio_format, \n",
168 |     "                                            LanguageCode='pt-BR')\n",
169 |     "        except transcribe.exceptions.ConflictException:\n",
170 |     "            print(f\"\\tO arquivo '{file_name}' já foi transcrito, indo para o próximo arquivo...\")\n",
171 |     "            text = get_transcription_from_job(transcribe, job_name)\n",
172 |     "            transcribed_texts.append(text)\n",
173 |     "            continue\n",
174 |     "\n",
175 |     "        while True:\n",
176 |     "            status = transcribe.get_transcription_job(TranscriptionJobName=job_name)\n",
177 |     "            if status['TranscriptionJob']['TranscriptionJobStatus'] in ['COMPLETED', 'FAILED']:\n",
178 |     "                break\n",
179 |     "            sleep(2)\n",
180 |     "            \n",
181 |     "        # print(f\"{status['TranscriptionJob']['TranscriptionJobStatus']}\\n\")\n",
182 |     "\n",
183 |     "        if status['TranscriptionJob']['TranscriptionJobStatus'] == 'COMPLETED':\n",
184 |     "            response = urlopen(status['TranscriptionJob']['Transcript']['TranscriptFileUri'])\n",
185 |     "            data = loads(response.read())\n",
186 |     "            text = data['results']['transcripts'][0]['transcript']\n",
187 |     "            transcribed_texts.append(text)\n",
188 |     "\n",
189 |     "    return file_names, transcribed_texts\n",
190 |     "\n",
191 |     "def get_completed_job_names():\n",
192 |     "\n",
193 |     "    transcribe = client('transcribe', \n",
194 |     "                        aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n",
195 |     "                        aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n",
196 |     "                        region_name=config.region_name)\n",
197 |     "    \n",
198 |     "    job_names = []\n",
199 |     "    \n",
200 |     "    response = transcribe.list_transcription_jobs(Status='COMPLETED', MaxResults=100)\n",
201 |     "    response_more = response\n",
202 |     "\n",
203 |     "    # Do while\n",
204 |     "    while True:\n",
205 |     "        for job in response_more['TranscriptionJobSummaries']:\n",
206 |     "            job_names.append(job['TranscriptionJobName'])\n",
207 |     "        \n",
208 |     "        if 'NextToken' not in response_more.keys():\n",
209 |     "            break\n",
210 |     "\n",
211 |     "        response_more = transcribe.list_transcription_jobs(Status='COMPLETED', NextToken=response_more['NextToken'], MaxResults=100)\n",
212 |     "\n",
213 |     "    return job_names\n",
214 |     "\n",
215 |     "\n",
216 |     "def delete_completed_jobs(completed_job_names):\n",
217 |     "    \"\"\"\n",
218 |     "    Deleta os 'Transcription jobs' completos.\n",
219 |     "    Essa função se faz necessária caso se use o mesmo \n",
220 |     "    'job_name' (na classe config) para mais de uma execução do script.\n",
221 |     "\n",
222 |     "    Os 'Transcription jobs' devem ser únicos.\n",
223 |     "    \"\"\"\n",
224 |     "\n",
225 |     "    transcribe = client('transcribe', \n",
226 |     "                        aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n",
227 |     "                        aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n",
228 |     "                        region_name=config.region_name)\n",
229 |     "    \n",
230 |     "    for job_name in tqdm(completed_job_names):\n",
231 |     "        transcribe.delete_transcription_job(TranscriptionJobName=job_name)\n",
232 |     "\n",
233 |     "def make_matadata(file_names, transcribed_texts):\n",
234 |     "    df = pd.DataFrame()\n",
235 |     "\n",
236 |     "    for file_name, text in zip(file_names, transcribed_texts):\n",
237 |     "        df = df.append({'A': file_name, 'B' : text}, ignore_index=True)\n",
238 |     "\n",
239 |     "    df.to_csv(join(config.output_path, 'transcribed_text.csv'), sep='|', index=False, header=False, quotechar=\"'\")\n",
240 |     "\n",
241 |     "def run_transcribe():\n",
242 |     "\n",
243 |     "    # completed_jobs = get_completed_job_names()\n",
244 |     "    # print(completed_jobs)\n",
245 |     "\n",
246 |     "    bucket_names = get_bucket_names()\n",
247 |     "\n",
248 |     "    for idx, bucket in enumerate(bucket_names):\n",
249 |     "        print(\"\\nAcessando bucket {0} -> {1} de {2}\".format(bucket, idx+1, len(bucket_names)))\n",
250 |     "\n",
251 |     "        URLS = get_audio_files_url(bucket)\n",
252 |     "        file_names, transcribed_texts = transcribe_audio_files(URLS)\n",
253 |     "        make_matadata(file_names, transcribed_texts)"
254 |    ]
255 |   },
256 |   {
257 |    "cell_type": "code",
258 |    "execution_count": 24,
259 |    "metadata": {
260 |     "colab": {
261 |      "base_uri": "https://localhost:8080/",
262 |      "height": 85
263 |     },
264 |     "id": "7sL4wWD66KGz",
265 |     "outputId": "19f6ef24-3e8b-42eb-ced7-e4da49ea7020"
266 |    },
267 |    "outputs": [
268 |     {
269 |      "name": "stderr",
270 |      "output_type": "stream",
271 |      "text": [
272 |       "\n",
273 |       "0it [00:00, ?it/s]"
274 |      ]
275 |     },
276 |     {
277 |      "name": "stdout",
278 |      "output_type": "stream",
279 |      "text": [
280 |       "[]\n",
281 |       "[]\n"
282 |      ]
283 |     },
284 |     {
285 |      "name": "stderr",
286 |      "output_type": "stream",
287 |      "text": [
288 |       "\n"
289 |      ]
290 |     }
291 |    ],
292 |    "source": [
293 |     "run_transcribe()"
294 |    ]
295 |   },
296 |   {
297 |    "cell_type": "code",
298 |    "execution_count": null,
299 |    "metadata": {
300 |     "id": "KYPZLDovCpjN"
301 |    },
302 |    "outputs": [],
303 |    "source": [
304 |     "from google.colab import files\n",
305 |     "files.download('transcribed_text.csv') "
306 |    ]
307 |   },
308 |   {
309 |    "cell_type": "code",
310 |    "execution_count": null,
311 |    "metadata": {
312 |     "id": "1YR2jSYnqTGx"
313 |    },
314 |    "outputs": [],
315 |    "source": [
316 |     "import logging\n",
317 |     "from boto3 import client\n",
318 |     "from botocore.exceptions import ClientError\n",
319 |     "\n",
320 |     "import os\n",
321 |     "import sys\n",
322 |     "import threading\n",
323 |     "\n",
324 |     "import ntpath\n",
325 |     "\n",
326 |     "class ProgressPercentage(object):\n",
327 |     "\n",
328 |     "    def __init__(self, filename):\n",
329 |     "        self._filename = filename\n",
330 |     "        self._size = float(os.path.getsize(filename))\n",
331 |     "        self._seen_so_far = 0\n",
332 |     "        self._lock = threading.Lock()\n",
333 |     "\n",
334 |     "    def __call__(self, bytes_amount):\n",
335 |     "        # To simplify, assume this is hooked up to a single filename\n",
336 |     "        with self._lock:\n",
337 |     "            self._seen_so_far += bytes_amount\n",
338 |     "            percentage = (self._seen_so_far / self._size) * 100\n",
339 |     "            sys.stdout.write(\n",
340 |     "                \"\\r%s  %s / %s  (%.2f%%)\" % (\n",
341 |     "                    self._filename, self._seen_so_far, self._size,\n",
342 |     "                    percentage))\n",
343 |     "            sys.stdout.flush()\n",
344 |     "\n",
345 |     "def upload_file(file_path, bucket, object_name=None):\n",
346 |     "\n",
347 |     "    if object_name is None:\n",
348 |     "        object_name = ntpath.basename(file_path)\n",
349 |     "\n",
350 |     "    # Upload the file\n",
351 |     "    s3 = client('s3', \n",
352 |     "                aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n",
353 |     "                aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n",
354 |     "                region_name=config.region_name)\n",
355 |     "    try:\n",
356 |     "        response = s3.upload_file(file_path, \n",
357 |     "                                  bucket, \n",
358 |     "                                  object_name,\n",
359 |     "                                  Callback=ProgressPercentage(file_path))\n",
360 |     "    except ClientError as e:\n",
361 |     "        logging.error(e)\n",
362 |     "        return False\n",
363 |     "    return True\n",
364 |     "\n",
365 |     "def upload_multiple_files(files_path, bucket_name):\n",
366 |     "    \"\"\"\n",
367 |     "    Realiza o upload de multiplos arquivos.\n",
368 |     "    \n",
369 |     "    ---\n",
370 |     "\n",
371 |     "    files_path se refere ao caminho até o diretório onde se encontram\n",
372 |     "    os arquivos de áudio.\n",
373 |     "\n",
374 |     "    bucket_name se refere ao nome de um bucket especifico, onde será feita o \n",
375 |     "    upload dos arquivos.\n",
376 |     "    \"\"\"\n",
377 |     "\n",
378 |     "    files = os.listdir(files_path)\n",
379 |     "\n",
380 |     "    for file in files:\n",
381 |     "        upload_file(os.path.join(files_path, file), bucket_name)\n",
382 |     "\n",
383 |     "def create_bucket(bucket_name, region=None):\n",
384 |     "    \"\"\"\n",
385 |     "    Cria um bucket com permissões privadas.\n",
386 |     "\n",
387 |     "    ---\n",
388 |     "\n",
389 |     "    bucket_name é o nome do bucket a ser criado.\n",
390 |     "\n",
391 |     "    region é o código de região (region_name), se não for explicitado, como padrão,\n",
392 |     "    a região 'us-west-2' (US West (Oregon)) será utilizada.\n",
393 |     "    \"\"\"\n",
394 |     "\n",
395 |     "    try:\n",
396 |     "        if region is None:\n",
397 |     "            s3 = client('s3', \n",
398 |     "                        aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n",
399 |     "                        aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY)\n",
400 |     "            s3.create_bucket(Bucket=bucket_name, ACL='private')\n",
401 |     "\n",
402 |     "\n",
403 |     "        else:\n",
404 |     "            s3 = client('s3', \n",
405 |     "                        aws_access_key_id=config.AWS_ACCESS_KEY_ID, \n",
406 |     "                        aws_secret_access_key=config.AWS_SECRET_ACCESS_KEY, \n",
407 |     "                        region_name=region)\n",
408 |     "            location = {'LocationConstraint': region}\n",
409 |     "            s3.create_bucket(Bucket=bucket_name,\n",
410 |     "                            CreateBucketConfiguration=location,\n",
411 |     "                            ACL='private')\n",
412 |     "    \n",
413 |     "        response_public = s3.put_public_access_block(\n",
414 |     "            Bucket=bucket_name,\n",
415 |     "            PublicAccessBlockConfiguration={\n",
416 |     "                'BlockPublicAcls': True,\n",
417 |     "                'IgnorePublicAcls': True,\n",
418 |     "                'BlockPublicPolicy': True,\n",
419 |     "                'RestrictPublicBuckets': True\n",
420 |     "            },\n",
421 |     "        )\n",
422 |     "\n",
423 |     "    except ClientError as e:\n",
424 |     "        logging.error(e)\n",
425 |     "        return False\n",
426 |     "    return True"
427 |    ]
428 |   },
429 |   {
430 |    "cell_type": "markdown",
431 |    "metadata": {
432 |     "id": "7T-tRyzPB6Dq"
433 |    },
434 |    "source": [
435 |     "# Exemplos de uso"
436 |    ]
437 |   },
438 |   {
439 |    "cell_type": "code",
440 |    "execution_count": null,
441 |    "metadata": {
442 |     "colab": {
443 |      "base_uri": "https://localhost:8080/",
444 |      "height": 34
445 |     },
446 |     "id": "gC_FPhosrkGu",
447 |     "outputId": "6942a7fd-c613-4c2b-dc9d-6f7b7eb55149"
448 |    },
449 |    "outputs": [
450 |     {
451 |      "data": {
452 |       "text/plain": [
453 |        "True"
454 |       ]
455 |      },
456 |      "execution_count": 37,
457 |      "metadata": {
458 |       "tags": []
459 |      },
460 |      "output_type": "execute_result"
461 |     }
462 |    ],
463 |    "source": [
464 |     "create_bucket('cbtest0', config.region_name)"
465 |    ]
466 |   },
467 |   {
468 |    "cell_type": "code",
469 |    "execution_count": null,
470 |    "metadata": {
471 |     "colab": {
472 |      "base_uri": "https://localhost:8080/",
473 |      "height": 34
474 |     },
475 |     "id": "-aO_Rbslxjbm",
476 |     "outputId": "4f793dc5-1901-4da6-82a2-492659411863"
477 |    },
478 |    "outputs": [
479 |     {
480 |      "name": "stdout",
481 |      "output_type": "stream",
482 |      "text": [
483 |       "/content/a/015.wav  62044 / 62044.0  (100.00%)"
484 |      ]
485 |     }
486 |    ],
487 |    "source": [
488 |     "upload_multiple_files('/content/audio', 'cbtest0')"
489 |    ]
490 |   }
491 |  ],
492 |  "metadata": {
493 |   "colab": {
494 |    "collapsed_sections": [],
495 |    "name": "aws_transcribe_2_1.ipynb",
496 |    "provenance": [],
497 |    "toc_visible": true
498 |   },
499 |   "kernelspec": {
500 |    "display_name": "Python 3",
501 |    "language": "python",
502 |    "name": "python3"
503 |   },
504 |   "language_info": {
505 |    "codemirror_mode": {
506 |     "name": "ipython",
507 |     "version": 3
508 |    },
509 |    "file_extension": ".py",
510 |    "mimetype": "text/x-python",
511 |    "name": "python",
512 |    "nbconvert_exporter": "python",
513 |    "pygments_lexer": "ipython3",
514 |    "version": "3.7.10"
515 |   }
516 |  },
517 |  "nbformat": 4,
518 |  "nbformat_minor": 1
519 | }
520 | 


--------------------------------------------------------------------------------