├── .gitignore
├── LICENSE
├── README.md
├── old
    ├── outputs
    │   └── README
    └── transcriber.py
├── outputs
    └── README
├── requirements.txt
├── transcriber
└── transcriber.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | *
 2 | !.gitignore
 3 | !LICENSE
 4 | !README.md
 5 | !requirements.txt
 6 | !transcriber.py
 7 | !transcriber
 8 | !outputs/
 9 | !outputs/README
10 | !old/
11 | !old/*
12 | !old/outputs/
13 | !old/outputs/README
14 | 


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2022 Kostas Ereksonas
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
 1 | # Audio-transcriber
 2 | Simple Python audio transcriber using OpenAI's Whisper speech recognition model
 3 | 
 4 | Table of Contents
 5 | =================
 6 | * [Prerequisites](#Prerequisites)
 7 | * [Instructions](#Instructions)
 8 | 
 9 | # Prerequisites
10 | 
11 | ***Python 3.10*** was used to create a virtual environment for the script to run.
12 | 
13 | # Instructions
14 | 
15 | 1. Create the virtual environment with python 3.10:
16 | 
17 | `python3.10 -m venv .`
18 | 
19 | 2. Activate the virtual environment:
20 | 
21 | `source bin/activate`
22 | 
23 | 3. Install required packages with pip
24 | 
25 | `pip install -r requirements.txt`
26 | 
27 | 4. Run a program with Python3
28 | 
29 | 4.1. For v1:
30 | `python3 transcriber.py -u, --url <URL>`
31 | 
32 | 4.2. For v2:
33 | `./transcriber -u <url>`
34 | 


--------------------------------------------------------------------------------
/old/outputs/README:
--------------------------------------------------------------------------------
1 | "Audio and trascription files are saved here"
2 | 


--------------------------------------------------------------------------------
/old/transcriber.py:
--------------------------------------------------------------------------------
  1 | #!/usr/bin/env python3
  2 | """
  3 | Audio transcriber using OpenAI's Whisper speech recognition model.
  4 | Usage: python3 transcriber.py -u, --url <URL>
  5 | """
  6 | import os
  7 | import getopt
  8 | import re
  9 | import sys
 10 | import torch
 11 | import whisper
 12 | from googletrans import Translator
 13 | import yt_dlp as youtube_dl
 14 | 
 15 | AUDIOFILE = "audio.mp3"  # Save audio file as audio.mp3
 16 | 
 17 | def match_pattern(pattern, arg):
 18 |     """If YouTube shorts URL is given, convert it to standard URL."""
 19 |     match = re.search(pattern, arg)
 20 |     if bool(match):
 21 |         url = re.sub(pattern, "watch?v=", arg)
 22 |     else:
 23 |         url = arg
 24 |     return url
 25 | 
 26 | 
 27 | def get_audio(url, argv):
 28 |     """
 29 |     Download mp3 audio of a YouTube video. Credit to Stokry.
 30 |     https://dev.to/stokry/download-youtube-video-to-mp3-with-python-26p
 31 |     """
 32 |     try:
 33 |         opts, args = getopt.getopt(argv, "u:", ["url="])
 34 |     except:
 35 |         print("Usage: python3 transcriber.py -u <url>")
 36 |     for opt, arg in opts:
 37 |         if opt in ['-u', '--url']:
 38 |             url = match_pattern("shorts/", arg)
 39 |     video_info = youtube_dl.YoutubeDL().extract_info(url=url, download=False)
 40 |     options = {
 41 |         'format': 'bestaudio/best',
 42 |         'keepvideo': False,
 43 |         'outtmpl': AUDIOFILE,
 44 |     }
 45 |     with youtube_dl.YoutubeDL(options) as ydl:
 46 |         ydl.download([video_info['webpage_url']])
 47 | 
 48 | 
 49 | def banner(text):
 50 |     """Display a message when the script is working in the background"""
 51 |     print(f"# {text} #")
 52 | 
 53 | 
 54 | def check_device():
 55 |     """Check CUDA availability."""
 56 |     if torch.cuda.is_available() == 1:
 57 |         device = "cuda"
 58 |     else:
 59 |         device = "cpu"
 60 |     return device
 61 | 
 62 | 
 63 | def get_result():
 64 |     """Get speech recognition model."""
 65 |     model_name = input("Select speech recognition model name (tiny, base, small, medium, large): ")
 66 |     banner("Transcribing text")
 67 |     model = whisper.load_model(model_name, device=check_device())
 68 |     result = model.transcribe(AUDIOFILE)
 69 |     format_result('transcription.txt', result["text"])
 70 | 
 71 | 
 72 | def format_result(file_name, text):
 73 |     """Put a newline character after each sentence and prompt user for translation."""
 74 |     format_text = re.sub('\.', '.\n', text)
 75 |     with open(file_name, 'a', encoding="utf-8") as file:
 76 |         banner("Writing transcription to text file")
 77 |         file.write(format_text)
 78 |         choice = input("Do you want to translate audio transcription to English? (Yes/No) ")
 79 |     if choice == "Yes":
 80 |         translate_result('transcription.txt', 'translation.txt')
 81 | 
 82 | 
 83 | def translate_result(org_file, trans_file):
 84 |     """
 85 |     Translate transcribed text. Credit to Harsh Jain at educative.io
 86 |     https://www.educative.io/answers/how-do-you-translate-text-using-python
 87 |     """
 88 |     translator = Translator()  # Create an instance of Translator() class
 89 |     with open(org_file, 'r', encoding="utf-8") as transcription:
 90 |         contents = transcription.read()
 91 |         banner("Translating text")
 92 |         translation = translator.translate(contents)
 93 |     with open(trans_file, 'a', encoding="utf-8") as file:
 94 |         banner("Writing translation to text file")
 95 |         file.write(translation.text)
 96 | 
 97 | 
 98 | def main():
 99 |     """Main function."""
100 |     os.chdir('outputs')
101 |     get_audio(None,sys.argv[1:])    # Download an mp3 audio file to transcribe to text
102 |     get_result()            # Get audio transcription and translation if needed
103 | 
104 | if __name__ == "__main__":
105 |     main()
106 | 


--------------------------------------------------------------------------------
/outputs/README:
--------------------------------------------------------------------------------
1 | "Audio and trascription files are saved here"
2 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | git+https://github.com/openai/whisper.git
2 | googletrans
3 | torch
4 | yt-dlp
5 | 


--------------------------------------------------------------------------------
/transcriber:
--------------------------------------------------------------------------------
  1 | #!/bin/bash
  2 | 
  3 | delimiter="------------------------------------------------------------------------------------------------------------"
  4 | 
  5 | selectModel(){
  6 | 	# Select whisper model
  7 | 	PS3="[+] Choose model: "
  8 | 	models=("tiny" "base" "small" "medium" "large" "turbo")
  9 | 	select model in "${models[@]}"; do
 10 | 		case "${model}" in
 11 | 			"tiny")
 12 | 				printf "%s\n" "${model}"
 13 | 				break
 14 | 				;;
 15 | 			"base")
 16 | 				printf "%s\n" "${model}"
 17 | 				break
 18 | 				;;
 19 | 			"small")
 20 | 				printf "%s\n" "${model}"
 21 | 				break
 22 | 				;;
 23 | 			"medium")
 24 | 				printf "%s\n" "${model}"
 25 | 				break
 26 | 				;;
 27 | 			"large")
 28 | 				printf "%s\n" "${model}"
 29 | 				break
 30 | 				;;
 31 | 			"turbo")
 32 | 				printf "%s\n" "${model}"
 33 | 				break
 34 | 				;;
 35 | 		esac
 36 | 	done
 37 | }
 38 | 
 39 | downloadAudio(){
 40 | 	# Download audio to transcribe
 41 | 	url="${1}"
 42 | 
 43 | 	cd outputs/
 44 | 	printf "%s\n" "${delimiter}"
 45 | 	printf "[+] Downloading audio\n"
 46 | 	printf "%s\n" "${delimiter}"
 47 | 	yt-dlp -x --audio-format mp3 "${url}" --output audio.mp3 --quiet
 48 | 	cd ../
 49 | }
 50 | 
 51 | checkDependencies(){
 52 | 	# Check if dependencies are met
 53 | 	if ! which yt-dlp 2>/dev/null 1>&2; then missing+=("yt-dlp"); fi
 54 | }
 55 | 
 56 | if [[ $# -eq 0 ]]; then
 57 | 	printf "Missing argument -u. Usage: ./transcriber -u <url>\n"
 58 | 	exit 1
 59 | elif [[ $# -gt 2 ]]; then
 60 | 	printf "To many arguments. Usage: ./transcriber -u <url>\n"
 61 | 	exit 1
 62 | fi
 63 | 
 64 | OPTSTRING=":u:"
 65 | while getopts "${OPTSTRING}" opt; do
 66 | 	case "${opt}" in
 67 | 		u)
 68 | 			url="${OPTARG}"
 69 | 			;;
 70 | 		:)
 71 | 			printf "Option -${OPTARG} requires an argument.\n"
 72 | 			exit 1
 73 | 			;;
 74 | 		?)
 75 | 			printf "Invalid option -${OPTARG}\n"
 76 | 			exit 1
 77 | 			;;
 78 | 	esac
 79 | done
 80 | 
 81 | missing=()
 82 | 
 83 | checkDependencies
 84 | 
 85 | if ! [[ -z "${missing[*]}" ]]; then
 86 | 	printf "%s\n" "${delimiter}"
 87 | 	printf "| [+] Missing dependencies: %s\n" "${missing[*]}"
 88 | 	printf "%s\n" "${delimiter}"
 89 | else
 90 | 	model="$(selectModel)"
 91 | 
 92 | 	printf "%s\n" "${delimiter}"
 93 | 	printf "[+] Model: %s\n" "${model}"
 94 | 	printf "%s\n" "${delimiter}"
 95 | 
 96 | 	downloadAudio "${url}"
 97 | 
 98 | 	printf "%s\n" "${delimiter}"
 99 | 	printf "[+] Transcribing audio\n"
100 | 	printf "%s\n" "${delimiter}"
101 | 
102 | 	sh -c "python3 transcriber.py ${model}"
103 | 
104 | 	cd outputs/ && rm "audio.mp3"
105 | fi
106 | 


--------------------------------------------------------------------------------
/transcriber.py:
--------------------------------------------------------------------------------
 1 | #!/usr/bin/env python3
 2 | 
 3 | import sys
 4 | import whisper
 5 | 
 6 | result_file = "outputs/results.txt"
 7 | 
 8 | model = whisper.load_model(sys.argv[1])
 9 | 
10 | # load audio and pad/trim it to fit 30 seconds
11 | audio = whisper.load_audio("outputs/audio.mp3")
12 | audio_trim = whisper.pad_or_trim(audio)
13 | 
14 | # make log-Mel spectrogram and move to the same device as the model
15 | mel = whisper.log_mel_spectrogram(audio_trim, n_mels=model.dims.n_mels).to(model.device)
16 | 
17 | # detect the spoken language and write to results file
18 | _, probs = model.detect_language(mel)
19 | language_detected = max(probs, key=probs.get)
20 | print(f"Detected language: {language_detected}")
21 | 
22 | with open(result_file, "a") as file:
23 |     file.write(f"\nDetected language: {language_detected}\n")
24 | 
25 | # decode the audio
26 | result = model.transcribe(audio)
27 | 
28 | # write the recognized text to file
29 | with open(result_file, "a") as file:
30 |     file.write("\nTranscription:\n")
31 |     file.write(result["text"])
32 |     file.write("\n")
33 | 
34 | print(f"Transcription saved at: {result_file}")
35 | 


--------------------------------------------------------------------------------