├── tabs
    ├── __init__.py
    ├── GreeterView.py
    ├── ListStreams.py
    ├── SubtitlesTab.py
    └── ConfigureVoiceTab.py
├── logo.ico
├── logo.png
├── .vscode
    ├── settings.json
    └── launch.json
├── requirements-win-310.txt
├── video_thumbnail_preview.png
├── test.py
├── .gitignore
├── requirements-min.txt
├── app_state.py
├── loading subs pseudocode
├── language_detection.py
├── requirements.txt
├── weeablind.spec
├── synth.py
├── vocal_isolation.py
├── utils.py
├── diarize.py
├── feature_support.py
├── requirements-linux310.txt
├── dub_line.py
├── weeablind.py
├── Voice.py
├── README.md
└── video.py


/tabs/__init__.py:
--------------------------------------------------------------------------------
1 | 


--------------------------------------------------------------------------------
/logo.ico:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FlorianEagox/WeeaBlind/HEAD/logo.ico


--------------------------------------------------------------------------------
/logo.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FlorianEagox/WeeaBlind/HEAD/logo.png


--------------------------------------------------------------------------------
/.vscode/settings.json:
--------------------------------------------------------------------------------
1 | {
2 |     "cSpell.words": [
3 |         "Weeablind"
4 |     ]
5 | }
6 | 


--------------------------------------------------------------------------------
/requirements-win-310.txt:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FlorianEagox/WeeaBlind/HEAD/requirements-win-310.txt


--------------------------------------------------------------------------------
/video_thumbnail_preview.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/FlorianEagox/WeeaBlind/HEAD/video_thumbnail_preview.png


--------------------------------------------------------------------------------
/test.py:
--------------------------------------------------------------------------------
 1 | import re
 2 | 
 3 | test = """
 4 | f u c k i e s u w u
 5 | 
 6 | """
 7 | 
 8 | pattern = r"\S{7,}"
 9 | 
10 | print(bool(re.search(pattern, test)))


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
 1 | venv
 2 | __pycache__
 3 | .venv
 4 | output/
 5 | *.mkv
 6 | *.wav
 7 | *.mp3
 8 | *.mp4
 9 | *.webm
10 | pretrained_models
11 | tmp
12 | dist
13 | build
14 | audio_cache
15 | 


--------------------------------------------------------------------------------
/requirements-min.txt:
--------------------------------------------------------------------------------
 1 | # Absolutely required
 2 | ffmpeg-python
 3 | git+https://github.com/FlorianEagox/static_ffmpeg # Script to install FFmpeg
 4 | srt
 5 | pydub
 6 | # pyAudio # <--- Needed on Windows, breaks on Linux smh
 7 | pyttsx3 # <-- System TTS engine
 8 | -f https://extras.wxpython.org/wxPython4/extras/linux/gtk3/ubuntu-22.04
 9 | wxpython
10 | audiotsm # <-- Audio timestretching
11 | yt-dlp # <-- Downloading YT vids
12 | espeakng
13 | pyinstaller
14 | 


--------------------------------------------------------------------------------
/app_state.py:
--------------------------------------------------------------------------------
 1 | from Voice import Voice
 2 | import feature_support
 3 | from video import Video
 4 | import sys
 5 | 
 6 | platform = sys.platform
 7 | video: Video = None
 8 | if feature_support.coqui_supported:
 9 |     speakers = [Voice(Voice.VoiceType.COQUI, name="Sample")]
10 |     speakers[0].set_voice_params('tts_models/en/vctk/vits', 'p326') # p340
11 | else:
12 |     speakers = [Voice(Voice.VoiceType.SYSTEM, name="Sample")]
13 | current_speaker = speakers[0] 
14 | sample_speaker = current_speaker
15 | 


--------------------------------------------------------------------------------
/.vscode/launch.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     // Use IntelliSense to learn about possible attributes.
 3 |     // Hover to view descriptions of existing attributes.
 4 |     // For more information, visit: https://go.microsoft.com/fwlink/?linkid=830387
 5 |     "version": "0.2.0",
 6 |     "configurations": [
 7 |         {
 8 |             "name": "Python: main file",
 9 |             "type": "python",
10 |             "request": "launch",
11 |             "program": "${workspaceFolder}/weeablind.py",
12 |             "console": "integratedTerminal",
13 |             "justMyCode": false,
14 |             "subProcess": true
15 |         }
16 |     ]
17 | }


--------------------------------------------------------------------------------
/loading subs pseudocode:
--------------------------------------------------------------------------------
 1 | user loads video
 2 | 	video is file:
 3 | 		file has subs?
 4 | 			load the first subs
 5 | 			display all subs
 6 | 			user selects new subs:
 7 | 				load subs with given stream index
 8 | 	video is YT link:
 9 | 		download all subs (if any)
10 | 		subs?
11 | 			display the subs
12 | 			user selects subs (vtt)
13 | 				convert the subs to srt
14 | 				load subs
15 | 	there are no subs!?!?!:
16 | 		This is the spooky zone
17 | 		offer to upload a subtitle file?
18 | 
19 | 		offer to attempt video OCR???
20 | 		attempt ASR + Translation? This would be fucking insane don't do this please don't add this feature this is literally impossible, right???


--------------------------------------------------------------------------------
/language_detection.py:
--------------------------------------------------------------------------------
 1 | # This is used to detect the spoken language in an audio file
 2 | # I wanted to abstract it to it's own file, just like vocal isolation & diarization
 3 | import feature_support
 4 | if feature_support.language_detection_supported:
 5 | 	from speechbrain.inference.classifiers import EncoderClassifier
 6 | 
 7 | language_identifier_model = None
 8 | 
 9 | def detect_language(file):
10 | 	global language_identifier_model
11 | 	if not language_identifier_model:
12 | 		language_identifier_model = EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp") #, run_opts={"device":"cuda"})
13 | 	signal = language_identifier_model.load_audio(file)
14 | 	prediction = language_identifier_model.classify_batch(signal)
15 | 	return prediction[3][0].split(' ')[1]
16 | 


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | # Absolutely required
 2 | ffmpeg-python
 3 | git+https://github.com/FlorianEagox/static_ffmpeg # Script to install FFmpeg
 4 | srt
 5 | pydub
 6 | # pyAudio # <--- Needed on Windows, breaks on Linux smh
 7 | pyttsx3 # <-- System TTS engine
 8 | -f https://extras.wxpython.org/wxPython4/extras/linux/gtk3/ubuntu-22.04
 9 | wxpython
10 | audiotsm # <-- Audio timestretching
11 | yt-dlp # <-- Downloading YT vids
12 | # Optional
13 | # librosa>=0.10.0
14 | espeakng
15 | spleeter # <-- Vocal / Background isolation
16 | coqui-tts # <-- Coqui TTS engine
17 | pyannote.audio
18 | # git+https://github.com/speechbrain/speechbrain.git # speechbrain # <-- Audio Language Identification
19 | git+https://github.com/casics/nostril.git # <--- GOBBLEDYGOOK OBLITERATOR
20 | -f https://github.com/simonflueckiger/tesserocr-windows_build/releases/download/tesserocr-v2.6.0-tesseract-5.3.1/tesserocr-2.6.0-cp310-cp310-win_amd64.whl
21 | protobuf==3.20 # Coqui breaks without this version specifically 


--------------------------------------------------------------------------------
/weeablind.spec:
--------------------------------------------------------------------------------
 1 | # -*- mode: python ; coding: utf-8 -*-
 2 | 
 3 | 
 4 | a = Analysis(
 5 |     ['weeablind.py'],
 6 |     pathex=[],
 7 |     binaries=[],
 8 |     datas=[('*.png', '.'), ('*.ico', '.')],
 9 |     hiddenimports=[],
10 |     hookspath=[],
11 |     hooksconfig={},
12 |     runtime_hooks=[],
13 |     excludes=[],
14 |     noarchive=False,
15 |     optimize=0,
16 | )
17 | pyz = PYZ(a.pure)
18 | 
19 | exe = EXE(
20 |     pyz,
21 |     a.scripts,
22 |     [],
23 |     exclude_binaries=True,
24 |     name='weeablind',
25 |     debug=False,
26 |     bootloader_ignore_signals=False,
27 |     strip=False,
28 |     upx=True,
29 |     console=True,
30 |     disable_windowed_traceback=False,
31 |     argv_emulation=False,
32 |     target_arch=None,
33 |     codesign_identity=None,
34 |     entitlements_file=None,
35 |     icon=['logo.ico'],
36 | )
37 | coll = COLLECT(
38 |     exe,
39 |     a.binaries,
40 |     a.datas,
41 |     strip=False,
42 |     upx=True,
43 |     upx_exclude=[],
44 |     name='weeablind',
45 | )
46 | 


--------------------------------------------------------------------------------
/synth.py:
--------------------------------------------------------------------------------
 1 | # Formerly the prototypical file, synth. Now it's just a graveyard of functions that may never return?
 2 | from pydub import AudioSegment
 3 | 
 4 | 
 5 | import concurrent.futures
 6 | from utils import get_output_path
 7 | 
 8 | 
 9 | # This function was intended to run with multiprocessing, but Coqui won't play nice with that.
10 | def dub_task(sub, i):
11 | 	print(f"{i}/{len(subs_adjusted)}")
12 | 	try:
13 | 		return dub_line_ram(sub)
14 | 		# empty_audio = empty_audio.overlay(line, sub.start*1000) 
15 | 	except Exception as e:
16 | 		print(e)
17 | 		with open(f"output/errors/{i}-rip.txt", 'w') as f:
18 | 			f.write(e)
19 | 		# total_errors += 1
20 | 
21 | # This may be used for multithreading?
22 | def combine_segments():
23 | 	empty_audio = AudioSegment.silent(total_duration * 1000, frame_rate=22050)
24 | 	total_errors = 0
25 | 	for sub in subs_adjusted:
26 | 		print(f"{sub.index}/{len(subs_adjusted)}")
27 | 		try:
28 | 			segment = AudioSegment.from_file(f'output/files/{sub.index}.wav')
29 | 			empty_audio = empty_audio.overlay(segment, sub.start*1000)
30 | 		except:
31 | 			total_errors += 1
32 | 	empty_audio.export('new.wav')
33 | 	print(total_errors)
34 | 


--------------------------------------------------------------------------------
/vocal_isolation.py:
--------------------------------------------------------------------------------
 1 | import feature_support
 2 | 
 3 | if feature_support.vocal_isolation_supported:
 4 | 	from spleeter.separator import Separator
 5 | 	from spleeter.audio import adapter
 6 | from pydub import AudioSegment
 7 | import numpy as np
 8 | import utils
 9 | 
10 | separator = None # Separator('spleeter:2stems')
11 | # I don't have any clue on how to make this work yet, just ignore for now. Ideally we'd never have to serialize the audio to wav and then rea read it but alas, bad implementations of PCM will be the death of me
12 | def seperate_ram(video):
13 | 	audio_loader = adapter.AudioAdapter.default()
14 | 	sample_rate = 44100
15 | 	audio = video.audio
16 | 	# arr = np.array(audio.get_array_of_samples(), dtype=np.float32).reshape((-1, audio.channels)) / (
17 | 	#         1 << (8 * audio.sample_width - 1)), audio.frame_rate
18 | 	arr = np.array(audio.get_array_of_samples())
19 | 	audio, _ = audio_loader.load_waveform(arr)
20 | 	# waveform, _ = audio_loader.load('/path/to/audio/file', sample_rate=sample_rate)
21 | 
22 | 	print("base audio\n", base_audio, "\n")
23 | 	# Perform the separation :
24 | 	# prediction = separator.separate(audio)
25 | 
26 | def seperate_file(video, isolate_subs=True):
27 | 	global separator
28 | 	if not separator:
29 | 		separator = Separator('spleeter:2stems')
30 | 	source_audio_path = utils.get_output_path(video.file, '-audio.wav')
31 | 	isolated_path = utils.get_output_path(video.file, '-isolate.wav')
32 | 	separator.separate_to_file(
33 | 		(video.audio).export(source_audio_path, format="wav").name,
34 | 		utils.get_output_path('.'),
35 | 		filename_format='{instrument}.{codec}'
36 | 	)
37 | 	# separator.separate_to_file(
38 | 	# 	video.isolate_subs().export(source_audio_path, format="wav").name,
39 | 	# 	'./output/',
40 | 	# 	filename_format='{filename}-{instrument}.{codec}'
41 | 	# )
42 | 	background_track = utils.get_output_path('accompaniment', '.wav')
43 | 	# If we removed primary langauge subs from a multilingual video, we'll need to add them back to the background.
44 | 	if video.subs_removed:
45 | 		background = AudioSegment.from_file(background_track)
46 | 		for sub in video.subs_removed:
47 | 			background = background.overlay(video.get_snippet(sub.start, sub.end), int(sub.start*1000))
48 | 		background.export(background_track, format="wav")
49 | 	video.background_track = background_track
50 | 	video.vocal_track = utils.get_output_path('vocals', '.wav')
51 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import os.path
 2 | import app_state
 3 | import numpy as np
 4 | from pydub.playback import play
 5 | from pydub import AudioSegment
 6 | import sys
 7 | 
 8 | APP_NAME = "WeeaBlind"
 9 | test_video_name = "./output/download.webm"
10 | 
11 | test_start_time = 94
12 | test_end_time =  1324
13 | 
14 | is_deployed = getattr(sys, 'frozen', False)
15 | 
16 | root = __file__
17 | if is_deployed:
18 | 	application_path = os.path.dirname(sys.executable)
19 | 	os.chdir(application_path)
20 | 	root = sys.executable
21 | 
22 | def create_output_dir():
23 | 	path = './output/files'
24 | 	if not os.path.exists(path):
25 | 		os.makedirs(path)
26 | 
27 | def get_output_path(input, suffix='', prefix='', path=''):
28 | 	filename = os.path.basename(input)
29 | 	filename_without_extension = os.path.splitext(filename)[0]
30 | 	return os.path.join(os.path.dirname(os.path.abspath(root)), 'output', path, f"{prefix}{filename_without_extension}{suffix}")
31 | 
32 | default_sample_path = get_output_path("sample", ".wav")
33 | print(default_sample_path)
34 | 
35 | def timecode_to_seconds(timecode):
36 | 	parts = list(map(float, timecode.split(':')))
37 | 	seconds = parts[-1]
38 | 	if len(parts) > 1:
39 | 		seconds += parts[-2] * 60
40 | 	if len(parts) > 2:
41 | 		seconds += parts[-3] * 3600
42 | 	return seconds
43 | 
44 | def seconds_to_timecode(seconds):
45 | 	hours = int(seconds // 3600)
46 | 	minutes = int((seconds % 3600) // 60)
47 | 	seconds = seconds % 60
48 | 	timecode = ""
49 | 	if hours:
50 | 		timecode += f"{hours}:"
51 | 	# if minutes:
52 | 	timecode += str(minutes).zfill(2) + ':'
53 | 	timecode = f"{timecode}{seconds:05.2f}"
54 | 	return timecode
55 | 
56 | # Finds the closest element in an arry to the given value
57 | def find_nearest(array, value):
58 | 	return (np.abs(np.asarray(array) - value)).argmin()
59 | 
60 | def sampleVoice(text, output=default_sample_path):
61 | 	play(AudioSegment.from_file(app_state.sample_speaker.speak(text, output)))
62 | 
63 | def attempt_long_running_task(function, parent, prompt, description):
64 | 	import wx
65 | 	msg_loading = wx.ProgressDialog(description, prompt, parent=parent, style=wx.PD_AUTO_HIDE | wx.PD_SMOOTH)
66 | 	msg_loading.Update(1)
67 | 	parent.Update()
68 | 	try:
69 | 		function()
70 | 	except Exception as e:
71 | 		print(e)
72 | 		wx.MessageBox(f"Something Went wrong Performing {prompt} \n\n{e}", "Something went wrong", wx.ICON_ERROR, parent)
73 | 	msg_loading.Destroy()
74 | 
75 | snippet_export_path = get_output_path("video_snippet", ".wav")


--------------------------------------------------------------------------------
/diarize.py:
--------------------------------------------------------------------------------
 1 | # This file contains all functions related to diarizing a video including optimization and processing a speech diary (rttm file)
 2 | # These functions use a functional approach as I didn't wanted to group them and not bloat the video class with such specific functions
 3 | # Perhaps going forward I should abstract diary entries as their own objects similar to dub_line, but I haven't decidded yet as diaries might be useful for voice cloning as well
 4 | 
 5 | import app_state
 6 | import utils
 7 | from Voice import Voice
 8 | import random
 9 | import feature_support
10 | if feature_support.torch_supported:
11 | 	import torchaudio.transforms as T
12 | 	import torchaudio
13 | if feature_support.diarization_supported:
14 | 	from pyannote.audio import Pipeline
15 | 
16 | pipeline = None
17 | 
18 | # Read RTTM files generated by Pyannote into an array containing the speaker, start, and end of their speech in the audio
19 | def load_diary(file):
20 | 	diary = []
21 | 	with open(file, 'r', encoding='utf-8') as diary_file:
22 | 		for line in diary_file.read().strip().split('\n'):
23 | 			line_values = line.split(' ')
24 | 			diary.append([line_values[7], float(line_values[3]), float(line_values[4])])
25 | 	total_speakers = len(set(line[0] for line in diary))
26 | 	app_state.speakers = initialize_speakers(total_speakers)
27 | 	return diary
28 | 
29 | # Time Shift the speech diary to be in line with the start time
30 | def update_diary_timing(diary, start_time):
31 | 	return [[int(line[0].split('_')[1]), line[1] + start_time, line[2]] for line in diary]
32 | 
33 | def initialize_speakers(speaker_count):
34 | 	speakers = []
35 | 	speaker_options = app_state.sample_speaker.list_speakers()
36 | 	for i in range(speaker_count):
37 | 		speakers.append(Voice(Voice.VoiceType.COQUI, f"Voice {i}"))
38 | 		speakers[i].set_voice_params('tts_models/en/vctk/vits', random.choice(speaker_options))
39 | 	return speakers
40 | 
41 | def find_nearest_speaker(diary, sub):
42 | 	return diary[
43 | 		utils.find_nearest(
44 | 			[diary_entry[1] for diary_entry in diary],
45 | 			sub.start
46 | 		)
47 | 	][0]
48 | 
49 | 
50 | 
51 | def optimize_audio_diarization(video):
52 | 	if video.vocal_track:
53 | 		crop = video.vocal_track
54 | 	else:
55 | 		crop = video.crop_audio(True)
56 | 	waveform, sample_rate = torchaudio.load(crop)
57 | 	# Apply noise reduction
58 | 	noise_reduce = T.Vad(sample_rate=sample_rate)
59 | 	clean_waveform = noise_reduce(waveform)
60 | 	
61 | 	# Normalize audio
62 | 	normalize = T.Resample(orig_freq=sample_rate, new_freq=sample_rate)
63 | 	normalized_waveform = normalize(clean_waveform)
64 | 
65 | 	return normalized_waveform, sample_rate
66 | 
67 | def run_diarization(video):
68 | 	global pipeline # Probably should move this to app state?
69 | 	if not pipeline:
70 | 		pipeline = Pipeline.from_pretrained("pyannote/speaker-diarization-3.1", use_auth_token="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")
71 | 		import torch
72 | 		pipeline.to(torch.device("cuda"))
73 | 	output = utils.get_output_path(video.file, ".rttm")
74 | 	optimized, sample_rate = optimize_audio_diarization(video)
75 | 	diarization = pipeline({"waveform": optimized, "sample_rate": sample_rate})
76 | 	with open(output, "w") as rttm:
77 | 		diarization.write_rttm(rttm)
78 | 	diary = load_diary(output)
79 | 	diary = update_diary_timing(diary, video.start_time)
80 | 	for sub in video.subs_adjusted:
81 | 		sub.voice = find_nearest_speaker(diary, sub)
82 | 


--------------------------------------------------------------------------------
/feature_support.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import importlib.util
 4 | import static_ffmpeg
 5 | import subprocess
 6 | from utils import get_output_path
 7 | 
 8 | def is_module_available(module_name):
 9 | 	try:
10 | 		return importlib.util.find_spec(module_name) is not None
11 | 	except Exception as e:
12 | 		print(f"failed to import {module_name}: {e}")
13 | 		return False
14 | 
15 | def is_executable(program):
16 | 	try:
17 | 		subprocess.run(program, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, check=True)
18 | 		return True
19 | 	except Exception as e:
20 | 		return False
21 | 
22 | def check_ffmpeg():
23 | 	# First we'll check if FFmpeg was installed automatically by weeablind, a "crumb" will be left behind if so
24 | 	ffmpeg_path = get_output_path('', '', path='ffmpeg')
25 | 	if os.path.exists(os.path.join(ffmpeg_path, 'installed.crumb')):
26 | 		# The dir with the ffmpeg binary will be named based dont he platform, and we need to know wthis to locate it
27 | 		os.environ["PATH"] = os.pathsep.join([os.path.join(ffmpeg_path, sys.platform), os.environ["PATH"]])
28 | 	return is_executable(["ffmpeg", "-version"])
29 | 
30 | ffmpeg_supported = check_ffmpeg() # "ffmpeg" in os.getenv('PATH').lower()
31 | diarization_supported = is_module_available("pyannote")
32 | ocr_supported = is_module_available("video_ocr")
33 | nostril_supported = is_module_available("nostril")
34 | language_detection_supported = is_module_available("speechbrain")
35 | vocal_isolation_supported = is_module_available("spleeter")
36 | downloads_supported = is_module_available("yt_dlp")
37 | espeak_supported = is_module_available("espeakng") and (is_executable(["espeak", "--version"]) or is_executable(["espeak-ng", "--version"]))
38 | coqui_supported = is_module_available("TTS") # and espeak_supported
39 | torch_supported = is_module_available("torch.cuda")
40 | gpu_supported = False
41 | if torch_supported:
42 | 	from torch.cuda import is_available
43 | 	gpu_supported = is_available()
44 | # TESTING
45 | # language_detection_supported = coqui_supported = False
46 | 
47 | def install_ffmpeg():
48 | 	static_ffmpeg.add_paths(False, get_output_path('', '', path='ffmpeg'))
49 | 	check_ffmpeg()
50 | 
51 | # Windows has some voices PyTTSx3 can't access for some reason unless you add them to a different part of the registry, so this will try to do that
52 | def patch_onecore_voices():
53 | 	import win32security
54 | 	import win32api
55 | 
56 | 	old_path = r"SOFTWARE\Microsoft\Speech_OneCore\Voices\Tokens"
57 | 
58 | 	priv_flags = win32security.TOKEN_ADJUST_PRIVILEGES | win32security.TOKEN_QUERY
59 | 	hToken = win32security.OpenProcessToken (win32api.GetCurrentProcess (), priv_flags)
60 | 	privilege_id = win32security.LookupPrivilegeValue (None, "SeBackupPrivilege")
61 | 	win32security.AdjustTokenPrivileges (hToken, 0, [(privilege_id, win32security.SE_PRIVILEGE_ENABLED)])
62 | 
63 | 	backup_path = get_output_path("onecore", ".reg")
64 | 	
65 | 	try:
66 | 		subprocess.run(["reg", "export", f"HKEY_LOCAL_MACHINE\\{old_path}", backup_path, "-y"], check=True)
67 | 		# winreg.SaveKey(key, backup_path)
68 | 	except PermissionError as e:
69 | 		print("Permission denied. Please run the script with admin privileges.", e)
70 | 		return
71 | 
72 | 	# Replace the old path with the new path in the exported .reg file
73 | 	with open(backup_path, 'r', encoding='utf-16') as f:
74 | 		reg_data = f.read()
75 | 
76 | 	reg_data = reg_data.replace("_OneCore", "")
77 | 
78 | 	# Write the modified .reg file
79 | 	modified_data_path = get_output_path("modified_tokens", ".reg")
80 | 	with open(modified_data_path, 'w', encoding='utf-16') as f:
81 | 		f.write(reg_data)
82 | 
83 | 	# Import the modified .reg file to update the registry
84 | 	os.system("regedit /s " + modified_data_path)
85 | 	print("Registry modification complete.")
86 | 


--------------------------------------------------------------------------------
/requirements-linux310.txt:
--------------------------------------------------------------------------------
  1 | absl-py==2.1.0
  2 | aiohttp==3.9.5
  3 | aiosignal==1.3.1
  4 | alembic==1.13.1
  5 | annotated-types==0.6.0
  6 | antlr4-python3-runtime==4.9.3
  7 | anyascii==0.3.2
  8 | anyio==3.7.1
  9 | asteroid-filterbanks==0.4.0
 10 | astunparse==1.6.3
 11 | async-timeout==4.0.3
 12 | attrs==23.2.0
 13 | audioread==3.0.1
 14 | audiotsm==0.1.2
 15 | Babel==2.15.0
 16 | bangla==0.0.2
 17 | blis==0.7.11
 18 | bnnumerizer==0.0.2
 19 | bnunicodenormalizer==0.1.6
 20 | Brotli==1.1.0
 21 | cachetools==5.3.3
 22 | catalogue==2.0.10
 23 | certifi==2024.2.2
 24 | cffi==1.16.0
 25 | charset-normalizer==3.3.2
 26 | click==7.1.2
 27 | cloudpathlib==0.16.0
 28 | colorama==0.4.6
 29 | colorlog==6.8.2
 30 | confection==0.1.4
 31 | contourpy==1.2.1
 32 | coqpit==0.0.17
 33 | coqui-tts==0.23.1
 34 | coqui-tts-trainer==0.1.1
 35 | cycler==0.12.1
 36 | cymem==2.0.8
 37 | Cython==3.0.10
 38 | dateparser==1.1.8
 39 | decorator==5.1.1
 40 | docopt==0.6.2
 41 | einops==0.8.0
 42 | encodec==0.1.1
 43 | espeakng==1.0.3
 44 | exceptiongroup==1.2.1
 45 | ffmpeg-python==0.2.0
 46 | filelock==3.14.0
 47 | flatbuffers==1.12
 48 | fonttools==4.51.0
 49 | frozenlist==1.4.1
 50 | fsspec==2024.3.1
 51 | future==1.0.0
 52 | g2pkk==0.1.2
 53 | gast==0.4.0
 54 | google-auth==2.29.0
 55 | google-auth-oauthlib==0.4.6
 56 | google-pasta==0.2.0
 57 | greenlet==3.0.3
 58 | grpcio==1.63.0
 59 | gruut==2.2.3
 60 | gruut-ipa==0.13.0
 61 | gruut-lang-de==2.0.0
 62 | gruut-lang-en==2.0.0
 63 | gruut-lang-es==2.0.0
 64 | gruut-lang-fr==2.0.2
 65 | h11==0.12.0
 66 | h2==4.1.0
 67 | h5py==3.11.0
 68 | hangul-romanize==0.1.0
 69 | hpack==4.0.0
 70 | httpcore==0.13.7
 71 | httpx==0.19.0
 72 | huggingface-hub==0.23.0
 73 | humanize==4.9.0
 74 | hyperframe==6.0.1
 75 | HyperPyYAML==1.2.2
 76 | idna==3.7
 77 | inflect==7.2.1
 78 | iniconfig==2.0.0
 79 | jamo==0.4.1
 80 | jieba==0.42.1
 81 | Jinja2==3.1.4
 82 | joblib==1.4.2
 83 | jsonlines==1.2.0
 84 | julius==0.2.7
 85 | keras==2.9.0
 86 | Keras-Preprocessing==1.1.2
 87 | kiwisolver==1.4.5
 88 | langcodes==3.4.0
 89 | language_data==1.2.0
 90 | lazy_loader==0.4
 91 | libclang==18.1.1
 92 | librosa==0.10.2
 93 | lightning==2.2.4
 94 | lightning-utilities==0.11.2
 95 | llvmlite==0.42.0
 96 | Mako==1.3.3
 97 | marisa-trie==1.1.1
 98 | Markdown==3.6
 99 | markdown-it-py==3.0.0
100 | MarkupSafe==2.1.5
101 | matplotlib==3.8.4
102 | mdurl==0.1.2
103 | more-itertools==10.2.0
104 | mpmath==1.3.0
105 | msgpack==1.0.8
106 | multidict==6.0.5
107 | murmurhash==1.0.10
108 | mutagen==1.47.0
109 | networkx==2.8.8
110 | nltk==3.8.1
111 | norbert==0.2.1
112 | nostril @ git+https://github.com/casics/nostril.git@fbc0c91249283a9fbc9036206391ce1138826fd3
113 | num2words==0.5.13
114 | numba==0.59.1
115 | numpy==1.26.4
116 | nvidia-cublas-cu12==12.1.3.1
117 | nvidia-cuda-cupti-cu12==12.1.105
118 | nvidia-cuda-nvrtc-cu12==12.1.105
119 | nvidia-cuda-runtime-cu12==12.1.105
120 | nvidia-cudnn-cu12==8.9.2.26
121 | nvidia-cufft-cu12==11.0.2.54
122 | nvidia-curand-cu12==10.3.2.106
123 | nvidia-cusolver-cu12==11.4.5.107
124 | nvidia-cusparse-cu12==12.1.0.106
125 | nvidia-nccl-cu12==2.20.5
126 | nvidia-nvjitlink-cu12==12.4.127
127 | nvidia-nvtx-cu12==12.1.105
128 | oauthlib==3.2.2
129 | omegaconf==2.3.0
130 | opt-einsum==3.3.0
131 | optuna==3.6.1
132 | packaging==24.0
133 | pandas==1.5.3
134 | pillow==10.3.0
135 | plac==1.4.3
136 | platformdirs==4.2.1
137 | pluggy==1.5.0
138 | pooch==1.8.1
139 | preshed==3.0.9
140 | pretty-errors==1.2.25
141 | primePy==1.3
142 | protobuf==3.20.0
143 | psutil==5.9.8
144 | pyannote.audio==3.2.0
145 | pyannote.core==5.0.0
146 | pyannote.database==5.0.1
147 | pyannote.metrics==3.2.1
148 | pyannote.pipeline==3.0.1
149 | pyasn1==0.6.0
150 | pyasn1_modules==0.4.0
151 | pycparser==2.22
152 | pycryptodomex==3.20.0
153 | pydantic==2.7.1
154 | pydantic_core==2.18.2
155 | pydub==0.25.1
156 | Pygments==2.18.0
157 | pynndescent==0.5.12
158 | pyparsing==3.1.2
159 | pypinyin==0.51.0
160 | pysbd==0.3.4
161 | pytest==8.2.0
162 | python-crfsuite==0.9.10
163 | python-dateutil==2.9.0.post0
164 | pytorch-lightning==2.2.4
165 | pytorch-metric-learning==2.5.0
166 | pyttsx3==2.90
167 | pytz==2024.1
168 | PyYAML==6.0.1
169 | regex==2024.5.10
170 | requests==2.31.0
171 | requests-oauthlib==2.0.0
172 | rfc3986==1.5.0
173 | rich==13.7.1
174 | rsa==4.9
175 | ruamel.yaml==0.18.6
176 | ruamel.yaml.clib==0.2.8
177 | safetensors==0.4.3
178 | scikit-learn==1.4.2
179 | scipy==1.13.0
180 | semver==3.0.2
181 | sentencepiece==0.2.0
182 | shellingham==1.5.4
183 | six==1.16.0
184 | smart-open==6.4.0
185 | sniffio==1.3.1
186 | sortedcontainers==2.4.0
187 | soundfile==0.12.1
188 | soxr==0.3.7
189 | spacy==3.7.4
190 | spacy-legacy==3.0.12
191 | spacy-loggers==1.0.5
192 | speechbrain==1.0.0
193 | spleeter==2.4.0
194 | SQLAlchemy==2.0.30
195 | srsly==2.4.8
196 | srt==3.5.3
197 | static-ffmpeg @ git+https://github.com/FlorianEagox/static_ffmpeg@718839941289012e1e48baf0d9e2b737b9caf91d
198 | SudachiDict-core==20240409
199 | SudachiPy==0.6.8
200 | sympy==1.12
201 | tabulate==0.9.0
202 | tensorboard==2.9.0
203 | tensorboard-data-server==0.6.1
204 | tensorboard-plugin-wit==1.8.1
205 | tensorboardX==2.6.2.2
206 | tensorflow==2.9.0
207 | tensorflow-estimator==2.9.0
208 | tensorflow-io-gcs-filesystem==0.37.0
209 | termcolor==2.4.0
210 | thinc==8.2.3
211 | threadpoolctl==3.5.0
212 | tokenizers==0.19.1
213 | tomli==2.0.1
214 | torch==2.3.0
215 | torch-audiomentations==0.11.1
216 | torch-pitch-shift==1.2.4
217 | torchaudio==2.3.0
218 | torchmetrics==1.4.0
219 | tqdm==4.66.4
220 | transformers==4.40.2
221 | triton==2.3.0
222 | typeguard==4.2.1
223 | typer==0.3.2
224 | typing_extensions==4.11.0
225 | tzlocal==5.2
226 | umap-learn==0.5.6
227 | urllib3==2.2.1
228 | wasabi==1.1.2
229 | weasel==0.3.4
230 | websockets==12.0
231 | Werkzeug==3.0.3
232 | wrapt==1.16.0
233 | wxPython==4.2.1
234 | yarl==1.9.4
235 | yt-dlp==2024.4.9
236 | 


--------------------------------------------------------------------------------
/dub_line.py:
--------------------------------------------------------------------------------
  1 | from dataclasses import dataclass
  2 | import ffmpeg
  3 | import utils
  4 | import app_state
  5 | import srt
  6 | from re import compile, sub as substitute
  7 | from pydub import AudioSegment
  8 | from audiotsm import wsola
  9 | from audiotsm.io.wav import WavReader, WavWriter
 10 | from audiotsm.io.array import ArrayReader, ArrayWriter
 11 | import numpy as np
 12 | from language_detection import detect_language
 13 | 
 14 | remove_xml = compile(r'<[^>]+>|\{[^}]+\}')
 15 | language_identifier_model = None # EncoderClassifier.from_hparams(source="speechbrain/lang-id-voxlingua107-ecapa", savedir="tmp")
 16 | 
 17 | @dataclass
 18 | class DubbedLine:
 19 | 	start: float
 20 | 	end: float
 21 | 	text: str
 22 | 	index: int
 23 | 	voice: int = 0
 24 | 	language: str = ""
 25 | 
 26 | 	def update_voice(self, voice):
 27 | 		self.voice = voice
 28 | 
 29 | 	# This is highly inefficient as it writes and reads the same file many times
 30 | 	def dub_line_file(self, match_rate=True, match_volume=True, output=False):
 31 | 		output_path = utils.get_output_path(str(self.index), '.wav', path='files')
 32 | 		tts_audio = app_state.speakers[self.voice].speak(self.text, output_path)
 33 | 		if match_rate and not self.end == -1:
 34 | 			rate_adjusted = self.match_rate(tts_audio, self.end-self.start)
 35 | 			segment = AudioSegment.from_wav(rate_adjusted)
 36 | 		else:
 37 | 			segment = AudioSegment.from_wav(tts_audio)
 38 | 		if match_volume:
 39 | 			segment = self.match_volume(app_state.video.get_snippet(self.start, self.end), segment)
 40 | 		if output:
 41 | 			segment.export(output_path, format='wav')
 42 | 		return segment, output_path
 43 | 
 44 | 	# This should ideally be a much more efficient way to dub.
 45 | 	# All functions should pass around numpy arrays rather than reading and writting files. For some reason though, it gives distroted results
 46 | 	def dub_line_ram(self, output=True):
 47 | 		output_path = utils.get_output_path(str(self.index), '.wav', path='files')
 48 | 		tts_audio = app_state.speakers[self.voice].speak(self.text)
 49 | 		rate_adjusted = self.match_rate_ram(tts_audio, self.end-self.start)
 50 | 		data = rate_adjusted / np.max(np.abs(rate_adjusted))
 51 | 		# This causes some kind of wacky audio distrotion we NEED to fix ;C
 52 | 		audio_as_int = (data * (2**15)).astype(np.int16).tobytes()
 53 | 		segment = AudioSegment(
 54 | 			audio_as_int,
 55 | 			frame_rate=22050,
 56 | 			sample_width=2,
 57 | 			channels=1
 58 | 		)
 59 | 		if output:
 60 | 			segment.export(output_path, format='wav')
 61 | 		return segment
 62 | 
 63 | 	def match_rate(self, target_path, source_duration, destination_path=None, clamp_min=0, clamp_max=4):
 64 | 		if destination_path == None:
 65 | 			destination_path = target_path.split('.')[0] + '-timeshift.wav'
 66 | 		duration = float(ffmpeg.probe(target_path)["format"]["duration"])
 67 | 		rate = duration*1/source_duration
 68 | 		rate = np.clip(rate, clamp_min, clamp_max)
 69 | 		with WavReader(target_path) as reader:
 70 | 			with WavWriter(destination_path, reader.channels, reader.samplerate) as writer:
 71 | 				tsm = wsola(reader.channels, speed=rate)
 72 | 				tsm.run(reader, writer)
 73 | 		return destination_path
 74 | 
 75 | 	def match_rate_ram(self, target, source_duration, outpath=None, clamp_min=0.8, clamp_max=2.5):
 76 | 		num_samples = len(target)
 77 | 		target = target.reshape(1, num_samples)
 78 | 		duration = num_samples / 22050
 79 | 		rate = duration*1/source_duration
 80 | 		rate = np.clip(rate, clamp_min, clamp_max)
 81 | 		reader = ArrayReader(target)
 82 | 		tsm = wsola(reader.channels, speed=rate)
 83 | 		if not outpath:
 84 | 			rate_adjusted = ArrayWriter(channels=1)
 85 | 			tsm.run(reader, rate_adjusted)
 86 | 			return rate_adjusted.data
 87 | 		else:
 88 | 			rate_adjusted = WavWriter(outpath, 1, 22050)
 89 | 			tsm.run(reader, rate_adjusted)
 90 | 			rate_adjusted.close()
 91 | 			return outpath
 92 | 
 93 | 	def match_volume(self, source_snippet, target):
 94 | 		# ratio = source_snippet.rms / (target.rms | 1)
 95 | 		ratio = source_snippet.dBFS - target.dBFS
 96 | 		# adjusted_audio = target.apply_gain(ratio)
 97 | 		adjusted_audio = target + ratio
 98 | 		return adjusted_audio
 99 | 		# adjusted_audio.export(output_path, format="wav")
100 | 
101 | 	def get_language(self, source_snippet):
102 | 		if not self.language:
103 | 			self.language = detect_language(source_snippet)
104 | 		return self.language
105 | 
106 | 
107 | def filter_junk(subs, minimum_duration=0.1, remove_repeats=True):
108 | 	filtered = []
109 | 	previous = ""
110 | 	for sub in subs:
111 | 		if (sub.end - sub.start) > minimum_duration:
112 | 			if sub.text != previous:
113 | 				if previous and sub.text.split("\n")[0] in previous:
114 | 					sub.text = "".join(sub.text.split("\n")[-1:])
115 | 				filtered.append(sub)
116 | 		previous = sub.text
117 | 	return filtered
118 | 
119 | # This function is designed to handle two cases
120 | #	1 We just have a path to an srt that we want to import
121 | #	2 You have a file containing subs, but not srt (a video file, a vtt, whatever)
122 | # 		In this case, we must extract or convert the subs to srt, and then read it in (export then import)
123 | def load_subs(import_path="", extract_subs_path=False, filter=True):
124 | 	if extract_subs_path: # For importing an external subtitles file
125 | 		(
126 | 			ffmpeg
127 | 			.input(extract_subs_path)
128 | 			.output(import_path)
129 | 			.global_args('-loglevel', 'error')
130 | 			.run(overwrite_output=True)
131 | 		)
132 | 	with open(import_path, "r", encoding="utf-8") as f:
133 | 		original_subs = list(srt.parse(f.read()))
134 | 		return filter_junk([
135 | 			DubbedLine(
136 | 				sub.start.total_seconds(),
137 | 				sub.end.total_seconds(),
138 | 				substitute(remove_xml, '', sub.content),
139 | 				sub.index
140 | 			)
141 | 			for sub in original_subs
142 | 		])
143 | 


--------------------------------------------------------------------------------
/tabs/GreeterView.py:
--------------------------------------------------------------------------------
 1 | import wx
 2 | import feature_support
 3 | from utils import is_deployed
 4 | import sys
 5 | import os
 6 | 
 7 | class GreeterView(wx.Panel):
 8 | 	def __init__(self, parent, context):
 9 | 		super().__init__(parent)
10 | 
11 | 		self.scroll_panel = wx.ScrolledWindow(self, style=wx.VSCROLL)
12 | 		vbox = wx.BoxSizer(wx.VERTICAL)
13 | 		self.scroll_panel.SetSizer(vbox)
14 | 		self.scroll_panel.SetScrollRate(0, 20)
15 |   
16 | 		txt_title = wx.StaticText(self.scroll_panel, label="Welcome to WeeaBlind")
17 | 		title_font = wx.Font(18, wx.FONTFAMILY_DEFAULT, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_BOLD)
18 | 		txt_title.SetFont(title_font)
19 | 		vbox.Add(txt_title, 0, wx.ALIGN_LEFT | wx.TOP | wx.LEFT, 20)
20 | 
21 | 		img = wx.Image("logo.png" if not is_deployed else os.path.join("_internal", "logo.png"), wx.BITMAP_TYPE_ANY)
22 | 		img.Rescale(200, 200)
23 | 		bmp = wx.StaticBitmap(self.scroll_panel, wx.ID_ANY, wx.Bitmap(img))
24 | 		vbox.Add(bmp, 0, wx.ALIGN_LEFT | wx.BOTTOM | wx.LEFT, 20)
25 | 
26 | 		intro_text = "Welcome to WeeaBlind, your companion for dubbing multi-lingual media using modern AI technologies. This tool bridges the gap for individuals with visual impairments, dyslexia, or anyone who prefers listening over reading subtitles. Dive into your favorite shows and videos and experiement with innovative speech synthesis, diarization, language identification, and voice cloning technologies. The program is still very much under construction, but is also quite useful in its current state!"
27 | 		txt_introduction = wx.StaticText(self.scroll_panel, label=intro_text)
28 | 		txt_introduction.Wrap(400)  # Wrap text to a maximum width of 400 pixels
29 | 		vbox.Add(txt_introduction, 0, wx.ALIGN_LEFT | wx.TOP | wx.LEFT, 20)
30 | 		
31 | 		mode_state = "RELEASE" if is_deployed else "DEVELOPMENT"
32 | 		mode_prompt = "You can install features from the requirements.txt with pip and they will become unlocked" if not is_deployed else "If you'd like access to the full suite of AI-powered features the program can perform, you'll need to install python and set up WeeaBlind in a virtual enviornment as described in the README. These are not easily distributable in a binary release format."
33 | 		txt_mode = wx.StaticText(self.scroll_panel, label=f"You are running in {mode_state} mode.\n{mode_prompt}")
34 | 		vbox.Add(txt_mode, 0, wx.ALIGN_LEFT | wx.TOP | wx.LEFT, 20)
35 | 
36 | 		txt_usage_header = wx.StaticText(self.scroll_panel, label="How to Use Weeablind:")
37 | 		usage_section_font = wx.Font(14, wx.FONTFAMILY_DEFAULT, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_BOLD)
38 | 		txt_usage_header.SetFont(usage_section_font)
39 | 		vbox.Add(txt_usage_header, 0, wx.ALIGN_LEFT | wx.TOP | wx.LEFT, 20)
40 | 
41 | 		
42 | 
43 | 		usage_guide = """
44 | Start by importing a video file or pasting a YouTube link and pressing enter
45 | If you want subtitles autogenerated by YouTube, specify the language you want, or specify "all" for all subtitle options.
46 | Make sure the correct audio and subtitle track are selected in the List Streams Tab
47 | Next, configure the TTS voice you'd like to dub the video with. You can install more TTS voices in Windows settings and enable them with the "Unlock OneCore" button
48 | Once everything is configured, you can preview how it will sound in the Subtitles Tab
49 | If the video or subtitles are too quiet, you can change and preview the audio mixing in List Streams
50 | If the subtitles for your video were autogenerated by YouTube, the timings may not line up perfectly, so unchecking Match Rate can fix timestreatch issues
51 | Finally, click "Run Dubbing" to create your video
52 | All files generated by the program appear in the "output" folder
53 | 
54 | --For Advanced Use--
55 | 
56 | If the video contains multiple spoken languages, use the language identification and filter features
57 | If you want to remove spoken vocals in the video's source language, use the "remove vocals" button in List Streams
58 | In the Subtitles Tab, you can select spoken lines for cloning and export them to a wav file
59 | If you have Coqui TTS installed, you can use these clones with XTTS or a Voice Conversion model
60 | For videos with only burned in subs, you can attempt Video OCR in the List Streams tab.
61 | """
62 | 		txt_usage = wx.StaticText(self.scroll_panel, label=usage_guide)
63 | 		txt_usage.Wrap(400)
64 | 		vbox.Add(txt_usage, 0, wx.ALIGN_LEFT | wx.TOP | wx.LEFT, 20)
65 | 
66 | 
67 | 		txt_features_header = wx.StaticText(self.scroll_panel, label="Currently Supported Features:")
68 | 		feature_section_font = wx.Font(14, wx.FONTFAMILY_DEFAULT, wx.FONTSTYLE_NORMAL, wx.FONTWEIGHT_BOLD)
69 | 		txt_features_header.SetFont(feature_section_font)
70 | 		vbox.Add(txt_features_header, 0, wx.ALIGN_LEFT | wx.TOP | wx.LEFT, 20)
71 | 
72 | 		features = {
73 | 			"FFmpeg": feature_support.ffmpeg_supported,
74 | 			"Diarization": feature_support.diarization_supported,
75 | 			"OCR": feature_support.ocr_supported,
76 | 			"Language Detection": feature_support.language_detection_supported,
77 | 			"Vocal Isolation": feature_support.vocal_isolation_supported,
78 | 			"YouTube Downloads": feature_support.downloads_supported,
79 | 			"Espeak": feature_support.espeak_supported,
80 | 			"Coqui TTS": feature_support.coqui_supported,
81 | 			"PyTorch": feature_support.torch_supported,
82 | 			"GPU Support": feature_support.gpu_supported,
83 | 		}
84 | 
85 | 		for feature in features:
86 | 			txt_feature = wx.StaticText(self.scroll_panel, label=f"""{feature}: {"Not" if not features[feature] else ""} Supported""")
87 | 			txt_feature.SetForegroundColour((0, 255, 0) if features[feature] else (255, 0, 0))
88 | 			vbox.Add(txt_feature, 0, wx.ALIGN_LEFT | wx.TOP | wx.LEFT, 20)
89 | 
90 | 		main_sizer = wx.BoxSizer(wx.VERTICAL)
91 | 		main_sizer.Add(self.scroll_panel, 1, wx.CENTER | wx.EXPAND | wx.ALL, border=10)
92 | 		self.SetSizerAndFit(main_sizer)
93 | 
94 | 


--------------------------------------------------------------------------------
/tabs/ListStreams.py:
--------------------------------------------------------------------------------
  1 | import wx
  2 | import app_state
  3 | import vocal_isolation
  4 | import dub_line
  5 | import re
  6 | import feature_support
  7 | from pydub import AudioSegment
  8 | from pydub.playback import play
  9 | import utils
 10 | 
 11 | if feature_support.ocr_supported:
 12 | 	import video_ocr
 13 | if feature_support.nostril_supported:
 14 | 	from nostril import nonsense
 15 | 
 16 | class ListStreamsTab(wx.Panel):
 17 | 	def __init__(self, parent, context):
 18 | 		super().__init__(parent)
 19 | 		
 20 | 		self.context = context
 21 | 
 22 | 		self.scroll_panel = wx.ScrolledWindow(self, style=wx.VSCROLL)
 23 | 		self.scroll_sizer = wx.BoxSizer(wx.VERTICAL)
 24 | 		self.scroll_panel.SetSizer(self.scroll_sizer)
 25 | 		self.scroll_panel.SetScrollRate(0, 20)
 26 | 		
 27 | 		self.rb_audio = wx.RadioBox(self.scroll_panel, majorDimension=1)
 28 | 		self.rb_subs = wx.RadioBox(self.scroll_panel, majorDimension=1)
 29 | 
 30 | 		btn_remove_vocals = wx.Button(self, label="Remove vocals")
 31 | 		btn_remove_vocals.Bind(wx.EVT_BUTTON, self.remove_vocals)
 32 | 		if not feature_support.vocal_isolation_supported: btn_remove_vocals.Disable()
 33 | 
 34 | 		btn_ocr = wx.Button(self, label="Extract subs with OCR")
 35 | 		btn_ocr.Bind(wx.EVT_BUTTON, self.run_ocr)
 36 | 		if not feature_support.ocr_supported: btn_ocr.Disable()
 37 | 		lbl_import_external = wx.StaticText(self.scroll_panel, label="Import external Subtitles file")
 38 | 		self.file_import_external = wx.FilePickerCtrl(self.scroll_panel, message="Import External subtitles file", wildcard="Subtitle Files |*.srt;*.vtt;*.ass")
 39 | 		self.file_import_external.Bind(wx.EVT_FILEPICKER_CHANGED, self.import_subs)
 40 | 
 41 | 		box_mixing = wx.StaticBox(self, label="Audio Mixing Settings")
 42 | 		box_mixing_sizer = wx.StaticBoxSizer(box_mixing)
 43 | 		box_mixing_grid = wx.GridSizer(2)
 44 | 
 45 | 		lbl_mixing_ratio = wx.StaticText(self, label="Volume Mixing Ratio")
 46 | 		self.slider_audio_ratio = wx.Slider(self, value=50)
 47 | 		self.slider_audio_ratio.Bind(wx.EVT_SLIDER, self.change_mix)
 48 | 		btn_sample_mix = wx.Button(self, label="Preview Mix")
 49 | 		btn_sample_mix.Bind(wx.EVT_BUTTON, self.sample_mix)
 50 | 		
 51 | 		btn_remix_audio = wx.Button(self, label="Remix Video")
 52 | 		btn_remix_audio.Bind(wx.EVT_BUTTON, self.remix_audio)
 53 | 
 54 | 		# Create a sizer for layout
 55 | 		sizer = wx.BoxSizer(wx.VERTICAL)
 56 | 		sizer.Add(btn_remove_vocals, 0, wx.ALL | wx.CENTER, 5)
 57 | 		sizer.Add(btn_ocr, 0, wx.ALL | wx.CENTER, 5)
 58 | 		self.scroll_sizer.Add(wx.StaticText(self.scroll_panel, label="Select an Audio Stream:"), 0, wx.ALL, 5)
 59 | 		self.scroll_sizer.Add(self.rb_audio, 0, wx.ALL | wx.EXPAND, 5)
 60 | 		self.scroll_sizer.Add(wx.StaticText(self.scroll_panel, label="Select a Subtitle Stream:"), 0, wx.ALL, 5)
 61 | 		self.scroll_sizer.Add(self.rb_subs, 0, wx.ALL | wx.EXPAND, 5)
 62 | 		self.scroll_sizer.Add(lbl_import_external, 0, wx.ALL | wx.CENTER, 5)
 63 | 		self.scroll_sizer.Add(self.file_import_external, 0, wx.ALL | wx.CENTER, 5)
 64 | 		sizer.Add(self.scroll_panel, 1, wx.SHRINK | wx.ALL | wx.EXPAND | wx.RIGHT, border=10)
 65 | 		
 66 | 		box_mixing_grid.Add(lbl_mixing_ratio)
 67 | 		box_mixing_grid.Add(self.slider_audio_ratio)
 68 | 		box_mixing_grid.Add(btn_sample_mix)
 69 | 		box_mixing_grid.Add(btn_remix_audio)
 70 | 		box_mixing_sizer.Add(box_mixing_grid, 1)
 71 | 		sizer.Add(box_mixing_sizer, 0, wx.ALIGN_CENTER | wx.SHRINK | wx.ALL, 5)
 72 | 
 73 | 		self.SetSizer(sizer)
 74 | 
 75 | 
 76 | 
 77 | 	def populate_streams(self, streams):
 78 | 		# This code is some of the worst code, i hate it so much, but WX DOESN'T LET ME RESET THE CHOICES LIKE WITH **EVERY** OTHER LIST COMPONENT
 79 | 		_rb_audio = self.rb_audio
 80 | 		self.rb_audio = wx.RadioBox(self.scroll_panel,
 81 | 			choices=[f"Stream #{stream['index']} ({stream.get('tags', {'language': 'unknown'}).get('language', 'unknown')})" for stream in streams["audio"]],
 82 | 			style=wx.RA_VERTICAL
 83 | 		)
 84 | 		self.rb_audio.Bind(wx.EVT_RADIOBOX, lambda a: self.on_audio_selection(None))
 85 | 		self.scroll_sizer.Replace(_rb_audio, self.rb_audio)
 86 | 		_rb_audio.Destroy()
 87 | 		
 88 | 		if not streams["subs"]:
 89 | 			self.SetSizerAndFit(self.GetSizer())
 90 | 			self.Layout()
 91 | 			return
 92 | 
 93 | 		_rb_subs_copy = self.rb_subs
 94 | 		self.rb_subs = wx.RadioBox(self.scroll_panel,
 95 | 			choices=[f"Stream #{stream['stream']} ({stream['name']})" for stream in streams["subs"]],
 96 | 			style=wx.RA_VERTICAL
 97 | 		)
 98 | 		self.rb_subs.Bind(wx.EVT_RADIOBOX, lambda a: self.on_subtitle_selection(None, streams))
 99 | 		self.scroll_sizer.Replace(_rb_subs_copy, self.rb_subs)
100 | 		_rb_subs_copy.Destroy()
101 | 		self.scroll_panel.SetSizerAndFit(self.scroll_sizer)
102 | 		self.Fit()
103 | 		self.Layout()
104 | 
105 | 	def on_audio_selection(self, event):
106 | 		app_state.video.change_audio(self.rb_audio.GetSelection())
107 | 		
108 | 	def on_subtitle_selection(self, event, streams):
109 | 		# app_state.video.change_subs(stream_index=streams['subs'][self.rb_audio.GetSelection()])
110 | 		app_state.video.change_subs(stream_index=self.rb_subs.GetSelection())
111 | 		self.context.tab_subtitles.create_entries()
112 | 	
113 | 	def run_ocr(self, event):
114 | 		frames = video_ocr.perform_video_ocr(app_state.video.file, sample_rate=1)
115 | 		ocr_subs = []
116 | 		for index, frame in enumerate(frames):
117 | 			try:
118 | 				if sum(not char.isspace() for char in frame.text) > 6 and not nonsense(frame.text):
119 | 					ocr_subs.append(dub_line.DubbedLine(frame.ts_second, -1, frame.text, index))
120 | 			except Exception as e:
121 | 					print(e)
122 | 		app_state.video.subs_adjusted = ocr_subs
123 | 		self.context.tab_subtitles.create_entries()
124 | 	
125 | 	def remove_vocals(self, event):
126 | 		utils.attempt_long_running_task(lambda: vocal_isolation.seperate_file(app_state.video), self, "Spleeter Seperating Vocals", "Attempting to Seperate Vocals with Spleeter")
127 | 
128 | 	def import_subs(self, event):
129 | 		app_state.video.change_subs(external_path=self.file_import_external.GetPath())
130 | 		self.context.tab_subtitles.create_entries()
131 | 
132 | 	def change_mix(self, event):
133 | 		app_state.video.mixing_ratio = self.slider_audio_ratio.GetValue() / 100
134 | 
135 | 	def sample_mix(self, event):
136 | 		play(app_state.video.sample_mixing())
137 | 
138 | 	def remix_audio(self, event):
139 | 		app_state.video.mix_av(app_state.video.mixing_ratio)


--------------------------------------------------------------------------------
/tabs/SubtitlesTab.py:
--------------------------------------------------------------------------------
  1 | import utils
  2 | import app_state
  3 | from pydub.playback import play
  4 | import wx
  5 | import threading
  6 | import diarize
  7 | import feature_support
  8 | 
  9 | class SubtitleEntry(wx.Panel):
 10 | 	def __init__(self, parent, context, sub):
 11 | 		super().__init__(parent)
 12 | 		self.text = sub.text
 13 | 		self.sub = sub
 14 | 		self.start_time = sub.start
 15 | 		self.end_time = sub.end
 16 | 		self.speaker = sub.voice
 17 | 		self.duration = self.end_time - self.start_time
 18 | 		self.context = context
 19 | 		
 20 | 		entry_box = wx.StaticBox(self, label=f"{utils.seconds_to_timecode(self.start_time)} - {utils.seconds_to_timecode(self.end_time)}")
 21 | 		entry_sizer = wx.StaticBoxSizer(entry_box, wx.VERTICAL)
 22 | 
 23 | 		lbl_text = wx.StaticText(self, label=f"Speaker: {self.speaker}\nText: {self.text}")
 24 | 		entry_sizer.Add(lbl_text, 0, wx.EXPAND | wx.ALL, border=5)
 25 | 
 26 | 		lbl_language = wx.StaticText(self, label=f"Language: {sub.language}")
 27 | 		entry_sizer.Add(lbl_language, 0, border=2)
 28 | 
 29 | 		btn_playback = wx.Button(self, label="Play")
 30 | 		btn_playback.Bind(wx.EVT_BUTTON, self.on_playback_button_click)
 31 | 		entry_sizer.Add(btn_playback, 0, wx.ALIGN_LEFT | wx.ALL, border=5)
 32 | 
 33 | 		btn_sample = wx.Button(self, label="Sample")
 34 | 		btn_sample.Bind(wx.EVT_BUTTON, self.on_sample_button_click)
 35 | 		entry_sizer.Add(btn_sample, 0, wx.ALIGN_LEFT | wx.ALL, border=5)
 36 | 
 37 | 		self.chk_mark_export = wx.CheckBox(self, label="Select Subtitle")
 38 | 		entry_sizer.Add(self.chk_mark_export, 0, wx.ALIGN_LEFT)
 39 | 
 40 | 		self.SetSizerAndFit(entry_sizer)
 41 | 
 42 | 	def on_playback_button_click(self, event):
 43 | 		play(app_state.video.get_snippet(self.start_time, self.end_time))
 44 | 		pass
 45 | 
 46 | 	def on_sample_button_click(self, event):
 47 | 		play(self.sub.dub_line_file(match_rate=self.context.chk_match_rate.GetValue())[0])
 48 | 
 49 | class SubtitlesTab(wx.Panel):
 50 | 	def __init__(self, notebook, context):
 51 | 		super().__init__(notebook)
 52 | 		self.context = context
 53 | 		self.subs_displayed = []
 54 | 		tb_controls = wx.ToolBar(self)
 55 | 		
 56 | 		lbl_lang_prompt = wx.StaticText(tb_controls, label="Remove all subs of this language from dubbing")
 57 | 		btn_lang_detect = wx.Button(tb_controls, label="Run Language Detection")
 58 | 		btn_lang_detect.Bind(wx.EVT_BUTTON, self.detect_langs)
 59 | 		btn_language_filter = wx.Button(tb_controls, label="Filter Language")
 60 | 		btn_language_filter.Bind(wx.EVT_BUTTON, self.remove_langs)
 61 | 		
 62 | 		btn_assign_to_voice = wx.Button(tb_controls, label="Assign Selected Voice")
 63 | 		btn_assign_to_voice.Bind(wx.EVT_BUTTON, self.assign_voice)
 64 | 
 65 | 		btn_export_clone = wx.Button(tb_controls, label="Export Clone")
 66 | 		btn_export_clone.Bind(wx.EVT_BUTTON, self.export_clone)
 67 | 		
 68 | 		btn_diarize = wx.Button(tb_controls, label="Run Diarization")
 69 | 		btn_diarize.Bind(wx.EVT_BUTTON, self.run_diarization)
 70 | 		if not feature_support.diarization_supported: btn_diarize.Disable()
 71 | 		self.lb_detected_langs = wx.CheckListBox(tb_controls, choices=["en", "es", "jp"])
 72 | 		if not feature_support.language_detection_supported:
 73 | 			btn_lang_detect.Disable()
 74 | 			self.lb_detected_langs.Disable()
 75 | 			btn_language_filter.Disable()
 76 | 
 77 | 		self.scroll_panel = wx.ScrolledWindow(self, style=wx.VSCROLL)
 78 | 		self.scroll_sizer = wx.BoxSizer(wx.VERTICAL)
 79 | 		self.scroll_panel.SetSizer(self.scroll_sizer)
 80 | 		self.scroll_panel.SetScrollRate(0, 20)
 81 | 		tb_controls.AddControl(btn_lang_detect)
 82 | 		tb_controls.AddControl(lbl_lang_prompt)
 83 | 		tb_controls.AddControl(self.lb_detected_langs)
 84 | 		tb_controls.AddControl(btn_language_filter)
 85 | 		tb_controls.AddControl(btn_diarize)
 86 | 		tb_controls.AddControl(btn_assign_to_voice)
 87 | 		tb_controls.AddControl(btn_export_clone)
 88 | 		tb_controls.Realize()
 89 | 
 90 | 		self.lbl_subs_placecholder = wx.StaticText(self.scroll_panel, label="No Subtitles Loaded")
 91 | 		self.scroll_sizer.Add(self.lbl_subs_placecholder, 0, wx.CENTER)
 92 | 
 93 | 		main_sizer = wx.BoxSizer(wx.VERTICAL)
 94 | 		main_sizer.Add(tb_controls, 0, wx.CENTER)
 95 | 		main_sizer.Add(self.scroll_panel, 1, wx.EXPAND | wx.ALL, border=10)
 96 | 
 97 | 		self.SetSizerAndFit(main_sizer)
 98 | 
 99 | 	def run_diarization(self, event):
100 | 		diarize.run_diarization(app_state.video)
101 | 		self.create_entries()
102 | 		self.context.update_voices_list()
103 | 	
104 | 	def detect_langs(self, event):
105 | 		dialog = wx.ProgressDialog("Filtering Subtitles", "starting", len(app_state.video.subs_adjusted), self)
106 | 		def update_progress(progress, status):
107 | 			def run_after():
108 | 				self.update_langs()
109 | 				self.create_entries()
110 | 				dialog.Destroy()
111 | 			if progress == -1:
112 | 				return wx.CallAfter(run_after)
113 | 			else:
114 | 				wx.CallAfter(dialog.Update, progress, status)
115 | 		threading.Thread(target=app_state.video.detect_subs_lang, args=(update_progress, )).start()
116 | 
117 | 	def filter_language(self, event):
118 | 		exclusions = self.lb_detected_langs.CheckedStrings
119 | 		print("GWEEP", exclusions)
120 | 		app_state.video.filter_multilingual_subtiles(exclusions)
121 | 		self.update_langs()
122 | 		self.create_entries()
123 | 
124 | 
125 | 	def create_entries(self):
126 | 		self.scroll_sizer.Clear(delete_windows=True)
127 | 		for sub in app_state.video.subs_adjusted: # self.subs_displayed:
128 | 			diarization_entry = SubtitleEntry(
129 | 				self.scroll_panel,
130 | 				context=self.context,
131 | 				sub=sub
132 | 			)
133 | 			diarization_entry.SetRefData
134 | 			self.scroll_sizer.Add(diarization_entry, 0, wx.EXPAND | wx.ALL, border=5)
135 | 
136 | 		self.Layout()
137 | 
138 | 	def update_langs(self):
139 | 		self.lb_detected_langs.Clear()
140 | 		self.lb_detected_langs.AppendItems(sorted(list(set(sub.language for sub in app_state.video.subs_adjusted))))
141 | 		self.Layout()
142 | 
143 | 	def remove_langs(self, event):
144 | 		# maybe move this into the video class?
145 | 		app_state.video.subs_adjusted = [sub for sub in app_state.video.subs_adjusted if not sub.language in self.lb_detected_langs.GetCheckedStrings()]
146 | 		self.update_langs()
147 | 		self.create_entries()
148 | 	
149 | 	def assign_voice(self, event):
150 | 		[child.GetWindow().sub.update_voice(self.context.lb_voices.GetSelection()) for child in self.scroll_sizer.GetChildren() if child.GetWindow().chk_mark_export.IsChecked()],
151 | 		self.create_entries()
152 | 
153 | 	def export_clone(self, event):
154 | 		dlg_save = wx.FileDialog(self, "Save a new clone sample", "./output", "voice_sample.wav", "*.wav", wx.FD_SAVE)
155 | 		if dlg_save.ShowModal() == wx.ID_OK:
156 | 			app_state.video.export_clone(
157 | 				[child.GetWindow().sub for child in self.scroll_sizer.GetChildren() if child.GetWindow().chk_mark_export.IsChecked()],
158 | 				dlg_save.GetPath()
159 | 			)
160 | 		dlg_save.Destroy()
161 | 


--------------------------------------------------------------------------------
/weeablind.py:
--------------------------------------------------------------------------------
  1 | import wx
  2 | import wx.adv
  3 | import sys
  4 | from tabs.ConfigureVoiceTab import ConfigureVoiceTab
  5 | from tabs.SubtitlesTab import SubtitlesTab
  6 | from tabs.ListStreams import ListStreamsTab
  7 | from tabs.GreeterView import GreeterView
  8 | import threading
  9 | import utils
 10 | from video import Video
 11 | import app_state
 12 | import feature_support
 13 | from Voice import Voice
 14 | import os
 15 | 
 16 | class GUI(wx.Panel):
 17 | 	def __init__(self, parent):
 18 | 		super().__init__(parent)
 19 | 
 20 | 		
 21 | 		lbl_title = wx.StaticText(self, label="WeeaBlind")
 22 | 		lbl_GPU = wx.StaticText(self, label=f"GPU Detected? {feature_support.gpu_supported}")
 23 | 		lbl_GPU.SetForegroundColour((0, 255, 0) if feature_support.gpu_supported else (255, 0, 0))
 24 | 
 25 | 
 26 | 		btn_choose_file = wx.Button(self, label="Choose File")
 27 | 		btn_choose_file.Bind(wx.EVT_BUTTON, self.open_file)
 28 | 
 29 | 		lbl_main_file = wx.StaticText(self, label="Choose a video file or link to a YouTube video:")
 30 | 		self.txt_main_file = wx.TextCtrl(self, style=wx.TE_PROCESS_ENTER, value=utils.test_video_name)
 31 | 		self.txt_main_file.Bind(wx.EVT_TEXT_ENTER, lambda event: self.load_video(self.txt_main_file.Value))
 32 | 		lbl_dl_lang = wx.StaticText(self, label="Download subtitle language:")
 33 | 		self.txt_dl_lang = wx.TextCtrl(self, value="en")
 34 | 
 35 | 		lbl_start_time = wx.StaticText(self, label="Start Time:")
 36 | 		lbl_end_time = wx.StaticText(self, label="End Time:")
 37 | 		self.txt_start = wx.TextCtrl(self, style=wx.TE_PROCESS_ENTER, value=utils.seconds_to_timecode(0))
 38 | 		self.txt_end = wx.TextCtrl(self, style=wx.TE_PROCESS_ENTER, value=utils.seconds_to_timecode(0))
 39 | 		self.txt_start.Bind(wx.EVT_TEXT_ENTER, self.change_crop_time)
 40 | 		self.txt_end.Bind(wx.EVT_TEXT_ENTER, self.change_crop_time)
 41 | 
 42 | 		self.chk_match_rate = wx.CheckBox(self, label="Match Speaker Rate")
 43 | 		self.chk_match_rate.SetValue(True)
 44 | 
 45 | 		self.lb_voices = wx.ListBox(self, choices=[speaker.name for speaker in app_state.speakers])
 46 | 		self.lb_voices.Bind(wx.EVT_LISTBOX, self.on_voice_change)
 47 | 		self.lb_voices.Select(0)
 48 | 
 49 | 		btn_new_speaker = wx.Button(self, label="New Speaker")
 50 | 		btn_new_speaker.Bind(wx.EVT_BUTTON, self.add_speaker)
 51 | 
 52 | 		tab_control = wx.Notebook(self)
 53 | 		tab_control.AddPage(GreeterView(tab_control, self), "Welcome!")
 54 | 		self.tab_voice_config = ConfigureVoiceTab(tab_control, self)
 55 | 		tab_control.AddPage(self.tab_voice_config, "Configure Voices")
 56 | 		self.tab_subtitles = SubtitlesTab(tab_control, self)
 57 | 		tab_control.AddPage(self.tab_subtitles, "Subtitles")
 58 | 		self.streams_tab = ListStreamsTab(tab_control, self)
 59 | 		tab_control.AddPage(self.streams_tab, "Video Streams")
 60 | 		
 61 | 		btn_run_dub = wx.Button(self, label="Run Dubbing!")
 62 | 		btn_run_dub.Bind(wx.EVT_BUTTON, self.run_dub)
 63 | 		
 64 | 		sizer = wx.GridBagSizer(vgap=5, hgap=5)
 65 | 		sizer.Add(lbl_title, pos=(0, 0), span=(1, 2), flag=wx.CENTER | wx.ALL, border=5)
 66 | 		sizer.Add(lbl_GPU, pos=(0, 3), span=(1, 1), flag=wx.CENTER | wx.ALL, border=5)
 67 | 		sizer.Add(lbl_main_file, pos=(2, 0), span=(1, 2), flag=wx.LEFT | wx.TOP, border=5)
 68 | 		sizer.Add(self.txt_main_file, pos=(3, 0), span=(1, 2), flag=wx.EXPAND | wx.LEFT | wx.RIGHT | wx.BOTTOM, border=5)
 69 | 		sizer.Add(btn_choose_file, pos=(3, 2), span=(1, 1), flag=wx.ALIGN_RIGHT | wx.RIGHT | wx.BOTTOM, border=5)
 70 | 		sizer.Add(lbl_dl_lang, pos=(4, 0), span=(1,1), flag=wx.EXPAND | wx.LEFT | wx.RIGHT | wx.BOTTOM, border=5)
 71 | 		sizer.Add(self.txt_dl_lang, pos=(4, 1), span=(1,1), flag=wx.EXPAND | wx.LEFT | wx.RIGHT | wx.BOTTOM, border=5)
 72 | 		sizer.Add(lbl_start_time, pos=(5, 0), flag=wx.LEFT | wx.TOP, border=3)
 73 | 		sizer.Add(self.txt_start, pos=(5, 1), flag= wx.TOP | wx.RIGHT, border=3)
 74 | 		sizer.Add(lbl_end_time, pos=(5, 2), flag=wx.LEFT | wx.TOP, border=3)
 75 | 		sizer.Add(self.txt_end, pos=(5, 3), flag= wx.TOP | wx.RIGHT, border=3)
 76 | 		sizer.Add(self.chk_match_rate, pos=(6, 0), span=(1, 2), flag=wx.LEFT | wx.TOP, border=5)
 77 | 		sizer.Add(self.lb_voices, pos=(7, 0), span=(2, 1), flag=wx.EXPAND | wx.LEFT | wx.TOP, border=5)
 78 | 		sizer.Add(btn_new_speaker, pos=(9, 0), span=(1, 1), flag=wx.LEFT, border=5)
 79 | 		sizer.Add(tab_control, pos=(7, 1), span=(2, 3), flag=wx.SHRINK | wx.ALL, border=5)
 80 | 		sizer.Add(btn_run_dub, pos=(10, 2), span=(1, 1), flag=wx.ALIGN_RIGHT | wx.RIGHT | wx.BOTTOM, border=5)
 81 | 		# sizer.AddGrowableCol(1)
 82 | 		# sizer.AddGrowableRow(7)
 83 | 		self.tab_voice_config.update_voice_fields(None)
 84 | 
 85 | 		self.SetSizerAndFit(sizer)
 86 | 		wx.CallAfter(self.check_ffmpeg)
 87 | 
 88 | 	def check_ffmpeg(self):
 89 | 		if not feature_support.ffmpeg_supported:
 90 | 			msg_has_ffmpeg = wx.MessageDialog(self, "FFmpeg is not detected on your system, Would you like to automatically install it?", "Install FFmpeg?", style=wx.YES_NO | wx.ICON_QUESTION)
 91 | 			if msg_has_ffmpeg.ShowModal() == wx.ID_YES:
 92 | 				msg_loading = wx.ProgressDialog("Installing FFmpeg...", "Installing FFmpeg", parent=self, style=wx.PD_AUTO_HIDE | wx.PD_SMOOTH)
 93 | 				msg_loading.Update(1)
 94 | 				try:
 95 | 					feature_support.install_ffmpeg()
 96 | 				except Exception as e:
 97 | 					print(e)
 98 | 					wx.MessageBox(f"Installing FFmpeg failed, please install it manually, and add it to your system envionrment path.\n\n{e}", "FFmpeg Install failed", wx.ICON_ERROR, self)
 99 | 				msg_loading.Destroy()
100 |   
101 | 
102 | 	def open_file(self, evenet):
103 | 		dlg = wx.FileDialog(
104 | 			frame, message="Choose a file",
105 | 			wildcard="*.*",
106 | 			style=wx.FD_OPEN | wx.FD_CHANGE_DIR
107 | 		)
108 | 		if dlg.ShowModal() == wx.ID_OK:
109 | 			self.load_video(dlg.GetPath())
110 | 		dlg.Destroy()
111 | 
112 | 	def load_video(self, video_path):
113 | 		def update_ui():
114 | 			self.txt_main_file.Value = app_state.video.file
115 | 			self.txt_start.SetValue(utils.seconds_to_timecode(app_state.video.start_time))
116 | 			self.txt_end.SetValue(utils.seconds_to_timecode(app_state.video.end_time))
117 | 			self.tab_subtitles.create_entries()
118 | 
119 | 		def initialize_video(progress=True):
120 | 			app_state.video = Video(video_path, update_progress if progress else print, lang=self.txt_dl_lang.Value)
121 | 			wx.CallAfter(update_ui)
122 | 			wx.CallAfter(self.streams_tab.populate_streams, app_state.video.list_streams())
123 | 
124 | 		if video_path.startswith("http"):
125 | 			dialog = wx.ProgressDialog("Downloading Video", "Download starting", 100, self)
126 | 
127 | 			def update_progress(progress=None):
128 | 				status = progress['status'] if progress else "waiting"
129 | 				total = progress.get("fragment_count", progress.get("total_bytes", 0))
130 | 				if status == "downloading" and total:
131 | 					completed = progress.get("fragment_index", progress.get("downloaded_bytes", 1))
132 | 					percent_complete = int(100 * (completed / total))
133 | 					wx.CallAfter(dialog.Update, percent_complete, f"{status}: {percent_complete}% \n {progress['info_dict'].get('fulltitle', '')}")
134 | 				elif status == "complete":
135 | 					if dialog:
136 | 						wx.CallAfter(dialog.Destroy)
137 | 				elif status == "error":
138 | 					wx.CallAfter(wx.MessageBox,
139 | 						f"Failed to download video with the following Error:\n {str(progress['error'])}",
140 | 						"Error",
141 | 						wx.ICON_ERROR
142 | 					)
143 | 					update_progress({"status": "complete"})
144 | 
145 | 			threading.Thread(target=initialize_video).start()
146 | 		else:
147 | 			initialize_video(False)
148 | 
149 | 	def change_crop_time(self, event):
150 | 		app_state.video.update_time(
151 | 			utils.timecode_to_seconds(self.txt_start.Value),
152 | 			utils.timecode_to_seconds(self.txt_end.Value)
153 | 		)
154 | 		self.tab_subtitles.create_entries()
155 | 
156 | 	def update_voices_list(self):
157 | 		self.lb_voices.Set([speaker.name for speaker in app_state.speakers])
158 | 		self.lb_voices.Select(self.lb_voices.Strings.index(app_state.current_speaker.name))
159 | 
160 | 	def on_voice_change(self, event):
161 | 		app_state.current_speaker = app_state.speakers[self.lb_voices.GetSelection()]
162 | 		app_state.sample_speaker = app_state.current_speaker
163 | 		self.tab_voice_config.update_voice_fields(event)
164 | 
165 | 	def add_speaker(self, event):
166 | 		num_voice = self.lb_voices.GetCount()
167 | 		app_state.speakers.append(Voice(Voice.VoiceType.SYSTEM, name=f"Voice {num_voice}"))
168 | 		self.update_voices_list()
169 | 		self.lb_voices.Select(num_voice)
170 | 
171 | 	def run_dub(self, event):
172 | 		progress_dialog = wx.ProgressDialog(
173 | 			"Dubbing Progress",
174 | 			"Starting...",
175 | 			maximum=len(app_state.video.subs_adjusted) + 1,  # +1 for combining phase
176 | 			parent=self,
177 | 			style=wx.PD_APP_MODAL | wx.PD_AUTO_HIDE
178 | 		)
179 | 		dub_thread = None
180 | 		def update_progress(i, text=""):
181 | 			if i == -1:
182 | 				return wx.CallAfter(progress_dialog.Destroy)
183 | 			wx.CallAfter(progress_dialog.Update, i, text)
184 | 
185 | 		dub_thread = threading.Thread(target=app_state.video.run_dubbing, args=(update_progress,self.chk_match_rate.GetValue()))
186 | 		dub_thread.start()
187 | 
188 | if __name__ == '__main__':
189 | 	utils.create_output_dir()
190 | 	app = wx.App(False)
191 | 	frame = wx.Frame(None, wx.ID_ANY, utils.APP_NAME, size=(1270, 800))
192 | 	frame.Center()
193 | 	icon_path = "logo.ico" if not utils.is_deployed else os.path.join('_internal', 'logo.ico')
194 | 	frame.SetIcon(wx.Icon(os.path.abspath(icon_path), wx.BITMAP_TYPE_ANY))
195 | 	gui = GUI(frame)
196 | 	frame.Show()
197 | 	app.MainLoop()
198 | 


--------------------------------------------------------------------------------
/Voice.py:
--------------------------------------------------------------------------------
  1 | from enum import Enum
  2 | import abc
  3 | import os
  4 | import threading
  5 | import feature_support
  6 | import pyttsx3
  7 | import numpy as np
  8 | import time
  9 | if feature_support.espeak_supported:
 10 | 	import espeakng
 11 | if feature_support.coqui_supported:
 12 | 	from TTS.api import TTS
 13 | 	from TTS.utils import manage
 14 | 
 15 | class Voice(abc.ABC):
 16 | 	class VoiceType(Enum):
 17 | 		ESPEAK = ("ESpeak", feature_support.espeak_supported)
 18 | 		COQUI = ("Coqui TTS", feature_support.coqui_supported)
 19 | 		SYSTEM = ("System Voices", True)
 20 | 
 21 | 	def __new__(cls, voice_type, init_args=[], name="Unnamed"):
 22 | 		if cls is Voice:
 23 | 			if voice_type == cls.VoiceType.ESPEAK:
 24 | 				return super().__new__(ESpeakVoice)
 25 | 			elif voice_type == cls.VoiceType.COQUI:
 26 | 				return super().__new__(CoquiVoice)
 27 | 			elif voice_type == cls.VoiceType.SYSTEM:
 28 | 				return super().__new__(SystemVoice)
 29 | 		else:
 30 | 			return super().__new__(cls)
 31 | 
 32 | 	def __init__(self, voice_type, init_args=[], name="Unnamed"):
 33 | 		self.voice = None
 34 | 		self.voice_type = voice_type
 35 | 		self.name = name
 36 | 		self.voice_option = None
 37 | 
 38 | 	@abc.abstractmethod
 39 | 	def speak(self, text, file_name):
 40 | 		pass
 41 | 
 42 | 	def set_speed(self, speed):
 43 | 		pass
 44 | 
 45 | 	@abc.abstractmethod
 46 | 	def set_voice_params(self, voice=None, pitch=None):
 47 | 		pass
 48 | 
 49 | 	@abc.abstractmethod
 50 | 	def list_voice_options(self):
 51 | 		pass
 52 | 
 53 | 	def calibrate_rate(self):
 54 | 		output_path = './output/calibration.wav'
 55 | 		calibration_phrase_long = "In the early morning light, a vibrant scene unfolds as the quick brown fox jumps gracefully over the lazy dog. The fox's russet fur glistens in the sun, and its swift movements captivate onlookers. With a leap of agility, it soars through the air, showcasing its remarkable prowess. Meanwhile, the dog, relaxed and unperturbed, watches with half-closed eyes, acknowledging the fox's spirited display. The surrounding nature seems to hold its breath, enchanted by this charming spectacle. The gentle rustling of leaves and the distant chirping of birds provide a soothing soundtrack to this magical moment. The two animals, one lively and the other laid-back, showcase the beautiful harmony of nature, an ageless dance that continues to mesmerize all who witness it."
 56 | 		calibration_phrase_chair = "A chair is a piece of furniture with a raised surface used to sit on, commonly for use by one person. Chairs are most often supported by four legs and have a back; however, a chair can have three legs or could have a different shape. A chair without a back or arm rests is a stool, or when raised up, a bar stool."
 57 | 		calibration_phrase = "Hello? Testing, testing. Is.. is this thing on? Ah! Hello Gordon! I'm... assuming that's your real name... You wouldn't lie to us. Would you? Well... You finally did it! You survived the resonance cascade! You brought us all to hell and back, alive! You made it to the ultimate birthday bash at the end of the world! You beat the video game! And... now I imagine you'll... shut it down. Move on with your life. Onwards and upwards, ay Gordon? I don't.. know... how much longer I have to send this to you so I'll try to keep it brief. Not my specialty. Perhaps this is presumptuous of me but... Must this really be the end of our time together? Perhaps you could take the science team's data, transfer us somewhere else, hmm? Now... it doesn't have to be Super Punch-Out for the Super Nintendo Entertainment System. Maybe a USB drive, or a spare floppy disk. You could take us with you! We could see the world! We could... I'm getting a little ahead of myself, surely. Welp! The option's always there! You changed our lives, Gordon. I'd like to think it was for the better. And I don't know what's going to happen to us once you exit the game for good. But I know we'll never forget you. I hope you won't forget us. Well... This is where I get off. Goodbye Gordon!"
 58 | 		self.speak(calibration_phrase, output_path)
 59 | 
 60 | 	def get_wpm(words, duration):
 61 | 		return (len(words.split(' ')) / duration * 60)
 62 | 
 63 | class ESpeakVoice(Voice):
 64 | 	def __init__(self, init_args=[], name="Unnamed"):
 65 | 		super().__init__(Voice.VoiceType.ESPEAK, init_args, name)
 66 | 		self.voice = espeakng.Speaker()
 67 | 		self.voice_option = self.voice.voice
 68 | 
 69 | 	def speak(self, text, file_name):
 70 | 		self.voice.say(text, wait4prev=True, export_path=file_name)
 71 | 		time.sleep(0.1) # We need to replace this with something that actually checks if the command output is finished tbh
 72 | 		return file_name
 73 | 
 74 | 	def set_voice_params(self, voice=None, pitch=None):
 75 | 		if voice:
 76 | 			self.voice.voice = self.voice_option = voice
 77 | 		if pitch:
 78 | 			self.voice.pitch = pitch
 79 | 
 80 | 	def list_voice_options(self):
 81 | 		return ["af","sq","am","ar","an","hy","hyw","as","az","ba","cu","eu","be","bn","bpy","bs","bg","my","ca","chr","yue","hak","haw","cmn","hr","cs","da","nl","en-us","en","en-029","en-gb-x-gbclan","en-gb-x-rp","en-gb-scotland","en-gb-x-gbcwmd","eo","et","fa","fa-latn","fi","fr-be","fr","fr-ch","ga","gd","ka","de","grc","el","kl","gn","gu","ht","he","hi","hu","is","id","ia","io","it","ja","kn","kok","ko","ku","kk","ky","la","lb","ltg","lv","lfn","lt","jbo","mi","mk","ms","ml","mt","mr","nci","ne","nb","nog","or","om","pap","py","pl","pt-br","qdb","qu","quc","qya","pt","pa","piqd","ro","ru","ru-lv","uk","sjn","sr","tn","sd","shn","si","sk","sl","smj","es","es-419","sw","sv","ta","th","tk","tt","te","tr","ug","ur","uz","vi-vn-x-central","vi","vi-vn-x-south","cy"]
 82 | 
 83 | class CoquiVoice(Voice):
 84 | 	def __init__(self, init_args=None, name="Coqui Voice"):
 85 | 		super().__init__(Voice.VoiceType.COQUI, init_args, name)
 86 | 		self.voice = TTS().to('cuda' if feature_support.gpu_supported else 'cpu')
 87 | 		self.langs = ["All Languages"] + list({lang.split("/")[1] for lang in self.voice.models})
 88 | 		self.langs.sort()
 89 | 		self.selected_lang = 'en'
 90 | 		self.is_multispeaker = False
 91 | 		self.speaker = None
 92 | 		self.is_clonable = False
 93 | 		self.use_vc = False
 94 | 		self.speaker_wav = ""
 95 | 	def speak(self, text, file_path=None):
 96 | 		if file_path:
 97 | 			if not self.use_vc or "xtts" in self.voice.model_name:
 98 | 				return self.voice.tts_to_file(
 99 | 					text,
100 | 					file_path=file_path,
101 | 					speaker=self.speaker,
102 | 					language='en' if self.voice.is_multi_lingual else None,
103 | 					speaker_wav=self.speaker_wav
104 | 				)
105 | 			else:
106 | 				self.voice.tts_with_vc_to_file(
107 | 					text,
108 | 					file_path=file_path,
109 | 					speaker=self.speaker,
110 | 					language='en' if self.voice.is_multi_lingual else None,
111 | 					speaker_wav=self.speaker_wav
112 | 				)
113 | 				return file_path
114 | 		else:
115 | 			return np.array(self.voice.tts(
116 | 				text,
117 | 				speaker=self.speaker,
118 | 				language= 'en' if self.voice.is_multi_lingual else None,
119 | 				speaker_wav=self.speaker_wav
120 | 			))
121 | 
122 | 	def set_voice_params(self, voice=None, speaker=None, speaker_wav=None, use_vc=None, progress=None):
123 | 		if voice and voice != self.voice_option:
124 | 			if progress:
125 | 				progress(0, "downloading")
126 | 				download_thread = threading.Thread(target=self.voice.load_tts_model_by_name, args=(voice,))
127 | 				download_thread.start()
128 | 				while(download_thread.is_alive()):
129 | 					# I'll remove this check if they accept my PR c:
130 | 					bar = manage.ModelManager.tqdm_progress if hasattr(manage.ModelManager, "tqdm_progress") else None
131 | 					if bar:
132 | 						progress_value = int(100*(bar.n / bar.total))
133 | 						progress(progress_value, "downloading")
134 | 					time.sleep(0.25)  # Adjust the interval as needed
135 | 				progress(-1, "done!")
136 | 			else:
137 | 				self.voice.load_tts_model_by_name(voice)
138 | 			self.voice_option = self.voice.model_name
139 | 		self.is_multispeaker = self.voice.is_multi_speaker
140 | 		if "xtts" in self.voice.model_name or self.use_vc:
141 | 			self.is_clonable = True
142 | 			# self.is_multispeaker = False
143 | 		if use_vc is not None:
144 | 			self.use_vc = use_vc
145 | 		if speaker_wav is not None:
146 | 			self.speaker_wav = speaker_wav
147 | 		if speaker is not None:
148 | 			self.speaker = speaker
149 | 			if "xtts" in self.voice.model_name and self.use_vc:
150 | 				self.speaker = None
151 | 
152 | 	def list_voice_options(self):
153 | 		return self.voice.models
154 | 
155 | 	def is_model_downloaded(self, model_name):
156 | 		return os.path.exists(os.path.join(self.voice.manager.output_prefix, self.voice.manager._set_model_item(model_name)[1]))
157 | 
158 | 	def list_speakers(self):
159 | 		return self.voice.speakers if self.voice.is_multi_speaker else []
160 | 
161 | class SystemVoice(Voice):
162 | 	def __init__(self, init_args=[], name="Unnamed"):
163 | 		super().__init__(Voice.VoiceType.SYSTEM, init_args, name)
164 | 		self.voice = pyttsx3.init()
165 | 		self.voice_option = self.voice.getProperty('voice')
166 | 
167 | 	def speak(self, text, file_name):
168 | 		self.voice.save_to_file(text, file_name)
169 | 		self.voice.runAndWait()
170 | 		return file_name
171 | 
172 | 	def set_speed(self, speed):
173 | 		self.voice.setProperty('rate', speed)
174 | 
175 | 	def set_voice_params(self, voice=None, pitch=None):
176 | 		if voice:
177 | 			print(voice, self.voice.getProperty('voices')[self.list_voice_options().index(voice)].id)
178 | 			self.voice.setProperty('voice', self.voice.getProperty('voices')[self.list_voice_options().index(voice)].id)
179 | 			self.voice_option = voice # self.voice.getProperty('voice')
180 | 
181 | 	def list_voice_options(self):
182 | 		return [voice.name for voice in self.voice.getProperty('voices')]
183 | 


--------------------------------------------------------------------------------
/tabs/ConfigureVoiceTab.py:
--------------------------------------------------------------------------------
  1 | import threading
  2 | import app_state
  3 | import wx
  4 | from Voice import Voice
  5 | import utils
  6 | import feature_support
  7 | 
  8 | class ConfigureVoiceTab(wx.Panel):
  9 | 	def __init__(self, notebook, parent):
 10 | 		super().__init__(notebook)
 11 | 		self.parent = parent
 12 | 
 13 | 		# Create a grid sizer with extra padding
 14 | 		grid_sizer = wx.FlexGridSizer(cols=2, hgap=5, vgap=10)
 15 | 
 16 | 		# Add controls with labels
 17 | 		lbl_voice_name = wx.StaticText(self, label="Name")
 18 | 		self.txt_voice_name = wx.TextCtrl(self, value=app_state.current_speaker.name)
 19 | 		self.add_control_with_label(grid_sizer, lbl_voice_name, self.txt_voice_name)
 20 | 
 21 | 		lbl_tts_engines = wx.StaticText(self, label="TTS Engine")
 22 | 		self.available_engines = [engine for engine in Voice.VoiceType if engine.value[1]]
 23 | 
 24 | 		self.cb_tts_engines = wx.Choice(self, choices=[engine.value[0] for engine in self.available_engines])
 25 | 		self.cb_tts_engines.Bind(wx.EVT_CHOICE, self.change_tts_engine)
 26 | 		self.add_control_with_label(grid_sizer, lbl_tts_engines, self.cb_tts_engines)
 27 | 
 28 | 		# This is for filtering coqui models by language
 29 | 		self.lbl_coqui_lang = wx.StaticText(self, label="Language")
 30 | 		self.cb_coqui_lang = wx.Choice(self, choices=[])
 31 | 		self.cb_coqui_lang.Bind(wx.EVT_CHOICE, self.change_model_language)
 32 | 		self.lbl_coqui_lang.Hide()
 33 | 		self.cb_coqui_lang.Hide()  # Hide by default, show only when multi-speaker Coqui model is selected
 34 | 		self.add_control_with_label(grid_sizer, self.lbl_coqui_lang, self.cb_coqui_lang)
 35 | 
 36 | 		lbl_model_options = wx.StaticText(self, label="Model Options")
 37 | 		self.cb_model_options = wx.Choice(self, choices=app_state.current_speaker.list_voice_options())
 38 | 		self.cb_model_options.Bind(wx.EVT_CHOICE, self.change_voice_params)
 39 | 		self.add_control_with_label(grid_sizer, lbl_model_options, self.cb_model_options)
 40 | 		
 41 | 		self.btn_patch_onecore = wx.Button(self, label="Unlock OneCore Voices (Requires Admin)")
 42 | 		self.btn_patch_onecore.Bind(wx.EVT_BUTTON, self.patch_onecore)
 43 | 		self.btn_patch_onecore.Hide()
 44 | 		grid_sizer.Add((0,0), 1, wx.ALL | wx.ALIGN_LEFT, 5)
 45 | 		grid_sizer.Add(self.btn_patch_onecore, 1, wx.ALL | wx.ALIGN_RIGHT, 5)
 46 | 
 47 | 		# This is for multispeaker coqui models. Should be hidden by default & shown when model is multispeaker
 48 | 		self.lbl_speaker_voices = wx.StaticText(self, label="Speaker Voices")
 49 | 		self.cb_speaker_voices = wx.Choice(self, choices=[])
 50 | 		self.cb_speaker_voices.Bind(wx.EVT_CHOICE, self.change_voice_params)
 51 | 		self.lbl_speaker_voices.Hide()
 52 | 		self.cb_speaker_voices.Hide()  # Hide by default, show only when multi-speaker Coqui model is selected
 53 | 		self.add_control_with_label(grid_sizer, self.lbl_speaker_voices, self.cb_speaker_voices)
 54 | 
 55 | 		# If you're using XTTS or voice conversion, provide a wav file to sample
 56 | 		self.chk_speaker_wav = wx.CheckBox(self, label="VC / Clone Sample")
 57 | 		self.chk_speaker_wav.Bind(wx.EVT_CHECKBOX, self.change_voice_params) # Set the voice to use VC
 58 | 		self.file_speaker_wav = wx.FilePickerCtrl(self, message="Select a voice sample to clone", wildcard="*.wav")
 59 | 		self.file_speaker_wav.Bind(wx.EVT_FILEPICKER_CHANGED, self.change_voice_params)
 60 | 		self.add_control_with_label(grid_sizer, self.chk_speaker_wav, self.file_speaker_wav)
 61 | 
 62 | 
 63 | 		lbl_sample_text = wx.StaticText(self, label="Sample Text")
 64 | 		self.txt_sample_text = wx.TextCtrl(self, value="I do be slurpin' that cheese without my momma's permission")
 65 | 		self.add_control_with_label(grid_sizer, lbl_sample_text, self.txt_sample_text)
 66 | 
 67 | 		self.btn_sample = wx.Button(self, label="▶️ Sample Voice")
 68 | 		self.btn_sample.Bind(wx.EVT_BUTTON, self.sample)
 69 | 		
 70 | 		self.btn_update_voice = wx.Button(self, label="Update Voice")
 71 | 		self.btn_update_voice.Bind(wx.EVT_BUTTON, self.update_voice)
 72 | 
 73 | 		# Add the buttons to the grid without labels
 74 | 		grid_sizer.AddStretchSpacer()
 75 | 		grid_sizer.Add(self.btn_sample, 0, wx.ALL | wx.ALIGN_RIGHT, 5)
 76 | 		grid_sizer.Add(self.btn_update_voice, 0, wx.ALL | wx.ALIGN_LEFT, 5)
 77 | 
 78 | 		# Set the grid sizer as the main sizer for the panel with extra padding
 79 | 		main_sizer = wx.BoxSizer(wx.VERTICAL)
 80 | 		main_sizer.Add(grid_sizer, 0, wx.ALL | wx.EXPAND, 15)
 81 | 		self.SetSizerAndFit(main_sizer)
 82 | 
 83 | 	def add_control_with_label(self, sizer, label, control):
 84 | 		sizer.Add(label, 0, wx.ALL|wx.ALIGN_LEFT, 5)
 85 | 		sizer.Add(control, 0, wx.ALL|wx.EXPAND, 5)
 86 | 
 87 | 	def sample(self, event):
 88 | 		utils.sampleVoice(self.txt_sample_text.Value)
 89 | 
 90 | 	# When the user clicks update voice, asign one in the array to the specification
 91 | 	def update_voice(self, event):
 92 | 		app_state.sample_speaker.name = self.txt_voice_name.Value
 93 | 		app_state.speakers[app_state.speakers.index(app_state.current_speaker)] = app_state.sample_speaker
 94 | 		app_state.current_speaker = app_state.sample_speaker
 95 | 		self.parent.update_voices_list()
 96 | 
 97 | 	# determines weather to show hidden models based on the state of the selected voice model/engine
 98 | 	def show_hidden(self):
 99 | 		if app_state.sample_speaker.voice_type == Voice.VoiceType.COQUI:
100 | 				self.lbl_coqui_lang.Show()
101 | 				self.cb_coqui_lang.Show()
102 | 				self.cb_coqui_lang.Set(list(app_state.sample_speaker.langs))
103 | 				self.cb_coqui_lang.Select(app_state.sample_speaker.langs.index(app_state.sample_speaker.selected_lang))
104 | 				self.chk_speaker_wav.Show()
105 | 				self.file_speaker_wav.Show()
106 | 				self.chk_speaker_wav.SetValue(app_state.sample_speaker.use_vc)
107 | 				self.file_speaker_wav.SetPath(app_state.sample_speaker.speaker_wav)
108 | 				self.change_model_language(None)
109 | 				if app_state.sample_speaker.is_multispeaker:
110 | 					self.lbl_speaker_voices.Show()
111 | 					self.cb_speaker_voices.Show()
112 | 					self.cb_speaker_voices.Set(app_state.sample_speaker.list_speakers())
113 | 					if app_state.sample_speaker.speaker:
114 | 						self.cb_speaker_voices.SetStringSelection(app_state.sample_speaker.speaker)
115 | 				else:
116 | 					self.lbl_speaker_voices.Hide()
117 | 					self.cb_speaker_voices.Hide()
118 | 		elif app_state.sample_speaker.voice_type == Voice.VoiceType.SYSTEM and app_state.platform == 'win32':
119 | 			self.btn_patch_onecore.Show()
120 | 		else:
121 | 			self.lbl_coqui_lang.Hide()
122 | 			self.cb_coqui_lang.Hide()
123 | 			self.chk_speaker_wav.Hide()
124 | 			self.file_speaker_wav.Hide()
125 | 			self.lbl_speaker_voices.Hide()
126 | 			self.cb_speaker_voices.Hide()
127 | 			self.btn_patch_onecore.Hide()
128 | 		self.Layout()
129 | 
130 | 	# Populate the form with the current sample speaker's params
131 | 	def update_voice_fields(self, event):
132 | 		self.txt_voice_name.Value = app_state.sample_speaker.name
133 | 		self.cb_tts_engines.Select(self.available_engines.index(app_state.sample_speaker.voice_type))
134 | 
135 | 		self.cb_model_options.Set(app_state.sample_speaker.list_voice_options())
136 | 		self.show_hidden()
137 | 		try:
138 | 			self.cb_model_options.Select(self.cb_model_options.GetStrings().index(app_state.sample_speaker.voice_option))
139 | 		except:
140 | 			self.cb_model_options.Select(0)
141 | 
142 | 	def change_tts_engine(self, event):
143 | 		app_state.sample_speaker = Voice(self.available_engines[self.cb_tts_engines.GetSelection()])
144 | 		self.update_voice_fields(event)
145 | 
146 | 	# Update the sample speaker to the specification
147 | 	def change_voice_params(self, event):
148 | 		self.SetCursor(wx.Cursor(wx.CURSOR_WAIT))
149 | 		self.Layout()
150 | 		option_name = self.cb_model_options.GetStringSelection()
151 | 		
152 | 		def run_after():
153 | 			app_state.sample_speaker.set_voice_params(voice=option_name)
154 | 			if app_state.sample_speaker.voice_type == Voice.VoiceType.COQUI:
155 | 				app_state.sample_speaker.set_voice_params(speaker_wav=self.file_speaker_wav.GetPath())
156 | 				if app_state.sample_speaker.is_multispeaker:
157 | 					app_state.sample_speaker.set_voice_params(speaker=self.cb_speaker_voices.GetStringSelection())
158 | 				app_state.sample_speaker.set_voice_params(use_vc=self.chk_speaker_wav.IsChecked())
159 | 				try:
160 | 					dialog_download.Destroy()
161 | 				except:
162 | 					pass
163 | 			self.update_voice_fields(event)
164 | 			self.SetCursor(wx.Cursor(wx.CURSOR_DEFAULT))
165 | 					
166 | 		if app_state.sample_speaker.voice_type == Voice.VoiceType.COQUI:
167 | 			if not app_state.sample_speaker.is_model_downloaded(option_name):
168 | 				message_download = wx.MessageDialog(
169 | 					None,
170 | 					f"You do not have\n{option_name}\n downloaded. Would you like to download it? It could take a long time and lots of storage",
171 | 					"Downlaod this model?",
172 | 					wx.CANCEL
173 | 				).ShowModal()
174 | 				if(message_download != wx.ID_OK):
175 | 					return
176 | 				dialog_download = wx.ProgressDialog("Downloading Model", "starting", 100, self)
177 | 
178 | 				def download_progress(progress, status=None):
179 | 					if progress == -1:
180 | 						wx.CallAfter(run_after)
181 | 						return
182 | 					wx.CallAfter(dialog_download.Update, progress, f"{progress}% - {status} \n {option_name}")
183 | 				threading.Thread(target=app_state.sample_speaker.set_voice_params, kwargs={"voice": option_name, "progress": download_progress}).start()
184 | 		wx.CallAfter(run_after)
185 | 			
186 | 
187 | 	def change_model_language(self, event):
188 | 		if self.cb_coqui_lang.GetSelection() == 0: # If they have "All Voices" selected, don't filter
189 | 			self.cb_model_options.Set(app_state.sample_speaker.list_voice_options())
190 | 		else:
191 | 			self.cb_model_options.Set([model for model in app_state.sample_speaker.list_voice_options() if f"/{self.cb_coqui_lang.GetStringSelection()}/" in model])
192 | 			app_state.sample_speaker.selected_lang = self.cb_coqui_lang.GetStringSelection()
193 | 
194 | 	def patch_onecore(self, event):
195 | 		msg_prompt_patch = wx.MessageDialog(self, 
196 | """By default, PyTTSx3 only supports 2 system voices, however, you can add more in
197 | Settings > Time & Language > Speech
198 | To use these voices, you must patch the registry to make them accessible. This requires admin rights and will not impact any other aspect of your system.
199 | Would you like to patch the Windows registry to add these voices?""",
200 | "Add OneCore Voices?",
201 | style=wx.YES_NO)
202 | 		if msg_prompt_patch.ShowModal() == wx.ID_YES:
203 | 			feature_support.patch_onecore_voices()
204 | 


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Weeablind
  2 | 
  3 | A program to dub multi-lingual media and anime using modern AI speech synthesis, diarization, language identification, and voice cloning.
  4 | 
  5 | <img src="logo.png" width="300" alt="A blind anime girl with an audio waveform for eyes. She's got green and purple hair and a cozy green sweater and purple burrettes. This above the words Weea-Blind. The image was generated by Dall-E AI">
  6 | 
  7 | ## [Download Release 1.0](https://github.com/florianeagox/weeablind/releases)
  8 | 
  9 | You can try the first binary version which has access to basic dubbing features with the non-AI based libraries for Windows and Linux. This is a good way to try out the program and you can still do basic dubbing with the system voices. If you want to use the advanced features' you'll have to try out the advanced features as described in the setup tutorial.
 10 | 
 11 | ## Live Demo and Tutorial
 12 | 
 13 | I made this video to show off how to use all the features and everything the softwwware can do presently
 14 | 
 15 | [![YouTube link to a video about the software](video_thumbnail_preview.png)](https://youtu.be/V1RDwXwmjFA)
 16 | 
 17 | ## Why
 18 | 
 19 | Many shows, movies, news segments, interviews, and videos will never receive proper dubs to other languages, and dubbing something from scratch can be an enormous undertaking. This presents a common accessibility hurdle for people with blindness, dyslexia, learning disabilities, or simply folks that don't enjoy reading subtitles. This program aims to create a pleasant alternative for folks facing these struggles.
 20 | 
 21 | This software is a product of war. My sister turned me onto my now-favorite comedy anime "The Disastrous Life of Saiki K." but Netflix never ordered a dub for the 2nd season. I'm blind and cannot and will not ever be able to read subtitles, but I MUST know how the story progresses! Netflix has forced my hand and I will bring AI-dubbed anime to the blind!
 22 | 
 23 | ## How
 24 | 
 25 | This project relies on some rudimentary slapping together of some state of the art technologies. It uses numerous audio processing libraries and techniques to analyze and synthesize speech that tries to stay in-line with the source video file. It primarily relies on ffmpeg and pydub for audio and video editing, Coqui TTS for speech synthesis, speechbrain for language identification, and pyannote.audio for speaker diarization.
 26 | 
 27 | You have the option of dubbing every subtitle in the video, setting the s tart and end times, dubbing only foreign-language content, or full-blown multi-speaker dubbing with speaking rate and volume matching.
 28 | 
 29 | ## When?
 30 | 
 31 | This project is currently what some might call in alpha. The major, core functionality is in place, and it's possible to use by cloning the repo, but it's only starting to be ready for a first release. There are numerous optimizations, UX, and refactoring that need to be done before I would call it finished. Stay tuned for regular updates, and feel free to extend a hand with contributions, testing, or suggestions if this is something you're interested in.
 32 | 
 33 | ## The Name
 34 | 
 35 | I had the idea to call the software Weeablind as a portmanteaux of Weeaboo (someone a little too obsessed with anime), and blind. I might change it to something else in the future like Blindtaku, DubHub, or something similar and more catchy because the software can be used for far more than just anime.
 36 | 
 37 | ## Setup
 38 | 
 39 | There are currently no prebuilt-binaries to download, this is something I am looking into, but many of these dependencies are not easy to bundle with something like PyInstaller
 40 | 
 41 | The program works best on Linux, but will also run on Windows.
 42 | 
 43 | ### System Prerequisits
 44 | You will need to install [FFmpeg](https://ffmpeg.org/download.html) on your system and make sure it's callable from terminal or in your system PATH
 45 | 
 46 | For using Coqui TTS, you will also need Espeak-NG which you can get from your package manager on Linux or [here](https://github.com/espeak-ng/espeak-ng/releases) on Windows
 47 | 
 48 | On Windows, pip requires MSVC Build Tools to build Coqui. You can install it here:
 49 | https://visualstudio.microsoft.com/visual-cpp-build-tools/
 50 | 
 51 | Coqui TTS and Pyannote diarization will also both perform better if you have CUDA set up on your system to use your GPU. This should work out of the box on Linux but getting it set up on Windows takes some doing. This [blog post](https://saturncloud.io/blog/how-to-run-mozilla-ttscoqui-tts-training-with-cuda-on-a-windows-system/) should walk you through the process. If you can't get it working, don't fret, you can still use them on your CPU.
 52 | 
 53 | The latest version of Python works on Linux, but Spleeter only works on 3.10 and Pyannote can be finicky with that too. 3.10 seems to work the best on on Windows. You can get it from the Microsoft Store.
 54 | 
 55 | ### Setup from Source
 56 | To use the project, you'll need to clone the repository and install the dependencies in a virtual enviormonet.
 57 | 
 58 | ```
 59 | git clone https://github.com/FlorianEagox/weeablind.git
 60 | cd weeablind
 61 | python3.10 -m venv venv
 62 | # Windows
 63 | .\venv\Scripts\activate
 64 | # Linux
 65 | source ./venv/bin/activate
 66 | ```
 67 | This project has a lot of dependencies, and pip can struggle with conflicts, so it's best to install from the lock file like this:
 68 | ```
 69 | pip install -r requirements-win-310.txt --no-deps
 70 | ```
 71 | You can try from the regular requirements file, but it can take a heck of a long time and requires some rejiggering sometimes. 
 72 | 
 73 | Installing the dependencies can take a hot minute and uses a lot of space (~8 GB).
 74 | 
 75 | If you don't need certain features for instance, language filtering, you can omit speechbrain from the readme. 
 76 | 
 77 | once this is completed, you can run the program with
 78 | 
 79 | ```
 80 | python weeablind.py
 81 | ```
 82 | 
 83 | ## Usage
 84 | Start by either selecting a video from your computer or pasting a link to a YT video and pressing enter. It should download the video and lot the subs and audio.
 85 | 
 86 | ### Loading a video
 87 | Once a video is loaded, you can preview the subtitles that will be dubbed. If the wrong language is loaded, or the wrong audio stream, switch to the streams tab and select the correct ones.
 88 | 
 89 | ### Cropping
 90 | You can specify a start and end time if you only need to dub a section of the video, for example to skip the opening theme and credits of a show. Use timecode syntax like 2:17 and press enter. 
 91 | 
 92 | ### Configuring Voices
 93 | By default, a "Sample" voice should be initialized. You can play around with different configurations and test the voice before dubbing with the "Sample Voice" button in the "Configure Voices" tab. When you have parameters you're happy with, clicking "Update Voices" will re-asign it to that slot. If you choose the SYSTEM tts engine, the program will use Windows' SAPI5 Narrorator or Linux espeak voices by default. This is extremely fast but sounds very robotic. Selecting Coqui gives you a TON of options to play around with, but you will be prompted to download often very heavy TTS models. VCTK/VITS is my favorite model to dub with as it's very quick, even on CPU, and there are hundreds of speakers to choose from. It is loaded by default. If you have ran diarization, you can select different voices from the listbox and change their properties as well.
 94 | 
 95 | ### Language Filtering
 96 | In the subtitles tab, you filter the subtitles to exclude lines spoken in your selected language so only the foreign language gets dubbed. This is useful for multi-lingual videos, but not videos all in one language.
 97 | 
 98 | ### Diarization
 99 | Running diarization will attempt to assign the correct speaker to all the subtitles and generate random voices for the total number of speakers detected. In the futre, you'll be able to specify the diarization pipeline and number of speakers if you know ahead of time. Diarization is only useful for videos with multiple speakers and the accuracy can very massively.
100 | 
101 | ### Background Isolation
102 | In the "Streams" tab, you can run vocal isolation which will attempt to remove the vocals from your source video track but retain the background. If you're using a multi-lingual video and running language filtering as well, you'll need to run that first to keep the english (or whatever source language's vocals).
103 | 
104 | ### Dubbing
105 | Once you've configured things how you like, you can press the big, JUICY run dubbing button. This can take a while to run. Once completed, you should have something like "MyVideo-dubbed.mkv" in the `output` directory. This is your finished video!
106 | 
107 | ## Things to do
108 | - ~~A better filtering system for language detection. Maybe inclusive and exclusive or confidence threshhold~~
109 | - Find some less copyrighted multi-lingual / non-english content to display demos publicly
110 | - ~~de-anglicanization it so the user can select their target language instead of just English~~
111 | - FIX PYDUB'S STUPID ARRAY DISTORTION so we don't have to perform 5 IO operations per dub!!!
112 | - ~~run a vocal isolation / remover on the source audio to remove / mitigate the original speakers?~~
113 | - ~~A proper setup guide for all platforms~~
114 | - ~~remove or fix the broken espeak implementation to be cross-platform~~
115 | - ~~Uninitialized, singletons for heavy models upon startup (e.g. only intialize pyannote/speechbrain pipelines when needed)~~
116 | - Abstraction for singletons of Coqui voices using the same model to reduce memory footprint
117 | - ~~GUI tab to list and select audio / subtitle streams w/ FFMPEG~~
118 | - ~~Move the tabs into their own classes~~
119 | - ~~Add labels and screen reader landmarks to all the controls~~
120 | - ~~Single speaker or multi speaker control switch~~
121 | - ~~Download YouTube video with Closed Captions~~
122 | - ~~GUI to select start and end time for dubbing~~
123 | - Throw up a Flask server on my website so you can try it with minimal features.
124 | - ~~Use OCR to generate subtitles for videos that don't have sub streams~~
125 | - ~~Use OCR for non-text based subtitles~~
126 | - ~~Make a cool logo?~~
127 | - ~~Learn how to package python programs as binaries to make releases~~
128 | - ~~Remove the copyrighted content from this repo (sorry not sorry TV Tokyo)~~
129 | - ~~Support for all subtitle formats~~
130 | - Maybe slap in an ASR library for videos without subtitles?
131 | - Maybe support for magnet URLs or the arrLib to pirate media (who knows???)
132 | 
133 | ### Diarization
134 | - Filter subtitles by the selected voice from the listbox
135 | - Select from multiple diarization models / pipelines
136 | - Optimize audio trakcs for diarizaiton by isolating lines speech based on subtitle timings
137 | - Investigate Diart?
138 | 
139 | ### TTS
140 | 
141 | - ~~Rework the speed control to use PyDub to speed up audio.~~
142 | - ~~match the volume of the speaker to TTS~~
143 | - Checkbox to remove sequential subtitle entries and entries that are tiny, e.g. "nom" "nom" "nom" "nom"~~
144 | - ~~investigate voice conversion?~~
145 | - Build an asynchronous queue of operations to perform
146 | - ~~Asynchronous GUI for Coqui model downloads~~
147 | - Add support for MyCroft Mimic 3
148 | - Add Support for PiperTTS
149 | 
150 | ### Cloning
151 | - ~~Create a cloning mode to select subtitles and export them to a dataset or wav compilation for Coqui XTTS~~
152 | - Use diaries and subtitles to isolate and build training datasets
153 | - Build a tool to streamline the manual creation of datasets
154 | 
155 | ###### (oh god that's literally so many things, the scope of this has gotten so big how will this ever become a thing)
156 | 


--------------------------------------------------------------------------------
/video.py:
--------------------------------------------------------------------------------
  1 | """
  2 | The Video class represents a reference to a video from either a file or web link. This class should implement the ncessary info to dub a video.
  3 | """
  4 | import time
  5 | import ffmpeg
  6 | from yt_dlp import YoutubeDL
  7 | import utils
  8 | from pydub import AudioSegment
  9 | from dub_line import load_subs
 10 | import random
 11 | from dub_line import DubbedLine
 12 | 
 13 | class Video:
 14 | 	def __init__(self, video_URL, loading_progress_hook=print, lang=None):
 15 | 		self.start_time = self.end_time = 0
 16 | 		self.downloaded = False
 17 | 		self.subs = self.subs_adjusted = self.subs_removed = []
 18 | 		self.background_track = self.vocal_track = None
 19 | 		self.speech_diary = self.speech_diary_adjusted = None
 20 | 		self.load_video(video_URL, loading_progress_hook, lang)
 21 | 		self.mixing_ratio = 1
 22 | 
 23 | 
 24 | 	# This is responsible for loading the app's audio and subtitles from a video file or YT link
 25 | 	def load_video(self, video_path, progress_hook=print, lang=None):
 26 | 		sub_path = ""
 27 | 		if video_path.startswith("http"):
 28 | 			self.downloaded = True
 29 | 			try:
 30 | 				video_path, sub_path, self.yt_sub_streams = self.download_video(video_path, progress_hook, lang=lang)
 31 | 			except: return
 32 | 			progress_hook({"status":"complete"})
 33 | 		else:
 34 | 			self.downloaded = False
 35 | 		self.file = video_path
 36 | 		if not (self.downloaded and not sub_path):
 37 | 			try:
 38 | 				self.subs = self.subs_adjusted = load_subs(utils.get_output_path(self.file, '.srt'), sub_path or video_path)
 39 | 			except:
 40 | 				progress_hook({"status": "subless"})
 41 | 		self.audio = AudioSegment.from_file(video_path)
 42 | 		self.duration = float(ffmpeg.probe(video_path)["format"]["duration"])
 43 | 		if self.subs:
 44 | 			self.update_time(0, self.duration)
 45 | 
 46 | 	def download_video(self, link, progress_hook=print, lang=None):
 47 | 		options = {
 48 | 			'outtmpl': 'output/%(id)s.%(ext)s',
 49 | 			'writesubtitles': True,
 50 | 			'writeautomaticsub': True,
 51 | 			"progress_hooks": (progress_hook,),
 52 | 			"listsubs": True
 53 | 		}
 54 | 		if lang:
 55 | 			# options["writeautomaticsub"] = False
 56 | 			options["subtitleslangs"] = (".*" + lang + ".*").split(',')
 57 | 		else:
 58 | 			options["subtitleslangs"] = ["all"]
 59 | 		try:
 60 | 			with YoutubeDL(options) as ydl:
 61 | 				
 62 | 				info = ydl.extract_info(link)
 63 | 				output = ydl.prepare_filename(info)
 64 | 				subs = info["subtitles"] | info["automatic_captions"]
 65 | 				print("SUBS:", subs)
 66 | 				subs = {k:v for k, v in subs.items() if v[-1].get("filepath", None)}
 67 | 				print("Detected Subtitles\n", subs)
 68 | 				return output, list(subs.values())[0][-1]["filepath"] if subs else None, subs
 69 | 		except Exception as e:
 70 | 			progress_hook({"status": "error", "error": e})
 71 | 			raise e
 72 | 
 73 | 
 74 | 	def update_time(self, start, end):
 75 | 		self.start_time = start
 76 | 		self.end_time = end
 77 | 		# clamp the subs to the crop time specified
 78 | 		start_line = utils.find_nearest([sub.start for sub in self.subs], start)
 79 | 		end_line = utils.find_nearest([sub.start for sub in self.subs], end)
 80 | 		self.subs_adjusted = self.subs[start_line:end_line]
 81 | 		if self.speech_diary:
 82 | 			self.update_diary_timing()
 83 | 
 84 | 	def list_streams(self):
 85 | 		probe = ffmpeg.probe(self.file)["streams"]
 86 | 		if self.downloaded:
 87 | 			subs = [{"name": stream[-1]['name'], "stream": stream[-1]['filepath']} for stream in self.yt_sub_streams.values()]
 88 | 		else:
 89 | 			subs = [{"name": stream['tags'].get('language', 'unknown'), "stream": stream['index']} for stream in probe if stream["codec_type"] == "subtitle"]
 90 | 		return {
 91 | 			"audio": [stream for stream in probe if stream["codec_type"] == "audio"],
 92 | 			"subs": subs
 93 | 		}
 94 | 
 95 | 	def get_snippet(self, start, end):
 96 | 		return self.audio[start*1000:end*1000]
 97 | 	
 98 | 	# Crops the video's audio segment to reduce memory size
 99 | 	def crop_audio(self, isolated_vocals):
100 | 		# ffmpeg -i .\saiki.mkv -vn -ss 84 -to 1325 crop.wav
101 | 		source_file = self.vocal_track if isolated_vocals and self.vocal_track else self.file
102 | 		output = utils.get_output_path(source_file, "-crop.wav")
103 | 		(
104 | 			ffmpeg
105 | 			.input(self.file, ss=self.start_time, to=self.end_time)
106 | 			.output(output)
107 | 			.global_args('-loglevel', 'error')
108 | 			.global_args('-vn')
109 | 			.run(overwrite_output=True)
110 | 		)
111 | 		return output
112 | 
113 | 	def detect_subs_lang(self, progress_hook=print):
114 | 		snippet_path = snippet_path = "video_snippet.wav" # utils.get_output_path('video_snippet', '.wav')
115 | 		for i, sub in enumerate(self.subs_adjusted):
116 | 			self.get_snippet(sub.start, sub.end).export(snippet_path, format="wav")
117 | 			sub.get_language(snippet_path)
118 | 			progress_hook(i, f"{i}/{len(self.subs_adjusted)}: {sub.text}")
119 | 		progress_hook(-1, "done")
120 | 
121 | 	def filter_multilingual_subtiles(self, exclusion=["English"]):
122 | 		multi_lingual_subs = []
123 | 		removed_subs = []
124 | 		for i, sub in enumerate(self.subs_adjusted):
125 | 			if sub.language not in exclusion:
126 | 				multi_lingual_subs.append(sub)
127 | 			else:
128 | 				removed_subs.append(sub)
129 | 		self.subs_adjusted = multi_lingual_subs
130 | 		self.subs_removed = removed_subs
131 | 
132 | 	# This funxion is is used to only get the snippets of the audio that appear in subs_adjusted after language filtration or cropping, irregardless of the vocal splitting.
133 | 	# This should be called AFTER filter multilingual and BEFORE vocal isolation. Not useful yet
134 | 	# OKAY THERE HAS TO BE A FASTER WAY TO DO THIS X_X
135 | 
136 | 	# def isolate_subs(self):
137 | 	# 	base = AudioSegment.silent(duration=self.duration*1000, frame_rate=self.audio.frame_rate, channels=self.audio.channels, frame_width=self.audio.frame_width)
138 | 	# 	samples = np.array(base.get_array_of_samples())
139 | 	# 	frame_rate = base.frame_rate
140 | 		
141 | 	# 	for sub in self.subs_adjusted:
142 | 	# 		copy = np.array(self.get_snippet(sub.start, sub.end).get_array_of_samples())
143 | 	# 		start_sample = int(sub.start * frame_rate)
144 | 	# 		end_sample = int(sub.end * frame_rate)
145 | 			
146 | 	# 		# Ensure that the copy array has the same length as the region to replace
147 | 	# 		copy = copy[:end_sample - start_sample]  # Trim if necessary
148 | 			
149 | 	# 		samples[start_sample:end_sample] = copy
150 | 
151 | 	# 	return AudioSegment(
152 | 	# 		samples.tobytes(),
153 | 	# 		frame_rate=frame_rate,
154 | 	# 		sample_width=base.sample_width,  # Adjust sample_width as needed (2 bytes for int16)
155 | 	# 		channels=base.channels
156 | 	# 	)
157 | 
158 | 	def isolate_subs(self, subs):
159 | 		empty_audio = AudioSegment.silent(self.duration * 1000, frame_rate=self.audio.frame_rate)
160 | 		empty_audio = self.audio
161 | 		first_sub = subs[0]
162 | 		empty_audio = empty_audio[0:first_sub.start].silent((first_sub.end-first_sub.start)*1000)
163 | 		for i, sub in enumerate(subs[:-1]):
164 | 			print(sub.text)
165 | 			empty_audio = empty_audio[sub.end:subs[i+1].start].silent((subs[i+1].start-sub.end)*1000, frame_rate=empty_audio.frame_rate, channels=empty_audio.channels, sample_width=empty_audio.sample_width, frame_width=empty_audio.frame_width)
166 | 
167 | 		return empty_audio
168 | 
169 | 	def run_dubbing(self, progress_hook=None, match_rate=True):
170 | 		total_errors = 0
171 | 		operation_start_time = time.process_time()
172 | 		empty_audio = AudioSegment.silent(self.duration * 1000, frame_rate=22050)
173 | 		status = ""
174 | 		# with concurrent.futures.ThreadPoolExecutor(max_workers=100) as pool:
175 | 		# 	tasks = [pool.submit(dub_task, sub, i) for i, sub in enumerate(subs_adjusted)]		
176 | 		# 	for future in concurrent.futures.as_completed(tasks):
177 | 		# 		pass
178 | 		for i, sub in enumerate(self.subs_adjusted):
179 | 			status = f"{i}/{len(self.subs_adjusted)}"
180 | 			progress_hook(i, f"{status}: {sub.text}")
181 | 			try:
182 | 				line = sub.dub_line_file(match_rate=match_rate, match_volume=False)[0]
183 | 				empty_audio = empty_audio.overlay(line, sub.start*1000)
184 | 			except Exception as e:
185 | 				print(e)
186 | 				total_errors += 1
187 | 		self.dub_track = empty_audio.export(utils.get_output_path(self.file, '-dubtrack.wav'), format="wav").name
188 | 		progress_hook(i+1, "Mixing New Audio")
189 | 		self.mix_av(mixing_ratio=self.mixing_ratio)
190 | 		progress_hook(-1)
191 | 		print(f"TOTAL TIME TAKEN: {time.process_time() - operation_start_time}")
192 | 		# print(total_errors)
193 | 
194 | 	# This runs an ffmpeg command to combine the audio, video, and subtitles with a specific ratio of how loud to make the dubtrack
195 | 	def mix_av(self, mixing_ratio=1, dubtrack=None, output_path=None):
196 | 		# i hate python, plz let me use self in func def
197 | 		if not dubtrack: dubtrack = self.dub_track
198 | 		if not output_path: output_path = utils.get_output_path(self.file, '-dubbed.mkv')
199 | 
200 | 		input_video = ffmpeg.input(self.file)
201 | 		input_audio = input_video.audio
202 | 		if self.background_track:
203 | 			input_audio = ffmpeg.input(self.background_track)
204 | 		input_dub = ffmpeg.input(dubtrack).audio
205 | 
206 | 		mixed_audio = ffmpeg.filter([input_audio, input_dub], 'amix', duration='first', weights=f"1 {mixing_ratio}")
207 | 
208 | 		output = (
209 | 			# input_video['s']
210 | 			ffmpeg.output(input_video['v'], mixed_audio, output_path, vcodec="copy", acodec="aac")
211 | 			.global_args('-loglevel', 'error')
212 | 			.global_args('-shortest')
213 | 		)
214 | 		ffmpeg.run(output, overwrite_output=True)
215 | 
216 | 	# Change the subs to either a file or a different stream from the video file
217 | 	def change_subs(self, stream_index=-1, external_path=""):
218 | 		if external_path:
219 | 			convert_srt_path = utils.get_output_path(external_path, '.srt')
220 | 			ffmpeg.input(external_path).output(convert_srt_path).run(overwrite_output=True)
221 | 			self.subs = self.subs_adjusted = load_subs(convert_srt_path)
222 | 			return
223 | 		if self.downloaded:
224 | 			sub_path = list(self.yt_sub_streams.values())[stream_index][-1]['filepath']
225 | 			self.subs = self.subs_adjusted = load_subs(utils.get_output_path(sub_path, '.srt'), sub_path)
226 | 		else:
227 | 			# ffmpeg -i output.mkv -map 0:s:1 frick.srt
228 | 			sub_path = utils.get_output_path(self.file, '.srt')
229 | 			ffmpeg.input(self.file).output(sub_path, map=f"0:s:{stream_index}").run(overwrite_output=True)
230 | 			self.subs = self.subs_adjusted = load_subs(sub_path)
231 | 
232 | 	def change_audio(self, stream_index=-1):
233 | 		audio_path = utils.get_output_path(self.file, f"-${stream_index}.wav")
234 | 		ffmpeg.input(self.file).output(audio_path, map=f"0:a:{stream_index}").run(overwrite_output=True)
235 | 		self.audio = AudioSegment.from_file(audio_path)
236 | 
237 | 	def export_clone(self, snippets, path):
238 | 		empty_audio = AudioSegment.empty()
239 | 		for snippet in snippets:
240 | 			empty_audio += AudioSegment.silent()
241 | 			empty_audio += self.get_snippet(snippet.start, snippet.end)
242 | 		empty_audio.export(path, "wav")
243 | 
244 | 	def sample_mixing(self) -> AudioSegment:
245 | 		random_test_sub: DubbedLine = random.choice(self.subs_adjusted)
246 | 		dubbed_audio, output_path = random_test_sub.dub_line_file()
247 | 		self.get_snippet(random_test_sub.start, random_test_sub.end).export(utils.snippet_export_path, format="wav")
248 | 		source = ffmpeg.input(utils.snippet_export_path)
249 | 		overlayed_tts = ffmpeg.input(output_path)
250 | 		mixed_audio = ffmpeg.filter([source, overlayed_tts], 'amix', duration='first', weights=f"1 {self.mixing_ratio}")
251 | 		mixed_sample_path = utils.get_output_path("mixed_sample", ".wav")
252 | 		output = (
253 | 			ffmpeg.output(mixed_audio, mixed_sample_path)
254 | 			.global_args('-loglevel', 'error')
255 | 			.global_args('-shortest')
256 | 		)
257 | 		ffmpeg.run(output, overwrite_output=True)
258 | 		return AudioSegment.from_file(mixed_sample_path)
259 | 


--------------------------------------------------------------------------------