├── config.json
├── manifest.json
├── workspace.code-workspace
├── .gitignore
├── __init__.py
├── README.md
├── config.md
├── ffmpeg.py
└── voicevox_gen.py
/config.json:
--------------------------------------------------------------------------------
1 | {
2 | }
--------------------------------------------------------------------------------
/manifest.json:
--------------------------------------------------------------------------------
1 | {
2 | "package": "voicevox-generator",
3 | "name": "VOICEVOX Audio Generator"
4 | }
--------------------------------------------------------------------------------
/workspace.code-workspace:
--------------------------------------------------------------------------------
1 | {
2 | "folders": [
3 | {
4 | "path": "."
5 | }
6 | ],
7 | "settings": {}
8 | }
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 | included_fonts/
2 | meta.json
3 | .code-workspace
4 | *.ankiaddon
5 | ffmpeg.exe
6 | ffmpeg
7 | *.wav
8 | __pycache__/
--------------------------------------------------------------------------------
/__init__.py:
--------------------------------------------------------------------------------
1 | from aqt import browser, gui_hooks, qt
2 | from . import voicevox_gen
3 |
4 | def on_browser_will_show_context_menu(browser: browser.Browser, menu: qt.QMenu):
5 | menu.addSeparator()
6 | menu.addAction("Generate VOICEVOX Audio", lambda: voicevox_gen.onVoicevoxOptionSelected(browser))
7 |
8 | gui_hooks.browser_will_show_context_menu.append(on_browser_will_show_context_menu)
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # VOICEVOX Audio Generator for Anki
2 | Generate high quality Japanese audio for your Anki cards using the VOICEVOX speech synthesis software
3 |
4 | No more dealing with addons that break randomly after a while or generate low quality audio. You can run VOICEVOX for free on your own computer and generate high quality Japanese audio with a simple right click in the card browser.
5 |
6 | Showoff and setup guide video: https://youtu.be/-V3pnCuEIxw
7 |
8 | Download VOICEVOX: https://voicevox.hiroshiba.jp/
9 |
10 | Download from AnkiWeb: https://ankiweb.net/shared/info/366960193
11 |
12 | View Code on GitHub: https://github.com/Toocanzs/anki-voicevox
13 |
14 | # What does this do?
15 | This is a text to speech addon for Anki that makes use of the VOICEVOX speech synthesis engine to generate audio for Japanese Anki cards.
16 |
17 | # Setup
18 | IMPORTANT: This addon requires that the VOICEVOX engine service is running in order to generate audio.
19 | To setup VOICEVOX for this addon, follow these steps (Video guide here https://youtu.be/-V3pnCuEIxw):
20 | 1. You can download VOICEVOX from here: https://voicevox.hiroshiba.jp/
21 | * I prefer the zip package, but you can use the installer if you like
22 | * Note that it's about 1gb in size. This is a machine learning model so it's quite large
23 |
24 | 2. Download the addon from AnkiWeb: https://ankiweb.net/shared/info/366960193
25 | 3. Navigate to your VOICEVOX install and find `run.exe`. `run.exe` will launch the VOICEVOX engine and allow Anki to communicate with it.
26 | * You must keep `run.exe` running to generate audio in Anki
27 | * You can also run the GUI version `VOICEVOX.exe` instead if you'd prefer
28 |
29 | 4. Open the Anki card browser and right click on any card and select "Generate VOICEVOX Audio" from the dropdown
30 | * You can drag and select multiple cards to generate audio for many cards at once. Note that if you select two different types of cards only the fields that they have in common will appear in the source/destination dropdown.
31 |
32 | 5. Select the source and destination fields for generating audio
33 | * Source refers to which field the addon should read from to generate audio. For example you usually want to read from the `Sentence` field or similar.
34 | * Destination refers to the field that the addon should output the audio to. Fields like `Audio` or `Audio On Front`. Whatever field you want the audio to be placed in. NOTE: This will overwrite the contents of this field, so don't select any field you don't want overwritten with audio
35 |
36 | 6. Select a speaker and a style from the dropdown. You can preview the voices by selecting "Preview Voice"
37 |
38 | 7. Click "Generate Audio" and wait for the audio to be generated
39 | * Note that the time it takes to generate audio can vary based on your hardware. VOICEVOX works whether you run it on a dedicated GPU or just a CPU, but running it on the CPU will be much slower.
40 |
41 | # Building
42 | * Windows
43 | * Building the .ankiaddon can be done on by running `build.bat`
44 | * NOTE: requires powershell 7 ( run `winget upgrade Microsoft.PowerShell` to get powershell 7)
45 | * Linux
46 | * On Linux there currently isn't a one click build setup, but all that needs to be done is to zip everything except for `meta.json`(it may not exist) into a `.zip` file, and then rename to a `.ankiaddon` file
--------------------------------------------------------------------------------
/config.md:
--------------------------------------------------------------------------------
1 |
2 | * `limitedToTheseDecks` Every sub section has a the ability to limit it to only certain deck names. Please note that the true deck name might differ from what's displayed visually. To get the correct deck name click the options/gear icon and click "rename" and copy the text. Both this and `globalDeckLimitation` support wildcards. For example `japanese*` will match `japanese kanji`, `japanese sentences` etc.
3 | * `globalDeckLimitation` This is for convenience if you just want all the features enabled but only for certain decks. This saves you having to enter the deck name in the `limitedToTheseDecks` for every option.
4 | * Note: For a feature to be enabled the card must be apart of a deck in the `globalDeckLimitation` AND `limitedToTheseDecks` for that feature. So if you have a global deck limit of `["Deck A"]` and the font randomizer has a deck limit of `["Deck B"]`, then Deck A will not have the font randomizer enabled, because the global deck limiter restricts that.
5 |
6 | * `katakanaConverter` (Disabled by default) This feature swaps all hiragana and katakana around to allow for some extra katakana reading practice.
7 | * `chance` Controls the percent chance to swap all hiragana and katakana.
8 | * 0 is off, 1 is always swap, 0.5 is 50% chance. Any value between 0 and 1 works.
9 | * `fontRandomizer` Switches randomly between a set of selected fonts
10 | * `fontsToRandomlyChoose` A list of what fonts to randomly change to. The fonts you enter in `fontsToRandomlyChoose` **MUST** be in your `/collections.media` folder in anki. Format `["A.tff", "B.tff", "C.tcc"]`
11 | * NOTE: Do not include your default font in this. It will choose between the fonts in this list AND the one you have on your card already. For example if you want to choose from fonts `A.tff`, `B.tff`, and `C.tff`, and the font on your card is already `A.tff` then `fontsToRandomlyChoose` should be `["B.tff", "C.tff"]`
12 | * A few fonts are included by default
13 |
14 | * `verticalText` Switches the card to layout text in a vertical left to right fashion, just like light novels are displayed. This is for vertical text reading practice
15 | * `chance` The chance to convert to vertical text. NOTE: This is set to `0` by default.
16 | * NOTE: This feature requires you to mark what section of your card is your expression field by adding the class name `expression-field` to it. Also for ease of use, Migaku cards should work by default as the class name `migaku-word-front` is also used to enable this feature. For example, if you edit your card and see something like `
{{Expression}}
` you just need to add `expression-field` to the class list like so `{{Expression}}
`
17 | * This feature may mess with your layout in unexpected ways (although it's only temporary). If you run into issues after turning it on, set it back to 0 and let me know what went wrong through a github issue https://github.com/Toocanzs/AnkiJapaneseCardRandomizer/issues/new
18 | * `styleMaxHeight` This controls the maximum height for vertical text. Default is 80% of the screen size.
19 |
20 | * `sizeRandomizer` Randomizes size of any elements with `expression-field` or `migaku-word-front` as their class. To set this up read the setup instructions for `verticalText`
21 | * `enabled` Enables the feature. `true` or `false`
22 | * `minSize` Minimum size
23 | * `maxSize` Maximum size
24 | * `units` Sets the units to use for styling the font size. For example `px` as a unit would mean that if 50 was randomly chosen to be the font size, the final style would be `50px`
25 |
26 |
27 |
--------------------------------------------------------------------------------
/ffmpeg.py:
--------------------------------------------------------------------------------
1 | from os.path import dirname, join, exists
2 | import os
3 | import stat
4 | import requests
5 | import json
6 | from aqt import mw
7 | from anki.hooks import addHook
8 | import zipfile
9 | import subprocess
10 | import sys
11 | import shutil
12 |
13 | is_mac = sys.platform.startswith("darwin")
14 | is_win = sys.platform.startswith("win32")
15 | # also covers *BSD
16 | is_lin = not is_mac and not is_win
17 |
18 | class FFmpegInstaller:
19 | def __init__(self):
20 | self.addonPath = dirname(__file__)
21 | self.can_convert = False
22 |
23 | self.ffmpeg_filename = "ffmpeg"
24 | if is_win:
25 | self.ffmpeg_filename += ".exe"
26 |
27 | self.full_ffmpeg_path = join(self.addonPath, self.ffmpeg_filename)
28 |
29 | def GetFFmpegIfNotExist(self):
30 | if exists(self.full_ffmpeg_path) or self.can_convert:
31 | self.can_convert = True
32 | return
33 |
34 | # check if there is a system wide ffmpeg installed
35 | system_ffmpeg_path = shutil.which("ffmpeg")
36 | if system_ffmpeg_path:
37 | self.full_ffmpeg_path = system_ffmpeg_path
38 | self.can_convert = True
39 | return
40 |
41 | speakers_response = requests.get("https://ffbinaries.com/api/v1/version/6.1")
42 | download_url = None
43 | if speakers_response.status_code == 200:
44 | binaries_json = json.loads(speakers_response.content)
45 | if is_win:
46 | download_url = binaries_json['bin']['windows-64']['ffmpeg']
47 | elif is_lin:
48 | download_url = binaries_json['bin']['linux-64']['ffmpeg']
49 | elif is_mac:
50 | download_url = binaries_json['bin']['osx-64']['ffmpeg']
51 | else:
52 | return
53 | else:
54 | return
55 |
56 | try:
57 | temp_file_path = join(self.addonPath, "ffmpeg.zip")
58 | # Download zip
59 | with requests.get(download_url, stream=True) as ffmpeg_request:
60 | ffmpeg_request.raise_for_status()
61 | with open(temp_file_path, 'wb') as ffmpeg_file:
62 | total_bytes = int(ffmpeg_request.headers['Content-Length'])
63 | bytes_so_far = 0
64 | for chunk in ffmpeg_request.iter_content(chunk_size=8192):
65 | if chunk:
66 | bytes_so_far += len(chunk)
67 | ffmpeg_file.write(chunk)
68 | # Extract zip
69 | with zipfile.ZipFile(temp_file_path) as zf:
70 | zf.extractall(dirname(self.full_ffmpeg_path))
71 | if exists(self.full_ffmpeg_path):
72 | # Mark executable on platforms that need that
73 | if not is_win:
74 | try:
75 | st = os.stat(self.full_ffmpeg_path)
76 | os.chmod(self.full_ffmpeg_path, st.st_mode | stat.S_IEXEC)
77 | except:
78 | print("Failed to mark ffmpeg as executable")
79 | os.remove(temp_file_path)
80 | self.can_convert = True
81 | except:
82 | print("FFmpeg failed")
83 |
84 | ffmpegInstaller = FFmpegInstaller()
85 |
86 | def ConvertWav(wav_data, format):
87 | if not ffmpegInstaller.can_convert:
88 | return None
89 | try:
90 | # If windows provide additional flags to subprocess.Popen
91 | if is_win:
92 | startupinfo = subprocess.STARTUPINFO()
93 | startupinfo.dwFlags |= subprocess.STARTF_USESHOWWINDOW
94 | else:
95 | # On MacOS, subprocess.STARTUPINFO() does not exist
96 | startupinfo = None
97 |
98 | ffmpeg_command = [ffmpegInstaller.full_ffmpeg_path, '-y', '-nostats', '-hide_banner', '-i', 'pipe:', '-f', format]
99 |
100 | if format == "mp3":
101 | ffmpeg_command += ["-qscale:a", "3"]
102 | elif format == "opus":
103 | ffmpeg_command += ["-b:a", "32k"]
104 |
105 | ffmpeg_command.append('-')
106 |
107 | process = subprocess.Popen(ffmpeg_command, stdout = subprocess.PIPE, stderr = subprocess.PIPE, stdin = subprocess.PIPE, startupinfo=startupinfo)
108 | output = process.communicate(input=wav_data)[0]
109 | return output
110 | except Exception as e:
111 | print("VoiceVox conversion error:", e)
112 | return None
113 |
114 | addHook("profileLoaded", ffmpegInstaller.GetFFmpegIfNotExist)
115 |
--------------------------------------------------------------------------------
/voicevox_gen.py:
--------------------------------------------------------------------------------
1 | from aqt.qt import QComboBox, QHBoxLayout, QLabel, QPushButton, QApplication, QMessageBox, QSlider, QLineEdit, QToolButton, QIcon
2 | from aqt import browser, gui_hooks, qt
3 | from aqt import mw
4 | from aqt.sound import av_player
5 | import requests
6 | import json
7 | import urllib.parse
8 | from aqt.utils import showText
9 | from os.path import join, exists, dirname
10 | import random
11 | import base64
12 | import uuid
13 | import re
14 | import zipfile
15 | import io
16 | from . import ffmpeg
17 | import traceback
18 | import re, html
19 | import json
20 | import datetime
21 |
22 | VOICEVOX_CONFIG_NAME = "VOICEVOX_CONFIG"
23 |
24 | def getCommonFields(selected_notes):
25 | common_fields = set()
26 |
27 | first = True
28 |
29 | for note_id in selected_notes:
30 | note = mw.col.get_note(note_id)
31 | if note is None:
32 | raise Exception(f"Note with id {note_id} is None.\nNotes: {','.join([mw.col.get_note(id) for id in selected_notes])}.\nPlease submit an issues with more information about what cards caused this at https://github.com/Toocanzs/anki-voicevox/issues/new")
33 | model = note.note_type()
34 | model_fields = set([f['name'] for f in model['flds']])
35 | if first:
36 | common_fields = model_fields # Take the first one as is and we will intersect it with the following ones
37 | else:
38 | common_fields = common_fields.intersection(model_fields) # Find the common fields by intersecting the set of all fields together
39 | first = False
40 | return common_fields
41 | def getSpeakersOrNone():
42 | try:
43 | speakers_response = requests.get("http://127.0.0.1:50021/speakers", timeout=5)
44 | if speakers_response.status_code == 200:
45 | print(speakers_response.content)
46 | return json.loads(speakers_response.content)
47 | except:
48 | return None
49 |
50 | def getSpeakerInfo(speaker_uuid):
51 | try:
52 | speakers_response = requests.get("http://127.0.0.1:50021/speaker_info?speaker_uuid=" + str(speaker_uuid), timeout=5)
53 | if speakers_response.status_code == 200:
54 | print(speakers_response.content)
55 | return json.loads(speakers_response.content)
56 | except:
57 | return None
58 |
59 | def getSpeakerList(speaker_json):
60 | speakers = []
61 | for obj in speaker_json:
62 | styles = []
63 | for style in obj['styles']:
64 | styles.append( (style['name'], style['id']) )
65 | #speaker_info = getSpeakerInfo(obj['speaker_uuid'])
66 | speakers.append( (obj['name'], styles, obj['speaker_uuid']) )
67 | return speakers
68 |
69 | def getSpeaker(speakers, speaker_combo, style_combo):
70 | speaker_name = speaker_combo.itemText(speaker_combo.currentIndex())
71 | speaker = next((x for x in speakers if x[0] == speaker_name), None)
72 | if speaker is None:
73 | raise Exception(f"Speaker '{speaker_name}' not found in getSpeaker")
74 |
75 | styles = speaker[1]
76 | style_name = style_combo.itemText(style_combo.currentIndex())
77 | style_info = next((x for x in styles if x[0] == style_name), None)
78 |
79 | if style_info is None:
80 | raise Exception(f"Style '{style_name}' not found in getSpeaker")
81 |
82 | speaker_id = style_info[1]
83 | return (speaker_id, speaker, style_info)
84 |
85 | def parse_filename_template(template: str, placeholders: dict) -> str:
86 | """
87 | Replaces placeholders of form {{...}} in the template with corresponding values.
88 | Example placeholders:
89 | {{uid}}, {{speaker}}, {{style}}, {{deck}}, {{deck-full}}, {{date}}, {{field:Foo}}
90 | """
91 | # For safety, let's keep a local copy
92 | result = template
93 |
94 | # Find all occurrences of {{xyz}}
95 | all_placeholders = re.findall(r"{{(.*?)}}", template)
96 | for ph in all_placeholders:
97 | # For field placeholders: "field:myFieldName"
98 | if ph.startswith("field:"):
99 | field_name = ph[len("field:"):]
100 | if field_name in placeholders.get("fields", {}):
101 | val = placeholders["fields"][field_name]
102 | else:
103 | val = "" # If field not found, replace with empty
104 | result = result.replace(f"{{{{field:{field_name}}}}}", val)
105 | else:
106 | # Normal placeholders
107 | val = placeholders.get(ph, "")
108 | result = result.replace(f"{{{{{ph}}}}}", val)
109 |
110 | return result
111 |
112 | class MyDialog(qt.QDialog):
113 | def __init__(self, browser, parent=None) -> None:
114 | super().__init__(parent)
115 | self.selected_notes = browser.selectedNotes()
116 |
117 | config = mw.addonManager.getConfig(__name__)
118 |
119 | layout = qt.QVBoxLayout()
120 |
121 | layout.addWidget(qt.QLabel("Selected notes: " + str(len(self.selected_notes))))
122 |
123 | self.grid_layout = qt.QGridLayout()
124 |
125 | common_fields = getCommonFields(self.selected_notes)
126 |
127 | if len(common_fields) < 1:
128 | QMessageBox.critical(mw, "Error", f"The chosen notes share no fields in common. Make sure you're not selecting two different note types")
129 | elif len(common_fields) == 1:
130 | QMessageBox.critical(mw, "Error", f"The chosen notes only share a single field in common '{list(common_fields)[0]}'. This would leave no field to put the generated audio without overwriting the sentence data")
131 |
132 | self.source_combo = qt.QComboBox()
133 | self.destination_combo = qt.QComboBox()
134 |
135 | last_source_field = config.get('last_source_field') or None
136 | last_destination_field = config.get('last_destination_field') or None
137 | source_field_index = 0
138 | destination_field_index = 0
139 | i = 0
140 | for field in common_fields:
141 | if last_source_field is None:
142 | if "expression" == field.lower() or "sentence" == field.lower() or "front" == field.lower():
143 | source_field_index = i
144 | elif field == last_source_field:
145 | source_field_index = i
146 |
147 | if last_destination_field is None:
148 | if "audio" == field.lower():
149 | destination_field_index = i
150 | elif field == last_destination_field:
151 | destination_field_index = i
152 | self.source_combo.addItem(field)
153 | self.destination_combo.addItem(field)
154 | i += 1
155 |
156 | self.source_combo.setCurrentIndex(source_field_index)
157 | self.destination_combo.setCurrentIndex(destination_field_index)
158 |
159 |
160 | source_label = qt.QLabel("Source field: ")
161 | source_tooltip = "The field to read from. For example if your sentence is in the field 'Expression' you want to choose 'Expression' as the source field to read from"
162 | source_label.setToolTip(source_tooltip)
163 |
164 | destination_label = qt.QLabel("Destination field: ")
165 | destination_tooltip = "The field to write the audio to. Typically you want to choose a field like 'Audio' or 'Audio on Front' or wherever you want the audio placed on your card."
166 | destination_label.setToolTip(destination_tooltip)
167 |
168 | self.source_combo.setToolTip(source_tooltip)
169 | self.destination_combo.setToolTip(destination_tooltip)
170 |
171 | self.grid_layout.addWidget(source_label, 0, 0)
172 | self.grid_layout.addWidget(self.source_combo, 0, 1)
173 | self.grid_layout.addWidget(destination_label, 0, 2)
174 | self.grid_layout.addWidget(self.destination_combo, 0, 3)
175 |
176 | # TODO: Does anyone actually want to not ignore stuff in brackets? The checkbox is here if we need it but I don't think anyone wants brackets to be read
177 | self.ignore_brackets_checkbox = qt.QCheckBox("Ignore stuff in brackets [...]")
178 | self.ignore_brackets_checkbox.setToolTip("Ignores things between brackets. Usually Japanese cards have pitch accent and reading info in brackets. Leave this checked unless you really know what you're doing")
179 | self.ignore_brackets_checkbox.setChecked(True)
180 | # self.grid_layout.addWidget(self.ignore_brackets_checkbox, 0, 4)
181 |
182 | speaker_json = getSpeakersOrNone()
183 | if speaker_json is None:
184 | layout.addWidget(qt.QLabel("VOICEVOX service was unable to get speakers list. Please make sure the VOICEVOX service is running and reopen this dialog"))
185 | self.setLayout(layout)
186 | return
187 |
188 | self.grid_layout.addWidget(qt.QLabel("Speaker: "), 1, 0)
189 | self.speakers = getSpeakerList(speaker_json)
190 | self.speaker_combo = qt.QComboBox()
191 | for speaker in self.speakers:
192 | self.speaker_combo.addItem(speaker[0])
193 | self.grid_layout.addWidget(self.speaker_combo, 1, 1)
194 |
195 | self.style_combo = qt.QComboBox()
196 |
197 | def update_speaker_style_combo_box():
198 | speaker_name = self.speaker_combo.itemText(self.speaker_combo.currentIndex())
199 | speaker = next((x for x in self.speakers if x[0] == speaker_name), None) # grab the first speaker with this name
200 | if speaker is None:
201 | print("Speaker not found in update_speaker_style_combo_box")
202 | return
203 | self.style_combo.clear()
204 | for style in speaker[1]:
205 | self.style_combo.addItem(style[0])
206 |
207 | self.speaker_combo.currentIndexChanged.connect(update_speaker_style_combo_box)
208 | update_speaker_style_combo_box() # run this the first time so the default speaker style is setup
209 |
210 | last_speaker_name = config.get('last_speaker_name') or None
211 | last_style_name = config.get('last_style_name') or None
212 |
213 | # find the speaker/style from the previously saved config data and pick it from the dropdown
214 | speaker_combo_index = 0
215 | i = 0
216 | for speaker_item in [self.speaker_combo.itemText(i) for i in range(self.speaker_combo.count())]:
217 | if speaker_item == last_speaker_name:
218 | speaker_combo_index = i
219 | break
220 | i += 1
221 |
222 | style_combo_index = 0
223 | i = 0
224 | for style_item in [self.style_combo.itemText(i) for i in range(self.style_combo.count())]:
225 | if style_item == last_style_name:
226 | style_combo_index = i
227 | break
228 | i += 1
229 |
230 | self.speaker_combo.setCurrentIndex(speaker_combo_index)
231 | self.style_combo.setCurrentIndex(style_combo_index) # NOTE: The previous style should probably be stored as a tuple with the speaker, but this is good enough. IE. Person A style X is not the same as Person B style X
232 |
233 | self.grid_layout.addWidget(qt.QLabel("Style: "), 1, 2)
234 | self.grid_layout.addWidget(self.style_combo, 1, 3)
235 |
236 | # Keep track of the current note index for previewing actual content
237 | self.preview_note_index = 0
238 | def resetPreviewIndex(*args):
239 | self.preview_note_index = 0
240 |
241 | def voiceValueChanged(*args):
242 | resetPreviewIndex(*args)
243 |
244 | # Connect the voice value changed signals
245 | self.speaker_combo.currentIndexChanged.connect(voiceValueChanged)
246 | self.style_combo.currentIndexChanged.connect(voiceValueChanged)
247 | self.source_combo.currentIndexChanged.connect(voiceValueChanged)
248 |
249 | preview_layout = qt.QHBoxLayout()
250 | label = QLabel("Preview Voice")
251 | preview_layout.addWidget(label)
252 |
253 | self.preview_voice_button_sample = qt.QPushButton("🎲", self)
254 | self.preview_voice_button_sample.setToolTip("Preview a random sample sentence")
255 | self.preview_voice_button_sample.clicked.connect(self.PreviewVoiceSample)
256 | preview_layout.addWidget(self.preview_voice_button_sample)
257 |
258 | self.preview_voice_button_actual = qt.QPushButton("🗂️", self)
259 | self.preview_voice_button_actual.setToolTip("Preview the source field\n\nRepeated clicks cycle through each note.")
260 | self.preview_voice_button_actual.clicked.connect(self.PreviewVoiceActual)
261 | preview_layout.addWidget(self.preview_voice_button_actual)
262 |
263 | self.grid_layout.addLayout(preview_layout, 1, 4)
264 |
265 | self.append_audio = qt.QCheckBox("Append Audio")
266 | append_audio_checked = config.get('append_audio') or "false"
267 | self.append_audio.setChecked(True if append_audio_checked == "true" else False)
268 | self.grid_layout.addWidget(self.append_audio, 2, 0)
269 |
270 | self.use_opus = qt.QCheckBox("Use opus instead of mp3")
271 | use_opus_checked = config.get('use_opus') or "false"
272 | self.use_opus.setChecked(True if use_opus_checked == "true" else False)
273 | self.grid_layout.addWidget(self.use_opus, 2, 1)
274 |
275 | # Filename template
276 | self.grid_layout.addWidget(qt.QLabel("Filename: "), 3, 0)
277 |
278 | self.filename_template_edit = QLineEdit()
279 | default_template = config.get('filename_template') or "VOICEVOX_{{speaker}}_{{style}}_{{uid}}"
280 | self.filename_template_edit.setText(default_template)
281 | self.filename_template_edit.setToolTip(
282 | "Use placeholders like {{uid}}, {{speaker}}, {{style}}, {{deck}}, {{deck-full}}, {{date}}, or {{field:}}.\n"
283 | "Example: VOICEVOX_{{speaker}}_{{style}}_{{field:Card ID}}.mp3\n"
284 | "If you omit {{uid}}, files may clash unless other placeholders ensure uniqueness."
285 | )
286 |
287 | self.grid_layout.addWidget(self.filename_template_edit, 3, 1, 1, 3)
288 |
289 | # "?" Button for help
290 | self.help_button = QToolButton()
291 | self.help_button.setText("?")
292 | self.help_button.setToolTip(
293 | "Possible placeholders:\n"
294 | " {{uid}} - random unique identifier\n"
295 | " {{speaker}} - speaker name\n"
296 | " {{style}} - speaking style\n"
297 | " {{deck}} - card deck name\n"
298 | " {{deck-full}} - full card deck name, including parent deck hierarchy\n"
299 | " {{date}} - current date in ISO format\n"
300 | " {{field:}} - replaces with note field content\n\n"
301 | "Example: VOICEVOX_{{speaker}}_{{style}}_{{field:ID}}_{{uid}}"
302 | )
303 | self.grid_layout.addWidget(self.help_button, 3, 4)
304 |
305 | # Warning icon if {{uid}} is missing
306 | self.uid_warning_label = QLabel()
307 | self.uid_warning_label.setToolTip("Warning: Without {{uid}}, you might get file name collisions. Each generated audio must have a globally unique name.")
308 | # Use some built-in icon or text: exclamation triangle
309 | self.uid_warning_label.setPixmap(self.style().standardIcon(qt.QStyle.StandardPixmap.SP_MessageBoxWarning).pixmap(16,16))
310 | self.uid_warning_label.setVisible(False)
311 | self.grid_layout.addWidget(self.uid_warning_label, 3, 5)
312 |
313 | # Red border if invalid placeholders
314 | def validate_template():
315 | t = self.filename_template_edit.text()
316 | # If we find placeholders not recognized: we won't parse them now with a big logic,
317 | # but let's warn if we see something that doesn't match recognized patterns.
318 | # For simplicity, only check for missing 'uid' for now, or obviously invalid placeholders.
319 | has_uid = "{{uid}}" in t
320 | self.uid_warning_label.setVisible(not has_uid)
321 |
322 | # Check for well-formed placeholders (all must be either "uid", "speaker", "style", "deck", "date", or "field:")
323 | # We'll just do a quick check: anything that doesn't start with field: or match known placeholders is suspect.
324 | # This is a mild approach; you can expand if needed.
325 | found_placeholders = re.findall(r"{{(.*?)}}", t)
326 | invalids = []
327 | for ph in found_placeholders:
328 | if ph not in ["uid", "speaker", "style", "deck", "deck-full", "date"] and not ph.startswith("field:"):
329 | invalids.append(ph)
330 | elif ph.startswith("field:"):
331 | field_name = ph[len("field:"):].strip()
332 | if field_name not in common_fields:
333 | invalids.append(ph)
334 |
335 | if invalids:
336 | self.filename_template_edit.setStyleSheet("border: 1px solid red;")
337 | self.filename_template_edit.setToolTip(
338 | f"Invalid placeholder(s) detected: {', '.join(invalids)}\n"
339 | "Valid placeholders: uid, speaker, style, deck, deck-full, date, field:"
340 | )
341 | else:
342 | self.filename_template_edit.setStyleSheet("")
343 | self.filename_template_edit.setToolTip(
344 | "Use placeholders like {{uid}}, {{speaker}}, {{style}}, {{deck}}, {{deck-full}}, {{date}}, or {{field:}}."
345 | )
346 |
347 | self.filename_template_edit.textChanged.connect(validate_template)
348 | validate_template()
349 |
350 | self.cancel_button = qt.QPushButton("Cancel")
351 | self.generate_button = qt.QPushButton("Generate Audio")
352 |
353 | self.cancel_button.clicked.connect(self.reject)
354 | self.generate_button.clicked.connect(self.pre_accept)
355 |
356 | self.grid_layout.addWidget(self.cancel_button, 4, 0, 1, 2)
357 | self.grid_layout.addWidget(self.generate_button, 4, 3, 1, 2)
358 |
359 | def update_slider(slider, label, config_name, slider_desc):
360 | def update_this_slider(value):
361 | label.setText(f'{slider_desc} {slider.value() / 100}')
362 | config[config_name] = slider.value()
363 | mw.addonManager.writeConfig(__name__, config)
364 | return update_this_slider
365 |
366 | volume_slider = QSlider(qt.Qt.Orientation.Horizontal)
367 | volume_slider.setMinimum(0)
368 | volume_slider.setMaximum(200)
369 | volume_slider.setValue(config.get('volume_slider_value') or 100)
370 |
371 | volume_label = QLabel(f'Volume scale {volume_slider.value() / 100}')
372 |
373 | volume_slider.valueChanged.connect(update_slider(volume_slider, volume_label, 'volume_slider_value', 'Volume scale'))
374 | volume_slider.valueChanged.connect(voiceValueChanged)
375 |
376 | self.grid_layout.addWidget(volume_label, 5, 0, 1, 2)
377 | self.grid_layout.addWidget(volume_slider, 5, 3, 1, 2)
378 |
379 | pitch_slider = QSlider(qt.Qt.Orientation.Horizontal)
380 | pitch_slider.setMinimum(-15)
381 | pitch_slider.setMaximum(15)
382 | pitch_slider.setValue(config.get('pitch_slider_value') or 0)
383 |
384 | pitch_label = QLabel(f'Pitch scale {pitch_slider.value() / 100}')
385 |
386 | pitch_slider.valueChanged.connect(update_slider(pitch_slider, pitch_label, 'pitch_slider_value', 'Pitch scale'))
387 | pitch_slider.valueChanged.connect(voiceValueChanged)
388 |
389 | self.grid_layout.addWidget(pitch_label, 6, 0, 1, 2)
390 | self.grid_layout.addWidget(pitch_slider, 6, 3, 1, 2)
391 |
392 | speed_slider = QSlider(qt.Qt.Orientation.Horizontal)
393 | speed_slider.setMinimum(50)
394 | speed_slider.setMaximum(200)
395 | speed_slider.setValue(config.get('speed_slider_value') or 100)
396 |
397 | speed_label = QLabel(f'Speed scale {speed_slider.value() / 100}')
398 |
399 | speed_slider.valueChanged.connect(update_slider(speed_slider, speed_label, 'speed_slider_value', 'Speed scale'))
400 | speed_slider.valueChanged.connect(voiceValueChanged)
401 |
402 | self.grid_layout.addWidget(speed_label, 7, 0, 1, 2)
403 | self.grid_layout.addWidget(speed_slider, 7, 3, 1, 2)
404 |
405 | # Intonation slider
406 | intonation_slider = QSlider(qt.Qt.Orientation.Horizontal)
407 | intonation_slider.setMinimum(1)
408 | intonation_slider.setMaximum(200)
409 | intonation_slider.setValue(config.get('intonation_slider_value') or 100)
410 |
411 | intonation_label = QLabel(f'Intonation scale {intonation_slider.value() / 100}')
412 |
413 | intonation_slider.valueChanged.connect(update_slider(intonation_slider, intonation_label, 'intonation_slider_value', 'Intonation scale'))
414 | intonation_slider.valueChanged.connect(voiceValueChanged)
415 |
416 | self.grid_layout.addWidget(intonation_label, 8, 0, 1, 2)
417 | self.grid_layout.addWidget(intonation_slider, 8, 3, 1, 2)
418 |
419 | # Initial silence slider
420 | initial_silence_slider = QSlider(qt.Qt.Orientation.Horizontal)
421 | initial_silence_slider.setMinimum(0)
422 | initial_silence_slider.setMaximum(150)
423 | initial_silence_slider.setValue(config.get('initial_silence_slider_value') or 10)
424 |
425 | initial_silence_label = QLabel(f'Initial silence scale {initial_silence_slider.value() / 100}')
426 |
427 | initial_silence_slider.valueChanged.connect(update_slider(initial_silence_slider, initial_silence_label, 'initial_silence_slider_value', 'Initial silence scale'))
428 | initial_silence_slider.valueChanged.connect(voiceValueChanged)
429 |
430 | self.grid_layout.addWidget(initial_silence_label, 9, 0, 1, 2)
431 | self.grid_layout.addWidget(initial_silence_slider, 9, 3, 1, 2)
432 |
433 | # Final silence slider
434 | final_silence_slider = QSlider(qt.Qt.Orientation.Horizontal)
435 | final_silence_slider.setMinimum(0)
436 | final_silence_slider.setMaximum(150)
437 | final_silence_slider.setValue(config.get('final_silence_slider_value') or 10)
438 |
439 | final_silence_label = QLabel(f'Final silence length {final_silence_slider.value() / 100}')
440 |
441 | final_silence_slider.valueChanged.connect(update_slider(final_silence_slider, final_silence_label, 'final_silence_slider_value', 'Final silence scale'))
442 | final_silence_slider.valueChanged.connect(voiceValueChanged)
443 |
444 | self.grid_layout.addWidget(final_silence_label, 10, 0, 1, 2)
445 | self.grid_layout.addWidget(final_silence_slider, 10, 3, 1, 2)
446 |
447 | layout.addLayout(self.grid_layout)
448 |
449 | self.setLayout(layout)
450 | def pre_accept(self):
451 | if self.source_combo.currentIndex() == self.destination_combo.currentIndex():
452 | source_text = self.source_combo.itemText(self.source_combo.currentIndex())
453 | destination_text = self.destination_combo.itemText(self.destination_combo.currentIndex())
454 | QMessageBox.critical(mw, "Error", f"The chosen source field '{source_text}' is the same as the destination field '{destination_text}'.\nThis would overwrite the field you're reading from.\n\nTypically you want to read from a field like 'sentence' and output to 'audio', but in this case you're trying to read from 'sentence' and write to 'sentence' which cause your sentence to be overwritten")
455 | else:
456 | self.accept()
457 |
458 | def getNoteTextAndSpeaker(self, note_id):
459 | (speaker_index, speaker, style_info) = getSpeaker(self.speakers, self.speaker_combo, self.style_combo)
460 | source_field = self.source_combo.itemText(self.source_combo.currentIndex())
461 | note = mw.col.get_note(note_id)
462 | note_text = note[source_field]
463 |
464 | # Remove html tags https://stackoverflow.com/a/19730306
465 | tag_re = re.compile(r'(|<[^>]*>)')
466 | entity_re = re.compile(r'(&[^;]+;)')
467 |
468 | note_text = entity_re.sub('', note_text)
469 | note_text = tag_re.sub('', note_text)
470 |
471 | # Remove stuff between brackets. Usually japanese cards have pitch accent and reading info in brackets like 「 タイトル[;a,h] を 聞[き,きく;h]いた わけ[;a] じゃ ない[;a] !」
472 | if self.ignore_brackets_checkbox.isChecked():
473 | note_text = re.sub("\[.*?\]", "", note_text)
474 | note_text = re.sub(" ", "", note_text) # there's a lot of spaces for whatever reason which throws off the voice gen so we remove all spaces (japanese doesn't care about them anyway)
475 |
476 | return (note_text, speaker_index)
477 |
478 | def PreviewVoice(self, sample=True):
479 | (speaker_index, speaker, style_info) = getSpeaker(self.speakers, self.speaker_combo, self.style_combo)
480 | if speaker_index is None:
481 | raise Exception('getSpeaker returned None in preview_voice')
482 |
483 | if sample:
484 | preview_sentences = ["こんにちは、これはテスト文章です。", "DVDの再生ボタンを押して、書斎に向かった。", "さてと 、 ご馳走様でした", "真似しないでくれる?", "な 、 なんだよ ? テンション高いな"]
485 | text = random.choice(preview_sentences)
486 | else:
487 | if not self.selected_notes:
488 | return
489 | # If we've gone past the last note, restart from 0
490 | if self.preview_note_index >= len(self.selected_notes):
491 | self.preview_note_index = 0
492 | note_id = self.selected_notes[self.preview_note_index]
493 | text, speaker_index = self.getNoteTextAndSpeaker(note_id)
494 | self.preview_note_index += 1
495 |
496 | tup = (text, speaker_index)
497 | result = GenerateAudioQuery(tup, mw.addonManager.getConfig(__name__))
498 | contents = SynthesizeAudio(result, speaker_index)
499 |
500 | addon_path = dirname(__file__)
501 | preview_path = join(addon_path, "VOICEVOX_preview.wav")
502 | with open(preview_path, "wb") as f:
503 | f.write(contents)
504 | av_player.play_file(preview_path)
505 |
506 | def PreviewVoiceSample(self):
507 | self.PreviewVoice(sample=True)
508 |
509 | def PreviewVoiceActual(self):
510 | self.PreviewVoice(sample=False)
511 |
512 | def GenerateAudioQuery(text_and_speaker_index_tuple, config):
513 | try:
514 | text = text_and_speaker_index_tuple[0]
515 | speaker_index = text_and_speaker_index_tuple[1]
516 | audio_query_response = requests.post("http://127.0.0.1:50021/audio_query?speaker=" + str(speaker_index) + "&text=" + urllib.parse.quote(text, safe=''))
517 | if audio_query_response.status_code != 200:
518 | raise Exception(f"Unable to generate audio for the following text: `{text}`. Response code was {audio_query_response.status_code}\nResponse:{audio_query_response.text}")
519 |
520 | result = audio_query_response.text
521 | j = json.loads(result)
522 | if config.get('speed_slider_value'):
523 | j['speedScale'] = config.get('speed_slider_value') / 100;
524 | if config.get('volume_slider_value'):
525 | j['volumeScale'] = config.get('volume_slider_value') / 100;
526 | if config.get('pitch_slider_value'):
527 | j['pitchScale'] = config.get('pitch_slider_value') / 100;
528 | if config.get('intonation_slider_value'):
529 | j['intonationScale'] = config.get('intonation_slider_value') / 100;
530 | if config.get('initial_silence_slider_value'):
531 | j['prePhonemeLength'] = config.get('initial_silence_slider_value') / 100;
532 | if config.get('final_silence_slider_value'):
533 | j['postPhonemeLength'] = config.get('final_silence_slider_value') / 100;
534 | result = json.dumps(j, ensure_ascii=False).encode('utf8')
535 | return result
536 | except Exception as e:
537 | raise Exception(f"Unable to generate audio for the following text: `{text}`.\nResponse: {audio_query_response.text if audio_query_response is not None else 'None'}\n{traceback.format_exc()}")
538 |
539 | def SynthesizeAudio(audio_query_json, speaker_index):
540 | synthesis_response = requests.post("http://127.0.0.1:50021/synthesis?speaker=" + str(speaker_index), data=audio_query_json)
541 | if synthesis_response.status_code != 200:
542 | return None
543 | return synthesis_response.content
544 |
545 | def MultiSynthesizeAudio(audio_queries, speaker_index): # NOTE: This returns a zip
546 | for q in audio_queries:
547 | if q is None:
548 | raise Exception("MultiSynthesizeAudio recieved an audio query that was None")
549 | # Create json array of queries
550 | combined = b"[" + b','.join(audio_queries) + b"]"
551 |
552 | synthesis_response = requests.post("http://127.0.0.1:50021/multi_synthesis?speaker=" + str(speaker_index), data=combined)
553 | if synthesis_response.status_code != 200:
554 | return None
555 | return synthesis_response.content
556 |
557 | def DivideIntoChunks(array, n):
558 | # looping till length l
559 | for i in range(0, len(array), n):
560 | yield array[i:i + n]
561 |
562 | def onVoicevoxOptionSelected(browser):
563 | voicevox_exists = False
564 | try:
565 | response = requests.get("http://127.0.0.1:50021/version", timeout=5)
566 | if response.status_code == 200:
567 | print(f"version: {response.content}")
568 | voicevox_exists = True
569 | except:
570 | print("Request timed out!")
571 |
572 | if not voicevox_exists:
573 | QMessageBox.critical(mw, "Error", f"VOICEVOX service is not running. Navigate to your VOICEVOX install and run 'run.exe'. You can download VOICEVOX from https://voicevox.hiroshiba.jp/ if you do not have it installed")
574 | return
575 |
576 | dialog = MyDialog(browser)
577 | if dialog.exec():
578 | (speaker_index, speaker, style_info) = getSpeaker(dialog.speakers, dialog.speaker_combo, dialog.style_combo)
579 | if speaker_index is None:
580 | raise Exception('getSpeaker returned None in my_action')
581 |
582 | source_field = dialog.source_combo.itemText(dialog.source_combo.currentIndex())
583 | destination_field = dialog.destination_combo.itemText(dialog.destination_combo.currentIndex())
584 |
585 | speaker_combo_text = dialog.speaker_combo.itemText(dialog.speaker_combo.currentIndex())
586 | style_combo_text = dialog.style_combo.itemText(dialog.style_combo.currentIndex())
587 | user_template = dialog.filename_template_edit.text()
588 |
589 | # Save previously used stuff
590 | config = mw.addonManager.getConfig(__name__)
591 | config['last_source_field'] = source_field
592 | config['last_destination_field'] = destination_field
593 | config['last_speaker_name'] = speaker_combo_text
594 | config['last_style_name'] = style_combo_text
595 | config['append_audio'] = "true" if dialog.append_audio.isChecked() else "false"
596 | config['use_opus'] = "true" if dialog.use_opus.isChecked() else "false"
597 | config['filename_template'] = user_template
598 |
599 | mw.addonManager.writeConfig(__name__, config)
600 |
601 | progress_window = qt.QWidget(None)
602 | progress_window.setWindowTitle("Generating VOICEVOX Audio")
603 | progress_window.setFixedSize(400, 80)
604 |
605 | progress_text = qt.QLabel("Generating Audio...")
606 |
607 | progress_bar = qt.QProgressBar(progress_window)
608 |
609 | progress_layout = qt.QVBoxLayout()
610 | progress_layout.addWidget(progress_text)
611 | progress_layout.addWidget(progress_bar)
612 |
613 | progress_window.setLayout(progress_layout)
614 |
615 | progress_window.show()
616 | progress_window.setFocus()
617 |
618 | def updateProgress(notes_so_far, total_notes, bottom_text = ''):
619 | progress_text.setText(f"Generating Audio {notes_so_far}/{total_notes}\n{bottom_text}")
620 | progress_bar.setMaximum(total_notes)
621 | progress_bar.setValue(notes_so_far)
622 | mw.app.processEvents()
623 | def sanitize_filename(filename: str, replacement: str = "_") -> str:
624 | # Replace problematic characters with a replacement character
625 | sanitized = re.sub(r'[<>:"/\\|?*]', replacement, filename)
626 | # Strip leading and trailing whitespaces and dots (Windows hates these)
627 | sanitized = sanitized.strip().strip(".")
628 | # Limit filename length to something reasonable (255 is typical for most filesystems)
629 | return sanitized[:255]
630 |
631 | # We split the work into chunks so we can pass a bunch of audio queries to the synthesizer instead of doing them one at time, but we don't want to do all of them at once so chunks make the most sense
632 | CHUNK_SIZE = 4
633 | note_chunks = DivideIntoChunks(dialog.selected_notes, CHUNK_SIZE)
634 | notes_so_far = 0
635 | total_notes = len(dialog.selected_notes)
636 | updateProgress(notes_so_far, total_notes)
637 |
638 | # Pre-cache user template for performance
639 | filename_template = config.get("filename_template", "VOICEVOX_{{speaker}}_{{style}}_{{uid}}")
640 |
641 | for note_chunk in note_chunks:
642 | note_text_and_speakers = map(dialog.getNoteTextAndSpeaker, note_chunk)
643 | updateProgress(notes_so_far, total_notes, f"Audio Query: {0}/{len(note_chunk)}")
644 | query_count = 0
645 | def GenerateQueryAndUpdateProgress(x, query_count):
646 | updateProgress(notes_so_far, total_notes, f"Audio Query: {query_count}/{len(note_chunk)}")
647 | query_count+=1
648 | return GenerateAudioQuery(x, config)
649 |
650 | audio_queries = list(map(lambda note: GenerateQueryAndUpdateProgress(note, query_count), note_text_and_speakers))
651 | media_dir = mw.col.media.dir()
652 | updateProgress(notes_so_far, total_notes, f"Synthesizing Audio {notes_so_far} to {min(notes_so_far+CHUNK_SIZE, total_notes)}")
653 | zip_bytes = MultiSynthesizeAudio(audio_queries, speaker_index)
654 |
655 | # MultiSynthesis returns zip bytes with ZIP_STORED
656 | zip_counter = 0
657 | with zipfile.ZipFile(io.BytesIO(zip_bytes), "r", zipfile.ZIP_STORED) as wavs_zip:
658 | for name in wavs_zip.namelist():
659 | updateProgress(notes_so_far, total_notes, f"Converting Audio: {zip_counter}/{len(note_chunk)}")
660 | zip_counter+=1
661 | audio_data = wavs_zip.read(name)
662 | chunk_note_index = int(name.replace('.wav', '')) - 1 # Starts at 001.wav, this converts to 0 index
663 | note_id = note_chunk[chunk_note_index]
664 |
665 | audio_extension = "wav"
666 |
667 | new_audio_format = "opus" if config['use_opus'] == "true" else "mp3"
668 | new_audio_data = ffmpeg.ConvertWav(audio_data, new_audio_format)
669 | if new_audio_data != None:
670 | audio_data = new_audio_data
671 | audio_extension = new_audio_format
672 |
673 | # Build placeholders
674 | note_obj = mw.col.get_note(note_id)
675 | fields_map = {f: note_obj[f] for f in note_obj.keys()}
676 | cards_of_note = note_obj.cards()
677 | if cards_of_note:
678 | deck_id = cards_of_note[0].did
679 | deck_name = mw.col.decks.name(deck_id)
680 | else:
681 | deck_name = "UnknownDeck"
682 |
683 | placeholders = {
684 | "uid": str(uuid.uuid4()),
685 | "speaker": speaker_combo_text,
686 | "style": style_combo_text,
687 | "deck": deck_name.split("::")[-1],
688 | "deck-full": deck_name,
689 | "date": datetime.datetime.now().date().isoformat(),
690 | "fields": fields_map
691 | }
692 |
693 | raw_filename = parse_filename_template(filename_template, placeholders)
694 | raw_filename = sanitize_filename(raw_filename)
695 | # We'll add the final extension here
696 | filename = f"{raw_filename}.{audio_extension}"
697 |
698 | audio_full_path = join(media_dir, filename)
699 |
700 | with open(audio_full_path, "wb") as f:
701 | f.write(audio_data)
702 |
703 | audio_field_text = f"[sound:{filename}]"
704 | note = mw.col.get_note(note_id)
705 | if config['append_audio'] == "true":
706 | note[destination_field] += audio_field_text
707 | else:
708 | note[destination_field] = audio_field_text
709 | mw.col.update_note(note)
710 | mw.app.processEvents()
711 | notes_so_far += 1
712 | mw.progress.finish()
713 | mw.reset() # reset mw so our changes are applied
714 | else:
715 | print("Canceled!")
716 |
--------------------------------------------------------------------------------