├── functions
├── __init__.py
├── functions.py
└── transcribe.py
├── ragnar.bat
├── images
└── ragge3.png
├── .gitignore
├── requirements.txt
├── config.py
├── .streamlit
└── config.toml
├── docker-compose.yml
├── Dockerfile_Ploomber
├── Dockerfile
├── README.md
└── app.py
/functions/__init__.py:
--------------------------------------------------------------------------------
1 |
--------------------------------------------------------------------------------
/ragnar.bat:
--------------------------------------------------------------------------------
1 | cmd.exe /k "cd /d "c:\ragnar" & start python -m streamlit run app.py"
--------------------------------------------------------------------------------
/images/ragge3.png:
--------------------------------------------------------------------------------
https://raw.githubusercontent.com/mickekring/TOOL--Ragnar/HEAD/images/ragge3.png
--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
1 |
2 | __pycache__
3 | audio
4 | text
5 | cache
6 | .DS_Store
7 | .streamlit/secrets.toml
8 | venv
9 |
10 |
--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
1 | datasets==4.1.1
2 | ffmpeg-python==0.2.0
3 | nltk==3.9.1
4 | openai-whisper==20250625
5 | pydub==0.25.1
6 | python-docx==1.2.0
7 | streamlit==1.49.1
8 | torch==2.8.0
9 | torchvision==0.23.0
10 | transformers==4.56.1
11 |
--------------------------------------------------------------------------------
/config.py:
--------------------------------------------------------------------------------
1 |
2 | ### Ragnar
3 | app_version = "0.7.5"
4 | ### Author: Micke Kring
5 | ### Contact: mikael.kring@ri.se
6 |
7 | ### Fun fact: The app is named after Ragnar Sohlman, the assistant
8 | ### of Alfred Nobel.
9 | ### https://sv.wikipedia.org/wiki/Ragnar_Sohlman
10 |
11 | run_mode = "docker"
--------------------------------------------------------------------------------
/.streamlit/config.toml:
--------------------------------------------------------------------------------
1 | [theme]
2 | base="dark"
3 | primaryColor="#5bbf83"
4 | backgroundColor="#333437"
5 | secondaryBackgroundColor="#202123"
6 |
7 |
8 | [server]
9 | maxUploadSize = 10000
10 |
11 |
12 | [browser]
13 | gatherUsageStats = false
14 |
15 |
16 | [client]
17 | toolbarMode = "auto"
18 |
19 |
--------------------------------------------------------------------------------
/docker-compose.yml:
--------------------------------------------------------------------------------
1 | services:
2 | ragnar:
3 | build: .
4 | container_name: ragnar
5 | restart: unless-stopped
6 | env_file:
7 | - .env
8 | networks:
9 | - web
10 | labels:
11 | - "traefik.enable=true"
12 | - "traefik.http.routers.ragnar.rule=Host(`ragnar.labbytan.se`)"
13 | - "traefik.http.routers.ragnar.entrypoints=websecure"
14 | - "traefik.http.routers.ragnar.tls.certresolver=myresolver"
15 | - "traefik.http.services.ragnar.loadbalancer.server.port=8501"
16 |
17 | networks:
18 | web:
19 | external: true
--------------------------------------------------------------------------------
/Dockerfile_Ploomber:
--------------------------------------------------------------------------------
1 | FROM continuumio/miniconda3
2 |
3 | WORKDIR /srv
4 |
5 | COPY requirements.txt /srv/
6 |
7 | RUN apt-get update && apt-get install ffmpeg pandoc -y
8 |
9 | # Use an available version of torchvision
10 | RUN pip install torch==2.3.0+cpu torchvision==0.18.0+cpu --index-url https://download.pytorch.org/whl/cpu
11 | RUN pip install -r requirements.txt --no-cache-dir
12 |
13 | COPY . /srv
14 |
15 | ENTRYPOINT ["streamlit", "run", "app.py", \
16 | "--server.port=80", \
17 | "--server.headless=true", \
18 | "--server.address=0.0.0.0", \
19 | "--browser.gatherUsageStats=false", \
20 | "--server.enableStaticServing=true", \
21 | "--server.fileWatcherType=none", \
22 | "--client.toolbarMode=viewer"]
--------------------------------------------------------------------------------
/Dockerfile:
--------------------------------------------------------------------------------
1 | FROM python:3.12-slim
2 |
3 | WORKDIR /app
4 |
5 | # Install system dependencies including ffmpeg and pandoc
6 | RUN apt-get update && \
7 | apt-get install -y curl ffmpeg pandoc && \
8 | apt-get clean && \
9 | rm -rf /var/lib/apt/lists/*
10 |
11 | COPY requirements.txt .
12 |
13 | # Install PyTorch CPU version first
14 | RUN pip install torch==2.3.0+cpu torchvision==0.18.0+cpu --index-url https://download.pytorch.org/whl/cpu
15 |
16 | # Then install the rest of the requirements
17 | RUN pip install --no-cache-dir -r requirements.txt
18 |
19 | COPY . .
20 |
21 | EXPOSE 8501
22 |
23 | HEALTHCHECK --interval=5s --timeout=5s --start-period=60s \
24 | CMD curl --fail http://localhost:8501/_stcore/health || exit 1
25 |
26 | ENTRYPOINT ["streamlit", "run", "app.py", \
27 | "--server.port=8501", \
28 | "--server.address=0.0.0.0", \
29 | "--browser.gatherUsageStats=false", \
30 | "--server.enableStaticServing=true", \
31 | "--client.toolbarMode=viewer", \
32 | "--server.headless=true", \
33 | "--server.fileWatcherType=none"]
--------------------------------------------------------------------------------
/functions/functions.py:
--------------------------------------------------------------------------------
1 |
2 | ### Functions
3 |
4 | import streamlit as st
5 | from pydub import AudioSegment
6 |
7 |
8 | # Converts and compresses audio or video file to mp3 and a more manageble size
9 |
10 | def convert_to_mono_and_compress(uploaded_file, file_name, target_size_MB=22):
11 |
12 | global file_name_converted
13 |
14 | # Load the audio file
15 | audio = AudioSegment.from_file(uploaded_file)
16 |
17 | # Convert to mono
18 | audio = audio.set_channels(1)
19 |
20 | # Calculate target bitrate to achieve the desired file size (in bits per second)
21 | duration_seconds = len(audio) / 1000.0 # pydub works in milliseconds
22 | target_bitrate = int((target_size_MB * 1024 * 1024 * 8) / duration_seconds)
23 |
24 | # Compress the audio file
25 | try:
26 | audio.export("audio/" + file_name + ".mp3", format="mp3", bitrate=f"{target_bitrate}")
27 | file_name_converted = "audio/" + file_name + ".mp3"
28 |
29 | except Exception as e:
30 | print(f"Error during audio export: {e}")
31 | return None
32 |
33 | return file_name_converted
34 |
--------------------------------------------------------------------------------
/functions/transcribe.py:
--------------------------------------------------------------------------------
1 |
2 | ### Transcribe
3 |
4 | import streamlit as st
5 | import torch
6 | from datasets import load_dataset
7 | from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
8 | import whisper
9 |
10 | # Functions that transcribes audio and creates the text files
11 |
12 | def transcribe_with_kb_whisper(file_name_converted, file_name, whisper_model, spoken_language):
13 |
14 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
15 | torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
16 | model_id = f"KBLab/{whisper_model}"
17 |
18 | model = AutoModelForSpeechSeq2Seq.from_pretrained(
19 | model_id, torch_dtype=torch_dtype, use_safetensors=True, cache_dir="cache"
20 | )
21 | model.to(device)
22 | processor = AutoProcessor.from_pretrained(model_id)
23 |
24 | pipe = pipeline(
25 | "automatic-speech-recognition",
26 | model=model,
27 | tokenizer=processor.tokenizer,
28 | feature_extractor=processor.feature_extractor,
29 | torch_dtype=torch_dtype,
30 | device=device,
31 | )
32 |
33 | generate_kwargs = {"task": "transcribe", "language": spoken_language}
34 |
35 | res = pipe(file_name_converted,
36 | chunk_length_s=30,
37 | generate_kwargs={"task": "transcribe", "language": spoken_language})
38 |
39 | transcribed_content = res["text"]
40 |
41 | with open('text/' + file_name + '.txt', 'w', encoding='utf-8', errors='replace') as file:
42 | # Write the string to the file
43 | file.write(transcribed_content)
44 |
45 | return transcribed_content
46 |
47 |
48 | def transcribe_with_whisper(file_name_converted, file_name, whisper_model, spoken_language):
49 |
50 | transcribed_content = ""
51 |
52 | model = whisper.load_model(whisper_model)
53 | result = model.transcribe(file_name_converted, language=spoken_language)
54 | transcribed_content = result["text"]
55 |
56 | with open('text/' + file_name + '.txt', 'w', encoding='utf-8', errors='replace') as file:
57 | # Write the string to the file
58 | file.write(transcribed_content)
59 |
60 | return transcribed_content
61 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Ragnar - the slow but secure way to transcribe files
2 | Transcribe your audio and video files locally, totally secure.
3 |
4 | 
5 |
6 | ## What is this?
7 | Ragnar is a simple app built with Python and [Streamlit](https://streamlit.io/) that transcribes your audio and video files locally on your computer, or your own server. Totally secure and without any need to call out to any services but your own computer. It uses [Whisper](https://github.com/openai/whisper) and [KB Whisper (from Kungliga Biblioteket)](https://huggingface.co/collections/KBLab/kb-whisper-67af9eafb24da903b63cc4aa).
8 |
The transcriptions can then be saved as txt, docx, json and srt (subtitles).
9 |
10 | ## How the app works - flow
11 | 1. When you run the app, a web page is opened in your default web browser.
12 | 2. You upload an audio- or video file directly from the app.
13 | 3. When you've uploaded audio, the audio file will be converted into an mp3 file and compressed in size.
14 | 4. The mp3 file will be transcribed using Whisper or KB Whisper locally on your computer based on your settings (language and model).
15 | 5. The transcribed text is presented to you with the possibility to download.
16 |
17 | ## Installation
18 | This is an early beta, but it works. Expect updates as I develop this app. If you have any suggestions, feel free to ask.
19 | PS. I'm not a programmer. It's prototype code. ;)
20 |
21 | * Tested on Mac OSX and Windows 10 with Python 3.12
22 | * Download the files and 'pip install -r requirements.txt'
23 | * Install FFMPEG on your system
24 | * Run with 'streamlit run app.py' alternatively 'python -m streamlit run app.py'
25 | * The first time you run it, it will take som time since the Whisper model is downloaded to your computer.
26 | * If you're on Windows, I included a 'ragnar.bat' file which starts the application if you place all code in 'C:\ragnar'. You can edit this if you place Ragnar in a different folder.
27 |
28 | ## Updates
29 | * v0.7.4
30 | * A bug fix for Windows users where they sometimes get a "UnicodeEncodeError: 'charmap' codec can't...".
31 | * v0.7.2
32 | * Just added some files for deployment to Docker
33 | * v0.7.0
34 | * Partially rewritten. Make sure to update your pip packages from requirement.txt if you've already installed Ragnar
35 | * Added KB (Kungliga Bibliotekets fine tuned Whisper) Whisper and reverted back to vanilla Whisper from OpenAI. Still all local
36 | * Removed translation which a language model does a lot better
37 | * v0.6.2
38 | * Updated requirements.txt and tested with latest versions of eg Streamlit
39 | * Tidying up and moving functions to separate folder
40 | * Replaced the audio recorder with Streamlit's new audio recorder
41 | * v0.6.1
42 | * Windows users getting weird characters instead of å ä ö. Fixed it with utf-8 in transcribe.py
43 | * v0.6.0
44 | * I've split the code into several python files.
45 | * I've fixed the 'record audio' section and it works.
46 | * Added a dropdown menu for language selection of the source audio. By default it's set to automatic detection, which works most of the time. But if you have people speaking with heavy accents or such you can set the language here.
47 | * v0.5.0
48 | * Init upload - early beta
49 |
50 | ## Known bugs
51 | * (v0.5.0) ~~The "record audio" section is a bit wonky~~
52 |
53 | ## License
54 | Some of you have asked why I haven't added a license to Ragnar. The truth is that I have no knowledge about licensing and open source. I've got some suggestions like MIT, and I'm looking into it.
55 | My point is that Ragnar is free to use, modify, distribute and do what you want with as long as you want to. It's just code that I wrote to solve an issue me and my collegues had. If it helps more people, great.
56 |
57 | ## Support
58 | Unfortunately, it's not possible for me to assist you guys with support. I just don't have the time. Report bugs and problems and hopefully we can try to solve them together.
59 |
60 |
61 |
--------------------------------------------------------------------------------
/app.py:
--------------------------------------------------------------------------------
1 |
2 | # Python imports
3 | import os
4 | import streamlit as st
5 | import hashlib
6 | from os import environ
7 | import hmac
8 |
9 | # External imports
10 | from docx import Document
11 | from pydub import AudioSegment
12 |
13 | # Local imports
14 | from functions.functions import convert_to_mono_and_compress
15 | from functions.transcribe import transcribe_with_kb_whisper, transcribe_with_whisper
16 | import config as c
17 |
18 |
19 | ### INITIAL VARIABLES
20 |
21 | # Creates folder if they don't exist
22 | os.makedirs("audio", exist_ok=True) # Where audio/video files are stored for transcription
23 | os.makedirs("text", exist_ok=True) # Where transcribed document are beeing stored
24 |
25 |
26 | ### PASSWORD ########################
27 |
28 | if c.run_mode == "streamlit":
29 | st.session_state["pwd_on"] = st.secrets.pwd_on
30 | else:
31 | st.session_state["pwd_on"] = environ.get("pwd_on")
32 |
33 |
34 | if st.session_state["pwd_on"] == "true":
35 |
36 | def check_password():
37 |
38 | if c.run_mode == "streamlit":
39 | passwd = st.secrets["password"]
40 | else:
41 | passwd = environ.get("password")
42 |
43 | def password_entered():
44 |
45 | if hmac.compare_digest(st.session_state["password"], passwd):
46 | st.session_state["password_correct"] = True
47 | del st.session_state["password"] # Don't store the password.
48 | else:
49 | st.session_state["password_correct"] = False
50 |
51 | if st.session_state.get("password_correct", False):
52 | return True
53 |
54 | st.text_input("Lösenord", type="password", on_change=password_entered, key="password")
55 | if "password_correct" in st.session_state:
56 | st.error("😕 Ooops. Fel lösenord.")
57 | return False
58 |
59 |
60 | if not check_password():
61 | st.stop()
62 |
63 | ############
64 |
65 |
66 | # Check and set default values if not set in session_state
67 | # of Streamlit
68 | if "translation" not in st.session_state: # If audio has been translated
69 | st.session_state["translation"] = False
70 | if "cpu_vs_gpu" not in st.session_state: # If user device has GPU support
71 | st.session_state["cpu_vs_gpu"] = False
72 | if "spoken_language" not in st.session_state: # What language source audio is in
73 | st.session_state["spoken_language"] = "Svenska"
74 | if "transcribe_model" not in st.session_state: # What model of Whisper to use
75 | st.session_state["transcribe_model"] = "KB Whisper Small"
76 | if "file_name_converted" not in st.session_state: # Audio file name
77 | st.session_state["file_name_converted"] = None
78 |
79 |
80 | # Checking if uploaded or recorded audio file has been transcribed
81 | def compute_file_hash(uploaded_file):
82 |
83 | # Compute the MD5 hash of a file
84 | hasher = hashlib.md5()
85 |
86 | for chunk in iter(lambda: uploaded_file.read(4096), b""):
87 | hasher.update(chunk)
88 | uploaded_file.seek(0) # Reset the file pointer to the beginning
89 |
90 | return hasher.hexdigest()
91 |
92 |
93 | ### MAIN APP ###########################
94 |
95 | # Page configuration
96 | st.set_page_config(
97 | page_title="Ragnar",
98 | page_icon=None,
99 | layout="centered",
100 | initial_sidebar_state="auto"
101 | )
102 |
103 |
104 | def main():
105 |
106 | global translation
107 | global model_map_transcribe_model
108 |
109 | ### SIDEBAR
110 |
111 | # Sidebar image of Ragnar
112 | st.sidebar.image("images/ragge3.png", width = 220)
113 |
114 | ###### SIDEBAR SETTINGS
115 |
116 | st.sidebar.header("Inställningar")
117 | st.sidebar.markdown("")
118 |
119 | # Dropdown menu - choose Whisper model
120 | transcribe_model = st.sidebar.selectbox(
121 | "Välj modell för transkribering",
122 | [
123 | "KB Whisper Large",
124 | "KB Whisper Medium",
125 | "KB Whisper Small",
126 | "KB Whisper Base",
127 | "KB Whisper Tiny",
128 | "OpenAI Whisper Turbo",
129 | "OpenAI Whisper Large",
130 | "OpenAI Whisper Medium",
131 | "OpenAI Whisper Small",
132 | "OpenAI Whisper Base",
133 | "OpenAI Whisper Tiny"
134 | ],
135 | index=[
136 | "KB Whisper Large",
137 | "KB Whisper Medium",
138 | "KB Whisper Small",
139 | "KB Whisper Base",
140 | "KB Whisper Tiny",
141 | "OpenAI Whisper Turbo",
142 | "OpenAI Whisper Large",
143 | "OpenAI Whisper Medium",
144 | "OpenAI Whisper Small",
145 | "OpenAI Whisper Base",
146 | "OpenAI Whisper Tiny"
147 | ].index(st.session_state["transcribe_model"]),
148 | )
149 |
150 | model_map_transcribe_model = {
151 | "KB Whisper Large": "kb-whisper-large",
152 | "KB Whisper Medium": "kb-whisper-medium",
153 | "KB Whisper Small": "kb-whisper-small",
154 | "KB Whisper Base": "kb-whisper-base",
155 | "KB Whisper Tiny": "kb-whisper-tiny",
156 | "OpenAI Whisper Turbo": "turbo",
157 | "OpenAI Whisper Large": "large",
158 | "OpenAI Whisper Medium": "medium",
159 | "OpenAI Whisper Small": "small",
160 | "OpenAI Whisper Base": "base",
161 | "OpenAI Whisper Tiny": "tiny"
162 | }
163 |
164 | # Dropdown menu - choose source language of audio
165 | spoken_language = st.sidebar.selectbox(
166 | "Välj språk som talas",
167 | ["Automatiskt", "Svenska", "Engelska", "Franska", "Tyska", "Spanska"],
168 | index=["Automatiskt", "Svenska", "Engelska", "Franska", "Tyska", "Spanska"].index(st.session_state["spoken_language"]),
169 | )
170 |
171 | model_map_spoken_language = {
172 | "Automatiskt": None,
173 | "Svenska": "sv",
174 | "Engelska": "en",
175 | "Franska": "fr",
176 | "Tyska": "de",
177 | "Spanska": "sp"
178 |
179 | }
180 |
181 | # Update the session_state directly
182 | st.session_state["transcribe_model"] = transcribe_model
183 | st.session_state["spoken_language"] = spoken_language
184 |
185 | print(model_map_transcribe_model[st.session_state["transcribe_model"]])
186 | print(model_map_spoken_language[st.session_state["spoken_language"]])
187 |
188 | st.sidebar.markdown(
189 | "#"
190 | )
191 |
192 | st.sidebar.markdown(f"""
193 | Version: {c.app_version}
194 | """)
195 |
196 |
197 | ### MAIN PAGE
198 |
199 | # Title
200 | st.markdown("""# Ragnar
201 | ### Din GDPR- och sekretessäkrade transkriberare
202 | """)
203 | st.markdown(f"""**Vald AI-modell:** {st.session_state["transcribe_model"]}
204 | **Valt språk:** {st.session_state["spoken_language"]}""")
205 |
206 |
207 | # CREATE TWO TABS FOR FILE UPLOAD VS RECORDED AUDIO
208 |
209 | tab1, tab2 = st.tabs(["Ladda upp", "Spela in"])
210 |
211 |
212 | # FILE UPLOADER
213 |
214 | with tab1:
215 |
216 | uploaded_file = st.file_uploader(
217 | "Ladda upp din ljud- eller videofil här",
218 | type=["mp3", "wav", "flac", "mp4", "m4a", "aifc"],
219 | help="Max 2GB stora filer", label_visibility="collapsed",
220 | )
221 |
222 |
223 | if uploaded_file:
224 |
225 | # Checks if uploaded file has already been transcribed
226 | current_file_hash = compute_file_hash(uploaded_file)
227 |
228 | # If the uploaded file hash is different from the one in session state, reset the state
229 | if "file_hash" not in st.session_state or st.session_state.file_hash != current_file_hash:
230 | st.session_state.file_hash = current_file_hash
231 |
232 | if "transcribed" in st.session_state:
233 | del st.session_state.transcribed
234 |
235 |
236 | # If audio has not been transcribed
237 | if "transcribed" not in st.session_state:
238 |
239 | # Sends audio to be converted to mp3 and compressed
240 | with st.spinner('Din ljudfil är lite stor. Jag ska bara komprimera den lite först...', show_time=True):
241 | st.session_state.file_name_converted = convert_to_mono_and_compress(uploaded_file, uploaded_file.name)
242 | st.success('Inspelning komprimerad och klar. Startar transkribering.')
243 |
244 | # Transcribes audio with Whisper
245 | with st.spinner('Transkriberar. Det här kan ta ett litet tag beroende på hur lång inspelningen är...', show_time=True):
246 |
247 | if "KB" in st.session_state["transcribe_model"]:
248 | st.session_state.transcribed = transcribe_with_kb_whisper(st.session_state.file_name_converted,
249 | uploaded_file.name,
250 | model_map_transcribe_model[st.session_state["transcribe_model"]],
251 | model_map_spoken_language[st.session_state["spoken_language"]])
252 | else:
253 | st.session_state.transcribed = transcribe_with_whisper(st.session_state.file_name_converted,
254 | uploaded_file.name,
255 | model_map_transcribe_model[st.session_state["transcribe_model"]],
256 | model_map_spoken_language[st.session_state["spoken_language"]])
257 |
258 | st.success('Transkribering klar.')
259 |
260 | st.balloons()
261 |
262 |
263 | # Creates a Word document with the transcribed text
264 | document = Document()
265 |
266 | clean_text = st.session_state.transcribed.encode('utf-8', errors='replace').decode('utf-8')
267 | document.add_paragraph(clean_text)
268 |
269 | document.save('text/' + uploaded_file.name + '.docx')
270 |
271 | with open("text/" + uploaded_file.name + ".docx", "rb") as template_file:
272 | template_byte = template_file.read()
273 |
274 |
275 | # Creates a grid of four columns for the different transcribed document download buttons
276 | col1, col2, col3, col4 = st.columns(4)
277 |
278 | # Text
279 | with col1:
280 | with open('text/' + uploaded_file.name + '.txt', "rb") as file_txt:
281 | st.download_button(
282 | label = ":flag-se: Ladda ned text",
283 | data = file_txt,
284 | file_name = uploaded_file.name + '.txt',
285 | mime = 'text/plain',
286 | )
287 |
288 | # Word
289 | with col2:
290 | st.download_button(
291 | label = ":flag-se: Ladda ned word",
292 | data = template_byte,
293 | file_name = uploaded_file.name + '.docx',
294 | mime = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
295 | )
296 |
297 | st.markdown("### Transkribering")
298 |
299 | if st.session_state.file_name_converted is not None:
300 | st.audio(st.session_state.file_name_converted, format='audio/wav')
301 |
302 | st.write(st.session_state.transcribed)
303 |
304 |
305 | # AUDIO RECORDER ###### ###### ######
306 |
307 | with tab2:
308 |
309 | audio = st.audio_input("Spela in")
310 |
311 | # The rest of the code in tab2 works the same way as in tab1, so it's not going to be
312 | # commented.
313 | if audio:
314 |
315 | # Open the saved audio file and compute its hash
316 | current_file_hash = compute_file_hash(audio)
317 |
318 | # If the uploaded file hash is different from the one in session state, reset the state
319 | if "file_hash" not in st.session_state or st.session_state.file_hash != current_file_hash:
320 | st.session_state.file_hash = current_file_hash
321 |
322 | if "transcribed" in st.session_state:
323 | del st.session_state.transcribed
324 |
325 | if "transcribed" not in st.session_state:
326 |
327 | audio_file = AudioSegment.from_file(audio)
328 | output_path = "audio/converted.mp3"
329 | audio_file.export(output_path, format="mp3", bitrate="16k")
330 |
331 | with st.spinner('Transkriberar. Det här kan ta ett litet tag beroende på hur lång inspelningen är...', show_time=True):
332 |
333 | if "KB" in st.session_state["transcribe_model"]:
334 | st.session_state.transcribed = transcribe_with_kb_whisper("audio/converted.mp3",
335 | "local_recording.mp3",
336 | model_map_transcribe_model[st.session_state["transcribe_model"]],
337 | model_map_spoken_language[st.session_state["spoken_language"]])
338 | else:
339 | st.session_state.transcribed = transcribe_with_whisper("audio/converted.mp3",
340 | "local_recording.mp3",
341 | model_map_transcribe_model[st.session_state["transcribe_model"]],
342 | model_map_spoken_language[st.session_state["spoken_language"]])
343 |
344 | st.success('Transkribering klar.')
345 |
346 | st.balloons()
347 |
348 | local_recording_name = "local_recording.mp3"
349 | document = Document()
350 | clean_text = st.session_state.transcribed.encode('utf-8', errors='replace').decode('utf-8')
351 | document.add_paragraph(clean_text)
352 |
353 | document.save('text/' + local_recording_name + '.docx')
354 |
355 | with open("text/local_recording.mp3.docx", "rb") as template_file:
356 | template_byte = template_file.read()
357 |
358 | col1, col2, col3, col4 = st.columns(4)
359 |
360 | with col1:
361 | with open('text/' + local_recording_name + '.txt', "rb") as file_txt:
362 | st.download_button(
363 | label = ":flag-se: Ladda ned text",
364 | data = file_txt,
365 | file_name = local_recording_name + '.txt',
366 | mime = 'text/plain',
367 | )
368 |
369 | with col2:
370 | st.download_button(
371 | label = ":flag-se: Ladda ned word",
372 | data = template_byte,
373 | file_name = local_recording_name + '.docx',
374 | mime = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
375 | )
376 |
377 |
378 | st.markdown("### Transkribering")
379 |
380 | if st.session_state.file_name_converted is not None:
381 | st.audio(st.session_state.file_name_converted, format='audio/wav')
382 |
383 | st.write(st.session_state.transcribed)
384 |
385 |
386 | if __name__ == "__main__":
387 | main()
--------------------------------------------------------------------------------