├── functions ├── __init__.py ├── functions.py └── transcribe.py ├── ragnar.bat ├── images └── ragge3.png ├── .gitignore ├── requirements.txt ├── config.py ├── .streamlit └── config.toml ├── docker-compose.yml ├── Dockerfile_Ploomber ├── Dockerfile ├── README.md └── app.py /functions/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /ragnar.bat: -------------------------------------------------------------------------------- 1 | cmd.exe /k "cd /d "c:\ragnar" & start python -m streamlit run app.py" -------------------------------------------------------------------------------- /images/ragge3.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/mickekring/TOOL--Ragnar/HEAD/images/ragge3.png -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | 2 | __pycache__ 3 | audio 4 | text 5 | cache 6 | .DS_Store 7 | .streamlit/secrets.toml 8 | venv 9 | 10 | -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | datasets==4.1.1 2 | ffmpeg-python==0.2.0 3 | nltk==3.9.1 4 | openai-whisper==20250625 5 | pydub==0.25.1 6 | python-docx==1.2.0 7 | streamlit==1.49.1 8 | torch==2.8.0 9 | torchvision==0.23.0 10 | transformers==4.56.1 11 | -------------------------------------------------------------------------------- /config.py: -------------------------------------------------------------------------------- 1 | 2 | ### Ragnar 3 | app_version = "0.7.5" 4 | ### Author: Micke Kring 5 | ### Contact: mikael.kring@ri.se 6 | 7 | ### Fun fact: The app is named after Ragnar Sohlman, the assistant 8 | ### of Alfred Nobel. 9 | ### https://sv.wikipedia.org/wiki/Ragnar_Sohlman 10 | 11 | run_mode = "docker" -------------------------------------------------------------------------------- /.streamlit/config.toml: -------------------------------------------------------------------------------- 1 | [theme] 2 | base="dark" 3 | primaryColor="#5bbf83" 4 | backgroundColor="#333437" 5 | secondaryBackgroundColor="#202123" 6 | 7 | 8 | [server] 9 | maxUploadSize = 10000 10 | 11 | 12 | [browser] 13 | gatherUsageStats = false 14 | 15 | 16 | [client] 17 | toolbarMode = "auto" 18 | 19 | -------------------------------------------------------------------------------- /docker-compose.yml: -------------------------------------------------------------------------------- 1 | services: 2 | ragnar: 3 | build: . 4 | container_name: ragnar 5 | restart: unless-stopped 6 | env_file: 7 | - .env 8 | networks: 9 | - web 10 | labels: 11 | - "traefik.enable=true" 12 | - "traefik.http.routers.ragnar.rule=Host(`ragnar.labbytan.se`)" 13 | - "traefik.http.routers.ragnar.entrypoints=websecure" 14 | - "traefik.http.routers.ragnar.tls.certresolver=myresolver" 15 | - "traefik.http.services.ragnar.loadbalancer.server.port=8501" 16 | 17 | networks: 18 | web: 19 | external: true -------------------------------------------------------------------------------- /Dockerfile_Ploomber: -------------------------------------------------------------------------------- 1 | FROM continuumio/miniconda3 2 | 3 | WORKDIR /srv 4 | 5 | COPY requirements.txt /srv/ 6 | 7 | RUN apt-get update && apt-get install ffmpeg pandoc -y 8 | 9 | # Use an available version of torchvision 10 | RUN pip install torch==2.3.0+cpu torchvision==0.18.0+cpu --index-url https://download.pytorch.org/whl/cpu 11 | RUN pip install -r requirements.txt --no-cache-dir 12 | 13 | COPY . /srv 14 | 15 | ENTRYPOINT ["streamlit", "run", "app.py", \ 16 | "--server.port=80", \ 17 | "--server.headless=true", \ 18 | "--server.address=0.0.0.0", \ 19 | "--browser.gatherUsageStats=false", \ 20 | "--server.enableStaticServing=true", \ 21 | "--server.fileWatcherType=none", \ 22 | "--client.toolbarMode=viewer"] -------------------------------------------------------------------------------- /Dockerfile: -------------------------------------------------------------------------------- 1 | FROM python:3.12-slim 2 | 3 | WORKDIR /app 4 | 5 | # Install system dependencies including ffmpeg and pandoc 6 | RUN apt-get update && \ 7 | apt-get install -y curl ffmpeg pandoc && \ 8 | apt-get clean && \ 9 | rm -rf /var/lib/apt/lists/* 10 | 11 | COPY requirements.txt . 12 | 13 | # Install PyTorch CPU version first 14 | RUN pip install torch==2.3.0+cpu torchvision==0.18.0+cpu --index-url https://download.pytorch.org/whl/cpu 15 | 16 | # Then install the rest of the requirements 17 | RUN pip install --no-cache-dir -r requirements.txt 18 | 19 | COPY . . 20 | 21 | EXPOSE 8501 22 | 23 | HEALTHCHECK --interval=5s --timeout=5s --start-period=60s \ 24 | CMD curl --fail http://localhost:8501/_stcore/health || exit 1 25 | 26 | ENTRYPOINT ["streamlit", "run", "app.py", \ 27 | "--server.port=8501", \ 28 | "--server.address=0.0.0.0", \ 29 | "--browser.gatherUsageStats=false", \ 30 | "--server.enableStaticServing=true", \ 31 | "--client.toolbarMode=viewer", \ 32 | "--server.headless=true", \ 33 | "--server.fileWatcherType=none"] -------------------------------------------------------------------------------- /functions/functions.py: -------------------------------------------------------------------------------- 1 | 2 | ### Functions 3 | 4 | import streamlit as st 5 | from pydub import AudioSegment 6 | 7 | 8 | # Converts and compresses audio or video file to mp3 and a more manageble size 9 | 10 | def convert_to_mono_and_compress(uploaded_file, file_name, target_size_MB=22): 11 | 12 | global file_name_converted 13 | 14 | # Load the audio file 15 | audio = AudioSegment.from_file(uploaded_file) 16 | 17 | # Convert to mono 18 | audio = audio.set_channels(1) 19 | 20 | # Calculate target bitrate to achieve the desired file size (in bits per second) 21 | duration_seconds = len(audio) / 1000.0 # pydub works in milliseconds 22 | target_bitrate = int((target_size_MB * 1024 * 1024 * 8) / duration_seconds) 23 | 24 | # Compress the audio file 25 | try: 26 | audio.export("audio/" + file_name + ".mp3", format="mp3", bitrate=f"{target_bitrate}") 27 | file_name_converted = "audio/" + file_name + ".mp3" 28 | 29 | except Exception as e: 30 | print(f"Error during audio export: {e}") 31 | return None 32 | 33 | return file_name_converted 34 | -------------------------------------------------------------------------------- /functions/transcribe.py: -------------------------------------------------------------------------------- 1 | 2 | ### Transcribe 3 | 4 | import streamlit as st 5 | import torch 6 | from datasets import load_dataset 7 | from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline 8 | import whisper 9 | 10 | # Functions that transcribes audio and creates the text files 11 | 12 | def transcribe_with_kb_whisper(file_name_converted, file_name, whisper_model, spoken_language): 13 | 14 | device = "cuda:0" if torch.cuda.is_available() else "cpu" 15 | torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32 16 | model_id = f"KBLab/{whisper_model}" 17 | 18 | model = AutoModelForSpeechSeq2Seq.from_pretrained( 19 | model_id, torch_dtype=torch_dtype, use_safetensors=True, cache_dir="cache" 20 | ) 21 | model.to(device) 22 | processor = AutoProcessor.from_pretrained(model_id) 23 | 24 | pipe = pipeline( 25 | "automatic-speech-recognition", 26 | model=model, 27 | tokenizer=processor.tokenizer, 28 | feature_extractor=processor.feature_extractor, 29 | torch_dtype=torch_dtype, 30 | device=device, 31 | ) 32 | 33 | generate_kwargs = {"task": "transcribe", "language": spoken_language} 34 | 35 | res = pipe(file_name_converted, 36 | chunk_length_s=30, 37 | generate_kwargs={"task": "transcribe", "language": spoken_language}) 38 | 39 | transcribed_content = res["text"] 40 | 41 | with open('text/' + file_name + '.txt', 'w', encoding='utf-8', errors='replace') as file: 42 | # Write the string to the file 43 | file.write(transcribed_content) 44 | 45 | return transcribed_content 46 | 47 | 48 | def transcribe_with_whisper(file_name_converted, file_name, whisper_model, spoken_language): 49 | 50 | transcribed_content = "" 51 | 52 | model = whisper.load_model(whisper_model) 53 | result = model.transcribe(file_name_converted, language=spoken_language) 54 | transcribed_content = result["text"] 55 | 56 | with open('text/' + file_name + '.txt', 'w', encoding='utf-8', errors='replace') as file: 57 | # Write the string to the file 58 | file.write(transcribed_content) 59 | 60 | return transcribed_content 61 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Ragnar - the slow but secure way to transcribe files 2 | Transcribe your audio and video files locally, totally secure. 3 | 4 | ![ragge_header](https://github.com/user-attachments/assets/fbb54afb-ec4a-462f-b24f-c3ee056e3ea8) 5 | 6 | ## What is this? 7 | Ragnar is a simple app built with Python and [Streamlit](https://streamlit.io/) that transcribes your audio and video files locally on your computer, or your own server. Totally secure and without any need to call out to any services but your own computer. It uses [Whisper](https://github.com/openai/whisper) and [KB Whisper (from Kungliga Biblioteket)](https://huggingface.co/collections/KBLab/kb-whisper-67af9eafb24da903b63cc4aa). 8 |
The transcriptions can then be saved as txt, docx, json and srt (subtitles). 9 | 10 | ## How the app works - flow 11 | 1. When you run the app, a web page is opened in your default web browser. 12 | 2. You upload an audio- or video file directly from the app. 13 | 3. When you've uploaded audio, the audio file will be converted into an mp3 file and compressed in size. 14 | 4. The mp3 file will be transcribed using Whisper or KB Whisper locally on your computer based on your settings (language and model). 15 | 5. The transcribed text is presented to you with the possibility to download. 16 | 17 | ## Installation 18 | This is an early beta, but it works. Expect updates as I develop this app. If you have any suggestions, feel free to ask.
19 | PS. I'm not a programmer. It's prototype code. ;) 20 |
21 | * Tested on Mac OSX and Windows 10 with Python 3.12 22 | * Download the files and 'pip install -r requirements.txt' 23 | * Install FFMPEG on your system 24 | * Run with 'streamlit run app.py' alternatively 'python -m streamlit run app.py' 25 | * The first time you run it, it will take som time since the Whisper model is downloaded to your computer. 26 | * If you're on Windows, I included a 'ragnar.bat' file which starts the application if you place all code in 'C:\ragnar'. You can edit this if you place Ragnar in a different folder. 27 | 28 | ## Updates 29 | * v0.7.4 30 | * A bug fix for Windows users where they sometimes get a "UnicodeEncodeError: 'charmap' codec can't...". 31 | * v0.7.2 32 | * Just added some files for deployment to Docker 33 | * v0.7.0 34 | * Partially rewritten. Make sure to update your pip packages from requirement.txt if you've already installed Ragnar 35 | * Added KB (Kungliga Bibliotekets fine tuned Whisper) Whisper and reverted back to vanilla Whisper from OpenAI. Still all local 36 | * Removed translation which a language model does a lot better 37 | * v0.6.2 38 | * Updated requirements.txt and tested with latest versions of eg Streamlit 39 | * Tidying up and moving functions to separate folder 40 | * Replaced the audio recorder with Streamlit's new audio recorder 41 | * v0.6.1 42 | * Windows users getting weird characters instead of å ä ö. Fixed it with utf-8 in transcribe.py 43 | * v0.6.0 44 | * I've split the code into several python files. 45 | * I've fixed the 'record audio' section and it works. 46 | * Added a dropdown menu for language selection of the source audio. By default it's set to automatic detection, which works most of the time. But if you have people speaking with heavy accents or such you can set the language here. 47 | * v0.5.0 48 | * Init upload - early beta 49 | 50 | ## Known bugs 51 | * (v0.5.0) ~~The "record audio" section is a bit wonky~~ 52 | 53 | ## License 54 | Some of you have asked why I haven't added a license to Ragnar. The truth is that I have no knowledge about licensing and open source. I've got some suggestions like MIT, and I'm looking into it. 55 | My point is that Ragnar is free to use, modify, distribute and do what you want with as long as you want to. It's just code that I wrote to solve an issue me and my collegues had. If it helps more people, great. 56 | 57 | ## Support 58 | Unfortunately, it's not possible for me to assist you guys with support. I just don't have the time. Report bugs and problems and hopefully we can try to solve them together. 59 | 60 | 61 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | 2 | # Python imports 3 | import os 4 | import streamlit as st 5 | import hashlib 6 | from os import environ 7 | import hmac 8 | 9 | # External imports 10 | from docx import Document 11 | from pydub import AudioSegment 12 | 13 | # Local imports 14 | from functions.functions import convert_to_mono_and_compress 15 | from functions.transcribe import transcribe_with_kb_whisper, transcribe_with_whisper 16 | import config as c 17 | 18 | 19 | ### INITIAL VARIABLES 20 | 21 | # Creates folder if they don't exist 22 | os.makedirs("audio", exist_ok=True) # Where audio/video files are stored for transcription 23 | os.makedirs("text", exist_ok=True) # Where transcribed document are beeing stored 24 | 25 | 26 | ### PASSWORD ######################## 27 | 28 | if c.run_mode == "streamlit": 29 | st.session_state["pwd_on"] = st.secrets.pwd_on 30 | else: 31 | st.session_state["pwd_on"] = environ.get("pwd_on") 32 | 33 | 34 | if st.session_state["pwd_on"] == "true": 35 | 36 | def check_password(): 37 | 38 | if c.run_mode == "streamlit": 39 | passwd = st.secrets["password"] 40 | else: 41 | passwd = environ.get("password") 42 | 43 | def password_entered(): 44 | 45 | if hmac.compare_digest(st.session_state["password"], passwd): 46 | st.session_state["password_correct"] = True 47 | del st.session_state["password"] # Don't store the password. 48 | else: 49 | st.session_state["password_correct"] = False 50 | 51 | if st.session_state.get("password_correct", False): 52 | return True 53 | 54 | st.text_input("Lösenord", type="password", on_change=password_entered, key="password") 55 | if "password_correct" in st.session_state: 56 | st.error("😕 Ooops. Fel lösenord.") 57 | return False 58 | 59 | 60 | if not check_password(): 61 | st.stop() 62 | 63 | ############ 64 | 65 | 66 | # Check and set default values if not set in session_state 67 | # of Streamlit 68 | if "translation" not in st.session_state: # If audio has been translated 69 | st.session_state["translation"] = False 70 | if "cpu_vs_gpu" not in st.session_state: # If user device has GPU support 71 | st.session_state["cpu_vs_gpu"] = False 72 | if "spoken_language" not in st.session_state: # What language source audio is in 73 | st.session_state["spoken_language"] = "Svenska" 74 | if "transcribe_model" not in st.session_state: # What model of Whisper to use 75 | st.session_state["transcribe_model"] = "KB Whisper Small" 76 | if "file_name_converted" not in st.session_state: # Audio file name 77 | st.session_state["file_name_converted"] = None 78 | 79 | 80 | # Checking if uploaded or recorded audio file has been transcribed 81 | def compute_file_hash(uploaded_file): 82 | 83 | # Compute the MD5 hash of a file 84 | hasher = hashlib.md5() 85 | 86 | for chunk in iter(lambda: uploaded_file.read(4096), b""): 87 | hasher.update(chunk) 88 | uploaded_file.seek(0) # Reset the file pointer to the beginning 89 | 90 | return hasher.hexdigest() 91 | 92 | 93 | ### MAIN APP ########################### 94 | 95 | # Page configuration 96 | st.set_page_config( 97 | page_title="Ragnar", 98 | page_icon=None, 99 | layout="centered", 100 | initial_sidebar_state="auto" 101 | ) 102 | 103 | 104 | def main(): 105 | 106 | global translation 107 | global model_map_transcribe_model 108 | 109 | ### SIDEBAR 110 | 111 | # Sidebar image of Ragnar 112 | st.sidebar.image("images/ragge3.png", width = 220) 113 | 114 | ###### SIDEBAR SETTINGS 115 | 116 | st.sidebar.header("Inställningar") 117 | st.sidebar.markdown("") 118 | 119 | # Dropdown menu - choose Whisper model 120 | transcribe_model = st.sidebar.selectbox( 121 | "Välj modell för transkribering", 122 | [ 123 | "KB Whisper Large", 124 | "KB Whisper Medium", 125 | "KB Whisper Small", 126 | "KB Whisper Base", 127 | "KB Whisper Tiny", 128 | "OpenAI Whisper Turbo", 129 | "OpenAI Whisper Large", 130 | "OpenAI Whisper Medium", 131 | "OpenAI Whisper Small", 132 | "OpenAI Whisper Base", 133 | "OpenAI Whisper Tiny" 134 | ], 135 | index=[ 136 | "KB Whisper Large", 137 | "KB Whisper Medium", 138 | "KB Whisper Small", 139 | "KB Whisper Base", 140 | "KB Whisper Tiny", 141 | "OpenAI Whisper Turbo", 142 | "OpenAI Whisper Large", 143 | "OpenAI Whisper Medium", 144 | "OpenAI Whisper Small", 145 | "OpenAI Whisper Base", 146 | "OpenAI Whisper Tiny" 147 | ].index(st.session_state["transcribe_model"]), 148 | ) 149 | 150 | model_map_transcribe_model = { 151 | "KB Whisper Large": "kb-whisper-large", 152 | "KB Whisper Medium": "kb-whisper-medium", 153 | "KB Whisper Small": "kb-whisper-small", 154 | "KB Whisper Base": "kb-whisper-base", 155 | "KB Whisper Tiny": "kb-whisper-tiny", 156 | "OpenAI Whisper Turbo": "turbo", 157 | "OpenAI Whisper Large": "large", 158 | "OpenAI Whisper Medium": "medium", 159 | "OpenAI Whisper Small": "small", 160 | "OpenAI Whisper Base": "base", 161 | "OpenAI Whisper Tiny": "tiny" 162 | } 163 | 164 | # Dropdown menu - choose source language of audio 165 | spoken_language = st.sidebar.selectbox( 166 | "Välj språk som talas", 167 | ["Automatiskt", "Svenska", "Engelska", "Franska", "Tyska", "Spanska"], 168 | index=["Automatiskt", "Svenska", "Engelska", "Franska", "Tyska", "Spanska"].index(st.session_state["spoken_language"]), 169 | ) 170 | 171 | model_map_spoken_language = { 172 | "Automatiskt": None, 173 | "Svenska": "sv", 174 | "Engelska": "en", 175 | "Franska": "fr", 176 | "Tyska": "de", 177 | "Spanska": "sp" 178 | 179 | } 180 | 181 | # Update the session_state directly 182 | st.session_state["transcribe_model"] = transcribe_model 183 | st.session_state["spoken_language"] = spoken_language 184 | 185 | print(model_map_transcribe_model[st.session_state["transcribe_model"]]) 186 | print(model_map_spoken_language[st.session_state["spoken_language"]]) 187 | 188 | st.sidebar.markdown( 189 | "#" 190 | ) 191 | 192 | st.sidebar.markdown(f""" 193 | Version: {c.app_version} 194 | """) 195 | 196 | 197 | ### MAIN PAGE 198 | 199 | # Title 200 | st.markdown("""# Ragnar 201 | ### Din GDPR- och sekretessäkrade transkriberare 202 | """) 203 | st.markdown(f"""**Vald AI-modell:** {st.session_state["transcribe_model"]} 204 | **Valt språk:** {st.session_state["spoken_language"]}""") 205 | 206 | 207 | # CREATE TWO TABS FOR FILE UPLOAD VS RECORDED AUDIO 208 | 209 | tab1, tab2 = st.tabs(["Ladda upp", "Spela in"]) 210 | 211 | 212 | # FILE UPLOADER 213 | 214 | with tab1: 215 | 216 | uploaded_file = st.file_uploader( 217 | "Ladda upp din ljud- eller videofil här", 218 | type=["mp3", "wav", "flac", "mp4", "m4a", "aifc"], 219 | help="Max 2GB stora filer", label_visibility="collapsed", 220 | ) 221 | 222 | 223 | if uploaded_file: 224 | 225 | # Checks if uploaded file has already been transcribed 226 | current_file_hash = compute_file_hash(uploaded_file) 227 | 228 | # If the uploaded file hash is different from the one in session state, reset the state 229 | if "file_hash" not in st.session_state or st.session_state.file_hash != current_file_hash: 230 | st.session_state.file_hash = current_file_hash 231 | 232 | if "transcribed" in st.session_state: 233 | del st.session_state.transcribed 234 | 235 | 236 | # If audio has not been transcribed 237 | if "transcribed" not in st.session_state: 238 | 239 | # Sends audio to be converted to mp3 and compressed 240 | with st.spinner('Din ljudfil är lite stor. Jag ska bara komprimera den lite först...', show_time=True): 241 | st.session_state.file_name_converted = convert_to_mono_and_compress(uploaded_file, uploaded_file.name) 242 | st.success('Inspelning komprimerad och klar. Startar transkribering.') 243 | 244 | # Transcribes audio with Whisper 245 | with st.spinner('Transkriberar. Det här kan ta ett litet tag beroende på hur lång inspelningen är...', show_time=True): 246 | 247 | if "KB" in st.session_state["transcribe_model"]: 248 | st.session_state.transcribed = transcribe_with_kb_whisper(st.session_state.file_name_converted, 249 | uploaded_file.name, 250 | model_map_transcribe_model[st.session_state["transcribe_model"]], 251 | model_map_spoken_language[st.session_state["spoken_language"]]) 252 | else: 253 | st.session_state.transcribed = transcribe_with_whisper(st.session_state.file_name_converted, 254 | uploaded_file.name, 255 | model_map_transcribe_model[st.session_state["transcribe_model"]], 256 | model_map_spoken_language[st.session_state["spoken_language"]]) 257 | 258 | st.success('Transkribering klar.') 259 | 260 | st.balloons() 261 | 262 | 263 | # Creates a Word document with the transcribed text 264 | document = Document() 265 | 266 | clean_text = st.session_state.transcribed.encode('utf-8', errors='replace').decode('utf-8') 267 | document.add_paragraph(clean_text) 268 | 269 | document.save('text/' + uploaded_file.name + '.docx') 270 | 271 | with open("text/" + uploaded_file.name + ".docx", "rb") as template_file: 272 | template_byte = template_file.read() 273 | 274 | 275 | # Creates a grid of four columns for the different transcribed document download buttons 276 | col1, col2, col3, col4 = st.columns(4) 277 | 278 | # Text 279 | with col1: 280 | with open('text/' + uploaded_file.name + '.txt', "rb") as file_txt: 281 | st.download_button( 282 | label = ":flag-se: Ladda ned text", 283 | data = file_txt, 284 | file_name = uploaded_file.name + '.txt', 285 | mime = 'text/plain', 286 | ) 287 | 288 | # Word 289 | with col2: 290 | st.download_button( 291 | label = ":flag-se: Ladda ned word", 292 | data = template_byte, 293 | file_name = uploaded_file.name + '.docx', 294 | mime = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 295 | ) 296 | 297 | st.markdown("### Transkribering") 298 | 299 | if st.session_state.file_name_converted is not None: 300 | st.audio(st.session_state.file_name_converted, format='audio/wav') 301 | 302 | st.write(st.session_state.transcribed) 303 | 304 | 305 | # AUDIO RECORDER ###### ###### ###### 306 | 307 | with tab2: 308 | 309 | audio = st.audio_input("Spela in") 310 | 311 | # The rest of the code in tab2 works the same way as in tab1, so it's not going to be 312 | # commented. 313 | if audio: 314 | 315 | # Open the saved audio file and compute its hash 316 | current_file_hash = compute_file_hash(audio) 317 | 318 | # If the uploaded file hash is different from the one in session state, reset the state 319 | if "file_hash" not in st.session_state or st.session_state.file_hash != current_file_hash: 320 | st.session_state.file_hash = current_file_hash 321 | 322 | if "transcribed" in st.session_state: 323 | del st.session_state.transcribed 324 | 325 | if "transcribed" not in st.session_state: 326 | 327 | audio_file = AudioSegment.from_file(audio) 328 | output_path = "audio/converted.mp3" 329 | audio_file.export(output_path, format="mp3", bitrate="16k") 330 | 331 | with st.spinner('Transkriberar. Det här kan ta ett litet tag beroende på hur lång inspelningen är...', show_time=True): 332 | 333 | if "KB" in st.session_state["transcribe_model"]: 334 | st.session_state.transcribed = transcribe_with_kb_whisper("audio/converted.mp3", 335 | "local_recording.mp3", 336 | model_map_transcribe_model[st.session_state["transcribe_model"]], 337 | model_map_spoken_language[st.session_state["spoken_language"]]) 338 | else: 339 | st.session_state.transcribed = transcribe_with_whisper("audio/converted.mp3", 340 | "local_recording.mp3", 341 | model_map_transcribe_model[st.session_state["transcribe_model"]], 342 | model_map_spoken_language[st.session_state["spoken_language"]]) 343 | 344 | st.success('Transkribering klar.') 345 | 346 | st.balloons() 347 | 348 | local_recording_name = "local_recording.mp3" 349 | document = Document() 350 | clean_text = st.session_state.transcribed.encode('utf-8', errors='replace').decode('utf-8') 351 | document.add_paragraph(clean_text) 352 | 353 | document.save('text/' + local_recording_name + '.docx') 354 | 355 | with open("text/local_recording.mp3.docx", "rb") as template_file: 356 | template_byte = template_file.read() 357 | 358 | col1, col2, col3, col4 = st.columns(4) 359 | 360 | with col1: 361 | with open('text/' + local_recording_name + '.txt', "rb") as file_txt: 362 | st.download_button( 363 | label = ":flag-se: Ladda ned text", 364 | data = file_txt, 365 | file_name = local_recording_name + '.txt', 366 | mime = 'text/plain', 367 | ) 368 | 369 | with col2: 370 | st.download_button( 371 | label = ":flag-se: Ladda ned word", 372 | data = template_byte, 373 | file_name = local_recording_name + '.docx', 374 | mime = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', 375 | ) 376 | 377 | 378 | st.markdown("### Transkribering") 379 | 380 | if st.session_state.file_name_converted is not None: 381 | st.audio(st.session_state.file_name_converted, format='audio/wav') 382 | 383 | st.write(st.session_state.transcribed) 384 | 385 | 386 | if __name__ == "__main__": 387 | main() --------------------------------------------------------------------------------