├── requirements.txt ├── dockerfile ├── prompt_dnd_2.txt ├── utils.py ├── prompt_dnd_1.txt ├── setup.py ├── LICENSE ├── recognize.py ├── summarize.py ├── .gitignore ├── configuration.py ├── tasmas.py ├── README.md └── assemble.py /requirements.txt: -------------------------------------------------------------------------------- 1 | whisper_timestamped==1.14.4 2 | auditok==0.2.0 3 | deepmultilingualpunctuation==1.0.1 4 | openai==1.23.6 -------------------------------------------------------------------------------- /dockerfile: -------------------------------------------------------------------------------- 1 | # you'll need to have already done: 2 | # docker build https://github.com/linto-ai/whisper-timestamped.git -t whisper_timestamped 3 | FROM whisper_timestamped 4 | 5 | # install packages it doesn't include 6 | RUN pip install --no-cache-dir deepmultilingualpunctuation openai 7 | 8 | WORKDIR /usr/src/tasmas 9 | 10 | COPY . /usr/src/tasmas 11 | 12 | RUN cd /usr/src/tasmas/ && pip3 install . -------------------------------------------------------------------------------- /prompt_dnd_2.txt: -------------------------------------------------------------------------------- 1 | Prompt: 2 | Given the following transcript of a D&D session, generate separate bulleted lists for each party member detailing specific events and experiences they encountered. 3 | Additionally, provide a comprehensive list of all items that were exchanged or obtained, specifying their name, origin, and recipient. 4 | Finally, include information about the scheduled upcoming sessions. 5 | 6 | Transcript: 7 | -------------------------------------------------------------------------------- /utils.py: -------------------------------------------------------------------------------- 1 | import os 2 | import re 3 | 4 | def extract_speaker_name(file, extension): 5 | file_name = os.path.basename(file) # strip off the path 6 | regex = r"^\d*[-_]?(.+?)(?:_0)?(?:\.[a-zA-Z0-9]{2,4})?\." + re.escape(extension) + "$" 7 | match = re.match(regex, file_name) 8 | if match: 9 | return match.group(1) 10 | else: 11 | raise ValueError(f"Could not parse speaker name from filename '{file_name}'") -------------------------------------------------------------------------------- /prompt_dnd_1.txt: -------------------------------------------------------------------------------- 1 | Prompt: 2 | Generate a detailed summary of a Dungeons & Dragons session based on the provided transcript. Pay attention to dialogue attribution, changes in inventory or quest progress, significant events, and exclude any irrelevant banter or out-of-character jokes. Ensure that the summary accurately captures the narrative elements provided by the Gamemaster, distinguishing between narrative details and NPC dialogue. Produce a comprehensive summary that provides enough context for future sessions and potential plot developments. 3 | 4 | Length: Please generate a detailed summary of the session, ensuring that it captures all essential details while maintaining clarity and coherence. 5 | 6 | Transcript: 7 | -------------------------------------------------------------------------------- /setup.py: -------------------------------------------------------------------------------- 1 | from setuptools import setup, find_packages 2 | 3 | setup(name='tasmas', 4 | version='0.1', 5 | author='Kadda OK', 6 | description='TASMAS (Transcribe And Summarize Multiple Audio Stems) transcribes and interleaves per-speaker audio recordings into a single threaded transcript, which it can optionally then summarize.', 7 | py_modules=['tasmas', 'assemble', 'configuration', 'recognize', 'summarize', 'utils'], 8 | install_requires=[ 9 | 'whisper_timestamped', 10 | 'auditok', 11 | 'deepmultilingualpunctuation', 12 | 'openai' 13 | ], 14 | entry_points={ 15 | 'console_scripts': [ 16 | 'tasmas=tasmas:main' 17 | ], 18 | }) -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Kadda OK 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /recognize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import json 3 | import glob 4 | from typing import Dict 5 | import whisper_timestamped as whisper 6 | 7 | from utils import extract_speaker_name 8 | 9 | def recognize(input_dir: str, names: Dict[str, str], fast: bool = False, model_type: str = "small", device: str = "cuda", audio_ext: str = "ogg"): 10 | model_type = "tiny" if fast else model_type 11 | model = whisper.load_model(model_type, device=device) 12 | 13 | print() 14 | print("--------------------") 15 | print("RECOGNIZE") 16 | print("--------------------") 17 | print() 18 | 19 | files = glob.glob(os.path.join(input_dir, '*.' + audio_ext)) 20 | 21 | if not files: 22 | print(f" No {audio_ext} files were found at {input_dir}.") 23 | print() 24 | return 25 | 26 | print(f" {len(files)} {audio_ext} files found at {input_dir}.") 27 | for audio_file in files: 28 | print(f" - {audio_file}...") 29 | speaker = extract_speaker_name(audio_file, audio_ext) 30 | if speaker in names and (names[speaker] is None or names[speaker] == ''): 31 | print(f" Skipping {audio_file} because '{speaker}' is specified as blank.") 32 | print() 33 | continue 34 | else: 35 | audio = whisper.load_audio(os.path.join(input_dir, audio_file)) 36 | if fast: 37 | results = whisper.transcribe(model, audio, detect_disfluencies=True, vad="auditok") 38 | else: 39 | results = whisper.transcribe(model, audio, detect_disfluencies=True, vad="auditok", beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0)) 40 | 41 | json_file = os.path.join(input_dir, audio_file + '.words.json') 42 | with open(json_file, 'w') as f: 43 | f.write(json.dumps(results)) 44 | print(f" Saved to {json_file}") 45 | print() 46 | print("--------------------") -------------------------------------------------------------------------------- /summarize.py: -------------------------------------------------------------------------------- 1 | import os 2 | import sys 3 | import glob 4 | import textwrap 5 | from openai import OpenAI 6 | 7 | def do_summary(transcript, client, prompt_file): 8 | with open(prompt_file, 'r') as file: 9 | prompt = file.read() 10 | 11 | completion = client.chat.completions.create(model="gpt-4-0125-preview", 12 | messages=[ 13 | {"role": "system", "content" : "You are a chatbot which can summarize long transcripts."}, 14 | {"role": "user", "content" : f'{prompt}{transcript}'}, 15 | ]) 16 | 17 | return completion.choices[0].message.content 18 | 19 | def summarize(input_dir, prompt_files, openai_api_key): 20 | 21 | if input_dir is None: 22 | print("Please provide an input directory.") 23 | return 24 | 25 | print() 26 | print("--------------------") 27 | print("SUMMARIZE") 28 | print("--------------------") 29 | print() 30 | 31 | transcript_path = os.path.join(input_dir, 'transcript.txt') 32 | if not os.path.exists(transcript_path): 33 | print("transcript.txt not found in the input directory.") 34 | sys.exit(1) 35 | 36 | with open(transcript_path, 'r') as file: 37 | transcript = file.read() 38 | 39 | if not prompt_files: 40 | print(" No prompts to use to summarize.") 41 | return 42 | 43 | client = OpenAI(api_key=openai_api_key) 44 | 45 | for prompt_file in prompt_files: 46 | # Call your command here 47 | print() 48 | print(f" - Prompt {prompt_file}...") 49 | 50 | summary = do_summary(transcript, client, prompt_file) 51 | 52 | filename = os.path.splitext(os.path.basename(prompt_file))[0].replace("prompt_", "") 53 | filename = f"summary_{filename}.txt" 54 | output_path = os.path.join(input_dir, filename) 55 | with open(output_path, 'w') as file: 56 | file.write(summary) 57 | print() 58 | print(" Result:") 59 | print(" ---------") 60 | terminal_width = os.get_terminal_size().columns 61 | # Split the summary into lines, then indent and wrap each line 62 | summary_lines = summary.split('\n') 63 | wrapped_summary = '\n'.join('\n'.join(textwrap.wrap(line, width=terminal_width, initial_indent=' ', subsequent_indent=' ')) for line in summary_lines) 64 | print(wrapped_summary) 65 | print(" ---------") 66 | print(f" Written to {output_path}.") 67 | print() 68 | print() 69 | 70 | 71 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Byte-compiled / optimized / DLL files 2 | __pycache__/ 3 | *.py[cod] 4 | *$py.class 5 | 6 | # C extensions 7 | *.so 8 | 9 | # Distribution / packaging 10 | .Python 11 | build/ 12 | develop-eggs/ 13 | dist/ 14 | downloads/ 15 | eggs/ 16 | .eggs/ 17 | lib/ 18 | lib64/ 19 | parts/ 20 | sdist/ 21 | var/ 22 | wheels/ 23 | share/python-wheels/ 24 | *.egg-info/ 25 | .installed.cfg 26 | *.egg 27 | MANIFEST 28 | 29 | # PyInstaller 30 | # Usually these files are written by a python script from a template 31 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 32 | *.manifest 33 | *.spec 34 | 35 | # Installer logs 36 | pip-log.txt 37 | pip-delete-this-directory.txt 38 | 39 | # Unit test / coverage reports 40 | htmlcov/ 41 | .tox/ 42 | .nox/ 43 | .coverage 44 | .coverage.* 45 | .cache 46 | nosetests.xml 47 | coverage.xml 48 | *.cover 49 | *.py,cover 50 | .hypothesis/ 51 | .pytest_cache/ 52 | cover/ 53 | 54 | # Translations 55 | *.mo 56 | *.pot 57 | 58 | # Django stuff: 59 | *.log 60 | local_settings.py 61 | db.sqlite3 62 | db.sqlite3-journal 63 | 64 | # Flask stuff: 65 | instance/ 66 | .webassets-cache 67 | 68 | # Scrapy stuff: 69 | .scrapy 70 | 71 | # Sphinx documentation 72 | docs/_build/ 73 | 74 | # PyBuilder 75 | .pybuilder/ 76 | target/ 77 | 78 | # Jupyter Notebook 79 | .ipynb_checkpoints 80 | 81 | # IPython 82 | profile_default/ 83 | ipython_config.py 84 | 85 | # pyenv 86 | # For a library or package, you might want to ignore these files since the code is 87 | # intended to run in multiple environments; otherwise, check them in: 88 | # .python-version 89 | 90 | # pipenv 91 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 92 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 93 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 94 | # install all needed dependencies. 95 | #Pipfile.lock 96 | 97 | # poetry 98 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 99 | # This is especially recommended for binary packages to ensure reproducibility, and is more 100 | # commonly ignored for libraries. 101 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 102 | #poetry.lock 103 | 104 | # pdm 105 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 106 | #pdm.lock 107 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 108 | # in version control. 109 | # https://pdm.fming.dev/#use-with-ide 110 | .pdm.toml 111 | 112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 113 | __pypackages__/ 114 | 115 | # Celery stuff 116 | celerybeat-schedule 117 | celerybeat.pid 118 | 119 | # SageMath parsed files 120 | *.sage.py 121 | 122 | # Environments 123 | .env 124 | .venv 125 | env/ 126 | venv/ 127 | ENV/ 128 | env.bak/ 129 | venv.bak/ 130 | 131 | # Spyder project settings 132 | .spyderproject 133 | .spyproject 134 | 135 | # Rope project settings 136 | .ropeproject 137 | 138 | # mkdocs documentation 139 | /site 140 | 141 | # mypy 142 | .mypy_cache/ 143 | .dmypy.json 144 | dmypy.json 145 | 146 | # Pyre type checker 147 | .pyre/ 148 | 149 | # pytype static type analyzer 150 | .pytype/ 151 | 152 | # Cython debug symbols 153 | cython_debug/ 154 | 155 | # PyCharm 156 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 157 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 158 | # and can be added to the global gitignore or merged into this file. For a more nuclear 159 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 160 | #.idea/ 161 | -------------------------------------------------------------------------------- /configuration.py: -------------------------------------------------------------------------------- 1 | import os 2 | import argparse 3 | 4 | def get_configuration(args): 5 | parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter, 6 | description='''Multi-Stem Conversational Transcriber 7 | ''') 8 | parser.add_argument('operationMode', type=str, 9 | choices=['recognize', 'assemble', 'summarize', 'semiauto', 'fullauto'], 10 | help='''Which step to perform: 11 | - recognize: Transcribes all audio files found at the 12 | path using whisper_timestamped and writes 13 | a .words.json file for each. 14 | - assemble: Arranges by timecode the contents of all 15 | .words.json files found at the path, 16 | switching speakers at punctuation, to 17 | produce a readable transcript.txt. 18 | - summarize: Calls OpenAI API to summarize the 19 | transcript.txt at the path using 20 | configurable prompts. 21 | - semiauto: Runs recognize followed immediately by 22 | assemble. (This is the recommended first 23 | pass mode, as it is common to iterate on 24 | assemble multiple times making manual 25 | tweaks to the .words.json files.) 26 | - fullauto: Performs all steps in succession. 27 | ''') 28 | parser.add_argument('inputDir', type=str, help='The path to the files to process.') 29 | 30 | recognizeConfigGroup = parser.add_argument_group('recognize mode options') 31 | recognizeConfigGroup.add_argument('--extension', type=str, help='''File extension of the audio files to transcribe. 32 | Defaults to "ogg", for use with Craig recordings, but 33 | I would think that things like "wav" or "flac" would 34 | work too. 35 | ''') 36 | recognizeConfigGroup.add_argument('--fast', action='store_true', 37 | help='''Prioritize recognition speed over accuracy. 38 | Results in the following changes: 39 | - Uses the "tiny" model instead of the "small" model 40 | - Uses "efficient" params rather than "accurate" ones 41 | Honestly this really doesn't work well at all and I 42 | do not recommend it. 43 | ''' 44 | ) 45 | 46 | assembleConfigGroup = parser.add_argument_group('assemble mode options') 47 | assembleConfigGroup.add_argument('--noEllipses', action='store_true', help='''This script normally inserts ellipses (...) into the 48 | transcript whenever a word is more than 5s after its 49 | predecessor, allowing a speaker change (which is done 50 | on punctuation). 51 | The --noEllipses switch suppresses this behavior. 52 | ''') 53 | assembleConfigGroup.add_argument('--disfluentComma', action='store_true', help='''Replace detected disfluencies (e.g. "um", "uh") with a 54 | comma in the transcript. 55 | This may help if you are using --noEllipses. 56 | ''') 57 | assembleConfigGroup.add_argument('--noAsterisks', action='store_true', help='''When this script inserts ellipses or disfluency commas 58 | into the transcript, it marks them with an asterisk (*) 59 | for reference. 60 | The --noAsterisks switch suppresses this behavior. 61 | ''') 62 | assembleConfigGroup.add_argument('--showTimestamps', action='store_true', help='''Include the start and end seconds of the phrase in 63 | front of each line in the transcript. 64 | i.e. [1905.39-1907.05] Joe: "Look a timestamp." 65 | ''') 66 | assembleConfigGroup.add_argument('--corrections', type=str, help='''A list of known incorrect values to replace in the 67 | transcript output. This is a quick way to correct 68 | frequently misinterpreted text such as unusual names. 69 | Each entry is the correct word or phrase with a list of 70 | incorrect ones. For example, 71 | '{"Elsalor":["Elcelor", "I'll solar", "else the Lord"], 72 | "A'Dhem" :["Adam"] }' 73 | This can be a path to a .json file or the actual JSON. 74 | ''') 75 | assembleConfigGroup.add_argument('--names', type=str, help='''Replacements for the speaker names as recorded in the 76 | filenames by discord/Craig. These should reflect the 77 | names used by speakers to refer to each other in the 78 | recordings. For example: 79 | '{ "joey__0": "Joe", 80 | "randointernet3000_0": "Bob" }' 81 | This can be a path to a .json file or the actual JSON. 82 | You will be prompted individually for any values not 83 | found here (and given the opportunity to skip that 84 | audio stem). 85 | ''') 86 | 87 | summarizeConfigGroup = parser.add_argument_group('summarize mode options') 88 | summarizeConfigGroup.add_argument('--promptType', type=str, help=''' 89 | This script will call OpenAI's GPT-4 API to summarize 90 | the transcript as many times as it is given prompts to 91 | do so. It will attempt to find text files with the name 92 | pattern "prompt_{promptType}_*.txt", in the following 93 | order: 94 | - in the `inputDir` 95 | - one level above the `inputDir` 96 | - in the location of this script 97 | ''') 98 | summarizeConfigGroup.add_argument('--openApiKey', type=str, help='''Due to current LLM token limits (Q1 2024) and the very 99 | large number of tokens needed to summarize transcripts 100 | of much length, the summarize operation calls ChatGPT 101 | 4 Turbo (128k tokens). As such, an OpenAI API key is 102 | required to run in summarize (or fullauto) mode. 103 | (It'll probably cost you about $0.10 USD per call.) 104 | ''') 105 | 106 | config = vars(parser.parse_args(args)) 107 | 108 | return config -------------------------------------------------------------------------------- /tasmas.py: -------------------------------------------------------------------------------- 1 | import sys 2 | import json 3 | import os 4 | import re 5 | import glob 6 | import readline 7 | from typing import Dict, Optional 8 | import torch 9 | import whisper_timestamped as whisper 10 | from configuration import get_configuration 11 | from recognize import recognize 12 | from assemble import assemble 13 | from summarize import summarize 14 | from utils import extract_speaker_name 15 | 16 | def json_string_or_path(json_string_or_path): 17 | if not json_string_or_path: 18 | return None 19 | 20 | deserialized_object = None 21 | try: 22 | deserialized_object = json.loads(json_string_or_path) 23 | except json.JSONDecodeError: 24 | # If it's not a valid JSON string, treat it as a file path 25 | if os.path.exists(json_string_or_path): 26 | with open(json_string_or_path, 'r') as f: 27 | deserialized_object = json.load(f) 28 | 29 | return deserialized_object 30 | 31 | def load_names(names_setting, input_dir): 32 | names = json_string_or_path(names_setting) 33 | if names is None: 34 | # Look for names.json in input_dir 35 | names_file_path = os.path.join(input_dir, 'names.json') 36 | if not os.path.exists(names_file_path): 37 | # If not found, look one folder up 38 | names_file_path = os.path.join(input_dir, '..', 'names.json') 39 | if os.path.exists(names_file_path): 40 | # If found, prompt the user whether to use it 41 | use_names_file = input(f" Found a names file at {names_file_path}. Do you want to use it? (y/n): ") 42 | if use_names_file.lower() == 'y': 43 | with open(names_file_path, 'r') as f: 44 | names = json.load(f) 45 | print(f" Loaded {len(names)} speaker name{'s' if len(names) > 1 else ''} from {names_file_path}.") 46 | else: 47 | print(f" Loaded {len(names)} speaker name{'s' if len(names) > 1 else ''}.") 48 | 49 | return names 50 | 51 | def load_corrections(corrections_setting, input_dir): 52 | corrections = None 53 | correction_setting_dic = json_string_or_path(corrections_setting) 54 | if correction_setting_dic is None: 55 | # Look for corrections.json in input_dir 56 | corrections_file_path = os.path.join(input_dir, 'corrections.json') 57 | if not os.path.exists(corrections_file_path): 58 | # If not found, look one folder up 59 | corrections_file_path = os.path.join(input_dir, '..', 'corrections.json') 60 | if os.path.exists(corrections_file_path): 61 | # If found, prompt the user whether to use it 62 | use_corrections_file = input(f" Found a corrections file at {corrections_file_path}. Do you want to use it? (y/n): ") 63 | if use_corrections_file.lower() == 'y': 64 | with open(corrections_file_path, 'r') as f: 65 | correction_setting_dic = json.load(f) 66 | print(f" Loaded corrections from {corrections_file_path}.") 67 | if correction_setting_dic is not None: 68 | # this was defined as "correct string": ["incorrect string", "incorrect string", ...] because that's 69 | # easier to write out multiple corrections to the same value, but now we need to flip it so that we 70 | # can actually use the dictionary to look up words and see if they need correcting 71 | corrections = {incorrect: correct for correct, incorrects in correction_setting_dic.items() for incorrect in incorrects} 72 | print(f" Loaded {len(corrections)} correction{'s' if len(corrections) > 1 else ''}.") 73 | print() 74 | 75 | return corrections 76 | 77 | def check_names(names: Optional[Dict[str, str]], files, extension): 78 | if names is None: 79 | names = {} 80 | for file in files: 81 | speaker_name = extract_speaker_name(file, extension) 82 | if speaker_name not in names: 83 | print() 84 | readline.set_startup_hook(lambda: readline.insert_text(speaker_name)) 85 | try: 86 | value = input(f" Enter the proper speaker name for '{speaker_name}' (press enter to accept, or backspace it all and enter nothing to skip this file): ") 87 | finally: 88 | readline.set_startup_hook() # remove hook again 89 | names[speaker_name] = value if value else None 90 | return names 91 | 92 | def load_prompt_files(input_dir, prompt_type): 93 | prompt_files = [] 94 | directories = [input_dir, os.path.dirname(input_dir), os.path.dirname(os.path.realpath(__file__))] 95 | 96 | for directory in directories: 97 | files = glob.glob(os.path.join(directory, f'prompt_{prompt_type}_*.txt')) 98 | if files: 99 | print() 100 | print(f" Found the following prompt files in {directory}:") 101 | print() 102 | for file in files: 103 | print(f" - {file}") 104 | print() 105 | use_files = input(" Use these files? (y/n): ") 106 | if use_files.lower() == 'y': 107 | prompt_files.extend(files) 108 | break 109 | print() 110 | 111 | if not prompt_files: 112 | print(" No prompt files found.") 113 | 114 | return prompt_files 115 | def check_cuda(): 116 | if not torch.cuda.is_available(): 117 | print("\033[93m WARNING: CUDA (gpu support) is not available!\n" 118 | "\n If you are in Docker, you may have forgotten to specify `--gpus all`." 119 | "\n Otherwise, this is a bit more of a rabbit hole than can be delved here " 120 | "\n (it depends on your operating system and environment, but it's quite " 121 | "\n googleable).\n" 122 | "\n You can try to continue without it, but:" 123 | "\n - RECOGNIZE may be excruciatingly slow, or just not work at all." 124 | "\n - ASSEMBLE may fail when trying to auto-repunctuate out of sync items.\n" 125 | "\n (SUMMARIZE workloads should be unaffected.)\n \033[0m") 126 | response = input("Do you want to continue running? (y/n): ") 127 | if response.lower() not in ["y", "yes"]: 128 | exit() 129 | else: 130 | print(" CUDA is available.") 131 | 132 | def main(): 133 | # sys.argv contains the command-line arguments 134 | # sys.argv[0] is the script name 135 | # sys.argv[1:] are the arguments passed to the script 136 | args = sys.argv[1:] 137 | config = get_configuration(args) 138 | inputDir = config['inputDir'] 139 | no_ellipses = config.get('noEllipses', False) 140 | disfluent_comma = config.get('disfluentComma', False) 141 | no_asterisks = config.get('noAsterisks', False) 142 | show_timestamps = config.get('showTimestamps', False) 143 | 144 | print() 145 | print("--------------------") 146 | print("PRE-CHECK") 147 | print("--------------------") 148 | print() 149 | 150 | check_cuda() 151 | corrections = load_corrections(config.get('corrections'), inputDir) 152 | 153 | operation = config['operationMode'] 154 | if operation in ['recognize', 'semiauto', 'fullauto']: 155 | check_names_extension = config.get('extension', 'ogg').strip() or 'ogg' 156 | else: 157 | check_names_extension = 'words.json' 158 | 159 | files = glob.glob(os.path.join(inputDir, f"*.{check_names_extension}")) 160 | 161 | if not files: 162 | print() 163 | print(f" No {check_names_extension} files were found at {inputDir}.") 164 | print() 165 | sys.exit() 166 | 167 | print(f" Found {len(files)} files to work on at {inputDir}:") 168 | print() 169 | for file in files: 170 | filename = os.path.basename(file) 171 | print(f' - {filename}') 172 | print() 173 | names = check_names(load_names(config.get('names'), inputDir), files, check_names_extension) 174 | 175 | openai_api_key = config.get('openApiKey') 176 | prompt_type = config.get('promptType') 177 | prompt_files = [] 178 | if operation in ['summarize', 'fullauto']: 179 | if (prompt_type is None) or (prompt_type == ''): 180 | print(" Prompt Type is required for summarize (or fullauto) operation mode.") 181 | sys.exit() 182 | prompt_files = load_prompt_files(inputDir, prompt_type) 183 | if not prompt_files: 184 | print(" At least one prompt file must be found for summarize (or fullauto) operation mode.") 185 | sys.exit() 186 | if (openai_api_key is None) or (openai_api_key == ''): 187 | print(" OpenAI API key is required for summarize (or fullauto) operation mode.") 188 | sys.exit() 189 | 190 | operation_modes = { 191 | 'recognize': lambda: recognize(inputDir, names, config['fast']), 192 | 'assemble': lambda: assemble(inputDir, corrections, names, no_ellipses, disfluent_comma, no_asterisks, show_timestamps), 193 | 'summarize': lambda: summarize(inputDir, prompt_files, openai_api_key), 194 | 'semiauto': lambda: [recognize(inputDir, names, config['fast']), assemble(inputDir, corrections, names, no_ellipses, disfluent_comma, no_asterisks, show_timestamps)], 195 | 'fullauto': lambda: [recognize(inputDir, names, config['fast']), assemble(inputDir, corrections, names, no_ellipses, disfluent_comma, no_asterisks, show_timestamps), summarize(inputDir, prompt_files, openai_api_key)] 196 | } 197 | 198 | print("--------------------") 199 | 200 | if operation in operation_modes: 201 | operation_modes[operation]() 202 | else: 203 | print(f"Invalid operation: {operation}") 204 | 205 | if __name__ == '__main__': 206 | main(sys.argv[1:]) -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # TASMAS (*Transcribe And Summarize Multiple Audio Stems*) 2 | 3 | This is an automatic interleaving transcriber and summarizer for file-per-speaker audio recordings, such as Discord calls recorded by [`Craig`](https://craig.chat/) or a similar bot. 4 | 5 | You point it at a folder that contains audio files, 6 | ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/1ce0e427-9670-4d2a-a877-d1175cd2c8d9) 7 | and it will generate transcripts of each file timestamped at the word level, 8 | ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/77d2e0b8-96bf-4b16-8c91-23f43e16d0bb)![image](https://github.com/KaddaOK/TASMAS/assets/151568451/3b5bf487-4a72-45e6-b5a9-b6fd784e0a16) 9 | then braid each phrase from the files into a single coherent attributed transcript, 10 | ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/d4add246-f1dc-4c9b-b098-c48ea3100cbb) 11 | and optionally get summaries of that transcript as well. 12 | ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/38d00f66-5400-42ec-a9c0-756766a2afee) 13 | (Okay yes that's not a "real" automatic summary output, but you get the point, I just wanted to highlight how the transcript looks when people are talking over each other) 14 | 15 | 16 | # Operating Modes 17 | TASMAS has 3 operating modes, each of which can be executed independently, 18 | as well as a `SEMIAUTO` mode which executes the first two modes in sequence (this is the recommended initial run), 19 | and a `FULLAUTO` mode which executes all 3 (not recommended, as manual fix-up after stage 2 is usually a good idea). 20 | 21 | ## `RECOGNIZE`: 22 | 23 | *Given a file path that contains a number of separate audio files, 24 | transcribe each file down to word-level timestamps, saving each as `{filename}.words.json`.* 25 | 26 | ### File formats 27 | As designed and tested, this operates on `.ogg` files recorded using the [`Craig`](https://craig.chat/) bot for Discord, but it could theoretically be any audio file that `whisper` can handle if you specify the `--extension` switch. 28 | 29 | ### Models and options 30 | This mode uses `whisper_timestamped` to transcribe the files, using the `small` model, with disfluency detection enabled and the Auditok VAD mode, and using the beam and temp etc parameters described as "accurate". 31 | 32 | (As of right now there's a configuration switch `--fast` which causes it to use the `"tiny"` model instead and not use the "accurate" parameters, but honestly it's not usable and I'm going to just take it out entirely in a subsequent release.) 33 | 34 | Personally I didn't get any more meaningful results out of using larger models, and in fact `small` seemed to work the best anyway, so I didn't follow through on model selection options. 35 | 36 | ## `ASSEMBLE`: 37 | 38 | *Given a file path that contains a bunch of separate `.words.json` files, sort and interleave these into one coherent human-readable transcript, saved as `transcript.txt`.* 39 | 40 | ### Speaker Identification 41 | As TASMAS is intended for recordings that have a separate file for each different speaker, the filename is used to identify the speaker. 42 | 43 | After any formatting idiomatic to `Craig` (a leading `n-` and trailing `_0`) is stripped out, the rest will be compared to the contents of `--names` if specified, or used directly as the speaker name if nothing is found. (See below under Usage) 44 | 45 | If a `names.json` file is found in the input path or its containing folder, it will be used for `--names` automatically, allowing you to set up this information once and have it be continually re-used for other recordings with the same speakers. 46 | 47 | TASMAS will also interrogate the user during the PRE-CHECK phase to verify any speakers it encounters whose names were not specified by `--names` or a `names.json`. 48 | 49 | ### Punctuation-based Interleaving 50 | TASMAS sorts all the words by timestamp and assembles sentences sequentially, only allowing the current speaker to change when a word ends with a punctuation mark (`.`, `,`, `!`, `?`, `-`). 51 | This allows cross-talk to be inserted in as accurate and followable an order as possible without each word being split up. 52 | 53 | Note that this is therefore only possible when all of the audio files are synchronized to start at the same moment, even if that speaker was not yet present. `Craig` does this automatically, but if your source does not, you may need to edit your audio files accordingly. 54 | 55 | ### Anticipate Corrections/Replacements 56 | TASMAS will replace words and phrases that are likely mishears in the output if `--corrections` data is provided. (See below under Usage) 57 | This is particularly useful for TTRPG recordings, as many proper names and phrases will never be interpreted correctly. 58 | As with names, TASMAS will automatically pick up a `corrections.json` file if present in the input path or its parent folder, so you can build up these replacements over time. 59 | 60 | ### Out-of-sync Warning 61 | At the end of this operation, TASMAS will detect any phrases with start times that are more than 5 seconds out of sync with their neighbors, and will automatically run them through a punctuation model to try to improve results (as adding punctuation will allow these phrases to split at those words, which may allow other speakers to interject improving the overall sync). 62 | After doing so, remaining phrases that are still more than 5 seconds out of sync will be output to the screen and to `outOfSyncItems.txt`. 63 | Manually adding a punctuation mark directly to an individual word in the corresponding `.words.json` file and re-executing the ASSEMBLE operation will improve these results. 64 | 65 | 66 | ## `SUMMARIZE`: 67 | 68 | *Given an Open AI api key, appropriate prompts, and a file path that contains a `transcript.txt`, ask GPT-4 Turbo to summarize the transcript.* 69 | 70 | ### Summary Prompts 71 | When executing the SUMMARIZE operation, `--promptType` is required, which will be used to attempt to locate text files named in the format `prompt_{promptType}_*.txt`, in the input path, its parent folder, or with TASMAS itself. 72 | 73 | TASMAS was designed for summarizing Dungeons & Dragons sessions, and as examples, comes with two prompts that produce useful output, `prompt_dnd_1.txt` and `prompt_dnd_2.txt`, which will be used if `--promptType dnd` is specified. 74 | 75 | Also, `--openApiKey` is required in this mode, because: 76 | 77 | ### Why does summarize need to call a paid API? 78 | For each prompt file found, the OpenAI API is called. This is because of context token limits. 79 | A typical D&D session transcript will likely be anywhere between 30,000 and 60,000 tokens. As of this writing, most models will consider only 4096 or 8192 tokens, and very few models can handle more than 32K tokens of input, with GPT-4 Turbo's 128K limit being the only one practically available to me. 80 | So yes, it's not free, but it'll only cost you probably about $0.10 USD per prompt. 81 | (And you don't ever have to use the SUMMARIZE workload at all if you don't want anyway. 😁) 82 | 83 | # Usage 84 | 85 | To run TASMAS, you must provide at minimum: 86 | 1. an operation mode (`recognize`, `assemble`, `summarize`, `semiauto` which does the first 2, or `fullauto` which does all 3) and 87 | 2. a folder path to process. 88 | ```bash 89 | tasmas semiauto /mnt/c/recordings/2024-04-04 90 | ``` 91 | If your recordings aren't in `.ogg` (and to be fair, why would they be, unless you were using `Craig`, but that's the use case I wrote this for so it's the default), you'll have to add `--extension "wav"` or whatever they are. 92 | I haven't even tested that, I just assume it works; if not plz open a bug 🤣 93 | 94 | But yeah, here are some additional things you can add: 95 | 96 | ### Names 97 | Specifying a `--names` value allows you to set how the filenames should translate into speaker names in the transcript. 98 | It can either be a path to a `.json` file, or the JSON itself inline if you're feeling like making things harder for yourself. 99 | For example, to produce the transcript from our example at the top, this might be the contents of a `names.json` file: 100 | ```json 101 | { 102 | "JohnTheJester": "John", 103 | "EmiLovesCats": "Emily", 104 | "RoboBert": "Robert", 105 | "JessInWonderland": "Jessica", 106 | "SassySarah": "Sarah" 107 | } 108 | ``` 109 | We could pass that to tasmas like this: 110 | ```bash 111 | tasmas --names /mnt/c/recordings/names.json semiauto /mnt/c/recordings/2024-04-04 112 | ``` 113 | but in this case we don't even have to specify `--names`, because that `names.json` is in the parent folder of the folder we're processing, so if we say nothing about names it'll pick it up automatically. 114 | ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/7bc9cedc-8605-492b-a20e-d059380559f7) 115 | 116 | If you run TASMAS without any names input, it'll prompt you for the name for each speaker file it detects. (I'm realizing as I'm writing this that it'd be a good feature to ask you after doing so if you want to save a `names.json` for future use, so I'll add that to the backlog I guess.) 117 | It also gives you the option to skip a speaker entirely, which is useful for files that are music bots or whatnot. (Don't specify speaker names for such files, or you won't be prompted if you'd like to skip them!) 118 | ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/0ae8c997-16cf-4a32-b8d3-9807af20f407) 119 | 120 | ### Corrections 121 | Specifying a `--corrections` value allows you to replace all the occurrences of a word or phrase that you know is an incorrect interpretation with the correct value. 122 | 123 | Similarly to `names`, this can either be a path to a `.json` file, or the JSON itself inline. 124 | 125 | However, the format is the opposite: 126 | instead of the `"wrongValue":"correctValue"` of names, 127 | corrections are presented 128 | `"correctValue": ["wrongValue", "wrongValue", wrongValue"]` 129 | in order to allow you to list many incorrect possibilities for a single correct possibility. 130 | 131 | For example, here's just a few items from a real `corrections.json` file I use, in which you can see why it needs to be done this way: because of weird made up names that get interpreted in many random ways. 132 | ```json 133 | { 134 | "Dagstorp": [ 135 | "Dexter", 136 | "Digstorpe", 137 | "Dagster", 138 | "Dagstrup", 139 | "Dagsorp", 140 | "Dag Swarp" 141 | ], 142 | "Elsalor": [ 143 | "El Soler", 144 | "El Solor", 145 | "Else Laura", 146 | "else the Lord", 147 | "I'll solar" 148 | ], 149 | "Jeltra": [ 150 | "Gelter", 151 | "Delta", 152 | "Geltro", 153 | "Gelja", 154 | "Jeldra", 155 | "Jelter" 156 | ] 157 | } 158 | ``` 159 | Anyway, yeah. They don't have to be a single word to replace, either, you could put anything you want in those quotes; another real world example is `"Shield of Faith": ["shield a faith"]`, which had the added bonus of capitalizing that spell name (corrections are case-insensitive for detection, but will insert the replacement value as capitalized). 160 | 161 | ### Other stuff 162 | There are some other finer-tuning options, but they're pretty well-summarized in the actual software if you do `tasmas --help`. 163 | You won't generally need to mess with them (other than `--showTimestamps`, which pretty self-explanatorily includes timestamps in the `transcript.txt` output), unless you feel like `assemble`ing numerous transcripts and comparing them line by line to see how they differ. As with all things, YMMV. 164 | 165 | 166 | # Installation 167 | 168 | ### Docker 169 | Maybe the easiest, or at least most foolproof, way to use TASMAS (especially on Windows, which always seems to make a mess of python stuff) is via [`Docker`](https://www.docker.com/), which creates a lightweight virtual container with everything already set up for you. 170 | 171 | A TASMAS image is available on Docker Hub tagged `kaddaok/tasmas`, 172 | or the `dockerfile` is a part of this repo if you want to build the image yourself. 173 | 174 | You just want to make sure that you include `--gpus all`, so that the model can use your GPU if present, 175 | and that you map something as a volume (easiest way is `-v {src}:{dest}` ) so you have access to what you want to process. 176 | 177 | For instance, I put all the recordings I need to transcribe on my N: drive, 178 | and my docker is running in linux so I can access N: from `/mnt/n` in my docker host, and I'll just put it in the same place in the container's file system, 179 | so my docker command looks like this: 180 | ```bash 181 | docker run -it -v /mnt/n:/mnt/n --gpus all kaddaok/tasmas:latest 182 | ``` 183 | Running that gives me a new prompt at `/usr/src/tasmas` in the running container and I can just say `tasmas` straight from there: 184 | ```bash 185 | tasmas semiauto /mnt/n/dnd/icespire/2024-03-17 186 | ``` 187 | and when I'm done using the Docker container, I just type 188 | ```bash 189 | exit 190 | ``` 191 | and I'm back at the regular prompt. 192 | 193 | ### Python 194 | If you're already comfortable with python environments (or optimistically think that it might be easier than setting up docker), you can just run it directly. 195 | 196 | I haven't put this on PyPI yet (and probably need to reorganize it a bit in order to do so) which means you can't yet just say `pip install tasmas`. ❌ 197 | 198 | What you can do, though, is clone or download the contents of this repo, cd to it and then say `pip install . `. That should allow you to use the `tasmas` command. 199 | 200 | You can also just say `python tasmas.py` instead though, if you feel like it. -------------------------------------------------------------------------------- /assemble.py: -------------------------------------------------------------------------------- 1 | import json 2 | import os 3 | import glob 4 | import re 5 | import math 6 | import uuid 7 | import warnings 8 | from operator import attrgetter 9 | from typing import List, Dict, Optional 10 | from deepmultilingualpunctuation import PunctuationModel 11 | from tqdm import tqdm 12 | from utils import extract_speaker_name 13 | 14 | class WtWordEncoder(json.JSONEncoder): 15 | def default(self, o): 16 | if isinstance(o, (WtWord, WtWordList)): 17 | return o.to_dict() 18 | return super().default(o) 19 | 20 | class WtWordList: 21 | def __init__(self, words): 22 | if len(set(word.speaker for word in words)) > 1: 23 | raise ValueError("All words in a WtWordList must have the same speaker") 24 | self.words = words 25 | 26 | @property 27 | def speaker(self): 28 | return self.words[0].speaker if self.words else None 29 | 30 | @property 31 | def start(self): 32 | return min(word.start for word in self.words) 33 | 34 | @property 35 | def end(self): 36 | return max(word.end for word in self.words) 37 | 38 | @property 39 | def text(self): 40 | return " ".join(word.text for word in self.words) 41 | 42 | def to_dict(self): 43 | return { 44 | 'speaker': self.speaker, 45 | 'start': self.start, 46 | 'end': self.end, 47 | 'text': self.text, 48 | 'words': [word.to_dict() for word in self.words] 49 | } 50 | 51 | class WtWord: 52 | def __init__(self, speaker, text, start, end): 53 | self.id = uuid.uuid4() 54 | self.speaker = speaker 55 | self.text = text 56 | self.start = start 57 | self.end = end 58 | def to_dict(self): 59 | return { 60 | 'id': str(self.id), 61 | 'speaker': self.speaker, 62 | 'text': self.text, 63 | 'start': self.start, 64 | 'end': self.end 65 | } 66 | 67 | def assemble(input_dir, corrections, names, no_ellipses, disfluent_comma, no_asterisks, show_timestamps): 68 | if input_dir is None: 69 | print("Please provide an input directory.") 70 | return 71 | 72 | print() 73 | print("--------------------") 74 | print("ASSEMBLE") 75 | print("--------------------") 76 | print() 77 | 78 | ext = "words.json" 79 | files = glob.glob(os.path.join(input_dir, '*.' + ext)) 80 | 81 | if not files: 82 | print() 83 | print(f" ERROR: No .{ext} files were found at {input_dir}.") 84 | print() 85 | return 86 | 87 | print(f" Found {len(files)} .{ext} files at {input_dir}.") 88 | print() 89 | 90 | all_chunks = extract_all_chunks(names, no_ellipses, disfluent_comma, no_asterisks, files) 91 | 92 | # Sort all chunks by start timestamp 93 | print(" Sorting...") 94 | sorted_chunks = sorted(all_chunks, key=attrgetter('start', 'speaker', 'end')) 95 | print(" Normalizing...") 96 | normalized_items = normalize_items(sorted_chunks) 97 | """normaljson = os.path.join(input_dir, 'normalized.json') 98 | with open(normaljson, 'w') as f: 99 | f.write(json.dumps(normalized_items, cls=WtWordEncoder, indent=2))""" 100 | print(" Collapsing...") 101 | collapsed_items = collapse_adjacent(normalized_items) 102 | """collapsejson = os.path.join(input_dir, 'collapsed.json') 103 | with open(collapsejson, 'w') as f: 104 | f.write(json.dumps(collapsed_items, cls=WtWordEncoder, indent=2))""" 105 | print(" Formatting...") 106 | # max timestamp value for formatting 107 | max_timestamp = max(value for chunk in sorted_chunks for value in [chunk.start, chunk.end]) 108 | timestamp_digits = int(math.ceil(math.log10(max_timestamp))) 109 | format_string = "{:0{}.2f}".format(max_timestamp, timestamp_digits) 110 | 111 | # Find the maximum width of the speaker name 112 | max_speaker_width = max(len(chunk.speaker) for chunk in sorted_chunks) 113 | print(" Checking for problems...") 114 | out_of_sync_items = get_out_of_sync_items(input_dir, format_string, max_speaker_width, collapsed_items) 115 | if len(out_of_sync_items) > 0: 116 | #run them through the punctuation model and collapse again 117 | print(f" Found {len(out_of_sync_items)} out-of-sync items. Attempting to auto-punctuate and re-collapse...") 118 | 119 | with warnings.catch_warnings(): 120 | warnings.simplefilter("ignore") 121 | punctuationModel = PunctuationModel() 122 | changesMade = 0 123 | for item, _ in out_of_sync_items: 124 | repunctuated = punctuationModel.restore_punctuation(item.text) 125 | if len(repunctuated) > len(item.text): 126 | update_word_texts(item, repunctuated, all_chunks) 127 | changesMade += 1 128 | if changesMade > 0: 129 | print(f" Made changes to punctuation on {changesMade} lines.") 130 | print(" Re-sorting...") 131 | sorted_chunks = sorted(all_chunks, key=attrgetter('start', 'speaker', 'end')) 132 | print(" Re-normalizing...") 133 | normalized_items = normalize_items(sorted_chunks) 134 | print(" Re-collapsing...") 135 | collapsed_items = collapse_adjacent(normalized_items) 136 | print(" Checking for problems again...") 137 | out_of_sync_items = get_out_of_sync_items(input_dir, format_string, max_speaker_width, collapsed_items) 138 | if len(out_of_sync_items) > 0: 139 | write_out_of_sync_items(input_dir, max_speaker_width, out_of_sync_items) 140 | 141 | # option repunctuate everything 142 | """print(" Repunctuating all items...") 143 | punctuationModel = PunctuationModel() 144 | for item in tqdm(collapsed_items): 145 | repunctuated = punctuationModel.restore_punctuation(item.text) 146 | update_word_texts(item, repunctuated, item.words)""" 147 | print(" Writing Output...") 148 | output_items(input_dir, corrections, show_timestamps, format_string, max_speaker_width, collapsed_items) 149 | print("--------------------") 150 | 151 | def extract_all_chunks(names: Optional[Dict[str, str]], no_ellipses: bool, disfluent_comma: bool, no_asterisks: bool, file_names: List[str]) -> List[WtWord]: 152 | all_chunks = [] 153 | 154 | for file_name in file_names: 155 | print(f" - {os.path.basename(file_name)}") 156 | original_speaker = extract_speaker_name(file_name, 'words.json') 157 | if names and original_speaker in names and (names[original_speaker] is None or names[original_speaker] == ''): 158 | print(f" Skipping because '{original_speaker}' is specified as blank.") 159 | print() 160 | continue 161 | 162 | speaker = names[original_speaker] if names and original_speaker in names else original_speaker 163 | print(f" Extracting chunks for {speaker} ({original_speaker})...") 164 | extracted_chunks = extract_chunks_from_file(file_name, speaker) 165 | if disfluent_comma: 166 | insert_commas_at_disfluencies(extracted_chunks, no_asterisks) 167 | undisfluent_chunks = [c for c in extracted_chunks if c.text.strip() != "[*]"] 168 | if not no_ellipses: 169 | insert_ellipses_at_likely_breaks(undisfluent_chunks, no_asterisks) 170 | all_chunks.extend(undisfluent_chunks) 171 | print() 172 | 173 | return all_chunks 174 | 175 | def normalize_items(sorted_chunks: List[WtWord]) -> List[WtWordList]: 176 | normalized_items = [] 177 | current_sentences = {} 178 | 179 | for chunk in sorted_chunks: 180 | if chunk.text == "[*]": 181 | # this is just a scrubbed disfluency, skip it 182 | continue 183 | 184 | if chunk.speaker not in current_sentences or current_sentences[chunk.speaker] is None: 185 | current_sentences[chunk.speaker] = WtWordList([chunk]) 186 | else: 187 | current_sentences[chunk.speaker].words.append(chunk) 188 | 189 | last_character = chunk.text.strip()[-1] 190 | if last_character in ['.', '!', '?', '-', ',', '~']: 191 | normalized_items.append(current_sentences[chunk.speaker]) 192 | current_sentences[chunk.speaker] = None 193 | 194 | # catch any leftovers 195 | leftovers = [v for k, v in current_sentences.items() if v is not None] 196 | leftovers.sort(key=lambda x: x.start) 197 | normalized_items.extend(leftovers) 198 | 199 | return normalized_items 200 | 201 | def collapse_adjacent(normalized_items: List[WtWordList]) -> List[WtWordList]: 202 | current_chunk = None 203 | 204 | collapsed_items = [] 205 | 206 | for nextchunk in normalized_items: 207 | if current_chunk is None: 208 | current_chunk = nextchunk 209 | continue 210 | elif nextchunk.speaker != current_chunk.speaker: 211 | collapsed_items.append(current_chunk) 212 | current_chunk = nextchunk 213 | else: 214 | current_chunk.words.extend(nextchunk.words) 215 | 216 | # Output the last chunk 217 | if current_chunk is not None: 218 | collapsed_items.append(current_chunk) 219 | 220 | return collapsed_items 221 | 222 | def get_out_of_sync_items(input_dir, format_string, max_speaker_width, collapsed_items): 223 | now_running_time = 0 224 | out_of_sync_items = [] 225 | 226 | for item in collapsed_items: 227 | if item.start < now_running_time - 5: 228 | out_of_sync_items.append((item, round(now_running_time - item.start, 2))) 229 | now_running_time = item.start 230 | 231 | return out_of_sync_items 232 | 233 | def write_out_of_sync_items(input_dir, max_speaker_width, potentially_needing_punctuation): 234 | print() 235 | print(f" NOTE: There are {len(potentially_needing_punctuation)} lines with significantly earlier start timestamps than the previous line, as shown.") 236 | print(" Manually adding a punctuation mark to the end of one of the words of these lines in the .words.json would allow them to split at that spot, that part of them appearing before the previous speaker's line:") 237 | print() 238 | potential_punctuation_string = "" 239 | max_item_offset_length = max(len(str(x[1])) for x in potentially_needing_punctuation) 240 | max_start_length = max(len(str(x[0].start)) for x in potentially_needing_punctuation) 241 | max_end_length = max(len(str(x[0].end)) for x in potentially_needing_punctuation) 242 | 243 | 244 | for item in potentially_needing_punctuation: 245 | item_string = f" [{f'{item[0].start:.2f}'.rjust(max_start_length)}-{f'{item[0].end:.2f}'.rjust(max_end_length)}] {f'-{item[1]:.2f}s'.rjust(max_item_offset_length +2)} {item[0].speaker.rjust(max_speaker_width)}: {item[0].text}" 246 | print(item_string) 247 | potential_punctuation_string += item_string + "\n" 248 | 249 | punctuation_review_path = os.path.join(input_dir, "outOfSyncItems.txt") 250 | with open(punctuation_review_path, 'w') as file: 251 | file.write(potential_punctuation_string) 252 | print() 253 | print(f" This list has been saved to {punctuation_review_path} for review.") 254 | print() 255 | 256 | def update_word_texts(word_list: WtWordList, new_text: str, original_words: List[WtWord]): 257 | new_words = new_text.split() 258 | 259 | if len(new_words) != len(word_list.words): 260 | raise ValueError(f"The new text does not have the same number of words as the original. New count: {len(new_words)}, Old count: {len(word_list.words)}, text: '{new_text}' Original text: '{word_list.text}'") 261 | 262 | for wt_word, new_word in zip(word_list.words, new_words): 263 | if wt_word.text != new_word: 264 | for original_word in original_words: 265 | if original_word.id == wt_word.id: 266 | original_word.text = new_word 267 | break 268 | 269 | def output_items(input_dir, corrections, show_timestamps, format_string, max_speaker_width, collapsed_items): 270 | output_builder = [] 271 | 272 | for item in collapsed_items: 273 | timestamp = f"[{item.start}-{item.end}] " if show_timestamps else "" 274 | out_string = f"{timestamp}{item.speaker.rjust(max_speaker_width)}: \"{item.text}\"" 275 | 276 | if corrections is not None: 277 | for key, value in corrections.items(): 278 | out_string = out_string.replace(key, value) 279 | 280 | output_builder.append(out_string) 281 | 282 | output_path = os.path.join(input_dir, "transcript.txt") 283 | with open(output_path, 'w') as file: 284 | file.write('\n'.join(output_builder)) 285 | print(f" {len(output_builder)} lines saved as {output_path}") 286 | 287 | def ends_with_break(chunk_text): 288 | last_character = chunk_text.strip()[-1] 289 | return last_character in ['.', '!', '?', '-', ',', '~'] 290 | 291 | def extract_chunks_from_file(file_path: str, speaker: str) -> List[WtWord]: 292 | chunks = [] 293 | 294 | try: 295 | with open(file_path, 'r') as file: 296 | json_content = file.read() 297 | jsonData = json.loads(json_content) 298 | 299 | for chunkData in jsonData['segments']: 300 | segment_chunks = [] 301 | for word in chunkData['words']: 302 | chunk = WtWord( 303 | speaker = speaker, 304 | text = word['text'].strip() if word['text'] else "", 305 | start = word['start'], 306 | end = word['end'] 307 | ) 308 | segment_chunks.append(chunk) 309 | 310 | chunks.extend(segment_chunks) 311 | except Exception as ex: 312 | print(f"Error processing file {file_path}: {str(ex)}") 313 | 314 | return chunks 315 | 316 | def insert_commas_at_disfluencies(segment_chunks: List[WtWord], no_asterisks: bool): 317 | if len(segment_chunks) > 1: 318 | for i in range(1, len(segment_chunks)): 319 | current_word = segment_chunks[i] 320 | prvs_word = segment_chunks[i - 1] 321 | if current_word.speaker != prvs_word.speaker: 322 | raise ValueError("Only meant to be used on single-speaker collection") 323 | if current_word.text == "[*]" and not ends_with_break(prvs_word.text): 324 | prvs_word.text = prvs_word.text.strip() + ("" if no_asterisks else "*") + "," 325 | 326 | def insert_ellipses_at_likely_breaks(segment_chunks: List[WtWord], no_asterisks: bool): 327 | if len(segment_chunks) > 1: 328 | for i in range(1, len(segment_chunks)): 329 | current_word = segment_chunks[i] 330 | prvs_word = segment_chunks[i - 1] 331 | if current_word.speaker != prvs_word.speaker: 332 | raise ValueError("Only meant to be used on single-speaker collection") 333 | if current_word.start - prvs_word.end > 10 and not ends_with_break(prvs_word.text): 334 | prvs_word.text = prvs_word.text.strip() + ("" if no_asterisks else "*") + "..." --------------------------------------------------------------------------------