├── requirements.txt
├── dockerfile
├── prompt_dnd_2.txt
├── utils.py
├── prompt_dnd_1.txt
├── setup.py
├── LICENSE
├── recognize.py
├── summarize.py
├── .gitignore
├── configuration.py
├── tasmas.py
├── README.md
└── assemble.py


/requirements.txt:
--------------------------------------------------------------------------------
1 | whisper_timestamped==1.14.4
2 | auditok==0.2.0
3 | deepmultilingualpunctuation==1.0.1
4 | openai==1.23.6


--------------------------------------------------------------------------------
/dockerfile:
--------------------------------------------------------------------------------
 1 | # you'll need to have already done: 
 2 | #  docker build https://github.com/linto-ai/whisper-timestamped.git -t whisper_timestamped
 3 | FROM whisper_timestamped 
 4 | 
 5 | # install packages it doesn't include
 6 | RUN pip install --no-cache-dir deepmultilingualpunctuation openai
 7 | 
 8 | WORKDIR /usr/src/tasmas
 9 | 
10 | COPY . /usr/src/tasmas
11 | 
12 | RUN cd /usr/src/tasmas/ && pip3 install .


--------------------------------------------------------------------------------
/prompt_dnd_2.txt:
--------------------------------------------------------------------------------
1 | Prompt:
2 | Given the following transcript of a D&D session, generate separate bulleted lists for each party member detailing specific events and experiences they encountered. 
3 | Additionally, provide a comprehensive list of all items that were exchanged or obtained, specifying their name, origin, and recipient. 
4 | Finally, include information about the scheduled upcoming sessions.
5 |          
6 | Transcript:
7 | 


--------------------------------------------------------------------------------
/utils.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import re
 3 | 
 4 | def extract_speaker_name(file, extension):
 5 |     file_name = os.path.basename(file)  # strip off the path
 6 |     regex = r"^\d*[-_]?(.+?)(?:_0)?(?:\.[a-zA-Z0-9]{2,4})?\." + re.escape(extension) + "$"
 7 |     match = re.match(regex, file_name)
 8 |     if match:
 9 |         return match.group(1)
10 |     else:
11 |         raise ValueError(f"Could not parse speaker name from filename '{file_name}'")


--------------------------------------------------------------------------------
/prompt_dnd_1.txt:
--------------------------------------------------------------------------------
1 | Prompt:
2 | Generate a detailed summary of a Dungeons & Dragons session based on the provided transcript. Pay attention to dialogue attribution, changes in inventory or quest progress, significant events, and exclude any irrelevant banter or out-of-character jokes. Ensure that the summary accurately captures the narrative elements provided by the Gamemaster, distinguishing between narrative details and NPC dialogue. Produce a comprehensive summary that provides enough context for future sessions and potential plot developments.
3 | 
4 | Length: Please generate a detailed summary of the session, ensuring that it captures all essential details while maintaining clarity and coherence.
5 |          
6 | Transcript:
7 | 


--------------------------------------------------------------------------------
/setup.py:
--------------------------------------------------------------------------------
 1 | from setuptools import setup, find_packages
 2 | 
 3 | setup(name='tasmas', 
 4 |       version='0.1',    
 5 |       author='Kadda OK',
 6 |       description='TASMAS (Transcribe And Summarize Multiple Audio Stems) transcribes and interleaves per-speaker audio recordings into a single threaded transcript, which it can optionally then summarize.',
 7 |       py_modules=['tasmas', 'assemble', 'configuration', 'recognize', 'summarize', 'utils'],
 8 |       install_requires=[
 9 |         'whisper_timestamped',
10 |         'auditok',
11 |         'deepmultilingualpunctuation',
12 |         'openai'
13 |       ],
14 |       entry_points={
15 |         'console_scripts': [
16 |           'tasmas=tasmas:main'
17 |         ],
18 |     })


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | MIT License
 2 | 
 3 | Copyright (c) 2024 Kadda OK
 4 | 
 5 | Permission is hereby granted, free of charge, to any person obtaining a copy
 6 | of this software and associated documentation files (the "Software"), to deal
 7 | in the Software without restriction, including without limitation the rights
 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
 9 | copies of the Software, and to permit persons to whom the Software is
10 | furnished to do so, subject to the following conditions:
11 | 
12 | The above copyright notice and this permission notice shall be included in all
13 | copies or substantial portions of the Software.
14 | 
15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
21 | SOFTWARE.
22 | 


--------------------------------------------------------------------------------
/recognize.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import json
 3 | import glob
 4 | from typing import Dict
 5 | import whisper_timestamped as whisper
 6 | 
 7 | from utils import extract_speaker_name
 8 | 
 9 | def recognize(input_dir: str, names: Dict[str, str], fast: bool = False, model_type: str = "small", device: str = "cuda", audio_ext: str = "ogg"):
10 |     model_type = "tiny" if fast else model_type
11 |     model = whisper.load_model(model_type, device=device)
12 | 
13 |     print()
14 |     print("--------------------")
15 |     print("RECOGNIZE")
16 |     print("--------------------")
17 |     print()
18 |     
19 |     files = glob.glob(os.path.join(input_dir, '*.' + audio_ext))
20 | 
21 |     if not files:
22 |         print(f" No {audio_ext} files were found at {input_dir}.")
23 |         print()
24 |         return
25 | 
26 |     print(f" {len(files)} {audio_ext} files found at {input_dir}.")
27 |     for audio_file in files:
28 |         print(f" - {audio_file}...")
29 |         speaker = extract_speaker_name(audio_file, audio_ext)
30 |         if speaker in names and (names[speaker] is None or names[speaker] == ''):
31 |             print(f"  Skipping {audio_file} because '{speaker}' is specified as blank.")
32 |             print()
33 |             continue
34 |         else:
35 |             audio = whisper.load_audio(os.path.join(input_dir, audio_file))
36 |             if fast:
37 |                 results = whisper.transcribe(model, audio, detect_disfluencies=True, vad="auditok")
38 |             else:
39 |                 results = whisper.transcribe(model, audio, detect_disfluencies=True, vad="auditok", beam_size=5, best_of=5, temperature=(0.0, 0.2, 0.4, 0.6, 0.8, 1.0))
40 | 
41 |             json_file = os.path.join(input_dir, audio_file + '.words.json')
42 |             with open(json_file, 'w') as f:
43 |                 f.write(json.dumps(results))
44 |             print(f"  Saved to {json_file}")
45 |             print()
46 |     print("--------------------")


--------------------------------------------------------------------------------
/summarize.py:
--------------------------------------------------------------------------------
 1 | import os
 2 | import sys
 3 | import glob 
 4 | import textwrap
 5 | from openai import OpenAI
 6 | 
 7 | def do_summary(transcript, client, prompt_file):
 8 |     with open(prompt_file, 'r') as file:
 9 |         prompt = file.read()
10 | 
11 |     completion = client.chat.completions.create(model="gpt-4-0125-preview", 
12 |         messages=[
13 |         {"role": "system", "content" : "You are a chatbot which can summarize long transcripts."},
14 |         {"role": "user", "content" : f'{prompt}{transcript}'},
15 |         ])
16 |     
17 |     return completion.choices[0].message.content
18 | 
19 | def summarize(input_dir, prompt_files, openai_api_key):
20 | 
21 |     if input_dir is None:
22 |         print("Please provide an input directory.")
23 |         return
24 |     
25 |     print()
26 |     print("--------------------")
27 |     print("SUMMARIZE")
28 |     print("--------------------")
29 |     print()
30 | 
31 |     transcript_path = os.path.join(input_dir, 'transcript.txt')
32 |     if not os.path.exists(transcript_path):
33 |         print("transcript.txt not found in the input directory.")
34 |         sys.exit(1)
35 | 
36 |     with open(transcript_path, 'r') as file:
37 |         transcript = file.read()
38 | 
39 |     if not prompt_files:
40 |         print("  No prompts to use to summarize.")
41 |         return
42 | 
43 |     client = OpenAI(api_key=openai_api_key)
44 | 
45 |     for prompt_file in prompt_files:
46 |         # Call your command here
47 |         print()
48 |         print(f"  - Prompt {prompt_file}...")
49 |         
50 |         summary = do_summary(transcript, client, prompt_file)
51 | 
52 |         filename = os.path.splitext(os.path.basename(prompt_file))[0].replace("prompt_", "")
53 |         filename = f"summary_{filename}.txt"
54 |         output_path = os.path.join(input_dir, filename)
55 |         with open(output_path, 'w') as file:
56 |             file.write(summary)
57 |         print()
58 |         print("    Result:")
59 |         print("    ---------")
60 |         terminal_width = os.get_terminal_size().columns
61 |         # Split the summary into lines, then indent and wrap each line
62 |         summary_lines = summary.split('\n')
63 |         wrapped_summary = '\n'.join('\n'.join(textwrap.wrap(line, width=terminal_width, initial_indent='     ', subsequent_indent='     ')) for line in summary_lines)
64 |         print(wrapped_summary)
65 |         print("    ---------")
66 |         print(f"    Written to {output_path}.")
67 |         print()
68 |         print()
69 | 
70 | 
71 | 


--------------------------------------------------------------------------------
/.gitignore:
--------------------------------------------------------------------------------
  1 | # Byte-compiled / optimized / DLL files
  2 | __pycache__/
  3 | *.py[cod]
  4 | *$py.class
  5 | 
  6 | # C extensions
  7 | *.so
  8 | 
  9 | # Distribution / packaging
 10 | .Python
 11 | build/
 12 | develop-eggs/
 13 | dist/
 14 | downloads/
 15 | eggs/
 16 | .eggs/
 17 | lib/
 18 | lib64/
 19 | parts/
 20 | sdist/
 21 | var/
 22 | wheels/
 23 | share/python-wheels/
 24 | *.egg-info/
 25 | .installed.cfg
 26 | *.egg
 27 | MANIFEST
 28 | 
 29 | # PyInstaller
 30 | #  Usually these files are written by a python script from a template
 31 | #  before PyInstaller builds the exe, so as to inject date/other infos into it.
 32 | *.manifest
 33 | *.spec
 34 | 
 35 | # Installer logs
 36 | pip-log.txt
 37 | pip-delete-this-directory.txt
 38 | 
 39 | # Unit test / coverage reports
 40 | htmlcov/
 41 | .tox/
 42 | .nox/
 43 | .coverage
 44 | .coverage.*
 45 | .cache
 46 | nosetests.xml
 47 | coverage.xml
 48 | *.cover
 49 | *.py,cover
 50 | .hypothesis/
 51 | .pytest_cache/
 52 | cover/
 53 | 
 54 | # Translations
 55 | *.mo
 56 | *.pot
 57 | 
 58 | # Django stuff:
 59 | *.log
 60 | local_settings.py
 61 | db.sqlite3
 62 | db.sqlite3-journal
 63 | 
 64 | # Flask stuff:
 65 | instance/
 66 | .webassets-cache
 67 | 
 68 | # Scrapy stuff:
 69 | .scrapy
 70 | 
 71 | # Sphinx documentation
 72 | docs/_build/
 73 | 
 74 | # PyBuilder
 75 | .pybuilder/
 76 | target/
 77 | 
 78 | # Jupyter Notebook
 79 | .ipynb_checkpoints
 80 | 
 81 | # IPython
 82 | profile_default/
 83 | ipython_config.py
 84 | 
 85 | # pyenv
 86 | #   For a library or package, you might want to ignore these files since the code is
 87 | #   intended to run in multiple environments; otherwise, check them in:
 88 | # .python-version
 89 | 
 90 | # pipenv
 91 | #   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
 92 | #   However, in case of collaboration, if having platform-specific dependencies or dependencies
 93 | #   having no cross-platform support, pipenv may install dependencies that don't work, or not
 94 | #   install all needed dependencies.
 95 | #Pipfile.lock
 96 | 
 97 | # poetry
 98 | #   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
 99 | #   This is especially recommended for binary packages to ensure reproducibility, and is more
100 | #   commonly ignored for libraries.
101 | #   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
102 | #poetry.lock
103 | 
104 | # pdm
105 | #   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
106 | #pdm.lock
107 | #   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
108 | #   in version control.
109 | #   https://pdm.fming.dev/#use-with-ide
110 | .pdm.toml
111 | 
112 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
113 | __pypackages__/
114 | 
115 | # Celery stuff
116 | celerybeat-schedule
117 | celerybeat.pid
118 | 
119 | # SageMath parsed files
120 | *.sage.py
121 | 
122 | # Environments
123 | .env
124 | .venv
125 | env/
126 | venv/
127 | ENV/
128 | env.bak/
129 | venv.bak/
130 | 
131 | # Spyder project settings
132 | .spyderproject
133 | .spyproject
134 | 
135 | # Rope project settings
136 | .ropeproject
137 | 
138 | # mkdocs documentation
139 | /site
140 | 
141 | # mypy
142 | .mypy_cache/
143 | .dmypy.json
144 | dmypy.json
145 | 
146 | # Pyre type checker
147 | .pyre/
148 | 
149 | # pytype static type analyzer
150 | .pytype/
151 | 
152 | # Cython debug symbols
153 | cython_debug/
154 | 
155 | # PyCharm
156 | #  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
157 | #  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
158 | #  and can be added to the global gitignore or merged into this file.  For a more nuclear
159 | #  option (not recommended) you can uncomment the following to ignore the entire idea folder.
160 | #.idea/
161 | 


--------------------------------------------------------------------------------
/configuration.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import argparse
  3 | 
  4 | def get_configuration(args):
  5 |     parser = argparse.ArgumentParser(formatter_class=argparse.RawTextHelpFormatter,
  6 |                                      description='''Multi-Stem Conversational Transcriber
  7 | ''')
  8 |     parser.add_argument('operationMode', type=str, 
  9 |                         choices=['recognize', 'assemble', 'summarize', 'semiauto', 'fullauto'], 
 10 |                         help='''Which step to perform:
 11 | - recognize: Transcribes all audio files found at the 
 12 |              path using whisper_timestamped and writes 
 13 |              a .words.json file for each.
 14 | - assemble:  Arranges by timecode the contents of all 
 15 |              .words.json files found at the path, 
 16 |              switching speakers at punctuation, to 
 17 |              produce a readable transcript.txt.
 18 | - summarize: Calls OpenAI API to summarize the 
 19 |              transcript.txt at the path using 
 20 |              configurable prompts.
 21 | - semiauto:  Runs recognize followed immediately by 
 22 |              assemble. (This is the recommended first 
 23 |              pass mode, as it is common to iterate on
 24 |              assemble multiple times making manual
 25 |              tweaks to the .words.json files.)
 26 | - fullauto:  Performs all steps in succession. 
 27 |  ''')
 28 |     parser.add_argument('inputDir', type=str, help='The path to the files to process.')
 29 | 
 30 |     recognizeConfigGroup = parser.add_argument_group('recognize mode options')
 31 |     recognizeConfigGroup.add_argument('--extension', type=str, help='''File extension of the audio files to transcribe.
 32 | Defaults to "ogg", for use with Craig recordings, but 
 33 | I would think that things like "wav" or "flac" would 
 34 | work too.
 35 | ''')
 36 |     recognizeConfigGroup.add_argument('--fast', action='store_true', 
 37 |                                       help='''Prioritize recognition speed over accuracy.
 38 | Results in the following changes:
 39 | - Uses the "tiny" model instead of the "small" model 
 40 | - Uses "efficient" params rather than "accurate" ones
 41 | Honestly this really doesn't work well at all and I 
 42 | do not recommend it.
 43 | '''
 44 | )
 45 | 
 46 |     assembleConfigGroup = parser.add_argument_group('assemble mode options')
 47 |     assembleConfigGroup.add_argument('--noEllipses', action='store_true', help='''This script normally inserts ellipses (...) into the
 48 | transcript whenever a word is more than 5s after its
 49 | predecessor, allowing a speaker change (which is done
 50 | on punctuation). 
 51 | The --noEllipses switch suppresses this behavior.
 52 |  ''')
 53 |     assembleConfigGroup.add_argument('--disfluentComma', action='store_true', help='''Replace detected disfluencies (e.g. "um", "uh") with a
 54 | comma in the transcript. 
 55 | This may help if you are using --noEllipses.
 56 |  ''')
 57 |     assembleConfigGroup.add_argument('--noAsterisks', action='store_true', help='''When this script inserts ellipses or disfluency commas
 58 | into the transcript, it marks them with an asterisk (*)
 59 | for reference.
 60 | The --noAsterisks switch suppresses this behavior.
 61 |  ''')
 62 |     assembleConfigGroup.add_argument('--showTimestamps', action='store_true', help='''Include the start and end seconds of the phrase in 
 63 | front of each line in the transcript. 
 64 | i.e. [1905.39-1907.05]  Joe: "Look a timestamp."
 65 |  ''')
 66 |     assembleConfigGroup.add_argument('--corrections', type=str, help='''A list of known incorrect values to replace in the 
 67 | transcript output. This is a quick way to correct 
 68 | frequently misinterpreted text such as unusual names. 
 69 | Each entry is the correct word or phrase with a list of
 70 | incorrect ones. For example,
 71 | '{"Elsalor":["Elcelor", "I'll solar", "else the Lord"],
 72 |   "A'Dhem" :["Adam"] }'
 73 | This can be a path to a .json file or the actual JSON.
 74 |  ''')
 75 |     assembleConfigGroup.add_argument('--names', type=str, help='''Replacements for the speaker names as recorded in the 
 76 | filenames by discord/Craig. These should reflect the 
 77 | names used by speakers to refer to each other in the 
 78 | recordings. For example:
 79 | '{ "joey__0": "Joe",
 80 |   "randointernet3000_0": "Bob" }'
 81 | This can be a path to a .json file or the actual JSON.
 82 | You will be prompted individually for any values not
 83 | found here (and given the opportunity to skip that 
 84 | audio stem).
 85 |  ''')
 86 |     
 87 |     summarizeConfigGroup = parser.add_argument_group('summarize mode options')
 88 |     summarizeConfigGroup.add_argument('--promptType', type=str, help='''
 89 | This script will call OpenAI's GPT-4 API to summarize
 90 | the transcript as many times as it is given prompts to 
 91 | do so. It will attempt to find text files with the name
 92 | pattern "prompt_{promptType}_*.txt", in the following 
 93 | order: 
 94 |  - in the `inputDir`
 95 |  - one level above the `inputDir`
 96 |  - in the location of this script
 97 | ''')
 98 |     summarizeConfigGroup.add_argument('--openApiKey', type=str, help='''Due to current LLM token limits (Q1 2024) and the very 
 99 | large number of tokens needed to summarize transcripts
100 | of much length, the summarize operation calls ChatGPT
101 | 4 Turbo (128k tokens). As such, an OpenAI API key is 
102 | required to run in summarize (or fullauto) mode. 
103 | (It'll probably cost you about $0.10 USD per call.)
104 | ''')
105 | 
106 |     config = vars(parser.parse_args(args))
107 |     
108 |     return config


--------------------------------------------------------------------------------
/tasmas.py:
--------------------------------------------------------------------------------
  1 | import sys
  2 | import json
  3 | import os
  4 | import re
  5 | import glob
  6 | import readline
  7 | from typing import Dict, Optional
  8 | import torch
  9 | import whisper_timestamped as whisper
 10 | from configuration import get_configuration
 11 | from recognize import recognize
 12 | from assemble import assemble
 13 | from summarize import summarize
 14 | from utils import extract_speaker_name
 15 | 
 16 | def json_string_or_path(json_string_or_path):
 17 |     if not json_string_or_path:
 18 |         return None
 19 | 
 20 |     deserialized_object = None
 21 |     try:
 22 |         deserialized_object = json.loads(json_string_or_path)
 23 |     except json.JSONDecodeError:
 24 |         # If it's not a valid JSON string, treat it as a file path
 25 |         if os.path.exists(json_string_or_path):
 26 |             with open(json_string_or_path, 'r') as f:
 27 |                 deserialized_object = json.load(f)
 28 | 
 29 |     return deserialized_object
 30 | 
 31 | def load_names(names_setting, input_dir):
 32 |     names = json_string_or_path(names_setting)
 33 |     if names is None:
 34 |         # Look for names.json in input_dir
 35 |         names_file_path = os.path.join(input_dir, 'names.json')
 36 |         if not os.path.exists(names_file_path):
 37 |             # If not found, look one folder up
 38 |             names_file_path = os.path.join(input_dir, '..', 'names.json')
 39 |         if os.path.exists(names_file_path):
 40 |             # If found, prompt the user whether to use it
 41 |             use_names_file = input(f"  Found a names file at {names_file_path}. Do you want to use it? (y/n): ")
 42 |             if use_names_file.lower() == 'y':
 43 |                 with open(names_file_path, 'r') as f:
 44 |                     names = json.load(f)
 45 |                     print(f"    Loaded {len(names)} speaker name{'s' if len(names) > 1 else ''} from {names_file_path}.")
 46 |     else:
 47 |         print(f" Loaded {len(names)} speaker name{'s' if len(names) > 1 else ''}.")
 48 | 
 49 |     return names
 50 | 
 51 | def load_corrections(corrections_setting, input_dir):
 52 |     corrections = None
 53 |     correction_setting_dic = json_string_or_path(corrections_setting)
 54 |     if correction_setting_dic is None:
 55 |         # Look for corrections.json in input_dir
 56 |         corrections_file_path = os.path.join(input_dir, 'corrections.json')
 57 |         if not os.path.exists(corrections_file_path):
 58 |             # If not found, look one folder up
 59 |             corrections_file_path = os.path.join(input_dir, '..', 'corrections.json')
 60 |         if os.path.exists(corrections_file_path):
 61 |             # If found, prompt the user whether to use it
 62 |             use_corrections_file = input(f"  Found a corrections file at {corrections_file_path}. Do you want to use it? (y/n): ")
 63 |             if use_corrections_file.lower() == 'y':
 64 |                 with open(corrections_file_path, 'r') as f:
 65 |                     correction_setting_dic = json.load(f)
 66 |                     print(f"    Loaded corrections from {corrections_file_path}.")
 67 |     if correction_setting_dic is not None:
 68 |         # this was defined as "correct string": ["incorrect string", "incorrect string", ...] because that's 
 69 |         # easier to write out multiple corrections to the same value, but now we need to flip it so that we 
 70 |         # can actually use the dictionary to look up words and see if they need correcting
 71 |         corrections = {incorrect: correct for correct, incorrects in correction_setting_dic.items() for incorrect in incorrects}
 72 |         print(f" Loaded {len(corrections)} correction{'s' if len(corrections) > 1 else ''}.")
 73 |         print()
 74 | 
 75 |     return corrections
 76 | 
 77 | def check_names(names: Optional[Dict[str, str]], files, extension):
 78 |     if names is None:
 79 |         names = {}
 80 |     for file in files:
 81 |         speaker_name = extract_speaker_name(file, extension)
 82 |         if speaker_name not in names:
 83 |             print()
 84 |             readline.set_startup_hook(lambda: readline.insert_text(speaker_name))
 85 |             try:
 86 |                 value = input(f" Enter the proper speaker name for '{speaker_name}' (press enter to accept, or backspace it all and enter nothing to skip this file): ")
 87 |             finally:
 88 |                 readline.set_startup_hook()  # remove hook again
 89 |             names[speaker_name] = value if value else None
 90 |     return names
 91 | 
 92 | def load_prompt_files(input_dir, prompt_type):
 93 |     prompt_files = []
 94 |     directories = [input_dir, os.path.dirname(input_dir), os.path.dirname(os.path.realpath(__file__))]
 95 | 
 96 |     for directory in directories:
 97 |         files = glob.glob(os.path.join(directory, f'prompt_{prompt_type}_*.txt'))
 98 |         if files:
 99 |             print()
100 |             print(f"  Found the following prompt files in {directory}:")
101 |             print()
102 |             for file in files:
103 |                 print(f"  - {file}")
104 |             print()
105 |             use_files = input("  Use these files? (y/n): ")
106 |             if use_files.lower() == 'y':
107 |                 prompt_files.extend(files)
108 |                 break
109 |             print()
110 | 
111 |     if not prompt_files:
112 |         print("  No prompt files found.")
113 | 
114 |     return prompt_files
115 | def check_cuda():
116 |     if not torch.cuda.is_available():
117 |         print("\033[93m WARNING: CUDA (gpu support) is not available!\n"
118 |               "\n If you are in Docker, you may have forgotten to specify `--gpus all`."
119 |               "\n Otherwise, this is a bit more of a rabbit hole than can be delved here "
120 |               "\n (it depends on your operating system and environment, but it's quite "
121 |               "\n googleable).\n"
122 |               "\n You can try to continue without it, but:"
123 |               "\n  - RECOGNIZE may be excruciatingly slow, or just not work at all."
124 |               "\n  - ASSEMBLE may fail when trying to auto-repunctuate out of sync items.\n"
125 |               "\n (SUMMARIZE workloads should be unaffected.)\n \033[0m")
126 |         response = input("Do you want to continue running? (y/n): ")
127 |         if response.lower() not in ["y", "yes"]:
128 |             exit()
129 |     else:
130 |         print("  CUDA is available.")
131 | 
132 | def main():
133 |     # sys.argv contains the command-line arguments
134 |     # sys.argv[0] is the script name
135 |     # sys.argv[1:] are the arguments passed to the script
136 |     args = sys.argv[1:]
137 |     config = get_configuration(args)
138 |     inputDir = config['inputDir']
139 |     no_ellipses = config.get('noEllipses', False)
140 |     disfluent_comma = config.get('disfluentComma', False)
141 |     no_asterisks = config.get('noAsterisks', False)
142 |     show_timestamps = config.get('showTimestamps', False)
143 | 
144 |     print()
145 |     print("--------------------")
146 |     print("PRE-CHECK")
147 |     print("--------------------")
148 |     print()
149 | 
150 |     check_cuda()
151 |     corrections = load_corrections(config.get('corrections'), inputDir)
152 | 
153 |     operation = config['operationMode']
154 |     if operation in ['recognize', 'semiauto', 'fullauto']:
155 |         check_names_extension = config.get('extension', 'ogg').strip() or 'ogg'
156 |     else:
157 |         check_names_extension = 'words.json'
158 | 
159 |     files = glob.glob(os.path.join(inputDir, f"*.{check_names_extension}"))
160 | 
161 |     if not files:
162 |         print()
163 |         print(f" No {check_names_extension} files were found at {inputDir}.")
164 |         print()
165 |         sys.exit()
166 | 
167 |     print(f" Found {len(files)} files to work on at {inputDir}:")
168 |     print()
169 |     for file in files:
170 |         filename = os.path.basename(file)
171 |         print(f'  - {filename}')
172 |     print()
173 |     names = check_names(load_names(config.get('names'), inputDir), files, check_names_extension)
174 | 
175 |     openai_api_key = config.get('openApiKey')
176 |     prompt_type = config.get('promptType')
177 |     prompt_files = []
178 |     if operation in ['summarize', 'fullauto']:
179 |         if (prompt_type is None) or (prompt_type == ''):
180 |             print("  Prompt Type is required for summarize (or fullauto) operation mode.")
181 |             sys.exit()
182 |         prompt_files = load_prompt_files(inputDir, prompt_type)
183 |         if not prompt_files:
184 |             print("  At least one prompt file must be found for summarize (or fullauto) operation mode.")
185 |             sys.exit()
186 |         if (openai_api_key is None) or (openai_api_key == ''):
187 |             print("  OpenAI API key is required for summarize (or fullauto) operation mode.")
188 |             sys.exit()
189 | 
190 |     operation_modes = {
191 |         'recognize': lambda: recognize(inputDir, names, config['fast']),
192 |         'assemble': lambda: assemble(inputDir, corrections, names, no_ellipses, disfluent_comma, no_asterisks, show_timestamps),
193 |         'summarize': lambda: summarize(inputDir, prompt_files, openai_api_key),
194 |         'semiauto': lambda: [recognize(inputDir, names, config['fast']), assemble(inputDir, corrections, names, no_ellipses, disfluent_comma, no_asterisks, show_timestamps)],
195 |         'fullauto': lambda: [recognize(inputDir, names, config['fast']), assemble(inputDir, corrections, names, no_ellipses, disfluent_comma, no_asterisks, show_timestamps), summarize(inputDir, prompt_files, openai_api_key)]
196 |     }
197 | 
198 |     print("--------------------")
199 | 
200 |     if operation in operation_modes:
201 |         operation_modes[operation]()
202 |     else:
203 |         print(f"Invalid operation: {operation}")    
204 | 
205 | if __name__ == '__main__':
206 |     main(sys.argv[1:])


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # TASMAS (*Transcribe And Summarize Multiple Audio Stems*)
  2 | 
  3 | This is an automatic interleaving transcriber and summarizer for file-per-speaker audio recordings, such as Discord calls recorded by [`Craig`](https://craig.chat/) or a similar bot.
  4 | 
  5 | You point it at a folder that contains audio files,  
  6 | ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/1ce0e427-9670-4d2a-a877-d1175cd2c8d9)  
  7 | and it will generate transcripts of each file timestamped at the word level,  
  8 | ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/77d2e0b8-96bf-4b16-8c91-23f43e16d0bb)![image](https://github.com/KaddaOK/TASMAS/assets/151568451/3b5bf487-4a72-45e6-b5a9-b6fd784e0a16)  
  9 | then braid each phrase from the files into a single coherent attributed transcript, 
 10 |  ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/d4add246-f1dc-4c9b-b098-c48ea3100cbb)  
 11 | and optionally get summaries of that transcript as well.
 12 |  ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/38d00f66-5400-42ec-a9c0-756766a2afee)  
 13 |  (Okay yes that's not a "real" automatic summary output, but you get the point, I just wanted to highlight how the transcript looks when people are talking over each other)
 14 | 
 15 | 
 16 | # Operating Modes
 17 | TASMAS has 3 operating modes, each of which can be executed independently,  
 18 | as well as a `SEMIAUTO` mode which executes the first two modes in sequence (this is the recommended initial run),  
 19 | and a `FULLAUTO` mode which executes all 3 (not recommended, as manual fix-up after stage 2 is usually a good idea).
 20 | 
 21 | ## `RECOGNIZE`:
 22 | 
 23 | *Given a file path that contains a number of separate audio files, 
 24 | transcribe each file down to word-level timestamps, saving each as `{filename}.words.json`.*
 25 | 
 26 | ### File formats
 27 | As designed and tested, this operates on `.ogg` files recorded using the [`Craig`](https://craig.chat/) bot for Discord, but it could theoretically be any audio file that `whisper` can handle if you specify the `--extension` switch.
 28 | 
 29 | ### Models and options
 30 | This mode uses `whisper_timestamped` to transcribe the files, using the `small` model, with disfluency detection enabled and the Auditok VAD mode, and using the beam and temp etc parameters described as "accurate".
 31 | 
 32 | (As of right now there's a configuration switch `--fast` which causes it to use the `"tiny"` model instead and not use the "accurate" parameters, but honestly it's not usable and I'm going to just take it out entirely in a subsequent release.)
 33 | 
 34 | Personally I didn't get any more meaningful results out of using larger models, and in fact `small` seemed to work the best anyway, so I didn't follow through on model selection options.
 35 | 
 36 | ## `ASSEMBLE`:
 37 | 
 38 | *Given a file path that contains a bunch of separate `.words.json` files, sort and interleave these into one coherent human-readable transcript, saved as `transcript.txt`.*
 39 | 
 40 | ### Speaker Identification
 41 | As TASMAS is intended for recordings that have a separate file for each different speaker, the filename is used to identify the speaker.  
 42 | 
 43 | After any formatting idiomatic to `Craig` (a leading `n-` and trailing `_0`) is stripped out, the rest will be compared to the contents of `--names` if specified, or used directly as the speaker name if nothing is found. (See below under Usage)  
 44 | 
 45 | If a `names.json` file is found in the input path or its containing folder, it will be used for `--names` automatically, allowing you to set up this information once and have it be continually re-used for other recordings with the same speakers.
 46 | 
 47 | TASMAS will also interrogate the user during the PRE-CHECK phase to verify any speakers it encounters whose names were not specified by `--names` or a `names.json`.
 48 | 
 49 | ### Punctuation-based Interleaving
 50 | TASMAS sorts all the words by timestamp and assembles sentences sequentially, only allowing the current speaker to change when a word ends with a punctuation mark (`.`, `,`, `!`, `?`, `-`).  
 51 | This allows cross-talk to be inserted in as accurate and followable an order as possible without each word being split up. 
 52 | 
 53 | Note that this is therefore only possible when all of the audio files are synchronized to start at the same moment, even if that speaker was not yet present.  `Craig` does this automatically, but if your source does not, you may need to edit your audio files accordingly.
 54 | 
 55 | ### Anticipate Corrections/Replacements
 56 | TASMAS will replace words and phrases that are likely mishears in the output if `--corrections` data is provided.  (See below under Usage)  
 57 | This is particularly useful for TTRPG recordings, as many proper names and phrases will never be interpreted correctly.  
 58 | As with names, TASMAS will automatically pick up a `corrections.json` file if present in the input path or its parent folder, so you can build up these replacements over time.
 59 | 
 60 | ### Out-of-sync Warning
 61 | At the end of this operation, TASMAS will detect any phrases with start times that are more than 5 seconds out of sync with their neighbors, and will automatically run them through a punctuation model to try to improve results (as adding punctuation will allow these phrases to split at those words, which may allow other speakers to interject improving the overall sync).  
 62 | After doing so, remaining phrases that are still more than 5 seconds out of sync will be output to the screen and to `outOfSyncItems.txt`.  
 63 | Manually adding a punctuation mark directly to an individual word in the corresponding `.words.json` file and re-executing the ASSEMBLE operation will improve these results.
 64 | 
 65 | 
 66 | ## `SUMMARIZE`:
 67 |  
 68 | *Given an Open AI api key, appropriate prompts, and a file path that contains a `transcript.txt`, ask GPT-4 Turbo to summarize the transcript.*
 69 | 
 70 | ### Summary Prompts
 71 | When executing the SUMMARIZE operation, `--promptType` is required, which will be used to attempt to locate text files named in the format `prompt_{promptType}_*.txt`, in the input path, its parent folder, or with TASMAS itself. 
 72 | 
 73 | TASMAS was designed for summarizing Dungeons & Dragons sessions, and as examples, comes with two prompts that produce useful output, `prompt_dnd_1.txt` and `prompt_dnd_2.txt`, which will be used if `--promptType dnd` is specified.
 74 | 
 75 | Also, `--openApiKey` is required in this mode, because:
 76 | 
 77 | ### Why does summarize need to call a paid API?
 78 | For each prompt file found, the OpenAI API is called. This is because of context token limits.  
 79 | A typical D&D session transcript will likely be anywhere between 30,000 and 60,000 tokens. As of this writing, most models will consider only 4096 or 8192 tokens, and very few models can handle more than 32K tokens of input, with GPT-4 Turbo's 128K limit being the only one practically available to me.  
 80 | So yes, it's not free, but it'll only cost you probably about $0.10 USD per prompt.  
 81 | (And you don't ever have to use the SUMMARIZE workload at all if you don't want anyway. 😁)
 82 | 
 83 | # Usage
 84 | 
 85 | To run TASMAS, you must provide at minimum:
 86 |  1. an operation mode (`recognize`, `assemble`, `summarize`, `semiauto` which does the first 2, or `fullauto` which does all 3) and 
 87 |  2. a folder path to process.
 88 | ```bash
 89 | tasmas semiauto /mnt/c/recordings/2024-04-04
 90 | ```
 91 | If your recordings aren't in `.ogg` (and to be fair, why would they be, unless you were using `Craig`, but that's the use case I wrote this for so it's the default), you'll have to add `--extension "wav"` or whatever they are.  
 92 | I haven't even tested that, I just assume it works; if not plz open a bug 🤣
 93 | 
 94 | But yeah, here are some additional things you can add:
 95 | 
 96 | ### Names
 97 | Specifying a `--names` value allows you to set how the filenames should translate into speaker names in the transcript.  
 98 | It can either be a path to a `.json` file, or the JSON itself inline if you're feeling like making things harder for yourself.    
 99 | For example, to produce the transcript from our example at the top, this might be the contents of a `names.json` file:
100 | ```json
101 | {
102 | 	"JohnTheJester": "John",
103 | 	"EmiLovesCats": "Emily",
104 | 	"RoboBert": "Robert",
105 | 	"JessInWonderland": "Jessica",
106 | 	"SassySarah": "Sarah"
107 | }
108 | ```
109 | We could pass that to tasmas like this:
110 | ```bash
111 | tasmas --names /mnt/c/recordings/names.json semiauto /mnt/c/recordings/2024-04-04
112 | ```
113 | but in this case we don't even have to specify `--names`, because that `names.json` is in the parent folder of the folder we're processing, so if we say nothing about names it'll pick it up automatically.
114 | ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/7bc9cedc-8605-492b-a20e-d059380559f7)
115 | 
116 | If you run TASMAS without any names input, it'll prompt you for the name for each speaker file it detects.  (I'm realizing as I'm writing this that it'd be a good feature to ask you after doing so if you want to save a `names.json` for future use, so I'll add that to the backlog I guess.)  
117 | It also gives you the option to skip a speaker entirely, which is useful for files that are music bots or whatnot.  (Don't specify speaker names for such files, or you won't be prompted if you'd like to skip them!)
118 | ![image](https://github.com/KaddaOK/TASMAS/assets/151568451/0ae8c997-16cf-4a32-b8d3-9807af20f407)
119 | 
120 | ### Corrections
121 | Specifying a `--corrections` value allows you to replace all the occurrences of a word or phrase that you know is an incorrect interpretation with the correct value.  
122 | 
123 | Similarly to `names`, this can either be a path to a `.json` file, or the JSON itself inline.  
124 | 
125 | However, the format is the opposite:  
126 | instead of the `"wrongValue":"correctValue"` of names,  
127 | corrections are presented  
128 | `"correctValue": ["wrongValue", "wrongValue", wrongValue"]`  
129 | in order to allow you to list many incorrect possibilities for a single correct possibility.  
130 | 
131 | For example, here's just a few items from a real `corrections.json` file I use, in which you can see why it needs to be done this way: because of weird made up names that get interpreted in many random ways.
132 | ```json
133 | {
134 | 	"Dagstorp": [
135 | 		"Dexter",
136 | 		"Digstorpe",
137 | 		"Dagster",
138 | 		"Dagstrup",
139 | 		"Dagsorp",
140 | 		"Dag Swarp"
141 | 	],
142 | 	"Elsalor": [
143 | 		"El Soler",
144 | 		"El Solor",
145 | 		"Else Laura",
146 | 		"else the Lord",
147 | 		"I'll solar"
148 | 	],
149 | 	"Jeltra": [
150 | 		"Gelter",
151 | 		"Delta",
152 | 		"Geltro",
153 | 		"Gelja",
154 | 		"Jeldra",
155 | 		"Jelter"
156 | 	]
157 | }
158 | ```
159 | Anyway, yeah. They don't have to be a single word to replace, either, you could put anything you want in those quotes; another real world example is `"Shield of Faith": ["shield a faith"]`, which had the added bonus of capitalizing that spell name (corrections are case-insensitive for detection, but will insert the replacement value as capitalized).
160 | 
161 | ### Other stuff
162 | There are some other finer-tuning options, but they're pretty well-summarized in the actual software if you do `tasmas --help`.  
163 | You won't generally need to mess with them (other than `--showTimestamps`, which pretty self-explanatorily includes timestamps in the `transcript.txt` output), unless you feel like `assemble`ing numerous transcripts and comparing them line by line to see how they differ.  As with all things, YMMV.
164 | 
165 | 
166 | # Installation
167 | 
168 | ### Docker
169 | Maybe the easiest, or at least most foolproof, way to use TASMAS (especially on Windows, which always seems to make a mess of python stuff) is via [`Docker`](https://www.docker.com/), which creates a lightweight virtual container with everything already set up for you. 
170 | 
171 | A TASMAS image is available on Docker Hub tagged `kaddaok/tasmas`,  
172 | or the `dockerfile` is a part of this repo if you want to build the image yourself.  
173 | 
174 | You just want to make sure that you include `--gpus all`, so that the model can use your GPU if present,  
175 | and that you map something as a volume (easiest way is `-v {src}:{dest}` ) so you have access to what you want to process.    
176 | 
177 | For instance, I put all the recordings I need to transcribe on my N: drive,  
178 | and my docker is running in linux so I can access N: from `/mnt/n` in my docker host, and I'll just put it in the same place in the container's file system,  
179 | so my docker command looks like this:
180 | ```bash
181 | docker run -it -v /mnt/n:/mnt/n --gpus all kaddaok/tasmas:latest
182 | ```
183 | Running that gives me a new prompt at `/usr/src/tasmas` in the running container and I can just say `tasmas` straight from there:
184 | ```bash
185 | tasmas semiauto /mnt/n/dnd/icespire/2024-03-17
186 | ```
187 | and when I'm done using the Docker container, I just type 
188 | ```bash
189 | exit
190 | ```
191 | and I'm back at the regular prompt.
192 | 
193 | ### Python
194 | If you're already comfortable with python environments (or optimistically think that it might be easier than setting up docker), you can just run it directly.   
195 | 
196 | I haven't put this on PyPI yet (and probably need to reorganize it a bit in order to do so) which means you can't yet just say `pip install tasmas`. ❌   
197 | 
198 | What you can do, though, is clone or download the contents of this repo, cd to it and then say `pip install . `.  That should allow you to use the `tasmas` command.  
199 | 
200 | You can also just say `python tasmas.py` instead though, if you feel like it.


--------------------------------------------------------------------------------
/assemble.py:
--------------------------------------------------------------------------------
  1 | import json
  2 | import os
  3 | import glob
  4 | import re
  5 | import math
  6 | import uuid
  7 | import warnings
  8 | from operator import attrgetter
  9 | from typing import List, Dict, Optional
 10 | from deepmultilingualpunctuation import PunctuationModel
 11 | from tqdm import tqdm
 12 | from utils import extract_speaker_name
 13 | 
 14 | class WtWordEncoder(json.JSONEncoder):
 15 |     def default(self, o):
 16 |         if isinstance(o, (WtWord, WtWordList)):
 17 |             return o.to_dict()
 18 |         return super().default(o)
 19 |     
 20 | class WtWordList:
 21 |     def __init__(self, words):
 22 |         if len(set(word.speaker for word in words)) > 1:
 23 |             raise ValueError("All words in a WtWordList must have the same speaker")
 24 |         self.words = words
 25 | 
 26 |     @property
 27 |     def speaker(self):
 28 |         return self.words[0].speaker if self.words else None
 29 | 
 30 |     @property
 31 |     def start(self):
 32 |         return min(word.start for word in self.words)
 33 | 
 34 |     @property
 35 |     def end(self):
 36 |         return max(word.end for word in self.words)
 37 | 
 38 |     @property
 39 |     def text(self):
 40 |         return " ".join(word.text for word in self.words)
 41 | 
 42 |     def to_dict(self):
 43 |         return {
 44 |             'speaker': self.speaker,
 45 |             'start': self.start,
 46 |             'end': self.end,
 47 |             'text': self.text,
 48 |             'words': [word.to_dict() for word in self.words]
 49 |         }
 50 |     
 51 | class WtWord:
 52 |     def __init__(self, speaker, text, start, end):
 53 |         self.id = uuid.uuid4()
 54 |         self.speaker = speaker
 55 |         self.text = text
 56 |         self.start = start
 57 |         self.end = end
 58 |     def to_dict(self):
 59 |         return {
 60 |             'id': str(self.id),
 61 |             'speaker': self.speaker,
 62 |             'text': self.text,
 63 |             'start': self.start,
 64 |             'end': self.end
 65 |         }
 66 | 
 67 | def assemble(input_dir, corrections, names, no_ellipses, disfluent_comma, no_asterisks, show_timestamps):
 68 |     if input_dir is None:
 69 |         print("Please provide an input directory.")
 70 |         return
 71 |     
 72 |     print()
 73 |     print("--------------------")
 74 |     print("ASSEMBLE")
 75 |     print("--------------------")
 76 |     print()
 77 | 
 78 |     ext = "words.json"
 79 |     files = glob.glob(os.path.join(input_dir, '*.' + ext))
 80 | 
 81 |     if not files:
 82 |         print()
 83 |         print(f" ERROR: No .{ext} files were found at {input_dir}.")
 84 |         print()
 85 |         return
 86 | 
 87 |     print(f" Found {len(files)} .{ext} files at {input_dir}.")
 88 |     print()
 89 | 
 90 |     all_chunks = extract_all_chunks(names, no_ellipses, disfluent_comma, no_asterisks, files)
 91 | 
 92 |     # Sort all chunks by start timestamp
 93 |     print(" Sorting...")
 94 |     sorted_chunks = sorted(all_chunks, key=attrgetter('start', 'speaker', 'end'))
 95 |     print(" Normalizing...")
 96 |     normalized_items = normalize_items(sorted_chunks)
 97 |     """normaljson = os.path.join(input_dir, 'normalized.json')
 98 |     with open(normaljson, 'w') as f:
 99 |         f.write(json.dumps(normalized_items, cls=WtWordEncoder, indent=2))"""
100 |     print(" Collapsing...")
101 |     collapsed_items = collapse_adjacent(normalized_items)
102 |     """collapsejson = os.path.join(input_dir, 'collapsed.json')
103 |     with open(collapsejson, 'w') as f:
104 |         f.write(json.dumps(collapsed_items, cls=WtWordEncoder, indent=2))"""
105 |     print(" Formatting...")
106 |     # max timestamp value for formatting 
107 |     max_timestamp =  max(value for chunk in sorted_chunks for value in [chunk.start, chunk.end])
108 |     timestamp_digits = int(math.ceil(math.log10(max_timestamp)))
109 |     format_string = "{:0{}.2f}".format(max_timestamp, timestamp_digits)
110 | 
111 |     # Find the maximum width of the speaker name
112 |     max_speaker_width = max(len(chunk.speaker) for chunk in sorted_chunks)
113 |     print(" Checking for problems...")
114 |     out_of_sync_items = get_out_of_sync_items(input_dir, format_string, max_speaker_width, collapsed_items)  
115 |     if len(out_of_sync_items) > 0:
116 |         #run them through the punctuation model and collapse again
117 |         print(f"  Found {len(out_of_sync_items)} out-of-sync items. Attempting to auto-punctuate and re-collapse...")
118 | 
119 |         with warnings.catch_warnings():
120 |             warnings.simplefilter("ignore")
121 |             punctuationModel = PunctuationModel()
122 |             changesMade = 0
123 |             for item, _ in out_of_sync_items:
124 |                 repunctuated = punctuationModel.restore_punctuation(item.text)
125 |                 if len(repunctuated) > len(item.text):
126 |                     update_word_texts(item, repunctuated, all_chunks)
127 |                     changesMade += 1
128 |             if changesMade > 0:
129 |                 print(f"  Made changes to punctuation on {changesMade} lines.")
130 |                 print("  Re-sorting...")
131 |                 sorted_chunks = sorted(all_chunks, key=attrgetter('start', 'speaker', 'end'))
132 |                 print("  Re-normalizing...")
133 |                 normalized_items = normalize_items(sorted_chunks)
134 |                 print("  Re-collapsing...")
135 |                 collapsed_items = collapse_adjacent(normalized_items)
136 |                 print("  Checking for problems again...")
137 |                 out_of_sync_items = get_out_of_sync_items(input_dir, format_string, max_speaker_width, collapsed_items)  
138 |                 if len(out_of_sync_items) > 0:     
139 |                     write_out_of_sync_items(input_dir, max_speaker_width, out_of_sync_items)
140 | 
141 |     # option repunctuate everything
142 |     """print(" Repunctuating all items...")                    
143 |     punctuationModel = PunctuationModel()
144 |     for item in tqdm(collapsed_items):
145 |         repunctuated = punctuationModel.restore_punctuation(item.text)
146 |         update_word_texts(item, repunctuated, item.words)"""
147 |     print(" Writing Output...")
148 |     output_items(input_dir, corrections, show_timestamps, format_string, max_speaker_width, collapsed_items)
149 |     print("--------------------")
150 | 
151 | def extract_all_chunks(names: Optional[Dict[str, str]], no_ellipses: bool, disfluent_comma: bool, no_asterisks: bool, file_names: List[str]) -> List[WtWord]:
152 |     all_chunks = []
153 | 
154 |     for file_name in file_names:
155 |         print(f" - {os.path.basename(file_name)}")
156 |         original_speaker = extract_speaker_name(file_name, 'words.json')
157 |         if names and original_speaker in names and (names[original_speaker] is None or names[original_speaker] == ''):
158 |             print(f"    Skipping because '{original_speaker}' is specified as blank.")
159 |             print()
160 |             continue
161 | 
162 |         speaker = names[original_speaker] if names and original_speaker in names else original_speaker
163 |         print(f"    Extracting chunks for {speaker} ({original_speaker})...")
164 |         extracted_chunks = extract_chunks_from_file(file_name, speaker)
165 |         if disfluent_comma:
166 |             insert_commas_at_disfluencies(extracted_chunks, no_asterisks)
167 |         undisfluent_chunks = [c for c in extracted_chunks if c.text.strip() != "[*]"]
168 |         if not no_ellipses:
169 |             insert_ellipses_at_likely_breaks(undisfluent_chunks, no_asterisks)
170 |         all_chunks.extend(undisfluent_chunks)
171 |         print()
172 | 
173 |     return all_chunks
174 | 
175 | def normalize_items(sorted_chunks: List[WtWord]) -> List[WtWordList]:
176 |     normalized_items = []
177 |     current_sentences = {}
178 | 
179 |     for chunk in sorted_chunks:
180 |         if chunk.text == "[*]":
181 |             # this is just a scrubbed disfluency, skip it
182 |             continue
183 | 
184 |         if chunk.speaker not in current_sentences or current_sentences[chunk.speaker] is None:
185 |             current_sentences[chunk.speaker] = WtWordList([chunk])
186 |         else:
187 |             current_sentences[chunk.speaker].words.append(chunk)
188 | 
189 |         last_character = chunk.text.strip()[-1]
190 |         if last_character in ['.', '!', '?', '-', ',', '~']:
191 |             normalized_items.append(current_sentences[chunk.speaker])
192 |             current_sentences[chunk.speaker] = None
193 | 
194 |     # catch any leftovers
195 |     leftovers = [v for k, v in current_sentences.items() if v is not None]
196 |     leftovers.sort(key=lambda x: x.start)
197 |     normalized_items.extend(leftovers)
198 | 
199 |     return normalized_items
200 | 
201 | def collapse_adjacent(normalized_items: List[WtWordList]) -> List[WtWordList]:
202 |     current_chunk = None
203 | 
204 |     collapsed_items = []
205 | 
206 |     for nextchunk in normalized_items:
207 |         if current_chunk is None:
208 |             current_chunk = nextchunk
209 |             continue
210 |         elif nextchunk.speaker != current_chunk.speaker:
211 |             collapsed_items.append(current_chunk)
212 |             current_chunk = nextchunk
213 |         else:
214 |             current_chunk.words.extend(nextchunk.words)
215 | 
216 |     # Output the last chunk
217 |     if current_chunk is not None:
218 |         collapsed_items.append(current_chunk)
219 | 
220 |     return collapsed_items
221 | 
222 | def get_out_of_sync_items(input_dir, format_string, max_speaker_width, collapsed_items):
223 |     now_running_time = 0
224 |     out_of_sync_items = []
225 | 
226 |     for item in collapsed_items:
227 |         if item.start < now_running_time - 5:
228 |             out_of_sync_items.append((item, round(now_running_time - item.start, 2)))
229 |         now_running_time = item.start
230 | 
231 |     return out_of_sync_items
232 | 
233 | def write_out_of_sync_items(input_dir, max_speaker_width, potentially_needing_punctuation):
234 |     print()
235 |     print(f"  NOTE: There are {len(potentially_needing_punctuation)} lines with significantly earlier start timestamps than the previous line, as shown.")
236 |     print("  Manually adding a punctuation mark to the end of one of the words of these lines in the .words.json would allow them to split at that spot, that part of them appearing before the previous speaker's line:")
237 |     print()
238 |     potential_punctuation_string = ""
239 |     max_item_offset_length = max(len(str(x[1])) for x in potentially_needing_punctuation)
240 |     max_start_length = max(len(str(x[0].start)) for x in potentially_needing_punctuation)
241 |     max_end_length = max(len(str(x[0].end)) for x in potentially_needing_punctuation)
242 | 
243 | 
244 |     for item in potentially_needing_punctuation:
245 |         item_string = f"   [{f'{item[0].start:.2f}'.rjust(max_start_length)}-{f'{item[0].end:.2f}'.rjust(max_end_length)}] {f'-{item[1]:.2f}s'.rjust(max_item_offset_length +2)}  {item[0].speaker.rjust(max_speaker_width)}: {item[0].text}"
246 |         print(item_string)
247 |         potential_punctuation_string += item_string + "\n"
248 | 
249 |     punctuation_review_path = os.path.join(input_dir, "outOfSyncItems.txt")
250 |     with open(punctuation_review_path, 'w') as file:
251 |         file.write(potential_punctuation_string)
252 |     print()
253 |     print(f"  This list has been saved to {punctuation_review_path} for review.")
254 |     print()
255 | 
256 | def update_word_texts(word_list: WtWordList, new_text: str, original_words: List[WtWord]):
257 |     new_words = new_text.split()
258 | 
259 |     if len(new_words) != len(word_list.words):
260 |         raise ValueError(f"The new text does not have the same number of words as the original. New count: {len(new_words)}, Old count: {len(word_list.words)}, text: '{new_text}' Original text: '{word_list.text}'")
261 | 
262 |     for wt_word, new_word in zip(word_list.words, new_words):
263 |         if wt_word.text != new_word:
264 |             for original_word in original_words:
265 |                 if original_word.id == wt_word.id:
266 |                     original_word.text = new_word
267 |                     break
268 | 
269 | def output_items(input_dir, corrections, show_timestamps, format_string, max_speaker_width, collapsed_items):
270 |     output_builder = []
271 | 
272 |     for item in collapsed_items:
273 |         timestamp = f"[{item.start}-{item.end}] " if show_timestamps else ""
274 |         out_string = f"{timestamp}{item.speaker.rjust(max_speaker_width)}: \"{item.text}\""
275 | 
276 |         if corrections is not None:
277 |             for key, value in corrections.items():
278 |                 out_string = out_string.replace(key, value)
279 | 
280 |         output_builder.append(out_string)
281 | 
282 |     output_path = os.path.join(input_dir, "transcript.txt")
283 |     with open(output_path, 'w') as file:
284 |         file.write('\n'.join(output_builder))
285 |     print(f"  {len(output_builder)} lines saved as {output_path}")
286 | 
287 | def ends_with_break(chunk_text):
288 |     last_character = chunk_text.strip()[-1]
289 |     return last_character in ['.', '!', '?', '-', ',', '~']
290 | 
291 | def extract_chunks_from_file(file_path: str, speaker: str) -> List[WtWord]:
292 |     chunks = []
293 | 
294 |     try:
295 |         with open(file_path, 'r') as file:
296 |             json_content = file.read()
297 |         jsonData = json.loads(json_content)
298 | 
299 |         for chunkData in jsonData['segments']:
300 |             segment_chunks = []
301 |             for word in chunkData['words']:
302 |                 chunk = WtWord(
303 |                     speaker = speaker,
304 |                     text = word['text'].strip() if word['text'] else "",
305 |                     start = word['start'],
306 |                     end = word['end']
307 |                 )
308 |                 segment_chunks.append(chunk)
309 | 
310 |             chunks.extend(segment_chunks)
311 |     except Exception as ex:
312 |         print(f"Error processing file {file_path}: {str(ex)}")
313 | 
314 |     return chunks
315 | 
316 | def insert_commas_at_disfluencies(segment_chunks: List[WtWord], no_asterisks: bool):
317 |     if len(segment_chunks) > 1:
318 |         for i in range(1, len(segment_chunks)):
319 |             current_word = segment_chunks[i]
320 |             prvs_word = segment_chunks[i - 1]
321 |             if current_word.speaker != prvs_word.speaker:
322 |                 raise ValueError("Only meant to be used on single-speaker collection")
323 |             if current_word.text == "[*]" and not ends_with_break(prvs_word.text):
324 |                 prvs_word.text = prvs_word.text.strip() + ("" if no_asterisks else "*") + ","
325 | 
326 | def insert_ellipses_at_likely_breaks(segment_chunks: List[WtWord], no_asterisks: bool):
327 |     if len(segment_chunks) > 1:
328 |         for i in range(1, len(segment_chunks)):
329 |             current_word = segment_chunks[i]
330 |             prvs_word = segment_chunks[i - 1]
331 |             if current_word.speaker != prvs_word.speaker:
332 |                 raise ValueError("Only meant to be used on single-speaker collection")
333 |             if current_word.start - prvs_word.end > 10 and not ends_with_break(prvs_word.text):
334 |                 prvs_word.text = prvs_word.text.strip() + ("" if no_asterisks else "*") + "..."


--------------------------------------------------------------------------------