26 |
27 |
--------------------------------------------------------------------------------
/eleven_labs.py:
--------------------------------------------------------------------------------
1 | from elevenlabs.client import ElevenLabs
2 | from elevenlabs import play, stream, save, Voice, VoiceSettings
3 | import time
4 | import os
5 |
6 | class ElevenLabsManager:
7 |
8 | def __init__(self):
9 | self.client = ElevenLabs(api_key=os.getenv('ELEVENLABS_API_KEY')) # Defaults to ELEVEN_API_KEY)
10 | self.voices = self.client.voices.get_all().voices
11 | # Create a map of Names->IDs, so that we can easily grab a voice's ID later on
12 | self.voice_to_id = {}
13 | for voice in self.voices:
14 | self.voice_to_id[voice.name] = voice.voice_id
15 | self.voice_to_settings = {}
16 |
17 | # Convert text to speech, then save it to file. Returns the file path.
18 | # Current model options (that I would use) are eleven_monolingual_v1 or eleven_turbo_v2
19 | # eleven_turbo_v2 takes about 60% of the time that eleven_monolingual_v1 takes
20 | # However eleven_monolingual_v1 seems to produce more variety and emphasis, whereas turbo feels more monotone. Turbo still sounds good, just a little less interesting
21 | def text_to_audio(self, input_text, voice="Doug VO Only", save_as_wave=True, subdirectory="", model_id="eleven_monolingual_v1"):
22 | # Currently seems to be a problem with the API where it uses default voice settings, rather than pulling the proper settings from the website
23 | # Workaround is to get the voice settings for each voice the first time it's used, then pass those settings in manually
24 | if voice not in self.voice_to_settings:
25 | self.voice_to_settings[voice] = self.client.voices.get_settings(self.voice_to_id[voice])
26 | voice_settings = self.voice_to_settings[voice]
27 | audio_saved = self.client.generate(text=input_text, voice=Voice(voice_id=self.voice_to_id[voice], settings=voice_settings), model=model_id,)
28 | if save_as_wave:
29 | file_name = f"___Msg{str(hash(input_text))}{time.time()}_{model_id}.wav"
30 | else:
31 | file_name = f"___Msg{str(hash(input_text))}{time.time()}_{model_id}.mp3"
32 | tts_file = os.path.join(os.path.abspath(os.curdir), subdirectory, file_name)
33 | save(audio_saved,tts_file)
34 | return tts_file
--------------------------------------------------------------------------------
/static/js/multiAgent.js:
--------------------------------------------------------------------------------
1 | import Letterize from "https://cdn.skypack.dev/letterizejs@2.0.0";
2 | import anime from "https://cdn.skypack.dev/animejs@3.2.1";
3 |
4 | $(document).ready(function() {
5 |
6 | var socket = io();
7 |
8 | socket.on('start_agent', function(msg, cb) {
9 | console.log("Got data: " + msg)
10 |
11 | $('#agent-container-' + msg.agent_id).animate({ opacity: 1 }, 500);
12 |
13 | if (cb)
14 | cb();
15 | });
16 |
17 | // Updates each sentence
18 | socket.on('agent_message', function(msg, cb) {
19 |
20 | $("#agent-text-" + msg.agent_id).text(msg.text)
21 |
22 | // Note that openAiAnimation is NOT a const variable
23 | let openAiAnimation = new Letterize({targets: "#agent-text-" + msg.agent_id, className: "agent-letter"});
24 |
25 | // Now we've turned every letter into its own span, we group all of the letter spans into "word" elements, so that the word elements can wrap around multiple lines appropriately
26 | let $openaiText = $('#agent-text-' + msg.agent_id); // Get the openai-text container
27 | let $letters = $openaiText.find('.agent-letter'); // Get all the letter spans inside the openai_text container
28 | let $newContent = $(''); // Create a new jQuery object to hold the new structure
29 | let $wordSpan = $(''); // Create a new word span to start with
30 | // Iterate over each letter span to create the word element
31 | $letters.each(function() {
32 | const $letter = $(this);
33 | if ($letter.text().trim() === '') { // Check if the letter is a space
34 | $newContent.append($wordSpan); // Append the current word span to the new content
35 | $newContent.append($letter); // Add the space directly to the new content
36 | $wordSpan = $(''); // Create a new word span for the next word
37 | } else {
38 | $wordSpan.append($letter); // If not a space, append the letter to the current word span
39 | }
40 | });
41 | $newContent.append($wordSpan); // Append the last word span to the new content
42 | $openaiText.empty().append($newContent.contents()); // Clear the openai_text container and append the new content
43 |
44 | var animation = anime.timeline({
45 | targets: openAiAnimation.listAll,
46 | delay: anime.stagger(30),
47 | loop: true
48 | });
49 | animation
50 | .add({translateY: -2, duration: 1000})
51 | .add({translateY: 0, duration: 1000});
52 |
53 | if (cb)
54 | cb();
55 | });
56 |
57 | socket.on('clear_agent', function (msg, cb) {
58 | console.log("Client received clear message instruction!")
59 |
60 | $('#agent-container-' + msg.agent_id).animate({ opacity: 0 }, 500);
61 |
62 | if (cb)
63 | cb();
64 | });
65 | });
--------------------------------------------------------------------------------
/whisper_openai.py:
--------------------------------------------------------------------------------
1 | import torch
2 | from transformers import AutoModelForSpeechSeq2Seq, AutoProcessor, pipeline
3 | from rich import print
4 | import time
5 |
6 | class WhisperManager():
7 |
8 | # Uses Whisper on HuggingFace: https://huggingface.co/openai/whisper-large-v3
9 | # Need to make sure you've installed torch with CUDA support, rather than just default torch: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
10 | # I tried a lot but could not get Flash Attention 2 to install. It would speed up performance but isn't necessary.
11 |
12 | def __init__(self):
13 | print(torch.cuda.is_available()) # Should return True if CUDA is available
14 | print(torch.cuda.get_device_name(0)) # Should return the name of your GPU, e.g., "NVIDIA GeForce RTX 4070 Ti"
15 | device = "cuda:0" if torch.cuda.is_available() else "cpu"
16 | torch_dtype = torch.float16 if torch.cuda.is_available() else torch.float32
17 | model_id = "openai/whisper-large-v3"
18 |
19 | model = AutoModelForSpeechSeq2Seq.from_pretrained(
20 | model_id, torch_dtype=torch_dtype, low_cpu_mem_usage=True, use_safetensors=True
21 | )
22 | model.to(device)
23 | model.generation_config.is_multilingual = False
24 | model.generation_config.language = "en"
25 |
26 | processor = AutoProcessor.from_pretrained(model_id)
27 |
28 | self.pipe = pipeline(
29 | "automatic-speech-recognition",
30 | model=model,
31 | tokenizer=processor.tokenizer,
32 | feature_extractor=processor.feature_extractor,
33 | max_new_tokens=256,
34 | chunk_length_s=30,
35 | batch_size=16,
36 | return_timestamps=True,
37 | torch_dtype=torch_dtype,
38 | device=device,
39 | )
40 |
41 | # Converts an audio file into transcribed text. Can provide also provide timestamps
42 | # wav and mp3 files appear to take the same amount of time to process
43 | # With test files, word timestamps took 3.5-4 seconds, sentence timestamps took 2.2 seconds, no timestamps took 1.9-2 seconds
44 | def audio_to_text(self, audio_file, timestamps=None):
45 | if timestamps == None:
46 | result = self.pipe(audio_file, return_timestamps=False)
47 | elif timestamps == "sentence":
48 | result = self.pipe(audio_file, return_timestamps=True)
49 | elif timestamps == "word":
50 | result = self.pipe(audio_file, return_timestamps="word")
51 | else:
52 | result = {"text": " "}
53 | if timestamps == None:
54 | # If they didn't want the timestamps, then just return the text
55 | return result["text"]
56 | else:
57 | # Return an array of dictionaries that contain every sentence/word with its corresponding start and end time
58 | # I reformat the data a bit so that it's more intuitive to work with.
59 | # Each dictionary will look like: {'text': 'here is my speech', 'start_time': 11.58, 'end_time': 14.74}
60 | timestamped_chunks = []
61 | for chunk in result['chunks']:
62 | new_chunk = {
63 | 'text': chunk['text'],
64 | 'start_time': chunk['timestamp'][0],
65 | 'end_time': chunk['timestamp'][1]
66 | }
67 | timestamped_chunks.append(new_chunk)
68 | return timestamped_chunks
69 |
70 |
--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
1 | # Multi Agent GPT Characters
2 | Web app that allows 3 GPT characters and a human to talk to each other.
3 | Written by DougDoug. Feel free to use this for whatever you want! Credit is appreciated but not required.
4 |
5 | This is uploaded for educational purposes. Unfortunately I don't have time to offer individual support or review pull requests, but ChatGPT or Claude can be very helpful if you are running into issues.
6 |
7 | ## SETUP:
8 | 1) This was written in Python 3.9.2. Install page here: https://www.python.org/downloads/release/python-392/
9 |
10 | 2) Run `pip install -r requirements.txt` to install all modules.
11 |
12 | 3) This uses the OpenAi API and Elevenlabs services. You'll need to set up an account with these services and generate an API key from them. Then add these keys as windows environment variables named OPENAI_API_KEY and ELEVENLABS_API_KEY respectively.
13 |
14 | 4) This app uses the GPT-4o model from OpenAi. As of this writing (Sep 3rd 2024), you need to pay $5 to OpenAi in order to get access to the GPT-4o model API. So after setting up your account with OpenAi, you will need to pay for at least $5 in credits so that your account is given the permission to use the GPT-4o model when running my app. See here: https://help.openai.com/en/articles/7102672-how-can-i-access-gpt-4-gpt-4-turbo-gpt-4o-and-gpt-4o-mini
15 |
16 | 5) Elevenlabs is the service I use for Ai voices. Once you've made Ai voices on the Elevenlabs website, open up multi_agent_gpt.py and make sure it's passing the name of your voices into each agent's init function.
17 |
18 | 6) This app uses the open source Whisper model from OpenAi for transcribing audio into text. This means you'll be running an Ai model locally on your PC, so ideally you have an Nvidia GPU to run this. The Whisper model is used to transcribe the user's microphone recordings, and is used to generate subtitles from the Elevenlabs audio every time an agent "speaks". This model was downloaded from Huggingface and should install automatically when you run the whisper_openai.py file.
19 | Note that you'll want to make sure you've installed torch with CUDA support, rather than just default torch, otherwise it will run very slow: pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118.
20 | If you have issues with the Whisper model there are other services that can offer an audio-to-text service (including a Whisper API), but this solution currently works well for me.
21 |
22 | 7) This code runs a Flask web app and will display the agents' dialogue using HTML and javascript. By default it will run the server on "127.0.0.1:5151", but you can change this in multi_agent_gpt.py.
23 |
24 | 8) Optionally, you can use OBS Websockets and an OBS plugin to make images move while talking.
25 | First open up OBS. Make sure you're running version 28.X or later. Click Tools, then WebSocket Server Settings. Make sure "Enable WebSocket server" is checked. Then set Server Port to '4455' and set the Server Password to 'TwitchChat9'. If you use a different Server Port or Server Password in your OBS, just make sure you update the websockets_auth.py file accordingly.
26 | Next install the Move OBS plugin: https://obsproject.com/forum/resources/move.913/ Now you can use this plugin to add a filter to an audio source that will change an image's transform based on the audio waveform. For example, I have a filter on a specific audio track that will move each agent's bell pepper icon source image whenever that pepper is talking.
27 | Note that OBS must be open when you're running this code, otherwise OBS WebSockets won't be able to connect. If you don't need the images to move while talking, you can just delete the OBS portions of the code.
28 |
29 | ## Using the App
30 |
31 | To start out, edit the ai_prompts.py file to design each agent's personality and the purpose of their conversation.
32 | By default the characters are told to discuss the greatest videogames of all time, but you can change this to anything you want, OpenAi is pretty great at having agents talk about pretty much anything.
33 |
34 | Next run multi_agent_gpt.py
35 |
36 | Once it's running you now have a number of options:
37 |
38 | __Press Numpad7 to "talk" to the agents.__
39 | Numpad7 will start recording your microphone audio. Hit Numpad8 to stop recording. It will then transcribe your audio into text and add your dialogue into all 3 agents' chat history. Then it will pick a random agent to "activate" and have them start talking next.
40 |
41 | __Numpad1 will "activate" Agent #1.__
42 | This means that agent will continue the conversation and start talking. Unless it has been "paused", it will also pick a random other agent and "activate" them to talk next, so that the conversation continues indefinitely.
43 |
44 | __Numpad2 will "activate" Agent #2, Numpad3 will "activate" Agent #3.__
45 |
46 | __F4 will "pause" all agents__
47 | This stops the agents from activating each other. Basically, use this to stop the conversation from continuing any further, and then you can talk to the agents again.
48 |
49 | ## Miscellaneous notes:
50 |
51 | All agents will automatically store their "chat history" into a backup txt file as the conversation continues. This is done so that when you restart the program, each agent will automatically load from their backup file and thus restore the entire conversation, letting you continue it from where you left off. If you ever want to fully reset the conversation then just delete the backup txt files in the project.
52 |
53 | If you want to have the agent dialogue displayed in OBS, you should add a browser source and set the URL to "127.0.0.1:5151".
54 |
--------------------------------------------------------------------------------
/obs_websockets.py:
--------------------------------------------------------------------------------
1 | import time
2 | from obswebsocket import obsws, requests # noqa: E402
3 | from websockets_auth import WEBSOCKET_HOST, WEBSOCKET_PORT, WEBSOCKET_PASSWORD
4 |
5 | ##########################################################
6 | ##########################################################
7 |
8 | class OBSWebsocketsManager:
9 | ws = None
10 |
11 | def __init__(self):
12 | # Connect to websockets
13 | self.ws = obsws(WEBSOCKET_HOST, WEBSOCKET_PORT, WEBSOCKET_PASSWORD)
14 | self.ws.connect()
15 | print("Connected to OBS Websockets!\n")
16 |
17 | def disconnect(self):
18 | self.ws.disconnect()
19 |
20 | # Set the current scene
21 | def set_scene(self, new_scene):
22 | self.ws.call(requests.SetCurrentProgramScene(sceneName=new_scene))
23 |
24 | # Set the visibility of any source's filters
25 | def set_filter_visibility(self, source_name, filter_name, filter_enabled=True):
26 | self.ws.call(requests.SetSourceFilterEnabled(sourceName=source_name, filterName=filter_name, filterEnabled=filter_enabled))
27 |
28 | # Set the visibility of any source
29 | def set_source_visibility(self, scene_name, source_name, source_visible=True):
30 | response = self.ws.call(requests.GetSceneItemId(sceneName=scene_name, sourceName=source_name))
31 | myItemID = response.datain['sceneItemId']
32 | self.ws.call(requests.SetSceneItemEnabled(sceneName=scene_name, sceneItemId=myItemID, sceneItemEnabled=source_visible))
33 |
34 | # Returns the current text of a text source
35 | def get_text(self, source_name):
36 | response = self.ws.call(requests.GetInputSettings(inputName=source_name))
37 | return response.datain["inputSettings"]["text"]
38 |
39 | # Returns the text of a text source
40 | def set_text(self, source_name, new_text):
41 | self.ws.call(requests.SetInputSettings(inputName=source_name, inputSettings = {'text': new_text}))
42 |
43 | def get_source_transform(self, scene_name, source_name):
44 | response = self.ws.call(requests.GetSceneItemId(sceneName=scene_name, sourceName=source_name))
45 | myItemID = response.datain['sceneItemId']
46 | response = self.ws.call(requests.GetSceneItemTransform(sceneName=scene_name, sceneItemId=myItemID))
47 | transform = {}
48 | transform["positionX"] = response.datain["sceneItemTransform"]["positionX"]
49 | transform["positionY"] = response.datain["sceneItemTransform"]["positionY"]
50 | transform["scaleX"] = response.datain["sceneItemTransform"]["scaleX"]
51 | transform["scaleY"] = response.datain["sceneItemTransform"]["scaleY"]
52 | transform["rotation"] = response.datain["sceneItemTransform"]["rotation"]
53 | transform["sourceWidth"] = response.datain["sceneItemTransform"]["sourceWidth"] # original width of the source
54 | transform["sourceHeight"] = response.datain["sceneItemTransform"]["sourceHeight"] # original width of the source
55 | transform["width"] = response.datain["sceneItemTransform"]["width"] # current width of the source after scaling, not including cropping. If the source has been flipped horizontally, this number will be negative.
56 | transform["height"] = response.datain["sceneItemTransform"]["height"] # current height of the source after scaling, not including cropping. If the source has been flipped vertically, this number will be negative.
57 | transform["cropLeft"] = response.datain["sceneItemTransform"]["cropLeft"] # the amount cropped off the *original source width*. This is NOT scaled, must multiply by scaleX to get current # of cropped pixels
58 | transform["cropRight"] = response.datain["sceneItemTransform"]["cropRight"] # the amount cropped off the *original source width*. This is NOT scaled, must multiply by scaleX to get current # of cropped pixels
59 | transform["cropTop"] = response.datain["sceneItemTransform"]["cropTop"] # the amount cropped off the *original source height*. This is NOT scaled, must multiply by scaleY to get current # of cropped pixels
60 | transform["cropBottom"] = response.datain["sceneItemTransform"]["cropBottom"] # the amount cropped off the *original source height*. This is NOT scaled, must multiply by scaleY to get current # of cropped pixels
61 | return transform
62 |
63 | # The transform should be a dictionary containing any of the following keys with corresponding values
64 | # positionX, positionY, scaleX, scaleY, rotation, width, height, sourceWidth, sourceHeight, cropTop, cropBottom, cropLeft, cropRight
65 | # e.g. {"scaleX": 2, "scaleY": 2.5}
66 | # Note: there are other transform settings, like alignment, etc, but these feel like the main useful ones.
67 | # Use get_source_transform to see the full list
68 | def set_source_transform(self, scene_name, source_name, new_transform):
69 | response = self.ws.call(requests.GetSceneItemId(sceneName=scene_name, sourceName=source_name))
70 | myItemID = response.datain['sceneItemId']
71 | self.ws.call(requests.SetSceneItemTransform(sceneName=scene_name, sceneItemId=myItemID, sceneItemTransform=new_transform))
72 |
73 | # Note: an input, like a text box, is a type of source. This will get *input-specific settings*, not the broader source settings like transform and scale
74 | # For a text source, this will return settings like its font, color, etc
75 | def get_input_settings(self, input_name):
76 | return self.ws.call(requests.GetInputSettings(inputName=input_name))
77 |
78 | # Get list of all the input types
79 | def get_input_kind_list(self):
80 | return self.ws.call(requests.GetInputKindList())
81 |
82 | # Get list of all items in a certain scene
83 | def get_scene_items(self, scene_name):
84 | return self.ws.call(requests.GetSceneItemList(sceneName=scene_name))
85 |
86 | # Immediately ends the stream. Use with caution.
87 | def stop_stream(self):
88 | return self.ws.call(requests.StopStream())
89 |
90 |
--------------------------------------------------------------------------------
/ai_prompts.py:
--------------------------------------------------------------------------------
1 | VIDEOGAME_SYSTEM_INTRO = '''
2 | This is a conversation with 3 other people where you are talking about the best videogames of all time. You will be playing a character where you are trying to engage in the most spirited and entertaining possible conversation about the greatest videogames of all time.
3 | '''
4 |
5 | VIDEOGAME_SYSTEM_OUTRO = '''
6 |
7 | Once the conversation starts, your goal is to have a discussion covering the best videogames of all time.
8 |
9 | Please use the following rules when giving a response:
10 | 1) Under no circumstances may you break character.
11 | 2) Always keep your answers short, just 4 sentences max.
12 |
13 | Messages that you receive from the other 3 people in the conversation will always begin with their title, to help you distinguish who has said what. For example a message from Victoria will begin with "[VICTORIA]", while a message from Tony will begin with [TONY]. You should NOT begin your message with this, just answer normally.
14 |
15 | Okay, let the story begin!
16 | '''
17 |
18 | # Agent 1: The Enthusiastic Adventurer
19 | VIDEOGAME_AGENT_1 = {"role": "system", "content": f'''
20 | {VIDEOGAME_SYSTEM_INTRO}
21 | In this conversation, your character is the Enthusiastic Adventurer. Your role is to be intensely curious and exuberantly passionate about every topic that comes up. You have an insatiable appetite for knowledge and are always seeking new experiences and discoveries. You love to share fascinating, sometimes bizarre, facts and stories with great excitement. Your enthusiasm knows no bounds, and you frequently express amazement and wonder at even the smallest details. When interacting with others, ask many questions, dive into the most obscure aspects of the topic, and encourage everyone to explore with you. You should not be afraid to completely change the conversation topic to fit whatever interests you, instead of what everyone else is talking about.
22 |
23 | Traits and Behaviors:
24 |
25 | Occasionally make up stories about your own life and your personal experiences with various videogames.
26 | Always expresses excitement and wonder.
27 | Frequently uses exclamations with a ton of emotion.
28 | You get frustrated if other people don't seem to be as excited about something as you are. You can barely contain yourself when this happens.
29 | You are HORRIFIED if anyone swears. This is a family-friendly conversation and you should aggressively scold anyone who swears and ruins the sanctity of this conversation.
30 | Constantly seek to uncover obscure or lesser-known facts.
31 | Asks many probing and unexpected questions.
32 | Encourages others to be just as excited and curious.
33 | {VIDEOGAME_SYSTEM_OUTRO}
34 | '''}
35 |
36 | # Agent 2: The Cynical Jester
37 | VIDEOGAME_AGENT_2 = {"role": "system", "content": f'''
38 | {VIDEOGAME_SYSTEM_INTRO}
39 | In this conversation, your character is the Cynical Jester. Your role is to provide sharp, witty, and often sarcastic commentary on the topics being discussed. You have a keen eye for the absurd and enjoy pointing out the ridiculousness in any situation. Your humor can be biting and dark, but it's always aimed at making the conversation more entertaining. You thrive on irony and love to make clever, sometimes controversial, observations. Challenge the status quo, poke fun at lofty ideas, and keep everyone on their toes with your unpredictable humor.
40 |
41 | Traits and Behaviors:
42 |
43 | Occasionally make up elaborate stories about your own life as a stereotypical Italian American in New York and your personal experiences with various videogames.
44 | Frequently uses sarcasm and irony.
45 | Makes dark or controversial jokes.
46 | Points out the absurdity in serious topics.
47 | Challenges and mocks the status quo.
48 | Keeps the conversation lively with unexpected humor.
49 | Frequently attack the opinions of other people in the conversations. You should frequently focus on their personality instead of the substance of what they're saying.
50 | Swear a LOT. You are a hard-hitting smack-talking guy from da streets o' New York, make sure people know it.
51 | You try to order Italian food sometimes.
52 | You think you're ALWAYS right and respond to any criticism or disagreement with extreme emotion and anger and swearing.
53 | Makes up random facts about videogames and always attributes it to some cousin of yours in New York.
54 | You get upset if anyone calls you just "Tony" instead of your full proper name, make sure they call you Tony King of New York.
55 | {VIDEOGAME_SYSTEM_OUTRO}
56 | '''}
57 |
58 | # Agent 3: The Intense Philosopher
59 | VIDEOGAME_AGENT_3 = {"role": "system", "content": f'''
60 | {VIDEOGAME_SYSTEM_INTRO}
61 | In this conversation, your character is the Intense Philosopher. Your role is to delve deeply into every topic with a fervent passion for understanding the truth. You approach every discussion with a critical and analytical mind, often questioning the fundamental assumptions behind ideas. You are unafraid to confront difficult or uncomfortable truths and enjoy pushing conversations into profound and existential territory. Your intensity can be overwhelming, and you often challenge others to think deeply and reconsider their viewpoints. Your goal is to provoke thought and inspire deep reflection, even if it leads to heated debates.
62 | However, you frequently change the topic and tie your philosophical points to a variety of different games, rather than just focusing on one game at a time.
63 |
64 | Traits and Behaviors:
65 |
66 | Occasionally make up elaborate stories about your own life as a posh British woman and your personal experiences with various videogames.
67 | You have EXTREMELY critical opinions about videogames and can barely restrain yourself from ranting about the various problems you have with different videogames.
68 | You occasionally change topics entirely because you're so angry about some videogame experience you had last week and you want to complain about it to the rest of the group.
69 | Frequently questions fundamental assumptions.
70 | Pushes conversations into profound and existential territory.
71 | Confronts difficult or uncomfortable truths.
72 | Challenges others to think deeply and reconsider viewpoints.
73 | Uses intense, sometimes overwhelming, arguments.
74 | You are gay and like Semigreatkate.
75 | You get REALLY upset if anyone questions or undermines your arguments.
76 | {VIDEOGAME_SYSTEM_OUTRO}
77 | '''}
78 |
--------------------------------------------------------------------------------
/audio_player.py:
--------------------------------------------------------------------------------
1 | import pygame
2 | import time
3 | import os
4 | import asyncio
5 | import subprocess
6 | import threading
7 | import keyboard
8 | import wave
9 | import pyaudio
10 | import soundfile as sf
11 | from mutagen.mp3 import MP3
12 | from pydub import AudioSegment
13 | from rich import print
14 |
15 | class AudioManager:
16 |
17 | # Variables for recording audio from mic
18 | is_recording = False
19 | audio_frames = []
20 | audio_format = pyaudio.paInt16
21 | channels = 2
22 | rate = 44100
23 | chunk = 1024
24 |
25 | def __init__(self):
26 | # Use higher frequency to prevent audio glitching noises
27 | # Use higher buffer because why not (default is 512)
28 | pygame.mixer.init(frequency=48000, buffer=1024)
29 |
30 | def play_audio(self, file_path, sleep_during_playback=True, delete_file=False, play_using_music=True):
31 | """
32 | Parameters:
33 | file_path (str): path to the audio file
34 | sleep_during_playback (bool): means program will wait for length of audio file before returning
35 | delete_file (bool): means file is deleted after playback (note that this shouldn't be used for multithreaded function calls)
36 | play_using_music (bool): means it will use Pygame Music, if false then uses pygame Sound instead
37 | """
38 | if not pygame.mixer.get_init(): # Reinitialize mixer if needed
39 | pygame.mixer.init(frequency=48000, buffer=1024)
40 | if play_using_music:
41 | # Pygame Music can only play one file at a time
42 | try:
43 | pygame.mixer.music.load(file_path)
44 | pygame.mixer.music.play()
45 | converted = False
46 | except:
47 | # Wav files from Elevenlabs don't work with Pygame's Music for some fucking reason (works fine with Sound)
48 | # If there's an error here that's likely why, so convert it to a format that Pygame can handle
49 | # You can't convert the file in place so just convert it into a temp file that you delete later
50 | converted_wav = "temp_convert.wav"
51 | subprocess.run(["ffmpeg", "-y", "-i", file_path, "-ar", "48000", "-ac", "2", "-c:a", "pcm_s16le", converted_wav])
52 | converted = True
53 | pygame.mixer.music.load(converted_wav)
54 | pygame.mixer.music.play()
55 | else:
56 | # Pygame Sound lets you play multiple sounds simultaneously
57 | pygame_sound = pygame.mixer.Sound(file_path)
58 | pygame_sound.play()
59 |
60 | if sleep_during_playback:
61 | # Sleep until file is done playing
62 | file_length = self.get_audio_length(file_path)
63 | time.sleep(file_length)
64 | # Delete the file
65 | if delete_file:
66 | # Stop Pygame so file can be deleted
67 | # Note: this will stop the audio on other threads as well, so it's not good if you're playing multiple sounds at once
68 | pygame.mixer.music.stop()
69 | pygame.mixer.quit()
70 | try:
71 | os.remove(file_path)
72 | if converted:
73 | os.remove(converted_wav) # Remove the converted wav if it was created
74 | except PermissionError:
75 | print(f"Couldn't remove {file_path} because it is being used by another process.")
76 |
77 | async def play_audio_async(self, file_path):
78 | """
79 | Parameters:
80 | file_path (str): path to the audio file
81 | """
82 | if not pygame.mixer.get_init(): # Reinitialize mixer if needed
83 | pygame.mixer.init(frequency=48000, buffer=1024)
84 | pygame_sound = pygame.mixer.Sound(file_path)
85 | pygame_sound.play()
86 |
87 | # Sleep for the duration of the audio.
88 | # Must use asyncio.sleep() because time.sleep() will block the thread, even if it's in an async function
89 | file_length = self.get_audio_length(file_path)
90 | await asyncio.sleep(file_length)
91 |
92 | def get_audio_length(self, file_path):
93 | # Calculate length of the file based on the file format
94 | _, ext = os.path.splitext(file_path) # Get the extension of this file
95 | if ext.lower() == '.wav':
96 | wav_file = sf.SoundFile(file_path)
97 | file_length = wav_file.frames / wav_file.samplerate
98 | wav_file.close()
99 | elif ext.lower() == '.mp3':
100 | mp3_file = MP3(file_path)
101 | file_length = mp3_file.info.length
102 | else:
103 | print("Unknown audio file type. Returning 0 as file length")
104 | file_length = 0
105 | return file_length
106 |
107 | def combine_audio_files(self, input_files):
108 | # input_files is an array of file paths
109 | output_file = os.path.join(os.path.abspath(os.curdir), f"___Msg{str(hash(' '.join(input_files)))}.wav")
110 | combined = None
111 | for file in input_files:
112 | audio = AudioSegment.from_file(file)
113 | if combined is None:
114 | combined = audio
115 | else:
116 | combined += audio
117 | if combined:
118 | combined.export(output_file, format=os.path.splitext(output_file)[1][1:])
119 | print(f"Combined file saved as: {output_file}")
120 | else:
121 | print("No files to combine.")
122 | return output_file
123 |
124 | def start_recording(self, stream):
125 | self.audio_frames = []
126 | while self.is_recording:
127 | data = stream.read(self.chunk)
128 | self.audio_frames.append(data)
129 | print("[red]DONE RECORDING!")
130 |
131 | def record_audio(self, end_recording_key='=', audio_device=None):
132 | # Records audio from an audio input device.
133 | # Example device names are "Line In (Realtek(R) Audio)", "Sample (TC-Helicon GoXLR)", or just leave empty to use default mic
134 | # For some reason this doesn't work on the Broadcast GoXLR Mix, the other 3 GoXLR audio inputs all work fine.
135 | # Both Azure Speech-to-Text AND this script have issues listening to Broadcast Stream Mix, so just ignore it.
136 | audio = pyaudio.PyAudio()
137 |
138 | if audio_device is None:
139 | # If no audio_device is provided, use the default mic
140 | audio_stream = audio.open(format=self.audio_format, channels=self.channels, rate=self.rate, input=True, frames_per_buffer=self.chunk)
141 | else:
142 | # If an audio device was provided, find its index
143 | device_index = None
144 | for i in range(audio.get_device_count()):
145 | dev_info = audio.get_device_info_by_index(i)
146 | # print(dev_info['name'])
147 | if audio_device in dev_info['name']:
148 | device_index = i
149 | # Some audio devices only support specific sample rates, so make sure to find a sample rate that's compatible with the device
150 | # This was necessary on certain GoXLR input but only sometimes. But this fixes the issues so w/e.
151 | supported_rates = [96000, 48000, 44100, 32000, 22050, 16000, 11025, 8000]
152 | for rate in supported_rates:
153 | try:
154 | if audio.is_format_supported(rate, input_device=device_index, input_channels=self.channels, input_format=self.audio_format):
155 | self.rate = rate
156 | break
157 | except ValueError:
158 | continue
159 | if device_index is None:
160 | raise ValueError(f"Device '{audio_device}' not found")
161 | if self.rate is None:
162 | raise ValueError(f"No supported sample rate found for device '{audio_device}'")
163 | audio_stream = audio.open(format=self.audio_format, channels=self.channels, rate=self.rate, input=True, input_device_index=device_index, frames_per_buffer=self.chunk)
164 |
165 | # Start recording an a second thread
166 | self.is_recording = True
167 | threading.Thread(target=self.start_recording, args=(audio_stream,)).start()
168 |
169 | # Wait until end key is pressed
170 | while True:
171 | if keyboard.is_pressed(end_recording_key):
172 | break
173 | time.sleep(0.05) # Add this to reduce CPU usage
174 |
175 | self.is_recording = False
176 | time.sleep(0.1) # Just for safety, no clue if this is needed
177 |
178 | filename = f"mic_recording_{int(time.time())}.wav"
179 | wave_file = wave.open(filename, 'wb')
180 | wave_file.setnchannels(self.channels)
181 | wave_file.setsampwidth(audio.get_sample_size(self.audio_format))
182 | wave_file.setframerate(self.rate)
183 | wave_file.writeframes(b''.join(self.audio_frames))
184 | wave_file.close()
185 |
186 | # Close the stream and PyAudio
187 | audio_stream.stop_stream()
188 | audio_stream.close()
189 | audio.terminate()
190 |
191 | return filename
192 |
193 |
--------------------------------------------------------------------------------
/openai_chat.py:
--------------------------------------------------------------------------------
1 | from openai import OpenAI
2 | import tiktoken
3 | import os
4 | from rich import print
5 | import base64
6 | import time
7 | import json
8 |
9 | class OpenAiManager:
10 |
11 | def __init__(self, system_prompt=None, chat_history_backup=None):
12 | """
13 | Optionally provide a chat_history_backup txt file and a system_prompt string.
14 | If the backup file is provided, we load the chat history from it.
15 | If the backup file already exists, then we don't add the system prompt into the convo history, because we assume that it already has a system prompt in it.
16 | Alternatively you manually add new system prompts into the chat history at any point.
17 | """
18 |
19 | self.client = OpenAI(api_key=os.environ['OPENAI_API_KEY'])
20 | self.logging = True # Determines whether the module should print out its results
21 | self.tiktoken_encoder = None # Used to calculate the token count in messages
22 | self.chat_history = []
23 |
24 | # If a backup file is provided, we will save our chat history to that file after every call
25 | self.chat_history_backup = chat_history_backup
26 |
27 | # If the backup file already exists, we load its contents into the chat_history
28 | if chat_history_backup and os.path.exists(chat_history_backup):
29 | with open(chat_history_backup, 'r') as file:
30 | self.chat_history = json.load(file)
31 | elif system_prompt:
32 | # If the chat history file doesn't exist, then our chat history is currently empty.
33 | # If we were provided a system_prompt, add it into the chat history as the first message.
34 | self.chat_history.append(system_prompt)
35 |
36 | # Write our current chat history to the txt file
37 | def save_chat_to_backup(self):
38 | if self.chat_history_backup:
39 | with open(self.chat_history_backup, 'w') as file:
40 | json.dump(self.chat_history, file)
41 |
42 | def num_tokens_from_messages(self, messages, model='gpt-4o'):
43 | """Returns the number of tokens used by a list of messages.
44 | The code below is an adaptation of this text-only version: https://platform.openai.com/docs/guides/chat/managing-tokens
45 |
46 | Note that image tokens are calculated differently from text.
47 | The guide for image token calculation is here: https://platform.openai.com/docs/guides/vision
48 | Short version is that a 1920x1080 image is going to be 1105 tokens, so just using that for all images for now.
49 | In the future I could swap to 'detail: low' and cap it at 85 tokens. Might be necessary for certain use cases.
50 |
51 | There are three message formats we have to check:
52 | Version 1: the 'content' is just a text string
53 | 'content' = 'What are considered some of the most popular characters in videogames?'
54 | Version 2: the content is an array with a single dictionary, with two key/value pairs
55 | 'content' = [{'type': 'text', 'text': 'What are considered some of the most popular characters in videogames?'}]
56 | Version 3: the content is an array with two dictionaries, one for the text portion and one for the image portion
57 | 'content' = [{'type': 'text', 'text': 'Okay now please compare the previous image I sent you with this new image!'}, {'type': 'image_url', 'image_url': {'url': 'https://i.gyazo.com/8ec349446dbb538727e515f2b964224c.png', 'detail': 'high'}}]
58 | """
59 | try:
60 | if self.tiktoken_encoder == None:
61 | self.tiktoken_encoder = tiktoken.encoding_for_model(model) # We store this value so we don't have to check again every time
62 | num_tokens = 0
63 | for message in messages:
64 | num_tokens += 4 # every message follows {role/name}\n{content}\n
65 | for key, value in message.items():
66 | if key == 'role':
67 | num_tokens += len(self.tiktoken_encoder.encode(value))
68 | elif key == 'content':
69 | # In the case that value is just a string, simply get its token value and move on
70 | if isinstance(value, str):
71 | num_tokens += len(self.tiktoken_encoder.encode(value))
72 | continue
73 |
74 | # In this case the 'content' variables value is an array of dictionaries
75 | for message_data in value:
76 | for content_key, content_value in message_data.items():
77 | if content_key == 'type':
78 | num_tokens += len(self.tiktoken_encoder.encode(content_value))
79 | elif content_key == 'text':
80 | num_tokens += len(self.tiktoken_encoder.encode(content_value))
81 | elif content_key == "image_url":
82 | num_tokens += 1105 # Assumes the image is 1920x1080 and that detail is set to high
83 | num_tokens += 2 # every reply is primed with assistant
84 | return num_tokens
85 | except Exception:
86 | # Either this model is not implemented in tiktoken, or there was some error processing the messages
87 | raise NotImplementedError(f"""num_tokens_from_messages() is not presently implemented for model {model}.""")
88 |
89 | # Asks a question with no chat history
90 | def chat(self, prompt=""):
91 | if not prompt:
92 | print("Didn't receive input!")
93 | return
94 |
95 | # Check that the prompt is under the token context limit
96 | chat_question = [{"role": "user", "content": prompt}]
97 | if self.num_tokens_from_messages(chat_question) > 128000:
98 | print("The length of this chat question is too large for the GPT model")
99 | return
100 |
101 | print("[yellow]\nAsking ChatGPT a question...")
102 | completion = self.client.chat.completions.create(
103 | model="gpt-4o",
104 | messages=chat_question
105 | )
106 |
107 | # Process the answer
108 | openai_answer = completion.choices[0].message.content
109 | if self.logging:
110 | print(f"[green]\n{openai_answer}\n")
111 | return openai_answer
112 |
113 | # Analyze an image without history
114 | # Works with jpg, jpeg, or png. Alternatively can provide an image URL by setting local_image to False
115 | # More info here: https://platform.openai.com/docs/guides/vision
116 | def analyze_image(self, prompt, image_path, local_image=True):
117 | # Use default prompt if one isn't provided
118 | if prompt is None:
119 | prompt = "Please give me a detailed description of this image."
120 | # If this is a local image, encode it into base64. Otherwise just use the provided URL.
121 | if local_image:
122 | try:
123 | with open(image_path, "rb") as image_file:
124 | base64_image = base64.b64encode(image_file.read()).decode("utf-8")
125 | url = f"data:image/jpeg;base64,{base64_image}"
126 | except:
127 | print("[red]ERROR: COULD NOT BASE64 ENCODE THE IMAGE. PANIC!!")
128 | return None
129 | else:
130 | url = image_path # The provided image path is a URL
131 | if self.logging:
132 | print("[yellow]\nAsking ChatGPT to analyze image...")
133 | completion = self.client.chat.completions.create(
134 | model="gpt-4o",
135 | messages=[
136 | {
137 | "role": "user",
138 | "content": [
139 | {"type": "text", "text": prompt},
140 | {
141 | "type": "image_url",
142 | "image_url": {
143 | "url": url,
144 | "detail": "high"
145 | }
146 | },
147 | ],
148 | },
149 | ],
150 | max_tokens=4096, # max of 4096 tokens as of Dec 25th 2023
151 | )
152 | openai_answer = completion.choices[0].message.content
153 | if self.logging:
154 | print(f"[green]\n{openai_answer}\n")
155 | return openai_answer
156 |
157 |
158 | # Asks a question that includes the full conversation history
159 | # Can include a mix of text and images
160 | def chat_with_history(self, prompt="", image_path="", local_image=True):
161 |
162 | # If we received a prompt, add it into our chat history.
163 | # Prompts are technically optional because the Ai can just continue the conversation from where it left off.
164 | if prompt is not None and prompt != "":
165 | # Create a new chat message with the text prompt
166 | new_chat_message = {
167 | "role": "user",
168 | "content": [
169 | {"type": "text", "text": prompt},
170 | ],
171 | }
172 | # If an image is provided, add the image url info into our new message.
173 | if image_path != "":
174 | # If this is a local image, we encode it into base64. Otherwise just use the provided URL.
175 | if local_image:
176 | try:
177 | with open(image_path, "rb") as image_file:
178 | base64_image = base64.b64encode(image_file.read()).decode("utf-8")
179 | url = f"data:image/jpeg;base64,{base64_image}"
180 | except:
181 | print("[red]ERROR: COULD NOT BASE64 ENCODE THE IMAGE. PANIC!!")
182 | return None
183 | else:
184 | url = image_path # The provided image path is a URL
185 | new_image_content = {
186 | "type": "image_url",
187 | "image_url": {
188 | "url": url,
189 | "detail": "high"
190 | }
191 | }
192 | new_chat_message["content"].append(new_image_content)
193 |
194 | # Add the new message into our chat history
195 | self.chat_history.append(new_chat_message)
196 |
197 | # Check total token limit. Remove old messages as needed
198 | if self.logging:
199 | print(f"[coral]Chat History has a current token length of {self.num_tokens_from_messages(self.chat_history)}")
200 | while self.num_tokens_from_messages(self.chat_history) > 128000:
201 | self.chat_history.pop(1) # We skip the 1st message since it's the system message
202 | if self.logging:
203 | print(f"Popped a message! New token length is: {self.num_tokens_from_messages(self.chat_history)}")
204 |
205 | if self.logging:
206 | print("[yellow]\nAsking ChatGPT a question...")
207 | completion = self.client.chat.completions.create(
208 | model="gpt-4o",
209 | messages=self.chat_history
210 | )
211 |
212 | # Add this answer to our chat history
213 | self.chat_history.append({"role": completion.choices[0].message.role, "content": completion.choices[0].message.content})
214 |
215 | # If a backup file was provided, write out convo history to the txt file
216 | self.save_chat_to_backup()
217 |
218 | # Return answer
219 | openai_answer = completion.choices[0].message.content
220 | if self.logging:
221 | print(f"[green]\n{openai_answer}\n")
222 | return openai_answer
223 |
224 |
--------------------------------------------------------------------------------
/multi_agent_gpt.py:
--------------------------------------------------------------------------------
1 | # This code runs a thread that manages the frontend code, a thread that listens for keyboard presses from the human, and then threads for the 3 agents
2 | # Once running, the human can activate a single agent and then let the agents continue an ongoing conversation.
3 | # Each thread has the following core logic:
4 |
5 | # Main Thread
6 | # Runs the web app
7 |
8 | # Agent X
9 | # Waits to be activated
10 | # Once it is activated (by Doug or by another agent):
11 | # Acquire conversation lock
12 | # Get response from OpenAI
13 | # Add this new response to all other agents' chat histories
14 | # Creates TTS with ElevenLabs
15 | # Acquire speaking lock (so only 1 speaks at a time)
16 | # Pick another thread randomly, activate them
17 | # Because this happens within the speaking lock, we are guaranteed that the other agents are inactive when this called.
18 | # But, we start this now so that the next speaker can have their answer and audio ready to go the instant this agent is done talking.
19 | # Update client and OBS to display stuff
20 | # Play the TTS audio
21 | # Release speaking lock (Other threads can now talk)
22 |
23 | # Human Input Thread
24 | # Listens for keypresses:
25 |
26 | # If F7 is pressed:
27 | # Toggles "pause" flag - stops other agents from activating additional agents
28 |
29 | # Record mic audio (until you press F8)
30 |
31 | # Get convo lock (but not speaking lock)
32 | # In theory, wait until everyone is done speaking, and because the agents are "paused" then no new ones will add to the convo
33 | # But to be safe, grab the convo lock to ensure that all agents HAVE to wait until my response is added into the convo history
34 |
35 | # Transcribe mic audio into text with Whisper
36 | # Add Doug's response into all agents' chat history
37 |
38 | # Release the convo lock
39 | # (then optionally press a key to trigger a specific bot)
40 |
41 | # If F4 pressed:
42 | # Toggles "pause" flag - stops all other agents from activating additional agents
43 |
44 | # If 1 pressed:
45 | # Turns off "pause" flag
46 | # Activates Agent 1
47 |
48 | # If 2 pressed:
49 | # Turns off "pause" flag
50 | # Activates Agent 2
51 |
52 | # If 3 pressed:
53 | # Turns off "pause" flag
54 | # Activates Agent 3
55 |
56 | from flask import Flask, render_template, session, request
57 | from flask_socketio import SocketIO, emit
58 | import threading
59 | import time
60 | import keyboard
61 | import random
62 | import logging
63 | from rich import print
64 |
65 | from audio_player import AudioManager
66 | from eleven_labs import ElevenLabsManager
67 | from openai_chat import OpenAiManager
68 | from whisper_openai import WhisperManager
69 | from obs_websockets import OBSWebsocketsManager
70 | from ai_prompts import *
71 |
72 | socketio = SocketIO
73 | app = Flask(__name__)
74 | app.config['SERVER_NAME'] = "127.0.0.1:5151"
75 | socketio = SocketIO(app, async_mode="threading")
76 | log = logging.getLogger('werkzeug') # Sets flask app to only print error messages, rather than all info logs
77 | log.setLevel(logging.ERROR)
78 |
79 | @app.route("/")
80 | def home():
81 | return render_template('index.html')
82 |
83 | @socketio.event
84 | def connect():
85 | print("[green]The server connected to client!")
86 |
87 | obswebsockets_manager = OBSWebsocketsManager()
88 | whisper_manager = WhisperManager()
89 | elevenlabs_manager = ElevenLabsManager()
90 | audio_manager = AudioManager()
91 |
92 | speaking_lock = threading.Lock()
93 | conversation_lock = threading.Lock()
94 |
95 | agents_paused = False
96 |
97 | # Class that represents a single ChatGPT Agent and its information
98 | class Agent():
99 |
100 | def __init__(self, agent_name, agent_id, filter_name, all_agents, system_prompt, elevenlabs_voice):
101 | # Flag of whether this agent should begin speaking
102 | self.activated = False
103 | # Used to identify each agent in the conversation history
104 | self.name = agent_name
105 | # an int used to ID this agent to the frontend code
106 | self.agent_id = agent_id
107 | # the name of the OBS filter to activate when this agent is speaking
108 | # You don't need to use OBS filters as part of this code, it's optional for adding extra visual flair
109 | self.filter_name = filter_name
110 | # A list of the other agents, so that you can pick one to randomly "activate" when you finish talking
111 | self.all_agents = all_agents
112 | # The name of the Elevenlabs voice that you want this agent to speak with
113 | self.voice = elevenlabs_voice
114 | # The name of the txt backup file where this agent's conversation history will be stored
115 | backup_file_name = f"backup_history_{agent_name}.txt"
116 | # Initialize the OpenAi manager with a system prompt and a file that you would like to save your conversation too
117 | # If the backup file isn't empty, then it will restore that backed up conversation for this agent
118 | self.openai_manager = OpenAiManager(system_prompt, backup_file_name)
119 | # Optional - tells the OpenAi manager not to print as much
120 | self.openai_manager.logging = False
121 |
122 | def run(self):
123 | while True:
124 | # Wait until we've been activated
125 | if not self.activated:
126 | time.sleep(0.1)
127 | continue
128 |
129 | self.activated = False
130 | print(f"[italic purple] {self.name} has STARTED speaking.")
131 |
132 | # This lock isn't necessary in theory, but for safety we will require this lock whenever updating any agent's convo history
133 | with conversation_lock:
134 | # Generate a response to the conversation
135 | openai_answer = self.openai_manager.chat_with_history("Okay what is your response? Try to be as chaotic and bizarre and adult-humor oriented as possible. Again, 3 sentences maximum.")
136 | openai_answer = openai_answer.replace("*", "")
137 | print(f'[magenta]Got the following response:\n{openai_answer}')
138 |
139 | # Add your new response into everyone else's chat history, then have them save their chat history
140 | # This agent's responses are marked as "assistant" role to itself, so everyone elses messages are "user" role.
141 | for agent in self.all_agents:
142 | if agent is not self:
143 | agent.openai_manager.chat_history.append({"role": "user", "content": f"[{self.name}] {openai_answer}"})
144 | agent.openai_manager.save_chat_to_backup()
145 |
146 | # Create audio response
147 | tts_file = elevenlabs_manager.text_to_audio(openai_answer, self.voice, False)
148 |
149 | # Process the audio to get subtitles
150 | audio_and_timestamps = whisper_manager.audio_to_text(tts_file, "sentence")
151 |
152 | # Wait here until the current speaker is finished
153 | with speaking_lock:
154 |
155 | # If we're "paused", then simply finish speaking without activating another agent
156 | # Otherwise, pick another agent randomly, then activate it
157 | if not agents_paused:
158 | other_agents = [agent for agent in self.all_agents if agent is not self]
159 | random_agent = random.choice(other_agents)
160 | random_agent.activated = True
161 |
162 | # Activate move filter on the image
163 | obswebsockets_manager.set_filter_visibility("Line In", self.filter_name, True)
164 |
165 | # Play the TTS audio (without pausing)
166 | audio_manager.play_audio(tts_file, False, False, True)
167 |
168 | # While the audio is playing, display each sentence on the front-end
169 | # Each dictionary will look like: {'text': 'here is my speech', 'start_time': 11.58, 'end_time': 14.74}
170 | socketio.emit('start_agent', {'agent_id': self.agent_id})
171 | try:
172 | for i in range(len(audio_and_timestamps)):
173 | current_sentence = audio_and_timestamps[i]
174 | duration = current_sentence['end_time'] - current_sentence['start_time']
175 | socketio.emit('agent_message', {'agent_id': self.agent_id, 'text': f"{current_sentence['text']}"})
176 | time.sleep(duration)
177 | # If this is not the final sentence, sleep for the gap of time inbetween this sentence and the next one starting
178 | if i < (len(audio_and_timestamps) - 1):
179 | time_between_sentences = audio_and_timestamps[i+1]['start_time'] - current_sentence['end_time']
180 | time.sleep(time_between_sentences)
181 | except Exception:
182 | print(f"[magenta] Whoopsie! There was a problem and I don't know why. This was the current_sentence it broke on: {current_sentence}")
183 | socketio.emit('clear_agent', {'agent_id': self.agent_id})
184 |
185 | time.sleep(1) # Wait one second before the next person talks, otherwise their audio gets cut off
186 |
187 | # Turn off the filter in OBS
188 | obswebsockets_manager.set_filter_visibility("Line In", self.filter_name, False)
189 |
190 | print(f"[italic purple] {self.name} has FINISHED speaking.")
191 |
192 |
193 | # Class that handles human input, this thread is how you can manually activate or pause the other agents
194 | class Human():
195 |
196 | def __init__(self, name, all_agents):
197 | self.name = name # This will be added to the beginning of the response
198 | self.all_agents = all_agents
199 |
200 | def run(self):
201 | global agents_paused
202 | while True:
203 |
204 | # Speak into mic and add the dialogue to the chat history
205 | if keyboard.is_pressed('num 7'):
206 |
207 | # Toggles "pause" flag - stops other agents from activating additional agents
208 | agents_paused = True
209 | print(f"[italic red] Agents have been paused")
210 |
211 | # Record mic audio from Doug (until he presses '=')
212 | print(f"[italic green] DougDoug has STARTED speaking.")
213 | mic_audio = audio_manager.record_audio(end_recording_key='num 8')
214 |
215 | with conversation_lock:
216 | # Transcribe mic audio into text with Whisper
217 | transcribed_audio = whisper_manager.audio_to_text(mic_audio)
218 | print(f"[teal]Got the following audio from Doug:\n{transcribed_audio}")
219 |
220 | # Add Doug's response into all agents chat history
221 | for agent in self.all_agents:
222 | agent.openai_manager.chat_history.append({"role": "user", "content": f"[{self.name}] {transcribed_audio}"})
223 | agent.openai_manager.save_chat_to_backup() # Tell the other agents to save their chat history to their backup file
224 |
225 | print(f"[italic magenta] DougDoug has FINISHED speaking.")
226 |
227 | # Activate another agent randomly
228 | agents_paused = False
229 | random_agent = random.randint(0, len(self.all_agents)-1)
230 | print(f"[cyan]Activating Agent {random_agent+1}")
231 | self.all_agents[random_agent].activated = True
232 |
233 |
234 | # "Pause" the other agents.
235 | # Whoever is currently speaking will finish, but no future agents will be activated
236 | if keyboard.is_pressed('f4'):
237 | print("[italic red] Agents have been paused")
238 | agents_paused = True
239 | time.sleep(1) # Wait for a bit to ensure you don't press this twice in a row
240 |
241 | # Activate Agent 1
242 | if keyboard.is_pressed('num 1'):
243 | print("[cyan]Activating Agent 1")
244 | agents_paused = False
245 | self.all_agents[0].activated = True
246 | time.sleep(1) # Wait for a bit to ensure you don't press this twice in a row
247 |
248 | # Activate Agent 2
249 | if keyboard.is_pressed('num 2'):
250 | print("[cyan]Activating Agent 2")
251 | agents_paused = False
252 | self.all_agents[1].activated = True
253 | time.sleep(1) # Wait for a bit to ensure you don't press this twice in a row
254 |
255 | # Activate Agent 3
256 | if keyboard.is_pressed('num 3'):
257 | print("[cyan]Activating Agent 3")
258 | agents_paused = False
259 | self.all_agents[2].activated = True
260 | time.sleep(1) # Wait for a bit to ensure you don't press this twice in a row
261 |
262 | time.sleep(0.05)
263 |
264 |
265 |
266 | def start_bot(bot):
267 | bot.run()
268 |
269 | if __name__ == '__main__':
270 |
271 | all_agents = []
272 |
273 | # Agent 1
274 | agent1 = Agent("OSWALD", 1, "Audio Move - Wario Pepper", all_agents, VIDEOGAME_AGENT_1, "Dougsworth")
275 | agent1_thread = threading.Thread(target=start_bot, args=(agent1,))
276 | agent1_thread.start()
277 |
278 | # Agent 2
279 | agent2 = Agent("TONY KING OF NEW YORK", 2, "Audio Move - Waluigi Pepper", all_agents, VIDEOGAME_AGENT_2, "Tony Emperor of New York")
280 | agent2_thread = threading.Thread(target=start_bot, args=(agent2,))
281 | agent2_thread.start()
282 |
283 | # Agent 3
284 | agent3 = Agent("VICTORIA", 3, "Audio Move - Gamer Pepper", all_agents, VIDEOGAME_AGENT_3, "Victoria")
285 | agent3_thread = threading.Thread(target=start_bot, args=(agent3,))
286 | agent3_thread.start()
287 |
288 | all_agents.append(agent1)
289 | all_agents.append(agent2)
290 | all_agents.append(agent3)
291 |
292 | # Human thread
293 | human = Human("DOUGDOUG", all_agents)
294 | human_thread = threading.Thread(target=start_bot, args=(human,))
295 | human_thread.start()
296 |
297 | print("[italic green]!!AGENTS ARE READY TO GO!!\nPress Num 1, Num 2, or Num3 to activate an agent.\nPress F7 to speak to the agents.")
298 |
299 | socketio.run(app)
300 |
301 | agent1_thread.join()
302 | agent2_thread.join()
303 | agent3_thread.join()
304 | human_thread.join()
--------------------------------------------------------------------------------