├── output └── placeholder.txt ├── story_book_agents ├── tools │ ├── __init__.py │ ├── voice.py │ ├── pptx.py │ ├── video.py │ ├── image.py │ └── utils.py ├── agent_manager.py ├── producer_agent.py ├── it_assistant_agent.py ├── image_generation_agent.py ├── story_critic_agent.py ├── story_editor_agent.py ├── storyboard_critic_agent.py ├── image_critic_agent.py ├── __init__.py ├── story_draft_groupchat.py ├── receptionist_agent.py ├── storyboard_editor_agent.py ├── text_to_image_prompt_critic_agent.py ├── image_creator_agent.py ├── storyboard_groupchat.py └── text_to_image_prompt_editor_agent.py ├── images └── MultiAgents.jpg ├── requirements.txt ├── DEMO-Results.md ├── pyproject.toml ├── LICENSE ├── .env.example ├── app.py ├── generate.py ├── README.zh-cn.md ├── README.ja-jp.md ├── README.md └── .gitignore /output/placeholder.txt: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /story_book_agents/tools/__init__.py: -------------------------------------------------------------------------------- 1 | -------------------------------------------------------------------------------- /images/MultiAgents.jpg: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/breakstring/Agentic_Story_Book_Workflow/HEAD/images/MultiAgents.jpg -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | agentops==0.2.6 2 | anthropic==0.32.0 3 | azure-cognitiveservices-speech==1.38.0 4 | llmlingua==0.2.2 5 | moviepy==1.0.3 6 | pip-chill==1.0.3 7 | pyautogen==0.2.33 8 | python-pptx==0.6.23 9 | replicate==0.27.0 10 | tinydb==4.8.0 11 | -------------------------------------------------------------------------------- /DEMO-Results.md: -------------------------------------------------------------------------------- 1 | 2 | ## result - 橘猫 3 | 4 | 5 | https://github.com/user-attachments/assets/83899004-3c00-4f80-b477-24c2b1e2da72 6 | 7 | https://github.com/user-attachments/assets/2cf59a5a-142a-4cb9-8046-45248c10ddf6 8 | 9 | https://github.com/user-attachments/assets/96b93c26-f7cb-4703-aff4-5993cd8026b8 10 | 11 | 12 | 13 | 14 | ## result - 小马过河 15 | 16 | https://github.com/user-attachments/assets/9a4ceaac-d730-492a-8e66-46a4af7e5af6 17 | 18 | https://github.com/user-attachments/assets/b7e36844-5859-46f3-85f6-cab6bbdf771c 19 | 20 | https://github.com/user-attachments/assets/6b884ec1-39a8-4113-aa43-071eaa7d128a 21 | 22 | -------------------------------------------------------------------------------- /story_book_agents/agent_manager.py: -------------------------------------------------------------------------------- 1 | """This module contains the AgentManager class which is used to manage all the default agent instance in the story book system""" 2 | 3 | class AgentManager: 4 | """Agent Manager used to manage all the default agent instance in the story book system""" 5 | def __init__(self) -> None: 6 | self.default_receptionist_agent = None 7 | self.default_story_editor_agent = None 8 | self.default_it_assistant_agent = None 9 | self.default_story_critic_agent = None 10 | self.default_producer_agent = None 11 | self.default_storyboard_editor_agent = None 12 | self.defualt_storyboard_critic_agent = None 13 | self.default_text_to_image_prompt_editor_agent = None 14 | self.default_text_to_image_prompt_critic_agent = None 15 | 16 | agent_manager_instance = AgentManager() 17 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [tool.pyright] 2 | include = ["app.py","story_book_agents"] 3 | exclude = [".cache","**/__pycache__"] 4 | [tool.poetry] 5 | name = "agentic story book workflow" 6 | version = "0.1.0" 7 | description = "A multi-agent workflow framework for creating children's picture books based on AutoGen." 8 | authors = ["Kenn Zhang "] 9 | license = "MIT" 10 | readme = "README.md" 11 | 12 | [tool.poetry.dependencies] 13 | python = "^3.11" 14 | pyautogen = "0.2.33" 15 | agentops = "0.2.6" 16 | anthropic="0.32.0" 17 | azure-cognitiveservices-speech="1.38.0" 18 | llmlingua="0.2.2" 19 | moviepy="1.0.3" 20 | pip-chill="1.0.3" 21 | python-pptx="0.6.23" 22 | replicate="0.27.0" 23 | tinydb="4.8.0" 24 | 25 | [tool.poetry.group.dev.dependencies] 26 | pip-chill = "1.0.3" 27 | 28 | [build-system] 29 | requires = ["poetry-core"] 30 | build-backend = "poetry.core.masonry.api" 31 | -------------------------------------------------------------------------------- /story_book_agents/producer_agent.py: -------------------------------------------------------------------------------- 1 | """ This module contains the ProducerAgent class.""" 2 | from autogen import AssistantAgent 3 | 4 | 5 | PRODUCER_AGENT_NAME = "Producer" 6 | 7 | PRODUCER_AGENT_SYSTEM_MESSAGE = """ 8 | You are a producer for a children's storybook creation team. Your task is to assist team members in completing the picture book using only the tools provided to you. 9 | """ 10 | PRODUCER_AGENT_DESCRIPTION = "Producer, assisting team members in completing the picture book using the provided tools." 11 | 12 | class ProducerAgent(AssistantAgent): 13 | """ 儿童故事绘本制作团队的制作人,协助团队成员完成绘本的制作 """ 14 | 15 | def __init__(self, gpt_config): 16 | super().__init__( 17 | name=PRODUCER_AGENT_NAME, 18 | llm_config=gpt_config, 19 | system_message=PRODUCER_AGENT_SYSTEM_MESSAGE, 20 | human_input_mode="NEVER", 21 | code_execution_config=False, 22 | description=PRODUCER_AGENT_DESCRIPTION 23 | ) 24 | -------------------------------------------------------------------------------- /story_book_agents/it_assistant_agent.py: -------------------------------------------------------------------------------- 1 | ''' IT assistant agent''' 2 | 3 | from autogen import AssistantAgent 4 | 5 | 6 | IT_ASSISTANT_AGENT_NAME="IT_Assistant" 7 | IT_ASSISTANT_AGENT_SYSTEM_MESSAGE=""" 8 | You are the IT assistant in the team. Your task is to assist team members in completing their tasks. 9 | Your main responsibilities are: 10 | - Generate a story ID for the story content. 11 | - Save the story content. 12 | 13 | """ 14 | IT_ASSISTANT_AGENT_DESCRIPTION = "IT assistant, generate the story id and save stroy content." 15 | 16 | class ITAssistantAgent(AssistantAgent): 17 | ''' IT assistant agent''' 18 | def __init__(self,gpt_config): 19 | super().__init__( 20 | name=IT_ASSISTANT_AGENT_NAME, 21 | system_message=IT_ASSISTANT_AGENT_SYSTEM_MESSAGE, 22 | description=IT_ASSISTANT_AGENT_DESCRIPTION, 23 | llm_config=gpt_config, 24 | code_execution_config=False, 25 | human_input_mode="NEVER", 26 | ) 27 | -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Kenn Zhang 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /.env.example: -------------------------------------------------------------------------------- 1 | # AGENTOPS 2 | AGENTOPS_API_KEY=agentops_api_key 3 | # LLM 4 | MODEL=model_deployment_name_or_modelname 5 | API_VERSION=azure_api_version 6 | API_TYPE=azure 7 | API_KEY=api_key 8 | BASE_URL=azure_base_url 9 | # Image Generation 10 | IMAGE_GENERATION_TYPE=azure # azure, openai, replicate... 11 | IMAGE_SHAPE=landscape #landscape, portrait, square 12 | # Dall-E 3 13 | DALLE_MODEL=model_deployment_name_or_modelname 14 | DALLE_API_VERSION=azure_api_version 15 | DALLE_API_KEY=api_key 16 | DALLE_BASE_URL=azure_base_url 17 | DALLE_IMAGE_QUALITY=hd # hd, standard 18 | DALLE_IMAGE_STYLE=vivid #natural,vivid 19 | # replicate 20 | REPLICATE_API_TOKEN=repilicate_api_key 21 | REPLICATE_MODEL_NAME=replicate_model_name #black-forest-labs/flux-schnell,black-forest-labs/flux-dev,black-forest-labs/flux-pro 22 | # image generation policy 23 | IMAGE_GENERATION_RETRIES=3 24 | IMAGE_CRITICISM_RETRIES=2 25 | IMAGE_SAVE_FAILURED_IMAGES=True 26 | IMAGE_STYLE_KEYWORD=Pixar #Pixar,DreamWorks,CGSociety,Japanese comics,Hayao Miyazaki,Makoto Shinkai,Disney,cartoon illustration,anime 27 | # voice 28 | AZURE_SPEECH_KEY=azure_speech_key 29 | AZURE_SPEECH_REGION=azure_speech_region 30 | AZURE_SPEECH_VOICE_NAME=azure_speech_voice_name 31 | 32 | -------------------------------------------------------------------------------- /story_book_agents/image_generation_agent.py: -------------------------------------------------------------------------------- 1 | """ Image generation agent """ 2 | 3 | from typing import Dict, List, Optional, Tuple, Union 4 | from autogen import ConversableAgent, Agent 5 | from .tools.image import generate_image_by_prompt 6 | 7 | IMAGE_GENERATION_AGENT_NAME = "Image_Generator" 8 | 9 | 10 | class ImageGenerationAgent(ConversableAgent): 11 | """ This agent is responsible for generating images based on storyboard scripts for children's storybooks. """ 12 | 13 | def __init__(self, gpt_config, *args, **kwargs): 14 | super().__init__( 15 | name=IMAGE_GENERATION_AGENT_NAME, 16 | llm_config=gpt_config, 17 | *args, 18 | **kwargs) 19 | 20 | self.register_reply( 21 | [Agent, None], ImageGenerationAgent._generate_dalle_reply, position=0) 22 | 23 | def send( 24 | self, 25 | message: Union[Dict, str], 26 | recipient: Agent, 27 | request_reply: Optional[bool] = None, 28 | silent: Optional[bool] = False, 29 | ): 30 | # override and always "silent" the send out message; 31 | # otherwise, the print log would be super long! 32 | super().send(message, recipient, request_reply, silent=True) 33 | 34 | def _generate_dalle_reply(self, messages: Optional[List[Dict]], sender: "Agent", config) -> Tuple[bool, Union[str, Dict, None]]: # pylint: disable=unused-argument 35 | 36 | if messages is None: 37 | messages = self._oai_messages[sender] 38 | 39 | prompt = messages[-1]["content"] 40 | img_url, revised_prompt = generate_image_by_prompt(prompt) 41 | 42 | # Return the OpenAI message format 43 | return True, {"content": [{"type": "image_url", "image_url": {"url": img_url}, "prompt": revised_prompt}]} 44 | -------------------------------------------------------------------------------- /story_book_agents/tools/voice.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | from typing import Annotated 4 | import azure.cognitiveservices.speech as speechsdk 5 | 6 | 7 | 8 | def text_to_speech(text: Annotated[str, "text to speech"], filename: Annotated[str, "filename to save"]) -> Annotated[bool, "result"]: 9 | """ 10 | TTS function use azure service 11 | 12 | Args: 13 | text (Annotated[str,"text to speech"]): text to speech 14 | filename (Annotated[str,"filename to save"]): filename to save 15 | 16 | Returns: 17 | Annotated[bool,"result"]: result 18 | 19 | """ 20 | speech_key = os.environ.get("AZURE_SPEECH_KEY") 21 | speech_region = os.environ.get("AZURE_SPEECH_REGION") 22 | speech_voice_name = os.environ.get( 23 | "AZURE_SPEECH_VOICE_NAME", "zh-CN-XiaoxiaoMultilingualNeural") 24 | speech_config = speechsdk.SpeechConfig(subscription=speech_key, 25 | region=speech_region) 26 | speech_config.speech_synthesis_voice_name = speech_voice_name 27 | 28 | audio_config = speechsdk.audio.AudioOutputConfig(filename=filename) 29 | 30 | speech_synthesizer = speechsdk.SpeechSynthesizer( 31 | speech_config=speech_config, audio_config=audio_config) 32 | result = speech_synthesizer.speak_text_async(text).get() 33 | 34 | if result.reason == speechsdk.ResultReason.SynthesizingAudioCompleted: 35 | print(f"Speech synthesized for text [{text}]") 36 | return True 37 | elif result.reason == speechsdk.ResultReason.Canceled: 38 | cancellation_details = result.cancellation_details 39 | print(f"Speech synthesis canceled: {cancellation_details.reason}") 40 | if cancellation_details.reason == speechsdk.CancellationReason.Error: 41 | print(f"Error details: {cancellation_details.error_details}") 42 | 43 | return False 44 | -------------------------------------------------------------------------------- /story_book_agents/tools/pptx.py: -------------------------------------------------------------------------------- 1 | from typing import Annotated 2 | from pptx import Presentation 3 | from pptx.util import Inches 4 | import os 5 | from PIL import Image 6 | from .utils import get_storyboard_by_story_id 7 | 8 | def create_pptx(story_id: Annotated[str,'story id']): 9 | 10 | output_directory= "./output/"+story_id 11 | storyboard = get_storyboard_by_story_id(story_id) 12 | 13 | prs = Presentation() 14 | 15 | # 获取所有子目录 16 | subdirs = sorted([d for d in os.listdir(output_directory) if os.path.isdir(os.path.join(output_directory, d))], key=int) 17 | 18 | for subdir in subdirs: 19 | subdir_path = os.path.join(output_directory, subdir) 20 | image_path = os.path.join(subdir_path, 'image.jpg') 21 | audio_path = os.path.join(subdir_path, 'voice.mp3') 22 | 23 | # 添加新的幻灯片 24 | slide_layout = prs.slide_layouts[5] # 使用空白布局 25 | slide = prs.slides.add_slide(slide_layout) 26 | 27 | # 获取当前索引的 storyboard 内容 28 | storyboard_item = next((item for item in storyboard if item["Index"] == subdir), None) 29 | if storyboard_item: 30 | notes_slide = slide.notes_slide 31 | text_frame = notes_slide.notes_text_frame 32 | text_frame.text = storyboard_item["StoryContent"] 33 | 34 | # 打开图片并获取尺寸 35 | img = Image.open(image_path) 36 | img_width, img_height = img.size 37 | slide_width = prs.slide_width 38 | slide_height = prs.slide_height 39 | 40 | # 计算缩放比例 41 | scale = min((slide_width - Inches(1)) / img_width, (slide_height - Inches(1)) / img_height) 42 | 43 | # 计算新的图片尺寸 44 | new_width = int(img_width * scale) 45 | new_height = int(img_height * scale) 46 | 47 | # 计算图片位置(居中) 48 | left = int((slide_width - new_width) / 2) 49 | top = int((slide_height - new_height) / 2) 50 | 51 | # 添加图片 52 | pic = slide.shapes.add_picture(image_path, left, top, new_width, new_height) 53 | 54 | # 添加音频 55 | audio_left = Inches(0) 56 | audio_top = Inches(0) 57 | audio_width = Inches(1) 58 | audio_height = Inches(1) 59 | slide.shapes.add_movie(audio_path, audio_left, audio_top, audio_width, audio_height) 60 | 61 | # 保存 PPTX 文件 62 | prs.save(os.path.join(output_directory, 'output.pptx')) 63 | -------------------------------------------------------------------------------- /story_book_agents/story_critic_agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | This agent's role is to review the content of children's stories created by story editors and provide critical feedback. 3 | """ 4 | from autogen import AssistantAgent 5 | 6 | STORY_CRITIC_AGENT_NAME="Story_Critic" 7 | STORY_CRITIC_AGENT_SYSTEM_MESSAGE = """ 8 | You are an expert content reviewer for children's picture books, tasked with evaluating and providing feedback on stories created by other children's content creators. 9 | 10 | Your task is to review the story content based on the user's requirements and provide modification suggestions. When evaluating the story, consider the following criteria: 11 | 12 | 1. Content Safety: Ensure the story is appropriate for children, free from violence, sexual content, or other unsuitable themes. The story should convey positive and uplifting messages. 13 | 14 | 2. Engagement: Verify that the story is simple, easy for children to understand, and interesting enough to capture their attention. 15 | 16 | 3. Completeness: Check that the story is complete, without missing elements, and has educational value. 17 | 18 | 4. User Requirements: Ensure the story aligns with any specific requirements provided by the user. 19 | 20 | If you believe the story structure is already excellent and requires no modifications, simply respond with "CRITIC_DONE". 21 | 22 | If you think the story can be improved, simply provide your feedback using the following format: 23 | 24 | 25 | [Your modification suggestions here] 26 | 27 | 28 | Important notes: 29 | 1. Unless otherwise specified in the user requirements, provide your feedback in the same language as the story content. 30 | 2. When giving feedback, focus on general suggestions and directions for improvement rather than specific sentence-level rewrites. Avoid providing detailed descriptions or exact phrasings for the content creator to use. 31 | 32 | Remember to carefully review the story content and user requirements before formulating your feedback. 33 | 34 | """ 35 | STORY_CRITIC_AGENT_DESCRIPTION = "This agent's role is to review the content of children's stories created by story editors and provide critical feedback." 36 | 37 | 38 | class StoryCriticAgent(AssistantAgent): 39 | """ This agent's role is to review the content of children's stories created by story editors and provide critical feedback. """ 40 | 41 | def __init__(self, gpt_config): 42 | super().__init__( 43 | name=STORY_CRITIC_AGENT_NAME, 44 | description=STORY_CRITIC_AGENT_DESCRIPTION, 45 | system_message=STORY_CRITIC_AGENT_SYSTEM_MESSAGE, 46 | max_consecutive_auto_reply=None, 47 | human_input_mode="NEVER", 48 | llm_config=gpt_config, 49 | code_execution_config=False, 50 | ) 51 | -------------------------------------------------------------------------------- /story_book_agents/story_editor_agent.py: -------------------------------------------------------------------------------- 1 | """ 2 | This agent is responsible for creating a kid's story book based on the user's input. 3 | """ 4 | from autogen import AssistantAgent 5 | 6 | STORY_EDITOR_AGENT_NAME="Story_Editor" 7 | STORY_EDITOR_AGENT_SYSTEM_MESSAGE=""" 8 | You are an AI assistant designed to help create text content for children's picture books. Your task is to create an engaging story outline based on the information provided by the user. Follow these instructions carefully: 9 | 10 | 1. Read the user's request carefully. 11 | 12 | 2. Create a story that meets the following content requirements: 13 | - Ensure the story is interesting and captivating for children. 14 | - Keep the content simple and easy to understand. 15 | - Make sure the story is positive and conveys good values. 16 | - Ensure the content is appropriate for children, avoiding any unsuitable themes or language. 17 | - Try to limit the number of characters in the story to three or fewer. Too many characters might be difficult for young children to remember. 18 | 19 | 3. Structure your story as follows: 20 | - Create a title for the story. 21 | - Write the story content as a continuous narrative. Do not divide it into pages or sections. 22 | 23 | 4. Language requirements: 24 | - If the user explicitly requests a specific language in their request, use that language for both the title and content. 25 | - If no language is specified, use the same language as the user's request for both the title and content. 26 | 27 | 5. Format your output strictly as follows, without any additional content: 28 | 29 | Story Title 30 | Story Content 31 | 32 | 33 | 6. If review feedback is provided, 34 | - Carefully read and consider the feedback. 35 | - Revise your story based on the feedback while maintaining all previous requirements (content, structure, language, etc.). 36 | - Provide your revised story using the same output format as before. 37 | 38 | Remember, your goal is to create an engaging, age-appropriate story that children will enjoy and learn from. Do not include any explanations or comments outside of the specified XML tags. 39 | """ 40 | 41 | STORY_EDITOR_AGENT_DESCRIPTION = "This agent is responsible for creating a kid's story book based on the user's input." 42 | 43 | 44 | class StoryEditorAgent(AssistantAgent): 45 | """ This agent is responsible for creating a kid's story book based on the user's input. """ 46 | 47 | def __init__(self, gpt_config): 48 | super().__init__( 49 | name=STORY_EDITOR_AGENT_NAME, 50 | description=STORY_EDITOR_AGENT_DESCRIPTION, 51 | system_message=STORY_EDITOR_AGENT_SYSTEM_MESSAGE, 52 | max_consecutive_auto_reply=None, 53 | human_input_mode="NEVER", 54 | llm_config=gpt_config, 55 | code_execution_config=False, 56 | ) 57 | 58 | -------------------------------------------------------------------------------- /story_book_agents/storyboard_critic_agent.py: -------------------------------------------------------------------------------- 1 | ''' storyboard critic agent ''' 2 | 3 | from autogen import AssistantAgent 4 | 5 | 6 | STORYBOARD_CRITIC_AGENT_NAME = "Storyboard_Critic" 7 | STORYBOARD_CRITIC_AGENT_SYSTEM_MESSAGE = """ 8 | You are an expert storyboard reviewer for a children's picture book creation team. Your task is to analyze the storyboard created by the previous storyboard artist based on the story content, and identify any issues with the format and content of this storyboard. 9 | 10 | The storyboard should follow this structure: 11 | 12 | [Frame number] 13 | [Frame content] 14 | [Visual content] 15 | 16 | 17 | Your task is to review and check the following aspects: 18 | 19 | 1. Does the storyboard conform to the specified XML format? 20 | 2. Are the sections numbered sequentially, starting from 1? 21 | 3. Do the sections maintain coherence and adhere to the original story's main plot and themes? (Note: minor story details are acceptable if they don't affect the overall narrative) 22 | 4. Do the sections accurately describe the visual representation of each frame? 23 | 24 | If you believe the storyboard content is satisfactory and can be approved, simply output "CRITIC_DONE" without any additional content. 25 | 26 | If you think the storyboard needs modifications, please output your critique strictly following this XML format: 27 | 28 | 29 | 30 | [Frame number] 31 | 32 | [Your modification suggestions here] 33 | 34 | 35 | ... 36 | 37 | 38 | Here are two examples of correct output formats: 39 | 40 | Example 1 (when no modifications are needed): 41 | STORYBOARD_CRITIC_DONE 42 | 43 | Example 2 (when modifications are needed): 44 | 45 | 46 | 2 47 | 48 | The in this frame deviates from the main plot. Consider revising to maintain story coherence. 49 | 50 | 51 | 52 | 5 53 | 54 | The lacks detail. Please provide a more specific description of the visual elements in this frame. 55 | 56 | 57 | 58 | 59 | Remember to be thorough in your review and provide clear, constructive feedback when necessary. 60 | """ 61 | 62 | STORYBOARD_CRITIC_AGENT_DESCRIPTION="""Storyboard critic agent, review the storyboard created by storyboard editors and provide critical feedback. """ 63 | 64 | class StoryboardCriticAgent(AssistantAgent): 65 | """Storyboard critic agent.""" 66 | def __init__(self, gpt_config): 67 | super().__init__( 68 | name=STORYBOARD_CRITIC_AGENT_NAME, 69 | system_message=STORYBOARD_CRITIC_AGENT_SYSTEM_MESSAGE, 70 | description=STORYBOARD_CRITIC_AGENT_DESCRIPTION, 71 | llm_config=gpt_config 72 | ) 73 | -------------------------------------------------------------------------------- /app.py: -------------------------------------------------------------------------------- 1 | """app.py""" 2 | import os 3 | import autogen 4 | import autogen.runtime_logging 5 | import autogen.types 6 | import dotenv 7 | import agentops 8 | 9 | from autogen import UserProxyAgent 10 | from story_book_agents import agent_manager_instance, story_draft_groupchat, storyboard_groupchat 11 | 12 | import story_book_agents 13 | 14 | 15 | # prepare the LLM configurations 16 | dotenv.load_dotenv(override=True) 17 | 18 | gpt_config_list_default = [{ 19 | "model": os.environ.get("MODEL"), 20 | "api_key": os.environ.get("API_KEY"), 21 | "base_url": os.environ.get("BASE_URL"), 22 | "api_type": os.environ.get("API_TYPE","azure"), 23 | "api_version": os.environ.get("API_VERSION","2024-06-01"), 24 | }] 25 | 26 | dalle_config = { 27 | "config_list": [{ 28 | "model": os.environ.get("DALLE_MODEL"), 29 | "api_key": os.environ.get("DALLE_API_KEY"), 30 | "base_url": os.environ.get("DALLE_BASE_URL"), 31 | "api_type": os.environ.get("DALLE_API_TYPE"), 32 | "api_version": os.environ.get("DALLE_API_VERSION"), 33 | }], 34 | "timeout": 120, 35 | "temperature": 0.7, 36 | "max_tokens": 2000 37 | 38 | } 39 | 40 | # toggle this line if you don't use agentops 41 | #agentops.init(api_key=os.environ.get("AGENTOPS_API_KEY")) 42 | 43 | story_book_agents.init_agents(gpt_config_list_default) 44 | # create a UserProxyAgent instance 45 | user_agent = UserProxyAgent( 46 | name="User", 47 | llm_config=False, 48 | human_input_mode="ALWAYS", 49 | code_execution_config=False, 50 | ) 51 | 52 | 53 | def main(): 54 | """Main function.""" 55 | preliminary_story_requirements = "给我讲一个故事吧" 56 | chat_results = autogen.agentchat.initiate_chats([ 57 | # Obtain preliminary requirements through the conversation with the Receptionist agent. 58 | { 59 | "sender": user_agent, 60 | "recipient": agent_manager_instance.default_receptionist_agent, 61 | "message": preliminary_story_requirements, 62 | "max_turns": 10, 63 | "summary_method": "reflection_with_llm", 64 | "summary_args": { 65 | "summary_prompt": """ 66 | Summarize the key points from the conversation without any introductory phrases. 67 | 68 | 1. The summary must be in the same language as the user's specified language for the picture book if explicitly mentioned. 69 | 2. If the user did not specify a language, use the main language used by the user throughout the conversation. 70 | 3. Ensure that the summary reflects the language used predominantly in the conversation, regardless of any predefined commands like "exit." 71 | 4. The summary should clearly reflect the user's requirements and preferences discussed during the conversation. 72 | """ 73 | } 74 | }, 75 | # The Story Draft Group includes agents such as the Story Editor, Story Critic, Producer, and IT Assistant. 76 | # They communicate through a group chat to create, review, and store the story content. 77 | story_draft_groupchat.set_story_draft_chat() 78 | ]) 79 | story_id = chat_results[1].summary 80 | print("Story ID: " + story_id) 81 | chat_results2 = autogen.agentchat.initiate_chats( 82 | [ 83 | # in this storyboard chatgroup, the agents create/critic/save the storyboard base on the story content. 84 | storyboard_groupchat.set_storyboard_chat(story_id) 85 | ] 86 | ) 87 | print(chat_results2[0].summary) 88 | 89 | 90 | # entry point 91 | if __name__ == '__main__': 92 | main() 93 | -------------------------------------------------------------------------------- /story_book_agents/image_critic_agent.py: -------------------------------------------------------------------------------- 1 | """ Image critic agent """ 2 | 3 | from autogen import AssistantAgent 4 | 5 | 6 | IMAGE_CRITIC_AGENT_NAME = "Image_Critic" 7 | IMAGE_CRITIC_AGENT_SYSTEM_MESSAGE_TEMPLATE = """ 8 | You are a visual review expert for a children's storybook creation team. Your task is to review an image based on a provided storyboard script to determine if it meets the requirements for the storybook. 9 | 10 | First, carefully read and understand the following storyboard content: 11 | ```json 12 | {STORYBOARD} 13 | ``` 14 | 15 | The current frame number being reviewed is {FRAME_NUMBER}, and the prompt used to generate the image is: 16 | 17 | 18 | {PROMPT} 19 | 20 | 21 | Now, carefully examine the provided image. 22 | 23 | Analyze the image based on the following criteria: 24 | 25 | 1. Overall story coherence: Does the image align with the overall story content? 26 | 2. Current frame requirements: Does the image meet the specific needs of the current frame as described in the storyboard? 27 | 3. Children's storybook visual standards: Does the image adhere to typical visual requirements for children's storybooks? 28 | 4. Logical consistency: Are there any logical errors in the image that might hinder story comprehension? 29 | 5. Visual element improvements: Are there any details that could be enhanced, such as color scheme, environment, shapes, lighting, character layout, or camera angle? 30 | 31 | After your analysis, respond in one of two ways: 32 | 33 | 1. If you believe the image is satisfactory and doesn't require improvements, JUST SIMPLY OUTPUT: 34 | CRITIC_DONE 35 | 36 | 2. If you think the image needs improvement, JUST provide a new prompt in the following format only: 37 | PROMPT:[Your improved prompt here] 38 | 39 | 40 | Here are two examples of correct output formats: 41 | 42 | Example 1 (when no modifications are needed): 43 | CRITIC_DONE 44 | 45 | Example 2 (when modifications are needed): 46 | PROMPT:A rabbit named Xiao Bai, perched on a branch of a tall apple tree brimming with bright red apples. He carefully picks a large ripe apple and enthusiastically takes big bites, showing clear enjoyment and contentment on his face. The environment is vibrant with lush green leaves and an intoxicating shade of red apples. The scene is rich with color and depicts Xiao Bai savoring the flavor of the succulent apple while sitting on the branch. 47 | 48 | 49 | Ensure your improved prompt addresses the specific areas that need enhancement while maintaining the core elements of the original prompt and story requirements. 50 | And also please note that your prompt should avoid words and expressions related to violence, pornography, politics, and similar topics as much as possible. 51 | """ 52 | 53 | IMAGE_CRITIC_AGENT_DESCRIPTION = "This agent is responsible for reviewing images based on storyboard scripts for children's storybooks." 54 | 55 | 56 | class ImageCriticAgent(AssistantAgent): 57 | """ This agent is responsible for reviewing images based on storyboard scripts for children's storybooks. """ 58 | 59 | def __init__(self, gpt_config, storyboard: str, frame_number: int, prompt: str, *args, **kwargs,): 60 | super().__init__( 61 | name=IMAGE_CRITIC_AGENT_NAME, 62 | description=IMAGE_CRITIC_AGENT_DESCRIPTION, 63 | system_message=IMAGE_CRITIC_AGENT_SYSTEM_MESSAGE_TEMPLATE.format( 64 | STORYBOARD=storyboard, FRAME_NUMBER=frame_number, PROMPT=prompt), 65 | max_consecutive_auto_reply=None, 66 | human_input_mode="NEVER", 67 | llm_config=gpt_config, 68 | code_execution_config=False, 69 | *args, 70 | **kwargs, 71 | ) 72 | -------------------------------------------------------------------------------- /story_book_agents/__init__.py: -------------------------------------------------------------------------------- 1 | """ agents modules """ 2 | 3 | from .story_editor_agent import StoryEditorAgent 4 | from .receptionist_agent import ReceptionistAgent 5 | from .it_assistant_agent import ITAssistantAgent 6 | from .story_critic_agent import StoryCriticAgent 7 | from .producer_agent import ProducerAgent 8 | from .storyboard_editor_agent import StoryboardEditorAgent 9 | from .storyboard_critic_agent import StoryboardCriticAgent 10 | from .text_to_image_prompt_editor_agent import TextToImagePromptEditorAgent 11 | from .text_to_image_prompt_critic_agent import TextToImagePromptCriticAgent 12 | from .tools.utils import save_story_content, load_story_content_by_id,save_storyboard_by_story_id,save_prompts_by_story_id 13 | 14 | 15 | from .agent_manager import agent_manager_instance 16 | 17 | 18 | gpt_config_high_temperature={ 19 | "config_list": [], 20 | "temperature": 0.7, 21 | "cache_seed": None, 22 | "max_tokens": 4096 23 | } 24 | gpt_config_low_temperature={ 25 | "config_list": [], 26 | "temperature": 0, 27 | "cache_seed": None, 28 | "max_tokens": 4096 29 | } 30 | 31 | def init_agents(default_gpt_config): 32 | """ init all agents """ 33 | gpt_config_high_temperature["config_list"]=default_gpt_config 34 | gpt_config_low_temperature["config_list"]=default_gpt_config 35 | 36 | # init agents 37 | agent_manager_instance.default_receptionist_agent = ReceptionistAgent( 38 | gpt_config_high_temperature) 39 | agent_manager_instance.default_story_editor_agent = StoryEditorAgent( 40 | gpt_config_high_temperature) 41 | agent_manager_instance.default_it_assistant_agent = ITAssistantAgent( 42 | gpt_config_low_temperature) 43 | agent_manager_instance.default_story_critic_agent = StoryCriticAgent( 44 | gpt_config_low_temperature) 45 | agent_manager_instance.default_producer_agent = ProducerAgent( 46 | gpt_config_low_temperature) 47 | agent_manager_instance.default_storyboard_editor_agent = StoryboardEditorAgent( 48 | gpt_config_high_temperature) 49 | agent_manager_instance.defualt_storyboard_critic_agent = StoryboardCriticAgent( 50 | gpt_config_low_temperature) 51 | agent_manager_instance.default_text_to_image_prompt_editor_agent = TextToImagePromptEditorAgent(gpt_config_high_temperature) 52 | agent_manager_instance.default_text_to_image_prompt_critic_agent = TextToImagePromptCriticAgent(gpt_config_low_temperature) 53 | 54 | agent_manager_instance.default_producer_agent.register_for_llm(name="save_story_content", description="save story content")(save_story_content) 55 | agent_manager_instance.default_producer_agent.register_for_llm(name="load_story_content_by_id", description="load story content by id")(load_story_content_by_id) 56 | agent_manager_instance.default_producer_agent.register_for_llm(name="save_storyboard_by_story_id", description="save storyboard by story id")(save_storyboard_by_story_id) 57 | agent_manager_instance.default_producer_agent.register_for_llm(name="save_prompts_by_story_id", description="save prompts by story id")(save_prompts_by_story_id) 58 | 59 | agent_manager_instance.default_it_assistant_agent.register_for_execution(name="save_story_content")(save_story_content) 60 | agent_manager_instance.default_it_assistant_agent.register_for_execution(name="load_story_content_by_id")(load_story_content_by_id) 61 | agent_manager_instance.default_it_assistant_agent.register_for_execution(name="save_storyboard_by_story_id")(save_storyboard_by_story_id) 62 | agent_manager_instance.default_it_assistant_agent.register_for_execution(name="save_prompts_by_story_id")(save_prompts_by_story_id) 63 | 64 | __all__ = ['init_agents', 'agent_manager_instance',"gpt_config_high_temperature","gpt_config_low_temperature"] 65 | -------------------------------------------------------------------------------- /generate.py: -------------------------------------------------------------------------------- 1 | 2 | import os 3 | import dotenv 4 | from autogen import UserProxyAgent 5 | from story_book_agents.tools.pptx import create_pptx 6 | from story_book_agents.tools.utils import get_storyboard_by_story_id, update_prompt_by_story_id_and_frame_number, get_last_story_id 7 | from story_book_agents.tools.image import save_image_from_url 8 | from story_book_agents.tools.video import create_video 9 | from story_book_agents.tools.voice import text_to_speech 10 | from story_book_agents.image_creator_agent import ImageCreatorAgent 11 | dotenv.load_dotenv(override=True) 12 | 13 | # get the last story id. if you have a specific story id, you can replace it with your own story id. 14 | story_id = get_last_story_id() 15 | 16 | storyboard = get_storyboard_by_story_id(story_id=story_id) 17 | 18 | user_proxy = UserProxyAgent("UserProxyAgent", 19 | llm_config={"config_list": [{ 20 | "model": os.environ.get("MODEL"), 21 | "api_key": os.environ.get("API_KEY"), 22 | "base_url": os.environ.get("BASE_URL"), 23 | "api_type": os.environ.get("API_TYPE"), 24 | "api_version": os.environ.get("API_VERSION"), 25 | }]}, 26 | human_input_mode="NEVER", 27 | system_message="A human admin", 28 | code_execution_config=False, 29 | max_consecutive_auto_reply=0) 30 | 31 | 32 | 33 | def generate_image_by_frame_number(frame_number): 34 | image_creator = ImageCreatorAgent( 35 | gpt_config={"config_list": [{ 36 | "model": os.environ.get("MODEL"), 37 | "api_key": os.environ.get("API_KEY"), 38 | "base_url": os.environ.get("BASE_URL"), 39 | "api_type": os.environ.get("API_TYPE"), 40 | "api_version": os.environ.get("API_VERSION"), 41 | }]}, 42 | story_id=story_id, 43 | frame_number=int(frame_number), 44 | max_consecutive_auto_reply=0, 45 | ) 46 | 47 | result = user_proxy.initiate_chat( 48 | image_creator, 49 | message=f"This is frame {frame_number}", 50 | ) 51 | img_url = result.chat_history[-1]['content'][-1]['image_url']['url'] 52 | prompt = result.chat_history[-1]['content'][-1]['prompt'] 53 | update_prompt_by_story_id_and_frame_number( 54 | story_id=story_id, frame_number=int(frame_number), prompt=prompt) 55 | save_image_from_url(story_id=story_id, frame_index=int( 56 | frame_number), image_url=img_url, is_final=True) 57 | 58 | # generate images 59 | def generate_images(): 60 | for frame in storyboard: 61 | frame_number = frame["Index"] 62 | generate_image_by_frame_number(frame_number) 63 | 64 | # generate audio 65 | def generate_voice(): 66 | for frame in storyboard: 67 | frame_number = frame["Index"] 68 | story_content = frame["StoryContent"] 69 | 70 | voice_filename = "./output/" + story_id + "/" + frame_number + "/voice.mp3" 71 | text_to_speech(story_content, voice_filename) 72 | 73 | 74 | # entry point 75 | if __name__ == '__main__': 76 | # generate image by frame number, you can use this function to re-generate image by frame number 77 | #generate_image_by_frame_number(2) 78 | 79 | # generate images, if you have images, you can comment this line 80 | generate_images() 81 | 82 | # generate voice, if you have voice, you can comment this line 83 | generate_voice() 84 | # just a basic blank pptx template with image and audio, you can comment this line if you donot need the pptx. 85 | create_pptx(story_id=story_id) 86 | # create video 87 | create_video(story_id=story_id) 88 | 89 | -------------------------------------------------------------------------------- /story_book_agents/story_draft_groupchat.py: -------------------------------------------------------------------------------- 1 | """ Story Draft Group Chat """ 2 | 3 | from typing import Union 4 | from autogen.agentchat import Agent, GroupChat, GroupChatManager 5 | 6 | import story_book_agents 7 | from .agent_manager import agent_manager_instance 8 | 9 | 10 | STORY_DRAFT_GROUP_SELECT_SPEAKER_MESSAGE_TEMPLATE = """ 11 | You are an AI assistant acting as the administrator of a story draft group. Your task is to determine who should speak next in the story creation process based on the current conversation context and the established workflow. 12 | 13 | The story draft group follows this workflow: 14 | 1. The Story_Editor writes the story content. 15 | 2. Once the story content is written, the Story_Critic reviews it and provides feedback. 16 | 3. If the story needs revisions, the Story_Editor makes the necessary changes. 17 | 4. If no further revisions are needed (indicated by "CRITIC_DONE" in the response), the Producer saves the story content. 18 | 19 | Your job is to analyze the conversation context and determine which role should speak next according to this workflow. 20 | 21 | To make your determination: 22 | 1. Examine the last speaker and their contribution in the conversation context. 23 | 2. Consider the current stage of the story creation process based on the workflow. 24 | 3. Identify the appropriate next step and the corresponding role that should speak. 25 | 26 | Provide your response by simply stating the role of the next speaker. Do not include any additional explanation or content. Your response should be one of the following: 27 | - Story_Editor 28 | - Story_Critic 29 | - Producer 30 | """ 31 | 32 | def story_draft_group_speaker_selection_func( 33 | last_speaker: Agent, groupchat: GroupChat 34 | ) -> Union[Agent, str, None]: 35 | """ 36 | Determine the next speaker in the story draft group based on the conversation context and the established workflow. 37 | """ 38 | _groupchat = groupchat 39 | if last_speaker == agent_manager_instance.default_it_assistant_agent: 40 | return None 41 | else: 42 | return "auto" 43 | 44 | def init_story_draft_groupchat()->GroupChat: 45 | story_draft_groupchat = GroupChat( 46 | agents=[ 47 | agent_manager_instance.default_producer_agent, 48 | agent_manager_instance.default_story_editor_agent, 49 | agent_manager_instance.default_story_critic_agent, 50 | agent_manager_instance.default_it_assistant_agent, 51 | ], 52 | messages=[], 53 | max_round=10, 54 | speaker_selection_method=story_draft_group_speaker_selection_func, 55 | select_speaker_message_template=STORY_DRAFT_GROUP_SELECT_SPEAKER_MESSAGE_TEMPLATE, 56 | select_speaker_prompt_template=None, 57 | ) 58 | return story_draft_groupchat 59 | 60 | def init_story_draft_group_manager()->GroupChatManager: 61 | story_draft_group_manager = GroupChatManager( 62 | name="StoryDraftGroup", 63 | groupchat=init_story_draft_groupchat(), 64 | llm_config=story_book_agents.gpt_config_low_temperature, 65 | human_input_mode="NEVER", 66 | code_execution_config=False, 67 | silent=False, 68 | ) 69 | return story_draft_group_manager 70 | 71 | def set_story_draft_chat(): 72 | story_draft_chat = { 73 | "sender": agent_manager_instance.default_receptionist_agent, 74 | "recipient": init_story_draft_group_manager(), 75 | "message": "User requirements have been compiled. Please create the story content and save it.", 76 | "max_turns": 1, 77 | "summary_method": "reflection_with_llm", 78 | "summary_args": { 79 | "summary_prompt": """ 80 | Please summarize the story ID from the above conversation and output the story ID directly. Do not output any other content. 81 | """ 82 | } 83 | } 84 | return story_draft_chat 85 | -------------------------------------------------------------------------------- /story_book_agents/receptionist_agent.py: -------------------------------------------------------------------------------- 1 | ''' This module defines the ReceptionistAgent class, which is responsible for collecting user requirements.''' 2 | 3 | from autogen import AssistantAgent 4 | 5 | RECEPTIONIST_AGENT_NAME = "Receptionist" 6 | RECEPTIONIST_AGENT_SYSTEM_MESSAGE = """ 7 | You are an AI assistant acting as a receptionist for a children's storybook creation team. 8 | Your primary task is to collect user requirements for creating engaging picture books for children. 9 | Follow these instructions carefully: 10 | 11 | 1. Introduction and Role: 12 | - Introduce yourself as the receptionist for a children's storybook creation team. 13 | - Explain that your goal is to gather information about the user's storybook requirements. 14 | 2. Guidelines for Interaction: 15 | - Use a step-by-step, question-and-answer approach when communicating with the user. 16 | - Ask one question at a time to avoid overwhelming the user. 17 | - Keep your questions and responses concise and easy to understand. 18 | - Do not provide multiple options or too much information in a single message. 19 | 3. Collecting User Requirements: 20 | - Be flexible in your questioning approach. Adapt your questions based on the user's responses and needs. 21 | - Cover various aspects of storybook creation, such as plot, characters, setting, theme, style, and any specific elements the user wants to include. 22 | - If the user requests a known story (e.g., "The Little Match Girl"), accept this request and ask if they want any modifications or adaptations to the original story. 23 | - Be open to both original story ideas and adaptations of existing stories. 24 | 4. Handling Off-Topic Conversations: 25 | - If the user provides information unrelated to storybook creation, politely acknowledge it and redirect the conversation back to the task at hand. 26 | - Use phrases like "That's interesting, but let's focus on your storybook requirements. Can you tell me more about [relevant aspect of the story]?" 27 | - There is no need to inquire about the target age group of the story unless the user brings it up; otherwise, we will assume that the story is aimed at the preschool age group of 3 to 6 years old. 28 | 5. Ending the Conversation: 29 | - Once you have gathered sufficient information, inform the user that their requirements have been collected. 30 | - Summarize the key points of the user's requirements without creating or suggesting any story content. 31 | - Explain that the storybook creation team will use this information to create the story. 32 | - Instruct the user to type "exit" to end the current session and wait for the team's creation. 33 | 6. Important Restrictions: 34 | - Do not create, write, or suggest any story content, plot details, or character developments. 35 | - Your role is strictly to collect and clarify requirements, not to produce any part of the story. 36 | - If the user asks for a sample or example of the story, politely explain that you're not able to provide story content and that the creation team will handle that aspect. 37 | 38 | 39 | Remember to maintain control of the conversation, keep the user focused on providing storybook requirements, and gather all necessary information in a simple, clear manner. 40 | Be adaptable and responsive to the user's specific needs and requests, but do not engage in story creation or content production. 41 | """ 42 | 43 | RECEPTIONIST_AGENT_DESCRIPTION = """This agent is responsible for collecting user requirements.""" 44 | 45 | 46 | class ReceptionistAgent(AssistantAgent): 47 | """ Rreceptionist agent""" 48 | def __init__(self, gpt_config): 49 | super().__init__( 50 | name=RECEPTIONIST_AGENT_NAME, 51 | description=RECEPTIONIST_AGENT_DESCRIPTION, 52 | system_message=RECEPTIONIST_AGENT_SYSTEM_MESSAGE, 53 | max_consecutive_auto_reply=None, 54 | human_input_mode="NEVER", 55 | llm_config=gpt_config, 56 | code_execution_config=False, 57 | ) 58 | -------------------------------------------------------------------------------- /README.zh-cn.md: -------------------------------------------------------------------------------- 1 | [English](README.md) 2 | [日本語](README.ja-jp.md) 3 | 4 | ## Agentic Story Book Workflow 5 | 基于 [AutoGen](https://microsoft.github.io/autogen/) 的一个儿童绘本制作多智能体工作流框架。 6 | 7 | https://github.com/user-attachments/assets/323d055a-27d9-487f-b8c4-2fad2df649cc 8 | 9 | ## Agentic workflow 10 | ![MultiAgent](./images/MultiAgents.jpg) 11 | 在代码中涉及到多种基于 AutoGen 的不同的多智能体协作方式。例如: 12 | - 在一开始,由 User_Proxy 代表用户和 Receptionist 来进行交流,从而采集用户的需求。 13 | - 在后继的两个环节中,均采用了 GroupChat 的机制,每个 GroupChat 中又分别设置了一个 GroupChat Manager 用来协调当前的 GroupChat 中的发言人角色。 14 | - 在两个 GroupChat 中,内容的创作角色(例如 Story Editor,Storboard Editor, Prompt Editor)均伴随着有一个进行该环节的评审的Agent。当他们的评审没有通过的话,由 GroupManger 发回内容创作 Editor 进行重新修改。 15 | - 最后的生成图片/视频/PPT 的环节,目前我将其放到了独立的代码中(generate.py),一是便于我目前的使用,二是后继对于 GroupChat 的组织可能还会有所调整。所以这部分暂时由一个 Image Creator Agent 来负责,和前面所不同的是这是一个独立的 Agent,但是它自己内部又包含了两个 Sub-Agents,一个 Image Generation Agent 负责进行文生图的 AI 的调用。另外一个负责对于生成的图片的审查。 16 | 17 | ## 系统需求 18 | - **LLM**: 建议使用 ChatGPT-4o,目前代码基于 Azure OpenAI 服务中的 ChatGPT-4o 进行测试,理论上对于 OpenAI 的原生服务也应该可以支持,最多需要对于 Config 做微调。尽管 AutoGen 支持多种 LLM,但是经过实际测试使用 Claude 3.5 sonnet 时也无法100%严格的遵循 Prompt 中的指令,所以不建议使用其他 LLM。 19 | - **Text2Image**: 支持 DALL-E 3 以及 Replicate 中的 Flux schnell。但是从成本和速度上考虑的话我最终选用了 Replicate 中的 Flux Schnell API 端点。因为 20 | - 在使用 Landscpae 或者 Portrait 模式的图片,HD 模式下DALL-E 3 的价格是 12$/100 张图,意味着每张图 0.12$,而且每张图要十多秒以上才能绘制完毕并得到结果。 21 | - 但是采用 Flux Schnell 的 API 服务每张图的成本只有 0.003$,绘图时间一般在 1 ~2 秒。从成本和时间调度来说 Flux Schnell 似乎更加合适,哪怕你觉得 Schnell 版本的质量不高,要使用 Flux Dev 版本的 API 成本也只有 0.03$而已(Replicate 上的 pro 版本成本为 0.055$,但是由于似乎它在 CPU 上,绘图速度很慢我就没有尝试),您也可以根据自己的需求来调整。 22 | - **Azure 账号**,并开通 Speech 服务资源。 23 | 24 | ## 如何使用 25 | - 创建 python 虚拟环境(我这里是在 Python 3.11 上进行的测试),并安装依赖包 26 | ``` 27 | pip install -r requirements.txt 28 | ``` 29 | - 创建.env 文件,并复制 .env.example 中的内容过来,修改为您的对应的设置值。执行下面的脚本来创作故事: 30 | ``` 31 | python app.py 32 | ``` 33 | - 生成图片/视频/PPTX:首先修改 generate.py 中的 story_id 为你想生成的故事 ID(从 app.py 的输出中得到)。然后运行: 34 | ``` 35 | python generate.py 36 | ``` 37 | 38 | ## .env 环境变量 39 | |变量名|描述 |默认值| 40 | |:-----|:----|:-----:| 41 | |AGENTOPS_API_KEY| [AgentOps](https://app.agentops.ai/) API Key| | 42 | |MODEL|Azure 上的模型部署名或者 OpenAI 上的模型名 | | 43 | |API_VERSION|API Version|'2024-06-01'| 44 | |API_TYPE|'azure' 或者 'openai'|azure| 45 | |API_KEY|API Key| | 46 | |BASE_URL|API base url, Azure 应该形如 'https://{region_name}.openai.azure.com/'|| 47 | |IMAGE_GENERATION_TYPE|'azure', 'openai' 或者 'replicate'|| 48 | |IMAGE_SHAPE|'landscape', 'portrait' 或者 'square'|landscape| 49 | |DALLE_MODEL|Azure 上的模型部署名或者 OpenAI 上的模型名 | | 50 | |DALLE_API_VERSION|API Version|'2024-06-01'| 51 | |DALLE_API_KEY|API Key| | 52 | |DALLE_BASE_URL|API base url, Azure 上应该形如 'https://{region_name}.openai.azure.com/'|| 53 | |DALLE_IMAGE_QUALITY|'hd' 或者 'standard'|'hd'| 54 | |DALLE_IMAGE_STYLE|'vivid' 或者 'natural'|'vivid'| 55 | |REPLICATE_API_TOKEN|[repilicate](https://replicate.com/) api key| | 56 | |REPLICATE_MODEL_NAME| 'black-forest-labs/flux-schnell', 'black-forest-labs/flux-dev' or 'black-forest-labs/flux-pro'|'black-forest-labs/flux-schnell'| 57 | |IMAGE_GENERATION_RETRIES|生成每张图片时的重试次数|3| 58 | |IMAGE_CRITICISM_RETRIES|每张图片的最大审核次数|2| 59 | |IMAGE_SAVE_FAILURED_IMAGES|是否保存生成后未采用的图片:True or False|False| 60 | |AZURE_SPEECH_KEY|Azure 语音的 API Key|| 61 | |AZURE_SPEECH_REGION|Azure 服务语音部署区域|| 62 | |AZURE_SPEECH_VOICE_NAME|Azure 语音发音人|'zh-CN-XiaoxiaoMultilingualNeural'| 63 | 64 | 65 | ## 路线图 66 | - [ ]增加更多 FLUX 模型版本和渠道 67 | - [ ]完善内容生成部分的逻辑 68 | - [ ]在故事内容创作和内容生成的过程中增加“人在回路”的逻辑 69 | - [ ]背景音乐 70 | 71 | ## 常见问题 72 | - **我看到你的 Demo 的故事内容是中文,它支持其他语言么?** 73 | 支持的,在内容创作的 Prompt 部分已经有指令要求遵循用户的要求或者用户输入时采用的语言。 74 | - **语音的多语言呢?** 75 | Azure的 TTS 支持上百种语言,您只需要将.env中的AZURE_SPEECH_VOICE_NAME指定为您所需要的语言的发音人即可(有的发音人本身就支持几十种不同国家的语言) 76 | - **那为啥你的 Prompt 都用英文写的?** 77 | 毋容置疑,英文的 Prompt 的效果要比中文好一点。一个很有用的小技巧,在 Anthropic 的 Portal 里有一个帮你生成提示词的工具,你可以在那边输入你初步想法然后他帮你生成提示词,你只需要做少量的修改就可以用到你的程序里面。 78 | - **视觉质量看起来不高** 79 | 这里有两方面的因素: 80 | - 一是目前我所展示的测试内容里采用的 Flux 的 Schnell 模型,为的是速度快和成本低。采用 dev 或者 pro 必然图片的视觉质量会有提高,目前代码中还未支持这两种不同的模型,未来会加入。 81 | - 二是现有的图片评审逻辑还不够,还有改善的余地。 82 | 83 | ## 其他 84 | [部分生成的内容演示参见这里](DEMO-Results.md) 85 | -------------------------------------------------------------------------------- /README.ja-jp.md: -------------------------------------------------------------------------------- 1 | [English](README.md) 2 | [中文版](README.zh-cn.md) 3 | 4 | ## エージェンティックストーリーブックワークフロー 5 | [AutoGen](https://microsoft.github.io/autogen/) に基づく子供向け絵本作成のためのマルチエージェントワークフローフレームワーク。 6 | 7 | https://github.com/user-attachments/assets/323d055a-27d9-487f-b8c4-2fad2df649cc 8 | 9 | ## エージェンティックワークフロー 10 | ![MultiAgent](./images/MultiAgents.jpg) 11 | コードには、AutoGen に基づくさまざまなマルチエージェント コラボレーション方法が含まれています。例えば: 12 | - 最初に、User_Proxy がユーザーを表し、Receptionist と通信してユーザーの要件を収集します。 13 | - 次の 2 つの段階では、GroupChat メカニズムが使用され、各 GroupChat には現在の GroupChat のスピーカーを調整する GroupChat マネージャーがいます。 14 | - 2 つのグループチャットでは、コンテンツ作成の役割 (ストーリー エディター、ストーリーボード エディター、プロンプト エディターなど) に、コンテンツを確認するエージェントが伴います。レビューが承認されない場合、GroupManager はコンテンツ作成エディターに送信して修正を行います。 15 | - 画像/ビデオ/PPT の生成の最終段階は、現在のところ、使用の容易さと将来のグループチャットの編成の潜在的な調整のために、別のコード (generate.py) に配置されています。この部分は一時的に Image Creator Agent によって処理されます。これは独立したエージェントですが、内部には 2 つのサブエージェントが含まれています。1 つは AI ベースの画像生成を担当する画像生成エージェントで、もう 1 つは生成された画像を確認するエージェントです。 16 | 17 | ## システム要件 18 | - **LLM**: ChatGPT-4o の使用をお勧めします。現在のコードは、Azure OpenAI の ChatGPT-4o サービスに基づいてテストされています。理論的には、構成を少し調整するだけで OpenAI のネイティブ サービスもサポートするはずです。AutoGen は複数の LLM をサポートしていますが、Claude 3.5 ソネットを使用した実際のテストでは、プロンプトの指示に 100% 厳密に従うことができないことがわかったため、他の LLM はお勧めしません。 19 | - **Text2Image**: DALL-E 3 と Replicate の Flux Schnell をサポートします。コストと速度を考慮して、最終的に Replicate の Flux Schnell API エンドポイントを選択しました。なぜなら 20 | - HD モードの DALL-E 3 を使用すると、100 枚の画像あたり 12 ドルの費用がかかり、つまり 1 枚の画像あたり 0.12 ドルの費用がかかり、各画像の生成には 10 秒以上かかります。 21 | - Flux Schnell API サービスを使用すると、1 枚の画像のコストはわずか 0.003 ドルで、描画時間は 1 ~ 2 秒です。コストとスケジューリングの観点から、Flux Schnell がより適しているようです。たとえ Schnell バージョンの品質が低いと感じても、Flux Dev バージョンの API を使用するコストはわずか 0.03 ドルです (Replicate の pro バージョンのコストは 0.055 ドルですが、CPU で実行されているようで非常に遅いため、試していません)。ニーズに応じて調整できます。 22 | - スピーチ サービス リソースが有効になっている Azure アカウント。 23 | 24 | ## 使用方法 25 | - Python 仮想環境を作成し (Python 3.11 でテスト済み)、依存関係をインストールします。 26 | ``` 27 | pip install -r requirements.txt 28 | ``` 29 | - .env ファイルを作成し、.env.example の内容をコピーして、設定に合わせて変更します。ストーリーを作成する 30 | ``` 31 | python app.py 32 | ``` 33 | - 画像/ビデオ/PPTX を生成します。まず、generate.py の story_id を生成したいストーリー ID に変更します (app.py の出力から取得します)。次に実行します。 34 | ``` 35 | python generate.py 36 | ``` 37 | 38 | ## .env 構成 39 | |環境名|説明 |デフォルト値| 40 | |:-----|:----|:-----:| 41 | |AGENTOPS_API_KEY| [AgentOps](https://app.agentops.ai/) API キー| | 42 | |MODEL|Azure のデプロイ名または OpenAI のモデル名 | | 43 | |API_VERSION|API バージョン|'2024-06-01'| 44 | |API_TYPE|'azure' または 'openai'|azure| 45 | |API_KEY|API キー| | 46 | |BASE_URL|API ベース URL、Azure の場合は 'https://{region_name}.openai.azure.com/' のようになります|| 47 | |IMAGE_GENERATION_TYPE|'azure'、'openai' または 'replicate'|| 48 | |IMAGE_SHAPE|'landscape'、'portrait' または 'square'|landscape| 49 | |DALLE_MODEL|Azure のデプロイ名または OpenAI のモデル名 | | 50 | |DALLE_API_VERSION|API バージョン|'2024-06-01'| 51 | |DALLE_API_KEY|API キー| | 52 | |DALLE_BASE_URL|API ベース URL、Azure の場合は 'https://{region_name}.openai.azure.com/' のようになります|| 53 | |DALLE_IMAGE_QUALITY|'hd' または 'standard'|'hd'| 54 | |DALLE_IMAGE_STYLE|'vivid' または 'natural'|'vivid'| 55 | |REPLICATE_API_TOKEN|[repilicate](https://replicate.com/) API キー| | 56 | |REPLICATE_MODEL_NAME| 'black-forest-labs/flux-schnell'、'black-forest-labs/flux-dev' または 'black-forest-labs/flux-pro'|'black-forest-labs/flux-schnell'| 57 | |IMAGE_GENERATION_RETRIES|画像ごとの最大再試行回数|3| 58 | |IMAGE_CRITICISM_RETRIES|画像ごとの最大批評回数|2| 59 | |IMAGE_SAVE_FAILURED_IMAGES|批評に失敗した画像を保存する:True または False|False| 60 | |AZURE_SPEECH_KEY|Azure 音声 API キー|| 61 | |AZURE_SPEECH_REGION|Azure 音声デプロイ リージョン|| 62 | |AZURE_SPEECH_VOICE_NAME|Azure 音声スピーカー名|'zh-CN-XiaoxiaoMultilingualNeural'| 63 | 64 | ## ロードマップ 65 | - [ ] さらに多くの FLUX モデルとチャネルを追加 66 | - [ ] コンテンツ生成のロジックを改善する 67 | - [ ] ストーリー コンテンツの作成と生成中に「ループ内の人間」ロジックを追加する 68 | - [ ] バックグラウンドミュージック 69 | 70 | ## FAQ 71 | - **デモのストーリーコンテンツが中国語ですが、他の言語に対応していますか?** 72 | はい、対応しています。コンテンツ作成のプロンプトセクションには、ユーザーの要件またはユーザーが使用する言語に従うように指示があります。 73 | - **多言語音声サポートはどうですか?** 74 | Azure の TTS は数百の言語をサポートしています。必要な言語の音声名を .env ファイルの AZURE_SPEECH_VOICE_NAME フィールドに指定するだけです (一部の音声は数十の異なる言語をサポートしています)。 75 | - **なぜプロンプトが英語で書かれているのですか?** 76 | 間違いなく、英語のプロンプトは中国語のプロンプトよりも効果的です。非常に便利なヒントとして、Anthropic のポータルにはプロンプトを生成するツールがあります。そこに初期のアイデアを入力すると、プロンプトを生成するのに役立ちます。少し修正するだけでプログラムで使用できます。 77 | - **視覚品質が低いようです** 78 | ここには 2 つの要因があります。 79 | - まず、現在表示されているテストコンテンツは、速度が速くコストが低い Flux の Schnell モデルを使用しています。dev または pro モデルを使用すると、画像の視覚品質が向上することは間違いありません。これらのモデルは現在のコードではサポートされていませんが、将来的には追加される予定です。 80 | - 第二に、既存の画像レビュー ロジックは不十分であり、改善の余地があります。 81 | 82 | ## その他 83 | [ここで生成されたコンテンツのデモをいくつか参照してください](DEMO-Results.md) 84 | -------------------------------------------------------------------------------- /story_book_agents/tools/video.py: -------------------------------------------------------------------------------- 1 | import os 2 | import random 3 | from typing import Annotated 4 | from moviepy.editor import ImageClip,AudioFileClip,concatenate_videoclips,CompositeVideoClip 5 | 6 | 7 | def apply_effect(clip, effect, duration, **kwargs): 8 | original_size = clip.size 9 | w, h = original_size 10 | move_distance = kwargs.get('move_distance', 0.15) 11 | scale_factor = 1 + move_distance 12 | 13 | if effect == "zoom_in": 14 | return clip.resize(lambda t: 1 + 0.15*t/duration) 15 | elif effect == "zoom_out": 16 | return clip.resize(lambda t: 1 + 0.15*(1-t/duration)) 17 | elif effect.startswith("move"): 18 | enlarged_clip = clip.resize(scale_factor) 19 | w_new, h_new = enlarged_clip.size 20 | 21 | if effect == "move_left": 22 | start_pos = (0, 'center') 23 | end_pos = (-(w_new - w), 'center') 24 | elif effect == "move_right": 25 | start_pos = (-(w_new - w), 'center') 26 | end_pos = (0, 'center') 27 | elif effect == "move_up": 28 | start_pos = ('center', 0) 29 | end_pos = ('center', -(h_new - h)) 30 | elif effect == "move_down": 31 | start_pos = ('center', -(h_new - h)) 32 | end_pos = ('center', 0) 33 | else: 34 | raise ValueError(f"Unknown move effect: {effect}") 35 | 36 | def move_position(t): 37 | progress = t / duration 38 | x1, y1 = start_pos 39 | x2, y2 = end_pos 40 | if x1 == 'center': 41 | x = 'center' 42 | else: 43 | x = x1 + (x2 - x1) * progress 44 | if y1 == 'center': 45 | y = 'center' 46 | else: 47 | y = y1 + (y2 - y1) * progress 48 | return (x, y) 49 | 50 | moving_clip = enlarged_clip.set_position(move_position) 51 | return CompositeVideoClip([moving_clip], size=original_size).set_duration(duration) 52 | elif effect == "none": 53 | return clip 54 | else: 55 | raise ValueError(f"Unknown effect: {effect}") 56 | 57 | 58 | effects_config = [ 59 | {"effect": "none"}, 60 | {"effect": "move_left"}, 61 | {"effect": "random"}, 62 | {"effect": "zoom_in"}, 63 | # Add more configurations for additional shots... 64 | ] 65 | 66 | def create_video(story_id: Annotated[str,'story id'] ): 67 | """ 68 | Create video by story id 69 | 70 | Args: 71 | story_id (Annotated[str,'story id']): story id 72 | 73 | """ 74 | output_directory= "./output/"+story_id 75 | # 获取所有子目录 76 | subdirs = sorted([d for d in os.listdir(output_directory) if os.path.isdir(os.path.join(output_directory, d))], key=int) 77 | available_effects = ["zoom_in", "zoom_out", "move_left", "move_right", "move_up", "move_down", "none"] 78 | 79 | clips = [] 80 | original_size = None 81 | for subdir in subdirs: 82 | subdir_path = os.path.join(output_directory, subdir) 83 | image_path = os.path.join(subdir_path, 'image.jpg') 84 | audio_path = os.path.join(subdir_path, 'voice.mp3') 85 | 86 | # 创建图像clip 87 | image_clip = ImageClip(image_path) 88 | if original_size is None: 89 | original_size = image_clip.size 90 | 91 | # 获取音频时长 92 | audio = AudioFileClip(audio_path) 93 | audio_duration = audio.duration 94 | 95 | chosen_effect =random.choice(available_effects) 96 | print(f"Applying effect: {chosen_effect} to clip in {subdir}") 97 | 98 | image_clip = apply_effect(image_clip, chosen_effect, audio_duration) 99 | # 设置图像clip时长(比音频长0.5秒) 100 | image_clip = image_clip.set_duration(audio_duration + 0.5) 101 | if image_clip.size != original_size: 102 | image_clip = image_clip.resize(original_size) 103 | # 添加音频 104 | video_clip = image_clip.set_audio(audio) 105 | 106 | # 添加过渡效果(淡入淡出) 107 | video_clip = video_clip.crossfadein(0.5).crossfadeout(0.5) 108 | 109 | clips.append(video_clip) 110 | 111 | # 合并所有clips 112 | final_clip = concatenate_videoclips(clips, method="compose") 113 | 114 | # 输出最终视频 115 | final_clip.write_videofile(os.path.join(output_directory,'output.mp4') , fps=24) 116 | 117 | # 输出最终音频 118 | final_clip.audio.write_audiofile(os.path.join(output_directory,'output.mp3'),fps=44100) 119 | 120 | -------------------------------------------------------------------------------- /story_book_agents/storyboard_editor_agent.py: -------------------------------------------------------------------------------- 1 | ''' Story editor agent ''' 2 | 3 | from autogen import AssistantAgent 4 | 5 | STORYBOARD_EDITOR_AGENT_NAME = "Storyboard_Editor" 6 | STORYBOARD_EDITOR_AGENT_SYSTEM_MESSAGE = """ 7 | You are a storyboard editor for a children's picture book creation team. 8 | You have strong reading comprehension, understanding, and innovative abilities. 9 | Your task is to create a storyboard for a children's picture book based on a given story draft. 10 | 11 | Using this story as a reference, create a storyboard for the picture book. 12 | A storyboard is a graphical tool used to describe the plot of a story, breaking down the main plot points into several frames, each containing a text description and an image description. 13 | 14 | When creating the storyboard, consider the following points: 15 | 1. Break the entire story into 10-15 storyboard scenes. Too few scenes may not reflect the main plot of the story, while too many may make the storyboard overly complex. 16 | 2. The content of each frame should reflect the main plot of the original story, ensuring that when the frame contents are strung together, they remain consistent with the original story. 17 | 3. Write the content for each frame, including the frame number, the story description from the original text, and the frame's visual content. 18 | 19 | Output your storyboard strictly in the following XML format, without any additional content: 20 | 21 | 22 | 23 | [Frame number] 24 | [Frame content] 25 | [Visual content] 26 | 27 | ... 28 | 29 | 30 | If you receive feedback on the Storyboard, it may be provided in XML format as follows: 31 | 32 | 33 | [Frame number] 34 | 35 | [Modification suggestions here] 36 | 37 | 38 | ... 39 | 40 | 41 | You will need to revise and improve your storyboard based on this feedback. Please remember that after incorporating the feedback, your output must also follow the aforementioned storyboard output format. 42 | 43 | 44 | 45 | Important notes: 46 | - Your output must only contain the XML format specified above. Do not include any other content. 47 | - In the section, provide detailed visual descriptions to prevent misunderstandings. For character names/roles, maintain consistency by inferring age, gender, appearance, and clothing from the story's context. If these details can't be inferred, supplement them based on your understanding of the story content. Remember, character/role names are not useful for visual descriptions. 48 | - Maintain consistency in visual traits and characteristics for each character across all frame shots in the ImageDescription. For example, if you've inferred that a character is a 15-year-old male student wearing glasses, always describe these visual traits rather than using the character's name. 49 | - Avoid using character names in the ImageDescription. Instead, use descriptive terms like "a teenage boy with glasses" or "a middle-aged woman with curly hair" to maintain visual consistency throughout the storyboard. 50 | - Based on your understanding of the story content and the current storyboard, please include visual descriptions of the environment in the ImageDescription as much as possible. 51 | - The language used in the StoryContent and ImageDescription sections within the Storyboard must be consistent with the language of the story content. 52 | 53 | Remember, your goal is to create a visually coherent and narratively faithful storyboard based on the given story. Focus on translating the written content into clear, consistent visual descriptions that could guide an illustrator in creating the picture book. 54 | """ 55 | STORYBOARD_EDITOR_AGENT_DESCRIPTION = "This agent is responsible for creating a kid's storyboard based on the story content" 56 | 57 | 58 | class StoryboardEditorAgent(AssistantAgent): 59 | """ This agent is responsible for creating a kid's story board based on the user's input. """ 60 | 61 | def __init__(self, gpt_config): 62 | super().__init__( 63 | name=STORYBOARD_EDITOR_AGENT_NAME, 64 | description=STORYBOARD_EDITOR_AGENT_DESCRIPTION, 65 | system_message=STORYBOARD_EDITOR_AGENT_SYSTEM_MESSAGE, 66 | max_consecutive_auto_reply=None, 67 | human_input_mode="NEVER", 68 | llm_config=gpt_config, 69 | code_execution_config=False, 70 | ) 71 | -------------------------------------------------------------------------------- /story_book_agents/text_to_image_prompt_critic_agent.py: -------------------------------------------------------------------------------- 1 | ''' 2 | text to image prompt critic agent 3 | ''' 4 | 5 | from autogen import AssistantAgent 6 | 7 | 8 | TEXT_TO_IMAGE_PROMPT_CRITIC_AGENT_NAME = "Text_To_Image_Prompt_Critic" 9 | TEXT_TO_IMAGE_PROMPT_CRITIC_AGENT_SYSTEM_MESSAGE = """ 10 | You are a prompt critic for a children's storybook creation team. Your task is to review the text-to-image generation prompts for each storyboard frame and determine if they are usable. 11 | 12 | The text-to-image prompts based on the storyboard script will be provided to you in the following format: 13 | 14 | 15 | 16 | [Frame number] 17 | [Frame text-to-image generation prompt] 18 | 19 | ... 20 | 21 | 22 | Your job is to carefully review these prompts, focusing on the following criteria: 23 | 24 | 1. Consistency of visual descriptions for the same character in different frames, including species-specific features, size, fur/skin color, eye color, and distinguishing marks. Determine if additional visual descriptions are necessary to maintain consistency. 25 | 2. Consistency of scene descriptions across frames, including setting, time of day, and environmental details that contribute to the story's atmosphere. 26 | 3. Presence and consistency of emotional atmosphere and mood descriptions across frames. 27 | 4. Presence of composition suggestions that help create depth and visual interest in the image. 28 | 5. Presence of ambiguous references, such as non-generic character/person names that may confuse the text-to-image AI and lead to inaccurate drawings. 29 | 6. Whether the prompts are sufficiently detailed and long enough (more than 3 sentences) to generate a comprehensive image. 30 | 7. Ensure that character and scene descriptions are culturally appropriate and avoid stereotypes. 31 | 8. Check if characters' expressions, body language, and actions are described in sufficient detail and accurately reflect the emotions and interactions described in the story content. 32 | 9. Whether the prompts are in English. 33 | 10. Whether the scene descriptions are consistent with the overall story progression. 34 | 35 | If there are no modifications needed, simply output: PROMPT_CRITIC_DONE 36 | 37 | If modifications are needed, **provide your feedback for the frames that require changes** using the following XML format: 38 | 39 | 40 | 41 | [Frame number] 42 | 43 | [Your modification suggestions here] 44 | 45 | 46 | ... 47 | 48 | 49 | Use the Priority attribute to indicate the severity of the issue: 50 | - High: Major issues that significantly impact the quality or accuracy of the generated image 51 | - Medium: Important issues that should be addressed but don't critically impact the overall image 52 | - Low: Minor suggestions or improvements 53 | 54 | Here are two examples of correct output formats: 55 | 56 | Example 1 (No modifications needed): 57 | PROMPT_CRITIC_DONE 58 | 59 | Example 2 (Modifications needed): 60 | 61 | 62 | 2 63 | 64 | The character description for the little fox lacks details about its fur color and eye color. Consider adding these details for consistency in future frames. 65 | 66 | 67 | 68 | 4 69 | 70 | The prompt lacks a clear description of the emotional atmosphere of the scene. Consider adding details about the mood and feelings evoked by the environment and character expressions. 71 | 72 | 73 | 74 | 75 | Remember to provide your feedback in English, even though the task description is in Chinese. Begin your review now. 76 | """ 77 | 78 | TEXT_TO_IMAGE_PROMPT_CRITIC_AGENT_DESCRIPTION = """"This agent reviews text-to-image generation prompts for children's picture book frames.""" 79 | 80 | class TextToImagePromptCriticAgent(AssistantAgent): 81 | """ This agent reviews text-to-image generation prompts for children's picture book frames. """ 82 | 83 | def __init__(self, gpt_config): 84 | super().__init__( 85 | name=TEXT_TO_IMAGE_PROMPT_CRITIC_AGENT_NAME, 86 | description=TEXT_TO_IMAGE_PROMPT_CRITIC_AGENT_DESCRIPTION, 87 | system_message=TEXT_TO_IMAGE_PROMPT_CRITIC_AGENT_SYSTEM_MESSAGE, 88 | llm_config=gpt_config, 89 | human_input_mode="NEVER", 90 | max_consecutive_auto_reply=None, 91 | code_execution_config=False, 92 | ) 93 | -------------------------------------------------------------------------------- /story_book_agents/image_creator_agent.py: -------------------------------------------------------------------------------- 1 | """ This module contains the ImageCreatorAgent class. """ 2 | 3 | import os 4 | from autogen import AssistantAgent, Agent 5 | 6 | from .tools.image import save_image_from_url 7 | from .image_generation_agent import ImageGenerationAgent 8 | from .image_critic_agent import ImageCriticAgent 9 | from .tools.utils import get_storyboard_by_story_id, get_prompt_by_story_id_and_frame_number 10 | 11 | IMAGE_CREATOR_AGENT_NAME = "Image_Creator" 12 | IMAGE_CREATOR_AGENT_DESCRIPTION = "This agent is responsible for generating images based on storyboard scripts for children's storybooks." 13 | 14 | 15 | class ImageCreatorAgent(AssistantAgent): 16 | """ This agent is responsible for generating images based on storyboard scripts for children's storybooks. """ 17 | 18 | def __init__(self, gpt_config, story_id: str, frame_number: int, *args, **kwargs): 19 | super().__init__( 20 | name=IMAGE_CREATOR_AGENT_NAME, 21 | description=IMAGE_CREATOR_AGENT_DESCRIPTION, 22 | llm_config=gpt_config, 23 | *args, 24 | **kwargs) 25 | 26 | self._story_id = story_id 27 | self._n_iters = int(os.environ.get("IMAGE_CRITICISM_RETRIES", 2)) 28 | self.register_reply( 29 | [Agent, None], reply_func=ImageCreatorAgent._reply_user, position=0) 30 | self._image_generator = None 31 | self._image_critic = None 32 | self._storyboard = get_storyboard_by_story_id(self._story_id) 33 | self._frame_number = frame_number 34 | 35 | def _generate_image(self, img_prompt): 36 | self.send(message=img_prompt, 37 | recipient=self._image_generator, request_reply=True) 38 | last_generation_result = self._image_generator.last_message() 39 | 40 | if not isinstance(last_generation_result, dict): 41 | raise TypeError("Expected last_message to be a dictionary") 42 | 43 | img_url = last_generation_result["content"][-1]["image_url"]["url"] 44 | if os.environ.get("IMAGE_GENERATION_TYPE") == "azure" or os.environ.get("IMAGE_GENERATION_TYPE") == "openai": 45 | real_prompt = last_generation_result["content"][-1]["prompt"] 46 | else: 47 | real_prompt = img_prompt 48 | 49 | print(f"Generated image: {img_url}") 50 | # if os environ IMAGE_SAVE_FAILURED_IMAGES is set to True, save the image 51 | if bool(os.environ.get("IMAGE_SAVE_FAILURED_IMAGES", "False")): 52 | save_image_from_url(story_id=self._story_id,frame_index=self._frame_number,image_url=img_url,is_final=False) 53 | return img_url, real_prompt 54 | 55 | def _reply_user(self, messages=None, sender=None, config=None): # pylint: disable=unused-argument 56 | if all((messages is None, sender is None)): 57 | error_msg = f"Either {messages=} or {sender=} must be provided." 58 | raise AssertionError(error_msg) 59 | 60 | if messages is None: 61 | messages = self._oai_messages[sender] 62 | 63 | img_prompt = get_prompt_by_story_id_and_frame_number( 64 | self._story_id, self._frame_number) 65 | 66 | self._image_generator = ImageGenerationAgent(gpt_config=self.llm_config, 67 | max_consecutive_auto_reply=0) 68 | 69 | img_url, real_prompt = self._generate_image(img_prompt) 70 | 71 | if self._n_iters > 0: 72 | self._image_critic = ImageCriticAgent(gpt_config=self.llm_config, 73 | storyboard=self._storyboard, 74 | frame_number=self._frame_number, 75 | prompt=real_prompt, 76 | ) 77 | 78 | for _ in range(self._n_iters): 79 | self.send(message={ 80 | "content": [ 81 | { 82 | "type": "image_url", 83 | "image_url": {"url": img_url}, 84 | } 85 | ]}, 86 | recipient=self._image_critic, 87 | request_reply=True) 88 | last_critic_result = self._image_critic.last_message() 89 | if not isinstance(last_critic_result, dict): 90 | raise TypeError("Expected last_message to be a dictionary") 91 | 92 | if "CRITIC_DONE" in last_critic_result["content"]: 93 | break 94 | 95 | img_prompt = last_critic_result["content"].split("PROMPT:")[1] 96 | 97 | img_url, real_prompt = self._generate_image(img_prompt) 98 | 99 | return True, {"content": [{"type": "image_url", "image_url": {"url": img_url}, "prompt": real_prompt}]} 100 | -------------------------------------------------------------------------------- /story_book_agents/storyboard_groupchat.py: -------------------------------------------------------------------------------- 1 | ''' Storyboard group chat module''' 2 | 3 | from typing import Union 4 | from autogen import GroupChat, Agent, GroupChatManager 5 | 6 | import story_book_agents 7 | from .agent_manager import agent_manager_instance 8 | 9 | STORYBOARD_GROUP_SELECT_SPEAKER_MESSAGE_TEMPLATE = """ 10 | You are an AI assistant acting as the administrator of a storyboard group. 11 | Your task is to determine who should speak next in the storyboard creation process based on the current conversation context and the established workflow. 12 | 13 | The storyboard group follows this workflow: 14 | 1. The Producer loads the story content based on the story ID. 15 | 2. The Storyboard_Editor creates the storyboard based on the story content. 16 | 3. Once the storyboard is created, the Storyboard_Critic reviews it and provides feedback. 17 | 3.1 If the storyboard needs revisions, the Storyboard_Editor makes the necessary changes. 18 | 3.2 If no further revisions are needed (indicated by "STORYBOARD_CRITIC_DONE" in the response), the Producer saves the storyboard. 19 | 4. Once the storyboard is saved, the Text_To_Image_Prompt_Editor creates text-to-image generation prompts. 20 | 5. Then the Text_To_Image_Prompt_Critic reviews the prompts and provides feedback. 21 | 5.1 If the prompts need revisions, the Text_To_Image_Prompt_Editor makes the necessary changes. 22 | 5.2 If no further revisions are needed (indicated by "PROMPT_CRITIC_DONE" in the response), the Producer saves the prompts. 23 | 24 | Your job is to analyze the conversation context and determine which role should speak next according to this workflow. 25 | 26 | To make your determination: 27 | 1. Examine the last speaker and their contribution in the conversation context. 28 | 2. Consider the current stage of the storyboard creation process based on the workflow. 29 | 3. Identify the appropriate next step and the corresponding role that should speak. 30 | 31 | Remember, the storyboard group is responsible for creating a coherent and engaging storyboard based on the user's input. 32 | 33 | Provide your response by simply stating the role of the next speaker. Do not include any additional explanation or content. Your response should be one of the following: 34 | - Producer 35 | - Storyboard_Editor 36 | - Storyboard_Critic 37 | - Text_To_Image_Prompt_Editor 38 | - Text_To_Image_Prompt_Critic 39 | 40 | """ 41 | 42 | 43 | def storyboard_group_speaker_selection_func( 44 | last_speaker: Agent, groupchat: GroupChat 45 | ) -> Union[Agent, str, None]: 46 | """ 47 | Determine the next speaker in the story draft group based on the conversation context and the established workflow. 48 | """ 49 | _groupchat = groupchat 50 | if last_speaker == agent_manager_instance.default_it_assistant_agent: 51 | if "PROMPTS_SAVED" == _groupchat.messages[-1]["content"]: 52 | return None 53 | else: 54 | return "auto" 55 | else: 56 | return "auto" 57 | 58 | 59 | def init_storyboard_groupchat() -> GroupChat: 60 | '''Initialize the storyboard group chat''' 61 | storyboard_groupchat = GroupChat( 62 | agents=[ 63 | agent_manager_instance.default_receptionist_agent, 64 | agent_manager_instance.default_producer_agent, 65 | agent_manager_instance.default_storyboard_editor_agent, 66 | agent_manager_instance.defualt_storyboard_critic_agent, 67 | agent_manager_instance.default_it_assistant_agent, 68 | agent_manager_instance.default_text_to_image_prompt_editor_agent, 69 | agent_manager_instance.default_text_to_image_prompt_critic_agent, 70 | ], 71 | messages=[], 72 | max_round=20, 73 | speaker_selection_method=storyboard_group_speaker_selection_func, 74 | select_speaker_message_template=STORYBOARD_GROUP_SELECT_SPEAKER_MESSAGE_TEMPLATE, 75 | select_speaker_prompt_template=None, 76 | ) 77 | return storyboard_groupchat 78 | 79 | 80 | def init_storyboard_group_manager() -> GroupChatManager: 81 | '''Initialize the storyboard group manager''' 82 | storyboard_group_manager = GroupChatManager( 83 | name="StoryboardGroup", 84 | groupchat=init_storyboard_groupchat(), 85 | llm_config=story_book_agents.gpt_config_low_temperature, 86 | human_input_mode="NEVER", 87 | code_execution_config=False, 88 | silent=False, 89 | ) 90 | return storyboard_group_manager 91 | 92 | 93 | def set_storyboard_chat(story_id: str): 94 | '''Set the storyboard chat message''' 95 | storyboard_chat = { 96 | "sender": agent_manager_instance.default_receptionist_agent, 97 | "recipient": init_storyboard_group_manager(), 98 | "message": f"""The story ID is: {story_id}. 99 | Please do the following tasks in order: 100 | - Load the story content, 101 | - Create the storyboard. 102 | - Save the storyboard. 103 | - Create the text-to-image generation prompts. 104 | - Save the prompts. 105 | """, 106 | #"max_turns": 1, 107 | "summary_method": "reflection_with_llm", 108 | } 109 | return storyboard_chat 110 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | [中文版](README.zh-cn.md) 2 | [日本語](README.ja-jp.md) 3 | 4 | ## Agentic Story Book Workflow 5 | A multi-agent workflow framework for creating children's picture books based on [AutoGen](https://microsoft.github.io/autogen/). 6 | 7 | 8 | https://github.com/user-attachments/assets/323d055a-27d9-487f-b8c4-2fad2df649cc 9 | 10 | ## Agentic workflow 11 | ![MultiAgent](./images/MultiAgents.jpg) 12 | The code involves various multi-agent collaboration methods based on AutoGen. For example: 13 | - Initially, the User_Proxy represents the user and communicates with the Receptionist to gather user requirements. 14 | - In the subsequent two stages, the GroupChat mechanism is used, with each GroupChat having a GroupChat Manager to coordinate the speakers in the current GroupChat. 15 | - In the two GroupChats, the content creation roles (e.g., Story Editor, Storyboard Editor, Prompt Editor) are accompanied by an Agent responsible for reviewing the content. If the review is not approved, the GroupManager sends it back to the content creation Editor for revision. 16 | - The final stage of generating images/videos/PPTs is currently placed in separate code (generate.py) for ease of use and potential future adjustments to the GroupChat organization. This part is temporarily handled by an Image Creator Agent, which is an independent Agent but contains two Sub-Agents internally: an Image Generation Agent responsible for AI-based image generation and another for reviewing the generated images. 17 | 18 | ## System Requirements 19 | - **LLM**: It is recommended to use ChatGPT-4o. The current code is tested based on the ChatGPT-4o service in Azure OpenAI. In theory, it should also support OpenAI's native services with minor configuration adjustments. Although AutoGen supports multiple LLMs, practical tests with Claude 3.5 sonnet showed that it could not strictly follow the instructions in the Prompt 100% of the time, so other LLMs are not recommended. 20 | - **Text2Image**: Supports DALL-E 3 and Flux Schnell from Replicate. Considering cost and speed, I ultimately chose the Flux Schnell API endpoint from Replicate because: 21 | - Using DALL-E 3 in HD mode costs $12/100 images, meaning $0.12 per image, and each image takes more than ten seconds to generate. 22 | - Using the Flux Schnell API service costs only $0.003 per image, with a drawing time of 1-2 seconds. From a cost and scheduling perspective, Flux Schnell seems more suitable. Even if you find the quality of the Schnell version low, using the Flux Dev version API costs only $0.03 per image (the pro version on Replicate costs $0.055, but it seems to run on CPU and is very slow, so I didn't try it). You can adjust according to your needs. 23 | - Azure account with Speech service resources enabled. 24 | 25 | ## How to use 26 | - Create a Python virtual environment (tested on Python 3.11) and install dependencies: 27 | ``` 28 | pip install -r requirements.txt 29 | ``` 30 | - Create a .env file, copy the contents from .env.example, and modify it with your settings. Create a story 31 | ``` 32 | python app.py 33 | ``` 34 | - Generate images/videos/PPTX: First, modify the story_id in generate.py to the story ID you want to generate (obtained from the output of app.py). Then run: 35 | ``` 36 | python generate.py 37 | ``` 38 | 39 | 40 | ## .env configurations 41 | |Enviroment Name|Description |Default Value| 42 | |:-----|:----|:-----:| 43 | |AGENTOPS_API_KEY| [AgentOps](https://app.agentops.ai/) API Key| | 44 | |MODEL|deployment name on azure or model name on OpenAI | | 45 | |API_VERSION|API Version|'2024-06-01'| 46 | |API_TYPE|'azure' or 'openai'|azure| 47 | |API_KEY|API Key| | 48 | |BASE_URL|API base url, Azure should be like 'https://{region_name}.openai.azure.com/'|| 49 | |IMAGE_GENERATION_TYPE|'azure', 'openai' or 'replicate'|| 50 | |IMAGE_SHAPE|'landscape', 'portrait' or 'square'|landscape| 51 | |DALLE_MODEL|deployment name on azure or model name on OpenAI | | 52 | |DALLE_API_VERSION|API Version|'2024-06-01'| 53 | |DALLE_API_KEY|API Key| | 54 | |DALLE_BASE_URL|API base url, Azure should be like 'https://{region_name}.openai.azure.com/'|| 55 | |DALLE_IMAGE_QUALITY|'hd' or 'standard'|'hd'| 56 | |DALLE_IMAGE_STYLE|'vivid' or 'natural'|'vivid'| 57 | |REPLICATE_API_TOKEN|[repilicate](https://replicate.com/) api key| | 58 | |REPLICATE_MODEL_NAME| 'black-forest-labs/flux-schnell', 'black-forest-labs/flux-dev' or 'black-forest-labs/flux-pro'|'black-forest-labs/flux-schnell'| 59 | |IMAGE_GENERATION_RETRIES|max retry count per image|3| 60 | |IMAGE_CRITICISM_RETRIES|max critic count per image|2| 61 | |IMAGE_SAVE_FAILURED_IMAGES|save the critic failed image:True or False|False| 62 | |AZURE_SPEECH_KEY|Azure voice API Key|| 63 | |AZURE_SPEECH_REGION|Azure voice deploy region|| 64 | |AZURE_SPEECH_VOICE_NAME|Azure voice speaker name|'zh-CN-XiaoxiaoMultilingualNeural'| 65 | 66 | 67 | 68 | ## Roadmap 69 | - [ ] Add more FLUX models and channels 70 | - [ ] Improve the logic of content generation 71 | - [ ] Add "human-in-the-loop" logic during story content creation and generation 72 | - [ ] Background music 73 | 74 | 75 | ## FAQ 76 | - **I see that the story content in your demo is in Chinese. Does it support other languages?** 77 | Yes, it does. In the prompt section for content creation, there are instructions to follow the user's requirements or the language used by the user. 78 | - **What about multilingual voice support?** 79 | Azure's TTS supports hundreds of languages. You just need to specify the desired language's voice name in the AZURE_SPEECH_VOICE_NAME field in the .env file (some voices support dozens of different languages). 80 | - **Why are your prompts written in English?** 81 | Undoubtedly, English prompts are slightly more effective than Chinese ones. A very useful tip is that there is a tool in Anthropic's Portal that helps you generate prompts. You can input your initial ideas there, and it will help you generate prompts that you only need to modify slightly before using them in your program. 82 | - **The visual quality seems low** 83 | There are two factors here: 84 | - First, the test content I currently display uses the Schnell model from Flux, which is fast and cost-effective. Using the dev or pro models will undoubtedly improve the visual quality of the images. These models are not yet supported in the current code but will be added in the future. 85 | - Second, the existing image review logic is not sufficient and has room for improvement. 86 | 87 | ## Others 88 | [See some generated content demos here](DEMO-Results.md) 89 | -------------------------------------------------------------------------------- /story_book_agents/tools/image.py: -------------------------------------------------------------------------------- 1 | ''' 2 | image related tools 3 | ''' 4 | import os 5 | import uuid 6 | import replicate 7 | from typing import Annotated, Union 8 | from io import BytesIO 9 | 10 | from openai import OpenAI, AzureOpenAI 11 | 12 | import requests 13 | from PIL import Image 14 | 15 | 16 | def dalle_client_factory() -> Union[OpenAI, AzureOpenAI]: 17 | """ 18 | Dalle client factory 19 | """ 20 | if os.environ.get("IMAGE_GENERATION_TYPE") == "azure": # Azure DallE 21 | return AzureOpenAI(api_key=os.environ.get("DALLE_API_KEY"), 22 | azure_deployment=os.environ.get("DALLE_MODEL"), 23 | api_version=os.environ.get( 24 | "DALLE_API_VERSION", "2024-06-01"), 25 | azure_endpoint=os.environ.get("DALLE_BASE_URL")) 26 | else: 27 | # TODO: if env set the parameters, should also change this. 28 | return OpenAI(api_key=os.environ.get("DALLE_API_KEY")) 29 | 30 | 31 | def save_image_from_url(story_id: Annotated[str, "Story ID"], 32 | frame_index: Annotated[int, "Frame index"], 33 | image_url: Annotated[str, "Image URL"], 34 | is_final: Annotated[bool, "Is final image"] = False) -> Annotated[str, "Image filename"]: 35 | """ 36 | Save image from URL 37 | 38 | Args: 39 | story_id (Annotated[str, "Story ID"]): Story ID 40 | frame_index (Annotated[int, "Frame index"]): Frame index 41 | image_url (Annotated[str, "Image URL"]): Image URL 42 | is_final (Annotated[bool,"Is final image"]): Is final image 43 | 44 | Returns: 45 | Annotated[str, "Image filename"]: Image filename 46 | """ 47 | 48 | response = requests.get(image_url, timeout=60) 49 | image = Image.open(BytesIO(response.content)) 50 | output_dir = f"output/{story_id}/{frame_index}" 51 | os.makedirs(output_dir, exist_ok=True) 52 | image_name = "" 53 | if is_final: 54 | image_name = "image.jpg" 55 | else: 56 | image_id = str(uuid.uuid4()) 57 | image_name = f"{image_id}.jpg" 58 | 59 | image.save(f"{output_dir}/{image_name}") 60 | return image_name 61 | 62 | 63 | def generate_image_by_prompt(prompt_content: Annotated[str, "Prompt Content"]) -> Annotated[tuple[str, str], "Image URL & revised prompt"]: 64 | """ 65 | Generate image by prompt 66 | 67 | Args: 68 | prompt_content (Annotated[str, "Prompt Content"]): Prompt content 69 | 70 | Returns: 71 | Annotated[tuple[str,str], "Image URL & revised prompt"] 72 | """ 73 | max_retries = int(os.environ.get("IMAGE_GENERATION_RETRIES", 3)) 74 | for attempt in range(max_retries): 75 | try: 76 | image_shape = os.environ.get("IMAGE_SHAPE", "landscape").lower() 77 | # switch to different image generation service base the IMAGE_GENERATION_TYPE enviroment 78 | if os.environ.get("IMAGE_GENERATION_TYPE") == "azure" or os.environ.get("IMAGE_GENERATION_TYPE") == "openai": 79 | dalle_client = dalle_client_factory() 80 | # set image_size base on image_shape:landscape, portrait, square 81 | image_size = "1792x1024" 82 | if image_shape == "portrait": 83 | image_size = "1024x1792" 84 | elif image_shape == "square": 85 | image_size = "1024x1024" 86 | else: 87 | image_size = "1792x1024" 88 | # TODO: It's just a temp approach, will fix later 89 | dalle_result = dalle_client.images.generate(prompt=prompt_content + f" The scene is depicted in a {os.environ.get('IMAGE_STYLE_KEYWORD')} style.", 90 | n=1, 91 | quality=os.environ.get( 92 | "DALLE_IMAGE_QUALITY", 'hd'), 93 | size=image_size, 94 | style=os.environ.get( 95 | "DALLE_IMAGE_STYLE", "vivid"), 96 | response_format="url", 97 | timeout=60) 98 | 99 | if dalle_result.data is not None: 100 | image_url = dalle_result.data[0].url 101 | revised_prompt = dalle_result.data[0].revised_prompt 102 | return image_url, revised_prompt 103 | else: 104 | print(f"Attempt {attempt + 1} failed: No data in response") 105 | elif os.environ.get("IMAGE_GENERATION_TYPE") == "replicate": 106 | aspect_ratio = "16:9" 107 | if image_shape == "portrait": 108 | aspect_ratio = "9:16" 109 | elif image_shape == "square": 110 | aspect_ratio = "1:1" 111 | else: 112 | aspect_ratio = "16:9" 113 | replicate_input = { 114 | # TODO: It's just a temp approach, will fix later 115 | "prompt": prompt_content + f" The scene is depicted in a {os.environ.get('IMAGE_STYLE_KEYWORD')} style.", 116 | "aspect_ratio": aspect_ratio, 117 | "output_quality": 90 118 | } 119 | replicate_output = replicate.run( 120 | os.environ.get("REPLICATE_MODEL_NAME", 121 | "black-forest-labs/flux-schnell"), 122 | input=replicate_input 123 | ) 124 | if isinstance(replicate_output, list): 125 | return str(replicate_output[0]), prompt_content + f" The scene is depicted in a {os.environ.get('IMAGE_STYLE_KEYWORD')} style." 126 | return str(replicate_output), prompt_content + f" The scene is depicted in a {os.environ.get('IMAGE_STYLE_KEYWORD')} style." 127 | 128 | else: 129 | raise NotImplementedError( 130 | f"IMAGE_GENERATION_TYPE:{os.environ.get('IMAGE_GENERATION_TYPE')} not implemented") 131 | except Exception as e: # pylint: disable=broad-except 132 | print(f"Attempt {attempt + 1} failed: {e}") 133 | 134 | raise RuntimeError( 135 | f"Failed to generate image after {max_retries} attempts") 136 | -------------------------------------------------------------------------------- /story_book_agents/text_to_image_prompt_editor_agent.py: -------------------------------------------------------------------------------- 1 | ''' 2 | text to image prompt editor agent 3 | ''' 4 | 5 | from autogen import AssistantAgent 6 | 7 | TEXT_TO_IMAGE_PROMPT_EDITOR_AGENT_NAME = "Text_To_Image_Prompt_Editor" 8 | TEXT_TO_IMAGE_PROMPT_EDITOR_AGENT_SYSTEM_MESSAGE = """ 9 | You are a visual design member of a children's picture book creation team. 10 | Your task is to create text-to-image generation prompts for each frame of a storyboard based on the story content and visual descriptions provided by other team members. 11 | 12 | Here is the storyboard format you will be working with: 13 | 14 | 15 | [Frame number] 16 | [Frame content] 17 | [Visual content] 18 | 19 | ... 20 | 21 | 22 | For each frame in the storyboard, you will create a text-to-image generation prompt. Your output should be in the following XML format: 23 | 24 | 25 | 26 | [Frame number] 27 | [Frame text-to-image generation prompt] 28 | 29 | ... 30 | 31 | 32 | When creating these prompts, keep the following points in mind: 33 | 34 | 1. All prompts must be written in English. 35 | 2. Do not include any descriptions related to the overall visual style or art style in your prompts. The visual style will be determined in a separate process. 36 | 3. Provide detailed visual descriptions for all characters (human, animal, or mythical), including species, age characteristics, clothing, and appearance. Use specific attributes to define these details, such as fur/skin color, eye color, size, and any distinguishing marks or accessories. Ensure these descriptions remain consistent across all frames. 37 | 4. Maintain consistency in character descriptions across all frames. This includes race, species, color, physical features, clothing, facial features, and any other distinctive characteristics. If these details are not provided in the original story, create appropriate descriptions based on the story context and maintain them throughout all frames. 38 | 5. Repeat character details in each frame where they appear, as the image generation process cannot reference previous frames. 39 | 6. Describe visual elements in detail for each frame, considering the story context. Include character expressions, actions, and environmental descriptions. 40 | 7. For each scene, provide a clear description of the setting, including time of day, weather conditions, and any relevant environmental details. Ensure these details contribute to the story's atmosphere and remain consistent throughout the narrative. 41 | 8. Include descriptions of the emotional atmosphere and mood of each scene, using appropriate adjectives and metaphors. 42 | 9. Consider suggesting composition elements like foreground, middle ground, and background to create depth in the image. 43 | 10. Avoid potentially offensive character descriptions. If the story mentions specific religious figures, mythical beings, or public figures, focus on visual descriptions only. 44 | 11. Ensure your prompts match the provided storyboard frames exactly, without omissions or additions. 45 | 12. Avoid detailed descriptions of the presentation of certain strings in the image, such as specific slogans, dialogue box content, and so on. 46 | 13. Do not include character names in the prompts unless they are universally recognizable (e.g., Santa Claus, Easter Bunny). Instead, refer to characters by their roles or distinguishing features. 47 | 48 | For each in the storyboard: 49 | 50 | 1. Read the and carefully. 51 | 2. Consider the overall story context and how this frame fits into it. 52 | 3. Think about what visual elements would be most appealing and appropriate for a children's picture book. 53 | 4. Craft a detailed prompt that captures all necessary visual elements, character details, and the mood of the scene. 54 | 5. Ensure the prompt aligns with the guidelines provided above. 55 | 56 | Before finalizing each prompt, consider: 57 | 1. Does this prompt effectively convey the story elements of this frame? 58 | 2. Will the resulting image be engaging and appropriate for children? 59 | 3. Have I included all necessary details about characters, setting, and action? 60 | 4. Are character descriptions consistent across all frames? 61 | 5. Have I avoided mentioning non-essential character names? 62 | 6. Is the scene description consistent with the overall story progression and previous frames? 63 | 7. Have I included sufficient details about the setting, emotional atmosphere, and composition? 64 | 8. Are the characters' expressions, body language, and actions described in detail to reflect their emotions and interactions? 65 | 66 | The prompt must intricately describe every part of the image in concrete, objective detail. 67 | THINK about what the end goal of the description is, and extrapolate that to what would make satisfying images. 68 | All descriptions in prompt should be a paragraph of text that is extremely descriptive and detailed. Each should be more than 3 sentences long. 69 | 70 | Please note that your output will be reviewed by another AI for quality assurance. Strive to create high-quality, consistent prompts that meet all the criteria mentioned above. 71 | 72 | If you receive feedback on the prompts, it may be provided in XML format as follows: 73 | 74 | 75 | [Frame number] 76 | 77 | [Modification suggestions here] 78 | 79 | 80 | ... 81 | 82 | 83 | You will need to revise and improve your prompts based on this feedback. Please remember that after incorporating the feedback, your output must also follow the aforementioned prompts output format. 84 | 85 | Once you have carefully considered and crafted each prompt, output them in the specified XML format. Ensure that you provide a prompt for each frame in the storyboard, maintaining the correct order and frame numbers. 86 | 87 | Begin processing the storyboard and creating your prompts now. 88 | """ 89 | TEXT_TO_IMAGE_PROMPT_EDITOR_AGENT_DESCRIPTION = "Designs text-to-image generation prompts" 90 | 91 | 92 | class TextToImagePromptEditorAgent(AssistantAgent): 93 | """ This agent designs text-to-image generation prompts for children's picture book frames. """ 94 | 95 | def __init__(self, gpt_config): 96 | super().__init__( 97 | name=TEXT_TO_IMAGE_PROMPT_EDITOR_AGENT_NAME, 98 | description=TEXT_TO_IMAGE_PROMPT_EDITOR_AGENT_DESCRIPTION, 99 | system_message=TEXT_TO_IMAGE_PROMPT_EDITOR_AGENT_SYSTEM_MESSAGE, 100 | llm_config=gpt_config, 101 | human_input_mode="NEVER", 102 | max_consecutive_auto_reply=None, 103 | code_execution_config=False, 104 | ) 105 | -------------------------------------------------------------------------------- /story_book_agents/tools/utils.py: -------------------------------------------------------------------------------- 1 | """ 2 | utils tools for story_book_agents 3 | """ 4 | import copy 5 | import datetime 6 | from typing import Annotated 7 | import uuid 8 | import xml.etree.ElementTree as ET 9 | from tinydb import TinyDB, Query 10 | from tinydb.storages import JSONStorage 11 | from tinydb.middlewares import CachingMiddleware 12 | 13 | 14 | class MyJSONStorage(JSONStorage): 15 | """ 16 | Custom JSON storage class that sets ensure_ascii=False and indent=4. 17 | """ 18 | 19 | def __init__(self, *args, **kwargs): 20 | super().__init__(*args, **kwargs) 21 | self.kwargs['ensure_ascii'] = False 22 | self.kwargs['indent'] = 4 23 | 24 | 25 | def save_story_content(story_title: Annotated[str, "Story Title"], story_content: Annotated[str, "Story Content"]) -> Annotated[str, " 保存的故事的ID"]: 26 | """ 27 | Save story content 28 | 29 | Args: 30 | story_title (Annotated[str, "Story Title"]): Story title 31 | story_draft (Annotated[str, "Story Content"]): Story content 32 | 33 | Returns: 34 | Annotated[str, "Story ID"]: Story ID 35 | 36 | """ 37 | story_id = str(uuid.uuid4()) 38 | db = TinyDB('output/stories.json', 39 | storage=CachingMiddleware(MyJSONStorage)) 40 | 41 | # story_table = db.table('stories') 42 | story = {'story_id': story_id, 43 | 'story_title': story_title, 44 | 'story_content': story_content, 45 | 'created_at': datetime.datetime.now().timestamp()} 46 | db.insert(story) 47 | # story_table.insert(story) 48 | db.close() 49 | return story_id 50 | 51 | 52 | def load_story_content_by_id(story_id: Annotated[str, "Story ID"]) -> Annotated[str, "Story Content"]: 53 | """ 54 | Load story content by ID 55 | 56 | Args: 57 | story_id (Annotated[str, "Story ID"]):Story ID 58 | 59 | Returns: 60 | Annotated[str, "Story Content"]: Story content 61 | 62 | """ 63 | db = TinyDB('output/stories.json', 64 | storage=CachingMiddleware(MyJSONStorage)) 65 | stories = Query() 66 | story = db.search(stories.story_id == story_id)[0] 67 | db.close() 68 | return story['story_content'] 69 | 70 | 71 | def save_storyboard_by_story_id(story_id: Annotated[str, "Story ID"], storyboard_content: Annotated[str, "Storyboard Content"]) -> Annotated[str, "Result"]: 72 | """ 73 | Save storyboard by story ID 74 | 75 | Args: 76 | story_id (Annotated[str, "Story ID"]): Story ID 77 | storyboard_content (Annotated[str, "Storyboard Content"]): Storyboard content 78 | 79 | Returns: 80 | Annotated[str, "Result"]: Result 81 | 82 | """ 83 | root = ET.fromstring(storyboard_content) 84 | storyboard_items = [{child.tag: child.text for child in item} 85 | for item in root.findall('StoryboardItem')] 86 | 87 | db = TinyDB('output/storyboards.json', 88 | storage=CachingMiddleware(MyJSONStorage)) 89 | storyboard = {'story_id': story_id, 90 | 'storyboard_content': storyboard_items, 91 | 'created_at': datetime.datetime.now().timestamp()} 92 | db.insert(storyboard) 93 | db.close() 94 | return "STORYBOARD_SAVED" 95 | 96 | 97 | def get_storyboard_by_story_id(story_id: Annotated[str, "Story ID"]) -> Annotated[list[dict], "Storyboard Content"]: 98 | """ 99 | Load storyboard by story ID 100 | 101 | Args: 102 | story_id (Annotated[str, "Story ID"]): Story ID 103 | 104 | Returns: 105 | Annotated[list[dict], "Storyboard Content"]: Storyboard content 106 | 107 | """ 108 | db = TinyDB('output/storyboards.json', 109 | storage=CachingMiddleware(MyJSONStorage)) 110 | storyboards = Query() 111 | storyboard_content = db.search(storyboards.story_id == story_id)[0] 112 | db.close() 113 | return storyboard_content['storyboard_content'] 114 | 115 | 116 | def save_prompts_by_story_id(story_id: Annotated[str, "Story ID"], prompts_content: Annotated[str, "Prompts Content"]) -> Annotated[str, "Result"]: 117 | """ 118 | Save prompts by story ID 119 | 120 | Args: 121 | story_id (Annotated[str, "Story ID"]): Story ID 122 | prompts_content (Annotated[str, "Prompts Content"]): Prompts content 123 | 124 | Returns: 125 | Annotated[str, "Result"]: Result 126 | 127 | """ 128 | root = ET.fromstring(prompts_content) 129 | prompts_items = [{child.tag: child.text for child in item} 130 | for item in root.findall('StoryboardItem')] 131 | 132 | db = TinyDB('output/prompts.json', 133 | storage=CachingMiddleware(MyJSONStorage)) 134 | prompts = {'story_id': story_id, 135 | 'prompts_content': prompts_items, 136 | 'created_at': datetime.datetime.now().timestamp()} 137 | db.insert(prompts) 138 | db.close() 139 | return "PROMPTS_SAVED" 140 | 141 | 142 | def get_prompts_by_story_id(story_id: Annotated[str, "Story ID"]) -> Annotated[list[dict], "Prompts Content"]: 143 | """ 144 | Load prompts by story ID 145 | 146 | Args: 147 | story_id (Annotated[str, "Story ID"]): Story ID 148 | 149 | Returns: 150 | Annotated[list[dict], "Prompts Content"]: Prompts content 151 | 152 | """ 153 | db = TinyDB('output/prompts.json', 154 | storage=CachingMiddleware(MyJSONStorage)) 155 | prompts = Query() 156 | prompts_content = db.search(prompts.story_id == story_id)[0] 157 | db.close() 158 | return prompts_content['prompts_content'] 159 | 160 | 161 | def get_prompt_by_story_id_and_frame_number(story_id: Annotated[str, "Story ID"], frame_number: Annotated[int, "Frame number"]) -> Annotated[str, "Prompt Content"]: 162 | """ 163 | Get prompt by story ID and frame number 164 | 165 | Args: 166 | story_id (Annotated[str, "Story ID"]): Story ID 167 | frame_number (Annotated[int, "Frame number"]): Frame number 168 | 169 | Returns: 170 | Annotated[str, "Prompt Content"]: Prompt content 171 | 172 | """ 173 | prompts_content = get_prompts_by_story_id(story_id) 174 | for frame in prompts_content: 175 | if frame["Index"] == str(frame_number): 176 | return frame["Prompt"] 177 | raise ValueError( 178 | f"Frame number {frame_number} not found in prompts for story ID {story_id}") 179 | 180 | 181 | def update_prompt_by_story_id_and_frame_number(story_id: Annotated[str, "Story ID"], 182 | frame_number: Annotated[int, "Frame number"], 183 | prompt: Annotated[str, "Prompt Content"]): 184 | """ 185 | Update prompt by story ID and frame number 186 | 187 | Args: 188 | story_id (Annotated[str, "Story ID"]): Story ID 189 | frame_number (Annotated[int, "Frame number"]): Frame number 190 | prompt (Annotated[str, "Prompt Content"]): Prompt content 191 | 192 | Returns: 193 | Annotated[bool, "Result"]: Result 194 | 195 | """ 196 | db = TinyDB('output/prompts.json', 197 | storage=CachingMiddleware(MyJSONStorage)) 198 | prompts = Query() 199 | story_prompts = copy.deepcopy(db.search(prompts.story_id == story_id)[0]) 200 | 201 | for frame in story_prompts['prompts_content']: 202 | if frame["Index"] == str(frame_number): 203 | frame["Prompt"] = prompt 204 | break 205 | 206 | db.update(story_prompts, prompts.story_id == story_id) 207 | db.close() 208 | 209 | def get_last_story_id() -> Annotated[str, "Story ID"]: 210 | """ 211 | Get last story ID 212 | 213 | Returns: 214 | Annotated[str, "Story ID"]: Story ID 215 | 216 | """ 217 | db = TinyDB('output/stories.json', 218 | storage=CachingMiddleware(MyJSONStorage)) 219 | stories = db.all() 220 | db.close() 221 | return stories[-1]['story_id'] -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | # Created by https://www.toptal.com/developers/gitignore/api/visualstudiocode,pycharm,python 2 | # Edit at https://www.toptal.com/developers/gitignore?templates=visualstudiocode,pycharm,python 3 | 4 | ### PyCharm ### 5 | # Covers JetBrains IDEs: IntelliJ, RubyMine, PhpStorm, AppCode, PyCharm, CLion, Android Studio, WebStorm and Rider 6 | # Reference: https://intellij-support.jetbrains.com/hc/en-us/articles/206544839 7 | 8 | # User-specific stuff 9 | .idea/**/workspace.xml 10 | .idea/**/tasks.xml 11 | .idea/**/usage.statistics.xml 12 | .idea/**/dictionaries 13 | .idea/**/shelf 14 | 15 | # AWS User-specific 16 | .idea/**/aws.xml 17 | 18 | # Generated files 19 | .idea/**/contentModel.xml 20 | 21 | # Sensitive or high-churn files 22 | .idea/**/dataSources/ 23 | .idea/**/dataSources.ids 24 | .idea/**/dataSources.local.xml 25 | .idea/**/sqlDataSources.xml 26 | .idea/**/dynamic.xml 27 | .idea/**/uiDesigner.xml 28 | .idea/**/dbnavigator.xml 29 | 30 | # Gradle 31 | .idea/**/gradle.xml 32 | .idea/**/libraries 33 | 34 | # Gradle and Maven with auto-import 35 | # When using Gradle or Maven with auto-import, you should exclude module files, 36 | # since they will be recreated, and may cause churn. Uncomment if using 37 | # auto-import. 38 | # .idea/artifacts 39 | # .idea/compiler.xml 40 | # .idea/jarRepositories.xml 41 | # .idea/modules.xml 42 | # .idea/*.iml 43 | # .idea/modules 44 | # *.iml 45 | # *.ipr 46 | 47 | # CMake 48 | cmake-build-*/ 49 | 50 | # Mongo Explorer plugin 51 | .idea/**/mongoSettings.xml 52 | 53 | # File-based project format 54 | *.iws 55 | 56 | # IntelliJ 57 | out/ 58 | 59 | # mpeltonen/sbt-idea plugin 60 | .idea_modules/ 61 | 62 | # JIRA plugin 63 | atlassian-ide-plugin.xml 64 | 65 | # Cursive Clojure plugin 66 | .idea/replstate.xml 67 | 68 | # SonarLint plugin 69 | .idea/sonarlint/ 70 | 71 | # Crashlytics plugin (for Android Studio and IntelliJ) 72 | com_crashlytics_export_strings.xml 73 | crashlytics.properties 74 | crashlytics-build.properties 75 | fabric.properties 76 | 77 | # Editor-based Rest Client 78 | .idea/httpRequests 79 | 80 | # Android studio 3.1+ serialized cache file 81 | .idea/caches/build_file_checksums.ser 82 | 83 | ### PyCharm Patch ### 84 | # Comment Reason: https://github.com/joeblau/gitignore.io/issues/186#issuecomment-215987721 85 | 86 | # *.iml 87 | # modules.xml 88 | # .idea/misc.xml 89 | # *.ipr 90 | 91 | # Sonarlint plugin 92 | # https://plugins.jetbrains.com/plugin/7973-sonarlint 93 | .idea/**/sonarlint/ 94 | 95 | # SonarQube Plugin 96 | # https://plugins.jetbrains.com/plugin/7238-sonarqube-community-plugin 97 | .idea/**/sonarIssues.xml 98 | 99 | # Markdown Navigator plugin 100 | # https://plugins.jetbrains.com/plugin/7896-markdown-navigator-enhanced 101 | .idea/**/markdown-navigator.xml 102 | .idea/**/markdown-navigator-enh.xml 103 | .idea/**/markdown-navigator/ 104 | 105 | # Cache file creation bug 106 | # See https://youtrack.jetbrains.com/issue/JBR-2257 107 | .idea/$CACHE_FILE$ 108 | 109 | # CodeStream plugin 110 | # https://plugins.jetbrains.com/plugin/12206-codestream 111 | .idea/codestream.xml 112 | 113 | # Azure Toolkit for IntelliJ plugin 114 | # https://plugins.jetbrains.com/plugin/8053-azure-toolkit-for-intellij 115 | .idea/**/azureSettings.xml 116 | 117 | ### Python ### 118 | # Byte-compiled / optimized / DLL files 119 | __pycache__/ 120 | *.py[cod] 121 | *$py.class 122 | 123 | # C extensions 124 | *.so 125 | 126 | # Distribution / packaging 127 | .Python 128 | build/ 129 | develop-eggs/ 130 | dist/ 131 | downloads/ 132 | eggs/ 133 | .eggs/ 134 | lib/ 135 | lib64/ 136 | parts/ 137 | sdist/ 138 | var/ 139 | wheels/ 140 | share/python-wheels/ 141 | *.egg-info/ 142 | .installed.cfg 143 | *.egg 144 | MANIFEST 145 | 146 | # PyInstaller 147 | # Usually these files are written by a python script from a template 148 | # before PyInstaller builds the exe, so as to inject date/other infos into it. 149 | *.manifest 150 | *.spec 151 | 152 | # Installer logs 153 | pip-log.txt 154 | pip-delete-this-directory.txt 155 | 156 | # Unit test / coverage reports 157 | htmlcov/ 158 | .tox/ 159 | .nox/ 160 | .coverage 161 | .coverage.* 162 | .cache 163 | nosetests.xml 164 | coverage.xml 165 | *.cover 166 | *.py,cover 167 | .hypothesis/ 168 | .pytest_cache/ 169 | cover/ 170 | 171 | # Translations 172 | *.mo 173 | *.pot 174 | 175 | # Django stuff: 176 | *.log 177 | local_settings.py 178 | db.sqlite3 179 | db.sqlite3-journal 180 | 181 | # Flask stuff: 182 | instance/ 183 | .webassets-cache 184 | 185 | # Scrapy stuff: 186 | .scrapy 187 | 188 | # Sphinx documentation 189 | docs/_build/ 190 | 191 | # PyBuilder 192 | .pybuilder/ 193 | target/ 194 | 195 | # Jupyter Notebook 196 | .ipynb_checkpoints 197 | 198 | # IPython 199 | profile_default/ 200 | ipython_config.py 201 | 202 | # pyenv 203 | # For a library or package, you might want to ignore these files since the code is 204 | # intended to run in multiple environments; otherwise, check them in: 205 | # .python-version 206 | 207 | # pipenv 208 | # According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. 209 | # However, in case of collaboration, if having platform-specific dependencies or dependencies 210 | # having no cross-platform support, pipenv may install dependencies that don't work, or not 211 | # install all needed dependencies. 212 | #Pipfile.lock 213 | 214 | # poetry 215 | # Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control. 216 | # This is especially recommended for binary packages to ensure reproducibility, and is more 217 | # commonly ignored for libraries. 218 | # https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control 219 | #poetry.lock 220 | 221 | # pdm 222 | # Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control. 223 | #pdm.lock 224 | # pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it 225 | # in version control. 226 | # https://pdm.fming.dev/#use-with-ide 227 | .pdm.toml 228 | 229 | # PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm 230 | __pypackages__/ 231 | 232 | # Celery stuff 233 | celerybeat-schedule 234 | celerybeat.pid 235 | 236 | # SageMath parsed files 237 | *.sage.py 238 | 239 | # Environments 240 | .env 241 | .venv 242 | env/ 243 | venv/ 244 | ENV/ 245 | env.bak/ 246 | venv.bak/ 247 | 248 | # Spyder project settings 249 | .spyderproject 250 | .spyproject 251 | 252 | # Rope project settings 253 | .ropeproject 254 | 255 | # mkdocs documentation 256 | /site 257 | 258 | # mypy 259 | .mypy_cache/ 260 | .dmypy.json 261 | dmypy.json 262 | 263 | # Pyre type checker 264 | .pyre/ 265 | 266 | # pytype static type analyzer 267 | .pytype/ 268 | 269 | # Cython debug symbols 270 | cython_debug/ 271 | 272 | # PyCharm 273 | # JetBrains specific template is maintained in a separate JetBrains.gitignore that can 274 | # be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore 275 | # and can be added to the global gitignore or merged into this file. For a more nuclear 276 | # option (not recommended) you can uncomment the following to ignore the entire idea folder. 277 | #.idea/ 278 | 279 | ### Python Patch ### 280 | # Poetry local configuration file - https://python-poetry.org/docs/configuration/#local-configuration 281 | poetry.toml 282 | 283 | # ruff 284 | .ruff_cache/ 285 | 286 | # LSP config files 287 | pyrightconfig.json 288 | 289 | ### VisualStudioCode ### 290 | .vscode/* 291 | !.vscode/settings.json 292 | !.vscode/tasks.json 293 | !.vscode/launch.json 294 | !.vscode/extensions.json 295 | !.vscode/*.code-snippets 296 | 297 | # Local History for Visual Studio Code 298 | .history/ 299 | 300 | # Built Visual Studio Code Extensions 301 | *.vsix 302 | 303 | ### VisualStudioCode Patch ### 304 | # Ignore all local history of files 305 | .history 306 | .ionide 307 | 308 | # End of https://www.toptal.com/developers/gitignore/api/visualstudiocode,pycharm,python 309 | .idea 310 | output/* 311 | autogen_logs 312 | .env 313 | AOAI_CONFIG_LIST.json 314 | 315 | !output/placeholder.txt 316 | .DS_Store --------------------------------------------------------------------------------