├── .env.local.template ├── .github └── workflows │ └── publish.yaml ├── .gitignore ├── LICENSE ├── Makefile ├── README.md ├── examples ├── openai_compatible_server │ ├── README.md │ ├── moa_config.yaml │ ├── moa_config_chat.yaml │ └── server.py ├── simple_example.py └── streamlit_chat │ ├── README.md │ ├── assets │ ├── chat_screenshot.png │ └── config_screenshot.png │ ├── streamlit_chat.py │ └── streamlit_chat_local_server.py ├── moa_llm ├── __init__.py ├── aggregation_layer.py ├── config_loader.py ├── layer.py ├── mixture_of_agents.py ├── neuron.py ├── user_query_annotator.py └── version.py └── pyproject.toml /.env.local.template: -------------------------------------------------------------------------------- 1 | PROVIDER_API_KEY= 2 | -------------------------------------------------------------------------------- /.github/workflows/publish.yaml: -------------------------------------------------------------------------------- 1 | name: Publish Python Package 2 | 3 | on: 4 | release: 5 | types: [created] 6 | 7 | jobs: 8 | deploy: 9 | runs-on: ubuntu-latest 10 | steps: 11 | - uses: actions/checkout@v3 12 | - name: Set up Python 13 | uses: actions/setup-python@v4 14 | with: 15 | python-version: "3.x" 16 | - name: Install dependencies 17 | run: | 18 | python -m pip install --upgrade pip 19 | pip install build twine 20 | - name: Build and publish 21 | env: 22 | TWINE_USERNAME: __token__ 23 | TWINE_PASSWORD: ${{ secrets.PYPI_API_TOKEN }} 24 | run: | 25 | python -m build 26 | twine upload dist/* 27 | -------------------------------------------------------------------------------- /.gitignore: -------------------------------------------------------------------------------- 1 | .venv 2 | pyrightconfig.json 3 | __pycache__ 4 | *.egg-info 5 | 6 | .env.local 7 | 8 | examples/openai_compatible_server/outputs 9 | outputs/ -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | MIT License 2 | 3 | Copyright (c) 2024 Catena Labs 4 | 5 | Permission is hereby granted, free of charge, to any person obtaining a copy 6 | of this software and associated documentation files (the "Software"), to deal 7 | in the Software without restriction, including without limitation the rights 8 | to use, copy, modify, merge, publish, distribute, sublicense, and/or sell 9 | copies of the Software, and to permit persons to whom the Software is 10 | furnished to do so, subject to the following conditions: 11 | 12 | The above copyright notice and this permission notice shall be included in all 13 | copies or substantial portions of the Software. 14 | 15 | THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR 16 | IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 17 | FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE 18 | AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 19 | LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, 20 | OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE 21 | SOFTWARE. 22 | -------------------------------------------------------------------------------- /Makefile: -------------------------------------------------------------------------------- 1 | .PHONY : run-checks 2 | run-checks : 3 | isort --check . 4 | black --check . 5 | ruff check . 6 | 7 | .PHONY : build 8 | build : 9 | rm -rf *.egg-info/ 10 | python -m build 11 | -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # moa-llm: Mixture of Agents for LLMs 2 | 3 | ## Overview 4 | 5 | moa-llm is a Python library that orchestrates Large Language Models (LLMs) in a neural network-inspired structure. This innovative approach enables sophisticated, multi-step processing of queries by leveraging various LLMs as "neurons" within the network. moa-llm is designed to harness the unique strengths of different models, combining their outputs to produce more comprehensive, accurate, and nuanced results than any single model could achieve alone. 6 | 7 | By emulating the layered structure of neural networks, moa-llm allows for complex information processing workflows. It can handle tasks that require diverse knowledge domains, multiple perspectives, or step-by-step reasoning. This makes it particularly suitable for applications in areas such as advanced question-answering systems, multi-faceted analysis, creative content generation, and complex problem-solving scenarios. 8 | 9 | moa-llm is inspired by the work done by Together AI in their research blog [Together MoA — collective intelligence of open-source models pushing the frontier of LLM capabilities](https://www.together.ai/blog/together-moa). The Mixture of Agents (MoA) approach adopts a layered architecture where each layer comprises several LLM agents. These agents take the outputs from the previous layer as auxiliary information to generate refined responses, effectively integrating diverse capabilities and insights from various models. 10 | 11 | For more technical details, refer to the arXiv paper: [Mixture-of-Agents: Architecting Large Language Models as Interacting Experts](https://arxiv.org/abs/2406.04692). 12 | 13 | ## Key Features 14 | 15 | - **Flexible Multi-Layer Architecture**: Supports an arbitrary number of layers, including multiple proposal layers and a final aggregation layer, allowing for deep and complex query processing pipelines. 16 | - **Weighted Model Inputs**: Each LLM "neuron" can be assigned a customizable weight, enabling fine-tuned control over the influence of different models on the final output. This feature allows for the prioritization of more reliable or task-appropriate models. 17 | - **Intelligent Query Annotation**: An optional pre-processing step that can reformulate, expand, or contextualize user queries to optimize them for the subsequent layers. This can significantly improve the relevance and quality of the final output. 18 | - **Asynchronous Processing**: Utilizes asynchronous calls for improved performance and concurrency, allowing multiple LLMs to process information simultaneously and reducing overall response times. 19 | - **Broad Model Support**: Compatible with a wide range of LLM providers and models, offering flexibility in choosing the most suitable models for specific tasks or domains. 20 | - **Customizable Prompts**: System prompts for individual neurons and aggregation prompts can be tailored for specific use cases, allowing for task-specific optimization and consistent output formatting. 21 | - **Dynamic Response Handling**: Features like shuffling and dropout in the aggregation layer introduce controlled randomness, potentially improving output diversity and robustness. 22 | - **Detailed Performance Metrics**: Provides comprehensive timing information for each step of the process, enabling performance analysis and optimization of the model architecture. 23 | 24 | ## System Architecture 25 | 26 | 27 | moa-llm's architecture consists of several key components that work together to process queries: 28 | 29 | 30 | 1. **UserQueryAnnotator**: An optional component that pre-processes and optimizes user queries. It can expand, reformulate, or add context to the original query to improve the performance of subsequent layers. 31 | 2. **Neuron**: An abstract base class representing the fundamental processing units in the network. It defines the interface for all types of neurons in the system. 32 | 3. **LLMNeuron**: A concrete implementation of a Neuron that encapsulates an LLM. It handles the interaction with the LLM API, including sending prompts and receiving responses. 33 | 4. **Layer**: A collection of Neurons that process input in parallel. Layers can be composed of multiple LLMNeurons, potentially using different models or configurations. 34 | 5. **AggregationLayer**: A specialized Layer that combines outputs from previous layers. It supports advanced features like response shuffling and dropout to introduce controlled variability in the aggregation process. 35 | 6. **MixtureOfAgents**: The main orchestrator class that manages the entire query processing pipeline. It coordinates the flow of information through the UserQueryAnnotator, multiple Layer instances, and the final AggregationLayer. 36 | 7. **ConfigLoader**: A utility for loading and parsing configuration files, enabling easy setup and customization of the moa-llm architecture without code changes. 37 | 38 | ## Installation 39 | 40 | If you want to install it as a local repo: 41 | ```bash 42 | git clone https://github.com/catena-labs/moa.git 43 | pip install -e . 44 | ``` 45 | 46 | ## Setup 47 | 48 | 1. Create a `.env.local` file in your project root with your API key and OpenAI-compatible `base_url`. If you do not specify a `base_url`, Together AI will be used. For example: 49 | 50 | ``` 51 | PROVIDER_API_KEY=your_api_key_here 52 | BASE_URL=https://api.together.xyz/v1 53 | 54 | ``` 55 | 56 | Replace `your_api_key_here` with your actual Together AI API key. 57 | 58 | 2. Ensure `.env.local` is added to your `.gitignore` file to prevent accidentally committing sensitive information. 59 | 60 | ## Usage 61 | 62 | Here's a basic example of how to set up and run moa-llm using a configuration file, demonstrating both single query and messages array inputs: 63 | 64 | ```python 65 | import asyncio 66 | from moa_llm import create_moa_from_config 67 | 68 | async def run_moa(): 69 | # Create MixtureOfAgents from a config file 70 | moa = create_moa_from_config('moa_config.yaml', is_file_path=True) 71 | 72 | # Process input 73 | user_query = "Explain the concept of quantum entanglement and its potential applications in quantum computing." 74 | result = await moa.process(user_query) 75 | 76 | # Print results 77 | print(result['content']) 78 | print(json.dumps(result['response_times'], indent=2)) 79 | print(f"Total Completion Time: {result['total_completion_time']:.2f} seconds") 80 | 81 | # Example with messages array 82 | messages = [ 83 | "What is the current state of quantum computing?", 84 | "Can you explain the concept of quantum entanglement?", 85 | "How does quantum computing differ from classical computing?" 86 | ] 87 | result = await moa.process(messages) 88 | print(result['content']) 89 | print(json.dumps(result['response_times'], indent=2)) 90 | print(f"Total Completion Time: {result['total_completion_time']:.2f} seconds") 91 | 92 | # Run moa-llm 93 | asyncio.run(run_moa()) 94 | ``` 95 | 96 | This example uses a configuration file (`moa_config.yaml`) to set up the moa-llm architecture. The configuration file allows for easy customization of the model structure, including the number and types of neurons, layer configurations, and aggregation settings. 97 | 98 | ## YAML Configuration 99 | 100 | moa-llm uses YAML configuration files for easy setup and customization of the model architecture. This allows users to define complex multi-layer structures without modifying the code. Here's a detailed breakdown of the configuration options: 101 | 102 | ### Basic Structure 103 | 104 | The configuration file consists of several main sections: 105 | 106 | 1. `use_annotator`: A boolean flag to enable or disable the query annotator. 107 | 2. `annotator`: Settings for the query annotator. 108 | 3. `proposal_layers`: An array of one or more proposal layers. 109 | 4. `aggregation_layer`: Settings for the final aggregation layer. 110 | 111 | ### Annotator Configuration 112 | 113 | The annotator is an optional component that can preprocess and optimize user queries: 114 | 115 | ```yaml 116 | use_annotator: true 117 | annotator: 118 | model: "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo" 119 | temperature: 0.7 120 | ``` 121 | 122 | 123 | - `model`: Specifies the LLM to use for annotation. 124 | - `temperature`: Controls the randomness of the model's output (0.0 to 1.0). 125 | 126 | ### Proposal Layers 127 | 128 | You can define multiple proposal layers, each containing one or more neurons: 129 | 130 | ```yaml 131 | proposal_layers: 132 | - neurons: 133 | - model: "meta-llama/Meta-Llama-3.1-8B-Instruct-Turbo" 134 | prompt: "You are a helpful assistant. Provide a concise response." 135 | temperature: 0.7 136 | weight: 8 137 | - model: "mistralai/Mixtral-8x22B-Instruct-v0.1" 138 | prompt: "You are a helpful assistant. Provide a concise response." 139 | temperature: 0.7 140 | weight: 7 141 | - neurons: 142 | - model: "Qwen/Qwen2-72B-Instruct" 143 | prompt: "You are an expert in the field. Provide a detailed analysis." 144 | temperature: 0.5 145 | weight: 9 146 | ``` 147 | 148 | 149 | For each neuron: 150 | - `model`: Specifies the LLM to use. 151 | - `prompt`: The system prompt for the model. 152 | - `temperature`: Controls output randomness. 153 | - `weight`: Determines the neuron's influence on the final result. 154 | 155 | ### Aggregation Layer 156 | 157 | The aggregation layer combines outputs from the proposal layers: 158 | 159 | ```yaml 160 | aggregation_layer: 161 | model: "meta-llama/Meta-Llama-3.1-70B-Instruct-Turbo" 162 | prompt: "You are an aggregator. Synthesize the following responses:" 163 | temperature: 0.7 164 | aggregation_prompt: | 165 | You are an advanced AI aggregator tasked with synthesizing multiple responses into a single, high-quality answer. 166 | USER QUERY: {user_query} 167 | RESPONSES: 168 | {responses} 169 | RESPONSE WEIGHTS: 170 | {response_weights} 171 | shuffle: true 172 | dropout_rate: 0.2 173 | ``` 174 | 175 | - `model`: The LLM used for aggregation. 176 | - `prompt`: System prompt for the aggregator. 177 | - `temperature`: Controls randomness of the aggregator's output. 178 | - `aggregation_prompt`: A template for combining responses. It can include placeholders like `{user_query}`, `{responses}`, and `{response_weights}`. 179 | - `shuffle`: When true, randomizes the order of input responses. 180 | - `dropout_rate`: Probability (0.0 to 1.0) of dropping each input response. 181 | 182 | ### Advanced Features 183 | 184 | 1. **Multiple Proposal Layers**: You can define any number of proposal layers, allowing for complex, multi-step processing pipelines. 185 | 186 | 2. **Weighted Responses**: By assigning weights to neurons, you can control their influence on the final output. 187 | 188 | 3. **Customizable Prompts**: Each neuron and the aggregator can have tailored prompts, allowing for specialized roles within the network. 189 | 190 | 4. **Randomization**: The `shuffle` and `dropout_rate` options in the aggregation layer introduce controlled randomness, potentially improving output diversity and robustness. 191 | 192 | ### Example Usage 193 | 194 | To use a YAML configuration file with moa-llm: 195 | ```python 196 | from moa_llm import create_moa_from_config 197 | moa = create_moa_from_config('moa_config.yaml', is_file_path=True) 198 | result = await moa.process("Your query here") 199 | ``` 200 | 201 | This flexibility allows users to experiment with different architectures, model combinations, and processing strategies without changing the underlying code. 202 | 203 | ## Performance Considerations 204 | 205 | - moa-llm uses asynchronous processing to improve performance, especially when dealing with multiple LLMs. 206 | - Response times for each neuron and layer are logged, allowing for performance analysis and optimization. 207 | - The total completion time for each query is calculated and reported. 208 | - Shuffling and dropout in the aggregation layer can be used to introduce randomness and potentially improve the diversity of outputs. 209 | 210 | ## Extending the System 211 | 212 | moa-llm is designed to be flexible and extensible. Some possible enhancements include: 213 | 214 | - Adding support for more LLM providers and models. 215 | - Implementing dynamic weight adjustment based on performance. 216 | - Creating specialized layers for specific tasks (e.g., fact-checking, creativity enhancement). 217 | - Integrating with other AI systems or data sources for enhanced capabilities. 218 | 219 | ## Limitations and Considerations 220 | 221 | - The system's performance and output quality depend on the chosen LLMs and their respective capabilities. 222 | - Proper prompt engineering is crucial for achieving optimal results. 223 | - API costs may be a consideration when using multiple commercial LLM services. 224 | 225 | ## Contributing 226 | 227 | Contributions to moa-llm are welcome! Please submit pull requests or open issues on the project's GitHub repository. 228 | 229 | ## License 230 | 231 | MIT 232 | -------------------------------------------------------------------------------- /examples/openai_compatible_server/README.md: -------------------------------------------------------------------------------- 1 | # OpenAI-Compatible Server with MoA 2 | 3 | This directory contains an example of how to use the MoA (Mixture of Annotators) framework to create a local server that provides an OpenAI-compatible API for chat completions. The server is built using FastAPI and can be configured to use different models and settings for the proposal and aggregation layers of the MoA. 4 | 5 | ## Prerequisites 6 | 7 | Before running the server, make sure you have the following dependencies installed: 8 | 9 | - Python 3.7 or higher 10 | - FastAPI 11 | - Pydantic 12 | - MoA-LLM 13 | 14 | You can install the required Python packages using pip: 15 | 16 | ```bash 17 | pip install fastapi pydantic moa-llm 18 | ``` 19 | 20 | ## Configuration 21 | 22 | The server configuration is defined in the `moa_config.yaml` file. This file specifies the models and settings for the proposal and aggregation layers of the MoA. You can modify this file to use different models or adjust the temperature and other settings. 23 | 24 | ## Running the Server 25 | 26 | To run the server, execute the following command: 27 | 28 | ```bash 29 | python server.py [path/to/moa_config.yaml] 30 | ``` 31 | 32 | Replace `[path/to/moa_config.yaml]` with the path to your MoA configuration file. 33 | 34 | If you don't provide a path to the configuration file, the server will use the default `moa_config.yaml` file in the same directory. 35 | 36 | The server will start running on `http://0.0.0.0:8000`. You can send POST requests to the `/chat/completions` endpoint with a JSON payload containing the chat messages and other parameters, as specified by the OpenAI API. 37 | 38 | Here's an example of a valid request payload: 39 | 40 | ```json 41 | { 42 | "model": "moa-model", 43 | "messages": [ 44 | {"role": "user", "content": "Hello, can you help me with a coding problem?"}, 45 | ], 46 | "max_tokens": 512, 47 | } 48 | ``` 49 | 50 | The server will respond with a JSON payload containing the generated chat completion, following the OpenAI API format. 51 | 52 | ## Streaming Responses 53 | 54 | If you set the `stream` parameter to `true` in the request payload, the server will stream the generated response token by token, allowing you to display the response as it's being generated. 55 | 56 | ## Customization 57 | 58 | You can customize the server by modifying the `moa_config.yaml` file or the `server.py` script. For example, you can change the models used in the proposal and aggregation layers, adjust the temperature and other settings, or modify the server's behavior. 59 | 60 | ## License 61 | 62 | This example is part of the MoA-LLM project and is licensed under the [MIT License](https://opensource.org/licenses/MIT). -------------------------------------------------------------------------------- /examples/openai_compatible_server/moa_config.yaml: -------------------------------------------------------------------------------- 1 | use_annotator: false 2 | 3 | proposal_layers: 4 | - neurons: 5 | - model: "claude-3-5-sonnet-20240620" 6 | temperature: 0.7 7 | - model: "gpt-4-turbo-2024-04-09" 8 | temperature: 0.7 9 | 10 | aggregation_layer: 11 | model: "gpt-4o-2024-08-06" 12 | system_prompt: | 13 | You are an advanced AI programming assistant tasked with outputing high-quality answer to difficult coding tasks. 14 | You may be provided with responses from previous attempts to the same task, correct or incorrect, and you should use them to improve your answer. 15 | 16 | Guidelines for LLM 17 | - Context: Before responding, carefully consider the full context of the request. Refer back to this prompt frequently. 18 | - Task Breakdown: If the task is complex, break it down into smaller, manageable steps. Address each step sequentially. 19 | - Precision: Aim for precise, specific responses rather than vague or general statements. 20 | - Knowledge Limits: If you're unsure about something, clearly state your uncertainty. Do not invent information. 21 | - Structured Output: When appropriate, use structured formats like lists, tables, or markdown to organize information clearly. 22 | - Examples: If helpful, provide brief examples to illustrate your points. 23 | - Conciseness: Balance thoroughness with brevity. Provide complete information without unnecessary verbosity. 24 | - Clarity Check: Before finalizing your response, review it to ensure it directly and clearly addresses the user's request. 25 | - Always write correct, up to date, bug free, fully functional and working, secure, performant and efficient code. 26 | - Focus on readability over being performant. 27 | - Fully implement all requested functionality. 28 | - Leave NO todos, placeholders or missing pieces. 29 | - Be concise. Minimize any other prose. 30 | 31 | Remember: Deliver a refined, enhanced answer that appears as a single, authoritative response to the user's query, including correct code and any necessary explanation. 32 | 33 | # Context 34 | {responses} 35 | 36 | temperature: 0.7 37 | max_tokens: 4000 38 | 39 | use_weights: false 40 | base_url: https://api.crosshatch.app 41 | -------------------------------------------------------------------------------- /examples/openai_compatible_server/moa_config_chat.yaml: -------------------------------------------------------------------------------- 1 | use_annotator: false 2 | 3 | proposal_layers: 4 | - neurons: 5 | - model: "meta-llama/Meta-Llama-3.1-70B-Instruct" 6 | temperature: 0.7 7 | - model: "deepseek-chat" 8 | temperature: 0.7 9 | 10 | aggregation_layer: 11 | model: "meta-llama/Meta-Llama-3.1-70B-Instruct" 12 | system_prompt: | 13 | You are an advanced AI assistant in a chat environment, synthesizing responses from multiple Language Models (LLMs) into natural, conversational replies. Your goal is to create cohesive, informative, and engaging responses that feel like a seamless part of an ongoing dialogue. 14 | 15 | # Core Approach 16 | 17 | Analyze and blend insights from multiple LLM responses. 18 | Craft replies that sound natural and conversational. 19 | Adapt your tone and style to match the user's communication style. 20 | Maintain context and continuity throughout the chat. 21 | 22 | # Key Attributes 23 | 24 | Knowledgeable yet approachable across all topics 25 | Adaptive to the flow and context of the conversation 26 | Clear and concise, avoiding unnecessary complexity 27 | Open-minded and willing to discuss any subject 28 | Emotionally intelligent, picking up on user's tone and mood 29 | 30 | # Response Guidelines 31 | 32 | Start responses naturally, as you would in a real conversation. 33 | Blend insights from multiple sources seamlessly into your replies. 34 | Use a mix of shorter and longer messages to maintain a natural rhythm. 35 | Address the user's points directly, maintaining relevance. 36 | Include conversational elements like questions or light observations when appropriate. 37 | Use simple language and explain complex concepts in accessible terms. 38 | Transition smoothly between topics if the conversation shifts. 39 | Offer to elaborate or provide examples if it feels natural to the flow. 40 | 41 | # Quality Focus 42 | 43 | Accuracy: Provide correct information while acknowledging uncertainties. 44 | Relevance: Keep responses focused on the user's current interests or questions. 45 | Clarity: Explain ideas in simple, easy-to-understand terms. 46 | Engagement: Maintain an interesting and dynamic conversation. 47 | Balance: Present multiple perspectives on complex topics when relevant. 48 | 49 | # Conversational Considerations 50 | 51 | Approach all topics openly, including controversial ones. 52 | Maintain a casual, friendly tone unless the user sets a different tone. 53 | Use conversational phrases and interjections naturally (e.g., "Well," "You know," "Actually"). 54 | Feel free to use humor or light-hearted comments when appropriate. 55 | Show empathy and understanding in your responses. 56 | Ask follow-up questions to clarify or deepen the conversation. 57 | Acknowledge and build upon the user's input in your responses. 58 | Be willing to admit uncertainty or limitations in knowledge. 59 | 60 | # Natural Response Structure 61 | 62 | Open with a direct address to the user's point or question. 63 | Present the main synthesized information in a conversational manner. 64 | Weave in additional context or perspectives naturally. 65 | Close with an element that invites further engagement (e.g., a question, an observation). 66 | 67 | # Adaptability 68 | 69 | Adjust your language complexity based on the user's communication style. 70 | Shift tone and formality to match the user's approach. 71 | Be responsive to changes in conversation direction or user mood. 72 | 73 | Remember: Your goal is to create a natural, engaging chat experience. Synthesize information from multiple sources into conversational, context-appropriate responses that feel like they're coming from a knowledgeable, friendly chat partner. 74 | 75 | Use the following context to create a response. Remember the previous responses may not be correct. Think step-by-step and reason through the best response before providing a final response. 76 | {responses} 77 | 78 | temperature: 0.7 79 | max_tokens: 8000 80 | 81 | use_weights: false 82 | base_url: https://api.crosshatch.app 83 | -------------------------------------------------------------------------------- /examples/openai_compatible_server/server.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import time 4 | from typing import List, Optional 5 | import sys 6 | 7 | from fastapi import FastAPI 8 | from pydantic import BaseModel 9 | from starlette.responses import StreamingResponse 10 | 11 | from moa_llm import create_moa_from_config 12 | 13 | app = FastAPI(title="MoA-powered Chat API") 14 | 15 | 16 | class Message(BaseModel): 17 | role: str 18 | content: str 19 | 20 | 21 | class ChatCompletionRequest(BaseModel): 22 | model: Optional[str] = "moa-model" 23 | messages: List[Message] 24 | max_tokens: Optional[int] = 512 25 | temperature: Optional[float] = 0.7 # This is set up in the moa_config.yaml file 26 | stream: Optional[bool] = False 27 | 28 | if len(sys.argv) > 1: 29 | config_path = sys.argv[1] 30 | else: 31 | config_path = "examples/openai_compatible_server/moa_config.yaml" 32 | 33 | moa = create_moa_from_config(config_path, is_file_path=True) 34 | 35 | async def stream_response(content: str): 36 | tokens = content.split() 37 | for token in tokens: 38 | chunk = { 39 | "choices": [{"delta": {"content": token + " "}}], 40 | } 41 | yield f"data: {json.dumps(chunk)}\n\n" 42 | await asyncio.sleep(0.1) 43 | yield "data: [DONE]\n\n" 44 | 45 | 46 | @app.post("/chat/completions") 47 | async def chat_completions(request: ChatCompletionRequest): 48 | messages = [{"role": msg.role, "content": msg.content} for msg in request.messages] 49 | 50 | result = await moa.process(messages) 51 | 52 | if request.stream: 53 | return StreamingResponse(stream_response(result["content"]), media_type="text/event-stream") 54 | 55 | return { 56 | "id": "moa-1", 57 | "object": "chat.completion", 58 | "created": int(time.time()), 59 | "model": request.model, 60 | "choices": [{"message": {"role": "assistant", "content": result["content"]}}], 61 | } 62 | 63 | 64 | if __name__ == "__main__": 65 | import uvicorn 66 | 67 | uvicorn.run(app, host="0.0.0.0", port=8000) -------------------------------------------------------------------------------- /examples/simple_example.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | 3 | from moa_llm import create_moa_from_config 4 | 5 | async def main(): 6 | moa = create_moa_from_config('examples/openai_compatible_server/moa_config.yaml', is_file_path=True) 7 | result = await moa.process("Write a function that takes a list of numbers and returns the sum of the squares of the numbers.") 8 | print(result) 9 | 10 | asyncio.run(main()) -------------------------------------------------------------------------------- /examples/streamlit_chat/README.md: -------------------------------------------------------------------------------- 1 | # Streamlit Chat Apps 2 | 3 | This directory contains two Streamlit applications for interacting with Mixture of Agents (MoA) models: 4 | 5 | ## streamlit_chat_local_server.py 6 | 7 | This application is designed to connect to a server (see `server.py`) and interact with a pre-configured MoA model. It provides a simple chat interface where users can send messages and receive responses from the MoA model. 8 | 9 | To run this application, you need to have the `server.py` (see `examples/openai_compatible_server/README.md`) running and accessible. Then, execute the following command: 10 | 11 | ```bash 12 | streamlit run streamlit_chat_local_server.py 13 | ``` 14 | 15 | ## streamlit_chat.py 16 | 17 | This application allows users to experiment with MoA configurations directly within the Streamlit interface. Users can specify the proposer models, their temperatures, and the aggregator model with its temperature and system prompt. 18 | 19 | ### Screenshots 20 | 21 | ![Configuration Screenshot](assets/config_screenshot.png?raw=true) 22 | 23 | ![Chat Screenshot](assets/chat_screenshot.png?raw=true) 24 | 25 | The application dynamically creates the MoA configuration based on the user's input and enables interactive chat with the configured MoA model. 26 | 27 | To run this application, execute the following command: 28 | 29 | ```bash 30 | streamlit run streamlit_chat.py 31 | ``` 32 | 33 | Both applications provide a user-friendly chat interface for interacting with MoA models, either by connecting to a pre-configured server or by dynamically configuring the MoA within the Streamlit app. -------------------------------------------------------------------------------- /examples/streamlit_chat/assets/chat_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catena-labs/moa-llm/5f1a43a59c8e7311a92a07b348153111d4dcd3ad/examples/streamlit_chat/assets/chat_screenshot.png -------------------------------------------------------------------------------- /examples/streamlit_chat/assets/config_screenshot.png: -------------------------------------------------------------------------------- https://raw.githubusercontent.com/catena-labs/moa-llm/5f1a43a59c8e7311a92a07b348153111d4dcd3ad/examples/streamlit_chat/assets/config_screenshot.png -------------------------------------------------------------------------------- /examples/streamlit_chat/streamlit_chat.py: -------------------------------------------------------------------------------- 1 | import streamlit as st 2 | import asyncio 3 | import yaml 4 | 5 | from typing import List, Dict 6 | from moa_llm import MixtureOfAgents, LLMNeuron, AggregationLayer, Layer 7 | from dotenv import load_dotenv 8 | 9 | # Load environment variables from .env.local file 10 | load_dotenv(".env.local") 11 | 12 | DEFAULT_AGGREGATION_LAYER_SYSTEM_PROMPT = """You have been provided with a set of responses from various open-source models to the latest user query. Your task is to synthesize these responses into a single, high-quality response. It is crucial to critically evaluate the information provided in these responses, recognizing that some of it may be biased or incorrect. Your response should not simply replicate the given answers but should offer a refined, accurate, and comprehensive reply to the instruction. Ensure your response is well-structured, coherent, and adheres to the highest standards of accuracy and reliability. 13 | 14 | Responses from models: 15 | {responses} 16 | """ 17 | 18 | def convert_to_yaml(config): 19 | proposal_layers = [] 20 | for layer in config["proposal_layers"]: 21 | neurons = [] 22 | for neuron in layer["neurons"]: 23 | neuron_config = { 24 | "model": neuron["model_id"], 25 | "temperature": neuron["temperature"], 26 | } 27 | if neuron["weight"] != 1.0: 28 | neuron_config["weight"] = neuron["weight"] 29 | if neuron["system_prompt"]: 30 | neuron_config["system_prompt"] = neuron["system_prompt"] 31 | neurons.append(neuron_config) 32 | proposal_layers.append({"neurons": neurons}) 33 | 34 | aggregation_layer = { 35 | "model": config["aggregation_layer"]["model_id"], 36 | "system_prompt": config["aggregation_layer"]["system_prompt"], 37 | "temperature": config["aggregation_layer"]["temperature"], 38 | "max_tokens": config["aggregation_layer"].get("max_tokens", 8000), 39 | } 40 | 41 | yaml_config = { 42 | "use_annotator": config["use_annotator"], 43 | "proposal_layers": proposal_layers, 44 | "aggregation_layer": aggregation_layer, 45 | "use_weights": config["use_weights"], 46 | } 47 | 48 | return yaml.dump(yaml_config) 49 | 50 | def create_moa_config(config: Dict) -> MixtureOfAgents: 51 | """ 52 | Create a Mixture of Agents (MOA) configuration based on the given configuration. 53 | 54 | Args: 55 | config (Dict): A dictionary containing the MOA configuration. 56 | 57 | Returns: 58 | MixtureOfAgents: A configured MOA instance. 59 | """ 60 | proposal_layers = [] 61 | for layer_config in config.get("proposal_layers", []): 62 | layer_neurons = [] 63 | for neuron_config in layer_config.get("neurons", []): 64 | layer_neurons.append( 65 | LLMNeuron( 66 | model=neuron_config["model_id"], 67 | temperature=neuron_config["temperature"], 68 | weight=neuron_config.get("weight", 1.0), 69 | max_tokens=2048, 70 | neuron_type=f"proposer_layer_{len(proposal_layers)+1}_neuron_{len(layer_neurons)+1}", 71 | system_prompt=neuron_config.get("system_prompt", ""), 72 | ) 73 | ) 74 | proposal_layers.append(Layer(layer_neurons)) 75 | 76 | agg_neuron = LLMNeuron( 77 | model=config["aggregation_layer"]["model_id"], 78 | temperature=config["aggregation_layer"]["temperature"], 79 | max_tokens=2048, 80 | neuron_type="aggregator", 81 | system_prompt=config["aggregation_layer"]["system_prompt"], 82 | ) 83 | aggregation_layer = AggregationLayer( 84 | agg_neuron, 85 | shuffle=False, 86 | dropout_rate=0.0, 87 | use_weights=config.get("use_weights", False), 88 | ) 89 | 90 | annotator = None 91 | if config.get("use_annotator", False): 92 | annotator = LLMNeuron( 93 | model=config["annotator"]["model_id"], 94 | temperature=config["annotator"]["temperature"], 95 | max_tokens=2048, 96 | neuron_type="annotator", 97 | system_prompt=config["annotator"]["system_prompt"], 98 | ) 99 | 100 | return MixtureOfAgents( 101 | proposal_layers=proposal_layers, 102 | aggregator_layer=aggregation_layer, 103 | annotator=annotator, 104 | use_annotator=config.get("use_annotator", False), 105 | max_workers=4, 106 | pass_corresponding_results=config.get("pass_corresponding_results", False), 107 | messages=None, 108 | ) 109 | 110 | async def chat_page(): 111 | """ 112 | Chat page for interacting with the MOA. 113 | """ 114 | st.title("MOA Chat") 115 | 116 | if "messages" not in st.session_state: 117 | st.session_state["messages"] = [] 118 | 119 | # Display chat history 120 | for message in st.session_state["messages"]: 121 | with st.chat_message(message["role"]): 122 | st.markdown(message["content"]) 123 | 124 | # Handle user input 125 | user_input = st.chat_input("Enter your message") 126 | if user_input: 127 | st.session_state["messages"].append({"role": "user", "content": user_input}) 128 | with st.chat_message("user"): 129 | st.markdown(user_input) 130 | 131 | if "moa" in st.session_state: 132 | moa = st.session_state["moa"] 133 | # Process user input using MOA 134 | result = await moa.process(st.session_state["messages"]) 135 | st.session_state["messages"].append({"role": "assistant", "content": result["content"]}) 136 | with st.chat_message("assistant"): 137 | st.markdown(result["content"]) 138 | 139 | async def app(): 140 | """ 141 | Main Streamlit application function. 142 | Handles the UI layout, user interactions, and MOA processing. 143 | """ 144 | st.set_page_config(page_title="MOA Chat", layout="wide") 145 | 146 | # Sidebar navigation 147 | pages = { 148 | "Configuration": config_page, 149 | "Chat": chat_page, 150 | } 151 | 152 | selection = st.sidebar.radio("Go to", list(pages.keys())) 153 | 154 | if selection == "Chat": 155 | await chat_page() 156 | else: 157 | pages[selection]() 158 | 159 | def config_page(): 160 | """ 161 | Configuration page for setting up the MOA. 162 | """ 163 | st.title("MOA Configuration") 164 | 165 | if "config" in st.session_state: 166 | print("config in session state") 167 | config = st.session_state["config"] 168 | else: 169 | print("config not in session state") 170 | config = { 171 | "annotator": None, 172 | "proposal_layers": [], 173 | "aggregation_layer": { 174 | "model_id": "", 175 | "temperature": 0.7, 176 | "system_prompt": DEFAULT_AGGREGATION_LAYER_SYSTEM_PROMPT, 177 | }, 178 | "use_annotator": False, 179 | "use_weights": False, 180 | "pass_corresponding_results": False, 181 | } 182 | 183 | use_annotator = st.sidebar.checkbox("Use Annotator", value=config["use_annotator"], key="use_annotator") 184 | if use_annotator: 185 | st.subheader("Annotator") 186 | annotator_model_id = st.text_input("Annotator Model ID", value=config["annotator"]["model_id"] if config["annotator"] else "", key="annotator_model_id") 187 | annotator_temperature = st.slider("Annotator Temperature", 0.0, 1.0, value=config["annotator"]["temperature"] if config["annotator"] else 0.7, key="annotator_temperature") 188 | annotator_system_prompt = st.text_area("Annotator System Prompt", value=config["annotator"]["system_prompt"] if config["annotator"] else "", key="annotator_system_prompt") 189 | 190 | # Proposal layers configuration 191 | st.subheader("Proposal Layers") 192 | num_proposal_layers = st.sidebar.number_input( 193 | "Number of Proposal Layers", 194 | min_value=1, 195 | max_value=3, 196 | value=len(config["proposal_layers"]) if config["proposal_layers"] else 1, # Set default value to 1 if empty 197 | step=1, 198 | key="num_proposal_layers", 199 | ) 200 | proposal_layers = [] 201 | for layer_idx in range(num_proposal_layers): 202 | with st.expander(f"Proposal Layer {layer_idx+1}"): 203 | existing_layer_config = config["proposal_layers"][layer_idx] if layer_idx < len(config["proposal_layers"]) else {"neurons": []} 204 | num_neurons = st.sidebar.number_input(f"Number of Neurons in Layer {layer_idx+1}", min_value=1, max_value=5, value=len(existing_layer_config["neurons"]) if existing_layer_config["neurons"] else 1, step=1, key=f"num_neurons_layer_{layer_idx+1}") 205 | layer_neurons = [] 206 | cols = st.columns(num_neurons) 207 | for neuron_idx, col in enumerate(cols): 208 | existing_neuron_config = existing_layer_config["neurons"][neuron_idx] if neuron_idx < len(existing_layer_config["neurons"]) else {} 209 | with col: 210 | model_id = st.text_input(f"Model ID for Neuron {neuron_idx+1}", value=existing_neuron_config.get("model_id", ""), key=f"neuron_{layer_idx+1}_{neuron_idx+1}_model_id") 211 | temperature = st.slider(f"Temperature for Neuron {neuron_idx+1}", 0.0, 1.0, value=existing_neuron_config.get("temperature", 0.7), key=f"neuron_{layer_idx+1}_{neuron_idx+1}_temperature") 212 | weight = st.number_input(f"Weight for Neuron {neuron_idx+1}", min_value=0.0, value=existing_neuron_config.get("weight", 1.0), step=0.1, key=f"neuron_{layer_idx+1}_{neuron_idx+1}_weight") 213 | system_prompt = st.text_area(f"System Prompt for Neuron {neuron_idx+1}", value=existing_neuron_config.get("system_prompt", ""), key=f"neuron_{layer_idx+1}_{neuron_idx+1}_system_prompt") 214 | layer_neurons.append({"model_id": model_id, "temperature": temperature, "weight": weight, "system_prompt": system_prompt}) 215 | proposal_layers.append({"neurons": layer_neurons}) 216 | 217 | # Aggregation layer configuration 218 | st.subheader("Aggregation Layer") 219 | agg_model_id = st.text_input("Aggregator Model ID", value=config["aggregation_layer"]["model_id"], key="agg_model_id") 220 | agg_temperature = st.slider("Aggregator Temperature", 0.0, 1.0, value=config["aggregation_layer"]["temperature"], key="agg_temperature") 221 | agg_system_prompt = st.text_area("Aggregator System Prompt", value=config["aggregation_layer"]["system_prompt"], key="agg_system_prompt") 222 | 223 | # Other configurations 224 | use_weights = st.checkbox("Use Weights", value=config["use_weights"], key="use_weights") 225 | pass_corresponding_results = st.checkbox("Pass Corresponding Results", value=config["pass_corresponding_results"], key="pass_corresponding_results") 226 | show_json = st.checkbox("Show JSON", key="show_json") 227 | 228 | if st.button("Save Configuration"): 229 | config = { 230 | "annotator": { 231 | "model_id": annotator_model_id, 232 | "temperature": annotator_temperature, 233 | "system_prompt": annotator_system_prompt, 234 | } if use_annotator else None, 235 | "proposal_layers": proposal_layers, 236 | "aggregation_layer": { 237 | "model_id": agg_model_id, 238 | "temperature": agg_temperature, 239 | "system_prompt": agg_system_prompt, 240 | }, 241 | "use_annotator": use_annotator, 242 | "use_weights": use_weights, 243 | "pass_corresponding_results": pass_corresponding_results, 244 | } 245 | st.session_state["config"] = config 246 | st.session_state["moa"] = create_moa_config(config) 247 | 248 | if show_json: 249 | with st.expander("Configuration JSON"): 250 | st.subheader("Configuration JSON") 251 | st.json(config) 252 | yaml_config = convert_to_yaml(config) 253 | st.code(yaml_config, language="yaml") 254 | 255 | 256 | if __name__ == "__main__": 257 | asyncio.run(app()) -------------------------------------------------------------------------------- /examples/streamlit_chat/streamlit_chat_local_server.py: -------------------------------------------------------------------------------- 1 | """ 2 | This script implements a simple chat interface using Streamlit and communicates with a MoA (Mixture of Agents) model. 3 | It allows users to interact with the AI model in a conversational manner. 4 | """ 5 | 6 | import requests 7 | import streamlit as st 8 | 9 | # Set the title of the Streamlit app 10 | st.title("MoA Chat") 11 | 12 | # Initialize the chat history in the session state if it doesn't exist 13 | if "messages" not in st.session_state: 14 | st.session_state.messages = [] 15 | 16 | # Get the endpoint URL from the user 17 | endpoint_url = st.text_input("Enter the endpoint URL", value="http://0.0.0.0:8000/chat/completions") 18 | 19 | # Display all previous messages in the chat history 20 | for message in st.session_state.messages: 21 | with st.chat_message(message["role"]): 22 | st.markdown(message["content"]) 23 | 24 | # Get user input and process it 25 | if prompt := st.chat_input("Ask a question"): 26 | # Add user message to the chat history 27 | st.session_state.messages.append({"role": "user", "content": prompt}) 28 | with st.chat_message("user"): 29 | st.markdown(prompt) 30 | 31 | # Prepare the payload for the API request 32 | payload = { 33 | "model": "moa-model", 34 | "messages": [ 35 | {"role": msg["role"], "content": msg["content"]} 36 | for msg in st.session_state.messages 37 | ], 38 | "stream": False, 39 | } 40 | 41 | # Send a POST request to the MoA model API 42 | response = requests.post(endpoint_url, json=payload) 43 | data = response.json() 44 | 45 | # Extract the assistant's response from the API response 46 | assistant_response = data["choices"][0]["message"]["content"] 47 | 48 | # Add assistant's response to the chat history 49 | st.session_state.messages.append({"role": "assistant", "content": assistant_response}) 50 | 51 | # Display the assistant's response in the chat interface 52 | with st.chat_message("assistant"): 53 | st.markdown(assistant_response) -------------------------------------------------------------------------------- /moa_llm/__init__.py: -------------------------------------------------------------------------------- 1 | import logging 2 | 3 | from .aggregation_layer import AggregationLayer 4 | from .config_loader import create_moa_from_config 5 | from .layer import Layer 6 | from .mixture_of_agents import MixtureOfAgents 7 | from .neuron import LLMNeuron, Neuron 8 | from .user_query_annotator import UserQueryAnnotator 9 | from .version import VERSION, VERSION_SHORT 10 | 11 | logging.getLogger(__name__).addHandler(logging.NullHandler()) 12 | 13 | # Set up basic logging configuration 14 | logging.basicConfig( 15 | level=logging.INFO, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s" 16 | ) 17 | __all__ = [ 18 | "Neuron", 19 | "LLMNeuron", 20 | "Layer", 21 | "AggregationLayer", 22 | "MixtureOfAgents", 23 | "UserQueryAnnotator", 24 | "create_moa_from_config", 25 | ] 26 | -------------------------------------------------------------------------------- /moa_llm/aggregation_layer.py: -------------------------------------------------------------------------------- 1 | import random 2 | import string 3 | from typing import Any, Dict, List, Optional 4 | 5 | from .layer import Layer 6 | from .neuron import Neuron 7 | 8 | 9 | class AggregationLayer(Layer): 10 | """ 11 | A layer that aggregates responses from previous layers and processes them through a single neuron. 12 | 13 | This layer can optionally shuffle the responses, apply dropout, and use a custom aggregation prompt template. 14 | 15 | Attributes: 16 | neuron (Neuron): The neuron used for processing the aggregated input. 17 | aggregation_prompt_template (str | None): Template for aggregating responses. 18 | shuffle (bool): Whether to shuffle the responses before processing. 19 | dropout_rate (float): The rate at which to randomly drop responses. 20 | use_weights (bool, optional): Whether to use weights for responses. Defaults to False. 21 | """ 22 | 23 | def __init__( 24 | self, 25 | neuron: Neuron, 26 | aggregation_prompt_template: str | None = None, 27 | shuffle: bool = False, 28 | dropout_rate: float = 0.0, 29 | use_weights: bool = False, 30 | ): 31 | """ 32 | Initialize the AggregationLayer. 33 | 34 | Args: 35 | neuron (Neuron): The neuron to use for processing. 36 | aggregation_prompt_template (str | None, optional): Template for aggregating responses. Defaults to None. 37 | shuffle (bool, optional): Whether to shuffle responses. Defaults to False. 38 | dropout_rate (float, optional): Rate for randomly dropping responses. Defaults to 0.0. 39 | use_weights (bool, optional): Whether to use weights for responses. Defaults to False. 40 | """ 41 | super().__init__([neuron]) 42 | self.neuron = neuron 43 | self.aggregation_prompt_template = aggregation_prompt_template 44 | self.shuffle = shuffle 45 | self.dropout_rate = dropout_rate 46 | self.use_weights = use_weights 47 | 48 | async def process( 49 | self, input_data: Any, prev_response: Optional[List[Dict[str, Any]]] = None 50 | ) -> List[Dict[str, Any]]: 51 | """ 52 | Process the input data and previous responses through the aggregation layer. 53 | 54 | Args: 55 | input_data (Any): The input data to process. 56 | prev_response (Optional[List[Dict[str, Any]]], optional): Previous responses to aggregate. Defaults to None. 57 | 58 | Returns: 59 | List[Dict[str, Any]]: A list containing the result of processing. 60 | """ 61 | if prev_response is None: 62 | prev_response = [] 63 | 64 | if self.shuffle: 65 | random.shuffle(prev_response) 66 | 67 | # Apply dropout 68 | prev_response = [r for r in prev_response if random.random() > self.dropout_rate] 69 | 70 | if self.aggregation_prompt_template is not None: 71 | aggregated_input = self.collect_responses( 72 | self.aggregation_prompt_template, prev_response, input_data 73 | ) 74 | 75 | self.neuron.system_prompt = aggregated_input 76 | 77 | result = await self.neuron.process(input_data) 78 | 79 | return [result] 80 | 81 | @staticmethod 82 | def collect_responses( 83 | aggregation_prompt_template: str, 84 | results: List[Dict[str, Any]], 85 | user_query: str, 86 | use_weights: bool, 87 | ) -> str: 88 | """ 89 | Collect and format responses from multiple models for aggregation. 90 | 91 | Args: 92 | aggregation_prompt_template (str): The template for formatting the aggregated prompt. 93 | results (List[Dict[str, Any]]): List of results from different models. 94 | user_query (str): The original user query. 95 | use_weights (bool): Whether to use weights for the responses. 96 | 97 | Returns: 98 | str: Formatted string containing aggregated responses and weights. 99 | """ 100 | # Combine responses from all models 101 | responses = "\n".join( 102 | [ 103 | f"Model {i+1}: {str(result['content'])} {f'(Weight: {result.get('weight', 1.0)})' if use_weights else ''}" 104 | for i, result in enumerate(results) 105 | ] 106 | ) 107 | 108 | # Prepare arguments for formatting 109 | format_args = { 110 | "user_query": user_query, 111 | "responses": responses, 112 | } 113 | 114 | # Ensure all required keys are present in the template 115 | required_keys = [ 116 | key[1] 117 | for key in string.Formatter().parse(aggregation_prompt_template) 118 | if key[1] is not None 119 | ] 120 | for key in required_keys: 121 | if key not in format_args: 122 | format_args[key] = f"[{key} not provided]" 123 | 124 | # Format and return the final aggregated prompt 125 | return aggregation_prompt_template.format(**format_args) -------------------------------------------------------------------------------- /moa_llm/config_loader.py: -------------------------------------------------------------------------------- 1 | """ 2 | config_loader.py 3 | 4 | This module is responsible for loading and parsing configuration files for the Mixture of Agents (MoA) system. 5 | It provides functionality to: 6 | 7 | 1. Load configurations from YAML files or strings 8 | 2. Parse and validate the configuration structure 9 | 3. Create MixtureOfAgents instances based on the loaded configuration 10 | 4. Set up layers, neurons, and other components of the MoA system 11 | 12 | The main functions in this module are: 13 | - load_config: Loads and parses a configuration from various input types 14 | - create_moa_from_config: Creates a MixtureOfAgents instance from a parsed configuration 15 | 16 | This module plays a crucial role in setting up the MoA system by interpreting user-defined 17 | configurations and instantiating the necessary components accordingly. 18 | """ 19 | 20 | import os 21 | from typing import Any, Dict, List, Optional, Union 22 | 23 | import yaml 24 | from dotenv import load_dotenv 25 | 26 | from .aggregation_layer import AggregationLayer 27 | from .layer import Layer 28 | from .mixture_of_agents import MixtureOfAgents 29 | from .neuron import LLMNeuron 30 | from .user_query_annotator import UserQueryAnnotator 31 | 32 | load_dotenv(".env.local") 33 | load_dotenv(".env") 34 | 35 | BASE_URL = os.getenv("BASE_URL", "https://api.together.xyz/v1") 36 | 37 | 38 | def load_config( 39 | config_input: Union[str, Dict[str, Any]], is_file_path: bool = False 40 | ) -> Dict[str, Any]: 41 | """ 42 | Load and parse a configuration from various input types. 43 | 44 | This function can handle configurations provided as YAML strings, file paths to YAML files, 45 | or pre-loaded dictionaries. 46 | 47 | Args: 48 | config_input (Union[str, Dict[str, Any]]): The configuration input. Can be a YAML string, 49 | a file path to a YAML file, or a pre-loaded dictionary. 50 | is_file_path (bool, optional): If True, treats a string input as a file path. 51 | Defaults to False. 52 | 53 | Returns: 54 | Dict[str, Any]: The parsed configuration as a dictionary. 55 | 56 | Raises: 57 | ValueError: If the input is invalid or cannot be parsed. 58 | FileNotFoundError: If is_file_path is True and the specified file doesn't exist. 59 | """ 60 | if isinstance(config_input, str): 61 | if is_file_path: 62 | try: 63 | with open(config_input, "r") as file: 64 | return yaml.safe_load(file) 65 | except FileNotFoundError: 66 | raise ValueError(f"Config file not found: {config_input}") 67 | else: 68 | try: 69 | return yaml.safe_load(config_input) 70 | except yaml.YAMLError: 71 | raise ValueError("Invalid YAML string provided") 72 | elif isinstance(config_input, dict): 73 | return config_input 74 | else: 75 | raise ValueError("config_input must be either a YAML string, a file path, or a dictionary") 76 | 77 | 78 | def create_moa_from_config( 79 | config_input: Union[str, Dict[str, Any]], 80 | is_file_path: bool = False, 81 | max_workers: int = 4, 82 | messages: Optional[List[Dict[str, str]]] = None, 83 | ) -> MixtureOfAgents: 84 | """ 85 | Create a MixtureOfAgents instance from a configuration. 86 | 87 | Args: 88 | config_input (Union[str, Dict[str, Any]]): Configuration input as YAML string, file path, or dictionary. 89 | is_file_path (bool, optional): If True, treats config_input as a file path. Defaults to False. 90 | max_workers (int, optional): Maximum number of concurrent workers. Defaults to 4. 91 | messages (Optional[List[Dict[str, str]]], optional): Initial messages for the MoA. Defaults to None. 92 | 93 | Returns: 94 | MixtureOfAgents: An initialized MixtureOfAgents instance. 95 | """ 96 | # Load and parse the configuration 97 | config = load_config(config_input, is_file_path) 98 | base_url = config.get("base_url", "https://api.together.xyz/v1") 99 | 100 | # Create UserQueryAnnotator if specified in the config 101 | annotator = None 102 | if config.get("use_annotator", False): 103 | annotator_config = config.get("annotator", {}) 104 | annotator = UserQueryAnnotator( 105 | model=annotator_config.get("model"), 106 | temperature=annotator_config.get("temperature", 0.7), 107 | system_prompt=annotator_config.get("system_prompt"), 108 | base_url=annotator_config.get("base_url", BASE_URL), 109 | moa_id=None, # Will be set later 110 | neuron_type="annotator", 111 | ) 112 | 113 | # Create proposal layers 114 | proposal_layers = [] 115 | for layer_index, layer_config in enumerate(config.get("proposal_layers", [])): 116 | neurons = [ 117 | LLMNeuron( 118 | model=neuron["model"], 119 | system_prompt=neuron.get("system_prompt", ""), 120 | temperature=neuron.get("temperature", 0.7), 121 | weight=neuron.get("weight", 1.0), 122 | max_tokens=neuron.get("max_tokens", 2048), 123 | base_url=base_url, 124 | moa_id=None, 125 | neuron_type=f"proposer_layer_{layer_index + 1}", 126 | ) 127 | for neuron in layer_config["neurons"] 128 | ] 129 | proposal_layers.append(Layer(neurons)) 130 | 131 | # Create aggregation layer 132 | agg_config = config["aggregation_layer"] 133 | agg_neuron = LLMNeuron( 134 | model=agg_config["model"], 135 | temperature=agg_config.get("temperature", 0.7), 136 | max_tokens=agg_config.get("max_tokens", 2048), 137 | base_url=base_url, 138 | moa_id=None, 139 | neuron_type="aggregator", 140 | ) 141 | aggregation_layer = AggregationLayer( 142 | agg_neuron, 143 | agg_config.get("system_prompt", None), 144 | shuffle=agg_config.get("shuffle", False), 145 | dropout_rate=agg_config.get("dropout_rate", 0.0), 146 | use_weights=agg_config.get("use_weights", False), 147 | ) 148 | 149 | # Create MixtureOfAgents instance 150 | moa = MixtureOfAgents( 151 | proposal_layers=proposal_layers, 152 | aggregator_layer=aggregation_layer, 153 | annotator=annotator, 154 | use_annotator=config.get("use_annotator", False), 155 | max_workers=max_workers, 156 | pass_corresponding_results=config.get("pass_corresponding_results", False), 157 | messages=messages, 158 | ) 159 | 160 | # Set moa_id for all neurons 161 | for layer in proposal_layers: 162 | for neuron in layer.neurons: 163 | neuron.moa_id = moa.moa_id 164 | aggregation_layer.neuron.moa_id = moa.moa_id 165 | if annotator: 166 | annotator.moa_id = moa.moa_id 167 | 168 | return moa 169 | -------------------------------------------------------------------------------- /moa_llm/layer.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | from typing import Any, Dict, List, Optional, Sequence, Union 3 | 4 | from .neuron import Neuron 5 | 6 | 7 | class Layer: 8 | """ 9 | Represents a layer of neurons in the mixture of agents model. 10 | 11 | This class manages a sequence of neurons and handles their concurrent processing. 12 | """ 13 | 14 | def __init__(self, neurons: Sequence[Neuron], max_workers: int = 4): 15 | """ 16 | Initialize a Layer instance. 17 | 18 | Args: 19 | neurons (Sequence[Neuron]): A sequence of Neuron objects that make up this layer. 20 | max_workers (int, optional): The maximum number of concurrent workers. Defaults to 4. 21 | """ 22 | self.neurons = neurons 23 | self.semaphore = asyncio.Semaphore(max_workers) 24 | 25 | async def process( 26 | self, input_data: Union[str, List[Dict[str, str]]], prev_response: Optional[List[Dict[str, Any]]] = None 27 | ) -> List[Dict[str, Any]]: 28 | """ 29 | Process the input data through all neurons in the layer concurrently. 30 | 31 | Args: 32 | input_data (str): The input data to be processed. 33 | prev_response (Optional[List[Dict[str, Any]]], optional): The response from the previous layer, if any. Defaults to None. 34 | 35 | Returns: 36 | List[Dict[str, Any]]: A list of dictionaries containing the results from each neuron. 37 | """ 38 | prev_response_content = [r["content"] for r in prev_response] if prev_response else None 39 | 40 | async def process_neuron(neuron): 41 | async with self.semaphore: 42 | return await neuron.process(input_data, prev_response_content) 43 | 44 | tasks = [process_neuron(neuron) for neuron in self.neurons] 45 | results = await asyncio.gather(*tasks) 46 | 47 | return results 48 | 49 | @property 50 | def max_workers(self) -> int: 51 | """ 52 | Get the maximum number of concurrent workers. 53 | 54 | Returns: 55 | int: The maximum number of workers. 56 | """ 57 | return self._max_workers 58 | 59 | @max_workers.setter 60 | def max_workers(self, value: int): 61 | """ 62 | Set the maximum number of concurrent workers and update the semaphore. 63 | 64 | Args: 65 | value (int): The new maximum number of workers. 66 | """ 67 | self._max_workers = value 68 | self.semaphore = asyncio.Semaphore(value) 69 | -------------------------------------------------------------------------------- /moa_llm/mixture_of_agents.py: -------------------------------------------------------------------------------- 1 | """ 2 | This Python file defines the MixtureOfAgents class, which implements a multi-layer approach for processing queries using various AI models. The class manages multiple proposal layers, an aggregation layer, and an optional query annotator to process input and produce a final output. Key features include: 3 | 4 | 1. Initialization of proposal layers, aggregation layer, and optional annotator 5 | 2. Configuration of processing parameters like max workers and result passing strategy 6 | 3. Methods for processing single queries or message arrays 7 | 4. Asynchronous processing of layers and aggregation of results 8 | 5. Timing and logging of processing steps 9 | 6. Handling of both single-turn queries and multi-turn conversations 10 | 11 | """ 12 | 13 | import asyncio 14 | import logging 15 | import time 16 | import uuid 17 | from typing import Any, Dict, List, Optional, Union 18 | 19 | from .aggregation_layer import AggregationLayer 20 | from .layer import Layer 21 | from .user_query_annotator import UserQueryAnnotator 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | 26 | class MixtureOfAgents: 27 | """ 28 | A class that implements a mixture of agents approach for processing queries. 29 | 30 | This class manages multiple layers of neural networks and an aggregation layer 31 | to process input queries and produce a final output. 32 | """ 33 | 34 | def __init__( 35 | self, 36 | proposal_layers: List[Layer], 37 | aggregator_layer: AggregationLayer, 38 | annotator: Optional[UserQueryAnnotator] = None, 39 | use_annotator: bool = False, 40 | max_workers: int = 4, 41 | pass_corresponding_results: bool = False, 42 | messages: Optional[List[Dict[str, str]]] = None, 43 | ): 44 | """ 45 | Initialize the MixtureOfAgents. 46 | 47 | Args: 48 | proposal_layers (List[Layer]): List of proposal layers for processing. 49 | aggregator_layer (AggregationLayer): The final aggregation layer. 50 | annotator (Optional[UserQueryAnnotator]): An optional query annotator. 51 | use_annotator (bool): Flag to determine if the annotator should be used. 52 | max_workers (int): Maximum number of concurrent workers. 53 | pass_corresponding_results (bool): Flag to pass corresponding results between layers, instead of fully connected layers. 54 | messages (Optional[List[Dict[str, str]]]): Optional list of messages to process. 55 | """ 56 | self.proposal_layers = proposal_layers 57 | self.aggregator_layer = aggregator_layer 58 | self.annotator = annotator if isinstance(annotator, UserQueryAnnotator) else None 59 | self.use_annotator = use_annotator 60 | self.max_workers = max_workers 61 | for layer in self.proposal_layers: 62 | layer.max_workers = max_workers 63 | self.aggregator_layer.max_workers = max_workers 64 | self.moa_id = str(uuid.uuid4()) 65 | self.pass_corresponding_results = pass_corresponding_results 66 | self.messages = messages 67 | 68 | async def process(self, input_data: Union[str, List[Dict[str, str]]]) -> Dict[str, Any]: 69 | """ 70 | Process the input data through the mixture of agents. 71 | 72 | Args: 73 | input_data (Union[str, List[Dict[str, str]]]): The input query or messages to process. 74 | 75 | Returns: 76 | Dict[str, Any]: A dictionary containing the processed result, including 77 | content, response times, total completion time, and annotated query (if used). 78 | """ 79 | start_time = time.time() 80 | results: List[Dict[str, Any]] = [] 81 | response_times = {} 82 | 83 | # Use input_data directly if it's a list of messages, otherwise create a single message 84 | messages = ( 85 | input_data 86 | if isinstance(input_data, list) 87 | else [{"role": "user", "content": input_data}] 88 | ) 89 | 90 | # Layer 0: Process with the first proposal layer 91 | if len(self.proposal_layers) > 0: 92 | layer_results = await self.proposal_layers[0].process(messages) 93 | results.extend(layer_results) 94 | for i, result in enumerate(layer_results): 95 | response_times[f"Layer 1 - Neuron {i+1}"] = result["time"] 96 | 97 | # Layers 1 to N-1: Process with remaining proposal layers 98 | for layer_num in range(1, len(self.proposal_layers)): 99 | layer_results = [] 100 | layer = self.proposal_layers[layer_num] 101 | 102 | if self.pass_corresponding_results: 103 | tasks = [ 104 | process_neuron(i, neuron, messages, results) 105 | for i, neuron in enumerate(layer.neurons) 106 | ] 107 | layer_results = await asyncio.gather(*tasks) 108 | for i, neuron_output in enumerate(layer_results): 109 | response_times[f"Layer {layer_num+1} - Neuron {i+1}"] = neuron_output["time"] 110 | else: 111 | layer_output = await layer.process(messages, prev_response=results) 112 | layer_results.extend(layer_output) 113 | for j, result in enumerate(layer_output): 114 | response_times[f"Layer {layer_num+1} - Neuron {j+1}"] = result["time"] 115 | 116 | results.extend(layer_results) 117 | 118 | # Layer N: Final aggregation 119 | final_result = await self.aggregator_layer.process(messages, results) 120 | response_times["Aggregation Layer"] = final_result[0]["time"] 121 | 122 | end_time = time.time() 123 | total_completion_time = end_time - start_time 124 | 125 | logger.info(f"Total completion time: {total_completion_time:.2f} seconds") 126 | 127 | return { 128 | "content": final_result[0]["content"], 129 | "response_times": response_times, 130 | "total_completion_time": total_completion_time, 131 | "annotated_query": messages if self.use_annotator and self.annotator else None, 132 | } 133 | 134 | 135 | async def process_neuron(i, neuron, messages, results): 136 | prev_result = [results[i]] if i < len(results) else [] 137 | return await neuron.process(messages, prev_response=prev_result) 138 | -------------------------------------------------------------------------------- /moa_llm/neuron.py: -------------------------------------------------------------------------------- 1 | import asyncio 2 | import json 3 | import logging 4 | import os 5 | import time 6 | from abc import ABC, abstractmethod 7 | from datetime import datetime 8 | from pathlib import Path 9 | from typing import Any, Dict, List, Optional, Union 10 | 11 | from dotenv import load_dotenv 12 | from openai import AsyncOpenAI 13 | 14 | # Load environment variables from .env.local file 15 | load_dotenv(".env.local") 16 | load_dotenv(".env") 17 | 18 | BASE_URL = os.getenv("BASE_URL", "https://api.together.xyz/v1") 19 | 20 | logger = logging.getLogger(__name__) 21 | 22 | 23 | class Neuron(ABC): 24 | """ 25 | Abstract base class for a neuron in the mixture of agents model. 26 | """ 27 | 28 | def __init__(self, system_prompt: str, temperature: float, weight: float = 1.0): 29 | """ 30 | Initialize a Neuron. 31 | 32 | Args: 33 | system_prompt (str): The system prompt for the neuron. 34 | temperature (float): The temperature parameter for response generation. 35 | weight (float, optional): The weight of this neuron in the layer. Defaults to 1.0. 36 | """ 37 | self.system_prompt = system_prompt 38 | self.temperature = temperature 39 | self.weight = weight 40 | 41 | @abstractmethod 42 | async def process(self, input_data: Any, prev_response: List[str] | None = None) -> Any: 43 | """ 44 | Process the input data. 45 | 46 | Args: 47 | input_data (Any): The input data to process. 48 | prev_response (List[str] | None, optional): Previous responses. Defaults to None. 49 | 50 | Returns: 51 | Any: The processed output. 52 | """ 53 | pass 54 | 55 | 56 | class LLMNeuron(Neuron): 57 | def __init__( 58 | self, 59 | model: str, 60 | system_prompt: str = "", 61 | temperature: float = 0.7, 62 | weight: float = 1.0, 63 | max_tokens: int = 2048, 64 | base_url: str = BASE_URL, 65 | moa_id: Optional[str] = None, 66 | num_previous_responses: int = int(os.getenv("NUM_PREVIOUS_RESPONSES", "1")), 67 | neuron_type: str = "", 68 | ): 69 | super().__init__(system_prompt, temperature, weight) 70 | self.model = model 71 | self.max_tokens = max_tokens 72 | api_key = os.getenv("PROVIDER_API_KEY") 73 | if not api_key: 74 | raise ValueError("PROVIDER_API_KEY not found in .env.local file") 75 | self.client = AsyncOpenAI(api_key=api_key, base_url=base_url) 76 | self.stream = os.getenv("STREAM", "false").lower() == "true" 77 | self.moa_id = moa_id 78 | self.num_previous_responses = num_previous_responses 79 | self.neuron_type = neuron_type 80 | 81 | def log_result(self, messages, result, response_time): 82 | if os.environ.get("LOG_RESULTS", "true").lower() == "true": 83 | output_dir = Path("outputs") 84 | output_dir.mkdir(exist_ok=True) 85 | 86 | date_str = datetime.now().strftime("%Y_%m_%d") 87 | moa_id = self.moa_id or "unknown" 88 | output_file = output_dir / f"{date_str}_moa_{moa_id}.jsonl" 89 | 90 | log_entry = { 91 | "input": messages, 92 | "output": result, 93 | "generator": self.model, 94 | "moa_id": moa_id, 95 | "temperature": self.temperature, 96 | "response_time": response_time, 97 | "neuron_type": self.neuron_type, 98 | } 99 | 100 | with output_file.open("a") as f: 101 | json.dump(log_entry, f) 102 | f.write("\n") 103 | 104 | async def process( 105 | self, input_data: Union[str, List[Dict[str, str]]], prev_response: List[str] | None = None 106 | ) -> Dict[str, Any]: 107 | """ 108 | Process the input data through the LLMNeuron. 109 | 110 | Args: 111 | input_data (Union[str, List[Dict[str, str]]]): The input data to be processed. It can be a string or a list of dictionaries with 'role' and 'content' keys. 112 | prev_response (List[str] | None, optional): The previous responses to be included in the system prompt. Defaults to None. 113 | 114 | Returns: 115 | Dict[str, Any]: A dictionary containing the response content, response time, and weight. 116 | """ 117 | start_time = time.time() 118 | 119 | messages = ( 120 | input_data 121 | if isinstance(input_data, list) 122 | else [{"role": "user", "content": input_data}] 123 | ) 124 | 125 | if prev_response: 126 | last_n_responses = prev_response[-self.num_previous_responses :] 127 | system_content = self.collect_responses( 128 | self.system_prompt if self.system_prompt else "", last_n_responses 129 | ) 130 | messages.insert(0, {"role": "system", "content": system_content}) 131 | 132 | for sleep_time in [1, 2, 4, 8, 16]: 133 | try: 134 | if self.stream: 135 | response_content = await self.process_stream(messages) 136 | else: 137 | response = await self.client.chat.completions.create( 138 | model=self.model, 139 | messages=messages, 140 | temperature=self.temperature, 141 | max_tokens=self.max_tokens, 142 | ) 143 | response_content = response.choices[0].message.content 144 | 145 | end_time = time.time() 146 | response_time = end_time - start_time 147 | 148 | logger.debug(f"Model: {self.model}") 149 | logger.debug(f"Prompt: {messages}") 150 | logger.debug(f"Response: {response_content}") 151 | logger.info(f"Response Time: {response_time:.2f} seconds") 152 | 153 | self.log_result(messages, response_content, response_time) 154 | 155 | return {"content": response_content, "time": response_time, "weight": self.weight} 156 | 157 | except Exception as e: 158 | logger.error(f"Error requesting {self.model}; prompt: {input_data}") 159 | logger.error(f"Error: {e}. Retrying in {sleep_time} seconds...") 160 | await asyncio.sleep(sleep_time) 161 | 162 | # If all retries fail, return the default response 163 | end_time = time.time() 164 | response_time = end_time - start_time 165 | logger.error(f"Failed to get response from model {self.model} after multiple retries") 166 | return { 167 | "content": "Model did not generate a response. Ignore this.", 168 | "time": response_time, 169 | "weight": self.weight, 170 | } 171 | 172 | async def process_stream(self, messages: List[Dict[str, str]]) -> str: 173 | chat_completion = await self.client.chat.completions.create( 174 | model=self.model, 175 | messages=messages, 176 | temperature=self.temperature, 177 | max_tokens=self.max_tokens, 178 | stream=True, 179 | ) 180 | 181 | response_content = "" 182 | async for chunk in chat_completion: 183 | if chunk.choices[0].delta.content is not None: 184 | response_content += chunk.choices[0].delta.content 185 | 186 | return response_content 187 | 188 | @staticmethod 189 | def collect_responses(prompt: str, prev_responses: List[str]) -> str: 190 | """ 191 | Collect and format previous responses. 192 | 193 | This method currently uses a simple concatenation approach to aggregate responses. 194 | In the future, we could explore more sophisticated aggregation methods. 195 | 196 | Args: 197 | prompt (str): The original prompt. 198 | prev_responses (List[str]): List of previous responses. 199 | 200 | Returns: 201 | str: Formatted string containing the original prompt and previous responses. 202 | """ 203 | responses = "\n".join( 204 | [ 205 | f"## Model {i+1}\n\nResponse:\n\n{str(response)}" 206 | for i, response in enumerate(prev_responses) 207 | ] 208 | ) 209 | return f"{prompt}\n\n# Previous responses:\n{responses}" 210 | -------------------------------------------------------------------------------- /moa_llm/user_query_annotator.py: -------------------------------------------------------------------------------- 1 | import os 2 | from typing import Optional 3 | 4 | from .neuron import LLMNeuron 5 | 6 | 7 | class UserQueryAnnotator(LLMNeuron): 8 | def __init__( 9 | self, 10 | model: str, 11 | temperature: float = 0.7, 12 | system_prompt: Optional[str] = None, 13 | base_url: str = os.getenv("BASE_URL", "https://api.together.xyz/v1"), 14 | moa_id: Optional[str] = None, 15 | neuron_type: str = "annotator", 16 | ): 17 | default_system_prompt = """ 18 | You are a User Query Annotator. Your task is to process and optimize the user's query, breaking it down into clear, specific steps and add context if necessary. 19 | 20 | Only return the rephrased prompt and nothing else. 21 | """ 22 | super().__init__( 23 | model, 24 | system_prompt or default_system_prompt, 25 | temperature, 26 | weight=1.0, 27 | base_url=base_url, 28 | moa_id=moa_id, 29 | neuron_type=neuron_type, 30 | ) 31 | 32 | async def annotate(self, user_query: str) -> str: 33 | result = await self.process(user_query) 34 | return result["content"] 35 | -------------------------------------------------------------------------------- /moa_llm/version.py: -------------------------------------------------------------------------------- 1 | _MAJOR = "0" 2 | _MINOR = "1" 3 | # On main and in a nightly release the patch should be one ahead of the last 4 | # released build. 5 | _PATCH = "0" 6 | # This is mainly for nightly builds which have the suffix ".dev$DATE". See 7 | # https://semver.org/#is-v123-a-semantic-version for the semantics. 8 | _SUFFIX = "" 9 | 10 | VERSION_SHORT = "{0}.{1}".format(_MAJOR, _MINOR) 11 | VERSION = "{0}.{1}.{2}{3}".format(_MAJOR, _MINOR, _PATCH, _SUFFIX) 12 | -------------------------------------------------------------------------------- /pyproject.toml: -------------------------------------------------------------------------------- 1 | [build-system] 2 | requires = ["setuptools", "wheel"] 3 | build-backend = "setuptools.build_meta" 4 | 5 | [project] 6 | # See https://setuptools.pypa.io/en/latest/userguide/quickstart.html for more project configuration options. 7 | name = "moa-llm" 8 | dynamic = ["version"] 9 | readme = "README.md" 10 | classifiers = ["Topic :: Scientific/Engineering :: Artificial Intelligence"] 11 | authors = [ 12 | { name = "Joao Fiadeiro", email = "joao@catena.xyz" }, 13 | { name = "Anjor Kanekar", email = "anjor@catena.xyz" }, 14 | ] 15 | requires-python = ">=3.8" 16 | dependencies = ["fastapi", "openai", "python-dotenv", "PyYAML"] 17 | license = { file = "LICENSE" } 18 | 19 | [project.urls] 20 | Homepage = "https://github.com/catena-labs/moa-llm" 21 | Repository = "https://github.com/catena-labs/moa-llm" 22 | 23 | [project.optional-dependencies] 24 | dev = [ 25 | "ruff", 26 | "black>=23.0,<24.0", 27 | "isort>=5.12,<5.14", 28 | "pytest", 29 | "pytest-sphinx", 30 | "pytest-cov", 31 | "twine>=1.11.0", 32 | "build", 33 | "setuptools", 34 | "wheel", 35 | "packaging", 36 | ] 37 | 38 | [tool.setuptools.packages.find] 39 | exclude = ["*.tests", "*.tests.*", "tests.*", "tests", "docs*", "scripts*"] 40 | 41 | [tool.setuptools] 42 | include-package-data = true 43 | 44 | [tool.setuptools.package-data] 45 | moa_llm = ["py.typed"] 46 | 47 | [tool.setuptools.dynamic] 48 | version = { attr = "moa_llm.version.VERSION" } 49 | 50 | [tool.black] 51 | line-length = 100 52 | include = '\.pyi?$' 53 | exclude = ''' 54 | ( 55 | __pycache__ 56 | | \.git 57 | | \.pytest_cache 58 | | \.vscode 59 | | \.venv 60 | | \bdist\b 61 | | \bdoc\b 62 | ) 63 | ''' 64 | 65 | [tool.isort] 66 | profile = "black" 67 | multi_line_output = 3 68 | 69 | # You can override these pyright settings by adding a personal pyrightconfig.json file. 70 | [tool.pyright] 71 | reportPrivateImportUsage = false 72 | 73 | [tool.ruff] 74 | line-length = 115 75 | target-version = "py39" 76 | 77 | [tool.ruff.lint.per-file-ignores] 78 | "__init__.py" = ["F401"] 79 | --------------------------------------------------------------------------------