├── .gitignore ├── LICENSE ├── README.md ├── application ├── amqp.py ├── base_handler.py ├── download.py ├── llm_handler.py ├── progress_streamer.py ├── system_info.py └── thread.py ├── modules ├── facebook │ └── convnext │ │ ├── convnext.py │ │ └── golem.json ├── haotian-liu │ └── llava │ │ ├── golem-generator.py │ │ └── golem.json ├── hf-pipeline │ ├── asr │ │ ├── asr.py │ │ └── golem.json │ ├── image-class │ │ ├── golem.json │ │ └── image-class.py │ ├── image-to-text │ │ ├── golem.json │ │ └── image-to-text.py │ ├── object-detection │ │ ├── golem.json │ │ └── object-detection.py │ ├── visual-question-answering │ │ ├── golem.json │ │ └── visual-question-answering.py │ ├── zero-shot-image-class │ │ ├── golem.json │ │ └── zero-shot-image-class.py │ └── zero-shot-object-detection │ │ ├── golem.json │ │ └── zero-shot-object-detection.py ├── hkunlp │ └── instructor │ │ ├── golem.json │ │ └── instructor.py ├── intfloat │ └── e5-v2 │ │ ├── e5-v2.py │ │ └── golem.json ├── microsoft │ └── git-textcaps │ │ ├── git-textcaps.py │ │ └── golem.json ├── noco-ai │ ├── bark-tts │ │ ├── golem.json │ │ └── handler.py │ ├── image-generator │ │ ├── golem.json │ │ └── handler.py │ ├── llama-cpp │ │ ├── golem.json │ │ └── llama-cpp.py │ ├── llm-api │ │ ├── golem.json │ │ └── handler.py │ ├── music-generator │ │ ├── golem.json │ │ └── handler.py │ ├── sd-xl │ │ ├── golem.json │ │ └── handler.py │ ├── transformers-stream │ │ ├── golem.json │ │ └── handler.py │ └── tts-api │ │ ├── golem.json │ │ └── handler.py ├── openai │ ├── chat-api │ │ ├── chat-api.py │ │ └── golem.json │ └── dalle │ │ ├── golem.json │ │ └── handler.py ├── salesforce │ └── blip2-opt │ │ ├── blip2-opt.py │ │ └── golem.json └── turboderp │ ├── exllama │ ├── golem-generator.py │ └── golem.json │ └── exllamav2 │ ├── golem.json │ └── handler.py ├── requirements-nogpu.txt ├── requirements.txt ├── schema ├── audio-gen.jsonschema ├── audio-url.jsonschema ├── img-gen.jsonschema ├── img-url.jsonschema ├── instructor.jsonschema ├── llm.jsonschema ├── visual-qa.jsonschema ├── voice-gen.jsonschema └── zero-shot-img.jsonschema └── server.py /.gitignore: -------------------------------------------------------------------------------- 1 | data/enabled_skills.json 2 | data/models/ 3 | data/ 4 | repos/ 5 | *.pyc 6 | vault-token 7 | .vscode/ 8 | 9 | # core modules w/ external reps 10 | modules/turboderp/exllama/* 11 | modules/turboderp/exllama/.* 12 | !modules/turboderp/exllama/golem.json 13 | !modules/turboderp/exllama/golem-generator.py 14 | modules/turboderp/exllama2/* 15 | modules/turboderp/exllama2/.* 16 | modules/haotian-liu/llava/* 17 | modules/haotian-liu/llava/.* 18 | !modules/haotian-liu/llava/golem.json 19 | !modules/haotian-liu/llava/golem-generator.py -------------------------------------------------------------------------------- /LICENSE: -------------------------------------------------------------------------------- 1 | Open Software License ("OSL") v. 3.0 2 | 3 | This Open Software License (the "License") applies to any original work of authorship (the "Original Work") whose owner (the "Licensor") has placed the following licensing notice adjacent to the copyright notice for the Original Work: 4 | 5 | Licensed under the Open Software License version 3.0 6 | 7 | 1. Grant of Copyright License. Licensor grants You a worldwide, royalty-free, non-exclusive, sublicensable license, for the duration of the copyright, to do the following: 8 | 9 | 1. to reproduce the Original Work in copies, either alone or as part of a collective work; 10 | 11 | 2. to translate, adapt, alter, transform, modify, or arrange the Original Work, thereby creating derivative works ("Derivative Works") based upon the Original Work; 12 | 13 | 3. to distribute or communicate copies of the Original Work and Derivative Works to the public, with the proviso that copies of Original Work or Derivative Works that You distribute or communicate shall be licensed under this Open Software License; 14 | 15 | 4. to perform the Original Work publicly; and 16 | 17 | 5. to display the Original Work publicly. 18 | 19 | 2. Grant of Patent License. Licensor grants You a worldwide, royalty-free, non-exclusive, sublicensable license, under patent claims owned or controlled by the Licensor that are embodied in the Original Work as furnished by the Licensor, for the duration of the patents, to make, use, sell, offer for sale, have made, and import the Original Work and Derivative Works. 20 | 21 | 3. Grant of Source Code License. The term "Source Code" means the preferred form of the Original Work for making modifications to it and all available documentation describing how to modify the Original Work. Licensor agrees to provide a machine-readable copy of the Source Code of the Original Work along with each copy of the Original Work that Licensor distributes. Licensor reserves the right to satisfy this obligation by placing a machine-readable copy of the Source Code in an information repository reasonably calculated to permit inexpensive and convenient access by You for as long as Licensor continues to distribute the Original Work. 22 | 23 | 4. Exclusions From License Grant. Neither the names of Licensor, nor the names of any contributors to the Original Work, nor any of their trademarks or service marks, may be used to endorse or promote products derived from this Original Work without express prior permission of the Licensor. Except as expressly stated herein, nothing in this License grants any license to Licensor's trademarks, copyrights, patents, trade secrets or any other intellectual property. No patent license is granted to make, use, sell, offer for sale, have made, or import embodiments of any patent claims other than the licensed claims defined in Section 2. No license is granted to the trademarks of Licensor even if such marks are included in the Original Work. Nothing in this License shall be interpreted to prohibit Licensor from licensing under terms different from this License any Original Work that Licensor otherwise would have a right to license. 24 | 25 | 5. External Deployment. The term "External Deployment" means the use, distribution, or communication of the Original Work or Derivative Works in any way such that the Original Work or Derivative Works may be used by anyone other than You, whether those works are distributed or communicated to those persons or made available as an application intended for use over a network. As an express condition for the grants of license hereunder, You must treat any External Deployment by You of the Original Work or a Derivative Work as a distribution under section 1(c). 26 | 27 | 6. Attribution Rights. You must retain, in the Source Code of any Derivative Works that You create, all copyright, patent, or trademark notices from the Source Code of the Original Work, as well as any notices of licensing and any descriptive text identified therein as an "Attribution Notice." You must cause the Source Code for any Derivative Works that You create to carry a prominent Attribution Notice reasonably calculated to inform recipients that You have modified the Original Work. 28 | 29 | 7. Warranty of Provenance and Disclaimer of Warranty. Licensor warrants that the copyright in and to the Original Work and the patent rights granted herein by Licensor are owned by the Licensor or are sublicensed to You under the terms of this License with the permission of the contributor(s) of those copyrights and patent rights. Except as expressly stated in the immediately preceding sentence, the Original Work is provided under this License on an "AS IS" BASIS and WITHOUT WARRANTY, either express or implied, including, without limitation, the warranties of non-infringement, merchantability or fitness for a particular purpose. THE ENTIRE RISK AS TO THE QUALITY OF THE ORIGINAL WORK IS WITH YOU. This DISCLAIMER OF WARRANTY constitutes an essential part of this License. No license to the Original Work is granted by this License except under this disclaimer. 30 | 31 | 8. Limitation of Liability. Under no circumstances and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall the Licensor be liable to anyone for any indirect, special, incidental, or consequential damages of any character arising as a result of this License or the use of the Original Work including, without limitation, damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses. This limitation of liability shall not apply to the extent applicable law prohibits such limitation. 32 | 33 | 9. Acceptance and Termination. If, at any time, You expressly assented to this License, that assent indicates your clear and irrevocable acceptance of this License and all of its terms and conditions. If You distribute or communicate copies of the Original Work or a Derivative Work, You must make a reasonable effort under the circumstances to obtain the express assent of recipients to the terms of this License. This License conditions your rights to undertake the activities listed in Section 1, including your right to create Derivative Works based upon the Original Work, and doing so without honoring these terms and conditions is prohibited by copyright law and international treaty. Nothing in this License is intended to affect copyright exceptions and limitations (including 'fair use' or 'fair dealing'). This License shall terminate immediately and You may no longer exercise any of the rights granted to You by this License upon your failure to honor the conditions in Section 1(c). 34 | 35 | 10. Termination for Patent Action. This License shall terminate automatically and You may no longer exercise any of the rights granted to You by this License as of the date You commence an action, including a cross-claim or counterclaim, against Licensor or any licensee alleging that the Original Work infringes a patent. This termination provision shall not apply for an action alleging patent infringement by combinations of the Original Work with other software or hardware. 36 | 37 | 11. Jurisdiction, Venue and Governing Law. Any action or suit relating to this License may be brought only in the courts of a jurisdiction wherein the Licensor resides or in which Licensor conducts its primary business, and under the laws of that jurisdiction excluding its conflict-of-law provisions. The application of the United Nations Convention on Contracts for the International Sale of Goods is expressly excluded. Any use of the Original Work outside the scope of this License or after its termination shall be subject to the requirements and penalties of copyright or patent law in the appropriate jurisdiction. This section shall survive the termination of this License. 38 | 39 | 12. Attorneys' Fees. In any action to enforce the terms of this License or seeking damages relating thereto, the prevailing party shall be entitled to recover its costs and expenses, including, without limitation, reasonable attorneys' fees and costs incurred in connection with such action, including any appeal of such action. This section shall survive the termination of this License. 40 | 41 | 13. Miscellaneous. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable. 42 | 43 | 14. Definition of "You" in This License. "You" throughout this License, whether in upper or lower case, means an individual or a legal entity exercising rights under, and complying with all of the terms of, this License. For legal entities, "You" includes any entity that controls, is controlled by, or is under common control with you. For purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. 44 | 45 | 15. Right to Use. You may use the Original Work in all ways not otherwise restricted or conditioned by this License or by law, and Licensor promises not to interfere with or be responsible for such uses by You. 46 | 47 | 16. Modification of This License. This License is Copyright (C) 2005 Lawrence Rosen. Permission is granted to copy, distribute, or communicate this License without modification. Nothing in this License permits You to modify this License as applied to the Original Work or to Derivative Works. However, You may modify the text of this License and copy, distribute or communicate your modified version (the "Modified License") and apply it to other original works of authorship subject to the following conditions: (i) You may not indicate in any way that your Modified License is the "Open Software License" or "OSL" and you may not use those names in the name of your Modified License; (ii) You must replace the notice specified in the first paragraph above with the notice "Licensed under " or with a notice of your own that is not confusingly similar to the notice in this License; and (iii) You may not claim that your original works are open source software unless your Modified License has been approved by Open Source Initiative (OSI) and You comply with its license review and certification process. -------------------------------------------------------------------------------- /README.md: -------------------------------------------------------------------------------- 1 | # Elemental Golem 2 | 3 | Elemental Golem is a project that defines and serves AI models using a modular system with a `golem.json` configuration file and a handler that implements the call and response from the model utilizing AMQP as the message broker. It is the backend used by Arcane Bridge and Spell Book for interating with AI models based off Pytorch and similar libraries. It currently focuses soley on inference tasks. 4 | 5 | 6 | ## Stack Documentation 7 | 8 | - https://github.com/noco-ai/spellbook-docker/wiki 9 | - The wiki for the docker project contains comprehensive documentation for the UI that uses Elemental Golem to serve AI models. 10 | 11 | ## Stack Architecture 12 | 13 | ![Software stack diagram](https://github.com/noco-ai/spellbook-docker/blob/master/stack.png) 14 | 15 | ## Dependencies 16 | 17 | - Hashicorp Vault >= 1.1 18 | - RabbitMQ >= 3.6.10 19 | 20 | ### Required Vault Keys 21 | 22 | In order to function Elemental Golem need to connect to a Vault server to retervice secrets and configuration data. 23 | The following information needs to be stored in Vault for Element Golem to start. 24 | 25 | ### **core/amqp** 26 | 27 | ```json 28 | { 29 | "host": "127.0.0.1", 30 | "password": "securepass", 31 | "username": "spellbook-user", 32 | "vhost": "spellbook" 33 | } 34 | ``` 35 | 36 | ## Install Guide 37 | 38 | ### Docker Install 39 | 40 | See https://github.com/noco-ai/spellbook-docker for installing the entire Spell Book stack with Docker Compose. 41 | 42 | ### Ubuntu Server 22 Install (no GPU) 43 | 44 | These step can be taken to install Element Golem on a Ubuntu 22 server with no GPU installed. 45 | 46 | ```bash 47 | sudo apt-get update 48 | sudo apt-get upgrade 49 | sudo apt install build-essential 50 | 51 | curl https://repo.anaconda.com/archive/Anaconda3-2021.11-Linux-x86_64.sh --output anaconda.sh 52 | bash anaconda.sh 53 | conda create -n golem python=3.10.9 54 | conda activate golem 55 | conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia 56 | CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python 57 | 58 | # clone repo 59 | mkdir elemental-golem 60 | cd elemental-golem 61 | git clone https://github.com/noco-ai/elemental-golem.git . 62 | pip install -r requirements-nogpu.txt 63 | python server.py --server-id golemX --vault-host https://vault.example.com --gpu-type=nogpu 64 | ``` 65 | 66 | ### requirements.txt 67 | 68 | When installing Elemental Golem on a system with a Nvidia graphics card you will nee to run the apporicate requirements.txt file. Use 69 | requirements-nogpu.txt if the system has no GPU present. 70 | 71 | ### CLI Parameters and Server Commands 72 | 73 | Elemental Golem provides several CLI commands for controlling the software. Below is a detailed explanation of them. 74 | 75 | ### **Command-line Interface (CLI) Parameters**: 76 | 77 | - `--server-id`: A unique identifier for the server. Required parameter. 78 | - `--vault-host`: The address of the Vault server host. Required parameter. 79 | - `--vault-token-file`: The path to the file containing the Vault token. Defaults to './vault-token' if not specified. 80 | - `--vault-root`: The root path in the Vault server. Defaults to 'arcane-bridge' if not specified. 81 | - `--shared-models`: If set to true all downloads for HuggingFace will be do to the data/cache/ folder. This is useful for shared drives and docker. 82 | - `--amqp-ip`: Overrides the IP stored in Vault for connecting to AMQP server. Useful if running instances of Elemental Golem on additional servers when primary node is running stack using Docker compose. 83 | - `--gpu-type`: The type of GPU running on the worker server. Valid choices are nvidia or nogpu. 84 | 85 | ### **Commands to Control the Server over AMQP**: 86 | 87 | - `system_info`: Returns details about the system, such as server ID, system status (ONLINE or STARTING), installed skills, and running skills. 88 | - `run_skill`: Adds and runs a skill on the server based on the `skill_details` provided in the message body. 89 | - `stop_skill`: Stops a running skill based on the `skill_details` provided in the message body. 90 | - `install_skill`: Installs a skill on the server based on the `skill_details` provided in the message body. 91 | - `stop_generation`: Stops generation on a particular thread based on the `stop_details` provided in the message body. 92 | - `update_configuration`: Updates the configuration of the system based on the details provided in the message body. 93 | 94 | Each command request should contain a `command`, `return_routing_key`, `return_exchange` in the message `headers`. Based on the command executed, appropriate responses are provided through the `AMQP` channel. 95 | 96 | > Note: It is crucial to reject the message correctly if any error occurs during command execution to prevent the message broker from requeueing the message. 97 | 98 | ### LLM Payload Validation 99 | 100 | The LLM handlers check the AMQP payload for the following data: 101 | 102 | - **max_new_tokens** (Number, Required): Your desired maximum tokens generated. 103 | - **top_p** (Number, Required): Your desired randomness in response (0.0 to 1.0). 104 | - **temperature** (Number, Required): Your desired "temperature" of output (0.0 to 1.0). 105 | - **stream** (Boolean, Required): If set to true, it signals to stream the output. 106 | - **debug** (Boolean, Optional): Signals to enable the debug mode. If enabled model output will be streamed to the console. 107 | - **stop_key** (String, Optional): The key string to stop generation. 108 | - **lora** (String, Optional): Specifies a lora to use with the request. Only ExLlama support if included at this point. 109 | - **ai_role** (String, Optional): Specifies role of AI in conversations. 110 | - **user_role** (String, Optional): Specifies role of user in conversations. 111 | - **start_response** (String, Optional): Specifies the response to start with. 112 | - **raw** (String, Optional): Raw content to use for generating the prompt. 113 | - **messages** (Array, Required): An array of message objects with these properties: 114 | - **role** (String, Required): Role in the message. 115 | - **content** (String, Required): Content of the message. 116 | 117 | ## golem.json 118 | 119 | The golem.json file defines the handlers and models/skills available for loading and inference. Here is a high level overview of the the fields found in the file. 120 | The best reference for this at the moment is to look in the modules/noco-ai/... to file and look at a handler that is implements the handler for similar type of model. 121 | If your model uses transformers of 🤗 pipelines you can add a new definition for the model to an exisiting handler. 122 | 123 | The configuration for Elemental Golem is stored in a JSON file. Below is a breakdown of each field in the JSON file: 124 | 125 | - `label`: Name of the module. 126 | - `description`: Purpose of the module, what the skill does. 127 | - `script`: Python script to use in running the project. 128 | - `multi_gpu_support`: Boolean indicating multi-gpu support. 129 | - `repository`: Stores information about the code repository. 130 | - `url`: URL to the project repository. 131 | - `folder`: Specific directory within the repository URL. 132 | - `skills`: An array containing model definitions. Each model has its properties: 133 | - `label`: A readable name for the model. 134 | - `routing_key`: Routing key for the message broker. 135 | - `use`: Use case(s) for this skill. 136 | - `available_precision`: Array of information of what devies and percision the skill can be loaded at. 137 | - `memory_usage`: The memory capacity requirement of the model. 138 | - `model`: A model with a `name` and `provider`. 139 | - `shortcut`: A symbol representing the model, this allows LLM models to be accessed via the Spellbook UI directly. 140 | - `configuration`: Model-specific configurations, like `max_seq_len`, `user_role`, `ai_role`, `stop_on`, `system_messages`. 141 | - `configuration`: Module-wide configuration involving secrets management and system-specific parameters. 142 | - `vault_path`: The path to secrets storage for sensitive data like API tokens. 143 | - `options`: An array of global options. 144 | 145 | ### Model Configuration 146 | 147 | Each model/skill can define configurtion information that is available to the handler. If these have keys that match the global configuration keys 148 | for the module they are merged with the user set values overriding the defaults. Here is an example of configuration values a LLM handler expects. 149 | 150 | - `max_seq_len`: Specifies the maximum sequence length for model input. 151 | - `user_role`: Define the user's assumed role in the interaction. 152 | - `ai_role`: Defines the AI's assumed role in the interaction. 153 | - `stop_on`: The signals that, when received, will trigger the model to stop the execution. 154 | - `system_message`: Describes the nature of interaction between a user and the AI. 155 | 156 | ### Global Configuration 157 | 158 | The global configuration is read by the frontend which allows the user to override the system default. What configuration options will vary by the type 159 | of handler. Module-wide configuration options include. 160 | 161 | - `vault_path`: Secure storage path for sensitive data like API keys. 162 | - `options`: An array of global parameters each with: 163 | - `label`: A readable field name displayed in a settings UI. 164 | - `name`: Identifier for the option/field. 165 | - `editable`: Boolean determining if the user can manually edit the value. 166 | - `type`: Data type of the parameter. 167 | - `default`: The default value if none is provided. 168 | 169 | ### Repository 170 | 171 | Some modules require that another repo is installed to allow for a skill handler to work correctly. These are defines at a global level for the handler. 172 | 173 | - `url`: URL to the project repository. 174 | - `folder`: The path to the folder within the repository. 175 | 176 | ## handler.py 177 | 178 | The handler is a Python class inheriting from `BaseHandler` or `LlmHandler` that is responsible for handling messages. Each handler must implement the following functions: 179 | 180 | - `__init__`: Initialize the handler. 181 | - `validate`: Validates a request. It should return a boolean indicating whether the request is valid and a list of errors (if any). 182 | - `execute`: Executes the model. It receives the model and request. This method is responsible for getting the request data, making the API call, and returning API response. 183 | - `load`: Loads the model. Receives three parameters: the model, model options, and the local path to the model. Be sure to set up the API key using `model["secrets"]["token"]`. 184 | 185 | ```python 186 | class ExampleHandler(BaseHandler): 187 | def __init__(self): 188 | super().__init__() 189 | 190 | def validate(self, request): 191 | return self.validate_fields(request, [("text", len)]) 192 | 193 | def execute(self, model, request): 194 | #... 195 | 196 | def load(self, model, model_options, local_path): 197 | openai.api_key = model["secrets"]["token"] 198 | return {"model_name": model["configuration"]["model"]} 199 | ``` 200 | 201 | Remember to replace `#...` in `execute` with the correct implementation that fits your scenario. 202 | The response must return `{ "content": response}` where `response` is the content you wish to send back. 203 | 204 | _Configuration, requests, and responses vary based on how the handler is implemented._ 205 | -------------------------------------------------------------------------------- /application/amqp.py: -------------------------------------------------------------------------------- 1 | import pika 2 | import logging 3 | 4 | logger = logging.getLogger(__name__) 5 | 6 | def connect_to_amqp(amqp_ip, amqp_user, amqp_password, amqp_vhost): 7 | 8 | # Otherwise, establish a new connection for this process 9 | connection_successful = True 10 | try: 11 | credentials = pika.PlainCredentials(amqp_user, amqp_password) 12 | connection = pika.BlockingConnection( 13 | pika.ConnectionParameters( 14 | host=amqp_ip, 15 | virtual_host=amqp_vhost, 16 | credentials=credentials, 17 | connection_attempts=5, 18 | retry_delay=5, 19 | socket_timeout=600, 20 | heartbeat=300 21 | ) 22 | ) 23 | channel = connection.channel() 24 | 25 | except Exception as e: 26 | connection_successful = False 27 | logger.error(f"failed to connect", e) 28 | 29 | return connection_successful, connection, channel 30 | 31 | def create_queue(channel, queue_name, dlx=None, dlx_queue='deadletters', is_exclusive=False, is_auto_delete=False): 32 | 33 | # Declare the queue with 'dlx' as the DLX if provided 34 | if dlx: 35 | result = channel.queue_declare(queue=queue_name, exclusive=is_exclusive, auto_delete=is_auto_delete, arguments={ 36 | 'x-dead-letter-exchange': dlx, 37 | 'x-dead-letter-routing-key': dlx_queue 38 | }) 39 | else: 40 | result = channel.queue_declare(queue=queue_name, exclusive=is_exclusive, auto_delete=is_auto_delete) 41 | 42 | return result.method.queue 43 | 44 | def create_exchange(channel, exchange_name, exchange_type='direct'): 45 | channel.exchange_declare(exchange=exchange_name, exchange_type=exchange_type) 46 | 47 | def bind_queue_to_exchange(channel, queue_name, exchange_name, routing_key=None): 48 | channel.queue_bind(exchange=exchange_name, queue=queue_name, routing_key=routing_key) 49 | 50 | def become_consumer(channel, queue_name, callback_function): 51 | channel.basic_consume(queue=queue_name, on_message_callback=callback_function, auto_ack=False) 52 | channel.start_consuming() 53 | 54 | def send_message_to_exchange(channel, exchange_name, routing_key, message, headers=None): 55 | properties = pika.BasicProperties(delivery_mode=2) # make message persistent 56 | if headers is not None: 57 | properties.headers = headers 58 | 59 | channel.basic_publish(exchange=exchange_name, 60 | routing_key=routing_key, 61 | body=message, 62 | properties=properties) 63 | -------------------------------------------------------------------------------- /application/base_handler.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | from pika import BasicProperties 3 | import hvac 4 | import json 5 | import jsonschema 6 | from jsonschema import validate 7 | from typing import Union, List 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class BaseHandler: 13 | 14 | def __init__(self): 15 | self.cached_schemas = {} 16 | 17 | def execute(self, model, request) -> dict: 18 | raise NotImplementedError("The `execute` method should be implemented in the derived class.") 19 | 20 | def validate(self, request) -> dict: 21 | raise NotImplementedError("The `validate` method should be implemented in the derived class.") 22 | 23 | def load(self, model, model_options) -> dict: 24 | return {} 25 | 26 | def copy_queue_headers(self, incoming_headers, override_command = None): 27 | # copy amqp headers 28 | outgoing_headers = {} 29 | stream_override = None 30 | for incoming_header in incoming_headers: 31 | if incoming_header in ["x-delay", "return_exchange", "return_routing_key"]: 32 | continue 33 | if incoming_header == "stream_to_override": 34 | stream_override = incoming_headers[incoming_header] 35 | 36 | outgoing_headers[incoming_header] = incoming_headers[incoming_header] 37 | 38 | stream_to = "prompt_fragment" if stream_override == None else stream_override 39 | if override_command != None: 40 | outgoing_headers["original_command"] = incoming_headers["command"] 41 | 42 | outgoing_headers["command"] = override_command if override_command is not None else stream_to 43 | return BasicProperties(headers=outgoing_headers) 44 | 45 | def load_schema_file(self, schema_file): 46 | # Check if schema is in cache 47 | if schema_file in self.cached_schemas: 48 | schema = self.cached_schemas[schema_file] 49 | else: 50 | # Load the schema at the path 51 | try: 52 | with open(f"schema/{schema_file}.jsonschema", 'r') as file: 53 | schema = json.load(file) 54 | except Exception as e: 55 | logger.error(e) 56 | return None 57 | # Cache the schema 58 | self.cached_schemas[schema_file] = schema 59 | 60 | return schema 61 | 62 | # A dictionary to hold the cached schemas 63 | def validate_request(self, json_data: dict, schema_file: str) -> Union[bool, List[str]]: 64 | 65 | schema = self.load_schema_file(schema_file) 66 | if schema is None: 67 | return False, ["Invalid schema file for handler"] 68 | 69 | json_data = self.apply_schema_defaults(json_data, schema_file) 70 | try: 71 | validate(instance=json_data, schema=schema) 72 | except jsonschema.exceptions.ValidationError as err: 73 | # If there is a validation error, return a list containing the error message 74 | logger.warn("validation failed for incoming request") 75 | return False, [str(err)] 76 | else: 77 | # If the data is valid, return True 78 | return True, [] 79 | 80 | def apply_schema_defaults(self, raw_data: dict, schema_file: str) -> dict: 81 | 82 | schema = self.load_schema_file(schema_file) 83 | if schema is None: 84 | logger.error("could not load schema file") 85 | return raw_data 86 | 87 | # Fill default values 88 | for property, attributes in schema['properties'].items(): 89 | if "default" in attributes and property not in raw_data: 90 | raw_data[property] = attributes["default"] 91 | 92 | return raw_data 93 | 94 | 95 | def check_stop_generation(self, counter, stop_generation_event, stop_generation_filter, socket_id): 96 | counter += 1 97 | if counter >= 5: 98 | counter = 0 99 | if stop_generation_event.is_set(): 100 | stop_generation_event.clear() 101 | if socket_id == None: 102 | return False, counter 103 | 104 | stop_socket = bytes(stop_generation_filter.raw).rstrip(b'\x00').decode("utf-8") 105 | if stop_socket == socket_id: 106 | return True, counter 107 | 108 | return False, counter 109 | -------------------------------------------------------------------------------- /application/download.py: -------------------------------------------------------------------------------- 1 | import os 2 | import logging 3 | import shutil 4 | import requests 5 | import multiprocessing 6 | import hashlib 7 | from huggingface_hub import snapshot_download, hf_hub_download 8 | from application.thread import send_ui_update 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | BUFFER_SIZE = 64 * 1024 * 1024 # 64 MB 13 | 14 | # Usage 15 | def install_skill(all_skills, install_skill_data, shared_models, server_id, channel): 16 | # Create a list to hold all the processes 17 | processes = [] 18 | for skill in all_skills: 19 | if skill["routing_key"] != install_skill_data["routing_key"]: 20 | continue 21 | 22 | if "model" in skill: 23 | for model in skill["model"]: 24 | process = multiprocessing.Process(target=download_model, args=(model, install_skill_data, shared_models, server_id, channel)) 25 | processes.append(process) 26 | process.start() 27 | 28 | if "repository" in skill: 29 | for repo in skill["repository"]: 30 | # Create and start a new process for each download 31 | process = multiprocessing.Process(target=download_repo, args=(repo["url"], repo["folder"], repo["module_path"])) 32 | processes.append(process) 33 | process.start() 34 | 35 | def download_repo(url, repo_folder, target_folder): 36 | repo_folder = f'data/repos/{repo_folder}' 37 | 38 | if os.path.exists(repo_folder) and os.path.isdir(repo_folder): 39 | os.system(f"cd {repo_folder} && git pull") 40 | else: 41 | # Create the directory if it doesn't exist 42 | os.makedirs(repo_folder, exist_ok=True) 43 | os.system(f"git clone {url} {repo_folder}") 44 | 45 | logger.info(f"done downloading repository {repo_folder}") 46 | 47 | # Make sure the target folder exists 48 | os.makedirs(target_folder, exist_ok=True) 49 | 50 | # Copy all files from repo_folder to target_folder 51 | for file_name in os.listdir(repo_folder): 52 | source = os.path.join(repo_folder, file_name) 53 | destination = os.path.join(target_folder, file_name) 54 | 55 | if os.path.isfile(source): 56 | shutil.copy2(source, destination) 57 | elif os.path.isdir(source): 58 | shutil.copytree(source, destination, dirs_exist_ok=True) 59 | 60 | logger.info(f"done copying repository files to {target_folder}") 61 | 62 | def combine_files(base_file, split_files): 63 | 64 | logger.info(f"opening {base_file} to combing splits") 65 | with open(base_file, 'wb') as dest_file: 66 | for split_name in split_files: 67 | filename = base_file + split_name 68 | logger.info(f"combining split file {filename}") 69 | with open(filename, 'rb') as src_file: 70 | while True: 71 | chunk = src_file.read(BUFFER_SIZE) 72 | if not chunk: 73 | break 74 | dest_file.write(chunk) 75 | 76 | logger.info(f"cleaning up split file {split_name}") 77 | os.remove(filename) 78 | logging.info("done combining split files") 79 | 80 | def download_model(model, install_skill, shared_models, server_id, channel): 81 | 82 | name = model["name"] 83 | provider = model["provider"] 84 | is_branch = False 85 | single_file = False 86 | model_full_path = model["name"] 87 | if "files" in model and install_skill["precision"] in model["files"]: 88 | model_full_path = os.path.join(model["name"], model["files"][install_skill["precision"]]) 89 | single_file = True 90 | elif "branch" in model and install_skill["precision"] in model["branch"]: 91 | model_full_path = os.path.join(model["name"], model["branch"][install_skill["precision"]]) 92 | is_branch = True 93 | 94 | lock_file_path = "data/models/" + hashlib.sha256(model_full_path.encode()).hexdigest()[:10] + ".lock" 95 | 96 | # Check if a download is already in progress for the given model name 97 | if os.path.exists(lock_file_path): 98 | logger.info(f"download already in progress for model: {name}") 99 | return 100 | 101 | # Create a lock file to signal that a download is in progress 102 | with open(lock_file_path, 'w') as lock_file: 103 | lock_file.write("download in progress") 104 | 105 | logger.info(f"downloading skill model {name}") 106 | 107 | try: 108 | if provider == 'huggingface': 109 | 110 | if single_file: 111 | if os.path.exists(f'data/models/{name}/{model["files"][install_skill["precision"]]}'): 112 | logger.info(f"already downloaded model: {name}") 113 | return 114 | elif os.path.exists(f'data/models/{name}'): 115 | logger.info(f"already downloaded model: {name}") 116 | return 117 | 118 | os.makedirs(f'data/models/{name}', exist_ok=True) 119 | use_symlinks = False if shared_models else "auto" 120 | cache_dir = "data/cache" if shared_models else None 121 | download_args = { 122 | "repo_id": name, 123 | "local_dir": f'data/models/{model_full_path}' 124 | } 125 | 126 | # Conditionally adding arguments to the dictionary if they are not None 127 | if cache_dir is not None: 128 | download_args["cache_dir"] = cache_dir 129 | 130 | if use_symlinks is not None: 131 | download_args["local_dir_use_symlinks"] = use_symlinks 132 | 133 | if single_file: 134 | if "split" in model and install_skill["precision"] in model["split"]: 135 | for split_name in model["split"][install_skill["precision"]]: 136 | download_args["filename"] = model["files"][install_skill["precision"]] + split_name 137 | download_args["local_dir"] = f'data/models/{name}' 138 | logger.info(f'downlading split file {model["files"][install_skill["precision"]]}{split_name} from hf hub, shared: {shared_models}') 139 | hf_hub_download(**download_args) 140 | 141 | base_file = os.path.join('data', 'models', model_full_path) 142 | combine_files(base_file, model["split"][install_skill["precision"]]) 143 | else: 144 | logger.info(f'downlading single file {model["files"][install_skill["precision"]]} from hf hub, shared: {shared_models}') 145 | download_args["local_dir"] = f'data/models/{name}' 146 | download_args["filename"] = model["files"][install_skill["precision"]] 147 | hf_hub_download(**download_args) 148 | elif is_branch: 149 | logger.info(f'downlading branch {model_full_path} from hf hub, shared: {shared_models}') 150 | os.makedirs(f'data/models/{model_full_path}', exist_ok=True) 151 | download_args["revision"] = model["branch"][install_skill["precision"]] 152 | snapshot_download(**download_args) 153 | else: 154 | logger.info(f'downlading repo {name} from hf hub, shared: {shared_models}') 155 | snapshot_download(**download_args) 156 | elif provider == 'civitai': 157 | url = "" if "url" not in model else model["url"] 158 | if 'api' not in url: 159 | logger.error("invalid url provided for civit.ai") 160 | return 161 | 162 | headers = { 163 | "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36" 164 | } 165 | 166 | base_name = name.rsplit('/', 1)[0] 167 | base_dir = f'data/models/{base_name}' 168 | os.makedirs(base_dir, exist_ok=True) 169 | 170 | logger.info(f"downloading model from {url}, please wait...") 171 | response = requests.get(url, headers=headers, stream=True) 172 | with open(os.path.join(base_dir, name.split('/')[-1]), 'wb') as f: 173 | for chunk in response.iter_content(chunk_size=8192): 174 | f.write(chunk) 175 | 176 | logger.info("model downloaded successfully!") 177 | else: 178 | logger.error(f"{provider} is not supported") 179 | 180 | finally: 181 | # Remove the lock file 182 | if os.path.exists(lock_file_path): 183 | os.remove(lock_file_path) 184 | 185 | send_ui_update("skill_downloaded", name, server_id, channel) 186 | logger.info(f"finished downloading skill model {name}") -------------------------------------------------------------------------------- /application/llm_handler.py: -------------------------------------------------------------------------------- 1 | from urllib.parse import urlparse 2 | from application.base_handler import BaseHandler 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | class LlmHandler(BaseHandler): 8 | 9 | def __init__(self): 10 | super().__init__() 11 | 12 | def load(self, model, model_options, local_path): 13 | pass 14 | 15 | def load_config_settings(self, num_input_tokens, request): 16 | config = self.model_config 17 | max_new_tokens_config = int(request.get("max_new_tokens", 1024)) 18 | max_seq_len = config.get("max_seq_len", 2048) 19 | max_new_tokens = min(max_new_tokens_config, max_seq_len - num_input_tokens) 20 | top_p = request.get("top_p", 0.9) 21 | top_k = request.get("top_k", 50) 22 | seed = request.get("seed", -1) 23 | min_p = request.get("min_p", 0.05) 24 | mirostat = request.get("mirostat", 0) 25 | mirostat_eta = request.get("mirostat_eta", 0.01) 26 | mirostat_tau = request.get("mirostat_tau", 5) 27 | temperature = request.get("temperature", 1) 28 | stream_output = True if "stream" in request and request["stream"] == True else False 29 | debug = "debug" in request 30 | stop_key = request.get("stop_key", "") 31 | 32 | logger.info(f"prompt tokens: {num_input_tokens}, max completion tokens: {max_new_tokens}, context length: {max_seq_len}") 33 | logger.info(f"temperature: {temperature}, top_p: {top_p}, top_k: {top_k}, seed: {seed}, stream output: {stream_output}") 34 | logger.info(f"min_p: {min_p}, mirostat: {mirostat}, mirostat_eta: {mirostat_eta}, mirostat_tau: {mirostat_tau}") 35 | return max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, min_p, mirostat, mirostat_eta, mirostat_tau 36 | 37 | def build_stop_conditions(self, stops, to_lower = True): 38 | check_stop_token = False 39 | stop_conditions = [] 40 | for stop_text in stops: 41 | if stop_text == "": 42 | check_stop_token = True 43 | continue 44 | add_condition = stop_text.lower() if to_lower == True else stop_text 45 | stop_conditions.append(add_condition) 46 | 47 | return check_stop_token, stop_conditions 48 | 49 | def check_stop_conditions(self, token, res_line, eos_token, check_stop_token, stop_conditions): 50 | if check_stop_token and token == eos_token: 51 | return True 52 | 53 | for stop_string in stop_conditions: 54 | if res_line.lower().endswith(stop_string): 55 | return True 56 | 57 | return False 58 | 59 | def finish_response(self, stop_key, response, request, stream_output, 60 | finish_reason, tokens_per_second, new_tokens, input_tokens, model, elapsed, debug): 61 | if debug and stream_output == False: 62 | print('\033[92m' + response + '\033[0m') 63 | 64 | send_content = "" 65 | if stream_output: 66 | send_content = stop_key 67 | elif "start_response" in request: 68 | send_content = f"{request['start_response']}{response}" 69 | else: 70 | send_content = response 71 | 72 | llm_response = {"content": send_content, "finish_reason": finish_reason, 73 | "tokens_per_second": round(tokens_per_second, 2), "completion_tokens": new_tokens, "prompt_tokens": input_tokens, "model": model } 74 | 75 | if debug: 76 | print(llm_response) 77 | 78 | logger.info(f"prompt processed in {elapsed:.2f} seconds, new tokens: {new_tokens}, tokens/second: {tokens_per_second:.2f}") 79 | return llm_response 80 | 81 | def get_token_count(self, input_text): 82 | return 100000 83 | 84 | def _get_system_prompt(self, request, config): 85 | system_prompt = "" 86 | in_request = False 87 | contains_user_message = False 88 | 89 | if "system_message" in config and len(config["system_message"]): 90 | system_prompt = config['system_message'] 91 | 92 | # override with system prompt provided by request 93 | messages_len = len(request["messages"]) 94 | if messages_len and request["messages"][0]["role"] == "system": 95 | system_prompt = request['messages'][0]['content'] 96 | in_request = True 97 | 98 | if "system_prompt_format" in config: 99 | template = config["system_prompt_format"] 100 | ai_role = request["ai_role"] if "ai_role" in request else config["ai_role"] 101 | user_role = request["user_role"] if "user_role" in request else config["user_role"] 102 | if "{prompt}" in template: 103 | check_index = 1 if in_request else 0 104 | check_len = 2 if in_request else 1 105 | prompt = request["messages"][check_index]["content"] if messages_len >= check_len and request["messages"][check_index]["role"] == "user" else "" 106 | response = request["messages"][check_index + 1]["content"] if check_index + 1 < messages_len and request["messages"][check_index + 1]["role"] == "assistant" else "" 107 | system_prompt = template.format(user_role=user_role, system_prompt=system_prompt.strip(), ai_role=ai_role, prompt=prompt, response=response) + "\n" 108 | contains_user_message = True 109 | else: 110 | system_prompt = template.format(user_role=user_role, system_prompt=system_prompt.strip(), ai_role=ai_role) 111 | 112 | return system_prompt, in_request, contains_user_message 113 | 114 | def _prep_prompt(self, request, config): 115 | request_system_message = None 116 | max_new_tokens = request.get("max_new_tokens", 1024) 117 | max_seq_length = config["max_seq_len"] 118 | max_input_tokens = max(max_seq_length - max_new_tokens, 0) 119 | 120 | if max_input_tokens == 0: 121 | logger.error("error with configuration of models context limits") 122 | raise ValueError('error with configuration of models context limits') 123 | 124 | # give a little wiggle room for the way the prompt is built 125 | max_input_tokens -= 64 126 | 127 | system_prompt, sys_prompt_in_request, clip_first_user_message = self._get_system_prompt(request, config) 128 | system_prompt_tokens = self.get_token_count(system_prompt) 129 | if system_prompt_tokens >= max_input_tokens: 130 | logger.error("system prompt excceds max input tokens") 131 | raise ValueError("system prompt excceds max input tokens") 132 | 133 | if sys_prompt_in_request: 134 | request_system_message = request["messages"][0] 135 | request["messages"].pop(0) 136 | 137 | if clip_first_user_message: 138 | request["messages"].pop(0) 139 | 140 | # clip all but last message if this is an instruct model 141 | if len(request["messages"]) == 0: 142 | messages = [] 143 | if "model_type" in config and config["model_type"] == "instruct": 144 | messages = [request["messages"][-1]] 145 | else: 146 | messages = request["messages"][::-1] 147 | 148 | return messages, system_prompt_tokens, request_system_message, system_prompt, sys_prompt_in_request, max_input_tokens 149 | 150 | def build_prompt(self, request, config, model): 151 | prompt = "" 152 | 153 | # raw prompt 154 | if "raw" in request: 155 | prompt = request["raw"] 156 | if "start_response" in request: 157 | prompt += request["start_response"] 158 | return prompt 159 | 160 | messages, system_prompt_tokens, request_system_message, system_prompt, sys_prompt_in_request, max_input_tokens = self._prep_prompt(request, config) 161 | max_input_tokens -= 64 162 | 163 | # get delimiter in-between user and prompt and get roles 164 | ai_role = request["ai_role"] if "ai_role" in request else config["ai_role"] 165 | user_role = request["user_role"] if "user_role" in request else config["user_role"] 166 | template = config["prompt_format"] 167 | 168 | prompt_parts = [] 169 | input_token_count = system_prompt_tokens 170 | 171 | for index, message in enumerate(messages): 172 | 173 | if message["role"] == "assistant": 174 | continue 175 | 176 | ai_response = "" if index == 0 else messages[index - 1]["content"].strip() 177 | formatted_string = template.format(user_role=user_role, prompt=message['content'].strip(), ai_role=ai_role, response=ai_response) 178 | token_count = self.get_token_count(formatted_string) 179 | if input_token_count + token_count > max_input_tokens: 180 | break 181 | 182 | input_token_count += token_count 183 | prompt_parts.append(formatted_string) 184 | 185 | prompt_parts = prompt_parts[::-1] 186 | prompt = system_prompt + "\n".join(prompt_parts) 187 | if "start_response" in request: 188 | prompt += request["start_response"] 189 | 190 | return prompt -------------------------------------------------------------------------------- /application/progress_streamer.py: -------------------------------------------------------------------------------- 1 | from transformers.generation.streamers import BaseStreamer 2 | from tqdm import tqdm 3 | import json 4 | 5 | class ProgressStreamer(BaseStreamer): 6 | def __init__(self): 7 | self.token_count = 0 8 | self.max_new_tokens = 0 9 | self.show_bar = False 10 | self.amqp_config = None 11 | self.label = "" 12 | 13 | def put(self, value): 14 | self.token_count += 1 15 | if self.show_bar: 16 | self.progress_bar.update(1) 17 | 18 | if self.amqp_config: 19 | send_body = { 20 | "total": self.max_new_tokens, 21 | "current": self.token_count, 22 | "label": self.label, 23 | "model": self.model 24 | } 25 | 26 | self.amqp_config["channel"].basic_publish( 27 | exchange=self.amqp_config["headers"]['return_exchange'], 28 | routing_key=self.amqp_config["headers"]['return_routing_key'], 29 | body=json.dumps(send_body), properties=self.amqp_config["outgoing_properties"]) 30 | 31 | def end(self): 32 | if self.show_bar: 33 | self.progress_bar.close() 34 | 35 | def configure(self, max_new_tokens, label, model, amqp_config = None, show_bar = True): 36 | self.max_new_tokens = max_new_tokens 37 | self.show_bar = show_bar 38 | self.amqp_config = amqp_config 39 | self.token_count = 0 40 | self.label = label 41 | self.model = model 42 | if show_bar: 43 | self.progress_bar = tqdm(total=max_new_tokens) -------------------------------------------------------------------------------- /modules/facebook/convnext/convnext.py: -------------------------------------------------------------------------------- 1 | from diffusers.utils import load_image 2 | from application.base_handler import BaseHandler 3 | from transformers import ConvNextImageProcessor, ConvNextForImageClassification 4 | import torch 5 | 6 | class FacebookConvNet(BaseHandler): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def validate(self, request): 11 | is_valid, errors = self.validate_request(request, 'img-url') 12 | return is_valid, errors 13 | 14 | def execute(self, model, request): 15 | raw_image = load_image(request["img_url"]).convert('RGB') 16 | inputs = model["feature_extractor"](raw_image, return_tensors="pt") 17 | 18 | with torch.no_grad(): 19 | inputs = {key: value.to(model["device"]) for key, value in inputs.items()} 20 | logits = model["model"](**inputs).logits 21 | 22 | predicted_label = logits.argmax(-1).item() 23 | return {"classes": [{"label": model["model"].config.id2label[predicted_label], "score": 1}]} 24 | 25 | def load(self, model, model_options, local_path): 26 | feature_extractor = ConvNextImageProcessor.from_pretrained(local_path) 27 | conv_model = ConvNextForImageClassification.from_pretrained(local_path) 28 | return {"model": conv_model, "feature_extractor": feature_extractor, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/facebook/convnext/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Facebook Convnext", 3 | "description": "Handler for Facebook's Convnext ML models for image classification.", 4 | "unique_key": "facebook_convnext", 5 | "script": "convnext.py", 6 | "skills": [ 7 | { 8 | "name": "facebook/convnext-large-224", 9 | "label": "Facebook Convnext Large 224", 10 | "routing_key": "facebook_convnext_large_224", 11 | "use": ["image_classification"], 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 2300 }, 14 | "model": [{ 15 | "name": "facebook/convnext-large-224", 16 | "provider": "huggingface" 17 | }] 18 | }, 19 | { 20 | "name": "facebook/convnext-base-224", 21 | "label": "Facebook Convnext Base 224", 22 | "routing_key": "facebook_convnext_base_224", 23 | "use": ["image_classification"], 24 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 25 | "memory_usage": { "full": 2000 }, 26 | "model": [{ 27 | "name": "facebook/convnext-base-224", 28 | "provider": "huggingface" 29 | }] 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /modules/haotian-liu/llava/golem-generator.py: -------------------------------------------------------------------------------- 1 | #from transformers_stream_generator import init_stream_support 2 | #init_stream_support() 3 | from application.llm_handler import LlmHandler 4 | import requests 5 | import torch 6 | import json 7 | import logging 8 | from PIL import Image 9 | from io import BytesIO 10 | import os 11 | import sys 12 | import time 13 | from transformers.generation.streamers import BaseStreamer 14 | 15 | logger = logging.getLogger(__name__) 16 | 17 | sys.path.append(os.path.dirname(os.path.realpath(__file__))) 18 | from llava.model.builder import load_pretrained_model 19 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria 20 | from llava.constants import IMAGE_TOKEN_INDEX 21 | 22 | class AmqpStreamer(BaseStreamer): 23 | def __init__(self, tokenizer, channel, incoming_headers, outgoing_properties, model_data, check_function, debug): 24 | self.tokenizer = tokenizer 25 | self.all_tokens = [] 26 | self.all_text = "" 27 | self.new_tokens = 0 28 | self.debug = debug 29 | self.channel = channel 30 | self.outgoing_properties = outgoing_properties 31 | self.incoming_headers = incoming_headers 32 | self.model_data = model_data 33 | self.finish_reason = "stop" 34 | self.check_stop_generation = check_function 35 | self.stop_generation_counter = 0 36 | self.socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None 37 | 38 | def get_new_tokens(self): 39 | return self.new_tokens 40 | 41 | def get_response(self): 42 | return self.all_text 43 | 44 | def get_finish_reason(self): 45 | return self.finish_reason 46 | 47 | def put(self, value): 48 | 49 | stop_generation, self.stop_generation_counter = self.check_stop_generation(self.stop_generation_counter, 50 | self.model_data["stop_generation_event"], self.model_data["stop_generation_filter"], self.socket_id) 51 | 52 | if stop_generation: 53 | self.finish_reason = "abort" 54 | logger.info("stopping generation of text") 55 | raise ValueError("stopping generation of text") 56 | 57 | self.all_tokens.extend(value.tolist()) 58 | new_text = self.tokenizer.decode(self.all_tokens) 59 | new_chunk = new_text[len(self.all_text):] 60 | self.all_text += new_chunk 61 | self.new_tokens += 1 62 | 63 | if self.debug: 64 | print('\033[96m' + new_chunk, end="") 65 | 66 | self.channel.basic_publish( 67 | exchange=self.incoming_headers['return_exchange'], 68 | routing_key=self.incoming_headers['return_routing_key'], 69 | body=new_chunk, properties=self.outgoing_properties) 70 | 71 | def end(self): 72 | pass 73 | 74 | class LLaVA(LlmHandler): 75 | def __init__(self): 76 | super().__init__() 77 | 78 | def load_image(self, image_file): 79 | if image_file.startswith('http') or image_file.startswith('https'): 80 | response = requests.get(image_file) 81 | image = Image.open(BytesIO(response.content)).convert('RGB') 82 | else: 83 | image = Image.open(image_file).convert('RGB') 84 | return image 85 | 86 | def validate(self, request): 87 | is_valid, errors = self.validate_request(request, 'llm') 88 | return is_valid, errors 89 | 90 | def get_token_count(self, input_text): 91 | input_ids = tokenizer_image_token(input_text, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 92 | input_token_count = input_ids.shape[1] 93 | print(f"INPUT: {input_text}\nTOKEN COUNT: {input_token_count}\n\n") 94 | return input_token_count 95 | 96 | def execute(self, model, request): 97 | 98 | stream_output = request.get("stream", False) 99 | image_found = request.get("img_url", None) 100 | if image_found and "messages" in request and request["messages"][-1]["role"] == "user": 101 | logger.info(f"loading image {image_found}") 102 | new_message = f"\n{request['messages'][-1]['content']}" 103 | request["messages"][-1]["content"] = new_message 104 | image = self.load_image(request["img_url"]) 105 | image_tensor = model["image_processor"].preprocess(image, return_tensors='pt')['pixel_values'].half().cuda() 106 | else: 107 | image_tensor = None 108 | 109 | config = self.model_config 110 | prompt = self.build_prompt(request, config, model) 111 | input_ids = tokenizer_image_token(prompt, model["tokenizer"], IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda() 112 | input_token_count = input_ids.shape[1] 113 | 114 | finish_reason = "stop" 115 | max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \ 116 | min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request) 117 | check_stop_token, stop_conditions = self.build_stop_conditions(config["stop_on"]) 118 | stopping_criteria = KeywordsStoppingCriteria(stop_conditions, model["tokenizer"], input_ids) 119 | 120 | if debug: 121 | print('\033[94m') 122 | print(json.dumps(request, indent=2)) 123 | print(prompt) 124 | print('\033[0m') 125 | 126 | begin_time = time.time() 127 | with torch.inference_mode(): 128 | 129 | do_sample = True if seed == -1 else False 130 | incoming_headers = model["amqp_headers"] 131 | outgoing_properties = self.copy_queue_headers(incoming_headers) 132 | 133 | if stream_output: 134 | streamer = AmqpStreamer(model["tokenizer"], model["amqp_channel"], incoming_headers, outgoing_properties, model, self.check_stop_generation, debug) 135 | try: 136 | output_ids = model["model_loaded"].generate( 137 | input_ids, 138 | images=image_tensor, 139 | do_sample=do_sample, 140 | temperature=temperature, 141 | max_new_tokens=max_new_tokens, 142 | streamer=streamer, 143 | top_k=int(top_k * 100), 144 | top_p=top_p, 145 | use_cache=True, 146 | stopping_criteria=[stopping_criteria]) 147 | except Exception as e: 148 | print(e) 149 | pass 150 | else: 151 | output_ids = model["model_loaded"].generate( 152 | input_ids, 153 | images=image_tensor, 154 | do_sample=do_sample, 155 | top_k=int(top_k * 100), 156 | top_p=top_p, 157 | temperature=temperature, 158 | max_new_tokens=max_new_tokens, 159 | use_cache=True, 160 | stopping_criteria=[stopping_criteria]) 161 | 162 | new_tokens = streamer.get_new_tokens() if stream_output == True else output_ids.shape[1] - input_token_count 163 | end_time = time.time() 164 | elapsed = end_time - begin_time 165 | token_rate = 0 if elapsed == 0 else (new_tokens / elapsed) 166 | 167 | response = model["tokenizer"].decode(output_ids[0, input_ids.shape[1]:]) if stream_output == False else "" 168 | model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided" 169 | return self.finish_response(stop_key, response, request, stream_output, finish_reason, 170 | token_rate, new_tokens, input_token_count, model_name, elapsed, debug) 171 | 172 | def load(self, model, model_options, local_path): 173 | self.config = model 174 | self.model_config = model["configuration"] 175 | # tried to used both of these but they do not work, 4-bit fails to load and 8-bit outputs random shit 176 | load_4bit = False 177 | load_8bit = True if model_options["use_precision"] == "8-bit" else False 178 | 179 | try: 180 | model_name = get_model_name_from_path(local_path) 181 | base_model = f"data/models/{model['model'][1]['name']}" 182 | tokenizer, model, image_processor, context_len = load_pretrained_model(local_path, base_model, model_name, load_8bit, load_4bit, device=model_options["device"]) 183 | self.tokenizer = tokenizer 184 | return {"model_loaded":model, "tokenizer": tokenizer, "image_processor": image_processor, "device": model_options["device"]} 185 | except Exception as e: 186 | logger.error(f"error loading model") 187 | load_error = True 188 | print(e) 189 | return { "error": load_error } 190 | -------------------------------------------------------------------------------- /modules/haotian-liu/llava/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "LLaVA 1.5", 3 | "description": "Handler for loading LLaVA 1.5 visual LLM models.", 4 | "script": "golem-generator.py", 5 | "unique_key": "llava_15", 6 | "supported_gpu": ["nvidia"], 7 | "repository": [ 8 | { 9 | "url": "https://github.com/haotian-liu/LLaVA", 10 | "folder": "haotian-liu/llava" 11 | } 12 | ], 13 | "skills": [ 14 | { 15 | "label": "LLaVA 7b v1.5", 16 | "shortcut": "👀", 17 | "moe_function": [ 18 | "This function takes an image as input and provides a detailed description of it.", 19 | "This function provides a description of an image." 20 | ], 21 | "routing_key": "llava_7b_v1_5", 22 | "use": ["visual_language_model"], 23 | "available_precision": { "cuda": ["full"] }, 24 | "memory_usage": { "full": 16000 }, 25 | "model": [ 26 | { 27 | "name": "liuhaotian/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5", 28 | "provider": "huggingface" 29 | }, 30 | { 31 | "name": "liuhaotian/llava-v1.5-7b", 32 | "provider": "huggingface" 33 | } 34 | ], 35 | "configuration": { 36 | "max_seq_len": 4096, 37 | "user_role": "USER:", 38 | "ai_role": "ASSISTANT:", 39 | "stop_on": ["", ""], 40 | "system_message": "ASSISTANT: A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n", 41 | "prompt_format": "{user_role} {prompt} {ai_role} {response}\n" 42 | } 43 | }, 44 | { 45 | "label": "LLaVA 13b v1.5", 46 | "shortcut": "👀", 47 | "moe_function": [ 48 | "This function takes an image as input and provides a detailed description of it.", 49 | "This function provides a description of an image." 50 | ], 51 | "routing_key": "llava_13b_v1_5", 52 | "use": ["visual_language_model"], 53 | "available_precision": { "cuda": ["full"] }, 54 | "memory_usage": { "full": 26000 }, 55 | "model": [ 56 | { 57 | "name": "liuhaotian/llava-v1.5-mlp2x-336px-pretrain-vicuna-13b-v1.5", 58 | "provider": "huggingface" 59 | }, 60 | { 61 | "name": "liuhaotian/llava-v1.5-13b", 62 | "provider": "huggingface" 63 | } 64 | ], 65 | "configuration": { 66 | "max_seq_len": 4096, 67 | "user_role": "USER:", 68 | "ai_role": "ASSISTANT:", 69 | "stop_on": ["", ""], 70 | "system_message": "ASSISTANT: A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n", 71 | "prompt_format": "{user_role} {prompt} {ai_role} {response}\n" 72 | } 73 | } 74 | ] 75 | } 76 | -------------------------------------------------------------------------------- /modules/hf-pipeline/asr/asr.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | from transformers import pipeline 3 | import logging 4 | 5 | logger = logging.getLogger(__name__) 6 | 7 | class AsrPipeline(BaseHandler): 8 | def __init__(self): 9 | super().__init__() 10 | 11 | def validate(self, request): 12 | is_valid, errors = self.validate_request(request, 'audio-url') 13 | return is_valid, errors 14 | 15 | def execute(self, model, request): 16 | audio_url = request["audio_url"] 17 | result = { "text": model["model"](audio_url, max_new_tokens=1024)["text"] } 18 | logger.info(f"asr extracted text: {result['text']}") 19 | return result 20 | 21 | def load(self, model, model_options, local_path): 22 | asr_model = pipeline("automatic-speech-recognition", model=local_path) 23 | return {"model": asr_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/hf-pipeline/asr/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "🤗 Automatic Speech Recognition", 3 | "description": "Handler for loading any ASR model that is compatible with HuggingFace ASR pipeline.", 4 | "script": "asr.py", 5 | "unique_key": "hf_asr", 6 | "skills": [ 7 | { 8 | "name": "openai/whisper-tiny", 9 | "label": "Whisper Tiny", 10 | "routing_key": "whisper_tiny", 11 | "use": ["automatic_speech_recognition"], 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 1700 }, 14 | "model": [{ 15 | "name": "openai/whisper-tiny", 16 | "provider": "huggingface" 17 | }] 18 | }, 19 | { 20 | "name": "openai/whisper-tiny.en", 21 | "label": "Whisper Tiny En", 22 | "routing_key": "whisper_tiny_en", 23 | "use": ["automatic_speech_recognition"], 24 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 25 | "memory_usage": { "full": 1700 }, 26 | "model": [{ 27 | "name": "openai/whisper-tiny.en", 28 | "provider": "huggingface" 29 | }] 30 | }, 31 | { 32 | "name": "openai/whisper-base", 33 | "label": "Whisper Base", 34 | "routing_key": "whisper_base", 35 | "use": ["automatic_speech_recognition"], 36 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 37 | "memory_usage": { "full": 1850 }, 38 | "model": [{ 39 | "name": "openai/whisper-base", 40 | "provider": "huggingface" 41 | }] 42 | }, 43 | { 44 | "name": "openai/whisper-base.en", 45 | "label": "Whisper Base En", 46 | "routing_key": "whisper_base_en", 47 | "use": ["automatic_speech_recognition"], 48 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 49 | "memory_usage": { "full": 1850 }, 50 | "model": [{ 51 | "name": "openai/whisper-base.en", 52 | "provider": "huggingface" 53 | }] 54 | }, 55 | { 56 | "name": "openai/whisper-small", 57 | "label": "Whisper Small", 58 | "routing_key": "whisper_small", 59 | "use": ["automatic_speech_recognition"], 60 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 61 | "memory_usage": { "full": 2700 }, 62 | "model": [{ 63 | "name": "openai/whisper-small", 64 | "provider": "huggingface" 65 | }] 66 | }, 67 | { 68 | "name": "openai/whisper-small.en", 69 | "label": "Whisper Small En", 70 | "routing_key": "whisper_small_en", 71 | "use": ["automatic_speech_recognition"], 72 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 73 | "memory_usage": { "full": 2700 }, 74 | "model": [{ 75 | "name": "openai/whisper-small.en", 76 | "provider": "huggingface" 77 | }] 78 | }, 79 | { 80 | "name": "openai/whisper-medium", 81 | "label": "Whisper Medium", 82 | "routing_key": "whisper_medium", 83 | "use": ["automatic_speech_recognition"], 84 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 85 | "memory_usage": { "full": 4800 }, 86 | "model": [{ 87 | "name": "openai/whisper-medium", 88 | "provider": "huggingface" 89 | }] 90 | }, 91 | { 92 | "name": "openai/whisper-medium.en", 93 | "label": "Whisper Medium En", 94 | "routing_key": "whisper_medium_en", 95 | "use": ["automatic_speech_recognition"], 96 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 97 | "memory_usage": { "full": 4800 }, 98 | "model": [{ 99 | "name": "openai/whisper-medium.en", 100 | "provider": "huggingface" 101 | }] 102 | }, 103 | { 104 | "name": "openai/whisper-large-v2", 105 | "label": "Whisper Large v2", 106 | "routing_key": "whisper_large_v2", 107 | "use": ["automatic_speech_recognition"], 108 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 109 | "memory_usage": { "full": 8100 }, 110 | "model": [{ 111 | "name": "openai/whisper-large-v2", 112 | "provider": "huggingface" 113 | }] 114 | }, 115 | { 116 | "name": "microsoft/speecht5_asr", 117 | "label": "Microsoft Speech T5", 118 | "routing_key": "speecht5_asr", 119 | "use": ["automatic_speech_recognition"], 120 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 121 | "memory_usage": { "full": 2300 }, 122 | "model": [{ 123 | "name": "microsoft/speecht5_asr", 124 | "provider": "huggingface" 125 | }] 126 | } 127 | ] 128 | } 129 | -------------------------------------------------------------------------------- /modules/hf-pipeline/image-class/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "🤗 Image Classification", 3 | "description": "Handler for loading any model that is compatible with HuggingFace Image Classification pipeline.", 4 | "script": "image-class.py", 5 | "unique_key": "hf_image_class", 6 | "skills": [ 7 | { 8 | "name": "microsoft/resnet-50", 9 | "label": "Resnet 50", 10 | "routing_key": "resnet_50", 11 | "use": ["image_classification"], 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 1800 }, 14 | "model": [{ 15 | "name": "microsoft/resnet-50", 16 | "provider": "huggingface" 17 | }] 18 | }, 19 | { 20 | "name": "microsoft/resnet-18", 21 | "label": "Resnet 18", 22 | "routing_key": "resnet_18", 23 | "use": ["image_classification"], 24 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 25 | "memory_usage": { "full": 1500 }, 26 | "model": [{ 27 | "name": "microsoft/resnet-18", 28 | "provider": "huggingface" 29 | }] 30 | }, 31 | { 32 | "name": "google/vit-base-patch16-224", 33 | "label": "Vit Base Patch16 224", 34 | "routing_key": "vit_base_patch16_224", 35 | "use": ["image_classification"], 36 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 37 | "memory_usage": { "full": 2000 }, 38 | "model": [{ 39 | "name": "google/vit-base-patch16-224", 40 | "provider": "huggingface" 41 | }] 42 | }, 43 | { 44 | "name": "google/efficientnet-b0", 45 | "label": "Efficientnet B0", 46 | "routing_key": "efficientnet_b0", 47 | "use": ["image_classification"], 48 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 49 | "memory_usage": { "full": 1800 }, 50 | "model": [{ 51 | "name": "google/efficientnet-b0", 52 | "provider": "huggingface" 53 | }] 54 | }, 55 | { 56 | "name": "google/efficientnet-b7", 57 | "label": "Efficientnet B7", 58 | "routing_key": "efficientnet_b7", 59 | "use": ["image_classification"], 60 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 61 | "memory_usage": { "full": 2000 }, 62 | "model": [{ 63 | "name": "google/efficientnet-b7", 64 | "provider": "huggingface" 65 | }] 66 | }, 67 | { 68 | "name": "microsoft/beit-base-patch16-224-pt22k-ft22k", 69 | "label": "Beit Base Patch16", 70 | "routing_key": "beit_base_patch16", 71 | "use": ["image_classification"], 72 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 73 | "memory_usage": { "full": 1800 }, 74 | "model": [{ 75 | "name": "microsoft/beit-base-patch16-224-pt22k-ft22k", 76 | "provider": "huggingface" 77 | }] 78 | } 79 | ] 80 | } 81 | -------------------------------------------------------------------------------- /modules/hf-pipeline/image-class/image-class.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | from transformers import pipeline 3 | 4 | class ImageClassificationPipeline(BaseHandler): 5 | def __init__(self): 6 | super().__init__() 7 | 8 | def validate(self, request): 9 | is_valid, errors = self.validate_request(request, 'img-url') 10 | return is_valid, errors 11 | 12 | def execute(self, model, request): 13 | img_url = request["img_url"] 14 | result = { "classes": model["model"](img_url) } 15 | return result 16 | 17 | def load(self, model, model_options, local_path): 18 | img_model = pipeline("image-classification", model=local_path) 19 | return {"model": img_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/hf-pipeline/image-to-text/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "🤗 Image to Text", 3 | "description": "Handler for loading any model that is compatible with HuggingFace Image to Text pipeline.", 4 | "unique_key": "hf_image_to_text", 5 | "script": "image-to-text.py", 6 | "skills": [ 7 | { 8 | "name": "nlpconnect/vit-gpt2-image-captioning", 9 | "use": ["image_captioning"], 10 | "label": "ViT GPT2 Image Captioning", 11 | "routing_key": "vit_gpt2_image_captioning", 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 2400 }, 14 | "model": [{ 15 | "name": "nlpconnect/vit-gpt2-image-captioning", 16 | "provider": "huggingface" 17 | }] 18 | }, 19 | { 20 | "name": "ydshieh/vit-gpt2-coco-en", 21 | "use": ["image_captioning"], 22 | "label": "ViT GPT2 CoCo En", 23 | "routing_key": "vit_gpt2_coco_en", 24 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 25 | "memory_usage": { "full": 2400 }, 26 | "model": [{ 27 | "name": "ydshieh/vit-gpt2-coco-en", 28 | "provider": "huggingface" 29 | }] 30 | }, 31 | { 32 | "name": "Salesforce/blip-image-captioning-base", 33 | "use": ["image_captioning"], 34 | "label": "Blip Image Captioning Base", 35 | "routing_key": "blip_image_captioning_base", 36 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 37 | "memory_usage": { "full": 2500 }, 38 | "model": [{ 39 | "name": "Salesforce/blip-image-captioning-base", 40 | "provider": "huggingface" 41 | }] 42 | }, 43 | { 44 | "name": "Salesforce/blip-image-captioning-large", 45 | "use": ["image_captioning"], 46 | "label": "Blip Image Captioning Large", 47 | "routing_key": "blip_image_captioning_large", 48 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 49 | "memory_usage": { "full": 3300 }, 50 | "model": [{ 51 | "name": "Salesforce/blip-image-captioning-large", 52 | "provider": "huggingface" 53 | }] 54 | } 55 | ] 56 | } -------------------------------------------------------------------------------- /modules/hf-pipeline/image-to-text/image-to-text.py: -------------------------------------------------------------------------------- 1 | from diffusers.utils import load_image 2 | from application.base_handler import BaseHandler 3 | from transformers import pipeline 4 | 5 | class ImageToTextPipeline(BaseHandler): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def validate(self, request): 10 | is_valid, errors = self.validate_request(request, 'img-url') 11 | return is_valid, errors 12 | 13 | def execute(self, model, request): 14 | predict = model["model"](request["img_url"]) 15 | return predict 16 | 17 | def load(self, model, model_options, local_path): 18 | vit_model = pipeline("image-to-text", model=local_path) 19 | return {"model": vit_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/hf-pipeline/object-detection/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "🤗 Object Detection", 3 | "description": "Handler for loading any model that is compatible with HuggingFace Object Detection pipeline.", 4 | "unique_key": "hf_object_detection", 5 | "script": "object-detection.py", 6 | "skills": [ 7 | { 8 | "name": "facebook/detr-resnet-101", 9 | "label": "Detr Resnet 101", 10 | "routing_key": "detr_resnet_101", 11 | "use": ["object_detection"], 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 2100 }, 14 | "model": [{ 15 | "name": "facebook/detr-resnet-101", 16 | "provider": "huggingface" 17 | }] 18 | }, 19 | { 20 | "name": "facebook/detr-resnet-50", 21 | "label": "Detr Resnet 50", 22 | "routing_key": "detr_resnet_50", 23 | "use": ["object_detection"], 24 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 25 | "memory_usage": { "full": 1900 }, 26 | "model": [{ 27 | "name": "facebook/detr-resnet-50", 28 | "provider": "huggingface" 29 | }] 30 | }, 31 | { 32 | "name": "hustvl/yolos-tiny", 33 | "label": "Yolos Tiny", 34 | "routing_key": "yolos_tiny", 35 | "use": ["object_detection"], 36 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 37 | "memory_usage": { "full": 1600 }, 38 | "model": [{ 39 | "name": "hustvl/yolos-tiny", 40 | "provider": "huggingface" 41 | }] 42 | }, 43 | { 44 | "name": "hustvl/yolos-small", 45 | "label": "Yolos Small", 46 | "routing_key": "yolos_small", 47 | "use": ["object_detection"], 48 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 49 | "memory_usage": { "full": 2200 }, 50 | "model": [{ 51 | "name": "hustvl/yolos-small", 52 | "provider": "huggingface" 53 | }] 54 | } 55 | ] 56 | } 57 | -------------------------------------------------------------------------------- /modules/hf-pipeline/object-detection/object-detection.py: -------------------------------------------------------------------------------- 1 | import requests 2 | from PIL import Image, ImageDraw, ImageFont 3 | import io 4 | import base64 5 | from diffusers.utils import load_image 6 | from application.base_handler import BaseHandler 7 | from transformers import pipeline 8 | 9 | class ObjectDetectionPipeline(BaseHandler): 10 | def __init__(self): 11 | super().__init__() 12 | 13 | def validate(self, request): 14 | is_valid, errors = self.validate_request(request, 'img-url') 15 | return is_valid, errors 16 | 17 | def image_with_boxes(self, img_url, detections): 18 | # Fetch the image 19 | response = requests.get(img_url) 20 | img = Image.open(io.BytesIO(response.content)) 21 | 22 | # Prepare for drawing on the image 23 | draw = ImageDraw.Draw(img) 24 | font = ImageFont.load_default() 25 | #font = ImageFont.truetype("arial.ttf", 15) 26 | 27 | # Draw boxes and labels 28 | for detection in detections: 29 | label = detection['label'] 30 | box = detection['box'] 31 | score = detection['score'] 32 | xmin = box['xmin'] 33 | ymin = box['ymin'] 34 | xmax = box['xmax'] 35 | ymax = box['ymax'] 36 | draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red") 37 | 38 | # Draw the label with its score 39 | text = f"{label} {score:.2f}" 40 | draw.text((xmin, ymin - 20), text, font=font, fill="red") 41 | 42 | # Convert the modified image to base64 43 | buffered = io.BytesIO() 44 | img.save(buffered, format="PNG") 45 | img_base64 = base64.b64encode(buffered.getvalue()).decode() 46 | 47 | return img_base64 48 | 49 | def execute(self, model, request): 50 | detections = model["model"](request["img_url"]) 51 | img_base64 = self.image_with_boxes(request["img_url"], detections) 52 | return { "objects": detections, "image": img_base64 } 53 | 54 | def load(self, model, model_options, local_path): 55 | detr_model = pipeline("object-detection", model=local_path) 56 | return {"model": detr_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/hf-pipeline/visual-question-answering/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "🤗 Visual QA Handler", 3 | "description": "Handler for loading any model that is compatible with HuggingFace Visual QA pipeline.", 4 | "unique_key": "hf_vqa", 5 | "script": "visual-question-answering.py", 6 | "skills": [ 7 | { 8 | "name": "dandelin/vilt-b32-finetuned-vqa", 9 | "label": "Vilt B32 Finetuned VQA", 10 | "routing_key": "vilt_b32_finetuned_vqa", 11 | "use": ["visual_qa"], 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 1900 }, 14 | "model": [{ 15 | "name": "dandelin/vilt-b32-finetuned-vqa", 16 | "provider": "huggingface" 17 | }] 18 | } 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /modules/hf-pipeline/visual-question-answering/visual-question-answering.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | from transformers import pipeline 3 | 4 | class VisualQaPipeline(BaseHandler): 5 | def __init__(self): 6 | super().__init__() 7 | 8 | def validate(self, request): 9 | is_valid, errors = self.validate_request(request, 'visual-qa') 10 | return is_valid, errors 11 | 12 | def execute(self, model, request): 13 | text = request["text"] 14 | predict = model["model"](question=text, image=request["img_url"]) 15 | return predict 16 | 17 | def load(self, model, model_options, local_path): 18 | vit_model = pipeline("visual-question-answering", model=local_path) 19 | return {"model": vit_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/hf-pipeline/zero-shot-image-class/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "🤗 Zero Shot Image Classification", 3 | "description": "Handler for loading any model that is compatible with HuggingFace Zero Short Image Classification pipeline.", 4 | "unique_key": "hf_zeroshot_image_class", 5 | "script": "zero-shot-image-class.py", 6 | "skills": [ 7 | { 8 | "name": "openai/clip-vit-large-patch14", 9 | "label": "Clip ViT Large Patch14", 10 | "routing_key": "clip_vit_large_patch14", 11 | "use": ["zero_shot_image_classification"], 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 3100 }, 14 | "model": [{ 15 | "name": "openai/clip-vit-large-patch14", 16 | "provider": "huggingface" 17 | }] 18 | } 19 | ] 20 | } 21 | -------------------------------------------------------------------------------- /modules/hf-pipeline/zero-shot-image-class/zero-shot-image-class.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | from transformers import pipeline 3 | 4 | class ZeroShotImageClassPipeline(BaseHandler): 5 | def __init__(self): 6 | super().__init__() 7 | 8 | def validate(self, request): 9 | is_valid, errors = self.validate_request(request, 'zero-shot-img') 10 | return is_valid, errors 11 | 12 | def execute(self, model, request): 13 | labels = request["labels"] 14 | predict = model["model"](request["img_url"], candidate_labels = labels) 15 | return predict 16 | 17 | def load(self, model, model_options, local_path): 18 | clip_model = pipeline("zero-shot-image-classification", model=local_path) 19 | return {"model": clip_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/hf-pipeline/zero-shot-object-detection/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "🤗 Zero Shot Object Detection", 3 | "description": "Handler for loading any model that is compatible with HuggingFace Zero Short Object Detection pipeline.", 4 | "unique_key": "hf_zeroshot_object_detection", 5 | "script": "zero-shot-object-detection.py", 6 | "skills": [ 7 | { 8 | "name": "google/owlvit-base-patch32", 9 | "label": "Owlvit Base Patch32", 10 | "routing_key": "owlvit_base_patch32", 11 | "use": ["zero_shot_object_detection"], 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 2100 }, 14 | "model": [{ 15 | "name": "google/owlvit-base-patch32", 16 | "provider": "huggingface" 17 | }] 18 | }, 19 | { 20 | "name": "google/owlvit-base-patch16", 21 | "label": "Owlvit Base Patch16", 22 | "routing_key": "owlvit_base_patch16", 23 | "use": ["zero_shot_object_detection"], 24 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 25 | "memory_usage": { "full": 2600 }, 26 | "model": [{ 27 | "name": "google/owlvit-base-patch16", 28 | "provider": "huggingface" 29 | }] 30 | }, 31 | { 32 | "name": "google/owlvit-large-patch14", 33 | "label": "Owlvit Large Patch14", 34 | "routing_key": "owlvit_large_patch14", 35 | "use": ["zero_shot_object_detection"], 36 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 37 | "memory_usage": { "full": 5600 }, 38 | "model": [{ 39 | "name": "google/owlvit-large-patch14", 40 | "provider": "huggingface" 41 | }] 42 | } 43 | ] 44 | } 45 | -------------------------------------------------------------------------------- /modules/hf-pipeline/zero-shot-object-detection/zero-shot-object-detection.py: -------------------------------------------------------------------------------- 1 | from diffusers.utils import load_image 2 | from application.base_handler import BaseHandler 3 | from transformers import pipeline 4 | 5 | class ZeroShotObjectDetectionPipeline(BaseHandler): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def validate(self, request): 10 | is_valid, errors = self.validate_request(request, 'zero-shot-img') 11 | return is_valid, errors 12 | 13 | def execute(self, model, request): 14 | labels = request["labels"] 15 | predict = model["model"](request["img_url"], candidate_labels = labels) 16 | return predict 17 | 18 | def load(self, model, model_options, local_path): 19 | object_model = pipeline("zero-shot-object-detection", model=local_path) 20 | return {"model": object_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/hkunlp/instructor/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Instructor Embeddings", 3 | "description": "Handler for loading instructor embedding models.", 4 | "unique_key": "instructor", 5 | "script": "instructor.py", 6 | "skills": [ 7 | { 8 | "name": "hkunlp/instructor-xl", 9 | "label": "Instructor Xl", 10 | "routing_key": "instructor_xl", 11 | "use": ["text_embedding"], 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 5900 }, 14 | "model": [{ 15 | "name": "hkunlp/instructor-xl", 16 | "provider": "huggingface" 17 | }] 18 | }, 19 | { 20 | "name": "hkunlp/instructor-large", 21 | "label": "Instructor Large", 22 | "routing_key": "instructor_large", 23 | "use": ["text_embedding"], 24 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 25 | "memory_usage": { "full": 2500 }, 26 | "model": [{ 27 | "name": "hkunlp/instructor-large", 28 | "provider": "huggingface" 29 | }] 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /modules/hkunlp/instructor/instructor.py: -------------------------------------------------------------------------------- 1 | from diffusers.utils import load_image 2 | from application.base_handler import BaseHandler 3 | from InstructorEmbedding import INSTRUCTOR 4 | 5 | class Instructor(BaseHandler): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def validate(self, request): 10 | is_valid, errors = self.validate_request(request, 'instructor') 11 | return is_valid, errors 12 | 13 | def execute(self, model, request): 14 | text = request["text"] 15 | instruction = request["instruction"] 16 | embedding = model["model"].encode([[instruction,text]]) 17 | result = {"embedding": embedding.tolist()} 18 | return result 19 | 20 | def load(self, model, model_options, local_path): 21 | return {"model": INSTRUCTOR(local_path), "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/intfloat/e5-v2/e5-v2.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | from transformers import AutoTokenizer, AutoModel 3 | from torch import Tensor 4 | 5 | class E5V2(BaseHandler): 6 | def __init__(self): 7 | super().__init__() 8 | 9 | def average_pool(self, last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor: 10 | last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0) 11 | return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None] 12 | 13 | def validate(self, request): 14 | return True, [] 15 | 16 | def execute(self, model, request): 17 | text = request["text"] 18 | if type(text) is str: 19 | text = [text] 20 | 21 | ret_embed = {} 22 | for embed in text: 23 | batch_dict = model["tokenizer"](f"query: {embed}", max_length=512, padding=True, truncation=True, return_tensors='pt') 24 | for key in batch_dict: 25 | batch_dict[key] = batch_dict[key].to(model["device"]) 26 | 27 | outputs = model["model"](**batch_dict) 28 | embeddings = self.average_pool(outputs.last_hidden_state, batch_dict['attention_mask']) 29 | ret_embed[embed] = embeddings.tolist() 30 | 31 | return {"embeddings": ret_embed} 32 | 33 | def load(self, model, model_options, local_path): 34 | tokenizer = AutoTokenizer.from_pretrained(local_path) 35 | e5_model = AutoModel.from_pretrained(local_path) 36 | return {"model": e5_model, "tokenizer": tokenizer, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/intfloat/e5-v2/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "E5 v2 Embeddings", 3 | "description": "Handler for loading the E5 v2 embedding models.", 4 | "unique_key": "e5_v2", 5 | "script": "e5-v2.py", 6 | "skills": [ 7 | { 8 | "name": "intfloat/e5-large-v2", 9 | "label": "E5 Large v2", 10 | "routing_key": "e5_large_v2", 11 | "use": ["text_embedding"], 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 2500 }, 14 | "model": [{ 15 | "name": "intfloat/e5-large-v2", 16 | "provider": "huggingface" 17 | }] 18 | }, 19 | { 20 | "name": "intfloat/e5-base-v2", 21 | "label": "E5 Base v2", 22 | "routing_key": "e5_base_v2", 23 | "use": ["text_embedding"], 24 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 25 | "memory_usage": { "full": 1600 }, 26 | "model": [{ 27 | "name": "intfloat/e5-base-v2", 28 | "provider": "huggingface" 29 | }] 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /modules/microsoft/git-textcaps/git-textcaps.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | from transformers import AutoProcessor, AutoModelForCausalLM 3 | from PIL import Image 4 | import requests 5 | 6 | class GitTextCaption(BaseHandler): 7 | def __init__(self): 8 | super().__init__() 9 | 10 | def validate(self, request): 11 | is_valid, errors = self.validate_request(request, 'img-url') 12 | return is_valid, errors 13 | 14 | def execute(self, model, request): 15 | image = Image.open(requests.get(request["img_url"], stream=True).raw) 16 | pixel_values = model["processor"](images=image, return_tensors="pt").to(model["device"]).pixel_values 17 | generated_ids = model["model"].generate(pixel_values=pixel_values, max_length=256) 18 | generated_caption = model["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0] 19 | return {"text": generated_caption} 20 | 21 | def load(self, model, model_options, local_path): 22 | processor = AutoProcessor.from_pretrained(local_path) 23 | git_model = AutoModelForCausalLM.from_pretrained(local_path) 24 | return {"model": git_model, "processor": processor, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]} -------------------------------------------------------------------------------- /modules/microsoft/git-textcaps/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "GIT Text Captions", 3 | "description": "Handler for loading Microsoft GIT text caption models.", 4 | "unique_key": "git_textcaps", 5 | "script": "git-textcaps.py", 6 | "skills": [ 7 | { 8 | "name": "microsoft/git-base-textcaps", 9 | "label": "GiT Base Textcaps", 10 | "routing_key": "git_base_textcaps", 11 | "use": ["image_captioning"], 12 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 2200 }, 14 | "model": [{ 15 | "name": "microsoft/git-base-textcaps", 16 | "provider": "huggingface" 17 | }] 18 | }, 19 | { 20 | "name": "microsoft/git-large-textcaps", 21 | "label": "GiT Large Textcaps", 22 | "routing_key": "git_large_textcaps", 23 | "use": ["image_captioning"], 24 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 25 | "memory_usage": { "full": 3000 }, 26 | "model": [{ 27 | "name": "microsoft/git-large-textcaps", 28 | "provider": "huggingface" 29 | }] 30 | } 31 | ] 32 | } 33 | -------------------------------------------------------------------------------- /modules/noco-ai/bark-tts/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Bark Text to Speech", 3 | "description": "Handler for loading Bark text to speech models.", 4 | "unique_key": "bark", 5 | "script": "handler.py", 6 | "skills": [ 7 | { 8 | "label": "Bark Small", 9 | "routing_key": "bark_small", 10 | "use": ["text_to_speech"], 11 | "available_precision": { "cuda": ["full"] }, 12 | "memory_usage": { "full": 2500 }, 13 | "model": [{ 14 | "name": "suno/bark-small", 15 | "provider": "huggingface" 16 | }], 17 | "configuration": { 18 | "progress_label": "Generating Speech" 19 | } 20 | }, 21 | { 22 | "label": "Bark Large", 23 | "routing_key": "bark_large", 24 | "use": ["text_to_speech"], 25 | "available_precision": { "cuda": ["full"] }, 26 | "memory_usage": { "full": 5200 }, 27 | "model": [{ 28 | "name": "suno/bark", 29 | "provider": "huggingface" 30 | }], 31 | "configuration": { 32 | "progress_label": "Generating Speech" 33 | } 34 | } 35 | ] 36 | } -------------------------------------------------------------------------------- /modules/noco-ai/bark-tts/handler.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | from transformers import AutoProcessor, AutoModel 3 | import torch 4 | import base64 5 | from io import BytesIO 6 | import scipy 7 | import copy 8 | from application.progress_streamer import ProgressStreamer 9 | import logging 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class BarkHandler(BaseHandler): 14 | def __init__(self): 15 | self.progress_streamer = ProgressStreamer() 16 | super().__init__() 17 | 18 | def validate(self, request): 19 | is_valid, errors = self.validate_request(request, 'voice-gen') 20 | return is_valid, errors 21 | 22 | def execute(self, model, request): 23 | prompt = request.get("prompt", "") 24 | send_progress = request.get("progress", True) 25 | voice_preset = request.get("voice", "v2/en_speaker_1") 26 | prompt_length = len(prompt) 27 | 28 | if voice_preset == "default": 29 | voice_preset = "v2/en_speaker_1" 30 | 31 | if send_progress: 32 | progress_headers = copy.deepcopy(model["amqp_headers"]) 33 | outgoing_properties = self.copy_queue_headers(progress_headers, "update_progress") 34 | amqp_config = { 35 | "headers": progress_headers, 36 | "outgoing_properties": outgoing_properties, 37 | "channel": model["amqp_channel"] 38 | } 39 | self.progress_streamer.configure(prompt_length * 25, self.model_config["progress_label"], self.routing_key, amqp_config, False) 40 | else: 41 | self.progress_streamer.configure(prompt_length * 25, self.model_config["progress_label"], self.routing_key, None, False) 42 | 43 | # Assuming the model function can take these parameters: 44 | logger.info(f"prompt: {prompt}, voice: {voice_preset}, length: {prompt_length}") 45 | inputs = model["processor"]( 46 | text=[prompt], 47 | voice_preset=voice_preset, 48 | return_tensors="pt", 49 | ).to(model["device"]) 50 | speech_values = model["model"].generate(**inputs, do_sample=True, streamer=self.progress_streamer) 51 | 52 | # Save image to an in-memory bytes buffer 53 | buffered = BytesIO() 54 | sampling_rate = model["model"].generation_config.sample_rate 55 | scipy.io.wavfile.write(buffered, rate=sampling_rate, data=speech_values.cpu().numpy().squeeze()) 56 | 57 | # Convert bytes buffer to a base64-encoded string 58 | wav_str = base64.b64encode(buffered.getvalue()).decode() 59 | return {"wav": wav_str} 60 | 61 | def load(self, model, model_options, local_path): 62 | self.model_config = model["configuration"] 63 | self.routing_key = model["routing_key"] 64 | processor = AutoProcessor.from_pretrained(local_path) 65 | load_model = AutoModel.from_pretrained(local_path) 66 | 67 | return { 68 | "model": load_model, 69 | "processor": processor, 70 | "device": model_options["device"], 71 | "device_memory": model["memory_usage"][model_options["use_precision"]] 72 | } 73 | -------------------------------------------------------------------------------- /modules/noco-ai/image-generator/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Stable Diffusion v1.5", 3 | "description": "Handler for loading Stable Diffusion v1.5 models.", 4 | "unique_key": "sd_15", 5 | "script": "handler.py", 6 | "supported_gpu": ["nvidia"], 7 | "skills": [ 8 | { 9 | "label": "Stable Diffusion v1.5", 10 | "routing_key": "stable_diffusion_v15", 11 | "use": ["image_generation"], 12 | "available_precision": { "cuda": ["full"] }, 13 | "memory_usage": { "full": 2400 }, 14 | "model": [{ 15 | "name": "runwayml/stable-diffusion-v1-5", 16 | "provider": "huggingface" 17 | }], 18 | "configuration": { 19 | "progress_label": "Generating Image" 20 | } 21 | }, 22 | { 23 | "label": "DreamShaper", 24 | "routing_key": "dream_shaper_image_gen", 25 | "use": ["image_generation"], 26 | "available_precision": { "cuda": ["full"] }, 27 | "memory_usage": { "full": 2400 }, 28 | "model": [{ 29 | "name": "civitai/dreamshaper/128713.safetensors", 30 | "provider": "civitai", 31 | "url": "https://civitai.com/api/download/models/128713?type=Model&format=SafeTensor&size=pruned&fp=fp16" 32 | }], 33 | "configuration": { 34 | "progress_label": "Generating Image" 35 | } 36 | } 37 | ] 38 | } -------------------------------------------------------------------------------- /modules/noco-ai/image-generator/handler.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | from diffusers import StableDiffusionPipeline, KDPM2DiscreteScheduler 3 | import torch 4 | import base64 5 | from io import BytesIO 6 | import logging 7 | import json 8 | import copy 9 | from compel import Compel 10 | 11 | logger = logging.getLogger(__name__) 12 | 13 | class StableDiffusion(BaseHandler): 14 | def __init__(self): 15 | super().__init__() 16 | 17 | def validate(self, request): 18 | is_valid, errors = self.validate_request(request, 'img-gen') 19 | return is_valid, errors 20 | 21 | def step_callback(self, pipeline: StableDiffusionPipeline, step: int, timestep: int, callback_kwargs): 22 | if self.stream_progress == False: 23 | return callback_kwargs 24 | 25 | self.current_step = self.current_step + 1 26 | label = self.model_configuration["progress_label"] if "progress_label" in self.model_configuration else self.routing_key 27 | send_body = { 28 | "total": self.total_steps, 29 | "current": self.current_step, 30 | "label": label, 31 | "model": self.routing_key 32 | } 33 | 34 | self.amqp_progress_config["channel"].basic_publish( 35 | exchange=self.amqp_progress_config["headers"]['return_exchange'], 36 | routing_key=self.amqp_progress_config["headers"]['return_routing_key'], 37 | body=json.dumps(send_body), properties=self.amqp_progress_config["outgoing_properties"]) 38 | 39 | return callback_kwargs 40 | 41 | def get_latents(self, num_images=1, height=512, width=512, user_seed=-1, device="cuda:0", model=None): 42 | latents = None 43 | generator = torch.Generator(device=device) 44 | if user_seed == -1: 45 | seed = generator.seed() 46 | else: 47 | seed = user_seed 48 | generator = generator.manual_seed(seed) 49 | 50 | latents = torch.randn( 51 | (num_images, model.unet.in_channels, height // 8, width // 8), 52 | generator = generator, 53 | device = device, 54 | dtype = torch.float16 55 | ) 56 | return { "seed": seed, "latents": latents } 57 | 58 | def execute(self, model, request): 59 | prompt = request.get("prompt", "") 60 | height = request.get("height", 512) 61 | width = request.get("width", 512) 62 | steps = request.get("steps", 50) 63 | seed = request.get("seed", -1) 64 | self.stream_progress = request.get("progress", False) 65 | negative_prompt = request.get("negative_prompt", "") 66 | guidance_scale = request.get("guidance_scale", 7.5) 67 | num_images_per_prompt = 1 68 | 69 | if self.stream_progress == True: 70 | progress_headers = copy.deepcopy(model["amqp_headers"]) 71 | outgoing_properties = self.copy_queue_headers(progress_headers, "update_progress") 72 | self.amqp_progress_config = { 73 | "headers": progress_headers, 74 | "outgoing_properties": outgoing_properties, 75 | "channel": model["amqp_channel"] 76 | } 77 | self.current_step = 0 78 | self.total_steps = steps * 2 79 | 80 | latent_data = self.get_latents(num_images_per_prompt, height, width, seed, self.model_options["device"], model["model"]) 81 | logger.info(f"prompt: {prompt}, height: {height}, width: {width}, steps: {steps}, guidance scale: {guidance_scale}, seed: {latent_data['seed']}") 82 | 83 | prompt_embeds = model["compel"](prompt) 84 | negative_prompt_embeds = model["compel"](negative_prompt) 85 | image = model["model"](prompt_embeds=prompt_embeds, height=height, width=width, num_inference_steps=steps, latents=latent_data["latents"], callback_on_step_end=self.step_callback, 86 | negative_prompt_embeds=negative_prompt_embeds, guidance_scale=guidance_scale, num_images_per_prompt=num_images_per_prompt).images[0] 87 | 88 | buffered = BytesIO() 89 | image.save(buffered, format="PNG") 90 | 91 | # Convert bytes buffer to a base64-encoded string 92 | img_str = base64.b64encode(buffered.getvalue()).decode() 93 | return {"image": img_str, "seed": latent_data["seed"], "guidance_scale": guidance_scale, "steps": steps } 94 | 95 | def load(self, model, model_options, local_path): 96 | self.model_options = model_options 97 | self.routing_key = model["routing_key"] 98 | self.model_configuration = model["configuration"] 99 | 100 | try: 101 | if "civitai" not in local_path: 102 | logger.info("loading standard sd model") 103 | load_model = StableDiffusionPipeline.from_pretrained(local_path, torch_dtype=torch.float16, safety_checker=None) 104 | else: 105 | logger.info("loading civit sd model") 106 | load_model = StableDiffusionPipeline.from_single_file(local_path, load_safety_checker=False, torch_dtype=torch.float16) 107 | 108 | load_model.scheduler = KDPM2DiscreteScheduler.from_config(load_model.scheduler.config) 109 | compel = Compel(tokenizer=load_model.tokenizer, text_encoder=load_model.text_encoder) 110 | 111 | return { 112 | "model": load_model, 113 | "device": model_options["device"], 114 | "device_memory": model["memory_usage"][model_options["use_precision"]], 115 | "compel": compel 116 | } 117 | 118 | except Exception as e: 119 | print(f"error loading sd model") 120 | print(e) 121 | return { "error": True } 122 | -------------------------------------------------------------------------------- /modules/noco-ai/llama-cpp/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Llama CPP", 3 | "description": "Handler for loading Llama CPP models.", 4 | "unique_key": "llama_cpp", 5 | "script": "llama-cpp.py", 6 | "skills": [ 7 | { 8 | "label": "Llama2 7B Chat (GGUF)", 9 | "routing_key": "llama_7b_chat_gguf", 10 | "use": ["language_model"], 11 | "available_precision": { "cpu": ["4-bit", "6-bit"], "cuda": ["4-bit", "6-bit"] }, 12 | "memory_usage": { "4-bit": 6000, "6-bit": 8000 }, 13 | "model": [{ 14 | "name": "TheBloke/Llama-2-7B-chat-GGUF", 15 | "provider": "huggingface", 16 | "files": { 17 | "4-bit": "llama-2-7b-chat.Q4_0.gguf" 18 | } 19 | }], 20 | "configuration": { 21 | "model_layers": 43, 22 | "num_threads": -1, 23 | "model_type": "chat", 24 | "max_seq_len": 4096, 25 | "user_role": "[INST]", 26 | "ai_role": "[/INST]", 27 | "stop_on": ["", "[INST]", ""], 28 | "system_prompt_format": "{user_role} <>{system_prompt}<>\n{prompt} {ai_role} {response}", 29 | "system_message": "You are an helpful assistant.", 30 | "prompt_format": "{user_role} {prompt} {ai_role} {response}" 31 | } 32 | }, 33 | { 34 | "label": "Llama2 13B Chat (GGUF)", 35 | "routing_key": "llama_13b_chat_gguf", 36 | "use": ["language_model"], 37 | "available_precision": { "cpu": ["4-bit", "6-bit"], "cuda": ["4-bit", "6-bit"] }, 38 | "memory_usage": { "4-bit": 11000, "6-bit": 13800 }, 39 | "model": [{ 40 | "name": "TheBloke/Llama-2-13B-chat-GGUF", 41 | "provider": "huggingface", 42 | "files": { 43 | "4-bit": "llama-2-13b-chat.Q4_0.gguf" 44 | } 45 | }], 46 | "configuration": { 47 | "model_layers": 43, 48 | "num_threads": -1, 49 | "model_type": "chat", 50 | "max_seq_len": 4096, 51 | "user_role": "[INST]", 52 | "ai_role": "[/INST]", 53 | "stop_on": ["", "[INST]", ""], 54 | "system_prompt_format": "{user_role} <>{system_prompt}<>\n{prompt} {ai_role} {response}", 55 | "system_message": "You are an helpful assistant.", 56 | "prompt_format": "{user_role} {prompt} {ai_role} {response}" 57 | } 58 | }, 59 | { 60 | "label": "CodeLlama 34B Instruct (GGUF)", 61 | "routing_key": "llama_34b_instruct_gguf", 62 | "use": ["language_model"], 63 | "available_precision": { "cpu": ["4-bit", "6-bit"], "cuda": ["4-bit", "6-bit"] }, 64 | "memory_usage": { "4-bit": 22000, "6-bit": 31500 }, 65 | "model": [{ 66 | "name": "TheBloke/CodeLlama-34B-Instruct-GGUF", 67 | "provider": "huggingface", 68 | "files": { 69 | "4-bit": "codellama-34b-instruct.Q4_K_M.gguf", 70 | "6-bit": "codellama-34b-instruct.Q6_K.gguf" 71 | } 72 | }], 73 | "configuration": { 74 | "model_layers": 51, 75 | "num_threads": -1, 76 | "model_type": "instruct", 77 | "max_seq_len": 16384, 78 | "user_role": "[INST]", 79 | "ai_role": "[/INST]", 80 | "stop_on": ["", "[INST]", ""], 81 | "system_message": "", 82 | "prompt_format": "{user_role} {prompt} {ai_role} {response}\n\n" 83 | } 84 | }, 85 | { 86 | "label": "Mistral 7B Instruct (GGUF)", 87 | "routing_key": "mistral_7b_instruct_gguf", 88 | "use": ["language_model"], 89 | "available_precision": { "cuda": ["4-bit", "6-bit"] }, 90 | "memory_usage": { "4-bit": 6900, "6-bit": 11000 }, 91 | "model": [{ 92 | "name": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF", 93 | "provider": "huggingface", 94 | "files": { 95 | "4-bit": "mistral-7b-instruct-v0.2.Q4_K_M.gguf", 96 | "6-bit": "mistral-7b-instruct-v0.2.Q6_K.gguf" 97 | } 98 | }], 99 | "configuration": { 100 | "model_layers": 35, 101 | "max_seq_len": 8192, 102 | "user_role": "[INST]", 103 | "ai_role": "[/INST]", 104 | "stop_on": ["", "### USER:"], 105 | "system_prompt_format": "[INST] {system_prompt} [/INST] ", 106 | "system_message": "", 107 | "prompt_format": "{user_role} {prompt} {ai_role} {response} " 108 | } 109 | }, 110 | { 111 | "label": "Llama2 70B Chat (GGUF)", 112 | "routing_key": "llama_70b_chat_gguf", 113 | "use": ["language_model"], 114 | "available_precision": { "cpu": ["4-bit", "5-bit", "6-bit"], "cuda": ["4-bit", "5-bit", "6-bit"] }, 115 | "memory_usage": { "4-bit": 41500, "5-bit": 48000, "6-bit": 60000 }, 116 | "model": [{ 117 | "name": "TheBloke/Llama-2-70B-chat-GGUF", 118 | "provider": "huggingface", 119 | "files": { 120 | "4-bit": "llama-2-70b-chat.Q4_K_M.gguf", 121 | "5-bit": "llama-2-70b-chat.Q5_K_M.gguf", 122 | "6-bit": "llama-2-70b-chat.Q6_K.gguf" 123 | }, 124 | "split": { 125 | "6-bit": ["-split-a", "-split-b"] 126 | } 127 | }], 128 | "configuration": { 129 | "model_layers": 83, 130 | "num_threads": -1, 131 | "model_type": "chat", 132 | "max_seq_len": 4096, 133 | "user_role": "[INST]", 134 | "ai_role": "[/INST]", 135 | "stop_on": ["", "[INST]", ""], 136 | "system_prompt_format": "{user_role} <>{system_prompt}<>\n{prompt} {ai_role} {response}", 137 | "prompt_format": "{user_role} {prompt} {ai_role} {response}", 138 | "system_message": "You are an helpful assistant." 139 | } 140 | }, 141 | { 142 | "label": "CodeLlama 70B Instruct (GGUF)", 143 | "routing_key": "codellama_70b_instruct_gguf", 144 | "use": ["language_model"], 145 | "available_precision": { "cpu": ["4-bit", "5-bit"], "cuda": ["4-bit", "5-bit"] }, 146 | "memory_usage": { "4-bit": 41500, "5-bit": 48000 }, 147 | "model": [{ 148 | "name": "LoneStriker/CodeLlama-70b-Instruct-hf-GGUF", 149 | "provider": "huggingface", 150 | "files": { 151 | "4-bit": "CodeLlama-70b-Instruct-hf-Q4_K_M.gguf" 152 | } 153 | }], 154 | "configuration": { 155 | "model_layers": 83, 156 | "num_threads": -1, 157 | "model_type": "chat", 158 | "max_seq_len": 4096, 159 | "user_role": "Source: user", 160 | "ai_role": "Source: assistant", 161 | "stop_on": ["", "Source:", "Source:"], 162 | "system_prompt_format": "Source: system\n\n{system_prompt} ", 163 | "prompt_format": "{user_role}\n\n{prompt} {ai_role} Destination: user\n{response}", 164 | "system_message": "You are an expert in coding Magento 2" 165 | } 166 | }, 167 | { 168 | "label": "Mixtral 8x7B Instruct (GGUF)", 169 | "routing_key": "mixtral_8x7b_instruct", 170 | "use": ["language_model", "reasoning_agent"], 171 | "available_precision": { "cpu": ["4-bit", "5-bit", "6-bit"], "cuda": ["4-bit", "5-bit", "6-bit"] }, 172 | "memory_usage": { "4-bit": 28000, "5-bit": 33500, "6-bit": 39000 }, 173 | "model": [{ 174 | "name": "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF", 175 | "provider": "huggingface", 176 | "files": { 177 | "4-bit": "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf", 178 | "5-bit": "mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf", 179 | "6-bit": "mixtral-8x7b-instruct-v0.1.Q6_K.gguf" 180 | } 181 | }], 182 | "configuration": { 183 | "model_layers": 63, 184 | "num_threads": -1, 185 | "max_seq_len": 16384, 186 | "user_role": "[INST]", 187 | "ai_role": "[/INST]", 188 | "stop_on": ["", "[/INST]"], 189 | "system_prompt_format": "[INST] {system_prompt}\nRespond with OK if you understand. [/INST] OK ", 190 | "system_message": "", 191 | "prompt_format": "{user_role} {prompt} {ai_role} {response} " 192 | } 193 | } 194 | ] 195 | } -------------------------------------------------------------------------------- /modules/noco-ai/llama-cpp/llama-cpp.py: -------------------------------------------------------------------------------- 1 | from llama_cpp import Llama 2 | from application.llm_handler import LlmHandler 3 | import torch 4 | import time 5 | import logging 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | class GGUFGenerator(LlmHandler): 10 | def __init__(self): 11 | super().__init__() 12 | self.loras = {} 13 | 14 | def update_config(self, config_data): 15 | current_config = self.model_config 16 | merged_config = {**current_config, **config_data} 17 | self.model_config = merged_config 18 | 19 | def get_token_count(self, input_text): 20 | inputs = self.loaded_model.tokenize(bytes(input_text, 'utf-8')) 21 | return len(inputs) 22 | 23 | def validate(self, request): 24 | is_valid, errors = self.validate_request(request, 'llm') 25 | return is_valid, errors 26 | 27 | def stream(self, model, prompt, channel, incoming_headers, 28 | outgoing_properties, stops, request, model_data): 29 | 30 | # setup stop conditions 31 | check_stop_token, stop_conditions = self.build_stop_conditions(stops, False) 32 | 33 | # get starting time 34 | begin_time = time.time() 35 | 36 | # set max new tokens and other params 37 | prompt_tokens = model.tokenize(bytes(prompt, 'utf-8')) 38 | input_token_count = len(prompt_tokens) 39 | max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \ 40 | min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request) 41 | if debug: 42 | print('\033[94m') 43 | print(request) 44 | print(prompt) 45 | print('\033[0m') 46 | 47 | response = "" 48 | new_tokens = 0 49 | finish_reason = 'stop' 50 | socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None 51 | stop_generation_counter = 0 52 | model_args = {} 53 | 54 | # sampler settings 55 | if mirostat != 0: 56 | model_args["mirostat_mode"] = mirostat 57 | model_args["mirostat_eta"] = mirostat_eta 58 | model_args["mirostat_tau"] = mirostat_tau 59 | if seed != -1: 60 | model_args["seed"] = seed 61 | 62 | if "start_response" in request and stream_output: 63 | channel.basic_publish( 64 | exchange=incoming_headers['return_exchange'], 65 | routing_key=incoming_headers['return_routing_key'], 66 | body=request["start_response"], properties=outgoing_properties) 67 | 68 | for model_stream in model(prompt, stream=True, max_tokens=max_new_tokens, min_p=min_p, 69 | temperature=temperature, stop=stop_conditions, top_k=top_k, top_p=top_p, **model_args): 70 | text = model_stream["choices"][0]["text"] 71 | 72 | stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 73 | model_data["stop_generation_event"], model_data["stop_generation_filter"], socket_id) 74 | 75 | if stop_generation: 76 | finish_reason = "abort" 77 | break 78 | 79 | new_tokens += 1 80 | if new_tokens >= max_new_tokens: 81 | finish_reason = 'length' 82 | break 83 | 84 | if debug: 85 | print('\033[96m' + text, end="") 86 | 87 | # send chunk to front end 88 | if stream_output: 89 | channel.basic_publish( 90 | exchange=incoming_headers['return_exchange'], 91 | routing_key=incoming_headers['return_routing_key'], 92 | body=text, properties=outgoing_properties) 93 | else: 94 | response += text 95 | 96 | if debug: 97 | print('\033[0m' + "") 98 | 99 | end_time = time.time() 100 | elapsed = end_time - begin_time 101 | token_rate = 0 if elapsed == 0 else (new_tokens / elapsed) 102 | model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided" 103 | return self.finish_response(stop_key, response, request, stream_output, finish_reason, 104 | token_rate, new_tokens, input_token_count, model_name, elapsed, debug) 105 | 106 | def execute(self, model, request): 107 | config = self.model_config 108 | 109 | # build the prompt 110 | prompt = self.build_prompt(request, config, model) 111 | incoming_headers = model["amqp_headers"] 112 | outgoing_properties = self.copy_queue_headers(incoming_headers) 113 | 114 | # last string to send after done streaming output 115 | stream_resp = self.stream( 116 | model["model_loaded"], 117 | prompt, 118 | model["amqp_channel"], 119 | incoming_headers, 120 | outgoing_properties, 121 | config["stop_on"], 122 | request, 123 | model) 124 | 125 | return stream_resp 126 | 127 | def load(self, model, model_options, local_path): 128 | self.model_config = model["configuration"] 129 | 130 | try: 131 | if not model["model"][0]["files"][model_options["use_precision"]]: 132 | return { "error": True } 133 | 134 | lora_name = self.model_config["default_lora"] if "default_lora" in self.model_config else None 135 | model_file = model["model"][0]["files"][model_options["use_precision"]] 136 | model_path = f"{local_path}/{model_file}" 137 | config_threads = model["configuration"].get("num_threads", -1) 138 | num_threads = None if config_threads == -1 else config_threads 139 | max_seq_len = model["configuration"].get("max_seq_len", 2048) 140 | model_args = { 141 | "model_path": model_path, 142 | "n_gpu_layers": 0, 143 | "n_ctx": max_seq_len, 144 | "n_threads":num_threads 145 | } 146 | 147 | if lora_name != None: 148 | model_args["lora_path"] = f"data/loras/{lora_name}/" 149 | 150 | if model_options["device"].startswith("cuda"): 151 | model_args["n_gpu_layers"] = model["configuration"].get("model_layers", 0) 152 | model_args["main_gpu"] = int(model_options["device"].split(":")[1]) 153 | #gpu_device = int(model_options["device"].split(":")[1]) 154 | #tensor_map = [0] * gpu_device + [1] 155 | #tensor_map[0] = 0.01 156 | #tensor_split=tensor_map, 157 | #tensor_map[gpu_device] = 0.99 158 | #print(tensor_map) 159 | #gpu_device = 0 160 | if "70b" in model_file: 161 | model_args["n_gqa"] = 8 162 | 163 | load_model = Llama(**model_args) 164 | self.loaded_model = load_model 165 | 166 | print(f'skill {model["routing_key"]} loaded to {model_options["device"]}') 167 | return { "model_loaded": load_model, "error": False } 168 | except Exception as e: 169 | print(f"error loading model") 170 | print(e) 171 | return { "error": True } -------------------------------------------------------------------------------- /modules/noco-ai/llm-api/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "LLM API", 3 | "description": "Handler for accessing LLMs running on remote systems.", 4 | "unique_key": "llm_api", 5 | "script": "handler.py", 6 | "skills": [ 7 | { 8 | "label": "OpenAI Compatible Endpoint", 9 | "routing_key": "custom_llm_endpoint", 10 | "use": ["language_model"], 11 | "available_precision": { "cpu": ["full"] }, 12 | "memory_usage": { "full": 20 }, 13 | "configuration": { 14 | "model": "none", 15 | "max_seq_len": 4096, 16 | "stop_on": [] 17 | } 18 | }, 19 | { 20 | "label": "Claude Haiku API", 21 | "routing_key": "claude_haiku_api", 22 | "use": ["language_model"], 23 | "available_precision": { "cpu": ["full"] }, 24 | "memory_usage": { "full": 20 }, 25 | "configuration": { 26 | "model": "claude-3-haiku-20240307", 27 | "max_seq_len": 16384, 28 | "api_path": "https://api.anthropic.com/v1/messages", 29 | "stop_on": [] 30 | } 31 | }, 32 | { 33 | "label": "Claude Opus API", 34 | "routing_key": "claude_opus_api", 35 | "use": ["language_model"], 36 | "available_precision": { "cpu": ["full"] }, 37 | "memory_usage": { "full": 20 }, 38 | "configuration": { 39 | "model": "claude-3-opus-20240229", 40 | "max_seq_len": 16384, 41 | "api_path": "https://api.anthropic.com/v1/messages", 42 | "stop_on": [] 43 | } 44 | }, 45 | { 46 | "label": "Mistral Small API", 47 | "routing_key": "mistral_small_api", 48 | "use": ["language_model"], 49 | "available_precision": { "cpu": ["full"] }, 50 | "memory_usage": { "full": 20 }, 51 | "configuration": { 52 | "model": "mistral-small-latest", 53 | "max_seq_len": 8192, 54 | "api_path": "https://api.mistral.ai/v1/chat/completions", 55 | "stop_on": [] 56 | } 57 | }, 58 | { 59 | "label": "Mistral Medium API", 60 | "routing_key": "mistral_medium_api", 61 | "use": ["language_model"], 62 | "available_precision": { "cpu": ["full"] }, 63 | "memory_usage": { "full": 20 }, 64 | "configuration": { 65 | "model": "mistral-medium-latest", 66 | "max_seq_len": 8192, 67 | "api_path": "https://api.mistral.ai/v1/chat/completions", 68 | "stop_on": [] 69 | } 70 | }, 71 | { 72 | "label": "Mistral Large API", 73 | "routing_key": "mistral_large_api", 74 | "use": ["language_model"], 75 | "available_precision": { "cpu": ["full"] }, 76 | "memory_usage": { "full": 20 }, 77 | "configuration": { 78 | "model": "mistral-large-latest", 79 | "max_seq_len": 8192, 80 | "api_path": "https://api.mistral.ai/v1/chat/completions", 81 | "stop_on": [] 82 | } 83 | } 84 | ], 85 | "configuration": { 86 | "vault_path": "golem/llm_api", 87 | "options": [ 88 | { 89 | "name": "api_path", 90 | "label": "API Path", 91 | "editable": true, 92 | "type": "string", 93 | "default": "http://127.0.0.1:5000/v1/chat/completions" 94 | }, 95 | { 96 | "name": "api_key", 97 | "label": "API Key", 98 | "editable": true, 99 | "type": "secret", 100 | "default": "none" 101 | } 102 | ] 103 | } 104 | } 105 | -------------------------------------------------------------------------------- /modules/noco-ai/llm-api/handler.py: -------------------------------------------------------------------------------- 1 | from application.llm_handler import LlmHandler 2 | from pika import BasicProperties 3 | import logging 4 | import time 5 | import json 6 | import requests 7 | import sseclient 8 | import tiktoken 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class ServerSideEventLlm(LlmHandler): 13 | def __init__(self): 14 | super().__init__() 15 | 16 | def validate(self, request): 17 | is_valid, errors = self.validate_request(request, 'llm') 18 | return is_valid, errors 19 | 20 | def get_token_count(self, input_text): 21 | enc = self.token_counter.encode(input_text) 22 | return len(enc) 23 | 24 | def update_config(self, config_data): 25 | current_config = self.model_config 26 | merged_config = {**current_config, **config_data} 27 | self.model_config = merged_config 28 | 29 | def clip_messages(self, request, config): 30 | clipped_messages = [] 31 | max_seq_len = config.get("max_seq_len", 2048) 32 | max_input_tokens = int(request.get("max_input_tokens", config.get("max_input_len", int(max_seq_len / 2)))) 33 | system_prompt_tokens = 0 34 | messages = request.get("messages", []) 35 | sys_prompt_in_request = False 36 | if len(messages) and messages[0]["role"] == "system": 37 | system_prompt_tokens = self.get_token_count(messages[0]["content"]) 38 | sys_prompt_in_request = True 39 | request_system_message = messages[0] 40 | del messages[0] 41 | 42 | input_token_count = system_prompt_tokens 43 | messages = messages[::-1] 44 | for index, message in enumerate(messages): 45 | token_count = self.get_token_count(message["content"]) 46 | if token_count + input_token_count > max_input_tokens: 47 | break 48 | 49 | input_token_count += token_count 50 | clipped_messages.append(message) 51 | 52 | clipped_messages = clipped_messages[::-1] 53 | if sys_prompt_in_request: 54 | clipped_messages.insert(0, request_system_message) 55 | 56 | return clipped_messages, input_token_count 57 | 58 | def execute(self, model, request): 59 | # this is not the correct tokenizer but will give a rough guess, will need to fix this at some point... 60 | model_name = "gpt-3.5-turbo" 61 | self.token_counter = tiktoken.encoding_for_model(model_name) 62 | clipped_messages, input_token_count = self.clip_messages(request, self.model_config) 63 | 64 | max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \ 65 | min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request) 66 | 67 | if debug: 68 | print('\033[94m') 69 | print(request) 70 | print('\033[0m') 71 | 72 | # make API request to OpenAI 73 | begin_time = time.time() 74 | config = self.model_config 75 | #check_stop_token, stop_conditions = self.build_stop_conditions(config["stop_on"]) 76 | url = self.model_config["api_path"] 77 | api_key = self.model_config["api_key"] 78 | model_name = self.model_config["model"] 79 | 80 | data = { 81 | "messages": clipped_messages, 82 | "max_tokens": max_new_tokens, 83 | "temperature": temperature, 84 | "top_p": top_p, 85 | "top_k": top_k, 86 | "min_p": min_p, 87 | "stream": True, 88 | } 89 | if seed != -1: 90 | data["seed"] = seed 91 | 92 | headers = { 93 | "Content-Type": "application/json" 94 | } 95 | 96 | verify_ssl = False 97 | is_claude = True if model_name.find("claude") != -1 else False 98 | is_mistral = True if model_name.find("mistral") != -1 else False 99 | if is_mistral: 100 | verify_ssl = True 101 | accept_header = "text/event-stream" if stream_output else "application/json" 102 | headers["Accept"] = accept_header 103 | headers["Authorization"] = f"Bearer {api_key}" 104 | headers["User-Agent"] = "elemental-golem/v3" 105 | data["model"] = self.model_config["model"] 106 | if "seed" in data: 107 | data["random_seed"] = seed 108 | del data["seed"] 109 | del data["min_p"] 110 | del data["top_k"] 111 | elif is_claude: 112 | verify_ssl = True 113 | if data["messages"][0]["role"] == "system": 114 | data["system"] = data["messages"][0]["content"] 115 | del data["messages"][0] 116 | del data["min_p"] 117 | data["model"] = self.model_config["model"] 118 | headers["x-api-key"] = api_key 119 | headers["anthropic-version"] = "2023-06-01" 120 | 121 | stream_response = requests.post(url, headers=headers, json=data, verify=verify_ssl, stream=True) 122 | if stream_response.status_code != 200: 123 | if stream_response.status_code == 401: 124 | raise Exception("Invalid API key") 125 | else: 126 | raise Exception("Failed to get response from API") 127 | 128 | client = sseclient.SSEClient(stream_response) 129 | channel = model["amqp_channel"] 130 | incoming_headers = model["amqp_headers"] 131 | 132 | # copy amqp headers 133 | response_str = "" 134 | finish_reason = "stop" 135 | new_tokens = 0 136 | outgoing_headers = {} 137 | for incoming_header in incoming_headers: 138 | if incoming_header in ["x-delay", "return_exchange", "return_routing_key"]: 139 | continue 140 | outgoing_headers[incoming_header] = incoming_headers[incoming_header] 141 | 142 | socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None 143 | outgoing_headers["command"] = "prompt_fragment" if "stream_to_override" not in incoming_headers else incoming_headers["stream_to_override"] 144 | outgoing_properties = BasicProperties(headers=outgoing_headers) 145 | stop_generation_counter = 0 146 | 147 | for event in client.events(): 148 | 149 | stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 150 | model["stop_generation_event"], model["stop_generation_filter"], socket_id) 151 | 152 | if stop_generation: 153 | finish_reason = "abort" 154 | break 155 | 156 | if is_claude and event.event != "content_block_delta": 157 | continue 158 | 159 | chunk = "" 160 | try: 161 | payload = json.loads(event.data) 162 | if is_mistral: 163 | chunk = payload['choices'][0]['delta']['content'] 164 | elif is_claude: 165 | chunk = payload['delta']['text'] 166 | else: 167 | chunk = payload['choices'][0]['message']['content'] 168 | except: 169 | continue 170 | 171 | response_str += chunk 172 | new_tokens += 1 173 | if debug: 174 | print('\033[96m' + chunk, end="") 175 | 176 | if stream_output: 177 | channel.basic_publish( 178 | exchange=incoming_headers['return_exchange'], 179 | routing_key=incoming_headers['return_routing_key'], 180 | body=chunk, properties=outgoing_properties) 181 | 182 | end_time = time.time() 183 | elapsed = end_time - begin_time 184 | token_rate = 0 if elapsed == 0 else (new_tokens / elapsed) 185 | model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided" 186 | resp = self.finish_response(stop_key, response_str, request, stream_output, finish_reason, 187 | token_rate, new_tokens, input_token_count, model_name, elapsed, debug) 188 | return resp 189 | 190 | 191 | def load(self, model, model_options, local_path): 192 | self.model_config = model["configuration"] 193 | self.model_config["api_key"] = model["secrets"]["api_key"] 194 | return { "model_name": "" } 195 | -------------------------------------------------------------------------------- /modules/noco-ai/music-generator/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "MusicGen", 3 | "description": "Handler for loading Meta MusicGen models.", 4 | "unique_key": "musicgen", 5 | "script": "handler.py", 6 | "skills": [ 7 | { 8 | "label": "MusicGen Small", 9 | "routing_key": "musicgen_small", 10 | "use": ["music_generation"], 11 | "available_precision": { "cpu": ["full"], "cuda": ["full"] }, 12 | "memory_usage": { "full": 3600 }, 13 | "model": [{ 14 | "name": "facebook/musicgen-small", 15 | "provider": "huggingface" 16 | }], 17 | "configuration": { 18 | "progress_label": "Generating Music" 19 | } 20 | }, 21 | { 22 | "label": "MusicGen Medium", 23 | "routing_key": "musicgen_medium", 24 | "use": ["music_generation"], 25 | "available_precision": { "cpu": ["full"], "cuda": ["full"] }, 26 | "memory_usage": { "full": 8500 }, 27 | "model": [{ 28 | "name": "facebook/musicgen-medium", 29 | "provider": "huggingface" 30 | }], 31 | "configuration": { 32 | "progress_label": "Generating Music" 33 | } 34 | }, 35 | { 36 | "label": "MusicGen Large", 37 | "routing_key": "musicgen_large", 38 | "use": ["music_generation"], 39 | "available_precision": { "cpu": ["full"], "cuda": ["full"] }, 40 | "memory_usage": { "full": 13500 }, 41 | "model": [{ 42 | "name": "facebook/musicgen-large", 43 | "provider": "huggingface" 44 | }] , 45 | "configuration": { 46 | "progress_label": "Generating Music" 47 | } 48 | } 49 | ] 50 | } -------------------------------------------------------------------------------- /modules/noco-ai/music-generator/handler.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | from application.progress_streamer import ProgressStreamer 3 | from transformers import AutoProcessor, MusicgenForConditionalGeneration 4 | import base64 5 | from io import BytesIO 6 | import scipy 7 | import copy 8 | import logging 9 | 10 | logger = logging.getLogger(__name__) 11 | 12 | class MusicGen(BaseHandler): 13 | def __init__(self): 14 | self.progress_streamer = ProgressStreamer() 15 | super().__init__() 16 | 17 | def validate(self, request): 18 | is_valid, errors = self.validate_request(request, 'audio-gen') 19 | return is_valid, errors 20 | 21 | def execute(self, model, request): 22 | prompt = request.get("prompt", "") # defaults to an empty string if "prompt" is not in request 23 | seconds = int(request.get("seconds", 5)) 24 | guidance_scale = int(request.get("guidance_scale", 3)) 25 | send_progress = request.get("progress", True) 26 | max_new_tokens = seconds * 52 27 | 28 | # prep headers for sending progress data 29 | if send_progress: 30 | progress_headers = copy.deepcopy(model["amqp_headers"]) 31 | outgoing_properties = self.copy_queue_headers(progress_headers, "update_progress") 32 | amqp_config = { 33 | "headers": progress_headers, 34 | "outgoing_properties": outgoing_properties, 35 | "channel": model["amqp_channel"] 36 | } 37 | self.progress_streamer.configure(max_new_tokens, self.model_config["progress_label"], self.routing_key, amqp_config) 38 | else: 39 | self.progress_streamer.configure(max_new_tokens, self.model_config["progress_label"], self.routing_key) 40 | 41 | # Assuming the model function can take these parameters: 42 | logger.info(f"prompt: {prompt}, seconds: {seconds}, max new tokens: {max_new_tokens}, guidance scale: {guidance_scale}") 43 | inputs = model["processor"]( 44 | text=[prompt], 45 | padding=True, 46 | return_tensors="pt", 47 | ).to(model["device"]) 48 | audio_values = model["model"].generate(**inputs, do_sample=True, streamer=self.progress_streamer, guidance_scale=guidance_scale, max_new_tokens=max_new_tokens) 49 | 50 | # Save image to an in-memory bytes buffer 51 | buffered = BytesIO() 52 | sampling_rate = model["model"].config.audio_encoder.sampling_rate 53 | scipy.io.wavfile.write(buffered, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy()) 54 | 55 | # Convert bytes buffer to a base64-encoded string 56 | wav_str = base64.b64encode(buffered.getvalue()).decode() 57 | return {"wav": wav_str} 58 | 59 | def load(self, model, model_options, local_path): 60 | self.model_config = model["configuration"] 61 | self.routing_key = model["routing_key"] 62 | processor = AutoProcessor.from_pretrained(local_path) 63 | load_model = MusicgenForConditionalGeneration.from_pretrained(local_path) 64 | 65 | return { 66 | "model": load_model, 67 | "processor": processor, 68 | "device": model_options["device"], 69 | "device_memory": model["memory_usage"][model_options["use_precision"]] 70 | } 71 | -------------------------------------------------------------------------------- /modules/noco-ai/sd-xl/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Stable Diffusion XL v1.0", 3 | "description": "Handler for loading Stable Diffusion XL v1.0 models.", 4 | "unique_key": "sd_xl_10", 5 | "script": "handler.py", 6 | "supported_gpu": ["nvidia"], 7 | "skills": [ 8 | { 9 | "label": "Stable Diffusion XL v1.0", 10 | "routing_key": "stable_diffusion_xl_v10", 11 | "use": ["image_generation"], 12 | "available_precision": { "cuda": ["full"] }, 13 | "memory_usage": { "full": 19000 }, 14 | "model": [{ 15 | "name": "stabilityai/stable-diffusion-xl-base-1.0", 16 | "provider": "huggingface" 17 | }, 18 | { 19 | "name": "stabilityai/stable-diffusion-xl-refiner-1.0", 20 | "provider": "huggingface" 21 | }], 22 | "configuration": { 23 | "is_turbo": false, 24 | "progress_label": "Generating Image" 25 | } 26 | }, 27 | { 28 | "label": "DreamShaper XL v1.0", 29 | "routing_key": "dream_shaper_xl_image_gen", 30 | "use": ["image_generation"], 31 | "available_precision": { "cuda": ["full"] }, 32 | "memory_usage": { "full": 19000 }, 33 | "model": [{ 34 | "name": "civitai/dreamshaper_xl/126688.safetensors", 35 | "provider": "civitai", 36 | "url": "https://civitai.com/api/download/models/126688?type=Model&format=SafeTensor&size=full&fp=fp16" 37 | }, 38 | { 39 | "name": "stabilityai/stable-diffusion-xl-refiner-1.0", 40 | "provider": "huggingface" 41 | }], 42 | "configuration": { 43 | "is_turbo": false, 44 | "progress_label": "Generating Image" 45 | } 46 | }, 47 | { 48 | "label": "Juggernaut XL", 49 | "routing_key": "juggernaut_xl_image_gen", 50 | "use": ["image_generation"], 51 | "available_precision": { "cuda": ["full"] }, 52 | "memory_usage": { "full": 19000 }, 53 | "model": [{ 54 | "name": "civitai/juggernaut_xl/240840.safetensors", 55 | "provider": "civitai", 56 | "url": "https://civitai.com/api/download/models/240840?type=Model&format=SafeTensor&size=full&fp=fp16" 57 | }, 58 | { 59 | "name": "stabilityai/stable-diffusion-xl-refiner-1.0", 60 | "provider": "huggingface" 61 | }], 62 | "configuration": { 63 | "is_turbo": false, 64 | "progress_label": "Generating Image" 65 | } 66 | }, 67 | { 68 | "label": "Stable Diffusion XL Turbo v1.0", 69 | "routing_key": "stable_diffusion_xl_trubo_v10", 70 | "use": ["image_generation"], 71 | "available_precision": { "cuda": ["full"] }, 72 | "memory_usage": { "full": 7800 }, 73 | "model": [{ 74 | "name": "stabilityai/sdxl-turbo", 75 | "provider": "huggingface" 76 | }], 77 | "configuration": { 78 | "is_turbo": true, 79 | "progress_label": "Generating Image" 80 | } 81 | }, 82 | { 83 | "label": "SD XL Turbo Unstable", 84 | "routing_key": "sd_xl_trubo_unstable", 85 | "use": ["image_generation"], 86 | "available_precision": { "cuda": ["full"] }, 87 | "memory_usage": { "full": 7800 }, 88 | "model": [{ 89 | "name": "civitai/sd_xl_trubo_unstable/247214.safetensors", 90 | "provider": "civitai", 91 | "url": "https://civitai.com/api/download/models/247214?type=Model&format=SafeTensor&size=full&fp=fp16" 92 | }], 93 | "configuration": { 94 | "is_turbo": true, 95 | "progress_label": "Generating Image" 96 | } 97 | } 98 | ] 99 | } -------------------------------------------------------------------------------- /modules/noco-ai/sd-xl/handler.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | import torch 3 | import base64 4 | from io import BytesIO 5 | import logging 6 | import torch 7 | from diffusers import StableDiffusionXLPipeline, KDPM2DiscreteScheduler, DiffusionPipeline 8 | from compel import Compel, ReturnedEmbeddingsType 9 | import copy 10 | import json 11 | 12 | logger = logging.getLogger(__name__) 13 | 14 | class StableDiffusionXl(BaseHandler): 15 | def __init__(self): 16 | super().__init__() 17 | 18 | def validate(self, request): 19 | is_valid, errors = self.validate_request(request, 'img-gen') 20 | return is_valid, errors 21 | 22 | def get_latents(self, num_images=1, height=1024, width=1024, user_seed=-1, device="cuda:0", model=None): 23 | latents = None 24 | generator = torch.Generator(device=device) 25 | if user_seed == -1: 26 | seed = generator.seed() 27 | else: 28 | seed = user_seed 29 | generator = generator.manual_seed(seed) 30 | 31 | latents = torch.randn( 32 | (num_images, model.unet.in_channels, height // 8, width // 8), 33 | generator = generator, 34 | device = device, 35 | dtype = torch.float16 36 | ) 37 | return { "seed": seed, "latents": latents } 38 | 39 | def step_callback(self, pipeline: DiffusionPipeline, step: int, timestep: int, callback_kwargs): 40 | if self.stream_progress == False: 41 | return callback_kwargs 42 | 43 | self.current_step = self.current_step + 1 44 | label = self.model_config["progress_label"] if "progress_label" in self.model_config else self.routing_key 45 | send_body = { 46 | "total": self.total_steps, 47 | "current": self.current_step, 48 | "label": label, 49 | "model": self.routing_key 50 | } 51 | 52 | self.amqp_progress_config["channel"].basic_publish( 53 | exchange=self.amqp_progress_config["headers"]['return_exchange'], 54 | routing_key=self.amqp_progress_config["headers"]['return_routing_key'], 55 | body=json.dumps(send_body), properties=self.amqp_progress_config["outgoing_properties"]) 56 | 57 | return callback_kwargs 58 | 59 | def execute(self, model, request): 60 | prompt = request.get("prompt", "") 61 | height = request.get("height", 1024) 62 | width = request.get("width", 1024) 63 | steps = request.get("steps", 50) 64 | seed = request.get("seed", -1) 65 | self.stream_progress = request.get("progress", False) 66 | negative_prompt = request.get("negative_prompt", "") 67 | guidance_scale = request.get("guidance_scale", 7.5) 68 | num_images_per_prompt = 1 69 | 70 | if self.model_config["is_turbo"] == True and steps > 4: 71 | guidance_scale = 0.0 72 | steps = 4 73 | 74 | high_noise_frac = 0.8 75 | if self.stream_progress == True: 76 | progress_headers = copy.deepcopy(model["amqp_headers"]) 77 | outgoing_properties = self.copy_queue_headers(progress_headers, "update_progress") 78 | self.amqp_progress_config = { 79 | "headers": progress_headers, 80 | "outgoing_properties": outgoing_properties, 81 | "channel": model["amqp_channel"] 82 | } 83 | self.current_step = 0 84 | if self.model_config["is_turbo"] == False: 85 | self.total_steps = ((steps * high_noise_frac) * 2) + (steps * (1 - high_noise_frac)) 86 | else: 87 | self.total_steps = steps 88 | 89 | latent_data = self.get_latents(num_images_per_prompt, height, width, seed, self.model_options["device"], model["model"]) 90 | logger.info(f"prompt: {prompt}, height: {height}, width: {width}, steps: {steps}, guidance scale: {guidance_scale}, seed: {latent_data['seed']}") 91 | conditioning, pooled = model["compel"](prompt) 92 | negative_conditioning, negative_pooled = model["compel"](negative_prompt) 93 | 94 | if self.model_config["is_turbo"] == False: 95 | conditioning_refiner, pooled_refiner = model["compel_refiner"](prompt) 96 | negative_conditioning_refiner, negative_pooled_refiner = model["compel_refiner"](negative_prompt) 97 | base_image = model["model"](prompt_embeds=conditioning, pooled_prompt_embeds=pooled, height=height, width=width, num_inference_steps=steps, callback_on_step_end=self.step_callback, latents=latent_data["latents"], denoising_end=high_noise_frac, 98 | negative_prompt_embeds=negative_conditioning, negative_pooled_prompt_embeds=negative_pooled, guidance_scale=guidance_scale, num_images_per_prompt=num_images_per_prompt, output_type="latent").images 99 | image = model["refiner"](prompt_embeds=conditioning_refiner, pooled_prompt_embeds=pooled_refiner, negative_prompt_embeds=negative_conditioning_refiner, 100 | negative_pooled_prompt_embeds=negative_pooled_refiner, num_inference_steps=steps, denoising_start=high_noise_frac, image=base_image, callback_on_step_end=self.step_callback).images[0] 101 | else: 102 | image = model["model"](prompt_embeds=conditioning, pooled_prompt_embeds=pooled, height=height, width=width, num_inference_steps=steps, latents=latent_data["latents"], 103 | guidance_scale=guidance_scale, num_images_per_prompt=num_images_per_prompt, callback_on_step_end=self.step_callback).images[0] 104 | 105 | buffered = BytesIO() 106 | image.save(buffered, format="PNG") 107 | 108 | # Convert bytes buffer to a base64-encoded string 109 | img_str = base64.b64encode(buffered.getvalue()).decode() 110 | return {"image": img_str, "seed": latent_data["seed"], "guidance_scale": guidance_scale, "steps": steps } 111 | 112 | def load(self, model, model_options, local_path): 113 | self.model_options = model_options 114 | self.model_config = model["configuration"] 115 | self.routing_key = model["routing_key"] 116 | 117 | try: 118 | is_turbo = model["configuration"]["is_turbo"] 119 | if "civitai" not in local_path: 120 | logger.info("loading sd xl model") 121 | load_model = StableDiffusionXLPipeline.from_pretrained(local_path, torch_dtype=torch.float16, use_safetensors=True, variant="fp16") 122 | else: 123 | logger.info("loading civit sd xl model") 124 | load_model = StableDiffusionXLPipeline.from_single_file(local_path, torch_dtype=torch.float16, variant="fp16") 125 | 126 | compel = Compel( 127 | tokenizer=[load_model.tokenizer, load_model.tokenizer_2] , 128 | text_encoder=[load_model.text_encoder, load_model.text_encoder_2], 129 | returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, 130 | requires_pooled=[False, True] 131 | ) 132 | 133 | ret = { 134 | "model": load_model, 135 | "device": model_options["device"], 136 | "device_memory": model["memory_usage"][model_options["use_precision"]], 137 | "compel": compel 138 | } 139 | 140 | # load the refiner model 141 | if is_turbo == False: 142 | load_model.scheduler = KDPM2DiscreteScheduler.from_config(load_model.scheduler.config) 143 | logger.info("loading sd xl refiner") 144 | load_refiner = DiffusionPipeline.from_pretrained( 145 | "./data/models/stabilityai/stable-diffusion-xl-refiner-1.0", 146 | text_encoder_2=load_model.text_encoder_2, 147 | vae=load_model.vae, 148 | torch_dtype=torch.float16, 149 | use_safetensors=True, 150 | variant="fp16" 151 | ) 152 | load_refiner.to(model_options["device"]) 153 | ret["refiner"] = load_refiner 154 | 155 | compel_refiner = Compel( 156 | tokenizer=[load_refiner.tokenizer_2], 157 | text_encoder=[load_refiner.text_encoder_2], 158 | returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED, 159 | requires_pooled=[True], 160 | ) 161 | ret["compel_refiner"] = compel_refiner 162 | 163 | return ret 164 | except Exception as e: 165 | print(f"error loading sdxl model") 166 | print(e) 167 | return { "error": True } 168 | -------------------------------------------------------------------------------- /modules/noco-ai/transformers-stream/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "🤗 Transfromers LLM", 3 | "description": "Handler for loading any models that are compatible with HuggingFace transformers. Has only tested with Llama 2.", 4 | "unique_key": "transformers_llm", 5 | "script": "handler.py", 6 | "multi_gpu_support": true, 7 | "multi_gpu_configurable": false, 8 | "supported_gpu": ["nvidia"], 9 | "skills": [ 10 | { 11 | "label": "CodeLlama 7B Instruct 🤗", 12 | "routing_key": "llama_v2_code_instruct_7b", 13 | "use": ["language_model"], 14 | "available_precision": { "cuda": ["4-bit", "8-bit", "full"] }, 15 | "memory_usage": { "4-bit": 5500, "8-bit": 8500, "full": 27000 }, 16 | "shortcut": "💻", 17 | "moe_domain": [ 18 | "Systems Programming: Development of computer systems software.", 19 | "Computer Networking: Study of computer systems that are interconnected via network." 20 | ], 21 | "lora": [{ 22 | "name": "nocoai/function-hul-lora", 23 | "moe_domain": [ 24 | "Systems Programming: Development of computer systems software." 25 | ], 26 | "chat_history": -1 27 | }], 28 | "model": [{ 29 | "name": "codellama/CodeLlama-7B-Instruct-hf", 30 | "provider": "huggingface" 31 | }], 32 | "configuration": { 33 | "max_seq_len": 16384, 34 | "stop_on": ["", "[INST]"], 35 | "user_role": "[INST]", 36 | "ai_role": "[/INST]", 37 | "system_prompt_format": "{user_role} <>{system_prompt}<>\n{prompt} {ai_role} {response}", 38 | "prompt_format": "{user_role} {prompt} {ai_role} {response}", 39 | "system_message": "You are an expert software development coding assistant. Wrap all code you output in ```." 40 | } 41 | }, 42 | { 43 | "label": "CodeLlama 13B Instruct 🤗", 44 | "routing_key": "llama_v2_code_instruct_13b", 45 | "use": ["language_model"], 46 | "available_precision": { "cuda": ["4-bit"] }, 47 | "memory_usage": { "4-bit": 16100 }, 48 | "shortcut": "💻", 49 | "moe_domain": [ 50 | "Systems Programming: Development of computer systems software.", 51 | "Computer Networking: Study of computer systems that are interconnected via network." 52 | ], 53 | "model": [{ 54 | "name": "codellama/CodeLlama-13B-Instruct-hf", 55 | "provider": "huggingface" 56 | }], 57 | "configuration": { 58 | "max_seq_len": 16384, 59 | "stop_on": ["", "[INST]"], 60 | "user_role": "[INST]", 61 | "ai_role": "[/INST]", 62 | "prompt_format": "{user_role} {prompt} {ai_role} {response}", 63 | "system_prompt_format": "{user_role} <>{system_prompt}<>\n{prompt} {ai_role} {response}", 64 | "system_message": "You are an expert software development coding assistant. Wrap all code you output in ```." 65 | } 66 | }, 67 | { 68 | "label": "CodeLlama 34B Instruct 🤗", 69 | "routing_key": "llama_v2_code_instruct_34b", 70 | "use": ["language_model"], 71 | "shortcut": "💻", 72 | "special_ability": ["coding"], 73 | "available_precision": { "cuda": ["4-bit"] }, 74 | "memory_usage": { "4-bit": 22000 }, 75 | "model": [{ 76 | "name": "codellama/CodeLlama-34b-Instruct-hf", 77 | "provider": "huggingface" 78 | }], 79 | "configuration": { 80 | "max_seq_len": 16384, 81 | "user_role": "[INST]", 82 | "ai_role": "[/INST]", 83 | "stop_on": ["", "[INST]"], 84 | "system_prompt_format": "{user_role} <>{system_prompt}<>\n{prompt} {ai_role} {response}", 85 | "system_message": "You are an expert software development coding assistant. Wrap all code you output in ```." 86 | } 87 | }, 88 | { 89 | "label": "CodeLlama 34B Python 🤗", 90 | "routing_key": "llama_v2_code_python_34b", 91 | "use": ["language_model"], 92 | "shortcut": "🐍", 93 | "special_ability": ["coding"], 94 | "available_precision": { "cuda": ["4-bit"] }, 95 | "memory_usage": { "4-bit": 22000 }, 96 | "model": [{ 97 | "name": "codellama/CodeLlama-34B-Python-hf", 98 | "provider": "huggingface" 99 | }], 100 | "configuration": { 101 | "max_seq_len": 16384, 102 | "stop_on": ["[INST]"], 103 | "user_role": "[INST]", 104 | "ai_role": "[/INST]", 105 | "prompt_format": "{user_role} {prompt} {ai_role} {response}", 106 | "system_prompt_format": "{user_role} <>{system_prompt}<>\n{prompt} {ai_role} {response}", 107 | "system_message": "You are an expert software development coding assistant. Wrap all code you output in ```." 108 | } 109 | }, 110 | { 111 | "label": "CodeLlama 34B Phind v2 🤗", 112 | "routing_key": "llama_v2_code_phind_v2", 113 | "use": ["language_model"], 114 | "available_precision": { "cuda": ["4-bit"] }, 115 | "memory_usage": { "4-bit": 22500 }, 116 | "shortcut": "💻", 117 | "special_ability": ["coding"], 118 | "moe_domain": [ 119 | "Systems Programming: Development of computer systems software" 120 | ], 121 | "model": [{ 122 | "name": "Phind/Phind-CodeLlama-34B-v2", 123 | "provider": "huggingface" 124 | }], 125 | "configuration": { 126 | "max_seq_len": 16384, 127 | "user_role": "### User Message\n", 128 | "ai_role": "### Assistant\n", 129 | "stop_on": ["", ""], 130 | "prompt_format": "{user_role}{prompt}\n\n{ai_role}{response}", 131 | "system_prompt_format": "### System Prompt:\n{system_prompt}\n\n", 132 | "system_message": "You are an expert software development coding assistant. Wrap all code you output in ```." 133 | } 134 | } 135 | ], 136 | "configuration": { 137 | "vault_path": "golem/transformers_llm", 138 | "options": [ 139 | { 140 | "label": "System Message", 141 | "name": "system_message", 142 | "editable": true, 143 | "type": "textarea", 144 | "default": "A chat between a human and an assistant." 145 | }, 146 | { 147 | "label": "Stop On", 148 | "name": "stop_on", 149 | "editable": true, 150 | "type": "multistring", 151 | "default": [""] 152 | } 153 | ] 154 | } 155 | } -------------------------------------------------------------------------------- /modules/noco-ai/transformers-stream/handler.py: -------------------------------------------------------------------------------- 1 | from transformers_stream_generator import init_stream_support 2 | init_stream_support() 3 | from application.llm_handler import LlmHandler 4 | from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig 5 | import torch 6 | import time 7 | import logging 8 | 9 | logger = logging.getLogger(__name__) 10 | 11 | class TransformersGenerator(LlmHandler): 12 | def __init__(self): 13 | super().__init__() 14 | 15 | def update_config(self, config_data): 16 | current_config = self.model_config 17 | merged_config = {**current_config, **config_data} 18 | self.model_config = merged_config 19 | 20 | def validate(self, request): 21 | is_valid, errors = self.validate_request(request, 'llm') 22 | return is_valid, errors 23 | 24 | def get_token_count(self, input_text): 25 | inputs = self.tokenizer(input_text, return_tensors="pt", add_special_tokens=False).to("cuda") 26 | return inputs["input_ids"].shape[1] 27 | 28 | def stream(self, generator, tokenizer, model, prompt, channel, incoming_headers, 29 | outgoing_properties, stops, request, model_data): 30 | 31 | # setup stop conditions 32 | check_stop_token, stop_conditions = self.build_stop_conditions(stops) 33 | 34 | # force this to false, token passed to check_stop_conditions not same format as other handlers 35 | check_stop_token = False 36 | 37 | # get starting time 38 | begin_time = time.time() 39 | 40 | # tokenize the prompt 41 | inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda") 42 | input_token_count = inputs["input_ids"].shape[1] 43 | 44 | # set max new tokens and other params 45 | max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \ 46 | min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request) 47 | if debug: 48 | print('\033[94m') 49 | print(request) 50 | print(prompt) 51 | print('\033[0m') 52 | 53 | generator = model.generate( 54 | inputs["input_ids"], 55 | max_new_tokens=max_new_tokens, 56 | do_sample=True, 57 | do_stream=True, 58 | top_p=top_p, 59 | top_k=top_k, 60 | eos_token_id=tokenizer.eos_token_id, 61 | temperature=temperature, 62 | ) 63 | 64 | # vars used in generation loop 65 | all_tokens = [] 66 | all_text = "" 67 | response = "" 68 | held_text = "" 69 | new_tokens = 0 70 | finish_reason = 'stop' 71 | socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None 72 | stop_generation_counter = 0 73 | 74 | for token in generator: 75 | all_tokens.extend(token.tolist()) 76 | new_text = tokenizer.decode(all_tokens) 77 | new_chuck = new_text[len(all_text):] 78 | all_text += new_chuck 79 | new_tokens += 1 80 | 81 | if new_tokens >= max_new_tokens: 82 | finish_reason = 'length' 83 | break 84 | 85 | stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 86 | model_data["stop_generation_event"], model_data["stop_generation_filter"], socket_id) 87 | 88 | if stop_generation: 89 | finish_reason = "abort" 90 | break 91 | 92 | # check if we should hold off on streaming this text 93 | hold_text = False 94 | for stop_string in stop_conditions: 95 | if len(held_text) and stop_string.startswith(held_text.lower() + new_chuck.lower()): hold_text = True 96 | elif stop_string.startswith(new_chuck.lower()): hold_text = True 97 | 98 | if not hold_text: 99 | 100 | # send chunk to front end 101 | if stream_output: 102 | if debug: 103 | print('\033[96m' + new_chuck, end="") 104 | 105 | channel.basic_publish( 106 | exchange=incoming_headers['return_exchange'], 107 | routing_key=incoming_headers['return_routing_key'], 108 | body=new_chuck, properties=outgoing_properties) 109 | else: 110 | response += new_chuck 111 | 112 | held_text = "" 113 | else: 114 | held_text += new_chuck 115 | 116 | stop_condition = self.check_stop_conditions(token, held_text, tokenizer.eos_token_id, 117 | check_stop_token, stop_conditions) 118 | if stop_condition: break 119 | 120 | if debug and stream_output: 121 | print('\033[0m' + "") 122 | 123 | end_time = time.time() 124 | elapsed = end_time - begin_time 125 | token_rate = 0 if elapsed == 0 else (new_tokens / elapsed) 126 | model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided" 127 | return self.finish_response(stop_key, response, request, stream_output, finish_reason, 128 | token_rate, new_tokens, input_token_count, model_name, elapsed, debug) 129 | 130 | def execute(self, model, request): 131 | config = self.model_config 132 | 133 | # build the prompt 134 | prompt = self.build_prompt(request, config, model) 135 | incoming_headers = model["amqp_headers"] 136 | outgoing_properties = self.copy_queue_headers(incoming_headers) 137 | 138 | # last string to send after done streaming output 139 | stream_resp = self.stream( 140 | model["generator"], 141 | model["tokenizer"], 142 | model["model_loaded"], 143 | prompt, 144 | model["amqp_channel"], 145 | incoming_headers, 146 | outgoing_properties, 147 | config["stop_on"], 148 | request, 149 | model) 150 | 151 | return stream_resp 152 | 153 | def load(self, model, model_options, local_path): 154 | self.model_config = model["configuration"] 155 | 156 | # get paths 157 | logger.info(f"starting module {local_path}") 158 | load_error = False 159 | try: 160 | tokenizer = AutoTokenizer.from_pretrained(local_path) 161 | quantization_config = None 162 | if model_options["use_precision"] != "full": 163 | if model_options["use_precision"] == "4-bit": 164 | quantization_config = BitsAndBytesConfig( 165 | load_in_4bit=True, 166 | bnb_4bit_compute_dtype=torch.float16 167 | ) 168 | else: 169 | quantization_config = BitsAndBytesConfig( 170 | load_in_8bit=True 171 | ) 172 | 173 | # this is not fully impelemented and should but a device map based off the split not auto 174 | device_map = "auto" if model_options["device"].startswith("split") else model_options["device"] 175 | load_model = AutoModelForCausalLM.from_pretrained( 176 | local_path, 177 | quantization_config=quantization_config, 178 | device_map=device_map 179 | ) 180 | self.tokenizer = tokenizer 181 | 182 | logger.info(f'skill {model["routing_key"]} loaded to {model_options["device"]}, precision: {model_options["use_precision"]}') 183 | return { "model_loaded": load_model, "generator": load_model, "tokenizer": tokenizer, "error": load_error } 184 | except Exception as e: 185 | logger.error(f"error loading model") 186 | print(e) 187 | load_error = True 188 | return { "error": load_error } -------------------------------------------------------------------------------- /modules/noco-ai/tts-api/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "xTTS Text to Speech", 3 | "description": "Handler for loading xTTS text to speech models.", 4 | "unique_key": "coqui_tts", 5 | "script": "handler.py", 6 | "skills": [ 7 | { 8 | "label": "XTTS v2", 9 | "routing_key": "xtts_v2_speech", 10 | "use": ["text_to_speech"], 11 | "available_precision": { "cuda": ["full"], "cpu": ["full"] }, 12 | "memory_usage": { "full": 2800 }, 13 | "model": [{ 14 | "name": "coqui/XTTS-v2", 15 | "provider": "huggingface" 16 | }], 17 | "configuration": { 18 | "progress_label": "Generating Speech" 19 | } 20 | } 21 | ] 22 | } -------------------------------------------------------------------------------- /modules/noco-ai/tts-api/handler.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | import logging 3 | from TTS.tts.configs.xtts_config import XttsConfig 4 | from TTS.tts.models.xtts import Xtts 5 | import soundfile as sf 6 | import base64 7 | from io import BytesIO 8 | import requests 9 | import tempfile 10 | import os 11 | from urllib.parse import urlparse 12 | 13 | logger = logging.getLogger(__name__) 14 | 15 | class XTTSHandler(BaseHandler): 16 | def __init__(self): 17 | super().__init__() 18 | 19 | def validate(self, request): 20 | is_valid, errors = self.validate_request(request, 'voice-gen') 21 | return is_valid, errors 22 | 23 | def is_valid_url(self, url): 24 | try: 25 | result = urlparse(url) 26 | return all([result.scheme, result.netloc]) 27 | except ValueError: 28 | return False 29 | 30 | def download_temp_file(self, url): 31 | response = requests.get(url) 32 | if response.status_code == 200: 33 | temp_file = tempfile.NamedTemporaryFile(delete=False) 34 | temp_file.write(response.content) 35 | temp_file.close() 36 | return temp_file.name 37 | else: 38 | return None 39 | 40 | def execute(self, model, request): 41 | prompt = request.get("prompt", "") 42 | voice_preset = request.get("voice", "default") 43 | prompt_length = len(prompt) 44 | 45 | temp_file_path = None 46 | if self.is_valid_url(voice_preset): 47 | temp_file_path = self.download_temp_file(voice_preset) 48 | if temp_file_path: 49 | voice_preset = temp_file_path 50 | else: 51 | voice_preset = model["default_wav"] 52 | else: 53 | voice_preset = model["default_wav"] 54 | 55 | logger.info(f"prompt: {prompt}, voice: {voice_preset}, length: {prompt_length}") 56 | 57 | outputs = model["loaded_model"].synthesize( 58 | prompt, 59 | model["config"], 60 | speaker_wav=voice_preset, 61 | gpt_cond_len=3, 62 | language="en", 63 | ) 64 | if temp_file_path: 65 | os.remove(temp_file_path) 66 | 67 | base64_encoded_wav = None 68 | with BytesIO() as wav_file: 69 | sf.write(wav_file, outputs["wav"], samplerate=22050, format='WAV') 70 | wav_file.seek(0) 71 | binary_wav = wav_file.read() 72 | base64_encoded_wav = base64.b64encode(binary_wav).decode() 73 | 74 | return {"wav": base64_encoded_wav } 75 | 76 | def load(self, model, model_options, local_path): 77 | self.model_config = model["configuration"] 78 | 79 | try: 80 | config = XttsConfig() 81 | config.load_json(f"{local_path}/config.json") 82 | loaded_model = Xtts.init_from_config(config) 83 | loaded_model.load_checkpoint(config, checkpoint_dir=local_path, eval=True) 84 | if model_options["device"] != "cpu": 85 | loaded_model.cuda(model_options["device"]) 86 | 87 | logger.setLevel(logging.INFO) 88 | return { 89 | "loaded_model": loaded_model, 90 | "config": config, 91 | "default_wav": f"{local_path}/samples/en_sample.wav" 92 | } 93 | except Exception as e: 94 | print(f"error loading xtts model") 95 | print(e) 96 | return { "error": True } 97 | -------------------------------------------------------------------------------- /modules/openai/chat-api/chat-api.py: -------------------------------------------------------------------------------- 1 | from application.llm_handler import LlmHandler 2 | from pika import BasicProperties 3 | from openai import OpenAI 4 | import logging 5 | import tiktoken 6 | import time 7 | 8 | logger = logging.getLogger(__name__) 9 | 10 | class OpenAIChatApi(LlmHandler): 11 | def __init__(self): 12 | super().__init__() 13 | 14 | def validate(self, request): 15 | is_valid, errors = self.validate_request(request, 'llm') 16 | return is_valid, errors 17 | 18 | def get_token_count(self, input_text): 19 | enc = self.token_counter.encode(input_text) 20 | return len(enc) 21 | 22 | def update_config(self, config_data): 23 | current_config = self.model_config 24 | merged_config = {**current_config, **config_data} 25 | client = OpenAI( 26 | api_key=merged_config["token"] 27 | ) 28 | self.client = client 29 | self.model_config = merged_config 30 | 31 | 32 | def clip_messages(self, request, config): 33 | clipped_messages = [] 34 | messages, system_prompt_tokens, request_system_message, system_prompt, sys_prompt_in_request, max_input_tokens = self._prep_prompt(request, config) 35 | input_token_count = system_prompt_tokens 36 | 37 | for index, message in enumerate(messages): 38 | token_count = self.get_token_count(message["content"]) 39 | if token_count + input_token_count > max_input_tokens: 40 | break 41 | 42 | input_token_count += token_count 43 | clipped_messages.append(message) 44 | 45 | clipped_messages = clipped_messages[::-1] 46 | if sys_prompt_in_request: 47 | clipped_messages.insert(0, request_system_message) 48 | 49 | return clipped_messages, input_token_count 50 | 51 | def execute(self, model, request): 52 | 53 | self.token_counter = tiktoken.encoding_for_model(model["model_name"]) 54 | clipped_messages, input_token_count = self.clip_messages(request, self.model_config) 55 | if clipped_messages == None: 56 | return None 57 | 58 | max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \ 59 | min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request) 60 | if debug: 61 | print('\033[94m') 62 | print(request) 63 | print('\033[0m') 64 | 65 | # make API request to OpenAI 66 | begin_time = time.time() 67 | config = self.model_config 68 | 69 | print(f'sending request to openai api') 70 | check_stop_token, stop_conditions = self.build_stop_conditions(config["stop_on"]) 71 | response = self.client.chat.completions.create( 72 | model=model["model_name"], 73 | stream=stream_output, 74 | messages=clipped_messages, 75 | temperature=temperature, 76 | max_tokens=max_new_tokens, 77 | stop=stop_conditions, 78 | presence_penalty=config.get("presence_penalty", 0), 79 | frequency_penalty=config.get("frequency_penalty", 0), 80 | top_p=top_p 81 | ) 82 | 83 | channel = model["amqp_channel"] 84 | incoming_headers = model["amqp_headers"] 85 | 86 | # copy amqp headers 87 | outgoing_headers = {} 88 | for incoming_header in incoming_headers: 89 | if incoming_header in ["x-delay", "return_exchange", "return_routing_key"]: 90 | continue 91 | outgoing_headers[incoming_header] = incoming_headers[incoming_header] 92 | 93 | response_str = "" 94 | finish_reason = "stop" 95 | new_tokens = 0 96 | if stream_output: 97 | socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None 98 | outgoing_headers["command"] = "prompt_fragment" if "stream_to_override" not in incoming_headers else incoming_headers["stream_to_override"] 99 | outgoing_properties = BasicProperties(headers=outgoing_headers) 100 | stop_generation_counter = 0 101 | 102 | for chunk in response: 103 | stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 104 | model["stop_generation_event"], model["stop_generation_filter"], socket_id) 105 | 106 | if stop_generation: 107 | finish_reason = "abort" 108 | break 109 | 110 | new_tokens += 1 111 | if chunk.choices[0].delta.content == None: 112 | continue 113 | 114 | if debug: 115 | print('\033[96m' + chunk.choices[0].delta.content, end="") 116 | 117 | response_str += chunk.choices[0].delta.content 118 | channel.basic_publish( 119 | exchange=incoming_headers['return_exchange'], 120 | routing_key=incoming_headers['return_routing_key'], 121 | body=chunk.choices[0].delta.content, properties=outgoing_properties) 122 | 123 | if debug: 124 | print('\033[0m' + "") 125 | else: 126 | response_str = response.choices[0].message.content 127 | new_tokens = response.usage.completion_tokens 128 | 129 | end_time = time.time() 130 | elapsed = end_time - begin_time 131 | token_rate = 0 if elapsed == 0 else (new_tokens / elapsed) 132 | model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided" 133 | request["start_response"] = "" 134 | resp = self.finish_response(stop_key, response_str, request, stream_output, finish_reason, 135 | token_rate, new_tokens, input_token_count, model_name, elapsed, debug) 136 | return resp 137 | 138 | def load(self, model, model_options, local_path): 139 | self.model_config = model["configuration"] 140 | client = OpenAI( 141 | api_key=model["secrets"]["token"] 142 | ) 143 | self.client = client 144 | return { "model_name": model["configuration"]["model"], "client": client } 145 | -------------------------------------------------------------------------------- /modules/openai/chat-api/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Open AI Chat", 3 | "description": "Handler for running OpenAI models using their API.", 4 | "unique_key": "openai_chat", 5 | "script": "chat-api.py", 6 | "skills": [ 7 | { 8 | "label": "OpenAI GPT 3.5", 9 | "routing_key": "openai_gpt_35", 10 | "use": ["reasoning_agent"], 11 | "available_precision": { "cpu": ["full"] }, 12 | "memory_usage": { "full": 20 }, 13 | "configuration": { 14 | "model": "gpt-3.5-turbo", 15 | "max_seq_len": 4096, 16 | "stop_on": [] 17 | }, 18 | "shortcut": "⚡" 19 | }, 20 | { 21 | "label": "OpenAI GPT 4", 22 | "routing_key": "openai_gpt_4", 23 | "use": ["reasoning_agent"], 24 | "available_precision": { "cpu": ["full"] }, 25 | "memory_usage": { "full": 20 }, 26 | "configuration": { 27 | "model": "gpt-4", 28 | "max_seq_len": 8192, 29 | "stop_on": [] 30 | }, 31 | "shortcut": "✨" 32 | } 33 | ], 34 | "configuration": { 35 | "vault_path": "golem/openai", 36 | "options": [ 37 | { 38 | "name": "token", 39 | "label": "API Token", 40 | "editable": true, 41 | "type": "secret", 42 | "default": "" 43 | }, 44 | { 45 | "name": "max_seq_len", 46 | "label": "Max Context Length", 47 | "type": "slider", 48 | "min": 512, 49 | "max": 16384, 50 | "default": 4096 51 | }, 52 | { 53 | "name": "frequency_penalty", 54 | "label": "Frequency Penalty", 55 | "type": "slider", 56 | "min": -2, 57 | "max": 2, 58 | "step": 0.01, 59 | "default": 0 60 | }, 61 | { 62 | "name": "presence_penalty", 63 | "label": "Presence Penalty", 64 | "type": "slider", 65 | "min": -2, 66 | "max": 2, 67 | "step": 0.01, 68 | "default": 0 69 | } 70 | ] 71 | } 72 | } 73 | -------------------------------------------------------------------------------- /modules/openai/dalle/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Open AI DALL-E", 3 | "description": "Handler for running OpenAI image generation models using their API.", 4 | "unique_key": "openai_dalle", 5 | "script": "handler.py", 6 | "skills": [ 7 | { 8 | "label": "OpenAI DALL-E 3", 9 | "routing_key": "openai_dalle_3", 10 | "use": ["image_generation"], 11 | "available_precision": { "cpu": ["full"] }, 12 | "memory_usage": { "full": 20 }, 13 | "configuration": { 14 | "model": "dall-e-3" 15 | } 16 | }, 17 | { 18 | "label": "OpenAI DALL-E 2", 19 | "routing_key": "openai_dalle_2", 20 | "use": ["image_generation"], 21 | "available_precision": { "cpu": ["full"] }, 22 | "memory_usage": { "full": 20 }, 23 | "configuration": { 24 | "model": "dall-e-2" 25 | } 26 | } 27 | ], 28 | "configuration": { 29 | "vault_path": "golem/openai", 30 | "options": [ 31 | { 32 | "name": "token", 33 | "label": "API Token", 34 | "editable": true, 35 | "type": "secret", 36 | "default": "" 37 | } 38 | ] 39 | } 40 | } 41 | -------------------------------------------------------------------------------- /modules/openai/dalle/handler.py: -------------------------------------------------------------------------------- 1 | from application.base_handler import BaseHandler 2 | from pika import BasicProperties 3 | from openai import OpenAI 4 | import logging 5 | import time 6 | 7 | logger = logging.getLogger(__name__) 8 | 9 | class OpenAIImageGeneration(BaseHandler): 10 | def __init__(self): 11 | super().__init__() 12 | 13 | def validate(self, request): 14 | is_valid, errors = self.validate_request(request, 'img-gen') 15 | return is_valid, errors 16 | 17 | def update_config(self, config_data): 18 | current_config = self.model_config 19 | merged_config = {**current_config, **config_data} 20 | client = OpenAI( 21 | api_key=merged_config["token"] 22 | ) 23 | self.client = client 24 | self.model_config = merged_config 25 | 26 | def execute(self, model, request): 27 | prompt = request.get("prompt", "") 28 | height = request.get("height", 1024) 29 | width = request.get("width", 1024) 30 | 31 | if height == 512 or width == 512: 32 | size = "512x512" 33 | else: 34 | size = "1024x1024" 35 | 36 | if self.model_config["model"] == "dall-e-2": 37 | size = "512x512" 38 | 39 | logger.info(f"generating image using {self.model_config['model']}") 40 | response = self.client.images.generate( 41 | model=self.model_config["model"], 42 | prompt=prompt, 43 | size=size, 44 | quality="standard", 45 | response_format="b64_json", 46 | n=1, 47 | ) 48 | return {"image": response.data[0].b64_json, "seed": 0, "guidance_scale": 0, "steps": 1 } 49 | 50 | def load(self, model, model_options, local_path): 51 | self.model_config = model["configuration"] 52 | client = OpenAI( 53 | api_key=model["secrets"]["token"] 54 | ) 55 | self.client = client 56 | return { "model_name": model["configuration"]["model"], "client": client } 57 | -------------------------------------------------------------------------------- /modules/salesforce/blip2-opt/blip2-opt.py: -------------------------------------------------------------------------------- 1 | from transformers import Blip2Processor, Blip2ForConditionalGeneration 2 | from application.base_handler import BaseHandler 3 | from PIL import Image 4 | import requests 5 | import torch 6 | 7 | class Blip2Opt27b(BaseHandler): 8 | def __init__(self): 9 | super().__init__() 10 | 11 | def validate(self, request): 12 | is_valid, errors = self.validate_request(request, 'visual-qa') 13 | return is_valid, errors 14 | 15 | def execute(self, model, request): 16 | img_url = request["img_url"] 17 | image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB') 18 | prompt = request["text"] 19 | 20 | # build the input tensor 21 | if self.use_precision == 'half': 22 | inputs = model["processor"](images=image, text=prompt, return_tensors="pt").to(model["device"], torch.float16) 23 | else: 24 | inputs = model["processor"](images=image, text=prompt, return_tensors="pt").to(model["device"]) 25 | 26 | generated_ids = model["model"].generate(**inputs, max_new_tokens=256) 27 | generated_text = model["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0].strip() 28 | return {"text":generated_text} 29 | 30 | def load(self, model, model_options, local_path): 31 | 32 | # load the processor 33 | processor = Blip2Processor.from_pretrained(local_path) 34 | self.use_precision = model_options["use_precision"] 35 | 36 | # load the model 37 | if self.use_precision == "full": 38 | blip2_model = Blip2ForConditionalGeneration.from_pretrained(local_path) 39 | elif model_options["use_precision"] == "half": 40 | blip2_model = Blip2ForConditionalGeneration.from_pretrained(local_path, torch_dtype=torch.float16) 41 | 42 | return {"model": blip2_model, "device": model_options["device"], "processor": processor, "device_memory": model["memory_usage"][self.use_precision]} -------------------------------------------------------------------------------- /modules/salesforce/blip2-opt/golem.json: -------------------------------------------------------------------------------- 1 | { 2 | "label": "Saleforce Blip2", 3 | "description": "Handler for loading Salesforce Blip 2 models/", 4 | "unique_key": "salesforce_blip2", 5 | "script": "blip2-opt.py", 6 | "skills": [ 7 | { 8 | "name": "Salesforce/blip2-opt-2.7b", 9 | "label": "BLIP v2 OPT 2.7b", 10 | "routing_key": "blip2_opt_27b", 11 | "use": ["visual_qa"], 12 | "available_precision": { "cuda": ["full", "half"], "cpu": ["full"] }, 13 | "memory_usage": { "full": 16100, "half": 8900 }, 14 | "model": [{ 15 | "name": "Salesforce/blip2-opt-2.7b", 16 | "provider": "huggingface" 17 | }] 18 | } 19 | ] 20 | } -------------------------------------------------------------------------------- /modules/turboderp/exllama/golem-generator.py: -------------------------------------------------------------------------------- 1 | from application.llm_handler import LlmHandler 2 | import sys 3 | import os 4 | import glob 5 | import time 6 | import logging 7 | import math 8 | sys.path.append(os.path.dirname(os.path.realpath(__file__))) 9 | from model import ExLlama, ExLlamaCache, ExLlamaConfig 10 | from tokenizer import ExLlamaTokenizer 11 | from generator import ExLlamaGenerator 12 | from lora import ExLlamaLora 13 | from application.system_info import get_gpu_memory_usage 14 | from huggingface_hub import snapshot_download 15 | 16 | logger = logging.getLogger(__name__) 17 | 18 | class GolemExLlamaGenerator(LlmHandler): 19 | def __init__(self): 20 | super().__init__() 21 | self.loras = {} 22 | 23 | def update_config(self, config_data): 24 | current_config = self.model_config 25 | merged_config = {**current_config, **config_data} 26 | self.model_config = merged_config 27 | 28 | def validate(self, request): 29 | is_valid, errors = self.validate_request(request, 'llm') 30 | return is_valid, errors 31 | 32 | def get_token_count(self, input_text): 33 | ids = self.generator.tokenizer.encode(input_text) 34 | input_token_count = len(ids[0]) 35 | return input_token_count 36 | 37 | def stream(self, generator, tokenizer, model, prompt, channel, incoming_headers, 38 | outgoing_properties, stops, model_data, request): 39 | 40 | # setup stop conditions 41 | check_stop_token, stop_conditions = self.build_stop_conditions(stops) 42 | 43 | res_line = "" 44 | held_text = "" 45 | response = "" 46 | unicode_hold = False 47 | finish_reason = "stop" 48 | stop_condition = False 49 | new_tokens = 0 50 | stop_generation_counter = 0 51 | ids = generator.tokenizer.encode(prompt) 52 | input_token_count = len(ids[0]) 53 | 54 | max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \ 55 | min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request) 56 | 57 | if debug: 58 | print('\033[94m') 59 | print(request) 60 | print(prompt) 61 | print('\033[0m') 62 | 63 | socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None 64 | generator.settings.temperature = temperature 65 | generator.settings.top_p = top_p 66 | begin_time = time.time() 67 | 68 | if "start_response" in request and stream_output: 69 | channel.basic_publish( 70 | exchange=incoming_headers['return_exchange'], 71 | routing_key=incoming_headers['return_routing_key'], 72 | body=request["start_response"], properties=outgoing_properties) 73 | 74 | generator.gen_begin(ids) 75 | generator.begin_beam_search() 76 | for i in range(max_new_tokens): 77 | new_tokens += 1 78 | 79 | # check if stop generation was requested 80 | stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 81 | model_data["stop_generation_event"], model_data["stop_generation_filter"], socket_id) 82 | 83 | if stop_generation: 84 | finish_reason = "abort" 85 | break 86 | 87 | token = generator.beam_search() 88 | prev_res_line = res_line 89 | res_line = tokenizer.decode(generator.sequence_actual[0, -new_tokens:]) 90 | new_text = res_line[len(prev_res_line):] 91 | 92 | # new text 93 | chunk = held_text + new_text 94 | 95 | # check if we should hold off on streaming this text 96 | hold_text = False 97 | for stop_string in stop_conditions: 98 | if stop_string.startswith(chunk.lower()): hold_text = True 99 | 100 | if len(res_line): 101 | check_ord = ord(res_line[-1]) 102 | if check_ord == 65533 or check_ord == 55356 or check_ord == 55357: 103 | hold_text = True 104 | unicode_hold = True 105 | 106 | if not hold_text: 107 | if unicode_hold is True: 108 | unicode_hold = False 109 | chunk = res_line[-1:] 110 | 111 | # send chunk to front end 112 | if stream_output: 113 | if debug: 114 | print('\033[96m' + chunk, end="") 115 | 116 | channel.basic_publish( 117 | exchange=incoming_headers['return_exchange'], 118 | routing_key=incoming_headers['return_routing_key'], 119 | body=chunk, properties=outgoing_properties) 120 | else: 121 | response += chunk 122 | 123 | prompt += chunk 124 | held_text = "" 125 | else: 126 | held_text += new_text 127 | 128 | # check stop conditions 129 | stop_condition = self.check_stop_conditions(token, res_line, tokenizer.eos_token_id, 130 | check_stop_token, stop_conditions) 131 | if stop_condition: break 132 | 133 | end_time = time.time() 134 | elapsed = end_time - begin_time 135 | token_rate = 0 if elapsed == 0 else (new_tokens / elapsed) 136 | generator.end_beam_search() 137 | 138 | if debug and stream_output: 139 | print('\033[0m' + "") 140 | 141 | if new_tokens == max_new_tokens: 142 | finish_reason = "length" 143 | 144 | model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided" 145 | resp = self.finish_response(stop_key, response, request, stream_output, finish_reason, 146 | token_rate, new_tokens, input_token_count, model_name, elapsed, debug) 147 | return resp 148 | 149 | def load_lora(self, request, model, config): 150 | # load lora from config and override w/ request if present 151 | lora_name = config["default_lora"] if "default_lora" in config else None 152 | if "lora" in request: 153 | lora_name = request["lora"] 154 | 155 | if lora_name != None: 156 | if lora_name not in self.loras: 157 | logger.info(f"loading lora {lora_name}") 158 | lora_dir = os.path.join(f"data/loras/", lora_name) 159 | if not os.path.exists(lora_dir): 160 | logger.info("downloading lora {lora_name} from huggingface") 161 | snapshot_download(repo_id=lora_name, local_dir=lora_dir, cache_dir='data/cache', local_dir_use_symlinks=False) 162 | 163 | lora_path = os.path.join(f"data/loras/", lora_name, "adapter_model.bin") 164 | lora_config_path = os.path.join(f"data/loras/{lora_name}", "adapter_config.json") 165 | 166 | lora = ExLlamaLora(model["model_loaded"], lora_config_path, lora_path) 167 | self.loras[lora_name] = lora 168 | else: 169 | logger.info(f"using lora {lora_name}") 170 | 171 | model["generator"].lora = self.loras[lora_name] 172 | else: 173 | model["generator"].lora = None 174 | 175 | def execute(self, model, request): 176 | # load lora 177 | config = self.model_config 178 | self.load_lora(request, model, config) 179 | 180 | # build prompt 181 | prompt = self.build_prompt(request, config, model) 182 | 183 | # copy amqp headers 184 | incoming_headers = model["amqp_headers"] 185 | outgoing_properties = self.copy_queue_headers(incoming_headers) 186 | 187 | stream_resp = self.stream( 188 | model["generator"], 189 | model["tokenizer"], 190 | model["model_loaded"], 191 | prompt, 192 | model["amqp_channel"], 193 | incoming_headers, 194 | outgoing_properties, 195 | config["stop_on"], 196 | model, 197 | request) 198 | 199 | return stream_resp 200 | 201 | def load(self, model, model_options, local_path): 202 | self.model_config = model["configuration"] 203 | 204 | # get paths 205 | logger.info(f"starting module {local_path}") 206 | tokenizer_path = os.path.join(local_path, "tokenizer.model") 207 | model_config_path = os.path.join(local_path, "config.json") 208 | st_pattern = os.path.join(local_path, "*.safetensors") 209 | model_path = glob.glob(st_pattern)[0] 210 | 211 | # Create config, model, tokenizer and generator 212 | config = ExLlamaConfig(model_config_path) 213 | config.model_path = model_path 214 | config.compress_pos_emb = model["configuration"].get("compress_pos_emb", 1.0) 215 | config.max_seq_len = model["configuration"].get("max_seq_len", 2048) 216 | config.matmul_recons_thd = 8 217 | config.fused_mlp_thd = 2 218 | config.sdp_thd = 8 219 | 220 | # set model device 221 | if model_options["device"].startswith("split"): 222 | device_map = model_options["device"].split(':')[1] 223 | config.set_auto_map(device_map) 224 | elif model_options["device"].startswith("cuda"): 225 | device_number = int(model_options["device"].split(':')[1]) 226 | device_array = [0]*12 227 | used_memory, free_memory, total_memory = get_gpu_memory_usage(device_number) 228 | device_array[device_number] = math.floor(total_memory / 1024) 229 | last_non_zero = len(device_array) - 1 230 | while last_non_zero > 0 and device_array[last_non_zero] == 0: 231 | last_non_zero -= 1 232 | device_array = device_array[:last_non_zero + 1] 233 | device_map = ','.join(map(str, device_array)) 234 | config.set_auto_map(device_map) 235 | 236 | load_error = False 237 | try: 238 | load_model = ExLlama(config) 239 | tokenizer = ExLlamaTokenizer(tokenizer_path) 240 | cache = ExLlamaCache(load_model) 241 | generator = ExLlamaGenerator(load_model, tokenizer, cache) 242 | 243 | # Configure generator 244 | self.generator = generator 245 | generator.settings.min_p = 0.0 246 | generator.settings.top_k = 0 247 | generator.settings.typical = 0.25 248 | generator.settings.token_repetition_penalty_max = 1.15 249 | generator.settings.token_repetition_penalty_sustain = 2048 250 | generator.settings.token_repetition_penalty_decay = 512 251 | 252 | logger.info(f'skill {model["routing_key"]} loaded to {model_options["device"]}') 253 | return { "model_loaded": load_model, "generator": generator, "tokenizer": tokenizer, "error": load_error } 254 | except Exception as e: 255 | logger.error(f"error loading model") 256 | load_error = True 257 | print(e) 258 | return { "error": load_error } -------------------------------------------------------------------------------- /modules/turboderp/exllamav2/handler.py: -------------------------------------------------------------------------------- 1 | from exllamav2 import ( 2 | ExLlamaV2, 3 | ExLlamaV2Config, 4 | ExLlamaV2Cache, 5 | ExLlamaV2Tokenizer, 6 | ExLlamaV2Lora 7 | ) 8 | from exllamav2.generator import ( 9 | ExLlamaV2StreamingGenerator, 10 | ExLlamaV2Sampler 11 | ) 12 | from application.system_info import get_gpu_memory_usage 13 | from huggingface_hub import snapshot_download 14 | from application.llm_handler import LlmHandler 15 | import torch 16 | import time 17 | import logging 18 | import sys 19 | import os 20 | import math 21 | import random 22 | 23 | logger = logging.getLogger(__name__) 24 | 25 | class ExllamaV2Generator(LlmHandler): 26 | def __init__(self): 27 | super().__init__() 28 | self.loras = {} 29 | 30 | def update_config(self, config_data): 31 | current_config = self.model_config 32 | merged_config = {**current_config, **config_data} 33 | self.model_config = merged_config 34 | 35 | def validate(self, request): 36 | is_valid, errors = self.validate_request(request, 'llm') 37 | return is_valid, errors 38 | 39 | def get_token_count(self, input_text): 40 | input_ids = self.tokenizer.encode(input_text) 41 | return input_ids.shape[-1] 42 | 43 | def stream(self, generator, tokenizer, model, prompt, channel, incoming_headers, 44 | outgoing_properties, stops, request, model_data, lora): 45 | 46 | # setup stop conditions 47 | check_stop_token, stop_conditions = self.build_stop_conditions(stops) 48 | 49 | # get starting time 50 | begin_time = time.time() 51 | 52 | # tokenize the prompt 53 | input_ids = tokenizer.encode(prompt) 54 | input_token_count = input_ids.shape[-1] 55 | 56 | # set max new tokens and other params 57 | max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \ 58 | min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request) 59 | 60 | if debug: 61 | print('\033[94m') 62 | print(request) 63 | print(prompt) 64 | print('\033[0m') 65 | 66 | if check_stop_token: 67 | stop_conditions.append(tokenizer.eos_token_id) 68 | 69 | if seed != -1: random.seed(seed) 70 | generator.warmup() 71 | generator.set_stop_conditions(stop_conditions) 72 | settings = ExLlamaV2Sampler.Settings() 73 | settings.temperature = temperature 74 | settings.top_k = top_k 75 | settings.top_p = top_p 76 | settings.min_p = min_p 77 | if mirostat != 0: 78 | settings.mirostat = True 79 | settings.mirostat_tau = mirostat_tau 80 | settings.mirostat_eta = mirostat_eta 81 | 82 | #settings.token_repetition_penalty = 1.05 83 | socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None 84 | 85 | if "start_response" in request and stream_output: 86 | channel.basic_publish( 87 | exchange=incoming_headers['return_exchange'], 88 | routing_key=incoming_headers['return_routing_key'], 89 | body=request["start_response"], properties=outgoing_properties) 90 | 91 | generated_tokens = 0 92 | stop_generation_counter = 0 93 | generator.begin_stream(input_ids, settings, loras = lora) 94 | response = "" 95 | while True: 96 | chunk, eos, _ = generator.stream() 97 | if eos: break 98 | 99 | generated_tokens += 1 100 | stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 101 | model_data["stop_generation_event"], model_data["stop_generation_filter"], socket_id) 102 | if stop_generation: 103 | finish_reason = "abort" 104 | break 105 | 106 | if generated_tokens >= max_new_tokens: 107 | finish_reason = 'length' 108 | break 109 | 110 | # send chunk to front end 111 | if stream_output: 112 | if debug: 113 | print('\033[96m' + chunk, end="") 114 | sys.stdout.flush() 115 | 116 | channel.basic_publish( 117 | exchange=incoming_headers['return_exchange'], 118 | routing_key=incoming_headers['return_routing_key'], 119 | body=chunk, properties=outgoing_properties) 120 | else: 121 | response += chunk 122 | 123 | if debug and stream_output: 124 | print('\033[0m' + "") 125 | 126 | finish_reason = "stop" 127 | end_time = time.time() 128 | elapsed = end_time - begin_time 129 | token_rate = 0 if elapsed == 0 else (generated_tokens / elapsed) 130 | model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided" 131 | return self.finish_response(stop_key, response, request, stream_output, finish_reason, 132 | token_rate, generated_tokens, input_token_count, model_name, elapsed, debug) 133 | 134 | 135 | def load_lora(self, request, model, config): 136 | 137 | # load lora from config and override w/ request if present 138 | lora_name = config["default_lora"] if "default_lora" in config else None 139 | if "lora" in request: 140 | lora_name = request["lora"] 141 | 142 | if lora_name != None: 143 | if lora_name not in self.loras: 144 | 145 | logger.info(f"loading lora {lora_name}") 146 | lora_dir = os.path.join(f"data/loras/", lora_name) 147 | if not os.path.exists(lora_dir): 148 | logger.info("downloading lora {lora_name} from huggingface") 149 | snapshot_download(repo_id=lora_name, local_dir=lora_dir, cache_dir='data/cache', local_dir_use_symlinks=False) 150 | 151 | lora = ExLlamaV2Lora.from_directory(model["model_loaded"], lora_dir) 152 | self.loras[lora_name] = lora 153 | else: 154 | logger.info(f"using lora {lora_name}") 155 | 156 | return self.loras[lora_name] 157 | 158 | return None 159 | 160 | def execute(self, model, request): 161 | config = self.model_config 162 | 163 | # build the prompt 164 | prompt = self.build_prompt(request, config, model) 165 | incoming_headers = model["amqp_headers"] 166 | outgoing_properties = self.copy_queue_headers(incoming_headers) 167 | 168 | # lora code 169 | lora = self.load_lora(request, model, self.model_config) 170 | 171 | # last string to send after done streaming output 172 | stream_resp = self.stream( 173 | model["generator"], 174 | model["tokenizer"], 175 | model["model_loaded"], 176 | prompt, 177 | model["amqp_channel"], 178 | incoming_headers, 179 | outgoing_properties, 180 | config["stop_on"], 181 | request, 182 | model, 183 | lora) 184 | 185 | return stream_resp 186 | 187 | def load(self, model, model_options, local_path): 188 | self.model_config = model["configuration"] 189 | load_error = False 190 | try: 191 | model_path = local_path 192 | if "branch" in model["model"][0] and model_options["use_precision"] in model["model"][0]["branch"]: 193 | branch_path = model["model"][0]["branch"][model_options["use_precision"]] 194 | model_path = f"{local_path}/{branch_path}" 195 | 196 | config = ExLlamaV2Config() 197 | config.model_dir = model_path 198 | config.prepare() 199 | 200 | if model_options["device"].startswith("split"): 201 | device_map = model_options["device"].split(':')[1].split(",") 202 | device_map = list(map(int, device_map)) 203 | elif model_options["device"].startswith("cuda"): 204 | device_number = int(model_options["device"].split(':')[1]) 205 | device_array = [0]*12 206 | used_memory, free_memory, total_memory = get_gpu_memory_usage(device_number) 207 | device_array[device_number] = math.floor(total_memory / 1024) 208 | last_non_zero = len(device_array) - 1 209 | while last_non_zero > 0 and device_array[last_non_zero] == 0: 210 | last_non_zero -= 1 211 | device_array = device_array[:last_non_zero + 1] 212 | device_map = device_array 213 | 214 | logger.info(f"starting module {model_path}") 215 | model_loaded = ExLlamaV2(config) 216 | model_loaded.load(gpu_split=device_map) 217 | cache = ExLlamaV2Cache(model_loaded) 218 | tokenizer = ExLlamaV2Tokenizer(config) 219 | generator = ExLlamaV2StreamingGenerator(model_loaded, cache, tokenizer) 220 | self.tokenizer = tokenizer 221 | 222 | logger.info(f'skill {model["routing_key"]} loaded to {model_options["device"]}, precision: {model_options["use_precision"]}') 223 | return { "model_loaded": model_loaded, "generator": generator, "tokenizer": tokenizer, "error": load_error } 224 | except Exception as e: 225 | logger.error(f"error loading model") 226 | print(e) 227 | load_error = True 228 | return { "error": load_error } -------------------------------------------------------------------------------- /requirements-nogpu.txt: -------------------------------------------------------------------------------- 1 | torch==2.1.0 2 | torchaudio==2.1.0 3 | safetensors==0.3.2 4 | sentencepiece>=0.1.97 5 | ninja==1.11.1 6 | tiktoken==0.3.3 7 | numpy==1.22 8 | ninja 9 | hvac 10 | pynvml 11 | psutil 12 | pika 13 | transformers 14 | bitsandbytes 15 | scipy 16 | transformers-stream-generator 17 | jsonschema 18 | omegaconf 19 | Pillow 20 | einops 21 | protobuf 22 | accelerate 23 | diffusers 24 | timm 25 | openai 26 | sseclient-py 27 | TTS==0.22.0 28 | soundfile 29 | llama-cpp-python 30 | compel -------------------------------------------------------------------------------- /requirements.txt: -------------------------------------------------------------------------------- 1 | torch==2.1.0 2 | torchaudio==2.1.0 3 | safetensors==0.3.2 4 | sentencepiece>=0.1.97 5 | ninja==1.11.1 6 | tiktoken==0.3.3 7 | numpy==1.22 8 | ninja 9 | hvac 10 | pynvml 11 | psutil 12 | pika 13 | transformers 14 | bitsandbytes 15 | scipy 16 | transformers-stream-generator 17 | jsonschema 18 | omegaconf 19 | Pillow 20 | einops 21 | protobuf 22 | accelerate 23 | diffusers 24 | timm 25 | openai 26 | sseclient-py 27 | TTS==0.22.0 28 | soundfile 29 | exllamav2 30 | compel 31 | chardet -------------------------------------------------------------------------------- /schema/audio-gen.jsonschema: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "prompt": { 5 | "type": "string" 6 | }, 7 | "seconds": { 8 | "type": "number" 9 | }, 10 | "guidance_scale": { 11 | "type": "number" 12 | }, 13 | "progress": { 14 | "type": "boolean", 15 | "default": false 16 | } 17 | }, 18 | "required": ["prompt"] 19 | } -------------------------------------------------------------------------------- /schema/audio-url.jsonschema: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "audio_url": { "type": "string" } 5 | }, 6 | "required": ["audio_url"] 7 | } -------------------------------------------------------------------------------- /schema/img-gen.jsonschema: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "prompt": { 5 | "type": "string" 6 | }, 7 | "height": { 8 | "type": "integer", 9 | "default": 512 10 | }, 11 | "width": { 12 | "type": "integer", 13 | "default": 512 14 | }, 15 | "steps": { 16 | "type": "integer", 17 | "default": 50 18 | }, 19 | "seed": { 20 | "type": "integer", 21 | "default": -1 22 | }, 23 | "progress": { 24 | "type": "boolean", 25 | "default": false 26 | }, 27 | "negative_prompt": { 28 | "type": "string", 29 | "default": "" 30 | }, 31 | "guidance_scale": { 32 | "type": "number", 33 | "default": 7.5 34 | } 35 | }, 36 | "required": ["prompt"] 37 | } 38 | -------------------------------------------------------------------------------- /schema/img-url.jsonschema: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "img_url": { "type": "string" } 5 | }, 6 | "required": ["img_url"] 7 | } -------------------------------------------------------------------------------- /schema/instructor.jsonschema: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "text": { "type": "string" }, 5 | "instruction": { "type": "string" } 6 | }, 7 | "required": ["text", "instruction"] 8 | } -------------------------------------------------------------------------------- /schema/llm.jsonschema: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "max_new_tokens": { "type": "number", "default": 512 }, 5 | "top_p": { "type": "number", "default": 0.9 }, 6 | "top_k": { "type": "number", "default": 50 }, 7 | "min_p": { "type": "number", "default": 0.05 }, 8 | "mirostat": { "type": "number", "default": 0 }, 9 | "mirostat_eta": { "type": "number", "default": 0.1 }, 10 | "mirostat_tau": { "type": "number", "default": 5 }, 11 | "temperature": { "type": "number", "default": 0.9 }, 12 | "seed": { "type": "number", "default": -1 }, 13 | "stream": { "type": "boolean", "default": true }, 14 | "debug": { "type": "boolean" }, 15 | "stop_key": { "type": "string" }, 16 | "lora": { "type": "string" }, 17 | "ai_role": { "type": "string" }, 18 | "user_role": { "type": "string" }, 19 | "start_response": { "type": "string"}, 20 | "raw": { "type": "string"}, 21 | "messages": { 22 | "type": "array", 23 | "items": { 24 | "type": "object", 25 | "properties": { 26 | "role": { "type": "string" }, 27 | "content": { "type": "string"} 28 | }, 29 | "required": ["role", "content"] 30 | } 31 | } 32 | }, 33 | "required": ["messages"] 34 | } -------------------------------------------------------------------------------- /schema/visual-qa.jsonschema: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "img_url": { "type": "string" }, 5 | "text": { "type": "string" } 6 | }, 7 | "required": ["img_url", "text"] 8 | } -------------------------------------------------------------------------------- /schema/voice-gen.jsonschema: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "prompt": { 5 | "type": "string" 6 | }, 7 | "progress": { 8 | "type": "boolean", 9 | "default": false 10 | }, 11 | "voice": { 12 | "type": "string" 13 | } 14 | }, 15 | "required": ["prompt"] 16 | } 17 | -------------------------------------------------------------------------------- /schema/zero-shot-img.jsonschema: -------------------------------------------------------------------------------- 1 | { 2 | "type": "object", 3 | "properties": { 4 | "img_url": { "type": "string" }, 5 | "labels": { 6 | "type": "array", 7 | "items": { 8 | "type": "string" 9 | } 10 | } 11 | }, 12 | "required": ["img_url", "labels"] 13 | } --------------------------------------------------------------------------------