├── .gitignore
├── LICENSE
├── README.md
├── application
    ├── amqp.py
    ├── base_handler.py
    ├── download.py
    ├── llm_handler.py
    ├── progress_streamer.py
    ├── system_info.py
    └── thread.py
├── modules
    ├── facebook
    │   └── convnext
    │   │   ├── convnext.py
    │   │   └── golem.json
    ├── haotian-liu
    │   └── llava
    │   │   ├── golem-generator.py
    │   │   └── golem.json
    ├── hf-pipeline
    │   ├── asr
    │   │   ├── asr.py
    │   │   └── golem.json
    │   ├── image-class
    │   │   ├── golem.json
    │   │   └── image-class.py
    │   ├── image-to-text
    │   │   ├── golem.json
    │   │   └── image-to-text.py
    │   ├── object-detection
    │   │   ├── golem.json
    │   │   └── object-detection.py
    │   ├── visual-question-answering
    │   │   ├── golem.json
    │   │   └── visual-question-answering.py
    │   ├── zero-shot-image-class
    │   │   ├── golem.json
    │   │   └── zero-shot-image-class.py
    │   └── zero-shot-object-detection
    │   │   ├── golem.json
    │   │   └── zero-shot-object-detection.py
    ├── hkunlp
    │   └── instructor
    │   │   ├── golem.json
    │   │   └── instructor.py
    ├── intfloat
    │   └── e5-v2
    │   │   ├── e5-v2.py
    │   │   └── golem.json
    ├── microsoft
    │   └── git-textcaps
    │   │   ├── git-textcaps.py
    │   │   └── golem.json
    ├── noco-ai
    │   ├── bark-tts
    │   │   ├── golem.json
    │   │   └── handler.py
    │   ├── image-generator
    │   │   ├── golem.json
    │   │   └── handler.py
    │   ├── llama-cpp
    │   │   ├── golem.json
    │   │   └── llama-cpp.py
    │   ├── llm-api
    │   │   ├── golem.json
    │   │   └── handler.py
    │   ├── music-generator
    │   │   ├── golem.json
    │   │   └── handler.py
    │   ├── sd-xl
    │   │   ├── golem.json
    │   │   └── handler.py
    │   ├── transformers-stream
    │   │   ├── golem.json
    │   │   └── handler.py
    │   └── tts-api
    │   │   ├── golem.json
    │   │   └── handler.py
    ├── openai
    │   ├── chat-api
    │   │   ├── chat-api.py
    │   │   └── golem.json
    │   └── dalle
    │   │   ├── golem.json
    │   │   └── handler.py
    ├── salesforce
    │   └── blip2-opt
    │   │   ├── blip2-opt.py
    │   │   └── golem.json
    └── turboderp
    │   ├── exllama
    │       ├── golem-generator.py
    │       └── golem.json
    │   └── exllamav2
    │       ├── golem.json
    │       └── handler.py
├── requirements-nogpu.txt
├── requirements.txt
├── schema
    ├── audio-gen.jsonschema
    ├── audio-url.jsonschema
    ├── img-gen.jsonschema
    ├── img-url.jsonschema
    ├── instructor.jsonschema
    ├── llm.jsonschema
    ├── visual-qa.jsonschema
    ├── voice-gen.jsonschema
    └── zero-shot-img.jsonschema
└── server.py


/.gitignore:
--------------------------------------------------------------------------------
 1 | data/enabled_skills.json
 2 | data/models/
 3 | data/
 4 | repos/
 5 | *.pyc
 6 | vault-token
 7 | .vscode/
 8 | 
 9 | # core modules w/ external reps
10 | modules/turboderp/exllama/*
11 | modules/turboderp/exllama/.*
12 | !modules/turboderp/exllama/golem.json
13 | !modules/turboderp/exllama/golem-generator.py
14 | modules/turboderp/exllama2/*
15 | modules/turboderp/exllama2/.*
16 | modules/haotian-liu/llava/*
17 | modules/haotian-liu/llava/.*
18 | !modules/haotian-liu/llava/golem.json
19 | !modules/haotian-liu/llava/golem-generator.py


--------------------------------------------------------------------------------
/LICENSE:
--------------------------------------------------------------------------------
 1 | Open Software License ("OSL") v. 3.0
 2 | 
 3 | This Open Software License (the "License") applies to any original work of authorship (the "Original Work") whose owner (the "Licensor") has placed the following licensing notice adjacent to the copyright notice for the Original Work:
 4 | 
 5 | Licensed under the Open Software License version 3.0
 6 | 
 7 |    1. Grant of Copyright License. Licensor grants You a worldwide, royalty-free, non-exclusive, sublicensable license, for the duration of the copyright, to do the following:
 8 | 
 9 |          1. to reproduce the Original Work in copies, either alone or as part of a collective work;
10 | 
11 |          2. to translate, adapt, alter, transform, modify, or arrange the Original Work, thereby creating derivative works ("Derivative Works") based upon the Original Work;
12 | 
13 |          3. to distribute or communicate copies of the Original Work and Derivative Works to the public, with the proviso that copies of Original Work or Derivative Works that You distribute or communicate shall be licensed under this Open Software License;
14 | 
15 |          4. to perform the Original Work publicly; and
16 | 
17 |          5. to display the Original Work publicly. 
18 | 
19 |    2. Grant of Patent License. Licensor grants You a worldwide, royalty-free, non-exclusive, sublicensable license, under patent claims owned or controlled by the Licensor that are embodied in the Original Work as furnished by the Licensor, for the duration of the patents, to make, use, sell, offer for sale, have made, and import the Original Work and Derivative Works.
20 | 
21 |    3. Grant of Source Code License. The term "Source Code" means the preferred form of the Original Work for making modifications to it and all available documentation describing how to modify the Original Work. Licensor agrees to provide a machine-readable copy of the Source Code of the Original Work along with each copy of the Original Work that Licensor distributes. Licensor reserves the right to satisfy this obligation by placing a machine-readable copy of the Source Code in an information repository reasonably calculated to permit inexpensive and convenient access by You for as long as Licensor continues to distribute the Original Work.
22 | 
23 |    4. Exclusions From License Grant. Neither the names of Licensor, nor the names of any contributors to the Original Work, nor any of their trademarks or service marks, may be used to endorse or promote products derived from this Original Work without express prior permission of the Licensor. Except as expressly stated herein, nothing in this License grants any license to Licensor's trademarks, copyrights, patents, trade secrets or any other intellectual property. No patent license is granted to make, use, sell, offer for sale, have made, or import embodiments of any patent claims other than the licensed claims defined in Section 2. No license is granted to the trademarks of Licensor even if such marks are included in the Original Work. Nothing in this License shall be interpreted to prohibit Licensor from licensing under terms different from this License any Original Work that Licensor otherwise would have a right to license.
24 | 
25 |    5. External Deployment. The term "External Deployment" means the use, distribution, or communication of the Original Work or Derivative Works in any way such that the Original Work or Derivative Works may be used by anyone other than You, whether those works are distributed or communicated to those persons or made available as an application intended for use over a network. As an express condition for the grants of license hereunder, You must treat any External Deployment by You of the Original Work or a Derivative Work as a distribution under section 1(c).
26 | 
27 |    6. Attribution Rights. You must retain, in the Source Code of any Derivative Works that You create, all copyright, patent, or trademark notices from the Source Code of the Original Work, as well as any notices of licensing and any descriptive text identified therein as an "Attribution Notice." You must cause the Source Code for any Derivative Works that You create to carry a prominent Attribution Notice reasonably calculated to inform recipients that You have modified the Original Work.
28 | 
29 |    7. Warranty of Provenance and Disclaimer of Warranty. Licensor warrants that the copyright in and to the Original Work and the patent rights granted herein by Licensor are owned by the Licensor or are sublicensed to You under the terms of this License with the permission of the contributor(s) of those copyrights and patent rights. Except as expressly stated in the immediately preceding sentence, the Original Work is provided under this License on an "AS IS" BASIS and WITHOUT WARRANTY, either express or implied, including, without limitation, the warranties of non-infringement, merchantability or fitness for a particular purpose. THE ENTIRE RISK AS TO THE QUALITY OF THE ORIGINAL WORK IS WITH YOU. This DISCLAIMER OF WARRANTY constitutes an essential part of this License. No license to the Original Work is granted by this License except under this disclaimer.
30 | 
31 |    8. Limitation of Liability. Under no circumstances and under no legal theory, whether in tort (including negligence), contract, or otherwise, shall the Licensor be liable to anyone for any indirect, special, incidental, or consequential damages of any character arising as a result of this License or the use of the Original Work including, without limitation, damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses. This limitation of liability shall not apply to the extent applicable law prohibits such limitation.
32 | 
33 |    9. Acceptance and Termination. If, at any time, You expressly assented to this License, that assent indicates your clear and irrevocable acceptance of this License and all of its terms and conditions. If You distribute or communicate copies of the Original Work or a Derivative Work, You must make a reasonable effort under the circumstances to obtain the express assent of recipients to the terms of this License. This License conditions your rights to undertake the activities listed in Section 1, including your right to create Derivative Works based upon the Original Work, and doing so without honoring these terms and conditions is prohibited by copyright law and international treaty. Nothing in this License is intended to affect copyright exceptions and limitations (including 'fair use' or 'fair dealing'). This License shall terminate immediately and You may no longer exercise any of the rights granted to You by this License upon your failure to honor the conditions in Section 1(c).
34 | 
35 |   10. Termination for Patent Action. This License shall terminate automatically and You may no longer exercise any of the rights granted to You by this License as of the date You commence an action, including a cross-claim or counterclaim, against Licensor or any licensee alleging that the Original Work infringes a patent. This termination provision shall not apply for an action alleging patent infringement by combinations of the Original Work with other software or hardware.
36 | 
37 |   11. Jurisdiction, Venue and Governing Law. Any action or suit relating to this License may be brought only in the courts of a jurisdiction wherein the Licensor resides or in which Licensor conducts its primary business, and under the laws of that jurisdiction excluding its conflict-of-law provisions. The application of the United Nations Convention on Contracts for the International Sale of Goods is expressly excluded. Any use of the Original Work outside the scope of this License or after its termination shall be subject to the requirements and penalties of copyright or patent law in the appropriate jurisdiction. This section shall survive the termination of this License.
38 | 
39 |   12. Attorneys' Fees. In any action to enforce the terms of this License or seeking damages relating thereto, the prevailing party shall be entitled to recover its costs and expenses, including, without limitation, reasonable attorneys' fees and costs incurred in connection with such action, including any appeal of such action. This section shall survive the termination of this License.
40 | 
41 |   13. Miscellaneous. If any provision of this License is held to be unenforceable, such provision shall be reformed only to the extent necessary to make it enforceable.
42 | 
43 |   14. Definition of "You" in This License. "You" throughout this License, whether in upper or lower case, means an individual or a legal entity exercising rights under, and complying with all of the terms of, this License. For legal entities, "You" includes any entity that controls, is controlled by, or is under common control with you. For purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity.
44 | 
45 |   15. Right to Use. You may use the Original Work in all ways not otherwise restricted or conditioned by this License or by law, and Licensor promises not to interfere with or be responsible for such uses by You.
46 | 
47 |   16. Modification of This License. This License is Copyright (C) 2005 Lawrence Rosen. Permission is granted to copy, distribute, or communicate this License without modification. Nothing in this License permits You to modify this License as applied to the Original Work or to Derivative Works. However, You may modify the text of this License and copy, distribute or communicate your modified version (the "Modified License") and apply it to other original works of authorship subject to the following conditions: (i) You may not indicate in any way that your Modified License is the "Open Software License" or "OSL" and you may not use those names in the name of your Modified License; (ii) You must replace the notice specified in the first paragraph above with the notice "Licensed under <insert your license name here>" or with a notice of your own that is not confusingly similar to the notice in this License; and (iii) You may not claim that your original works are open source software unless your Modified License has been approved by Open Source Initiative (OSI) and You comply with its license review and certification process.


--------------------------------------------------------------------------------
/README.md:
--------------------------------------------------------------------------------
  1 | # Elemental Golem
  2 | 
  3 | Elemental Golem is a project that defines and serves AI models using a modular system with a `golem.json` configuration file and a handler that implements the call and response from the model utilizing AMQP as the message broker. It is the backend used by Arcane Bridge and Spell Book for interating with AI models based off Pytorch and similar libraries. It currently focuses soley on inference tasks.
  4 | 
  5 | 
  6 | ## Stack Documentation
  7 | 
  8 | - https://github.com/noco-ai/spellbook-docker/wiki
  9 | - The wiki for the docker project contains comprehensive documentation for the UI that uses Elemental Golem to serve AI models.
 10 | 
 11 | ## Stack Architecture
 12 | 
 13 | ![Software stack diagram](https://github.com/noco-ai/spellbook-docker/blob/master/stack.png)
 14 | 
 15 | ## Dependencies
 16 | 
 17 | - Hashicorp Vault >= 1.1
 18 | - RabbitMQ >= 3.6.10
 19 | 
 20 | ### Required Vault Keys
 21 | 
 22 | In order to function Elemental Golem need to connect to a Vault server to retervice secrets and configuration data.
 23 | The following information needs to be stored in Vault for Element Golem to start.
 24 | 
 25 | ### **core/amqp**
 26 | 
 27 | ```json
 28 | {
 29 |   "host": "127.0.0.1",
 30 |   "password": "securepass",
 31 |   "username": "spellbook-user",
 32 |   "vhost": "spellbook"
 33 | }
 34 | ```
 35 | 
 36 | ## Install Guide
 37 | 
 38 | ### Docker Install
 39 | 
 40 | See https://github.com/noco-ai/spellbook-docker for installing the entire Spell Book stack with Docker Compose.
 41 | 
 42 | ### Ubuntu Server 22 Install (no GPU)
 43 | 
 44 | These step can be taken to install Element Golem on a Ubuntu 22 server with no GPU installed.
 45 | 
 46 | ```bash
 47 | sudo apt-get update
 48 | sudo apt-get upgrade
 49 | sudo apt install build-essential
 50 | 
 51 | curl https://repo.anaconda.com/archive/Anaconda3-2021.11-Linux-x86_64.sh --output anaconda.sh
 52 | bash anaconda.sh
 53 | conda create -n golem python=3.10.9
 54 | conda activate golem
 55 | conda install pytorch torchvision torchaudio pytorch-cuda=12.1 -c pytorch -c nvidia
 56 | CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python
 57 | 
 58 | # clone repo
 59 | mkdir elemental-golem
 60 | cd elemental-golem
 61 | git clone https://github.com/noco-ai/elemental-golem.git .
 62 | pip install -r requirements-nogpu.txt
 63 | python server.py --server-id golemX --vault-host https://vault.example.com --gpu-type=nogpu
 64 | ```
 65 | 
 66 | ### requirements.txt
 67 | 
 68 | When installing Elemental Golem on a system with a Nvidia graphics card you will nee to run the apporicate requirements.txt file. Use
 69 | requirements-nogpu.txt if the system has no GPU present.
 70 | 
 71 | ### CLI Parameters and Server Commands
 72 | 
 73 | Elemental Golem provides several CLI commands for controlling the software. Below is a detailed explanation of them.
 74 | 
 75 | ### **Command-line Interface (CLI) Parameters**:
 76 | 
 77 | - `--server-id`: A unique identifier for the server. Required parameter.
 78 | - `--vault-host`: The address of the Vault server host. Required parameter.
 79 | - `--vault-token-file`: The path to the file containing the Vault token. Defaults to './vault-token' if not specified.
 80 | - `--vault-root`: The root path in the Vault server. Defaults to 'arcane-bridge' if not specified.
 81 | - `--shared-models`: If set to true all downloads for HuggingFace will be do to the data/cache/ folder. This is useful for shared drives and docker.
 82 | - `--amqp-ip`: Overrides the IP stored in Vault for connecting to AMQP server. Useful if running instances of Elemental Golem on additional servers when primary node is running stack using Docker compose.
 83 | - `--gpu-type`: The type of GPU running on the worker server. Valid choices are nvidia or nogpu.
 84 | 
 85 | ### **Commands to Control the Server over AMQP**:
 86 | 
 87 | - `system_info`: Returns details about the system, such as server ID, system status (ONLINE or STARTING), installed skills, and running skills.
 88 | - `run_skill`: Adds and runs a skill on the server based on the `skill_details` provided in the message body.
 89 | - `stop_skill`: Stops a running skill based on the `skill_details` provided in the message body.
 90 | - `install_skill`: Installs a skill on the server based on the `skill_details` provided in the message body.
 91 | - `stop_generation`: Stops generation on a particular thread based on the `stop_details` provided in the message body.
 92 | - `update_configuration`: Updates the configuration of the system based on the details provided in the message body.
 93 | 
 94 | Each command request should contain a `command`, `return_routing_key`, `return_exchange` in the message `headers`. Based on the command executed, appropriate responses are provided through the `AMQP` channel.
 95 | 
 96 | > Note: It is crucial to reject the message correctly if any error occurs during command execution to prevent the message broker from requeueing the message.
 97 | 
 98 | ### LLM Payload Validation
 99 | 
100 | The LLM handlers check the AMQP payload for the following data:
101 | 
102 | - **max_new_tokens** (Number, Required): Your desired maximum tokens generated.
103 | - **top_p** (Number, Required): Your desired randomness in response (0.0 to 1.0).
104 | - **temperature** (Number, Required): Your desired "temperature" of output (0.0 to 1.0).
105 | - **stream** (Boolean, Required): If set to true, it signals to stream the output.
106 | - **debug** (Boolean, Optional): Signals to enable the debug mode. If enabled model output will be streamed to the console.
107 | - **stop_key** (String, Optional): The key string to stop generation.
108 | - **lora** (String, Optional): Specifies a lora to use with the request. Only ExLlama support if included at this point.
109 | - **ai_role** (String, Optional): Specifies role of AI in conversations.
110 | - **user_role** (String, Optional): Specifies role of user in conversations.
111 | - **start_response** (String, Optional): Specifies the response to start with.
112 | - **raw** (String, Optional): Raw content to use for generating the prompt.
113 | - **messages** (Array, Required): An array of message objects with these properties:
114 |   - **role** (String, Required): Role in the message.
115 |   - **content** (String, Required): Content of the message.
116 | 
117 | ## golem.json
118 | 
119 | The golem.json file defines the handlers and models/skills available for loading and inference. Here is a high level overview of the the fields found in the file.
120 | The best reference for this at the moment is to look in the modules/noco-ai/... to file and look at a handler that is implements the handler for similar type of model.
121 | If your model uses transformers of 🤗 pipelines you can add a new definition for the model to an exisiting handler.
122 | 
123 | The configuration for Elemental Golem is stored in a JSON file. Below is a breakdown of each field in the JSON file:
124 | 
125 | - `label`: Name of the module.
126 | - `description`: Purpose of the module, what the skill does.
127 | - `script`: Python script to use in running the project.
128 | - `multi_gpu_support`: Boolean indicating multi-gpu support.
129 | - `repository`: Stores information about the code repository.
130 |   - `url`: URL to the project repository.
131 |   - `folder`: Specific directory within the repository URL.
132 | - `skills`: An array containing model definitions. Each model has its properties:
133 |   - `label`: A readable name for the model.
134 |   - `routing_key`: Routing key for the message broker.
135 |   - `use`: Use case(s) for this skill.
136 |   - `available_precision`: Array of information of what devies and percision the skill can be loaded at.
137 |   - `memory_usage`: The memory capacity requirement of the model.
138 |   - `model`: A model with a `name` and `provider`.
139 |   - `shortcut`: A symbol representing the model, this allows LLM models to be accessed via the Spellbook UI directly.
140 |   - `configuration`: Model-specific configurations, like `max_seq_len`, `user_role`, `ai_role`, `stop_on`, `system_messages`.
141 | - `configuration`: Module-wide configuration involving secrets management and system-specific parameters.
142 |   - `vault_path`: The path to secrets storage for sensitive data like API tokens.
143 |   - `options`: An array of global options.
144 | 
145 | ### Model Configuration
146 | 
147 | Each model/skill can define configurtion information that is available to the handler. If these have keys that match the global configuration keys
148 | for the module they are merged with the user set values overriding the defaults. Here is an example of configuration values a LLM handler expects.
149 | 
150 | - `max_seq_len`: Specifies the maximum sequence length for model input.
151 | - `user_role`: Define the user's assumed role in the interaction.
152 | - `ai_role`: Defines the AI's assumed role in the interaction.
153 | - `stop_on`: The signals that, when received, will trigger the model to stop the execution.
154 | - `system_message`: Describes the nature of interaction between a user and the AI.
155 | 
156 | ### Global Configuration
157 | 
158 | The global configuration is read by the frontend which allows the user to override the system default. What configuration options will vary by the type
159 | of handler. Module-wide configuration options include.
160 | 
161 | - `vault_path`: Secure storage path for sensitive data like API keys.
162 | - `options`: An array of global parameters each with:
163 |   - `label`: A readable field name displayed in a settings UI.
164 |   - `name`: Identifier for the option/field.
165 |   - `editable`: Boolean determining if the user can manually edit the value.
166 |   - `type`: Data type of the parameter.
167 |   - `default`: The default value if none is provided.
168 | 
169 | ### Repository
170 | 
171 | Some modules require that another repo is installed to allow for a skill handler to work correctly. These are defines at a global level for the handler.
172 | 
173 | - `url`: URL to the project repository.
174 | - `folder`: The path to the folder within the repository.
175 | 
176 | ## handler.py
177 | 
178 | The handler is a Python class inheriting from `BaseHandler` or `LlmHandler` that is responsible for handling messages. Each handler must implement the following functions:
179 | 
180 | - `__init__`: Initialize the handler.
181 | - `validate`: Validates a request. It should return a boolean indicating whether the request is valid and a list of errors (if any).
182 | - `execute`: Executes the model. It receives the model and request. This method is responsible for getting the request data, making the API call, and returning API response.
183 | - `load`: Loads the model. Receives three parameters: the model, model options, and the local path to the model. Be sure to set up the API key using `model["secrets"]["token"]`.
184 | 
185 | ```python
186 | class ExampleHandler(BaseHandler):
187 |     def __init__(self):
188 |         super().__init__()
189 | 
190 |     def validate(self, request):
191 |         return self.validate_fields(request, [("text", len)])
192 | 
193 |     def execute(self, model, request):
194 |         #...
195 | 
196 |     def load(self, model, model_options, local_path):
197 |         openai.api_key = model["secrets"]["token"]
198 |         return {"model_name": model["configuration"]["model"]}
199 | ```
200 | 
201 | Remember to replace `#...` in `execute` with the correct implementation that fits your scenario.
202 | The response must return `{ "content": response}` where `response` is the content you wish to send back.
203 | 
204 | _Configuration, requests, and responses vary based on how the handler is implemented._
205 | 


--------------------------------------------------------------------------------
/application/amqp.py:
--------------------------------------------------------------------------------
 1 | import pika
 2 | import logging
 3 | 
 4 | logger = logging.getLogger(__name__)
 5 | 
 6 | def connect_to_amqp(amqp_ip, amqp_user, amqp_password, amqp_vhost):    
 7 | 
 8 |     # Otherwise, establish a new connection for this process
 9 |     connection_successful = True
10 |     try:
11 |         credentials = pika.PlainCredentials(amqp_user, amqp_password)
12 |         connection = pika.BlockingConnection(
13 |             pika.ConnectionParameters(
14 |                 host=amqp_ip,
15 |                 virtual_host=amqp_vhost,
16 |                 credentials=credentials,
17 |                 connection_attempts=5,
18 |                 retry_delay=5,
19 |                 socket_timeout=600,
20 |                 heartbeat=300
21 |             )
22 |         )
23 |         channel = connection.channel()
24 |     
25 |     except Exception as e:
26 |         connection_successful = False
27 |         logger.error(f"failed to connect", e)
28 |     
29 |     return connection_successful, connection, channel
30 | 
31 | def create_queue(channel, queue_name, dlx=None, dlx_queue='deadletters', is_exclusive=False, is_auto_delete=False):
32 |     
33 |     # Declare the queue with 'dlx' as the DLX if provided
34 |     if dlx:
35 |         result = channel.queue_declare(queue=queue_name, exclusive=is_exclusive, auto_delete=is_auto_delete, arguments={
36 |             'x-dead-letter-exchange': dlx,
37 |             'x-dead-letter-routing-key': dlx_queue
38 |         })
39 |     else:
40 |         result = channel.queue_declare(queue=queue_name, exclusive=is_exclusive, auto_delete=is_auto_delete)
41 | 
42 |     return result.method.queue
43 | 
44 | def create_exchange(channel, exchange_name, exchange_type='direct'):
45 |     channel.exchange_declare(exchange=exchange_name, exchange_type=exchange_type)
46 | 
47 | def bind_queue_to_exchange(channel, queue_name, exchange_name, routing_key=None):
48 |     channel.queue_bind(exchange=exchange_name, queue=queue_name, routing_key=routing_key)
49 | 
50 | def become_consumer(channel, queue_name, callback_function):
51 |     channel.basic_consume(queue=queue_name, on_message_callback=callback_function, auto_ack=False)
52 |     channel.start_consuming()
53 | 
54 | def send_message_to_exchange(channel, exchange_name, routing_key, message, headers=None):
55 |     properties = pika.BasicProperties(delivery_mode=2)  # make message persistent
56 |     if headers is not None:
57 |         properties.headers = headers
58 | 
59 |     channel.basic_publish(exchange=exchange_name,
60 |                           routing_key=routing_key,
61 |                           body=message,
62 |                           properties=properties)
63 | 


--------------------------------------------------------------------------------
/application/base_handler.py:
--------------------------------------------------------------------------------
  1 | from urllib.parse import urlparse
  2 | from pika import BasicProperties
  3 | import hvac
  4 | import json
  5 | import jsonschema
  6 | from jsonschema import validate
  7 | from typing import Union, List
  8 | import logging
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | class BaseHandler:
 13 | 
 14 |     def __init__(self):
 15 |         self.cached_schemas = {}
 16 | 
 17 |     def execute(self, model, request) -> dict:
 18 |         raise NotImplementedError("The `execute` method should be implemented in the derived class.")
 19 |     
 20 |     def validate(self, request) -> dict:
 21 |         raise NotImplementedError("The `validate` method should be implemented in the derived class.")
 22 | 
 23 |     def load(self, model, model_options) -> dict:
 24 |         return {}
 25 |     
 26 |     def copy_queue_headers(self, incoming_headers, override_command = None):
 27 |         # copy amqp headers
 28 |         outgoing_headers = {}
 29 |         stream_override = None
 30 |         for incoming_header in incoming_headers:
 31 |             if incoming_header in ["x-delay", "return_exchange", "return_routing_key"]:
 32 |                 continue
 33 |             if incoming_header == "stream_to_override":
 34 |                 stream_override = incoming_headers[incoming_header]
 35 | 
 36 |             outgoing_headers[incoming_header] = incoming_headers[incoming_header]
 37 | 
 38 |         stream_to = "prompt_fragment" if stream_override == None else stream_override
 39 |         if override_command != None:
 40 |             outgoing_headers["original_command"] = incoming_headers["command"]
 41 | 
 42 |         outgoing_headers["command"] = override_command if override_command is not None else stream_to
 43 |         return BasicProperties(headers=outgoing_headers)
 44 | 
 45 |     def load_schema_file(self, schema_file):
 46 |         # Check if schema is in cache
 47 |         if schema_file in self.cached_schemas:
 48 |             schema = self.cached_schemas[schema_file]
 49 |         else:
 50 |             # Load the schema at the path
 51 |             try: 
 52 |                 with open(f"schema/{schema_file}.jsonschema", 'r') as file:
 53 |                     schema = json.load(file)
 54 |             except Exception as e:
 55 |                 logger.error(e)
 56 |                 return None
 57 |             # Cache the schema
 58 |             self.cached_schemas[schema_file] = schema
 59 | 
 60 |         return schema
 61 |     
 62 |     # A dictionary to hold the cached schemas    
 63 |     def validate_request(self, json_data: dict, schema_file: str) -> Union[bool, List[str]]:
 64 |         
 65 |         schema = self.load_schema_file(schema_file)
 66 |         if schema is None:
 67 |             return False, ["Invalid schema file for handler"]
 68 |         
 69 |         json_data = self.apply_schema_defaults(json_data, schema_file)        
 70 |         try:
 71 |             validate(instance=json_data, schema=schema)
 72 |         except jsonschema.exceptions.ValidationError as err:
 73 |             # If there is a validation error, return a list containing the error message
 74 |             logger.warn("validation failed for incoming request")
 75 |             return False, [str(err)]
 76 |         else:
 77 |             # If the data is valid, return True
 78 |             return True, []
 79 | 
 80 |     def apply_schema_defaults(self, raw_data: dict, schema_file: str) -> dict:
 81 |         
 82 |         schema = self.load_schema_file(schema_file)
 83 |         if schema is None:
 84 |             logger.error("could not load schema file")
 85 |             return raw_data
 86 |         
 87 |         # Fill default values
 88 |         for property, attributes in schema['properties'].items():
 89 |             if "default" in attributes and property not in raw_data:
 90 |                 raw_data[property] = attributes["default"]
 91 | 
 92 |         return raw_data
 93 | 
 94 |     
 95 |     def check_stop_generation(self, counter, stop_generation_event, stop_generation_filter, socket_id):        
 96 |         counter += 1
 97 |         if counter >= 5:
 98 |             counter = 0
 99 |             if stop_generation_event.is_set():
100 |                 stop_generation_event.clear()
101 |                 if socket_id == None:
102 |                     return False, counter
103 |                 
104 |                 stop_socket = bytes(stop_generation_filter.raw).rstrip(b'\x00').decode("utf-8")
105 |                 if stop_socket == socket_id:
106 |                     return True, counter
107 | 
108 |         return False, counter
109 | 


--------------------------------------------------------------------------------
/application/download.py:
--------------------------------------------------------------------------------
  1 | import os
  2 | import logging
  3 | import shutil
  4 | import requests
  5 | import multiprocessing
  6 | import hashlib
  7 | from huggingface_hub import snapshot_download, hf_hub_download
  8 | from application.thread import send_ui_update
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | BUFFER_SIZE = 64 * 1024 * 1024  # 64 MB
 13 | 
 14 | # Usage
 15 | def install_skill(all_skills, install_skill_data, shared_models, server_id, channel):
 16 |     # Create a list to hold all the processes
 17 |     processes = []
 18 |     for skill in all_skills:
 19 |         if skill["routing_key"] != install_skill_data["routing_key"]:
 20 |             continue
 21 | 
 22 |         if "model" in skill:
 23 |             for model in skill["model"]:
 24 |                 process = multiprocessing.Process(target=download_model, args=(model, install_skill_data, shared_models, server_id, channel))
 25 |                 processes.append(process)
 26 |                 process.start()
 27 |         
 28 |         if "repository" in skill:
 29 |             for repo in skill["repository"]:
 30 |                 # Create and start a new process for each download
 31 |                 process = multiprocessing.Process(target=download_repo, args=(repo["url"], repo["folder"], repo["module_path"]))
 32 |                 processes.append(process)
 33 |                 process.start()
 34 | 
 35 | def download_repo(url, repo_folder, target_folder):    
 36 |     repo_folder = f'data/repos/{repo_folder}'
 37 |     
 38 |     if os.path.exists(repo_folder) and os.path.isdir(repo_folder):
 39 |         os.system(f"cd {repo_folder} && git pull")
 40 |     else:
 41 |         # Create the directory if it doesn't exist
 42 |         os.makedirs(repo_folder, exist_ok=True)
 43 |         os.system(f"git clone {url} {repo_folder}")
 44 | 
 45 |     logger.info(f"done downloading repository {repo_folder}")
 46 |     
 47 |     # Make sure the target folder exists
 48 |     os.makedirs(target_folder, exist_ok=True)
 49 | 
 50 |     # Copy all files from repo_folder to target_folder
 51 |     for file_name in os.listdir(repo_folder):
 52 |         source = os.path.join(repo_folder, file_name)
 53 |         destination = os.path.join(target_folder, file_name)
 54 | 
 55 |         if os.path.isfile(source):
 56 |             shutil.copy2(source, destination)
 57 |         elif os.path.isdir(source):
 58 |             shutil.copytree(source, destination, dirs_exist_ok=True)
 59 | 
 60 |     logger.info(f"done copying repository files to {target_folder}")
 61 | 
 62 | def combine_files(base_file, split_files):
 63 |     
 64 |     logger.info(f"opening {base_file} to combing splits")
 65 |     with open(base_file, 'wb') as dest_file:
 66 |         for split_name in split_files:
 67 |             filename = base_file + split_name
 68 |             logger.info(f"combining split file {filename}")
 69 |             with open(filename, 'rb') as src_file:
 70 |                 while True:
 71 |                     chunk = src_file.read(BUFFER_SIZE)
 72 |                     if not chunk:
 73 |                         break
 74 |                     dest_file.write(chunk)
 75 |             
 76 |             logger.info(f"cleaning up split file {split_name}")
 77 |             os.remove(filename)
 78 |     logging.info("done combining split files")
 79 | 
 80 | def download_model(model, install_skill, shared_models, server_id, channel):
 81 |     
 82 |     name = model["name"] 
 83 |     provider = model["provider"]
 84 |     is_branch = False
 85 |     single_file = False
 86 |     model_full_path = model["name"]
 87 |     if "files" in model and install_skill["precision"] in model["files"]:
 88 |         model_full_path = os.path.join(model["name"], model["files"][install_skill["precision"]])
 89 |         single_file = True
 90 |     elif "branch" in model and install_skill["precision"] in model["branch"]:
 91 |         model_full_path = os.path.join(model["name"], model["branch"][install_skill["precision"]])
 92 |         is_branch = True
 93 |     
 94 |     lock_file_path = "data/models/" + hashlib.sha256(model_full_path.encode()).hexdigest()[:10] + ".lock"
 95 | 
 96 |     # Check if a download is already in progress for the given model name
 97 |     if os.path.exists(lock_file_path):
 98 |         logger.info(f"download already in progress for model: {name}")
 99 |         return       
100 | 
101 |     # Create a lock file to signal that a download is in progress
102 |     with open(lock_file_path, 'w') as lock_file:
103 |         lock_file.write("download in progress")
104 |     
105 |     logger.info(f"downloading skill model {name}")
106 | 
107 |     try:        
108 |         if provider == 'huggingface':
109 | 
110 |             if single_file:
111 |                 if os.path.exists(f'data/models/{name}/{model["files"][install_skill["precision"]]}'):
112 |                     logger.info(f"already downloaded model: {name}")
113 |                     return
114 |             elif os.path.exists(f'data/models/{name}'):
115 |                 logger.info(f"already downloaded model: {name}")
116 |                 return
117 |             
118 |             os.makedirs(f'data/models/{name}', exist_ok=True)
119 |             use_symlinks = False if shared_models else "auto"
120 |             cache_dir = "data/cache" if shared_models else None
121 |             download_args = {
122 |                 "repo_id": name,
123 |                 "local_dir": f'data/models/{model_full_path}'                
124 |             }
125 | 
126 |             # Conditionally adding arguments to the dictionary if they are not None
127 |             if cache_dir is not None:
128 |                 download_args["cache_dir"] = cache_dir
129 | 
130 |             if use_symlinks is not None:
131 |                 download_args["local_dir_use_symlinks"] = use_symlinks
132 | 
133 |             if single_file:
134 |                 if "split" in model and install_skill["precision"] in model["split"]:
135 |                     for split_name in model["split"][install_skill["precision"]]:
136 |                         download_args["filename"] = model["files"][install_skill["precision"]] + split_name
137 |                         download_args["local_dir"] = f'data/models/{name}'                
138 |                         logger.info(f'downlading split file {model["files"][install_skill["precision"]]}{split_name} from hf hub, shared: {shared_models}')
139 |                         hf_hub_download(**download_args)
140 |                     
141 |                     base_file = os.path.join('data', 'models', model_full_path)            
142 |                     combine_files(base_file, model["split"][install_skill["precision"]])                    
143 |                 else:
144 |                     logger.info(f'downlading single file {model["files"][install_skill["precision"]]} from hf hub, shared: {shared_models}')
145 |                     download_args["local_dir"] = f'data/models/{name}'
146 |                     download_args["filename"] = model["files"][install_skill["precision"]]
147 |                     hf_hub_download(**download_args)
148 |             elif is_branch:
149 |                 logger.info(f'downlading branch {model_full_path} from hf hub, shared: {shared_models}')
150 |                 os.makedirs(f'data/models/{model_full_path}', exist_ok=True)
151 |                 download_args["revision"] = model["branch"][install_skill["precision"]]
152 |                 snapshot_download(**download_args)
153 |             else:
154 |                 logger.info(f'downlading repo {name} from hf hub, shared: {shared_models}')
155 |                 snapshot_download(**download_args)
156 |         elif provider == 'civitai':
157 |             url = "" if "url" not in model else model["url"]
158 |             if 'api' not in url:
159 |                 logger.error("invalid url provided for civit.ai")
160 |                 return
161 | 
162 |             headers = {
163 |                 "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36"
164 |             }
165 | 
166 |             base_name = name.rsplit('/', 1)[0]
167 |             base_dir = f'data/models/{base_name}'
168 |             os.makedirs(base_dir, exist_ok=True)
169 | 
170 |             logger.info(f"downloading model from {url}, please wait...")
171 |             response = requests.get(url, headers=headers, stream=True)
172 |             with open(os.path.join(base_dir, name.split('/')[-1]), 'wb') as f:
173 |                 for chunk in response.iter_content(chunk_size=8192):
174 |                     f.write(chunk)
175 | 
176 |             logger.info("model downloaded successfully!")
177 |         else:
178 |             logger.error(f"{provider} is not supported")
179 | 
180 |     finally:
181 |         # Remove the lock file
182 |         if os.path.exists(lock_file_path):
183 |             os.remove(lock_file_path)
184 | 
185 |         send_ui_update("skill_downloaded", name, server_id, channel)
186 |         logger.info(f"finished downloading skill model {name}")        


--------------------------------------------------------------------------------
/application/llm_handler.py:
--------------------------------------------------------------------------------
  1 | from urllib.parse import urlparse
  2 | from application.base_handler import BaseHandler
  3 | import logging
  4 | 
  5 | logger = logging.getLogger(__name__)
  6 | 
  7 | class LlmHandler(BaseHandler):
  8 |     
  9 |     def __init__(self):
 10 |         super().__init__()
 11 | 
 12 |     def load(self, model, model_options, local_path):
 13 |         pass
 14 |     
 15 |     def load_config_settings(self, num_input_tokens, request):
 16 |         config = self.model_config
 17 |         max_new_tokens_config = int(request.get("max_new_tokens", 1024))
 18 |         max_seq_len = config.get("max_seq_len", 2048)
 19 |         max_new_tokens = min(max_new_tokens_config, max_seq_len - num_input_tokens)
 20 |         top_p = request.get("top_p", 0.9)
 21 |         top_k = request.get("top_k", 50)
 22 |         seed = request.get("seed", -1)
 23 |         min_p = request.get("min_p", 0.05)
 24 |         mirostat = request.get("mirostat", 0)
 25 |         mirostat_eta = request.get("mirostat_eta", 0.01)
 26 |         mirostat_tau = request.get("mirostat_tau", 5)
 27 |         temperature = request.get("temperature", 1)
 28 |         stream_output = True if "stream" in request and request["stream"] == True else False
 29 |         debug = "debug" in request
 30 |         stop_key = request.get("stop_key", "<stop>")                
 31 | 
 32 |         logger.info(f"prompt tokens: {num_input_tokens}, max completion tokens: {max_new_tokens}, context length: {max_seq_len}")
 33 |         logger.info(f"temperature: {temperature}, top_p: {top_p}, top_k: {top_k}, seed: {seed}, stream output: {stream_output}")
 34 |         logger.info(f"min_p: {min_p}, mirostat: {mirostat}, mirostat_eta: {mirostat_eta}, mirostat_tau: {mirostat_tau}")
 35 |         return max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, min_p, mirostat, mirostat_eta, mirostat_tau    
 36 | 
 37 |     def build_stop_conditions(self, stops, to_lower = True):
 38 |         check_stop_token = False
 39 |         stop_conditions = []
 40 |         for stop_text in stops:       
 41 |             if stop_text == "<stop>":
 42 |                 check_stop_token = True
 43 |                 continue
 44 |             add_condition = stop_text.lower() if to_lower == True else stop_text
 45 |             stop_conditions.append(add_condition)
 46 |         
 47 |         return check_stop_token, stop_conditions
 48 | 
 49 |     def check_stop_conditions(self, token, res_line, eos_token, check_stop_token, stop_conditions):
 50 |         if check_stop_token and token == eos_token:
 51 |             return True
 52 | 
 53 |         for stop_string in stop_conditions:
 54 |             if res_line.lower().endswith(stop_string):
 55 |                 return True
 56 |         
 57 |         return False
 58 |     
 59 |     def finish_response(self, stop_key, response, request, stream_output,
 60 |                              finish_reason, tokens_per_second, new_tokens, input_tokens, model, elapsed, debug):
 61 |         if debug and stream_output == False:
 62 |             print('\033[92m' + response + '\033[0m')
 63 | 
 64 |         send_content = ""
 65 |         if stream_output:
 66 |             send_content = stop_key    
 67 |         elif "start_response" in request:
 68 |             send_content = f"{request['start_response']}{response}"
 69 |         else:
 70 |             send_content = response
 71 | 
 72 |         llm_response = {"content": send_content, "finish_reason": finish_reason, 
 73 |                 "tokens_per_second": round(tokens_per_second, 2), "completion_tokens": new_tokens, "prompt_tokens": input_tokens, "model": model }
 74 |         
 75 |         if debug:
 76 |             print(llm_response)
 77 | 
 78 |         logger.info(f"prompt processed in {elapsed:.2f} seconds, new tokens: {new_tokens}, tokens/second: {tokens_per_second:.2f}")   
 79 |         return llm_response
 80 |     
 81 |     def get_token_count(self, input_text):
 82 |         return 100000
 83 |     
 84 |     def _get_system_prompt(self, request, config):
 85 |         system_prompt = ""
 86 |         in_request = False
 87 |         contains_user_message = False
 88 | 
 89 |         if "system_message" in config and len(config["system_message"]):
 90 |             system_prompt = config['system_message']            
 91 | 
 92 |         # override with system prompt provided by request
 93 |         messages_len = len(request["messages"])
 94 |         if messages_len and request["messages"][0]["role"] == "system":
 95 |             system_prompt = request['messages'][0]['content']            
 96 |             in_request = True
 97 | 
 98 |         if "system_prompt_format" in config:            
 99 |             template = config["system_prompt_format"]
100 |             ai_role = request["ai_role"] if "ai_role" in request else config["ai_role"]
101 |             user_role = request["user_role"] if "user_role" in request else config["user_role"]                                    
102 |             if "{prompt}" in template:       
103 |                 check_index = 1 if in_request else 0
104 |                 check_len = 2 if in_request else 1
105 |                 prompt = request["messages"][check_index]["content"] if messages_len >= check_len and request["messages"][check_index]["role"] == "user" else ""
106 |                 response = request["messages"][check_index + 1]["content"] if check_index + 1 < messages_len and request["messages"][check_index + 1]["role"] == "assistant" else ""
107 |                 system_prompt = template.format(user_role=user_role, system_prompt=system_prompt.strip(), ai_role=ai_role, prompt=prompt, response=response) + "\n"
108 |                 contains_user_message = True
109 |             else:
110 |                 system_prompt = template.format(user_role=user_role, system_prompt=system_prompt.strip(), ai_role=ai_role)
111 |         
112 |         return system_prompt, in_request, contains_user_message
113 |     
114 |     def _prep_prompt(self, request, config):
115 |         request_system_message = None
116 |         max_new_tokens = request.get("max_new_tokens", 1024)
117 |         max_seq_length = config["max_seq_len"]                
118 |         max_input_tokens = max(max_seq_length - max_new_tokens, 0)        
119 | 
120 |         if max_input_tokens == 0:
121 |             logger.error("error with configuration of models context limits")
122 |             raise ValueError('error with configuration of models context limits')
123 |         
124 |         # give a little wiggle room for the way the prompt is built
125 |         max_input_tokens -= 64
126 |         
127 |         system_prompt, sys_prompt_in_request, clip_first_user_message = self._get_system_prompt(request, config)
128 |         system_prompt_tokens = self.get_token_count(system_prompt)
129 |         if system_prompt_tokens >= max_input_tokens:
130 |             logger.error("system prompt excceds max input tokens")
131 |             raise ValueError("system prompt excceds max input tokens")
132 |                 
133 |         if sys_prompt_in_request:
134 |             request_system_message = request["messages"][0]
135 |             request["messages"].pop(0) 
136 | 
137 |         if clip_first_user_message:
138 |             request["messages"].pop(0)
139 | 
140 |         # clip all but last message if this is an instruct model
141 |         if  len(request["messages"]) == 0:
142 |             messages = []
143 |         if "model_type" in config and config["model_type"] == "instruct":
144 |             messages = [request["messages"][-1]]
145 |         else:
146 |             messages = request["messages"][::-1]      
147 | 
148 |         return messages, system_prompt_tokens, request_system_message, system_prompt, sys_prompt_in_request, max_input_tokens     
149 | 
150 |     def build_prompt(self, request, config, model):
151 |         prompt = ""
152 |         
153 |         # raw prompt
154 |         if "raw" in request:
155 |             prompt = request["raw"]
156 |             if "start_response" in request:
157 |                 prompt += request["start_response"]            
158 |             return prompt
159 | 
160 |         messages, system_prompt_tokens, request_system_message, system_prompt, sys_prompt_in_request, max_input_tokens = self._prep_prompt(request, config)        
161 |         max_input_tokens -= 64
162 | 
163 |         # get delimiter in-between user and prompt and get roles        
164 |         ai_role = request["ai_role"] if "ai_role" in request else config["ai_role"]
165 |         user_role = request["user_role"] if "user_role" in request else config["user_role"]
166 |         template = config["prompt_format"]
167 | 
168 |         prompt_parts = []  
169 |         input_token_count = system_prompt_tokens
170 | 
171 |         for index, message in enumerate(messages):
172 |             
173 |             if message["role"] == "assistant":
174 |                 continue            
175 |             
176 |             ai_response = "" if index == 0 else messages[index - 1]["content"].strip()
177 |             formatted_string = template.format(user_role=user_role, prompt=message['content'].strip(), ai_role=ai_role, response=ai_response)            
178 |             token_count = self.get_token_count(formatted_string)             
179 |             if input_token_count + token_count > max_input_tokens:
180 |                 break
181 | 
182 |             input_token_count += token_count
183 |             prompt_parts.append(formatted_string)          
184 | 
185 |         prompt_parts = prompt_parts[::-1]        
186 |         prompt = system_prompt + "\n".join(prompt_parts)        
187 |         if "start_response" in request:
188 |             prompt += request["start_response"]
189 | 
190 |         return prompt


--------------------------------------------------------------------------------
/application/progress_streamer.py:
--------------------------------------------------------------------------------
 1 | from transformers.generation.streamers import BaseStreamer
 2 | from tqdm import tqdm
 3 | import json
 4 | 
 5 | class ProgressStreamer(BaseStreamer):
 6 |     def __init__(self):
 7 |         self.token_count = 0
 8 |         self.max_new_tokens = 0
 9 |         self.show_bar = False
10 |         self.amqp_config = None     
11 |         self.label = ""
12 | 
13 |     def put(self, value):
14 |         self.token_count += 1
15 |         if self.show_bar:
16 |             self.progress_bar.update(1)            
17 | 
18 |         if self.amqp_config:        
19 |             send_body = {
20 |                 "total": self.max_new_tokens,
21 |                 "current": self.token_count,
22 |                 "label": self.label,
23 |                 "model": self.model
24 |             }
25 |             
26 |             self.amqp_config["channel"].basic_publish(
27 |                 exchange=self.amqp_config["headers"]['return_exchange'], 
28 |                 routing_key=self.amqp_config["headers"]['return_routing_key'], 
29 |                 body=json.dumps(send_body), properties=self.amqp_config["outgoing_properties"])             
30 | 
31 |     def end(self):
32 |         if self.show_bar:
33 |             self.progress_bar.close()        
34 | 
35 |     def configure(self, max_new_tokens, label, model, amqp_config = None, show_bar = True):
36 |         self.max_new_tokens = max_new_tokens
37 |         self.show_bar = show_bar
38 |         self.amqp_config = amqp_config        
39 |         self.token_count = 0
40 |         self.label = label
41 |         self.model = model
42 |         if show_bar:
43 |             self.progress_bar = tqdm(total=max_new_tokens)


--------------------------------------------------------------------------------
/modules/facebook/convnext/convnext.py:
--------------------------------------------------------------------------------
 1 | from diffusers.utils import load_image
 2 | from application.base_handler import BaseHandler
 3 | from transformers import ConvNextImageProcessor, ConvNextForImageClassification
 4 | import torch
 5 | 
 6 | class FacebookConvNet(BaseHandler):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def validate(self, request):
11 |         is_valid, errors = self.validate_request(request, 'img-url')
12 |         return is_valid, errors
13 |     
14 |     def execute(self, model, request):        
15 |         raw_image = load_image(request["img_url"]).convert('RGB')
16 |         inputs = model["feature_extractor"](raw_image, return_tensors="pt")
17 | 
18 |         with torch.no_grad():
19 |             inputs = {key: value.to(model["device"]) for key, value in inputs.items()}
20 |             logits = model["model"](**inputs).logits
21 | 
22 |         predicted_label = logits.argmax(-1).item()        
23 |         return {"classes": [{"label": model["model"].config.id2label[predicted_label], "score": 1}]}
24 | 
25 |     def load(self, model, model_options, local_path):
26 |         feature_extractor = ConvNextImageProcessor.from_pretrained(local_path)
27 |         conv_model = ConvNextForImageClassification.from_pretrained(local_path)
28 |         return {"model": conv_model, "feature_extractor": feature_extractor, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/facebook/convnext/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "Facebook Convnext",
 3 |     "description": "Handler for Facebook's Convnext ML models for image classification.",
 4 |     "unique_key": "facebook_convnext",
 5 |     "script": "convnext.py",
 6 |     "skills": [
 7 |         {
 8 |             "name": "facebook/convnext-large-224",
 9 |             "label": "Facebook Convnext Large 224",
10 |             "routing_key": "facebook_convnext_large_224",
11 |             "use": ["image_classification"],
12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 2300 },
14 |             "model": [{
15 |                 "name": "facebook/convnext-large-224",
16 |                 "provider": "huggingface"
17 |             }]
18 |         },
19 |         {
20 |             "name": "facebook/convnext-base-224",
21 |             "label": "Facebook Convnext Base 224",
22 |             "routing_key": "facebook_convnext_base_224",
23 |             "use": ["image_classification"],
24 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
25 |             "memory_usage": { "full": 2000 },
26 |             "model": [{
27 |                 "name": "facebook/convnext-base-224",
28 |                 "provider": "huggingface"
29 |             }]
30 |         }
31 |     ]
32 | }
33 | 


--------------------------------------------------------------------------------
/modules/haotian-liu/llava/golem-generator.py:
--------------------------------------------------------------------------------
  1 | #from transformers_stream_generator import init_stream_support
  2 | #init_stream_support()
  3 | from application.llm_handler import LlmHandler
  4 | import requests
  5 | import torch
  6 | import json
  7 | import logging
  8 | from PIL import Image
  9 | from io import BytesIO
 10 | import os
 11 | import sys
 12 | import time
 13 | from transformers.generation.streamers import BaseStreamer
 14 | 
 15 | logger = logging.getLogger(__name__)
 16 | 
 17 | sys.path.append(os.path.dirname(os.path.realpath(__file__)))
 18 | from llava.model.builder import load_pretrained_model
 19 | from llava.mm_utils import tokenizer_image_token, get_model_name_from_path, KeywordsStoppingCriteria
 20 | from llava.constants import IMAGE_TOKEN_INDEX
 21 | 
 22 | class AmqpStreamer(BaseStreamer):
 23 |     def __init__(self, tokenizer, channel, incoming_headers, outgoing_properties, model_data, check_function, debug):
 24 |         self.tokenizer = tokenizer                
 25 |         self.all_tokens = []
 26 |         self.all_text = ""
 27 |         self.new_tokens = 0            
 28 |         self.debug = debug
 29 |         self.channel = channel
 30 |         self.outgoing_properties = outgoing_properties
 31 |         self.incoming_headers = incoming_headers
 32 |         self.model_data = model_data
 33 |         self.finish_reason = "stop"
 34 |         self.check_stop_generation = check_function
 35 |         self.stop_generation_counter = 0
 36 |         self.socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None
 37 | 
 38 |     def get_new_tokens(self):
 39 |         return self.new_tokens
 40 |     
 41 |     def get_response(self):
 42 |         return self.all_text
 43 |     
 44 |     def get_finish_reason(self):
 45 |         return self.finish_reason
 46 |     
 47 |     def put(self, value):        
 48 | 
 49 |         stop_generation, self.stop_generation_counter = self.check_stop_generation(self.stop_generation_counter, 
 50 |                 self.model_data["stop_generation_event"], self.model_data["stop_generation_filter"], self.socket_id)
 51 |         
 52 |         if stop_generation:
 53 |             self.finish_reason = "abort"
 54 |             logger.info("stopping generation of text")
 55 |             raise ValueError("stopping generation of text")
 56 |         
 57 |         self.all_tokens.extend(value.tolist())
 58 |         new_text = self.tokenizer.decode(self.all_tokens)
 59 |         new_chunk = new_text[len(self.all_text):]
 60 |         self.all_text += new_chunk
 61 |         self.new_tokens += 1             
 62 | 
 63 |         if self.debug:
 64 |             print('\033[96m' + new_chunk, end="")
 65 | 
 66 |         self.channel.basic_publish(
 67 |             exchange=self.incoming_headers['return_exchange'], 
 68 |             routing_key=self.incoming_headers['return_routing_key'], 
 69 |             body=new_chunk, properties=self.outgoing_properties)
 70 | 
 71 |     def end(self):
 72 |         pass      
 73 | 
 74 | class LLaVA(LlmHandler):
 75 |     def __init__(self):
 76 |         super().__init__()
 77 | 
 78 |     def load_image(self, image_file):
 79 |         if image_file.startswith('http') or image_file.startswith('https'):
 80 |             response = requests.get(image_file)
 81 |             image = Image.open(BytesIO(response.content)).convert('RGB')
 82 |         else:
 83 |             image = Image.open(image_file).convert('RGB')
 84 |         return image
 85 | 
 86 |     def validate(self, request):
 87 |         is_valid, errors = self.validate_request(request, 'llm')        
 88 |         return is_valid, errors
 89 | 
 90 |     def get_token_count(self, input_text):
 91 |         input_ids = tokenizer_image_token(input_text, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
 92 |         input_token_count = input_ids.shape[1]
 93 |         print(f"INPUT: {input_text}\nTOKEN COUNT: {input_token_count}\n\n")
 94 |         return input_token_count
 95 |     
 96 |     def execute(self, model, request):
 97 |         
 98 |         stream_output = request.get("stream", False)
 99 |         image_found = request.get("img_url", None)
100 |         if image_found and "messages" in request and request["messages"][-1]["role"] == "user":
101 |             logger.info(f"loading image {image_found}")
102 |             new_message = f"<image>\n{request['messages'][-1]['content']}"
103 |             request["messages"][-1]["content"] = new_message
104 |             image = self.load_image(request["img_url"])
105 |             image_tensor = model["image_processor"].preprocess(image, return_tensors='pt')['pixel_values'].half().cuda()        
106 |         else:
107 |             image_tensor = None
108 |         
109 |         config = self.model_config
110 |         prompt = self.build_prompt(request, config, model)
111 |         input_ids = tokenizer_image_token(prompt, model["tokenizer"], IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).cuda()
112 |         input_token_count = input_ids.shape[1]
113 | 
114 |         finish_reason = "stop"
115 |         max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \
116 |                     min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request)
117 |         check_stop_token, stop_conditions = self.build_stop_conditions(config["stop_on"])
118 |         stopping_criteria = KeywordsStoppingCriteria(stop_conditions, model["tokenizer"], input_ids)
119 | 
120 |         if debug:
121 |             print('\033[94m')
122 |             print(json.dumps(request, indent=2))
123 |             print(prompt)     
124 |             print('\033[0m')        
125 | 
126 |         begin_time = time.time()                
127 |         with torch.inference_mode():            
128 | 
129 |             do_sample = True if seed == -1 else False
130 |             incoming_headers = model["amqp_headers"]
131 |             outgoing_properties = self.copy_queue_headers(incoming_headers)
132 | 
133 |             if stream_output:                
134 |                 streamer = AmqpStreamer(model["tokenizer"], model["amqp_channel"], incoming_headers, outgoing_properties, model, self.check_stop_generation, debug)
135 |                 try:
136 |                     output_ids = model["model_loaded"].generate(
137 |                         input_ids,
138 |                         images=image_tensor,
139 |                         do_sample=do_sample,
140 |                         temperature=temperature,
141 |                         max_new_tokens=max_new_tokens,
142 |                         streamer=streamer,
143 |                         top_k=int(top_k * 100),
144 |                         top_p=top_p,
145 |                         use_cache=True,
146 |                         stopping_criteria=[stopping_criteria])        
147 |                 except Exception as e:
148 |                     print(e)
149 |                     pass
150 |             else:
151 |                 output_ids = model["model_loaded"].generate(
152 |                     input_ids,
153 |                     images=image_tensor,
154 |                     do_sample=do_sample,
155 |                     top_k=int(top_k * 100),
156 |                     top_p=top_p,
157 |                     temperature=temperature,
158 |                     max_new_tokens=max_new_tokens,
159 |                     use_cache=True,
160 |                     stopping_criteria=[stopping_criteria])  
161 |         
162 |         new_tokens = streamer.get_new_tokens() if stream_output == True else output_ids.shape[1] - input_token_count
163 |         end_time = time.time()
164 |         elapsed = end_time - begin_time
165 |         token_rate = 0 if elapsed == 0 else (new_tokens / elapsed)
166 | 
167 |         response = model["tokenizer"].decode(output_ids[0, input_ids.shape[1]:]) if stream_output == False else ""
168 |         model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided"
169 |         return self.finish_response(stop_key, response, request, stream_output, finish_reason, 
170 |                                         token_rate, new_tokens, input_token_count, model_name, elapsed, debug)        
171 |         
172 |     def load(self, model, model_options, local_path):
173 |         self.config = model        
174 |         self.model_config = model["configuration"]            
175 |         # tried to used both of these but they do not work, 4-bit fails to load and 8-bit outputs random shit
176 |         load_4bit = False
177 |         load_8bit = True if model_options["use_precision"] == "8-bit" else False
178 |         
179 |         try:
180 |             model_name = get_model_name_from_path(local_path)
181 |             base_model = f"data/models/{model['model'][1]['name']}"
182 |             tokenizer, model, image_processor, context_len = load_pretrained_model(local_path, base_model, model_name, load_8bit, load_4bit, device=model_options["device"])   
183 |             self.tokenizer = tokenizer
184 |             return {"model_loaded":model, "tokenizer": tokenizer, "image_processor": image_processor, "device": model_options["device"]}
185 |         except Exception as e:
186 |             logger.error(f"error loading model")
187 |             load_error = True
188 |             print(e)            
189 |             return { "error": load_error }
190 | 


--------------------------------------------------------------------------------
/modules/haotian-liu/llava/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "LLaVA 1.5",
 3 |     "description": "Handler for loading LLaVA 1.5 visual LLM models.",
 4 |     "script": "golem-generator.py",
 5 |     "unique_key": "llava_15",
 6 |     "supported_gpu": ["nvidia"],
 7 |     "repository": [
 8 |         {
 9 |             "url": "https://github.com/haotian-liu/LLaVA",
10 |             "folder": "haotian-liu/llava"           
11 |         }
12 |     ],
13 |     "skills": [
14 |         {
15 |             "label": "LLaVA 7b v1.5",
16 |             "shortcut": "👀",
17 |             "moe_function": [
18 |                 "This function takes an image as input and provides a detailed description of it.",
19 |                 "This function provides a description of an image."
20 |             ],
21 |             "routing_key": "llava_7b_v1_5",
22 |             "use": ["visual_language_model"],
23 |             "available_precision": { "cuda": ["full"] },
24 |             "memory_usage": { "full": 16000 },
25 |             "model": [
26 |                 {
27 |                     "name": "liuhaotian/llava-v1.5-mlp2x-336px-pretrain-vicuna-7b-v1.5",
28 |                     "provider": "huggingface"
29 |                 },
30 |                 {
31 |                     "name": "liuhaotian/llava-v1.5-7b",
32 |                     "provider": "huggingface"
33 |                 }
34 |             ],
35 |             "configuration": {
36 |                 "max_seq_len": 4096,
37 |                 "user_role": "USER:",
38 |                 "ai_role": "ASSISTANT:",
39 |                 "stop_on": ["</s>", "<s>"],
40 |                 "system_message": "ASSISTANT: A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n",
41 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response}\n"
42 |             }
43 |         },
44 |         {
45 |             "label": "LLaVA 13b v1.5",
46 |             "shortcut": "👀",
47 |             "moe_function": [
48 |                 "This function takes an image as input and provides a detailed description of it.",
49 |                 "This function provides a description of an image."
50 |             ],
51 |             "routing_key": "llava_13b_v1_5",
52 |             "use": ["visual_language_model"],
53 |             "available_precision": { "cuda": ["full"] },
54 |             "memory_usage": { "full": 26000 },
55 |             "model": [
56 |                 {
57 |                     "name": "liuhaotian/llava-v1.5-mlp2x-336px-pretrain-vicuna-13b-v1.5",
58 |                     "provider": "huggingface"
59 |                 },
60 |                 {
61 |                     "name": "liuhaotian/llava-v1.5-13b",
62 |                     "provider": "huggingface"
63 |                 }                
64 |             ],
65 |             "configuration": {
66 |                 "max_seq_len": 4096,
67 |                 "user_role": "USER:",
68 |                 "ai_role": "ASSISTANT:",
69 |                 "stop_on": ["</s>", "<s>"],
70 |                 "system_message": "ASSISTANT: A chat between a curious human and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the human's questions.\n",
71 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response}\n"
72 |             }
73 |         }
74 |     ]
75 | }
76 | 


--------------------------------------------------------------------------------
/modules/hf-pipeline/asr/asr.py:
--------------------------------------------------------------------------------
 1 | from application.base_handler import BaseHandler
 2 | from transformers import pipeline
 3 | import logging
 4 | 
 5 | logger = logging.getLogger(__name__)
 6 | 
 7 | class AsrPipeline(BaseHandler):
 8 |     def __init__(self):
 9 |         super().__init__()
10 | 
11 |     def validate(self, request):
12 |         is_valid, errors = self.validate_request(request, 'audio-url')
13 |         return is_valid, errors
14 |     
15 |     def execute(self, model, request):        
16 |         audio_url = request["audio_url"]
17 |         result = { "text": model["model"](audio_url, max_new_tokens=1024)["text"] }
18 |         logger.info(f"asr extracted text: {result['text']}")
19 |         return result
20 | 
21 |     def load(self, model, model_options, local_path):
22 |         asr_model = pipeline("automatic-speech-recognition", model=local_path)
23 |         return {"model": asr_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/hf-pipeline/asr/golem.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "label": "🤗 Automatic Speech Recognition",
  3 |     "description": "Handler for loading any ASR model that is compatible with HuggingFace ASR pipeline.",
  4 |     "script": "asr.py",
  5 |     "unique_key": "hf_asr",
  6 |     "skills": [
  7 |         {
  8 |             "name": "openai/whisper-tiny",
  9 |             "label": "Whisper Tiny",
 10 |             "routing_key": "whisper_tiny",
 11 |             "use": ["automatic_speech_recognition"],
 12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
 13 |             "memory_usage": { "full": 1700 },
 14 |             "model": [{
 15 |                 "name": "openai/whisper-tiny",
 16 |                 "provider": "huggingface"
 17 |             }]
 18 |         },
 19 |         {
 20 |             "name": "openai/whisper-tiny.en",
 21 |             "label": "Whisper Tiny En",
 22 |             "routing_key": "whisper_tiny_en",
 23 |             "use": ["automatic_speech_recognition"],
 24 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
 25 |             "memory_usage": { "full": 1700 },
 26 |             "model": [{
 27 |                 "name": "openai/whisper-tiny.en",
 28 |                 "provider": "huggingface"
 29 |             }]
 30 |         },
 31 |         {
 32 |             "name": "openai/whisper-base",
 33 |             "label": "Whisper Base",
 34 |             "routing_key": "whisper_base",
 35 |             "use": ["automatic_speech_recognition"],
 36 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
 37 |             "memory_usage": { "full": 1850 },
 38 |             "model": [{
 39 |                 "name": "openai/whisper-base",
 40 |                 "provider": "huggingface"
 41 |             }]
 42 |         },
 43 |         {
 44 |             "name": "openai/whisper-base.en",
 45 |             "label": "Whisper Base En",
 46 |             "routing_key": "whisper_base_en",
 47 |             "use": ["automatic_speech_recognition"],
 48 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
 49 |             "memory_usage": { "full": 1850 },
 50 |             "model": [{
 51 |                 "name": "openai/whisper-base.en",
 52 |                 "provider": "huggingface"
 53 |             }]
 54 |         },
 55 |         {
 56 |             "name": "openai/whisper-small",
 57 |             "label": "Whisper Small",
 58 |             "routing_key": "whisper_small",
 59 |             "use": ["automatic_speech_recognition"],
 60 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
 61 |             "memory_usage": { "full": 2700 },
 62 |             "model": [{
 63 |                 "name": "openai/whisper-small",
 64 |                 "provider": "huggingface"
 65 |             }]
 66 |         },
 67 |         {
 68 |             "name": "openai/whisper-small.en",
 69 |             "label": "Whisper Small En",
 70 |             "routing_key": "whisper_small_en",
 71 |             "use": ["automatic_speech_recognition"],
 72 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
 73 |             "memory_usage": { "full": 2700 },
 74 |             "model": [{
 75 |                 "name": "openai/whisper-small.en",
 76 |                 "provider": "huggingface"
 77 |             }]
 78 |         },
 79 |         {
 80 |             "name": "openai/whisper-medium",
 81 |             "label": "Whisper Medium",
 82 |             "routing_key": "whisper_medium",
 83 |             "use": ["automatic_speech_recognition"],
 84 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
 85 |             "memory_usage": { "full": 4800 },
 86 |             "model": [{
 87 |                 "name": "openai/whisper-medium",
 88 |                 "provider": "huggingface"
 89 |             }]
 90 |         },
 91 |         {
 92 |             "name": "openai/whisper-medium.en",
 93 |             "label": "Whisper Medium En",
 94 |             "routing_key": "whisper_medium_en",
 95 |             "use": ["automatic_speech_recognition"],
 96 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
 97 |             "memory_usage": { "full": 4800 },
 98 |             "model": [{
 99 |                 "name": "openai/whisper-medium.en",
100 |                 "provider": "huggingface"
101 |             }]
102 |         },
103 |         {
104 |             "name": "openai/whisper-large-v2",
105 |             "label": "Whisper Large v2",
106 |             "routing_key": "whisper_large_v2",
107 |             "use": ["automatic_speech_recognition"],
108 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
109 |             "memory_usage": { "full": 8100 },
110 |             "model": [{
111 |                 "name": "openai/whisper-large-v2",
112 |                 "provider": "huggingface"
113 |             }]
114 |         },
115 |         {
116 |             "name": "microsoft/speecht5_asr",
117 |             "label": "Microsoft Speech T5",
118 |             "routing_key": "speecht5_asr",
119 |             "use": ["automatic_speech_recognition"],
120 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
121 |             "memory_usage": { "full": 2300 },
122 |             "model": [{
123 |                 "name": "microsoft/speecht5_asr",
124 |                 "provider": "huggingface"
125 |             }]
126 |         }
127 |     ]
128 | }
129 | 


--------------------------------------------------------------------------------
/modules/hf-pipeline/image-class/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "🤗 Image Classification",
 3 |     "description": "Handler for loading any model that is compatible with HuggingFace Image Classification pipeline.",
 4 |     "script": "image-class.py",
 5 |     "unique_key": "hf_image_class",
 6 |     "skills": [
 7 |         {
 8 |             "name": "microsoft/resnet-50",
 9 |             "label": "Resnet 50",
10 |             "routing_key": "resnet_50",
11 |             "use": ["image_classification"],
12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 1800 },
14 |             "model": [{
15 |                 "name": "microsoft/resnet-50",
16 |                 "provider": "huggingface"
17 |             }]
18 |         },
19 |         {
20 |             "name": "microsoft/resnet-18",
21 |             "label": "Resnet 18",
22 |             "routing_key": "resnet_18",
23 |             "use": ["image_classification"],
24 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
25 |             "memory_usage": { "full": 1500 },
26 |             "model": [{
27 |                 "name": "microsoft/resnet-18",
28 |                 "provider": "huggingface"
29 |             }]
30 |         },
31 |         {
32 |             "name": "google/vit-base-patch16-224",
33 |             "label": "Vit Base Patch16 224",
34 |             "routing_key": "vit_base_patch16_224",
35 |             "use": ["image_classification"],
36 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
37 |             "memory_usage": { "full": 2000 },
38 |             "model": [{
39 |                 "name": "google/vit-base-patch16-224",
40 |                 "provider": "huggingface"
41 |             }]
42 |         },
43 |         {
44 |             "name": "google/efficientnet-b0",
45 |             "label": "Efficientnet B0",
46 |             "routing_key": "efficientnet_b0",
47 |             "use": ["image_classification"],
48 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
49 |             "memory_usage": { "full": 1800 },
50 |             "model": [{
51 |                 "name": "google/efficientnet-b0",
52 |                 "provider": "huggingface"
53 |             }]
54 |         },
55 |         {
56 |             "name": "google/efficientnet-b7",
57 |             "label": "Efficientnet B7",
58 |             "routing_key": "efficientnet_b7",
59 |             "use": ["image_classification"],
60 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
61 |             "memory_usage": { "full": 2000 },
62 |             "model": [{
63 |                 "name": "google/efficientnet-b7",
64 |                 "provider": "huggingface"
65 |             }]
66 |         },
67 |         {
68 |             "name": "microsoft/beit-base-patch16-224-pt22k-ft22k",
69 |             "label": "Beit Base Patch16",
70 |             "routing_key": "beit_base_patch16",
71 |             "use": ["image_classification"],
72 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
73 |             "memory_usage": { "full": 1800 },
74 |             "model": [{
75 |                 "name": "microsoft/beit-base-patch16-224-pt22k-ft22k",
76 |                 "provider": "huggingface"
77 |             }]
78 |         }
79 |     ]
80 | }
81 | 


--------------------------------------------------------------------------------
/modules/hf-pipeline/image-class/image-class.py:
--------------------------------------------------------------------------------
 1 | from application.base_handler import BaseHandler
 2 | from transformers import pipeline
 3 | 
 4 | class  ImageClassificationPipeline(BaseHandler):
 5 |     def __init__(self):
 6 |         super().__init__()
 7 | 
 8 |     def validate(self, request):
 9 |         is_valid, errors = self.validate_request(request, 'img-url')
10 |         return is_valid, errors
11 |     
12 |     def execute(self, model, request):        
13 |         img_url = request["img_url"]        
14 |         result = { "classes": model["model"](img_url) }        
15 |         return result
16 | 
17 |     def load(self, model, model_options, local_path):
18 |         img_model = pipeline("image-classification", model=local_path)
19 |         return {"model": img_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/hf-pipeline/image-to-text/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "🤗 Image to Text",
 3 |     "description": "Handler for loading any model that is compatible with HuggingFace Image to Text pipeline.",
 4 |     "unique_key": "hf_image_to_text",
 5 |     "script": "image-to-text.py",    
 6 |     "skills": [
 7 |         {
 8 |             "name": "nlpconnect/vit-gpt2-image-captioning",     
 9 |             "use": ["image_captioning"],  
10 |             "label": "ViT GPT2 Image Captioning",
11 |             "routing_key": "vit_gpt2_image_captioning",
12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 2400 },
14 |             "model": [{
15 |                 "name": "nlpconnect/vit-gpt2-image-captioning",
16 |                 "provider": "huggingface"
17 |             }]
18 |         },
19 |         {
20 |             "name": "ydshieh/vit-gpt2-coco-en",     
21 |             "use": ["image_captioning"],  
22 |             "label": "ViT GPT2 CoCo En",
23 |             "routing_key": "vit_gpt2_coco_en",
24 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
25 |             "memory_usage": { "full": 2400 },
26 |             "model": [{
27 |                 "name": "ydshieh/vit-gpt2-coco-en",
28 |                 "provider": "huggingface"
29 |             }]
30 |         },
31 |         {
32 |             "name": "Salesforce/blip-image-captioning-base",     
33 |             "use": ["image_captioning"],        
34 |             "label": "Blip Image Captioning Base",
35 |             "routing_key": "blip_image_captioning_base",
36 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
37 |             "memory_usage": { "full": 2500 },
38 |             "model": [{
39 |                 "name": "Salesforce/blip-image-captioning-base",
40 |                 "provider": "huggingface"
41 |             }]
42 |         },
43 |         {
44 |             "name": "Salesforce/blip-image-captioning-large",     
45 |             "use": ["image_captioning"],      
46 |             "label": "Blip Image Captioning Large",
47 |             "routing_key": "blip_image_captioning_large",  
48 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
49 |             "memory_usage": { "full": 3300 },
50 |             "model": [{
51 |                 "name": "Salesforce/blip-image-captioning-large",
52 |                 "provider": "huggingface"
53 |             }]
54 |         }
55 |     ]
56 | }


--------------------------------------------------------------------------------
/modules/hf-pipeline/image-to-text/image-to-text.py:
--------------------------------------------------------------------------------
 1 | from diffusers.utils import load_image
 2 | from application.base_handler import BaseHandler
 3 | from transformers import pipeline
 4 | 
 5 | class ImageToTextPipeline(BaseHandler):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def validate(self, request):
10 |         is_valid, errors = self.validate_request(request, 'img-url')
11 |         return is_valid, errors
12 |     
13 |     def execute(self, model, request):
14 |         predict = model["model"](request["img_url"])        
15 |         return predict
16 | 
17 |     def load(self, model, model_options, local_path):
18 |         vit_model = pipeline("image-to-text", model=local_path)
19 |         return {"model": vit_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/hf-pipeline/object-detection/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "🤗 Object Detection",
 3 |     "description": "Handler for loading any model that is compatible with HuggingFace Object Detection pipeline.",
 4 |     "unique_key": "hf_object_detection",
 5 |     "script": "object-detection.py",
 6 |     "skills": [
 7 |         {
 8 |             "name": "facebook/detr-resnet-101",
 9 |             "label": "Detr Resnet 101",
10 |             "routing_key": "detr_resnet_101",
11 |             "use": ["object_detection"],
12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 2100 },
14 |             "model": [{
15 |                 "name": "facebook/detr-resnet-101",
16 |                 "provider": "huggingface"
17 |             }]
18 |         },
19 |         {
20 |             "name": "facebook/detr-resnet-50",
21 |             "label": "Detr Resnet 50",
22 |             "routing_key": "detr_resnet_50",
23 |             "use": ["object_detection"],
24 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
25 |             "memory_usage": { "full": 1900 },
26 |             "model": [{
27 |                 "name": "facebook/detr-resnet-50",
28 |                 "provider": "huggingface"
29 |             }]
30 |         },
31 |         {
32 |             "name": "hustvl/yolos-tiny",
33 |             "label": "Yolos Tiny",
34 |             "routing_key": "yolos_tiny",
35 |             "use": ["object_detection"],
36 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
37 |             "memory_usage": { "full": 1600 },
38 |             "model": [{
39 |                 "name": "hustvl/yolos-tiny",
40 |                 "provider": "huggingface"
41 |             }]
42 |         },
43 |         {
44 |             "name": "hustvl/yolos-small",
45 |             "label": "Yolos Small",
46 |             "routing_key": "yolos_small",
47 |             "use": ["object_detection"],
48 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
49 |             "memory_usage": { "full": 2200 },
50 |             "model": [{
51 |                 "name": "hustvl/yolos-small",
52 |                 "provider": "huggingface"
53 |             }]
54 |         }
55 |     ]
56 | }
57 | 


--------------------------------------------------------------------------------
/modules/hf-pipeline/object-detection/object-detection.py:
--------------------------------------------------------------------------------
 1 | import requests
 2 | from PIL import Image, ImageDraw, ImageFont
 3 | import io
 4 | import base64
 5 | from diffusers.utils import load_image
 6 | from application.base_handler import BaseHandler
 7 | from transformers import pipeline
 8 | 
 9 | class ObjectDetectionPipeline(BaseHandler):
10 |     def __init__(self):
11 |         super().__init__()
12 | 
13 |     def validate(self, request):
14 |         is_valid, errors = self.validate_request(request, 'img-url')
15 |         return is_valid, errors
16 |     
17 |     def image_with_boxes(self, img_url, detections):
18 |         # Fetch the image
19 |         response = requests.get(img_url)
20 |         img = Image.open(io.BytesIO(response.content))
21 |         
22 |         # Prepare for drawing on the image
23 |         draw = ImageDraw.Draw(img)
24 |         font = ImageFont.load_default()
25 |         #font = ImageFont.truetype("arial.ttf", 15)
26 | 
27 |         # Draw boxes and labels
28 |         for detection in detections:
29 |             label = detection['label']
30 |             box = detection['box']
31 |             score = detection['score']
32 |             xmin = box['xmin']
33 |             ymin = box['ymin']
34 |             xmax = box['xmax']
35 |             ymax = box['ymax']
36 |             draw.rectangle([(xmin, ymin), (xmax, ymax)], outline="red")
37 |             
38 |             # Draw the label with its score
39 |             text = f"{label} {score:.2f}"
40 |             draw.text((xmin, ymin - 20), text, font=font, fill="red")
41 | 
42 |         # Convert the modified image to base64
43 |         buffered = io.BytesIO()
44 |         img.save(buffered, format="PNG")
45 |         img_base64 = base64.b64encode(buffered.getvalue()).decode()
46 | 
47 |         return img_base64
48 |         
49 |     def execute(self, model, request):
50 |         detections = model["model"](request["img_url"])
51 |         img_base64 = self.image_with_boxes(request["img_url"], detections)
52 |         return { "objects": detections, "image": img_base64 }                        
53 | 
54 |     def load(self, model, model_options, local_path):
55 |         detr_model = pipeline("object-detection", model=local_path)
56 |         return {"model": detr_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/hf-pipeline/visual-question-answering/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "🤗 Visual QA Handler",
 3 |     "description": "Handler for loading any model that is compatible with HuggingFace Visual QA pipeline.",
 4 |     "unique_key": "hf_vqa",
 5 |     "script": "visual-question-answering.py",
 6 |     "skills": [
 7 |         {
 8 |             "name": "dandelin/vilt-b32-finetuned-vqa",
 9 |             "label": "Vilt B32 Finetuned VQA",
10 |             "routing_key": "vilt_b32_finetuned_vqa",
11 |             "use": ["visual_qa"],
12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 1900 },
14 |             "model": [{
15 |                 "name": "dandelin/vilt-b32-finetuned-vqa",
16 |                 "provider": "huggingface"
17 |             }]
18 |         }
19 |     ]
20 | }
21 | 


--------------------------------------------------------------------------------
/modules/hf-pipeline/visual-question-answering/visual-question-answering.py:
--------------------------------------------------------------------------------
 1 | from application.base_handler import BaseHandler
 2 | from transformers import pipeline
 3 | 
 4 | class VisualQaPipeline(BaseHandler):
 5 |     def __init__(self):
 6 |         super().__init__()
 7 | 
 8 |     def validate(self, request):
 9 |         is_valid, errors = self.validate_request(request, 'visual-qa')
10 |         return is_valid, errors
11 |     
12 |     def execute(self, model, request):        
13 |         text = request["text"]
14 |         predict = model["model"](question=text, image=request["img_url"])
15 |         return predict
16 | 
17 |     def load(self, model, model_options, local_path):
18 |         vit_model = pipeline("visual-question-answering", model=local_path)
19 |         return {"model": vit_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/hf-pipeline/zero-shot-image-class/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "🤗 Zero Shot Image Classification",
 3 |     "description": "Handler for loading any model that is compatible with HuggingFace Zero Short Image Classification pipeline.",
 4 |     "unique_key": "hf_zeroshot_image_class",
 5 |     "script": "zero-shot-image-class.py",
 6 |     "skills": [
 7 |         {
 8 |             "name": "openai/clip-vit-large-patch14",
 9 |             "label": "Clip ViT Large Patch14",
10 |             "routing_key": "clip_vit_large_patch14",
11 |             "use": ["zero_shot_image_classification"],
12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 3100 },
14 |             "model": [{
15 |                 "name": "openai/clip-vit-large-patch14",
16 |                 "provider": "huggingface"
17 |             }]
18 |         }
19 |     ]
20 | }
21 | 


--------------------------------------------------------------------------------
/modules/hf-pipeline/zero-shot-image-class/zero-shot-image-class.py:
--------------------------------------------------------------------------------
 1 | from application.base_handler import BaseHandler
 2 | from transformers import pipeline
 3 | 
 4 | class ZeroShotImageClassPipeline(BaseHandler):
 5 |     def __init__(self):
 6 |         super().__init__()
 7 | 
 8 |     def validate(self, request):
 9 |         is_valid, errors = self.validate_request(request, 'zero-shot-img')
10 |         return is_valid, errors
11 |     
12 |     def execute(self, model, request):
13 |         labels = request["labels"]
14 |         predict = model["model"](request["img_url"], candidate_labels = labels)
15 |         return predict
16 | 
17 |     def load(self, model, model_options, local_path):
18 |         clip_model = pipeline("zero-shot-image-classification", model=local_path)
19 |         return {"model": clip_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/hf-pipeline/zero-shot-object-detection/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "🤗 Zero Shot Object Detection",
 3 |     "description": "Handler for loading any model that is compatible with HuggingFace Zero Short Object Detection pipeline.",
 4 |     "unique_key": "hf_zeroshot_object_detection",
 5 |     "script": "zero-shot-object-detection.py",
 6 |     "skills": [
 7 |         {
 8 |             "name": "google/owlvit-base-patch32",
 9 |             "label": "Owlvit Base Patch32",
10 |             "routing_key": "owlvit_base_patch32",
11 |             "use": ["zero_shot_object_detection"],
12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 2100 },
14 |             "model": [{
15 |                 "name": "google/owlvit-base-patch32",
16 |                 "provider": "huggingface"
17 |             }]
18 |         },
19 |         {
20 |             "name": "google/owlvit-base-patch16",
21 |             "label": "Owlvit Base Patch16",
22 |             "routing_key": "owlvit_base_patch16",
23 |             "use": ["zero_shot_object_detection"],
24 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
25 |             "memory_usage": { "full": 2600 },
26 |             "model": [{
27 |                 "name": "google/owlvit-base-patch16",
28 |                 "provider": "huggingface"
29 |             }]
30 |         },
31 |         {
32 |             "name": "google/owlvit-large-patch14",
33 |             "label": "Owlvit Large Patch14",
34 |             "routing_key": "owlvit_large_patch14",
35 |             "use": ["zero_shot_object_detection"],
36 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
37 |             "memory_usage": { "full": 5600 },
38 |             "model": [{
39 |                 "name": "google/owlvit-large-patch14",
40 |                 "provider": "huggingface"
41 |             }]
42 |         }
43 |     ]
44 | }
45 | 


--------------------------------------------------------------------------------
/modules/hf-pipeline/zero-shot-object-detection/zero-shot-object-detection.py:
--------------------------------------------------------------------------------
 1 | from diffusers.utils import load_image
 2 | from application.base_handler import BaseHandler
 3 | from transformers import pipeline
 4 | 
 5 | class ZeroShotObjectDetectionPipeline(BaseHandler):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def validate(self, request):
10 |         is_valid, errors = self.validate_request(request, 'zero-shot-img')
11 |         return is_valid, errors
12 |     
13 |     def execute(self, model, request):
14 |         labels = request["labels"]
15 |         predict = model["model"](request["img_url"], candidate_labels = labels)        
16 |         return predict
17 | 
18 |     def load(self, model, model_options, local_path):
19 |         object_model = pipeline("zero-shot-object-detection", model=local_path)
20 |         return {"model": object_model, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/hkunlp/instructor/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "Instructor Embeddings",
 3 |     "description": "Handler for loading instructor embedding models.",
 4 |     "unique_key": "instructor",
 5 |     "script": "instructor.py",
 6 |     "skills": [
 7 |         {
 8 |             "name": "hkunlp/instructor-xl",
 9 |             "label": "Instructor Xl",
10 |             "routing_key": "instructor_xl",
11 |             "use": ["text_embedding"],
12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 5900 },
14 |             "model": [{
15 |                 "name": "hkunlp/instructor-xl",
16 |                 "provider": "huggingface"
17 |             }]
18 |         },
19 |         {
20 |             "name": "hkunlp/instructor-large",
21 |             "label": "Instructor Large",
22 |             "routing_key": "instructor_large",
23 |             "use": ["text_embedding"],
24 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
25 |             "memory_usage": { "full": 2500 },
26 |             "model": [{
27 |                 "name": "hkunlp/instructor-large",
28 |                 "provider": "huggingface"
29 |             }]
30 |         }
31 |     ]
32 | }
33 | 


--------------------------------------------------------------------------------
/modules/hkunlp/instructor/instructor.py:
--------------------------------------------------------------------------------
 1 | from diffusers.utils import load_image
 2 | from application.base_handler import BaseHandler
 3 | from InstructorEmbedding import INSTRUCTOR
 4 | 
 5 | class Instructor(BaseHandler):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def validate(self, request):
10 |         is_valid, errors = self.validate_request(request, 'instructor')
11 |         return is_valid, errors
12 | 
13 |     def execute(self, model, request):        
14 |         text = request["text"]
15 |         instruction = request["instruction"]
16 |         embedding = model["model"].encode([[instruction,text]])
17 |         result = {"embedding": embedding.tolist()}
18 |         return result
19 | 
20 |     def load(self, model, model_options, local_path):
21 |         return {"model": INSTRUCTOR(local_path), "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/intfloat/e5-v2/e5-v2.py:
--------------------------------------------------------------------------------
 1 | from application.base_handler import BaseHandler
 2 | from transformers import AutoTokenizer, AutoModel
 3 | from torch import Tensor
 4 | 
 5 | class E5V2(BaseHandler):
 6 |     def __init__(self):
 7 |         super().__init__()
 8 | 
 9 |     def average_pool(self, last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
10 |         last_hidden = last_hidden_states.masked_fill(~attention_mask[..., None].bool(), 0.0)
11 |         return last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
12 | 
13 |     def validate(self, request):
14 |         return True, []
15 | 
16 |     def execute(self, model, request):        
17 |         text = request["text"]
18 |         if type(text) is str:
19 |             text = [text]
20 | 
21 |         ret_embed = {}
22 |         for embed in text: 
23 |             batch_dict = model["tokenizer"](f"query: {embed}", max_length=512, padding=True, truncation=True, return_tensors='pt')
24 |             for key in batch_dict:
25 |                 batch_dict[key] = batch_dict[key].to(model["device"])
26 | 
27 |             outputs = model["model"](**batch_dict)
28 |             embeddings = self.average_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
29 |             ret_embed[embed] = embeddings.tolist() 
30 |             
31 |         return {"embeddings": ret_embed}
32 | 
33 |     def load(self, model, model_options, local_path):
34 |         tokenizer = AutoTokenizer.from_pretrained(local_path)
35 |         e5_model = AutoModel.from_pretrained(local_path)
36 |         return {"model": e5_model, "tokenizer": tokenizer, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/intfloat/e5-v2/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "E5 v2 Embeddings",
 3 |     "description": "Handler for loading the E5 v2 embedding models.",
 4 |     "unique_key": "e5_v2",
 5 |     "script": "e5-v2.py",
 6 |     "skills": [
 7 |         {
 8 |             "name": "intfloat/e5-large-v2",
 9 |             "label": "E5 Large v2",
10 |             "routing_key": "e5_large_v2",
11 |             "use": ["text_embedding"],
12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 2500 },
14 |             "model": [{
15 |                 "name": "intfloat/e5-large-v2",
16 |                 "provider": "huggingface"
17 |             }]
18 |         },
19 |         {
20 |             "name": "intfloat/e5-base-v2",
21 |             "label": "E5 Base v2",
22 |             "routing_key": "e5_base_v2",
23 |             "use": ["text_embedding"],
24 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
25 |             "memory_usage": { "full": 1600 },
26 |             "model": [{
27 |                 "name": "intfloat/e5-base-v2",
28 |                 "provider": "huggingface"
29 |             }]
30 |         }
31 |     ]
32 | }
33 | 


--------------------------------------------------------------------------------
/modules/microsoft/git-textcaps/git-textcaps.py:
--------------------------------------------------------------------------------
 1 | from application.base_handler import BaseHandler
 2 | from transformers import AutoProcessor, AutoModelForCausalLM
 3 | from PIL import Image
 4 | import requests
 5 | 
 6 | class GitTextCaption(BaseHandler):
 7 |     def __init__(self):
 8 |         super().__init__()
 9 | 
10 |     def validate(self, request):
11 |         is_valid, errors = self.validate_request(request, 'img-url')
12 |         return is_valid, errors
13 | 
14 |     def execute(self, model, request):
15 |         image = Image.open(requests.get(request["img_url"], stream=True).raw)
16 |         pixel_values = model["processor"](images=image, return_tensors="pt").to(model["device"]).pixel_values
17 |         generated_ids = model["model"].generate(pixel_values=pixel_values, max_length=256)
18 |         generated_caption = model["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0]
19 |         return {"text": generated_caption}
20 | 
21 |     def load(self, model, model_options, local_path):
22 |         processor = AutoProcessor.from_pretrained(local_path)
23 |         git_model = AutoModelForCausalLM.from_pretrained(local_path)
24 |         return {"model": git_model, "processor": processor, "device": model_options["device"], "device_memory": model["memory_usage"][model_options["use_precision"]]}


--------------------------------------------------------------------------------
/modules/microsoft/git-textcaps/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "GIT Text Captions",
 3 |     "description": "Handler for loading Microsoft GIT text caption models.",
 4 |     "unique_key": "git_textcaps",    
 5 |     "script": "git-textcaps.py",
 6 |     "skills": [
 7 |         {
 8 |             "name": "microsoft/git-base-textcaps",
 9 |             "label": "GiT Base Textcaps",
10 |             "routing_key": "git_base_textcaps",
11 |             "use": ["image_captioning"],
12 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 2200 },
14 |             "model": [{
15 |                 "name": "microsoft/git-base-textcaps",
16 |                 "provider": "huggingface"
17 |             }]
18 |         },
19 |         {
20 |             "name": "microsoft/git-large-textcaps",
21 |             "label": "GiT Large Textcaps",
22 |             "routing_key": "git_large_textcaps",
23 |             "use": ["image_captioning"],
24 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
25 |             "memory_usage": { "full": 3000 },
26 |             "model": [{
27 |                 "name": "microsoft/git-large-textcaps",
28 |                 "provider": "huggingface"
29 |             }]
30 |         }
31 |     ]
32 | }
33 | 


--------------------------------------------------------------------------------
/modules/noco-ai/bark-tts/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "Bark Text to Speech",
 3 |     "description": "Handler for loading Bark text to speech models.",
 4 |     "unique_key": "bark",
 5 |     "script": "handler.py",
 6 |     "skills": [
 7 |         {
 8 |             "label":  "Bark Small",
 9 |             "routing_key": "bark_small",            
10 |             "use": ["text_to_speech"],        
11 |             "available_precision": { "cuda": ["full"] },
12 |             "memory_usage": { "full": 2500 },
13 |             "model": [{
14 |                 "name": "suno/bark-small",
15 |                 "provider": "huggingface"
16 |             }],
17 |             "configuration": {
18 |                 "progress_label": "Generating Speech"
19 |             }            
20 |         },
21 |         {
22 |             "label":  "Bark Large",
23 |             "routing_key": "bark_large",            
24 |             "use": ["text_to_speech"],        
25 |             "available_precision": { "cuda": ["full"] },
26 |             "memory_usage": { "full": 5200 },
27 |             "model": [{
28 |                 "name": "suno/bark",
29 |                 "provider": "huggingface"
30 |             }],
31 |             "configuration": {
32 |                 "progress_label": "Generating Speech"
33 |             }            
34 |         }
35 |     ]
36 | }


--------------------------------------------------------------------------------
/modules/noco-ai/bark-tts/handler.py:
--------------------------------------------------------------------------------
 1 | from application.base_handler import BaseHandler
 2 | from transformers import AutoProcessor, AutoModel
 3 | import torch
 4 | import base64
 5 | from io import BytesIO
 6 | import scipy
 7 | import copy
 8 | from application.progress_streamer import ProgressStreamer
 9 | import logging
10 | 
11 | logger = logging.getLogger(__name__)
12 | 
13 | class BarkHandler(BaseHandler):
14 |     def __init__(self):
15 |         self.progress_streamer = ProgressStreamer()
16 |         super().__init__()
17 | 
18 |     def validate(self, request):
19 |         is_valid, errors = self.validate_request(request, 'voice-gen')
20 |         return is_valid, errors
21 | 
22 |     def execute(self, model, request):
23 |         prompt = request.get("prompt", "")          
24 |         send_progress = request.get("progress", True)
25 |         voice_preset = request.get("voice", "v2/en_speaker_1")
26 |         prompt_length = len(prompt)
27 | 
28 |         if voice_preset == "default": 
29 |             voice_preset = "v2/en_speaker_1"
30 |             
31 |         if send_progress:
32 |             progress_headers = copy.deepcopy(model["amqp_headers"])
33 |             outgoing_properties = self.copy_queue_headers(progress_headers, "update_progress")
34 |             amqp_config = {
35 |                 "headers": progress_headers,
36 |                 "outgoing_properties": outgoing_properties,
37 |                 "channel": model["amqp_channel"]
38 |             }            
39 |             self.progress_streamer.configure(prompt_length * 25, self.model_config["progress_label"], self.routing_key, amqp_config, False)
40 |         else:
41 |             self.progress_streamer.configure(prompt_length * 25, self.model_config["progress_label"], self.routing_key, None, False)
42 | 
43 |         # Assuming the model function can take these parameters:        
44 |         logger.info(f"prompt: {prompt}, voice: {voice_preset}, length: {prompt_length}")        
45 |         inputs = model["processor"](
46 |             text=[prompt],
47 |             voice_preset=voice_preset,
48 |             return_tensors="pt",
49 |         ).to(model["device"])
50 |         speech_values = model["model"].generate(**inputs, do_sample=True, streamer=self.progress_streamer)
51 | 
52 |         # Save image to an in-memory bytes buffer
53 |         buffered = BytesIO()
54 |         sampling_rate = model["model"].generation_config.sample_rate
55 |         scipy.io.wavfile.write(buffered, rate=sampling_rate, data=speech_values.cpu().numpy().squeeze())
56 |         
57 |         # Convert bytes buffer to a base64-encoded string
58 |         wav_str = base64.b64encode(buffered.getvalue()).decode()
59 |         return {"wav": wav_str}
60 |     
61 |     def load(self, model, model_options, local_path):
62 |         self.model_config = model["configuration"]      
63 |         self.routing_key = model["routing_key"]
64 |         processor = AutoProcessor.from_pretrained(local_path)
65 |         load_model = AutoModel.from_pretrained(local_path)
66 |         
67 |         return {
68 |             "model": load_model,
69 |             "processor": processor,
70 |             "device": model_options["device"],
71 |             "device_memory": model["memory_usage"][model_options["use_precision"]]
72 |         }
73 | 


--------------------------------------------------------------------------------
/modules/noco-ai/image-generator/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "Stable Diffusion v1.5",
 3 |     "description": "Handler for loading Stable Diffusion v1.5 models.",
 4 |     "unique_key": "sd_15",
 5 |     "script": "handler.py",
 6 |     "supported_gpu": ["nvidia"],
 7 |     "skills": [
 8 |         {
 9 |             "label":  "Stable Diffusion v1.5",
10 |             "routing_key": "stable_diffusion_v15",            
11 |             "use": ["image_generation"],        
12 |             "available_precision": { "cuda": ["full"] },
13 |             "memory_usage": { "full": 2400 },
14 |             "model": [{
15 |                 "name": "runwayml/stable-diffusion-v1-5",
16 |                 "provider": "huggingface"
17 |             }],
18 |             "configuration": {
19 |                 "progress_label": "Generating Image"
20 |             }                        
21 |         },
22 |         {
23 |             "label":  "DreamShaper",
24 |             "routing_key": "dream_shaper_image_gen",
25 |             "use": ["image_generation"],        
26 |             "available_precision": { "cuda": ["full"] },
27 |             "memory_usage": { "full": 2400 },
28 |             "model": [{
29 |                 "name": "civitai/dreamshaper/128713.safetensors",
30 |                 "provider": "civitai",
31 |                 "url": "https://civitai.com/api/download/models/128713?type=Model&format=SafeTensor&size=pruned&fp=fp16"
32 |             }],
33 |             "configuration": {
34 |                 "progress_label": "Generating Image"
35 |             }                        
36 |         }
37 |     ]
38 | }


--------------------------------------------------------------------------------
/modules/noco-ai/image-generator/handler.py:
--------------------------------------------------------------------------------
  1 | from application.base_handler import BaseHandler
  2 | from diffusers import StableDiffusionPipeline, KDPM2DiscreteScheduler
  3 | import torch
  4 | import base64
  5 | from io import BytesIO
  6 | import logging
  7 | import json
  8 | import copy
  9 | from compel import Compel
 10 | 
 11 | logger = logging.getLogger(__name__)
 12 | 
 13 | class StableDiffusion(BaseHandler):
 14 |     def __init__(self):
 15 |         super().__init__()
 16 | 
 17 |     def validate(self, request):
 18 |         is_valid, errors = self.validate_request(request, 'img-gen')
 19 |         return is_valid, errors
 20 | 
 21 |     def step_callback(self, pipeline: StableDiffusionPipeline, step: int, timestep: int, callback_kwargs):
 22 |         if self.stream_progress == False:
 23 |             return callback_kwargs    
 24 |         
 25 |         self.current_step = self.current_step + 1
 26 |         label = self.model_configuration["progress_label"] if "progress_label" in self.model_configuration else self.routing_key
 27 |         send_body = {
 28 |             "total": self.total_steps,
 29 |             "current": self.current_step,
 30 |             "label": label,
 31 |             "model": self.routing_key
 32 |         }
 33 |             
 34 |         self.amqp_progress_config["channel"].basic_publish(
 35 |             exchange=self.amqp_progress_config["headers"]['return_exchange'], 
 36 |             routing_key=self.amqp_progress_config["headers"]['return_routing_key'], 
 37 |             body=json.dumps(send_body), properties=self.amqp_progress_config["outgoing_properties"])             
 38 |         
 39 |         return callback_kwargs
 40 |     
 41 |     def get_latents(self, num_images=1, height=512, width=512, user_seed=-1, device="cuda:0", model=None):
 42 |         latents = None
 43 |         generator = torch.Generator(device=device)        
 44 |         if user_seed == -1:
 45 |             seed = generator.seed()
 46 |         else:
 47 |             seed = user_seed
 48 |         generator = generator.manual_seed(seed)
 49 |         
 50 |         latents = torch.randn(
 51 |             (num_images, model.unet.in_channels, height // 8, width // 8),
 52 |             generator = generator,
 53 |             device = device,
 54 |             dtype = torch.float16
 55 |         )
 56 |         return { "seed": seed, "latents": latents }
 57 | 
 58 |     def execute(self, model, request):
 59 |         prompt = request.get("prompt", "")
 60 |         height = request.get("height", 512)
 61 |         width = request.get("width", 512)
 62 |         steps = request.get("steps", 50)
 63 |         seed = request.get("seed", -1)
 64 |         self.stream_progress = request.get("progress", False)
 65 |         negative_prompt = request.get("negative_prompt", "")
 66 |         guidance_scale = request.get("guidance_scale", 7.5)
 67 |         num_images_per_prompt = 1        
 68 | 
 69 |         if self.stream_progress == True:
 70 |             progress_headers = copy.deepcopy(model["amqp_headers"])
 71 |             outgoing_properties = self.copy_queue_headers(progress_headers, "update_progress")
 72 |             self.amqp_progress_config = {
 73 |                 "headers": progress_headers,
 74 |                 "outgoing_properties": outgoing_properties,
 75 |                 "channel": model["amqp_channel"]
 76 |             }
 77 |             self.current_step = 0            
 78 |             self.total_steps = steps * 2
 79 | 
 80 |         latent_data = self.get_latents(num_images_per_prompt, height, width, seed, self.model_options["device"], model["model"])
 81 |         logger.info(f"prompt: {prompt}, height: {height}, width: {width}, steps: {steps}, guidance scale: {guidance_scale}, seed: {latent_data['seed']}")
 82 | 
 83 |         prompt_embeds = model["compel"](prompt)
 84 |         negative_prompt_embeds = model["compel"](negative_prompt)
 85 |         image = model["model"](prompt_embeds=prompt_embeds, height=height, width=width, num_inference_steps=steps, latents=latent_data["latents"], callback_on_step_end=self.step_callback,
 86 |                             negative_prompt_embeds=negative_prompt_embeds, guidance_scale=guidance_scale, num_images_per_prompt=num_images_per_prompt).images[0]
 87 | 
 88 |         buffered = BytesIO()
 89 |         image.save(buffered, format="PNG") 
 90 | 
 91 |         # Convert bytes buffer to a base64-encoded string
 92 |         img_str = base64.b64encode(buffered.getvalue()).decode()
 93 |         return {"image": img_str, "seed": latent_data["seed"], "guidance_scale": guidance_scale, "steps": steps }
 94 |     
 95 |     def load(self, model, model_options, local_path):
 96 |         self.model_options = model_options
 97 |         self.routing_key = model["routing_key"]
 98 |         self.model_configuration = model["configuration"]
 99 | 
100 |         try:
101 |             if "civitai" not in local_path: 
102 |                 logger.info("loading standard sd model")           
103 |                 load_model = StableDiffusionPipeline.from_pretrained(local_path, torch_dtype=torch.float16, safety_checker=None)
104 |             else:
105 |                 logger.info("loading civit sd model")
106 |                 load_model = StableDiffusionPipeline.from_single_file(local_path, load_safety_checker=False, torch_dtype=torch.float16)
107 |             
108 |             load_model.scheduler = KDPM2DiscreteScheduler.from_config(load_model.scheduler.config)
109 |             compel = Compel(tokenizer=load_model.tokenizer, text_encoder=load_model.text_encoder)
110 | 
111 |             return {
112 |                 "model": load_model,
113 |                 "device": model_options["device"],
114 |                 "device_memory": model["memory_usage"][model_options["use_precision"]],
115 |                 "compel": compel
116 |             }
117 |     
118 |         except Exception as e:
119 |             print(f"error loading sd model")
120 |             print(e)
121 |             return { "error": True }
122 | 


--------------------------------------------------------------------------------
/modules/noco-ai/llama-cpp/golem.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "label": "Llama CPP",
  3 |     "description": "Handler for loading Llama CPP models.",
  4 |     "unique_key": "llama_cpp",
  5 |     "script": "llama-cpp.py",
  6 |     "skills": [        
  7 |         {
  8 |             "label":  "Llama2 7B Chat (GGUF)",
  9 |             "routing_key": "llama_7b_chat_gguf",
 10 |             "use": ["language_model"],        
 11 |             "available_precision": { "cpu": ["4-bit", "6-bit"], "cuda": ["4-bit", "6-bit"] },
 12 |             "memory_usage": { "4-bit": 6000, "6-bit": 8000 },            
 13 |             "model": [{
 14 |                 "name": "TheBloke/Llama-2-7B-chat-GGUF",
 15 |                 "provider": "huggingface",
 16 |                 "files": {
 17 |                     "4-bit": "llama-2-7b-chat.Q4_0.gguf"
 18 |                 }
 19 |             }],
 20 |             "configuration": {
 21 |                 "model_layers": 43,
 22 |                 "num_threads": -1,
 23 |                 "model_type": "chat",
 24 |                 "max_seq_len": 4096,
 25 |                 "user_role": "[INST]",
 26 |                 "ai_role": "[/INST]",
 27 |                 "stop_on": ["<stop>", "[INST]", "</s>"],
 28 |                 "system_prompt_format": "{user_role} <<SYS>>{system_prompt}<</SYS>>\n{prompt} {ai_role} {response}",
 29 |                 "system_message": "You are an helpful assistant.",
 30 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response}"
 31 |             }        
 32 |         },
 33 |         {
 34 |             "label":  "Llama2 13B Chat (GGUF)",
 35 |             "routing_key": "llama_13b_chat_gguf",            
 36 |             "use": ["language_model"],        
 37 |             "available_precision": { "cpu": ["4-bit", "6-bit"], "cuda": ["4-bit", "6-bit"] },
 38 |             "memory_usage": { "4-bit": 11000, "6-bit": 13800 },            
 39 |             "model": [{
 40 |                 "name": "TheBloke/Llama-2-13B-chat-GGUF",
 41 |                 "provider": "huggingface",
 42 |                 "files": {
 43 |                     "4-bit": "llama-2-13b-chat.Q4_0.gguf"
 44 |                 }
 45 |             }],
 46 |             "configuration": {
 47 |                 "model_layers": 43,
 48 |                 "num_threads": -1,
 49 |                 "model_type": "chat",
 50 |                 "max_seq_len": 4096,
 51 |                 "user_role": "[INST]",
 52 |                 "ai_role": "[/INST]",
 53 |                 "stop_on": ["<stop>", "[INST]", "</s>"],
 54 |                 "system_prompt_format": "{user_role} <<SYS>>{system_prompt}<</SYS>>\n{prompt} {ai_role} {response}",
 55 |                 "system_message": "You are an helpful assistant.",
 56 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response}"
 57 |             }        
 58 |         },
 59 |         {
 60 |             "label":  "CodeLlama 34B Instruct (GGUF)",
 61 |             "routing_key": "llama_34b_instruct_gguf",            
 62 |             "use": ["language_model"],        
 63 |             "available_precision": { "cpu": ["4-bit", "6-bit"], "cuda": ["4-bit", "6-bit"] },
 64 |             "memory_usage": { "4-bit": 22000, "6-bit": 31500 },            
 65 |             "model": [{
 66 |                 "name": "TheBloke/CodeLlama-34B-Instruct-GGUF",
 67 |                 "provider": "huggingface",
 68 |                 "files": {
 69 |                     "4-bit": "codellama-34b-instruct.Q4_K_M.gguf",
 70 |                     "6-bit": "codellama-34b-instruct.Q6_K.gguf"
 71 |                 }
 72 |             }],
 73 |             "configuration": {
 74 |                 "model_layers": 51,
 75 |                 "num_threads": -1,
 76 |                 "model_type": "instruct",
 77 |                 "max_seq_len": 16384,
 78 |                 "user_role": "[INST]",
 79 |                 "ai_role": "[/INST]",
 80 |                 "stop_on": ["<stop>", "[INST]", "</s>"],
 81 |                 "system_message": "",
 82 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response}\n\n"
 83 |             }        
 84 |         },
 85 |         {
 86 |             "label":  "Mistral 7B Instruct (GGUF)",
 87 |             "routing_key": "mistral_7b_instruct_gguf",            
 88 |             "use": ["language_model"],                    
 89 |             "available_precision": { "cuda": ["4-bit", "6-bit"] },
 90 |             "memory_usage": { "4-bit": 6900, "6-bit": 11000 },
 91 |             "model": [{
 92 |                 "name": "TheBloke/Mistral-7B-Instruct-v0.2-GGUF",
 93 |                 "provider": "huggingface",
 94 |                 "files": {
 95 |                     "4-bit": "mistral-7b-instruct-v0.2.Q4_K_M.gguf",
 96 |                     "6-bit": "mistral-7b-instruct-v0.2.Q6_K.gguf"
 97 |                 }
 98 |             }],
 99 |             "configuration": {           
100 |                 "model_layers": 35,     
101 |                 "max_seq_len": 8192,
102 |                 "user_role": "[INST]",
103 |                 "ai_role": "[/INST]",
104 |                 "stop_on": ["<stop>", "### USER:"],
105 |                 "system_prompt_format": "[INST] {system_prompt} [/INST] ",
106 |                 "system_message": "",
107 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response} "
108 |             }
109 |         },                       
110 |         {
111 |             "label":  "Llama2 70B Chat (GGUF)",
112 |             "routing_key": "llama_70b_chat_gguf",            
113 |             "use": ["language_model"],        
114 |             "available_precision": { "cpu": ["4-bit", "5-bit", "6-bit"], "cuda": ["4-bit", "5-bit", "6-bit"] },
115 |             "memory_usage": { "4-bit": 41500, "5-bit": 48000, "6-bit": 60000 },            
116 |             "model": [{
117 |                 "name": "TheBloke/Llama-2-70B-chat-GGUF",
118 |                 "provider": "huggingface",
119 |                 "files": {
120 |                     "4-bit": "llama-2-70b-chat.Q4_K_M.gguf",
121 |                     "5-bit": "llama-2-70b-chat.Q5_K_M.gguf",
122 |                     "6-bit": "llama-2-70b-chat.Q6_K.gguf"
123 |                 },
124 |                 "split": {
125 |                     "6-bit": ["-split-a", "-split-b"]
126 |                 }
127 |             }],
128 |             "configuration": {
129 |                 "model_layers": 83,
130 |                 "num_threads": -1,
131 |                 "model_type": "chat",
132 |                 "max_seq_len": 4096,
133 |                 "user_role": "[INST]",
134 |                 "ai_role": "[/INST]",
135 |                 "stop_on": ["<stop>", "[INST]", "</s>"],
136 |                 "system_prompt_format": "{user_role} <<SYS>>{system_prompt}<</SYS>>\n{prompt} {ai_role} {response}",
137 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response}",
138 |                 "system_message": "You are an helpful assistant."
139 |             }        
140 |         },
141 |         {
142 |             "label":  "CodeLlama 70B Instruct (GGUF)",
143 |             "routing_key": "codellama_70b_instruct_gguf",            
144 |             "use": ["language_model"],        
145 |             "available_precision": { "cpu": ["4-bit", "5-bit"], "cuda": ["4-bit", "5-bit"] },
146 |             "memory_usage": { "4-bit": 41500, "5-bit": 48000 },            
147 |             "model": [{
148 |                 "name": "LoneStriker/CodeLlama-70b-Instruct-hf-GGUF",
149 |                 "provider": "huggingface",
150 |                 "files": {
151 |                     "4-bit": "CodeLlama-70b-Instruct-hf-Q4_K_M.gguf"
152 |                 }
153 |             }],
154 |             "configuration": {
155 |                 "model_layers": 83,
156 |                 "num_threads": -1,
157 |                 "model_type": "chat",
158 |                 "max_seq_len": 4096,
159 |                 "user_role": "Source: user",
160 |                 "ai_role": "Source: assistant",
161 |                 "stop_on": ["<stop>", "Source:", "Source:"],
162 |                 "system_prompt_format": "Source: system\n\n{system_prompt}<step> ",
163 |                 "prompt_format": "{user_role}\n\n{prompt}<step> {ai_role} Destination: user\n{response}",
164 |                 "system_message": "You are an expert in coding Magento 2"
165 |             }        
166 |         },
167 |         {
168 |             "label":  "Mixtral 8x7B Instruct (GGUF)",
169 |             "routing_key": "mixtral_8x7b_instruct",
170 |             "use": ["language_model", "reasoning_agent"],        
171 |             "available_precision": { "cpu": ["4-bit", "5-bit", "6-bit"], "cuda": ["4-bit", "5-bit", "6-bit"] },
172 |             "memory_usage": { "4-bit": 28000, "5-bit": 33500, "6-bit": 39000 },            
173 |             "model": [{
174 |                 "name": "TheBloke/Mixtral-8x7B-Instruct-v0.1-GGUF",
175 |                 "provider": "huggingface",
176 |                 "files": {
177 |                     "4-bit": "mixtral-8x7b-instruct-v0.1.Q4_K_M.gguf",
178 |                     "5-bit": "mixtral-8x7b-instruct-v0.1.Q5_K_M.gguf",
179 |                     "6-bit": "mixtral-8x7b-instruct-v0.1.Q6_K.gguf"
180 |                 }
181 |             }],
182 |             "configuration": {
183 |                 "model_layers": 63,
184 |                 "num_threads": -1,
185 |                 "max_seq_len": 16384,                                
186 |                 "user_role": "[INST]",
187 |                 "ai_role": "[/INST]",
188 |                 "stop_on": ["<stop>", "[/INST]"],
189 |                 "system_prompt_format": "[INST] {system_prompt}\nRespond with OK if you understand. [/INST] OK ",
190 |                 "system_message": "",
191 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response} "
192 |             }        
193 |         }
194 |     ] 
195 | }


--------------------------------------------------------------------------------
/modules/noco-ai/llama-cpp/llama-cpp.py:
--------------------------------------------------------------------------------
  1 | from llama_cpp import Llama
  2 | from application.llm_handler import LlmHandler
  3 | import torch
  4 | import time
  5 | import logging
  6 | 
  7 | logger = logging.getLogger(__name__)
  8 | 
  9 | class GGUFGenerator(LlmHandler):
 10 |     def __init__(self):
 11 |         super().__init__()
 12 |         self.loras = {}
 13 | 
 14 |     def update_config(self, config_data):
 15 |         current_config = self.model_config
 16 |         merged_config = {**current_config, **config_data}
 17 |         self.model_config = merged_config
 18 | 
 19 |     def get_token_count(self, input_text):
 20 |         inputs = self.loaded_model.tokenize(bytes(input_text, 'utf-8'))
 21 |         return len(inputs)
 22 | 
 23 |     def validate(self, request):
 24 |         is_valid, errors = self.validate_request(request, 'llm')
 25 |         return is_valid, errors
 26 | 
 27 |     def stream(self, model, prompt, channel, incoming_headers, 
 28 |                outgoing_properties, stops, request, model_data):
 29 |         
 30 |         # setup stop conditions
 31 |         check_stop_token, stop_conditions = self.build_stop_conditions(stops, False)
 32 |                 
 33 |         # get starting time
 34 |         begin_time = time.time()        
 35 |         
 36 |         # set max new tokens and other params
 37 |         prompt_tokens = model.tokenize(bytes(prompt, 'utf-8'))
 38 |         input_token_count = len(prompt_tokens)
 39 |         max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \
 40 |                     min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request)
 41 |         if debug:
 42 |             print('\033[94m')
 43 |             print(request)
 44 |             print(prompt)
 45 |             print('\033[0m')                    
 46 |         
 47 |         response = ""
 48 |         new_tokens = 0
 49 |         finish_reason = 'stop'                            
 50 |         socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None
 51 |         stop_generation_counter = 0
 52 |         model_args = {}
 53 | 
 54 |         # sampler settings
 55 |         if mirostat != 0:
 56 |             model_args["mirostat_mode"] = mirostat
 57 |             model_args["mirostat_eta"] = mirostat_eta
 58 |             model_args["mirostat_tau"] = mirostat_tau
 59 |         if seed != -1:
 60 |             model_args["seed"] = seed
 61 | 
 62 |         if "start_response" in request and stream_output:
 63 |             channel.basic_publish(
 64 |                     exchange=incoming_headers['return_exchange'], 
 65 |                     routing_key=incoming_headers['return_routing_key'], 
 66 |                     body=request["start_response"], properties=outgoing_properties)
 67 | 
 68 |         for model_stream in model(prompt, stream=True, max_tokens=max_new_tokens, min_p=min_p,
 69 |             temperature=temperature, stop=stop_conditions, top_k=top_k, top_p=top_p, **model_args):
 70 |             text = model_stream["choices"][0]["text"]
 71 |             
 72 |             stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 
 73 |                                     model_data["stop_generation_event"], model_data["stop_generation_filter"], socket_id)
 74 | 
 75 |             if stop_generation:
 76 |                 finish_reason = "abort"
 77 |                 break                            
 78 | 
 79 |             new_tokens += 1
 80 |             if new_tokens >= max_new_tokens: 
 81 |                 finish_reason = 'length'
 82 |                 break
 83 |             
 84 |             if debug:
 85 |                 print('\033[96m' + text, end="")
 86 | 
 87 |             # send chunk to front end
 88 |             if stream_output:                
 89 |                 channel.basic_publish(
 90 |                     exchange=incoming_headers['return_exchange'], 
 91 |                     routing_key=incoming_headers['return_routing_key'], 
 92 |                     body=text, properties=outgoing_properties)
 93 |             else:
 94 |                 response += text
 95 | 
 96 |         if debug:
 97 |             print('\033[0m' + "")
 98 | 
 99 |         end_time = time.time()
100 |         elapsed = end_time - begin_time
101 |         token_rate = 0 if elapsed == 0 else (new_tokens / elapsed)        
102 |         model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided"
103 |         return self.finish_response(stop_key, response, request, stream_output, finish_reason, 
104 |                                         token_rate, new_tokens, input_token_count, model_name, elapsed, debug)
105 | 
106 |     def execute(self, model, request):
107 |         config = self.model_config                    
108 | 
109 |         # build the prompt        
110 |         prompt = self.build_prompt(request, config, model)
111 |         incoming_headers = model["amqp_headers"]
112 |         outgoing_properties = self.copy_queue_headers(incoming_headers)        
113 | 
114 |         # last string to send after done streaming output                        
115 |         stream_resp = self.stream(            
116 |             model["model_loaded"], 
117 |             prompt,
118 |             model["amqp_channel"],
119 |             incoming_headers,
120 |             outgoing_properties,
121 |             config["stop_on"],
122 |             request,
123 |             model)
124 |         
125 |         return stream_resp
126 |         
127 |     def load(self, model, model_options, local_path):           
128 |         self.model_config = model["configuration"]      
129 |         
130 |         try:                        
131 |             if not model["model"][0]["files"][model_options["use_precision"]]:
132 |                 return { "error": True }
133 | 
134 |             lora_name = self.model_config["default_lora"] if "default_lora" in self.model_config else None
135 |             model_file = model["model"][0]["files"][model_options["use_precision"]]
136 |             model_path = f"{local_path}/{model_file}"
137 |             config_threads = model["configuration"].get("num_threads", -1)
138 |             num_threads = None if config_threads == -1 else config_threads
139 |             max_seq_len = model["configuration"].get("max_seq_len", 2048)
140 |             model_args = {
141 |                 "model_path": model_path, 
142 |                 "n_gpu_layers": 0, 
143 |                 "n_ctx": max_seq_len,
144 |                 "n_threads":num_threads
145 |             }
146 | 
147 |             if lora_name != None:
148 |                 model_args["lora_path"] = f"data/loras/{lora_name}/"
149 | 
150 |             if model_options["device"].startswith("cuda"):
151 |                 model_args["n_gpu_layers"] = model["configuration"].get("model_layers", 0)
152 |                 model_args["main_gpu"] = int(model_options["device"].split(":")[1]) 
153 |                 #gpu_device = int(model_options["device"].split(":")[1])                
154 |                 #tensor_map = [0] * gpu_device + [1]
155 |                 #tensor_map[0] = 0.01 
156 |                 #tensor_split=tensor_map, 
157 |                 #tensor_map[gpu_device] = 0.99
158 |                 #print(tensor_map)
159 |                 #gpu_device = 0
160 |             if "70b" in model_file:
161 |                 model_args["n_gqa"] = 8   
162 | 
163 |             load_model = Llama(**model_args)
164 |             self.loaded_model = load_model
165 |             
166 |             print(f'skill {model["routing_key"]} loaded to {model_options["device"]}')
167 |             return { "model_loaded": load_model, "error": False }             
168 |         except Exception as e:
169 |             print(f"error loading model")
170 |             print(e)
171 |             return { "error": True }


--------------------------------------------------------------------------------
/modules/noco-ai/llm-api/golem.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "label": "LLM API",
  3 |     "description": "Handler for accessing LLMs running on remote systems.",
  4 |     "unique_key": "llm_api",
  5 |     "script": "handler.py",
  6 |     "skills": [        
  7 |         {
  8 |             "label": "OpenAI Compatible Endpoint",
  9 |             "routing_key": "custom_llm_endpoint",
 10 |             "use": ["language_model"],
 11 |             "available_precision": { "cpu": ["full"] },
 12 |             "memory_usage": { "full": 20 },
 13 |             "configuration": {
 14 |                 "model": "none",
 15 |                 "max_seq_len": 4096,
 16 |                 "stop_on": []
 17 |             }
 18 |         },
 19 |         {
 20 |             "label": "Claude Haiku API",
 21 |             "routing_key": "claude_haiku_api",
 22 |             "use": ["language_model"],
 23 |             "available_precision": { "cpu": ["full"] },
 24 |             "memory_usage": { "full": 20 },
 25 |             "configuration": {
 26 |                 "model": "claude-3-haiku-20240307",
 27 |                 "max_seq_len": 16384,
 28 |                 "api_path": "https://api.anthropic.com/v1/messages",
 29 |                 "stop_on": []
 30 |             }
 31 |         },
 32 |         {
 33 |             "label": "Claude Opus API",
 34 |             "routing_key": "claude_opus_api",
 35 |             "use": ["language_model"],
 36 |             "available_precision": { "cpu": ["full"] },
 37 |             "memory_usage": { "full": 20 },
 38 |             "configuration": {
 39 |                 "model": "claude-3-opus-20240229",
 40 |                 "max_seq_len": 16384,
 41 |                 "api_path": "https://api.anthropic.com/v1/messages",
 42 |                 "stop_on": []
 43 |             }
 44 |         },
 45 |         {
 46 |             "label": "Mistral Small API",
 47 |             "routing_key": "mistral_small_api",
 48 |             "use": ["language_model"],
 49 |             "available_precision": { "cpu": ["full"] },
 50 |             "memory_usage": { "full": 20 },
 51 |             "configuration": {
 52 |                 "model": "mistral-small-latest",
 53 |                 "max_seq_len": 8192,
 54 |                 "api_path": "https://api.mistral.ai/v1/chat/completions",
 55 |                 "stop_on": []
 56 |             }
 57 |         },
 58 |         {
 59 |             "label": "Mistral Medium API",
 60 |             "routing_key": "mistral_medium_api",
 61 |             "use": ["language_model"],
 62 |             "available_precision": { "cpu": ["full"] },
 63 |             "memory_usage": { "full": 20 },
 64 |             "configuration": {
 65 |                 "model": "mistral-medium-latest",
 66 |                 "max_seq_len": 8192,
 67 |                 "api_path": "https://api.mistral.ai/v1/chat/completions",
 68 |                 "stop_on": []
 69 |             }
 70 |         },
 71 |         {
 72 |             "label": "Mistral Large API",
 73 |             "routing_key": "mistral_large_api",
 74 |             "use": ["language_model"],
 75 |             "available_precision": { "cpu": ["full"] },
 76 |             "memory_usage": { "full": 20 },
 77 |             "configuration": {
 78 |                 "model": "mistral-large-latest",
 79 |                 "max_seq_len": 8192,
 80 |                 "api_path": "https://api.mistral.ai/v1/chat/completions",
 81 |                 "stop_on": []
 82 |             }
 83 |         }
 84 |     ],
 85 |     "configuration": {
 86 |         "vault_path": "golem/llm_api",
 87 |         "options": [            
 88 |             {
 89 |                 "name": "api_path",
 90 |                 "label": "API Path",
 91 |                 "editable": true,
 92 |                 "type": "string",
 93 |                 "default": "http://127.0.0.1:5000/v1/chat/completions"
 94 |             },
 95 |             {
 96 |                 "name": "api_key",
 97 |                 "label": "API Key",
 98 |                 "editable": true,
 99 |                 "type": "secret",
100 |                 "default": "none"
101 |             }
102 |         ]
103 |     }
104 | }
105 | 


--------------------------------------------------------------------------------
/modules/noco-ai/llm-api/handler.py:
--------------------------------------------------------------------------------
  1 | from application.llm_handler import LlmHandler
  2 | from pika import BasicProperties
  3 | import logging
  4 | import time
  5 | import json
  6 | import requests
  7 | import sseclient
  8 | import tiktoken
  9 | 
 10 | logger = logging.getLogger(__name__)
 11 | 
 12 | class ServerSideEventLlm(LlmHandler):
 13 |     def __init__(self):        
 14 |         super().__init__()
 15 | 
 16 |     def validate(self, request):        
 17 |         is_valid, errors = self.validate_request(request, 'llm')
 18 |         return is_valid, errors
 19 | 
 20 |     def get_token_count(self, input_text):
 21 |         enc = self.token_counter.encode(input_text)
 22 |         return len(enc)
 23 |     
 24 |     def update_config(self, config_data):
 25 |         current_config = self.model_config
 26 |         merged_config = {**current_config, **config_data}
 27 |         self.model_config = merged_config
 28 |     
 29 |     def clip_messages(self, request, config):        
 30 |         clipped_messages = []
 31 |         max_seq_len = config.get("max_seq_len", 2048)
 32 |         max_input_tokens = int(request.get("max_input_tokens", config.get("max_input_len", int(max_seq_len / 2))))
 33 |         system_prompt_tokens = 0
 34 |         messages = request.get("messages", [])
 35 |         sys_prompt_in_request = False
 36 |         if len(messages) and messages[0]["role"] == "system":
 37 |             system_prompt_tokens = self.get_token_count(messages[0]["content"])
 38 |             sys_prompt_in_request = True
 39 |             request_system_message = messages[0]
 40 |             del messages[0]
 41 | 
 42 |         input_token_count = system_prompt_tokens
 43 |         messages = messages[::-1]
 44 |         for index, message in enumerate(messages):
 45 |             token_count = self.get_token_count(message["content"]) 
 46 |             if token_count + input_token_count > max_input_tokens:
 47 |                 break
 48 | 
 49 |             input_token_count += token_count
 50 |             clipped_messages.append(message)
 51 |         
 52 |         clipped_messages = clipped_messages[::-1]          
 53 |         if sys_prompt_in_request:
 54 |             clipped_messages.insert(0, request_system_message)
 55 | 
 56 |         return clipped_messages, input_token_count    
 57 |     
 58 |     def execute(self, model, request):
 59 |         # this is not the correct tokenizer but will give a rough guess, will need to fix this at some point...                    
 60 |         model_name = "gpt-3.5-turbo"
 61 |         self.token_counter = tiktoken.encoding_for_model(model_name)    
 62 |         clipped_messages, input_token_count = self.clip_messages(request, self.model_config)
 63 |         
 64 |         max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \
 65 |             min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request)
 66 | 
 67 |         if debug:
 68 |             print('\033[94m')
 69 |             print(request)
 70 |             print('\033[0m')                                     
 71 | 
 72 |         # make API request to OpenAI
 73 |         begin_time = time.time()        
 74 |         config = self.model_config
 75 |         #check_stop_token, stop_conditions = self.build_stop_conditions(config["stop_on"])                
 76 |         url = self.model_config["api_path"]
 77 |         api_key = self.model_config["api_key"]
 78 |         model_name = self.model_config["model"]
 79 |         
 80 |         data = {
 81 |             "messages": clipped_messages,
 82 |             "max_tokens": max_new_tokens,
 83 |             "temperature": temperature,
 84 |             "top_p": top_p,
 85 |             "top_k": top_k,
 86 |             "min_p": min_p,
 87 |             "stream": True,
 88 |         }
 89 |         if seed != -1:
 90 |             data["seed"] = seed
 91 |         
 92 |         headers = {
 93 |             "Content-Type": "application/json"
 94 |         }        
 95 | 
 96 |         verify_ssl = False
 97 |         is_claude = True if model_name.find("claude") != -1 else False
 98 |         is_mistral = True if model_name.find("mistral") != -1 else False
 99 |         if is_mistral:
100 |             verify_ssl = True
101 |             accept_header = "text/event-stream" if stream_output else "application/json"
102 |             headers["Accept"] = accept_header
103 |             headers["Authorization"] = f"Bearer {api_key}"
104 |             headers["User-Agent"] = "elemental-golem/v3"
105 |             data["model"] = self.model_config["model"]
106 |             if "seed" in data:
107 |                 data["random_seed"] = seed
108 |                 del data["seed"]
109 |             del data["min_p"]
110 |             del data["top_k"]        
111 |         elif is_claude:
112 |             verify_ssl = True
113 |             if data["messages"][0]["role"] == "system":
114 |                 data["system"] = data["messages"][0]["content"]
115 |                 del data["messages"][0]
116 |             del data["min_p"]
117 |             data["model"] = self.model_config["model"]
118 |             headers["x-api-key"] = api_key
119 |             headers["anthropic-version"] = "2023-06-01"
120 | 
121 |         stream_response = requests.post(url, headers=headers, json=data, verify=verify_ssl, stream=True)
122 |         if stream_response.status_code != 200:
123 |             if stream_response.status_code == 401:
124 |                 raise Exception("Invalid API key")
125 |             else:
126 |                 raise Exception("Failed to get response from API")
127 | 
128 |         client = sseclient.SSEClient(stream_response)
129 |         channel = model["amqp_channel"]
130 |         incoming_headers = model["amqp_headers"]
131 | 
132 |         # copy amqp headers
133 |         response_str = ""
134 |         finish_reason = "stop"                
135 |         new_tokens = 0
136 |         outgoing_headers = {}
137 |         for incoming_header in incoming_headers:
138 |             if incoming_header in ["x-delay", "return_exchange", "return_routing_key"]:
139 |                 continue
140 |             outgoing_headers[incoming_header] = incoming_headers[incoming_header]        
141 | 
142 |         socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None
143 |         outgoing_headers["command"] = "prompt_fragment" if "stream_to_override" not in incoming_headers else incoming_headers["stream_to_override"]
144 |         outgoing_properties = BasicProperties(headers=outgoing_headers)
145 |         stop_generation_counter = 0
146 |         
147 |         for event in client.events():
148 |             
149 |             stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 
150 |                                 model["stop_generation_event"], model["stop_generation_filter"], socket_id)
151 |             
152 |             if stop_generation:
153 |                 finish_reason = "abort"
154 |                 break                
155 | 
156 |             if is_claude and event.event != "content_block_delta":
157 |                 continue
158 | 
159 |             chunk = ""
160 |             try:
161 |                 payload = json.loads(event.data)
162 |                 if is_mistral:
163 |                     chunk = payload['choices'][0]['delta']['content']  
164 |                 elif is_claude:
165 |                       chunk = payload['delta']['text']
166 |                 else:
167 |                     chunk = payload['choices'][0]['message']['content']
168 |             except:
169 |                 continue
170 |             
171 |             response_str += chunk            
172 |             new_tokens += 1            
173 |             if debug:
174 |                 print('\033[96m' + chunk, end="")
175 | 
176 |             if stream_output:
177 |                 channel.basic_publish(
178 |                     exchange=incoming_headers['return_exchange'], 
179 |                     routing_key=incoming_headers['return_routing_key'], 
180 |                     body=chunk, properties=outgoing_properties)            
181 |             
182 |         end_time = time.time()
183 |         elapsed = end_time - begin_time
184 |         token_rate = 0 if elapsed == 0 else (new_tokens / elapsed)        
185 |         model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided"
186 |         resp = self.finish_response(stop_key, response_str, request, stream_output, finish_reason, 
187 |                                         token_rate, new_tokens, input_token_count, model_name, elapsed, debug)        
188 |         return resp
189 |         
190 | 
191 |     def load(self, model, model_options, local_path):         
192 |         self.model_config = model["configuration"]
193 |         self.model_config["api_key"] = model["secrets"]["api_key"]            
194 |         return { "model_name": "" }
195 | 


--------------------------------------------------------------------------------
/modules/noco-ai/music-generator/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "MusicGen",
 3 |     "description": "Handler for loading Meta MusicGen models.",
 4 |     "unique_key": "musicgen",
 5 |     "script": "handler.py",
 6 |     "skills": [       
 7 |         {
 8 |             "label":  "MusicGen Small",
 9 |             "routing_key": "musicgen_small",            
10 |             "use": ["music_generation"],        
11 |             "available_precision": { "cpu": ["full"], "cuda": ["full"] },
12 |             "memory_usage": { "full": 3600 },
13 |             "model": [{
14 |                 "name": "facebook/musicgen-small",
15 |                 "provider": "huggingface"
16 |             }],
17 |             "configuration": {
18 |                 "progress_label": "Generating Music"
19 |             }            
20 |         },
21 |         {
22 |             "label":  "MusicGen Medium",
23 |             "routing_key": "musicgen_medium",            
24 |             "use": ["music_generation"],        
25 |             "available_precision": { "cpu": ["full"], "cuda": ["full"] },
26 |             "memory_usage": { "full": 8500 },
27 |             "model": [{
28 |                 "name": "facebook/musicgen-medium",
29 |                 "provider": "huggingface"
30 |             }],
31 |             "configuration": {
32 |                 "progress_label": "Generating Music"
33 |             }            
34 |         },
35 |         {
36 |             "label":  "MusicGen Large",
37 |             "routing_key": "musicgen_large",            
38 |             "use": ["music_generation"],        
39 |             "available_precision": { "cpu": ["full"], "cuda": ["full"] },
40 |             "memory_usage": { "full": 13500 },
41 |             "model": [{
42 |                 "name": "facebook/musicgen-large",
43 |                 "provider": "huggingface"
44 |             }]            ,
45 |             "configuration": {
46 |                 "progress_label": "Generating Music"
47 |             }            
48 |         }
49 |     ]
50 | }


--------------------------------------------------------------------------------
/modules/noco-ai/music-generator/handler.py:
--------------------------------------------------------------------------------
 1 | from application.base_handler import BaseHandler
 2 | from application.progress_streamer import ProgressStreamer
 3 | from transformers import AutoProcessor, MusicgenForConditionalGeneration
 4 | import base64
 5 | from io import BytesIO
 6 | import scipy
 7 | import copy
 8 | import logging
 9 | 
10 | logger = logging.getLogger(__name__)
11 | 
12 | class MusicGen(BaseHandler):
13 |     def __init__(self):
14 |         self.progress_streamer = ProgressStreamer()
15 |         super().__init__()
16 | 
17 |     def validate(self, request):
18 |         is_valid, errors = self.validate_request(request, 'audio-gen')
19 |         return is_valid, errors
20 | 
21 |     def execute(self, model, request):
22 |         prompt = request.get("prompt", "")  # defaults to an empty string if "prompt" is not in request
23 |         seconds = int(request.get("seconds", 5))
24 |         guidance_scale = int(request.get("guidance_scale", 3))
25 |         send_progress = request.get("progress", True)
26 |         max_new_tokens = seconds * 52
27 | 
28 |         # prep headers for sending progress data
29 |         if send_progress:
30 |             progress_headers = copy.deepcopy(model["amqp_headers"])
31 |             outgoing_properties = self.copy_queue_headers(progress_headers, "update_progress")
32 |             amqp_config = {
33 |                 "headers": progress_headers,
34 |                 "outgoing_properties": outgoing_properties,
35 |                 "channel": model["amqp_channel"]
36 |             }            
37 |             self.progress_streamer.configure(max_new_tokens, self.model_config["progress_label"], self.routing_key, amqp_config)
38 |         else:
39 |             self.progress_streamer.configure(max_new_tokens, self.model_config["progress_label"], self.routing_key)
40 | 
41 |         # Assuming the model function can take these parameters:
42 |         logger.info(f"prompt: {prompt}, seconds: {seconds}, max new tokens: {max_new_tokens}, guidance scale: {guidance_scale}")
43 |         inputs = model["processor"](
44 |             text=[prompt],
45 |             padding=True,
46 |             return_tensors="pt",
47 |         ).to(model["device"])        
48 |         audio_values = model["model"].generate(**inputs, do_sample=True, streamer=self.progress_streamer, guidance_scale=guidance_scale, max_new_tokens=max_new_tokens)
49 | 
50 |         # Save image to an in-memory bytes buffer
51 |         buffered = BytesIO()
52 |         sampling_rate = model["model"].config.audio_encoder.sampling_rate
53 |         scipy.io.wavfile.write(buffered, rate=sampling_rate, data=audio_values[0, 0].cpu().numpy())
54 |         
55 |         # Convert bytes buffer to a base64-encoded string
56 |         wav_str = base64.b64encode(buffered.getvalue()).decode()
57 |         return {"wav": wav_str}    
58 |     
59 |     def load(self, model, model_options, local_path):        
60 |         self.model_config = model["configuration"]      
61 |         self.routing_key = model["routing_key"]
62 |         processor = AutoProcessor.from_pretrained(local_path)
63 |         load_model = MusicgenForConditionalGeneration.from_pretrained(local_path)        
64 | 
65 |         return {
66 |             "model": load_model,
67 |             "processor": processor,
68 |             "device": model_options["device"],
69 |             "device_memory": model["memory_usage"][model_options["use_precision"]]
70 |         }
71 | 


--------------------------------------------------------------------------------
/modules/noco-ai/sd-xl/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "Stable Diffusion XL v1.0",
 3 |     "description": "Handler for loading Stable Diffusion XL v1.0 models.",
 4 |     "unique_key": "sd_xl_10",
 5 |     "script": "handler.py",
 6 |     "supported_gpu": ["nvidia"],
 7 |     "skills": [
 8 |         {
 9 |             "label":  "Stable Diffusion XL v1.0",
10 |             "routing_key": "stable_diffusion_xl_v10",            
11 |             "use": ["image_generation"],        
12 |             "available_precision": { "cuda": ["full"] },
13 |             "memory_usage": { "full": 19000 },
14 |             "model": [{
15 |                 "name": "stabilityai/stable-diffusion-xl-base-1.0",
16 |                 "provider": "huggingface"
17 |             },
18 |             {
19 |                 "name": "stabilityai/stable-diffusion-xl-refiner-1.0",
20 |                 "provider": "huggingface"
21 |             }],
22 |             "configuration": {
23 |                 "is_turbo": false,
24 |                 "progress_label": "Generating Image"                
25 |             }
26 |         },        
27 |         {
28 |             "label":  "DreamShaper XL v1.0",
29 |             "routing_key": "dream_shaper_xl_image_gen",
30 |             "use": ["image_generation"],        
31 |             "available_precision": { "cuda": ["full"] },
32 |             "memory_usage": { "full": 19000 },
33 |             "model": [{
34 |                 "name": "civitai/dreamshaper_xl/126688.safetensors",
35 |                 "provider": "civitai",
36 |                 "url": "https://civitai.com/api/download/models/126688?type=Model&format=SafeTensor&size=full&fp=fp16"
37 |             },
38 |             {
39 |                 "name": "stabilityai/stable-diffusion-xl-refiner-1.0",
40 |                 "provider": "huggingface"
41 |             }],
42 |             "configuration": {
43 |                 "is_turbo": false,
44 |                 "progress_label": "Generating Image"
45 |             }              
46 |         },
47 |         {
48 |             "label":  "Juggernaut XL",
49 |             "routing_key": "juggernaut_xl_image_gen",
50 |             "use": ["image_generation"],        
51 |             "available_precision": { "cuda": ["full"] },
52 |             "memory_usage": { "full": 19000 },
53 |             "model": [{
54 |                 "name": "civitai/juggernaut_xl/240840.safetensors",
55 |                 "provider": "civitai",
56 |                 "url": "https://civitai.com/api/download/models/240840?type=Model&format=SafeTensor&size=full&fp=fp16"
57 |             },
58 |             {
59 |                 "name": "stabilityai/stable-diffusion-xl-refiner-1.0",
60 |                 "provider": "huggingface"
61 |             }],
62 |             "configuration": {
63 |                 "is_turbo": false,
64 |                 "progress_label": "Generating Image"
65 |             }         
66 |         },
67 |         {
68 |             "label":  "Stable Diffusion XL Turbo v1.0",
69 |             "routing_key": "stable_diffusion_xl_trubo_v10",            
70 |             "use": ["image_generation"],        
71 |             "available_precision": { "cuda": ["full"] },
72 |             "memory_usage": { "full": 7800 },
73 |             "model": [{
74 |                 "name": "stabilityai/sdxl-turbo",
75 |                 "provider": "huggingface"
76 |             }],
77 |             "configuration": {
78 |                 "is_turbo": true,
79 |                 "progress_label": "Generating Image"
80 |             }        
81 |         },
82 |         {
83 |             "label":  "SD XL Turbo Unstable",
84 |             "routing_key": "sd_xl_trubo_unstable",            
85 |             "use": ["image_generation"],        
86 |             "available_precision": { "cuda": ["full"] },
87 |             "memory_usage": { "full": 7800 },
88 |             "model": [{
89 |                 "name": "civitai/sd_xl_trubo_unstable/247214.safetensors",
90 |                 "provider": "civitai",
91 |                 "url": "https://civitai.com/api/download/models/247214?type=Model&format=SafeTensor&size=full&fp=fp16"
92 |             }],
93 |             "configuration": {
94 |                 "is_turbo": true,
95 |                 "progress_label": "Generating Image"
96 |             }        
97 |         }        
98 |     ]
99 | }


--------------------------------------------------------------------------------
/modules/noco-ai/sd-xl/handler.py:
--------------------------------------------------------------------------------
  1 | from application.base_handler import BaseHandler
  2 | import torch
  3 | import base64
  4 | from io import BytesIO
  5 | import logging
  6 | import torch
  7 | from diffusers import StableDiffusionXLPipeline, KDPM2DiscreteScheduler, DiffusionPipeline
  8 | from compel import Compel, ReturnedEmbeddingsType
  9 | import copy
 10 | import json
 11 | 
 12 | logger = logging.getLogger(__name__)
 13 | 
 14 | class StableDiffusionXl(BaseHandler):
 15 |     def __init__(self):
 16 |         super().__init__()
 17 | 
 18 |     def validate(self, request):
 19 |         is_valid, errors = self.validate_request(request, 'img-gen')
 20 |         return is_valid, errors
 21 | 
 22 |     def get_latents(self, num_images=1, height=1024, width=1024, user_seed=-1, device="cuda:0", model=None):
 23 |         latents = None
 24 |         generator = torch.Generator(device=device)        
 25 |         if user_seed == -1:
 26 |             seed = generator.seed()
 27 |         else:
 28 |             seed = user_seed
 29 |         generator = generator.manual_seed(seed)
 30 |         
 31 |         latents = torch.randn(
 32 |             (num_images, model.unet.in_channels, height // 8, width // 8),
 33 |             generator = generator,
 34 |             device = device,
 35 |             dtype = torch.float16
 36 |         )
 37 |         return { "seed": seed, "latents": latents }
 38 | 
 39 |     def step_callback(self, pipeline: DiffusionPipeline, step: int, timestep: int, callback_kwargs):
 40 |         if self.stream_progress == False:
 41 |             return callback_kwargs    
 42 |         
 43 |         self.current_step = self.current_step + 1
 44 |         label = self.model_config["progress_label"] if "progress_label" in self.model_config else self.routing_key
 45 |         send_body = {
 46 |             "total": self.total_steps,
 47 |             "current": self.current_step,
 48 |             "label": label,
 49 |             "model": self.routing_key
 50 |         }
 51 |             
 52 |         self.amqp_progress_config["channel"].basic_publish(
 53 |             exchange=self.amqp_progress_config["headers"]['return_exchange'], 
 54 |             routing_key=self.amqp_progress_config["headers"]['return_routing_key'], 
 55 |             body=json.dumps(send_body), properties=self.amqp_progress_config["outgoing_properties"])             
 56 |         
 57 |         return callback_kwargs
 58 | 
 59 |     def execute(self, model, request):
 60 |         prompt = request.get("prompt", "")
 61 |         height = request.get("height", 1024)
 62 |         width = request.get("width", 1024)
 63 |         steps = request.get("steps", 50)
 64 |         seed = request.get("seed", -1)
 65 |         self.stream_progress = request.get("progress", False)
 66 |         negative_prompt = request.get("negative_prompt", "")
 67 |         guidance_scale = request.get("guidance_scale", 7.5)
 68 |         num_images_per_prompt = 1        
 69 | 
 70 |         if self.model_config["is_turbo"] == True and steps > 4:
 71 |             guidance_scale = 0.0            
 72 |             steps = 4
 73 |         
 74 |         high_noise_frac = 0.8
 75 |         if self.stream_progress == True:
 76 |             progress_headers = copy.deepcopy(model["amqp_headers"])
 77 |             outgoing_properties = self.copy_queue_headers(progress_headers, "update_progress")
 78 |             self.amqp_progress_config = {
 79 |                 "headers": progress_headers,
 80 |                 "outgoing_properties": outgoing_properties,
 81 |                 "channel": model["amqp_channel"]
 82 |             }
 83 |             self.current_step = 0
 84 |             if self.model_config["is_turbo"] == False:
 85 |                 self.total_steps = ((steps * high_noise_frac) * 2) + (steps * (1 - high_noise_frac))
 86 |             else:
 87 |                 self.total_steps = steps
 88 | 
 89 |         latent_data = self.get_latents(num_images_per_prompt, height, width, seed, self.model_options["device"], model["model"])
 90 |         logger.info(f"prompt: {prompt}, height: {height}, width: {width}, steps: {steps}, guidance scale: {guidance_scale}, seed: {latent_data['seed']}")
 91 |         conditioning, pooled = model["compel"](prompt)
 92 |         negative_conditioning, negative_pooled = model["compel"](negative_prompt)
 93 | 
 94 |         if self.model_config["is_turbo"] == False:            
 95 |             conditioning_refiner, pooled_refiner = model["compel_refiner"](prompt)
 96 |             negative_conditioning_refiner, negative_pooled_refiner = model["compel_refiner"](negative_prompt)
 97 |             base_image = model["model"](prompt_embeds=conditioning, pooled_prompt_embeds=pooled, height=height, width=width, num_inference_steps=steps, callback_on_step_end=self.step_callback, latents=latent_data["latents"], denoising_end=high_noise_frac,
 98 |                             negative_prompt_embeds=negative_conditioning, negative_pooled_prompt_embeds=negative_pooled, guidance_scale=guidance_scale, num_images_per_prompt=num_images_per_prompt, output_type="latent").images        
 99 |             image = model["refiner"](prompt_embeds=conditioning_refiner, pooled_prompt_embeds=pooled_refiner, negative_prompt_embeds=negative_conditioning_refiner,
100 |                             negative_pooled_prompt_embeds=negative_pooled_refiner, num_inference_steps=steps, denoising_start=high_noise_frac, image=base_image, callback_on_step_end=self.step_callback).images[0]
101 |         else:
102 |             image = model["model"](prompt_embeds=conditioning, pooled_prompt_embeds=pooled, height=height, width=width, num_inference_steps=steps, latents=latent_data["latents"], 
103 |                             guidance_scale=guidance_scale, num_images_per_prompt=num_images_per_prompt, callback_on_step_end=self.step_callback).images[0]
104 | 
105 |         buffered = BytesIO()
106 |         image.save(buffered, format="PNG") 
107 | 
108 |         # Convert bytes buffer to a base64-encoded string
109 |         img_str = base64.b64encode(buffered.getvalue()).decode()
110 |         return {"image": img_str, "seed": latent_data["seed"], "guidance_scale": guidance_scale, "steps": steps }
111 |     
112 |     def load(self, model, model_options, local_path):
113 |         self.model_options = model_options        
114 |         self.model_config = model["configuration"]
115 |         self.routing_key = model["routing_key"]
116 | 
117 |         try:                        
118 |             is_turbo = model["configuration"]["is_turbo"]
119 |             if "civitai" not in local_path:             
120 |                 logger.info("loading sd xl model")           
121 |                 load_model = StableDiffusionXLPipeline.from_pretrained(local_path, torch_dtype=torch.float16, use_safetensors=True, variant="fp16")            
122 |             else:            
123 |                 logger.info("loading civit sd xl model")
124 |                 load_model = StableDiffusionXLPipeline.from_single_file(local_path, torch_dtype=torch.float16, variant="fp16")        
125 | 
126 |             compel = Compel(
127 |                 tokenizer=[load_model.tokenizer, load_model.tokenizer_2] ,
128 |                 text_encoder=[load_model.text_encoder, load_model.text_encoder_2],
129 |                 returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
130 |                 requires_pooled=[False, True]
131 |             )
132 | 
133 |             ret = {
134 |                 "model": load_model,
135 |                 "device": model_options["device"],            
136 |                 "device_memory": model["memory_usage"][model_options["use_precision"]],
137 |                 "compel": compel
138 |             }
139 | 
140 |             # load the refiner model
141 |             if is_turbo == False:            
142 |                 load_model.scheduler = KDPM2DiscreteScheduler.from_config(load_model.scheduler.config)
143 |                 logger.info("loading sd xl refiner")           
144 |                 load_refiner = DiffusionPipeline.from_pretrained(
145 |                     "./data/models/stabilityai/stable-diffusion-xl-refiner-1.0",
146 |                     text_encoder_2=load_model.text_encoder_2,
147 |                     vae=load_model.vae,
148 |                     torch_dtype=torch.float16,
149 |                     use_safetensors=True,
150 |                     variant="fp16"
151 |                 )
152 |                 load_refiner.to(model_options["device"])
153 |                 ret["refiner"] = load_refiner
154 | 
155 |                 compel_refiner = Compel(
156 |                     tokenizer=[load_refiner.tokenizer_2],
157 |                     text_encoder=[load_refiner.text_encoder_2],
158 |                     returned_embeddings_type=ReturnedEmbeddingsType.PENULTIMATE_HIDDEN_STATES_NON_NORMALIZED,
159 |                     requires_pooled=[True],
160 |                 )
161 |                 ret["compel_refiner"] = compel_refiner
162 | 
163 |             return ret
164 |         except Exception as e:
165 |             print(f"error loading sdxl model")
166 |             print(e)
167 |             return { "error": True }
168 | 


--------------------------------------------------------------------------------
/modules/noco-ai/transformers-stream/golem.json:
--------------------------------------------------------------------------------
  1 | {
  2 |     "label": "🤗 Transfromers LLM",
  3 |     "description": "Handler for loading any models that are compatible with HuggingFace transformers. Has only tested with Llama 2.",
  4 |     "unique_key": "transformers_llm",
  5 |     "script": "handler.py",    
  6 |     "multi_gpu_support": true,
  7 |     "multi_gpu_configurable": false,
  8 |     "supported_gpu": ["nvidia"],
  9 |     "skills": [
 10 |         {
 11 |             "label":  "CodeLlama 7B Instruct 🤗",
 12 |             "routing_key": "llama_v2_code_instruct_7b",            
 13 |             "use": ["language_model"],        
 14 |             "available_precision": { "cuda": ["4-bit", "8-bit", "full"] },
 15 |             "memory_usage": { "4-bit": 5500, "8-bit": 8500, "full": 27000 },
 16 |             "shortcut": "💻",
 17 |             "moe_domain": [
 18 |                 "Systems Programming: Development of computer systems software.",
 19 |                 "Computer Networking: Study of computer systems that are interconnected via network."
 20 |             ],         
 21 |             "lora": [{
 22 |                 "name": "nocoai/function-hul-lora",
 23 |                 "moe_domain": [
 24 |                     "Systems Programming: Development of computer systems software."
 25 |                 ],
 26 |                 "chat_history": -1
 27 |             }],
 28 |             "model": [{
 29 |                 "name": "codellama/CodeLlama-7B-Instruct-hf",
 30 |                 "provider": "huggingface"
 31 |             }],
 32 |             "configuration": {
 33 |                 "max_seq_len": 16384,                
 34 |                 "stop_on": ["</s>", "[INST]"],
 35 |                 "user_role": "[INST]",
 36 |                 "ai_role": "[/INST]",            
 37 |                 "system_prompt_format": "{user_role} <<SYS>>{system_prompt}<</SYS>>\n{prompt} {ai_role} {response}",
 38 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response}",
 39 |                 "system_message": "You are an expert software development coding assistant. Wrap all code you output in ```."
 40 |             }            
 41 |         },
 42 |         {
 43 |             "label":  "CodeLlama 13B Instruct 🤗",
 44 |             "routing_key": "llama_v2_code_instruct_13b",            
 45 |             "use": ["language_model"],        
 46 |             "available_precision": { "cuda": ["4-bit"] },
 47 |             "memory_usage": { "4-bit": 16100 },
 48 |             "shortcut": "💻",
 49 |             "moe_domain": [
 50 |                 "Systems Programming: Development of computer systems software.",
 51 |                 "Computer Networking: Study of computer systems that are interconnected via network."
 52 |             ],            
 53 |             "model": [{
 54 |                 "name": "codellama/CodeLlama-13B-Instruct-hf",
 55 |                 "provider": "huggingface"
 56 |             }],
 57 |             "configuration": {
 58 |                 "max_seq_len": 16384,
 59 |                 "stop_on": ["</s>", "[INST]"],
 60 |                 "user_role": "[INST]",
 61 |                 "ai_role": "[/INST]",                            
 62 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response}",
 63 |                 "system_prompt_format": "{user_role} <<SYS>>{system_prompt}<</SYS>>\n{prompt} {ai_role} {response}",
 64 |                 "system_message": "You are an expert software development coding assistant. Wrap all code you output in ```."
 65 |             }
 66 |         },
 67 |         {
 68 |             "label":  "CodeLlama 34B Instruct 🤗",
 69 |             "routing_key": "llama_v2_code_instruct_34b",
 70 |             "use": ["language_model"],
 71 |             "shortcut": "💻",
 72 |             "special_ability": ["coding"],
 73 |             "available_precision": { "cuda": ["4-bit"] },
 74 |             "memory_usage": { "4-bit": 22000 },
 75 |             "model": [{
 76 |                 "name": "codellama/CodeLlama-34b-Instruct-hf",
 77 |                 "provider": "huggingface"
 78 |             }],
 79 |             "configuration": {
 80 |                 "max_seq_len": 16384,
 81 |                 "user_role": "[INST]",
 82 |                 "ai_role": "[/INST]",
 83 |                 "stop_on": ["</s>", "[INST]"],
 84 |                 "system_prompt_format": "{user_role} <<SYS>>{system_prompt}<</SYS>>\n{prompt} {ai_role} {response}",
 85 |                 "system_message": "You are an expert software development coding assistant. Wrap all code you output in ```."
 86 |             }
 87 |         },
 88 |         {
 89 |             "label":  "CodeLlama 34B Python 🤗",
 90 |             "routing_key": "llama_v2_code_python_34b",            
 91 |             "use": ["language_model"],
 92 |             "shortcut": "🐍",
 93 |             "special_ability": ["coding"],
 94 |             "available_precision": { "cuda": ["4-bit"] },
 95 |             "memory_usage": { "4-bit": 22000 },
 96 |             "model": [{
 97 |                 "name": "codellama/CodeLlama-34B-Python-hf",
 98 |                 "provider": "huggingface"
 99 |             }],
100 |             "configuration": {
101 |                 "max_seq_len": 16384,
102 |                 "stop_on": ["[INST]"],
103 |                 "user_role": "[INST]",
104 |                 "ai_role": "[/INST]",            
105 |                 "prompt_format": "{user_role} {prompt} {ai_role} {response}",
106 |                 "system_prompt_format": "{user_role} <<SYS>>{system_prompt}<</SYS>>\n{prompt} {ai_role} {response}",
107 |                 "system_message": "You are an expert software development coding assistant. Wrap all code you output in ```."
108 |             }
109 |         },
110 |         {
111 |             "label":  "CodeLlama 34B Phind v2 🤗",
112 |             "routing_key": "llama_v2_code_phind_v2",            
113 |             "use": ["language_model"],        
114 |             "available_precision": { "cuda": ["4-bit"] },
115 |             "memory_usage": { "4-bit": 22500 },
116 |             "shortcut": "💻",
117 |             "special_ability": ["coding"],
118 |             "moe_domain": [
119 |                 "Systems Programming: Development of computer systems software"
120 |             ],
121 |             "model": [{
122 |                 "name": "Phind/Phind-CodeLlama-34B-v2",
123 |                 "provider": "huggingface"
124 |             }],
125 |             "configuration": {
126 |                 "max_seq_len": 16384,                
127 |                 "user_role": "### User Message\n",
128 |                 "ai_role": "### Assistant\n",
129 |                 "stop_on": ["</s>", "<stop>"],
130 |                 "prompt_format": "{user_role}{prompt}\n\n{ai_role}{response}",
131 |                 "system_prompt_format": "### System Prompt:\n{system_prompt}\n\n",
132 |                 "system_message": "You are an expert software development coding assistant. Wrap all code you output in ```."
133 |             }
134 |         }
135 |     ],    
136 |     "configuration": {
137 |         "vault_path": "golem/transformers_llm",
138 |         "options": [
139 |             {
140 |                 "label": "System Message",
141 |                 "name": "system_message",
142 |                 "editable": true,
143 |                 "type": "textarea",
144 |                 "default": "A chat between a human and an assistant."
145 |             },
146 |             {
147 |                 "label": "Stop On",
148 |                 "name": "stop_on",            
149 |                 "editable": true,
150 |                 "type": "multistring",
151 |                 "default": ["</s>"]
152 |             }
153 |         ]
154 |     }            
155 | }


--------------------------------------------------------------------------------
/modules/noco-ai/transformers-stream/handler.py:
--------------------------------------------------------------------------------
  1 | from transformers_stream_generator import init_stream_support
  2 | init_stream_support()
  3 | from application.llm_handler import LlmHandler
  4 | from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
  5 | import torch
  6 | import time
  7 | import logging
  8 | 
  9 | logger = logging.getLogger(__name__)
 10 | 
 11 | class TransformersGenerator(LlmHandler):
 12 |     def __init__(self):
 13 |         super().__init__()
 14 | 
 15 |     def update_config(self, config_data):
 16 |         current_config = self.model_config
 17 |         merged_config = {**current_config, **config_data}
 18 |         self.model_config = merged_config
 19 | 
 20 |     def validate(self, request):
 21 |         is_valid, errors = self.validate_request(request, 'llm')
 22 |         return is_valid, errors
 23 | 
 24 |     def get_token_count(self, input_text):
 25 |         inputs = self.tokenizer(input_text, return_tensors="pt", add_special_tokens=False).to("cuda")
 26 |         return inputs["input_ids"].shape[1]
 27 |     
 28 |     def stream(self, generator, tokenizer, model, prompt, channel, incoming_headers, 
 29 |                outgoing_properties, stops, request, model_data):        
 30 | 
 31 |         # setup stop conditions
 32 |         check_stop_token, stop_conditions = self.build_stop_conditions(stops)
 33 | 
 34 |         # force this to false, token passed to check_stop_conditions not same format as other handlers
 35 |         check_stop_token = False        
 36 |                 
 37 |         # get starting time
 38 |         begin_time = time.time()
 39 | 
 40 |         # tokenize the prompt        
 41 |         inputs = tokenizer(prompt, return_tensors="pt", add_special_tokens=False).to("cuda")
 42 |         input_token_count = inputs["input_ids"].shape[1]
 43 |         
 44 |         # set max new tokens and other params
 45 |         max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \
 46 |                     min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request)
 47 |         if debug:
 48 |             print('\033[94m')
 49 |             print(request)
 50 |             print(prompt)
 51 |             print('\033[0m')                                     
 52 | 
 53 |         generator = model.generate(
 54 |             inputs["input_ids"],
 55 |             max_new_tokens=max_new_tokens,
 56 |             do_sample=True,
 57 |             do_stream=True,
 58 |             top_p=top_p,
 59 |             top_k=top_k,
 60 |             eos_token_id=tokenizer.eos_token_id,
 61 |             temperature=temperature,
 62 |         )
 63 | 
 64 |         # vars used in generation loop
 65 |         all_tokens = []
 66 |         all_text = ""
 67 |         response = ""
 68 |         held_text = ""
 69 |         new_tokens = 0            
 70 |         finish_reason = 'stop'
 71 |         socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None
 72 |         stop_generation_counter = 0
 73 | 
 74 |         for token in generator:                
 75 |             all_tokens.extend(token.tolist())
 76 |             new_text = tokenizer.decode(all_tokens)
 77 |             new_chuck = new_text[len(all_text):]
 78 |             all_text += new_chuck
 79 |             new_tokens += 1                              
 80 | 
 81 |             if new_tokens >= max_new_tokens: 
 82 |                 finish_reason = 'length'
 83 |                 break
 84 | 
 85 |             stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 
 86 |                                     model_data["stop_generation_event"], model_data["stop_generation_filter"], socket_id)
 87 | 
 88 |             if stop_generation:
 89 |                 finish_reason = "abort"
 90 |                 break                            
 91 | 
 92 |             # check if we should hold off on streaming this text
 93 |             hold_text = False
 94 |             for stop_string in stop_conditions:                        
 95 |                 if len(held_text) and stop_string.startswith(held_text.lower() + new_chuck.lower()): hold_text = True                            
 96 |                 elif stop_string.startswith(new_chuck.lower()): hold_text = True                            
 97 | 
 98 |             if not hold_text:                    
 99 | 
100 |                 # send chunk to front end
101 |                 if stream_output:
102 |                     if debug:
103 |                         print('\033[96m' + new_chuck, end="")
104 | 
105 |                     channel.basic_publish(
106 |                         exchange=incoming_headers['return_exchange'], 
107 |                         routing_key=incoming_headers['return_routing_key'], 
108 |                         body=new_chuck, properties=outgoing_properties)
109 |                 else:
110 |                     response += new_chuck
111 | 
112 |                 held_text = ""
113 |             else:
114 |                 held_text += new_chuck                
115 | 
116 |             stop_condition = self.check_stop_conditions(token, held_text, tokenizer.eos_token_id, 
117 |                                                         check_stop_token, stop_conditions)
118 |             if stop_condition: break
119 | 
120 |         if debug and stream_output:
121 |             print('\033[0m' + "")
122 | 
123 |         end_time = time.time()
124 |         elapsed = end_time - begin_time
125 |         token_rate = 0 if elapsed == 0 else (new_tokens / elapsed)        
126 |         model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided"
127 |         return self.finish_response(stop_key, response, request, stream_output, finish_reason, 
128 |                                         token_rate, new_tokens, input_token_count, model_name, elapsed, debug)
129 | 
130 |     def execute(self, model, request):
131 |         config = self.model_config        
132 |         
133 |         # build the prompt
134 |         prompt = self.build_prompt(request, config, model)
135 |         incoming_headers = model["amqp_headers"]
136 |         outgoing_properties = self.copy_queue_headers(incoming_headers)
137 | 
138 |         # last string to send after done streaming output                        
139 |         stream_resp = self.stream(
140 |             model["generator"], 
141 |             model["tokenizer"], 
142 |             model["model_loaded"], 
143 |             prompt,
144 |             model["amqp_channel"],
145 |             incoming_headers,
146 |             outgoing_properties,
147 |             config["stop_on"],
148 |             request,
149 |             model)
150 |         
151 |         return stream_resp
152 |         
153 |     def load(self, model, model_options, local_path):           
154 |         self.model_config = model["configuration"]      
155 | 
156 |         # get paths
157 |         logger.info(f"starting module {local_path}")        
158 |         load_error = False
159 |         try:
160 |             tokenizer = AutoTokenizer.from_pretrained(local_path)
161 |             quantization_config = None            
162 |             if model_options["use_precision"] != "full":
163 |                 if model_options["use_precision"] == "4-bit":                    
164 |                     quantization_config = BitsAndBytesConfig(
165 |                         load_in_4bit=True,
166 |                         bnb_4bit_compute_dtype=torch.float16
167 |                     )
168 |                 else:
169 |                     quantization_config = BitsAndBytesConfig(
170 |                         load_in_8bit=True
171 |                     )
172 | 
173 |             # this is not fully impelemented and should but a device map based off the split not auto
174 |             device_map = "auto" if model_options["device"].startswith("split") else model_options["device"]
175 |             load_model = AutoModelForCausalLM.from_pretrained(
176 |                 local_path,
177 |                 quantization_config=quantization_config,
178 |                 device_map=device_map
179 |             )
180 |             self.tokenizer = tokenizer
181 |             
182 |             logger.info(f'skill {model["routing_key"]} loaded to {model_options["device"]}, precision: {model_options["use_precision"]}')
183 |             return { "model_loaded": load_model, "generator": load_model, "tokenizer": tokenizer, "error": load_error }                        
184 |         except Exception as e:
185 |             logger.error(f"error loading model")
186 |             print(e)
187 |             load_error = True
188 |             return { "error": load_error }


--------------------------------------------------------------------------------
/modules/noco-ai/tts-api/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "xTTS Text to Speech",
 3 |     "description": "Handler for loading xTTS text to speech models.",
 4 |     "unique_key": "coqui_tts",
 5 |     "script": "handler.py",
 6 |     "skills": [        
 7 |         {
 8 |             "label":  "XTTS v2",
 9 |             "routing_key": "xtts_v2_speech",            
10 |             "use": ["text_to_speech"],        
11 |             "available_precision": { "cuda": ["full"], "cpu": ["full"] },
12 |             "memory_usage": { "full": 2800 },
13 |             "model": [{
14 |                 "name": "coqui/XTTS-v2",
15 |                 "provider": "huggingface"
16 |             }],
17 |             "configuration": {
18 |                 "progress_label": "Generating Speech"
19 |             }            
20 |         }
21 |     ]
22 | }


--------------------------------------------------------------------------------
/modules/noco-ai/tts-api/handler.py:
--------------------------------------------------------------------------------
 1 | from application.base_handler import BaseHandler
 2 | import logging
 3 | from TTS.tts.configs.xtts_config import XttsConfig
 4 | from TTS.tts.models.xtts import Xtts
 5 | import soundfile as sf
 6 | import base64
 7 | from io import BytesIO
 8 | import requests
 9 | import tempfile
10 | import os
11 | from urllib.parse import urlparse
12 | 
13 | logger = logging.getLogger(__name__)
14 | 
15 | class XTTSHandler(BaseHandler):
16 |     def __init__(self):
17 |         super().__init__()
18 | 
19 |     def validate(self, request):
20 |         is_valid, errors = self.validate_request(request, 'voice-gen')
21 |         return is_valid, errors
22 | 
23 |     def is_valid_url(self, url):
24 |         try:
25 |             result = urlparse(url)
26 |             return all([result.scheme, result.netloc])
27 |         except ValueError:
28 |             return False
29 | 
30 |     def download_temp_file(self, url):
31 |         response = requests.get(url)
32 |         if response.status_code == 200:
33 |             temp_file = tempfile.NamedTemporaryFile(delete=False)
34 |             temp_file.write(response.content)
35 |             temp_file.close()
36 |             return temp_file.name
37 |         else:
38 |             return None
39 |         
40 |     def execute(self, model, request):
41 |         prompt = request.get("prompt", "")
42 |         voice_preset = request.get("voice", "default")
43 |         prompt_length = len(prompt)
44 | 
45 |         temp_file_path = None
46 |         if self.is_valid_url(voice_preset):
47 |             temp_file_path = self.download_temp_file(voice_preset)
48 |             if temp_file_path:
49 |                 voice_preset = temp_file_path
50 |             else:
51 |                 voice_preset = model["default_wav"]
52 |         else:
53 |             voice_preset = model["default_wav"]
54 | 
55 |         logger.info(f"prompt: {prompt}, voice: {voice_preset}, length: {prompt_length}")
56 | 
57 |         outputs = model["loaded_model"].synthesize(
58 |             prompt,
59 |             model["config"],
60 |             speaker_wav=voice_preset,
61 |             gpt_cond_len=3,
62 |             language="en",
63 |         )
64 |         if temp_file_path:
65 |             os.remove(temp_file_path)
66 |             
67 |         base64_encoded_wav = None
68 |         with BytesIO() as wav_file:
69 |             sf.write(wav_file, outputs["wav"], samplerate=22050, format='WAV')
70 |             wav_file.seek(0)
71 |             binary_wav = wav_file.read()
72 |             base64_encoded_wav = base64.b64encode(binary_wav).decode()
73 | 
74 |         return {"wav": base64_encoded_wav }
75 |     
76 |     def load(self, model, model_options, local_path):
77 |         self.model_config = model["configuration"]      
78 | 
79 |         try:                        
80 |             config = XttsConfig()
81 |             config.load_json(f"{local_path}/config.json")
82 |             loaded_model = Xtts.init_from_config(config)
83 |             loaded_model.load_checkpoint(config, checkpoint_dir=local_path, eval=True)
84 |             if model_options["device"] != "cpu":
85 |                 loaded_model.cuda(model_options["device"])
86 | 
87 |             logger.setLevel(logging.INFO)
88 |             return {
89 |                 "loaded_model": loaded_model,
90 |                 "config": config,
91 |                 "default_wav": f"{local_path}/samples/en_sample.wav"
92 |             }
93 |         except Exception as e:
94 |             print(f"error loading xtts model")
95 |             print(e)
96 |             return { "error": True }
97 | 


--------------------------------------------------------------------------------
/modules/openai/chat-api/chat-api.py:
--------------------------------------------------------------------------------
  1 | from application.llm_handler import LlmHandler
  2 | from pika import BasicProperties
  3 | from openai import OpenAI
  4 | import logging
  5 | import tiktoken
  6 | import time
  7 | 
  8 | logger = logging.getLogger(__name__)
  9 | 
 10 | class OpenAIChatApi(LlmHandler):
 11 |     def __init__(self):        
 12 |         super().__init__()
 13 | 
 14 |     def validate(self, request):        
 15 |         is_valid, errors = self.validate_request(request, 'llm')
 16 |         return is_valid, errors
 17 | 
 18 |     def get_token_count(self, input_text):
 19 |         enc = self.token_counter.encode(input_text)
 20 |         return len(enc)
 21 |     
 22 |     def update_config(self, config_data):
 23 |         current_config = self.model_config
 24 |         merged_config = {**current_config, **config_data}        
 25 |         client = OpenAI(
 26 |             api_key=merged_config["token"]
 27 |         )
 28 |         self.client = client
 29 |         self.model_config = merged_config
 30 |         
 31 | 
 32 |     def clip_messages(self, request, config):        
 33 |         clipped_messages = []
 34 |         messages, system_prompt_tokens, request_system_message, system_prompt, sys_prompt_in_request, max_input_tokens = self._prep_prompt(request, config)       
 35 |         input_token_count = system_prompt_tokens
 36 | 
 37 |         for index, message in enumerate(messages):
 38 |             token_count = self.get_token_count(message["content"]) 
 39 |             if token_count + input_token_count > max_input_tokens:
 40 |                 break
 41 | 
 42 |             input_token_count += token_count
 43 |             clipped_messages.append(message)
 44 |         
 45 |         clipped_messages = clipped_messages[::-1]          
 46 |         if sys_prompt_in_request:
 47 |             clipped_messages.insert(0, request_system_message)
 48 | 
 49 |         return clipped_messages, input_token_count
 50 |     
 51 |     def execute(self, model, request):
 52 |         
 53 |         self.token_counter = tiktoken.encoding_for_model(model["model_name"])    
 54 |         clipped_messages, input_token_count = self.clip_messages(request, self.model_config)
 55 |         if clipped_messages == None:
 56 |             return None
 57 |         
 58 |         max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \
 59 |             min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request)
 60 |         if debug:
 61 |             print('\033[94m')
 62 |             print(request)
 63 |             print('\033[0m')                                     
 64 | 
 65 |         # make API request to OpenAI
 66 |         begin_time = time.time()        
 67 |         config = self.model_config
 68 | 
 69 |         print(f'sending request to openai api')
 70 |         check_stop_token, stop_conditions = self.build_stop_conditions(config["stop_on"])        
 71 |         response = self.client.chat.completions.create(
 72 |             model=model["model_name"],
 73 |             stream=stream_output,
 74 |             messages=clipped_messages,
 75 |             temperature=temperature,
 76 |             max_tokens=max_new_tokens,
 77 |             stop=stop_conditions,
 78 |             presence_penalty=config.get("presence_penalty", 0),
 79 |             frequency_penalty=config.get("frequency_penalty", 0),
 80 |             top_p=top_p
 81 |         )        
 82 | 
 83 |         channel = model["amqp_channel"]
 84 |         incoming_headers = model["amqp_headers"]
 85 | 
 86 |         # copy amqp headers
 87 |         outgoing_headers = {}
 88 |         for incoming_header in incoming_headers:
 89 |             if incoming_header in ["x-delay", "return_exchange", "return_routing_key"]:
 90 |                 continue
 91 |             outgoing_headers[incoming_header] = incoming_headers[incoming_header]        
 92 | 
 93 |         response_str = ""
 94 |         finish_reason = "stop"                
 95 |         new_tokens = 0
 96 |         if stream_output:
 97 |             socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None
 98 |             outgoing_headers["command"] = "prompt_fragment" if "stream_to_override" not in incoming_headers else incoming_headers["stream_to_override"]
 99 |             outgoing_properties = BasicProperties(headers=outgoing_headers)
100 |             stop_generation_counter = 0
101 | 
102 |             for chunk in response:        
103 |                 stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 
104 |                                     model["stop_generation_event"], model["stop_generation_filter"], socket_id)
105 |                 
106 |                 if stop_generation:
107 |                     finish_reason = "abort"
108 |                     break                
109 | 
110 |                 new_tokens += 1                
111 |                 if chunk.choices[0].delta.content == None:
112 |                     continue
113 | 
114 |                 if debug:
115 |                     print('\033[96m' + chunk.choices[0].delta.content, end="")
116 | 
117 |                 response_str += chunk.choices[0].delta.content
118 |                 channel.basic_publish(
119 |                     exchange=incoming_headers['return_exchange'], 
120 |                     routing_key=incoming_headers['return_routing_key'], 
121 |                     body=chunk.choices[0].delta.content, properties=outgoing_properties)
122 | 
123 |             if debug:
124 |                 print('\033[0m' + "")
125 |         else:
126 |             response_str = response.choices[0].message.content
127 |             new_tokens = response.usage.completion_tokens
128 | 
129 |         end_time = time.time()
130 |         elapsed = end_time - begin_time
131 |         token_rate = 0 if elapsed == 0 else (new_tokens / elapsed)        
132 |         model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided"
133 |         request["start_response"] = ""
134 |         resp = self.finish_response(stop_key, response_str, request, stream_output, finish_reason, 
135 |                                         token_rate, new_tokens, input_token_count, model_name, elapsed, debug)        
136 |         return resp
137 | 
138 |     def load(self, model, model_options, local_path):         
139 |         self.model_config = model["configuration"]            
140 |         client = OpenAI(
141 |             api_key=model["secrets"]["token"]
142 |         )
143 |         self.client = client
144 |         return { "model_name": model["configuration"]["model"], "client": client }
145 | 


--------------------------------------------------------------------------------
/modules/openai/chat-api/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "Open AI Chat",
 3 |     "description": "Handler for running OpenAI models using their API.",
 4 |     "unique_key": "openai_chat",
 5 |     "script": "chat-api.py",
 6 |     "skills": [
 7 |         {
 8 |             "label": "OpenAI GPT 3.5",
 9 |             "routing_key": "openai_gpt_35",
10 |             "use": ["reasoning_agent"],
11 |             "available_precision": { "cpu": ["full"] },
12 |             "memory_usage": { "full": 20 },
13 |             "configuration": {
14 |                 "model": "gpt-3.5-turbo",
15 |                 "max_seq_len": 4096,
16 |                 "stop_on": []
17 |             },
18 |             "shortcut": "⚡"
19 |         },
20 |         {
21 |             "label": "OpenAI GPT 4",
22 |             "routing_key": "openai_gpt_4",
23 |             "use": ["reasoning_agent"],
24 |             "available_precision": { "cpu": ["full"] },
25 |             "memory_usage": { "full": 20 },
26 |             "configuration": {
27 |                 "model": "gpt-4",
28 |                 "max_seq_len": 8192,
29 |                 "stop_on": []
30 |             },
31 |             "shortcut": "✨"
32 |         }
33 |     ],
34 |     "configuration": {
35 |         "vault_path": "golem/openai",
36 |         "options": [                        
37 |             {
38 |                 "name": "token",
39 |                 "label": "API Token",
40 |                 "editable": true,
41 |                 "type": "secret",
42 |                 "default": ""
43 |             },
44 |             {
45 |                 "name": "max_seq_len",
46 |                 "label": "Max Context Length",
47 |                 "type": "slider",
48 |                 "min": 512,
49 |                 "max": 16384,
50 |                 "default": 4096
51 |             },
52 |             {
53 |                 "name": "frequency_penalty",
54 |                 "label": "Frequency Penalty",
55 |                 "type": "slider",
56 |                 "min": -2,
57 |                 "max": 2,
58 |                 "step": 0.01,
59 |                 "default": 0
60 |             },
61 |             {
62 |                 "name": "presence_penalty",
63 |                 "label": "Presence Penalty",
64 |                 "type": "slider",
65 |                 "min": -2,
66 |                 "max": 2,
67 |                 "step": 0.01,
68 |                 "default": 0
69 |             }            
70 |         ]
71 |     }
72 | }
73 | 


--------------------------------------------------------------------------------
/modules/openai/dalle/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "Open AI DALL-E",
 3 |     "description": "Handler for running OpenAI image generation models using their API.",
 4 |     "unique_key": "openai_dalle",
 5 |     "script": "handler.py",
 6 |     "skills": [
 7 |         {
 8 |             "label": "OpenAI DALL-E 3",
 9 |             "routing_key": "openai_dalle_3",
10 |             "use": ["image_generation"],
11 |             "available_precision": { "cpu": ["full"] },
12 |             "memory_usage": { "full": 20 },
13 |             "configuration": {
14 |                 "model": "dall-e-3"
15 |             }
16 |         },
17 |         {
18 |             "label": "OpenAI DALL-E 2",
19 |             "routing_key": "openai_dalle_2",
20 |             "use": ["image_generation"],
21 |             "available_precision": { "cpu": ["full"] },
22 |             "memory_usage": { "full": 20 },
23 |             "configuration": {
24 |                 "model": "dall-e-2"
25 |             }
26 |         }
27 |     ],
28 |     "configuration": {
29 |         "vault_path": "golem/openai",
30 |         "options": [                        
31 |             {
32 |                 "name": "token",
33 |                 "label": "API Token",
34 |                 "editable": true,
35 |                 "type": "secret",
36 |                 "default": ""
37 |             }    
38 |         ]
39 |     }
40 | }
41 | 


--------------------------------------------------------------------------------
/modules/openai/dalle/handler.py:
--------------------------------------------------------------------------------
 1 | from application.base_handler import BaseHandler
 2 | from pika import BasicProperties
 3 | from openai import OpenAI
 4 | import logging
 5 | import time
 6 | 
 7 | logger = logging.getLogger(__name__)
 8 | 
 9 | class OpenAIImageGeneration(BaseHandler):
10 |     def __init__(self):        
11 |         super().__init__()
12 | 
13 |     def validate(self, request):        
14 |         is_valid, errors = self.validate_request(request, 'img-gen')
15 |         return is_valid, errors    
16 |     
17 |     def update_config(self, config_data):
18 |         current_config = self.model_config
19 |         merged_config = {**current_config, **config_data}
20 |         client = OpenAI(
21 |             api_key=merged_config["token"]
22 |         )
23 |         self.client = client
24 |         self.model_config = merged_config
25 |     
26 |     def execute(self, model, request):        
27 |         prompt = request.get("prompt", "")
28 |         height = request.get("height", 1024)
29 |         width = request.get("width", 1024)
30 | 
31 |         if height == 512 or width == 512:
32 |             size = "512x512"
33 |         else:
34 |             size = "1024x1024"
35 | 
36 |         if self.model_config["model"] == "dall-e-2":
37 |             size = "512x512"
38 | 
39 |         logger.info(f"generating image using {self.model_config['model']}")
40 |         response = self.client.images.generate(
41 |             model=self.model_config["model"],
42 |             prompt=prompt,
43 |             size=size,
44 |             quality="standard",
45 |             response_format="b64_json",
46 |             n=1,
47 |         )
48 |         return {"image": response.data[0].b64_json, "seed": 0, "guidance_scale": 0, "steps": 1 }
49 | 
50 |     def load(self, model, model_options, local_path):         
51 |         self.model_config = model["configuration"]            
52 |         client = OpenAI(
53 |             api_key=model["secrets"]["token"]
54 |         )
55 |         self.client = client
56 |         return { "model_name": model["configuration"]["model"], "client": client }
57 | 


--------------------------------------------------------------------------------
/modules/salesforce/blip2-opt/blip2-opt.py:
--------------------------------------------------------------------------------
 1 | from transformers import Blip2Processor, Blip2ForConditionalGeneration
 2 | from application.base_handler import BaseHandler
 3 | from PIL import Image
 4 | import requests
 5 | import torch
 6 | 
 7 | class Blip2Opt27b(BaseHandler):
 8 |     def __init__(self):
 9 |         super().__init__()
10 | 
11 |     def validate(self, request):
12 |         is_valid, errors = self.validate_request(request, 'visual-qa')
13 |         return is_valid, errors
14 |     
15 |     def execute(self, model, request):        
16 |         img_url = request["img_url"]        
17 |         image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
18 |         prompt = request["text"]
19 | 
20 |         # build the input tensor
21 |         if self.use_precision == 'half':
22 |             inputs = model["processor"](images=image, text=prompt, return_tensors="pt").to(model["device"], torch.float16)
23 |         else:
24 |             inputs = model["processor"](images=image, text=prompt, return_tensors="pt").to(model["device"])
25 | 
26 |         generated_ids = model["model"].generate(**inputs, max_new_tokens=256)
27 |         generated_text = model["processor"].batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
28 |         return {"text":generated_text}
29 |     
30 |     def load(self, model, model_options, local_path):
31 | 
32 |         # load the processor        
33 |         processor = Blip2Processor.from_pretrained(local_path)
34 |         self.use_precision = model_options["use_precision"]
35 | 
36 |         # load the model        
37 |         if self.use_precision == "full":        
38 |             blip2_model = Blip2ForConditionalGeneration.from_pretrained(local_path)            
39 |         elif model_options["use_precision"] == "half":
40 |             blip2_model = Blip2ForConditionalGeneration.from_pretrained(local_path, torch_dtype=torch.float16)
41 | 
42 |         return {"model": blip2_model, "device": model_options["device"], "processor": processor, "device_memory": model["memory_usage"][self.use_precision]}


--------------------------------------------------------------------------------
/modules/salesforce/blip2-opt/golem.json:
--------------------------------------------------------------------------------
 1 | {
 2 |     "label": "Saleforce Blip2",
 3 |     "description": "Handler for loading Salesforce Blip 2 models/",
 4 |     "unique_key": "salesforce_blip2",
 5 |     "script": "blip2-opt.py",
 6 |     "skills": [
 7 |         {
 8 |             "name": "Salesforce/blip2-opt-2.7b",    
 9 |             "label":  "BLIP v2 OPT 2.7b",
10 |             "routing_key": "blip2_opt_27b",            
11 |             "use": ["visual_qa"],        
12 |             "available_precision": { "cuda": ["full", "half"], "cpu": ["full"] },
13 |             "memory_usage": { "full": 16100, "half": 8900 },
14 |             "model": [{
15 |                 "name": "Salesforce/blip2-opt-2.7b",
16 |                 "provider": "huggingface"
17 |             }]            
18 |         }
19 |     ]
20 | }


--------------------------------------------------------------------------------
/modules/turboderp/exllama/golem-generator.py:
--------------------------------------------------------------------------------
  1 | from application.llm_handler import LlmHandler
  2 | import sys
  3 | import os
  4 | import glob
  5 | import time
  6 | import logging
  7 | import math
  8 | sys.path.append(os.path.dirname(os.path.realpath(__file__)))
  9 | from model import ExLlama, ExLlamaCache, ExLlamaConfig
 10 | from tokenizer import ExLlamaTokenizer
 11 | from generator import ExLlamaGenerator
 12 | from lora import ExLlamaLora
 13 | from application.system_info import get_gpu_memory_usage
 14 | from huggingface_hub import snapshot_download
 15 | 
 16 | logger = logging.getLogger(__name__)
 17 | 
 18 | class GolemExLlamaGenerator(LlmHandler):
 19 |     def __init__(self):
 20 |         super().__init__()
 21 |         self.loras = {}
 22 | 
 23 |     def update_config(self, config_data):
 24 |         current_config = self.model_config
 25 |         merged_config = {**current_config, **config_data}
 26 |         self.model_config = merged_config
 27 | 
 28 |     def validate(self, request):
 29 |         is_valid, errors = self.validate_request(request, 'llm')        
 30 |         return is_valid, errors
 31 | 
 32 |     def get_token_count(self, input_text):
 33 |         ids = self.generator.tokenizer.encode(input_text)
 34 |         input_token_count = len(ids[0])
 35 |         return input_token_count
 36 |     
 37 |     def stream(self, generator, tokenizer, model, prompt, channel, incoming_headers, 
 38 |                outgoing_properties, stops, model_data, request):
 39 |         
 40 |         # setup stop conditions
 41 |         check_stop_token, stop_conditions = self.build_stop_conditions(stops)        
 42 |         
 43 |         res_line = ""        
 44 |         held_text = ""
 45 |         response = ""
 46 |         unicode_hold = False        
 47 |         finish_reason = "stop"
 48 |         stop_condition = False        
 49 |         new_tokens = 0
 50 |         stop_generation_counter = 0
 51 |         ids = generator.tokenizer.encode(prompt)
 52 |         input_token_count = len(ids[0])
 53 | 
 54 |         max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \
 55 |                     min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request)        
 56 |         
 57 |         if debug:
 58 |             print('\033[94m')
 59 |             print(request)
 60 |             print(prompt)
 61 |             print('\033[0m')                                     
 62 |         
 63 |         socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None
 64 |         generator.settings.temperature = temperature
 65 |         generator.settings.top_p = top_p        
 66 |         begin_time = time.time()
 67 | 
 68 |         if "start_response" in request and stream_output:
 69 |             channel.basic_publish(
 70 |                     exchange=incoming_headers['return_exchange'], 
 71 |                     routing_key=incoming_headers['return_routing_key'], 
 72 |                     body=request["start_response"], properties=outgoing_properties)
 73 |             
 74 |         generator.gen_begin(ids)
 75 |         generator.begin_beam_search()
 76 |         for i in range(max_new_tokens):
 77 |             new_tokens += 1
 78 | 
 79 |             # check if stop generation was requested
 80 |             stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 
 81 |                                     model_data["stop_generation_event"], model_data["stop_generation_filter"], socket_id)
 82 |                 
 83 |             if stop_generation:
 84 |                 finish_reason = "abort"
 85 |                 break                            
 86 | 
 87 |             token = generator.beam_search()
 88 |             prev_res_line = res_line
 89 |             res_line = tokenizer.decode(generator.sequence_actual[0, -new_tokens:])
 90 |             new_text = res_line[len(prev_res_line):]
 91 | 
 92 |             # new text
 93 |             chunk = held_text + new_text
 94 | 
 95 |             # check if we should hold off on streaming this text
 96 |             hold_text = False
 97 |             for stop_string in stop_conditions:
 98 |                 if stop_string.startswith(chunk.lower()): hold_text = True
 99 |             
100 |             if len(res_line): 
101 |                 check_ord = ord(res_line[-1])
102 |                 if check_ord == 65533 or check_ord == 55356 or check_ord == 55357:
103 |                     hold_text = True
104 |                     unicode_hold = True
105 | 
106 |             if not hold_text:
107 |                 if unicode_hold is True:
108 |                     unicode_hold = False
109 |                     chunk = res_line[-1:]
110 | 
111 |                 # send chunk to front end
112 |                 if stream_output:
113 |                     if debug:
114 |                         print('\033[96m' + chunk, end="")
115 |                         
116 |                     channel.basic_publish(
117 |                         exchange=incoming_headers['return_exchange'], 
118 |                         routing_key=incoming_headers['return_routing_key'], 
119 |                         body=chunk, properties=outgoing_properties)
120 |                 else:
121 |                     response += chunk
122 | 
123 |                 prompt += chunk
124 |                 held_text = ""
125 |             else:
126 |                 held_text += new_text
127 | 
128 |             # check stop conditions                
129 |             stop_condition = self.check_stop_conditions(token, res_line, tokenizer.eos_token_id, 
130 |                                                     check_stop_token, stop_conditions)
131 |             if stop_condition: break
132 | 
133 |         end_time = time.time()
134 |         elapsed = end_time - begin_time
135 |         token_rate = 0 if elapsed == 0 else (new_tokens / elapsed)
136 |         generator.end_beam_search()
137 |         
138 |         if debug and stream_output:
139 |             print('\033[0m' + "")
140 | 
141 |         if new_tokens == max_new_tokens:
142 |             finish_reason = "length"
143 | 
144 |         model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided"
145 |         resp = self.finish_response(stop_key, response, request, stream_output, finish_reason, 
146 |                                         token_rate, new_tokens, input_token_count, model_name, elapsed, debug)                
147 |         return resp
148 |     
149 |     def load_lora(self, request, model, config):
150 |         # load lora from config and override w/ request if present
151 |         lora_name = config["default_lora"] if "default_lora" in config else None
152 |         if "lora" in request:
153 |             lora_name = request["lora"]
154 | 
155 |         if lora_name != None:
156 |             if lora_name not in self.loras:
157 |                 logger.info(f"loading lora {lora_name}")
158 |                 lora_dir = os.path.join(f"data/loras/", lora_name)
159 |                 if not os.path.exists(lora_dir):
160 |                     logger.info("downloading lora {lora_name} from huggingface")
161 |                     snapshot_download(repo_id=lora_name, local_dir=lora_dir, cache_dir='data/cache', local_dir_use_symlinks=False)
162 |  
163 |                 lora_path = os.path.join(f"data/loras/", lora_name, "adapter_model.bin")                
164 |                 lora_config_path = os.path.join(f"data/loras/{lora_name}", "adapter_config.json")
165 |                 
166 |                 lora = ExLlamaLora(model["model_loaded"], lora_config_path, lora_path)
167 |                 self.loras[lora_name] = lora
168 |             else:
169 |                 logger.info(f"using lora {lora_name}")
170 | 
171 |             model["generator"].lora = self.loras[lora_name]
172 |         else:
173 |             model["generator"].lora = None
174 | 
175 |     def execute(self, model, request):
176 |         # load lora
177 |         config = self.model_config        
178 |         self.load_lora(request, model, config)
179 | 
180 |         # build prompt
181 |         prompt = self.build_prompt(request, config, model)
182 | 
183 |         # copy amqp headers
184 |         incoming_headers = model["amqp_headers"]
185 |         outgoing_properties = self.copy_queue_headers(incoming_headers)
186 | 
187 |         stream_resp = self.stream(
188 |             model["generator"], 
189 |             model["tokenizer"], 
190 |             model["model_loaded"], 
191 |             prompt,
192 |             model["amqp_channel"],
193 |             incoming_headers,
194 |             outgoing_properties,
195 |             config["stop_on"],     
196 |             model,       
197 |             request)
198 |         
199 |         return stream_resp
200 |         
201 |     def load(self, model, model_options, local_path):        
202 |         self.model_config = model["configuration"]     
203 | 
204 |         # get paths
205 |         logger.info(f"starting module {local_path}")
206 |         tokenizer_path = os.path.join(local_path, "tokenizer.model")
207 |         model_config_path = os.path.join(local_path, "config.json")
208 |         st_pattern = os.path.join(local_path, "*.safetensors")
209 |         model_path = glob.glob(st_pattern)[0]
210 | 
211 |         # Create config, model, tokenizer and generator
212 |         config = ExLlamaConfig(model_config_path)
213 |         config.model_path = model_path
214 |         config.compress_pos_emb = model["configuration"].get("compress_pos_emb", 1.0)
215 |         config.max_seq_len = model["configuration"].get("max_seq_len", 2048)
216 |         config.matmul_recons_thd = 8
217 |         config.fused_mlp_thd = 2
218 |         config.sdp_thd = 8
219 | 
220 |         # set model device        
221 |         if model_options["device"].startswith("split"):
222 |             device_map = model_options["device"].split(':')[1]
223 |             config.set_auto_map(device_map)
224 |         elif model_options["device"].startswith("cuda"):
225 |             device_number = int(model_options["device"].split(':')[1])
226 |             device_array = [0]*12
227 |             used_memory, free_memory, total_memory = get_gpu_memory_usage(device_number)
228 |             device_array[device_number] = math.floor(total_memory / 1024)
229 |             last_non_zero = len(device_array) - 1
230 |             while last_non_zero > 0 and device_array[last_non_zero] == 0:
231 |                 last_non_zero -= 1
232 |             device_array = device_array[:last_non_zero + 1]
233 |             device_map = ','.join(map(str, device_array))
234 |             config.set_auto_map(device_map)        
235 | 
236 |         load_error = False
237 |         try:
238 |             load_model = ExLlama(config)                                            
239 |             tokenizer = ExLlamaTokenizer(tokenizer_path)           
240 |             cache = ExLlamaCache(load_model)                       
241 |             generator = ExLlamaGenerator(load_model, tokenizer, cache)  
242 | 
243 |             # Configure generator         
244 |             self.generator = generator   
245 |             generator.settings.min_p = 0.0
246 |             generator.settings.top_k = 0
247 |             generator.settings.typical = 0.25
248 |             generator.settings.token_repetition_penalty_max = 1.15
249 |             generator.settings.token_repetition_penalty_sustain = 2048
250 |             generator.settings.token_repetition_penalty_decay = 512
251 |             
252 |             logger.info(f'skill {model["routing_key"]} loaded to {model_options["device"]}')
253 |             return { "model_loaded": load_model, "generator": generator, "tokenizer": tokenizer, "error": load_error }                        
254 |         except Exception as e:
255 |             logger.error(f"error loading model")
256 |             load_error = True
257 |             print(e)            
258 |             return { "error": load_error }


--------------------------------------------------------------------------------
/modules/turboderp/exllamav2/handler.py:
--------------------------------------------------------------------------------
  1 | from exllamav2 import (
  2 |     ExLlamaV2,
  3 |     ExLlamaV2Config,
  4 |     ExLlamaV2Cache,
  5 |     ExLlamaV2Tokenizer,
  6 |     ExLlamaV2Lora
  7 | )
  8 | from exllamav2.generator import (
  9 |     ExLlamaV2StreamingGenerator,
 10 |     ExLlamaV2Sampler
 11 | )
 12 | from application.system_info import get_gpu_memory_usage
 13 | from huggingface_hub import snapshot_download
 14 | from application.llm_handler import LlmHandler
 15 | import torch
 16 | import time
 17 | import logging
 18 | import sys
 19 | import os
 20 | import math
 21 | import random
 22 | 
 23 | logger = logging.getLogger(__name__)
 24 | 
 25 | class ExllamaV2Generator(LlmHandler):
 26 |     def __init__(self):
 27 |         super().__init__()
 28 |         self.loras = {}
 29 | 
 30 |     def update_config(self, config_data):
 31 |         current_config = self.model_config
 32 |         merged_config = {**current_config, **config_data}
 33 |         self.model_config = merged_config
 34 | 
 35 |     def validate(self, request):
 36 |         is_valid, errors = self.validate_request(request, 'llm')
 37 |         return is_valid, errors
 38 | 
 39 |     def get_token_count(self, input_text):
 40 |         input_ids = self.tokenizer.encode(input_text)
 41 |         return input_ids.shape[-1]
 42 |     
 43 |     def stream(self, generator, tokenizer, model, prompt, channel, incoming_headers, 
 44 |                outgoing_properties, stops, request, model_data, lora):        
 45 |         
 46 |         # setup stop conditions
 47 |         check_stop_token, stop_conditions = self.build_stop_conditions(stops)
 48 |                 
 49 |         # get starting time
 50 |         begin_time = time.time()
 51 | 
 52 |         # tokenize the prompt            
 53 |         input_ids = tokenizer.encode(prompt)
 54 |         input_token_count = input_ids.shape[-1]
 55 |         
 56 |         # set max new tokens and other params        
 57 |         max_new_tokens, top_p, top_k, seed, temperature, stream_output, debug, stop_key, \
 58 |                     min_p, mirostat, mirostat_eta, mirostat_tau = self.load_config_settings(input_token_count, request)
 59 |         
 60 |         if debug:
 61 |             print('\033[94m')
 62 |             print(request)
 63 |             print(prompt)
 64 |             print('\033[0m')                     
 65 | 
 66 |         if check_stop_token:
 67 |             stop_conditions.append(tokenizer.eos_token_id)
 68 | 
 69 |         if seed != -1: random.seed(seed)
 70 |         generator.warmup()
 71 |         generator.set_stop_conditions(stop_conditions)        
 72 |         settings = ExLlamaV2Sampler.Settings()
 73 |         settings.temperature = temperature
 74 |         settings.top_k = top_k
 75 |         settings.top_p = top_p
 76 |         settings.min_p = min_p
 77 |         if mirostat != 0:
 78 |             settings.mirostat = True
 79 |             settings.mirostat_tau = mirostat_tau
 80 |             settings.mirostat_eta = mirostat_eta        
 81 | 
 82 |         #settings.token_repetition_penalty = 1.05
 83 |         socket_id = incoming_headers["socket_id"] if "socket_id" in incoming_headers else None
 84 | 
 85 |         if "start_response" in request and stream_output:
 86 |             channel.basic_publish(
 87 |                     exchange=incoming_headers['return_exchange'], 
 88 |                     routing_key=incoming_headers['return_routing_key'], 
 89 |                     body=request["start_response"], properties=outgoing_properties)
 90 | 
 91 |         generated_tokens = 0
 92 |         stop_generation_counter = 0
 93 |         generator.begin_stream(input_ids, settings, loras = lora)        
 94 |         response = ""
 95 |         while True:
 96 |             chunk, eos, _ = generator.stream()
 97 |             if eos: break
 98 | 
 99 |             generated_tokens += 1            
100 |             stop_generation, stop_generation_counter = self.check_stop_generation(stop_generation_counter, 
101 |                                     model_data["stop_generation_event"], model_data["stop_generation_filter"], socket_id)        
102 |             if stop_generation:
103 |                 finish_reason = "abort"
104 |                 break                            
105 | 
106 |             if generated_tokens >= max_new_tokens: 
107 |                 finish_reason = 'length'
108 |                 break
109 | 
110 |             # send chunk to front end
111 |             if stream_output:
112 |                 if debug:
113 |                     print('\033[96m' + chunk, end="")
114 |                     sys.stdout.flush()
115 | 
116 |                 channel.basic_publish(
117 |                     exchange=incoming_headers['return_exchange'], 
118 |                     routing_key=incoming_headers['return_routing_key'], 
119 |                     body=chunk, properties=outgoing_properties)
120 |             else:
121 |                 response += chunk            
122 | 
123 |         if debug and stream_output:
124 |             print('\033[0m' + "")
125 |         
126 |         finish_reason = "stop"
127 |         end_time = time.time()
128 |         elapsed = end_time - begin_time
129 |         token_rate = 0 if elapsed == 0 else (generated_tokens / elapsed)        
130 |         model_name = incoming_headers["model_name"] if "model_name" in incoming_headers else "not_provided"
131 |         return self.finish_response(stop_key, response, request, stream_output, finish_reason, 
132 |                                         token_rate, generated_tokens, input_token_count, model_name, elapsed, debug)
133 | 
134 | 
135 |     def load_lora(self, request, model, config):
136 | 
137 |         # load lora from config and override w/ request if present
138 |         lora_name = config["default_lora"] if "default_lora" in config else None
139 |         if "lora" in request:
140 |             lora_name = request["lora"]
141 | 
142 |         if lora_name != None:
143 |             if lora_name not in self.loras:
144 | 
145 |                 logger.info(f"loading lora {lora_name}")
146 |                 lora_dir = os.path.join(f"data/loras/", lora_name)
147 |                 if not os.path.exists(lora_dir):
148 |                     logger.info("downloading lora {lora_name} from huggingface")
149 |                     snapshot_download(repo_id=lora_name, local_dir=lora_dir, cache_dir='data/cache', local_dir_use_symlinks=False)
150 |                 
151 |                 lora = ExLlamaV2Lora.from_directory(model["model_loaded"], lora_dir)
152 |                 self.loras[lora_name] = lora
153 |             else:
154 |                 logger.info(f"using lora {lora_name}")
155 | 
156 |             return self.loras[lora_name]
157 |         
158 |         return None
159 | 
160 |     def execute(self, model, request):
161 |         config = self.model_config        
162 |         
163 |         # build the prompt
164 |         prompt = self.build_prompt(request, config, model)
165 |         incoming_headers = model["amqp_headers"]
166 |         outgoing_properties = self.copy_queue_headers(incoming_headers)
167 | 
168 |         # lora code
169 |         lora = self.load_lora(request, model, self.model_config)
170 | 
171 |         # last string to send after done streaming output                        
172 |         stream_resp = self.stream(
173 |             model["generator"], 
174 |             model["tokenizer"], 
175 |             model["model_loaded"], 
176 |             prompt,
177 |             model["amqp_channel"],
178 |             incoming_headers,
179 |             outgoing_properties,
180 |             config["stop_on"],
181 |             request,
182 |             model,
183 |             lora)
184 |         
185 |         return stream_resp
186 |         
187 |     def load(self, model, model_options, local_path):           
188 |         self.model_config = model["configuration"]              
189 |         load_error = False
190 |         try:
191 |             model_path = local_path
192 |             if "branch" in model["model"][0] and model_options["use_precision"] in model["model"][0]["branch"]:                
193 |                 branch_path = model["model"][0]["branch"][model_options["use_precision"]]
194 |                 model_path = f"{local_path}/{branch_path}"
195 | 
196 |             config = ExLlamaV2Config()
197 |             config.model_dir = model_path
198 |             config.prepare()
199 | 
200 |             if model_options["device"].startswith("split"):
201 |                 device_map = model_options["device"].split(':')[1].split(",")            
202 |                 device_map = list(map(int, device_map))
203 |             elif model_options["device"].startswith("cuda"):
204 |                 device_number = int(model_options["device"].split(':')[1])
205 |                 device_array = [0]*12
206 |                 used_memory, free_memory, total_memory = get_gpu_memory_usage(device_number)
207 |                 device_array[device_number] = math.floor(total_memory / 1024)
208 |                 last_non_zero = len(device_array) - 1
209 |                 while last_non_zero > 0 and device_array[last_non_zero] == 0:
210 |                     last_non_zero -= 1
211 |                 device_array = device_array[:last_non_zero + 1]
212 |                 device_map = device_array
213 |                                         
214 |             logger.info(f"starting module {model_path}")          
215 |             model_loaded = ExLlamaV2(config)                                                     
216 |             model_loaded.load(gpu_split=device_map)
217 |             cache = ExLlamaV2Cache(model_loaded)
218 |             tokenizer = ExLlamaV2Tokenizer(config)
219 |             generator = ExLlamaV2StreamingGenerator(model_loaded, cache, tokenizer)   
220 |             self.tokenizer = tokenizer         
221 | 
222 |             logger.info(f'skill {model["routing_key"]} loaded to {model_options["device"]}, precision: {model_options["use_precision"]}')
223 |             return { "model_loaded": model_loaded, "generator": generator, "tokenizer": tokenizer, "error": load_error }                        
224 |         except Exception as e:
225 |             logger.error(f"error loading model")
226 |             print(e)
227 |             load_error = True
228 |             return { "error": load_error }


--------------------------------------------------------------------------------
/requirements-nogpu.txt:
--------------------------------------------------------------------------------
 1 | torch==2.1.0
 2 | torchaudio==2.1.0
 3 | safetensors==0.3.2
 4 | sentencepiece>=0.1.97
 5 | ninja==1.11.1
 6 | tiktoken==0.3.3
 7 | numpy==1.22
 8 | ninja
 9 | hvac
10 | pynvml
11 | psutil
12 | pika
13 | transformers
14 | bitsandbytes
15 | scipy
16 | transformers-stream-generator
17 | jsonschema
18 | omegaconf
19 | Pillow
20 | einops
21 | protobuf
22 | accelerate
23 | diffusers
24 | timm
25 | openai
26 | sseclient-py
27 | TTS==0.22.0
28 | soundfile
29 | llama-cpp-python
30 | compel


--------------------------------------------------------------------------------
/requirements.txt:
--------------------------------------------------------------------------------
 1 | torch==2.1.0
 2 | torchaudio==2.1.0
 3 | safetensors==0.3.2
 4 | sentencepiece>=0.1.97
 5 | ninja==1.11.1
 6 | tiktoken==0.3.3
 7 | numpy==1.22
 8 | ninja
 9 | hvac
10 | pynvml
11 | psutil
12 | pika
13 | transformers
14 | bitsandbytes
15 | scipy
16 | transformers-stream-generator
17 | jsonschema
18 | omegaconf
19 | Pillow
20 | einops
21 | protobuf
22 | accelerate
23 | diffusers
24 | timm
25 | openai
26 | sseclient-py
27 | TTS==0.22.0
28 | soundfile
29 | exllamav2
30 | compel
31 | chardet


--------------------------------------------------------------------------------
/schema/audio-gen.jsonschema:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "object",
 3 |     "properties": {
 4 |         "prompt": {
 5 |             "type": "string"
 6 |         },
 7 |         "seconds": {
 8 |             "type": "number"
 9 |         },
10 |         "guidance_scale": {
11 |             "type": "number"
12 |         },
13 |         "progress": {
14 |             "type": "boolean",
15 |             "default": false
16 |         }
17 |     },
18 |     "required": ["prompt"]
19 | }


--------------------------------------------------------------------------------
/schema/audio-url.jsonschema:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "object",
3 |     "properties": {
4 |         "audio_url": { "type": "string" }     
5 |     },
6 |     "required": ["audio_url"]
7 | }


--------------------------------------------------------------------------------
/schema/img-gen.jsonschema:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "object",
 3 |     "properties": {
 4 |         "prompt": {
 5 |             "type": "string"
 6 |         },
 7 |         "height": {
 8 |             "type": "integer",
 9 |             "default": 512
10 |         },
11 |         "width": {
12 |             "type": "integer",
13 |             "default": 512
14 |         },
15 |         "steps": {
16 |             "type": "integer",
17 |             "default": 50
18 |         },
19 |         "seed": {
20 |             "type": "integer",
21 |             "default": -1
22 |         },
23 |         "progress": {
24 |             "type": "boolean",
25 |             "default": false
26 |         },
27 |         "negative_prompt": {
28 |             "type": "string",
29 |             "default": ""
30 |         },
31 |         "guidance_scale": {
32 |             "type": "number",
33 |             "default": 7.5
34 |         }
35 |     },
36 |     "required": ["prompt"]
37 | }
38 | 


--------------------------------------------------------------------------------
/schema/img-url.jsonschema:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "object",
3 |     "properties": {
4 |         "img_url": { "type": "string" }    
5 |     },
6 |     "required": ["img_url"]
7 | }


--------------------------------------------------------------------------------
/schema/instructor.jsonschema:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "object",
3 |     "properties": {
4 |         "text": { "type": "string" },
5 |         "instruction": { "type": "string" }        
6 |     },
7 |     "required": ["text", "instruction"]
8 | }


--------------------------------------------------------------------------------
/schema/llm.jsonschema:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "object",
 3 |     "properties": {
 4 |         "max_new_tokens": { "type": "number", "default": 512 },
 5 |         "top_p": { "type": "number", "default": 0.9 },
 6 |         "top_k": { "type": "number", "default": 50 },
 7 |         "min_p": { "type": "number", "default": 0.05 },
 8 |         "mirostat": { "type": "number", "default": 0 },
 9 |         "mirostat_eta": { "type": "number", "default": 0.1 },
10 |         "mirostat_tau": { "type": "number", "default": 5 },
11 |         "temperature": { "type": "number", "default": 0.9 },
12 |         "seed": { "type": "number", "default": -1 },
13 |         "stream": { "type": "boolean", "default": true },
14 |         "debug": { "type": "boolean" },
15 |         "stop_key": { "type": "string" },
16 |         "lora": { "type": "string" },
17 |         "ai_role": { "type": "string" },
18 |         "user_role": { "type": "string" },
19 |         "start_response": { "type": "string"},
20 |         "raw": { "type": "string"},
21 |         "messages": { 
22 |             "type": "array",
23 |             "items": { 
24 |                 "type": "object",
25 |                 "properties": {
26 |                     "role": { "type": "string" },
27 |                     "content": { "type": "string"}
28 |                 },
29 |                 "required": ["role", "content"]
30 |             }
31 |         }
32 |     },
33 |     "required": ["messages"]
34 | }


--------------------------------------------------------------------------------
/schema/visual-qa.jsonschema:
--------------------------------------------------------------------------------
1 | {
2 |     "type": "object",
3 |     "properties": {
4 |         "img_url": { "type": "string" },        
5 |         "text": { "type": "string" }        
6 |     },
7 |     "required": ["img_url", "text"]
8 | }


--------------------------------------------------------------------------------
/schema/voice-gen.jsonschema:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "object",
 3 |     "properties": {
 4 |         "prompt": {
 5 |             "type": "string"
 6 |         },
 7 |         "progress": {
 8 |             "type": "boolean",
 9 |             "default": false
10 |         },
11 |         "voice": {
12 |             "type": "string"
13 |         }
14 |     },
15 |     "required": ["prompt"]
16 | }
17 | 


--------------------------------------------------------------------------------
/schema/zero-shot-img.jsonschema:
--------------------------------------------------------------------------------
 1 | {
 2 |     "type": "object",
 3 |     "properties": {        
 4 |         "img_url": { "type": "string" },
 5 |         "labels": { 
 6 |             "type": "array",
 7 |             "items": { 
 8 |                 "type": "string" 
 9 |             }
10 |         }
11 |     },
12 |     "required": ["img_url", "labels"]
13 | }


--------------------------------------------------------------------------------